{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999969938373666, "eval_steps": 500, "global_step": 16632, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.20073664, "auxiliary_loss_mlp": 1.09514689, "balance_loss_clip": 0.12871204, "balance_loss_mlp": 0.03704143, "epoch": 6.012325266796934e-05, "flos": 24462952254720.0, "grad_norm": 2868829.8988746256, "language_loss": 24.41374016, "learning_rate": 0.0, "loss": 16.9847374, "num_input_tokens_seen": 19155, "router_z_loss_clip": 71.90625, "router_z_loss_mlp": 1059.5, "step": 1, "time_per_iteration": 17.464494466781616 }, { "auxiliary_loss_clip": 0.13308005, "auxiliary_loss_mlp": 0.71657234, "balance_loss_clip": 0.08572355, "balance_loss_mlp": 0.02467782, "epoch": 0.00012024650533593868, "flos": 20231457598080.0, "grad_norm": 665202.6849804686, "language_loss": 16.12836075, "learning_rate": 4.4628432569317594e-07, "loss": 16.97801208, "num_input_tokens_seen": 36175, "router_z_loss_clip": 47.375, "router_z_loss_mlp": 692.5, "step": 2, "time_per_iteration": 2.7131378650665283 }, { "auxiliary_loss_clip": 0.13336843, "auxiliary_loss_mlp": 0.73071623, "balance_loss_clip": 0.0858888, "balance_loss_mlp": 0.02466154, "epoch": 0.000180369758003908, "flos": 22316532197760.0, "grad_norm": 111693.47316822955, "language_loss": 16.05858421, "learning_rate": 7.073439208833112e-07, "loss": 16.92266846, "num_input_tokens_seen": 54870, "router_z_loss_clip": 47.46875, "router_z_loss_mlp": 707.0, "step": 3, "time_per_iteration": 2.54451584815979 }, { "auxiliary_loss_clip": 0.13394475, "auxiliary_loss_mlp": 0.72687328, "balance_loss_clip": 0.0858528, "balance_loss_mlp": 0.02472486, "epoch": 0.00024049301067187735, "flos": 22420471587840.0, "grad_norm": 130763.61828990748, "language_loss": 15.92000198, "learning_rate": 8.925686513863519e-07, "loss": 16.78082085, "num_input_tokens_seen": 74575, "router_z_loss_clip": 48.0625, "router_z_loss_mlp": 703.0, "step": 4, "time_per_iteration": 2.5398571491241455 }, { "auxiliary_loss_clip": 0.13351913, "auxiliary_loss_mlp": 0.73019242, "balance_loss_clip": 0.08577777, "balance_loss_mlp": 0.02462606, "epoch": 0.0003006162633398467, "flos": 21403286547840.0, "grad_norm": 118092.05075564985, "language_loss": 16.01875877, "learning_rate": 1.0362401141348472e-06, "loss": 16.88246918, "num_input_tokens_seen": 92580, "router_z_loss_clip": 47.71875, "router_z_loss_mlp": 706.0, "step": 5, "time_per_iteration": 2.724152088165283 }, { "auxiliary_loss_clip": 0.13330972, "auxiliary_loss_mlp": 0.71458501, "balance_loss_clip": 0.08568175, "balance_loss_mlp": 0.02464359, "epoch": 0.000360739516007816, "flos": 21658725319680.0, "grad_norm": 654757.5819697931, "language_loss": 15.39735794, "learning_rate": 1.153628246576487e-06, "loss": 16.24525452, "num_input_tokens_seen": 109705, "router_z_loss_clip": 47.53125, "router_z_loss_mlp": 690.5, "step": 6, "time_per_iteration": 2.723098039627075 }, { "auxiliary_loss_clip": 0.13350911, "auxiliary_loss_mlp": 0.73311615, "balance_loss_clip": 0.08560076, "balance_loss_mlp": 0.02462004, "epoch": 0.0004208627686757854, "flos": 27166682407680.0, "grad_norm": 16048.732179209617, "language_loss": 15.14230919, "learning_rate": 1.2528784983718962e-06, "loss": 16.00893402, "num_input_tokens_seen": 129425, "router_z_loss_clip": 47.84375, "router_z_loss_mlp": 709.5, "step": 7, "time_per_iteration": 2.7535853385925293 }, { "auxiliary_loss_clip": 0.13365248, "auxiliary_loss_mlp": 0.73959029, "balance_loss_clip": 0.08570272, "balance_loss_mlp": 0.02474651, "epoch": 0.0004809860213437547, "flos": 31326727190400.0, "grad_norm": 29608.231569281303, "language_loss": 15.2764349, "learning_rate": 1.338852977079528e-06, "loss": 16.14967918, "num_input_tokens_seen": 149210, "router_z_loss_clip": 47.9375, "router_z_loss_mlp": 716.0, "step": 8, "time_per_iteration": 2.815904378890991 }, { "auxiliary_loss_clip": 0.13362633, "auxiliary_loss_mlp": 0.72973949, "balance_loss_clip": 0.08564633, "balance_loss_mlp": 0.02466134, "epoch": 0.000541109274011724, "flos": 32168541634560.0, "grad_norm": 9748.121759272448, "language_loss": 14.75504589, "learning_rate": 1.4146878417666224e-06, "loss": 15.61841106, "num_input_tokens_seen": 169055, "router_z_loss_clip": 47.90625, "router_z_loss_mlp": 706.0, "step": 9, "time_per_iteration": 2.799344778060913 }, { "auxiliary_loss_clip": 0.13306631, "auxiliary_loss_mlp": 0.73561567, "balance_loss_clip": 0.08548263, "balance_loss_mlp": 0.02467818, "epoch": 0.0006012325266796934, "flos": 18922845657600.0, "grad_norm": 6497.117698339061, "language_loss": 13.93240356, "learning_rate": 1.4825244398280232e-06, "loss": 14.80108547, "num_input_tokens_seen": 188045, "router_z_loss_clip": 47.53125, "router_z_loss_mlp": 712.0, "step": 10, "time_per_iteration": 2.7238149642944336 }, { "auxiliary_loss_clip": 0.13326591, "auxiliary_loss_mlp": 0.72933173, "balance_loss_clip": 0.08547086, "balance_loss_mlp": 0.02474184, "epoch": 0.0006613557793476627, "flos": 20780755038720.0, "grad_norm": 6578.2063676698, "language_loss": 13.58784485, "learning_rate": 1.5438901072051983e-06, "loss": 14.45044327, "num_input_tokens_seen": 207035, "router_z_loss_clip": 47.75, "router_z_loss_mlp": 705.0, "step": 11, "time_per_iteration": 2.779069185256958 }, { "auxiliary_loss_clip": 0.13335934, "auxiliary_loss_mlp": 0.73854089, "balance_loss_clip": 0.08561198, "balance_loss_mlp": 0.02467367, "epoch": 0.000721479032015632, "flos": 16587321603840.0, "grad_norm": 8289.423207680878, "language_loss": 12.80888844, "learning_rate": 1.5999125722696629e-06, "loss": 13.68078899, "num_input_tokens_seen": 223225, "router_z_loss_clip": 47.65625, "router_z_loss_mlp": 714.5, "step": 12, "time_per_iteration": 2.721177339553833 }, { "auxiliary_loss_clip": 0.1328945, "auxiliary_loss_mlp": 0.73649609, "balance_loss_clip": 0.08561218, "balance_loss_mlp": 0.02458198, "epoch": 0.0007816022846836014, "flos": 23812254305280.0, "grad_norm": 2683.6953426838113, "language_loss": 11.94140244, "learning_rate": 1.6514482443788434e-06, "loss": 12.81079197, "num_input_tokens_seen": 242570, "router_z_loss_clip": 47.28125, "router_z_loss_mlp": 712.5, "step": 13, "time_per_iteration": 2.880390167236328 }, { "auxiliary_loss_clip": 0.13300006, "auxiliary_loss_mlp": 0.73945683, "balance_loss_clip": 0.08571017, "balance_loss_mlp": 0.0246131, "epoch": 0.0008417255373515708, "flos": 19178284429440.0, "grad_norm": 1675.8387445492428, "language_loss": 11.13089561, "learning_rate": 1.6991628240650723e-06, "loss": 12.00335312, "num_input_tokens_seen": 261215, "router_z_loss_clip": 47.28125, "router_z_loss_mlp": 715.5, "step": 14, "time_per_iteration": 2.7245962619781494 }, { "auxiliary_loss_clip": 0.13376898, "auxiliary_loss_mlp": 0.75027448, "balance_loss_clip": 0.08592511, "balance_loss_mlp": 0.02468855, "epoch": 0.00090184879001954, "flos": 26402714006400.0, "grad_norm": 5776.048009526236, "language_loss": 11.36476898, "learning_rate": 1.7435840350181584e-06, "loss": 12.24881172, "num_input_tokens_seen": 280035, "router_z_loss_clip": 47.875, "router_z_loss_mlp": 726.0, "step": 15, "time_per_iteration": 2.8378286361694336 }, { "auxiliary_loss_clip": 0.13296548, "auxiliary_loss_mlp": 0.74440491, "balance_loss_clip": 0.0855456, "balance_loss_mlp": 0.02467833, "epoch": 0.0009619720426875094, "flos": 24686157663360.0, "grad_norm": 1521.8687773258057, "language_loss": 10.48291683, "learning_rate": 1.7851373027727038e-06, "loss": 11.36028671, "num_input_tokens_seen": 300265, "router_z_loss_clip": 47.34375, "router_z_loss_mlp": 721.0, "step": 16, "time_per_iteration": 2.77034854888916 }, { "auxiliary_loss_clip": 0.13309306, "auxiliary_loss_mlp": 0.75762796, "balance_loss_clip": 0.08564389, "balance_loss_mlp": 0.02471785, "epoch": 0.0010220952953554788, "flos": 18630454435200.0, "grad_norm": 2524.098416355543, "language_loss": 9.97653389, "learning_rate": 1.8241705979033208e-06, "loss": 10.86725521, "num_input_tokens_seen": 317375, "router_z_loss_clip": 47.4375, "router_z_loss_mlp": 733.5, "step": 17, "time_per_iteration": 5.926611661911011 }, { "auxiliary_loss_clip": 0.13298619, "auxiliary_loss_mlp": 0.75803769, "balance_loss_clip": 0.08551884, "balance_loss_mlp": 0.02463922, "epoch": 0.001082218548023448, "flos": 26150042419200.0, "grad_norm": 1826.7812157413425, "language_loss": 9.51778698, "learning_rate": 1.860972167459798e-06, "loss": 10.40881062, "num_input_tokens_seen": 337975, "router_z_loss_clip": 47.4375, "router_z_loss_mlp": 733.5, "step": 18, "time_per_iteration": 2.824043035507202 }, { "auxiliary_loss_clip": 0.13316831, "auxiliary_loss_mlp": 0.74243569, "balance_loss_clip": 0.08564163, "balance_loss_mlp": 0.02466224, "epoch": 0.0011423418006914173, "flos": 19615885977600.0, "grad_norm": 951.6333526514316, "language_loss": 8.42019558, "learning_rate": 1.89578346593066e-06, "loss": 9.29579926, "num_input_tokens_seen": 356635, "router_z_loss_clip": 47.5625, "router_z_loss_mlp": 718.0, "step": 19, "time_per_iteration": 4.240902662277222 }, { "auxiliary_loss_clip": 0.13302431, "auxiliary_loss_mlp": 0.74440825, "balance_loss_clip": 0.08565642, "balance_loss_mlp": 0.02468166, "epoch": 0.0012024650533593868, "flos": 17901258278400.0, "grad_norm": 3369.843349704104, "language_loss": 7.76120472, "learning_rate": 1.928808765521199e-06, "loss": 8.63863754, "num_input_tokens_seen": 375625, "router_z_loss_clip": 47.28125, "router_z_loss_mlp": 720.0, "step": 20, "time_per_iteration": 2.7386860847473145 }, { "auxiliary_loss_clip": 0.1334375, "auxiliary_loss_mlp": 0.76446283, "balance_loss_clip": 0.08570582, "balance_loss_mlp": 0.02471673, "epoch": 0.001262588306027356, "flos": 21258495492480.0, "grad_norm": 5048.265595293692, "language_loss": 8.15846729, "learning_rate": 1.9602224192552076e-06, "loss": 9.05636787, "num_input_tokens_seen": 394350, "router_z_loss_clip": 47.71875, "router_z_loss_mlp": 740.5, "step": 21, "time_per_iteration": 2.801339626312256 }, { "auxiliary_loss_clip": 0.13291712, "auxiliary_loss_mlp": 0.75857496, "balance_loss_clip": 0.08551858, "balance_loss_mlp": 0.02468826, "epoch": 0.0013227115586953253, "flos": 26111245178880.0, "grad_norm": 2843.6955508863693, "language_loss": 8.18930149, "learning_rate": 1.9901744328983746e-06, "loss": 9.08079338, "num_input_tokens_seen": 413255, "router_z_loss_clip": 47.34375, "router_z_loss_mlp": 735.0, "step": 22, "time_per_iteration": 2.791954517364502 }, { "auxiliary_loss_clip": 0.13294855, "auxiliary_loss_mlp": 0.74685329, "balance_loss_clip": 0.08561563, "balance_loss_mlp": 0.02468529, "epoch": 0.0013828348113632948, "flos": 23958177390720.0, "grad_norm": 1589.92113543109, "language_loss": 7.70024586, "learning_rate": 2.018794797290208e-06, "loss": 8.58004761, "num_input_tokens_seen": 433065, "router_z_loss_clip": 47.3125, "router_z_loss_mlp": 723.5, "step": 23, "time_per_iteration": 2.7276787757873535 }, { "auxiliary_loss_clip": 0.13283379, "auxiliary_loss_mlp": 0.75814998, "balance_loss_clip": 0.08537732, "balance_loss_mlp": 0.02475152, "epoch": 0.001442958064031264, "flos": 15965125511040.0, "grad_norm": 1044.5568324302108, "language_loss": 7.80618572, "learning_rate": 2.046196897962839e-06, "loss": 8.69717026, "num_input_tokens_seen": 451175, "router_z_loss_clip": 47.46875, "router_z_loss_mlp": 734.0, "step": 24, "time_per_iteration": 2.6865298748016357 }, { "auxiliary_loss_clip": 0.1323812, "auxiliary_loss_mlp": 0.75660467, "balance_loss_clip": 0.0854367, "balance_loss_mlp": 0.02467109, "epoch": 0.0015030813166992333, "flos": 18113287835520.0, "grad_norm": 2207.4953415637983, "language_loss": 7.2556057, "learning_rate": 2.0724802282696944e-06, "loss": 8.14459133, "num_input_tokens_seen": 468775, "router_z_loss_clip": 47.03125, "router_z_loss_mlp": 732.5, "step": 25, "time_per_iteration": 2.738314628601074 }, { "auxiliary_loss_clip": 0.13241503, "auxiliary_loss_mlp": 0.7565825, "balance_loss_clip": 0.08545682, "balance_loss_mlp": 0.02464887, "epoch": 0.0015632045693672028, "flos": 22240740579840.0, "grad_norm": 3768.983190695873, "language_loss": 7.89717627, "learning_rate": 2.0977325700720194e-06, "loss": 8.78617382, "num_input_tokens_seen": 488530, "router_z_loss_clip": 46.9375, "router_z_loss_mlp": 732.5, "step": 26, "time_per_iteration": 2.8125810623168945 }, { "auxiliary_loss_clip": 0.13250855, "auxiliary_loss_mlp": 0.79076016, "balance_loss_clip": 0.08555634, "balance_loss_mlp": 0.02464689, "epoch": 0.001623327822035172, "flos": 23999448326400.0, "grad_norm": 194.78777787363032, "language_loss": 8.28186131, "learning_rate": 2.122031762649933e-06, "loss": 9.20513058, "num_input_tokens_seen": 510495, "router_z_loss_clip": 46.90625, "router_z_loss_mlp": 765.5, "step": 27, "time_per_iteration": 2.8646275997161865 }, { "auxiliary_loss_clip": 0.13255733, "auxiliary_loss_mlp": 0.80354822, "balance_loss_clip": 0.08544519, "balance_loss_mlp": 0.02473963, "epoch": 0.0016834510747031415, "flos": 19682914844160.0, "grad_norm": 262.7816968698906, "language_loss": 7.06895828, "learning_rate": 2.1454471497582483e-06, "loss": 8.00506401, "num_input_tokens_seen": 528605, "router_z_loss_clip": 47.125, "router_z_loss_mlp": 778.5, "step": 28, "time_per_iteration": 2.7358648777008057 }, { "auxiliary_loss_clip": 0.13214478, "auxiliary_loss_mlp": 0.79759502, "balance_loss_clip": 0.08541814, "balance_loss_mlp": 0.02464583, "epoch": 0.0017435743273711108, "flos": 20930241922560.0, "grad_norm": 1230.4113853169245, "language_loss": 6.84652519, "learning_rate": 2.1680407726407727e-06, "loss": 7.77626514, "num_input_tokens_seen": 548515, "router_z_loss_clip": 46.78125, "router_z_loss_mlp": 771.5, "step": 29, "time_per_iteration": 2.753887176513672 }, { "auxiliary_loss_clip": 0.13207217, "auxiliary_loss_mlp": 0.80155778, "balance_loss_clip": 0.08527508, "balance_loss_mlp": 0.02470229, "epoch": 0.00180369758003908, "flos": 19533763376640.0, "grad_norm": 1139.8837732489794, "language_loss": 7.50153732, "learning_rate": 2.189868360711334e-06, "loss": 8.43516731, "num_input_tokens_seen": 564025, "router_z_loss_clip": 46.8125, "router_z_loss_mlp": 776.0, "step": 30, "time_per_iteration": 2.700237512588501 }, { "auxiliary_loss_clip": 0.13243839, "auxiliary_loss_mlp": 0.80398238, "balance_loss_clip": 0.08544054, "balance_loss_mlp": 0.02468548, "epoch": 0.0018638208327070496, "flos": 27460415295360.0, "grad_norm": 37310.92751794245, "language_loss": 6.40033245, "learning_rate": 2.2109801597326265e-06, "loss": 7.33675289, "num_input_tokens_seen": 583345, "router_z_loss_clip": 46.9375, "router_z_loss_mlp": 778.0, "step": 31, "time_per_iteration": 2.8351519107818604 }, { "auxiliary_loss_clip": 0.13250521, "auxiliary_loss_mlp": 0.80644637, "balance_loss_clip": 0.08546963, "balance_loss_mlp": 0.0247081, "epoch": 0.0019239440853750188, "flos": 13594535723520.0, "grad_norm": 79792.15432733182, "language_loss": 6.48489475, "learning_rate": 2.2314216284658796e-06, "loss": 7.42384624, "num_input_tokens_seen": 600010, "router_z_loss_clip": 46.96875, "router_z_loss_mlp": 780.5, "step": 32, "time_per_iteration": 2.924616813659668 }, { "auxiliary_loss_clip": 0.13232908, "auxiliary_loss_mlp": 0.81763446, "balance_loss_clip": 0.08556462, "balance_loss_mlp": 0.02466569, "epoch": 0.001984067338042988, "flos": 11258466618240.0, "grad_norm": 3981.3881552667613, "language_loss": 6.66624451, "learning_rate": 2.2512340280885094e-06, "loss": 7.6162076, "num_input_tokens_seen": 616295, "router_z_loss_clip": 46.75, "router_z_loss_mlp": 792.0, "step": 33, "time_per_iteration": 2.804166555404663 }, { "auxiliary_loss_clip": 0.1322647, "auxiliary_loss_mlp": 0.81283522, "balance_loss_clip": 0.08543752, "balance_loss_mlp": 0.02474929, "epoch": 0.0020441905907109576, "flos": 22393413918720.0, "grad_norm": 6324.884377847389, "language_loss": 6.88012981, "learning_rate": 2.270454923596497e-06, "loss": 7.82522964, "num_input_tokens_seen": 637640, "router_z_loss_clip": 46.8125, "router_z_loss_mlp": 787.0, "step": 34, "time_per_iteration": 2.8252575397491455 }, { "auxiliary_loss_clip": 0.13201287, "auxiliary_loss_mlp": 0.82014906, "balance_loss_clip": 0.08529929, "balance_loss_mlp": 0.02473894, "epoch": 0.0021043138433789266, "flos": 49788911427840.0, "grad_norm": 3052.1692121516103, "language_loss": 6.4965539, "learning_rate": 2.2891186125067434e-06, "loss": 7.44871569, "num_input_tokens_seen": 659710, "router_z_loss_clip": 46.75, "router_z_loss_mlp": 794.5, "step": 35, "time_per_iteration": 3.0796027183532715 }, { "auxiliary_loss_clip": 0.13256595, "auxiliary_loss_mlp": 0.81520545, "balance_loss_clip": 0.08541176, "balance_loss_mlp": 0.02467813, "epoch": 0.002164437096046896, "flos": 20564155434240.0, "grad_norm": 2505.715399099467, "language_loss": 6.40970325, "learning_rate": 2.307256493152974e-06, "loss": 7.3574748, "num_input_tokens_seen": 679670, "router_z_loss_clip": 47.09375, "router_z_loss_mlp": 789.5, "step": 36, "time_per_iteration": 2.7999444007873535 }, { "auxiliary_loss_clip": 0.13268444, "auxiliary_loss_mlp": 0.83140552, "balance_loss_clip": 0.08538517, "balance_loss_mlp": 0.02476493, "epoch": 0.0022245603487148656, "flos": 26549601413760.0, "grad_norm": 4199.2589276450035, "language_loss": 6.46337318, "learning_rate": 2.3248973825097614e-06, "loss": 7.42746305, "num_input_tokens_seen": 700170, "router_z_loss_clip": 47.21875, "router_z_loss_mlp": 806.0, "step": 37, "time_per_iteration": 2.7790980339050293 }, { "auxiliary_loss_clip": 0.13217646, "auxiliary_loss_mlp": 0.81081212, "balance_loss_clip": 0.0853001, "balance_loss_mlp": 0.02467934, "epoch": 0.0022846836013828346, "flos": 20344201666560.0, "grad_norm": 2422.2863004634614, "language_loss": 6.89877605, "learning_rate": 2.3420677916238357e-06, "loss": 7.8417654, "num_input_tokens_seen": 718545, "router_z_loss_clip": 46.84375, "router_z_loss_mlp": 785.5, "step": 38, "time_per_iteration": 2.725318193435669 }, { "auxiliary_loss_clip": 0.13204619, "auxiliary_loss_mlp": 0.81819534, "balance_loss_clip": 0.08533479, "balance_loss_mlp": 0.02473829, "epoch": 0.002344806854050804, "flos": 26254359152640.0, "grad_norm": 2071.304647577015, "language_loss": 6.83647823, "learning_rate": 2.358792165262154e-06, "loss": 7.78671932, "num_input_tokens_seen": 739865, "router_z_loss_clip": 46.6875, "router_z_loss_mlp": 792.5, "step": 39, "time_per_iteration": 2.7743327617645264 }, { "auxiliary_loss_clip": 0.13201952, "auxiliary_loss_mlp": 0.80107164, "balance_loss_clip": 0.0853734, "balance_loss_mlp": 0.02470449, "epoch": 0.0024049301067187736, "flos": 11806296612480.0, "grad_norm": 1614.6481425534148, "language_loss": 6.21051455, "learning_rate": 2.3750930912143747e-06, "loss": 7.14360571, "num_input_tokens_seen": 755770, "router_z_loss_clip": 46.625, "router_z_loss_mlp": 775.5, "step": 40, "time_per_iteration": 2.673217296600342 }, { "auxiliary_loss_clip": 0.13279408, "auxiliary_loss_mlp": 0.83091199, "balance_loss_clip": 0.08550765, "balance_loss_mlp": 0.02475969, "epoch": 0.0024650533593867426, "flos": 20637808773120.0, "grad_norm": 1521.0816828231002, "language_loss": 6.64805317, "learning_rate": 2.3909914837471044e-06, "loss": 7.61175919, "num_input_tokens_seen": 773440, "router_z_loss_clip": 47.28125, "router_z_loss_mlp": 805.5, "step": 41, "time_per_iteration": 2.8627371788024902 }, { "auxiliary_loss_clip": 0.13219067, "auxiliary_loss_mlp": 0.81333143, "balance_loss_clip": 0.08530853, "balance_loss_mlp": 0.02475721, "epoch": 0.002525176612054712, "flos": 18412093895040.0, "grad_norm": 399.70751901012807, "language_loss": 6.16824055, "learning_rate": 2.4065067449483835e-06, "loss": 7.11376238, "num_input_tokens_seen": 790455, "router_z_loss_clip": 46.875, "router_z_loss_mlp": 787.5, "step": 42, "time_per_iteration": 2.7327961921691895 }, { "auxiliary_loss_clip": 0.13247772, "auxiliary_loss_mlp": 0.81430697, "balance_loss_clip": 0.08529416, "balance_loss_mlp": 0.02475616, "epoch": 0.0025852998647226816, "flos": 28191582023040.0, "grad_norm": 1208.934616087294, "language_loss": 6.22202539, "learning_rate": 2.4216569070848724e-06, "loss": 7.16880989, "num_input_tokens_seen": 810645, "router_z_loss_clip": 47.125, "router_z_loss_mlp": 788.5, "step": 43, "time_per_iteration": 2.820488691329956 }, { "auxiliary_loss_clip": 0.13290325, "auxiliary_loss_mlp": 0.79381454, "balance_loss_clip": 0.08557805, "balance_loss_mlp": 0.02477159, "epoch": 0.0026454231173906506, "flos": 14288372657280.0, "grad_norm": 79.37091059236829, "language_loss": 5.88209057, "learning_rate": 2.4364587585915504e-06, "loss": 6.80880833, "num_input_tokens_seen": 827470, "router_z_loss_clip": 47.25, "router_z_loss_mlp": 768.5, "step": 44, "time_per_iteration": 2.7213616371154785 }, { "auxiliary_loss_clip": 0.13284674, "auxiliary_loss_mlp": 0.80063045, "balance_loss_clip": 0.08555049, "balance_loss_mlp": 0.02475151, "epoch": 0.00270554637005862, "flos": 22425796321920.0, "grad_norm": 744.7765774780087, "language_loss": 6.23250771, "learning_rate": 2.450927955901469e-06, "loss": 7.16598511, "num_input_tokens_seen": 847285, "router_z_loss_clip": 47.28125, "router_z_loss_mlp": 775.5, "step": 45, "time_per_iteration": 2.800656795501709 }, { "auxiliary_loss_clip": 0.13224691, "auxiliary_loss_mlp": 0.78446972, "balance_loss_clip": 0.08549362, "balance_loss_mlp": 0.02470412, "epoch": 0.0027656696227265896, "flos": 23992236875520.0, "grad_norm": 1433.4146902093082, "language_loss": 6.57723188, "learning_rate": 2.465079122983384e-06, "loss": 7.49394798, "num_input_tokens_seen": 867545, "router_z_loss_clip": 46.78125, "router_z_loss_mlp": 759.0, "step": 46, "time_per_iteration": 2.753516912460327 }, { "auxiliary_loss_clip": 0.13226762, "auxiliary_loss_mlp": 0.78698361, "balance_loss_clip": 0.0851877, "balance_loss_mlp": 0.02477654, "epoch": 0.0028257928753945586, "flos": 37678511220480.0, "grad_norm": 691.378984027035, "language_loss": 5.87511921, "learning_rate": 2.4789259401737868e-06, "loss": 6.79437065, "num_input_tokens_seen": 889915, "router_z_loss_clip": 47.0625, "router_z_loss_mlp": 761.5, "step": 47, "time_per_iteration": 2.905287981033325 }, { "auxiliary_loss_clip": 0.13228483, "auxiliary_loss_mlp": 0.79232681, "balance_loss_clip": 0.08518307, "balance_loss_mlp": 0.0247487, "epoch": 0.002885916128062528, "flos": 22460945909760.0, "grad_norm": 358.84443843043266, "language_loss": 5.9286871, "learning_rate": 2.492481223656015e-06, "loss": 6.85329866, "num_input_tokens_seen": 908975, "router_z_loss_clip": 47.0625, "router_z_loss_mlp": 766.5, "step": 48, "time_per_iteration": 2.729231595993042 }, { "auxiliary_loss_clip": 0.1320309, "auxiliary_loss_mlp": 0.78255546, "balance_loss_clip": 0.08525954, "balance_loss_mlp": 0.02474299, "epoch": 0.0029460393807304976, "flos": 27019543438080.0, "grad_norm": 505.93031580853744, "language_loss": 6.52600098, "learning_rate": 2.5057569967437924e-06, "loss": 7.440588, "num_input_tokens_seen": 929810, "router_z_loss_clip": 46.8125, "router_z_loss_mlp": 756.5, "step": 49, "time_per_iteration": 2.8463971614837646 }, { "auxiliary_loss_clip": 0.13241643, "auxiliary_loss_mlp": 0.75616109, "balance_loss_clip": 0.08534031, "balance_loss_mlp": 0.02471579, "epoch": 0.0030061626333984666, "flos": 15857328833280.0, "grad_norm": 292.3223525953008, "language_loss": 5.72161198, "learning_rate": 2.51876455396287e-06, "loss": 6.61018944, "num_input_tokens_seen": 948650, "router_z_loss_clip": 47.125, "router_z_loss_mlp": 732.5, "step": 50, "time_per_iteration": 2.7053110599517822 }, { "auxiliary_loss_clip": 0.13225487, "auxiliary_loss_mlp": 0.75810206, "balance_loss_clip": 0.08533954, "balance_loss_mlp": 0.0247036, "epoch": 0.003066285886066436, "flos": 31834292497920.0, "grad_norm": 611.1840994526164, "language_loss": 6.63009739, "learning_rate": 2.5315145187866316e-06, "loss": 7.52045441, "num_input_tokens_seen": 966455, "router_z_loss_clip": 46.84375, "router_z_loss_mlp": 734.0, "step": 51, "time_per_iteration": 2.824453830718994 }, { "auxiliary_loss_clip": 0.13236934, "auxiliary_loss_mlp": 0.76012999, "balance_loss_clip": 0.08534493, "balance_loss_mlp": 0.0247784, "epoch": 0.0031264091387344056, "flos": 41437110291840.0, "grad_norm": 362.24001641521465, "language_loss": 5.71871233, "learning_rate": 2.5440168957651953e-06, "loss": 6.61121178, "num_input_tokens_seen": 988110, "router_z_loss_clip": 47.03125, "router_z_loss_mlp": 736.0, "step": 52, "time_per_iteration": 2.8989784717559814 }, { "auxiliary_loss_clip": 0.13233, "auxiliary_loss_mlp": 0.74639487, "balance_loss_clip": 0.08529676, "balance_loss_mlp": 0.02471518, "epoch": 0.0031865323914023747, "flos": 23447719117440.0, "grad_norm": 1156.4830582475292, "language_loss": 5.53168821, "learning_rate": 2.5562811176888872e-06, "loss": 6.41041279, "num_input_tokens_seen": 1008550, "router_z_loss_clip": 47.03125, "router_z_loss_mlp": 722.5, "step": 53, "time_per_iteration": 2.895324945449829 }, { "auxiliary_loss_clip": 0.13253972, "auxiliary_loss_mlp": 0.73961848, "balance_loss_clip": 0.08527695, "balance_loss_mlp": 0.02477474, "epoch": 0.003246655644070344, "flos": 14434505377920.0, "grad_norm": 282.8242660489776, "language_loss": 5.04539824, "learning_rate": 2.5683160883431093e-06, "loss": 5.91755676, "num_input_tokens_seen": 1026840, "router_z_loss_clip": 47.25, "router_z_loss_mlp": 716.0, "step": 54, "time_per_iteration": 2.7772090435028076 }, { "auxiliary_loss_clip": 0.13245401, "auxiliary_loss_mlp": 0.74111426, "balance_loss_clip": 0.08528943, "balance_loss_mlp": 0.0248057, "epoch": 0.0033067788967383136, "flos": 35926972997760.0, "grad_norm": 997.6730208042513, "language_loss": 5.00962353, "learning_rate": 2.580130221340046e-06, "loss": 5.88319159, "num_input_tokens_seen": 1048875, "router_z_loss_clip": 47.125, "router_z_loss_mlp": 717.5, "step": 55, "time_per_iteration": 2.860889196395874 }, { "auxiliary_loss_clip": 0.13229325, "auxiliary_loss_mlp": 0.74299961, "balance_loss_clip": 0.08537658, "balance_loss_mlp": 0.02473791, "epoch": 0.003366902149406283, "flos": 22964108878080.0, "grad_norm": 13014.258295923459, "language_loss": 5.70745373, "learning_rate": 2.5917314754514246e-06, "loss": 6.58274603, "num_input_tokens_seen": 1066435, "router_z_loss_clip": 46.875, "router_z_loss_mlp": 719.5, "step": 56, "time_per_iteration": 2.7658848762512207 }, { "auxiliary_loss_clip": 0.13237409, "auxiliary_loss_mlp": 0.72102314, "balance_loss_clip": 0.08536699, "balance_loss_mlp": 0.02473408, "epoch": 0.003427025402074252, "flos": 26590830422400.0, "grad_norm": 541.506200338983, "language_loss": 6.32591534, "learning_rate": 2.6031273868139713e-06, "loss": 7.17931318, "num_input_tokens_seen": 1090330, "router_z_loss_clip": 46.96875, "router_z_loss_mlp": 697.0, "step": 57, "time_per_iteration": 4.981154680252075 }, { "auxiliary_loss_clip": 0.13219057, "auxiliary_loss_mlp": 0.72540784, "balance_loss_clip": 0.08520763, "balance_loss_mlp": 0.02472425, "epoch": 0.0034871486547422216, "flos": 23957967755520.0, "grad_norm": 417.4139941069721, "language_loss": 6.63180542, "learning_rate": 2.614325098333948e-06, "loss": 7.4894042, "num_input_tokens_seen": 1109840, "router_z_loss_clip": 46.96875, "router_z_loss_mlp": 701.5, "step": 58, "time_per_iteration": 4.221381902694702 }, { "auxiliary_loss_clip": 0.13277611, "auxiliary_loss_mlp": 0.71276021, "balance_loss_clip": 0.08549097, "balance_loss_mlp": 0.02477193, "epoch": 0.003547271907410191, "flos": 21221333406720.0, "grad_norm": 3634.8572800082643, "language_loss": 5.05342245, "learning_rate": 2.625331386578098e-06, "loss": 5.89895868, "num_input_tokens_seen": 1128415, "router_z_loss_clip": 47.28125, "router_z_loss_mlp": 688.5, "step": 59, "time_per_iteration": 2.743398904800415 }, { "auxiliary_loss_clip": 0.1328682, "auxiliary_loss_mlp": 0.72794712, "balance_loss_clip": 0.08546945, "balance_loss_mlp": 0.02482216, "epoch": 0.00360739516007816, "flos": 16509894831360.0, "grad_norm": 1899.601490149347, "language_loss": 5.9859457, "learning_rate": 2.63615268640451e-06, "loss": 6.84676075, "num_input_tokens_seen": 1146515, "router_z_loss_clip": 47.375, "router_z_loss_mlp": 704.0, "step": 60, "time_per_iteration": 2.6825168132781982 }, { "auxiliary_loss_clip": 0.13254309, "auxiliary_loss_mlp": 0.71475518, "balance_loss_clip": 0.08537132, "balance_loss_mlp": 0.0248138, "epoch": 0.0036675184127461296, "flos": 19471052995200.0, "grad_norm": 1199.54659592168, "language_loss": 5.98041821, "learning_rate": 2.6467951135575943e-06, "loss": 6.82771635, "num_input_tokens_seen": 1166330, "router_z_loss_clip": 47.15625, "router_z_loss_mlp": 690.5, "step": 61, "time_per_iteration": 2.697571039199829 }, { "auxiliary_loss_clip": 0.13208099, "auxiliary_loss_mlp": 0.71225941, "balance_loss_clip": 0.08522251, "balance_loss_mlp": 0.02475934, "epoch": 0.003727641665414099, "flos": 20963253231360.0, "grad_norm": 2245.4227927212323, "language_loss": 5.91332626, "learning_rate": 2.657264485425803e-06, "loss": 6.75766659, "num_input_tokens_seen": 1186010, "router_z_loss_clip": 46.78125, "router_z_loss_mlp": 688.0, "step": 62, "time_per_iteration": 2.713111162185669 }, { "auxiliary_loss_clip": 0.13215411, "auxiliary_loss_mlp": 0.71225369, "balance_loss_clip": 0.08534817, "balance_loss_mlp": 0.02475371, "epoch": 0.003787764918082068, "flos": 18412010040960.0, "grad_norm": 1230.9157460541894, "language_loss": 5.6049099, "learning_rate": 2.6675663401385186e-06, "loss": 6.44931793, "num_input_tokens_seen": 1204985, "router_z_loss_clip": 46.8125, "router_z_loss_mlp": 688.0, "step": 63, "time_per_iteration": 2.680806875228882 }, { "auxiliary_loss_clip": 0.1323109, "auxiliary_loss_mlp": 0.71321279, "balance_loss_clip": 0.08531044, "balance_loss_mlp": 0.02473621, "epoch": 0.0038478881707500376, "flos": 12464271198720.0, "grad_norm": 138.1077854658612, "language_loss": 5.19434977, "learning_rate": 2.677705954159056e-06, "loss": 6.03987312, "num_input_tokens_seen": 1223545, "router_z_loss_clip": 47.0, "router_z_loss_mlp": 689.0, "step": 64, "time_per_iteration": 2.7345190048217773 }, { "auxiliary_loss_clip": 0.13210054, "auxiliary_loss_mlp": 0.70640564, "balance_loss_clip": 0.08524624, "balance_loss_mlp": 0.02476507, "epoch": 0.003908011423418007, "flos": 13558463740800.0, "grad_norm": 114.58536441358979, "language_loss": 5.16492271, "learning_rate": 2.6876883585136904e-06, "loss": 6.00342894, "num_input_tokens_seen": 1241175, "router_z_loss_clip": 46.90625, "router_z_loss_mlp": 682.0, "step": 65, "time_per_iteration": 2.7209055423736572 }, { "auxiliary_loss_clip": 0.13168152, "auxiliary_loss_mlp": 0.70342863, "balance_loss_clip": 0.0852159, "balance_loss_mlp": 0.02471772, "epoch": 0.003968134676085976, "flos": 18339488732160.0, "grad_norm": 1418.4580577177485, "language_loss": 4.90314722, "learning_rate": 2.697518353781685e-06, "loss": 5.73825741, "num_input_tokens_seen": 1259315, "router_z_loss_clip": 46.40625, "router_z_loss_mlp": 679.0, "step": 66, "time_per_iteration": 2.7219431400299072 }, { "auxiliary_loss_clip": 0.13156243, "auxiliary_loss_mlp": 0.68734097, "balance_loss_clip": 0.08517988, "balance_loss_mlp": 0.02474336, "epoch": 0.004028257928753946, "flos": 20491466417280.0, "grad_norm": 714.1128533801146, "language_loss": 6.08120823, "learning_rate": 2.7072005239581103e-06, "loss": 6.90011215, "num_input_tokens_seen": 1277055, "router_z_loss_clip": 46.4375, "router_z_loss_mlp": 662.5, "step": 67, "time_per_iteration": 2.715069532394409 }, { "auxiliary_loss_clip": 0.13120888, "auxiliary_loss_mlp": 0.69322467, "balance_loss_clip": 0.0850184, "balance_loss_mlp": 0.02476763, "epoch": 0.004088381181421915, "flos": 18849863151360.0, "grad_norm": 192.60125516865367, "language_loss": 5.99375868, "learning_rate": 2.7167392492896727e-06, "loss": 6.81819248, "num_input_tokens_seen": 1294355, "router_z_loss_clip": 46.1875, "router_z_loss_mlp": 668.5, "step": 68, "time_per_iteration": 2.6923844814300537 }, { "auxiliary_loss_clip": 0.13132155, "auxiliary_loss_mlp": 0.68632603, "balance_loss_clip": 0.08494577, "balance_loss_mlp": 0.02470493, "epoch": 0.004148504434089885, "flos": 19433974763520.0, "grad_norm": 154.82886156939932, "language_loss": 5.72154331, "learning_rate": 2.7261387181735195e-06, "loss": 6.53919077, "num_input_tokens_seen": 1313525, "router_z_loss_clip": 46.375, "router_z_loss_mlp": 661.5, "step": 69, "time_per_iteration": 2.718217134475708 }, { "auxiliary_loss_clip": 0.13113901, "auxiliary_loss_mlp": 0.68434703, "balance_loss_clip": 0.08499341, "balance_loss_mlp": 0.02467903, "epoch": 0.004208627686757853, "flos": 20816868948480.0, "grad_norm": 2289.2375116863686, "language_loss": 6.8012948, "learning_rate": 2.7354029381999196e-06, "loss": 7.61678076, "num_input_tokens_seen": 1330505, "router_z_loss_clip": 46.09375, "router_z_loss_mlp": 659.5, "step": 70, "time_per_iteration": 2.692087173461914 }, { "auxiliary_loss_clip": 0.13087457, "auxiliary_loss_mlp": 0.67906868, "balance_loss_clip": 0.08496201, "balance_loss_mlp": 0.02477181, "epoch": 0.004268750939425823, "flos": 19104589163520.0, "grad_norm": 3363.249496275772, "language_loss": 5.55019951, "learning_rate": 2.7445357464116983e-06, "loss": 6.36014271, "num_input_tokens_seen": 1349615, "router_z_loss_clip": 45.8125, "router_z_loss_mlp": 654.0, "step": 71, "time_per_iteration": 2.70426082611084 }, { "auxiliary_loss_clip": 0.13806424, "auxiliary_loss_mlp": 0.59260559, "balance_loss_clip": 0.08872451, "balance_loss_mlp": 0.02473455, "epoch": 0.004328874192093792, "flos": 52456112340480.0, "grad_norm": 27.365087332216024, "language_loss": 0.75850952, "learning_rate": 2.75354081884615e-06, "loss": 1.48917937, "num_input_tokens_seen": 1410275, "router_z_loss_clip": 49.46875, "router_z_loss_mlp": 569.5, "step": 72, "time_per_iteration": 3.4798011779785156 }, { "auxiliary_loss_clip": 0.13816734, "auxiliary_loss_mlp": 0.58485699, "balance_loss_clip": 0.08875924, "balance_loss_mlp": 0.02479837, "epoch": 0.004388997444761762, "flos": 66495922260480.0, "grad_norm": 27.06855864438921, "language_loss": 0.70984924, "learning_rate": 2.7624216794188286e-06, "loss": 1.43287349, "num_input_tokens_seen": 1473020, "router_z_loss_clip": 49.5, "router_z_loss_mlp": 561.5, "step": 73, "time_per_iteration": 3.753922462463379 }, { "auxiliary_loss_clip": 0.13088223, "auxiliary_loss_mlp": 0.64395165, "balance_loss_clip": 0.08484054, "balance_loss_mlp": 0.02481099, "epoch": 0.004449120697429731, "flos": 18958959567360.0, "grad_norm": 213.49868519381914, "language_loss": 6.13619328, "learning_rate": 2.771181708202938e-06, "loss": 6.91102743, "num_input_tokens_seen": 1490385, "router_z_loss_clip": 46.0, "router_z_loss_mlp": 618.5, "step": 74, "time_per_iteration": 2.898646593093872 }, { "auxiliary_loss_clip": 0.1305978, "auxiliary_loss_mlp": 0.62825066, "balance_loss_clip": 0.08490635, "balance_loss_mlp": 0.02473507, "epoch": 0.004509243950097701, "flos": 21111817720320.0, "grad_norm": 111.73615936080266, "language_loss": 6.21756268, "learning_rate": 2.779824149153005e-06, "loss": 6.97641134, "num_input_tokens_seen": 1509725, "router_z_loss_clip": 45.6875, "router_z_loss_mlp": 602.0, "step": 75, "time_per_iteration": 2.6960086822509766 }, { "auxiliary_loss_clip": 0.13031921, "auxiliary_loss_mlp": 0.62532729, "balance_loss_clip": 0.08493434, "balance_loss_mlp": 0.02474137, "epoch": 0.004569367202765669, "flos": 20704082952960.0, "grad_norm": 208.11422081894122, "language_loss": 6.07705164, "learning_rate": 2.788352117317012e-06, "loss": 6.83269787, "num_input_tokens_seen": 1527245, "router_z_loss_clip": 45.40625, "router_z_loss_mlp": 599.0, "step": 76, "time_per_iteration": 2.7000839710235596 }, { "auxiliary_loss_clip": 0.13012415, "auxiliary_loss_mlp": 0.60529923, "balance_loss_clip": 0.08490606, "balance_loss_mlp": 0.0247328, "epoch": 0.004629490455433639, "flos": 28666136021760.0, "grad_norm": 68.27797233116135, "language_loss": 5.94905949, "learning_rate": 2.796768605577095e-06, "loss": 6.68448305, "num_input_tokens_seen": 1548930, "router_z_loss_clip": 45.1875, "router_z_loss_mlp": 580.0, "step": 77, "time_per_iteration": 2.8898820877075195 }, { "auxiliary_loss_clip": 0.12999924, "auxiliary_loss_mlp": 0.5911392, "balance_loss_clip": 0.08469264, "balance_loss_mlp": 0.02473293, "epoch": 0.004689613708101608, "flos": 11077142382720.0, "grad_norm": 76.0354455682205, "language_loss": 5.2337265, "learning_rate": 2.80507649095533e-06, "loss": 5.9548645, "num_input_tokens_seen": 1565695, "router_z_loss_clip": 45.28125, "router_z_loss_mlp": 567.5, "step": 78, "time_per_iteration": 2.724954605102539 }, { "auxiliary_loss_clip": 0.12948887, "auxiliary_loss_mlp": 0.57554126, "balance_loss_clip": 0.08460076, "balance_loss_mlp": 0.02475997, "epoch": 0.004749736960769578, "flos": 21805612727040.0, "grad_norm": 109.54045548408942, "language_loss": 4.90722847, "learning_rate": 2.813278540517843e-06, "loss": 5.61225843, "num_input_tokens_seen": 1582625, "router_z_loss_clip": 44.84375, "router_z_loss_mlp": 552.0, "step": 79, "time_per_iteration": 2.825841188430786 }, { "auxiliary_loss_clip": 0.12964176, "auxiliary_loss_mlp": 0.57404763, "balance_loss_clip": 0.08467558, "balance_loss_mlp": 0.02473124, "epoch": 0.004809860213437547, "flos": 19798803440640.0, "grad_norm": 59.55802984676335, "language_loss": 4.96699333, "learning_rate": 2.8213774169075505e-06, "loss": 5.67068291, "num_input_tokens_seen": 1601725, "router_z_loss_clip": 44.96875, "router_z_loss_mlp": 550.5, "step": 80, "time_per_iteration": 2.7265636920928955 }, { "auxiliary_loss_clip": 0.1294769, "auxiliary_loss_mlp": 0.53967392, "balance_loss_clip": 0.08467852, "balance_loss_mlp": 0.02478133, "epoch": 0.004869983466105517, "flos": 26580893713920.0, "grad_norm": 65.77693444526564, "language_loss": 5.44196844, "learning_rate": 2.829375683533245e-06, "loss": 6.11111927, "num_input_tokens_seen": 1622420, "router_z_loss_clip": 44.8125, "router_z_loss_mlp": 515.25, "step": 81, "time_per_iteration": 2.7202470302581787 }, { "auxiliary_loss_clip": 0.12890302, "auxiliary_loss_mlp": 0.52149391, "balance_loss_clip": 0.08454925, "balance_loss_mlp": 0.0246677, "epoch": 0.004930106718773485, "flos": 12828345189120.0, "grad_norm": 65.38563394862781, "language_loss": 4.32633924, "learning_rate": 2.8372758094402803e-06, "loss": 4.97673607, "num_input_tokens_seen": 1640715, "router_z_loss_clip": 44.34375, "router_z_loss_mlp": 496.25, "step": 82, "time_per_iteration": 2.7039830684661865 }, { "auxiliary_loss_clip": 0.12882753, "auxiliary_loss_mlp": 0.51274276, "balance_loss_clip": 0.08455303, "balance_loss_mlp": 0.02470566, "epoch": 0.004990229971441455, "flos": 25781901505920.0, "grad_norm": 75.47409186988203, "language_loss": 5.07570362, "learning_rate": 2.84508017388607e-06, "loss": 5.71727371, "num_input_tokens_seen": 1662210, "router_z_loss_clip": 44.25, "router_z_loss_mlp": 487.75, "step": 83, "time_per_iteration": 2.7550344467163086 }, { "auxiliary_loss_clip": 0.12884168, "auxiliary_loss_mlp": 0.48491812, "balance_loss_clip": 0.08472075, "balance_loss_mlp": 0.02471305, "epoch": 0.005050353224109424, "flos": 17463027824640.0, "grad_norm": 97.01321362759072, "language_loss": 4.5026207, "learning_rate": 2.852791070641559e-06, "loss": 5.11638021, "num_input_tokens_seen": 1681070, "router_z_loss_clip": 44.125, "router_z_loss_mlp": 460.0, "step": 84, "time_per_iteration": 2.7062366008758545 }, { "auxiliary_loss_clip": 0.13397604, "auxiliary_loss_mlp": 0.40453887, "balance_loss_clip": 0.0878343, "balance_loss_mlp": 0.02465607, "epoch": 0.005110476476777394, "flos": 69824607160320.0, "grad_norm": 20.768149428903122, "language_loss": 0.6453259, "learning_rate": 2.8604107120381682e-06, "loss": 1.18384075, "num_input_tokens_seen": 1747140, "router_z_loss_clip": 46.03125, "router_z_loss_mlp": 379.0, "step": 85, "time_per_iteration": 3.425076961517334 }, { "auxiliary_loss_clip": 0.127913, "auxiliary_loss_mlp": 0.43971938, "balance_loss_clip": 0.08451283, "balance_loss_mlp": 0.0246803, "epoch": 0.005170599729445363, "flos": 24796973088000.0, "grad_norm": 254.3132346802179, "language_loss": 4.64791298, "learning_rate": 2.8679412327780482e-06, "loss": 5.21554565, "num_input_tokens_seen": 1767475, "router_z_loss_clip": 43.40625, "router_z_loss_mlp": 415.0, "step": 86, "time_per_iteration": 2.791365385055542 }, { "auxiliary_loss_clip": 0.1280022, "auxiliary_loss_mlp": 0.40925997, "balance_loss_clip": 0.08443085, "balance_loss_mlp": 0.02473851, "epoch": 0.005230722982113333, "flos": 23264717800320.0, "grad_norm": 65.58987084955277, "language_loss": 5.13358688, "learning_rate": 2.8753846935240833e-06, "loss": 5.67084885, "num_input_tokens_seen": 1784980, "router_z_loss_clip": 43.59375, "router_z_loss_mlp": 384.25, "step": 87, "time_per_iteration": 2.6975181102752686 }, { "auxiliary_loss_clip": 0.12783667, "auxiliary_loss_mlp": 0.39117497, "balance_loss_clip": 0.08448306, "balance_loss_mlp": 0.02471991, "epoch": 0.005290846234781301, "flos": 16733622032640.0, "grad_norm": 86.62821297597523, "language_loss": 4.42919731, "learning_rate": 2.8827430842847267e-06, "loss": 4.94820881, "num_input_tokens_seen": 1803030, "router_z_loss_clip": 43.34375, "router_z_loss_mlp": 366.75, "step": 88, "time_per_iteration": 2.679661750793457 }, { "auxiliary_loss_clip": 0.12770416, "auxiliary_loss_mlp": 0.36628264, "balance_loss_clip": 0.08459744, "balance_loss_mlp": 0.02472993, "epoch": 0.005350969487449271, "flos": 20892283223040.0, "grad_norm": 77.99657475933643, "language_loss": 4.64495277, "learning_rate": 2.8900183276075957e-06, "loss": 5.1389389, "num_input_tokens_seen": 1822865, "router_z_loss_clip": 43.09375, "router_z_loss_mlp": 341.75, "step": 89, "time_per_iteration": 2.705355167388916 }, { "auxiliary_loss_clip": 0.1273649, "auxiliary_loss_mlp": 0.33960941, "balance_loss_clip": 0.08465879, "balance_loss_mlp": 0.02466799, "epoch": 0.00541109274011724, "flos": 26216568161280.0, "grad_norm": 154.3290879232552, "language_loss": 3.85976601, "learning_rate": 2.8972122815946455e-06, "loss": 4.32674026, "num_input_tokens_seen": 1842435, "router_z_loss_clip": 42.65625, "router_z_loss_mlp": 314.5, "step": 90, "time_per_iteration": 2.7374789714813232 }, { "auxiliary_loss_clip": 0.12763281, "auxiliary_loss_mlp": 0.31818292, "balance_loss_clip": 0.08447961, "balance_loss_mlp": 0.02472589, "epoch": 0.00547121599278521, "flos": 21184926007680.0, "grad_norm": 92.13562845778304, "language_loss": 4.86918736, "learning_rate": 2.90432674275074e-06, "loss": 5.31500244, "num_input_tokens_seen": 1860065, "router_z_loss_clip": 43.125, "router_z_loss_mlp": 293.25, "step": 91, "time_per_iteration": 2.7474875450134277 }, { "auxiliary_loss_clip": 0.12699604, "auxiliary_loss_mlp": 0.29665828, "balance_loss_clip": 0.08454539, "balance_loss_mlp": 0.02468564, "epoch": 0.005531339245453179, "flos": 19724856612480.0, "grad_norm": 112.67559316446786, "language_loss": 3.88830423, "learning_rate": 2.91136344867656e-06, "loss": 4.31195831, "num_input_tokens_seen": 1878135, "router_z_loss_clip": 42.4375, "router_z_loss_mlp": 272.5, "step": 92, "time_per_iteration": 2.714306116104126 }, { "auxiliary_loss_clip": 0.12730764, "auxiliary_loss_mlp": 0.27796683, "balance_loss_clip": 0.08462584, "balance_loss_mlp": 0.02467091, "epoch": 0.005591462498121149, "flos": 17641291386240.0, "grad_norm": 209.59504352740288, "language_loss": 4.04585791, "learning_rate": 2.918324080615938e-06, "loss": 4.4511323, "num_input_tokens_seen": 1894895, "router_z_loss_clip": 42.625, "router_z_loss_mlp": 253.25, "step": 93, "time_per_iteration": 2.6865789890289307 }, { "auxiliary_loss_clip": 0.12707059, "auxiliary_loss_mlp": 0.26600021, "balance_loss_clip": 0.08458553, "balance_loss_mlp": 0.02466719, "epoch": 0.005651585750789117, "flos": 20017415543040.0, "grad_norm": 119.31981343520108, "language_loss": 4.21546745, "learning_rate": 2.925210265866963e-06, "loss": 4.60853815, "num_input_tokens_seen": 1913220, "router_z_loss_clip": 42.46875, "router_z_loss_mlp": 241.125, "step": 94, "time_per_iteration": 2.7316513061523438 }, { "auxiliary_loss_clip": 0.13178506, "auxiliary_loss_mlp": 0.22774096, "balance_loss_clip": 0.08756742, "balance_loss_mlp": 0.02449389, "epoch": 0.005711709003457087, "flos": 59831202758400.0, "grad_norm": 14.129521265229965, "language_loss": 0.69451952, "learning_rate": 2.932023580065507e-06, "loss": 1.05404544, "num_input_tokens_seen": 1970970, "router_z_loss_clip": 44.25, "router_z_loss_mlp": 203.125, "step": 95, "time_per_iteration": 3.2335197925567627 }, { "auxiliary_loss_clip": 0.12688157, "auxiliary_loss_mlp": 0.23964925, "balance_loss_clip": 0.08467753, "balance_loss_mlp": 0.02468343, "epoch": 0.005771832256125056, "flos": 15564979537920.0, "grad_norm": 88.40426151672713, "language_loss": 3.69762373, "learning_rate": 2.9387655493491906e-06, "loss": 4.06415462, "num_input_tokens_seen": 1988930, "router_z_loss_clip": 42.1875, "router_z_loss_mlp": 215.125, "step": 96, "time_per_iteration": 4.176332473754883 }, { "auxiliary_loss_clip": 0.12648645, "auxiliary_loss_mlp": 0.22522679, "balance_loss_clip": 0.08446988, "balance_loss_mlp": 0.02466527, "epoch": 0.005831955508793026, "flos": 22534934664960.0, "grad_norm": 82.6454361993076, "language_loss": 4.51463318, "learning_rate": 2.9454376524092147e-06, "loss": 4.86634636, "num_input_tokens_seen": 2006285, "router_z_loss_clip": 42.0, "router_z_loss_mlp": 200.375, "step": 97, "time_per_iteration": 5.898942470550537 }, { "auxiliary_loss_clip": 0.12609984, "auxiliary_loss_mlp": 0.20984815, "balance_loss_clip": 0.08454677, "balance_loss_mlp": 0.02466748, "epoch": 0.005892078761460995, "flos": 22055600983680.0, "grad_norm": 91.88002301398242, "language_loss": 3.9030776, "learning_rate": 2.952041322436969e-06, "loss": 4.23902559, "num_input_tokens_seen": 2024905, "router_z_loss_clip": 41.5, "router_z_loss_mlp": 185.25, "step": 98, "time_per_iteration": 4.166609287261963 }, { "auxiliary_loss_clip": 0.12999861, "auxiliary_loss_mlp": 0.18133822, "balance_loss_clip": 0.08715963, "balance_loss_mlp": 0.02447787, "epoch": 0.005952202014128965, "flos": 68559865632000.0, "grad_norm": 12.184176378886272, "language_loss": 0.66453552, "learning_rate": 2.9585779489718204e-06, "loss": 0.9758724, "num_input_tokens_seen": 2086220, "router_z_loss_clip": 42.875, "router_z_loss_mlp": 156.625, "step": 99, "time_per_iteration": 3.455320358276367 }, { "auxiliary_loss_clip": 0.12551229, "auxiliary_loss_mlp": 0.18527201, "balance_loss_clip": 0.08451475, "balance_loss_mlp": 0.02462747, "epoch": 0.006012325266796933, "flos": 22966624500480.0, "grad_norm": 85.98169213815497, "language_loss": 4.00207853, "learning_rate": 2.9650488796560464e-06, "loss": 4.3128624, "num_input_tokens_seen": 2103365, "router_z_loss_clip": 40.9375, "router_z_loss_mlp": 160.5, "step": 100, "time_per_iteration": 2.7101457118988037 }, { "auxiliary_loss_clip": 0.12557042, "auxiliary_loss_mlp": 0.17203504, "balance_loss_clip": 0.08472102, "balance_loss_mlp": 0.02469618, "epoch": 0.006072448519464903, "flos": 17353721773440.0, "grad_norm": 83.38285847674591, "language_loss": 3.84722161, "learning_rate": 2.971455421902446e-06, "loss": 4.14482737, "num_input_tokens_seen": 2121995, "router_z_loss_clip": 40.8125, "router_z_loss_mlp": 147.125, "step": 101, "time_per_iteration": 2.6774775981903076 }, { "auxiliary_loss_clip": 0.1250564, "auxiliary_loss_mlp": 0.15857592, "balance_loss_clip": 0.08465028, "balance_loss_mlp": 0.02466478, "epoch": 0.006132571772132872, "flos": 24688044380160.0, "grad_norm": 126.46805182658686, "language_loss": 3.96392989, "learning_rate": 2.9777988444798075e-06, "loss": 4.24756241, "num_input_tokens_seen": 2141815, "router_z_loss_clip": 40.40625, "router_z_loss_mlp": 134.0, "step": 102, "time_per_iteration": 2.7474446296691895 }, { "auxiliary_loss_clip": 0.12488147, "auxiliary_loss_mlp": 0.14847699, "balance_loss_clip": 0.0846689, "balance_loss_mlp": 0.0246977, "epoch": 0.006192695024800842, "flos": 21471279736320.0, "grad_norm": 134.07957022415462, "language_loss": 3.74091101, "learning_rate": 2.9840803790210285e-06, "loss": 4.01426935, "num_input_tokens_seen": 2161125, "router_z_loss_clip": 40.21875, "router_z_loss_mlp": 123.625, "step": 103, "time_per_iteration": 2.7533724308013916 }, { "auxiliary_loss_clip": 0.12413687, "auxiliary_loss_mlp": 0.13732624, "balance_loss_clip": 0.08453064, "balance_loss_mlp": 0.02459431, "epoch": 0.006252818277468811, "flos": 17426117301120.0, "grad_norm": 205.77783718757073, "language_loss": 3.54425669, "learning_rate": 2.990301221458371e-06, "loss": 3.80571985, "num_input_tokens_seen": 2179510, "router_z_loss_clip": 39.625, "router_z_loss_mlp": 112.8125, "step": 104, "time_per_iteration": 2.696789503097534 }, { "auxiliary_loss_clip": 0.12437286, "auxiliary_loss_mlp": 0.13317139, "balance_loss_clip": 0.08486864, "balance_loss_mlp": 0.02458986, "epoch": 0.006312941530136781, "flos": 19105679266560.0, "grad_norm": 106.65157951415281, "language_loss": 3.23600483, "learning_rate": 2.9964625333900544e-06, "loss": 3.49354887, "num_input_tokens_seen": 2197870, "router_z_loss_clip": 39.46875, "router_z_loss_mlp": 108.6875, "step": 105, "time_per_iteration": 2.8186123371124268 }, { "auxiliary_loss_clip": 0.12429639, "auxiliary_loss_mlp": 0.12771098, "balance_loss_clip": 0.08482035, "balance_loss_mlp": 0.02468364, "epoch": 0.006373064782804749, "flos": 24067651150080.0, "grad_norm": 213.99846714159625, "language_loss": 3.50973082, "learning_rate": 3.002565443382063e-06, "loss": 3.76173782, "num_input_tokens_seen": 2217495, "router_z_loss_clip": 39.4375, "router_z_loss_mlp": 103.0, "step": 106, "time_per_iteration": 2.7928006649017334 }, { "auxiliary_loss_clip": 0.12367644, "auxiliary_loss_mlp": 0.12222727, "balance_loss_clip": 0.08467251, "balance_loss_mlp": 0.02457101, "epoch": 0.006433188035472719, "flos": 18338272848000.0, "grad_norm": 154.55891296759967, "language_loss": 2.81608582, "learning_rate": 3.008611048208843e-06, "loss": 3.06198931, "num_input_tokens_seen": 2236520, "router_z_loss_clip": 38.96875, "router_z_loss_mlp": 97.5625, "step": 107, "time_per_iteration": 2.7040483951568604 }, { "auxiliary_loss_clip": 0.12712547, "auxiliary_loss_mlp": 0.10412, "balance_loss_clip": 0.08728987, "balance_loss_mlp": 0.02446912, "epoch": 0.006493311288140688, "flos": 62583266257920.0, "grad_norm": 8.186979949401213, "language_loss": 0.65631682, "learning_rate": 3.014600414036285e-06, "loss": 0.88756233, "num_input_tokens_seen": 2300140, "router_z_loss_clip": 39.875, "router_z_loss_mlp": 79.5625, "step": 108, "time_per_iteration": 3.375021457672119 }, { "auxiliary_loss_clip": 0.12343232, "auxiliary_loss_mlp": 0.10634232, "balance_loss_clip": 0.08481926, "balance_loss_mlp": 0.02467728, "epoch": 0.006553434540808658, "flos": 19506202583040.0, "grad_norm": 184.83889451415823, "language_loss": 3.79070783, "learning_rate": 3.0205345775501937e-06, "loss": 4.02048206, "num_input_tokens_seen": 2317320, "router_z_loss_clip": 38.5625, "router_z_loss_mlp": 81.625, "step": 109, "time_per_iteration": 2.7063653469085693 }, { "auxiliary_loss_clip": 0.12333325, "auxiliary_loss_mlp": 0.09804593, "balance_loss_clip": 0.08488195, "balance_loss_mlp": 0.02468168, "epoch": 0.006613557793476627, "flos": 21111398449920.0, "grad_norm": 147.25328400998768, "language_loss": 2.94254875, "learning_rate": 3.0264145470332218e-06, "loss": 3.16392803, "num_input_tokens_seen": 2337820, "router_z_loss_clip": 38.4375, "router_z_loss_mlp": 73.25, "step": 110, "time_per_iteration": 2.793097734451294 }, { "auxiliary_loss_clip": 0.12337886, "auxiliary_loss_mlp": 0.09300542, "balance_loss_clip": 0.08477179, "balance_loss_mlp": 0.02464605, "epoch": 0.006673681046144597, "flos": 26037843402240.0, "grad_norm": 147.00972456675896, "language_loss": 3.59402847, "learning_rate": 3.032241303393073e-06, "loss": 3.81041265, "num_input_tokens_seen": 2358560, "router_z_loss_clip": 38.625, "router_z_loss_mlp": 68.4375, "step": 111, "time_per_iteration": 2.7643141746520996 }, { "auxiliary_loss_clip": 0.12284013, "auxiliary_loss_mlp": 0.09034258, "balance_loss_clip": 0.08471591, "balance_loss_mlp": 0.02466875, "epoch": 0.006733804298812566, "flos": 23154279719040.0, "grad_norm": 259.49700461079533, "language_loss": 3.55302191, "learning_rate": 3.0380158011446e-06, "loss": 3.7662046, "num_input_tokens_seen": 2379005, "router_z_loss_clip": 38.09375, "router_z_loss_mlp": 65.71875, "step": 112, "time_per_iteration": 2.7240653038024902 }, { "auxiliary_loss_clip": 0.12222753, "auxiliary_loss_mlp": 0.08641934, "balance_loss_clip": 0.08472199, "balance_loss_mlp": 0.02456021, "epoch": 0.006793927551480535, "flos": 11769092599680.0, "grad_norm": 181.88405501542795, "language_loss": 2.91473556, "learning_rate": 3.0437389693482466e-06, "loss": 3.12338257, "num_input_tokens_seen": 2395610, "router_z_loss_clip": 37.5, "router_z_loss_mlp": 61.84375, "step": 113, "time_per_iteration": 2.682826280593872 }, { "auxiliary_loss_clip": 0.1225155, "auxiliary_loss_mlp": 0.08362015, "balance_loss_clip": 0.08478779, "balance_loss_mlp": 0.02459915, "epoch": 0.006854050804148504, "flos": 19177990940160.0, "grad_norm": 199.38252994593094, "language_loss": 3.13487625, "learning_rate": 3.0494117125071475e-06, "loss": 3.34101176, "num_input_tokens_seen": 2415005, "router_z_loss_clip": 37.6875, "router_z_loss_mlp": 59.03125, "step": 114, "time_per_iteration": 2.714510679244995 }, { "auxiliary_loss_clip": 0.12178786, "auxiliary_loss_mlp": 0.08149155, "balance_loss_clip": 0.08470957, "balance_loss_mlp": 0.02436265, "epoch": 0.006914174056816474, "flos": 21988488263040.0, "grad_norm": 302.9237649905414, "language_loss": 3.39552879, "learning_rate": 3.055034911425055e-06, "loss": 3.59880805, "num_input_tokens_seen": 2433965, "router_z_loss_clip": 37.125, "router_z_loss_mlp": 57.1875, "step": 115, "time_per_iteration": 2.7574901580810547 }, { "auxiliary_loss_clip": 0.12134635, "auxiliary_loss_mlp": 0.08105126, "balance_loss_clip": 0.08458051, "balance_loss_mlp": 0.02450218, "epoch": 0.006974297309484443, "flos": 16294636892160.0, "grad_norm": 131.17593239184845, "language_loss": 2.87884021, "learning_rate": 3.0606094240271244e-06, "loss": 3.08123779, "num_input_tokens_seen": 2451605, "router_z_loss_clip": 36.78125, "router_z_loss_mlp": 56.5625, "step": 116, "time_per_iteration": 2.8412928581237793 }, { "auxiliary_loss_clip": 0.12087503, "auxiliary_loss_mlp": 0.08003856, "balance_loss_clip": 0.08450261, "balance_loss_mlp": 0.02440502, "epoch": 0.007034420562152413, "flos": 26111161324800.0, "grad_norm": 405.2571024551994, "language_loss": 3.14370418, "learning_rate": 3.0661360861454656e-06, "loss": 3.34461784, "num_input_tokens_seen": 2472035, "router_z_loss_clip": 36.40625, "router_z_loss_mlp": 55.6875, "step": 117, "time_per_iteration": 2.7617461681365967 }, { "auxiliary_loss_clip": 0.12120166, "auxiliary_loss_mlp": 0.07676817, "balance_loss_clip": 0.08469536, "balance_loss_mlp": 0.02446105, "epoch": 0.007094543814820382, "flos": 14208933386880.0, "grad_norm": 636.9911447990372, "language_loss": 3.02098989, "learning_rate": 3.071615712271274e-06, "loss": 3.21895981, "num_input_tokens_seen": 2489285, "router_z_loss_clip": 36.53125, "router_z_loss_mlp": 52.3125, "step": 118, "time_per_iteration": 2.6711666584014893 }, { "auxiliary_loss_clip": 0.12035839, "auxiliary_loss_mlp": 0.0741428, "balance_loss_clip": 0.08468533, "balance_loss_mlp": 0.02421604, "epoch": 0.007154667067488351, "flos": 14981329123200.0, "grad_norm": 415.38909469578357, "language_loss": 3.29147959, "learning_rate": 3.0770490962752172e-06, "loss": 3.48598075, "num_input_tokens_seen": 2506460, "router_z_loss_clip": 35.6875, "router_z_loss_mlp": 49.875, "step": 119, "time_per_iteration": 2.693372964859009 }, { "auxiliary_loss_clip": 0.12065814, "auxiliary_loss_mlp": 0.07500771, "balance_loss_clip": 0.08466005, "balance_loss_mlp": 0.02419594, "epoch": 0.00721479032015632, "flos": 20199452538240.0, "grad_norm": 187.30524612195882, "language_loss": 3.00418305, "learning_rate": 3.082437012097686e-06, "loss": 3.19984889, "num_input_tokens_seen": 2525565, "router_z_loss_clip": 36.0625, "router_z_loss_mlp": 50.78125, "step": 120, "time_per_iteration": 2.695387840270996 }, { "auxiliary_loss_clip": 0.12038741, "auxiliary_loss_mlp": 0.07134814, "balance_loss_clip": 0.08453342, "balance_loss_mlp": 0.02429003, "epoch": 0.00727491357282429, "flos": 23153650813440.0, "grad_norm": 255.9983318466725, "language_loss": 3.3493042, "learning_rate": 3.0877802144103967e-06, "loss": 3.54103971, "num_input_tokens_seen": 2546605, "router_z_loss_clip": 35.875, "router_z_loss_mlp": 47.0625, "step": 121, "time_per_iteration": 2.7926671504974365 }, { "auxiliary_loss_clip": 0.12008427, "auxiliary_loss_mlp": 0.0698216, "balance_loss_clip": 0.08456004, "balance_loss_mlp": 0.02422834, "epoch": 0.007335036825492259, "flos": 15526811203200.0, "grad_norm": 577.3530992130394, "language_loss": 3.01833296, "learning_rate": 3.09307943925077e-06, "loss": 3.20823908, "num_input_tokens_seen": 2560730, "router_z_loss_clip": 35.5625, "router_z_loss_mlp": 45.625, "step": 122, "time_per_iteration": 2.8421542644500732 }, { "auxiliary_loss_clip": 0.11967987, "auxiliary_loss_mlp": 0.06668985, "balance_loss_clip": 0.0844539, "balance_loss_mlp": 0.02411783, "epoch": 0.007395160078160229, "flos": 24250233196800.0, "grad_norm": 520.5228525099532, "language_loss": 3.28371286, "learning_rate": 3.0983354046304154e-06, "loss": 3.47008276, "num_input_tokens_seen": 2579550, "router_z_loss_clip": 35.1875, "router_z_loss_mlp": 42.5625, "step": 123, "time_per_iteration": 2.7508468627929688 }, { "auxiliary_loss_clip": 0.11949614, "auxiliary_loss_mlp": 0.06614668, "balance_loss_clip": 0.08438589, "balance_loss_mlp": 0.02403242, "epoch": 0.007455283330828198, "flos": 31767976391040.0, "grad_norm": 214.49829909206866, "language_loss": 2.75341105, "learning_rate": 3.103548811118979e-06, "loss": 2.93905401, "num_input_tokens_seen": 2600390, "router_z_loss_clip": 35.15625, "router_z_loss_mlp": 42.15625, "step": 124, "time_per_iteration": 2.7855255603790283 }, { "auxiliary_loss_clip": 0.12003073, "auxiliary_loss_mlp": 0.06465346, "balance_loss_clip": 0.08465698, "balance_loss_mlp": 0.02424818, "epoch": 0.007515406583496167, "flos": 26622458138880.0, "grad_norm": 578.6506612846227, "language_loss": 2.7959106, "learning_rate": 3.108720342404542e-06, "loss": 2.98059464, "num_input_tokens_seen": 2620770, "router_z_loss_clip": 35.40625, "router_z_loss_mlp": 40.375, "step": 125, "time_per_iteration": 2.7296698093414307 }, { "auxiliary_loss_clip": 0.11966403, "auxiliary_loss_mlp": 0.06346069, "balance_loss_clip": 0.08469936, "balance_loss_mlp": 0.02427612, "epoch": 0.007575529836164136, "flos": 18229637629440.0, "grad_norm": 1278.9286526965868, "language_loss": 2.86619425, "learning_rate": 3.1138506658316945e-06, "loss": 3.04931927, "num_input_tokens_seen": 2639900, "router_z_loss_clip": 34.96875, "router_z_loss_mlp": 39.15625, "step": 126, "time_per_iteration": 2.720729112625122 }, { "auxiliary_loss_clip": 0.11984246, "auxiliary_loss_mlp": 0.06358081, "balance_loss_clip": 0.08454584, "balance_loss_mlp": 0.02415211, "epoch": 0.007635653088832106, "flos": 21586916770560.0, "grad_norm": 504.9707040649663, "language_loss": 2.53610039, "learning_rate": 3.1189404329183404e-06, "loss": 2.71952391, "num_input_tokens_seen": 2657450, "router_z_loss_clip": 35.34375, "router_z_loss_mlp": 39.40625, "step": 127, "time_per_iteration": 2.7664260864257812 }, { "auxiliary_loss_clip": 0.11950956, "auxiliary_loss_mlp": 0.06346647, "balance_loss_clip": 0.0844048, "balance_loss_mlp": 0.02437345, "epoch": 0.007695776341500075, "flos": 25382216730240.0, "grad_norm": 275.611088999027, "language_loss": 2.74591923, "learning_rate": 3.1239902798522317e-06, "loss": 2.928895, "num_input_tokens_seen": 2678150, "router_z_loss_clip": 35.15625, "router_z_loss_mlp": 39.03125, "step": 128, "time_per_iteration": 2.799276828765869 }, { "auxiliary_loss_clip": 0.11924884, "auxiliary_loss_mlp": 0.06279474, "balance_loss_clip": 0.08440928, "balance_loss_mlp": 0.02431207, "epoch": 0.007755899594168045, "flos": 22350088558080.0, "grad_norm": 502.8536762011656, "language_loss": 3.05189991, "learning_rate": 3.129000827968184e-06, "loss": 3.23394346, "num_input_tokens_seen": 2698290, "router_z_loss_clip": 34.90625, "router_z_loss_mlp": 38.4375, "step": 129, "time_per_iteration": 2.738448143005371 }, { "auxiliary_loss_clip": 0.11899924, "auxiliary_loss_mlp": 0.06431475, "balance_loss_clip": 0.0843666, "balance_loss_mlp": 0.02509965, "epoch": 0.007816022846836013, "flos": 22644869621760.0, "grad_norm": 184.6266411186459, "language_loss": 3.14814043, "learning_rate": 3.133972684206866e-06, "loss": 3.33145428, "num_input_tokens_seen": 2717630, "router_z_loss_clip": 34.6875, "router_z_loss_mlp": 39.15625, "step": 130, "time_per_iteration": 2.7609336376190186 }, { "auxiliary_loss_clip": 0.11843424, "auxiliary_loss_mlp": 0.06229096, "balance_loss_clip": 0.08413848, "balance_loss_mlp": 0.02475434, "epoch": 0.007876146099503984, "flos": 18188115131520.0, "grad_norm": 2522.109177646642, "language_loss": 2.5444231, "learning_rate": 3.138906441556014e-06, "loss": 2.7251482, "num_input_tokens_seen": 2735835, "router_z_loss_clip": 34.34375, "router_z_loss_mlp": 37.5, "step": 131, "time_per_iteration": 2.6973793506622314 }, { "auxiliary_loss_clip": 0.11846272, "auxiliary_loss_mlp": 0.06038957, "balance_loss_clip": 0.08426226, "balance_loss_mlp": 0.0245009, "epoch": 0.007936269352171952, "flos": 27125788815360.0, "grad_norm": 321.5121108739286, "language_loss": 2.66214728, "learning_rate": 3.143802679474861e-06, "loss": 2.8409996, "num_input_tokens_seen": 2756335, "router_z_loss_clip": 34.25, "router_z_loss_mlp": 35.90625, "step": 132, "time_per_iteration": 2.7591896057128906 }, { "auxiliary_loss_clip": 0.11874332, "auxiliary_loss_mlp": 0.06040521, "balance_loss_clip": 0.08419663, "balance_loss_mlp": 0.0247912, "epoch": 0.007996392604839923, "flos": 19032403271040.0, "grad_norm": 343.3479548237288, "language_loss": 2.74569082, "learning_rate": 3.1486619643025565e-06, "loss": 2.9248395, "num_input_tokens_seen": 2775090, "router_z_loss_clip": 34.53125, "router_z_loss_mlp": 35.625, "step": 133, "time_per_iteration": 2.7065813541412354 }, { "auxiliary_loss_clip": 0.11848032, "auxiliary_loss_mlp": 0.06221145, "balance_loss_clip": 0.08394249, "balance_loss_mlp": 0.02494948, "epoch": 0.008056515857507891, "flos": 25491271219200.0, "grad_norm": 248.76376183223093, "language_loss": 2.04897356, "learning_rate": 3.153484849651286e-06, "loss": 2.22966528, "num_input_tokens_seen": 2795320, "router_z_loss_clip": 34.625, "router_z_loss_mlp": 37.25, "step": 134, "time_per_iteration": 2.7677738666534424 }, { "auxiliary_loss_clip": 0.11821242, "auxiliary_loss_mlp": 0.06254968, "balance_loss_clip": 0.08370227, "balance_loss_mlp": 0.02476892, "epoch": 0.00811663911017586, "flos": 20563694236800.0, "grad_norm": 216.2447239830176, "language_loss": 2.66527557, "learning_rate": 3.1582718767847806e-06, "loss": 2.84603786, "num_input_tokens_seen": 2812815, "router_z_loss_clip": 34.5, "router_z_loss_mlp": 37.71875, "step": 135, "time_per_iteration": 4.186769485473633 }, { "auxiliary_loss_clip": 0.11774851, "auxiliary_loss_mlp": 0.06416594, "balance_loss_clip": 0.08353916, "balance_loss_mlp": 0.02498137, "epoch": 0.00817676236284383, "flos": 18804483365760.0, "grad_norm": 217.34700259379386, "language_loss": 2.64891624, "learning_rate": 3.1630235749828485e-06, "loss": 2.83083081, "num_input_tokens_seen": 2830445, "router_z_loss_clip": 34.25, "router_z_loss_mlp": 39.15625, "step": 136, "time_per_iteration": 4.378416061401367 }, { "auxiliary_loss_clip": 0.11800981, "auxiliary_loss_mlp": 0.06336799, "balance_loss_clip": 0.08355659, "balance_loss_mlp": 0.02479377, "epoch": 0.008236885615511799, "flos": 23879576661120.0, "grad_norm": 649.1324934489451, "language_loss": 2.66790771, "learning_rate": 3.1677404618925676e-06, "loss": 2.8492856, "num_input_tokens_seen": 2846965, "router_z_loss_clip": 34.5, "router_z_loss_mlp": 38.53125, "step": 137, "time_per_iteration": 4.193670749664307 }, { "auxiliary_loss_clip": 0.11810952, "auxiliary_loss_mlp": 0.06460385, "balance_loss_clip": 0.08349136, "balance_loss_mlp": 0.02502255, "epoch": 0.00829700886817977, "flos": 24650379169920.0, "grad_norm": 456.2252900024314, "language_loss": 2.73583841, "learning_rate": 3.1724230438666953e-06, "loss": 2.91855168, "num_input_tokens_seen": 2867520, "router_z_loss_clip": 34.6875, "router_z_loss_mlp": 39.53125, "step": 138, "time_per_iteration": 2.7236080169677734 }, { "auxiliary_loss_clip": 0.11799063, "auxiliary_loss_mlp": 0.0641405, "balance_loss_clip": 0.08353324, "balance_loss_mlp": 0.02541369, "epoch": 0.008357132120847738, "flos": 25268550266880.0, "grad_norm": 387.7132010132663, "language_loss": 2.7448945, "learning_rate": 3.177071816289865e-06, "loss": 2.92702579, "num_input_tokens_seen": 2885675, "router_z_loss_clip": 34.53125, "router_z_loss_mlp": 38.65625, "step": 139, "time_per_iteration": 2.720029830932617 }, { "auxiliary_loss_clip": 0.11779921, "auxiliary_loss_mlp": 0.06837647, "balance_loss_clip": 0.08328707, "balance_loss_mlp": 0.02629273, "epoch": 0.008417255373515706, "flos": 27352325128320.0, "grad_norm": 503.11458222760035, "language_loss": 2.24712992, "learning_rate": 3.181687263893095e-06, "loss": 2.4333055, "num_input_tokens_seen": 2905960, "router_z_loss_clip": 34.53125, "router_z_loss_mlp": 42.09375, "step": 140, "time_per_iteration": 2.7691755294799805 }, { "auxiliary_loss_clip": 0.11790594, "auxiliary_loss_mlp": 0.0678352, "balance_loss_clip": 0.08337438, "balance_loss_mlp": 0.02645336, "epoch": 0.008477378626183677, "flos": 17644771330560.0, "grad_norm": 456.74114475846767, "language_loss": 2.30869794, "learning_rate": 3.186269861057098e-06, "loss": 2.49443913, "num_input_tokens_seen": 2922780, "router_z_loss_clip": 34.59375, "router_z_loss_mlp": 41.375, "step": 141, "time_per_iteration": 2.709735631942749 }, { "auxiliary_loss_clip": 0.11770265, "auxiliary_loss_mlp": 0.06960782, "balance_loss_clip": 0.0832642, "balance_loss_mlp": 0.02743252, "epoch": 0.008537501878851645, "flos": 13886465748480.0, "grad_norm": 466.0963561972635, "language_loss": 2.35469556, "learning_rate": 3.1908200721048745e-06, "loss": 2.54200625, "num_input_tokens_seen": 2938765, "router_z_loss_clip": 34.46875, "router_z_loss_mlp": 42.1875, "step": 142, "time_per_iteration": 2.7184841632843018 }, { "auxiliary_loss_clip": 0.12053505, "auxiliary_loss_mlp": 0.05489081, "balance_loss_clip": 0.08669233, "balance_loss_mlp": 0.02579229, "epoch": 0.008597625131519616, "flos": 71270783976960.0, "grad_norm": 4.1666978714498875, "language_loss": 0.6700815, "learning_rate": 3.195338351584042e-06, "loss": 0.84550738, "num_input_tokens_seen": 3006665, "router_z_loss_clip": 33.78125, "router_z_loss_mlp": 29.078125, "step": 143, "time_per_iteration": 3.589076280593872 }, { "auxiliary_loss_clip": 0.11742389, "auxiliary_loss_mlp": 0.07177937, "balance_loss_clip": 0.08313634, "balance_loss_mlp": 0.02850544, "epoch": 0.008657748384187584, "flos": 17608573566720.0, "grad_norm": 383.5684420198803, "language_loss": 2.41582465, "learning_rate": 3.1998251445393258e-06, "loss": 2.60502768, "num_input_tokens_seen": 3024335, "router_z_loss_clip": 34.3125, "router_z_loss_mlp": 43.28125, "step": 144, "time_per_iteration": 2.984539031982422 }, { "auxiliary_loss_clip": 0.11720376, "auxiliary_loss_mlp": 0.06702486, "balance_loss_clip": 0.08300847, "balance_loss_mlp": 0.02796235, "epoch": 0.008717871636855555, "flos": 19720789689600.0, "grad_norm": 414.0153397219062, "language_loss": 2.39112997, "learning_rate": 3.204280886775619e-06, "loss": 2.57535839, "num_input_tokens_seen": 3043300, "router_z_loss_clip": 34.15625, "router_z_loss_mlp": 39.0, "step": 145, "time_per_iteration": 2.6944291591644287 }, { "auxiliary_loss_clip": 0.11645032, "auxiliary_loss_mlp": 0.06257392, "balance_loss_clip": 0.08271384, "balance_loss_mlp": 0.02671578, "epoch": 0.008777994889523523, "flos": 24724325998080.0, "grad_norm": 234.10563445964561, "language_loss": 2.39550424, "learning_rate": 3.208706005112005e-06, "loss": 2.57452846, "num_input_tokens_seen": 3064610, "router_z_loss_clip": 33.75, "router_z_loss_mlp": 35.890625, "step": 146, "time_per_iteration": 2.7288639545440674 }, { "auxiliary_loss_clip": 0.11878775, "auxiliary_loss_mlp": 0.04782712, "balance_loss_clip": 0.08568791, "balance_loss_mlp": 0.02612913, "epoch": 0.008838118142191492, "flos": 70150974013440.0, "grad_norm": 3.494399791658663, "language_loss": 0.60447025, "learning_rate": 3.213100917627104e-06, "loss": 0.77108514, "num_input_tokens_seen": 3130385, "router_z_loss_clip": 33.0, "router_z_loss_mlp": 21.71875, "step": 147, "time_per_iteration": 3.425687551498413 }, { "auxiliary_loss_clip": 0.11622411, "auxiliary_loss_mlp": 0.06434272, "balance_loss_clip": 0.08269688, "balance_loss_mlp": 0.02845405, "epoch": 0.008898241394859462, "flos": 20050510705920.0, "grad_norm": 572.5249285755303, "language_loss": 2.45229673, "learning_rate": 3.2174660338961135e-06, "loss": 2.63286352, "num_input_tokens_seen": 3149760, "router_z_loss_clip": 33.625, "router_z_loss_mlp": 35.875, "step": 148, "time_per_iteration": 2.7068917751312256 }, { "auxiliary_loss_clip": 0.11488681, "auxiliary_loss_mlp": 0.06176116, "balance_loss_clip": 0.08215897, "balance_loss_mlp": 0.02839018, "epoch": 0.008958364647527431, "flos": 10748217980160.0, "grad_norm": 251.47707618481257, "language_loss": 2.42674017, "learning_rate": 3.2218017552198588e-06, "loss": 2.60338831, "num_input_tokens_seen": 3164500, "router_z_loss_clip": 32.75, "router_z_loss_mlp": 33.390625, "step": 149, "time_per_iteration": 2.683795213699341 }, { "auxiliary_loss_clip": 0.11533463, "auxiliary_loss_mlp": 0.06066919, "balance_loss_clip": 0.0824708, "balance_loss_mlp": 0.02827479, "epoch": 0.009018487900195401, "flos": 29134317110400.0, "grad_norm": 362.43414187287823, "language_loss": 2.9004488, "learning_rate": 3.226108474846181e-06, "loss": 3.07645273, "num_input_tokens_seen": 3182455, "router_z_loss_clip": 32.875, "router_z_loss_mlp": 32.453125, "step": 150, "time_per_iteration": 2.7913906574249268 }, { "auxiliary_loss_clip": 0.11469954, "auxiliary_loss_mlp": 0.06015911, "balance_loss_clip": 0.08219881, "balance_loss_mlp": 0.02906169, "epoch": 0.00907861115286337, "flos": 32972020035840.0, "grad_norm": 804.5874259433871, "language_loss": 2.03773189, "learning_rate": 3.2303865781839817e-06, "loss": 2.21259046, "num_input_tokens_seen": 3203995, "router_z_loss_clip": 32.5, "router_z_loss_mlp": 31.09375, "step": 151, "time_per_iteration": 2.8074684143066406 }, { "auxiliary_loss_clip": 0.1142254, "auxiliary_loss_mlp": 0.05607922, "balance_loss_clip": 0.08202653, "balance_loss_mlp": 0.02806408, "epoch": 0.009138734405531338, "flos": 21768911838720.0, "grad_norm": 3171.14607551049, "language_loss": 2.30351353, "learning_rate": 3.234636443010188e-06, "loss": 2.4738183, "num_input_tokens_seen": 3222575, "router_z_loss_clip": 32.1875, "router_z_loss_mlp": 28.046875, "step": 152, "time_per_iteration": 2.7401516437530518 }, { "auxiliary_loss_clip": 0.11446309, "auxiliary_loss_mlp": 0.05798356, "balance_loss_clip": 0.08216575, "balance_loss_mlp": 0.02973954, "epoch": 0.009198857658199309, "flos": 20847532343040.0, "grad_norm": 176.4980009136778, "language_loss": 2.34463573, "learning_rate": 3.238858439669943e-06, "loss": 2.51708221, "num_input_tokens_seen": 3240180, "router_z_loss_clip": 32.34375, "router_z_loss_mlp": 28.25, "step": 153, "time_per_iteration": 2.734022855758667 }, { "auxiliary_loss_clip": 0.11465098, "auxiliary_loss_mlp": 0.05509797, "balance_loss_clip": 0.08234373, "balance_loss_mlp": 0.02873079, "epoch": 0.009258980910867277, "flos": 24834386736000.0, "grad_norm": 287.55570030638347, "language_loss": 2.41556501, "learning_rate": 3.2430529312702712e-06, "loss": 2.58531404, "num_input_tokens_seen": 3259800, "router_z_loss_clip": 32.28125, "router_z_loss_mlp": 26.359375, "step": 154, "time_per_iteration": 2.7531468868255615 }, { "auxiliary_loss_clip": 0.11399396, "auxiliary_loss_mlp": 0.05085283, "balance_loss_clip": 0.08223251, "balance_loss_mlp": 0.02741533, "epoch": 0.009319104163535248, "flos": 28775442072960.0, "grad_norm": 822.5927976316603, "language_loss": 2.40112686, "learning_rate": 3.2472202738674737e-06, "loss": 2.56597328, "num_input_tokens_seen": 3280400, "router_z_loss_clip": 31.75, "router_z_loss_mlp": 23.421875, "step": 155, "time_per_iteration": 2.791504144668579 }, { "auxiliary_loss_clip": 0.11425503, "auxiliary_loss_mlp": 0.04995255, "balance_loss_clip": 0.08227465, "balance_loss_mlp": 0.02714066, "epoch": 0.009379227416203216, "flos": 16587698947200.0, "grad_norm": 444.28995295045866, "language_loss": 2.19901848, "learning_rate": 3.2513608166485063e-06, "loss": 2.36322594, "num_input_tokens_seen": 3297600, "router_z_loss_clip": 31.953125, "router_z_loss_mlp": 22.828125, "step": 156, "time_per_iteration": 2.7164008617401123 }, { "auxiliary_loss_clip": 0.11378187, "auxiliary_loss_mlp": 0.04768521, "balance_loss_clip": 0.08209029, "balance_loss_mlp": 0.02571255, "epoch": 0.009439350668871187, "flos": 18335337955200.0, "grad_norm": 4517.383932800753, "language_loss": 2.43901157, "learning_rate": 3.2554749021065498e-06, "loss": 2.60047865, "num_input_tokens_seen": 3313635, "router_z_loss_clip": 31.703125, "router_z_loss_mlp": 21.96875, "step": 157, "time_per_iteration": 2.693735361099243 }, { "auxiliary_loss_clip": 0.11385772, "auxiliary_loss_mlp": 0.04803238, "balance_loss_clip": 0.08225825, "balance_loss_mlp": 0.02659378, "epoch": 0.009499473921539155, "flos": 24356310865920.0, "grad_norm": 203.4769826274587, "language_loss": 2.32455492, "learning_rate": 3.2595628662110186e-06, "loss": 2.48644495, "num_input_tokens_seen": 3333735, "router_z_loss_clip": 31.625, "router_z_loss_mlp": 21.4375, "step": 158, "time_per_iteration": 2.7688815593719482 }, { "auxiliary_loss_clip": 0.11339363, "auxiliary_loss_mlp": 0.04651701, "balance_loss_clip": 0.08190882, "balance_loss_mlp": 0.02483428, "epoch": 0.009559597174207124, "flos": 16404949192320.0, "grad_norm": 552.9193650344022, "language_loss": 2.26490593, "learning_rate": 3.2636250385721982e-06, "loss": 2.42481661, "num_input_tokens_seen": 3348800, "router_z_loss_clip": 31.484375, "router_z_loss_mlp": 21.671875, "step": 159, "time_per_iteration": 2.6573240756988525 }, { "auxiliary_loss_clip": 0.11342955, "auxiliary_loss_mlp": 0.04420956, "balance_loss_clip": 0.08195815, "balance_loss_mlp": 0.02465542, "epoch": 0.009619720426875094, "flos": 22863523651200.0, "grad_norm": 216.0069765915514, "language_loss": 2.24749613, "learning_rate": 3.2676617426007263e-06, "loss": 2.40513563, "num_input_tokens_seen": 3368595, "router_z_loss_clip": 31.484375, "router_z_loss_mlp": 19.5703125, "step": 160, "time_per_iteration": 2.720640182495117 }, { "auxiliary_loss_clip": 0.1130629, "auxiliary_loss_mlp": 0.04082536, "balance_loss_clip": 0.08190325, "balance_loss_mlp": 0.02272843, "epoch": 0.009679843679543063, "flos": 19140954635520.0, "grad_norm": 638.0759345444727, "language_loss": 2.55352163, "learning_rate": 3.2716732956621042e-06, "loss": 2.70740986, "num_input_tokens_seen": 3384975, "router_z_loss_clip": 31.171875, "router_z_loss_mlp": 18.1015625, "step": 161, "time_per_iteration": 2.699381113052368 }, { "auxiliary_loss_clip": 0.11322825, "auxiliary_loss_mlp": 0.03989629, "balance_loss_clip": 0.08191037, "balance_loss_mlp": 0.02234105, "epoch": 0.009739966932211033, "flos": 20309219786880.0, "grad_norm": 328.67325310548205, "language_loss": 2.1751914, "learning_rate": 3.2756600092264203e-06, "loss": 2.32831621, "num_input_tokens_seen": 3404755, "router_z_loss_clip": 31.328125, "router_z_loss_mlp": 17.5625, "step": 162, "time_per_iteration": 2.6838431358337402 }, { "auxiliary_loss_clip": 0.11388724, "auxiliary_loss_mlp": 0.03665022, "balance_loss_clip": 0.08399337, "balance_loss_mlp": 0.02084975, "epoch": 0.009800090184879002, "flos": 67053200567040.0, "grad_norm": 3.3825494545949377, "language_loss": 0.72855979, "learning_rate": 3.279622189013474e-06, "loss": 0.87909722, "num_input_tokens_seen": 3467210, "router_z_loss_clip": 29.890625, "router_z_loss_mlp": 15.78125, "step": 163, "time_per_iteration": 3.3238282203674316 }, { "auxiliary_loss_clip": 0.11330494, "auxiliary_loss_mlp": 0.03866369, "balance_loss_clip": 0.08205608, "balance_loss_mlp": 0.02317602, "epoch": 0.00986021343754697, "flos": 17170301185920.0, "grad_norm": 587.6809186689462, "language_loss": 1.96475303, "learning_rate": 3.283560135133457e-06, "loss": 2.11672163, "num_input_tokens_seen": 3483220, "router_z_loss_clip": 31.21875, "router_z_loss_mlp": 15.5, "step": 164, "time_per_iteration": 2.6805226802825928 }, { "auxiliary_loss_clip": 0.11389627, "auxiliary_loss_mlp": 0.03919633, "balance_loss_clip": 0.08231495, "balance_loss_mlp": 0.02409012, "epoch": 0.00992033669021494, "flos": 17755293265920.0, "grad_norm": 363.4608807737949, "language_loss": 2.16569424, "learning_rate": 3.2874741422233565e-06, "loss": 2.31878662, "num_input_tokens_seen": 3501465, "router_z_loss_clip": 31.578125, "router_z_loss_mlp": 15.1015625, "step": 165, "time_per_iteration": 2.7040939331054688 }, { "auxiliary_loss_clip": 0.1141687, "auxiliary_loss_mlp": 0.03988666, "balance_loss_clip": 0.08239292, "balance_loss_mlp": 0.02460498, "epoch": 0.00998045994288291, "flos": 25303490219520.0, "grad_norm": 120.34112705789404, "language_loss": 1.95176649, "learning_rate": 3.2913644995792465e-06, "loss": 2.10582185, "num_input_tokens_seen": 3520480, "router_z_loss_clip": 31.796875, "router_z_loss_mlp": 15.2890625, "step": 166, "time_per_iteration": 2.7658779621124268 }, { "auxiliary_loss_clip": 0.11406423, "auxiliary_loss_mlp": 0.03656866, "balance_loss_clip": 0.08239384, "balance_loss_mlp": 0.02377416, "epoch": 0.01004058319555088, "flos": 32305869676800.0, "grad_norm": 206.85697775375314, "language_loss": 2.40086269, "learning_rate": 3.2952314912845914e-06, "loss": 2.55149579, "num_input_tokens_seen": 3539570, "router_z_loss_clip": 31.703125, "router_z_loss_mlp": 12.796875, "step": 167, "time_per_iteration": 2.913330078125 }, { "auxiliary_loss_clip": 0.11390907, "auxiliary_loss_mlp": 0.03732774, "balance_loss_clip": 0.08257819, "balance_loss_mlp": 0.02375505, "epoch": 0.010100706448218848, "flos": 11323399132800.0, "grad_norm": 155.8956268885179, "language_loss": 2.07265854, "learning_rate": 3.299075396334735e-06, "loss": 2.22389531, "num_input_tokens_seen": 3555465, "router_z_loss_clip": 31.3125, "router_z_loss_mlp": 13.5703125, "step": 168, "time_per_iteration": 2.6671464443206787 }, { "auxiliary_loss_clip": 0.11345768, "auxiliary_loss_mlp": 0.03570532, "balance_loss_clip": 0.08220493, "balance_loss_mlp": 0.02337622, "epoch": 0.010160829700886819, "flos": 29727820379520.0, "grad_norm": 2678.255597875989, "language_loss": 2.10412264, "learning_rate": 3.3028964887576868e-06, "loss": 2.25328565, "num_input_tokens_seen": 3578970, "router_z_loss_clip": 31.25, "router_z_loss_mlp": 12.34375, "step": 169, "time_per_iteration": 2.8901703357696533 }, { "auxiliary_loss_clip": 0.11379687, "auxiliary_loss_mlp": 0.035707, "balance_loss_clip": 0.08244219, "balance_loss_mlp": 0.02360678, "epoch": 0.010220952953554787, "flos": 20418567765120.0, "grad_norm": 158.52477446841945, "language_loss": 2.00271034, "learning_rate": 3.306695037731344e-06, "loss": 2.15221405, "num_input_tokens_seen": 3597275, "router_z_loss_clip": 31.359375, "router_z_loss_mlp": 12.1015625, "step": 170, "time_per_iteration": 2.7144839763641357 }, { "auxiliary_loss_clip": 0.11327772, "auxiliary_loss_mlp": 0.03291816, "balance_loss_clip": 0.08229955, "balance_loss_mlp": 0.02198523, "epoch": 0.010281076206222756, "flos": 31293170830080.0, "grad_norm": 266.7261121103048, "language_loss": 2.15153098, "learning_rate": 3.3104713076972827e-06, "loss": 2.29772687, "num_input_tokens_seen": 3618905, "router_z_loss_clip": 30.96875, "router_z_loss_mlp": 10.9296875, "step": 171, "time_per_iteration": 2.8870177268981934 }, { "auxiliary_loss_clip": 0.11403278, "auxiliary_loss_mlp": 0.03199054, "balance_loss_clip": 0.08256124, "balance_loss_mlp": 0.02210284, "epoch": 0.010341199458890726, "flos": 21988949460480.0, "grad_norm": 383.5208527317226, "language_loss": 2.11188293, "learning_rate": 3.314225558471224e-06, "loss": 2.2579062, "num_input_tokens_seen": 3639610, "router_z_loss_clip": 31.484375, "router_z_loss_mlp": 9.890625, "step": 172, "time_per_iteration": 2.7657642364501953 }, { "auxiliary_loss_clip": 0.11385289, "auxiliary_loss_mlp": 0.03303289, "balance_loss_clip": 0.08243956, "balance_loss_mlp": 0.02290106, "epoch": 0.010401322711558695, "flos": 30818449123200.0, "grad_norm": 177.81461738990703, "language_loss": 1.99950588, "learning_rate": 3.317958045350308e-06, "loss": 2.14639163, "num_input_tokens_seen": 3664030, "router_z_loss_clip": 31.453125, "router_z_loss_mlp": 10.125, "step": 173, "time_per_iteration": 2.813304901123047 }, { "auxiliary_loss_clip": 0.11389437, "auxiliary_loss_mlp": 0.03037732, "balance_loss_clip": 0.08244417, "balance_loss_mlp": 0.0210008, "epoch": 0.010461445964226665, "flos": 24721642667520.0, "grad_norm": 175.82375566529302, "language_loss": 1.99643493, "learning_rate": 3.3216690192172596e-06, "loss": 2.14070654, "num_input_tokens_seen": 3683615, "router_z_loss_clip": 31.40625, "router_z_loss_mlp": 9.3671875, "step": 174, "time_per_iteration": 2.7651207447052 }, { "auxiliary_loss_clip": 0.11394367, "auxiliary_loss_mlp": 0.02905973, "balance_loss_clip": 0.08244226, "balance_loss_mlp": 0.02037748, "epoch": 0.010521569216894634, "flos": 27717950419200.0, "grad_norm": 160.73758658914298, "language_loss": 1.8264966, "learning_rate": 3.325358726641591e-06, "loss": 1.96950006, "num_input_tokens_seen": 3704540, "router_z_loss_clip": 31.5, "router_z_loss_mlp": 8.6875, "step": 175, "time_per_iteration": 4.3024046421051025 }, { "auxiliary_loss_clip": 0.11383677, "auxiliary_loss_mlp": 0.02695759, "balance_loss_clip": 0.08248399, "balance_loss_mlp": 0.01993473, "epoch": 0.010581692469562603, "flos": 12463223022720.0, "grad_norm": 193.13759456149177, "language_loss": 2.029634, "learning_rate": 3.329027409977902e-06, "loss": 2.17042828, "num_input_tokens_seen": 3721320, "router_z_loss_clip": 31.359375, "router_z_loss_mlp": 7.0234375, "step": 176, "time_per_iteration": 5.780745506286621 }, { "auxiliary_loss_clip": 0.11398004, "auxiliary_loss_mlp": 0.0274248, "balance_loss_clip": 0.08248216, "balance_loss_mlp": 0.01980304, "epoch": 0.010641815722230573, "flos": 19433723201280.0, "grad_norm": 143.9065338927622, "language_loss": 1.87658668, "learning_rate": 3.3326753074614087e-06, "loss": 2.01799154, "num_input_tokens_seen": 3739385, "router_z_loss_clip": 31.453125, "router_z_loss_mlp": 7.62109375, "step": 177, "time_per_iteration": 4.19015908241272 }, { "auxiliary_loss_clip": 0.11396284, "auxiliary_loss_mlp": 0.02709674, "balance_loss_clip": 0.08247803, "balance_loss_mlp": 0.0195856, "epoch": 0.010701938974898541, "flos": 18338440556160.0, "grad_norm": 166.58280571902029, "language_loss": 1.83766198, "learning_rate": 3.3363026533007716e-06, "loss": 1.97872162, "num_input_tokens_seen": 3756360, "router_z_loss_clip": 31.4375, "router_z_loss_mlp": 7.5078125, "step": 178, "time_per_iteration": 2.7839162349700928 }, { "auxiliary_loss_clip": 0.11439433, "auxiliary_loss_mlp": 0.02670019, "balance_loss_clip": 0.08259946, "balance_loss_mlp": 0.01970022, "epoch": 0.010762062227566512, "flos": 19209283240320.0, "grad_norm": 141.2858163728389, "language_loss": 1.91303861, "learning_rate": 3.3399096777683303e-06, "loss": 2.05413318, "num_input_tokens_seen": 3773930, "router_z_loss_clip": 31.8125, "router_z_loss_mlp": 7.0, "step": 179, "time_per_iteration": 2.7622735500335693 }, { "auxiliary_loss_clip": 0.11402039, "auxiliary_loss_mlp": 0.02611784, "balance_loss_clip": 0.08244345, "balance_loss_mlp": 0.01940398, "epoch": 0.01082218548023448, "flos": 31432553297280.0, "grad_norm": 182.9763603695812, "language_loss": 2.02355862, "learning_rate": 3.3434966072878213e-06, "loss": 2.16369677, "num_input_tokens_seen": 3793630, "router_z_loss_clip": 31.5625, "router_z_loss_mlp": 6.71875, "step": 180, "time_per_iteration": 2.8384201526641846 }, { "auxiliary_loss_clip": 0.11390718, "auxiliary_loss_mlp": 0.02591032, "balance_loss_clip": 0.08254006, "balance_loss_mlp": 0.01955122, "epoch": 0.01088230873290245, "flos": 25053501962880.0, "grad_norm": 100.5365457081426, "language_loss": 1.7600801, "learning_rate": 3.3470636645196674e-06, "loss": 1.89989758, "num_input_tokens_seen": 3813610, "router_z_loss_clip": 31.40625, "router_z_loss_mlp": 6.359375, "step": 181, "time_per_iteration": 2.792621612548828 }, { "auxiliary_loss_clip": 0.11376683, "auxiliary_loss_mlp": 0.0257603, "balance_loss_clip": 0.08242378, "balance_loss_mlp": 0.0187527, "epoch": 0.01094243198557042, "flos": 22900056831360.0, "grad_norm": 153.52569302252203, "language_loss": 1.97075272, "learning_rate": 3.3506110684439156e-06, "loss": 2.1102798, "num_input_tokens_seen": 3831390, "router_z_loss_clip": 31.34375, "router_z_loss_mlp": 7.01171875, "step": 182, "time_per_iteration": 2.729404926300049 }, { "auxiliary_loss_clip": 0.11361333, "auxiliary_loss_mlp": 0.0257297, "balance_loss_clip": 0.08247926, "balance_loss_mlp": 0.01857714, "epoch": 0.011002555238238388, "flos": 17170720456320.0, "grad_norm": 843.480692143334, "language_loss": 1.8764925, "learning_rate": 3.3541390344409054e-06, "loss": 2.01583576, "num_input_tokens_seen": 3849705, "router_z_loss_clip": 31.109375, "router_z_loss_mlp": 7.15625, "step": 183, "time_per_iteration": 2.8035337924957275 }, { "auxiliary_loss_clip": 0.11325814, "auxiliary_loss_mlp": 0.02494063, "balance_loss_clip": 0.08233176, "balance_loss_mlp": 0.01798262, "epoch": 0.011062678490906358, "flos": 22316783760000.0, "grad_norm": 227.93985359609798, "language_loss": 1.94280648, "learning_rate": 3.357647774369736e-06, "loss": 2.08100533, "num_input_tokens_seen": 3869230, "router_z_loss_clip": 30.921875, "router_z_loss_mlp": 6.95703125, "step": 184, "time_per_iteration": 2.7576985359191895 }, { "auxiliary_loss_clip": 0.11259703, "auxiliary_loss_mlp": 0.02506667, "balance_loss_clip": 0.08210915, "balance_loss_mlp": 0.01775389, "epoch": 0.011122801743574327, "flos": 24395108106240.0, "grad_norm": 2535.758601307882, "language_loss": 1.83429599, "learning_rate": 3.3611374966446085e-06, "loss": 1.97195959, "num_input_tokens_seen": 3889735, "router_z_loss_clip": 30.484375, "router_z_loss_mlp": 7.31640625, "step": 185, "time_per_iteration": 2.7673726081848145 }, { "auxiliary_loss_clip": 0.11261868, "auxiliary_loss_mlp": 0.0255121, "balance_loss_clip": 0.08194633, "balance_loss_mlp": 0.01751268, "epoch": 0.011182924996242297, "flos": 18156110071680.0, "grad_norm": 602.6245746948024, "language_loss": 1.52334189, "learning_rate": 3.3646084063091142e-06, "loss": 1.6614728, "num_input_tokens_seen": 3908855, "router_z_loss_clip": 30.671875, "router_z_loss_mlp": 8.0, "step": 186, "time_per_iteration": 2.783914804458618 }, { "auxiliary_loss_clip": 0.11279814, "auxiliary_loss_mlp": 0.02616766, "balance_loss_clip": 0.08213215, "balance_loss_mlp": 0.01800421, "epoch": 0.011243048248910266, "flos": 15492206666880.0, "grad_norm": 303.8550931195418, "language_loss": 2.0932591, "learning_rate": 3.3680607051085194e-06, "loss": 2.2322247, "num_input_tokens_seen": 3923865, "router_z_loss_clip": 30.65625, "router_z_loss_mlp": 8.16796875, "step": 187, "time_per_iteration": 2.7237133979797363 }, { "auxiliary_loss_clip": 0.11310254, "auxiliary_loss_mlp": 0.02684779, "balance_loss_clip": 0.08219692, "balance_loss_mlp": 0.01877971, "epoch": 0.011303171501578235, "flos": 40926442383360.0, "grad_norm": 303.26054792454744, "language_loss": 1.55736661, "learning_rate": 3.371494591560139e-06, "loss": 1.69731712, "num_input_tokens_seen": 3946870, "router_z_loss_clip": 30.921875, "router_z_loss_mlp": 8.06640625, "step": 188, "time_per_iteration": 2.8625614643096924 }, { "auxiliary_loss_clip": 0.1122445, "auxiliary_loss_mlp": 0.02226817, "balance_loss_clip": 0.08356779, "balance_loss_mlp": 0.01638209, "epoch": 0.011363294754246205, "flos": 66321237225600.0, "grad_norm": 1.0080258300363176, "language_loss": 0.56106132, "learning_rate": 3.3749102610218297e-06, "loss": 0.69557393, "num_input_tokens_seen": 4010005, "router_z_loss_clip": 28.640625, "router_z_loss_mlp": 5.88671875, "step": 189, "time_per_iteration": 3.493482828140259 }, { "auxiliary_loss_clip": 0.11288576, "auxiliary_loss_mlp": 0.02715642, "balance_loss_clip": 0.08207373, "balance_loss_mlp": 0.01910741, "epoch": 0.011423418006914174, "flos": 24907285388160.0, "grad_norm": 497.7887236654595, "language_loss": 1.84588838, "learning_rate": 3.3783079057586833e-06, "loss": 1.98593044, "num_input_tokens_seen": 4029035, "router_z_loss_clip": 30.765625, "router_z_loss_mlp": 8.046875, "step": 190, "time_per_iteration": 2.7697582244873047 }, { "auxiliary_loss_clip": 0.11279334, "auxiliary_loss_mlp": 0.02702031, "balance_loss_clip": 0.0820808, "balance_loss_mlp": 0.01931462, "epoch": 0.011483541259582144, "flos": 19797964899840.0, "grad_norm": 130.50261533404438, "language_loss": 1.81741333, "learning_rate": 3.3816877150079665e-06, "loss": 1.95722723, "num_input_tokens_seen": 4046995, "router_z_loss_clip": 30.71875, "router_z_loss_mlp": 7.703125, "step": 191, "time_per_iteration": 2.71152663230896 }, { "auxiliary_loss_clip": 0.11242746, "auxiliary_loss_mlp": 0.02626935, "balance_loss_clip": 0.08208676, "balance_loss_mlp": 0.01929609, "epoch": 0.011543664512250112, "flos": 26184101904000.0, "grad_norm": 176.04853860218387, "language_loss": 1.89318991, "learning_rate": 3.385049875042367e-06, "loss": 2.03188658, "num_input_tokens_seen": 4065865, "router_z_loss_clip": 30.328125, "router_z_loss_mlp": 6.97265625, "step": 192, "time_per_iteration": 2.7821969985961914 }, { "auxiliary_loss_clip": 0.11254707, "auxiliary_loss_mlp": 0.02712884, "balance_loss_clip": 0.08199477, "balance_loss_mlp": 0.02002969, "epoch": 0.011603787764918083, "flos": 23775763052160.0, "grad_norm": 101.57234761564332, "language_loss": 1.78023255, "learning_rate": 3.3883945692315938e-06, "loss": 1.91990852, "num_input_tokens_seen": 4085305, "router_z_loss_clip": 30.578125, "router_z_loss_mlp": 7.09765625, "step": 193, "time_per_iteration": 2.7814338207244873 }, { "auxiliary_loss_clip": 0.11305298, "auxiliary_loss_mlp": 0.02626672, "balance_loss_clip": 0.08234039, "balance_loss_mlp": 0.02022424, "epoch": 0.011663911017586051, "flos": 25961255170560.0, "grad_norm": 112.51544589526844, "language_loss": 1.87978864, "learning_rate": 3.3917219781023906e-06, "loss": 2.0191083, "num_input_tokens_seen": 4105185, "router_z_loss_clip": 30.703125, "router_z_loss_mlp": 6.0390625, "step": 194, "time_per_iteration": 2.7648532390594482 }, { "auxiliary_loss_clip": 0.11204592, "auxiliary_loss_mlp": 0.02456707, "balance_loss_clip": 0.08184949, "balance_loss_mlp": 0.01897472, "epoch": 0.01172403427025402, "flos": 17901006716160.0, "grad_norm": 183.56421166357967, "language_loss": 1.89044344, "learning_rate": 3.3950322793970014e-06, "loss": 2.02705646, "num_input_tokens_seen": 4123160, "router_z_loss_clip": 30.203125, "router_z_loss_mlp": 5.58984375, "step": 195, "time_per_iteration": 2.7101588249206543 }, { "auxiliary_loss_clip": 0.11201677, "auxiliary_loss_mlp": 0.02362252, "balance_loss_clip": 0.08187875, "balance_loss_mlp": 0.01872635, "epoch": 0.01178415752292199, "flos": 17900293956480.0, "grad_norm": 131.84444956878764, "language_loss": 1.72705925, "learning_rate": 3.3983256481301445e-06, "loss": 1.86269855, "num_input_tokens_seen": 4140425, "router_z_loss_clip": 30.140625, "router_z_loss_mlp": 4.89648438, "step": 196, "time_per_iteration": 2.6882259845733643 }, { "auxiliary_loss_clip": 0.1119855, "auxiliary_loss_mlp": 0.02216405, "balance_loss_clip": 0.08183622, "balance_loss_mlp": 0.01803464, "epoch": 0.011844280775589959, "flos": 22900224539520.0, "grad_norm": 178.18231074838872, "language_loss": 1.7307272, "learning_rate": 3.4016022566445335e-06, "loss": 1.86487663, "num_input_tokens_seen": 4159555, "router_z_loss_clip": 30.1875, "router_z_loss_mlp": 4.13476562, "step": 197, "time_per_iteration": 2.7389159202575684 }, { "auxiliary_loss_clip": 0.1123178, "auxiliary_loss_mlp": 0.02167441, "balance_loss_clip": 0.08210452, "balance_loss_mlp": 0.01781012, "epoch": 0.01190440402825793, "flos": 26987748013440.0, "grad_norm": 153.82041745672925, "language_loss": 1.70140243, "learning_rate": 3.4048622746649966e-06, "loss": 1.83539462, "num_input_tokens_seen": 4180480, "router_z_loss_clip": 30.1875, "router_z_loss_mlp": 3.86523438, "step": 198, "time_per_iteration": 2.812265396118164 }, { "auxiliary_loss_clip": 0.11186399, "auxiliary_loss_mlp": 0.02114024, "balance_loss_clip": 0.08207051, "balance_loss_mlp": 0.01758304, "epoch": 0.011964527280925898, "flos": 20527789962240.0, "grad_norm": 103.4056302538264, "language_loss": 1.63981485, "learning_rate": 3.4081058693512278e-06, "loss": 1.77281904, "num_input_tokens_seen": 4198835, "router_z_loss_clip": 29.796875, "router_z_loss_mlp": 3.5546875, "step": 199, "time_per_iteration": 2.786724805831909 }, { "auxiliary_loss_clip": 0.11248228, "auxiliary_loss_mlp": 0.02118004, "balance_loss_clip": 0.0822373, "balance_loss_mlp": 0.01784027, "epoch": 0.012024650533593867, "flos": 27753435423360.0, "grad_norm": 92.66317173444641, "language_loss": 1.5373764, "learning_rate": 3.411333205349222e-06, "loss": 1.67103887, "num_input_tokens_seen": 4219335, "router_z_loss_clip": 30.21875, "router_z_loss_mlp": 3.34179688, "step": 200, "time_per_iteration": 2.7747623920440674 }, { "auxiliary_loss_clip": 0.11263117, "auxiliary_loss_mlp": 0.02092378, "balance_loss_clip": 0.08234014, "balance_loss_mlp": 0.01792352, "epoch": 0.012084773786261837, "flos": 10456623371520.0, "grad_norm": 99.64982265111412, "language_loss": 1.58174205, "learning_rate": 3.4145444448414217e-06, "loss": 1.7152971, "num_input_tokens_seen": 4236940, "router_z_loss_clip": 30.296875, "router_z_loss_mlp": 3.0, "step": 201, "time_per_iteration": 2.7248892784118652 }, { "auxiliary_loss_clip": 0.11232355, "auxiliary_loss_mlp": 0.02053753, "balance_loss_clip": 0.08229408, "balance_loss_mlp": 0.01763454, "epoch": 0.012144897038929806, "flos": 23111331701760.0, "grad_norm": 106.45794913566223, "language_loss": 1.62622523, "learning_rate": 3.4177397475956223e-06, "loss": 1.75908637, "num_input_tokens_seen": 4256755, "router_z_loss_clip": 30.0, "router_z_loss_mlp": 2.90039062, "step": 202, "time_per_iteration": 2.7696847915649414 }, { "auxiliary_loss_clip": 0.11270876, "auxiliary_loss_mlp": 0.02004651, "balance_loss_clip": 0.08243515, "balance_loss_mlp": 0.01724461, "epoch": 0.012205020291597776, "flos": 21039631827840.0, "grad_norm": 154.7819719678595, "language_loss": 1.64158762, "learning_rate": 3.4209192710126685e-06, "loss": 1.77434278, "num_input_tokens_seen": 4276505, "router_z_loss_clip": 30.28125, "router_z_loss_mlp": 2.80078125, "step": 203, "time_per_iteration": 2.7837016582489014 }, { "auxiliary_loss_clip": 0.10886835, "auxiliary_loss_mlp": 0.01818337, "balance_loss_clip": 0.0835187, "balance_loss_mlp": 0.01651921, "epoch": 0.012265143544265745, "flos": 68465416481280.0, "grad_norm": 1.0832212930923646, "language_loss": 0.611498, "learning_rate": 3.4240831701729837e-06, "loss": 0.73854971, "num_input_tokens_seen": 4330965, "router_z_loss_clip": 25.28125, "router_z_loss_mlp": 1.66503906, "step": 204, "time_per_iteration": 3.3247714042663574 }, { "auxiliary_loss_clip": 0.11189112, "auxiliary_loss_mlp": 0.01979091, "balance_loss_clip": 0.08211984, "balance_loss_mlp": 0.0166476, "epoch": 0.012325266796933715, "flos": 17024923152000.0, "grad_norm": 82.33285607890609, "language_loss": 1.68441916, "learning_rate": 3.4272315978819516e-06, "loss": 1.81610119, "num_input_tokens_seen": 4348200, "router_z_loss_clip": 29.765625, "router_z_loss_mlp": 3.14257812, "step": 205, "time_per_iteration": 2.7312228679656982 }, { "auxiliary_loss_clip": 0.11079738, "auxiliary_loss_mlp": 0.01959361, "balance_loss_clip": 0.08184563, "balance_loss_mlp": 0.01662577, "epoch": 0.012385390049601683, "flos": 20195679104640.0, "grad_norm": 127.5810371180984, "language_loss": 1.63888741, "learning_rate": 3.4303647047142043e-06, "loss": 1.76927829, "num_input_tokens_seen": 4365460, "router_z_loss_clip": 28.953125, "router_z_loss_mlp": 2.96875, "step": 206, "time_per_iteration": 2.8013710975646973 }, { "auxiliary_loss_clip": 0.11079603, "auxiliary_loss_mlp": 0.01954523, "balance_loss_clip": 0.08188254, "balance_loss_mlp": 0.01659265, "epoch": 0.012445513302269652, "flos": 16258690690560.0, "grad_norm": 127.33562494366276, "language_loss": 1.67344439, "learning_rate": 3.43348263905683e-06, "loss": 1.80378556, "num_input_tokens_seen": 4383650, "router_z_loss_clip": 28.953125, "router_z_loss_mlp": 2.953125, "step": 207, "time_per_iteration": 2.739304780960083 }, { "auxiliary_loss_clip": 0.11027829, "auxiliary_loss_mlp": 0.01951729, "balance_loss_clip": 0.08169329, "balance_loss_mlp": 0.01670204, "epoch": 0.012505636554937622, "flos": 23776224249600.0, "grad_norm": 120.34749004491694, "language_loss": 1.41682816, "learning_rate": 3.436585547151547e-06, "loss": 1.54662383, "num_input_tokens_seen": 4403765, "router_z_loss_clip": 28.59375, "router_z_loss_mlp": 2.81640625, "step": 208, "time_per_iteration": 2.7495081424713135 }, { "auxiliary_loss_clip": 0.11017898, "auxiliary_loss_mlp": 0.01958833, "balance_loss_clip": 0.08196604, "balance_loss_mlp": 0.01693712, "epoch": 0.012565759807605591, "flos": 30599417750400.0, "grad_norm": 224.61114228839082, "language_loss": 1.77280974, "learning_rate": 3.4396735731358586e-06, "loss": 1.90257692, "num_input_tokens_seen": 4421935, "router_z_loss_clip": 28.203125, "router_z_loss_mlp": 2.65039062, "step": 209, "time_per_iteration": 2.777177095413208 }, { "auxiliary_loss_clip": 0.10975216, "auxiliary_loss_mlp": 0.01995764, "balance_loss_clip": 0.08177136, "balance_loss_mlp": 0.01702223, "epoch": 0.012625883060273561, "flos": 40122838200960.0, "grad_norm": 115.96841236097424, "language_loss": 1.57956791, "learning_rate": 3.4427468590832302e-06, "loss": 1.70927763, "num_input_tokens_seen": 4441470, "router_z_loss_clip": 28.0, "router_z_loss_mlp": 2.93554688, "step": 210, "time_per_iteration": 2.9100658893585205 }, { "auxiliary_loss_clip": 0.10965836, "auxiliary_loss_mlp": 0.02020279, "balance_loss_clip": 0.08172574, "balance_loss_mlp": 0.01740852, "epoch": 0.01268600631294153, "flos": 27096509013120.0, "grad_norm": 100.50494979549721, "language_loss": 1.64161384, "learning_rate": 3.445805545042314e-06, "loss": 1.77147508, "num_input_tokens_seen": 4459950, "router_z_loss_clip": 27.90625, "router_z_loss_mlp": 2.79296875, "step": 211, "time_per_iteration": 2.8991847038269043 }, { "auxiliary_loss_clip": 0.10949131, "auxiliary_loss_mlp": 0.02096077, "balance_loss_clip": 0.0817456, "balance_loss_mlp": 0.01794907, "epoch": 0.012746129565609499, "flos": 16988431898880.0, "grad_norm": 58.191935080588706, "language_loss": 1.62653613, "learning_rate": 3.448849769075239e-06, "loss": 1.75698817, "num_input_tokens_seen": 4478390, "router_z_loss_clip": 27.75, "router_z_loss_mlp": 3.00976562, "step": 212, "time_per_iteration": 2.9669017791748047 }, { "auxiliary_loss_clip": 0.10912459, "auxiliary_loss_mlp": 0.0213514, "balance_loss_clip": 0.08158834, "balance_loss_mlp": 0.01829392, "epoch": 0.012806252818277469, "flos": 46543621668480.0, "grad_norm": 203.52397909758855, "language_loss": 1.39114833, "learning_rate": 3.4518796672950093e-06, "loss": 1.52162445, "num_input_tokens_seen": 4501665, "router_z_loss_clip": 27.515625, "router_z_loss_mlp": 3.05664062, "step": 213, "time_per_iteration": 2.977318286895752 }, { "auxiliary_loss_clip": 0.10917506, "auxiliary_loss_mlp": 0.02143429, "balance_loss_clip": 0.08151287, "balance_loss_mlp": 0.01848171, "epoch": 0.012866376070945438, "flos": 14393234442240.0, "grad_norm": 77.15593333920886, "language_loss": 1.46743393, "learning_rate": 3.4548953739020187e-06, "loss": 1.59804332, "num_input_tokens_seen": 4519055, "router_z_loss_clip": 27.671875, "router_z_loss_mlp": 2.95117188, "step": 214, "time_per_iteration": 2.7443134784698486 }, { "auxiliary_loss_clip": 0.10901947, "auxiliary_loss_mlp": 0.0215728, "balance_loss_clip": 0.08144978, "balance_loss_mlp": 0.01851914, "epoch": 0.012926499323613408, "flos": 26148029921280.0, "grad_norm": 80.63031755504639, "language_loss": 1.35821486, "learning_rate": 3.4578970212197196e-06, "loss": 1.4888072, "num_input_tokens_seen": 4540870, "router_z_loss_clip": 27.5625, "router_z_loss_mlp": 3.0546875, "step": 215, "time_per_iteration": 5.709124565124512 }, { "auxiliary_loss_clip": 0.10901303, "auxiliary_loss_mlp": 0.02162593, "balance_loss_clip": 0.0815216, "balance_loss_mlp": 0.01886218, "epoch": 0.012986622576281377, "flos": 30124989532800.0, "grad_norm": 76.98934528398533, "language_loss": 1.58053398, "learning_rate": 3.460884739729461e-06, "loss": 1.71117282, "num_input_tokens_seen": 4560395, "router_z_loss_clip": 27.515625, "router_z_loss_mlp": 2.76367188, "step": 216, "time_per_iteration": 5.871690273284912 }, { "auxiliary_loss_clip": 0.10919537, "auxiliary_loss_mlp": 0.02142251, "balance_loss_clip": 0.08159825, "balance_loss_mlp": 0.0186683, "epoch": 0.013046745828949347, "flos": 13959112838400.0, "grad_norm": 74.82653520573953, "language_loss": 1.61256909, "learning_rate": 3.463858658104523e-06, "loss": 1.74318695, "num_input_tokens_seen": 4575785, "router_z_loss_clip": 27.59375, "router_z_loss_mlp": 2.75195312, "step": 217, "time_per_iteration": 2.7133610248565674 }, { "auxiliary_loss_clip": 0.10919586, "auxiliary_loss_mlp": 0.02145747, "balance_loss_clip": 0.08150701, "balance_loss_mlp": 0.0189474, "epoch": 0.013106869081617315, "flos": 17353595992320.0, "grad_norm": 88.47028436974244, "language_loss": 1.44425917, "learning_rate": 3.4668189032433696e-06, "loss": 1.57491243, "num_input_tokens_seen": 4594985, "router_z_loss_clip": 27.65625, "router_z_loss_mlp": 2.51171875, "step": 218, "time_per_iteration": 2.8007471561431885 }, { "auxiliary_loss_clip": 0.10930814, "auxiliary_loss_mlp": 0.02112394, "balance_loss_clip": 0.08146541, "balance_loss_mlp": 0.01877027, "epoch": 0.013166992334285284, "flos": 25892004170880.0, "grad_norm": 137.65004916175144, "language_loss": 1.40982533, "learning_rate": 3.46976560030214e-06, "loss": 1.54025733, "num_input_tokens_seen": 4616125, "router_z_loss_clip": 27.859375, "router_z_loss_mlp": 2.35449219, "step": 219, "time_per_iteration": 2.7860114574432373 }, { "auxiliary_loss_clip": 0.10935922, "auxiliary_loss_mlp": 0.02094616, "balance_loss_clip": 0.0815215, "balance_loss_mlp": 0.01856769, "epoch": 0.013227115586953254, "flos": 31184032487040.0, "grad_norm": 242.12448717719937, "language_loss": 1.53183818, "learning_rate": 3.4726988727263976e-06, "loss": 1.66214347, "num_input_tokens_seen": 4637795, "router_z_loss_clip": 27.828125, "router_z_loss_mlp": 2.37890625, "step": 220, "time_per_iteration": 2.8548035621643066 }, { "auxiliary_loss_clip": 0.10902297, "auxiliary_loss_mlp": 0.02076837, "balance_loss_clip": 0.08144423, "balance_loss_mlp": 0.0183613, "epoch": 0.013287238839621223, "flos": 20415213601920.0, "grad_norm": 78.46148496572852, "language_loss": 1.41299176, "learning_rate": 3.475618842282164e-06, "loss": 1.54278302, "num_input_tokens_seen": 4656835, "router_z_loss_clip": 27.578125, "router_z_loss_mlp": 2.40820312, "step": 221, "time_per_iteration": 2.739452362060547 }, { "auxiliary_loss_clip": 0.10931256, "auxiliary_loss_mlp": 0.02012765, "balance_loss_clip": 0.08152997, "balance_loss_mlp": 0.01783883, "epoch": 0.013347362092289193, "flos": 14142365717760.0, "grad_norm": 122.64249097425026, "language_loss": 1.50664437, "learning_rate": 3.4785256290862486e-06, "loss": 1.63608456, "num_input_tokens_seen": 4673015, "router_z_loss_clip": 27.765625, "router_z_loss_mlp": 2.2890625, "step": 222, "time_per_iteration": 2.874008893966675 }, { "auxiliary_loss_clip": 0.10974105, "auxiliary_loss_mlp": 0.01994867, "balance_loss_clip": 0.08166558, "balance_loss_mlp": 0.01795835, "epoch": 0.013407485344957162, "flos": 21803977572480.0, "grad_norm": 195.0272790549294, "language_loss": 1.45205688, "learning_rate": 3.481419351635897e-06, "loss": 1.5817467, "num_input_tokens_seen": 4692355, "router_z_loss_clip": 28.0625, "router_z_loss_mlp": 1.99121094, "step": 223, "time_per_iteration": 2.71883487701416 }, { "auxiliary_loss_clip": 0.10972212, "auxiliary_loss_mlp": 0.01992263, "balance_loss_clip": 0.08147465, "balance_loss_mlp": 0.0179037, "epoch": 0.013467608597625132, "flos": 18627058344960.0, "grad_norm": 101.09878013583065, "language_loss": 1.44547963, "learning_rate": 3.484300126837776e-06, "loss": 1.5751245, "num_input_tokens_seen": 4710080, "router_z_loss_clip": 28.21875, "router_z_loss_mlp": 2.02148438, "step": 224, "time_per_iteration": 2.712034225463867 }, { "auxiliary_loss_clip": 0.1098446, "auxiliary_loss_mlp": 0.02029454, "balance_loss_clip": 0.08160213, "balance_loss_mlp": 0.01841675, "epoch": 0.013527731850293101, "flos": 18558352396800.0, "grad_norm": 111.77585724669879, "language_loss": 1.42205453, "learning_rate": 3.487168070036317e-06, "loss": 1.55219376, "num_input_tokens_seen": 4728980, "router_z_loss_clip": 28.234375, "router_z_loss_mlp": 1.87792969, "step": 225, "time_per_iteration": 2.7924132347106934 }, { "auxiliary_loss_clip": 0.10969555, "auxiliary_loss_mlp": 0.02081825, "balance_loss_clip": 0.08152533, "balance_loss_mlp": 0.01898433, "epoch": 0.01358785510296107, "flos": 19170318291840.0, "grad_norm": 125.78489526607811, "language_loss": 1.5010227, "learning_rate": 3.4900232950414224e-06, "loss": 1.63153648, "num_input_tokens_seen": 4747020, "router_z_loss_clip": 28.171875, "router_z_loss_mlp": 1.83496094, "step": 226, "time_per_iteration": 2.7119739055633545 }, { "auxiliary_loss_clip": 0.1098239, "auxiliary_loss_mlp": 0.0205101, "balance_loss_clip": 0.08159718, "balance_loss_mlp": 0.01871147, "epoch": 0.01364797835562904, "flos": 23336442495360.0, "grad_norm": 105.50493268922995, "language_loss": 1.42007875, "learning_rate": 3.4928659141555727e-06, "loss": 1.55041277, "num_input_tokens_seen": 4765000, "router_z_loss_clip": 28.21875, "router_z_loss_mlp": 1.79882812, "step": 227, "time_per_iteration": 2.7505042552948 }, { "auxiliary_loss_clip": 0.10413106, "auxiliary_loss_mlp": 0.0189137, "balance_loss_clip": 0.08180417, "balance_loss_mlp": 0.01759, "epoch": 0.013708101608297009, "flos": 71016561089280.0, "grad_norm": 1.3573310395218086, "language_loss": 0.57872188, "learning_rate": 3.4956960382003234e-06, "loss": 0.70176667, "num_input_tokens_seen": 4833210, "router_z_loss_clip": 22.265625, "router_z_loss_mlp": 1.32421875, "step": 228, "time_per_iteration": 3.4243690967559814 }, { "auxiliary_loss_clip": 0.10934681, "auxiliary_loss_mlp": 0.02109737, "balance_loss_clip": 0.08141253, "balance_loss_mlp": 0.01932926, "epoch": 0.013768224860964979, "flos": 16330583093760.0, "grad_norm": 100.31629076730684, "language_loss": 1.35243988, "learning_rate": 3.4985137765422354e-06, "loss": 1.48288405, "num_input_tokens_seen": 4850120, "router_z_loss_clip": 27.90625, "router_z_loss_mlp": 1.76855469, "step": 229, "time_per_iteration": 2.7118582725524902 }, { "auxiliary_loss_clip": 0.1091322, "auxiliary_loss_mlp": 0.02043134, "balance_loss_clip": 0.08134322, "balance_loss_mlp": 0.01868898, "epoch": 0.013828348113632948, "flos": 20199159048960.0, "grad_norm": 80.97898565316619, "language_loss": 1.3485918, "learning_rate": 3.501319237118231e-06, "loss": 1.47815549, "num_input_tokens_seen": 4866215, "router_z_loss_clip": 27.8125, "router_z_loss_mlp": 1.74316406, "step": 230, "time_per_iteration": 2.7079660892486572 }, { "auxiliary_loss_clip": 0.10932577, "auxiliary_loss_mlp": 0.02066154, "balance_loss_clip": 0.08155647, "balance_loss_mlp": 0.01905365, "epoch": 0.013888471366300916, "flos": 20747408313600.0, "grad_norm": 195.4095338784864, "language_loss": 1.29123521, "learning_rate": 3.5041125264604056e-06, "loss": 1.42122257, "num_input_tokens_seen": 4885630, "router_z_loss_clip": 27.765625, "router_z_loss_mlp": 1.60839844, "step": 231, "time_per_iteration": 2.824749708175659 }, { "auxiliary_loss_clip": 0.10971954, "auxiliary_loss_mlp": 0.02071127, "balance_loss_clip": 0.08165792, "balance_loss_mlp": 0.01913008, "epoch": 0.013948594618968886, "flos": 22097123481600.0, "grad_norm": 50.59255945473631, "language_loss": 1.30814672, "learning_rate": 3.5068937497203002e-06, "loss": 1.43857741, "num_input_tokens_seen": 4905570, "router_z_loss_clip": 28.078125, "router_z_loss_mlp": 1.58203125, "step": 232, "time_per_iteration": 2.719578742980957 }, { "auxiliary_loss_clip": 0.10969584, "auxiliary_loss_mlp": 0.02135091, "balance_loss_clip": 0.08169422, "balance_loss_mlp": 0.01985364, "epoch": 0.014008717871636855, "flos": 19069229940480.0, "grad_norm": 60.67396516511377, "language_loss": 1.23404992, "learning_rate": 3.509663010692652e-06, "loss": 1.36509669, "num_input_tokens_seen": 4923535, "router_z_loss_clip": 28.0, "router_z_loss_mlp": 1.49707031, "step": 233, "time_per_iteration": 2.824281692504883 }, { "auxiliary_loss_clip": 0.11050306, "auxiliary_loss_mlp": 0.02229309, "balance_loss_clip": 0.08196901, "balance_loss_mlp": 0.02104425, "epoch": 0.014068841124304825, "flos": 14534839042560.0, "grad_norm": 2183.452126493528, "language_loss": 1.33000302, "learning_rate": 3.512420411838642e-06, "loss": 1.46279931, "num_input_tokens_seen": 4939200, "router_z_loss_clip": 28.5, "router_z_loss_mlp": 1.24755859, "step": 234, "time_per_iteration": 2.7762162685394287 }, { "auxiliary_loss_clip": 0.10957129, "auxiliary_loss_mlp": 0.02131996, "balance_loss_clip": 0.08172818, "balance_loss_mlp": 0.01998386, "epoch": 0.014128964376972794, "flos": 18083253346560.0, "grad_norm": 116.9130490437534, "language_loss": 1.38589096, "learning_rate": 3.515166054308634e-06, "loss": 1.51678228, "num_input_tokens_seen": 4956620, "router_z_loss_clip": 27.828125, "router_z_loss_mlp": 1.33496094, "step": 235, "time_per_iteration": 2.75978684425354 }, { "auxiliary_loss_clip": 0.11010915, "auxiliary_loss_mlp": 0.02282195, "balance_loss_clip": 0.08182159, "balance_loss_mlp": 0.02159362, "epoch": 0.014189087629640764, "flos": 25340778086400.0, "grad_norm": 743.0346884585028, "language_loss": 1.2382437, "learning_rate": 3.5179000379644498e-06, "loss": 1.37117481, "num_input_tokens_seen": 4975650, "router_z_loss_clip": 28.28125, "router_z_loss_mlp": 1.22851562, "step": 236, "time_per_iteration": 2.7732717990875244 }, { "auxiliary_loss_clip": 0.10966392, "auxiliary_loss_mlp": 0.02303015, "balance_loss_clip": 0.08185383, "balance_loss_mlp": 0.02175318, "epoch": 0.014249210882308733, "flos": 36148939263360.0, "grad_norm": 56.66924345531136, "language_loss": 1.21429288, "learning_rate": 3.520622461401154e-06, "loss": 1.34698701, "num_input_tokens_seen": 4997415, "router_z_loss_clip": 27.828125, "router_z_loss_mlp": 1.27636719, "step": 237, "time_per_iteration": 2.8684213161468506 }, { "auxiliary_loss_clip": 0.10934368, "auxiliary_loss_mlp": 0.02271314, "balance_loss_clip": 0.08181767, "balance_loss_mlp": 0.02141614, "epoch": 0.014309334134976702, "flos": 12937986656640.0, "grad_norm": 214.2262827394738, "language_loss": 1.21656013, "learning_rate": 3.5233334219683935e-06, "loss": 1.34861696, "num_input_tokens_seen": 5013905, "router_z_loss_clip": 27.515625, "router_z_loss_mlp": 1.29785156, "step": 238, "time_per_iteration": 2.736187696456909 }, { "auxiliary_loss_clip": 0.10882899, "auxiliary_loss_mlp": 0.02200053, "balance_loss_clip": 0.08172622, "balance_loss_mlp": 0.02060244, "epoch": 0.014369457387644672, "flos": 20783857639680.0, "grad_norm": 31.52786949994366, "language_loss": 1.27088118, "learning_rate": 3.526033015791284e-06, "loss": 1.40171075, "num_input_tokens_seen": 5033645, "router_z_loss_clip": 27.125, "router_z_loss_mlp": 1.3984375, "step": 239, "time_per_iteration": 2.7793784141540527 }, { "auxiliary_loss_clip": 0.10856512, "auxiliary_loss_mlp": 0.02187028, "balance_loss_clip": 0.08159775, "balance_loss_mlp": 0.02047315, "epoch": 0.01442958064031264, "flos": 25855638698880.0, "grad_norm": 36.447334541047695, "language_loss": 1.2945801, "learning_rate": 3.528721337790862e-06, "loss": 1.42501557, "num_input_tokens_seen": 5052875, "router_z_loss_clip": 26.953125, "router_z_loss_mlp": 1.39746094, "step": 240, "time_per_iteration": 2.7800652980804443 }, { "auxiliary_loss_clip": 0.10804121, "auxiliary_loss_mlp": 0.02153849, "balance_loss_clip": 0.08153222, "balance_loss_mlp": 0.0199325, "epoch": 0.014489703892980611, "flos": 28227150881280.0, "grad_norm": 31.075067570990175, "language_loss": 1.21250105, "learning_rate": 3.531398481704111e-06, "loss": 1.34208071, "num_input_tokens_seen": 5075005, "router_z_loss_clip": 26.546875, "router_z_loss_mlp": 1.60546875, "step": 241, "time_per_iteration": 2.7983202934265137 }, { "auxiliary_loss_clip": 0.10725023, "auxiliary_loss_mlp": 0.02170284, "balance_loss_clip": 0.08119947, "balance_loss_mlp": 0.01996429, "epoch": 0.01454982714564858, "flos": 22497311381760.0, "grad_norm": 117.45278349513595, "language_loss": 1.28913331, "learning_rate": 3.534064540103573e-06, "loss": 1.41808641, "num_input_tokens_seen": 5091875, "router_z_loss_clip": 26.0625, "router_z_loss_mlp": 1.73730469, "step": 242, "time_per_iteration": 2.8302454948425293 }, { "auxiliary_loss_clip": 0.10722608, "auxiliary_loss_mlp": 0.02217712, "balance_loss_clip": 0.08118095, "balance_loss_mlp": 0.02042809, "epoch": 0.014609950398316548, "flos": 21659689641600.0, "grad_norm": 289.89181145564015, "language_loss": 1.37394333, "learning_rate": 3.536719604416555e-06, "loss": 1.50334656, "num_input_tokens_seen": 5111290, "router_z_loss_clip": 26.03125, "router_z_loss_mlp": 1.74902344, "step": 243, "time_per_iteration": 2.812532424926758 }, { "auxiliary_loss_clip": 0.10645698, "auxiliary_loss_mlp": 0.0225419, "balance_loss_clip": 0.08093758, "balance_loss_mlp": 0.02046289, "epoch": 0.014670073650984519, "flos": 21876163464960.0, "grad_norm": 206.45342782685617, "language_loss": 1.42209065, "learning_rate": 3.5393637649439464e-06, "loss": 1.55108953, "num_input_tokens_seen": 5132265, "router_z_loss_clip": 25.5, "router_z_loss_mlp": 2.08007812, "step": 244, "time_per_iteration": 2.7669363021850586 }, { "auxiliary_loss_clip": 0.1065483, "auxiliary_loss_mlp": 0.02272133, "balance_loss_clip": 0.08092082, "balance_loss_mlp": 0.02055267, "epoch": 0.014730196903652487, "flos": 23190142066560.0, "grad_norm": 182.4717286398996, "language_loss": 1.35098445, "learning_rate": 3.54199711087864e-06, "loss": 1.48025417, "num_input_tokens_seen": 5148575, "router_z_loss_clip": 25.59375, "router_z_loss_mlp": 2.16699219, "step": 245, "time_per_iteration": 2.7266640663146973 }, { "auxiliary_loss_clip": 0.10630374, "auxiliary_loss_mlp": 0.02271285, "balance_loss_clip": 0.08096874, "balance_loss_mlp": 0.02028289, "epoch": 0.014790320156320457, "flos": 23229442431360.0, "grad_norm": 46.464969194095275, "language_loss": 1.25353289, "learning_rate": 3.5446197303235913e-06, "loss": 1.38254952, "num_input_tokens_seen": 5170415, "router_z_loss_clip": 25.328125, "router_z_loss_mlp": 2.4296875, "step": 246, "time_per_iteration": 2.7484755516052246 }, { "auxiliary_loss_clip": 0.10633783, "auxiliary_loss_mlp": 0.02252098, "balance_loss_clip": 0.08098595, "balance_loss_mlp": 0.02006241, "epoch": 0.014850443408988426, "flos": 15821005288320.0, "grad_norm": 51.90262878133645, "language_loss": 1.25566173, "learning_rate": 3.5472317103095034e-06, "loss": 1.38452053, "num_input_tokens_seen": 5188565, "router_z_loss_clip": 25.328125, "router_z_loss_mlp": 2.45605469, "step": 247, "time_per_iteration": 2.746431350708008 }, { "auxiliary_loss_clip": 0.10617188, "auxiliary_loss_mlp": 0.02326912, "balance_loss_clip": 0.08111985, "balance_loss_mlp": 0.0206427, "epoch": 0.014910566661656396, "flos": 22787899741440.0, "grad_norm": 28.92518397889307, "language_loss": 1.16154099, "learning_rate": 3.549833136812155e-06, "loss": 1.29098201, "num_input_tokens_seen": 5207810, "router_z_loss_clip": 25.0625, "router_z_loss_mlp": 2.625, "step": 248, "time_per_iteration": 2.749980926513672 }, { "auxiliary_loss_clip": 0.10583431, "auxiliary_loss_mlp": 0.0230246, "balance_loss_clip": 0.08085144, "balance_loss_mlp": 0.02020364, "epoch": 0.014970689914324365, "flos": 26871440146560.0, "grad_norm": 96.23832689788304, "language_loss": 1.19038117, "learning_rate": 3.552424094769381e-06, "loss": 1.31924009, "num_input_tokens_seen": 5226210, "router_z_loss_clip": 24.984375, "router_z_loss_mlp": 2.82226562, "step": 249, "time_per_iteration": 2.755146026611328 }, { "auxiliary_loss_clip": 0.10582642, "auxiliary_loss_mlp": 0.02218426, "balance_loss_clip": 0.08082327, "balance_loss_mlp": 0.01942433, "epoch": 0.015030813166992334, "flos": 13989943941120.0, "grad_norm": 75.70676955526795, "language_loss": 1.32026982, "learning_rate": 3.5550046680977174e-06, "loss": 1.44828057, "num_input_tokens_seen": 5241660, "router_z_loss_clip": 25.015625, "router_z_loss_mlp": 2.75976562, "step": 250, "time_per_iteration": 2.710949659347534 }, { "auxiliary_loss_clip": 0.10598561, "auxiliary_loss_mlp": 0.02144517, "balance_loss_clip": 0.0809335, "balance_loss_mlp": 0.0189618, "epoch": 0.015090936419660304, "flos": 24724787195520.0, "grad_norm": 506.43430998276597, "language_loss": 1.35167825, "learning_rate": 3.5575749397087034e-06, "loss": 1.47910905, "num_input_tokens_seen": 5261090, "router_z_loss_clip": 25.03125, "router_z_loss_mlp": 2.484375, "step": 251, "time_per_iteration": 2.7576348781585693 }, { "auxiliary_loss_clip": 0.10596387, "auxiliary_loss_mlp": 0.02143365, "balance_loss_clip": 0.08081001, "balance_loss_mlp": 0.01894456, "epoch": 0.015151059672328273, "flos": 25745829523200.0, "grad_norm": 1191.7410561950649, "language_loss": 1.21585989, "learning_rate": 3.5601349915248707e-06, "loss": 1.34325743, "num_input_tokens_seen": 5279175, "router_z_loss_clip": 25.125, "router_z_loss_mlp": 2.49023438, "step": 252, "time_per_iteration": 2.76078462600708 }, { "auxiliary_loss_clip": 0.10597921, "auxiliary_loss_mlp": 0.02087212, "balance_loss_clip": 0.08089147, "balance_loss_mlp": 0.01862622, "epoch": 0.015211182924996243, "flos": 21877588984320.0, "grad_norm": 294.21762268014334, "language_loss": 1.40722466, "learning_rate": 3.5626849044954064e-06, "loss": 1.53407609, "num_input_tokens_seen": 5296975, "router_z_loss_clip": 25.09375, "router_z_loss_mlp": 2.24609375, "step": 253, "time_per_iteration": 2.7428462505340576 }, { "auxiliary_loss_clip": 0.09942168, "auxiliary_loss_mlp": 0.01814742, "balance_loss_clip": 0.08013746, "balance_loss_mlp": 0.01533408, "epoch": 0.015271306177664212, "flos": 66915159765120.0, "grad_norm": 1.1623089082959217, "language_loss": 0.56290162, "learning_rate": 3.5652247586115167e-06, "loss": 0.68047071, "num_input_tokens_seen": 5358375, "router_z_loss_clip": 19.25, "router_z_loss_mlp": 2.81640625, "step": 254, "time_per_iteration": 4.792823553085327 }, { "auxiliary_loss_clip": 0.10584889, "auxiliary_loss_mlp": 0.02052567, "balance_loss_clip": 0.08070633, "balance_loss_mlp": 0.01821587, "epoch": 0.01533142943033218, "flos": 26841405657600.0, "grad_norm": 77.28125294274467, "language_loss": 1.26056254, "learning_rate": 3.567754632921479e-06, "loss": 1.38693702, "num_input_tokens_seen": 5377255, "router_z_loss_clip": 25.140625, "router_z_loss_mlp": 2.31054688, "step": 255, "time_per_iteration": 4.415185213088989 }, { "auxiliary_loss_clip": 0.10562941, "auxiliary_loss_mlp": 0.02009043, "balance_loss_clip": 0.08074763, "balance_loss_mlp": 0.01770625, "epoch": 0.01539155268300015, "flos": 20820055403520.0, "grad_norm": 105.63408963941578, "language_loss": 1.25605464, "learning_rate": 3.5702746055454075e-06, "loss": 1.38177443, "num_input_tokens_seen": 5395320, "router_z_loss_clip": 24.890625, "router_z_loss_mlp": 2.38671875, "step": 256, "time_per_iteration": 5.8118555545806885 }, { "auxiliary_loss_clip": 0.10544074, "auxiliary_loss_mlp": 0.01961974, "balance_loss_clip": 0.08064944, "balance_loss_mlp": 0.01734332, "epoch": 0.01545167593566812, "flos": 15967473425280.0, "grad_norm": 123.56244588935442, "language_loss": 1.19285119, "learning_rate": 3.5727847536897254e-06, "loss": 1.31791162, "num_input_tokens_seen": 5411970, "router_z_loss_clip": 24.796875, "router_z_loss_mlp": 2.27539062, "step": 257, "time_per_iteration": 2.710686683654785 }, { "auxiliary_loss_clip": 0.10580529, "auxiliary_loss_mlp": 0.01913402, "balance_loss_clip": 0.08070882, "balance_loss_mlp": 0.01714847, "epoch": 0.01551179918833609, "flos": 22608378368640.0, "grad_norm": 75.49816181591198, "language_loss": 1.30932927, "learning_rate": 3.5752851536613596e-06, "loss": 1.43426859, "num_input_tokens_seen": 5430245, "router_z_loss_clip": 25.09375, "router_z_loss_mlp": 1.98632812, "step": 258, "time_per_iteration": 2.743995189666748 }, { "auxiliary_loss_clip": 0.10580976, "auxiliary_loss_mlp": 0.01871293, "balance_loss_clip": 0.08071049, "balance_loss_mlp": 0.01676839, "epoch": 0.015571922441004058, "flos": 22822713912960.0, "grad_norm": 151.47279040727867, "language_loss": 1.27403307, "learning_rate": 3.577775880881658e-06, "loss": 1.39855576, "num_input_tokens_seen": 5448905, "router_z_loss_clip": 25.109375, "router_z_loss_mlp": 1.94433594, "step": 259, "time_per_iteration": 2.743861198425293 }, { "auxiliary_loss_clip": 0.10576621, "auxiliary_loss_mlp": 0.01851958, "balance_loss_clip": 0.08067229, "balance_loss_mlp": 0.01662463, "epoch": 0.015632045693672027, "flos": 18952502803200.0, "grad_norm": 77.24599294684478, "language_loss": 1.28941512, "learning_rate": 3.5802570099000424e-06, "loss": 1.41370082, "num_input_tokens_seen": 5466405, "router_z_loss_clip": 25.109375, "router_z_loss_mlp": 1.89453125, "step": 260, "time_per_iteration": 2.7531046867370605 }, { "auxiliary_loss_clip": 0.1063319, "auxiliary_loss_mlp": 0.0187014, "balance_loss_clip": 0.0808953, "balance_loss_mlp": 0.01687321, "epoch": 0.015692168946339995, "flos": 29979569571840.0, "grad_norm": 92.89918077688223, "language_loss": 1.24756277, "learning_rate": 3.5827286144073947e-06, "loss": 1.37259603, "num_input_tokens_seen": 5487055, "router_z_loss_clip": 25.421875, "router_z_loss_mlp": 1.83007812, "step": 261, "time_per_iteration": 2.9224071502685547 }, { "auxiliary_loss_clip": 0.10699748, "auxiliary_loss_mlp": 0.01871619, "balance_loss_clip": 0.08128223, "balance_loss_mlp": 0.01694521, "epoch": 0.015752292199007967, "flos": 19398363978240.0, "grad_norm": 52.14430147139385, "language_loss": 0.99231029, "learning_rate": 3.5851907672491904e-06, "loss": 1.11802411, "num_input_tokens_seen": 5506600, "router_z_loss_clip": 25.703125, "router_z_loss_mlp": 1.77050781, "step": 262, "time_per_iteration": 2.7580909729003906 }, { "auxiliary_loss_clip": 0.10683639, "auxiliary_loss_mlp": 0.0191358, "balance_loss_clip": 0.08109702, "balance_loss_mlp": 0.01736864, "epoch": 0.015812415451675936, "flos": 20346088383360.0, "grad_norm": 57.42724262094468, "language_loss": 1.03127134, "learning_rate": 3.587643540438383e-06, "loss": 1.15724349, "num_input_tokens_seen": 5524350, "router_z_loss_clip": 25.734375, "router_z_loss_mlp": 1.76757812, "step": 263, "time_per_iteration": 2.7515785694122314 }, { "auxiliary_loss_clip": 0.10696459, "auxiliary_loss_mlp": 0.01957213, "balance_loss_clip": 0.08118658, "balance_loss_mlp": 0.01766478, "epoch": 0.015872538704343905, "flos": 17530392107520.0, "grad_norm": 205.25219111390956, "language_loss": 1.27436137, "learning_rate": 3.590087005168037e-06, "loss": 1.40089798, "num_input_tokens_seen": 5542145, "router_z_loss_clip": 25.796875, "router_z_loss_mlp": 1.90820312, "step": 264, "time_per_iteration": 2.7773938179016113 }, { "auxiliary_loss_clip": 0.10670541, "auxiliary_loss_mlp": 0.02009335, "balance_loss_clip": 0.08108748, "balance_loss_mlp": 0.01811448, "epoch": 0.015932661957011873, "flos": 15264622177920.0, "grad_norm": 53.08383075153608, "language_loss": 1.38373995, "learning_rate": 3.5925212318237344e-06, "loss": 1.5105387, "num_input_tokens_seen": 5557920, "router_z_loss_clip": 25.625, "router_z_loss_mlp": 1.98144531, "step": 265, "time_per_iteration": 2.7439019680023193 }, { "auxiliary_loss_clip": 0.10611819, "auxiliary_loss_mlp": 0.01955072, "balance_loss_clip": 0.0806461, "balance_loss_mlp": 0.0175337, "epoch": 0.015992785209679845, "flos": 20308674735360.0, "grad_norm": 42.50767703314507, "language_loss": 1.11200345, "learning_rate": 3.5949462899957323e-06, "loss": 1.23767233, "num_input_tokens_seen": 5576290, "router_z_loss_clip": 25.453125, "router_z_loss_mlp": 2.01855469, "step": 266, "time_per_iteration": 2.7124879360198975 }, { "auxiliary_loss_clip": 0.10598169, "auxiliary_loss_mlp": 0.01955708, "balance_loss_clip": 0.08073585, "balance_loss_mlp": 0.01749905, "epoch": 0.016052908462347814, "flos": 23368195992960.0, "grad_norm": 99.66443114656697, "language_loss": 1.20787168, "learning_rate": 3.5973622484909068e-06, "loss": 1.3334105, "num_input_tokens_seen": 5595205, "router_z_loss_clip": 25.28125, "router_z_loss_mlp": 2.05664062, "step": 267, "time_per_iteration": 2.73384690284729 }, { "auxiliary_loss_clip": 0.10616863, "auxiliary_loss_mlp": 0.01931033, "balance_loss_clip": 0.0808523, "balance_loss_mlp": 0.01721415, "epoch": 0.016113031715015783, "flos": 21292722685440.0, "grad_norm": 73.0493088325239, "language_loss": 1.23023081, "learning_rate": 3.599769175344462e-06, "loss": 1.35570979, "num_input_tokens_seen": 5612645, "router_z_loss_clip": 25.3125, "router_z_loss_mlp": 2.09863281, "step": 268, "time_per_iteration": 2.7623796463012695 }, { "auxiliary_loss_clip": 0.10552512, "auxiliary_loss_mlp": 0.0194135, "balance_loss_clip": 0.08062648, "balance_loss_mlp": 0.01715139, "epoch": 0.01617315496768375, "flos": 18920371962240.0, "grad_norm": 65.96195322726, "language_loss": 1.20586538, "learning_rate": 3.602167137831432e-06, "loss": 1.33080399, "num_input_tokens_seen": 5628345, "router_z_loss_clip": 24.875, "router_z_loss_mlp": 2.265625, "step": 269, "time_per_iteration": 2.7403147220611572 }, { "auxiliary_loss_clip": 0.10617341, "auxiliary_loss_mlp": 0.01910561, "balance_loss_clip": 0.08095752, "balance_loss_mlp": 0.01703423, "epoch": 0.01623327822035172, "flos": 16552339724160.0, "grad_norm": 178.01097567134497, "language_loss": 1.27943325, "learning_rate": 3.6045562024779565e-06, "loss": 1.4047122, "num_input_tokens_seen": 5645940, "router_z_loss_clip": 25.234375, "router_z_loss_mlp": 2.0703125, "step": 270, "time_per_iteration": 2.749847412109375 }, { "auxiliary_loss_clip": 0.10580981, "auxiliary_loss_mlp": 0.01891213, "balance_loss_clip": 0.08066227, "balance_loss_mlp": 0.01700002, "epoch": 0.016293401473019692, "flos": 23520198499200.0, "grad_norm": 45.68895546960623, "language_loss": 1.22518063, "learning_rate": 3.606936435072361e-06, "loss": 1.34990263, "num_input_tokens_seen": 5665690, "router_z_loss_clip": 25.140625, "router_z_loss_mlp": 1.91308594, "step": 271, "time_per_iteration": 2.737215757369995 }, { "auxiliary_loss_clip": 0.1061901, "auxiliary_loss_mlp": 0.01927922, "balance_loss_clip": 0.08088507, "balance_loss_mlp": 0.01719067, "epoch": 0.01635352472568766, "flos": 29022579290880.0, "grad_norm": 84.32687840510764, "language_loss": 1.19829226, "learning_rate": 3.609307900676025e-06, "loss": 1.3237617, "num_input_tokens_seen": 5683190, "router_z_loss_clip": 25.328125, "router_z_loss_mlp": 2.08886719, "step": 272, "time_per_iteration": 2.778616428375244 }, { "auxiliary_loss_clip": 0.10649274, "auxiliary_loss_mlp": 0.01943746, "balance_loss_clip": 0.08120243, "balance_loss_mlp": 0.01748624, "epoch": 0.01641364797835563, "flos": 13375546277760.0, "grad_norm": 72.85190541264907, "language_loss": 1.12030125, "learning_rate": 3.611670663634051e-06, "loss": 1.24623156, "num_input_tokens_seen": 5699780, "router_z_loss_clip": 25.296875, "router_z_loss_mlp": 1.95117188, "step": 273, "time_per_iteration": 2.7111518383026123 }, { "auxiliary_loss_clip": 0.10657561, "auxiliary_loss_mlp": 0.01963974, "balance_loss_clip": 0.08108483, "balance_loss_mlp": 0.01760174, "epoch": 0.016473771231023598, "flos": 18883922636160.0, "grad_norm": 1308.4881702295816, "language_loss": 1.31543612, "learning_rate": 3.614024787585744e-06, "loss": 1.44165158, "num_input_tokens_seen": 5716980, "router_z_loss_clip": 25.515625, "router_z_loss_mlp": 2.03808594, "step": 274, "time_per_iteration": 2.695049285888672 }, { "auxiliary_loss_clip": 0.10602801, "auxiliary_loss_mlp": 0.01961263, "balance_loss_clip": 0.08088836, "balance_loss_mlp": 0.01742299, "epoch": 0.016533894483691566, "flos": 22608252587520.0, "grad_norm": 73.61961492447037, "language_loss": 1.19702959, "learning_rate": 3.6163703354748927e-06, "loss": 1.32267034, "num_input_tokens_seen": 5737780, "router_z_loss_clip": 25.15625, "router_z_loss_mlp": 2.19238281, "step": 275, "time_per_iteration": 2.7535433769226074 }, { "auxiliary_loss_clip": 0.10667402, "auxiliary_loss_mlp": 0.01957114, "balance_loss_clip": 0.08112603, "balance_loss_mlp": 0.01775153, "epoch": 0.01659401773635954, "flos": 21513640775040.0, "grad_norm": 47.03007484204966, "language_loss": 1.07216835, "learning_rate": 3.6187073695598707e-06, "loss": 1.19841361, "num_input_tokens_seen": 5758330, "router_z_loss_clip": 25.5625, "router_z_loss_mlp": 1.81933594, "step": 276, "time_per_iteration": 2.7853293418884277 }, { "auxiliary_loss_clip": 0.10708412, "auxiliary_loss_mlp": 0.01970091, "balance_loss_clip": 0.08144559, "balance_loss_mlp": 0.01783171, "epoch": 0.016654140989027507, "flos": 32858772842880.0, "grad_norm": 58.128408735876974, "language_loss": 1.09463584, "learning_rate": 3.621035951423551e-06, "loss": 1.22142076, "num_input_tokens_seen": 5778340, "router_z_loss_clip": 25.640625, "router_z_loss_mlp": 1.8671875, "step": 277, "time_per_iteration": 2.8669304847717285 }, { "auxiliary_loss_clip": 0.1076626, "auxiliary_loss_mlp": 0.02018869, "balance_loss_clip": 0.08169557, "balance_loss_mlp": 0.01846826, "epoch": 0.016714264241695476, "flos": 12310046559360.0, "grad_norm": 37.54685416873824, "language_loss": 1.07976103, "learning_rate": 3.623356141983041e-06, "loss": 1.20761228, "num_input_tokens_seen": 5794295, "router_z_loss_clip": 25.96875, "router_z_loss_mlp": 1.72070312, "step": 278, "time_per_iteration": 2.7667062282562256 }, { "auxiliary_loss_clip": 0.10779934, "auxiliary_loss_mlp": 0.01978563, "balance_loss_clip": 0.08170436, "balance_loss_mlp": 0.01814913, "epoch": 0.016774387494363444, "flos": 27130820060160.0, "grad_norm": 106.68337884953986, "language_loss": 1.2415359, "learning_rate": 3.6256680014992486e-06, "loss": 1.36912107, "num_input_tokens_seen": 5814405, "router_z_loss_clip": 26.125, "router_z_loss_mlp": 1.63671875, "step": 279, "time_per_iteration": 2.8102805614471436 }, { "auxiliary_loss_clip": 0.10829133, "auxiliary_loss_mlp": 0.01991597, "balance_loss_clip": 0.08201596, "balance_loss_mlp": 0.01841966, "epoch": 0.016834510747031413, "flos": 20197356186240.0, "grad_norm": 24.02615870427978, "language_loss": 1.25858295, "learning_rate": 3.6279715895862713e-06, "loss": 1.38679028, "num_input_tokens_seen": 5832795, "router_z_loss_clip": 26.265625, "router_z_loss_mlp": 1.49609375, "step": 280, "time_per_iteration": 2.7282345294952393 }, { "auxiliary_loss_clip": 0.1090824, "auxiliary_loss_mlp": 0.0202826, "balance_loss_clip": 0.08225574, "balance_loss_mlp": 0.01887307, "epoch": 0.016894633999699385, "flos": 27282067879680.0, "grad_norm": 25.593182514611424, "language_loss": 1.0111661, "learning_rate": 3.6302669652206183e-06, "loss": 1.14053106, "num_input_tokens_seen": 5855750, "router_z_loss_clip": 26.828125, "router_z_loss_mlp": 1.41113281, "step": 281, "time_per_iteration": 2.804511785507202 }, { "auxiliary_loss_clip": 0.10922664, "auxiliary_loss_mlp": 0.01998029, "balance_loss_clip": 0.08246116, "balance_loss_mlp": 0.01863561, "epoch": 0.016954757252367354, "flos": 14908262762880.0, "grad_norm": 75.07053969664524, "language_loss": 1.22864521, "learning_rate": 3.632554186750274e-06, "loss": 1.3578521, "num_input_tokens_seen": 5872610, "router_z_loss_clip": 26.78125, "router_z_loss_mlp": 1.34472656, "step": 282, "time_per_iteration": 2.7086706161499023 }, { "auxiliary_loss_clip": 0.10971005, "auxiliary_loss_mlp": 0.02038743, "balance_loss_clip": 0.08251718, "balance_loss_mlp": 0.01910474, "epoch": 0.017014880505035322, "flos": 21364824723840.0, "grad_norm": 180.29082014124324, "language_loss": 1.1593976, "learning_rate": 3.6348333119035937e-06, "loss": 1.28949499, "num_input_tokens_seen": 5892985, "router_z_loss_clip": 27.1875, "router_z_loss_mlp": 1.28320312, "step": 283, "time_per_iteration": 2.763383626937866 }, { "auxiliary_loss_clip": 0.10927565, "auxiliary_loss_mlp": 0.0198477, "balance_loss_clip": 0.08254448, "balance_loss_mlp": 0.01845534, "epoch": 0.01707500375770329, "flos": 35341561647360.0, "grad_norm": 129.03781204583257, "language_loss": 1.14479685, "learning_rate": 3.6371043977980503e-06, "loss": 1.27392018, "num_input_tokens_seen": 5914060, "router_z_loss_clip": 26.765625, "router_z_loss_mlp": 1.39257812, "step": 284, "time_per_iteration": 2.854335308074951 }, { "auxiliary_loss_clip": 0.11015406, "auxiliary_loss_mlp": 0.0198828, "balance_loss_clip": 0.08298483, "balance_loss_mlp": 0.01853716, "epoch": 0.01713512701037126, "flos": 23588065906560.0, "grad_norm": 21.298712882849582, "language_loss": 1.29111028, "learning_rate": 3.639367500948819e-06, "loss": 1.42114711, "num_input_tokens_seen": 5932860, "router_z_loss_clip": 27.171875, "router_z_loss_mlp": 1.34667969, "step": 285, "time_per_iteration": 2.736524820327759 }, { "auxiliary_loss_clip": 0.11003432, "auxiliary_loss_mlp": 0.01982556, "balance_loss_clip": 0.08289581, "balance_loss_mlp": 0.01840363, "epoch": 0.01719525026303923, "flos": 27641781457920.0, "grad_norm": 38.58743976151074, "language_loss": 1.23023808, "learning_rate": 3.6416226772772178e-06, "loss": 1.360098, "num_input_tokens_seen": 5952725, "router_z_loss_clip": 27.1875, "router_z_loss_mlp": 1.421875, "step": 286, "time_per_iteration": 2.7615838050842285 }, { "auxiliary_loss_clip": 0.11025167, "auxiliary_loss_mlp": 0.0195934, "balance_loss_clip": 0.08301717, "balance_loss_mlp": 0.01827352, "epoch": 0.0172553735157072, "flos": 26987035253760.0, "grad_norm": 27.560684417213817, "language_loss": 1.1615119, "learning_rate": 3.643869982119001e-06, "loss": 1.2913568, "num_input_tokens_seen": 5970560, "router_z_loss_clip": 27.25, "router_z_loss_mlp": 1.3203125, "step": 287, "time_per_iteration": 2.742321729660034 }, { "auxiliary_loss_clip": 0.11026175, "auxiliary_loss_mlp": 0.01975251, "balance_loss_clip": 0.08300602, "balance_loss_mlp": 0.01841355, "epoch": 0.01731549676837517, "flos": 14060578533120.0, "grad_norm": 32.57209961704188, "language_loss": 1.3804822, "learning_rate": 3.646109470232502e-06, "loss": 1.51049662, "num_input_tokens_seen": 5982980, "router_z_loss_clip": 27.25, "router_z_loss_mlp": 1.33984375, "step": 288, "time_per_iteration": 2.7507340908050537 }, { "auxiliary_loss_clip": 0.09814686, "auxiliary_loss_mlp": 0.01997156, "balance_loss_clip": 0.07997867, "balance_loss_mlp": 0.01804799, "epoch": 0.017375620021043137, "flos": 66533545543680.0, "grad_norm": 1.404415418204711, "language_loss": 0.6476754, "learning_rate": 3.6483411958066417e-06, "loss": 0.7657938, "num_input_tokens_seen": 6049445, "router_z_loss_clip": 18.171875, "router_z_loss_mlp": 1.92285156, "step": 289, "time_per_iteration": 3.4570860862731934 }, { "auxiliary_loss_clip": 0.10943717, "auxiliary_loss_mlp": 0.01858325, "balance_loss_clip": 0.08290383, "balance_loss_mlp": 0.01706596, "epoch": 0.01743574327371111, "flos": 15229472590080.0, "grad_norm": 34.43726256314337, "language_loss": 1.20645785, "learning_rate": 3.6505652124687957e-06, "loss": 1.33447838, "num_input_tokens_seen": 6064150, "router_z_loss_clip": 26.546875, "router_z_loss_mlp": 1.51660156, "step": 290, "time_per_iteration": 2.6829311847686768 }, { "auxiliary_loss_clip": 0.1092225, "auxiliary_loss_mlp": 0.0180376, "balance_loss_clip": 0.08292429, "balance_loss_mlp": 0.01662807, "epoch": 0.017495866526379078, "flos": 25380833137920.0, "grad_norm": 59.490006665439104, "language_loss": 1.14332092, "learning_rate": 3.6527815732925258e-06, "loss": 1.27058101, "num_input_tokens_seen": 6083920, "router_z_loss_clip": 26.328125, "router_z_loss_mlp": 1.40917969, "step": 291, "time_per_iteration": 2.7716500759124756 }, { "auxiliary_loss_clip": 0.10808207, "auxiliary_loss_mlp": 0.01790578, "balance_loss_clip": 0.08249392, "balance_loss_mlp": 0.01661165, "epoch": 0.017555989779047047, "flos": 26366683950720.0, "grad_norm": 273.56194241998077, "language_loss": 0.9911502, "learning_rate": 3.6549903308051806e-06, "loss": 1.11713815, "num_input_tokens_seen": 6105460, "router_z_loss_clip": 25.609375, "router_z_loss_mlp": 1.29492188, "step": 292, "time_per_iteration": 2.7938642501831055 }, { "auxiliary_loss_clip": 0.10815758, "auxiliary_loss_mlp": 0.0181448, "balance_loss_clip": 0.08259781, "balance_loss_mlp": 0.01680584, "epoch": 0.017616113031715015, "flos": 22344134918400.0, "grad_norm": 93.98498977921987, "language_loss": 1.18506086, "learning_rate": 3.6571915369953646e-06, "loss": 1.31136322, "num_input_tokens_seen": 6122890, "router_z_loss_clip": 25.546875, "router_z_loss_mlp": 1.33886719, "step": 293, "time_per_iteration": 2.7122936248779297 }, { "auxiliary_loss_clip": 0.10772584, "auxiliary_loss_mlp": 0.01868709, "balance_loss_clip": 0.08247141, "balance_loss_mlp": 0.01732334, "epoch": 0.017676236284382984, "flos": 20163087066240.0, "grad_norm": 261.6868410925989, "language_loss": 1.13228333, "learning_rate": 3.6593852433202797e-06, "loss": 1.25869632, "num_input_tokens_seen": 6142890, "router_z_loss_clip": 25.25, "router_z_loss_mlp": 1.36328125, "step": 294, "time_per_iteration": 4.184765338897705 }, { "auxiliary_loss_clip": 0.10759985, "auxiliary_loss_mlp": 0.01919133, "balance_loss_clip": 0.08252198, "balance_loss_mlp": 0.01775986, "epoch": 0.017736359537050956, "flos": 25229501464320.0, "grad_norm": 59.2900451626738, "language_loss": 1.11089289, "learning_rate": 3.6615715007129453e-06, "loss": 1.23768401, "num_input_tokens_seen": 6162030, "router_z_loss_clip": 25.0625, "router_z_loss_mlp": 1.43066406, "step": 295, "time_per_iteration": 5.74894380569458 }, { "auxiliary_loss_clip": 0.10722776, "auxiliary_loss_mlp": 0.01930268, "balance_loss_clip": 0.08228456, "balance_loss_mlp": 0.01799996, "epoch": 0.017796482789718925, "flos": 20344914426240.0, "grad_norm": 148.64744030454474, "language_loss": 1.15460384, "learning_rate": 3.6637503595892897e-06, "loss": 1.28113437, "num_input_tokens_seen": 6180540, "router_z_loss_clip": 24.953125, "router_z_loss_mlp": 1.30175781, "step": 296, "time_per_iteration": 4.178096771240234 }, { "auxiliary_loss_clip": 0.10744089, "auxiliary_loss_mlp": 0.01909433, "balance_loss_clip": 0.08256245, "balance_loss_mlp": 0.01776681, "epoch": 0.017856606042386893, "flos": 22385196218880.0, "grad_norm": 73.07283025366343, "language_loss": 1.12729955, "learning_rate": 3.665921869855132e-06, "loss": 1.25383472, "num_input_tokens_seen": 6199425, "router_z_loss_clip": 24.875, "router_z_loss_mlp": 1.32714844, "step": 297, "time_per_iteration": 2.7258141040802 }, { "auxiliary_loss_clip": 0.1067224, "auxiliary_loss_mlp": 0.01912254, "balance_loss_clip": 0.08232345, "balance_loss_mlp": 0.01771397, "epoch": 0.017916729295054862, "flos": 20236279207680.0, "grad_norm": 33.86653613403749, "language_loss": 1.20875561, "learning_rate": 3.6680860809130346e-06, "loss": 1.33460069, "num_input_tokens_seen": 6219170, "router_z_loss_clip": 24.4375, "router_z_loss_mlp": 1.40917969, "step": 298, "time_per_iteration": 2.7635698318481445 }, { "auxiliary_loss_clip": 0.10655981, "auxiliary_loss_mlp": 0.01870317, "balance_loss_clip": 0.08231036, "balance_loss_mlp": 0.01728791, "epoch": 0.01797685254772283, "flos": 19397064240000.0, "grad_norm": 124.78945743392137, "language_loss": 1.10024941, "learning_rate": 3.6702430416690516e-06, "loss": 1.22551239, "num_input_tokens_seen": 6237930, "router_z_loss_clip": 24.234375, "router_z_loss_mlp": 1.41601562, "step": 299, "time_per_iteration": 2.8744893074035645 }, { "auxiliary_loss_clip": 0.10637853, "auxiliary_loss_mlp": 0.01913224, "balance_loss_clip": 0.08221854, "balance_loss_mlp": 0.01776181, "epoch": 0.018036975800390802, "flos": 24432941024640.0, "grad_norm": 36.72459700928439, "language_loss": 0.93439728, "learning_rate": 3.672392800539357e-06, "loss": 1.05990803, "num_input_tokens_seen": 6257170, "router_z_loss_clip": 24.125, "router_z_loss_mlp": 1.37011719, "step": 300, "time_per_iteration": 2.817115068435669 }, { "auxiliary_loss_clip": 0.10591716, "auxiliary_loss_mlp": 0.01948295, "balance_loss_clip": 0.0822611, "balance_loss_mlp": 0.01811156, "epoch": 0.01809709905305877, "flos": 15784430181120.0, "grad_norm": 38.50770895791376, "language_loss": 1.20154452, "learning_rate": 3.6745354054567686e-06, "loss": 1.32694459, "num_input_tokens_seen": 6274780, "router_z_loss_clip": 23.65625, "router_z_loss_mlp": 1.37207031, "step": 301, "time_per_iteration": 2.7316067218780518 }, { "auxiliary_loss_clip": 0.09557892, "auxiliary_loss_mlp": 0.01923592, "balance_loss_clip": 0.08003455, "balance_loss_mlp": 0.01772626, "epoch": 0.01815722230572674, "flos": 67371125356800.0, "grad_norm": 1.085303110669548, "language_loss": 0.62705791, "learning_rate": 3.676670903877158e-06, "loss": 0.74187279, "num_input_tokens_seen": 6340435, "router_z_loss_clip": 15.53125, "router_z_loss_mlp": 1.5078125, "step": 302, "time_per_iteration": 3.49729323387146 }, { "auxiliary_loss_clip": 0.105156, "auxiliary_loss_mlp": 0.02021028, "balance_loss_clip": 0.08205092, "balance_loss_mlp": 0.01885511, "epoch": 0.01821734555839471, "flos": 15490823074560.0, "grad_norm": 24.011243365254675, "language_loss": 1.18474972, "learning_rate": 3.6787993427857567e-06, "loss": 1.31011605, "num_input_tokens_seen": 6358160, "router_z_loss_clip": 23.125, "router_z_loss_mlp": 1.35546875, "step": 303, "time_per_iteration": 2.803725242614746 }, { "auxiliary_loss_clip": 0.10420055, "auxiliary_loss_mlp": 0.02063214, "balance_loss_clip": 0.08160749, "balance_loss_mlp": 0.01918828, "epoch": 0.018277468811062677, "flos": 24104268184320.0, "grad_norm": 30.69374285506819, "language_loss": 1.07154536, "learning_rate": 3.680920768703364e-06, "loss": 1.19637811, "num_input_tokens_seen": 6378485, "router_z_loss_clip": 22.578125, "router_z_loss_mlp": 1.44335938, "step": 304, "time_per_iteration": 2.7633535861968994 }, { "auxiliary_loss_clip": 0.10312397, "auxiliary_loss_mlp": 0.02036659, "balance_loss_clip": 0.08143976, "balance_loss_mlp": 0.01887981, "epoch": 0.01833759206373065, "flos": 20965601145600.0, "grad_norm": 46.00175459934731, "language_loss": 1.03654563, "learning_rate": 3.6830352276924415e-06, "loss": 1.16003621, "num_input_tokens_seen": 6397845, "router_z_loss_clip": 21.640625, "router_z_loss_mlp": 1.48632812, "step": 305, "time_per_iteration": 2.7400009632110596 }, { "auxiliary_loss_clip": 0.10383353, "auxiliary_loss_mlp": 0.02143375, "balance_loss_clip": 0.08174411, "balance_loss_mlp": 0.01991646, "epoch": 0.018397715316398618, "flos": 19396812677760.0, "grad_norm": 46.40945063700133, "language_loss": 1.13804626, "learning_rate": 3.685142765363119e-06, "loss": 1.26331353, "num_input_tokens_seen": 6416475, "router_z_loss_clip": 22.109375, "router_z_loss_mlp": 1.51660156, "step": 306, "time_per_iteration": 2.7159414291381836 }, { "auxiliary_loss_clip": 0.10306612, "auxiliary_loss_mlp": 0.02142666, "balance_loss_clip": 0.08147487, "balance_loss_mlp": 0.01990078, "epoch": 0.018457838569066586, "flos": 29140228823040.0, "grad_norm": 48.83850504461263, "language_loss": 1.15796888, "learning_rate": 3.687243426879095e-06, "loss": 1.28246176, "num_input_tokens_seen": 6437520, "router_z_loss_clip": 21.59375, "router_z_loss_mlp": 1.52636719, "step": 307, "time_per_iteration": 2.8441402912139893 }, { "auxiliary_loss_clip": 0.10293026, "auxiliary_loss_mlp": 0.02163863, "balance_loss_clip": 0.08144858, "balance_loss_mlp": 0.02008223, "epoch": 0.018517961821734555, "flos": 19214733755520.0, "grad_norm": 695.1547334294014, "language_loss": 1.0086031, "learning_rate": 3.6893372569634466e-06, "loss": 1.1331718, "num_input_tokens_seen": 6455680, "router_z_loss_clip": 21.515625, "router_z_loss_mlp": 1.55566406, "step": 308, "time_per_iteration": 2.6787145137786865 }, { "auxiliary_loss_clip": 0.10256235, "auxiliary_loss_mlp": 0.02085664, "balance_loss_clip": 0.08143084, "balance_loss_mlp": 0.01934602, "epoch": 0.018578085074402523, "flos": 19868809127040.0, "grad_norm": 49.514339764604365, "language_loss": 1.16175699, "learning_rate": 3.6914242999043395e-06, "loss": 1.28517592, "num_input_tokens_seen": 6474880, "router_z_loss_clip": 21.15625, "router_z_loss_mlp": 1.51074219, "step": 309, "time_per_iteration": 2.758892774581909 }, { "auxiliary_loss_clip": 0.10229975, "auxiliary_loss_mlp": 0.02084341, "balance_loss_clip": 0.08127443, "balance_loss_mlp": 0.01932802, "epoch": 0.018638208327070496, "flos": 29614740894720.0, "grad_norm": 36.27262085209735, "language_loss": 1.04507506, "learning_rate": 3.69350459956065e-06, "loss": 1.16821814, "num_input_tokens_seen": 6495945, "router_z_loss_clip": 21.0, "router_z_loss_mlp": 1.515625, "step": 310, "time_per_iteration": 2.779299259185791 }, { "auxiliary_loss_clip": 0.10194056, "auxiliary_loss_mlp": 0.02101955, "balance_loss_clip": 0.08130715, "balance_loss_mlp": 0.01937351, "epoch": 0.018698331579738464, "flos": 45741694567680.0, "grad_norm": 62.73195980754013, "language_loss": 1.02574718, "learning_rate": 3.695578199367497e-06, "loss": 1.14870739, "num_input_tokens_seen": 6519930, "router_z_loss_clip": 20.59375, "router_z_loss_mlp": 1.64746094, "step": 311, "time_per_iteration": 2.917358636856079 }, { "auxiliary_loss_clip": 0.10166678, "auxiliary_loss_mlp": 0.02083976, "balance_loss_clip": 0.08133545, "balance_loss_mlp": 0.01909263, "epoch": 0.018758454832406433, "flos": 20489621627520.0, "grad_norm": 33.52497735262052, "language_loss": 1.18419766, "learning_rate": 3.6976451423416825e-06, "loss": 1.30670428, "num_input_tokens_seen": 6535070, "router_z_loss_clip": 20.34375, "router_z_loss_mlp": 1.74804688, "step": 312, "time_per_iteration": 2.6969099044799805 }, { "auxiliary_loss_clip": 0.10187265, "auxiliary_loss_mlp": 0.02052368, "balance_loss_clip": 0.08135178, "balance_loss_mlp": 0.01863731, "epoch": 0.0188185780850744, "flos": 15783088515840.0, "grad_norm": 41.064636148765885, "language_loss": 1.19993901, "learning_rate": 3.699705471087043e-06, "loss": 1.32233524, "num_input_tokens_seen": 6554135, "router_z_loss_clip": 20.484375, "router_z_loss_mlp": 1.88574219, "step": 313, "time_per_iteration": 2.691932201385498 }, { "auxiliary_loss_clip": 0.10093386, "auxiliary_loss_mlp": 0.02051153, "balance_loss_clip": 0.08101605, "balance_loss_mlp": 0.01850881, "epoch": 0.018878701337742373, "flos": 22462329502080.0, "grad_norm": 35.767837518526306, "language_loss": 1.1651026, "learning_rate": 3.7017592277997256e-06, "loss": 1.28654814, "num_input_tokens_seen": 6572275, "router_z_loss_clip": 19.90625, "router_z_loss_mlp": 2.00390625, "step": 314, "time_per_iteration": 2.7403862476348877 }, { "auxiliary_loss_clip": 0.10136703, "auxiliary_loss_mlp": 0.02038959, "balance_loss_clip": 0.08120742, "balance_loss_mlp": 0.01857189, "epoch": 0.018938824590410342, "flos": 31001576221440.0, "grad_norm": 57.78711297283827, "language_loss": 1.22431242, "learning_rate": 3.7038064542733654e-06, "loss": 1.3460691, "num_input_tokens_seen": 6594520, "router_z_loss_clip": 20.15625, "router_z_loss_mlp": 1.81835938, "step": 315, "time_per_iteration": 2.804436445236206 }, { "auxiliary_loss_clip": 0.10055003, "auxiliary_loss_mlp": 0.02070671, "balance_loss_clip": 0.08090051, "balance_loss_mlp": 0.01876693, "epoch": 0.01899894784307831, "flos": 23265724049280.0, "grad_norm": 52.048537427827284, "language_loss": 1.09657371, "learning_rate": 3.7058471919041945e-06, "loss": 1.21783042, "num_input_tokens_seen": 6614245, "router_z_loss_clip": 19.65625, "router_z_loss_mlp": 1.93945312, "step": 316, "time_per_iteration": 2.7407314777374268 }, { "auxiliary_loss_clip": 0.10055124, "auxiliary_loss_mlp": 0.02135306, "balance_loss_clip": 0.08087708, "balance_loss_mlp": 0.01931315, "epoch": 0.01905907109574628, "flos": 17463782511360.0, "grad_norm": 72.65367122659349, "language_loss": 1.09652281, "learning_rate": 3.7078814816960605e-06, "loss": 1.21842694, "num_input_tokens_seen": 6632015, "router_z_loss_clip": 19.6875, "router_z_loss_mlp": 2.04199219, "step": 317, "time_per_iteration": 2.7164266109466553 }, { "auxiliary_loss_clip": 0.10020253, "auxiliary_loss_mlp": 0.02108615, "balance_loss_clip": 0.08088815, "balance_loss_mlp": 0.01904148, "epoch": 0.019119194348414248, "flos": 14974578869760.0, "grad_norm": 41.64527228998052, "language_loss": 1.22575903, "learning_rate": 3.709909364265374e-06, "loss": 1.34704769, "num_input_tokens_seen": 6649015, "router_z_loss_clip": 19.34375, "router_z_loss_mlp": 2.04589844, "step": 318, "time_per_iteration": 2.761678695678711 }, { "auxiliary_loss_clip": 0.09947887, "auxiliary_loss_mlp": 0.02099726, "balance_loss_clip": 0.08056363, "balance_loss_mlp": 0.01889918, "epoch": 0.01917931760108222, "flos": 25489719918720.0, "grad_norm": 44.868219083999236, "language_loss": 1.22346115, "learning_rate": 3.7119308798459706e-06, "loss": 1.3439374, "num_input_tokens_seen": 6669225, "router_z_loss_clip": 18.90625, "router_z_loss_mlp": 2.09570312, "step": 319, "time_per_iteration": 2.746798276901245 }, { "auxiliary_loss_clip": 0.08966947, "auxiliary_loss_mlp": 0.02046217, "balance_loss_clip": 0.07772721, "balance_loss_mlp": 0.01761259, "epoch": 0.01923944085375019, "flos": 71576438872320.0, "grad_norm": 1.0648204416629283, "language_loss": 0.59694874, "learning_rate": 3.7139460682939026e-06, "loss": 0.70708036, "num_input_tokens_seen": 6725775, "router_z_loss_clip": 11.9609375, "router_z_loss_mlp": 2.85351562, "step": 320, "time_per_iteration": 3.274454355239868 }, { "auxiliary_loss_clip": 0.09942803, "auxiliary_loss_mlp": 0.02162024, "balance_loss_clip": 0.08059976, "balance_loss_mlp": 0.01960608, "epoch": 0.019299564106418157, "flos": 19688574994560.0, "grad_norm": 36.85324073458194, "language_loss": 1.21235991, "learning_rate": 3.715954969092154e-06, "loss": 1.33340812, "num_input_tokens_seen": 6744170, "router_z_loss_clip": 18.84375, "router_z_loss_mlp": 2.01367188, "step": 321, "time_per_iteration": 2.7055981159210205 }, { "auxiliary_loss_clip": 0.09851484, "auxiliary_loss_mlp": 0.02195589, "balance_loss_clip": 0.0802213, "balance_loss_mlp": 0.01971952, "epoch": 0.019359687359086126, "flos": 24393682586880.0, "grad_norm": 37.005863915261685, "language_loss": 1.0884496, "learning_rate": 3.7179576213552805e-06, "loss": 1.20892036, "num_input_tokens_seen": 6764565, "router_z_loss_clip": 18.3125, "router_z_loss_mlp": 2.23535156, "step": 322, "time_per_iteration": 2.8042454719543457 }, { "auxiliary_loss_clip": 0.09869438, "auxiliary_loss_mlp": 0.02208443, "balance_loss_clip": 0.08035862, "balance_loss_mlp": 0.02007886, "epoch": 0.019419810611754094, "flos": 23958177390720.0, "grad_norm": 23.451534466221666, "language_loss": 1.00615501, "learning_rate": 3.719954063833981e-06, "loss": 1.12693381, "num_input_tokens_seen": 6785310, "router_z_loss_clip": 18.328125, "router_z_loss_mlp": 2.00292969, "step": 323, "time_per_iteration": 2.771831512451172 }, { "auxiliary_loss_clip": 0.09845757, "auxiliary_loss_mlp": 0.0218247, "balance_loss_clip": 0.08017131, "balance_loss_mlp": 0.019863, "epoch": 0.019479933864422067, "flos": 22166164846080.0, "grad_norm": 27.71303205775148, "language_loss": 1.15132666, "learning_rate": 3.721944334919596e-06, "loss": 1.27160883, "num_input_tokens_seen": 6803290, "router_z_loss_clip": 18.28125, "router_z_loss_mlp": 1.96191406, "step": 324, "time_per_iteration": 2.78174090385437 }, { "auxiliary_loss_clip": 0.09812874, "auxiliary_loss_mlp": 0.0225142, "balance_loss_clip": 0.08001637, "balance_loss_mlp": 0.02054487, "epoch": 0.019540057117090035, "flos": 22243381983360.0, "grad_norm": 28.675124424704663, "language_loss": 0.95210946, "learning_rate": 3.7239284726485375e-06, "loss": 1.07275236, "num_input_tokens_seen": 6822570, "router_z_loss_clip": 18.125, "router_z_loss_mlp": 1.96972656, "step": 325, "time_per_iteration": 2.8028931617736816 }, { "auxiliary_loss_clip": 0.09796478, "auxiliary_loss_mlp": 0.02210868, "balance_loss_clip": 0.08005644, "balance_loss_mlp": 0.02028621, "epoch": 0.019600180369758004, "flos": 23083603200000.0, "grad_norm": 25.42324884128057, "language_loss": 1.00108218, "learning_rate": 3.72590651470665e-06, "loss": 1.12115562, "num_input_tokens_seen": 6841910, "router_z_loss_clip": 17.890625, "router_z_loss_mlp": 1.82128906, "step": 326, "time_per_iteration": 2.7459192276000977 }, { "auxiliary_loss_clip": 0.0976932, "auxiliary_loss_mlp": 0.02270741, "balance_loss_clip": 0.07974759, "balance_loss_mlp": 0.0209145, "epoch": 0.019660303622425972, "flos": 25417911369600.0, "grad_norm": 40.24692701397352, "language_loss": 1.01830983, "learning_rate": 3.727878498433505e-06, "loss": 1.13871038, "num_input_tokens_seen": 6862480, "router_z_loss_clip": 17.9375, "router_z_loss_mlp": 1.79101562, "step": 327, "time_per_iteration": 2.751791477203369 }, { "auxiliary_loss_clip": 0.09730887, "auxiliary_loss_mlp": 0.02288714, "balance_loss_clip": 0.07983745, "balance_loss_mlp": 0.02102843, "epoch": 0.01972042687509394, "flos": 23663941378560.0, "grad_norm": 42.499945245351924, "language_loss": 1.02594519, "learning_rate": 3.7298444608266328e-06, "loss": 1.14614129, "num_input_tokens_seen": 6882015, "router_z_loss_clip": 17.484375, "router_z_loss_mlp": 1.85742188, "step": 328, "time_per_iteration": 2.7651679515838623 }, { "auxiliary_loss_clip": 0.09768753, "auxiliary_loss_mlp": 0.02255759, "balance_loss_clip": 0.07996632, "balance_loss_mlp": 0.02074084, "epoch": 0.019780550127761913, "flos": 18229386067200.0, "grad_norm": 26.138195362990437, "language_loss": 1.29962397, "learning_rate": 3.731804438545683e-06, "loss": 1.41986918, "num_input_tokens_seen": 6899785, "router_z_loss_clip": 17.75, "router_z_loss_mlp": 1.81738281, "step": 329, "time_per_iteration": 2.68766188621521 }, { "auxiliary_loss_clip": 0.09740511, "auxiliary_loss_mlp": 0.02257826, "balance_loss_clip": 0.07975982, "balance_loss_mlp": 0.02070333, "epoch": 0.01984067338042988, "flos": 22425293197440.0, "grad_norm": 36.26599735611846, "language_loss": 1.04502726, "learning_rate": 3.7337584679165324e-06, "loss": 1.16501057, "num_input_tokens_seen": 6918575, "router_z_loss_clip": 17.671875, "router_z_loss_mlp": 1.875, "step": 330, "time_per_iteration": 2.739821434020996 }, { "auxiliary_loss_clip": 0.09662101, "auxiliary_loss_mlp": 0.02287996, "balance_loss_clip": 0.07944447, "balance_loss_mlp": 0.02071417, "epoch": 0.01990079663309785, "flos": 17060785499520.0, "grad_norm": 45.37673172787676, "language_loss": 1.26239204, "learning_rate": 3.7357065849353186e-06, "loss": 1.38189304, "num_input_tokens_seen": 6936965, "router_z_loss_clip": 17.140625, "router_z_loss_mlp": 2.16601562, "step": 331, "time_per_iteration": 2.6723287105560303 }, { "auxiliary_loss_clip": 0.09644835, "auxiliary_loss_mlp": 0.02401711, "balance_loss_clip": 0.07945098, "balance_loss_mlp": 0.02143265, "epoch": 0.01996091988576582, "flos": 15967389571200.0, "grad_norm": 30.951538915521077, "language_loss": 1.18671632, "learning_rate": 3.737648825272422e-06, "loss": 1.30718172, "num_input_tokens_seen": 6953475, "router_z_loss_clip": 17.03125, "router_z_loss_mlp": 2.58398438, "step": 332, "time_per_iteration": 2.701328992843628 }, { "auxiliary_loss_clip": 0.0957718, "auxiliary_loss_mlp": 0.02419832, "balance_loss_clip": 0.07887793, "balance_loss_mlp": 0.02106264, "epoch": 0.02002104313843379, "flos": 23593181005440.0, "grad_norm": 29.704031487722055, "language_loss": 1.10019553, "learning_rate": 3.739585224276384e-06, "loss": 1.22016561, "num_input_tokens_seen": 6971630, "router_z_loss_clip": 16.90625, "router_z_loss_mlp": 3.13671875, "step": 333, "time_per_iteration": 2.7498457431793213 }, { "auxiliary_loss_clip": 0.09516162, "auxiliary_loss_mlp": 0.02439835, "balance_loss_clip": 0.07871465, "balance_loss_mlp": 0.02097657, "epoch": 0.02008116639110176, "flos": 34103458517760.0, "grad_norm": 24.66534413589763, "language_loss": 1.07575607, "learning_rate": 3.7415158169777673e-06, "loss": 1.19531596, "num_input_tokens_seen": 6992775, "router_z_loss_clip": 16.453125, "router_z_loss_mlp": 3.421875, "step": 334, "time_per_iteration": 5.679600954055786 }, { "auxiliary_loss_clip": 0.09474801, "auxiliary_loss_mlp": 0.02526928, "balance_loss_clip": 0.07841562, "balance_loss_mlp": 0.02129627, "epoch": 0.020141289643769728, "flos": 19690000513920.0, "grad_norm": 37.97337722379276, "language_loss": 1.03167653, "learning_rate": 3.7434406380929575e-06, "loss": 1.15169382, "num_input_tokens_seen": 7011425, "router_z_loss_clip": 16.3046875, "router_z_loss_mlp": 3.97265625, "step": 335, "time_per_iteration": 4.322086334228516 }, { "auxiliary_loss_clip": 0.09436272, "auxiliary_loss_mlp": 0.02538286, "balance_loss_clip": 0.07827939, "balance_loss_mlp": 0.02117334, "epoch": 0.020201412896437697, "flos": 20746821335040.0, "grad_norm": 270.45843087223557, "language_loss": 1.15931618, "learning_rate": 3.745359722027911e-06, "loss": 1.27906168, "num_input_tokens_seen": 7029450, "router_z_loss_clip": 16.0859375, "router_z_loss_mlp": 4.2109375, "step": 336, "time_per_iteration": 4.3615100383758545 }, { "auxiliary_loss_clip": 0.09486409, "auxiliary_loss_mlp": 0.02532829, "balance_loss_clip": 0.07864712, "balance_loss_mlp": 0.02138199, "epoch": 0.020261536149105665, "flos": 20272728533760.0, "grad_norm": 28.730889481377506, "language_loss": 1.08086991, "learning_rate": 3.7472731028818428e-06, "loss": 1.20106232, "num_input_tokens_seen": 7047555, "router_z_loss_clip": 16.2265625, "router_z_loss_mlp": 3.94921875, "step": 337, "time_per_iteration": 2.7400906085968018 }, { "auxiliary_loss_clip": 0.09441007, "auxiliary_loss_mlp": 0.02523169, "balance_loss_clip": 0.07846677, "balance_loss_mlp": 0.02109275, "epoch": 0.020321659401773638, "flos": 25855890261120.0, "grad_norm": 115.1302266567086, "language_loss": 1.0481236, "learning_rate": 3.7491808144508626e-06, "loss": 1.16776538, "num_input_tokens_seen": 7068185, "router_z_loss_clip": 15.96875, "router_z_loss_mlp": 4.1328125, "step": 338, "time_per_iteration": 2.7506327629089355 }, { "auxiliary_loss_clip": 0.09387235, "auxiliary_loss_mlp": 0.02553876, "balance_loss_clip": 0.07816803, "balance_loss_mlp": 0.02104504, "epoch": 0.020381782654441606, "flos": 17501028451200.0, "grad_norm": 21.990627858504098, "language_loss": 1.06974554, "learning_rate": 3.7510828902315576e-06, "loss": 1.18915665, "num_input_tokens_seen": 7085955, "router_z_loss_clip": 15.71875, "router_z_loss_mlp": 4.49023438, "step": 339, "time_per_iteration": 2.729308843612671 }, { "auxiliary_loss_clip": 0.0937392, "auxiliary_loss_mlp": 0.02613015, "balance_loss_clip": 0.07790822, "balance_loss_mlp": 0.02104897, "epoch": 0.020441905907109575, "flos": 24250904029440.0, "grad_norm": 23.9631531907409, "language_loss": 1.08953905, "learning_rate": 3.75297936342452e-06, "loss": 1.20940852, "num_input_tokens_seen": 7106345, "router_z_loss_clip": 15.8359375, "router_z_loss_mlp": 5.078125, "step": 340, "time_per_iteration": 2.7526721954345703 }, { "auxiliary_loss_clip": 0.09345865, "auxiliary_loss_mlp": 0.02598748, "balance_loss_clip": 0.07777454, "balance_loss_mlp": 0.02115426, "epoch": 0.020502029159777543, "flos": 22239273133440.0, "grad_norm": 34.461431345982476, "language_loss": 1.07032013, "learning_rate": 3.7548702669378253e-06, "loss": 1.18976617, "num_input_tokens_seen": 7125070, "router_z_loss_clip": 15.6796875, "router_z_loss_mlp": 4.83203125, "step": 341, "time_per_iteration": 2.7135770320892334 }, { "auxiliary_loss_clip": 0.09341425, "auxiliary_loss_mlp": 0.02588684, "balance_loss_clip": 0.07779245, "balance_loss_mlp": 0.02101165, "epoch": 0.020562152412445512, "flos": 23994668643840.0, "grad_norm": 28.717773707902648, "language_loss": 1.089607, "learning_rate": 3.756755633390458e-06, "loss": 1.20890808, "num_input_tokens_seen": 7144675, "router_z_loss_clip": 15.6328125, "router_z_loss_mlp": 4.875, "step": 342, "time_per_iteration": 2.740600824356079 }, { "auxiliary_loss_clip": 0.09346582, "auxiliary_loss_mlp": 0.0256977, "balance_loss_clip": 0.0777574, "balance_loss_mlp": 0.02097892, "epoch": 0.020622275665113484, "flos": 26981878227840.0, "grad_norm": 18.43933537060389, "language_loss": 1.07006097, "learning_rate": 3.7586354951156886e-06, "loss": 1.18922448, "num_input_tokens_seen": 7165505, "router_z_loss_clip": 15.7109375, "router_z_loss_mlp": 4.72265625, "step": 343, "time_per_iteration": 2.896061420440674 }, { "auxiliary_loss_clip": 0.09312961, "auxiliary_loss_mlp": 0.02587527, "balance_loss_clip": 0.07773486, "balance_loss_mlp": 0.02098864, "epoch": 0.020682398917781453, "flos": 22607162484480.0, "grad_norm": 37.340858059648944, "language_loss": 0.97079968, "learning_rate": 3.7605098841644e-06, "loss": 1.08980453, "num_input_tokens_seen": 7184605, "router_z_loss_clip": 15.3828125, "router_z_loss_mlp": 4.8828125, "step": 344, "time_per_iteration": 2.7618842124938965 }, { "auxiliary_loss_clip": 0.09300768, "auxiliary_loss_mlp": 0.02552643, "balance_loss_clip": 0.0775435, "balance_loss_mlp": 0.02081337, "epoch": 0.02074252217044942, "flos": 15019120114560.0, "grad_norm": 67.96700881373285, "language_loss": 0.98176152, "learning_rate": 3.7623788323083666e-06, "loss": 1.10029554, "num_input_tokens_seen": 7203065, "router_z_loss_clip": 15.4765625, "router_z_loss_mlp": 4.70898438, "step": 345, "time_per_iteration": 2.720573663711548 }, { "auxiliary_loss_clip": 0.09329296, "auxiliary_loss_mlp": 0.02513499, "balance_loss_clip": 0.07764676, "balance_loss_mlp": 0.0207767, "epoch": 0.02080264542311739, "flos": 25345012717440.0, "grad_norm": 40.30681169643748, "language_loss": 1.05799294, "learning_rate": 3.7642423710434837e-06, "loss": 1.17642093, "num_input_tokens_seen": 7222995, "router_z_loss_clip": 15.625, "router_z_loss_mlp": 4.35351562, "step": 346, "time_per_iteration": 2.7794384956359863 }, { "auxiliary_loss_clip": 0.09296279, "auxiliary_loss_mlp": 0.02486025, "balance_loss_clip": 0.07752907, "balance_loss_mlp": 0.02059351, "epoch": 0.02086276867578536, "flos": 24395611230720.0, "grad_norm": 27.889965270305403, "language_loss": 0.9859103, "learning_rate": 3.7661005315929563e-06, "loss": 1.1037333, "num_input_tokens_seen": 7244625, "router_z_loss_clip": 15.40625, "router_z_loss_mlp": 4.265625, "step": 347, "time_per_iteration": 2.7792587280273438 }, { "auxiliary_loss_clip": 0.09273382, "auxiliary_loss_mlp": 0.02482995, "balance_loss_clip": 0.07742353, "balance_loss_mlp": 0.02066811, "epoch": 0.02092289192845333, "flos": 24469096861440.0, "grad_norm": 37.32751685117863, "language_loss": 0.93295753, "learning_rate": 3.7679533449104354e-06, "loss": 1.05052114, "num_input_tokens_seen": 7263255, "router_z_loss_clip": 15.296875, "router_z_loss_mlp": 4.16015625, "step": 348, "time_per_iteration": 2.7531614303588867 }, { "auxiliary_loss_clip": 0.09335239, "auxiliary_loss_mlp": 0.02435036, "balance_loss_clip": 0.0777099, "balance_loss_mlp": 0.02051278, "epoch": 0.0209830151811213, "flos": 17455942154880.0, "grad_norm": 19.23397042729009, "language_loss": 1.01866364, "learning_rate": 3.7698008416831116e-06, "loss": 1.13636637, "num_input_tokens_seen": 7279275, "router_z_loss_clip": 15.640625, "router_z_loss_mlp": 3.83984375, "step": 349, "time_per_iteration": 2.7531864643096924 }, { "auxiliary_loss_clip": 0.09266762, "auxiliary_loss_mlp": 0.0252588, "balance_loss_clip": 0.07729967, "balance_loss_mlp": 0.02092149, "epoch": 0.021043138433789268, "flos": 24581295878400.0, "grad_norm": 16.253000677507725, "language_loss": 1.00458372, "learning_rate": 3.7716430523347664e-06, "loss": 1.12251019, "num_input_tokens_seen": 7300180, "router_z_loss_clip": 15.3671875, "router_z_loss_mlp": 4.33203125, "step": 350, "time_per_iteration": 2.8689963817596436 }, { "auxiliary_loss_clip": 0.09263526, "auxiliary_loss_mlp": 0.02518824, "balance_loss_clip": 0.07723217, "balance_loss_mlp": 0.02092341, "epoch": 0.021103261686457236, "flos": 24459579423360.0, "grad_norm": 21.13628660113763, "language_loss": 0.9690336, "learning_rate": 3.773480007028776e-06, "loss": 1.08685708, "num_input_tokens_seen": 7317430, "router_z_loss_clip": 15.40625, "router_z_loss_mlp": 4.265625, "step": 351, "time_per_iteration": 2.7088723182678223 }, { "auxiliary_loss_clip": 0.09272309, "auxiliary_loss_mlp": 0.02456919, "balance_loss_clip": 0.07742499, "balance_loss_mlp": 0.02066294, "epoch": 0.021163384939125205, "flos": 14688183214080.0, "grad_norm": 7.907669454914101, "language_loss": 1.02571297, "learning_rate": 3.775311735671078e-06, "loss": 1.14300525, "num_input_tokens_seen": 7334875, "router_z_loss_clip": 15.296875, "router_z_loss_mlp": 3.90625, "step": 352, "time_per_iteration": 2.7053749561309814 }, { "auxiliary_loss_clip": 0.0925562, "auxiliary_loss_mlp": 0.02490899, "balance_loss_clip": 0.07719561, "balance_loss_mlp": 0.02079675, "epoch": 0.021223508191793177, "flos": 24499173277440.0, "grad_norm": 27.250252541594197, "language_loss": 0.98564088, "learning_rate": 3.7771382679130878e-06, "loss": 1.10310602, "num_input_tokens_seen": 7355185, "router_z_loss_clip": 15.3671875, "router_z_loss_mlp": 4.11523438, "step": 353, "time_per_iteration": 2.742327928543091 }, { "auxiliary_loss_clip": 0.09172569, "auxiliary_loss_mlp": 0.02443618, "balance_loss_clip": 0.07694063, "balance_loss_mlp": 0.02058334, "epoch": 0.021283631444461146, "flos": 24132667518720.0, "grad_norm": 34.65031783744928, "language_loss": 0.9755829, "learning_rate": 3.7789596331545845e-06, "loss": 1.09174478, "num_input_tokens_seen": 7374425, "router_z_loss_clip": 14.78125, "router_z_loss_mlp": 3.85742188, "step": 354, "time_per_iteration": 2.75799560546875 }, { "auxiliary_loss_clip": 0.0923726, "auxiliary_loss_mlp": 0.02486869, "balance_loss_clip": 0.07710594, "balance_loss_mlp": 0.02088614, "epoch": 0.021343754697129114, "flos": 25199299267200.0, "grad_norm": 18.287925743197274, "language_loss": 0.99752176, "learning_rate": 3.780775860546545e-06, "loss": 1.11476302, "num_input_tokens_seen": 7394175, "router_z_loss_clip": 15.265625, "router_z_loss_mlp": 3.984375, "step": 355, "time_per_iteration": 2.7667505741119385 }, { "auxiliary_loss_clip": 0.09163535, "auxiliary_loss_mlp": 0.02506391, "balance_loss_clip": 0.07664894, "balance_loss_mlp": 0.02087728, "epoch": 0.021403877949797083, "flos": 17279816872320.0, "grad_norm": 28.94656234097247, "language_loss": 1.09084773, "learning_rate": 3.7825869789939474e-06, "loss": 1.20754695, "num_input_tokens_seen": 7412645, "router_z_loss_clip": 14.984375, "router_z_loss_mlp": 4.1875, "step": 356, "time_per_iteration": 2.7092108726501465 }, { "auxiliary_loss_clip": 0.09168819, "auxiliary_loss_mlp": 0.02590391, "balance_loss_clip": 0.07650582, "balance_loss_mlp": 0.02132437, "epoch": 0.021464001202465055, "flos": 30924946062720.0, "grad_norm": 23.46156854415092, "language_loss": 0.97759289, "learning_rate": 3.784393017158528e-06, "loss": 1.09518504, "num_input_tokens_seen": 7432275, "router_z_loss_clip": 15.1875, "router_z_loss_mlp": 4.58007812, "step": 357, "time_per_iteration": 2.8124947547912598 }, { "auxiliary_loss_clip": 0.09142401, "auxiliary_loss_mlp": 0.02575084, "balance_loss_clip": 0.07646303, "balance_loss_mlp": 0.02121517, "epoch": 0.021524124455133024, "flos": 18192182054400.0, "grad_norm": 25.76233023983871, "language_loss": 0.97297597, "learning_rate": 3.786194003461506e-06, "loss": 1.09015083, "num_input_tokens_seen": 7450245, "router_z_loss_clip": 14.9609375, "router_z_loss_mlp": 4.53710938, "step": 358, "time_per_iteration": 2.729569911956787 }, { "auxiliary_loss_clip": 0.09157528, "auxiliary_loss_mlp": 0.02502821, "balance_loss_clip": 0.07654461, "balance_loss_mlp": 0.0209503, "epoch": 0.021584247707800992, "flos": 13810464495360.0, "grad_norm": 17.250401070501063, "language_loss": 1.08638048, "learning_rate": 3.787989966086264e-06, "loss": 1.20298409, "num_input_tokens_seen": 7466845, "router_z_loss_clip": 15.0390625, "router_z_loss_mlp": 4.07617188, "step": 359, "time_per_iteration": 2.687722682952881 }, { "auxiliary_loss_clip": 0.09118073, "auxiliary_loss_mlp": 0.02518971, "balance_loss_clip": 0.0763142, "balance_loss_mlp": 0.0209821, "epoch": 0.02164437096046896, "flos": 23301418688640.0, "grad_norm": 21.560423537157583, "language_loss": 1.02815473, "learning_rate": 3.789780932980997e-06, "loss": 1.14452529, "num_input_tokens_seen": 7485450, "router_z_loss_clip": 14.8671875, "router_z_loss_mlp": 4.20703125, "step": 360, "time_per_iteration": 2.7178635597229004 }, { "auxiliary_loss_clip": 0.08545981, "auxiliary_loss_mlp": 0.02131845, "balance_loss_clip": 0.07463808, "balance_loss_mlp": 0.01895334, "epoch": 0.02170449421313693, "flos": 68919621137280.0, "grad_norm": 1.054290370718647, "language_loss": 0.65320539, "learning_rate": 3.79156693186132e-06, "loss": 0.75998366, "num_input_tokens_seen": 7553780, "router_z_loss_clip": 10.828125, "router_z_loss_mlp": 2.359375, "step": 361, "time_per_iteration": 3.4505438804626465 }, { "auxiliary_loss_clip": 0.09096563, "auxiliary_loss_mlp": 0.02411331, "balance_loss_clip": 0.07631494, "balance_loss_mlp": 0.02065719, "epoch": 0.0217646174658049, "flos": 25235580885120.0, "grad_norm": 19.712728933313024, "language_loss": 1.03243005, "learning_rate": 3.7933479902128433e-06, "loss": 1.14750898, "num_input_tokens_seen": 7574155, "router_z_loss_clip": 14.6484375, "router_z_loss_mlp": 3.453125, "step": 362, "time_per_iteration": 2.789533853530884 }, { "auxiliary_loss_clip": 0.09099672, "auxiliary_loss_mlp": 0.02301515, "balance_loss_clip": 0.07632374, "balance_loss_mlp": 0.01992906, "epoch": 0.02182474071847287, "flos": 22899721415040.0, "grad_norm": 23.857737858895046, "language_loss": 1.08070135, "learning_rate": 3.7951241352937077e-06, "loss": 1.19471323, "num_input_tokens_seen": 7592320, "router_z_loss_clip": 14.671875, "router_z_loss_mlp": 3.08789062, "step": 363, "time_per_iteration": 2.7415597438812256 }, { "auxiliary_loss_clip": 0.09083772, "auxiliary_loss_mlp": 0.02270687, "balance_loss_clip": 0.07630498, "balance_loss_mlp": 0.01985157, "epoch": 0.02188486397114084, "flos": 23665660387200.0, "grad_norm": 20.287345119604943, "language_loss": 1.07456708, "learning_rate": 3.7968953941370915e-06, "loss": 1.18811178, "num_input_tokens_seen": 7611185, "router_z_loss_clip": 14.5546875, "router_z_loss_mlp": 2.85742188, "step": 364, "time_per_iteration": 2.722900390625 }, { "auxiliary_loss_clip": 0.09067199, "auxiliary_loss_mlp": 0.0225283, "balance_loss_clip": 0.076175, "balance_loss_mlp": 0.01965964, "epoch": 0.021944987223808807, "flos": 21550090101120.0, "grad_norm": 18.79644877771038, "language_loss": 0.98372221, "learning_rate": 3.798661793553676e-06, "loss": 1.0969224, "num_input_tokens_seen": 7631970, "router_z_loss_clip": 14.5078125, "router_z_loss_mlp": 2.8671875, "step": 365, "time_per_iteration": 2.7064945697784424 }, { "auxiliary_loss_clip": 0.09032331, "auxiliary_loss_mlp": 0.02158501, "balance_loss_clip": 0.0761082, "balance_loss_mlp": 0.01908448, "epoch": 0.022005110476476776, "flos": 16076444060160.0, "grad_norm": 17.600301904671007, "language_loss": 0.97869253, "learning_rate": 3.8004233601340808e-06, "loss": 1.09060085, "num_input_tokens_seen": 7649745, "router_z_loss_clip": 14.21875, "router_z_loss_mlp": 2.50195312, "step": 366, "time_per_iteration": 2.764807939529419 }, { "auxiliary_loss_clip": 0.09026615, "auxiliary_loss_mlp": 0.02191772, "balance_loss_clip": 0.07615771, "balance_loss_mlp": 0.01937904, "epoch": 0.022065233729144748, "flos": 21440071290240.0, "grad_norm": 21.569866588057813, "language_loss": 1.09448373, "learning_rate": 3.8021801202512694e-06, "loss": 1.20666766, "num_input_tokens_seen": 7668830, "router_z_loss_clip": 14.109375, "router_z_loss_mlp": 2.53710938, "step": 367, "time_per_iteration": 2.846562147140503 }, { "auxiliary_loss_clip": 0.09041451, "auxiliary_loss_mlp": 0.02132641, "balance_loss_clip": 0.07625143, "balance_loss_mlp": 0.01905094, "epoch": 0.022125356981812717, "flos": 21550173955200.0, "grad_norm": 12.974609891708685, "language_loss": 1.04753137, "learning_rate": 3.803932100062912e-06, "loss": 1.15927243, "num_input_tokens_seen": 7687240, "router_z_loss_clip": 14.1875, "router_z_loss_mlp": 2.27734375, "step": 368, "time_per_iteration": 2.7882933616638184 }, { "auxiliary_loss_clip": 0.08988454, "auxiliary_loss_mlp": 0.02149549, "balance_loss_clip": 0.07594614, "balance_loss_mlp": 0.01900258, "epoch": 0.022185480234480685, "flos": 20710413936000.0, "grad_norm": 41.47001770700195, "language_loss": 1.03951311, "learning_rate": 3.8056793255137264e-06, "loss": 1.15089309, "num_input_tokens_seen": 7704440, "router_z_loss_clip": 13.9453125, "router_z_loss_mlp": 2.49023438, "step": 369, "time_per_iteration": 2.7233164310455322 }, { "auxiliary_loss_clip": 0.09007177, "auxiliary_loss_mlp": 0.02155857, "balance_loss_clip": 0.07596335, "balance_loss_mlp": 0.0190237, "epoch": 0.022245603487148654, "flos": 25200431297280.0, "grad_norm": 14.23296892357352, "language_loss": 1.00631571, "learning_rate": 3.8074218223377844e-06, "loss": 1.11794603, "num_input_tokens_seen": 7727160, "router_z_loss_clip": 14.125, "router_z_loss_mlp": 2.53515625, "step": 370, "time_per_iteration": 2.768825054168701 }, { "auxiliary_loss_clip": 0.08925571, "auxiliary_loss_mlp": 0.02117698, "balance_loss_clip": 0.07556547, "balance_loss_mlp": 0.0187432, "epoch": 0.022305726739816623, "flos": 21402070663680.0, "grad_norm": 11.3763928032969, "language_loss": 0.96937811, "learning_rate": 3.8091596160607834e-06, "loss": 1.07981086, "num_input_tokens_seen": 7747730, "router_z_loss_clip": 13.6875, "router_z_loss_mlp": 2.43359375, "step": 371, "time_per_iteration": 2.756399393081665 }, { "auxiliary_loss_clip": 0.08893323, "auxiliary_loss_mlp": 0.02092827, "balance_loss_clip": 0.07547152, "balance_loss_mlp": 0.01851738, "epoch": 0.022365849992484595, "flos": 22498736901120.0, "grad_norm": 9.484951862945636, "language_loss": 1.04679894, "learning_rate": 3.8108927320022896e-06, "loss": 1.15666032, "num_input_tokens_seen": 7766765, "router_z_loss_clip": 13.4453125, "router_z_loss_mlp": 2.40820312, "step": 372, "time_per_iteration": 2.7844929695129395 }, { "auxiliary_loss_clip": 0.08874454, "auxiliary_loss_mlp": 0.02088881, "balance_loss_clip": 0.07535434, "balance_loss_mlp": 0.01848364, "epoch": 0.022425973245152563, "flos": 17862083694720.0, "grad_norm": 14.771615836572277, "language_loss": 1.01113474, "learning_rate": 3.8126211952779548e-06, "loss": 1.12076807, "num_input_tokens_seen": 7784010, "router_z_loss_clip": 13.390625, "router_z_loss_mlp": 2.40332031, "step": 373, "time_per_iteration": 4.1343770027160645 }, { "auxiliary_loss_clip": 0.08855486, "auxiliary_loss_mlp": 0.02071489, "balance_loss_clip": 0.07528085, "balance_loss_mlp": 0.01833834, "epoch": 0.022486096497820532, "flos": 15487804327680.0, "grad_norm": 14.425992343466291, "language_loss": 1.03705406, "learning_rate": 3.8143450308016952e-06, "loss": 1.14632392, "num_input_tokens_seen": 7801305, "router_z_loss_clip": 13.265625, "router_z_loss_mlp": 2.37695312, "step": 374, "time_per_iteration": 5.75036883354187 }, { "auxiliary_loss_clip": 0.0883513, "auxiliary_loss_mlp": 0.020618, "balance_loss_clip": 0.07515097, "balance_loss_mlp": 0.01831773, "epoch": 0.0225462197504885, "flos": 27791897247360.0, "grad_norm": 10.953995201171189, "language_loss": 0.98251975, "learning_rate": 3.8160642632878525e-06, "loss": 1.09148908, "num_input_tokens_seen": 7823965, "router_z_loss_clip": 13.203125, "router_z_loss_mlp": 2.30078125, "step": 375, "time_per_iteration": 4.29323673248291 }, { "auxiliary_loss_clip": 0.08847529, "auxiliary_loss_mlp": 0.0210359, "balance_loss_clip": 0.07524449, "balance_loss_mlp": 0.01881384, "epoch": 0.02260634300315647, "flos": 19981804757760.0, "grad_norm": 19.449409618275485, "language_loss": 1.04142499, "learning_rate": 3.817778917253314e-06, "loss": 1.15093625, "num_input_tokens_seen": 7842115, "router_z_loss_clip": 13.2265625, "router_z_loss_mlp": 2.22460938, "step": 376, "time_per_iteration": 2.773331880569458 }, { "auxiliary_loss_clip": 0.08807029, "auxiliary_loss_mlp": 0.02107831, "balance_loss_clip": 0.07509544, "balance_loss_mlp": 0.01894112, "epoch": 0.02266646625582444, "flos": 16032699429120.0, "grad_norm": 15.174704313946519, "language_loss": 0.98384166, "learning_rate": 3.8194890170196155e-06, "loss": 1.09299028, "num_input_tokens_seen": 7857830, "router_z_loss_clip": 12.9609375, "router_z_loss_mlp": 2.13867188, "step": 377, "time_per_iteration": 2.6545400619506836 }, { "auxiliary_loss_clip": 0.08781163, "auxiliary_loss_mlp": 0.02082802, "balance_loss_clip": 0.0749824, "balance_loss_mlp": 0.01891209, "epoch": 0.02272658950849241, "flos": 20409553451520.0, "grad_norm": 10.78367904673643, "language_loss": 1.13654113, "learning_rate": 3.8211945867150055e-06, "loss": 1.24518085, "num_input_tokens_seen": 7875840, "router_z_loss_clip": 12.8359375, "router_z_loss_mlp": 1.91796875, "step": 378, "time_per_iteration": 2.7296626567840576 }, { "auxiliary_loss_clip": 0.0822655, "auxiliary_loss_mlp": 0.01526715, "balance_loss_clip": 0.07317516, "balance_loss_mlp": 0.01393963, "epoch": 0.02278671276116038, "flos": 69867387469440.0, "grad_norm": 0.9778815657344732, "language_loss": 0.75473452, "learning_rate": 3.822895650276492e-06, "loss": 0.85226715, "num_input_tokens_seen": 7940190, "router_z_loss_clip": 9.109375, "router_z_loss_mlp": 1.328125, "step": 379, "time_per_iteration": 3.3721580505371094 }, { "auxiliary_loss_clip": 0.08795945, "auxiliary_loss_mlp": 0.02103198, "balance_loss_clip": 0.07516603, "balance_loss_mlp": 0.01920187, "epoch": 0.022846836013828347, "flos": 38517935823360.0, "grad_norm": 13.960269217680894, "language_loss": 0.99256343, "learning_rate": 3.824592231451859e-06, "loss": 1.10155487, "num_input_tokens_seen": 7960840, "router_z_loss_clip": 12.796875, "router_z_loss_mlp": 1.83007812, "step": 380, "time_per_iteration": 2.886889934539795 }, { "auxiliary_loss_clip": 0.08808355, "auxiliary_loss_mlp": 0.0207914, "balance_loss_clip": 0.0752324, "balance_loss_mlp": 0.01904141, "epoch": 0.02290695926649632, "flos": 20965768853760.0, "grad_norm": 21.640219370343612, "language_loss": 1.14180756, "learning_rate": 3.826284353801652e-06, "loss": 1.25068259, "num_input_tokens_seen": 7975500, "router_z_loss_clip": 12.84375, "router_z_loss_mlp": 1.75097656, "step": 381, "time_per_iteration": 2.683680295944214 }, { "auxiliary_loss_clip": 0.08820011, "auxiliary_loss_mlp": 0.02139106, "balance_loss_clip": 0.07529296, "balance_loss_mlp": 0.01964489, "epoch": 0.022967082519164288, "flos": 24028895836800.0, "grad_norm": 6.808444259381938, "language_loss": 1.03946352, "learning_rate": 3.827972040701142e-06, "loss": 1.14905477, "num_input_tokens_seen": 7993880, "router_z_loss_clip": 12.9140625, "router_z_loss_mlp": 1.74707031, "step": 382, "time_per_iteration": 2.7551612854003906 }, { "auxiliary_loss_clip": 0.08803385, "auxiliary_loss_mlp": 0.02103384, "balance_loss_clip": 0.0753744, "balance_loss_mlp": 0.01933058, "epoch": 0.023027205771832256, "flos": 21003643699200.0, "grad_norm": 8.229934834045526, "language_loss": 1.02564669, "learning_rate": 3.829655315342268e-06, "loss": 1.13471437, "num_input_tokens_seen": 8012730, "router_z_loss_clip": 12.65625, "router_z_loss_mlp": 1.703125, "step": 383, "time_per_iteration": 2.714245080947876 }, { "auxiliary_loss_clip": 0.08744098, "auxiliary_loss_mlp": 0.02138166, "balance_loss_clip": 0.07485208, "balance_loss_mlp": 0.01948671, "epoch": 0.023087329024500225, "flos": 21367172638080.0, "grad_norm": 8.371238333500088, "language_loss": 1.02199805, "learning_rate": 3.831334200735543e-06, "loss": 1.13082063, "num_input_tokens_seen": 8031275, "router_z_loss_clip": 12.59375, "router_z_loss_mlp": 1.89257812, "step": 384, "time_per_iteration": 2.7247872352600098 }, { "auxiliary_loss_clip": 0.08692361, "auxiliary_loss_mlp": 0.02121918, "balance_loss_clip": 0.07462803, "balance_loss_mlp": 0.01938622, "epoch": 0.023147452277168194, "flos": 21879014503680.0, "grad_norm": 10.357338491228886, "language_loss": 1.03473544, "learning_rate": 3.8330087197119426e-06, "loss": 1.14287829, "num_input_tokens_seen": 8051600, "router_z_loss_clip": 12.296875, "router_z_loss_mlp": 1.83300781, "step": 385, "time_per_iteration": 2.800464153289795 }, { "auxiliary_loss_clip": 0.08709672, "auxiliary_loss_mlp": 0.02135559, "balance_loss_clip": 0.07468034, "balance_loss_mlp": 0.01958843, "epoch": 0.023207575529836166, "flos": 18922719876480.0, "grad_norm": 61.59106536135228, "language_loss": 0.81051028, "learning_rate": 3.83467889492477e-06, "loss": 0.9189626, "num_input_tokens_seen": 8070600, "router_z_loss_clip": 12.40625, "router_z_loss_mlp": 1.76757812, "step": 386, "time_per_iteration": 2.721559762954712 }, { "auxiliary_loss_clip": 0.08754762, "auxiliary_loss_mlp": 0.02164058, "balance_loss_clip": 0.07497977, "balance_loss_mlp": 0.0198286, "epoch": 0.023267698782504134, "flos": 25052998838400.0, "grad_norm": 5.6642604959294225, "language_loss": 1.00869286, "learning_rate": 3.836344748851495e-06, "loss": 1.11788106, "num_input_tokens_seen": 8090680, "router_z_loss_clip": 12.5625, "router_z_loss_mlp": 1.8125, "step": 387, "time_per_iteration": 2.7987332344055176 }, { "auxiliary_loss_clip": 0.08759755, "auxiliary_loss_mlp": 0.02156811, "balance_loss_clip": 0.0752253, "balance_loss_mlp": 0.019841, "epoch": 0.023327822035172103, "flos": 28887221819520.0, "grad_norm": 10.203095464786248, "language_loss": 0.98497391, "learning_rate": 3.838006303795566e-06, "loss": 1.09413958, "num_input_tokens_seen": 8114610, "router_z_loss_clip": 12.375, "router_z_loss_mlp": 1.72753906, "step": 388, "time_per_iteration": 2.9316813945770264 }, { "auxiliary_loss_clip": 0.08814891, "auxiliary_loss_mlp": 0.02120944, "balance_loss_clip": 0.07554325, "balance_loss_mlp": 0.01958724, "epoch": 0.02338794528784007, "flos": 27128178656640.0, "grad_norm": 10.780272865075728, "language_loss": 1.11646926, "learning_rate": 3.839663581888206e-06, "loss": 1.22582769, "num_input_tokens_seen": 8133975, "router_z_loss_clip": 12.59375, "router_z_loss_mlp": 1.62402344, "step": 389, "time_per_iteration": 2.877930164337158 }, { "auxiliary_loss_clip": 0.0883325, "auxiliary_loss_mlp": 0.02134253, "balance_loss_clip": 0.07575743, "balance_loss_mlp": 0.01981761, "epoch": 0.02344806854050804, "flos": 21328375397760.0, "grad_norm": 10.814109957510984, "language_loss": 1.02814746, "learning_rate": 3.841316605090178e-06, "loss": 1.13782251, "num_input_tokens_seen": 8153570, "router_z_loss_clip": 12.5625, "router_z_loss_mlp": 1.52539062, "step": 390, "time_per_iteration": 2.7848708629608154 }, { "auxiliary_loss_clip": 0.08840166, "auxiliary_loss_mlp": 0.0211514, "balance_loss_clip": 0.07578407, "balance_loss_mlp": 0.01965699, "epoch": 0.023508191793176012, "flos": 24796847306880.0, "grad_norm": 10.982503897115624, "language_loss": 1.10594165, "learning_rate": 3.842965395193529e-06, "loss": 1.21549475, "num_input_tokens_seen": 8170075, "router_z_loss_clip": 12.6328125, "router_z_loss_mlp": 1.49609375, "step": 391, "time_per_iteration": 2.748985767364502 }, { "auxiliary_loss_clip": 0.08896914, "auxiliary_loss_mlp": 0.02074488, "balance_loss_clip": 0.07627341, "balance_loss_mlp": 0.01926097, "epoch": 0.02356831504584398, "flos": 26002651887360.0, "grad_norm": 3.817786386403993, "language_loss": 1.03600502, "learning_rate": 3.84460997382332e-06, "loss": 1.14571905, "num_input_tokens_seen": 8190420, "router_z_loss_clip": 12.703125, "router_z_loss_mlp": 1.484375, "step": 392, "time_per_iteration": 2.8258187770843506 }, { "auxiliary_loss_clip": 0.08940631, "auxiliary_loss_mlp": 0.0209703, "balance_loss_clip": 0.07650009, "balance_loss_mlp": 0.01955981, "epoch": 0.02362843829851195, "flos": 19068475253760.0, "grad_norm": 4.848308014971396, "language_loss": 1.02524877, "learning_rate": 3.8462503624393256e-06, "loss": 1.13562536, "num_input_tokens_seen": 8208790, "router_z_loss_clip": 12.8984375, "router_z_loss_mlp": 1.41015625, "step": 393, "time_per_iteration": 2.6984028816223145 }, { "auxiliary_loss_clip": 0.08961076, "auxiliary_loss_mlp": 0.02032299, "balance_loss_clip": 0.07674353, "balance_loss_mlp": 0.01896115, "epoch": 0.023688561551179918, "flos": 16076611768320.0, "grad_norm": 3.509240477878542, "language_loss": 0.95718193, "learning_rate": 3.84788658233771e-06, "loss": 1.06711566, "num_input_tokens_seen": 8226885, "router_z_loss_clip": 12.8671875, "router_z_loss_mlp": 1.36230469, "step": 394, "time_per_iteration": 2.719968318939209 }, { "auxiliary_loss_clip": 0.08946715, "auxiliary_loss_mlp": 0.02018129, "balance_loss_clip": 0.07672442, "balance_loss_mlp": 0.0187689, "epoch": 0.023748684803847887, "flos": 21730575795840.0, "grad_norm": 12.137866363743248, "language_loss": 0.98482603, "learning_rate": 3.84951865465269e-06, "loss": 1.09447455, "num_input_tokens_seen": 8246825, "router_z_loss_clip": 12.75, "router_z_loss_mlp": 1.41308594, "step": 395, "time_per_iteration": 2.72884464263916 }, { "auxiliary_loss_clip": 0.08008762, "auxiliary_loss_mlp": 0.01706313, "balance_loss_clip": 0.07277077, "balance_loss_mlp": 0.01561736, "epoch": 0.02380880805651586, "flos": 61944299349120.0, "grad_norm": 1.0669143848205522, "language_loss": 0.64365673, "learning_rate": 3.851146600358172e-06, "loss": 0.74080747, "num_input_tokens_seen": 8302835, "router_z_loss_clip": 7.31640625, "router_z_loss_mlp": 1.44726562, "step": 396, "time_per_iteration": 3.189471960067749 }, { "auxiliary_loss_clip": 0.08895213, "auxiliary_loss_mlp": 0.01963343, "balance_loss_clip": 0.07650186, "balance_loss_mlp": 0.01816859, "epoch": 0.023868931309183827, "flos": 20272518898560.0, "grad_norm": 7.931672956328554, "language_loss": 1.04430842, "learning_rate": 3.852770440269372e-06, "loss": 1.1528939, "num_input_tokens_seen": 8320745, "router_z_loss_clip": 12.453125, "router_z_loss_mlp": 1.46484375, "step": 397, "time_per_iteration": 2.7050302028656006 }, { "auxiliary_loss_clip": 0.08901544, "auxiliary_loss_mlp": 0.01944167, "balance_loss_clip": 0.07645249, "balance_loss_mlp": 0.01783473, "epoch": 0.023929054561851796, "flos": 21144954810240.0, "grad_norm": 17.886586955332092, "language_loss": 1.05454719, "learning_rate": 3.854390195044404e-06, "loss": 1.1630044, "num_input_tokens_seen": 8339540, "router_z_loss_clip": 12.5625, "router_z_loss_mlp": 1.60742188, "step": 398, "time_per_iteration": 2.736851453781128 }, { "auxiliary_loss_clip": 0.08838461, "auxiliary_loss_mlp": 0.01905877, "balance_loss_clip": 0.07620921, "balance_loss_mlp": 0.01739938, "epoch": 0.023989177814519765, "flos": 13703548285440.0, "grad_norm": 8.882878175070198, "language_loss": 1.12087321, "learning_rate": 3.856005885185868e-06, "loss": 1.22831666, "num_input_tokens_seen": 8354890, "router_z_loss_clip": 12.1796875, "router_z_loss_mlp": 1.66015625, "step": 399, "time_per_iteration": 2.699446201324463 }, { "auxiliary_loss_clip": 0.0879382, "auxiliary_loss_mlp": 0.01913722, "balance_loss_clip": 0.07599196, "balance_loss_mlp": 0.0173014, "epoch": 0.024049301067187733, "flos": 26329060667520.0, "grad_norm": 5.725661750373347, "language_loss": 0.98875171, "learning_rate": 3.857617531042398e-06, "loss": 1.0958271, "num_input_tokens_seen": 8375845, "router_z_loss_clip": 11.953125, "router_z_loss_mlp": 1.83691406, "step": 400, "time_per_iteration": 2.8085501194000244 }, { "auxiliary_loss_clip": 0.08664083, "auxiliary_loss_mlp": 0.01925357, "balance_loss_clip": 0.07516336, "balance_loss_mlp": 0.01736435, "epoch": 0.024109424319855705, "flos": 24432270192000.0, "grad_norm": 22.991250584627956, "language_loss": 0.94753551, "learning_rate": 3.8592251528102065e-06, "loss": 1.05342984, "num_input_tokens_seen": 8395240, "router_z_loss_clip": 11.484375, "router_z_loss_mlp": 1.88964844, "step": 401, "time_per_iteration": 2.7653586864471436 }, { "auxiliary_loss_clip": 0.08672163, "auxiliary_loss_mlp": 0.01975411, "balance_loss_clip": 0.07524493, "balance_loss_mlp": 0.0176379, "epoch": 0.024169547572523674, "flos": 29611764074880.0, "grad_norm": 7.680357027061313, "language_loss": 0.94644737, "learning_rate": 3.8608287705345976e-06, "loss": 1.05292308, "num_input_tokens_seen": 8416950, "router_z_loss_clip": 11.4921875, "router_z_loss_mlp": 2.11621094, "step": 402, "time_per_iteration": 2.773346185684204 }, { "auxiliary_loss_clip": 0.08590572, "auxiliary_loss_mlp": 0.01896812, "balance_loss_clip": 0.0746823, "balance_loss_mlp": 0.01690056, "epoch": 0.024229670825191642, "flos": 22608042952320.0, "grad_norm": 13.080969842515431, "language_loss": 1.12640262, "learning_rate": 3.86242840411147e-06, "loss": 1.23127651, "num_input_tokens_seen": 8433660, "router_z_loss_clip": 11.2265625, "router_z_loss_mlp": 2.06933594, "step": 403, "time_per_iteration": 2.7166121006011963 }, { "auxiliary_loss_clip": 0.08582766, "auxiliary_loss_mlp": 0.01926887, "balance_loss_clip": 0.07450296, "balance_loss_mlp": 0.01706206, "epoch": 0.02428979407785961, "flos": 18156110071680.0, "grad_norm": 15.638467931917857, "language_loss": 1.16199422, "learning_rate": 3.864024073288798e-06, "loss": 1.2670908, "num_input_tokens_seen": 8450180, "router_z_loss_clip": 11.3359375, "router_z_loss_mlp": 2.20898438, "step": 404, "time_per_iteration": 2.6974244117736816 }, { "auxiliary_loss_clip": 0.08484534, "auxiliary_loss_mlp": 0.01916243, "balance_loss_clip": 0.07379355, "balance_loss_mlp": 0.0168183, "epoch": 0.024349917330527583, "flos": 15310463160960.0, "grad_norm": 7.589048720689155, "language_loss": 1.04564297, "learning_rate": 3.865615797668091e-06, "loss": 1.14965081, "num_input_tokens_seen": 8467775, "router_z_loss_clip": 11.0390625, "router_z_loss_mlp": 2.34375, "step": 405, "time_per_iteration": 2.6993160247802734 }, { "auxiliary_loss_clip": 0.08434704, "auxiliary_loss_mlp": 0.01835907, "balance_loss_clip": 0.07341028, "balance_loss_mlp": 0.01609505, "epoch": 0.024410040583195552, "flos": 20779623008640.0, "grad_norm": 663.1550346404129, "language_loss": 1.10934949, "learning_rate": 3.867203596705844e-06, "loss": 1.21205544, "num_input_tokens_seen": 8486765, "router_z_loss_clip": 10.9375, "router_z_loss_mlp": 2.26269531, "step": 406, "time_per_iteration": 2.7323110103607178 }, { "auxiliary_loss_clip": 0.08439983, "auxiliary_loss_mlp": 0.01875439, "balance_loss_clip": 0.07333603, "balance_loss_mlp": 0.01632061, "epoch": 0.02447016383586352, "flos": 21805319237760.0, "grad_norm": 11.47182456603053, "language_loss": 1.01831198, "learning_rate": 3.86878748971496e-06, "loss": 1.12146628, "num_input_tokens_seen": 8506515, "router_z_loss_clip": 11.0703125, "router_z_loss_mlp": 2.43359375, "step": 407, "time_per_iteration": 2.7526111602783203 }, { "auxiliary_loss_clip": 0.0842638, "auxiliary_loss_mlp": 0.01854262, "balance_loss_clip": 0.07328645, "balance_loss_mlp": 0.01621947, "epoch": 0.02453028708853149, "flos": 33956529183360.0, "grad_norm": 20.422732686586677, "language_loss": 0.87534207, "learning_rate": 3.8703674958661596e-06, "loss": 0.97814852, "num_input_tokens_seen": 8528035, "router_z_loss_clip": 10.96875, "router_z_loss_mlp": 2.32421875, "step": 408, "time_per_iteration": 2.8248300552368164 }, { "auxiliary_loss_clip": 0.08432455, "auxiliary_loss_mlp": 0.01882386, "balance_loss_clip": 0.07319137, "balance_loss_mlp": 0.01618409, "epoch": 0.024590410341199458, "flos": 21798485130240.0, "grad_norm": 72.10927979496245, "language_loss": 1.12008786, "learning_rate": 3.871943634189376e-06, "loss": 1.22323632, "num_input_tokens_seen": 8546455, "router_z_loss_clip": 11.1484375, "router_z_loss_mlp": 2.63867188, "step": 409, "time_per_iteration": 2.825770378112793 }, { "auxiliary_loss_clip": 0.08439121, "auxiliary_loss_mlp": 0.01837884, "balance_loss_clip": 0.07319718, "balance_loss_mlp": 0.0158249, "epoch": 0.02465053359386743, "flos": 35123243034240.0, "grad_norm": 15.88447171211352, "language_loss": 0.99107659, "learning_rate": 3.873515923575128e-06, "loss": 1.09384656, "num_input_tokens_seen": 8568450, "router_z_loss_clip": 11.1875, "router_z_loss_mlp": 2.55273438, "step": 410, "time_per_iteration": 2.851783037185669 }, { "auxiliary_loss_clip": 0.08426885, "auxiliary_loss_mlp": 0.01876917, "balance_loss_clip": 0.07304651, "balance_loss_mlp": 0.01606837, "epoch": 0.0247106568465354, "flos": 27458360870400.0, "grad_norm": 20.387806067701575, "language_loss": 0.96514916, "learning_rate": 3.875084382775879e-06, "loss": 1.06818724, "num_input_tokens_seen": 8589340, "router_z_loss_clip": 11.2421875, "router_z_loss_mlp": 2.69921875, "step": 411, "time_per_iteration": 2.8029165267944336 }, { "auxiliary_loss_clip": 0.08414158, "auxiliary_loss_mlp": 0.01864767, "balance_loss_clip": 0.07278149, "balance_loss_mlp": 0.01584578, "epoch": 0.024770780099203367, "flos": 20709994665600.0, "grad_norm": 37.0234858880943, "language_loss": 1.03647625, "learning_rate": 3.87664903040738e-06, "loss": 1.13926554, "num_input_tokens_seen": 8607150, "router_z_loss_clip": 11.375, "router_z_loss_mlp": 2.80273438, "step": 412, "time_per_iteration": 4.389789581298828 }, { "auxiliary_loss_clip": 0.08061095, "auxiliary_loss_mlp": 0.01506143, "balance_loss_clip": 0.07260823, "balance_loss_mlp": 0.01336485, "epoch": 0.024830903351871336, "flos": 69571264740480.0, "grad_norm": 1.09362025659424, "language_loss": 0.58883524, "learning_rate": 3.878209884949994e-06, "loss": 0.68450761, "num_input_tokens_seen": 8669865, "router_z_loss_clip": 8.0, "router_z_loss_mlp": 1.69824219, "step": 413, "time_per_iteration": 4.934236288070679 }, { "auxiliary_loss_clip": 0.08408618, "auxiliary_loss_mlp": 0.01818727, "balance_loss_clip": 0.07268016, "balance_loss_mlp": 0.01546166, "epoch": 0.024891026604539304, "flos": 32278728153600.0, "grad_norm": 13.384721180224536, "language_loss": 0.92307699, "learning_rate": 3.879766964750006e-06, "loss": 1.02535033, "num_input_tokens_seen": 8690235, "router_z_loss_clip": 11.3984375, "router_z_loss_mlp": 2.72460938, "step": 414, "time_per_iteration": 4.372466325759888 }, { "auxiliary_loss_clip": 0.08361179, "auxiliary_loss_mlp": 0.01749862, "balance_loss_clip": 0.07245225, "balance_loss_mlp": 0.01494086, "epoch": 0.024951149857207276, "flos": 18845712374400.0, "grad_norm": 56.644838900777344, "language_loss": 0.96696556, "learning_rate": 3.881320288020917e-06, "loss": 1.0680759, "num_input_tokens_seen": 8706295, "router_z_loss_clip": 11.171875, "router_z_loss_mlp": 2.55664062, "step": 415, "time_per_iteration": 4.095777273178101 }, { "auxiliary_loss_clip": 0.08420383, "auxiliary_loss_mlp": 0.01810087, "balance_loss_clip": 0.07274027, "balance_loss_mlp": 0.01523413, "epoch": 0.025011273109875245, "flos": 15382565199360.0, "grad_norm": 31.328662190304307, "language_loss": 1.21718085, "learning_rate": 3.882869872844723e-06, "loss": 1.31948566, "num_input_tokens_seen": 8724200, "router_z_loss_clip": 11.4609375, "router_z_loss_mlp": 2.86914062, "step": 416, "time_per_iteration": 2.6890594959259033 }, { "auxiliary_loss_clip": 0.08417129, "auxiliary_loss_mlp": 0.01781868, "balance_loss_clip": 0.07278772, "balance_loss_mlp": 0.01493286, "epoch": 0.025071396362543213, "flos": 18921336284160.0, "grad_norm": 35.76043439262441, "language_loss": 0.9024933, "learning_rate": 3.884415737173176e-06, "loss": 1.00448322, "num_input_tokens_seen": 8744170, "router_z_loss_clip": 11.390625, "router_z_loss_mlp": 2.88476562, "step": 417, "time_per_iteration": 2.7173030376434326 }, { "auxiliary_loss_clip": 0.0838825, "auxiliary_loss_mlp": 0.01747444, "balance_loss_clip": 0.07277165, "balance_loss_mlp": 0.01487472, "epoch": 0.025131519615211182, "flos": 25345012717440.0, "grad_norm": 8.119113085136403, "language_loss": 0.90497577, "learning_rate": 3.8859578988290344e-06, "loss": 1.00633276, "num_input_tokens_seen": 8765120, "router_z_loss_clip": 11.1171875, "router_z_loss_mlp": 2.60351562, "step": 418, "time_per_iteration": 2.7951149940490723 }, { "auxiliary_loss_clip": 0.08401385, "auxiliary_loss_mlp": 0.01750493, "balance_loss_clip": 0.0728187, "balance_loss_mlp": 0.01485181, "epoch": 0.02519164286787915, "flos": 18959169202560.0, "grad_norm": 12.681471269773734, "language_loss": 1.06282973, "learning_rate": 3.887496375507294e-06, "loss": 1.1643486, "num_input_tokens_seen": 8783500, "router_z_loss_clip": 11.1953125, "router_z_loss_mlp": 2.65234375, "step": 419, "time_per_iteration": 2.896268606185913 }, { "auxiliary_loss_clip": 0.083846, "auxiliary_loss_mlp": 0.01713251, "balance_loss_clip": 0.07271554, "balance_loss_mlp": 0.01451182, "epoch": 0.025251766120547123, "flos": 17426913914880.0, "grad_norm": 257.43679760974385, "language_loss": 0.88551891, "learning_rate": 3.8890311847764065e-06, "loss": 0.9864974, "num_input_tokens_seen": 8801175, "router_z_loss_clip": 11.140625, "router_z_loss_mlp": 2.62109375, "step": 420, "time_per_iteration": 2.7617626190185547 }, { "auxiliary_loss_clip": 0.08405273, "auxiliary_loss_mlp": 0.01726557, "balance_loss_clip": 0.07286938, "balance_loss_mlp": 0.01472498, "epoch": 0.02531188937321509, "flos": 25052328005760.0, "grad_norm": 10.94133527795272, "language_loss": 0.9227348, "learning_rate": 3.890562344079484e-06, "loss": 1.0240531, "num_input_tokens_seen": 8820215, "router_z_loss_clip": 11.1796875, "router_z_loss_mlp": 2.54101562, "step": 421, "time_per_iteration": 2.7364959716796875 }, { "auxiliary_loss_clip": 0.08388168, "auxiliary_loss_mlp": 0.01709363, "balance_loss_clip": 0.07287371, "balance_loss_mlp": 0.01462075, "epoch": 0.02537201262588306, "flos": 30600214364160.0, "grad_norm": 81.18294454620147, "language_loss": 1.01825976, "learning_rate": 3.89208987073549e-06, "loss": 1.11923504, "num_input_tokens_seen": 8839660, "router_z_loss_clip": 11.0234375, "router_z_loss_mlp": 2.47363281, "step": 422, "time_per_iteration": 2.83280611038208 }, { "auxiliary_loss_clip": 0.08410043, "auxiliary_loss_mlp": 0.0169572, "balance_loss_clip": 0.0730456, "balance_loss_mlp": 0.01452914, "epoch": 0.02543213587855103, "flos": 26072154449280.0, "grad_norm": 23.012178957255234, "language_loss": 0.97509295, "learning_rate": 3.893613781940409e-06, "loss": 1.07615066, "num_input_tokens_seen": 8859280, "router_z_loss_clip": 11.0625, "router_z_loss_mlp": 2.42578125, "step": 423, "time_per_iteration": 2.769129753112793 }, { "auxiliary_loss_clip": 0.084075, "auxiliary_loss_mlp": 0.01753515, "balance_loss_clip": 0.07321095, "balance_loss_mlp": 0.01504988, "epoch": 0.025492259131218997, "flos": 36030744679680.0, "grad_norm": 11.953165936607233, "language_loss": 0.87479258, "learning_rate": 3.895134094768415e-06, "loss": 0.97640276, "num_input_tokens_seen": 8880560, "router_z_loss_clip": 10.859375, "router_z_loss_mlp": 2.484375, "step": 424, "time_per_iteration": 2.845954656600952 }, { "auxiliary_loss_clip": 0.08407094, "auxiliary_loss_mlp": 0.01756681, "balance_loss_clip": 0.07324042, "balance_loss_mlp": 0.01503576, "epoch": 0.02555238238388697, "flos": 18593963182080.0, "grad_norm": 10.023071274729306, "language_loss": 1.01591742, "learning_rate": 3.896650826173015e-06, "loss": 1.11755514, "num_input_tokens_seen": 8899155, "router_z_loss_clip": 10.8359375, "router_z_loss_mlp": 2.53125, "step": 425, "time_per_iteration": 2.72218918800354 }, { "auxiliary_loss_clip": 0.08392456, "auxiliary_loss_mlp": 0.01791741, "balance_loss_clip": 0.0730297, "balance_loss_mlp": 0.01509454, "epoch": 0.025612505636554938, "flos": 24250023561600.0, "grad_norm": 24.855679209155984, "language_loss": 1.02044201, "learning_rate": 3.898163992988186e-06, "loss": 1.12228394, "num_input_tokens_seen": 8917890, "router_z_loss_clip": 10.8984375, "router_z_loss_mlp": 2.82421875, "step": 426, "time_per_iteration": 2.724874973297119 }, { "auxiliary_loss_clip": 0.07811658, "auxiliary_loss_mlp": 0.0162531, "balance_loss_clip": 0.07109191, "balance_loss_mlp": 0.01505624, "epoch": 0.025672628889222907, "flos": 60606617241600.0, "grad_norm": 0.8828441182575233, "language_loss": 0.57222718, "learning_rate": 3.899673611929491e-06, "loss": 0.66659689, "num_input_tokens_seen": 8978260, "router_z_loss_clip": 7.015625, "router_z_loss_mlp": 1.19433594, "step": 427, "time_per_iteration": 3.441506862640381 }, { "auxiliary_loss_clip": 0.08378599, "auxiliary_loss_mlp": 0.01851589, "balance_loss_clip": 0.07277112, "balance_loss_mlp": 0.0155347, "epoch": 0.025732752141890875, "flos": 19579352797440.0, "grad_norm": 13.819012731574857, "language_loss": 1.04156351, "learning_rate": 3.901179699595194e-06, "loss": 1.14386535, "num_input_tokens_seen": 8994460, "router_z_loss_clip": 11.015625, "router_z_loss_mlp": 2.98046875, "step": 428, "time_per_iteration": 2.6875052452087402 }, { "auxiliary_loss_clip": 0.08346203, "auxiliary_loss_mlp": 0.01807252, "balance_loss_clip": 0.07259046, "balance_loss_mlp": 0.01511232, "epoch": 0.025792875394558847, "flos": 31292164581120.0, "grad_norm": 9.713352182957394, "language_loss": 0.95669609, "learning_rate": 3.902682272467353e-06, "loss": 1.05823064, "num_input_tokens_seen": 9016670, "router_z_loss_clip": 10.875, "router_z_loss_mlp": 2.95898438, "step": 429, "time_per_iteration": 2.841230869293213 }, { "auxiliary_loss_clip": 0.08344866, "auxiliary_loss_mlp": 0.01866202, "balance_loss_clip": 0.07258245, "balance_loss_mlp": 0.01539854, "epoch": 0.025852998647226816, "flos": 32387824569600.0, "grad_norm": 15.427505328741477, "language_loss": 0.9995032, "learning_rate": 3.904181346912895e-06, "loss": 1.101614, "num_input_tokens_seen": 9039720, "router_z_loss_clip": 10.8671875, "router_z_loss_mlp": 3.26171875, "step": 430, "time_per_iteration": 2.7961831092834473 }, { "auxiliary_loss_clip": 0.08276956, "auxiliary_loss_mlp": 0.01812775, "balance_loss_clip": 0.07214629, "balance_loss_mlp": 0.01505692, "epoch": 0.025913121899894784, "flos": 20199452538240.0, "grad_norm": 15.979784585161854, "language_loss": 0.96008289, "learning_rate": 3.905676939184698e-06, "loss": 1.0609802, "num_input_tokens_seen": 9059850, "router_z_loss_clip": 10.6328125, "router_z_loss_mlp": 3.0703125, "step": 431, "time_per_iteration": 2.7295918464660645 }, { "auxiliary_loss_clip": 0.08276859, "auxiliary_loss_mlp": 0.0181774, "balance_loss_clip": 0.0720863, "balance_loss_mlp": 0.01498832, "epoch": 0.025973245152562753, "flos": 14725680716160.0, "grad_norm": 23.064503690313426, "language_loss": 1.05336046, "learning_rate": 3.907169065422638e-06, "loss": 1.15430641, "num_input_tokens_seen": 9077590, "router_z_loss_clip": 10.6796875, "router_z_loss_mlp": 3.18945312, "step": 432, "time_per_iteration": 2.781553268432617 }, { "auxiliary_loss_clip": 0.08281875, "auxiliary_loss_mlp": 0.01736632, "balance_loss_clip": 0.07200839, "balance_loss_mlp": 0.01438322, "epoch": 0.02603336840523072, "flos": 31000947315840.0, "grad_norm": 128.6590210260777, "language_loss": 0.89968473, "learning_rate": 3.908657741654636e-06, "loss": 0.9998697, "num_input_tokens_seen": 9099880, "router_z_loss_clip": 10.8203125, "router_z_loss_mlp": 2.98242188, "step": 433, "time_per_iteration": 3.0130672454833984 }, { "auxiliary_loss_clip": 0.08263177, "auxiliary_loss_mlp": 0.01714673, "balance_loss_clip": 0.07179515, "balance_loss_mlp": 0.0141808, "epoch": 0.026093491657898694, "flos": 17679753210240.0, "grad_norm": 185.82644394838275, "language_loss": 1.03115284, "learning_rate": 3.910142983797699e-06, "loss": 1.13093138, "num_input_tokens_seen": 9118620, "router_z_loss_clip": 10.8359375, "router_z_loss_mlp": 2.96484375, "step": 434, "time_per_iteration": 2.7954752445220947 }, { "auxiliary_loss_clip": 0.08270057, "auxiliary_loss_mlp": 0.01636649, "balance_loss_clip": 0.07164779, "balance_loss_mlp": 0.01352454, "epoch": 0.026153614910566662, "flos": 17863593068160.0, "grad_norm": 33.15493768432381, "language_loss": 0.96225137, "learning_rate": 3.9116248076589305e-06, "loss": 1.0613184, "num_input_tokens_seen": 9135655, "router_z_loss_clip": 11.0546875, "router_z_loss_mlp": 2.83984375, "step": 435, "time_per_iteration": 2.697329521179199 }, { "auxiliary_loss_clip": 0.08282125, "auxiliary_loss_mlp": 0.0164971, "balance_loss_clip": 0.07180528, "balance_loss_mlp": 0.01363226, "epoch": 0.02621373816323463, "flos": 20017289761920.0, "grad_norm": 138.90398230831343, "language_loss": 1.02647448, "learning_rate": 3.913103228936546e-06, "loss": 1.12579274, "num_input_tokens_seen": 9153520, "router_z_loss_clip": 11.03125, "router_z_loss_mlp": 2.86523438, "step": 436, "time_per_iteration": 2.733081340789795 }, { "auxiliary_loss_clip": 0.082919, "auxiliary_loss_mlp": 0.01575514, "balance_loss_clip": 0.07181831, "balance_loss_mlp": 0.01297805, "epoch": 0.0262738614159026, "flos": 19287213137280.0, "grad_norm": 1871.6594367847856, "language_loss": 0.94274378, "learning_rate": 3.914578263220868e-06, "loss": 1.04141784, "num_input_tokens_seen": 9170750, "router_z_loss_clip": 11.1015625, "router_z_loss_mlp": 2.77734375, "step": 437, "time_per_iteration": 2.750288963317871 }, { "auxiliary_loss_clip": 0.08302096, "auxiliary_loss_mlp": 0.01582782, "balance_loss_clip": 0.07158275, "balance_loss_mlp": 0.01280658, "epoch": 0.026333984668570568, "flos": 18813204190080.0, "grad_norm": 54.446580011025446, "language_loss": 1.05407143, "learning_rate": 3.916049925995316e-06, "loss": 1.15292025, "num_input_tokens_seen": 9188430, "router_z_loss_clip": 11.4453125, "router_z_loss_mlp": 3.02148438, "step": 438, "time_per_iteration": 2.6948108673095703 }, { "auxiliary_loss_clip": 0.07776922, "auxiliary_loss_mlp": 0.01679164, "balance_loss_clip": 0.07077076, "balance_loss_mlp": 0.0140794, "epoch": 0.02639410792123854, "flos": 64593723196800.0, "grad_norm": 1.1089898127668145, "language_loss": 0.62909126, "learning_rate": 3.917518232637377e-06, "loss": 0.72365212, "num_input_tokens_seen": 9255835, "router_z_loss_clip": 7.0, "router_z_loss_mlp": 2.71484375, "step": 439, "time_per_iteration": 3.408757448196411 }, { "auxiliary_loss_clip": 0.08282258, "auxiliary_loss_mlp": 0.01569013, "balance_loss_clip": 0.07159215, "balance_loss_mlp": 0.0128291, "epoch": 0.02645423117390651, "flos": 28480661009280.0, "grad_norm": 250.58291782054113, "language_loss": 0.91598636, "learning_rate": 3.918983198419573e-06, "loss": 1.01449907, "num_input_tokens_seen": 9276835, "router_z_loss_clip": 11.234375, "router_z_loss_mlp": 2.86328125, "step": 440, "time_per_iteration": 2.767333984375 }, { "auxiliary_loss_clip": 0.08261073, "auxiliary_loss_mlp": 0.01546856, "balance_loss_clip": 0.07138279, "balance_loss_mlp": 0.01277348, "epoch": 0.026514354426574478, "flos": 18557094585600.0, "grad_norm": 95.13990367909632, "language_loss": 0.97542405, "learning_rate": 3.920444838510415e-06, "loss": 1.07350326, "num_input_tokens_seen": 9295075, "router_z_loss_clip": 11.21875, "router_z_loss_mlp": 2.6953125, "step": 441, "time_per_iteration": 2.6861679553985596 }, { "auxiliary_loss_clip": 0.08269531, "auxiliary_loss_mlp": 0.01580771, "balance_loss_clip": 0.07144554, "balance_loss_mlp": 0.01284369, "epoch": 0.026574477679242446, "flos": 20674090391040.0, "grad_norm": 149.448529615829, "language_loss": 0.93086505, "learning_rate": 3.92190316797534e-06, "loss": 1.02936816, "num_input_tokens_seen": 9314205, "router_z_loss_clip": 11.25, "router_z_loss_mlp": 2.96484375, "step": 442, "time_per_iteration": 2.716130018234253 }, { "auxiliary_loss_clip": 0.07617158, "auxiliary_loss_mlp": 0.0134971, "balance_loss_clip": 0.06983034, "balance_loss_mlp": 0.0125172, "epoch": 0.026634600931910415, "flos": 57974718896640.0, "grad_norm": 0.9511724543911444, "language_loss": 0.64699095, "learning_rate": 3.92335820177765e-06, "loss": 0.73665965, "num_input_tokens_seen": 9367395, "router_z_loss_clip": 6.33984375, "router_z_loss_mlp": 0.97949219, "step": 443, "time_per_iteration": 3.2211360931396484 }, { "auxiliary_loss_clip": 0.0824962, "auxiliary_loss_mlp": 0.01587735, "balance_loss_clip": 0.07142894, "balance_loss_mlp": 0.01306019, "epoch": 0.026694724184578387, "flos": 15820586017920.0, "grad_norm": 43.910725718542054, "language_loss": 1.01581132, "learning_rate": 3.924809954779425e-06, "loss": 1.11418486, "num_input_tokens_seen": 9385185, "router_z_loss_clip": 11.0703125, "router_z_loss_mlp": 2.81835938, "step": 444, "time_per_iteration": 2.678849458694458 }, { "auxiliary_loss_clip": 0.08280735, "auxiliary_loss_mlp": 0.01574288, "balance_loss_clip": 0.07152508, "balance_loss_mlp": 0.01306306, "epoch": 0.026754847437246355, "flos": 23446922503680.0, "grad_norm": 32.620311619870414, "language_loss": 1.08006191, "learning_rate": 3.9262584417424425e-06, "loss": 1.17861211, "num_input_tokens_seen": 9403225, "router_z_loss_clip": 11.2890625, "router_z_loss_mlp": 2.67773438, "step": 445, "time_per_iteration": 2.739989757537842 }, { "auxiliary_loss_clip": 0.08227271, "auxiliary_loss_mlp": 0.01585422, "balance_loss_clip": 0.07145882, "balance_loss_mlp": 0.01323543, "epoch": 0.026814970689914324, "flos": 17346552249600.0, "grad_norm": 48.39965763099443, "language_loss": 1.07364511, "learning_rate": 3.9277036773290725e-06, "loss": 1.171772, "num_input_tokens_seen": 9420540, "router_z_loss_clip": 10.8125, "router_z_loss_mlp": 2.62109375, "step": 446, "time_per_iteration": 2.8004019260406494 }, { "auxiliary_loss_clip": 0.08224732, "auxiliary_loss_mlp": 0.01561164, "balance_loss_clip": 0.0714457, "balance_loss_mlp": 0.01321792, "epoch": 0.026875093942582293, "flos": 17900503591680.0, "grad_norm": 34.910607067631595, "language_loss": 0.92348784, "learning_rate": 3.92914567610317e-06, "loss": 1.02134681, "num_input_tokens_seen": 9438840, "router_z_loss_clip": 10.796875, "router_z_loss_mlp": 2.39453125, "step": 447, "time_per_iteration": 2.725327968597412 }, { "auxiliary_loss_clip": 0.08227077, "auxiliary_loss_mlp": 0.01605478, "balance_loss_clip": 0.07145772, "balance_loss_mlp": 0.0135676, "epoch": 0.026935217195250265, "flos": 21730114598400.0, "grad_norm": 78.5123653318011, "language_loss": 0.99150318, "learning_rate": 3.930584452530952e-06, "loss": 1.08982873, "num_input_tokens_seen": 9457215, "router_z_loss_clip": 10.8046875, "router_z_loss_mlp": 2.48828125, "step": 448, "time_per_iteration": 2.7394907474517822 }, { "auxiliary_loss_clip": 0.0818538, "auxiliary_loss_mlp": 0.01603523, "balance_loss_clip": 0.07132646, "balance_loss_mlp": 0.01369873, "epoch": 0.026995340447918233, "flos": 23629378769280.0, "grad_norm": 9.788919647344033, "language_loss": 0.99402559, "learning_rate": 3.9320200209818755e-06, "loss": 1.09191465, "num_input_tokens_seen": 9475615, "router_z_loss_clip": 10.5390625, "router_z_loss_mlp": 2.33789062, "step": 449, "time_per_iteration": 2.7648239135742188 }, { "auxiliary_loss_clip": 0.08204612, "auxiliary_loss_mlp": 0.01617269, "balance_loss_clip": 0.07140334, "balance_loss_mlp": 0.01378278, "epoch": 0.027055463700586202, "flos": 17937078698880.0, "grad_norm": 18.399084028352068, "language_loss": 0.98054314, "learning_rate": 3.933452395729493e-06, "loss": 1.07876194, "num_input_tokens_seen": 9493975, "router_z_loss_clip": 10.6484375, "router_z_loss_mlp": 2.38867188, "step": 450, "time_per_iteration": 2.763979196548462 }, { "auxiliary_loss_clip": 0.08170174, "auxiliary_loss_mlp": 0.01569962, "balance_loss_clip": 0.07142225, "balance_loss_mlp": 0.01346993, "epoch": 0.02711558695325417, "flos": 25125897490560.0, "grad_norm": 15.83021559350231, "language_loss": 0.89707726, "learning_rate": 3.934881590952304e-06, "loss": 0.99447864, "num_input_tokens_seen": 9514810, "router_z_loss_clip": 10.28125, "router_z_loss_mlp": 2.23144531, "step": 451, "time_per_iteration": 2.793919086456299 }, { "auxiliary_loss_clip": 0.08166996, "auxiliary_loss_mlp": 0.01649855, "balance_loss_clip": 0.07154252, "balance_loss_mlp": 0.01420019, "epoch": 0.02717571020592214, "flos": 24245788930560.0, "grad_norm": 7.6332269947901334, "language_loss": 0.84353113, "learning_rate": 3.936307620734599e-06, "loss": 0.94169962, "num_input_tokens_seen": 9533635, "router_z_loss_clip": 10.1328125, "router_z_loss_mlp": 2.296875, "step": 452, "time_per_iteration": 4.191622018814087 }, { "auxiliary_loss_clip": 0.08187561, "auxiliary_loss_mlp": 0.01616942, "balance_loss_clip": 0.07172111, "balance_loss_mlp": 0.01381575, "epoch": 0.02723583345859011, "flos": 25125939417600.0, "grad_norm": 14.956930166184293, "language_loss": 0.81609267, "learning_rate": 3.937730499067294e-06, "loss": 0.91413766, "num_input_tokens_seen": 9555420, "router_z_loss_clip": 10.15625, "router_z_loss_mlp": 2.35546875, "step": 453, "time_per_iteration": 4.279350757598877 }, { "auxiliary_loss_clip": 0.08210248, "auxiliary_loss_mlp": 0.01609492, "balance_loss_clip": 0.07198013, "balance_loss_mlp": 0.01388812, "epoch": 0.02729595671125808, "flos": 42751550090880.0, "grad_norm": 8.563508920073836, "language_loss": 0.9488827, "learning_rate": 3.939150239848748e-06, "loss": 1.04708004, "num_input_tokens_seen": 9578950, "router_z_loss_clip": 10.109375, "router_z_loss_mlp": 2.20898438, "step": 454, "time_per_iteration": 4.443447113037109 }, { "auxiliary_loss_clip": 0.08213249, "auxiliary_loss_mlp": 0.01621559, "balance_loss_clip": 0.07206814, "balance_loss_mlp": 0.01392868, "epoch": 0.02735607996392605, "flos": 21436884835200.0, "grad_norm": 19.31720171440933, "language_loss": 0.84946293, "learning_rate": 3.9405668568855866e-06, "loss": 0.94781101, "num_input_tokens_seen": 9598160, "router_z_loss_clip": 10.0703125, "router_z_loss_mlp": 2.28515625, "step": 455, "time_per_iteration": 2.891237497329712 }, { "auxiliary_loss_clip": 0.0821996, "auxiliary_loss_mlp": 0.01621749, "balance_loss_clip": 0.07199726, "balance_loss_mlp": 0.01387336, "epoch": 0.027416203216594017, "flos": 20857762540800.0, "grad_norm": 8.214958367439634, "language_loss": 0.93458587, "learning_rate": 3.941980363893499e-06, "loss": 1.03300285, "num_input_tokens_seen": 9616010, "router_z_loss_clip": 10.1875, "router_z_loss_mlp": 2.34570312, "step": 456, "time_per_iteration": 2.696747064590454 }, { "auxiliary_loss_clip": 0.08199795, "auxiliary_loss_mlp": 0.0156898, "balance_loss_clip": 0.07175269, "balance_loss_mlp": 0.0135097, "epoch": 0.027476326469261986, "flos": 13229497411200.0, "grad_norm": 12.137256936190553, "language_loss": 0.91643143, "learning_rate": 3.9433907744980384e-06, "loss": 1.01411927, "num_input_tokens_seen": 9634000, "router_z_loss_clip": 10.2578125, "router_z_loss_mlp": 2.18164062, "step": 457, "time_per_iteration": 2.6710782051086426 }, { "auxiliary_loss_clip": 0.0820792, "auxiliary_loss_mlp": 0.01595701, "balance_loss_clip": 0.07174765, "balance_loss_mlp": 0.01369299, "epoch": 0.027536449721929958, "flos": 24031369532160.0, "grad_norm": 11.889788108015722, "language_loss": 1.04596925, "learning_rate": 3.944798102235412e-06, "loss": 1.14400554, "num_input_tokens_seen": 9653455, "router_z_loss_clip": 10.34375, "router_z_loss_mlp": 2.26171875, "step": 458, "time_per_iteration": 2.738854169845581 }, { "auxiliary_loss_clip": 0.08180498, "auxiliary_loss_mlp": 0.01618791, "balance_loss_clip": 0.07169077, "balance_loss_mlp": 0.0138228, "epoch": 0.027596572974597926, "flos": 13011094944000.0, "grad_norm": 8.626110224002513, "language_loss": 0.9590916, "learning_rate": 3.9462023605532545e-06, "loss": 1.05708456, "num_input_tokens_seen": 9669650, "router_z_loss_clip": 10.125, "router_z_loss_mlp": 2.36523438, "step": 459, "time_per_iteration": 2.696697473526001 }, { "auxiliary_loss_clip": 0.08143491, "auxiliary_loss_mlp": 0.01610058, "balance_loss_clip": 0.07150031, "balance_loss_mlp": 0.01380795, "epoch": 0.027656696227265895, "flos": 26150671324800.0, "grad_norm": 13.741858810567269, "language_loss": 0.91584772, "learning_rate": 3.947603562811407e-06, "loss": 1.01338315, "num_input_tokens_seen": 9691415, "router_z_loss_clip": 9.9296875, "router_z_loss_mlp": 2.29101562, "step": 460, "time_per_iteration": 2.820133686065674 }, { "auxiliary_loss_clip": 0.07641406, "auxiliary_loss_mlp": 0.01687844, "balance_loss_clip": 0.07066333, "balance_loss_mlp": 0.0153497, "epoch": 0.027716819479933864, "flos": 60717055322880.0, "grad_norm": 1.4162327624946587, "language_loss": 0.73782349, "learning_rate": 3.949001722282675e-06, "loss": 0.83111596, "num_input_tokens_seen": 9755605, "router_z_loss_clip": 5.75, "router_z_loss_mlp": 1.52636719, "step": 461, "time_per_iteration": 3.3262434005737305 }, { "auxiliary_loss_clip": 0.08104984, "auxiliary_loss_mlp": 0.01596736, "balance_loss_clip": 0.07127225, "balance_loss_mlp": 0.01386642, "epoch": 0.027776942732601832, "flos": 31219936761600.0, "grad_norm": 16.511436069608187, "language_loss": 0.96007705, "learning_rate": 3.950396852153582e-06, "loss": 1.05709434, "num_input_tokens_seen": 9776270, "router_z_loss_clip": 9.796875, "router_z_loss_mlp": 2.1015625, "step": 462, "time_per_iteration": 2.8136584758758545 }, { "auxiliary_loss_clip": 0.08092098, "auxiliary_loss_mlp": 0.01631683, "balance_loss_clip": 0.07111506, "balance_loss_mlp": 0.01406235, "epoch": 0.027837065985269804, "flos": 22681277020800.0, "grad_norm": 12.768428142922598, "language_loss": 1.04160738, "learning_rate": 3.951788965525118e-06, "loss": 1.13884521, "num_input_tokens_seen": 9794465, "router_z_loss_clip": 9.8125, "router_z_loss_mlp": 2.25390625, "step": 463, "time_per_iteration": 2.7015275955200195 }, { "auxiliary_loss_clip": 0.07537781, "auxiliary_loss_mlp": 0.01634643, "balance_loss_clip": 0.06993717, "balance_loss_mlp": 0.01509235, "epoch": 0.027897189237937773, "flos": 62200786296960.0, "grad_norm": 0.8946160874369528, "language_loss": 0.5915041, "learning_rate": 3.953178075413476e-06, "loss": 0.68322837, "num_input_tokens_seen": 9849685, "router_z_loss_clip": 5.4375, "router_z_loss_mlp": 1.25292969, "step": 464, "time_per_iteration": 3.330556869506836 }, { "auxiliary_loss_clip": 0.08120599, "auxiliary_loss_mlp": 0.01668217, "balance_loss_clip": 0.07109532, "balance_loss_mlp": 0.01416638, "epoch": 0.02795731249060574, "flos": 24499131350400.0, "grad_norm": 19.721912364450386, "language_loss": 0.96524417, "learning_rate": 3.954564194750784e-06, "loss": 1.06313241, "num_input_tokens_seen": 9869505, "router_z_loss_clip": 10.1171875, "router_z_loss_mlp": 2.51757812, "step": 465, "time_per_iteration": 2.7661914825439453 }, { "auxiliary_loss_clip": 0.08107261, "auxiliary_loss_mlp": 0.01659218, "balance_loss_clip": 0.0709277, "balance_loss_mlp": 0.01413361, "epoch": 0.02801743574327371, "flos": 23739858777600.0, "grad_norm": 28.657213296820125, "language_loss": 0.88399237, "learning_rate": 3.955947336385828e-06, "loss": 0.98165715, "num_input_tokens_seen": 9890950, "router_z_loss_clip": 10.1484375, "router_z_loss_mlp": 2.45703125, "step": 466, "time_per_iteration": 2.7363779544830322 }, { "auxiliary_loss_clip": 0.08113077, "auxiliary_loss_mlp": 0.01707976, "balance_loss_clip": 0.07099996, "balance_loss_mlp": 0.01458495, "epoch": 0.02807755899594168, "flos": 20634999661440.0, "grad_norm": 18.57915696130606, "language_loss": 0.95353818, "learning_rate": 3.957327513084761e-06, "loss": 1.05174863, "num_input_tokens_seen": 9911265, "router_z_loss_clip": 10.1328125, "router_z_loss_mlp": 2.49414062, "step": 467, "time_per_iteration": 2.727511405944824 }, { "auxiliary_loss_clip": 0.0816057, "auxiliary_loss_mlp": 0.01732923, "balance_loss_clip": 0.0711654, "balance_loss_mlp": 0.0145979, "epoch": 0.02813768224860965, "flos": 19250554176000.0, "grad_norm": 19.839982350107768, "language_loss": 0.97071081, "learning_rate": 3.958704737531818e-06, "loss": 1.06964564, "num_input_tokens_seen": 9929025, "router_z_loss_clip": 10.4609375, "router_z_loss_mlp": 2.73242188, "step": 468, "time_per_iteration": 2.686372756958008 }, { "auxiliary_loss_clip": 0.08195222, "auxiliary_loss_mlp": 0.01797256, "balance_loss_clip": 0.07125561, "balance_loss_mlp": 0.01495704, "epoch": 0.02819780550127762, "flos": 20820306965760.0, "grad_norm": 9.823621276317217, "language_loss": 1.04051948, "learning_rate": 3.9600790223300065e-06, "loss": 1.14044428, "num_input_tokens_seen": 9945190, "router_z_loss_clip": 10.6875, "router_z_loss_mlp": 3.015625, "step": 469, "time_per_iteration": 2.708902597427368 }, { "auxiliary_loss_clip": 0.08160621, "auxiliary_loss_mlp": 0.0182202, "balance_loss_clip": 0.07108016, "balance_loss_mlp": 0.01519514, "epoch": 0.028257928753945588, "flos": 19980211530240.0, "grad_norm": 52.346256304152554, "language_loss": 0.96462166, "learning_rate": 3.96145038000181e-06, "loss": 1.06444812, "num_input_tokens_seen": 9962820, "router_z_loss_clip": 10.5546875, "router_z_loss_mlp": 3.02539062, "step": 470, "time_per_iteration": 2.724116325378418 }, { "auxiliary_loss_clip": 0.08222574, "auxiliary_loss_mlp": 0.0190628, "balance_loss_clip": 0.07122838, "balance_loss_mlp": 0.01562195, "epoch": 0.028318052006613557, "flos": 20490585949440.0, "grad_norm": 45.44253924357528, "language_loss": 1.04839385, "learning_rate": 3.962818822989861e-06, "loss": 1.1496824, "num_input_tokens_seen": 9982595, "router_z_loss_clip": 10.984375, "router_z_loss_mlp": 3.44140625, "step": 471, "time_per_iteration": 2.7583234310150146 }, { "auxiliary_loss_clip": 0.08207499, "auxiliary_loss_mlp": 0.01952525, "balance_loss_clip": 0.07116827, "balance_loss_mlp": 0.01594516, "epoch": 0.02837817525928153, "flos": 28522854339840.0, "grad_norm": 49.627680340139015, "language_loss": 0.86643308, "learning_rate": 3.964184363657625e-06, "loss": 0.96803331, "num_input_tokens_seen": 10004645, "router_z_loss_clip": 10.921875, "router_z_loss_mlp": 3.58203125, "step": 472, "time_per_iteration": 2.810147285461426 }, { "auxiliary_loss_clip": 0.08279589, "auxiliary_loss_mlp": 0.02000526, "balance_loss_clip": 0.07145083, "balance_loss_mlp": 0.01609329, "epoch": 0.028438298511949497, "flos": 18557597710080.0, "grad_norm": 20.829621880006503, "language_loss": 1.04128265, "learning_rate": 3.965547014290071e-06, "loss": 1.14408386, "num_input_tokens_seen": 10022555, "router_z_loss_clip": 11.3359375, "router_z_loss_mlp": 3.91210938, "step": 473, "time_per_iteration": 2.74711537361145 }, { "auxiliary_loss_clip": 0.08283779, "auxiliary_loss_mlp": 0.02052871, "balance_loss_clip": 0.07145297, "balance_loss_mlp": 0.01648704, "epoch": 0.028498421764617466, "flos": 16915952517120.0, "grad_norm": 15.877107183891189, "language_loss": 1.02892244, "learning_rate": 3.96690678709433e-06, "loss": 1.13228881, "num_input_tokens_seen": 10041025, "router_z_loss_clip": 11.375, "router_z_loss_mlp": 4.04296875, "step": 474, "time_per_iteration": 2.7547295093536377 }, { "auxiliary_loss_clip": 0.08251248, "auxiliary_loss_mlp": 0.02039377, "balance_loss_clip": 0.07128356, "balance_loss_mlp": 0.01649324, "epoch": 0.028558545017285435, "flos": 27785524337280.0, "grad_norm": 21.628822330205896, "language_loss": 0.90228081, "learning_rate": 3.968263694200355e-06, "loss": 1.00518703, "num_input_tokens_seen": 10060775, "router_z_loss_clip": 11.2265625, "router_z_loss_mlp": 3.8984375, "step": 475, "time_per_iteration": 2.7934505939483643 }, { "auxiliary_loss_clip": 0.07393242, "auxiliary_loss_mlp": 0.01473408, "balance_loss_clip": 0.06863622, "balance_loss_mlp": 0.01379996, "epoch": 0.028618668269953403, "flos": 65674205596800.0, "grad_norm": 0.9868938911093398, "language_loss": 0.66927224, "learning_rate": 3.969617747661569e-06, "loss": 0.75793874, "num_input_tokens_seen": 10120225, "router_z_loss_clip": 5.30078125, "router_z_loss_mlp": 0.93359375, "step": 476, "time_per_iteration": 3.3112971782684326 }, { "auxiliary_loss_clip": 0.08203082, "auxiliary_loss_mlp": 0.02020787, "balance_loss_clip": 0.07113506, "balance_loss_mlp": 0.01650189, "epoch": 0.028678791522621375, "flos": 21942269936640.0, "grad_norm": 9.094268410017984, "language_loss": 0.97502005, "learning_rate": 3.970968959455509e-06, "loss": 1.07725883, "num_input_tokens_seen": 10137880, "router_z_loss_clip": 10.890625, "router_z_loss_mlp": 3.70507812, "step": 477, "time_per_iteration": 2.7825121879577637 }, { "auxiliary_loss_clip": 0.08196925, "auxiliary_loss_mlp": 0.01986181, "balance_loss_clip": 0.07117103, "balance_loss_mlp": 0.01617491, "epoch": 0.028738914775289344, "flos": 24579115672320.0, "grad_norm": 65.78408322824754, "language_loss": 0.94668734, "learning_rate": 3.97231734148446e-06, "loss": 1.04851842, "num_input_tokens_seen": 10156930, "router_z_loss_clip": 10.8046875, "router_z_loss_mlp": 3.68359375, "step": 478, "time_per_iteration": 2.762369155883789 }, { "auxiliary_loss_clip": 0.08192317, "auxiliary_loss_mlp": 0.01965098, "balance_loss_clip": 0.07080325, "balance_loss_mlp": 0.01593928, "epoch": 0.028799038027957313, "flos": 23264633946240.0, "grad_norm": 14.019385148576825, "language_loss": 0.92471093, "learning_rate": 3.973662905576082e-06, "loss": 1.02628505, "num_input_tokens_seen": 10176295, "router_z_loss_clip": 11.1171875, "router_z_loss_mlp": 3.71289062, "step": 479, "time_per_iteration": 2.715630531311035 }, { "auxiliary_loss_clip": 0.08169413, "auxiliary_loss_mlp": 0.01915991, "balance_loss_clip": 0.07076322, "balance_loss_mlp": 0.0156771, "epoch": 0.02885916128062528, "flos": 22170692966400.0, "grad_norm": 36.18918281897142, "language_loss": 0.82578695, "learning_rate": 3.975005663484038e-06, "loss": 0.92664105, "num_input_tokens_seen": 10195790, "router_z_loss_clip": 10.9375, "router_z_loss_mlp": 3.48046875, "step": 480, "time_per_iteration": 2.703430652618408 }, { "auxiliary_loss_clip": 0.08179657, "auxiliary_loss_mlp": 0.01937273, "balance_loss_clip": 0.07069217, "balance_loss_mlp": 0.0157831, "epoch": 0.02891928453329325, "flos": 22939986101760.0, "grad_norm": 13.569637391673249, "language_loss": 0.94778454, "learning_rate": 3.976345626888605e-06, "loss": 1.04895389, "num_input_tokens_seen": 10218405, "router_z_loss_clip": 11.109375, "router_z_loss_mlp": 3.58789062, "step": 481, "time_per_iteration": 2.7663185596466064 }, { "auxiliary_loss_clip": 0.07342237, "auxiliary_loss_mlp": 0.01380654, "balance_loss_clip": 0.06830175, "balance_loss_mlp": 0.01297207, "epoch": 0.028979407785961222, "flos": 57449376524160.0, "grad_norm": 0.8784729547470895, "language_loss": 0.66100943, "learning_rate": 3.9776828073972864e-06, "loss": 0.74823833, "num_input_tokens_seen": 10271005, "router_z_loss_clip": 5.125, "router_z_loss_mlp": 0.83496094, "step": 482, "time_per_iteration": 3.12870454788208 }, { "auxiliary_loss_clip": 0.08160948, "auxiliary_loss_mlp": 0.01836414, "balance_loss_clip": 0.07062453, "balance_loss_mlp": 0.01492328, "epoch": 0.02903953103862919, "flos": 16727584538880.0, "grad_norm": 41.267354478183144, "language_loss": 0.93697101, "learning_rate": 3.979017216545415e-06, "loss": 1.03694463, "num_input_tokens_seen": 10288405, "router_z_loss_clip": 10.96875, "router_z_loss_mlp": 3.43945312, "step": 483, "time_per_iteration": 2.6712028980255127 }, { "auxiliary_loss_clip": 0.0817885, "auxiliary_loss_mlp": 0.01844926, "balance_loss_clip": 0.07065724, "balance_loss_mlp": 0.01496644, "epoch": 0.02909965429129716, "flos": 16769232817920.0, "grad_norm": 1444.6354497277678, "language_loss": 0.87388277, "learning_rate": 3.980348865796749e-06, "loss": 0.9741205, "num_input_tokens_seen": 10306875, "router_z_loss_clip": 11.125, "router_z_loss_mlp": 3.48242188, "step": 484, "time_per_iteration": 2.6866753101348877 }, { "auxiliary_loss_clip": 0.08148923, "auxiliary_loss_mlp": 0.01846368, "balance_loss_clip": 0.07049301, "balance_loss_mlp": 0.0149904, "epoch": 0.029159777543965128, "flos": 19790334178560.0, "grad_norm": 12.324139401341426, "language_loss": 0.93142998, "learning_rate": 3.9816777665440615e-06, "loss": 1.03138292, "num_input_tokens_seen": 10323965, "router_z_loss_clip": 11.0078125, "router_z_loss_mlp": 3.47265625, "step": 485, "time_per_iteration": 2.692122459411621 }, { "auxiliary_loss_clip": 0.08162046, "auxiliary_loss_mlp": 0.01850293, "balance_loss_clip": 0.07045294, "balance_loss_mlp": 0.0150697, "epoch": 0.029219900796633096, "flos": 19648184526720.0, "grad_norm": 10.792474005901179, "language_loss": 0.96805763, "learning_rate": 3.983003930109732e-06, "loss": 1.06818104, "num_input_tokens_seen": 10342620, "router_z_loss_clip": 11.1796875, "router_z_loss_mlp": 3.43359375, "step": 486, "time_per_iteration": 2.7626545429229736 }, { "auxiliary_loss_clip": 0.08157975, "auxiliary_loss_mlp": 0.01836506, "balance_loss_clip": 0.07071027, "balance_loss_mlp": 0.01496044, "epoch": 0.02928002404930107, "flos": 25892926565760.0, "grad_norm": 11.19842672784126, "language_loss": 0.96472645, "learning_rate": 3.984327367746315e-06, "loss": 1.06467128, "num_input_tokens_seen": 10364610, "router_z_loss_clip": 10.875, "router_z_loss_mlp": 3.40625, "step": 487, "time_per_iteration": 2.729121208190918 }, { "auxiliary_loss_clip": 0.08143252, "auxiliary_loss_mlp": 0.01806545, "balance_loss_clip": 0.07066137, "balance_loss_mlp": 0.01496983, "epoch": 0.029340147301969037, "flos": 20665243785600.0, "grad_norm": 28.5028071043778, "language_loss": 1.00307643, "learning_rate": 3.985648090637122e-06, "loss": 1.10257447, "num_input_tokens_seen": 10380910, "router_z_loss_clip": 10.765625, "router_z_loss_mlp": 3.09375, "step": 488, "time_per_iteration": 2.709808588027954 }, { "auxiliary_loss_clip": 0.08181503, "auxiliary_loss_mlp": 0.01812769, "balance_loss_clip": 0.07098946, "balance_loss_mlp": 0.01492335, "epoch": 0.029400270554637006, "flos": 24435288938880.0, "grad_norm": 17.550446592484736, "language_loss": 0.95168841, "learning_rate": 3.986966109896785e-06, "loss": 1.05163121, "num_input_tokens_seen": 10400665, "router_z_loss_clip": 10.8359375, "router_z_loss_mlp": 3.20507812, "step": 489, "time_per_iteration": 2.760267972946167 }, { "auxiliary_loss_clip": 0.08159917, "auxiliary_loss_mlp": 0.01806183, "balance_loss_clip": 0.07084038, "balance_loss_mlp": 0.01510162, "epoch": 0.029460393807304974, "flos": 20127140864640.0, "grad_norm": 10.127478714299635, "language_loss": 0.95046794, "learning_rate": 3.988281436571815e-06, "loss": 1.05012894, "num_input_tokens_seen": 10420150, "router_z_loss_clip": 10.765625, "router_z_loss_mlp": 2.96289062, "step": 490, "time_per_iteration": 2.708706855773926 }, { "auxiliary_loss_clip": 0.08179517, "auxiliary_loss_mlp": 0.01817266, "balance_loss_clip": 0.07097362, "balance_loss_mlp": 0.01510755, "epoch": 0.029520517059972943, "flos": 17681681854080.0, "grad_norm": 13.732893133950075, "language_loss": 1.01595044, "learning_rate": 3.989594081641164e-06, "loss": 1.11591816, "num_input_tokens_seen": 10438210, "router_z_loss_clip": 10.8203125, "router_z_loss_mlp": 3.06445312, "step": 491, "time_per_iteration": 4.143407344818115 }, { "auxiliary_loss_clip": 0.08152616, "auxiliary_loss_mlp": 0.01797546, "balance_loss_clip": 0.07091749, "balance_loss_mlp": 0.01514877, "epoch": 0.029580640312640915, "flos": 18959211129600.0, "grad_norm": 8.134999259516295, "language_loss": 0.91352236, "learning_rate": 3.9909040560167675e-06, "loss": 1.01302409, "num_input_tokens_seen": 10455125, "router_z_loss_clip": 10.6171875, "router_z_loss_mlp": 2.828125, "step": 492, "time_per_iteration": 4.174432754516602 }, { "auxiliary_loss_clip": 0.08114009, "auxiliary_loss_mlp": 0.01726811, "balance_loss_clip": 0.07082409, "balance_loss_mlp": 0.01455968, "epoch": 0.029640763565308884, "flos": 18730746172800.0, "grad_norm": 17.04026989931895, "language_loss": 0.95503974, "learning_rate": 3.992211370544093e-06, "loss": 1.05344796, "num_input_tokens_seen": 10470990, "router_z_loss_clip": 10.3125, "router_z_loss_mlp": 2.7109375, "step": 493, "time_per_iteration": 6.304533243179321 }, { "auxiliary_loss_clip": 0.08150046, "auxiliary_loss_mlp": 0.01703127, "balance_loss_clip": 0.07107189, "balance_loss_mlp": 0.0145727, "epoch": 0.029700886817976852, "flos": 20601652936320.0, "grad_norm": 8.02162697754432, "language_loss": 0.95138353, "learning_rate": 3.99351603600268e-06, "loss": 1.04991531, "num_input_tokens_seen": 10490685, "router_z_loss_clip": 10.4375, "router_z_loss_mlp": 2.45898438, "step": 494, "time_per_iteration": 2.7332475185394287 }, { "auxiliary_loss_clip": 0.08108614, "auxiliary_loss_mlp": 0.01662632, "balance_loss_clip": 0.07090655, "balance_loss_mlp": 0.01421734, "epoch": 0.02976101007064482, "flos": 22243423910400.0, "grad_norm": 7.144517662419635, "language_loss": 0.94971097, "learning_rate": 3.994818063106668e-06, "loss": 1.0474236, "num_input_tokens_seen": 10509435, "router_z_loss_clip": 10.1875, "router_z_loss_mlp": 2.40820312, "step": 495, "time_per_iteration": 2.7615957260131836 }, { "auxiliary_loss_clip": 0.08069725, "auxiliary_loss_mlp": 0.01626448, "balance_loss_clip": 0.07077596, "balance_loss_mlp": 0.01410632, "epoch": 0.029821133323312793, "flos": 23739439507200.0, "grad_norm": 39.354529915542365, "language_loss": 0.70709372, "learning_rate": 3.99611746250533e-06, "loss": 0.80405545, "num_input_tokens_seen": 10530050, "router_z_loss_clip": 9.9140625, "router_z_loss_mlp": 2.16015625, "step": 496, "time_per_iteration": 2.814415693283081 }, { "auxiliary_loss_clip": 0.08085234, "auxiliary_loss_mlp": 0.01583766, "balance_loss_clip": 0.07100055, "balance_loss_mlp": 0.01376819, "epoch": 0.02988125657598076, "flos": 22426131738240.0, "grad_norm": 33.85441341898987, "language_loss": 0.95965928, "learning_rate": 3.997414244783595e-06, "loss": 1.05634928, "num_input_tokens_seen": 10551370, "router_z_loss_clip": 9.84375, "router_z_loss_mlp": 2.0703125, "step": 497, "time_per_iteration": 2.769326686859131 }, { "auxiliary_loss_clip": 0.0810876, "auxiliary_loss_mlp": 0.01568837, "balance_loss_clip": 0.07127016, "balance_loss_mlp": 0.01363511, "epoch": 0.02994137982864873, "flos": 13850267984640.0, "grad_norm": 27.97736330132271, "language_loss": 0.97655296, "learning_rate": 3.998708420462557e-06, "loss": 1.07332885, "num_input_tokens_seen": 10569225, "router_z_loss_clip": 9.8125, "router_z_loss_mlp": 2.0546875, "step": 498, "time_per_iteration": 2.6626296043395996 }, { "auxiliary_loss_clip": 0.08098714, "auxiliary_loss_mlp": 0.01577948, "balance_loss_clip": 0.07097377, "balance_loss_mlp": 0.01370524, "epoch": 0.0300015030813167, "flos": 23914055416320.0, "grad_norm": 13.81254132720896, "language_loss": 0.92231667, "learning_rate": 4e-06, "loss": 1.01908326, "num_input_tokens_seen": 10586170, "router_z_loss_clip": 10.015625, "router_z_loss_mlp": 2.07617188, "step": 499, "time_per_iteration": 2.7434065341949463 }, { "auxiliary_loss_clip": 0.08117321, "auxiliary_loss_mlp": 0.01585597, "balance_loss_clip": 0.07110468, "balance_loss_mlp": 0.01381701, "epoch": 0.030061626333984667, "flos": 22023134726400.0, "grad_norm": 8.163487530617427, "language_loss": 0.89554548, "learning_rate": 3.9999999620799e-06, "loss": 0.99257469, "num_input_tokens_seen": 10606205, "router_z_loss_clip": 10.0703125, "router_z_loss_mlp": 2.0390625, "step": 500, "time_per_iteration": 2.771660327911377 }, { "auxiliary_loss_clip": 0.08112106, "auxiliary_loss_mlp": 0.01635535, "balance_loss_clip": 0.07099217, "balance_loss_mlp": 0.0139025, "epoch": 0.03012174958665264, "flos": 23046483041280.0, "grad_norm": 12.784965014350787, "language_loss": 0.99664813, "learning_rate": 3.9999998483196e-06, "loss": 1.09412456, "num_input_tokens_seen": 10625995, "router_z_loss_clip": 10.1328125, "router_z_loss_mlp": 2.45117188, "step": 501, "time_per_iteration": 2.731306314468384 }, { "auxiliary_loss_clip": 0.08181497, "auxiliary_loss_mlp": 0.01641896, "balance_loss_clip": 0.07149707, "balance_loss_mlp": 0.01403287, "epoch": 0.030181872839320608, "flos": 18959294983680.0, "grad_norm": 14.991636074292822, "language_loss": 0.955791, "learning_rate": 3.9999996587191065e-06, "loss": 1.05402493, "num_input_tokens_seen": 10644105, "router_z_loss_clip": 10.3359375, "router_z_loss_mlp": 2.38476562, "step": 502, "time_per_iteration": 2.7441954612731934 }, { "auxiliary_loss_clip": 0.08159538, "auxiliary_loss_mlp": 0.0165419, "balance_loss_clip": 0.07137958, "balance_loss_mlp": 0.01421302, "epoch": 0.030241996091988577, "flos": 16733747813760.0, "grad_norm": 7.640221347770445, "language_loss": 0.92283404, "learning_rate": 3.999999393278425e-06, "loss": 1.0209713, "num_input_tokens_seen": 10661090, "router_z_loss_clip": 10.2109375, "router_z_loss_mlp": 2.33007812, "step": 503, "time_per_iteration": 2.7242472171783447 }, { "auxiliary_loss_clip": 0.08168961, "auxiliary_loss_mlp": 0.01646633, "balance_loss_clip": 0.07144021, "balance_loss_mlp": 0.01422138, "epoch": 0.030302119344656545, "flos": 28628806227840.0, "grad_norm": 14.245820665989804, "language_loss": 0.94205093, "learning_rate": 3.999999051997567e-06, "loss": 1.04020691, "num_input_tokens_seen": 10682380, "router_z_loss_clip": 10.25, "router_z_loss_mlp": 2.24804688, "step": 504, "time_per_iteration": 2.786360025405884 }, { "auxiliary_loss_clip": 0.0823727, "auxiliary_loss_mlp": 0.01668091, "balance_loss_clip": 0.0716431, "balance_loss_mlp": 0.01430054, "epoch": 0.030362242597324514, "flos": 15674788713600.0, "grad_norm": 7.4855706282670536, "language_loss": 0.85064822, "learning_rate": 3.9999986348765425e-06, "loss": 0.94970185, "num_input_tokens_seen": 10699925, "router_z_loss_clip": 10.7421875, "router_z_loss_mlp": 2.37890625, "step": 505, "time_per_iteration": 2.725151777267456 }, { "auxiliary_loss_clip": 0.07403059, "auxiliary_loss_mlp": 0.01367935, "balance_loss_clip": 0.06898427, "balance_loss_mlp": 0.01290593, "epoch": 0.030422365849992486, "flos": 72149173528320.0, "grad_norm": 0.9604872267066594, "language_loss": 0.55508077, "learning_rate": 3.999998141915371e-06, "loss": 0.64279068, "num_input_tokens_seen": 10766525, "router_z_loss_clip": 5.0625, "router_z_loss_mlp": 0.77246094, "step": 506, "time_per_iteration": 3.5226175785064697 }, { "auxiliary_loss_clip": 0.08278921, "auxiliary_loss_mlp": 0.01721787, "balance_loss_clip": 0.07167426, "balance_loss_mlp": 0.01475549, "epoch": 0.030482489102660455, "flos": 19433974763520.0, "grad_norm": 5.898797843897536, "language_loss": 0.8822906, "learning_rate": 3.999997573114069e-06, "loss": 0.98229772, "num_input_tokens_seen": 10786725, "router_z_loss_clip": 11.125, "router_z_loss_mlp": 2.4609375, "step": 507, "time_per_iteration": 2.7399346828460693 }, { "auxiliary_loss_clip": 0.08250935, "auxiliary_loss_mlp": 0.01721918, "balance_loss_clip": 0.07170868, "balance_loss_mlp": 0.01459466, "epoch": 0.030542612355328423, "flos": 20382034584960.0, "grad_norm": 8.041474661100672, "language_loss": 0.96697211, "learning_rate": 3.999996928472659e-06, "loss": 1.0667007, "num_input_tokens_seen": 10805390, "router_z_loss_clip": 10.8125, "router_z_loss_mlp": 2.625, "step": 508, "time_per_iteration": 2.7214508056640625 }, { "auxiliary_loss_clip": 0.0828799, "auxiliary_loss_mlp": 0.0172797, "balance_loss_clip": 0.07190797, "balance_loss_mlp": 0.01460368, "epoch": 0.030602735607996392, "flos": 34685809194240.0, "grad_norm": 56.44043527093789, "language_loss": 0.7755487, "learning_rate": 3.999996207991165e-06, "loss": 0.87570834, "num_input_tokens_seen": 10828030, "router_z_loss_clip": 10.9765625, "router_z_loss_mlp": 2.67382812, "step": 509, "time_per_iteration": 2.8615329265594482 }, { "auxiliary_loss_clip": 0.08307496, "auxiliary_loss_mlp": 0.01741815, "balance_loss_clip": 0.07206818, "balance_loss_mlp": 0.01490235, "epoch": 0.03066285886066436, "flos": 23665283043840.0, "grad_norm": 6.914584988633158, "language_loss": 0.87427461, "learning_rate": 3.999995411669614e-06, "loss": 0.97476774, "num_input_tokens_seen": 10845240, "router_z_loss_clip": 11.0078125, "router_z_loss_mlp": 2.515625, "step": 510, "time_per_iteration": 2.739849328994751 }, { "auxiliary_loss_clip": 0.0830367, "auxiliary_loss_mlp": 0.01740919, "balance_loss_clip": 0.0721366, "balance_loss_mlp": 0.01483046, "epoch": 0.030722982113332332, "flos": 23009656371840.0, "grad_norm": 9.02873061780712, "language_loss": 0.91045892, "learning_rate": 3.999994539508036e-06, "loss": 1.01090479, "num_input_tokens_seen": 10864325, "router_z_loss_clip": 10.90625, "router_z_loss_mlp": 2.578125, "step": 511, "time_per_iteration": 2.7374744415283203 }, { "auxiliary_loss_clip": 0.08278815, "auxiliary_loss_mlp": 0.01767161, "balance_loss_clip": 0.07181448, "balance_loss_mlp": 0.01508715, "epoch": 0.0307831053660003, "flos": 24757253452800.0, "grad_norm": 5.306236068466569, "language_loss": 0.89211214, "learning_rate": 3.9999935915064655e-06, "loss": 0.99257189, "num_input_tokens_seen": 10883860, "router_z_loss_clip": 10.984375, "router_z_loss_mlp": 2.58398438, "step": 512, "time_per_iteration": 2.8061001300811768 }, { "auxiliary_loss_clip": 0.08323276, "auxiliary_loss_mlp": 0.01778541, "balance_loss_clip": 0.07207321, "balance_loss_mlp": 0.01498351, "epoch": 0.03084322861866827, "flos": 26148113775360.0, "grad_norm": 14.13873925969336, "language_loss": 0.93603063, "learning_rate": 3.9999925676649374e-06, "loss": 1.03704882, "num_input_tokens_seen": 10904555, "router_z_loss_clip": 11.171875, "router_z_loss_mlp": 2.8046875, "step": 513, "time_per_iteration": 2.7962169647216797 }, { "auxiliary_loss_clip": 0.08313082, "auxiliary_loss_mlp": 0.01755267, "balance_loss_clip": 0.07207762, "balance_loss_mlp": 0.0148614, "epoch": 0.03090335187133624, "flos": 18777383769600.0, "grad_norm": 25.380649331347275, "language_loss": 0.85998952, "learning_rate": 3.999991467983491e-06, "loss": 0.96067297, "num_input_tokens_seen": 10923700, "router_z_loss_clip": 11.0546875, "router_z_loss_mlp": 2.69140625, "step": 514, "time_per_iteration": 2.73396635055542 }, { "auxiliary_loss_clip": 0.08291554, "auxiliary_loss_mlp": 0.01735348, "balance_loss_clip": 0.0719616, "balance_loss_mlp": 0.01476902, "epoch": 0.030963475124004207, "flos": 23228603890560.0, "grad_norm": 13.780966720284345, "language_loss": 0.85589135, "learning_rate": 3.999990292462167e-06, "loss": 0.95616031, "num_input_tokens_seen": 10942730, "router_z_loss_clip": 10.9453125, "router_z_loss_mlp": 2.58398438, "step": 515, "time_per_iteration": 2.756216287612915 }, { "auxiliary_loss_clip": 0.08270891, "auxiliary_loss_mlp": 0.01721657, "balance_loss_clip": 0.0718703, "balance_loss_mlp": 0.01466072, "epoch": 0.03102359837667218, "flos": 42535998662400.0, "grad_norm": 22.418791940739503, "language_loss": 0.90623331, "learning_rate": 3.999989041101011e-06, "loss": 1.00615883, "num_input_tokens_seen": 10967120, "router_z_loss_clip": 10.8359375, "router_z_loss_mlp": 2.55664062, "step": 516, "time_per_iteration": 2.950775384902954 }, { "auxiliary_loss_clip": 0.08241457, "auxiliary_loss_mlp": 0.01705198, "balance_loss_clip": 0.07162091, "balance_loss_mlp": 0.01454763, "epoch": 0.031083721629340148, "flos": 21183039290880.0, "grad_norm": 56.49460524221073, "language_loss": 0.84937477, "learning_rate": 3.999987713900071e-06, "loss": 0.94884133, "num_input_tokens_seen": 10986775, "router_z_loss_clip": 10.796875, "router_z_loss_mlp": 2.50585938, "step": 517, "time_per_iteration": 2.7979519367218018 }, { "auxiliary_loss_clip": 0.08247378, "auxiliary_loss_mlp": 0.01706756, "balance_loss_clip": 0.07180919, "balance_loss_mlp": 0.01469863, "epoch": 0.031143844882008116, "flos": 29723963091840.0, "grad_norm": 10.944638698690998, "language_loss": 0.94907433, "learning_rate": 3.999986310859396e-06, "loss": 1.04861569, "num_input_tokens_seen": 11011360, "router_z_loss_clip": 10.65625, "router_z_loss_mlp": 2.3671875, "step": 518, "time_per_iteration": 2.8410794734954834 }, { "auxiliary_loss_clip": 0.08258124, "auxiliary_loss_mlp": 0.0170597, "balance_loss_clip": 0.07176487, "balance_loss_mlp": 0.01477088, "epoch": 0.031203968134676085, "flos": 23119172058240.0, "grad_norm": 21.133576944641646, "language_loss": 0.93873817, "learning_rate": 3.999984831979039e-06, "loss": 1.03837919, "num_input_tokens_seen": 11030150, "router_z_loss_clip": 10.8203125, "router_z_loss_mlp": 2.29101562, "step": 519, "time_per_iteration": 2.7405221462249756 }, { "auxiliary_loss_clip": 0.08283552, "auxiliary_loss_mlp": 0.01701834, "balance_loss_clip": 0.07177985, "balance_loss_mlp": 0.01454642, "epoch": 0.03126409138734405, "flos": 20959815214080.0, "grad_norm": 4.667019392963239, "language_loss": 0.92549896, "learning_rate": 3.999983277259057e-06, "loss": 1.02535284, "num_input_tokens_seen": 11049145, "router_z_loss_clip": 11.0625, "router_z_loss_mlp": 2.47265625, "step": 520, "time_per_iteration": 2.8051681518554688 }, { "auxiliary_loss_clip": 0.08241201, "auxiliary_loss_mlp": 0.0169714, "balance_loss_clip": 0.07158608, "balance_loss_mlp": 0.01451664, "epoch": 0.031324214640012026, "flos": 21656083916160.0, "grad_norm": 7.503148818605688, "language_loss": 0.93824017, "learning_rate": 3.999981646699509e-06, "loss": 1.03762364, "num_input_tokens_seen": 11068835, "router_z_loss_clip": 10.828125, "router_z_loss_mlp": 2.45507812, "step": 521, "time_per_iteration": 2.7821803092956543 }, { "auxiliary_loss_clip": 0.08185411, "auxiliary_loss_mlp": 0.01671739, "balance_loss_clip": 0.07138726, "balance_loss_mlp": 0.01439519, "epoch": 0.03138433789267999, "flos": 23448180314880.0, "grad_norm": 9.42738992368903, "language_loss": 0.76085907, "learning_rate": 3.999979940300456e-06, "loss": 0.85943055, "num_input_tokens_seen": 11088980, "router_z_loss_clip": 10.4609375, "router_z_loss_mlp": 2.32226562, "step": 522, "time_per_iteration": 2.8791234493255615 }, { "auxiliary_loss_clip": 0.08193403, "auxiliary_loss_mlp": 0.01658961, "balance_loss_clip": 0.07134453, "balance_loss_mlp": 0.01433703, "epoch": 0.03144446114534796, "flos": 18986939631360.0, "grad_norm": 10.568535061941095, "language_loss": 0.93453163, "learning_rate": 3.999978158061963e-06, "loss": 1.03305531, "num_input_tokens_seen": 11104300, "router_z_loss_clip": 10.6015625, "router_z_loss_mlp": 2.25390625, "step": 523, "time_per_iteration": 2.8870973587036133 }, { "auxiliary_loss_clip": 0.0818169, "auxiliary_loss_mlp": 0.01648659, "balance_loss_clip": 0.07121208, "balance_loss_mlp": 0.01416916, "epoch": 0.031504584398015935, "flos": 22644240716160.0, "grad_norm": 7.121161087519675, "language_loss": 0.97676986, "learning_rate": 3.999976299984099e-06, "loss": 1.07507336, "num_input_tokens_seen": 11123335, "router_z_loss_clip": 10.609375, "router_z_loss_mlp": 2.31445312, "step": 524, "time_per_iteration": 2.7664198875427246 }, { "auxiliary_loss_clip": 0.08161123, "auxiliary_loss_mlp": 0.01629467, "balance_loss_clip": 0.07103601, "balance_loss_mlp": 0.01411552, "epoch": 0.0315647076506839, "flos": 25303364438400.0, "grad_norm": 17.076349674497056, "language_loss": 0.87808192, "learning_rate": 3.999974366066933e-06, "loss": 0.97598785, "num_input_tokens_seen": 11140880, "router_z_loss_clip": 10.5703125, "router_z_loss_mlp": 2.17871094, "step": 525, "time_per_iteration": 2.751093626022339 }, { "auxiliary_loss_clip": 0.08129942, "auxiliary_loss_mlp": 0.01599941, "balance_loss_clip": 0.0709652, "balance_loss_mlp": 0.01387844, "epoch": 0.03162483090335187, "flos": 16988515752960.0, "grad_norm": 7.603781292513342, "language_loss": 0.8669728, "learning_rate": 3.999972356310538e-06, "loss": 0.96427166, "num_input_tokens_seen": 11158710, "router_z_loss_clip": 10.3359375, "router_z_loss_mlp": 2.12207031, "step": 526, "time_per_iteration": 2.709425687789917 }, { "auxiliary_loss_clip": 0.08178101, "auxiliary_loss_mlp": 0.01611592, "balance_loss_clip": 0.07108223, "balance_loss_mlp": 0.01401879, "epoch": 0.03168495415601984, "flos": 18740515173120.0, "grad_norm": 18.2725053416285, "language_loss": 0.88442177, "learning_rate": 3.999970270714991e-06, "loss": 0.9823187, "num_input_tokens_seen": 11177550, "router_z_loss_clip": 10.7109375, "router_z_loss_mlp": 2.09863281, "step": 527, "time_per_iteration": 2.7390432357788086 }, { "auxiliary_loss_clip": 0.08113208, "auxiliary_loss_mlp": 0.01597729, "balance_loss_clip": 0.07090626, "balance_loss_mlp": 0.01388398, "epoch": 0.03174507740868781, "flos": 21221207625600.0, "grad_norm": 9.249883209862883, "language_loss": 1.0081563, "learning_rate": 3.999968109280371e-06, "loss": 1.10526586, "num_input_tokens_seen": 11196230, "router_z_loss_clip": 10.2109375, "router_z_loss_mlp": 2.09472656, "step": 528, "time_per_iteration": 2.691324234008789 }, { "auxiliary_loss_clip": 0.08098789, "auxiliary_loss_mlp": 0.01595756, "balance_loss_clip": 0.07070777, "balance_loss_mlp": 0.0139949, "epoch": 0.03180520066135578, "flos": 24794122049280.0, "grad_norm": 5.802627265768085, "language_loss": 0.88882554, "learning_rate": 3.99996587200676e-06, "loss": 0.985771, "num_input_tokens_seen": 11214935, "router_z_loss_clip": 10.2890625, "router_z_loss_mlp": 1.96582031, "step": 529, "time_per_iteration": 2.7876157760620117 }, { "auxiliary_loss_clip": 0.0803078, "auxiliary_loss_mlp": 0.01589954, "balance_loss_clip": 0.070584, "balance_loss_mlp": 0.01406086, "epoch": 0.03186532391402375, "flos": 24871339186560.0, "grad_norm": 7.474442221850896, "language_loss": 0.95449364, "learning_rate": 3.999963558894243e-06, "loss": 1.05070102, "num_input_tokens_seen": 11235310, "router_z_loss_clip": 9.7265625, "router_z_loss_mlp": 1.83789062, "step": 530, "time_per_iteration": 2.7428765296936035 }, { "auxiliary_loss_clip": 0.08057764, "auxiliary_loss_mlp": 0.01574994, "balance_loss_clip": 0.0706311, "balance_loss_mlp": 0.01393224, "epoch": 0.03192544716669172, "flos": 21221417260800.0, "grad_norm": 5.796559213049824, "language_loss": 0.82788658, "learning_rate": 3.999961169942907e-06, "loss": 0.92421424, "num_input_tokens_seen": 11254425, "router_z_loss_clip": 9.9609375, "router_z_loss_mlp": 1.8203125, "step": 531, "time_per_iteration": 4.172942876815796 }, { "auxiliary_loss_clip": 0.08036035, "auxiliary_loss_mlp": 0.01584894, "balance_loss_clip": 0.07060225, "balance_loss_mlp": 0.01423342, "epoch": 0.03198557041935969, "flos": 24360168153600.0, "grad_norm": 9.325635517545745, "language_loss": 0.97642684, "learning_rate": 3.999958705152843e-06, "loss": 1.07263613, "num_input_tokens_seen": 11274595, "router_z_loss_clip": 9.7578125, "router_z_loss_mlp": 1.61621094, "step": 532, "time_per_iteration": 5.759657382965088 }, { "auxiliary_loss_clip": 0.07256257, "auxiliary_loss_mlp": 0.01497298, "balance_loss_clip": 0.0676978, "balance_loss_mlp": 0.01416474, "epoch": 0.032045693672027656, "flos": 61847235993600.0, "grad_norm": 0.7439510894751772, "language_loss": 0.58053553, "learning_rate": 3.9999561645241445e-06, "loss": 0.66807103, "num_input_tokens_seen": 11336705, "router_z_loss_clip": 4.875, "router_z_loss_mlp": 0.80810547, "step": 533, "time_per_iteration": 5.703488826751709 }, { "auxiliary_loss_clip": 0.07979153, "auxiliary_loss_mlp": 0.01539427, "balance_loss_clip": 0.07043815, "balance_loss_mlp": 0.01401049, "epoch": 0.03210581692469563, "flos": 28408475116800.0, "grad_norm": 4.814958310431301, "language_loss": 0.92700481, "learning_rate": 3.999953548056907e-06, "loss": 1.02219057, "num_input_tokens_seen": 11356820, "router_z_loss_clip": 9.359375, "router_z_loss_mlp": 1.3828125, "step": 534, "time_per_iteration": 2.7976458072662354 }, { "auxiliary_loss_clip": 0.0798554, "auxiliary_loss_mlp": 0.01525176, "balance_loss_clip": 0.07026511, "balance_loss_mlp": 0.01385749, "epoch": 0.03216594017736359, "flos": 24724661414400.0, "grad_norm": 5.452349178548391, "language_loss": 0.85188043, "learning_rate": 3.999950855751232e-06, "loss": 0.94698757, "num_input_tokens_seen": 11376645, "router_z_loss_clip": 9.5859375, "router_z_loss_mlp": 1.39453125, "step": 535, "time_per_iteration": 2.7737655639648438 }, { "auxiliary_loss_clip": 0.07985443, "auxiliary_loss_mlp": 0.01560111, "balance_loss_clip": 0.07040181, "balance_loss_mlp": 0.01420779, "epoch": 0.032226063430031565, "flos": 31183445508480.0, "grad_norm": 5.945881877822285, "language_loss": 0.86379123, "learning_rate": 3.999948087607219e-06, "loss": 0.95924687, "num_input_tokens_seen": 11397310, "router_z_loss_clip": 9.4296875, "router_z_loss_mlp": 1.39355469, "step": 536, "time_per_iteration": 2.8431649208068848 }, { "auxiliary_loss_clip": 0.0796698, "auxiliary_loss_mlp": 0.01530287, "balance_loss_clip": 0.0702485, "balance_loss_mlp": 0.01387141, "epoch": 0.03228618668269954, "flos": 32206584188160.0, "grad_norm": 8.234420763473974, "language_loss": 0.76730812, "learning_rate": 3.999945243624975e-06, "loss": 0.86228079, "num_input_tokens_seen": 11418475, "router_z_loss_clip": 9.4296875, "router_z_loss_mlp": 1.43261719, "step": 537, "time_per_iteration": 2.858151435852051 }, { "auxiliary_loss_clip": 0.07960014, "auxiliary_loss_mlp": 0.01556563, "balance_loss_clip": 0.0702926, "balance_loss_mlp": 0.01424289, "epoch": 0.0323463099353675, "flos": 22676036140800.0, "grad_norm": 3.821780300268333, "language_loss": 0.89469683, "learning_rate": 3.999942323804607e-06, "loss": 0.98986256, "num_input_tokens_seen": 11436630, "router_z_loss_clip": 9.3125, "router_z_loss_mlp": 1.32324219, "step": 538, "time_per_iteration": 2.7075653076171875 }, { "auxiliary_loss_clip": 0.07987617, "auxiliary_loss_mlp": 0.01585312, "balance_loss_clip": 0.07031371, "balance_loss_mlp": 0.0144989, "epoch": 0.032406433188035474, "flos": 26912207957760.0, "grad_norm": 7.4462903705123304, "language_loss": 0.84769905, "learning_rate": 3.999939328146225e-06, "loss": 0.94342834, "num_input_tokens_seen": 11457275, "router_z_loss_clip": 9.5859375, "router_z_loss_mlp": 1.35546875, "step": 539, "time_per_iteration": 2.753883123397827 }, { "auxiliary_loss_clip": 0.07955803, "auxiliary_loss_mlp": 0.01568141, "balance_loss_clip": 0.07021256, "balance_loss_mlp": 0.01429858, "epoch": 0.03246655644070344, "flos": 31511992567680.0, "grad_norm": 8.046826004117069, "language_loss": 0.8342917, "learning_rate": 3.999936256649943e-06, "loss": 0.9295311, "num_input_tokens_seen": 11476925, "router_z_loss_clip": 9.3515625, "router_z_loss_mlp": 1.38183594, "step": 540, "time_per_iteration": 2.7699813842773438 }, { "auxiliary_loss_clip": 0.07974392, "auxiliary_loss_mlp": 0.01592068, "balance_loss_clip": 0.07032871, "balance_loss_mlp": 0.01449112, "epoch": 0.03252667969337141, "flos": 23224453113600.0, "grad_norm": 8.995894721047451, "language_loss": 0.93457413, "learning_rate": 3.999933109315878e-06, "loss": 1.03023863, "num_input_tokens_seen": 11496830, "router_z_loss_clip": 9.4140625, "router_z_loss_mlp": 1.42871094, "step": 541, "time_per_iteration": 2.71942138671875 }, { "auxiliary_loss_clip": 0.0793568, "auxiliary_loss_mlp": 0.01563942, "balance_loss_clip": 0.07013851, "balance_loss_mlp": 0.01416504, "epoch": 0.032586802946039384, "flos": 14762800874880.0, "grad_norm": 5.644825679700489, "language_loss": 0.9476245, "learning_rate": 3.9999298861441496e-06, "loss": 1.04262066, "num_input_tokens_seen": 11515605, "router_z_loss_clip": 9.203125, "router_z_loss_mlp": 1.47460938, "step": 542, "time_per_iteration": 2.6974472999572754 }, { "auxiliary_loss_clip": 0.07938398, "auxiliary_loss_mlp": 0.01578446, "balance_loss_clip": 0.07004233, "balance_loss_mlp": 0.01439973, "epoch": 0.03264692619870735, "flos": 24287688771840.0, "grad_norm": 12.39226083593915, "language_loss": 0.77156484, "learning_rate": 3.999926587134879e-06, "loss": 0.86673331, "num_input_tokens_seen": 11536230, "router_z_loss_clip": 9.3515625, "router_z_loss_mlp": 1.38476562, "step": 543, "time_per_iteration": 2.761361598968506 }, { "auxiliary_loss_clip": 0.07947888, "auxiliary_loss_mlp": 0.01559843, "balance_loss_clip": 0.07004046, "balance_loss_mlp": 0.01414408, "epoch": 0.03270704945137532, "flos": 22899763342080.0, "grad_norm": 14.091470913603855, "language_loss": 1.00389326, "learning_rate": 3.999923212288192e-06, "loss": 1.09897053, "num_input_tokens_seen": 11554715, "router_z_loss_clip": 9.4453125, "router_z_loss_mlp": 1.453125, "step": 544, "time_per_iteration": 2.8048763275146484 }, { "auxiliary_loss_clip": 0.07935792, "auxiliary_loss_mlp": 0.01563656, "balance_loss_clip": 0.06992707, "balance_loss_mlp": 0.01412308, "epoch": 0.032767172704043286, "flos": 18046887874560.0, "grad_norm": 9.192386884295905, "language_loss": 0.74527502, "learning_rate": 3.999919761604216e-06, "loss": 0.84026945, "num_input_tokens_seen": 11571370, "router_z_loss_clip": 9.4296875, "router_z_loss_mlp": 1.51367188, "step": 545, "time_per_iteration": 2.677480936050415 }, { "auxiliary_loss_clip": 0.07937724, "auxiliary_loss_mlp": 0.01549668, "balance_loss_clip": 0.07000428, "balance_loss_mlp": 0.01401562, "epoch": 0.03282729595671126, "flos": 22535353935360.0, "grad_norm": 8.186413512710459, "language_loss": 0.98691964, "learning_rate": 3.999916235083083e-06, "loss": 1.08179355, "num_input_tokens_seen": 11588560, "router_z_loss_clip": 9.390625, "router_z_loss_mlp": 1.48046875, "step": 546, "time_per_iteration": 2.7174832820892334 }, { "auxiliary_loss_clip": 0.07906722, "auxiliary_loss_mlp": 0.01561749, "balance_loss_clip": 0.06990965, "balance_loss_mlp": 0.01423276, "epoch": 0.03288741920937923, "flos": 20416555267200.0, "grad_norm": 5.810425617725829, "language_loss": 0.91560805, "learning_rate": 3.999912632724925e-06, "loss": 1.01029277, "num_input_tokens_seen": 11605685, "router_z_loss_clip": 9.1640625, "router_z_loss_mlp": 1.38378906, "step": 547, "time_per_iteration": 2.692147970199585 }, { "auxiliary_loss_clip": 0.0789326, "auxiliary_loss_mlp": 0.01514459, "balance_loss_clip": 0.06978238, "balance_loss_mlp": 0.0137446, "epoch": 0.032947542462047195, "flos": 20784402691200.0, "grad_norm": 12.606985782719034, "language_loss": 0.87186551, "learning_rate": 3.999908954529881e-06, "loss": 0.96594268, "num_input_tokens_seen": 11626290, "router_z_loss_clip": 9.1484375, "router_z_loss_mlp": 1.40039062, "step": 548, "time_per_iteration": 2.7189507484436035 }, { "auxiliary_loss_clip": 0.07915475, "auxiliary_loss_mlp": 0.01519542, "balance_loss_clip": 0.06981654, "balance_loss_mlp": 0.01388793, "epoch": 0.03300766571471517, "flos": 19907354805120.0, "grad_norm": 12.208599658848868, "language_loss": 0.77665782, "learning_rate": 3.999905200498087e-06, "loss": 0.87100798, "num_input_tokens_seen": 11643950, "router_z_loss_clip": 9.359375, "router_z_loss_mlp": 1.30761719, "step": 549, "time_per_iteration": 2.727060317993164 }, { "auxiliary_loss_clip": 0.07895661, "auxiliary_loss_mlp": 0.01473924, "balance_loss_clip": 0.06970972, "balance_loss_mlp": 0.01351949, "epoch": 0.03306778896738313, "flos": 17973569952000.0, "grad_norm": 8.378394567018125, "language_loss": 0.91695094, "learning_rate": 3.999901370629689e-06, "loss": 1.01064682, "num_input_tokens_seen": 11662560, "router_z_loss_clip": 9.2578125, "router_z_loss_mlp": 1.22070312, "step": 550, "time_per_iteration": 2.691377878189087 }, { "auxiliary_loss_clip": 0.07843034, "auxiliary_loss_mlp": 0.0147157, "balance_loss_clip": 0.06956752, "balance_loss_mlp": 0.01354364, "epoch": 0.033127912220051105, "flos": 21659899276800.0, "grad_norm": 6.646148110200237, "language_loss": 0.87097764, "learning_rate": 3.99989746492483e-06, "loss": 0.96412373, "num_input_tokens_seen": 11682265, "router_z_loss_clip": 8.8671875, "router_z_loss_mlp": 1.17089844, "step": 551, "time_per_iteration": 2.759091377258301 }, { "auxiliary_loss_clip": 0.07893354, "auxiliary_loss_mlp": 0.01481928, "balance_loss_clip": 0.069647, "balance_loss_mlp": 0.0135385, "epoch": 0.03318803547271908, "flos": 30195875687040.0, "grad_norm": 6.752696461363376, "language_loss": 0.93977439, "learning_rate": 3.999893483383658e-06, "loss": 1.03352726, "num_input_tokens_seen": 11699300, "router_z_loss_clip": 9.2890625, "router_z_loss_mlp": 1.28125, "step": 552, "time_per_iteration": 2.7856762409210205 }, { "auxiliary_loss_clip": 0.07855408, "auxiliary_loss_mlp": 0.01452826, "balance_loss_clip": 0.06947725, "balance_loss_mlp": 0.01332281, "epoch": 0.03324815872538704, "flos": 20382286147200.0, "grad_norm": 14.853579491603334, "language_loss": 0.99182695, "learning_rate": 3.999889426006326e-06, "loss": 1.0849092, "num_input_tokens_seen": 11716955, "router_z_loss_clip": 9.078125, "router_z_loss_mlp": 1.20507812, "step": 553, "time_per_iteration": 2.70064640045166 }, { "auxiliary_loss_clip": 0.07846013, "auxiliary_loss_mlp": 0.01470318, "balance_loss_clip": 0.0694893, "balance_loss_mlp": 0.01349964, "epoch": 0.033308281978055014, "flos": 24500766504960.0, "grad_norm": 9.119578889394878, "language_loss": 0.8474611, "learning_rate": 3.999885292792986e-06, "loss": 0.94062436, "num_input_tokens_seen": 11736130, "router_z_loss_clip": 8.96875, "router_z_loss_mlp": 1.203125, "step": 554, "time_per_iteration": 2.7942843437194824 }, { "auxiliary_loss_clip": 0.07852865, "auxiliary_loss_mlp": 0.01455819, "balance_loss_clip": 0.06958357, "balance_loss_mlp": 0.01342856, "epoch": 0.03336840523072298, "flos": 23406406254720.0, "grad_norm": 37.933481707839256, "language_loss": 0.87970865, "learning_rate": 3.999881083743795e-06, "loss": 0.97279555, "num_input_tokens_seen": 11754425, "router_z_loss_clip": 8.953125, "router_z_loss_mlp": 1.12939453, "step": 555, "time_per_iteration": 2.734922409057617 }, { "auxiliary_loss_clip": 0.07877155, "auxiliary_loss_mlp": 0.01478887, "balance_loss_clip": 0.06944031, "balance_loss_mlp": 0.01356054, "epoch": 0.03342852848339095, "flos": 30557685617280.0, "grad_norm": 5.1902431709741, "language_loss": 0.96334755, "learning_rate": 3.999876798858914e-06, "loss": 1.05690789, "num_input_tokens_seen": 11772845, "router_z_loss_clip": 9.3359375, "router_z_loss_mlp": 1.22949219, "step": 556, "time_per_iteration": 2.7987170219421387 }, { "auxiliary_loss_clip": 0.0785365, "auxiliary_loss_mlp": 0.01450941, "balance_loss_clip": 0.06953613, "balance_loss_mlp": 0.01332399, "epoch": 0.03348865173605892, "flos": 22899931050240.0, "grad_norm": 552.0413908178386, "language_loss": 0.91523594, "learning_rate": 3.999872438138503e-06, "loss": 1.00828171, "num_input_tokens_seen": 11792850, "router_z_loss_clip": 9.0078125, "router_z_loss_mlp": 1.18652344, "step": 557, "time_per_iteration": 2.713655471801758 }, { "auxiliary_loss_clip": 0.0788148, "auxiliary_loss_mlp": 0.01479643, "balance_loss_clip": 0.06944586, "balance_loss_mlp": 0.01356142, "epoch": 0.03354877498872689, "flos": 17681807635200.0, "grad_norm": 6.945068786000008, "language_loss": 1.02668595, "learning_rate": 3.999868001582729e-06, "loss": 1.12029719, "num_input_tokens_seen": 11809670, "router_z_loss_clip": 9.375, "router_z_loss_mlp": 1.234375, "step": 558, "time_per_iteration": 2.7286341190338135 }, { "auxiliary_loss_clip": 0.07863195, "auxiliary_loss_mlp": 0.01458265, "balance_loss_clip": 0.06941472, "balance_loss_mlp": 0.01343395, "epoch": 0.03360889824139486, "flos": 21659438079360.0, "grad_norm": 17.729072790544613, "language_loss": 0.84501421, "learning_rate": 3.99986348919176e-06, "loss": 0.93822885, "num_input_tokens_seen": 11829665, "router_z_loss_clip": 9.21875, "router_z_loss_mlp": 1.14892578, "step": 559, "time_per_iteration": 2.738365650177002 }, { "auxiliary_loss_clip": 0.07885552, "auxiliary_loss_mlp": 0.01460357, "balance_loss_clip": 0.06955631, "balance_loss_mlp": 0.01344772, "epoch": 0.033669021494062826, "flos": 21801671585280.0, "grad_norm": 3.4974428639972195, "language_loss": 0.93082201, "learning_rate": 3.9998589009657675e-06, "loss": 1.02428102, "num_input_tokens_seen": 11848190, "router_z_loss_clip": 9.3046875, "router_z_loss_mlp": 1.15722656, "step": 560, "time_per_iteration": 2.737426996231079 }, { "auxiliary_loss_clip": 0.07849163, "auxiliary_loss_mlp": 0.01428713, "balance_loss_clip": 0.06933156, "balance_loss_mlp": 0.01314797, "epoch": 0.0337291447467308, "flos": 21871761125760.0, "grad_norm": 13.979886710109978, "language_loss": 0.87404859, "learning_rate": 3.999854236904925e-06, "loss": 0.96682739, "num_input_tokens_seen": 11864795, "router_z_loss_clip": 9.15625, "router_z_loss_mlp": 1.13964844, "step": 561, "time_per_iteration": 2.7188100814819336 }, { "auxiliary_loss_clip": 0.07876473, "auxiliary_loss_mlp": 0.01435641, "balance_loss_clip": 0.06954326, "balance_loss_mlp": 0.01321867, "epoch": 0.03378926799939877, "flos": 24253251943680.0, "grad_norm": 4.468694349876602, "language_loss": 0.86828172, "learning_rate": 3.999849497009409e-06, "loss": 0.96140283, "num_input_tokens_seen": 11885275, "router_z_loss_clip": 9.21875, "router_z_loss_mlp": 1.13964844, "step": 562, "time_per_iteration": 2.7436530590057373 }, { "auxiliary_loss_clip": 0.07891943, "auxiliary_loss_mlp": 0.01440651, "balance_loss_clip": 0.06951986, "balance_loss_mlp": 0.01323635, "epoch": 0.033849391252066735, "flos": 16513290921600.0, "grad_norm": 11.410475108141098, "language_loss": 0.89636308, "learning_rate": 3.999844681279401e-06, "loss": 0.98968905, "num_input_tokens_seen": 11903595, "router_z_loss_clip": 9.3828125, "router_z_loss_mlp": 1.16894531, "step": 563, "time_per_iteration": 2.6834917068481445 }, { "auxiliary_loss_clip": 0.07869986, "auxiliary_loss_mlp": 0.01404697, "balance_loss_clip": 0.06939585, "balance_loss_mlp": 0.01293689, "epoch": 0.03390951450473471, "flos": 15674746786560.0, "grad_norm": 7.216841613929315, "language_loss": 0.98824751, "learning_rate": 3.99983978971508e-06, "loss": 1.08099437, "num_input_tokens_seen": 11917815, "router_z_loss_clip": 9.3203125, "router_z_loss_mlp": 1.109375, "step": 564, "time_per_iteration": 2.6626014709472656 }, { "auxiliary_loss_clip": 0.07898393, "auxiliary_loss_mlp": 0.01446015, "balance_loss_clip": 0.06948674, "balance_loss_mlp": 0.01322991, "epoch": 0.03396963775740267, "flos": 22681444728960.0, "grad_norm": 4.426273840046642, "language_loss": 0.99916983, "learning_rate": 3.999834822316635e-06, "loss": 1.09261394, "num_input_tokens_seen": 11936305, "router_z_loss_clip": 9.5078125, "router_z_loss_mlp": 1.22949219, "step": 565, "time_per_iteration": 2.728072166442871 }, { "auxiliary_loss_clip": 0.07459943, "auxiliary_loss_mlp": 0.01439365, "balance_loss_clip": 0.06954996, "balance_loss_mlp": 0.01360591, "epoch": 0.034029761010070644, "flos": 64414872656640.0, "grad_norm": 1.7409474404750522, "language_loss": 0.56881332, "learning_rate": 3.9998297790842535e-06, "loss": 0.6578064, "num_input_tokens_seen": 11998940, "router_z_loss_clip": 5.0625, "router_z_loss_mlp": 0.78710938, "step": 566, "time_per_iteration": 3.3710274696350098 }, { "auxiliary_loss_clip": 0.07921585, "auxiliary_loss_mlp": 0.01449153, "balance_loss_clip": 0.06954443, "balance_loss_mlp": 0.01323268, "epoch": 0.034089884262738616, "flos": 25010302383360.0, "grad_norm": 7.126994182895947, "language_loss": 0.8351109, "learning_rate": 3.999824660018126e-06, "loss": 0.92881829, "num_input_tokens_seen": 12018860, "router_z_loss_clip": 9.6796875, "router_z_loss_mlp": 1.25976562, "step": 567, "time_per_iteration": 2.838259696960449 }, { "auxiliary_loss_clip": 0.07822628, "auxiliary_loss_mlp": 0.01431579, "balance_loss_clip": 0.06927978, "balance_loss_mlp": 0.01321143, "epoch": 0.03415000751540658, "flos": 28446643451520.0, "grad_norm": 23.733755490987164, "language_loss": 0.86081904, "learning_rate": 3.999819465118447e-06, "loss": 0.95336115, "num_input_tokens_seen": 12039675, "router_z_loss_clip": 8.9609375, "router_z_loss_mlp": 1.10498047, "step": 568, "time_per_iteration": 2.717855453491211 }, { "auxiliary_loss_clip": 0.07880579, "auxiliary_loss_mlp": 0.01452823, "balance_loss_clip": 0.06940256, "balance_loss_mlp": 0.01331134, "epoch": 0.034210130768074554, "flos": 21474843534720.0, "grad_norm": 5.1793936085154, "language_loss": 0.91261148, "learning_rate": 3.999814194385413e-06, "loss": 1.00594544, "num_input_tokens_seen": 12057680, "router_z_loss_clip": 9.40625, "router_z_loss_mlp": 1.21679688, "step": 569, "time_per_iteration": 2.6854450702667236 }, { "auxiliary_loss_clip": 0.07893647, "auxiliary_loss_mlp": 0.01465778, "balance_loss_clip": 0.0696155, "balance_loss_mlp": 0.0134838, "epoch": 0.03427025402074252, "flos": 18703436941440.0, "grad_norm": 11.730792855503013, "language_loss": 0.99669755, "learning_rate": 3.9998088478192255e-06, "loss": 1.09029186, "num_input_tokens_seen": 12076135, "router_z_loss_clip": 9.3203125, "router_z_loss_mlp": 1.17285156, "step": 570, "time_per_iteration": 4.1476476192474365 }, { "auxiliary_loss_clip": 0.07906075, "auxiliary_loss_mlp": 0.01461307, "balance_loss_clip": 0.06956837, "balance_loss_mlp": 0.01339332, "epoch": 0.03433037727341049, "flos": 20856253167360.0, "grad_norm": 11.937042187055825, "language_loss": 0.86007202, "learning_rate": 3.9998034254200846e-06, "loss": 0.95374584, "num_input_tokens_seen": 12094785, "router_z_loss_clip": 9.5078125, "router_z_loss_mlp": 1.22167969, "step": 571, "time_per_iteration": 4.100546598434448 }, { "auxiliary_loss_clip": 0.07851475, "auxiliary_loss_mlp": 0.01428469, "balance_loss_clip": 0.069406, "balance_loss_mlp": 0.01314219, "epoch": 0.03439050052607846, "flos": 25417240536960.0, "grad_norm": 25.04601295356388, "language_loss": 0.88407302, "learning_rate": 3.999797927188199e-06, "loss": 0.97687244, "num_input_tokens_seen": 12114590, "router_z_loss_clip": 9.1171875, "router_z_loss_mlp": 1.14306641, "step": 572, "time_per_iteration": 5.83282208442688 }, { "auxiliary_loss_clip": 0.07827017, "auxiliary_loss_mlp": 0.0143009, "balance_loss_clip": 0.0692388, "balance_loss_mlp": 0.01311739, "epoch": 0.03445062377874643, "flos": 17646029141760.0, "grad_norm": 7.08848339441883, "language_loss": 0.90657026, "learning_rate": 3.999792353123774e-06, "loss": 0.99914134, "num_input_tokens_seen": 12132390, "router_z_loss_clip": 9.03125, "router_z_loss_mlp": 1.18310547, "step": 573, "time_per_iteration": 2.6901252269744873 }, { "auxiliary_loss_clip": 0.07866041, "auxiliary_loss_mlp": 0.01454799, "balance_loss_clip": 0.06945365, "balance_loss_mlp": 0.01327864, "epoch": 0.0345107470314144, "flos": 16770239066880.0, "grad_norm": 8.449525341559722, "language_loss": 0.84608436, "learning_rate": 3.999786703227023e-06, "loss": 0.93929279, "num_input_tokens_seen": 12149035, "router_z_loss_clip": 9.203125, "router_z_loss_mlp": 1.26953125, "step": 574, "time_per_iteration": 2.743028402328491 }, { "auxiliary_loss_clip": 0.07864258, "auxiliary_loss_mlp": 0.01448937, "balance_loss_clip": 0.06942337, "balance_loss_mlp": 0.01324768, "epoch": 0.03457087028408237, "flos": 14689776441600.0, "grad_norm": 4.650824751681945, "language_loss": 0.89659417, "learning_rate": 3.9997809774981606e-06, "loss": 0.98972607, "num_input_tokens_seen": 12167530, "router_z_loss_clip": 9.234375, "router_z_loss_mlp": 1.2421875, "step": 575, "time_per_iteration": 2.7682435512542725 }, { "auxiliary_loss_clip": 0.07855296, "auxiliary_loss_mlp": 0.01444756, "balance_loss_clip": 0.06940714, "balance_loss_mlp": 0.01320301, "epoch": 0.03463099353675034, "flos": 20017499397120.0, "grad_norm": 6.597889303506997, "language_loss": 0.87923062, "learning_rate": 3.9997751759374025e-06, "loss": 0.97223115, "num_input_tokens_seen": 12186340, "router_z_loss_clip": 9.15625, "router_z_loss_mlp": 1.24414062, "step": 576, "time_per_iteration": 2.7009005546569824 }, { "auxiliary_loss_clip": 0.07791448, "auxiliary_loss_mlp": 0.01435935, "balance_loss_clip": 0.06916034, "balance_loss_mlp": 0.01316058, "epoch": 0.03469111678941831, "flos": 25308144120960.0, "grad_norm": 4.110525760720056, "language_loss": 0.90955788, "learning_rate": 3.99976929854497e-06, "loss": 1.00183165, "num_input_tokens_seen": 12204090, "router_z_loss_clip": 8.7421875, "router_z_loss_mlp": 1.19921875, "step": 577, "time_per_iteration": 2.7583096027374268 }, { "auxiliary_loss_clip": 0.07823181, "auxiliary_loss_mlp": 0.01444711, "balance_loss_clip": 0.06929369, "balance_loss_mlp": 0.01321305, "epoch": 0.034751240042086275, "flos": 23266311027840.0, "grad_norm": 3.237765660737881, "language_loss": 0.76631224, "learning_rate": 3.9997633453210845e-06, "loss": 0.85899121, "num_input_tokens_seen": 12224850, "router_z_loss_clip": 8.9296875, "router_z_loss_mlp": 1.234375, "step": 578, "time_per_iteration": 2.7504096031188965 }, { "auxiliary_loss_clip": 0.07843652, "auxiliary_loss_mlp": 0.0145746, "balance_loss_clip": 0.0692562, "balance_loss_mlp": 0.01325567, "epoch": 0.03481136329475425, "flos": 23776056541440.0, "grad_norm": 4.477958510583946, "language_loss": 0.82751846, "learning_rate": 3.999757316265973e-06, "loss": 0.9205296, "num_input_tokens_seen": 12244935, "router_z_loss_clip": 9.1875, "router_z_loss_mlp": 1.31933594, "step": 579, "time_per_iteration": 2.737015962600708 }, { "auxiliary_loss_clip": 0.0782055, "auxiliary_loss_mlp": 0.0143168, "balance_loss_clip": 0.06919888, "balance_loss_mlp": 0.01301598, "epoch": 0.03487148654742222, "flos": 20163799825920.0, "grad_norm": 3.5773424656908683, "language_loss": 0.91410613, "learning_rate": 3.999751211379863e-06, "loss": 1.00662851, "num_input_tokens_seen": 12262140, "router_z_loss_clip": 8.9921875, "router_z_loss_mlp": 1.30175781, "step": 580, "time_per_iteration": 2.738572120666504 }, { "auxiliary_loss_clip": 0.07828015, "auxiliary_loss_mlp": 0.01436643, "balance_loss_clip": 0.06928071, "balance_loss_mlp": 0.01305036, "epoch": 0.034931609800090184, "flos": 15675082202880.0, "grad_norm": 5.563191422175432, "language_loss": 0.89103639, "learning_rate": 3.999745030662987e-06, "loss": 0.98368299, "num_input_tokens_seen": 12280930, "router_z_loss_clip": 9.0, "router_z_loss_mlp": 1.31640625, "step": 581, "time_per_iteration": 2.6869733333587646 }, { "auxiliary_loss_clip": 0.07802528, "auxiliary_loss_mlp": 0.01434433, "balance_loss_clip": 0.06916791, "balance_loss_mlp": 0.0130645, "epoch": 0.034991733052758156, "flos": 16367912887680.0, "grad_norm": 4.290945484645226, "language_loss": 0.82872576, "learning_rate": 3.99973877411558e-06, "loss": 0.92109537, "num_input_tokens_seen": 12299125, "router_z_loss_clip": 8.859375, "router_z_loss_mlp": 1.27929688, "step": 582, "time_per_iteration": 2.6822171211242676 }, { "auxiliary_loss_clip": 0.07762548, "auxiliary_loss_mlp": 0.01414958, "balance_loss_clip": 0.06896524, "balance_loss_mlp": 0.01284305, "epoch": 0.03505185630542612, "flos": 19392787681920.0, "grad_norm": 5.7491176426087796, "language_loss": 0.93518543, "learning_rate": 3.999732441737877e-06, "loss": 1.02696049, "num_input_tokens_seen": 12316905, "router_z_loss_clip": 8.671875, "router_z_loss_mlp": 1.3046875, "step": 583, "time_per_iteration": 2.6823132038116455 }, { "auxiliary_loss_clip": 0.07814728, "auxiliary_loss_mlp": 0.01436377, "balance_loss_clip": 0.06918105, "balance_loss_mlp": 0.01294756, "epoch": 0.03511197955809409, "flos": 21330094406400.0, "grad_norm": 9.953448488565583, "language_loss": 0.87155038, "learning_rate": 3.99972603353012e-06, "loss": 0.96406138, "num_input_tokens_seen": 12335070, "router_z_loss_clip": 8.9609375, "router_z_loss_mlp": 1.41699219, "step": 584, "time_per_iteration": 2.6822144985198975 }, { "auxiliary_loss_clip": 0.07797576, "auxiliary_loss_mlp": 0.01424053, "balance_loss_clip": 0.06902643, "balance_loss_mlp": 0.01291588, "epoch": 0.035172102810762065, "flos": 14141736812160.0, "grad_norm": 5.034212917915482, "language_loss": 1.00025153, "learning_rate": 3.999719549492551e-06, "loss": 1.0924679, "num_input_tokens_seen": 12350315, "router_z_loss_clip": 8.96875, "router_z_loss_mlp": 1.32617188, "step": 585, "time_per_iteration": 2.6495814323425293 }, { "auxiliary_loss_clip": 0.07776659, "auxiliary_loss_mlp": 0.01424257, "balance_loss_clip": 0.06892089, "balance_loss_mlp": 0.01291315, "epoch": 0.03523222606343003, "flos": 20302092190080.0, "grad_norm": 4.220440045465305, "language_loss": 0.93666804, "learning_rate": 3.9997129896254165e-06, "loss": 1.02867723, "num_input_tokens_seen": 12366030, "router_z_loss_clip": 8.8515625, "router_z_loss_mlp": 1.32910156, "step": 586, "time_per_iteration": 2.659757137298584 }, { "auxiliary_loss_clip": 0.0780828, "auxiliary_loss_mlp": 0.01435096, "balance_loss_clip": 0.06914581, "balance_loss_mlp": 0.01293284, "epoch": 0.035292349316098, "flos": 20382034584960.0, "grad_norm": 3.603214241489832, "language_loss": 0.81692171, "learning_rate": 3.999706353928965e-06, "loss": 0.9093554, "num_input_tokens_seen": 12384895, "router_z_loss_clip": 8.9375, "router_z_loss_mlp": 1.41894531, "step": 587, "time_per_iteration": 2.7310309410095215 }, { "auxiliary_loss_clip": 0.07822067, "auxiliary_loss_mlp": 0.01417531, "balance_loss_clip": 0.06911913, "balance_loss_mlp": 0.0127839, "epoch": 0.03535247256876597, "flos": 21475011242880.0, "grad_norm": 2.9875200352935516, "language_loss": 0.83931899, "learning_rate": 3.999699642403449e-06, "loss": 0.93171495, "num_input_tokens_seen": 12404980, "router_z_loss_clip": 9.09375, "router_z_loss_mlp": 1.390625, "step": 588, "time_per_iteration": 2.6755521297454834 }, { "auxiliary_loss_clip": 0.07795858, "auxiliary_loss_mlp": 0.01402639, "balance_loss_clip": 0.06907371, "balance_loss_mlp": 0.0125892, "epoch": 0.03541259582143394, "flos": 23629798039680.0, "grad_norm": 4.723710869765527, "language_loss": 1.00389087, "learning_rate": 3.99969285504912e-06, "loss": 1.09587586, "num_input_tokens_seen": 12423835, "router_z_loss_clip": 8.875, "router_z_loss_mlp": 1.43652344, "step": 589, "time_per_iteration": 2.7270853519439697 }, { "auxiliary_loss_clip": 0.07762802, "auxiliary_loss_mlp": 0.01390756, "balance_loss_clip": 0.06890289, "balance_loss_mlp": 0.01257433, "epoch": 0.03547271907410191, "flos": 33734269428480.0, "grad_norm": 3.458883711339883, "language_loss": 0.89686447, "learning_rate": 3.99968599186624e-06, "loss": 0.98839998, "num_input_tokens_seen": 12443135, "router_z_loss_clip": 8.7265625, "router_z_loss_mlp": 1.33398438, "step": 590, "time_per_iteration": 2.8615758419036865 }, { "auxiliary_loss_clip": 0.07771927, "auxiliary_loss_mlp": 0.01403905, "balance_loss_clip": 0.06896152, "balance_loss_mlp": 0.01261235, "epoch": 0.03553284232676988, "flos": 21149147514240.0, "grad_norm": 2.901107526183973, "language_loss": 0.9135772, "learning_rate": 3.999679052855065e-06, "loss": 1.00533557, "num_input_tokens_seen": 12462895, "router_z_loss_clip": 8.75, "router_z_loss_mlp": 1.42675781, "step": 591, "time_per_iteration": 2.7662274837493896 }, { "auxiliary_loss_clip": 0.07766397, "auxiliary_loss_mlp": 0.01400425, "balance_loss_clip": 0.06889464, "balance_loss_mlp": 0.01263, "epoch": 0.03559296557943785, "flos": 20052607057920.0, "grad_norm": 2.615323392086512, "language_loss": 0.88361806, "learning_rate": 3.999672038015861e-06, "loss": 0.97528636, "num_input_tokens_seen": 12481515, "router_z_loss_clip": 8.7734375, "router_z_loss_mlp": 1.375, "step": 592, "time_per_iteration": 2.6612040996551514 }, { "auxiliary_loss_clip": 0.07311257, "auxiliary_loss_mlp": 0.0148543, "balance_loss_clip": 0.06807105, "balance_loss_mlp": 0.01386868, "epoch": 0.035653088832105814, "flos": 60354742268160.0, "grad_norm": 1.3806618377406474, "language_loss": 0.61070937, "learning_rate": 3.999664947348893e-06, "loss": 0.69867623, "num_input_tokens_seen": 12548220, "router_z_loss_clip": 5.0625, "router_z_loss_mlp": 0.984375, "step": 593, "time_per_iteration": 3.3231780529022217 }, { "auxiliary_loss_clip": 0.07726936, "auxiliary_loss_mlp": 0.01370819, "balance_loss_clip": 0.06885384, "balance_loss_mlp": 0.01249511, "epoch": 0.035713212084773786, "flos": 20118084624000.0, "grad_norm": 2.7082087726253934, "language_loss": 0.90936136, "learning_rate": 3.999657780854429e-06, "loss": 1.00033891, "num_input_tokens_seen": 12566105, "router_z_loss_clip": 8.4296875, "router_z_loss_mlp": 1.21191406, "step": 594, "time_per_iteration": 2.6624724864959717 }, { "auxiliary_loss_clip": 0.07731242, "auxiliary_loss_mlp": 0.01378913, "balance_loss_clip": 0.06886677, "balance_loss_mlp": 0.01251788, "epoch": 0.03577333533744176, "flos": 26292862903680.0, "grad_norm": 3.931540630431008, "language_loss": 0.87739408, "learning_rate": 3.999650538532742e-06, "loss": 0.96849573, "num_input_tokens_seen": 12586680, "router_z_loss_clip": 8.4453125, "router_z_loss_mlp": 1.27246094, "step": 595, "time_per_iteration": 2.716637134552002 }, { "auxiliary_loss_clip": 0.07721841, "auxiliary_loss_mlp": 0.01381107, "balance_loss_clip": 0.06874892, "balance_loss_mlp": 0.01253601, "epoch": 0.035833458590109724, "flos": 10894392627840.0, "grad_norm": 7.706232442092288, "language_loss": 1.02263343, "learning_rate": 3.999643220384106e-06, "loss": 1.11366296, "num_input_tokens_seen": 12601605, "router_z_loss_clip": 8.46875, "router_z_loss_mlp": 1.27636719, "step": 596, "time_per_iteration": 2.6478211879730225 }, { "auxiliary_loss_clip": 0.0769853, "auxiliary_loss_mlp": 0.01377703, "balance_loss_clip": 0.06867382, "balance_loss_mlp": 0.01254869, "epoch": 0.035893581842777696, "flos": 22096620357120.0, "grad_norm": 4.053401601038262, "language_loss": 0.88788092, "learning_rate": 3.999635826408799e-06, "loss": 0.97864318, "num_input_tokens_seen": 12620365, "router_z_loss_clip": 8.30859375, "router_z_loss_mlp": 1.22753906, "step": 597, "time_per_iteration": 2.6949715614318848 }, { "auxiliary_loss_clip": 0.07617883, "auxiliary_loss_mlp": 0.01378047, "balance_loss_clip": 0.06834701, "balance_loss_mlp": 0.01255213, "epoch": 0.03595370509544566, "flos": 23044847886720.0, "grad_norm": 2.7139118311107957, "language_loss": 0.84727561, "learning_rate": 3.999628356607101e-06, "loss": 0.93723488, "num_input_tokens_seen": 12641140, "router_z_loss_clip": 7.828125, "router_z_loss_mlp": 1.22851562, "step": 598, "time_per_iteration": 2.673550844192505 }, { "auxiliary_loss_clip": 0.07626826, "auxiliary_loss_mlp": 0.0137463, "balance_loss_clip": 0.06847554, "balance_loss_mlp": 0.01258281, "epoch": 0.03601382834811363, "flos": 20784109201920.0, "grad_norm": 2.7977171982392184, "language_loss": 0.85572684, "learning_rate": 3.999620810979295e-06, "loss": 0.94574136, "num_input_tokens_seen": 12661080, "router_z_loss_clip": 7.796875, "router_z_loss_mlp": 1.16308594, "step": 599, "time_per_iteration": 2.6529829502105713 }, { "auxiliary_loss_clip": 0.07704739, "auxiliary_loss_mlp": 0.01379324, "balance_loss_clip": 0.06859669, "balance_loss_mlp": 0.01256109, "epoch": 0.036073951600781605, "flos": 23958470880000.0, "grad_norm": 7.909579881109847, "language_loss": 0.92701644, "learning_rate": 3.999613189525668e-06, "loss": 1.01785707, "num_input_tokens_seen": 12678270, "router_z_loss_clip": 8.4609375, "router_z_loss_mlp": 1.23144531, "step": 600, "time_per_iteration": 2.7463738918304443 }, { "auxiliary_loss_clip": 0.07627887, "auxiliary_loss_mlp": 0.01370736, "balance_loss_clip": 0.0684543, "balance_loss_mlp": 0.01258155, "epoch": 0.03613407485344957, "flos": 18917562850560.0, "grad_norm": 3.9450028375937465, "language_loss": 0.88050455, "learning_rate": 3.999605492246508e-06, "loss": 0.97049075, "num_input_tokens_seen": 12697295, "router_z_loss_clip": 7.8359375, "router_z_loss_mlp": 1.125, "step": 601, "time_per_iteration": 2.667297601699829 }, { "auxiliary_loss_clip": 0.07637613, "auxiliary_loss_mlp": 0.01383404, "balance_loss_clip": 0.06830598, "balance_loss_mlp": 0.01264767, "epoch": 0.03619419810611754, "flos": 23045057521920.0, "grad_norm": 4.315738934673544, "language_loss": 0.80681419, "learning_rate": 3.999597719142107e-06, "loss": 0.89702439, "num_input_tokens_seen": 12716165, "router_z_loss_clip": 8.07421875, "router_z_loss_mlp": 1.18652344, "step": 602, "time_per_iteration": 2.6844820976257324 }, { "auxiliary_loss_clip": 0.07623202, "auxiliary_loss_mlp": 0.01377227, "balance_loss_clip": 0.06835207, "balance_loss_mlp": 0.01262357, "epoch": 0.03625432135878551, "flos": 29465002448640.0, "grad_norm": 3.703311527571735, "language_loss": 0.85070622, "learning_rate": 3.999589870212761e-06, "loss": 0.94071054, "num_input_tokens_seen": 12735475, "router_z_loss_clip": 7.890625, "router_z_loss_mlp": 1.14746094, "step": 603, "time_per_iteration": 2.751957416534424 }, { "auxiliary_loss_clip": 0.07610631, "auxiliary_loss_mlp": 0.01364771, "balance_loss_clip": 0.06826615, "balance_loss_mlp": 0.01255862, "epoch": 0.03631444461145348, "flos": 23514412567680.0, "grad_norm": 2.607493186808741, "language_loss": 0.90734184, "learning_rate": 3.9995819454587664e-06, "loss": 0.99709582, "num_input_tokens_seen": 12754540, "router_z_loss_clip": 7.83984375, "router_z_loss_mlp": 1.08886719, "step": 604, "time_per_iteration": 2.7002077102661133 }, { "auxiliary_loss_clip": 0.07600254, "auxiliary_loss_mlp": 0.01374708, "balance_loss_clip": 0.06807975, "balance_loss_mlp": 0.01255403, "epoch": 0.03637456786412145, "flos": 16623770929920.0, "grad_norm": 6.280658076578956, "language_loss": 0.8707459, "learning_rate": 3.999573944880424e-06, "loss": 0.96049553, "num_input_tokens_seen": 12773050, "router_z_loss_clip": 7.92578125, "router_z_loss_mlp": 1.19287109, "step": 605, "time_per_iteration": 2.6469318866729736 }, { "auxiliary_loss_clip": 0.07619411, "auxiliary_loss_mlp": 0.01369806, "balance_loss_clip": 0.06814868, "balance_loss_mlp": 0.01261278, "epoch": 0.03643469111678942, "flos": 15857328833280.0, "grad_norm": 5.329237358790447, "language_loss": 0.92930192, "learning_rate": 3.9995658684780375e-06, "loss": 1.01919413, "num_input_tokens_seen": 12791240, "router_z_loss_clip": 8.046875, "router_z_loss_mlp": 1.0859375, "step": 606, "time_per_iteration": 2.6339194774627686 }, { "auxiliary_loss_clip": 0.07632896, "auxiliary_loss_mlp": 0.01389298, "balance_loss_clip": 0.0682153, "balance_loss_mlp": 0.01273045, "epoch": 0.03649481436945739, "flos": 23626695438720.0, "grad_norm": 5.322023572943129, "language_loss": 0.88234091, "learning_rate": 3.999557716251912e-06, "loss": 0.97256279, "num_input_tokens_seen": 12812245, "router_z_loss_clip": 8.12109375, "router_z_loss_mlp": 1.16210938, "step": 607, "time_per_iteration": 2.6670620441436768 }, { "auxiliary_loss_clip": 0.07595222, "auxiliary_loss_mlp": 0.01387417, "balance_loss_clip": 0.06815962, "balance_loss_mlp": 0.01272976, "epoch": 0.036554937622125354, "flos": 21760903774080.0, "grad_norm": 3.7842025697740374, "language_loss": 0.88215786, "learning_rate": 3.999549488202358e-06, "loss": 0.97198427, "num_input_tokens_seen": 12831085, "router_z_loss_clip": 7.80078125, "router_z_loss_mlp": 1.14453125, "step": 608, "time_per_iteration": 2.7443697452545166 }, { "auxiliary_loss_clip": 0.07609922, "auxiliary_loss_mlp": 0.01393452, "balance_loss_clip": 0.06823076, "balance_loss_mlp": 0.0128049, "epoch": 0.036615060874793326, "flos": 17825215098240.0, "grad_norm": 3.1607404180468333, "language_loss": 0.87030363, "learning_rate": 3.999541184329688e-06, "loss": 0.9603374, "num_input_tokens_seen": 12849115, "router_z_loss_clip": 7.86328125, "router_z_loss_mlp": 1.12988281, "step": 609, "time_per_iteration": 4.070490598678589 }, { "auxiliary_loss_clip": 0.07632478, "auxiliary_loss_mlp": 0.01398239, "balance_loss_clip": 0.06830801, "balance_loss_mlp": 0.01290187, "epoch": 0.0366751841274613, "flos": 26759911962240.0, "grad_norm": 4.298629669876905, "language_loss": 0.85686368, "learning_rate": 3.999532804634215e-06, "loss": 0.94717085, "num_input_tokens_seen": 12868005, "router_z_loss_clip": 8.01953125, "router_z_loss_mlp": 1.08105469, "step": 610, "time_per_iteration": 2.7082746028900146 }, { "auxiliary_loss_clip": 0.07665572, "auxiliary_loss_mlp": 0.01442841, "balance_loss_clip": 0.06847397, "balance_loss_mlp": 0.01318863, "epoch": 0.03673530738012926, "flos": 22202949588480.0, "grad_norm": 2.8616151112051984, "language_loss": 0.91407102, "learning_rate": 3.9995243491162575e-06, "loss": 1.00515521, "num_input_tokens_seen": 12886890, "router_z_loss_clip": 8.171875, "router_z_loss_mlp": 1.24121094, "step": 611, "time_per_iteration": 5.645262956619263 }, { "auxiliary_loss_clip": 0.07641716, "auxiliary_loss_mlp": 0.01442711, "balance_loss_clip": 0.06839284, "balance_loss_mlp": 0.0133218, "epoch": 0.036795430632797235, "flos": 24688673285760.0, "grad_norm": 10.450824150167211, "language_loss": 0.78231227, "learning_rate": 3.999515817776136e-06, "loss": 0.87315655, "num_input_tokens_seen": 12906130, "router_z_loss_clip": 8.0234375, "router_z_loss_mlp": 1.10400391, "step": 612, "time_per_iteration": 4.232727289199829 }, { "auxiliary_loss_clip": 0.07683285, "auxiliary_loss_mlp": 0.01458626, "balance_loss_clip": 0.06842151, "balance_loss_mlp": 0.01334744, "epoch": 0.0368555538854652, "flos": 17754706287360.0, "grad_norm": 8.531393222732717, "language_loss": 0.84625483, "learning_rate": 3.999507210614175e-06, "loss": 0.93767393, "num_input_tokens_seen": 12925260, "router_z_loss_clip": 8.41015625, "router_z_loss_mlp": 1.23828125, "step": 613, "time_per_iteration": 2.8269193172454834 }, { "auxiliary_loss_clip": 0.07655013, "auxiliary_loss_mlp": 0.01479067, "balance_loss_clip": 0.06842783, "balance_loss_mlp": 0.01351084, "epoch": 0.03691567713813317, "flos": 20600772468480.0, "grad_norm": 22.941153108291925, "language_loss": 0.99018866, "learning_rate": 3.9994985276307e-06, "loss": 1.08152938, "num_input_tokens_seen": 12944590, "router_z_loss_clip": 8.140625, "router_z_loss_mlp": 1.28027344, "step": 614, "time_per_iteration": 2.695507526397705 }, { "auxiliary_loss_clip": 0.0767267, "auxiliary_loss_mlp": 0.01508676, "balance_loss_clip": 0.06846563, "balance_loss_mlp": 0.01374303, "epoch": 0.036975800390801145, "flos": 33657765050880.0, "grad_norm": 4.545636575716966, "language_loss": 0.7861551, "learning_rate": 3.999489768826041e-06, "loss": 0.87796855, "num_input_tokens_seen": 12964785, "router_z_loss_clip": 8.26953125, "router_z_loss_mlp": 1.34375, "step": 615, "time_per_iteration": 2.7841460704803467 }, { "auxiliary_loss_clip": 0.07690837, "auxiliary_loss_mlp": 0.01505864, "balance_loss_clip": 0.06852247, "balance_loss_mlp": 0.01377261, "epoch": 0.03703592364346911, "flos": 28301307344640.0, "grad_norm": 4.643859170710883, "language_loss": 0.86154211, "learning_rate": 3.999480934200528e-06, "loss": 0.95350915, "num_input_tokens_seen": 12986705, "router_z_loss_clip": 8.39453125, "router_z_loss_mlp": 1.28515625, "step": 616, "time_per_iteration": 2.760497570037842 }, { "auxiliary_loss_clip": 0.07688581, "auxiliary_loss_mlp": 0.01503059, "balance_loss_clip": 0.06862646, "balance_loss_mlp": 0.01372978, "epoch": 0.03709604689613708, "flos": 31512327984000.0, "grad_norm": 5.246888588918579, "language_loss": 0.75041842, "learning_rate": 3.999472023754499e-06, "loss": 0.84233475, "num_input_tokens_seen": 13010560, "router_z_loss_clip": 8.25, "router_z_loss_mlp": 1.30175781, "step": 617, "time_per_iteration": 2.7738265991210938 }, { "auxiliary_loss_clip": 0.07660149, "auxiliary_loss_mlp": 0.01505665, "balance_loss_clip": 0.06853727, "balance_loss_mlp": 0.01369862, "epoch": 0.03715617014880505, "flos": 19615424780160.0, "grad_norm": 4.308298175605395, "language_loss": 0.85427976, "learning_rate": 3.99946303748829e-06, "loss": 0.94593787, "num_input_tokens_seen": 13028935, "router_z_loss_clip": 8.05859375, "router_z_loss_mlp": 1.35839844, "step": 618, "time_per_iteration": 2.682257890701294 }, { "auxiliary_loss_clip": 0.07688208, "auxiliary_loss_mlp": 0.01486156, "balance_loss_clip": 0.06852958, "balance_loss_mlp": 0.01351306, "epoch": 0.03721629340147302, "flos": 15929598579840.0, "grad_norm": 25.33867918965015, "language_loss": 0.96795291, "learning_rate": 3.999453975402242e-06, "loss": 1.05969644, "num_input_tokens_seen": 13046000, "router_z_loss_clip": 8.3515625, "router_z_loss_mlp": 1.34863281, "step": 619, "time_per_iteration": 2.6275417804718018 }, { "auxiliary_loss_clip": 0.07665384, "auxiliary_loss_mlp": 0.01456165, "balance_loss_clip": 0.06847725, "balance_loss_mlp": 0.01330661, "epoch": 0.03727641665414099, "flos": 21110182565760.0, "grad_norm": 6.594866768776284, "language_loss": 0.99370271, "learning_rate": 3.9994448374967e-06, "loss": 1.08491814, "num_input_tokens_seen": 13062995, "router_z_loss_clip": 8.18359375, "router_z_loss_mlp": 1.25488281, "step": 620, "time_per_iteration": 2.6706383228302 }, { "auxiliary_loss_clip": 0.07689012, "auxiliary_loss_mlp": 0.0146967, "balance_loss_clip": 0.06854448, "balance_loss_mlp": 0.0134245, "epoch": 0.037336539906808956, "flos": 24138159960960.0, "grad_norm": 2.7185586472665726, "language_loss": 0.81752932, "learning_rate": 3.999435623772008e-06, "loss": 0.90911615, "num_input_tokens_seen": 13084120, "router_z_loss_clip": 8.35546875, "router_z_loss_mlp": 1.27148438, "step": 621, "time_per_iteration": 2.739349603652954 }, { "auxiliary_loss_clip": 0.07621565, "auxiliary_loss_mlp": 0.01404903, "balance_loss_clip": 0.06828419, "balance_loss_mlp": 0.01294467, "epoch": 0.03739666315947693, "flos": 22352981523840.0, "grad_norm": 4.496902308523792, "language_loss": 0.90666676, "learning_rate": 3.999426334228518e-06, "loss": 0.99693143, "num_input_tokens_seen": 13100035, "router_z_loss_clip": 7.92578125, "router_z_loss_mlp": 1.10400391, "step": 622, "time_per_iteration": 2.67120099067688 }, { "auxiliary_loss_clip": 0.07650566, "auxiliary_loss_mlp": 0.01409733, "balance_loss_clip": 0.06839268, "balance_loss_mlp": 0.01297485, "epoch": 0.0374567864121449, "flos": 20455855632000.0, "grad_norm": 3.800678504338413, "language_loss": 0.94893229, "learning_rate": 3.999416968866581e-06, "loss": 1.03953528, "num_input_tokens_seen": 13118070, "router_z_loss_clip": 8.1015625, "router_z_loss_mlp": 1.12304688, "step": 623, "time_per_iteration": 2.6548471450805664 }, { "auxiliary_loss_clip": 0.07638848, "auxiliary_loss_mlp": 0.01383267, "balance_loss_clip": 0.06837358, "balance_loss_mlp": 0.01279221, "epoch": 0.037516909664812866, "flos": 19214020995840.0, "grad_norm": 3.076932209069846, "language_loss": 0.88079208, "learning_rate": 3.999407527686551e-06, "loss": 0.97101319, "num_input_tokens_seen": 13136355, "router_z_loss_clip": 8.0234375, "router_z_loss_mlp": 1.04101562, "step": 624, "time_per_iteration": 2.6601412296295166 }, { "auxiliary_loss_clip": 0.07614884, "auxiliary_loss_mlp": 0.01380508, "balance_loss_clip": 0.06835255, "balance_loss_mlp": 0.01277177, "epoch": 0.03757703291748084, "flos": 35013643493760.0, "grad_norm": 5.206189838509693, "language_loss": 0.74194682, "learning_rate": 3.999398010688788e-06, "loss": 0.83190072, "num_input_tokens_seen": 13155435, "router_z_loss_clip": 7.7890625, "router_z_loss_mlp": 1.03369141, "step": 625, "time_per_iteration": 2.7410738468170166 }, { "auxiliary_loss_clip": 0.0758748, "auxiliary_loss_mlp": 0.01370684, "balance_loss_clip": 0.06817469, "balance_loss_mlp": 0.01274744, "epoch": 0.0376371561701488, "flos": 25490977729920.0, "grad_norm": 5.619432245142284, "language_loss": 0.81553912, "learning_rate": 3.999388417873652e-06, "loss": 0.90512073, "num_input_tokens_seen": 13174295, "router_z_loss_clip": 7.70703125, "router_z_loss_mlp": 0.96044922, "step": 626, "time_per_iteration": 2.7021541595458984 }, { "auxiliary_loss_clip": 0.07587172, "auxiliary_loss_mlp": 0.01375168, "balance_loss_clip": 0.06815329, "balance_loss_mlp": 0.01274842, "epoch": 0.037697279422816775, "flos": 18191301586560.0, "grad_norm": 27.588517916130247, "language_loss": 0.84648871, "learning_rate": 3.999378749241506e-06, "loss": 0.93611217, "num_input_tokens_seen": 13192500, "router_z_loss_clip": 7.7109375, "router_z_loss_mlp": 1.00244141, "step": 627, "time_per_iteration": 2.637824296951294 }, { "auxiliary_loss_clip": 0.07571504, "auxiliary_loss_mlp": 0.0137071, "balance_loss_clip": 0.06802179, "balance_loss_mlp": 0.01270145, "epoch": 0.03775740267548475, "flos": 24651133856640.0, "grad_norm": 4.393307916711652, "language_loss": 0.92302209, "learning_rate": 3.999369004792719e-06, "loss": 1.01244426, "num_input_tokens_seen": 13213470, "router_z_loss_clip": 7.703125, "router_z_loss_mlp": 1.00488281, "step": 628, "time_per_iteration": 2.7257795333862305 }, { "auxiliary_loss_clip": 0.07559849, "auxiliary_loss_mlp": 0.01377927, "balance_loss_clip": 0.06806828, "balance_loss_mlp": 0.0128461, "epoch": 0.03781752592815271, "flos": 21294609402240.0, "grad_norm": 3.4319794430384603, "language_loss": 0.83957946, "learning_rate": 3.999359184527658e-06, "loss": 0.92895722, "num_input_tokens_seen": 13232365, "router_z_loss_clip": 7.53125, "router_z_loss_mlp": 0.93310547, "step": 629, "time_per_iteration": 2.6576297283172607 }, { "auxiliary_loss_clip": 0.07570235, "auxiliary_loss_mlp": 0.01383082, "balance_loss_clip": 0.06808193, "balance_loss_mlp": 0.01284282, "epoch": 0.037877649180820684, "flos": 22095949524480.0, "grad_norm": 2.53304111913576, "language_loss": 0.80872911, "learning_rate": 3.999349288446696e-06, "loss": 0.89826226, "num_input_tokens_seen": 13251920, "router_z_loss_clip": 7.62109375, "router_z_loss_mlp": 0.98925781, "step": 630, "time_per_iteration": 2.6696414947509766 }, { "auxiliary_loss_clip": 0.07566547, "auxiliary_loss_mlp": 0.01387011, "balance_loss_clip": 0.06801605, "balance_loss_mlp": 0.01294027, "epoch": 0.03793777243348865, "flos": 14506523562240.0, "grad_norm": 5.1335334779605395, "language_loss": 0.97525644, "learning_rate": 3.99933931655021e-06, "loss": 1.06479192, "num_input_tokens_seen": 13267440, "router_z_loss_clip": 7.65234375, "router_z_loss_mlp": 0.9296875, "step": 631, "time_per_iteration": 2.6129579544067383 }, { "auxiliary_loss_clip": 0.07512769, "auxiliary_loss_mlp": 0.0138869, "balance_loss_clip": 0.06806379, "balance_loss_mlp": 0.01302382, "epoch": 0.03799789568615662, "flos": 21914918778240.0, "grad_norm": 2.547041753805319, "language_loss": 0.95604569, "learning_rate": 3.999329268838575e-06, "loss": 1.04506028, "num_input_tokens_seen": 13287850, "router_z_loss_clip": 7.05859375, "router_z_loss_mlp": 0.86328125, "step": 632, "time_per_iteration": 2.6856651306152344 }, { "auxiliary_loss_clip": 0.07533744, "auxiliary_loss_mlp": 0.01390492, "balance_loss_clip": 0.06808687, "balance_loss_mlp": 0.01299273, "epoch": 0.03805801893882459, "flos": 24833967465600.0, "grad_norm": 2.61580348943968, "language_loss": 0.87126696, "learning_rate": 3.999319145312175e-06, "loss": 0.9605093, "num_input_tokens_seen": 13307760, "router_z_loss_clip": 7.25390625, "router_z_loss_mlp": 0.91259766, "step": 633, "time_per_iteration": 2.703340768814087 }, { "auxiliary_loss_clip": 0.07522996, "auxiliary_loss_mlp": 0.01372714, "balance_loss_clip": 0.06797203, "balance_loss_mlp": 0.01285262, "epoch": 0.03811814219149256, "flos": 30490950240000.0, "grad_norm": 3.646868991440272, "language_loss": 0.73184884, "learning_rate": 3.999308945971392e-06, "loss": 0.82080591, "num_input_tokens_seen": 13331230, "router_z_loss_clip": 7.2734375, "router_z_loss_mlp": 0.875, "step": 634, "time_per_iteration": 2.7660887241363525 }, { "auxiliary_loss_clip": 0.07511846, "auxiliary_loss_mlp": 0.01471529, "balance_loss_clip": 0.07052977, "balance_loss_mlp": 0.01427183, "epoch": 0.03817826544416053, "flos": 67010671820160.0, "grad_norm": 1.2081837543407326, "language_loss": 0.6249069, "learning_rate": 3.999298670816614e-06, "loss": 0.71474063, "num_input_tokens_seen": 13394760, "router_z_loss_clip": 4.59375, "router_z_loss_mlp": 0.44384766, "step": 635, "time_per_iteration": 3.3306891918182373 }, { "auxiliary_loss_clip": 0.07517649, "auxiliary_loss_mlp": 0.01369577, "balance_loss_clip": 0.06811251, "balance_loss_mlp": 0.01282506, "epoch": 0.038238388696828496, "flos": 20491592198400.0, "grad_norm": 4.938176974574923, "language_loss": 0.8847245, "learning_rate": 3.9992883198482294e-06, "loss": 0.97359675, "num_input_tokens_seen": 13412775, "router_z_loss_clip": 7.0625, "router_z_loss_mlp": 0.87109375, "step": 636, "time_per_iteration": 2.701509475708008 }, { "auxiliary_loss_clip": 0.07536258, "auxiliary_loss_mlp": 0.0137573, "balance_loss_clip": 0.06819088, "balance_loss_mlp": 0.01291187, "epoch": 0.03829851194949647, "flos": 17971389745920.0, "grad_norm": 7.358994178563381, "language_loss": 0.86340225, "learning_rate": 3.999277893066632e-06, "loss": 0.95252204, "num_input_tokens_seen": 13427835, "router_z_loss_clip": 7.171875, "router_z_loss_mlp": 0.84716797, "step": 637, "time_per_iteration": 2.6422460079193115 }, { "auxiliary_loss_clip": 0.0753096, "auxiliary_loss_mlp": 0.01344111, "balance_loss_clip": 0.06806178, "balance_loss_mlp": 0.01267674, "epoch": 0.03835863520216444, "flos": 22463251896960.0, "grad_norm": 3.463859051501116, "language_loss": 0.88202983, "learning_rate": 3.999267390472215e-06, "loss": 0.97078061, "num_input_tokens_seen": 13447295, "router_z_loss_clip": 7.2421875, "router_z_loss_mlp": 0.76416016, "step": 638, "time_per_iteration": 2.6910884380340576 }, { "auxiliary_loss_clip": 0.07552798, "auxiliary_loss_mlp": 0.01358918, "balance_loss_clip": 0.06815432, "balance_loss_mlp": 0.01270274, "epoch": 0.038418758454832405, "flos": 22171070309760.0, "grad_norm": 4.772938001283959, "language_loss": 0.74682933, "learning_rate": 3.999256812065381e-06, "loss": 0.83594644, "num_input_tokens_seen": 13468455, "router_z_loss_clip": 7.3671875, "router_z_loss_mlp": 0.88623047, "step": 639, "time_per_iteration": 2.6673524379730225 }, { "auxiliary_loss_clip": 0.07554536, "auxiliary_loss_mlp": 0.01346278, "balance_loss_clip": 0.06817013, "balance_loss_mlp": 0.01258015, "epoch": 0.03847888170750038, "flos": 22754049891840.0, "grad_norm": 15.458069006228987, "language_loss": 0.91368532, "learning_rate": 3.999246157846526e-06, "loss": 1.00269341, "num_input_tokens_seen": 13489085, "router_z_loss_clip": 7.390625, "router_z_loss_mlp": 0.8828125, "step": 640, "time_per_iteration": 2.692963123321533 }, { "auxiliary_loss_clip": 0.07541686, "auxiliary_loss_mlp": 0.01349032, "balance_loss_clip": 0.0680511, "balance_loss_mlp": 0.01261723, "epoch": 0.03853900496016834, "flos": 22717852128000.0, "grad_norm": 5.389558921795994, "language_loss": 0.8668502, "learning_rate": 3.9992354278160574e-06, "loss": 0.95575732, "num_input_tokens_seen": 13509120, "router_z_loss_clip": 7.375, "router_z_loss_mlp": 0.87255859, "step": 641, "time_per_iteration": 2.658036708831787 }, { "auxiliary_loss_clip": 0.07413899, "auxiliary_loss_mlp": 0.0137231, "balance_loss_clip": 0.06963582, "balance_loss_mlp": 0.0131571, "epoch": 0.038599128212836314, "flos": 70420039073280.0, "grad_norm": 0.8990695083510251, "language_loss": 0.65252364, "learning_rate": 3.999224621974381e-06, "loss": 0.74038577, "num_input_tokens_seen": 13562005, "router_z_loss_clip": 4.5, "router_z_loss_mlp": 0.56689453, "step": 642, "time_per_iteration": 3.2853758335113525 }, { "auxiliary_loss_clip": 0.0750014, "auxiliary_loss_mlp": 0.01341474, "balance_loss_clip": 0.06793246, "balance_loss_mlp": 0.01260364, "epoch": 0.03865925146550429, "flos": 23301921813120.0, "grad_norm": 4.694741469990202, "language_loss": 0.83989561, "learning_rate": 3.999213740321906e-06, "loss": 0.92831171, "num_input_tokens_seen": 13582185, "router_z_loss_clip": 7.05859375, "router_z_loss_mlp": 0.81005859, "step": 643, "time_per_iteration": 2.699235439300537 }, { "auxiliary_loss_clip": 0.07485759, "auxiliary_loss_mlp": 0.01342753, "balance_loss_clip": 0.06781226, "balance_loss_mlp": 0.01258639, "epoch": 0.03871937471817225, "flos": 21436255929600.0, "grad_norm": 14.498651347145046, "language_loss": 0.86810982, "learning_rate": 3.999202782859046e-06, "loss": 0.95639491, "num_input_tokens_seen": 13599555, "router_z_loss_clip": 7.046875, "router_z_loss_mlp": 0.84033203, "step": 644, "time_per_iteration": 2.6727428436279297 }, { "auxiliary_loss_clip": 0.07483482, "auxiliary_loss_mlp": 0.01341981, "balance_loss_clip": 0.06789058, "balance_loss_mlp": 0.01257915, "epoch": 0.038779497970840224, "flos": 34285914783360.0, "grad_norm": 5.475640184776624, "language_loss": 0.86271727, "learning_rate": 3.9991917495862165e-06, "loss": 0.9509719, "num_input_tokens_seen": 13621160, "router_z_loss_clip": 6.9453125, "router_z_loss_mlp": 0.84082031, "step": 645, "time_per_iteration": 2.763963460922241 }, { "auxiliary_loss_clip": 0.0746941, "auxiliary_loss_mlp": 0.01343614, "balance_loss_clip": 0.06773508, "balance_loss_mlp": 0.01260501, "epoch": 0.03883962122350819, "flos": 22754930359680.0, "grad_norm": 4.187430758437384, "language_loss": 0.86855042, "learning_rate": 3.9991806405038345e-06, "loss": 0.95668066, "num_input_tokens_seen": 13641915, "router_z_loss_clip": 6.96484375, "router_z_loss_mlp": 0.83056641, "step": 646, "time_per_iteration": 2.71628737449646 }, { "auxiliary_loss_clip": 0.07476978, "auxiliary_loss_mlp": 0.01349436, "balance_loss_clip": 0.06773661, "balance_loss_mlp": 0.01261316, "epoch": 0.03889974447617616, "flos": 21952500134400.0, "grad_norm": 5.727659131778787, "language_loss": 0.87272036, "learning_rate": 3.999169455612323e-06, "loss": 0.96098447, "num_input_tokens_seen": 13661410, "router_z_loss_clip": 7.03515625, "router_z_loss_mlp": 0.87988281, "step": 647, "time_per_iteration": 2.6699793338775635 }, { "auxiliary_loss_clip": 0.07462803, "auxiliary_loss_mlp": 0.01349777, "balance_loss_clip": 0.06778196, "balance_loss_mlp": 0.01265711, "epoch": 0.03895986772884413, "flos": 31513040743680.0, "grad_norm": 5.062187800744724, "language_loss": 0.89260381, "learning_rate": 3.999158194912106e-06, "loss": 0.98072958, "num_input_tokens_seen": 13681705, "router_z_loss_clip": 6.84375, "router_z_loss_mlp": 0.84082031, "step": 648, "time_per_iteration": 2.762803316116333 }, { "auxiliary_loss_clip": 0.07460001, "auxiliary_loss_mlp": 0.01359205, "balance_loss_clip": 0.0677515, "balance_loss_mlp": 0.01268701, "epoch": 0.0390199909815121, "flos": 19907061315840.0, "grad_norm": 10.87265903757942, "language_loss": 0.88136256, "learning_rate": 3.9991468584036086e-06, "loss": 0.96955466, "num_input_tokens_seen": 13700400, "router_z_loss_clip": 6.84375, "router_z_loss_mlp": 0.90478516, "step": 649, "time_per_iteration": 4.092935562133789 }, { "auxiliary_loss_clip": 0.07470724, "auxiliary_loss_mlp": 0.01351685, "balance_loss_clip": 0.06771737, "balance_loss_mlp": 0.01265997, "epoch": 0.03908011423418007, "flos": 21618250997760.0, "grad_norm": 8.111674241453183, "language_loss": 0.83429939, "learning_rate": 3.999135446087263e-06, "loss": 0.92252344, "num_input_tokens_seen": 13720145, "router_z_loss_clip": 6.9921875, "router_z_loss_mlp": 0.85644531, "step": 650, "time_per_iteration": 2.6678836345672607 }, { "auxiliary_loss_clip": 0.07429118, "auxiliary_loss_mlp": 0.01346585, "balance_loss_clip": 0.06754167, "balance_loss_mlp": 0.01264235, "epoch": 0.039140237486848035, "flos": 18667406885760.0, "grad_norm": 5.262705962139183, "language_loss": 0.8328563, "learning_rate": 3.9991239579635e-06, "loss": 0.92061335, "num_input_tokens_seen": 13737500, "router_z_loss_clip": 6.75, "router_z_loss_mlp": 0.82373047, "step": 651, "time_per_iteration": 6.7488932609558105 }, { "auxiliary_loss_clip": 0.07419126, "auxiliary_loss_mlp": 0.01345627, "balance_loss_clip": 0.06750219, "balance_loss_mlp": 0.01258843, "epoch": 0.03920036073951601, "flos": 18667071469440.0, "grad_norm": 11.825813206974932, "language_loss": 0.93389237, "learning_rate": 3.999112394032757e-06, "loss": 1.02153993, "num_input_tokens_seen": 13754750, "router_z_loss_clip": 6.6953125, "router_z_loss_mlp": 0.86865234, "step": 652, "time_per_iteration": 2.6469109058380127 }, { "auxiliary_loss_clip": 0.07429454, "auxiliary_loss_mlp": 0.01359515, "balance_loss_clip": 0.06747465, "balance_loss_mlp": 0.01271157, "epoch": 0.03926048399218398, "flos": 31361918705280.0, "grad_norm": 5.749243110771883, "language_loss": 0.84714293, "learning_rate": 3.999100754295471e-06, "loss": 0.93503267, "num_input_tokens_seen": 13771990, "router_z_loss_clip": 6.8203125, "router_z_loss_mlp": 0.88330078, "step": 653, "time_per_iteration": 2.714426279067993 }, { "auxiliary_loss_clip": 0.07462627, "auxiliary_loss_mlp": 0.01356462, "balance_loss_clip": 0.06757441, "balance_loss_mlp": 0.0126758, "epoch": 0.039320607244851945, "flos": 29610715898880.0, "grad_norm": 6.720303062872896, "language_loss": 0.90553141, "learning_rate": 3.999089038752085e-06, "loss": 0.99372232, "num_input_tokens_seen": 13792750, "router_z_loss_clip": 7.046875, "router_z_loss_mlp": 0.88818359, "step": 654, "time_per_iteration": 2.7201974391937256 }, { "auxiliary_loss_clip": 0.07144373, "auxiliary_loss_mlp": 0.01360948, "balance_loss_clip": 0.06698178, "balance_loss_mlp": 0.01291425, "epoch": 0.03938073049751992, "flos": 66555362332800.0, "grad_norm": 0.7506182763362922, "language_loss": 0.50199664, "learning_rate": 3.999077247403041e-06, "loss": 0.58704984, "num_input_tokens_seen": 13858570, "router_z_loss_clip": 4.45703125, "router_z_loss_mlp": 0.69628906, "step": 655, "time_per_iteration": 3.314666509628296 }, { "auxiliary_loss_clip": 0.07371997, "auxiliary_loss_mlp": 0.0135723, "balance_loss_clip": 0.06729065, "balance_loss_mlp": 0.01280698, "epoch": 0.03944085375018788, "flos": 23374568903040.0, "grad_norm": 5.084000714734093, "language_loss": 0.84213579, "learning_rate": 3.9990653802487886e-06, "loss": 0.92942804, "num_input_tokens_seen": 13876335, "router_z_loss_clip": 6.43359375, "router_z_loss_mlp": 0.765625, "step": 656, "time_per_iteration": 2.7047102451324463 }, { "auxiliary_loss_clip": 0.07430609, "auxiliary_loss_mlp": 0.01364868, "balance_loss_clip": 0.06736114, "balance_loss_mlp": 0.01272076, "epoch": 0.039500977002855854, "flos": 18553656568320.0, "grad_norm": 3.719987700835855, "language_loss": 0.82078624, "learning_rate": 3.999053437289776e-06, "loss": 0.90874094, "num_input_tokens_seen": 13892640, "router_z_loss_clip": 6.94140625, "router_z_loss_mlp": 0.92773438, "step": 657, "time_per_iteration": 2.6856720447540283 }, { "auxiliary_loss_clip": 0.0740719, "auxiliary_loss_mlp": 0.0139627, "balance_loss_clip": 0.06738829, "balance_loss_mlp": 0.01303764, "epoch": 0.039561100255523826, "flos": 25345264279680.0, "grad_norm": 6.312653733083569, "language_loss": 0.85947692, "learning_rate": 3.999041418526457e-06, "loss": 0.94751149, "num_input_tokens_seen": 13910085, "router_z_loss_clip": 6.6953125, "router_z_loss_mlp": 0.92480469, "step": 658, "time_per_iteration": 2.756199836730957 }, { "auxiliary_loss_clip": 0.07359891, "auxiliary_loss_mlp": 0.01373684, "balance_loss_clip": 0.06707886, "balance_loss_mlp": 0.01286661, "epoch": 0.03962122350819179, "flos": 18225193363200.0, "grad_norm": 3.5943642782157763, "language_loss": 0.95807093, "learning_rate": 3.999029323959287e-06, "loss": 1.0454067, "num_input_tokens_seen": 13928800, "router_z_loss_clip": 6.5234375, "router_z_loss_mlp": 0.86914062, "step": 659, "time_per_iteration": 2.6678850650787354 }, { "auxiliary_loss_clip": 0.07396142, "auxiliary_loss_mlp": 0.01361406, "balance_loss_clip": 0.06732086, "balance_loss_mlp": 0.01274621, "epoch": 0.03968134676085976, "flos": 20528544648960.0, "grad_norm": 3.746930355050233, "language_loss": 0.82839066, "learning_rate": 3.999017153588724e-06, "loss": 0.91596615, "num_input_tokens_seen": 13948325, "router_z_loss_clip": 6.640625, "router_z_loss_mlp": 0.86816406, "step": 660, "time_per_iteration": 2.6968488693237305 }, { "auxiliary_loss_clip": 0.0738163, "auxiliary_loss_mlp": 0.01356241, "balance_loss_clip": 0.06731005, "balance_loss_mlp": 0.01267263, "epoch": 0.03974147001352773, "flos": 22429737463680.0, "grad_norm": 2.8738070808666953, "language_loss": 0.85670698, "learning_rate": 3.999004907415231e-06, "loss": 0.94408572, "num_input_tokens_seen": 13969090, "router_z_loss_clip": 6.51171875, "router_z_loss_mlp": 0.89013672, "step": 661, "time_per_iteration": 2.7147953510284424 }, { "auxiliary_loss_clip": 0.07028852, "auxiliary_loss_mlp": 0.01350387, "balance_loss_clip": 0.06596245, "balance_loss_mlp": 0.01287302, "epoch": 0.0398015932661957, "flos": 71149780281600.0, "grad_norm": 0.9008815591618644, "language_loss": 0.69765049, "learning_rate": 3.998992585439272e-06, "loss": 0.78144282, "num_input_tokens_seen": 14037555, "router_z_loss_clip": 4.3125, "router_z_loss_mlp": 0.63037109, "step": 662, "time_per_iteration": 3.471202850341797 }, { "auxiliary_loss_clip": 0.07377633, "auxiliary_loss_mlp": 0.01357908, "balance_loss_clip": 0.06709532, "balance_loss_mlp": 0.01270218, "epoch": 0.03986171651886367, "flos": 16806688392960.0, "grad_norm": 3.306023713967296, "language_loss": 0.8741495, "learning_rate": 3.998980187661314e-06, "loss": 0.96150494, "num_input_tokens_seen": 14055765, "router_z_loss_clip": 6.69140625, "router_z_loss_mlp": 0.87744141, "step": 663, "time_per_iteration": 2.673849582672119 }, { "auxiliary_loss_clip": 0.07397956, "auxiliary_loss_mlp": 0.01354475, "balance_loss_clip": 0.06713245, "balance_loss_mlp": 0.01264114, "epoch": 0.03992183977153164, "flos": 24541953586560.0, "grad_norm": 46.644085920607594, "language_loss": 0.9070189, "learning_rate": 3.998967714081826e-06, "loss": 0.99454314, "num_input_tokens_seen": 14074195, "router_z_loss_clip": 6.85546875, "router_z_loss_mlp": 0.90429688, "step": 664, "time_per_iteration": 2.6985132694244385 }, { "auxiliary_loss_clip": 0.07324643, "auxiliary_loss_mlp": 0.01346049, "balance_loss_clip": 0.06696418, "balance_loss_mlp": 0.01268038, "epoch": 0.03998196302419961, "flos": 15601261155840.0, "grad_norm": 8.322105760090134, "language_loss": 0.88795483, "learning_rate": 3.998955164701281e-06, "loss": 0.97466171, "num_input_tokens_seen": 14090215, "router_z_loss_clip": 6.28515625, "router_z_loss_mlp": 0.78027344, "step": 665, "time_per_iteration": 2.615748405456543 }, { "auxiliary_loss_clip": 0.07373211, "auxiliary_loss_mlp": 0.01361759, "balance_loss_clip": 0.06692147, "balance_loss_mlp": 0.01273068, "epoch": 0.04004208627686758, "flos": 25312714168320.0, "grad_norm": 7.375834132736909, "language_loss": 0.84860396, "learning_rate": 3.998942539520158e-06, "loss": 0.93595368, "num_input_tokens_seen": 14112150, "router_z_loss_clip": 6.8203125, "router_z_loss_mlp": 0.88671875, "step": 666, "time_per_iteration": 2.75258469581604 }, { "auxiliary_loss_clip": 0.07372509, "auxiliary_loss_mlp": 0.01360949, "balance_loss_clip": 0.067082, "balance_loss_mlp": 0.01271351, "epoch": 0.04010220952953555, "flos": 23482365580800.0, "grad_norm": 2.8841042179704637, "language_loss": 0.89869118, "learning_rate": 3.998929838538932e-06, "loss": 0.98602581, "num_input_tokens_seen": 14131475, "router_z_loss_clip": 6.64453125, "router_z_loss_mlp": 0.89599609, "step": 667, "time_per_iteration": 2.6780028343200684 }, { "auxiliary_loss_clip": 0.07330574, "auxiliary_loss_mlp": 0.01353623, "balance_loss_clip": 0.06689723, "balance_loss_mlp": 0.01266791, "epoch": 0.04016233278220352, "flos": 18621691683840.0, "grad_norm": 3.256336816133801, "language_loss": 0.85285783, "learning_rate": 3.998917061758087e-06, "loss": 0.93969977, "num_input_tokens_seen": 14146165, "router_z_loss_clip": 6.41015625, "router_z_loss_mlp": 0.86816406, "step": 668, "time_per_iteration": 2.6417434215545654 }, { "auxiliary_loss_clip": 0.06974136, "auxiliary_loss_mlp": 0.01329121, "balance_loss_clip": 0.06558014, "balance_loss_mlp": 0.01281818, "epoch": 0.040222456034871484, "flos": 70926556204800.0, "grad_norm": 2.2676143725819404, "language_loss": 0.61240685, "learning_rate": 3.998904209178107e-06, "loss": 0.69543946, "num_input_tokens_seen": 14215005, "router_z_loss_clip": 4.15625, "router_z_loss_mlp": 0.47265625, "step": 669, "time_per_iteration": 3.360527753829956 }, { "auxiliary_loss_clip": 0.07373828, "auxiliary_loss_mlp": 0.01400177, "balance_loss_clip": 0.06695002, "balance_loss_mlp": 0.01306431, "epoch": 0.040282579287539456, "flos": 23770773734400.0, "grad_norm": 16.424660152172752, "language_loss": 0.90356255, "learning_rate": 3.9988912807994785e-06, "loss": 0.99130261, "num_input_tokens_seen": 14235510, "router_z_loss_clip": 6.80078125, "router_z_loss_mlp": 0.9375, "step": 670, "time_per_iteration": 2.6873295307159424 }, { "auxiliary_loss_clip": 0.07362077, "auxiliary_loss_mlp": 0.01395872, "balance_loss_clip": 0.06699031, "balance_loss_mlp": 0.01301506, "epoch": 0.04034270254020743, "flos": 18484405568640.0, "grad_norm": 3.444380599471723, "language_loss": 0.79433954, "learning_rate": 3.998878276622692e-06, "loss": 0.88191903, "num_input_tokens_seen": 14254565, "router_z_loss_clip": 6.6328125, "router_z_loss_mlp": 0.94287109, "step": 671, "time_per_iteration": 2.7156693935394287 }, { "auxiliary_loss_clip": 0.0735767, "auxiliary_loss_mlp": 0.01397573, "balance_loss_clip": 0.06697436, "balance_loss_mlp": 0.01301394, "epoch": 0.040402825792875394, "flos": 17207589052800.0, "grad_norm": 10.329070323900924, "language_loss": 0.96831805, "learning_rate": 3.998865196648242e-06, "loss": 1.05587041, "num_input_tokens_seen": 14271885, "router_z_loss_clip": 6.609375, "router_z_loss_mlp": 0.96142578, "step": 672, "time_per_iteration": 2.639333486557007 }, { "auxiliary_loss_clip": 0.07386683, "auxiliary_loss_mlp": 0.01429563, "balance_loss_clip": 0.06701782, "balance_loss_mlp": 0.01323705, "epoch": 0.040462949045543366, "flos": 19178242502400.0, "grad_norm": 13.220399491648788, "language_loss": 0.9287594, "learning_rate": 3.998852040876622e-06, "loss": 1.01692188, "num_input_tokens_seen": 14289670, "router_z_loss_clip": 6.85546875, "router_z_loss_mlp": 1.05712891, "step": 673, "time_per_iteration": 2.6545166969299316 }, { "auxiliary_loss_clip": 0.07380789, "auxiliary_loss_mlp": 0.01451452, "balance_loss_clip": 0.06718417, "balance_loss_mlp": 0.01341637, "epoch": 0.04052307229821133, "flos": 24025877089920.0, "grad_norm": 3.2586096608913384, "language_loss": 0.79200137, "learning_rate": 3.998838809308334e-06, "loss": 0.88032377, "num_input_tokens_seen": 14309285, "router_z_loss_clip": 6.62890625, "router_z_loss_mlp": 1.09716797, "step": 674, "time_per_iteration": 2.68906569480896 }, { "auxiliary_loss_clip": 0.07417461, "auxiliary_loss_mlp": 0.01471202, "balance_loss_clip": 0.06718877, "balance_loss_mlp": 0.01360433, "epoch": 0.0405831955508793, "flos": 16442362840320.0, "grad_norm": 3.8001504766035032, "language_loss": 0.83278131, "learning_rate": 3.9988255019438766e-06, "loss": 0.92166799, "num_input_tokens_seen": 14328300, "router_z_loss_clip": 6.984375, "router_z_loss_mlp": 1.10839844, "step": 675, "time_per_iteration": 2.6681389808654785 }, { "auxiliary_loss_clip": 0.07390545, "auxiliary_loss_mlp": 0.01468283, "balance_loss_clip": 0.06715566, "balance_loss_mlp": 0.01359755, "epoch": 0.040643318803547275, "flos": 24286808304000.0, "grad_norm": 2.8161475012361024, "language_loss": 0.80486161, "learning_rate": 3.998812118783757e-06, "loss": 0.8934499, "num_input_tokens_seen": 14346395, "router_z_loss_clip": 6.7578125, "router_z_loss_mlp": 1.08349609, "step": 676, "time_per_iteration": 2.6935548782348633 }, { "auxiliary_loss_clip": 0.07404241, "auxiliary_loss_mlp": 0.01473558, "balance_loss_clip": 0.06713233, "balance_loss_mlp": 0.01357115, "epoch": 0.04070344205621524, "flos": 17717795763840.0, "grad_norm": 3.883116562929528, "language_loss": 0.89649886, "learning_rate": 3.9987986598284804e-06, "loss": 0.98527682, "num_input_tokens_seen": 14364605, "router_z_loss_clip": 6.91015625, "router_z_loss_mlp": 1.16503906, "step": 677, "time_per_iteration": 2.631305694580078 }, { "auxiliary_loss_clip": 0.07385714, "auxiliary_loss_mlp": 0.01461402, "balance_loss_clip": 0.06718226, "balance_loss_mlp": 0.01353256, "epoch": 0.04076356530888321, "flos": 26184940444800.0, "grad_norm": 5.142148723106733, "language_loss": 0.80289865, "learning_rate": 3.998785125078559e-06, "loss": 0.8913697, "num_input_tokens_seen": 14385265, "router_z_loss_clip": 6.67578125, "router_z_loss_mlp": 1.08154297, "step": 678, "time_per_iteration": 2.6865859031677246 }, { "auxiliary_loss_clip": 0.07363702, "auxiliary_loss_mlp": 0.01436573, "balance_loss_clip": 0.06700337, "balance_loss_mlp": 0.01339775, "epoch": 0.04082368856155118, "flos": 35782349650560.0, "grad_norm": 3.536723941978954, "language_loss": 0.85626304, "learning_rate": 3.998771514534505e-06, "loss": 0.94426578, "num_input_tokens_seen": 14406090, "router_z_loss_clip": 6.6328125, "router_z_loss_mlp": 0.96728516, "step": 679, "time_per_iteration": 2.7917263507843018 }, { "auxiliary_loss_clip": 0.07330166, "auxiliary_loss_mlp": 0.01418035, "balance_loss_clip": 0.06688674, "balance_loss_mlp": 0.01332872, "epoch": 0.04088381181421915, "flos": 28154042593920.0, "grad_norm": 2.92608992456938, "language_loss": 0.80778247, "learning_rate": 3.998757828196835e-06, "loss": 0.89526439, "num_input_tokens_seen": 14425130, "router_z_loss_clip": 6.42578125, "router_z_loss_mlp": 0.85107422, "step": 680, "time_per_iteration": 2.72206711769104 }, { "auxiliary_loss_clip": 0.07375951, "auxiliary_loss_mlp": 0.01405667, "balance_loss_clip": 0.06709567, "balance_loss_mlp": 0.01312207, "epoch": 0.04094393506688712, "flos": 27604703226240.0, "grad_norm": 2.555892166534676, "language_loss": 0.86239433, "learning_rate": 3.9987440660660685e-06, "loss": 0.95021045, "num_input_tokens_seen": 14447355, "router_z_loss_clip": 6.66796875, "router_z_loss_mlp": 0.93505859, "step": 681, "time_per_iteration": 2.823004722595215 }, { "auxiliary_loss_clip": 0.07332181, "auxiliary_loss_mlp": 0.01401215, "balance_loss_clip": 0.06682517, "balance_loss_mlp": 0.0131052, "epoch": 0.04100405831955509, "flos": 23118668933760.0, "grad_norm": 2.987432461092481, "language_loss": 0.75663465, "learning_rate": 3.998730228142726e-06, "loss": 0.84396863, "num_input_tokens_seen": 14466790, "router_z_loss_clip": 6.49609375, "router_z_loss_mlp": 0.90673828, "step": 682, "time_per_iteration": 2.711036205291748 }, { "auxiliary_loss_clip": 0.07326129, "auxiliary_loss_mlp": 0.01388919, "balance_loss_clip": 0.06689399, "balance_loss_mlp": 0.01310622, "epoch": 0.04106418157222306, "flos": 20162877431040.0, "grad_norm": 3.9770067562910323, "language_loss": 0.76284611, "learning_rate": 3.998716314427333e-06, "loss": 0.84999663, "num_input_tokens_seen": 14485195, "router_z_loss_clip": 6.359375, "router_z_loss_mlp": 0.78320312, "step": 683, "time_per_iteration": 2.6761462688446045 }, { "auxiliary_loss_clip": 0.07315259, "auxiliary_loss_mlp": 0.01399435, "balance_loss_clip": 0.06681891, "balance_loss_mlp": 0.0131494, "epoch": 0.041124304824891024, "flos": 17426452717440.0, "grad_norm": 4.391258367189773, "language_loss": 0.85315913, "learning_rate": 3.998702324920417e-06, "loss": 0.94030607, "num_input_tokens_seen": 14503370, "router_z_loss_clip": 6.3359375, "router_z_loss_mlp": 0.84472656, "step": 684, "time_per_iteration": 2.66725754737854 }, { "auxiliary_loss_clip": 0.07311833, "auxiliary_loss_mlp": 0.0140251, "balance_loss_clip": 0.06688397, "balance_loss_mlp": 0.01319445, "epoch": 0.041184428077558996, "flos": 25788022853760.0, "grad_norm": 2.5470475149872285, "language_loss": 0.93604738, "learning_rate": 3.9986882596225085e-06, "loss": 1.02319074, "num_input_tokens_seen": 14526415, "router_z_loss_clip": 6.2421875, "router_z_loss_mlp": 0.83056641, "step": 685, "time_per_iteration": 2.7492470741271973 }, { "auxiliary_loss_clip": 0.07336061, "auxiliary_loss_mlp": 0.0141696, "balance_loss_clip": 0.06692488, "balance_loss_mlp": 0.01324025, "epoch": 0.04124455133022697, "flos": 22971152620800.0, "grad_norm": 4.405222018626464, "language_loss": 0.92711079, "learning_rate": 3.998674118534141e-06, "loss": 1.01464105, "num_input_tokens_seen": 14546595, "router_z_loss_clip": 6.4453125, "router_z_loss_mlp": 0.92822266, "step": 686, "time_per_iteration": 2.674330949783325 }, { "auxiliary_loss_clip": 0.07323459, "auxiliary_loss_mlp": 0.0140661, "balance_loss_clip": 0.06682873, "balance_loss_mlp": 0.0132016, "epoch": 0.04130467458289493, "flos": 21295615651200.0, "grad_norm": 3.3619155638303777, "language_loss": 0.75051665, "learning_rate": 3.998659901655851e-06, "loss": 0.83781737, "num_input_tokens_seen": 14566590, "router_z_loss_clip": 6.4140625, "router_z_loss_mlp": 0.86376953, "step": 687, "time_per_iteration": 2.675309658050537 }, { "auxiliary_loss_clip": 0.07296701, "auxiliary_loss_mlp": 0.01400046, "balance_loss_clip": 0.06680067, "balance_loss_mlp": 0.01319508, "epoch": 0.041364797835562905, "flos": 19980337311360.0, "grad_norm": 6.0874472120101055, "language_loss": 0.89916396, "learning_rate": 3.998645608988177e-06, "loss": 0.98613137, "num_input_tokens_seen": 14585965, "router_z_loss_clip": 6.17578125, "router_z_loss_mlp": 0.80517578, "step": 688, "time_per_iteration": 4.084171295166016 }, { "auxiliary_loss_clip": 0.07271288, "auxiliary_loss_mlp": 0.01389412, "balance_loss_clip": 0.06662028, "balance_loss_mlp": 0.01313309, "epoch": 0.04142492108823087, "flos": 21912361228800.0, "grad_norm": 4.747338227032698, "language_loss": 0.86950773, "learning_rate": 3.998631240531661e-06, "loss": 0.95611471, "num_input_tokens_seen": 14606015, "router_z_loss_clip": 6.09375, "router_z_loss_mlp": 0.76171875, "step": 689, "time_per_iteration": 2.6739342212677 }, { "auxiliary_loss_clip": 0.07253118, "auxiliary_loss_mlp": 0.01370183, "balance_loss_clip": 0.06643637, "balance_loss_mlp": 0.01293794, "epoch": 0.04148504434089884, "flos": 27647567389440.0, "grad_norm": 4.910108572604327, "language_loss": 0.71716589, "learning_rate": 3.998616796286848e-06, "loss": 0.80339885, "num_input_tokens_seen": 14629955, "router_z_loss_clip": 6.08984375, "router_z_loss_mlp": 0.76367188, "step": 690, "time_per_iteration": 4.18480920791626 }, { "auxiliary_loss_clip": 0.07223171, "auxiliary_loss_mlp": 0.0135986, "balance_loss_clip": 0.06622484, "balance_loss_mlp": 0.0128595, "epoch": 0.041545167593566815, "flos": 20524058455680.0, "grad_norm": 4.352749052755786, "language_loss": 0.78210062, "learning_rate": 3.998602276254286e-06, "loss": 0.86793089, "num_input_tokens_seen": 14648000, "router_z_loss_clip": 6.0078125, "router_z_loss_mlp": 0.73925781, "step": 691, "time_per_iteration": 5.516546964645386 }, { "auxiliary_loss_clip": 0.07214737, "auxiliary_loss_mlp": 0.01351047, "balance_loss_clip": 0.06620452, "balance_loss_mlp": 0.01280237, "epoch": 0.04160529084623478, "flos": 11872738500480.0, "grad_norm": 200.9885754114551, "language_loss": 0.87282997, "learning_rate": 3.998587680434526e-06, "loss": 0.95848787, "num_input_tokens_seen": 14662235, "router_z_loss_clip": 5.9453125, "router_z_loss_mlp": 0.70849609, "step": 692, "time_per_iteration": 2.6082451343536377 }, { "auxiliary_loss_clip": 0.07247378, "auxiliary_loss_mlp": 0.01351947, "balance_loss_clip": 0.06612961, "balance_loss_mlp": 0.01272125, "epoch": 0.04166541409890275, "flos": 14833309685760.0, "grad_norm": 28.94680779962916, "language_loss": 0.93099695, "learning_rate": 3.99857300882812e-06, "loss": 1.01699018, "num_input_tokens_seen": 14676065, "router_z_loss_clip": 6.34765625, "router_z_loss_mlp": 0.79833984, "step": 693, "time_per_iteration": 2.596867322921753 }, { "auxiliary_loss_clip": 0.07215904, "auxiliary_loss_mlp": 0.01332998, "balance_loss_clip": 0.06603514, "balance_loss_mlp": 0.01258039, "epoch": 0.04172553735157072, "flos": 25814577398400.0, "grad_norm": 7.90246841627214, "language_loss": 0.86524689, "learning_rate": 3.998558261435626e-06, "loss": 0.95073593, "num_input_tokens_seen": 14694955, "router_z_loss_clip": 6.125, "router_z_loss_mlp": 0.74951172, "step": 694, "time_per_iteration": 2.678952217102051 }, { "auxiliary_loss_clip": 0.07228087, "auxiliary_loss_mlp": 0.01332304, "balance_loss_clip": 0.06598382, "balance_loss_mlp": 0.01255867, "epoch": 0.04178566060423869, "flos": 24286682522880.0, "grad_norm": 3.7451308144723816, "language_loss": 0.88445711, "learning_rate": 3.9985434382576015e-06, "loss": 0.970061, "num_input_tokens_seen": 14715510, "router_z_loss_clip": 6.296875, "router_z_loss_mlp": 0.76464844, "step": 695, "time_per_iteration": 2.695157527923584 }, { "auxiliary_loss_clip": 0.07187828, "auxiliary_loss_mlp": 0.01340377, "balance_loss_clip": 0.06586383, "balance_loss_mlp": 0.01262652, "epoch": 0.04184578385690666, "flos": 18227667058560.0, "grad_norm": 11.072216860254212, "language_loss": 0.89863694, "learning_rate": 3.99852853929461e-06, "loss": 0.98391896, "num_input_tokens_seen": 14731755, "router_z_loss_clip": 6.015625, "router_z_loss_mlp": 0.77734375, "step": 696, "time_per_iteration": 2.633845329284668 }, { "auxiliary_loss_clip": 0.07179459, "auxiliary_loss_mlp": 0.01350006, "balance_loss_clip": 0.06581238, "balance_loss_mlp": 0.0127109, "epoch": 0.041905907109574626, "flos": 22781694539520.0, "grad_norm": 3.4698136389342866, "language_loss": 0.96004307, "learning_rate": 3.998513564547216e-06, "loss": 1.04533768, "num_input_tokens_seen": 14750810, "router_z_loss_clip": 5.98828125, "router_z_loss_mlp": 0.7890625, "step": 697, "time_per_iteration": 2.6711642742156982 }, { "auxiliary_loss_clip": 0.07162187, "auxiliary_loss_mlp": 0.01357329, "balance_loss_clip": 0.06565037, "balance_loss_mlp": 0.01278651, "epoch": 0.0419660303622426, "flos": 20163128993280.0, "grad_norm": 9.785842078350347, "language_loss": 0.88087916, "learning_rate": 3.998498514015987e-06, "loss": 0.96607435, "num_input_tokens_seen": 14768435, "router_z_loss_clip": 5.9765625, "router_z_loss_mlp": 0.78710938, "step": 698, "time_per_iteration": 2.646597146987915 }, { "auxiliary_loss_clip": 0.07194079, "auxiliary_loss_mlp": 0.01347491, "balance_loss_clip": 0.06591865, "balance_loss_mlp": 0.01269815, "epoch": 0.042026153614910564, "flos": 23083142002560.0, "grad_norm": 8.653372791471378, "language_loss": 0.95163286, "learning_rate": 3.998483387701495e-06, "loss": 1.03704858, "num_input_tokens_seen": 14786690, "router_z_loss_clip": 6.02734375, "router_z_loss_mlp": 0.77734375, "step": 699, "time_per_iteration": 2.6465659141540527 }, { "auxiliary_loss_clip": 0.06880753, "auxiliary_loss_mlp": 0.01476162, "balance_loss_clip": 0.0646016, "balance_loss_mlp": 0.01394336, "epoch": 0.042086276867578536, "flos": 64516296424320.0, "grad_norm": 1.588629406214863, "language_loss": 0.68719864, "learning_rate": 3.998468185604312e-06, "loss": 0.77076787, "num_input_tokens_seen": 14853840, "router_z_loss_clip": 4.1953125, "router_z_loss_mlp": 0.81835938, "step": 700, "time_per_iteration": 3.2867801189422607 }, { "auxiliary_loss_clip": 0.07191842, "auxiliary_loss_mlp": 0.0135334, "balance_loss_clip": 0.06580989, "balance_loss_mlp": 0.0127285, "epoch": 0.04214640012024651, "flos": 15492458229120.0, "grad_norm": 634.3715616056356, "language_loss": 0.92532325, "learning_rate": 3.998452907725016e-06, "loss": 1.01077509, "num_input_tokens_seen": 14869580, "router_z_loss_clip": 6.109375, "router_z_loss_mlp": 0.80517578, "step": 701, "time_per_iteration": 2.6180334091186523 }, { "auxiliary_loss_clip": 0.07175699, "auxiliary_loss_mlp": 0.01360469, "balance_loss_clip": 0.06568789, "balance_loss_mlp": 0.0128346, "epoch": 0.04220652337291447, "flos": 23883601656960.0, "grad_norm": 9.816585894531626, "language_loss": 0.71508706, "learning_rate": 3.998437554064184e-06, "loss": 0.80044878, "num_input_tokens_seen": 14891065, "router_z_loss_clip": 6.078125, "router_z_loss_mlp": 0.77050781, "step": 702, "time_per_iteration": 2.681915283203125 }, { "auxiliary_loss_clip": 0.06853853, "auxiliary_loss_mlp": 0.01429968, "balance_loss_clip": 0.064373, "balance_loss_mlp": 0.01355153, "epoch": 0.042266646625582445, "flos": 63815289966720.0, "grad_norm": 0.9021395245332886, "language_loss": 0.61408651, "learning_rate": 3.9984221246224006e-06, "loss": 0.69692469, "num_input_tokens_seen": 14954815, "router_z_loss_clip": 4.17578125, "router_z_loss_mlp": 0.74658203, "step": 703, "time_per_iteration": 3.3909740447998047 }, { "auxiliary_loss_clip": 0.06846362, "auxiliary_loss_mlp": 0.01398504, "balance_loss_clip": 0.0643204, "balance_loss_mlp": 0.01333749, "epoch": 0.04232676987825041, "flos": 50038912154880.0, "grad_norm": 1.0743039389319724, "language_loss": 0.58221889, "learning_rate": 3.9984066194002494e-06, "loss": 0.66466755, "num_input_tokens_seen": 15003050, "router_z_loss_clip": 4.125, "router_z_loss_mlp": 0.64746094, "step": 704, "time_per_iteration": 3.1630377769470215 }, { "auxiliary_loss_clip": 0.07194979, "auxiliary_loss_mlp": 0.01385648, "balance_loss_clip": 0.06572917, "balance_loss_mlp": 0.01304109, "epoch": 0.04238689313091838, "flos": 21622485628800.0, "grad_norm": 4.177662397101252, "language_loss": 0.9254896, "learning_rate": 3.998391038398319e-06, "loss": 1.0112958, "num_input_tokens_seen": 15021990, "router_z_loss_clip": 6.21875, "router_z_loss_mlp": 0.81591797, "step": 705, "time_per_iteration": 2.690131664276123 }, { "auxiliary_loss_clip": 0.07169293, "auxiliary_loss_mlp": 0.01378754, "balance_loss_clip": 0.06584113, "balance_loss_mlp": 0.01303414, "epoch": 0.042447016383586354, "flos": 19141080416640.0, "grad_norm": 7.313077397546701, "language_loss": 0.74796784, "learning_rate": 3.998375381617201e-06, "loss": 0.83344829, "num_input_tokens_seen": 15040700, "router_z_loss_clip": 5.85546875, "router_z_loss_mlp": 0.75390625, "step": 706, "time_per_iteration": 2.69670033454895 }, { "auxiliary_loss_clip": 0.07180236, "auxiliary_loss_mlp": 0.01364164, "balance_loss_clip": 0.06569722, "balance_loss_mlp": 0.01289444, "epoch": 0.04250713963625432, "flos": 24432941024640.0, "grad_norm": 5.8289117592692365, "language_loss": 0.96679974, "learning_rate": 3.9983596490574875e-06, "loss": 1.05224371, "num_input_tokens_seen": 15056725, "router_z_loss_clip": 6.10546875, "router_z_loss_mlp": 0.74755859, "step": 707, "time_per_iteration": 2.707656145095825 }, { "auxiliary_loss_clip": 0.07175466, "auxiliary_loss_mlp": 0.01362336, "balance_loss_clip": 0.06573546, "balance_loss_mlp": 0.01285947, "epoch": 0.04256726288892229, "flos": 30374348883840.0, "grad_norm": 4.004513089254729, "language_loss": 0.84717977, "learning_rate": 3.998343840719776e-06, "loss": 0.93255776, "num_input_tokens_seen": 15077550, "router_z_loss_clip": 6.01953125, "router_z_loss_mlp": 0.76416016, "step": 708, "time_per_iteration": 2.7134242057800293 }, { "auxiliary_loss_clip": 0.07207081, "auxiliary_loss_mlp": 0.01370728, "balance_loss_clip": 0.0657872, "balance_loss_mlp": 0.01290238, "epoch": 0.04262738614159026, "flos": 16368248304000.0, "grad_norm": 335.80035326799117, "language_loss": 0.87784767, "learning_rate": 3.998327956604666e-06, "loss": 0.96362579, "num_input_tokens_seen": 15094955, "router_z_loss_clip": 6.29296875, "router_z_loss_mlp": 0.8046875, "step": 709, "time_per_iteration": 2.6286845207214355 }, { "auxiliary_loss_clip": 0.07206431, "auxiliary_loss_mlp": 0.01401239, "balance_loss_clip": 0.06574333, "balance_loss_mlp": 0.01312452, "epoch": 0.04268750939425823, "flos": 20418609692160.0, "grad_norm": 6.057123721866422, "language_loss": 0.90213031, "learning_rate": 3.99831199671276e-06, "loss": 0.98820698, "num_input_tokens_seen": 15113395, "router_z_loss_clip": 6.3203125, "router_z_loss_mlp": 0.88818359, "step": 710, "time_per_iteration": 2.6308538913726807 }, { "auxiliary_loss_clip": 0.07223485, "auxiliary_loss_mlp": 0.01386403, "balance_loss_clip": 0.06588341, "balance_loss_mlp": 0.013072, "epoch": 0.0427476326469262, "flos": 20309177859840.0, "grad_norm": 8.470829770726338, "language_loss": 0.87846339, "learning_rate": 3.998295961044662e-06, "loss": 0.9645623, "num_input_tokens_seen": 15132920, "router_z_loss_clip": 6.34375, "router_z_loss_mlp": 0.79199219, "step": 711, "time_per_iteration": 2.6699259281158447 }, { "auxiliary_loss_clip": 0.07217565, "auxiliary_loss_mlp": 0.01380838, "balance_loss_clip": 0.06590888, "balance_loss_mlp": 0.01298202, "epoch": 0.042807755899594166, "flos": 21656880529920.0, "grad_norm": 3.6944495335285072, "language_loss": 0.88986564, "learning_rate": 3.9982798496009804e-06, "loss": 0.97584969, "num_input_tokens_seen": 15153115, "router_z_loss_clip": 6.265625, "router_z_loss_mlp": 0.82617188, "step": 712, "time_per_iteration": 2.6324374675750732 }, { "auxiliary_loss_clip": 0.07249977, "auxiliary_loss_mlp": 0.01413545, "balance_loss_clip": 0.06592557, "balance_loss_mlp": 0.01326618, "epoch": 0.04286787915226214, "flos": 21441580663680.0, "grad_norm": 19.34542890832792, "language_loss": 0.9462533, "learning_rate": 3.998263662382328e-06, "loss": 1.03288853, "num_input_tokens_seen": 15172770, "router_z_loss_clip": 6.578125, "router_z_loss_mlp": 0.86962891, "step": 713, "time_per_iteration": 2.657372236251831 }, { "auxiliary_loss_clip": 0.06820129, "auxiliary_loss_mlp": 0.01353692, "balance_loss_clip": 0.06429406, "balance_loss_mlp": 0.01310968, "epoch": 0.04292800240493011, "flos": 66420256423680.0, "grad_norm": 0.9494160974161034, "language_loss": 0.64562583, "learning_rate": 3.9982473993893165e-06, "loss": 0.72736406, "num_input_tokens_seen": 15240055, "router_z_loss_clip": 3.90625, "router_z_loss_mlp": 0.42773438, "step": 714, "time_per_iteration": 3.343198776245117 }, { "auxiliary_loss_clip": 0.07197013, "auxiliary_loss_mlp": 0.0140494, "balance_loss_clip": 0.06572799, "balance_loss_mlp": 0.01317917, "epoch": 0.042988125657598075, "flos": 31658418777600.0, "grad_norm": 7.67230274987649, "language_loss": 0.77982134, "learning_rate": 3.998231060622563e-06, "loss": 0.86584079, "num_input_tokens_seen": 15261585, "router_z_loss_clip": 6.25, "router_z_loss_mlp": 0.87011719, "step": 715, "time_per_iteration": 2.7537152767181396 }, { "auxiliary_loss_clip": 0.07213086, "auxiliary_loss_mlp": 0.0139997, "balance_loss_clip": 0.06575428, "balance_loss_mlp": 0.0130775, "epoch": 0.04304824891026605, "flos": 33255690433920.0, "grad_norm": 5.897897905906545, "language_loss": 0.76060808, "learning_rate": 3.998214646082688e-06, "loss": 0.84673858, "num_input_tokens_seen": 15281160, "router_z_loss_clip": 6.38671875, "router_z_loss_mlp": 0.921875, "step": 716, "time_per_iteration": 2.7578506469726562 }, { "auxiliary_loss_clip": 0.06823374, "auxiliary_loss_mlp": 0.01322027, "balance_loss_clip": 0.06432808, "balance_loss_mlp": 0.01280661, "epoch": 0.04310837216293401, "flos": 64086996430080.0, "grad_norm": 0.9325816286992883, "language_loss": 0.66020185, "learning_rate": 3.998198155770314e-06, "loss": 0.74165589, "num_input_tokens_seen": 15344505, "router_z_loss_clip": 3.90625, "router_z_loss_mlp": 0.41381836, "step": 717, "time_per_iteration": 3.3362181186676025 }, { "auxiliary_loss_clip": 0.06808876, "auxiliary_loss_mlp": 0.01303503, "balance_loss_clip": 0.06421348, "balance_loss_mlp": 0.01265476, "epoch": 0.043168495415601985, "flos": 61361990599680.0, "grad_norm": 0.9961514622775219, "language_loss": 0.58829898, "learning_rate": 3.998181589686065e-06, "loss": 0.66942275, "num_input_tokens_seen": 15404050, "router_z_loss_clip": 3.875, "router_z_loss_mlp": 0.37963867, "step": 718, "time_per_iteration": 3.124319314956665 }, { "auxiliary_loss_clip": 0.07231366, "auxiliary_loss_mlp": 0.01414728, "balance_loss_clip": 0.06576821, "balance_loss_mlp": 0.01314068, "epoch": 0.04322861866826996, "flos": 20710539717120.0, "grad_norm": 6.90854826368016, "language_loss": 0.95492548, "learning_rate": 3.99816494783057e-06, "loss": 1.04138637, "num_input_tokens_seen": 15424190, "router_z_loss_clip": 6.546875, "router_z_loss_mlp": 1.00683594, "step": 719, "time_per_iteration": 2.6854662895202637 }, { "auxiliary_loss_clip": 0.07253273, "auxiliary_loss_mlp": 0.01418078, "balance_loss_clip": 0.06590989, "balance_loss_mlp": 0.01308072, "epoch": 0.04328874192093792, "flos": 30381308772480.0, "grad_norm": 3.2709432042170734, "language_loss": 0.69539535, "learning_rate": 3.99814823020446e-06, "loss": 0.7821089, "num_input_tokens_seen": 15446500, "router_z_loss_clip": 6.625, "router_z_loss_mlp": 1.10009766, "step": 720, "time_per_iteration": 2.729779005050659 }, { "auxiliary_loss_clip": 0.07239585, "auxiliary_loss_mlp": 0.01408532, "balance_loss_clip": 0.06587559, "balance_loss_mlp": 0.01297762, "epoch": 0.043348865173605894, "flos": 21951284250240.0, "grad_norm": 26.86432578519503, "language_loss": 0.80358607, "learning_rate": 3.9981314368083684e-06, "loss": 0.89006728, "num_input_tokens_seen": 15465830, "router_z_loss_clip": 6.5234375, "router_z_loss_mlp": 1.10742188, "step": 721, "time_per_iteration": 2.6972463130950928 }, { "auxiliary_loss_clip": 0.07242912, "auxiliary_loss_mlp": 0.01399921, "balance_loss_clip": 0.06589278, "balance_loss_mlp": 0.01307939, "epoch": 0.04340898842627386, "flos": 15268982590080.0, "grad_norm": 7.094631199749295, "language_loss": 0.91378009, "learning_rate": 3.998114567642933e-06, "loss": 1.0002085, "num_input_tokens_seen": 15479985, "router_z_loss_clip": 6.5390625, "router_z_loss_mlp": 0.91943359, "step": 722, "time_per_iteration": 2.6186699867248535 }, { "auxiliary_loss_clip": 0.0725567, "auxiliary_loss_mlp": 0.01418433, "balance_loss_clip": 0.06579642, "balance_loss_mlp": 0.01310286, "epoch": 0.04346911167894183, "flos": 27973011847680.0, "grad_norm": 14.063074635347425, "language_loss": 0.88646364, "learning_rate": 3.998097622708792e-06, "loss": 0.97320461, "num_input_tokens_seen": 15501545, "router_z_loss_clip": 6.765625, "router_z_loss_mlp": 1.08007812, "step": 723, "time_per_iteration": 2.7172467708587646 }, { "auxiliary_loss_clip": 0.07234006, "auxiliary_loss_mlp": 0.01393074, "balance_loss_clip": 0.06584681, "balance_loss_mlp": 0.01295179, "epoch": 0.0435292349316098, "flos": 29249954144640.0, "grad_norm": 4.937752141162674, "language_loss": 0.8588869, "learning_rate": 3.99808060200659e-06, "loss": 0.94515777, "num_input_tokens_seen": 15521725, "router_z_loss_clip": 6.5, "router_z_loss_mlp": 0.97949219, "step": 724, "time_per_iteration": 2.7269935607910156 }, { "auxiliary_loss_clip": 0.07204686, "auxiliary_loss_mlp": 0.0138342, "balance_loss_clip": 0.06558691, "balance_loss_mlp": 0.01287575, "epoch": 0.04358935818427777, "flos": 20564616631680.0, "grad_norm": 3.7588074921290553, "language_loss": 0.83413696, "learning_rate": 3.998063505536971e-06, "loss": 0.92001796, "num_input_tokens_seen": 15540910, "router_z_loss_clip": 6.46875, "router_z_loss_mlp": 0.95898438, "step": 725, "time_per_iteration": 2.6647160053253174 }, { "auxiliary_loss_clip": 0.07255159, "auxiliary_loss_mlp": 0.01410391, "balance_loss_clip": 0.06579678, "balance_loss_mlp": 0.01306632, "epoch": 0.04364948143694574, "flos": 14470116163200.0, "grad_norm": 6.548900102375731, "language_loss": 0.9214775, "learning_rate": 3.998046333300584e-06, "loss": 1.00813293, "num_input_tokens_seen": 15558640, "router_z_loss_clip": 6.75390625, "router_z_loss_mlp": 1.03662109, "step": 726, "time_per_iteration": 2.6802375316619873 }, { "auxiliary_loss_clip": 0.0683739, "auxiliary_loss_mlp": 0.01430608, "balance_loss_clip": 0.06452459, "balance_loss_mlp": 0.01387979, "epoch": 0.043709604689613706, "flos": 50083216565760.0, "grad_norm": 1.0878600025413527, "language_loss": 0.56122434, "learning_rate": 3.998029085298079e-06, "loss": 0.64390433, "num_input_tokens_seen": 15612975, "router_z_loss_clip": 3.84375, "router_z_loss_mlp": 0.42675781, "step": 727, "time_per_iteration": 3.397329092025757 }, { "auxiliary_loss_clip": 0.07230257, "auxiliary_loss_mlp": 0.01385252, "balance_loss_clip": 0.0656419, "balance_loss_mlp": 0.01297896, "epoch": 0.04376972794228168, "flos": 13996861902720.0, "grad_norm": 4.218442674724064, "language_loss": 0.85789305, "learning_rate": 3.998011761530112e-06, "loss": 0.94404817, "num_input_tokens_seen": 15631070, "router_z_loss_clip": 6.66015625, "router_z_loss_mlp": 0.87353516, "step": 728, "time_per_iteration": 4.226960897445679 }, { "auxiliary_loss_clip": 0.07188703, "auxiliary_loss_mlp": 0.01374397, "balance_loss_clip": 0.06556599, "balance_loss_mlp": 0.01296577, "epoch": 0.04382985119494965, "flos": 22015084734720.0, "grad_norm": 4.952058898660914, "language_loss": 0.79291785, "learning_rate": 3.997994361997338e-06, "loss": 0.87854886, "num_input_tokens_seen": 15647825, "router_z_loss_clip": 6.328125, "router_z_loss_mlp": 0.77832031, "step": 729, "time_per_iteration": 4.119134187698364 }, { "auxiliary_loss_clip": 0.07224233, "auxiliary_loss_mlp": 0.01379868, "balance_loss_clip": 0.06565178, "balance_loss_mlp": 0.01291462, "epoch": 0.043889974447617615, "flos": 24213322673280.0, "grad_norm": 4.7392447872693895, "language_loss": 0.9911207, "learning_rate": 3.997976886700417e-06, "loss": 1.07716179, "num_input_tokens_seen": 15668260, "router_z_loss_clip": 6.58984375, "router_z_loss_mlp": 0.88330078, "step": 730, "time_per_iteration": 4.139965772628784 }, { "auxiliary_loss_clip": 0.07246524, "auxiliary_loss_mlp": 0.01390131, "balance_loss_clip": 0.06566965, "balance_loss_mlp": 0.01297053, "epoch": 0.04395009770028559, "flos": 17280236142720.0, "grad_norm": 7.639114476493563, "language_loss": 0.91468781, "learning_rate": 3.997959335640013e-06, "loss": 1.00105429, "num_input_tokens_seen": 15685630, "router_z_loss_clip": 6.796875, "router_z_loss_mlp": 0.93017578, "step": 731, "time_per_iteration": 4.050705909729004 }, { "auxiliary_loss_clip": 0.07206252, "auxiliary_loss_mlp": 0.01375087, "balance_loss_clip": 0.06564952, "balance_loss_mlp": 0.01288684, "epoch": 0.04401022095295355, "flos": 12314784314880.0, "grad_norm": 6.963146488577029, "language_loss": 0.95658869, "learning_rate": 3.997941708816791e-06, "loss": 1.04240203, "num_input_tokens_seen": 15698645, "router_z_loss_clip": 6.41796875, "router_z_loss_mlp": 0.86425781, "step": 732, "time_per_iteration": 2.620358943939209 }, { "auxiliary_loss_clip": 0.07238307, "auxiliary_loss_mlp": 0.0138676, "balance_loss_clip": 0.06572638, "balance_loss_mlp": 0.01295064, "epoch": 0.044070344205621524, "flos": 20965978488960.0, "grad_norm": 4.9917858687328565, "language_loss": 0.89770919, "learning_rate": 3.997924006231419e-06, "loss": 0.98395979, "num_input_tokens_seen": 15716775, "router_z_loss_clip": 6.65625, "router_z_loss_mlp": 0.91748047, "step": 733, "time_per_iteration": 2.6386139392852783 }, { "auxiliary_loss_clip": 0.07233074, "auxiliary_loss_mlp": 0.01388628, "balance_loss_clip": 0.06569068, "balance_loss_mlp": 0.01295883, "epoch": 0.044130467458289496, "flos": 13850477619840.0, "grad_norm": 7.314170510411423, "language_loss": 0.940557, "learning_rate": 3.9979062278845685e-06, "loss": 1.02677393, "num_input_tokens_seen": 15733320, "router_z_loss_clip": 6.64453125, "router_z_loss_mlp": 0.92675781, "step": 734, "time_per_iteration": 2.606072187423706 }, { "auxiliary_loss_clip": 0.07199788, "auxiliary_loss_mlp": 0.01353689, "balance_loss_clip": 0.06563859, "balance_loss_mlp": 0.01277204, "epoch": 0.04419059071095746, "flos": 28662152952960.0, "grad_norm": 5.916269573580131, "language_loss": 0.81656837, "learning_rate": 3.9978883737769125e-06, "loss": 0.90210313, "num_input_tokens_seen": 15752705, "router_z_loss_clip": 6.35546875, "router_z_loss_mlp": 0.76416016, "step": 735, "time_per_iteration": 2.690378427505493 }, { "auxiliary_loss_clip": 0.07226487, "auxiliary_loss_mlp": 0.01376369, "balance_loss_clip": 0.0655812, "balance_loss_mlp": 0.01285866, "epoch": 0.04425071396362543, "flos": 28190743482240.0, "grad_norm": 5.348034588902011, "language_loss": 0.92441845, "learning_rate": 3.9978704439091305e-06, "loss": 1.01044703, "num_input_tokens_seen": 15772800, "router_z_loss_clip": 6.69140625, "router_z_loss_mlp": 0.90576172, "step": 736, "time_per_iteration": 2.6856133937835693 }, { "auxiliary_loss_clip": 0.07193038, "auxiliary_loss_mlp": 0.01367171, "balance_loss_clip": 0.06560061, "balance_loss_mlp": 0.0128177, "epoch": 0.0443108372162934, "flos": 23665031481600.0, "grad_norm": 2.531934570522977, "language_loss": 0.87643433, "learning_rate": 3.997852438281901e-06, "loss": 0.96203643, "num_input_tokens_seen": 15793665, "router_z_loss_clip": 6.328125, "router_z_loss_mlp": 0.85351562, "step": 737, "time_per_iteration": 2.6815457344055176 }, { "auxiliary_loss_clip": 0.07204877, "auxiliary_loss_mlp": 0.01367991, "balance_loss_clip": 0.06565012, "balance_loss_mlp": 0.01280253, "epoch": 0.04437096046896137, "flos": 33987486067200.0, "grad_norm": 32.23545054560804, "language_loss": 0.88030899, "learning_rate": 3.997834356895906e-06, "loss": 0.96603763, "num_input_tokens_seen": 15813175, "router_z_loss_clip": 6.3984375, "router_z_loss_mlp": 0.87792969, "step": 738, "time_per_iteration": 2.7730093002319336 }, { "auxiliary_loss_clip": 0.06792486, "auxiliary_loss_mlp": 0.01405381, "balance_loss_clip": 0.06411592, "balance_loss_mlp": 0.01365541, "epoch": 0.04443108372162934, "flos": 67416268308480.0, "grad_norm": 0.9499777465520359, "language_loss": 0.59362948, "learning_rate": 3.9978161997518324e-06, "loss": 0.6756081, "num_input_tokens_seen": 15872050, "router_z_loss_clip": 3.8125, "router_z_loss_mlp": 0.39819336, "step": 739, "time_per_iteration": 3.2219595909118652 }, { "auxiliary_loss_clip": 0.07232308, "auxiliary_loss_mlp": 0.01375364, "balance_loss_clip": 0.06571238, "balance_loss_mlp": 0.01278233, "epoch": 0.04449120697429731, "flos": 29760454344960.0, "grad_norm": 6.837574557307469, "language_loss": 0.94743592, "learning_rate": 3.997797966850369e-06, "loss": 1.03351271, "num_input_tokens_seen": 15891085, "router_z_loss_clip": 6.61328125, "router_z_loss_mlp": 0.97167969, "step": 740, "time_per_iteration": 2.7153775691986084 }, { "auxiliary_loss_clip": 0.0722376, "auxiliary_loss_mlp": 0.01375365, "balance_loss_clip": 0.06570661, "balance_loss_mlp": 0.01289296, "epoch": 0.04455133022696528, "flos": 36510958828800.0, "grad_norm": 5.62142850279292, "language_loss": 0.74805313, "learning_rate": 3.997779658192205e-06, "loss": 0.83404434, "num_input_tokens_seen": 15914225, "router_z_loss_clip": 6.54296875, "router_z_loss_mlp": 0.86083984, "step": 741, "time_per_iteration": 2.800685167312622 }, { "auxiliary_loss_clip": 0.07198022, "auxiliary_loss_mlp": 0.01369993, "balance_loss_clip": 0.0657227, "balance_loss_mlp": 0.01287262, "epoch": 0.044611453479633245, "flos": 28811220566400.0, "grad_norm": 10.559592530019541, "language_loss": 0.91324615, "learning_rate": 3.997761273778037e-06, "loss": 0.99892628, "num_input_tokens_seen": 15934540, "router_z_loss_clip": 6.2578125, "router_z_loss_mlp": 0.82714844, "step": 742, "time_per_iteration": 2.708967924118042 }, { "auxiliary_loss_clip": 0.07218881, "auxiliary_loss_mlp": 0.01360302, "balance_loss_clip": 0.06567729, "balance_loss_mlp": 0.01273804, "epoch": 0.04467157673230122, "flos": 20017122053760.0, "grad_norm": 5.3828407474983955, "language_loss": 0.87490702, "learning_rate": 3.997742813608561e-06, "loss": 0.96069884, "num_input_tokens_seen": 15952560, "router_z_loss_clip": 6.51953125, "router_z_loss_mlp": 0.86523438, "step": 743, "time_per_iteration": 2.644928455352783 }, { "auxiliary_loss_clip": 0.07207333, "auxiliary_loss_mlp": 0.01354265, "balance_loss_clip": 0.06561214, "balance_loss_mlp": 0.01266337, "epoch": 0.04473169998496919, "flos": 18010899745920.0, "grad_norm": 3.2765127984730875, "language_loss": 0.83039057, "learning_rate": 3.997724277684479e-06, "loss": 0.91600657, "num_input_tokens_seen": 15970620, "router_z_loss_clip": 6.46484375, "router_z_loss_mlp": 0.87939453, "step": 744, "time_per_iteration": 2.6487436294555664 }, { "auxiliary_loss_clip": 0.07180318, "auxiliary_loss_mlp": 0.01336164, "balance_loss_clip": 0.0654997, "balance_loss_mlp": 0.01261396, "epoch": 0.044791823237637154, "flos": 20638060335360.0, "grad_norm": 7.9601977440769955, "language_loss": 0.88623995, "learning_rate": 3.99770566600649e-06, "loss": 0.97140479, "num_input_tokens_seen": 15987325, "router_z_loss_clip": 6.30078125, "router_z_loss_mlp": 0.74804688, "step": 745, "time_per_iteration": 2.698974370956421 }, { "auxiliary_loss_clip": 0.07221337, "auxiliary_loss_mlp": 0.01346353, "balance_loss_clip": 0.06564759, "balance_loss_mlp": 0.0126076, "epoch": 0.04485194649030513, "flos": 31184284049280.0, "grad_norm": 3.6236360142266206, "language_loss": 0.72012842, "learning_rate": 3.997686978575302e-06, "loss": 0.80580527, "num_input_tokens_seen": 16008310, "router_z_loss_clip": 6.55859375, "router_z_loss_mlp": 0.85595703, "step": 746, "time_per_iteration": 2.7448737621307373 }, { "auxiliary_loss_clip": 0.07177748, "auxiliary_loss_mlp": 0.01347085, "balance_loss_clip": 0.06543265, "balance_loss_mlp": 0.0126464, "epoch": 0.04491206974297309, "flos": 26150922887040.0, "grad_norm": 3.038304497224793, "language_loss": 0.73197919, "learning_rate": 3.997668215391625e-06, "loss": 0.81722754, "num_input_tokens_seen": 16029620, "router_z_loss_clip": 6.34375, "router_z_loss_mlp": 0.82519531, "step": 747, "time_per_iteration": 2.712475299835205 }, { "auxiliary_loss_clip": 0.07189851, "auxiliary_loss_mlp": 0.01351028, "balance_loss_clip": 0.06547353, "balance_loss_mlp": 0.01268011, "epoch": 0.044972192995641064, "flos": 20673922682880.0, "grad_norm": 4.417736950098344, "language_loss": 0.69905424, "learning_rate": 3.997649376456168e-06, "loss": 0.78446305, "num_input_tokens_seen": 16049065, "router_z_loss_clip": 6.41796875, "router_z_loss_mlp": 0.83105469, "step": 748, "time_per_iteration": 2.74652361869812 }, { "auxiliary_loss_clip": 0.0720419, "auxiliary_loss_mlp": 0.01355763, "balance_loss_clip": 0.065401, "balance_loss_mlp": 0.01272317, "epoch": 0.045032316248309036, "flos": 16112306407680.0, "grad_norm": 4.239071520239307, "language_loss": 0.80252117, "learning_rate": 3.997630461769647e-06, "loss": 0.88812077, "num_input_tokens_seen": 16066765, "router_z_loss_clip": 6.64453125, "router_z_loss_mlp": 0.83447266, "step": 749, "time_per_iteration": 2.6767635345458984 }, { "auxiliary_loss_clip": 0.07186007, "auxiliary_loss_mlp": 0.01351822, "balance_loss_clip": 0.06540497, "balance_loss_mlp": 0.01271522, "epoch": 0.045092439500977, "flos": 17864725098240.0, "grad_norm": 4.414863715685797, "language_loss": 0.92637914, "learning_rate": 3.997611471332778e-06, "loss": 1.01175737, "num_input_tokens_seen": 16085980, "router_z_loss_clip": 6.45703125, "router_z_loss_mlp": 0.80273438, "step": 750, "time_per_iteration": 2.6098227500915527 }, { "auxiliary_loss_clip": 0.07190081, "auxiliary_loss_mlp": 0.01359045, "balance_loss_clip": 0.0654382, "balance_loss_mlp": 0.01270925, "epoch": 0.04515256275364497, "flos": 24469809621120.0, "grad_norm": 4.037947923916033, "language_loss": 0.78247011, "learning_rate": 3.9975924051462825e-06, "loss": 0.86796135, "num_input_tokens_seen": 16106260, "router_z_loss_clip": 6.4609375, "router_z_loss_mlp": 0.88134766, "step": 751, "time_per_iteration": 2.669125556945801 }, { "auxiliary_loss_clip": 0.07186107, "auxiliary_loss_mlp": 0.01356682, "balance_loss_clip": 0.06535654, "balance_loss_mlp": 0.01269611, "epoch": 0.04521268600631294, "flos": 20921563025280.0, "grad_norm": 6.591944026535536, "language_loss": 0.72475159, "learning_rate": 3.997573263210883e-06, "loss": 0.81017941, "num_input_tokens_seen": 16123475, "router_z_loss_clip": 6.5078125, "router_z_loss_mlp": 0.87109375, "step": 752, "time_per_iteration": 2.678579807281494 }, { "auxiliary_loss_clip": 0.07190315, "auxiliary_loss_mlp": 0.01373411, "balance_loss_clip": 0.06534429, "balance_loss_mlp": 0.01282716, "epoch": 0.04527280925898091, "flos": 13376552526720.0, "grad_norm": 6.320768719939874, "language_loss": 0.95766145, "learning_rate": 3.997554045527305e-06, "loss": 1.04329872, "num_input_tokens_seen": 16138335, "router_z_loss_clip": 6.56640625, "router_z_loss_mlp": 0.90625, "step": 753, "time_per_iteration": 2.612010955810547 }, { "auxiliary_loss_clip": 0.07205604, "auxiliary_loss_mlp": 0.0136962, "balance_loss_clip": 0.06544213, "balance_loss_mlp": 0.01285077, "epoch": 0.04533293251164888, "flos": 23260650877440.0, "grad_norm": 6.1468659988803, "language_loss": 0.93886566, "learning_rate": 3.997534752096277e-06, "loss": 1.02461791, "num_input_tokens_seen": 16157110, "router_z_loss_clip": 6.609375, "router_z_loss_mlp": 0.84570312, "step": 754, "time_per_iteration": 2.649549961090088 }, { "auxiliary_loss_clip": 0.07144003, "auxiliary_loss_mlp": 0.01380196, "balance_loss_clip": 0.06515512, "balance_loss_mlp": 0.01295605, "epoch": 0.04539305576431685, "flos": 12426899477760.0, "grad_norm": 5.832117296734392, "language_loss": 0.83787107, "learning_rate": 3.997515382918531e-06, "loss": 0.92311311, "num_input_tokens_seen": 16174155, "router_z_loss_clip": 6.28515625, "router_z_loss_mlp": 0.84619141, "step": 755, "time_per_iteration": 2.6113007068634033 }, { "auxiliary_loss_clip": 0.07167999, "auxiliary_loss_mlp": 0.01393177, "balance_loss_clip": 0.06518481, "balance_loss_mlp": 0.01302339, "epoch": 0.04545317901698482, "flos": 16076569841280.0, "grad_norm": 11.907646183288312, "language_loss": 0.82518649, "learning_rate": 3.9974959379948015e-06, "loss": 0.91079825, "num_input_tokens_seen": 16192240, "router_z_loss_clip": 6.49609375, "router_z_loss_mlp": 0.90771484, "step": 756, "time_per_iteration": 2.6199307441711426 }, { "auxiliary_loss_clip": 0.06788747, "auxiliary_loss_mlp": 0.01391475, "balance_loss_clip": 0.06404833, "balance_loss_mlp": 0.01340692, "epoch": 0.045513302269652785, "flos": 66418118144640.0, "grad_norm": 0.7909026524850269, "language_loss": 0.62978053, "learning_rate": 3.997476417325827e-06, "loss": 0.71158272, "num_input_tokens_seen": 16255775, "router_z_loss_clip": 3.84179688, "router_z_loss_mlp": 0.50805664, "step": 757, "time_per_iteration": 3.2828118801116943 }, { "auxiliary_loss_clip": 0.07137576, "auxiliary_loss_mlp": 0.01370657, "balance_loss_clip": 0.06510019, "balance_loss_mlp": 0.01285542, "epoch": 0.04557342552232076, "flos": 21477694573440.0, "grad_norm": 6.533423883189047, "language_loss": 0.87081361, "learning_rate": 3.997456820912346e-06, "loss": 0.9558959, "num_input_tokens_seen": 16277015, "router_z_loss_clip": 6.28125, "router_z_loss_mlp": 0.85107422, "step": 758, "time_per_iteration": 2.6664209365844727 }, { "auxiliary_loss_clip": 0.07130213, "auxiliary_loss_mlp": 0.01365758, "balance_loss_clip": 0.06518968, "balance_loss_mlp": 0.01285125, "epoch": 0.04563354877498873, "flos": 23739481434240.0, "grad_norm": 5.252898037833299, "language_loss": 0.91091406, "learning_rate": 3.997437148755101e-06, "loss": 0.99587369, "num_input_tokens_seen": 16296005, "router_z_loss_clip": 6.11328125, "router_z_loss_mlp": 0.80615234, "step": 759, "time_per_iteration": 2.683450222015381 }, { "auxiliary_loss_clip": 0.07157241, "auxiliary_loss_mlp": 0.01366071, "balance_loss_clip": 0.06521985, "balance_loss_mlp": 0.01279525, "epoch": 0.045693672027656694, "flos": 25742265724800.0, "grad_norm": 4.744342511376619, "language_loss": 0.77381551, "learning_rate": 3.9974174008548405e-06, "loss": 0.85904866, "num_input_tokens_seen": 16315300, "router_z_loss_clip": 6.3515625, "router_z_loss_mlp": 0.86621094, "step": 760, "time_per_iteration": 2.6832544803619385 }, { "auxiliary_loss_clip": 0.07122989, "auxiliary_loss_mlp": 0.01355349, "balance_loss_clip": 0.06509081, "balance_loss_mlp": 0.01285397, "epoch": 0.045753795280324666, "flos": 19725108174720.0, "grad_norm": 3.160760879931681, "language_loss": 0.86197484, "learning_rate": 3.9973975772123105e-06, "loss": 0.94675815, "num_input_tokens_seen": 16333820, "router_z_loss_clip": 6.14453125, "router_z_loss_mlp": 0.70019531, "step": 761, "time_per_iteration": 2.64084792137146 }, { "auxiliary_loss_clip": 0.0712342, "auxiliary_loss_mlp": 0.01340254, "balance_loss_clip": 0.06502891, "balance_loss_mlp": 0.01263245, "epoch": 0.04581391853299264, "flos": 23262076396800.0, "grad_norm": 3.0119598034073456, "language_loss": 0.82669699, "learning_rate": 3.997377677828266e-06, "loss": 0.9113338, "num_input_tokens_seen": 16355290, "router_z_loss_clip": 6.20703125, "router_z_loss_mlp": 0.77050781, "step": 762, "time_per_iteration": 2.7076919078826904 }, { "auxiliary_loss_clip": 0.06790853, "auxiliary_loss_mlp": 0.01345812, "balance_loss_clip": 0.06412761, "balance_loss_mlp": 0.01308548, "epoch": 0.0458740417856606, "flos": 64250711308800.0, "grad_norm": 0.964803012454844, "language_loss": 0.59286845, "learning_rate": 3.9973577027034585e-06, "loss": 0.67423511, "num_input_tokens_seen": 16415995, "router_z_loss_clip": 3.78125, "router_z_loss_mlp": 0.37231445, "step": 763, "time_per_iteration": 3.286306619644165 }, { "auxiliary_loss_clip": 0.07149484, "auxiliary_loss_mlp": 0.01343298, "balance_loss_clip": 0.06513429, "balance_loss_mlp": 0.01264573, "epoch": 0.045934165038328575, "flos": 20775220669440.0, "grad_norm": 4.490345804490523, "language_loss": 0.91284835, "learning_rate": 3.9973376518386475e-06, "loss": 0.99777621, "num_input_tokens_seen": 16433120, "router_z_loss_clip": 6.35546875, "router_z_loss_mlp": 0.78662109, "step": 764, "time_per_iteration": 2.639216423034668 }, { "auxiliary_loss_clip": 0.07157769, "auxiliary_loss_mlp": 0.01345261, "balance_loss_clip": 0.0651679, "balance_loss_mlp": 0.01265343, "epoch": 0.04599428829099654, "flos": 30270661056000.0, "grad_norm": 7.056388212037809, "language_loss": 0.89123255, "learning_rate": 3.997317525234592e-06, "loss": 0.97626281, "num_input_tokens_seen": 16453360, "router_z_loss_clip": 6.4140625, "router_z_loss_mlp": 0.79882812, "step": 765, "time_per_iteration": 2.7340688705444336 }, { "auxiliary_loss_clip": 0.071592, "auxiliary_loss_mlp": 0.01363057, "balance_loss_clip": 0.06512532, "balance_loss_mlp": 0.01276035, "epoch": 0.04605441154366451, "flos": 23045518719360.0, "grad_norm": 6.847606941352117, "language_loss": 0.9273175, "learning_rate": 3.997297322892056e-06, "loss": 1.01253998, "num_input_tokens_seen": 16471160, "router_z_loss_clip": 6.4609375, "router_z_loss_mlp": 0.86914062, "step": 766, "time_per_iteration": 2.635687828063965 }, { "auxiliary_loss_clip": 0.07144564, "auxiliary_loss_mlp": 0.01351812, "balance_loss_clip": 0.06500787, "balance_loss_mlp": 0.01273087, "epoch": 0.046114534796332485, "flos": 22023847486080.0, "grad_norm": 80.10273067853437, "language_loss": 0.87813628, "learning_rate": 3.997277044811806e-06, "loss": 0.96310008, "num_input_tokens_seen": 16488940, "router_z_loss_clip": 6.44140625, "router_z_loss_mlp": 0.78710938, "step": 767, "time_per_iteration": 4.080042362213135 }, { "auxiliary_loss_clip": 0.07147346, "auxiliary_loss_mlp": 0.0134669, "balance_loss_clip": 0.06513391, "balance_loss_mlp": 0.01265199, "epoch": 0.04617465804900045, "flos": 29870221593600.0, "grad_norm": 3.8001211640402075, "language_loss": 0.90334916, "learning_rate": 3.99725669099461e-06, "loss": 0.98828954, "num_input_tokens_seen": 16509505, "router_z_loss_clip": 6.33984375, "router_z_loss_mlp": 0.81494141, "step": 768, "time_per_iteration": 4.221670627593994 }, { "auxiliary_loss_clip": 0.07150711, "auxiliary_loss_mlp": 0.01355498, "balance_loss_clip": 0.06505459, "balance_loss_mlp": 0.01279919, "epoch": 0.04623478130166842, "flos": 25637194304640.0, "grad_norm": 4.107955283630799, "language_loss": 0.77902997, "learning_rate": 3.9972362614412395e-06, "loss": 0.86409199, "num_input_tokens_seen": 16528840, "router_z_loss_clip": 6.44921875, "router_z_loss_mlp": 0.75585938, "step": 769, "time_per_iteration": 2.6986286640167236 }, { "auxiliary_loss_clip": 0.07101085, "auxiliary_loss_mlp": 0.01358146, "balance_loss_clip": 0.06475995, "balance_loss_mlp": 0.01278944, "epoch": 0.04629490455433639, "flos": 20455352507520.0, "grad_norm": 3.6160787712947577, "language_loss": 0.8934325, "learning_rate": 3.997215756152471e-06, "loss": 0.97802478, "num_input_tokens_seen": 16548335, "router_z_loss_clip": 6.25, "router_z_loss_mlp": 0.79150391, "step": 770, "time_per_iteration": 4.186040639877319 }, { "auxiliary_loss_clip": 0.07126901, "auxiliary_loss_mlp": 0.01357528, "balance_loss_clip": 0.06484652, "balance_loss_mlp": 0.01281664, "epoch": 0.04635502780700436, "flos": 23155411749120.0, "grad_norm": 5.5967390603004645, "language_loss": 0.90718663, "learning_rate": 3.99719517512908e-06, "loss": 0.99203092, "num_input_tokens_seen": 16567725, "router_z_loss_clip": 6.41796875, "router_z_loss_mlp": 0.75878906, "step": 771, "time_per_iteration": 4.083797931671143 }, { "auxiliary_loss_clip": 0.07156885, "auxiliary_loss_mlp": 0.01367059, "balance_loss_clip": 0.06493522, "balance_loss_mlp": 0.01281896, "epoch": 0.04641515105967233, "flos": 23298274160640.0, "grad_norm": 3.6467913614343312, "language_loss": 0.8729201, "learning_rate": 3.997174518371848e-06, "loss": 0.95815951, "num_input_tokens_seen": 16588175, "router_z_loss_clip": 6.6328125, "router_z_loss_mlp": 0.8515625, "step": 772, "time_per_iteration": 2.659289836883545 }, { "auxiliary_loss_clip": 0.07083683, "auxiliary_loss_mlp": 0.01365167, "balance_loss_clip": 0.06469382, "balance_loss_mlp": 0.01292783, "epoch": 0.046475274312340296, "flos": 25121579005440.0, "grad_norm": 4.82618433621897, "language_loss": 0.77390647, "learning_rate": 3.997153785881557e-06, "loss": 0.85839498, "num_input_tokens_seen": 16607735, "router_z_loss_clip": 6.15234375, "router_z_loss_mlp": 0.72363281, "step": 773, "time_per_iteration": 2.652568817138672 }, { "auxiliary_loss_clip": 0.07075872, "auxiliary_loss_mlp": 0.01354855, "balance_loss_clip": 0.06468752, "balance_loss_mlp": 0.01278656, "epoch": 0.04653539756500827, "flos": 25271946357120.0, "grad_norm": 20.09294063203196, "language_loss": 0.81614679, "learning_rate": 3.997132977658996e-06, "loss": 0.9004541, "num_input_tokens_seen": 16627225, "router_z_loss_clip": 6.07421875, "router_z_loss_mlp": 0.76171875, "step": 774, "time_per_iteration": 2.6723434925079346 }, { "auxiliary_loss_clip": 0.0710728, "auxiliary_loss_mlp": 0.01366695, "balance_loss_clip": 0.06484248, "balance_loss_mlp": 0.01285823, "epoch": 0.046595520817676234, "flos": 35412238166400.0, "grad_norm": 12.592229136400514, "language_loss": 0.76727688, "learning_rate": 3.997112093704952e-06, "loss": 0.85201669, "num_input_tokens_seen": 16647785, "router_z_loss_clip": 6.234375, "router_z_loss_mlp": 0.80761719, "step": 775, "time_per_iteration": 2.754901885986328 }, { "auxiliary_loss_clip": 0.07098351, "auxiliary_loss_mlp": 0.01354798, "balance_loss_clip": 0.0648039, "balance_loss_mlp": 0.01279362, "epoch": 0.046655644070344206, "flos": 18118151372160.0, "grad_norm": 2.7891876339131465, "language_loss": 0.80077243, "learning_rate": 3.997091134020217e-06, "loss": 0.88530397, "num_input_tokens_seen": 16667555, "router_z_loss_clip": 6.1796875, "router_z_loss_mlp": 0.75390625, "step": 776, "time_per_iteration": 2.6401877403259277 }, { "auxiliary_loss_clip": 0.07084684, "auxiliary_loss_mlp": 0.01349315, "balance_loss_clip": 0.06483892, "balance_loss_mlp": 0.01274833, "epoch": 0.04671576732301218, "flos": 29212959767040.0, "grad_norm": 5.02912234126233, "language_loss": 0.75250071, "learning_rate": 3.997070098605585e-06, "loss": 0.83684075, "num_input_tokens_seen": 16686875, "router_z_loss_clip": 6.01171875, "router_z_loss_mlp": 0.74414062, "step": 777, "time_per_iteration": 2.7004332542419434 }, { "auxiliary_loss_clip": 0.07077673, "auxiliary_loss_mlp": 0.01354004, "balance_loss_clip": 0.06478713, "balance_loss_mlp": 0.01275755, "epoch": 0.04677589057568014, "flos": 30485541651840.0, "grad_norm": 4.609961974119921, "language_loss": 0.79389715, "learning_rate": 3.997048987461856e-06, "loss": 0.87821388, "num_input_tokens_seen": 16706420, "router_z_loss_clip": 5.99609375, "router_z_loss_mlp": 0.78222656, "step": 778, "time_per_iteration": 2.7039754390716553 }, { "auxiliary_loss_clip": 0.07085802, "auxiliary_loss_mlp": 0.01358818, "balance_loss_clip": 0.06476518, "balance_loss_mlp": 0.01285862, "epoch": 0.046836013828348115, "flos": 20563820017920.0, "grad_norm": 4.172397322975656, "language_loss": 0.83077013, "learning_rate": 3.997027800589829e-06, "loss": 0.91521633, "num_input_tokens_seen": 16726390, "router_z_loss_clip": 6.10546875, "router_z_loss_mlp": 0.72900391, "step": 779, "time_per_iteration": 2.6167049407958984 }, { "auxiliary_loss_clip": 0.07055575, "auxiliary_loss_mlp": 0.01349634, "balance_loss_clip": 0.06466093, "balance_loss_mlp": 0.01281876, "epoch": 0.04689613708101608, "flos": 25454444549760.0, "grad_norm": 2.589980381003352, "language_loss": 0.79144293, "learning_rate": 3.997006537990308e-06, "loss": 0.87549508, "num_input_tokens_seen": 16748965, "router_z_loss_clip": 5.89453125, "router_z_loss_mlp": 0.67822266, "step": 780, "time_per_iteration": 2.7178072929382324 }, { "auxiliary_loss_clip": 0.07056205, "auxiliary_loss_mlp": 0.01344452, "balance_loss_clip": 0.06464383, "balance_loss_mlp": 0.0127779, "epoch": 0.04695626033368405, "flos": 23007811582080.0, "grad_norm": 3.71557856895017, "language_loss": 0.78752637, "learning_rate": 3.996985199664099e-06, "loss": 0.87153292, "num_input_tokens_seen": 16768620, "router_z_loss_clip": 5.91796875, "router_z_loss_mlp": 0.66650391, "step": 781, "time_per_iteration": 2.6808021068573 }, { "auxiliary_loss_clip": 0.07103659, "auxiliary_loss_mlp": 0.01356229, "balance_loss_clip": 0.06468488, "balance_loss_mlp": 0.01275405, "epoch": 0.047016383586352024, "flos": 29141193144960.0, "grad_norm": 3.851503437348459, "language_loss": 0.77968454, "learning_rate": 3.99696378561201e-06, "loss": 0.8642835, "num_input_tokens_seen": 16789755, "router_z_loss_clip": 6.34765625, "router_z_loss_mlp": 0.80810547, "step": 782, "time_per_iteration": 2.7261157035827637 }, { "auxiliary_loss_clip": 0.07048626, "auxiliary_loss_mlp": 0.01344173, "balance_loss_clip": 0.06449573, "balance_loss_mlp": 0.01275842, "epoch": 0.04707650683901999, "flos": 14981706466560.0, "grad_norm": 6.408293368029827, "language_loss": 0.8388226, "learning_rate": 3.996942295834855e-06, "loss": 0.92275059, "num_input_tokens_seen": 16807585, "router_z_loss_clip": 5.9921875, "router_z_loss_mlp": 0.68310547, "step": 783, "time_per_iteration": 2.631540536880493 }, { "auxiliary_loss_clip": 0.07026248, "auxiliary_loss_mlp": 0.01349908, "balance_loss_clip": 0.06445228, "balance_loss_mlp": 0.01281052, "epoch": 0.04713663009168796, "flos": 21657257873280.0, "grad_norm": 5.561977666305616, "language_loss": 0.85501373, "learning_rate": 3.996920730333448e-06, "loss": 0.9387753, "num_input_tokens_seen": 16827220, "router_z_loss_clip": 5.8125, "router_z_loss_mlp": 0.68896484, "step": 784, "time_per_iteration": 2.6578471660614014 }, { "auxiliary_loss_clip": 0.07046221, "auxiliary_loss_mlp": 0.01343831, "balance_loss_clip": 0.0644875, "balance_loss_mlp": 0.01274499, "epoch": 0.04719675334435593, "flos": 21331939196160.0, "grad_norm": 3.362107156859812, "language_loss": 0.8358919, "learning_rate": 3.996899089108607e-06, "loss": 0.91979241, "num_input_tokens_seen": 16846230, "router_z_loss_clip": 5.984375, "router_z_loss_mlp": 0.69335938, "step": 785, "time_per_iteration": 2.6519343852996826 }, { "auxiliary_loss_clip": 0.07039312, "auxiliary_loss_mlp": 0.01361438, "balance_loss_clip": 0.06440622, "balance_loss_mlp": 0.01289912, "epoch": 0.0472568765970239, "flos": 17937204480000.0, "grad_norm": 6.780350329178303, "language_loss": 0.93449444, "learning_rate": 3.996877372161152e-06, "loss": 1.01850188, "num_input_tokens_seen": 16865325, "router_z_loss_clip": 5.9921875, "router_z_loss_mlp": 0.71582031, "step": 786, "time_per_iteration": 2.6395480632781982 }, { "auxiliary_loss_clip": 0.07084908, "auxiliary_loss_mlp": 0.01382903, "balance_loss_clip": 0.06449488, "balance_loss_mlp": 0.01299933, "epoch": 0.04731699984969187, "flos": 18083169492480.0, "grad_norm": 5.338376686519002, "language_loss": 0.81279731, "learning_rate": 3.9968555794919065e-06, "loss": 0.89747536, "num_input_tokens_seen": 16882930, "router_z_loss_clip": 6.3515625, "router_z_loss_mlp": 0.82958984, "step": 787, "time_per_iteration": 2.599910020828247 }, { "auxiliary_loss_clip": 0.07039601, "auxiliary_loss_mlp": 0.01338843, "balance_loss_clip": 0.06443816, "balance_loss_mlp": 0.01270226, "epoch": 0.047377123102359836, "flos": 23191735294080.0, "grad_norm": 24.371977018170398, "language_loss": 0.84700537, "learning_rate": 3.996833711101698e-06, "loss": 0.93078983, "num_input_tokens_seen": 16900710, "router_z_loss_clip": 5.953125, "router_z_loss_mlp": 0.68554688, "step": 788, "time_per_iteration": 2.712230920791626 }, { "auxiliary_loss_clip": 0.07033639, "auxiliary_loss_mlp": 0.0132685, "balance_loss_clip": 0.06448196, "balance_loss_mlp": 0.01260903, "epoch": 0.04743724635502781, "flos": 22754469162240.0, "grad_norm": 4.889184363623447, "language_loss": 0.87278831, "learning_rate": 3.996811766991355e-06, "loss": 0.95639324, "num_input_tokens_seen": 16919210, "router_z_loss_clip": 5.86328125, "router_z_loss_mlp": 0.66015625, "step": 789, "time_per_iteration": 2.6509737968444824 }, { "auxiliary_loss_clip": 0.07038186, "auxiliary_loss_mlp": 0.01333227, "balance_loss_clip": 0.06440315, "balance_loss_mlp": 0.01264801, "epoch": 0.04749736960769577, "flos": 17244499576320.0, "grad_norm": 7.319631610361179, "language_loss": 0.85541749, "learning_rate": 3.996789747161709e-06, "loss": 0.93913162, "num_input_tokens_seen": 16937125, "router_z_loss_clip": 5.984375, "router_z_loss_mlp": 0.68457031, "step": 790, "time_per_iteration": 2.6371359825134277 }, { "auxiliary_loss_clip": 0.07037228, "auxiliary_loss_mlp": 0.01325356, "balance_loss_clip": 0.06439266, "balance_loss_mlp": 0.01255738, "epoch": 0.047557492860363745, "flos": 40488798908160.0, "grad_norm": 6.244276173208921, "language_loss": 0.90958333, "learning_rate": 3.996767651613597e-06, "loss": 0.99320918, "num_input_tokens_seen": 16958610, "router_z_loss_clip": 5.98046875, "router_z_loss_mlp": 0.69628906, "step": 791, "time_per_iteration": 2.836726427078247 }, { "auxiliary_loss_clip": 0.07049613, "auxiliary_loss_mlp": 0.01329335, "balance_loss_clip": 0.06454255, "balance_loss_mlp": 0.0126067, "epoch": 0.04761761611303172, "flos": 18704023920000.0, "grad_norm": 4.242503618182929, "language_loss": 0.92719126, "learning_rate": 3.996745480347854e-06, "loss": 1.01098073, "num_input_tokens_seen": 16977300, "router_z_loss_clip": 5.94921875, "router_z_loss_mlp": 0.68652344, "step": 792, "time_per_iteration": 2.6954169273376465 }, { "auxiliary_loss_clip": 0.07033367, "auxiliary_loss_mlp": 0.01329411, "balance_loss_clip": 0.06446603, "balance_loss_mlp": 0.0126313, "epoch": 0.04767773936569968, "flos": 20928103643520.0, "grad_norm": 4.024292708986136, "language_loss": 0.76225007, "learning_rate": 3.996723233365324e-06, "loss": 0.84587789, "num_input_tokens_seen": 16994950, "router_z_loss_clip": 5.87109375, "router_z_loss_mlp": 0.66259766, "step": 793, "time_per_iteration": 2.6279451847076416 }, { "auxiliary_loss_clip": 0.07041553, "auxiliary_loss_mlp": 0.0133827, "balance_loss_clip": 0.06435324, "balance_loss_mlp": 0.01262167, "epoch": 0.047737862618367655, "flos": 23739481434240.0, "grad_norm": 4.500560866813839, "language_loss": 0.90210879, "learning_rate": 3.996700910666847e-06, "loss": 0.98590708, "num_input_tokens_seen": 17014760, "router_z_loss_clip": 6.0546875, "router_z_loss_mlp": 0.76025391, "step": 794, "time_per_iteration": 2.6950149536132812 }, { "auxiliary_loss_clip": 0.07040641, "auxiliary_loss_mlp": 0.01332201, "balance_loss_clip": 0.06442543, "balance_loss_mlp": 0.01259435, "epoch": 0.04779798587103562, "flos": 23702487056640.0, "grad_norm": 8.732471256629616, "language_loss": 0.74911976, "learning_rate": 3.996678512253272e-06, "loss": 0.83284819, "num_input_tokens_seen": 17032715, "router_z_loss_clip": 5.984375, "router_z_loss_mlp": 0.72753906, "step": 795, "time_per_iteration": 2.801392078399658 }, { "auxiliary_loss_clip": 0.07007521, "auxiliary_loss_mlp": 0.01331583, "balance_loss_clip": 0.06427439, "balance_loss_mlp": 0.01262347, "epoch": 0.04785810912370359, "flos": 23190058212480.0, "grad_norm": 4.387116346542859, "language_loss": 0.83166564, "learning_rate": 3.996656038125449e-06, "loss": 0.91505671, "num_input_tokens_seen": 17052215, "router_z_loss_clip": 5.80078125, "router_z_loss_mlp": 0.69287109, "step": 796, "time_per_iteration": 2.6902272701263428 }, { "auxiliary_loss_clip": 0.07028541, "auxiliary_loss_mlp": 0.0133014, "balance_loss_clip": 0.06436078, "balance_loss_mlp": 0.01259091, "epoch": 0.047918232376371564, "flos": 18046426677120.0, "grad_norm": 6.478965133074123, "language_loss": 0.86377782, "learning_rate": 3.996633488284228e-06, "loss": 0.94736469, "num_input_tokens_seen": 17069225, "router_z_loss_clip": 5.91796875, "router_z_loss_mlp": 0.71044922, "step": 797, "time_per_iteration": 2.624303102493286 }, { "auxiliary_loss_clip": 0.06889782, "auxiliary_loss_mlp": 0.0131429, "balance_loss_clip": 0.06516756, "balance_loss_mlp": 0.01261457, "epoch": 0.04797835562903953, "flos": 62461717511040.0, "grad_norm": 1.0624006098911367, "language_loss": 0.64796114, "learning_rate": 3.996610862730465e-06, "loss": 0.73000187, "num_input_tokens_seen": 17126680, "router_z_loss_clip": 3.72460938, "router_z_loss_mlp": 0.52929688, "step": 798, "time_per_iteration": 3.1654694080352783 }, { "auxiliary_loss_clip": 0.07064653, "auxiliary_loss_mlp": 0.01337083, "balance_loss_clip": 0.06452642, "balance_loss_mlp": 0.01261171, "epoch": 0.0480384788817075, "flos": 21513766556160.0, "grad_norm": 9.78418264487265, "language_loss": 0.94207525, "learning_rate": 3.996588161465018e-06, "loss": 1.02609253, "num_input_tokens_seen": 17144835, "router_z_loss_clip": 6.12109375, "router_z_loss_mlp": 0.75878906, "step": 799, "time_per_iteration": 2.6468873023986816 }, { "auxiliary_loss_clip": 0.07008617, "auxiliary_loss_mlp": 0.01353131, "balance_loss_clip": 0.06426462, "balance_loss_mlp": 0.01273976, "epoch": 0.048098602134375466, "flos": 21733301053440.0, "grad_norm": 4.675316756379063, "language_loss": 0.9027831, "learning_rate": 3.996565384488748e-06, "loss": 0.9864006, "num_input_tokens_seen": 17165030, "router_z_loss_clip": 5.81640625, "router_z_loss_mlp": 0.79150391, "step": 800, "time_per_iteration": 2.6547205448150635 }, { "auxiliary_loss_clip": 0.07036844, "auxiliary_loss_mlp": 0.01333347, "balance_loss_clip": 0.06441022, "balance_loss_mlp": 0.01260201, "epoch": 0.04815872538704344, "flos": 22937931676800.0, "grad_norm": 4.117829299144331, "language_loss": 0.87062037, "learning_rate": 3.996542531802518e-06, "loss": 0.95432234, "num_input_tokens_seen": 17184895, "router_z_loss_clip": 5.9609375, "router_z_loss_mlp": 0.73095703, "step": 801, "time_per_iteration": 2.638288736343384 }, { "auxiliary_loss_clip": 0.07033582, "auxiliary_loss_mlp": 0.01347014, "balance_loss_clip": 0.06427091, "balance_loss_mlp": 0.01267334, "epoch": 0.04821884863971141, "flos": 43183952686080.0, "grad_norm": 3.281410780354955, "language_loss": 0.834306, "learning_rate": 3.996519603407196e-06, "loss": 0.91811204, "num_input_tokens_seen": 17208225, "router_z_loss_clip": 6.06640625, "router_z_loss_mlp": 0.79638672, "step": 802, "time_per_iteration": 2.818291664123535 }, { "auxiliary_loss_clip": 0.07007331, "auxiliary_loss_mlp": 0.01323477, "balance_loss_clip": 0.06424223, "balance_loss_mlp": 0.01253239, "epoch": 0.048278971892379376, "flos": 18625171628160.0, "grad_norm": 10.084564925743097, "language_loss": 0.89353657, "learning_rate": 3.996496599303649e-06, "loss": 0.97684467, "num_input_tokens_seen": 17226305, "router_z_loss_clip": 5.8359375, "router_z_loss_mlp": 0.70263672, "step": 803, "time_per_iteration": 2.604703664779663 }, { "auxiliary_loss_clip": 0.07024553, "auxiliary_loss_mlp": 0.01331891, "balance_loss_clip": 0.0643099, "balance_loss_mlp": 0.0125636, "epoch": 0.04833909514504735, "flos": 20236279207680.0, "grad_norm": 3.2598992024925346, "language_loss": 0.8918699, "learning_rate": 3.996473519492753e-06, "loss": 0.97543436, "num_input_tokens_seen": 17244545, "router_z_loss_clip": 5.93359375, "router_z_loss_mlp": 0.75537109, "step": 804, "time_per_iteration": 2.637340784072876 }, { "auxiliary_loss_clip": 0.07020802, "auxiliary_loss_mlp": 0.01326832, "balance_loss_clip": 0.06429113, "balance_loss_mlp": 0.01259741, "epoch": 0.04839921839771532, "flos": 24652182032640.0, "grad_norm": 4.873016006911891, "language_loss": 0.89537036, "learning_rate": 3.99645036397538e-06, "loss": 0.97884673, "num_input_tokens_seen": 17265730, "router_z_loss_clip": 5.9140625, "router_z_loss_mlp": 0.67089844, "step": 805, "time_per_iteration": 2.683056116104126 }, { "auxiliary_loss_clip": 0.07028521, "auxiliary_loss_mlp": 0.01333189, "balance_loss_clip": 0.06433994, "balance_loss_mlp": 0.01265812, "epoch": 0.048459341650383285, "flos": 24834470590080.0, "grad_norm": 2.6705597216420958, "language_loss": 0.70765924, "learning_rate": 3.9964271327524085e-06, "loss": 0.79127634, "num_input_tokens_seen": 17284820, "router_z_loss_clip": 5.94921875, "router_z_loss_mlp": 0.67382812, "step": 806, "time_per_iteration": 2.659463882446289 }, { "auxiliary_loss_clip": 0.07000336, "auxiliary_loss_mlp": 0.01329225, "balance_loss_clip": 0.06435825, "balance_loss_mlp": 0.01265567, "epoch": 0.04851946490305126, "flos": 22169644790400.0, "grad_norm": 6.708583192858006, "language_loss": 0.79307151, "learning_rate": 3.9964038258247214e-06, "loss": 0.87636709, "num_input_tokens_seen": 17305085, "router_z_loss_clip": 5.6484375, "router_z_loss_mlp": 0.63623047, "step": 807, "time_per_iteration": 4.1534950733184814 }, { "auxiliary_loss_clip": 0.07008768, "auxiliary_loss_mlp": 0.01344666, "balance_loss_clip": 0.06435926, "balance_loss_mlp": 0.01274857, "epoch": 0.04857958815571922, "flos": 19798132608000.0, "grad_norm": 4.0829833826436746, "language_loss": 0.89613616, "learning_rate": 3.9963804431932005e-06, "loss": 0.97967046, "num_input_tokens_seen": 17322715, "router_z_loss_clip": 5.73046875, "router_z_loss_mlp": 0.69824219, "step": 808, "time_per_iteration": 4.082685708999634 }, { "auxiliary_loss_clip": 0.07054844, "auxiliary_loss_mlp": 0.01361942, "balance_loss_clip": 0.06441776, "balance_loss_mlp": 0.012887, "epoch": 0.048639711408387194, "flos": 18703981992960.0, "grad_norm": 4.581252969986566, "language_loss": 0.9353385, "learning_rate": 3.996356984858732e-06, "loss": 1.01950634, "num_input_tokens_seen": 17341455, "router_z_loss_clip": 6.125, "router_z_loss_mlp": 0.73193359, "step": 809, "time_per_iteration": 2.7621610164642334 }, { "auxiliary_loss_clip": 0.07027323, "auxiliary_loss_mlp": 0.01360074, "balance_loss_clip": 0.06441981, "balance_loss_mlp": 0.01288214, "epoch": 0.048699834661055166, "flos": 24870458718720.0, "grad_norm": 3.520012226043267, "language_loss": 0.88927138, "learning_rate": 3.996333450822208e-06, "loss": 0.97314537, "num_input_tokens_seen": 17360765, "router_z_loss_clip": 5.85546875, "router_z_loss_mlp": 0.71875, "step": 810, "time_per_iteration": 5.5484254360198975 }, { "auxiliary_loss_clip": 0.07065233, "auxiliary_loss_mlp": 0.01364877, "balance_loss_clip": 0.06445035, "balance_loss_mlp": 0.01296404, "epoch": 0.04875995791372313, "flos": 20710246227840.0, "grad_norm": 5.751273723525114, "language_loss": 0.82904005, "learning_rate": 3.99630984108452e-06, "loss": 0.91334116, "num_input_tokens_seen": 17380625, "router_z_loss_clip": 6.19921875, "router_z_loss_mlp": 0.68505859, "step": 811, "time_per_iteration": 2.661637306213379 }, { "auxiliary_loss_clip": 0.07031714, "auxiliary_loss_mlp": 0.01366358, "balance_loss_clip": 0.06444806, "balance_loss_mlp": 0.0129879, "epoch": 0.048820081166391104, "flos": 18594256671360.0, "grad_norm": 5.183417592242847, "language_loss": 0.77488858, "learning_rate": 3.9962861556465615e-06, "loss": 0.85886931, "num_input_tokens_seen": 17399355, "router_z_loss_clip": 5.875, "router_z_loss_mlp": 0.67529297, "step": 812, "time_per_iteration": 2.6417150497436523 }, { "auxiliary_loss_clip": 0.07012257, "auxiliary_loss_mlp": 0.01366382, "balance_loss_clip": 0.06437302, "balance_loss_mlp": 0.0130096, "epoch": 0.04888020441905907, "flos": 22713324007680.0, "grad_norm": 4.097555931817123, "language_loss": 0.92560816, "learning_rate": 3.996262394509233e-06, "loss": 1.00939453, "num_input_tokens_seen": 17418240, "router_z_loss_clip": 5.75, "router_z_loss_mlp": 0.65380859, "step": 813, "time_per_iteration": 2.6783809661865234 }, { "auxiliary_loss_clip": 0.07032309, "auxiliary_loss_mlp": 0.01381185, "balance_loss_clip": 0.06442818, "balance_loss_mlp": 0.01308802, "epoch": 0.04894032767172704, "flos": 22791044269440.0, "grad_norm": 4.573323240900139, "language_loss": 0.77757925, "learning_rate": 3.9962385576734335e-06, "loss": 0.86171418, "num_input_tokens_seen": 17436250, "router_z_loss_clip": 5.8984375, "router_z_loss_mlp": 0.72412109, "step": 814, "time_per_iteration": 2.657712697982788 }, { "auxiliary_loss_clip": 0.0704044, "auxiliary_loss_mlp": 0.01373499, "balance_loss_clip": 0.06446699, "balance_loss_mlp": 0.01301115, "epoch": 0.04900045092439501, "flos": 25522521592320.0, "grad_norm": 3.947685554432281, "language_loss": 0.86391771, "learning_rate": 3.9962146451400675e-06, "loss": 0.94805712, "num_input_tokens_seen": 17455750, "router_z_loss_clip": 5.9375, "router_z_loss_mlp": 0.72363281, "step": 815, "time_per_iteration": 2.662299394607544 }, { "auxiliary_loss_clip": 0.07066838, "auxiliary_loss_mlp": 0.01387383, "balance_loss_clip": 0.06450786, "balance_loss_mlp": 0.01308848, "epoch": 0.04906057417706298, "flos": 25965280166400.0, "grad_norm": 26.550005706417043, "language_loss": 0.94863713, "learning_rate": 3.996190656910043e-06, "loss": 1.03317928, "num_input_tokens_seen": 17474995, "router_z_loss_clip": 6.16015625, "router_z_loss_mlp": 0.78564453, "step": 816, "time_per_iteration": 2.6723761558532715 }, { "auxiliary_loss_clip": 0.07064529, "auxiliary_loss_mlp": 0.01380942, "balance_loss_clip": 0.0643966, "balance_loss_mlp": 0.01307652, "epoch": 0.04912069742973095, "flos": 18630580216320.0, "grad_norm": 6.578911584583829, "language_loss": 0.82915914, "learning_rate": 3.996166592984268e-06, "loss": 0.91361392, "num_input_tokens_seen": 17493395, "router_z_loss_clip": 6.24609375, "router_z_loss_mlp": 0.73339844, "step": 817, "time_per_iteration": 2.6711559295654297 }, { "auxiliary_loss_clip": 0.07014075, "auxiliary_loss_mlp": 0.01368431, "balance_loss_clip": 0.06429762, "balance_loss_mlp": 0.01299623, "epoch": 0.049180820682398915, "flos": 23707182885120.0, "grad_norm": 4.473006713316069, "language_loss": 0.87685072, "learning_rate": 3.996142453363656e-06, "loss": 0.96067572, "num_input_tokens_seen": 17514565, "router_z_loss_clip": 5.84765625, "router_z_loss_mlp": 0.68847656, "step": 818, "time_per_iteration": 2.6638214588165283 }, { "auxiliary_loss_clip": 0.07062039, "auxiliary_loss_mlp": 0.01390419, "balance_loss_clip": 0.06435136, "balance_loss_mlp": 0.013136, "epoch": 0.04924094393506689, "flos": 22427179914240.0, "grad_norm": 13.438373563161528, "language_loss": 0.80597007, "learning_rate": 3.996118238049124e-06, "loss": 0.8904947, "num_input_tokens_seen": 17534590, "router_z_loss_clip": 6.265625, "router_z_loss_mlp": 0.76806641, "step": 819, "time_per_iteration": 2.667180061340332 }, { "auxiliary_loss_clip": 0.07022672, "auxiliary_loss_mlp": 0.01379041, "balance_loss_clip": 0.06421575, "balance_loss_mlp": 0.01308374, "epoch": 0.04930106718773486, "flos": 15743033464320.0, "grad_norm": 4.289022918442267, "language_loss": 0.87369835, "learning_rate": 3.996093947041586e-06, "loss": 0.95771545, "num_input_tokens_seen": 17551900, "router_z_loss_clip": 6.015625, "router_z_loss_mlp": 0.70605469, "step": 820, "time_per_iteration": 2.6114985942840576 }, { "auxiliary_loss_clip": 0.07032701, "auxiliary_loss_mlp": 0.01367637, "balance_loss_clip": 0.06427422, "balance_loss_mlp": 0.01298257, "epoch": 0.049361190440402825, "flos": 26257922951040.0, "grad_norm": 11.063684561828058, "language_loss": 0.93132615, "learning_rate": 3.996069580341966e-06, "loss": 1.01532948, "num_input_tokens_seen": 17571485, "router_z_loss_clip": 6.05078125, "router_z_loss_mlp": 0.69433594, "step": 821, "time_per_iteration": 2.7055137157440186 }, { "auxiliary_loss_clip": 0.07039528, "auxiliary_loss_mlp": 0.01375461, "balance_loss_clip": 0.06423414, "balance_loss_mlp": 0.01303793, "epoch": 0.0494213136930708, "flos": 21258872835840.0, "grad_norm": 3.614680014650336, "language_loss": 0.91875458, "learning_rate": 3.996045137951188e-06, "loss": 1.00290442, "num_input_tokens_seen": 17591410, "router_z_loss_clip": 6.1640625, "router_z_loss_mlp": 0.71679688, "step": 822, "time_per_iteration": 2.661337375640869 }, { "auxiliary_loss_clip": 0.07020516, "auxiliary_loss_mlp": 0.01373245, "balance_loss_clip": 0.06425461, "balance_loss_mlp": 0.01300671, "epoch": 0.04948143694573876, "flos": 27973095701760.0, "grad_norm": 4.07493240689221, "language_loss": 0.69784033, "learning_rate": 3.996020619870178e-06, "loss": 0.78177792, "num_input_tokens_seen": 17612010, "router_z_loss_clip": 5.953125, "router_z_loss_mlp": 0.72509766, "step": 823, "time_per_iteration": 2.7176904678344727 }, { "auxiliary_loss_clip": 0.06863685, "auxiliary_loss_mlp": 0.01395488, "balance_loss_clip": 0.06498053, "balance_loss_mlp": 0.01366401, "epoch": 0.049541560198406734, "flos": 66197466345600.0, "grad_norm": 1.3367627053827353, "language_loss": 0.62853259, "learning_rate": 3.995996026099866e-06, "loss": 0.7111243, "num_input_tokens_seen": 17673430, "router_z_loss_clip": 3.65625, "router_z_loss_mlp": 0.2902832, "step": 824, "time_per_iteration": 3.319753646850586 }, { "auxiliary_loss_clip": 0.07036494, "auxiliary_loss_mlp": 0.01404709, "balance_loss_clip": 0.06422058, "balance_loss_mlp": 0.01329035, "epoch": 0.049601683451074706, "flos": 22899218290560.0, "grad_norm": 47.468334378200915, "language_loss": 0.93524337, "learning_rate": 3.995971356641185e-06, "loss": 1.01965547, "num_input_tokens_seen": 17689545, "router_z_loss_clip": 6.140625, "router_z_loss_mlp": 0.75683594, "step": 825, "time_per_iteration": 2.6812820434570312 }, { "auxiliary_loss_clip": 0.07016331, "auxiliary_loss_mlp": 0.01394763, "balance_loss_clip": 0.06427336, "balance_loss_mlp": 0.01325813, "epoch": 0.04966180670374267, "flos": 21439987436160.0, "grad_norm": 13.386590697135363, "language_loss": 0.71324062, "learning_rate": 3.9959466114950695e-06, "loss": 0.7973516, "num_input_tokens_seen": 17705965, "router_z_loss_clip": 5.88671875, "router_z_loss_mlp": 0.68994141, "step": 826, "time_per_iteration": 2.653604030609131 }, { "auxiliary_loss_clip": 0.07023457, "auxiliary_loss_mlp": 0.01393966, "balance_loss_clip": 0.06417996, "balance_loss_mlp": 0.01321868, "epoch": 0.04972192995641064, "flos": 23113218418560.0, "grad_norm": 7.687618828654154, "language_loss": 0.81418258, "learning_rate": 3.995921790662459e-06, "loss": 0.8983568, "num_input_tokens_seen": 17724580, "router_z_loss_clip": 6.0625, "router_z_loss_mlp": 0.72070312, "step": 827, "time_per_iteration": 2.6784040927886963 }, { "auxiliary_loss_clip": 0.07030068, "auxiliary_loss_mlp": 0.0137482, "balance_loss_clip": 0.06425111, "balance_loss_mlp": 0.01307109, "epoch": 0.04978205320907861, "flos": 40415648693760.0, "grad_norm": 9.76330658073976, "language_loss": 0.81490493, "learning_rate": 3.995896894144294e-06, "loss": 0.8989538, "num_input_tokens_seen": 17747755, "router_z_loss_clip": 6.05078125, "router_z_loss_mlp": 0.67724609, "step": 828, "time_per_iteration": 2.832648515701294 }, { "auxiliary_loss_clip": 0.06985921, "auxiliary_loss_mlp": 0.01362505, "balance_loss_clip": 0.06412103, "balance_loss_mlp": 0.01298752, "epoch": 0.04984217646174658, "flos": 25235580885120.0, "grad_norm": 17.40533750864247, "language_loss": 0.87119758, "learning_rate": 3.995871921941519e-06, "loss": 0.95468187, "num_input_tokens_seen": 17768550, "router_z_loss_clip": 5.7421875, "router_z_loss_mlp": 0.63720703, "step": 829, "time_per_iteration": 2.7499611377716064 }, { "auxiliary_loss_clip": 0.07017666, "auxiliary_loss_mlp": 0.01366085, "balance_loss_clip": 0.0642464, "balance_loss_mlp": 0.01293177, "epoch": 0.04990229971441455, "flos": 15964873948800.0, "grad_norm": 8.368954093824756, "language_loss": 0.78215969, "learning_rate": 3.99584687405508e-06, "loss": 0.8659972, "num_input_tokens_seen": 17786080, "router_z_loss_clip": 5.9375, "router_z_loss_mlp": 0.72851562, "step": 830, "time_per_iteration": 2.6304290294647217 }, { "auxiliary_loss_clip": 0.0700103, "auxiliary_loss_mlp": 0.01346309, "balance_loss_clip": 0.06415769, "balance_loss_mlp": 0.01278932, "epoch": 0.04996242296708252, "flos": 18410919937920.0, "grad_norm": 4.850209264279701, "language_loss": 0.81347758, "learning_rate": 3.995821750485929e-06, "loss": 0.89695096, "num_input_tokens_seen": 17803635, "router_z_loss_clip": 5.85546875, "router_z_loss_mlp": 0.67382812, "step": 831, "time_per_iteration": 2.623443126678467 }, { "auxiliary_loss_clip": 0.07021336, "auxiliary_loss_mlp": 0.01343945, "balance_loss_clip": 0.06416108, "balance_loss_mlp": 0.0127428, "epoch": 0.05002254621975049, "flos": 17863802703360.0, "grad_norm": 4.426655401148886, "language_loss": 0.95896137, "learning_rate": 3.995796551235016e-06, "loss": 1.04261422, "num_input_tokens_seen": 17822190, "router_z_loss_clip": 6.0546875, "router_z_loss_mlp": 0.69677734, "step": 832, "time_per_iteration": 2.6315298080444336 }, { "auxiliary_loss_clip": 0.06987635, "auxiliary_loss_mlp": 0.01348562, "balance_loss_clip": 0.06417282, "balance_loss_mlp": 0.01281423, "epoch": 0.050082669472418455, "flos": 45670682632320.0, "grad_norm": 8.44631253887852, "language_loss": 0.86097991, "learning_rate": 3.9957712763032974e-06, "loss": 0.9443419, "num_input_tokens_seen": 17846915, "router_z_loss_clip": 5.69921875, "router_z_loss_mlp": 0.67089844, "step": 833, "time_per_iteration": 2.8509373664855957 }, { "auxiliary_loss_clip": 0.06984498, "auxiliary_loss_mlp": 0.01337995, "balance_loss_clip": 0.06417027, "balance_loss_mlp": 0.01264753, "epoch": 0.05014279272508643, "flos": 37971237859200.0, "grad_norm": 15.995709608405926, "language_loss": 0.84567291, "learning_rate": 3.995745925691733e-06, "loss": 0.92889786, "num_input_tokens_seen": 17867270, "router_z_loss_clip": 5.6796875, "router_z_loss_mlp": 0.73242188, "step": 834, "time_per_iteration": 2.8414053916931152 }, { "auxiliary_loss_clip": 0.07003896, "auxiliary_loss_mlp": 0.01341923, "balance_loss_clip": 0.0641816, "balance_loss_mlp": 0.01273735, "epoch": 0.0502029159777544, "flos": 21002511669120.0, "grad_norm": 5.236493544131414, "language_loss": 0.95671034, "learning_rate": 3.995720499401282e-06, "loss": 1.04016852, "num_input_tokens_seen": 17884880, "router_z_loss_clip": 5.859375, "router_z_loss_mlp": 0.68164062, "step": 835, "time_per_iteration": 2.6943187713623047 }, { "auxiliary_loss_clip": 0.07003018, "auxiliary_loss_mlp": 0.01353196, "balance_loss_clip": 0.06416585, "balance_loss_mlp": 0.01284389, "epoch": 0.050263039230422364, "flos": 15893526597120.0, "grad_norm": 3.941260029917548, "language_loss": 0.81105316, "learning_rate": 3.995694997432911e-06, "loss": 0.89461535, "num_input_tokens_seen": 17903695, "router_z_loss_clip": 5.86328125, "router_z_loss_mlp": 0.68847656, "step": 836, "time_per_iteration": 2.6644322872161865 }, { "auxiliary_loss_clip": 0.06968692, "auxiliary_loss_mlp": 0.01354219, "balance_loss_clip": 0.06418016, "balance_loss_mlp": 0.01290323, "epoch": 0.050323162483090336, "flos": 23739565288320.0, "grad_norm": 8.116693154436529, "language_loss": 0.87882197, "learning_rate": 3.9956694197875855e-06, "loss": 0.96205103, "num_input_tokens_seen": 17920745, "router_z_loss_clip": 5.50390625, "router_z_loss_mlp": 0.63867188, "step": 837, "time_per_iteration": 2.6496055126190186 }, { "auxiliary_loss_clip": 0.06998191, "auxiliary_loss_mlp": 0.01374456, "balance_loss_clip": 0.06420138, "balance_loss_mlp": 0.01308081, "epoch": 0.0503832857357583, "flos": 20272393117440.0, "grad_norm": 5.663033306678846, "language_loss": 0.76431561, "learning_rate": 3.995643766466275e-06, "loss": 0.84804213, "num_input_tokens_seen": 17938220, "router_z_loss_clip": 5.78125, "router_z_loss_mlp": 0.6640625, "step": 838, "time_per_iteration": 2.6501388549804688 }, { "auxiliary_loss_clip": 0.06991281, "auxiliary_loss_mlp": 0.0137587, "balance_loss_clip": 0.06411404, "balance_loss_mlp": 0.01310591, "epoch": 0.05044340898842627, "flos": 17790736343040.0, "grad_norm": 3.824642711381824, "language_loss": 0.86403626, "learning_rate": 3.995618037469953e-06, "loss": 0.94770777, "num_input_tokens_seen": 17957325, "router_z_loss_clip": 5.80078125, "router_z_loss_mlp": 0.65234375, "step": 839, "time_per_iteration": 2.6324779987335205 }, { "auxiliary_loss_clip": 0.06978457, "auxiliary_loss_mlp": 0.01427958, "balance_loss_clip": 0.06420659, "balance_loss_mlp": 0.01360772, "epoch": 0.050503532241094246, "flos": 22973207045760.0, "grad_norm": 5.725203886157121, "language_loss": 0.88138366, "learning_rate": 3.995592232799595e-06, "loss": 0.96544778, "num_input_tokens_seen": 17975875, "router_z_loss_clip": 5.5859375, "router_z_loss_mlp": 0.671875, "step": 840, "time_per_iteration": 2.6764144897460938 }, { "auxiliary_loss_clip": 0.06991018, "auxiliary_loss_mlp": 0.01424309, "balance_loss_clip": 0.06432299, "balance_loss_mlp": 0.01356789, "epoch": 0.05056365549376221, "flos": 22782449226240.0, "grad_norm": 3.953284345081573, "language_loss": 0.97260153, "learning_rate": 3.99556635245618e-06, "loss": 1.05675483, "num_input_tokens_seen": 17994340, "router_z_loss_clip": 5.59765625, "router_z_loss_mlp": 0.67480469, "step": 841, "time_per_iteration": 2.751573085784912 }, { "auxiliary_loss_clip": 0.06994696, "auxiliary_loss_mlp": 0.01462031, "balance_loss_clip": 0.06428304, "balance_loss_mlp": 0.01396465, "epoch": 0.05062377874643018, "flos": 30924401011200.0, "grad_norm": 7.529707979136291, "language_loss": 0.80926263, "learning_rate": 3.995540396440688e-06, "loss": 0.89382982, "num_input_tokens_seen": 18015260, "router_z_loss_clip": 5.66015625, "router_z_loss_mlp": 0.65576172, "step": 842, "time_per_iteration": 2.729416847229004 }, { "auxiliary_loss_clip": 0.07012583, "auxiliary_loss_mlp": 0.01494147, "balance_loss_clip": 0.06425353, "balance_loss_mlp": 0.01416757, "epoch": 0.05068390199909815, "flos": 19653425406720.0, "grad_norm": 10.8788339409038, "language_loss": 0.81598854, "learning_rate": 3.995514364754105e-06, "loss": 0.90105587, "num_input_tokens_seen": 18033960, "router_z_loss_clip": 5.87109375, "router_z_loss_mlp": 0.7734375, "step": 843, "time_per_iteration": 2.6502270698547363 }, { "auxiliary_loss_clip": 0.07013413, "auxiliary_loss_mlp": 0.01490531, "balance_loss_clip": 0.06428012, "balance_loss_mlp": 0.01417813, "epoch": 0.05074402525176612, "flos": 37971279786240.0, "grad_norm": 7.20522895037358, "language_loss": 0.85907066, "learning_rate": 3.995488257397417e-06, "loss": 0.94411004, "num_input_tokens_seen": 18056700, "router_z_loss_clip": 5.8515625, "router_z_loss_mlp": 0.7265625, "step": 844, "time_per_iteration": 2.7819106578826904 }, { "auxiliary_loss_clip": 0.07012028, "auxiliary_loss_mlp": 0.01493142, "balance_loss_clip": 0.06434745, "balance_loss_mlp": 0.01424048, "epoch": 0.05080414850443409, "flos": 22061177280000.0, "grad_norm": 4.219025709319164, "language_loss": 0.78979045, "learning_rate": 3.995462074371614e-06, "loss": 0.87484217, "num_input_tokens_seen": 18075815, "router_z_loss_clip": 5.76953125, "router_z_loss_mlp": 0.69042969, "step": 845, "time_per_iteration": 2.616264820098877 }, { "auxiliary_loss_clip": 0.07007407, "auxiliary_loss_mlp": 0.01514586, "balance_loss_clip": 0.06434496, "balance_loss_mlp": 0.01441249, "epoch": 0.05086427175710206, "flos": 20231289889920.0, "grad_norm": 6.347719323565222, "language_loss": 0.91061014, "learning_rate": 3.99543581567769e-06, "loss": 0.99583006, "num_input_tokens_seen": 18095095, "router_z_loss_clip": 5.7265625, "router_z_loss_mlp": 0.73339844, "step": 846, "time_per_iteration": 4.05492639541626 }, { "auxiliary_loss_clip": 0.0698079, "auxiliary_loss_mlp": 0.01487232, "balance_loss_clip": 0.06424157, "balance_loss_mlp": 0.01421381, "epoch": 0.05092439500977003, "flos": 15164707783680.0, "grad_norm": 5.117325320829915, "language_loss": 0.90171295, "learning_rate": 3.9954094813166394e-06, "loss": 0.98639321, "num_input_tokens_seen": 18112675, "router_z_loss_clip": 5.5703125, "router_z_loss_mlp": 0.65771484, "step": 847, "time_per_iteration": 2.60140323638916 }, { "auxiliary_loss_clip": 0.06983387, "auxiliary_loss_mlp": 0.01510479, "balance_loss_clip": 0.06426337, "balance_loss_mlp": 0.01443102, "epoch": 0.050984518262437994, "flos": 22061806185600.0, "grad_norm": 5.982320865342955, "language_loss": 0.86449158, "learning_rate": 3.995383071289462e-06, "loss": 0.94943023, "num_input_tokens_seen": 18130745, "router_z_loss_clip": 5.5703125, "router_z_loss_mlp": 0.67382812, "step": 848, "time_per_iteration": 4.132046937942505 }, { "auxiliary_loss_clip": 0.06987946, "auxiliary_loss_mlp": 0.01512694, "balance_loss_clip": 0.06426945, "balance_loss_mlp": 0.0144503, "epoch": 0.05104464151510597, "flos": 30232911991680.0, "grad_norm": 11.448832422879937, "language_loss": 0.90354812, "learning_rate": 3.995356585597158e-06, "loss": 0.98855454, "num_input_tokens_seen": 18152410, "router_z_loss_clip": 5.6171875, "router_z_loss_mlp": 0.67724609, "step": 849, "time_per_iteration": 4.177002668380737 }, { "auxiliary_loss_clip": 0.06993163, "auxiliary_loss_mlp": 0.01475608, "balance_loss_clip": 0.06442861, "balance_loss_mlp": 0.01412666, "epoch": 0.05110476476777394, "flos": 18338817899520.0, "grad_norm": 3.703542136495143, "language_loss": 0.86514401, "learning_rate": 3.995330024240732e-06, "loss": 0.94983172, "num_input_tokens_seen": 18170870, "router_z_loss_clip": 5.5, "router_z_loss_mlp": 0.62890625, "step": 850, "time_per_iteration": 4.039630889892578 }, { "auxiliary_loss_clip": 0.06996657, "auxiliary_loss_mlp": 0.01500443, "balance_loss_clip": 0.06435632, "balance_loss_mlp": 0.01434163, "epoch": 0.051164888020441904, "flos": 38007938747520.0, "grad_norm": 5.222154969750691, "language_loss": 0.69063056, "learning_rate": 3.995303387221192e-06, "loss": 0.77560157, "num_input_tokens_seen": 18191555, "router_z_loss_clip": 5.609375, "router_z_loss_mlp": 0.66308594, "step": 851, "time_per_iteration": 2.7826218605041504 }, { "auxiliary_loss_clip": 0.07017241, "auxiliary_loss_mlp": 0.0150903, "balance_loss_clip": 0.06457026, "balance_loss_mlp": 0.01439125, "epoch": 0.051225011273109876, "flos": 23045183303040.0, "grad_norm": 6.38460410115259, "language_loss": 0.8625297, "learning_rate": 3.995276674539547e-06, "loss": 0.94779229, "num_input_tokens_seen": 18208620, "router_z_loss_clip": 5.60546875, "router_z_loss_mlp": 0.69921875, "step": 852, "time_per_iteration": 2.615828275680542 }, { "auxiliary_loss_clip": 0.07007915, "auxiliary_loss_mlp": 0.01526208, "balance_loss_clip": 0.06439061, "balance_loss_mlp": 0.01455922, "epoch": 0.05128513452577785, "flos": 18265709612160.0, "grad_norm": 6.347834722679273, "language_loss": 0.82639277, "learning_rate": 3.995249886196811e-06, "loss": 0.91173398, "num_input_tokens_seen": 18226370, "router_z_loss_clip": 5.69140625, "router_z_loss_mlp": 0.703125, "step": 853, "time_per_iteration": 2.6010348796844482 }, { "auxiliary_loss_clip": 0.07010502, "auxiliary_loss_mlp": 0.01526139, "balance_loss_clip": 0.06446202, "balance_loss_mlp": 0.01461623, "epoch": 0.05134525777844581, "flos": 27206360115840.0, "grad_norm": 4.5433524822042575, "language_loss": 0.79082942, "learning_rate": 3.995223022193999e-06, "loss": 0.87619579, "num_input_tokens_seen": 18247075, "router_z_loss_clip": 5.640625, "router_z_loss_mlp": 0.64550781, "step": 854, "time_per_iteration": 2.808309316635132 }, { "auxiliary_loss_clip": 0.07011996, "auxiliary_loss_mlp": 0.01503364, "balance_loss_clip": 0.06436715, "balance_loss_mlp": 0.01434413, "epoch": 0.051405381031113785, "flos": 28369132824960.0, "grad_norm": 3.8384168340964027, "language_loss": 0.84313297, "learning_rate": 3.99519608253213e-06, "loss": 0.92828655, "num_input_tokens_seen": 18265680, "router_z_loss_clip": 5.75390625, "router_z_loss_mlp": 0.68847656, "step": 855, "time_per_iteration": 2.6903862953186035 }, { "auxiliary_loss_clip": 0.06969179, "auxiliary_loss_mlp": 0.01419588, "balance_loss_clip": 0.06600188, "balance_loss_mlp": 0.01383611, "epoch": 0.05146550428378175, "flos": 65638049760000.0, "grad_norm": 0.9733071686338123, "language_loss": 0.65598023, "learning_rate": 3.995169067212227e-06, "loss": 0.73986799, "num_input_tokens_seen": 18327015, "router_z_loss_clip": 3.6875, "router_z_loss_mlp": 0.36035156, "step": 856, "time_per_iteration": 3.253328561782837 }, { "auxiliary_loss_clip": 0.06992525, "auxiliary_loss_mlp": 0.0149847, "balance_loss_clip": 0.06437615, "balance_loss_mlp": 0.01436434, "epoch": 0.05152562753644972, "flos": 22061470769280.0, "grad_norm": 15.843040588994413, "language_loss": 0.79320967, "learning_rate": 3.9951419762353116e-06, "loss": 0.87811959, "num_input_tokens_seen": 18345235, "router_z_loss_clip": 5.55078125, "router_z_loss_mlp": 0.61962891, "step": 857, "time_per_iteration": 2.6454732418060303 }, { "auxiliary_loss_clip": 0.07000342, "auxiliary_loss_mlp": 0.01523282, "balance_loss_clip": 0.06430548, "balance_loss_mlp": 0.01449992, "epoch": 0.051585750789117694, "flos": 18514523911680.0, "grad_norm": 7.53808853409064, "language_loss": 0.90981781, "learning_rate": 3.995114809602412e-06, "loss": 0.99505407, "num_input_tokens_seen": 18362350, "router_z_loss_clip": 5.69921875, "router_z_loss_mlp": 0.73242188, "step": 858, "time_per_iteration": 2.6115405559539795 }, { "auxiliary_loss_clip": 0.06994863, "auxiliary_loss_mlp": 0.01494326, "balance_loss_clip": 0.06425697, "balance_loss_mlp": 0.01422514, "epoch": 0.05164587404178566, "flos": 23736630395520.0, "grad_norm": 4.894550482476157, "language_loss": 0.79437906, "learning_rate": 3.9950875673145605e-06, "loss": 0.87927091, "num_input_tokens_seen": 18383390, "router_z_loss_clip": 5.6875, "router_z_loss_mlp": 0.71875, "step": 859, "time_per_iteration": 2.669271945953369 }, { "auxiliary_loss_clip": 0.06991632, "auxiliary_loss_mlp": 0.01496852, "balance_loss_clip": 0.06414089, "balance_loss_mlp": 0.01427472, "epoch": 0.05170599729445363, "flos": 16258397201280.0, "grad_norm": 4.54717783743079, "language_loss": 0.94322526, "learning_rate": 3.995060249372788e-06, "loss": 1.02811003, "num_input_tokens_seen": 18399220, "router_z_loss_clip": 5.77734375, "router_z_loss_mlp": 0.69335938, "step": 860, "time_per_iteration": 2.6032159328460693 }, { "auxiliary_loss_clip": 0.06988232, "auxiliary_loss_mlp": 0.01475183, "balance_loss_clip": 0.06436417, "balance_loss_mlp": 0.01411239, "epoch": 0.0517661205471216, "flos": 23992404583680.0, "grad_norm": 4.735894351497266, "language_loss": 0.84666151, "learning_rate": 3.99503285577813e-06, "loss": 0.93129563, "num_input_tokens_seen": 18419005, "router_z_loss_clip": 5.51953125, "router_z_loss_mlp": 0.63916016, "step": 861, "time_per_iteration": 2.6580357551574707 }, { "auxiliary_loss_clip": 0.06991492, "auxiliary_loss_mlp": 0.0147727, "balance_loss_clip": 0.06423034, "balance_loss_mlp": 0.01412611, "epoch": 0.05182624379978957, "flos": 29285313367680.0, "grad_norm": 4.486635911167205, "language_loss": 0.81280261, "learning_rate": 3.995005386531627e-06, "loss": 0.89749026, "num_input_tokens_seen": 18440550, "router_z_loss_clip": 5.6875, "router_z_loss_mlp": 0.64697266, "step": 862, "time_per_iteration": 2.698885202407837 }, { "auxiliary_loss_clip": 0.06969079, "auxiliary_loss_mlp": 0.01432295, "balance_loss_clip": 0.06414389, "balance_loss_mlp": 0.01372929, "epoch": 0.05188636705245754, "flos": 24177753815040.0, "grad_norm": 5.526348677914022, "language_loss": 0.91253364, "learning_rate": 3.9949778416343195e-06, "loss": 0.99654734, "num_input_tokens_seen": 18461950, "router_z_loss_clip": 5.54296875, "router_z_loss_mlp": 0.59423828, "step": 863, "time_per_iteration": 2.774623155593872 }, { "auxiliary_loss_clip": 0.06971329, "auxiliary_loss_mlp": 0.01426915, "balance_loss_clip": 0.06417412, "balance_loss_mlp": 0.01363496, "epoch": 0.051946490305125506, "flos": 26767961953920.0, "grad_norm": 8.492019408398518, "language_loss": 0.78659296, "learning_rate": 3.9949502210872525e-06, "loss": 0.87057543, "num_input_tokens_seen": 18480555, "router_z_loss_clip": 5.5390625, "router_z_loss_mlp": 0.63427734, "step": 864, "time_per_iteration": 2.679213047027588 }, { "auxiliary_loss_clip": 0.06985264, "auxiliary_loss_mlp": 0.01408613, "balance_loss_clip": 0.06413454, "balance_loss_mlp": 0.01341522, "epoch": 0.05200661355779348, "flos": 21508190259840.0, "grad_norm": 30.101793063022775, "language_loss": 0.82343817, "learning_rate": 3.994922524891474e-06, "loss": 0.907377, "num_input_tokens_seen": 18499645, "router_z_loss_clip": 5.71875, "router_z_loss_mlp": 0.67089844, "step": 865, "time_per_iteration": 2.7101173400878906 }, { "auxiliary_loss_clip": 0.06970219, "auxiliary_loss_mlp": 0.01406212, "balance_loss_clip": 0.06409559, "balance_loss_mlp": 0.01341457, "epoch": 0.05206673681046144, "flos": 18120457359360.0, "grad_norm": 10.144790037153854, "language_loss": 0.88737285, "learning_rate": 3.994894753048032e-06, "loss": 0.97113717, "num_input_tokens_seen": 18516810, "router_z_loss_clip": 5.609375, "router_z_loss_mlp": 0.64746094, "step": 866, "time_per_iteration": 2.5981605052948 }, { "auxiliary_loss_clip": 0.06949246, "auxiliary_loss_mlp": 0.01394564, "balance_loss_clip": 0.06405221, "balance_loss_mlp": 0.01333505, "epoch": 0.052126860063129415, "flos": 17528966588160.0, "grad_norm": 8.7132071335502, "language_loss": 0.90340543, "learning_rate": 3.9948669055579815e-06, "loss": 0.98684359, "num_input_tokens_seen": 18532510, "router_z_loss_clip": 5.4375, "router_z_loss_mlp": 0.6105957, "step": 867, "time_per_iteration": 2.629401445388794 }, { "auxiliary_loss_clip": 0.0694665, "auxiliary_loss_mlp": 0.01367601, "balance_loss_clip": 0.06405527, "balance_loss_mlp": 0.01306232, "epoch": 0.05218698331579739, "flos": 32606227036800.0, "grad_norm": 12.927185327843027, "language_loss": 0.65028739, "learning_rate": 3.9948389824223785e-06, "loss": 0.73342991, "num_input_tokens_seen": 18557380, "router_z_loss_clip": 5.41796875, "router_z_loss_mlp": 0.61376953, "step": 868, "time_per_iteration": 2.7288155555725098 }, { "auxiliary_loss_clip": 0.06970181, "auxiliary_loss_mlp": 0.01361389, "balance_loss_clip": 0.06399573, "balance_loss_mlp": 0.01291246, "epoch": 0.05224710656846535, "flos": 22133824369920.0, "grad_norm": 19.38173823308245, "language_loss": 0.86169767, "learning_rate": 3.994810983642281e-06, "loss": 0.9450134, "num_input_tokens_seen": 18575720, "router_z_loss_clip": 5.7109375, "router_z_loss_mlp": 0.70117188, "step": 869, "time_per_iteration": 2.6904990673065186 }, { "auxiliary_loss_clip": 0.06976946, "auxiliary_loss_mlp": 0.01356121, "balance_loss_clip": 0.06397468, "balance_loss_mlp": 0.01290365, "epoch": 0.052307229821133325, "flos": 11149789472640.0, "grad_norm": 5.430120444175442, "language_loss": 0.90660226, "learning_rate": 3.994782909218751e-06, "loss": 0.98993295, "num_input_tokens_seen": 18592185, "router_z_loss_clip": 5.80859375, "router_z_loss_mlp": 0.65722656, "step": 870, "time_per_iteration": 2.631592273712158 }, { "auxiliary_loss_clip": 0.06976657, "auxiliary_loss_mlp": 0.01351226, "balance_loss_clip": 0.06404389, "balance_loss_mlp": 0.01283706, "epoch": 0.05236735307380129, "flos": 19132862716800.0, "grad_norm": 6.1711786314441595, "language_loss": 0.83258122, "learning_rate": 3.994754759152854e-06, "loss": 0.91586006, "num_input_tokens_seen": 18609560, "router_z_loss_clip": 5.73046875, "router_z_loss_mlp": 0.67529297, "step": 871, "time_per_iteration": 2.652129650115967 }, { "auxiliary_loss_clip": 0.06939071, "auxiliary_loss_mlp": 0.01352857, "balance_loss_clip": 0.06396289, "balance_loss_mlp": 0.01291059, "epoch": 0.05242747632646926, "flos": 20967152446080.0, "grad_norm": 5.8512865584462626, "language_loss": 0.83531278, "learning_rate": 3.994726533445656e-06, "loss": 0.91823208, "num_input_tokens_seen": 18629405, "router_z_loss_clip": 5.42578125, "router_z_loss_mlp": 0.6171875, "step": 872, "time_per_iteration": 2.765220880508423 }, { "auxiliary_loss_clip": 0.06773912, "auxiliary_loss_mlp": 0.01429748, "balance_loss_clip": 0.06397347, "balance_loss_mlp": 0.0135498, "epoch": 0.052487599579137234, "flos": 65038005872640.0, "grad_norm": 0.8825720019176877, "language_loss": 0.61780709, "learning_rate": 3.9946982320982274e-06, "loss": 0.6998437, "num_input_tokens_seen": 18681480, "router_z_loss_clip": 3.765625, "router_z_loss_mlp": 0.74658203, "step": 873, "time_per_iteration": 3.1387808322906494 }, { "auxiliary_loss_clip": 0.06991202, "auxiliary_loss_mlp": 0.01335593, "balance_loss_clip": 0.06411646, "balance_loss_mlp": 0.01269504, "epoch": 0.0525477228318052, "flos": 23294584581120.0, "grad_norm": 3.7933126113421727, "language_loss": 0.91210377, "learning_rate": 3.994669855111643e-06, "loss": 0.99537176, "num_input_tokens_seen": 18700390, "router_z_loss_clip": 5.796875, "router_z_loss_mlp": 0.66113281, "step": 874, "time_per_iteration": 2.639739751815796 }, { "auxiliary_loss_clip": 0.06986094, "auxiliary_loss_mlp": 0.01331451, "balance_loss_clip": 0.06406635, "balance_loss_mlp": 0.01263597, "epoch": 0.05260784608447317, "flos": 32237834561280.0, "grad_norm": 3.050033879532819, "language_loss": 0.77646315, "learning_rate": 3.994641402486977e-06, "loss": 0.85963857, "num_input_tokens_seen": 18721280, "router_z_loss_clip": 5.79296875, "router_z_loss_mlp": 0.67871094, "step": 875, "time_per_iteration": 2.7695372104644775 }, { "auxiliary_loss_clip": 0.06990877, "auxiliary_loss_mlp": 0.01331706, "balance_loss_clip": 0.06415338, "balance_loss_mlp": 0.01263948, "epoch": 0.052667969337141136, "flos": 24470270818560.0, "grad_norm": 3.924035542608268, "language_loss": 0.94988197, "learning_rate": 3.99461287422531e-06, "loss": 1.03310776, "num_input_tokens_seen": 18741545, "router_z_loss_clip": 5.76171875, "router_z_loss_mlp": 0.67773438, "step": 876, "time_per_iteration": 2.6723616123199463 }, { "auxiliary_loss_clip": 0.06759, "auxiliary_loss_mlp": 0.01398088, "balance_loss_clip": 0.06380128, "balance_loss_mlp": 0.01338913, "epoch": 0.05272809258980911, "flos": 57804673034880.0, "grad_norm": 0.8679343004862465, "language_loss": 0.63118809, "learning_rate": 3.994584270327722e-06, "loss": 0.71275902, "num_input_tokens_seen": 18801400, "router_z_loss_clip": 3.78515625, "router_z_loss_mlp": 0.59033203, "step": 877, "time_per_iteration": 3.277306318283081 }, { "auxiliary_loss_clip": 0.07013722, "auxiliary_loss_mlp": 0.01327682, "balance_loss_clip": 0.06422149, "balance_loss_mlp": 0.01260829, "epoch": 0.05278821584247708, "flos": 17426578498560.0, "grad_norm": 7.776885813999353, "language_loss": 0.87702364, "learning_rate": 3.994555590795299e-06, "loss": 0.96043766, "num_input_tokens_seen": 18819670, "router_z_loss_clip": 5.9140625, "router_z_loss_mlp": 0.66894531, "step": 878, "time_per_iteration": 2.6235287189483643 }, { "auxiliary_loss_clip": 0.07027362, "auxiliary_loss_mlp": 0.0133185, "balance_loss_clip": 0.06424384, "balance_loss_mlp": 0.01262327, "epoch": 0.052848339095145046, "flos": 26143879144320.0, "grad_norm": 2.8725659890441824, "language_loss": 0.86160982, "learning_rate": 3.9945268356291275e-06, "loss": 0.94520187, "num_input_tokens_seen": 18840580, "router_z_loss_clip": 6.03515625, "router_z_loss_mlp": 0.6953125, "step": 879, "time_per_iteration": 2.713995933532715 }, { "auxiliary_loss_clip": 0.07003315, "auxiliary_loss_mlp": 0.01331262, "balance_loss_clip": 0.06431365, "balance_loss_mlp": 0.01262931, "epoch": 0.05290846234781302, "flos": 16477680136320.0, "grad_norm": 12.719371857306683, "language_loss": 0.87210822, "learning_rate": 3.9944980048302985e-06, "loss": 0.95545405, "num_input_tokens_seen": 18859295, "router_z_loss_clip": 5.7265625, "router_z_loss_mlp": 0.68310547, "step": 880, "time_per_iteration": 2.6463210582733154 }, { "auxiliary_loss_clip": 0.07038447, "auxiliary_loss_mlp": 0.0133543, "balance_loss_clip": 0.06437503, "balance_loss_mlp": 0.01258182, "epoch": 0.05296858560048098, "flos": 19871324749440.0, "grad_norm": 4.034727227987173, "language_loss": 0.91060352, "learning_rate": 3.994469098399906e-06, "loss": 0.99434233, "num_input_tokens_seen": 18877485, "router_z_loss_clip": 6.01171875, "router_z_loss_mlp": 0.77246094, "step": 881, "time_per_iteration": 2.629709482192993 }, { "auxiliary_loss_clip": 0.07035483, "auxiliary_loss_mlp": 0.01344281, "balance_loss_clip": 0.06438821, "balance_loss_mlp": 0.01265746, "epoch": 0.053028708853148955, "flos": 24395359668480.0, "grad_norm": 3.585170734433985, "language_loss": 0.9092831, "learning_rate": 3.994440116339046e-06, "loss": 0.99308079, "num_input_tokens_seen": 18898275, "router_z_loss_clip": 5.9609375, "router_z_loss_mlp": 0.78515625, "step": 882, "time_per_iteration": 2.656916618347168 }, { "auxiliary_loss_clip": 0.07056689, "auxiliary_loss_mlp": 0.01339295, "balance_loss_clip": 0.06448351, "balance_loss_mlp": 0.01261093, "epoch": 0.05308883210581693, "flos": 36402072048000.0, "grad_norm": 123.04432336545209, "language_loss": 0.72229356, "learning_rate": 3.994411058648816e-06, "loss": 0.80625337, "num_input_tokens_seen": 18920665, "router_z_loss_clip": 6.0859375, "router_z_loss_mlp": 0.78222656, "step": 883, "time_per_iteration": 2.7729275226593018 }, { "auxiliary_loss_clip": 0.07016774, "auxiliary_loss_mlp": 0.01331904, "balance_loss_clip": 0.06430573, "balance_loss_mlp": 0.01257517, "epoch": 0.05314895535848489, "flos": 22861427299200.0, "grad_norm": 4.282674763750217, "language_loss": 0.78297055, "learning_rate": 3.994381925330319e-06, "loss": 0.86645728, "num_input_tokens_seen": 18939835, "router_z_loss_clip": 5.859375, "router_z_loss_mlp": 0.74365234, "step": 884, "time_per_iteration": 2.656759023666382 }, { "auxiliary_loss_clip": 0.07004087, "auxiliary_loss_mlp": 0.01326042, "balance_loss_clip": 0.06438361, "balance_loss_mlp": 0.0125981, "epoch": 0.053209078611152864, "flos": 12865381493760.0, "grad_norm": 3.337582010487321, "language_loss": 0.88599998, "learning_rate": 3.994352716384659e-06, "loss": 0.96930122, "num_input_tokens_seen": 18958405, "router_z_loss_clip": 5.65625, "router_z_loss_mlp": 0.66259766, "step": 885, "time_per_iteration": 4.06245231628418 }, { "auxiliary_loss_clip": 0.07040052, "auxiliary_loss_mlp": 0.01331219, "balance_loss_clip": 0.06449106, "balance_loss_mlp": 0.01257309, "epoch": 0.05326920186382083, "flos": 12169112791680.0, "grad_norm": 5.788726102229348, "language_loss": 0.89611739, "learning_rate": 3.994323431812945e-06, "loss": 0.97983009, "num_input_tokens_seen": 18975445, "router_z_loss_clip": 5.91015625, "router_z_loss_mlp": 0.73925781, "step": 886, "time_per_iteration": 2.67983078956604 }, { "auxiliary_loss_clip": 0.07013536, "auxiliary_loss_mlp": 0.01332917, "balance_loss_clip": 0.06437603, "balance_loss_mlp": 0.01258625, "epoch": 0.0533293251164888, "flos": 22710011771520.0, "grad_norm": 3.780871294272927, "language_loss": 0.91903591, "learning_rate": 3.994294071616286e-06, "loss": 1.00250053, "num_input_tokens_seen": 18991930, "router_z_loss_clip": 5.76171875, "router_z_loss_mlp": 0.74316406, "step": 887, "time_per_iteration": 2.7848031520843506 }, { "auxiliary_loss_clip": 0.07037164, "auxiliary_loss_mlp": 0.01331507, "balance_loss_clip": 0.06452733, "balance_loss_mlp": 0.01256834, "epoch": 0.053389448369156774, "flos": 26947860670080.0, "grad_norm": 3.3884294240747432, "language_loss": 0.78062391, "learning_rate": 3.994264635795796e-06, "loss": 0.86431056, "num_input_tokens_seen": 19009790, "router_z_loss_clip": 5.8359375, "router_z_loss_mlp": 0.74658203, "step": 888, "time_per_iteration": 4.091126441955566 }, { "auxiliary_loss_clip": 0.07035064, "auxiliary_loss_mlp": 0.01325713, "balance_loss_clip": 0.06458724, "balance_loss_mlp": 0.01255284, "epoch": 0.05344957162182474, "flos": 25563331330560.0, "grad_norm": 5.930980487878811, "language_loss": 0.91476464, "learning_rate": 3.994235124352592e-06, "loss": 0.99837244, "num_input_tokens_seen": 19030170, "router_z_loss_clip": 5.7578125, "router_z_loss_mlp": 0.70410156, "step": 889, "time_per_iteration": 4.135676860809326 }, { "auxiliary_loss_clip": 0.07038309, "auxiliary_loss_mlp": 0.01325895, "balance_loss_clip": 0.06474581, "balance_loss_mlp": 0.01257708, "epoch": 0.05350969487449271, "flos": 19725779007360.0, "grad_norm": 3.2747584474589004, "language_loss": 0.92074233, "learning_rate": 3.994205537287791e-06, "loss": 1.00438452, "num_input_tokens_seen": 19048075, "router_z_loss_clip": 5.64453125, "router_z_loss_mlp": 0.68164062, "step": 890, "time_per_iteration": 4.031081914901733 }, { "auxiliary_loss_clip": 0.07057517, "auxiliary_loss_mlp": 0.01333952, "balance_loss_clip": 0.06471993, "balance_loss_mlp": 0.01264238, "epoch": 0.053569818127160676, "flos": 27023694215040.0, "grad_norm": 3.3715636361983092, "language_loss": 0.96893251, "learning_rate": 3.994175874602517e-06, "loss": 1.05284715, "num_input_tokens_seen": 19067465, "router_z_loss_clip": 5.859375, "router_z_loss_mlp": 0.69775391, "step": 891, "time_per_iteration": 2.685680389404297 }, { "auxiliary_loss_clip": 0.07037006, "auxiliary_loss_mlp": 0.01335873, "balance_loss_clip": 0.06467102, "balance_loss_mlp": 0.01261486, "epoch": 0.05362994137982865, "flos": 13193383501440.0, "grad_norm": 3.406725266811041, "language_loss": 0.73980451, "learning_rate": 3.994146136297893e-06, "loss": 0.8235333, "num_input_tokens_seen": 19085505, "router_z_loss_clip": 5.703125, "router_z_loss_mlp": 0.74462891, "step": 892, "time_per_iteration": 2.614439010620117 }, { "auxiliary_loss_clip": 0.0706319, "auxiliary_loss_mlp": 0.01339987, "balance_loss_clip": 0.06478012, "balance_loss_mlp": 0.01266172, "epoch": 0.05369006463249662, "flos": 28665590970240.0, "grad_norm": 3.68292973718263, "language_loss": 0.84681326, "learning_rate": 3.994116322375049e-06, "loss": 0.93084496, "num_input_tokens_seen": 19104360, "router_z_loss_clip": 5.84765625, "router_z_loss_mlp": 0.73828125, "step": 893, "time_per_iteration": 2.7020463943481445 }, { "auxiliary_loss_clip": 0.07071498, "auxiliary_loss_mlp": 0.01333424, "balance_loss_clip": 0.06486666, "balance_loss_mlp": 0.01264378, "epoch": 0.053750187885164585, "flos": 28920736252800.0, "grad_norm": 3.613105524231044, "language_loss": 0.84623915, "learning_rate": 3.994086432835114e-06, "loss": 0.93028837, "num_input_tokens_seen": 19124680, "router_z_loss_clip": 5.83984375, "router_z_loss_mlp": 0.69042969, "step": 894, "time_per_iteration": 2.7133023738861084 }, { "auxiliary_loss_clip": 0.07044563, "auxiliary_loss_mlp": 0.01324032, "balance_loss_clip": 0.06475604, "balance_loss_mlp": 0.01256083, "epoch": 0.05381031113783256, "flos": 15164246586240.0, "grad_norm": 4.830246662774341, "language_loss": 0.78217983, "learning_rate": 3.994056467679221e-06, "loss": 0.86586583, "num_input_tokens_seen": 19142895, "router_z_loss_clip": 5.6875, "router_z_loss_mlp": 0.6796875, "step": 895, "time_per_iteration": 2.6482059955596924 }, { "auxiliary_loss_clip": 0.07076214, "auxiliary_loss_mlp": 0.01330587, "balance_loss_clip": 0.06481346, "balance_loss_mlp": 0.0125496, "epoch": 0.05387043439050053, "flos": 21841684709760.0, "grad_norm": 3.5306683936609544, "language_loss": 0.89617562, "learning_rate": 3.9940264269085065e-06, "loss": 0.98024362, "num_input_tokens_seen": 19163125, "router_z_loss_clip": 5.953125, "router_z_loss_mlp": 0.75585938, "step": 896, "time_per_iteration": 2.6998372077941895 }, { "auxiliary_loss_clip": 0.07074694, "auxiliary_loss_mlp": 0.01333557, "balance_loss_clip": 0.06489147, "balance_loss_mlp": 0.01261888, "epoch": 0.053930557643168495, "flos": 17315888855040.0, "grad_norm": 3.508407806831503, "language_loss": 0.90716487, "learning_rate": 3.9939963105241115e-06, "loss": 0.99124742, "num_input_tokens_seen": 19179385, "router_z_loss_clip": 5.86328125, "router_z_loss_mlp": 0.71582031, "step": 897, "time_per_iteration": 2.6511428356170654 }, { "auxiliary_loss_clip": 0.07031922, "auxiliary_loss_mlp": 0.01328865, "balance_loss_clip": 0.06476996, "balance_loss_mlp": 0.01258293, "epoch": 0.05399068089583647, "flos": 17354350679040.0, "grad_norm": 3.239313764239217, "language_loss": 0.92410642, "learning_rate": 3.993966118527175e-06, "loss": 1.00771427, "num_input_tokens_seen": 19198725, "router_z_loss_clip": 5.546875, "router_z_loss_mlp": 0.70605469, "step": 898, "time_per_iteration": 2.650688409805298 }, { "auxiliary_loss_clip": 0.0706577, "auxiliary_loss_mlp": 0.01335157, "balance_loss_clip": 0.06467015, "balance_loss_mlp": 0.0125748, "epoch": 0.05405080414850443, "flos": 17491594867200.0, "grad_norm": 9.478271811911624, "language_loss": 0.95334655, "learning_rate": 3.993935850918845e-06, "loss": 1.0373559, "num_input_tokens_seen": 19212380, "router_z_loss_clip": 5.98828125, "router_z_loss_mlp": 0.77636719, "step": 899, "time_per_iteration": 2.607253313064575 }, { "auxiliary_loss_clip": 0.07027786, "auxiliary_loss_mlp": 0.01323293, "balance_loss_clip": 0.06471542, "balance_loss_mlp": 0.01254247, "epoch": 0.054110927401172404, "flos": 24503365981440.0, "grad_norm": 3.207976602082201, "language_loss": 0.77854413, "learning_rate": 3.9939055077002665e-06, "loss": 0.86205494, "num_input_tokens_seen": 19232235, "router_z_loss_clip": 5.5625, "router_z_loss_mlp": 0.69140625, "step": 900, "time_per_iteration": 2.6676182746887207 }, { "auxiliary_loss_clip": 0.07047489, "auxiliary_loss_mlp": 0.01321065, "balance_loss_clip": 0.06469654, "balance_loss_mlp": 0.01253449, "epoch": 0.054171050653840376, "flos": 22936715792640.0, "grad_norm": 5.922430068869974, "language_loss": 0.77758533, "learning_rate": 3.993875088872592e-06, "loss": 0.8612709, "num_input_tokens_seen": 19251460, "router_z_loss_clip": 5.76953125, "router_z_loss_mlp": 0.67529297, "step": 901, "time_per_iteration": 2.6460626125335693 }, { "auxiliary_loss_clip": 0.07001346, "auxiliary_loss_mlp": 0.01321724, "balance_loss_clip": 0.06452878, "balance_loss_mlp": 0.01258019, "epoch": 0.05423117390650834, "flos": 12938238218880.0, "grad_norm": 4.297565984115266, "language_loss": 0.87827295, "learning_rate": 3.9938445944369745e-06, "loss": 0.96150362, "num_input_tokens_seen": 19269060, "router_z_loss_clip": 5.484375, "router_z_loss_mlp": 0.63720703, "step": 902, "time_per_iteration": 2.614790916442871 }, { "auxiliary_loss_clip": 0.07037289, "auxiliary_loss_mlp": 0.01320683, "balance_loss_clip": 0.06461436, "balance_loss_mlp": 0.01254689, "epoch": 0.05429129715917631, "flos": 19907438659200.0, "grad_norm": 2.606392686792414, "language_loss": 0.89259619, "learning_rate": 3.993814024394569e-06, "loss": 0.9761759, "num_input_tokens_seen": 19288620, "router_z_loss_clip": 5.7578125, "router_z_loss_mlp": 0.65966797, "step": 903, "time_per_iteration": 2.6933581829071045 }, { "auxiliary_loss_clip": 0.07044423, "auxiliary_loss_mlp": 0.01329778, "balance_loss_clip": 0.06460882, "balance_loss_mlp": 0.01260732, "epoch": 0.05435142041184428, "flos": 16914065800320.0, "grad_norm": 3.7826191495898143, "language_loss": 0.78973627, "learning_rate": 3.993783378746537e-06, "loss": 0.87347829, "num_input_tokens_seen": 19306615, "router_z_loss_clip": 5.83203125, "router_z_loss_mlp": 0.69091797, "step": 904, "time_per_iteration": 2.635842800140381 }, { "auxiliary_loss_clip": 0.07026307, "auxiliary_loss_mlp": 0.01323718, "balance_loss_clip": 0.06437198, "balance_loss_mlp": 0.01254005, "epoch": 0.05441154366451225, "flos": 23954613592320.0, "grad_norm": 3.0370739575226677, "language_loss": 0.87755048, "learning_rate": 3.993752657494039e-06, "loss": 0.96105075, "num_input_tokens_seen": 19321680, "router_z_loss_clip": 5.890625, "router_z_loss_mlp": 0.69775391, "step": 905, "time_per_iteration": 2.630345582962036 }, { "auxiliary_loss_clip": 0.06987349, "auxiliary_loss_mlp": 0.01318128, "balance_loss_clip": 0.0644051, "balance_loss_mlp": 0.0125819, "epoch": 0.05447166691718022, "flos": 19981678976640.0, "grad_norm": 3.72998642108642, "language_loss": 0.76572859, "learning_rate": 3.993721860638241e-06, "loss": 0.84878337, "num_input_tokens_seen": 19339760, "router_z_loss_clip": 5.47265625, "router_z_loss_mlp": 0.59960938, "step": 906, "time_per_iteration": 2.657399892807007 }, { "auxiliary_loss_clip": 0.07010697, "auxiliary_loss_mlp": 0.01331821, "balance_loss_clip": 0.06443925, "balance_loss_mlp": 0.01265636, "epoch": 0.05453179016984819, "flos": 24943483152000.0, "grad_norm": 2.73202421304937, "language_loss": 0.8984735, "learning_rate": 3.993690988180309e-06, "loss": 0.98189867, "num_input_tokens_seen": 19359585, "router_z_loss_clip": 5.66796875, "router_z_loss_mlp": 0.66210938, "step": 907, "time_per_iteration": 2.70552659034729 }, { "auxiliary_loss_clip": 0.07009655, "auxiliary_loss_mlp": 0.01336487, "balance_loss_clip": 0.0644108, "balance_loss_mlp": 0.01266773, "epoch": 0.05459191342251616, "flos": 18121170119040.0, "grad_norm": 3.7247943925872202, "language_loss": 0.89606416, "learning_rate": 3.9936600401214165e-06, "loss": 0.97952551, "num_input_tokens_seen": 19378590, "router_z_loss_clip": 5.68359375, "router_z_loss_mlp": 0.69628906, "step": 908, "time_per_iteration": 2.624128818511963 }, { "auxiliary_loss_clip": 0.07028729, "auxiliary_loss_mlp": 0.01330329, "balance_loss_clip": 0.06449239, "balance_loss_mlp": 0.01265908, "epoch": 0.054652036675184125, "flos": 19214314485120.0, "grad_norm": 3.696802061528334, "language_loss": 0.92750657, "learning_rate": 3.9936290164627345e-06, "loss": 1.01109719, "num_input_tokens_seen": 19397910, "router_z_loss_clip": 5.79296875, "router_z_loss_mlp": 0.64404297, "step": 909, "time_per_iteration": 2.730168342590332 }, { "auxiliary_loss_clip": 0.07054898, "auxiliary_loss_mlp": 0.01343761, "balance_loss_clip": 0.06464791, "balance_loss_mlp": 0.01270901, "epoch": 0.0547121599278521, "flos": 16331253926400.0, "grad_norm": 3.824338952515333, "language_loss": 0.74160182, "learning_rate": 3.99359791720544e-06, "loss": 0.82558846, "num_input_tokens_seen": 19415950, "router_z_loss_clip": 5.8984375, "router_z_loss_mlp": 0.72851562, "step": 910, "time_per_iteration": 2.6440482139587402 }, { "auxiliary_loss_clip": 0.07023662, "auxiliary_loss_mlp": 0.01338221, "balance_loss_clip": 0.06449929, "balance_loss_mlp": 0.01272537, "epoch": 0.05477228318052007, "flos": 20345165988480.0, "grad_norm": 3.7642494282148635, "language_loss": 0.85761505, "learning_rate": 3.993566742350714e-06, "loss": 0.94123387, "num_input_tokens_seen": 19435275, "router_z_loss_clip": 5.73828125, "router_z_loss_mlp": 0.65698242, "step": 911, "time_per_iteration": 2.6990997791290283 }, { "auxiliary_loss_clip": 0.07051647, "auxiliary_loss_mlp": 0.01347684, "balance_loss_clip": 0.0646567, "balance_loss_mlp": 0.01274919, "epoch": 0.054832406433188034, "flos": 21978216138240.0, "grad_norm": 4.282609375581314, "language_loss": 0.79087842, "learning_rate": 3.993535491899736e-06, "loss": 0.87487173, "num_input_tokens_seen": 19452090, "router_z_loss_clip": 5.859375, "router_z_loss_mlp": 0.72753906, "step": 912, "time_per_iteration": 2.638399600982666 }, { "auxiliary_loss_clip": 0.07018465, "auxiliary_loss_mlp": 0.01330377, "balance_loss_clip": 0.06454241, "balance_loss_mlp": 0.01265145, "epoch": 0.054892529685856006, "flos": 16404487994880.0, "grad_norm": 7.421294005979243, "language_loss": 0.85555679, "learning_rate": 3.993504165853694e-06, "loss": 0.93904519, "num_input_tokens_seen": 19470865, "router_z_loss_clip": 5.64453125, "router_z_loss_mlp": 0.65185547, "step": 913, "time_per_iteration": 2.6384260654449463 }, { "auxiliary_loss_clip": 0.07035242, "auxiliary_loss_mlp": 0.01344245, "balance_loss_clip": 0.06462297, "balance_loss_mlp": 0.01276916, "epoch": 0.05495265293852397, "flos": 23918709317760.0, "grad_norm": 3.0760158636114627, "language_loss": 0.8558929, "learning_rate": 3.993472764213772e-06, "loss": 0.93968773, "num_input_tokens_seen": 19492145, "router_z_loss_clip": 5.71875, "router_z_loss_mlp": 0.67333984, "step": 914, "time_per_iteration": 2.6973021030426025 }, { "auxiliary_loss_clip": 0.07042215, "auxiliary_loss_mlp": 0.01352952, "balance_loss_clip": 0.06471132, "balance_loss_mlp": 0.01284001, "epoch": 0.055012776191191944, "flos": 23593767984000.0, "grad_norm": 4.522334705788564, "language_loss": 0.93402672, "learning_rate": 3.9934412869811655e-06, "loss": 1.01797831, "num_input_tokens_seen": 19511015, "router_z_loss_clip": 5.71875, "router_z_loss_mlp": 0.68945312, "step": 915, "time_per_iteration": 2.680291175842285 }, { "auxiliary_loss_clip": 0.07027879, "auxiliary_loss_mlp": 0.01346968, "balance_loss_clip": 0.06467623, "balance_loss_mlp": 0.01280879, "epoch": 0.055072899443859916, "flos": 17533997832960.0, "grad_norm": 3.194415924432816, "language_loss": 0.9225449, "learning_rate": 3.993409734157064e-06, "loss": 1.0062933, "num_input_tokens_seen": 19529040, "router_z_loss_clip": 5.6015625, "router_z_loss_mlp": 0.66113281, "step": 916, "time_per_iteration": 2.679859161376953 }, { "auxiliary_loss_clip": 0.07060275, "auxiliary_loss_mlp": 0.0135794, "balance_loss_clip": 0.06458248, "balance_loss_mlp": 0.01280359, "epoch": 0.05513302269652788, "flos": 21693246001920.0, "grad_norm": 3.9850852164858757, "language_loss": 0.82232028, "learning_rate": 3.993378105742666e-06, "loss": 0.90650243, "num_input_tokens_seen": 19549540, "router_z_loss_clip": 6.015625, "router_z_loss_mlp": 0.77636719, "step": 917, "time_per_iteration": 2.6715087890625 }, { "auxiliary_loss_clip": 0.07039833, "auxiliary_loss_mlp": 0.01348614, "balance_loss_clip": 0.0645812, "balance_loss_mlp": 0.01275611, "epoch": 0.05519314594919585, "flos": 21619257246720.0, "grad_norm": 19.85183901887281, "language_loss": 0.82756221, "learning_rate": 3.9933464017391705e-06, "loss": 0.91144669, "num_input_tokens_seen": 19567570, "router_z_loss_clip": 5.8125, "router_z_loss_mlp": 0.72998047, "step": 918, "time_per_iteration": 2.673781156539917 }, { "auxiliary_loss_clip": 0.07055612, "auxiliary_loss_mlp": 0.0135342, "balance_loss_clip": 0.06463441, "balance_loss_mlp": 0.01278462, "epoch": 0.05525326920186382, "flos": 21804983821440.0, "grad_norm": 5.136088475717337, "language_loss": 0.91050416, "learning_rate": 3.99331462214778e-06, "loss": 0.99459445, "num_input_tokens_seen": 19585330, "router_z_loss_clip": 5.92578125, "router_z_loss_mlp": 0.74951172, "step": 919, "time_per_iteration": 2.672171115875244 }, { "auxiliary_loss_clip": 0.07016006, "auxiliary_loss_mlp": 0.0132326, "balance_loss_clip": 0.06447516, "balance_loss_mlp": 0.01258792, "epoch": 0.05531339245453179, "flos": 28447272357120.0, "grad_norm": 4.400623515023514, "language_loss": 0.90684527, "learning_rate": 3.993282766969699e-06, "loss": 0.99023795, "num_input_tokens_seen": 19604970, "router_z_loss_clip": 5.6875, "router_z_loss_mlp": 0.64501953, "step": 920, "time_per_iteration": 2.7246055603027344 }, { "auxiliary_loss_clip": 0.07005861, "auxiliary_loss_mlp": 0.01327201, "balance_loss_clip": 0.0644387, "balance_loss_mlp": 0.01261827, "epoch": 0.05537351570719976, "flos": 37383688229760.0, "grad_norm": 4.716706188353569, "language_loss": 0.67978835, "learning_rate": 3.993250836206136e-06, "loss": 0.76311898, "num_input_tokens_seen": 19626235, "router_z_loss_clip": 5.62109375, "router_z_loss_mlp": 0.65332031, "step": 921, "time_per_iteration": 2.783457040786743 }, { "auxiliary_loss_clip": 0.0704613, "auxiliary_loss_mlp": 0.01338325, "balance_loss_clip": 0.06452499, "balance_loss_mlp": 0.01268945, "epoch": 0.05543363895986773, "flos": 20090733465600.0, "grad_norm": 7.93700522608525, "language_loss": 0.75091624, "learning_rate": 3.993218829858301e-06, "loss": 0.83476079, "num_input_tokens_seen": 19644305, "router_z_loss_clip": 5.9375, "router_z_loss_mlp": 0.69433594, "step": 922, "time_per_iteration": 2.6528234481811523 }, { "auxiliary_loss_clip": 0.07014701, "auxiliary_loss_mlp": 0.01327756, "balance_loss_clip": 0.06443498, "balance_loss_mlp": 0.01262191, "epoch": 0.0554937622125357, "flos": 24539773380480.0, "grad_norm": 8.978476512601588, "language_loss": 0.85553777, "learning_rate": 3.993186747927408e-06, "loss": 0.93896234, "num_input_tokens_seen": 19662130, "router_z_loss_clip": 5.71484375, "router_z_loss_mlp": 0.65625, "step": 923, "time_per_iteration": 2.656796455383301 }, { "auxiliary_loss_clip": 0.07031158, "auxiliary_loss_mlp": 0.01334699, "balance_loss_clip": 0.06449839, "balance_loss_mlp": 0.01267322, "epoch": 0.055553885465203665, "flos": 14325408961920.0, "grad_norm": 4.0458273636844675, "language_loss": 0.82161236, "learning_rate": 3.993154590414675e-06, "loss": 0.90527093, "num_input_tokens_seen": 19680715, "router_z_loss_clip": 5.80859375, "router_z_loss_mlp": 0.67382812, "step": 924, "time_per_iteration": 2.6386351585388184 }, { "auxiliary_loss_clip": 0.07001401, "auxiliary_loss_mlp": 0.01336544, "balance_loss_clip": 0.06436484, "balance_loss_mlp": 0.01270693, "epoch": 0.05561400871787164, "flos": 27388522892160.0, "grad_norm": 5.639887799263743, "language_loss": 1.04798317, "learning_rate": 3.993122357321319e-06, "loss": 1.13136268, "num_input_tokens_seen": 19700535, "router_z_loss_clip": 5.64453125, "router_z_loss_mlp": 0.65820312, "step": 925, "time_per_iteration": 4.224088668823242 }, { "auxiliary_loss_clip": 0.07004342, "auxiliary_loss_mlp": 0.01337244, "balance_loss_clip": 0.06437448, "balance_loss_mlp": 0.01270725, "epoch": 0.05567413197053961, "flos": 23227681495680.0, "grad_norm": 2.9574918445200957, "language_loss": 0.83379328, "learning_rate": 3.993090048648564e-06, "loss": 0.91720915, "num_input_tokens_seen": 19718825, "router_z_loss_clip": 5.66796875, "router_z_loss_mlp": 0.66503906, "step": 926, "time_per_iteration": 2.677828311920166 }, { "auxiliary_loss_clip": 0.07063979, "auxiliary_loss_mlp": 0.01347565, "balance_loss_clip": 0.06446829, "balance_loss_mlp": 0.01269364, "epoch": 0.055734255223207574, "flos": 25271988284160.0, "grad_norm": 7.8108263610906565, "language_loss": 0.77720487, "learning_rate": 3.993057664397634e-06, "loss": 0.86132026, "num_input_tokens_seen": 19739080, "router_z_loss_clip": 6.16796875, "router_z_loss_mlp": 0.78222656, "step": 927, "time_per_iteration": 4.145713806152344 }, { "auxiliary_loss_clip": 0.06809035, "auxiliary_loss_mlp": 0.01303092, "balance_loss_clip": 0.0644419, "balance_loss_mlp": 0.01267067, "epoch": 0.055794378475875546, "flos": 66524698938240.0, "grad_norm": 0.7889975169960105, "language_loss": 0.59785306, "learning_rate": 3.9930252045697585e-06, "loss": 0.67897427, "num_input_tokens_seen": 19802960, "router_z_loss_clip": 3.65429688, "router_z_loss_mlp": 0.36108398, "step": 928, "time_per_iteration": 3.3333184719085693 }, { "auxiliary_loss_clip": 0.06983252, "auxiliary_loss_mlp": 0.01339879, "balance_loss_clip": 0.06424144, "balance_loss_mlp": 0.01272073, "epoch": 0.05585450172854351, "flos": 25344635374080.0, "grad_norm": 7.473015168676801, "language_loss": 0.98719501, "learning_rate": 3.992992669166168e-06, "loss": 1.07042623, "num_input_tokens_seen": 19822765, "router_z_loss_clip": 5.6015625, "router_z_loss_mlp": 0.67773438, "step": 929, "time_per_iteration": 5.478257417678833 }, { "auxiliary_loss_clip": 0.06988022, "auxiliary_loss_mlp": 0.013292, "balance_loss_clip": 0.06424532, "balance_loss_mlp": 0.01264064, "epoch": 0.05591462498121148, "flos": 33920163711360.0, "grad_norm": 4.6208554021782895, "language_loss": 0.74273825, "learning_rate": 3.992960058188094e-06, "loss": 0.82591045, "num_input_tokens_seen": 19843590, "router_z_loss_clip": 5.640625, "router_z_loss_mlp": 0.65136719, "step": 930, "time_per_iteration": 2.7701520919799805 }, { "auxiliary_loss_clip": 0.06988774, "auxiliary_loss_mlp": 0.01342377, "balance_loss_clip": 0.06424791, "balance_loss_mlp": 0.01276765, "epoch": 0.055974748233879455, "flos": 17936617501440.0, "grad_norm": 7.218279681558707, "language_loss": 0.87794346, "learning_rate": 3.992927371636776e-06, "loss": 0.96125495, "num_input_tokens_seen": 19860230, "router_z_loss_clip": 5.63671875, "router_z_loss_mlp": 0.65673828, "step": 931, "time_per_iteration": 2.7697629928588867 }, { "auxiliary_loss_clip": 0.06986471, "auxiliary_loss_mlp": 0.01334449, "balance_loss_clip": 0.06418632, "balance_loss_mlp": 0.01269551, "epoch": 0.05603487148654742, "flos": 24028392712320.0, "grad_norm": 3.005511934395981, "language_loss": 0.86582875, "learning_rate": 3.9928946095134525e-06, "loss": 0.94903797, "num_input_tokens_seen": 19880795, "router_z_loss_clip": 5.6796875, "router_z_loss_mlp": 0.6484375, "step": 932, "time_per_iteration": 2.7142741680145264 }, { "auxiliary_loss_clip": 0.06981857, "auxiliary_loss_mlp": 0.01333595, "balance_loss_clip": 0.06418051, "balance_loss_mlp": 0.0126946, "epoch": 0.05609499473921539, "flos": 17312912035200.0, "grad_norm": 10.588987204936076, "language_loss": 0.78387892, "learning_rate": 3.992861771819365e-06, "loss": 0.86703342, "num_input_tokens_seen": 19897960, "router_z_loss_clip": 5.63671875, "router_z_loss_mlp": 0.64135742, "step": 933, "time_per_iteration": 2.6395018100738525 }, { "auxiliary_loss_clip": 0.06948254, "auxiliary_loss_mlp": 0.01319822, "balance_loss_clip": 0.06401913, "balance_loss_mlp": 0.01260933, "epoch": 0.05615511799188336, "flos": 21000834587520.0, "grad_norm": 8.413888756175096, "language_loss": 0.89410168, "learning_rate": 3.99282885855576e-06, "loss": 0.97678244, "num_input_tokens_seen": 19913315, "router_z_loss_clip": 5.4609375, "router_z_loss_mlp": 0.58886719, "step": 934, "time_per_iteration": 2.644928455352783 }, { "auxiliary_loss_clip": 0.06921855, "auxiliary_loss_mlp": 0.01323808, "balance_loss_clip": 0.06403126, "balance_loss_mlp": 0.01268877, "epoch": 0.05621524124455133, "flos": 17279062185600.0, "grad_norm": 6.077493065760864, "language_loss": 0.83233416, "learning_rate": 3.992795869723885e-06, "loss": 0.91479075, "num_input_tokens_seen": 19928790, "router_z_loss_clip": 5.19140625, "router_z_loss_mlp": 0.55029297, "step": 935, "time_per_iteration": 2.630181312561035 }, { "auxiliary_loss_clip": 0.06756425, "auxiliary_loss_mlp": 0.01294882, "balance_loss_clip": 0.06394777, "balance_loss_mlp": 0.01256616, "epoch": 0.0562753644972193, "flos": 58737597194880.0, "grad_norm": 0.7918854634392748, "language_loss": 0.69103956, "learning_rate": 3.99276280532499e-06, "loss": 0.77155262, "num_input_tokens_seen": 19988785, "router_z_loss_clip": 3.62304688, "router_z_loss_mlp": 0.38208008, "step": 936, "time_per_iteration": 3.211699962615967 }, { "auxiliary_loss_clip": 0.06965613, "auxiliary_loss_mlp": 0.01356063, "balance_loss_clip": 0.06407349, "balance_loss_mlp": 0.01285635, "epoch": 0.05633548774988727, "flos": 17462776262400.0, "grad_norm": 7.429718649473098, "language_loss": 0.79578388, "learning_rate": 3.992729665360331e-06, "loss": 0.87900066, "num_input_tokens_seen": 20007685, "router_z_loss_clip": 5.58984375, "router_z_loss_mlp": 0.70410156, "step": 937, "time_per_iteration": 2.6845245361328125 }, { "auxiliary_loss_clip": 0.06723402, "auxiliary_loss_mlp": 0.01311303, "balance_loss_clip": 0.06365992, "balance_loss_mlp": 0.01265384, "epoch": 0.05639561100255524, "flos": 70675939042560.0, "grad_norm": 0.8465343984020223, "language_loss": 0.64401186, "learning_rate": 3.992696449831162e-06, "loss": 0.72435898, "num_input_tokens_seen": 20072750, "router_z_loss_clip": 3.5703125, "router_z_loss_mlp": 0.45874023, "step": 938, "time_per_iteration": 3.2352135181427 }, { "auxiliary_loss_clip": 0.06980278, "auxiliary_loss_mlp": 0.01379658, "balance_loss_clip": 0.06411511, "balance_loss_mlp": 0.01308085, "epoch": 0.056455734255223204, "flos": 20492346885120.0, "grad_norm": 5.817380422883121, "language_loss": 0.82404077, "learning_rate": 3.992663158738745e-06, "loss": 0.9076401, "num_input_tokens_seen": 20089070, "router_z_loss_clip": 5.69140625, "router_z_loss_mlp": 0.71533203, "step": 939, "time_per_iteration": 2.66983699798584 }, { "auxiliary_loss_clip": 0.06950864, "auxiliary_loss_mlp": 0.01374438, "balance_loss_clip": 0.06412613, "balance_loss_mlp": 0.01308682, "epoch": 0.056515857507891176, "flos": 22059961395840.0, "grad_norm": 4.781785077239573, "language_loss": 0.76151478, "learning_rate": 3.992629792084341e-06, "loss": 0.84476781, "num_input_tokens_seen": 20108790, "router_z_loss_clip": 5.3828125, "router_z_loss_mlp": 0.65820312, "step": 940, "time_per_iteration": 2.761112689971924 }, { "auxiliary_loss_clip": 0.06935409, "auxiliary_loss_mlp": 0.01389487, "balance_loss_clip": 0.06405176, "balance_loss_mlp": 0.01319058, "epoch": 0.05657598076055915, "flos": 24032291927040.0, "grad_norm": 3.187925287835243, "language_loss": 0.73360771, "learning_rate": 3.992596349869216e-06, "loss": 0.81685662, "num_input_tokens_seen": 20128455, "router_z_loss_clip": 5.30078125, "router_z_loss_mlp": 0.70410156, "step": 941, "time_per_iteration": 2.6992452144622803 }, { "auxiliary_loss_clip": 0.06945658, "auxiliary_loss_mlp": 0.01378635, "balance_loss_clip": 0.06398071, "balance_loss_mlp": 0.01310066, "epoch": 0.05663610401322711, "flos": 20486057829120.0, "grad_norm": 4.608346490918788, "language_loss": 0.82308596, "learning_rate": 3.992562832094637e-06, "loss": 0.90632892, "num_input_tokens_seen": 20145775, "router_z_loss_clip": 5.48046875, "router_z_loss_mlp": 0.68652344, "step": 942, "time_per_iteration": 2.6976025104522705 }, { "auxiliary_loss_clip": 0.06938953, "auxiliary_loss_mlp": 0.0136955, "balance_loss_clip": 0.06401725, "balance_loss_mlp": 0.0130265, "epoch": 0.056696227265895086, "flos": 21075368394240.0, "grad_norm": 5.062266729241667, "language_loss": 0.91752386, "learning_rate": 3.9925292387618755e-06, "loss": 1.00060892, "num_input_tokens_seen": 20164315, "router_z_loss_clip": 5.37890625, "router_z_loss_mlp": 0.66992188, "step": 943, "time_per_iteration": 2.6652562618255615 }, { "auxiliary_loss_clip": 0.06934037, "auxiliary_loss_mlp": 0.01353365, "balance_loss_clip": 0.06398888, "balance_loss_mlp": 0.01290184, "epoch": 0.05675635051856306, "flos": 17827017960960.0, "grad_norm": 7.312947439782761, "language_loss": 0.78526902, "learning_rate": 3.992495569872206e-06, "loss": 0.86814302, "num_input_tokens_seen": 20182760, "router_z_loss_clip": 5.34765625, "router_z_loss_mlp": 0.63183594, "step": 944, "time_per_iteration": 2.638725996017456 }, { "auxiliary_loss_clip": 0.06927779, "auxiliary_loss_mlp": 0.01377735, "balance_loss_clip": 0.06396958, "balance_loss_mlp": 0.01312027, "epoch": 0.05681647377123102, "flos": 23122065024000.0, "grad_norm": 3.1381933589172792, "language_loss": 0.81710505, "learning_rate": 3.992461825426906e-06, "loss": 0.90016025, "num_input_tokens_seen": 20203830, "router_z_loss_clip": 5.3125, "router_z_loss_mlp": 0.65722656, "step": 945, "time_per_iteration": 2.6897506713867188 }, { "auxiliary_loss_clip": 0.06937259, "auxiliary_loss_mlp": 0.01380081, "balance_loss_clip": 0.06398278, "balance_loss_mlp": 0.01314707, "epoch": 0.056876597023898995, "flos": 16076024789760.0, "grad_norm": 4.7063355453076285, "language_loss": 0.85361469, "learning_rate": 3.992428005427252e-06, "loss": 0.93678814, "num_input_tokens_seen": 20220365, "router_z_loss_clip": 5.38671875, "router_z_loss_mlp": 0.65380859, "step": 946, "time_per_iteration": 2.610459566116333 }, { "auxiliary_loss_clip": 0.0695236, "auxiliary_loss_mlp": 0.01369939, "balance_loss_clip": 0.06396273, "balance_loss_mlp": 0.01299271, "epoch": 0.05693672027656696, "flos": 16841083294080.0, "grad_norm": 3.3682319921439903, "language_loss": 0.81748629, "learning_rate": 3.992394109874529e-06, "loss": 0.90070927, "num_input_tokens_seen": 20238640, "router_z_loss_clip": 5.55859375, "router_z_loss_mlp": 0.70654297, "step": 947, "time_per_iteration": 2.646082639694214 }, { "auxiliary_loss_clip": 0.06973866, "auxiliary_loss_mlp": 0.01360496, "balance_loss_clip": 0.06404343, "balance_loss_mlp": 0.0129207, "epoch": 0.05699684352923493, "flos": 21394104526080.0, "grad_norm": 6.071324417818573, "language_loss": 0.89129198, "learning_rate": 3.9923601387700225e-06, "loss": 0.9746356, "num_input_tokens_seen": 20251025, "router_z_loss_clip": 5.69140625, "router_z_loss_mlp": 0.68408203, "step": 948, "time_per_iteration": 2.635869026184082 }, { "auxiliary_loss_clip": 0.06943691, "auxiliary_loss_mlp": 0.01346834, "balance_loss_clip": 0.06407556, "balance_loss_mlp": 0.01280697, "epoch": 0.057056966781902904, "flos": 15565818078720.0, "grad_norm": 6.378741559708573, "language_loss": 0.89860737, "learning_rate": 3.992326092115019e-06, "loss": 0.98151267, "num_input_tokens_seen": 20269775, "router_z_loss_clip": 5.359375, "router_z_loss_mlp": 0.66113281, "step": 949, "time_per_iteration": 2.639361619949341 }, { "auxiliary_loss_clip": 0.06921871, "auxiliary_loss_mlp": 0.01327287, "balance_loss_clip": 0.06401557, "balance_loss_mlp": 0.01268302, "epoch": 0.05711709003457087, "flos": 19943971839360.0, "grad_norm": 7.3788660478010435, "language_loss": 0.81434751, "learning_rate": 3.992291969910811e-06, "loss": 0.89683902, "num_input_tokens_seen": 20287715, "router_z_loss_clip": 5.203125, "router_z_loss_mlp": 0.58935547, "step": 950, "time_per_iteration": 2.6154983043670654 }, { "auxiliary_loss_clip": 0.06953124, "auxiliary_loss_mlp": 0.01336247, "balance_loss_clip": 0.064138, "balance_loss_mlp": 0.01273066, "epoch": 0.05717721328723884, "flos": 30339953982720.0, "grad_norm": 9.590897730889148, "language_loss": 0.84578055, "learning_rate": 3.992257772158691e-06, "loss": 0.92867428, "num_input_tokens_seen": 20307070, "router_z_loss_clip": 5.3984375, "router_z_loss_mlp": 0.63183594, "step": 951, "time_per_iteration": 2.726933479309082 }, { "auxiliary_loss_clip": 0.06952639, "auxiliary_loss_mlp": 0.01334465, "balance_loss_clip": 0.06414816, "balance_loss_mlp": 0.01268757, "epoch": 0.05723733653990681, "flos": 23660251799040.0, "grad_norm": 5.708161262146418, "language_loss": 0.90079391, "learning_rate": 3.992223498859958e-06, "loss": 0.98366499, "num_input_tokens_seen": 20324945, "router_z_loss_clip": 5.375, "router_z_loss_mlp": 0.65673828, "step": 952, "time_per_iteration": 2.6571085453033447 }, { "auxiliary_loss_clip": 0.06987752, "auxiliary_loss_mlp": 0.01340453, "balance_loss_clip": 0.06424144, "balance_loss_mlp": 0.01266734, "epoch": 0.05729745979257478, "flos": 22062518945280.0, "grad_norm": 7.374440015167745, "language_loss": 0.82910091, "learning_rate": 3.9921891500159084e-06, "loss": 0.91238296, "num_input_tokens_seen": 20346135, "router_z_loss_clip": 5.64453125, "router_z_loss_mlp": 0.73730469, "step": 953, "time_per_iteration": 2.6719062328338623 }, { "auxiliary_loss_clip": 0.06969544, "auxiliary_loss_mlp": 0.01328636, "balance_loss_clip": 0.06425074, "balance_loss_mlp": 0.01263023, "epoch": 0.05735758304524275, "flos": 19609554994560.0, "grad_norm": 3.564774700439061, "language_loss": 0.89403319, "learning_rate": 3.992154725627848e-06, "loss": 0.97701502, "num_input_tokens_seen": 20364450, "router_z_loss_clip": 5.4453125, "router_z_loss_mlp": 0.65625, "step": 954, "time_per_iteration": 2.654792308807373 }, { "auxiliary_loss_clip": 0.06945822, "auxiliary_loss_mlp": 0.01329154, "balance_loss_clip": 0.06409524, "balance_loss_mlp": 0.01267928, "epoch": 0.057417706297910716, "flos": 19105050360960.0, "grad_norm": 5.45697953026124, "language_loss": 0.91789138, "learning_rate": 3.9921202256970804e-06, "loss": 1.00064123, "num_input_tokens_seen": 20383500, "router_z_loss_clip": 5.359375, "router_z_loss_mlp": 0.61279297, "step": 955, "time_per_iteration": 2.660323143005371 }, { "auxiliary_loss_clip": 0.06940375, "auxiliary_loss_mlp": 0.01324185, "balance_loss_clip": 0.06418206, "balance_loss_mlp": 0.01262339, "epoch": 0.05747782955057869, "flos": 16660136401920.0, "grad_norm": 5.56111547963972, "language_loss": 0.92997694, "learning_rate": 3.992085650224914e-06, "loss": 1.01262259, "num_input_tokens_seen": 20400295, "router_z_loss_clip": 5.22265625, "router_z_loss_mlp": 0.61914062, "step": 956, "time_per_iteration": 2.638274908065796 }, { "auxiliary_loss_clip": 0.06900108, "auxiliary_loss_mlp": 0.01321128, "balance_loss_clip": 0.06406718, "balance_loss_mlp": 0.01262095, "epoch": 0.05753795280324665, "flos": 14507362103040.0, "grad_norm": 4.436468044615737, "language_loss": 0.78555512, "learning_rate": 3.99205099921266e-06, "loss": 0.86776751, "num_input_tokens_seen": 20419085, "router_z_loss_clip": 4.93359375, "router_z_loss_mlp": 0.58984375, "step": 957, "time_per_iteration": 2.612305164337158 }, { "auxiliary_loss_clip": 0.06927843, "auxiliary_loss_mlp": 0.01332902, "balance_loss_clip": 0.06410155, "balance_loss_mlp": 0.01269053, "epoch": 0.057598076055914625, "flos": 18081995535360.0, "grad_norm": 5.373802138885738, "language_loss": 0.82134569, "learning_rate": 3.992016272661633e-06, "loss": 0.90395314, "num_input_tokens_seen": 20437465, "router_z_loss_clip": 5.171875, "router_z_loss_mlp": 0.63769531, "step": 958, "time_per_iteration": 2.642695426940918 }, { "auxiliary_loss_clip": 0.06912456, "auxiliary_loss_mlp": 0.01339068, "balance_loss_clip": 0.06404807, "balance_loss_mlp": 0.01273408, "epoch": 0.0576581993085826, "flos": 22130679841920.0, "grad_norm": 5.44627129318385, "language_loss": 0.90960246, "learning_rate": 3.99198147057315e-06, "loss": 0.9921177, "num_input_tokens_seen": 20456235, "router_z_loss_clip": 5.078125, "router_z_loss_mlp": 0.65625, "step": 959, "time_per_iteration": 2.6563334465026855 }, { "auxiliary_loss_clip": 0.06930724, "auxiliary_loss_mlp": 0.01330034, "balance_loss_clip": 0.06414665, "balance_loss_mlp": 0.01267187, "epoch": 0.05771832256125056, "flos": 33190003232640.0, "grad_norm": 15.719453894182188, "language_loss": 0.81831473, "learning_rate": 3.991946592948529e-06, "loss": 0.9009223, "num_input_tokens_seen": 20476825, "router_z_loss_clip": 5.1640625, "router_z_loss_mlp": 0.62841797, "step": 960, "time_per_iteration": 2.7721080780029297 }, { "auxiliary_loss_clip": 0.06937068, "auxiliary_loss_mlp": 0.0133126, "balance_loss_clip": 0.06413113, "balance_loss_mlp": 0.0126436, "epoch": 0.057778445813918534, "flos": 24176957201280.0, "grad_norm": 15.506717721243698, "language_loss": 0.95232409, "learning_rate": 3.991911639789094e-06, "loss": 1.03500736, "num_input_tokens_seen": 20496965, "router_z_loss_clip": 5.23828125, "router_z_loss_mlp": 0.66894531, "step": 961, "time_per_iteration": 2.71431827545166 }, { "auxiliary_loss_clip": 0.06937822, "auxiliary_loss_mlp": 0.01337946, "balance_loss_clip": 0.06406724, "balance_loss_mlp": 0.01266563, "epoch": 0.0578385690665865, "flos": 29650770950400.0, "grad_norm": 6.957864096167619, "language_loss": 0.71170616, "learning_rate": 3.991876611096169e-06, "loss": 0.79446387, "num_input_tokens_seen": 20518035, "router_z_loss_clip": 5.30859375, "router_z_loss_mlp": 0.71386719, "step": 962, "time_per_iteration": 2.7579345703125 }, { "auxiliary_loss_clip": 0.0693513, "auxiliary_loss_mlp": 0.01329687, "balance_loss_clip": 0.06412388, "balance_loss_mlp": 0.01267269, "epoch": 0.05789869231925447, "flos": 20891528536320.0, "grad_norm": 5.9413021670748565, "language_loss": 0.92082286, "learning_rate": 3.991841506871084e-06, "loss": 1.00347102, "num_input_tokens_seen": 20534740, "router_z_loss_clip": 5.23046875, "router_z_loss_mlp": 0.62402344, "step": 963, "time_per_iteration": 2.693134069442749 }, { "auxiliary_loss_clip": 0.06929669, "auxiliary_loss_mlp": 0.01335664, "balance_loss_clip": 0.06407388, "balance_loss_mlp": 0.01266475, "epoch": 0.057958815571922444, "flos": 26038262672640.0, "grad_norm": 5.587726189809887, "language_loss": 0.88751566, "learning_rate": 3.99180632711517e-06, "loss": 0.97016895, "num_input_tokens_seen": 20553485, "router_z_loss_clip": 5.21875, "router_z_loss_mlp": 0.69189453, "step": 964, "time_per_iteration": 4.075676441192627 }, { "auxiliary_loss_clip": 0.06934804, "auxiliary_loss_mlp": 0.01323391, "balance_loss_clip": 0.06415507, "balance_loss_mlp": 0.01257301, "epoch": 0.05801893882459041, "flos": 18083588762880.0, "grad_norm": 5.2652044361109684, "language_loss": 0.80115426, "learning_rate": 3.99177107182976e-06, "loss": 0.88373625, "num_input_tokens_seen": 20572155, "router_z_loss_clip": 5.19140625, "router_z_loss_mlp": 0.66113281, "step": 965, "time_per_iteration": 2.625941276550293 }, { "auxiliary_loss_clip": 0.06921771, "auxiliary_loss_mlp": 0.01320501, "balance_loss_clip": 0.0641055, "balance_loss_mlp": 0.01253696, "epoch": 0.05807906207725838, "flos": 17754664360320.0, "grad_norm": 11.642065461702026, "language_loss": 0.85210633, "learning_rate": 3.99173574101619e-06, "loss": 0.93452907, "num_input_tokens_seen": 20590395, "router_z_loss_clip": 5.11328125, "router_z_loss_mlp": 0.66796875, "step": 966, "time_per_iteration": 4.047506809234619 }, { "auxiliary_loss_clip": 0.06920336, "auxiliary_loss_mlp": 0.01317224, "balance_loss_clip": 0.06412185, "balance_loss_mlp": 0.01254711, "epoch": 0.058139185329926346, "flos": 18046133187840.0, "grad_norm": 2.8369137549921994, "language_loss": 0.79431653, "learning_rate": 3.9917003346758035e-06, "loss": 0.87669212, "num_input_tokens_seen": 20608435, "router_z_loss_clip": 5.08984375, "router_z_loss_mlp": 0.62548828, "step": 967, "time_per_iteration": 2.6023471355438232 }, { "auxiliary_loss_clip": 0.06809588, "auxiliary_loss_mlp": 0.01331179, "balance_loss_clip": 0.06457537, "balance_loss_mlp": 0.01289503, "epoch": 0.05819930858259432, "flos": 62381355845760.0, "grad_norm": 0.7980590419915288, "language_loss": 0.57548451, "learning_rate": 3.991664852809939e-06, "loss": 0.65689218, "num_input_tokens_seen": 20668575, "router_z_loss_clip": 3.52539062, "router_z_loss_mlp": 0.41699219, "step": 968, "time_per_iteration": 4.633905410766602 }, { "auxiliary_loss_clip": 0.06903881, "auxiliary_loss_mlp": 0.01322036, "balance_loss_clip": 0.06404178, "balance_loss_mlp": 0.0126329, "epoch": 0.05825943183526229, "flos": 19141373905920.0, "grad_norm": 3.6066058077991827, "language_loss": 0.84439236, "learning_rate": 3.991629295419945e-06, "loss": 0.92665154, "num_input_tokens_seen": 20687355, "router_z_loss_clip": 4.99609375, "router_z_loss_mlp": 0.58789062, "step": 969, "time_per_iteration": 4.161763906478882 }, { "auxiliary_loss_clip": 0.06913339, "auxiliary_loss_mlp": 0.01319042, "balance_loss_clip": 0.06398721, "balance_loss_mlp": 0.01255766, "epoch": 0.058319555087930255, "flos": 29030042304000.0, "grad_norm": 4.9997434636821, "language_loss": 0.81431055, "learning_rate": 3.991593662507167e-06, "loss": 0.89663434, "num_input_tokens_seen": 20705710, "router_z_loss_clip": 5.15625, "router_z_loss_mlp": 0.63232422, "step": 970, "time_per_iteration": 2.702735424041748 }, { "auxiliary_loss_clip": 0.06905666, "auxiliary_loss_mlp": 0.01313835, "balance_loss_clip": 0.06390124, "balance_loss_mlp": 0.01251179, "epoch": 0.05837967834059823, "flos": 18885977061120.0, "grad_norm": 6.92355730200372, "language_loss": 0.95081294, "learning_rate": 3.991557954072958e-06, "loss": 1.0330081, "num_input_tokens_seen": 20722405, "router_z_loss_clip": 5.16015625, "router_z_loss_mlp": 0.62646484, "step": 971, "time_per_iteration": 2.6801650524139404 }, { "auxiliary_loss_clip": 0.06915039, "auxiliary_loss_mlp": 0.01312143, "balance_loss_clip": 0.06397903, "balance_loss_mlp": 0.01250488, "epoch": 0.05843980159326619, "flos": 25710218737920.0, "grad_norm": 2.9898468593114753, "language_loss": 0.89060891, "learning_rate": 3.991522170118673e-06, "loss": 0.97288072, "num_input_tokens_seen": 20741480, "router_z_loss_clip": 5.16796875, "router_z_loss_mlp": 0.61669922, "step": 972, "time_per_iteration": 2.6918952465057373 }, { "auxiliary_loss_clip": 0.06891805, "auxiliary_loss_mlp": 0.01318108, "balance_loss_clip": 0.063945, "balance_loss_mlp": 0.01261365, "epoch": 0.058499924845934165, "flos": 25558425866880.0, "grad_norm": 2.955157856613525, "language_loss": 0.89338934, "learning_rate": 3.991486310645667e-06, "loss": 0.97548848, "num_input_tokens_seen": 20759685, "router_z_loss_clip": 4.97265625, "router_z_loss_mlp": 0.56835938, "step": 973, "time_per_iteration": 2.6686625480651855 }, { "auxiliary_loss_clip": 0.06892689, "auxiliary_loss_mlp": 0.01320578, "balance_loss_clip": 0.06386027, "balance_loss_mlp": 0.01260688, "epoch": 0.05856004809860214, "flos": 16441859715840.0, "grad_norm": 10.877612555358086, "language_loss": 0.77289438, "learning_rate": 3.991450375655301e-06, "loss": 0.85502696, "num_input_tokens_seen": 20778180, "router_z_loss_clip": 5.0703125, "router_z_loss_mlp": 0.59960938, "step": 974, "time_per_iteration": 2.611807346343994 }, { "auxiliary_loss_clip": 0.0689766, "auxiliary_loss_mlp": 0.01327552, "balance_loss_clip": 0.06397611, "balance_loss_mlp": 0.01271404, "epoch": 0.0586201713512701, "flos": 39468385486080.0, "grad_norm": 14.594850311591218, "language_loss": 0.78851676, "learning_rate": 3.991414365148936e-06, "loss": 0.8707689, "num_input_tokens_seen": 20802705, "router_z_loss_clip": 4.99609375, "router_z_loss_mlp": 0.56176758, "step": 975, "time_per_iteration": 2.8051445484161377 }, { "auxiliary_loss_clip": 0.06905077, "auxiliary_loss_mlp": 0.01322752, "balance_loss_clip": 0.063935, "balance_loss_mlp": 0.01259332, "epoch": 0.058680294603938074, "flos": 23371466302080.0, "grad_norm": 4.635245796884561, "language_loss": 0.78968799, "learning_rate": 3.99137827912794e-06, "loss": 0.8719663, "num_input_tokens_seen": 20822540, "router_z_loss_clip": 5.12109375, "router_z_loss_mlp": 0.63427734, "step": 976, "time_per_iteration": 2.632770538330078 }, { "auxiliary_loss_clip": 0.0689847, "auxiliary_loss_mlp": 0.01344449, "balance_loss_clip": 0.06396477, "balance_loss_mlp": 0.01286537, "epoch": 0.05874041785660604, "flos": 32239930913280.0, "grad_norm": 3.0907765810613026, "language_loss": 0.89206493, "learning_rate": 3.991342117593679e-06, "loss": 0.9744941, "num_input_tokens_seen": 20844175, "router_z_loss_clip": 5.0234375, "router_z_loss_mlp": 0.57885742, "step": 977, "time_per_iteration": 2.7186553478240967 }, { "auxiliary_loss_clip": 0.06910314, "auxiliary_loss_mlp": 0.01332746, "balance_loss_clip": 0.064007, "balance_loss_mlp": 0.01271902, "epoch": 0.05880054110927401, "flos": 22316657978880.0, "grad_norm": 5.474537451993443, "language_loss": 0.81364381, "learning_rate": 3.991305880547527e-06, "loss": 0.89607435, "num_input_tokens_seen": 20864730, "router_z_loss_clip": 5.09765625, "router_z_loss_mlp": 0.60839844, "step": 978, "time_per_iteration": 2.9032394886016846 }, { "auxiliary_loss_clip": 0.0690965, "auxiliary_loss_mlp": 0.01351056, "balance_loss_clip": 0.06400986, "balance_loss_mlp": 0.0128952, "epoch": 0.05886066436194198, "flos": 27387726278400.0, "grad_norm": 4.5331868994482605, "language_loss": 0.83286619, "learning_rate": 3.991269567990855e-06, "loss": 0.91547322, "num_input_tokens_seen": 20885200, "router_z_loss_clip": 5.08984375, "router_z_loss_mlp": 0.61499023, "step": 979, "time_per_iteration": 2.7694687843322754 }, { "auxiliary_loss_clip": 0.0674461, "auxiliary_loss_mlp": 0.01303122, "balance_loss_clip": 0.06401386, "balance_loss_mlp": 0.01270983, "epoch": 0.05892078761460995, "flos": 59601102647040.0, "grad_norm": 0.9119608431147183, "language_loss": 0.59312075, "learning_rate": 3.9912331799250415e-06, "loss": 0.67359805, "num_input_tokens_seen": 20940325, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.32128906, "step": 980, "time_per_iteration": 3.1904032230377197 }, { "auxiliary_loss_clip": 0.06888404, "auxiliary_loss_mlp": 0.01343427, "balance_loss_clip": 0.06397587, "balance_loss_mlp": 0.01289878, "epoch": 0.05898091086727792, "flos": 15419517649920.0, "grad_norm": 8.581731718147271, "language_loss": 0.90187299, "learning_rate": 3.9911967163514665e-06, "loss": 0.9841913, "num_input_tokens_seen": 20958220, "router_z_loss_clip": 4.91015625, "router_z_loss_mlp": 0.53540039, "step": 981, "time_per_iteration": 2.60420298576355 }, { "auxiliary_loss_clip": 0.06887044, "auxiliary_loss_mlp": 0.01323825, "balance_loss_clip": 0.06389254, "balance_loss_mlp": 0.01269728, "epoch": 0.059041034119945886, "flos": 23661383829120.0, "grad_norm": 13.845381124864431, "language_loss": 0.81972361, "learning_rate": 3.991160177271513e-06, "loss": 0.90183234, "num_input_tokens_seen": 20978920, "router_z_loss_clip": 4.9765625, "router_z_loss_mlp": 0.54077148, "step": 982, "time_per_iteration": 2.6513113975524902 }, { "auxiliary_loss_clip": 0.06908423, "auxiliary_loss_mlp": 0.01339768, "balance_loss_clip": 0.06387585, "balance_loss_mlp": 0.0127592, "epoch": 0.05910115737261386, "flos": 24761026886400.0, "grad_norm": 2.9369620098966074, "language_loss": 0.86491728, "learning_rate": 3.9911235626865654e-06, "loss": 0.94739914, "num_input_tokens_seen": 20999490, "router_z_loss_clip": 5.203125, "router_z_loss_mlp": 0.63867188, "step": 983, "time_per_iteration": 2.6679372787475586 }, { "auxiliary_loss_clip": 0.06874552, "auxiliary_loss_mlp": 0.01335025, "balance_loss_clip": 0.06377757, "balance_loss_mlp": 0.01277852, "epoch": 0.05916128062528183, "flos": 11733523741440.0, "grad_norm": 6.033543173545853, "language_loss": 0.87607598, "learning_rate": 3.9910868725980125e-06, "loss": 0.95817184, "num_input_tokens_seen": 21017865, "router_z_loss_clip": 4.97265625, "router_z_loss_mlp": 0.57226562, "step": 984, "time_per_iteration": 2.6773152351379395 }, { "auxiliary_loss_clip": 0.06885281, "auxiliary_loss_mlp": 0.0132944, "balance_loss_clip": 0.06382986, "balance_loss_mlp": 0.01272887, "epoch": 0.059221403877949795, "flos": 21908587795200.0, "grad_norm": 4.359244492460397, "language_loss": 0.79905891, "learning_rate": 3.9910501070072465e-06, "loss": 0.88120615, "num_input_tokens_seen": 21035900, "router_z_loss_clip": 5.0234375, "router_z_loss_mlp": 0.56542969, "step": 985, "time_per_iteration": 2.633265256881714 }, { "auxiliary_loss_clip": 0.06880912, "auxiliary_loss_mlp": 0.01337998, "balance_loss_clip": 0.06371763, "balance_loss_mlp": 0.01277345, "epoch": 0.05928152713061777, "flos": 20519614189440.0, "grad_norm": 6.806542668412807, "language_loss": 0.93205738, "learning_rate": 3.991013265915661e-06, "loss": 1.01424646, "num_input_tokens_seen": 21053235, "router_z_loss_clip": 5.0859375, "router_z_loss_mlp": 0.60742188, "step": 986, "time_per_iteration": 2.6528656482696533 }, { "auxiliary_loss_clip": 0.06889828, "auxiliary_loss_mlp": 0.01327307, "balance_loss_clip": 0.06374214, "balance_loss_mlp": 0.01266415, "epoch": 0.05934165038328574, "flos": 24501437337600.0, "grad_norm": 21.20670565587982, "language_loss": 0.79012221, "learning_rate": 3.9909763493246525e-06, "loss": 0.87229359, "num_input_tokens_seen": 21073090, "router_z_loss_clip": 5.15234375, "router_z_loss_mlp": 0.60888672, "step": 987, "time_per_iteration": 2.6602959632873535 }, { "auxiliary_loss_clip": 0.06909087, "auxiliary_loss_mlp": 0.01332736, "balance_loss_clip": 0.06378782, "balance_loss_mlp": 0.01269292, "epoch": 0.059401773635953704, "flos": 38737302612480.0, "grad_norm": 10.39466994946307, "language_loss": 0.7395463, "learning_rate": 3.990939357235621e-06, "loss": 0.82196462, "num_input_tokens_seen": 21094895, "router_z_loss_clip": 5.30078125, "router_z_loss_mlp": 0.63452148, "step": 988, "time_per_iteration": 2.7666804790496826 }, { "auxiliary_loss_clip": 0.06689397, "auxiliary_loss_mlp": 0.01287493, "balance_loss_clip": 0.06350094, "balance_loss_mlp": 0.01255258, "epoch": 0.059461896888621676, "flos": 58041244638720.0, "grad_norm": 0.9459992317084083, "language_loss": 0.71296698, "learning_rate": 3.99090228964997e-06, "loss": 0.79273593, "num_input_tokens_seen": 21147555, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.32226562, "step": 989, "time_per_iteration": 3.0741660594940186 }, { "auxiliary_loss_clip": 0.06903534, "auxiliary_loss_mlp": 0.01333459, "balance_loss_clip": 0.06375888, "balance_loss_mlp": 0.01267798, "epoch": 0.05952202014128964, "flos": 22134369421440.0, "grad_norm": 3.4108841907836536, "language_loss": 0.81156981, "learning_rate": 3.990865146569105e-06, "loss": 0.89393973, "num_input_tokens_seen": 21167845, "router_z_loss_clip": 5.28125, "router_z_loss_mlp": 0.65625, "step": 990, "time_per_iteration": 2.6413450241088867 }, { "auxiliary_loss_clip": 0.06886624, "auxiliary_loss_mlp": 0.0133518, "balance_loss_clip": 0.06378897, "balance_loss_mlp": 0.01275336, "epoch": 0.059582143393957614, "flos": 20451495219840.0, "grad_norm": 5.337262407138213, "language_loss": 0.88981426, "learning_rate": 3.990827927994434e-06, "loss": 0.97203231, "num_input_tokens_seen": 21185085, "router_z_loss_clip": 5.06640625, "router_z_loss_mlp": 0.59838867, "step": 991, "time_per_iteration": 2.6040194034576416 }, { "auxiliary_loss_clip": 0.0690359, "auxiliary_loss_mlp": 0.0132707, "balance_loss_clip": 0.06380047, "balance_loss_mlp": 0.01261886, "epoch": 0.059642266646625586, "flos": 20601149811840.0, "grad_norm": 4.559207200448543, "language_loss": 0.80205786, "learning_rate": 3.9907906339273674e-06, "loss": 0.88436437, "num_input_tokens_seen": 21204230, "router_z_loss_clip": 5.2421875, "router_z_loss_mlp": 0.65234375, "step": 992, "time_per_iteration": 2.5984697341918945 }, { "auxiliary_loss_clip": 0.06890898, "auxiliary_loss_mlp": 0.01324227, "balance_loss_clip": 0.06377406, "balance_loss_mlp": 0.01261618, "epoch": 0.05970238989929355, "flos": 19358434707840.0, "grad_norm": 8.600366626176184, "language_loss": 0.79561204, "learning_rate": 3.9907532643693215e-06, "loss": 0.87776327, "num_input_tokens_seen": 21222655, "router_z_loss_clip": 5.125, "router_z_loss_mlp": 0.62597656, "step": 993, "time_per_iteration": 2.616816282272339 }, { "auxiliary_loss_clip": 0.06879665, "auxiliary_loss_mlp": 0.01319203, "balance_loss_clip": 0.06374134, "balance_loss_mlp": 0.01260457, "epoch": 0.05976251315196152, "flos": 30272002721280.0, "grad_norm": 2.900821099881974, "language_loss": 0.82693768, "learning_rate": 3.990715819321712e-06, "loss": 0.90892631, "num_input_tokens_seen": 21242310, "router_z_loss_clip": 5.05859375, "router_z_loss_mlp": 0.58789062, "step": 994, "time_per_iteration": 2.7093870639801025 }, { "auxiliary_loss_clip": 0.06887421, "auxiliary_loss_mlp": 0.01318901, "balance_loss_clip": 0.06370816, "balance_loss_mlp": 0.01259296, "epoch": 0.05982263640462949, "flos": 23191819148160.0, "grad_norm": 4.801198312644185, "language_loss": 0.82671839, "learning_rate": 3.99067829878596e-06, "loss": 0.90878165, "num_input_tokens_seen": 21261410, "router_z_loss_clip": 5.16796875, "router_z_loss_mlp": 0.59619141, "step": 995, "time_per_iteration": 2.6292359828948975 }, { "auxiliary_loss_clip": 0.06890725, "auxiliary_loss_mlp": 0.01320861, "balance_loss_clip": 0.06374486, "balance_loss_mlp": 0.0125749, "epoch": 0.05988275965729746, "flos": 27857584448640.0, "grad_norm": 3.5723715069003665, "language_loss": 0.892946, "learning_rate": 3.990640702763487e-06, "loss": 0.97506183, "num_input_tokens_seen": 21280080, "router_z_loss_clip": 5.1640625, "router_z_loss_mlp": 0.63378906, "step": 996, "time_per_iteration": 2.7305662631988525 }, { "auxiliary_loss_clip": 0.06891064, "auxiliary_loss_mlp": 0.01320779, "balance_loss_clip": 0.0637079, "balance_loss_mlp": 0.01256454, "epoch": 0.05994288290996543, "flos": 24686744641920.0, "grad_norm": 6.81564313699873, "language_loss": 0.90695029, "learning_rate": 3.990603031255718e-06, "loss": 0.98906869, "num_input_tokens_seen": 21296765, "router_z_loss_clip": 5.20703125, "router_z_loss_mlp": 0.64355469, "step": 997, "time_per_iteration": 2.670194625854492 }, { "auxiliary_loss_clip": 0.06663455, "auxiliary_loss_mlp": 0.01280899, "balance_loss_clip": 0.06329013, "balance_loss_mlp": 0.01250429, "epoch": 0.0600030061626334, "flos": 69951187152000.0, "grad_norm": 1.0297252538850652, "language_loss": 0.75485182, "learning_rate": 3.990565284264083e-06, "loss": 0.83429539, "num_input_tokens_seen": 21363345, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.30419922, "step": 998, "time_per_iteration": 3.3193228244781494 }, { "auxiliary_loss_clip": 0.06866221, "auxiliary_loss_mlp": 0.0131522, "balance_loss_clip": 0.06371284, "balance_loss_mlp": 0.01259812, "epoch": 0.06006312941530137, "flos": 26547085791360.0, "grad_norm": 2.9471649339019486, "language_loss": 0.77874577, "learning_rate": 3.990527461790013e-06, "loss": 0.86056018, "num_input_tokens_seen": 21385290, "router_z_loss_clip": 4.94140625, "router_z_loss_mlp": 0.55419922, "step": 999, "time_per_iteration": 2.673881769180298 }, { "auxiliary_loss_clip": 0.06882791, "auxiliary_loss_mlp": 0.01317311, "balance_loss_clip": 0.06368452, "balance_loss_mlp": 0.01257039, "epoch": 0.060123252667969335, "flos": 27351276952320.0, "grad_norm": 2.647696542293234, "language_loss": 0.84437662, "learning_rate": 3.990489563834943e-06, "loss": 0.92637765, "num_input_tokens_seen": 21407625, "router_z_loss_clip": 5.14453125, "router_z_loss_mlp": 0.6027832, "step": 1000, "time_per_iteration": 2.6962852478027344 }, { "auxiliary_loss_clip": 0.06867585, "auxiliary_loss_mlp": 0.01321855, "balance_loss_clip": 0.06362192, "balance_loss_mlp": 0.0125989, "epoch": 0.06018337592063731, "flos": 27024113485440.0, "grad_norm": 5.113342075707075, "language_loss": 0.88219225, "learning_rate": 3.990451590400309e-06, "loss": 0.96408671, "num_input_tokens_seen": 21426835, "router_z_loss_clip": 5.0546875, "router_z_loss_mlp": 0.62036133, "step": 1001, "time_per_iteration": 2.7342276573181152 }, { "auxiliary_loss_clip": 0.06857201, "auxiliary_loss_mlp": 0.0131678, "balance_loss_clip": 0.0636451, "balance_loss_mlp": 0.01260751, "epoch": 0.06024349917330528, "flos": 25599990291840.0, "grad_norm": 6.574510548648764, "language_loss": 0.75855827, "learning_rate": 3.990413541487551e-06, "loss": 0.84029806, "num_input_tokens_seen": 21444920, "router_z_loss_clip": 4.93359375, "router_z_loss_mlp": 0.56005859, "step": 1002, "time_per_iteration": 2.706644058227539 }, { "auxiliary_loss_clip": 0.06870495, "auxiliary_loss_mlp": 0.01326654, "balance_loss_clip": 0.06365715, "balance_loss_mlp": 0.01264665, "epoch": 0.060303622425973244, "flos": 26139225242880.0, "grad_norm": 5.020169281149363, "language_loss": 0.79107821, "learning_rate": 3.990375417098112e-06, "loss": 0.87304968, "num_input_tokens_seen": 21463555, "router_z_loss_clip": 5.0546875, "router_z_loss_mlp": 0.62011719, "step": 1003, "time_per_iteration": 2.6606953144073486 }, { "auxiliary_loss_clip": 0.0686712, "auxiliary_loss_mlp": 0.01318006, "balance_loss_clip": 0.06356932, "balance_loss_mlp": 0.01257067, "epoch": 0.060363745678641216, "flos": 20383627812480.0, "grad_norm": 52.33729258762881, "language_loss": 0.72317886, "learning_rate": 3.990337217233437e-06, "loss": 0.80503011, "num_input_tokens_seen": 21481990, "router_z_loss_clip": 5.09765625, "router_z_loss_mlp": 0.60986328, "step": 1004, "time_per_iteration": 4.0808351039886475 }, { "auxiliary_loss_clip": 0.06884186, "auxiliary_loss_mlp": 0.01319799, "balance_loss_clip": 0.06372617, "balance_loss_mlp": 0.01258812, "epoch": 0.06042386893130918, "flos": 17754999776640.0, "grad_norm": 6.950633703329809, "language_loss": 0.86387587, "learning_rate": 3.990298941894976e-06, "loss": 0.9459157, "num_input_tokens_seen": 21500385, "router_z_loss_clip": 5.11328125, "router_z_loss_mlp": 0.60986328, "step": 1005, "time_per_iteration": 2.6184909343719482 }, { "auxiliary_loss_clip": 0.06674859, "auxiliary_loss_mlp": 0.01279765, "balance_loss_clip": 0.06342275, "balance_loss_mlp": 0.01251822, "epoch": 0.06048399218397715, "flos": 68559110945280.0, "grad_norm": 0.8716805347454917, "language_loss": 0.58911711, "learning_rate": 3.9902605910841794e-06, "loss": 0.66866338, "num_input_tokens_seen": 21561040, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.27954102, "step": 1006, "time_per_iteration": 4.688751697540283 }, { "auxiliary_loss_clip": 0.06885125, "auxiliary_loss_mlp": 0.01317343, "balance_loss_clip": 0.06370768, "balance_loss_mlp": 0.01255974, "epoch": 0.060544115436645125, "flos": 23265262851840.0, "grad_norm": 7.618982490380967, "language_loss": 0.76937538, "learning_rate": 3.990222164802503e-06, "loss": 0.85140002, "num_input_tokens_seen": 21580655, "router_z_loss_clip": 5.1328125, "router_z_loss_mlp": 0.61328125, "step": 1007, "time_per_iteration": 2.620565414428711 }, { "auxiliary_loss_clip": 0.06877994, "auxiliary_loss_mlp": 0.01321669, "balance_loss_clip": 0.06372695, "balance_loss_mlp": 0.01261921, "epoch": 0.06060423868931309, "flos": 23885236811520.0, "grad_norm": 10.162693015713062, "language_loss": 0.82948869, "learning_rate": 3.9901836630514006e-06, "loss": 0.91148531, "num_input_tokens_seen": 21599650, "router_z_loss_clip": 5.0546875, "router_z_loss_mlp": 0.59716797, "step": 1008, "time_per_iteration": 4.153787612915039 }, { "auxiliary_loss_clip": 0.06850334, "auxiliary_loss_mlp": 0.01310348, "balance_loss_clip": 0.0636135, "balance_loss_mlp": 0.01255035, "epoch": 0.06066436194198106, "flos": 18733010232960.0, "grad_norm": 4.175121436681379, "language_loss": 0.80435932, "learning_rate": 3.990145085832335e-06, "loss": 0.88596618, "num_input_tokens_seen": 21617550, "router_z_loss_clip": 4.88671875, "router_z_loss_mlp": 0.55322266, "step": 1009, "time_per_iteration": 4.057676315307617 }, { "auxiliary_loss_clip": 0.06842222, "auxiliary_loss_mlp": 0.01311393, "balance_loss_clip": 0.06361406, "balance_loss_mlp": 0.01259656, "epoch": 0.06072448519464903, "flos": 24646689590400.0, "grad_norm": 8.430255230108985, "language_loss": 0.95037049, "learning_rate": 3.990106433146769e-06, "loss": 1.0319066, "num_input_tokens_seen": 21635865, "router_z_loss_clip": 4.80859375, "router_z_loss_mlp": 0.51757812, "step": 1010, "time_per_iteration": 2.6565613746643066 }, { "auxiliary_loss_clip": 0.06886148, "auxiliary_loss_mlp": 0.01323089, "balance_loss_clip": 0.06360517, "balance_loss_mlp": 0.0125452, "epoch": 0.060784608447317, "flos": 17383672408320.0, "grad_norm": 5.005053540862375, "language_loss": 0.75016463, "learning_rate": 3.9900677049961665e-06, "loss": 0.83225703, "num_input_tokens_seen": 21653945, "router_z_loss_clip": 5.26171875, "router_z_loss_mlp": 0.68554688, "step": 1011, "time_per_iteration": 2.627919912338257 }, { "auxiliary_loss_clip": 0.06849347, "auxiliary_loss_mlp": 0.01321108, "balance_loss_clip": 0.06361511, "balance_loss_mlp": 0.01260144, "epoch": 0.06084473169998497, "flos": 23698336279680.0, "grad_norm": 6.3469637171639794, "language_loss": 0.8885048, "learning_rate": 3.990028901381999e-06, "loss": 0.97020936, "num_input_tokens_seen": 21671230, "router_z_loss_clip": 4.87890625, "router_z_loss_mlp": 0.60961914, "step": 1012, "time_per_iteration": 2.630528450012207 }, { "auxiliary_loss_clip": 0.06852879, "auxiliary_loss_mlp": 0.01319191, "balance_loss_clip": 0.06353416, "balance_loss_mlp": 0.01259539, "epoch": 0.06090485495265294, "flos": 23552455121280.0, "grad_norm": 3.8367681335796577, "language_loss": 0.78544354, "learning_rate": 3.989990022305734e-06, "loss": 0.86716431, "num_input_tokens_seen": 21691155, "router_z_loss_clip": 4.98828125, "router_z_loss_mlp": 0.59643555, "step": 1013, "time_per_iteration": 2.647179126739502 }, { "auxiliary_loss_clip": 0.06856604, "auxiliary_loss_mlp": 0.01324648, "balance_loss_clip": 0.06354481, "balance_loss_mlp": 0.01259607, "epoch": 0.06096497820532091, "flos": 20345501404800.0, "grad_norm": 6.282388185016725, "language_loss": 0.8789115, "learning_rate": 3.98995106776885e-06, "loss": 0.96072406, "num_input_tokens_seen": 21707405, "router_z_loss_clip": 5.01953125, "router_z_loss_mlp": 0.65087891, "step": 1014, "time_per_iteration": 2.627119541168213 }, { "auxiliary_loss_clip": 0.06870042, "auxiliary_loss_mlp": 0.01324334, "balance_loss_clip": 0.06346966, "balance_loss_mlp": 0.01257815, "epoch": 0.061025101457988874, "flos": 26945638536960.0, "grad_norm": 6.866918037825997, "language_loss": 0.76548421, "learning_rate": 3.98991203777282e-06, "loss": 0.84742796, "num_input_tokens_seen": 21728090, "router_z_loss_clip": 5.23046875, "router_z_loss_mlp": 0.66552734, "step": 1015, "time_per_iteration": 2.685600996017456 }, { "auxiliary_loss_clip": 0.06839433, "auxiliary_loss_mlp": 0.01319665, "balance_loss_clip": 0.0636069, "balance_loss_mlp": 0.01264566, "epoch": 0.061085224710656846, "flos": 25382216730240.0, "grad_norm": 2.6202088997376807, "language_loss": 0.81206697, "learning_rate": 3.9898729323191275e-06, "loss": 0.89365792, "num_input_tokens_seen": 21747950, "router_z_loss_clip": 4.79296875, "router_z_loss_mlp": 0.55053711, "step": 1016, "time_per_iteration": 2.6733744144439697 }, { "auxiliary_loss_clip": 0.06849083, "auxiliary_loss_mlp": 0.01321655, "balance_loss_clip": 0.06358337, "balance_loss_mlp": 0.01262718, "epoch": 0.06114534796332482, "flos": 24831326062080.0, "grad_norm": 4.0372444487790045, "language_loss": 0.7777856, "learning_rate": 3.989833751409254e-06, "loss": 0.85949302, "num_input_tokens_seen": 21767900, "router_z_loss_clip": 4.90625, "router_z_loss_mlp": 0.58959961, "step": 1017, "time_per_iteration": 2.650970935821533 }, { "auxiliary_loss_clip": 0.06867677, "auxiliary_loss_mlp": 0.01318357, "balance_loss_clip": 0.06350835, "balance_loss_mlp": 0.01254747, "epoch": 0.061205471215992784, "flos": 20637724919040.0, "grad_norm": 4.274258725704856, "language_loss": 0.88345933, "learning_rate": 3.989794495044685e-06, "loss": 0.96531969, "num_input_tokens_seen": 21787375, "router_z_loss_clip": 5.17578125, "router_z_loss_mlp": 0.63623047, "step": 1018, "time_per_iteration": 2.68355655670166 }, { "auxiliary_loss_clip": 0.06852833, "auxiliary_loss_mlp": 0.01321412, "balance_loss_clip": 0.06361751, "balance_loss_mlp": 0.01259518, "epoch": 0.061265594468660756, "flos": 16513919827200.0, "grad_norm": 9.73299173720135, "language_loss": 0.80906922, "learning_rate": 3.989755163226909e-06, "loss": 0.89081168, "num_input_tokens_seen": 21806275, "router_z_loss_clip": 4.90625, "router_z_loss_mlp": 0.61914062, "step": 1019, "time_per_iteration": 2.5820364952087402 }, { "auxiliary_loss_clip": 0.06861682, "auxiliary_loss_mlp": 0.01316015, "balance_loss_clip": 0.06366628, "balance_loss_mlp": 0.01257221, "epoch": 0.06132571772132872, "flos": 26252765925120.0, "grad_norm": 14.025506196625736, "language_loss": 0.86520386, "learning_rate": 3.989715755957418e-06, "loss": 0.94698083, "num_input_tokens_seen": 21826430, "router_z_loss_clip": 4.953125, "router_z_loss_mlp": 0.58764648, "step": 1020, "time_per_iteration": 2.6812808513641357 }, { "auxiliary_loss_clip": 0.06865478, "auxiliary_loss_mlp": 0.01316132, "balance_loss_clip": 0.06372684, "balance_loss_mlp": 0.01257004, "epoch": 0.06138584097399669, "flos": 37423869062400.0, "grad_norm": 4.793166958812779, "language_loss": 0.8054809, "learning_rate": 3.989676273237705e-06, "loss": 0.88729703, "num_input_tokens_seen": 21847800, "router_z_loss_clip": 4.9296875, "router_z_loss_mlp": 0.59130859, "step": 1021, "time_per_iteration": 2.759984016418457 }, { "auxiliary_loss_clip": 0.06866466, "auxiliary_loss_mlp": 0.01318179, "balance_loss_clip": 0.06370437, "balance_loss_mlp": 0.01261054, "epoch": 0.061445964226664665, "flos": 17426410790400.0, "grad_norm": 11.091010148407296, "language_loss": 0.90623611, "learning_rate": 3.9896367150692705e-06, "loss": 0.98808253, "num_input_tokens_seen": 21863385, "router_z_loss_clip": 4.95703125, "router_z_loss_mlp": 0.5715332, "step": 1022, "time_per_iteration": 2.62958025932312 }, { "auxiliary_loss_clip": 0.06872213, "auxiliary_loss_mlp": 0.0132584, "balance_loss_clip": 0.06382899, "balance_loss_mlp": 0.01266951, "epoch": 0.06150608747933263, "flos": 22606365870720.0, "grad_norm": 3.5849589336157317, "language_loss": 0.8462556, "learning_rate": 3.989597081453611e-06, "loss": 0.92823613, "num_input_tokens_seen": 21881880, "router_z_loss_clip": 4.89453125, "router_z_loss_mlp": 0.58886719, "step": 1023, "time_per_iteration": 2.677443265914917 }, { "auxiliary_loss_clip": 0.06678022, "auxiliary_loss_mlp": 0.01275222, "balance_loss_clip": 0.06354097, "balance_loss_mlp": 0.01248495, "epoch": 0.0615662107320006, "flos": 56758097139840.0, "grad_norm": 0.8811283573075386, "language_loss": 0.64872718, "learning_rate": 3.989557372392231e-06, "loss": 0.72825968, "num_input_tokens_seen": 21940550, "router_z_loss_clip": 3.23828125, "router_z_loss_mlp": 0.26782227, "step": 1024, "time_per_iteration": 3.4110305309295654 }, { "auxiliary_loss_clip": 0.0690044, "auxiliary_loss_mlp": 0.01334672, "balance_loss_clip": 0.06389843, "balance_loss_mlp": 0.01266055, "epoch": 0.06162633398466857, "flos": 22571342064000.0, "grad_norm": 6.922523653367844, "language_loss": 0.91110694, "learning_rate": 3.989517587886636e-06, "loss": 0.99345803, "num_input_tokens_seen": 21958390, "router_z_loss_clip": 5.10546875, "router_z_loss_mlp": 0.68554688, "step": 1025, "time_per_iteration": 2.679743766784668 }, { "auxiliary_loss_clip": 0.0688673, "auxiliary_loss_mlp": 0.01328607, "balance_loss_clip": 0.06385951, "balance_loss_mlp": 0.01266403, "epoch": 0.06168645723733654, "flos": 25600158000000.0, "grad_norm": 4.958218017303581, "language_loss": 0.86516023, "learning_rate": 3.989477727938335e-06, "loss": 0.94731361, "num_input_tokens_seen": 21978625, "router_z_loss_clip": 5.0078125, "router_z_loss_mlp": 0.62182617, "step": 1026, "time_per_iteration": 2.713397264480591 }, { "auxiliary_loss_clip": 0.06892787, "auxiliary_loss_mlp": 0.01331471, "balance_loss_clip": 0.06382988, "balance_loss_mlp": 0.01262139, "epoch": 0.06174658049000451, "flos": 16003461553920.0, "grad_norm": 4.001085447830148, "language_loss": 0.83895862, "learning_rate": 3.989437792548839e-06, "loss": 0.92120117, "num_input_tokens_seen": 21996035, "router_z_loss_clip": 5.09375, "router_z_loss_mlp": 0.69287109, "step": 1027, "time_per_iteration": 2.6023991107940674 }, { "auxiliary_loss_clip": 0.06875727, "auxiliary_loss_mlp": 0.01329022, "balance_loss_clip": 0.06370571, "balance_loss_mlp": 0.01264029, "epoch": 0.06180670374267248, "flos": 11289842772480.0, "grad_norm": 8.379930298353582, "language_loss": 0.86376554, "learning_rate": 3.989397781719663e-06, "loss": 0.94581306, "num_input_tokens_seen": 22011625, "router_z_loss_clip": 5.0546875, "router_z_loss_mlp": 0.65039062, "step": 1028, "time_per_iteration": 2.5792932510375977 }, { "auxiliary_loss_clip": 0.06642863, "auxiliary_loss_mlp": 0.01274308, "balance_loss_clip": 0.06322081, "balance_loss_mlp": 0.01248392, "epoch": 0.06186682699534045, "flos": 65147647340160.0, "grad_norm": 5.612689283654183, "language_loss": 0.6053586, "learning_rate": 3.989357695452323e-06, "loss": 0.68453032, "num_input_tokens_seen": 22066035, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.25952148, "step": 1029, "time_per_iteration": 3.155982732772827 }, { "auxiliary_loss_clip": 0.06876817, "auxiliary_loss_mlp": 0.01320925, "balance_loss_clip": 0.06363089, "balance_loss_mlp": 0.01255407, "epoch": 0.061926950248008414, "flos": 21112111209600.0, "grad_norm": 5.422560120585665, "language_loss": 0.84847832, "learning_rate": 3.98931753374834e-06, "loss": 0.9304558, "num_input_tokens_seen": 22085015, "router_z_loss_clip": 5.140625, "router_z_loss_mlp": 0.65527344, "step": 1030, "time_per_iteration": 2.6272146701812744 }, { "auxiliary_loss_clip": 0.06853147, "auxiliary_loss_mlp": 0.01323348, "balance_loss_clip": 0.06358467, "balance_loss_mlp": 0.01263171, "epoch": 0.061987073500676386, "flos": 17754161235840.0, "grad_norm": 5.653793209467307, "language_loss": 0.83075261, "learning_rate": 3.989277296609237e-06, "loss": 0.91251755, "num_input_tokens_seen": 22102775, "router_z_loss_clip": 4.94921875, "router_z_loss_mlp": 0.60131836, "step": 1031, "time_per_iteration": 2.597930431365967 }, { "auxiliary_loss_clip": 0.06855875, "auxiliary_loss_mlp": 0.01319626, "balance_loss_clip": 0.06359681, "balance_loss_mlp": 0.01263526, "epoch": 0.06204719675334436, "flos": 21842858666880.0, "grad_norm": 6.928224508705342, "language_loss": 0.79891574, "learning_rate": 3.98923698403654e-06, "loss": 0.88067073, "num_input_tokens_seen": 22121680, "router_z_loss_clip": 4.96484375, "router_z_loss_mlp": 0.56079102, "step": 1032, "time_per_iteration": 2.620595932006836 }, { "auxiliary_loss_clip": 0.06853209, "auxiliary_loss_mlp": 0.0133271, "balance_loss_clip": 0.06358507, "balance_loss_mlp": 0.01271317, "epoch": 0.06210732000601232, "flos": 19359650592000.0, "grad_norm": 7.0469522188329465, "language_loss": 0.92051417, "learning_rate": 3.989196596031776e-06, "loss": 1.00237334, "num_input_tokens_seen": 22138155, "router_z_loss_clip": 4.9453125, "router_z_loss_mlp": 0.61303711, "step": 1033, "time_per_iteration": 2.584752321243286 }, { "auxiliary_loss_clip": 0.06854545, "auxiliary_loss_mlp": 0.01327415, "balance_loss_clip": 0.06362561, "balance_loss_mlp": 0.01269575, "epoch": 0.062167443258680295, "flos": 24755534444160.0, "grad_norm": 35.55259243773466, "language_loss": 0.86645699, "learning_rate": 3.989156132596479e-06, "loss": 0.94827658, "num_input_tokens_seen": 22157420, "router_z_loss_clip": 4.9296875, "router_z_loss_mlp": 0.57958984, "step": 1034, "time_per_iteration": 2.634554624557495 }, { "auxiliary_loss_clip": 0.06830575, "auxiliary_loss_mlp": 0.01325947, "balance_loss_clip": 0.06356403, "balance_loss_mlp": 0.01272184, "epoch": 0.06222756651134827, "flos": 34466903602560.0, "grad_norm": 4.690570408921569, "language_loss": 0.83220553, "learning_rate": 3.989115593732182e-06, "loss": 0.91377074, "num_input_tokens_seen": 22178620, "router_z_loss_clip": 4.734375, "router_z_loss_mlp": 0.53833008, "step": 1035, "time_per_iteration": 2.7182929515838623 }, { "auxiliary_loss_clip": 0.06838702, "auxiliary_loss_mlp": 0.01319094, "balance_loss_clip": 0.06351291, "balance_loss_mlp": 0.0126185, "epoch": 0.06228768976401623, "flos": 25673601703680.0, "grad_norm": 8.424988370037143, "language_loss": 0.80251509, "learning_rate": 3.989074979440421e-06, "loss": 0.88409305, "num_input_tokens_seen": 22197125, "router_z_loss_clip": 4.875, "router_z_loss_mlp": 0.57299805, "step": 1036, "time_per_iteration": 2.694938898086548 }, { "auxiliary_loss_clip": 0.06840621, "auxiliary_loss_mlp": 0.0132168, "balance_loss_clip": 0.06351113, "balance_loss_mlp": 0.01263268, "epoch": 0.062347813016684205, "flos": 25301687356800.0, "grad_norm": 5.747367460275316, "language_loss": 0.88192236, "learning_rate": 3.989034289722739e-06, "loss": 0.96354532, "num_input_tokens_seen": 22217575, "router_z_loss_clip": 4.890625, "router_z_loss_mlp": 0.58398438, "step": 1037, "time_per_iteration": 2.655034303665161 }, { "auxiliary_loss_clip": 0.06831633, "auxiliary_loss_mlp": 0.013261, "balance_loss_clip": 0.06348096, "balance_loss_mlp": 0.01269881, "epoch": 0.06240793626935217, "flos": 26914388163840.0, "grad_norm": 5.963612374518066, "language_loss": 0.82725751, "learning_rate": 3.988993524580676e-06, "loss": 0.90883476, "num_input_tokens_seen": 22236840, "router_z_loss_clip": 4.8359375, "router_z_loss_mlp": 0.56152344, "step": 1038, "time_per_iteration": 2.6616322994232178 }, { "auxiliary_loss_clip": 0.06830563, "auxiliary_loss_mlp": 0.01323214, "balance_loss_clip": 0.06352188, "balance_loss_mlp": 0.01267234, "epoch": 0.06246805952202014, "flos": 21622108285440.0, "grad_norm": 3.6679091568585602, "language_loss": 0.87513053, "learning_rate": 3.98895268401578e-06, "loss": 0.95666838, "num_input_tokens_seen": 22256465, "router_z_loss_clip": 4.78515625, "router_z_loss_mlp": 0.55957031, "step": 1039, "time_per_iteration": 2.642929792404175 }, { "auxiliary_loss_clip": 0.06838046, "auxiliary_loss_mlp": 0.01338702, "balance_loss_clip": 0.06352444, "balance_loss_mlp": 0.0128141, "epoch": 0.0625281827746881, "flos": 19316954136960.0, "grad_norm": 4.617489413203002, "language_loss": 0.83654046, "learning_rate": 3.9889117680296e-06, "loss": 0.91830802, "num_input_tokens_seen": 22274025, "router_z_loss_clip": 4.86328125, "router_z_loss_mlp": 0.57299805, "step": 1040, "time_per_iteration": 2.632106065750122 }, { "auxiliary_loss_clip": 0.06844001, "auxiliary_loss_mlp": 0.0134222, "balance_loss_clip": 0.06346402, "balance_loss_mlp": 0.01277799, "epoch": 0.06258830602735609, "flos": 27753183861120.0, "grad_norm": 6.541722401123852, "language_loss": 0.71650124, "learning_rate": 3.988870776623685e-06, "loss": 0.79836339, "num_input_tokens_seen": 22292245, "router_z_loss_clip": 4.9765625, "router_z_loss_mlp": 0.64453125, "step": 1041, "time_per_iteration": 2.65826153755188 }, { "auxiliary_loss_clip": 0.06846659, "auxiliary_loss_mlp": 0.01325125, "balance_loss_clip": 0.06352075, "balance_loss_mlp": 0.01265616, "epoch": 0.06264842928002405, "flos": 23229442431360.0, "grad_norm": 6.167061341933019, "language_loss": 0.83664328, "learning_rate": 3.9888297097995905e-06, "loss": 0.91836113, "num_input_tokens_seen": 22311455, "router_z_loss_clip": 4.94921875, "router_z_loss_mlp": 0.5949707, "step": 1042, "time_per_iteration": 2.6226415634155273 }, { "auxiliary_loss_clip": 0.06855999, "auxiliary_loss_mlp": 0.01340525, "balance_loss_clip": 0.06370223, "balance_loss_mlp": 0.0128607, "epoch": 0.06270855253269202, "flos": 38408671699200.0, "grad_norm": 5.264561877553824, "language_loss": 0.78436804, "learning_rate": 3.988788567558874e-06, "loss": 0.86633337, "num_input_tokens_seen": 22333750, "router_z_loss_clip": 4.8671875, "router_z_loss_mlp": 0.54418945, "step": 1043, "time_per_iteration": 2.7688052654266357 }, { "auxiliary_loss_clip": 0.06843244, "auxiliary_loss_mlp": 0.01351561, "balance_loss_clip": 0.06372165, "balance_loss_mlp": 0.01292719, "epoch": 0.06276867578535998, "flos": 22459771952640.0, "grad_norm": 5.559015267028873, "language_loss": 0.95225823, "learning_rate": 3.988747349903097e-06, "loss": 1.03420639, "num_input_tokens_seen": 22351940, "router_z_loss_clip": 4.70703125, "router_z_loss_mlp": 0.58837891, "step": 1044, "time_per_iteration": 4.083786249160767 }, { "auxiliary_loss_clip": 0.06844236, "auxiliary_loss_mlp": 0.01349026, "balance_loss_clip": 0.06369142, "balance_loss_mlp": 0.01291996, "epoch": 0.06282879903802796, "flos": 22937176990080.0, "grad_norm": 4.50212962545353, "language_loss": 0.87046742, "learning_rate": 3.988706056833821e-06, "loss": 0.95240009, "num_input_tokens_seen": 22372085, "router_z_loss_clip": 4.75, "router_z_loss_mlp": 0.5703125, "step": 1045, "time_per_iteration": 2.676539182662964 }, { "auxiliary_loss_clip": 0.06848145, "auxiliary_loss_mlp": 0.01363719, "balance_loss_clip": 0.0637561, "balance_loss_mlp": 0.01301444, "epoch": 0.06288892229069593, "flos": 34827036451200.0, "grad_norm": 2.5158371554957975, "language_loss": 0.80367607, "learning_rate": 3.9886646883526125e-06, "loss": 0.8857947, "num_input_tokens_seen": 22392020, "router_z_loss_clip": 4.7265625, "router_z_loss_mlp": 0.62255859, "step": 1046, "time_per_iteration": 4.134838581085205 }, { "auxiliary_loss_clip": 0.06876896, "auxiliary_loss_mlp": 0.01343815, "balance_loss_clip": 0.06392741, "balance_loss_mlp": 0.01287071, "epoch": 0.06294904554336389, "flos": 19433178149760.0, "grad_norm": 5.3952887026944145, "language_loss": 0.80237287, "learning_rate": 3.988623244461039e-06, "loss": 0.88458002, "num_input_tokens_seen": 22411180, "router_z_loss_clip": 4.83984375, "router_z_loss_mlp": 0.56811523, "step": 1047, "time_per_iteration": 2.7239632606506348 }, { "auxiliary_loss_clip": 0.06899157, "auxiliary_loss_mlp": 0.01334758, "balance_loss_clip": 0.06397451, "balance_loss_mlp": 0.01272817, "epoch": 0.06300916879603187, "flos": 40671464808960.0, "grad_norm": 10.012257567123674, "language_loss": 0.79500884, "learning_rate": 3.988581725160672e-06, "loss": 0.87734801, "num_input_tokens_seen": 22435105, "router_z_loss_clip": 5.00390625, "router_z_loss_mlp": 0.61865234, "step": 1048, "time_per_iteration": 5.467214345932007 }, { "auxiliary_loss_clip": 0.06892899, "auxiliary_loss_mlp": 0.01336403, "balance_loss_clip": 0.06402062, "balance_loss_mlp": 0.01272411, "epoch": 0.06306929204869983, "flos": 23810703004800.0, "grad_norm": 4.433899385468539, "language_loss": 0.80013376, "learning_rate": 3.988540130453087e-06, "loss": 0.88242674, "num_input_tokens_seen": 22452710, "router_z_loss_clip": 4.90625, "router_z_loss_mlp": 0.64013672, "step": 1049, "time_per_iteration": 2.648219347000122 }, { "auxiliary_loss_clip": 0.06873557, "auxiliary_loss_mlp": 0.01350254, "balance_loss_clip": 0.06390609, "balance_loss_mlp": 0.01288933, "epoch": 0.0631294153013678, "flos": 18921671700480.0, "grad_norm": 2.932422948076506, "language_loss": 0.84802842, "learning_rate": 3.988498460339862e-06, "loss": 0.9302665, "num_input_tokens_seen": 22470175, "router_z_loss_clip": 4.83203125, "router_z_loss_mlp": 0.61376953, "step": 1050, "time_per_iteration": 2.5898306369781494 }, { "auxiliary_loss_clip": 0.06856458, "auxiliary_loss_mlp": 0.0134494, "balance_loss_clip": 0.0639476, "balance_loss_mlp": 0.01283333, "epoch": 0.06318953855403578, "flos": 24287101793280.0, "grad_norm": 6.567707395205666, "language_loss": 0.8031314, "learning_rate": 3.988456714822575e-06, "loss": 0.88514543, "num_input_tokens_seen": 22490020, "router_z_loss_clip": 4.6171875, "router_z_loss_mlp": 0.61572266, "step": 1051, "time_per_iteration": 2.6332619190216064 }, { "auxiliary_loss_clip": 0.06906091, "auxiliary_loss_mlp": 0.01334576, "balance_loss_clip": 0.06422383, "balance_loss_mlp": 0.01267724, "epoch": 0.06324966180670374, "flos": 22535563570560.0, "grad_norm": 4.322275567324227, "language_loss": 0.83723575, "learning_rate": 3.98841489390281e-06, "loss": 0.91964245, "num_input_tokens_seen": 22509685, "router_z_loss_clip": 4.83984375, "router_z_loss_mlp": 0.66845703, "step": 1052, "time_per_iteration": 2.628657817840576 }, { "auxiliary_loss_clip": 0.06923645, "auxiliary_loss_mlp": 0.01345581, "balance_loss_clip": 0.06420612, "balance_loss_mlp": 0.0127315, "epoch": 0.06330978505937171, "flos": 15783465859200.0, "grad_norm": 5.538589870966703, "language_loss": 0.80150855, "learning_rate": 3.988372997582155e-06, "loss": 0.88420081, "num_input_tokens_seen": 22527905, "router_z_loss_clip": 5.03515625, "router_z_loss_mlp": 0.72460938, "step": 1053, "time_per_iteration": 2.5896615982055664 }, { "auxiliary_loss_clip": 0.06878138, "auxiliary_loss_mlp": 0.01370786, "balance_loss_clip": 0.06404915, "balance_loss_mlp": 0.01297734, "epoch": 0.06336990831203967, "flos": 21477610719360.0, "grad_norm": 4.430391541130351, "language_loss": 0.87106156, "learning_rate": 3.988331025862195e-06, "loss": 0.95355076, "num_input_tokens_seen": 22546335, "router_z_loss_clip": 4.7265625, "router_z_loss_mlp": 0.73144531, "step": 1054, "time_per_iteration": 2.608200788497925 }, { "auxiliary_loss_clip": 0.06908351, "auxiliary_loss_mlp": 0.01350817, "balance_loss_clip": 0.06427412, "balance_loss_mlp": 0.01281103, "epoch": 0.06343003156470765, "flos": 18484824839040.0, "grad_norm": 4.05678556018871, "language_loss": 0.88354146, "learning_rate": 3.9882889787445225e-06, "loss": 0.96613324, "num_input_tokens_seen": 22563885, "router_z_loss_clip": 4.80859375, "router_z_loss_mlp": 0.69677734, "step": 1055, "time_per_iteration": 2.588425636291504 }, { "auxiliary_loss_clip": 0.06923386, "auxiliary_loss_mlp": 0.01361975, "balance_loss_clip": 0.06410407, "balance_loss_mlp": 0.01280531, "epoch": 0.06349015481737562, "flos": 25161801765120.0, "grad_norm": 6.527599061635347, "language_loss": 0.84326565, "learning_rate": 3.988246856230734e-06, "loss": 0.92611927, "num_input_tokens_seen": 22583035, "router_z_loss_clip": 5.1328125, "router_z_loss_mlp": 0.81445312, "step": 1056, "time_per_iteration": 2.6324806213378906 }, { "auxiliary_loss_clip": 0.06934687, "auxiliary_loss_mlp": 0.01368432, "balance_loss_clip": 0.0642418, "balance_loss_mlp": 0.0128756, "epoch": 0.06355027807004358, "flos": 26879322430080.0, "grad_norm": 5.092600694152034, "language_loss": 0.84852326, "learning_rate": 3.988204658322426e-06, "loss": 0.93155444, "num_input_tokens_seen": 22605055, "router_z_loss_clip": 5.10546875, "router_z_loss_mlp": 0.80908203, "step": 1057, "time_per_iteration": 2.6797921657562256 }, { "auxiliary_loss_clip": 0.06893437, "auxiliary_loss_mlp": 0.01385827, "balance_loss_clip": 0.06425765, "balance_loss_mlp": 0.01313491, "epoch": 0.06361040132271156, "flos": 21402951131520.0, "grad_norm": 2.8779765736399225, "language_loss": 0.85811734, "learning_rate": 3.988162385021196e-06, "loss": 0.94090998, "num_input_tokens_seen": 22623760, "router_z_loss_clip": 4.67578125, "router_z_loss_mlp": 0.72265625, "step": 1058, "time_per_iteration": 2.6181015968322754 }, { "auxiliary_loss_clip": 0.06921411, "auxiliary_loss_mlp": 0.01374239, "balance_loss_clip": 0.06427316, "balance_loss_mlp": 0.01292939, "epoch": 0.06367052457537953, "flos": 25739959737600.0, "grad_norm": 4.3431375687549245, "language_loss": 0.89418149, "learning_rate": 3.988120036328651e-06, "loss": 0.97713804, "num_input_tokens_seen": 22643000, "router_z_loss_clip": 4.9453125, "router_z_loss_mlp": 0.81298828, "step": 1059, "time_per_iteration": 2.672983407974243 }, { "auxiliary_loss_clip": 0.06931035, "auxiliary_loss_mlp": 0.01354239, "balance_loss_clip": 0.06434439, "balance_loss_mlp": 0.01276133, "epoch": 0.0637306478280475, "flos": 17635840871040.0, "grad_norm": 6.4536690916390835, "language_loss": 0.94330621, "learning_rate": 3.988077612246394e-06, "loss": 1.02615893, "num_input_tokens_seen": 22660460, "router_z_loss_clip": 4.96875, "router_z_loss_mlp": 0.78027344, "step": 1060, "time_per_iteration": 2.621184825897217 }, { "auxiliary_loss_clip": 0.06933829, "auxiliary_loss_mlp": 0.0135732, "balance_loss_clip": 0.06441894, "balance_loss_mlp": 0.0128279, "epoch": 0.06379077108071547, "flos": 13667727864960.0, "grad_norm": 3.033038013340172, "language_loss": 0.90222359, "learning_rate": 3.988035112776035e-06, "loss": 0.98513508, "num_input_tokens_seen": 22679270, "router_z_loss_clip": 4.91796875, "router_z_loss_mlp": 0.74560547, "step": 1061, "time_per_iteration": 2.663595676422119 }, { "auxiliary_loss_clip": 0.06947447, "auxiliary_loss_mlp": 0.01341808, "balance_loss_clip": 0.06442681, "balance_loss_mlp": 0.01266897, "epoch": 0.06385089433338344, "flos": 28486950065280.0, "grad_norm": 3.158690622301979, "language_loss": 0.79541624, "learning_rate": 3.987992537919185e-06, "loss": 0.87830889, "num_input_tokens_seen": 22699330, "router_z_loss_clip": 5.05078125, "router_z_loss_mlp": 0.74951172, "step": 1062, "time_per_iteration": 2.710247755050659 }, { "auxiliary_loss_clip": 0.06946395, "auxiliary_loss_mlp": 0.01344742, "balance_loss_clip": 0.06447402, "balance_loss_mlp": 0.0127603, "epoch": 0.0639110175860514, "flos": 24317052428160.0, "grad_norm": 4.866128309069476, "language_loss": 0.89005435, "learning_rate": 3.987949887677459e-06, "loss": 0.97296566, "num_input_tokens_seen": 22717945, "router_z_loss_clip": 4.984375, "router_z_loss_mlp": 0.6875, "step": 1063, "time_per_iteration": 2.6471641063690186 }, { "auxiliary_loss_clip": 0.0694098, "auxiliary_loss_mlp": 0.01341496, "balance_loss_clip": 0.06438962, "balance_loss_mlp": 0.01267968, "epoch": 0.06397114083871938, "flos": 22097291189760.0, "grad_norm": 2.996188316543514, "language_loss": 0.82736695, "learning_rate": 3.9879071620524744e-06, "loss": 0.91019171, "num_input_tokens_seen": 22736790, "router_z_loss_clip": 5.01953125, "router_z_loss_mlp": 0.73486328, "step": 1064, "time_per_iteration": 2.6168856620788574 }, { "auxiliary_loss_clip": 0.06917861, "auxiliary_loss_mlp": 0.01342764, "balance_loss_clip": 0.06432741, "balance_loss_mlp": 0.01270905, "epoch": 0.06403126409138735, "flos": 19578849672960.0, "grad_norm": 3.447845066533728, "language_loss": 0.86825591, "learning_rate": 3.987864361045851e-06, "loss": 0.95086217, "num_input_tokens_seen": 22754745, "router_z_loss_clip": 4.84765625, "router_z_loss_mlp": 0.71875, "step": 1065, "time_per_iteration": 2.637864828109741 }, { "auxiliary_loss_clip": 0.06943334, "auxiliary_loss_mlp": 0.01336069, "balance_loss_clip": 0.06434952, "balance_loss_mlp": 0.01262827, "epoch": 0.06409138734405531, "flos": 40816968624000.0, "grad_norm": 2.2452861545128173, "language_loss": 0.70360494, "learning_rate": 3.987821484659211e-06, "loss": 0.78639901, "num_input_tokens_seen": 22776780, "router_z_loss_clip": 5.08203125, "router_z_loss_mlp": 0.73193359, "step": 1066, "time_per_iteration": 2.791109085083008 }, { "auxiliary_loss_clip": 0.06938846, "auxiliary_loss_mlp": 0.01327179, "balance_loss_clip": 0.06439488, "balance_loss_mlp": 0.01258419, "epoch": 0.06415151059672328, "flos": 20446631683200.0, "grad_norm": 2.6274512817835123, "language_loss": 0.92377681, "learning_rate": 3.987778532894181e-06, "loss": 1.00643706, "num_input_tokens_seen": 22793915, "router_z_loss_clip": 4.99609375, "router_z_loss_mlp": 0.68798828, "step": 1067, "time_per_iteration": 2.635678291320801 }, { "auxiliary_loss_clip": 0.06954654, "auxiliary_loss_mlp": 0.01319324, "balance_loss_clip": 0.06453405, "balance_loss_mlp": 0.01255523, "epoch": 0.06421163384939126, "flos": 18077006217600.0, "grad_norm": 2.3444810849210413, "language_loss": 0.8671453, "learning_rate": 3.987735505752391e-06, "loss": 0.94988513, "num_input_tokens_seen": 22812670, "router_z_loss_clip": 5.01953125, "router_z_loss_mlp": 0.63769531, "step": 1068, "time_per_iteration": 2.6518359184265137 }, { "auxiliary_loss_clip": 0.06947501, "auxiliary_loss_mlp": 0.01330284, "balance_loss_clip": 0.06457137, "balance_loss_mlp": 0.01268963, "epoch": 0.06427175710205922, "flos": 25126526396160.0, "grad_norm": 3.380835712643114, "language_loss": 0.92220068, "learning_rate": 3.987692403235471e-06, "loss": 1.00497842, "num_input_tokens_seen": 22832440, "router_z_loss_clip": 4.90625, "router_z_loss_mlp": 0.61328125, "step": 1069, "time_per_iteration": 2.6594398021698 }, { "auxiliary_loss_clip": 0.06985693, "auxiliary_loss_mlp": 0.01326912, "balance_loss_clip": 0.06462373, "balance_loss_mlp": 0.01259249, "epoch": 0.06433188035472719, "flos": 17385684906240.0, "grad_norm": 5.654601638389748, "language_loss": 0.98059136, "learning_rate": 3.987649225345056e-06, "loss": 1.06371737, "num_input_tokens_seen": 22845495, "router_z_loss_clip": 5.23828125, "router_z_loss_mlp": 0.67724609, "step": 1070, "time_per_iteration": 2.64224910736084 }, { "auxiliary_loss_clip": 0.06949505, "auxiliary_loss_mlp": 0.01324121, "balance_loss_clip": 0.06443483, "balance_loss_mlp": 0.01263277, "epoch": 0.06439200360739517, "flos": 23552371267200.0, "grad_norm": 6.5174217913242165, "language_loss": 0.89278609, "learning_rate": 3.987605972082782e-06, "loss": 0.97552234, "num_input_tokens_seen": 22865390, "router_z_loss_clip": 5.05859375, "router_z_loss_mlp": 0.60864258, "step": 1071, "time_per_iteration": 2.807978630065918 }, { "auxiliary_loss_clip": 0.0693385, "auxiliary_loss_mlp": 0.01330999, "balance_loss_clip": 0.06443986, "balance_loss_mlp": 0.01272038, "epoch": 0.06445212686006313, "flos": 21986014567680.0, "grad_norm": 2.165742045776895, "language_loss": 0.78714955, "learning_rate": 3.987562643450292e-06, "loss": 0.86979806, "num_input_tokens_seen": 22885495, "router_z_loss_clip": 4.90625, "router_z_loss_mlp": 0.58959961, "step": 1072, "time_per_iteration": 2.6764063835144043 }, { "auxiliary_loss_clip": 0.06952445, "auxiliary_loss_mlp": 0.01328542, "balance_loss_clip": 0.06444757, "balance_loss_mlp": 0.01265505, "epoch": 0.0645122501127311, "flos": 25928369642880.0, "grad_norm": 2.863344885527646, "language_loss": 0.84136605, "learning_rate": 3.987519239449226e-06, "loss": 0.92417586, "num_input_tokens_seen": 22904845, "router_z_loss_clip": 5.07421875, "router_z_loss_mlp": 0.63012695, "step": 1073, "time_per_iteration": 2.661863327026367 }, { "auxiliary_loss_clip": 0.06942394, "auxiliary_loss_mlp": 0.01335109, "balance_loss_clip": 0.06452467, "balance_loss_mlp": 0.01277268, "epoch": 0.06457237336539907, "flos": 25632498476160.0, "grad_norm": 6.424571561911483, "language_loss": 0.81898731, "learning_rate": 3.987475760081233e-06, "loss": 0.90176237, "num_input_tokens_seen": 22925940, "router_z_loss_clip": 4.90234375, "router_z_loss_mlp": 0.57861328, "step": 1074, "time_per_iteration": 2.6717326641082764 }, { "auxiliary_loss_clip": 0.06913738, "auxiliary_loss_mlp": 0.01330334, "balance_loss_clip": 0.06424763, "balance_loss_mlp": 0.01269776, "epoch": 0.06463249661806704, "flos": 19470088673280.0, "grad_norm": 3.252270541042323, "language_loss": 0.82492226, "learning_rate": 3.987432205347958e-06, "loss": 0.90736294, "num_input_tokens_seen": 22944375, "router_z_loss_clip": 4.88671875, "router_z_loss_mlp": 0.60498047, "step": 1075, "time_per_iteration": 2.633288621902466 }, { "auxiliary_loss_clip": 0.06921678, "auxiliary_loss_mlp": 0.01322185, "balance_loss_clip": 0.06425065, "balance_loss_mlp": 0.01263677, "epoch": 0.064692619870735, "flos": 24504833427840.0, "grad_norm": 9.436010018582554, "language_loss": 0.90293998, "learning_rate": 3.987388575251055e-06, "loss": 0.98537862, "num_input_tokens_seen": 22959145, "router_z_loss_clip": 4.96484375, "router_z_loss_mlp": 0.58520508, "step": 1076, "time_per_iteration": 2.656564950942993 }, { "auxiliary_loss_clip": 0.0690427, "auxiliary_loss_mlp": 0.01320227, "balance_loss_clip": 0.06415409, "balance_loss_mlp": 0.01260194, "epoch": 0.06475274312340297, "flos": 17024252319360.0, "grad_norm": 2.47726460824121, "language_loss": 0.8340475, "learning_rate": 3.98734486979218e-06, "loss": 0.91629249, "num_input_tokens_seen": 22978100, "router_z_loss_clip": 4.88671875, "router_z_loss_mlp": 0.60107422, "step": 1077, "time_per_iteration": 2.614377021789551 }, { "auxiliary_loss_clip": 0.06937259, "auxiliary_loss_mlp": 0.01336934, "balance_loss_clip": 0.06412908, "balance_loss_mlp": 0.01266028, "epoch": 0.06481286637607095, "flos": 24579409161600.0, "grad_norm": 3.8877549935807294, "language_loss": 0.94395125, "learning_rate": 3.987301088972986e-06, "loss": 1.02669334, "num_input_tokens_seen": 22997285, "router_z_loss_clip": 5.23828125, "router_z_loss_mlp": 0.70996094, "step": 1078, "time_per_iteration": 2.6475040912628174 }, { "auxiliary_loss_clip": 0.06945011, "auxiliary_loss_mlp": 0.01342806, "balance_loss_clip": 0.06408297, "balance_loss_mlp": 0.01269564, "epoch": 0.06487298962873891, "flos": 21111985428480.0, "grad_norm": 2.6216898618687448, "language_loss": 0.80980271, "learning_rate": 3.987257232795137e-06, "loss": 0.89268088, "num_input_tokens_seen": 23016285, "router_z_loss_clip": 5.37109375, "router_z_loss_mlp": 0.73193359, "step": 1079, "time_per_iteration": 2.6077752113342285 }, { "auxiliary_loss_clip": 0.06908356, "auxiliary_loss_mlp": 0.01327869, "balance_loss_clip": 0.06399774, "balance_loss_mlp": 0.01261541, "epoch": 0.06493311288140688, "flos": 24615103800960.0, "grad_norm": 3.356424836294019, "language_loss": 0.71939301, "learning_rate": 3.987213301260294e-06, "loss": 0.80175531, "num_input_tokens_seen": 23036420, "router_z_loss_clip": 5.08203125, "router_z_loss_mlp": 0.66308594, "step": 1080, "time_per_iteration": 2.6570446491241455 }, { "auxiliary_loss_clip": 0.06895899, "auxiliary_loss_mlp": 0.01326801, "balance_loss_clip": 0.06391886, "balance_loss_mlp": 0.01257612, "epoch": 0.06499323613407486, "flos": 25345054644480.0, "grad_norm": 2.775301512581838, "language_loss": 0.75537187, "learning_rate": 3.987169294370123e-06, "loss": 0.83759886, "num_input_tokens_seen": 23056945, "router_z_loss_clip": 5.03515625, "router_z_loss_mlp": 0.69238281, "step": 1081, "time_per_iteration": 2.654404401779175 }, { "auxiliary_loss_clip": 0.06882126, "auxiliary_loss_mlp": 0.01323392, "balance_loss_clip": 0.06389876, "balance_loss_mlp": 0.01260401, "epoch": 0.06505335938674282, "flos": 20381908803840.0, "grad_norm": 3.838150454628084, "language_loss": 0.86954296, "learning_rate": 3.987125212126294e-06, "loss": 0.95159817, "num_input_tokens_seen": 23074940, "router_z_loss_clip": 4.92578125, "router_z_loss_mlp": 0.62939453, "step": 1082, "time_per_iteration": 2.6167423725128174 }, { "auxiliary_loss_clip": 0.06925024, "auxiliary_loss_mlp": 0.0134949, "balance_loss_clip": 0.06391876, "balance_loss_mlp": 0.01277773, "epoch": 0.06511348263941079, "flos": 25344970790400.0, "grad_norm": 3.5078957599555487, "language_loss": 0.85978454, "learning_rate": 3.987081054530478e-06, "loss": 0.94252968, "num_input_tokens_seen": 23093420, "router_z_loss_clip": 5.3359375, "router_z_loss_mlp": 0.71728516, "step": 1083, "time_per_iteration": 4.048933506011963 }, { "auxiliary_loss_clip": 0.06887761, "auxiliary_loss_mlp": 0.01343596, "balance_loss_clip": 0.06377752, "balance_loss_mlp": 0.01274789, "epoch": 0.06517360589207877, "flos": 20337912610560.0, "grad_norm": 3.3581742181278003, "language_loss": 0.82668042, "learning_rate": 3.987036821584348e-06, "loss": 0.90899396, "num_input_tokens_seen": 23111550, "router_z_loss_clip": 5.1015625, "router_z_loss_mlp": 0.68798828, "step": 1084, "time_per_iteration": 2.62243390083313 }, { "auxiliary_loss_clip": 0.06884155, "auxiliary_loss_mlp": 0.01341639, "balance_loss_clip": 0.06370962, "balance_loss_mlp": 0.01269207, "epoch": 0.06523372914474673, "flos": 31688956391040.0, "grad_norm": 4.6199503613484145, "language_loss": 0.6879971, "learning_rate": 3.986992513289584e-06, "loss": 0.77025497, "num_input_tokens_seen": 23130335, "router_z_loss_clip": 5.125, "router_z_loss_mlp": 0.72460938, "step": 1085, "time_per_iteration": 4.094900131225586 }, { "auxiliary_loss_clip": 0.06872599, "auxiliary_loss_mlp": 0.013372, "balance_loss_clip": 0.06376751, "balance_loss_mlp": 0.01273304, "epoch": 0.0652938523974147, "flos": 20784612326400.0, "grad_norm": 2.174149113434693, "language_loss": 0.78946465, "learning_rate": 3.9869481296478645e-06, "loss": 0.8715626, "num_input_tokens_seen": 23152380, "router_z_loss_clip": 4.953125, "router_z_loss_mlp": 0.63867188, "step": 1086, "time_per_iteration": 2.661574125289917 }, { "auxiliary_loss_clip": 0.0688162, "auxiliary_loss_mlp": 0.0132609, "balance_loss_clip": 0.063787, "balance_loss_mlp": 0.01262575, "epoch": 0.06535397565008266, "flos": 16696627655040.0, "grad_norm": 3.1397223684841404, "language_loss": 0.87736571, "learning_rate": 3.986903670660872e-06, "loss": 0.95944273, "num_input_tokens_seen": 23171630, "router_z_loss_clip": 5.03125, "router_z_loss_mlp": 0.63525391, "step": 1087, "time_per_iteration": 2.6267261505126953 }, { "auxiliary_loss_clip": 0.06901286, "auxiliary_loss_mlp": 0.01326999, "balance_loss_clip": 0.06384677, "balance_loss_mlp": 0.01262436, "epoch": 0.06541409890275064, "flos": 26875171653120.0, "grad_norm": 3.2581224770912636, "language_loss": 0.80225849, "learning_rate": 3.9868591363302945e-06, "loss": 0.88454133, "num_input_tokens_seen": 23192520, "router_z_loss_clip": 5.16796875, "router_z_loss_mlp": 0.64599609, "step": 1088, "time_per_iteration": 5.7078564167022705 }, { "auxiliary_loss_clip": 0.06883702, "auxiliary_loss_mlp": 0.01328573, "balance_loss_clip": 0.06376368, "balance_loss_mlp": 0.01264581, "epoch": 0.06547422215541861, "flos": 20527831889280.0, "grad_norm": 2.278134310557921, "language_loss": 0.73107111, "learning_rate": 3.9868145266578186e-06, "loss": 0.81319392, "num_input_tokens_seen": 23210710, "router_z_loss_clip": 5.07421875, "router_z_loss_mlp": 0.63916016, "step": 1089, "time_per_iteration": 2.642862319946289 }, { "auxiliary_loss_clip": 0.06885035, "auxiliary_loss_mlp": 0.01319422, "balance_loss_clip": 0.06381325, "balance_loss_mlp": 0.01263728, "epoch": 0.06553434540808657, "flos": 22022925091200.0, "grad_norm": 9.83848658121989, "language_loss": 0.87785614, "learning_rate": 3.9867698416451366e-06, "loss": 0.95990074, "num_input_tokens_seen": 23230305, "router_z_loss_clip": 5.02734375, "router_z_loss_mlp": 0.55737305, "step": 1090, "time_per_iteration": 2.6325759887695312 }, { "auxiliary_loss_clip": 0.06894137, "auxiliary_loss_mlp": 0.01327288, "balance_loss_clip": 0.06387401, "balance_loss_mlp": 0.01267469, "epoch": 0.06559446866075455, "flos": 24615648852480.0, "grad_norm": 3.0953270692218684, "language_loss": 0.74385309, "learning_rate": 3.9867250812939434e-06, "loss": 0.82606733, "num_input_tokens_seen": 23249015, "router_z_loss_clip": 5.05859375, "router_z_loss_mlp": 0.59838867, "step": 1091, "time_per_iteration": 2.6618103981018066 }, { "auxiliary_loss_clip": 0.06902524, "auxiliary_loss_mlp": 0.01329691, "balance_loss_clip": 0.0639568, "balance_loss_mlp": 0.0126682, "epoch": 0.06565459191342252, "flos": 24280686956160.0, "grad_norm": 4.872786531648794, "language_loss": 0.8503592, "learning_rate": 3.986680245605936e-06, "loss": 0.93268132, "num_input_tokens_seen": 23265105, "router_z_loss_clip": 5.06640625, "router_z_loss_mlp": 0.62817383, "step": 1092, "time_per_iteration": 2.628657817840576 }, { "auxiliary_loss_clip": 0.06910273, "auxiliary_loss_mlp": 0.01317431, "balance_loss_clip": 0.06399979, "balance_loss_mlp": 0.01256205, "epoch": 0.06571471516609048, "flos": 24793493143680.0, "grad_norm": 3.9573129956526403, "language_loss": 0.73900837, "learning_rate": 3.986635334582814e-06, "loss": 0.82128543, "num_input_tokens_seen": 23283950, "router_z_loss_clip": 5.1015625, "router_z_loss_mlp": 0.61181641, "step": 1093, "time_per_iteration": 2.722295045852661 }, { "auxiliary_loss_clip": 0.06907663, "auxiliary_loss_mlp": 0.01327695, "balance_loss_clip": 0.06398495, "balance_loss_mlp": 0.01265134, "epoch": 0.06577483841875846, "flos": 26221347843840.0, "grad_norm": 2.4053510968662066, "language_loss": 0.90056098, "learning_rate": 3.986590348226282e-06, "loss": 0.98291457, "num_input_tokens_seen": 23305005, "router_z_loss_clip": 5.09375, "router_z_loss_mlp": 0.62548828, "step": 1094, "time_per_iteration": 2.6742162704467773 }, { "auxiliary_loss_clip": 0.06911839, "auxiliary_loss_mlp": 0.0132178, "balance_loss_clip": 0.06402736, "balance_loss_mlp": 0.01256405, "epoch": 0.06583496167142643, "flos": 25087519520640.0, "grad_norm": 3.7979886671504284, "language_loss": 0.83312082, "learning_rate": 3.986545286538044e-06, "loss": 0.91545707, "num_input_tokens_seen": 23323220, "router_z_loss_clip": 5.0859375, "router_z_loss_mlp": 0.65380859, "step": 1095, "time_per_iteration": 2.666663408279419 }, { "auxiliary_loss_clip": 0.06916954, "auxiliary_loss_mlp": 0.01319587, "balance_loss_clip": 0.06403794, "balance_loss_mlp": 0.01260578, "epoch": 0.06589508492409439, "flos": 25636900815360.0, "grad_norm": 5.338014421605042, "language_loss": 0.72403896, "learning_rate": 3.986500149519811e-06, "loss": 0.80640435, "num_input_tokens_seen": 23342235, "router_z_loss_clip": 5.125, "router_z_loss_mlp": 0.59033203, "step": 1096, "time_per_iteration": 2.6569652557373047 }, { "auxiliary_loss_clip": 0.06927451, "auxiliary_loss_mlp": 0.01320969, "balance_loss_clip": 0.06416817, "balance_loss_mlp": 0.01261698, "epoch": 0.06595520817676236, "flos": 23627701687680.0, "grad_norm": 17.7059113957259, "language_loss": 0.79668045, "learning_rate": 3.986454937173292e-06, "loss": 0.8791647, "num_input_tokens_seen": 23363680, "router_z_loss_clip": 5.109375, "router_z_loss_mlp": 0.59179688, "step": 1097, "time_per_iteration": 2.638474941253662 }, { "auxiliary_loss_clip": 0.06951246, "auxiliary_loss_mlp": 0.01332922, "balance_loss_clip": 0.06411075, "balance_loss_mlp": 0.01264687, "epoch": 0.06601533142943034, "flos": 33810019119360.0, "grad_norm": 6.7959479808492125, "language_loss": 0.80364764, "learning_rate": 3.986409649500203e-06, "loss": 0.88648927, "num_input_tokens_seen": 23385590, "router_z_loss_clip": 5.3984375, "router_z_loss_mlp": 0.68212891, "step": 1098, "time_per_iteration": 2.715017795562744 }, { "auxiliary_loss_clip": 0.06924744, "auxiliary_loss_mlp": 0.01331304, "balance_loss_clip": 0.06408165, "balance_loss_mlp": 0.01266168, "epoch": 0.0660754546820983, "flos": 20264175417600.0, "grad_norm": 2.37058899542206, "language_loss": 0.83990842, "learning_rate": 3.986364286502261e-06, "loss": 0.9224689, "num_input_tokens_seen": 23402945, "router_z_loss_clip": 5.17578125, "router_z_loss_mlp": 0.65136719, "step": 1099, "time_per_iteration": 2.6091742515563965 }, { "auxiliary_loss_clip": 0.06898868, "auxiliary_loss_mlp": 0.01312315, "balance_loss_clip": 0.06398935, "balance_loss_mlp": 0.01253664, "epoch": 0.06613557793476627, "flos": 19360195643520.0, "grad_norm": 2.5241668529484014, "language_loss": 0.86398578, "learning_rate": 3.986318848181186e-06, "loss": 0.94609761, "num_input_tokens_seen": 23421410, "router_z_loss_clip": 5.0, "router_z_loss_mlp": 0.58618164, "step": 1100, "time_per_iteration": 2.625481128692627 }, { "auxiliary_loss_clip": 0.06921602, "auxiliary_loss_mlp": 0.01335741, "balance_loss_clip": 0.06401949, "balance_loss_mlp": 0.01269461, "epoch": 0.06619570118743424, "flos": 13777788602880.0, "grad_norm": 14.600807967211853, "language_loss": 0.75291127, "learning_rate": 3.986273334538702e-06, "loss": 0.83548474, "num_input_tokens_seen": 23438870, "router_z_loss_clip": 5.1875, "router_z_loss_mlp": 0.66259766, "step": 1101, "time_per_iteration": 2.622021436691284 }, { "auxiliary_loss_clip": 0.06895723, "auxiliary_loss_mlp": 0.01320019, "balance_loss_clip": 0.06395142, "balance_loss_mlp": 0.01259699, "epoch": 0.06625582444010221, "flos": 17863593068160.0, "grad_norm": 3.4916740270552897, "language_loss": 0.89007831, "learning_rate": 3.986227745576533e-06, "loss": 0.97223568, "num_input_tokens_seen": 23456975, "router_z_loss_clip": 5.0078125, "router_z_loss_mlp": 0.60351562, "step": 1102, "time_per_iteration": 2.6042158603668213 }, { "auxiliary_loss_clip": 0.06920774, "auxiliary_loss_mlp": 0.01331912, "balance_loss_clip": 0.06402102, "balance_loss_mlp": 0.01265488, "epoch": 0.06631594769277017, "flos": 11843584479360.0, "grad_norm": 3.1179305034166034, "language_loss": 0.85211229, "learning_rate": 3.98618208129641e-06, "loss": 0.93463916, "num_input_tokens_seen": 23473440, "router_z_loss_clip": 5.1796875, "router_z_loss_mlp": 0.66455078, "step": 1103, "time_per_iteration": 2.5923678874969482 }, { "auxiliary_loss_clip": 0.06887359, "auxiliary_loss_mlp": 0.01326281, "balance_loss_clip": 0.06392767, "balance_loss_mlp": 0.01264102, "epoch": 0.06637607094543815, "flos": 19799683908480.0, "grad_norm": 7.938603272223706, "language_loss": 0.84505272, "learning_rate": 3.986136341700063e-06, "loss": 0.92718911, "num_input_tokens_seen": 23493880, "router_z_loss_clip": 4.94921875, "router_z_loss_mlp": 0.62182617, "step": 1104, "time_per_iteration": 2.633896589279175 }, { "auxiliary_loss_clip": 0.0687416, "auxiliary_loss_mlp": 0.01324751, "balance_loss_clip": 0.06384002, "balance_loss_mlp": 0.01267101, "epoch": 0.06643619419810612, "flos": 25493032154880.0, "grad_norm": 2.732702683871998, "language_loss": 0.82415807, "learning_rate": 3.986090526789227e-06, "loss": 0.90614724, "num_input_tokens_seen": 23514920, "router_z_loss_clip": 4.8984375, "router_z_loss_mlp": 0.57617188, "step": 1105, "time_per_iteration": 2.665058135986328 }, { "auxiliary_loss_clip": 0.06856485, "auxiliary_loss_mlp": 0.01323071, "balance_loss_clip": 0.063769, "balance_loss_mlp": 0.01268163, "epoch": 0.06649631745077408, "flos": 16952234135040.0, "grad_norm": 3.6139176532511175, "language_loss": 0.99075681, "learning_rate": 3.986044636565639e-06, "loss": 1.07255244, "num_input_tokens_seen": 23531635, "router_z_loss_clip": 4.796875, "router_z_loss_mlp": 0.54907227, "step": 1106, "time_per_iteration": 2.610689640045166 }, { "auxiliary_loss_clip": 0.0690133, "auxiliary_loss_mlp": 0.01342262, "balance_loss_clip": 0.06381699, "balance_loss_mlp": 0.01275457, "epoch": 0.06655644070344206, "flos": 17864431608960.0, "grad_norm": 2.5559539846689523, "language_loss": 0.84342331, "learning_rate": 3.985998671031039e-06, "loss": 0.92585921, "num_input_tokens_seen": 23551020, "router_z_loss_clip": 5.1953125, "router_z_loss_mlp": 0.66772461, "step": 1107, "time_per_iteration": 2.590402603149414 }, { "auxiliary_loss_clip": 0.06769942, "auxiliary_loss_mlp": 0.01356026, "balance_loss_clip": 0.06438436, "balance_loss_mlp": 0.0132248, "epoch": 0.06661656395611003, "flos": 61438033779840.0, "grad_norm": 0.8184463800805081, "language_loss": 0.56928402, "learning_rate": 3.9859526301871705e-06, "loss": 0.65054369, "num_input_tokens_seen": 23610675, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.33569336, "step": 1108, "time_per_iteration": 3.1976168155670166 }, { "auxiliary_loss_clip": 0.06895565, "auxiliary_loss_mlp": 0.01333151, "balance_loss_clip": 0.06378756, "balance_loss_mlp": 0.01270637, "epoch": 0.066676687208778, "flos": 20668304459520.0, "grad_norm": 4.089905026274518, "language_loss": 0.7453565, "learning_rate": 3.9859065140357795e-06, "loss": 0.82764363, "num_input_tokens_seen": 23628710, "router_z_loss_clip": 5.16796875, "router_z_loss_mlp": 0.62451172, "step": 1109, "time_per_iteration": 2.6029696464538574 }, { "auxiliary_loss_clip": 0.06871051, "auxiliary_loss_mlp": 0.01331771, "balance_loss_clip": 0.06365906, "balance_loss_mlp": 0.01268304, "epoch": 0.06673681046144596, "flos": 20929613016960.0, "grad_norm": 3.7188881184750686, "language_loss": 0.80508482, "learning_rate": 3.985860322578614e-06, "loss": 0.88711309, "num_input_tokens_seen": 23649160, "router_z_loss_clip": 5.05078125, "router_z_loss_mlp": 0.63427734, "step": 1110, "time_per_iteration": 2.665144920349121 }, { "auxiliary_loss_clip": 0.06875485, "auxiliary_loss_mlp": 0.01326616, "balance_loss_clip": 0.06366089, "balance_loss_mlp": 0.01264579, "epoch": 0.06679693371411394, "flos": 31073762113920.0, "grad_norm": 2.341021714346649, "language_loss": 0.74087751, "learning_rate": 3.985814055817427e-06, "loss": 0.82289851, "num_input_tokens_seen": 23671995, "router_z_loss_clip": 5.09375, "router_z_loss_mlp": 0.62060547, "step": 1111, "time_per_iteration": 2.7146060466766357 }, { "auxiliary_loss_clip": 0.06870697, "auxiliary_loss_mlp": 0.01328191, "balance_loss_clip": 0.06362369, "balance_loss_mlp": 0.01266489, "epoch": 0.0668570569667819, "flos": 21732630220800.0, "grad_norm": 3.2131363997788074, "language_loss": 0.8125906, "learning_rate": 3.985767713753971e-06, "loss": 0.89457953, "num_input_tokens_seen": 23690705, "router_z_loss_clip": 5.07421875, "router_z_loss_mlp": 0.61621094, "step": 1112, "time_per_iteration": 2.6147642135620117 }, { "auxiliary_loss_clip": 0.06851762, "auxiliary_loss_mlp": 0.01321728, "balance_loss_clip": 0.06355949, "balance_loss_mlp": 0.01258976, "epoch": 0.06691718021944987, "flos": 22753840256640.0, "grad_norm": 3.8978165384910617, "language_loss": 0.82427609, "learning_rate": 3.985721296390005e-06, "loss": 0.90601099, "num_input_tokens_seen": 23709990, "router_z_loss_clip": 4.9609375, "router_z_loss_mlp": 0.62744141, "step": 1113, "time_per_iteration": 2.6600024700164795 }, { "auxiliary_loss_clip": 0.06824666, "auxiliary_loss_mlp": 0.01314174, "balance_loss_clip": 0.06349224, "balance_loss_mlp": 0.0126053, "epoch": 0.06697730347211785, "flos": 16551333475200.0, "grad_norm": 2.8255048523487574, "language_loss": 0.85238278, "learning_rate": 3.985674803727289e-06, "loss": 0.93377125, "num_input_tokens_seen": 23728485, "router_z_loss_clip": 4.7578125, "router_z_loss_mlp": 0.53710938, "step": 1114, "time_per_iteration": 2.6163811683654785 }, { "auxiliary_loss_clip": 0.06730072, "auxiliary_loss_mlp": 0.01298796, "balance_loss_clip": 0.06402205, "balance_loss_mlp": 0.01264845, "epoch": 0.06703742672478581, "flos": 59801545612800.0, "grad_norm": 0.8011803045144809, "language_loss": 0.58187926, "learning_rate": 3.985628235767584e-06, "loss": 0.66216797, "num_input_tokens_seen": 23786650, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.33984375, "step": 1115, "time_per_iteration": 3.2278709411621094 }, { "auxiliary_loss_clip": 0.06841976, "auxiliary_loss_mlp": 0.01316391, "balance_loss_clip": 0.06348206, "balance_loss_mlp": 0.01256644, "epoch": 0.06709754997745378, "flos": 16805807925120.0, "grad_norm": 5.373631907903535, "language_loss": 0.93705082, "learning_rate": 3.985581592512658e-06, "loss": 1.01863456, "num_input_tokens_seen": 23802555, "router_z_loss_clip": 4.9375, "router_z_loss_mlp": 0.59790039, "step": 1116, "time_per_iteration": 2.5907366275787354 }, { "auxiliary_loss_clip": 0.06854642, "auxiliary_loss_mlp": 0.01322254, "balance_loss_clip": 0.06346735, "balance_loss_mlp": 0.01261123, "epoch": 0.06715767323012176, "flos": 22129883228160.0, "grad_norm": 3.9281028167235474, "language_loss": 0.89397568, "learning_rate": 3.985534873964279e-06, "loss": 0.97574472, "num_input_tokens_seen": 23822945, "router_z_loss_clip": 5.08203125, "router_z_loss_mlp": 0.61157227, "step": 1117, "time_per_iteration": 2.722459554672241 }, { "auxiliary_loss_clip": 0.06715775, "auxiliary_loss_mlp": 0.01280878, "balance_loss_clip": 0.06390567, "balance_loss_mlp": 0.01251577, "epoch": 0.06721779648278972, "flos": 66634522842240.0, "grad_norm": 0.8309690144411452, "language_loss": 0.59717494, "learning_rate": 3.985488080124218e-06, "loss": 0.67714143, "num_input_tokens_seen": 23874075, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.29223633, "step": 1118, "time_per_iteration": 3.2172603607177734 }, { "auxiliary_loss_clip": 0.06848867, "auxiliary_loss_mlp": 0.01320416, "balance_loss_clip": 0.06343876, "balance_loss_mlp": 0.01258284, "epoch": 0.06727791973545769, "flos": 22389011579520.0, "grad_norm": 3.82313765513172, "language_loss": 0.85961932, "learning_rate": 3.985441210994251e-06, "loss": 0.94131213, "num_input_tokens_seen": 23889720, "router_z_loss_clip": 5.046875, "router_z_loss_mlp": 0.62133789, "step": 1119, "time_per_iteration": 2.6166653633117676 }, { "auxiliary_loss_clip": 0.06816891, "auxiliary_loss_mlp": 0.01309856, "balance_loss_clip": 0.06336962, "balance_loss_mlp": 0.01255496, "epoch": 0.06733804298812565, "flos": 24287143720320.0, "grad_norm": 12.599764740941366, "language_loss": 0.86793315, "learning_rate": 3.9853942665761545e-06, "loss": 0.94920063, "num_input_tokens_seen": 23909385, "router_z_loss_clip": 4.796875, "router_z_loss_mlp": 0.54345703, "step": 1120, "time_per_iteration": 2.6280746459960938 }, { "auxiliary_loss_clip": 0.06826638, "auxiliary_loss_mlp": 0.01315898, "balance_loss_clip": 0.06340948, "balance_loss_mlp": 0.01259274, "epoch": 0.06739816624079363, "flos": 15922638691200.0, "grad_norm": 2.697857595011909, "language_loss": 0.80611122, "learning_rate": 3.985347246871708e-06, "loss": 0.88753653, "num_input_tokens_seen": 23926830, "router_z_loss_clip": 4.85546875, "router_z_loss_mlp": 0.56567383, "step": 1121, "time_per_iteration": 2.573779821395874 }, { "auxiliary_loss_clip": 0.0670975, "auxiliary_loss_mlp": 0.01284794, "balance_loss_clip": 0.06388508, "balance_loss_mlp": 0.01256017, "epoch": 0.0674582894934616, "flos": 71422031796480.0, "grad_norm": 0.736683268485418, "language_loss": 0.58405095, "learning_rate": 3.985300151882694e-06, "loss": 0.6639964, "num_input_tokens_seen": 23992640, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.28759766, "step": 1122, "time_per_iteration": 4.890187978744507 }, { "auxiliary_loss_clip": 0.06830083, "auxiliary_loss_mlp": 0.01319977, "balance_loss_clip": 0.06344315, "balance_loss_mlp": 0.01263662, "epoch": 0.06751841274612956, "flos": 25271988284160.0, "grad_norm": 4.831035747044008, "language_loss": 0.74046361, "learning_rate": 3.985252981610901e-06, "loss": 0.82196426, "num_input_tokens_seen": 24011135, "router_z_loss_clip": 4.85546875, "router_z_loss_mlp": 0.56347656, "step": 1123, "time_per_iteration": 2.6427416801452637 }, { "auxiliary_loss_clip": 0.06852065, "auxiliary_loss_mlp": 0.01330194, "balance_loss_clip": 0.0634281, "balance_loss_mlp": 0.01266536, "epoch": 0.06757853599879754, "flos": 23809067850240.0, "grad_norm": 6.283169913546201, "language_loss": 0.81228876, "learning_rate": 3.985205736058114e-06, "loss": 0.89411139, "num_input_tokens_seen": 24030695, "router_z_loss_clip": 5.09375, "router_z_loss_mlp": 0.63671875, "step": 1124, "time_per_iteration": 4.017887592315674 }, { "auxiliary_loss_clip": 0.06810746, "auxiliary_loss_mlp": 0.01319832, "balance_loss_clip": 0.06333528, "balance_loss_mlp": 0.01265186, "epoch": 0.0676386592514655, "flos": 21040260733440.0, "grad_norm": 4.7677507849652825, "language_loss": 0.74157143, "learning_rate": 3.985158415226128e-06, "loss": 0.82287723, "num_input_tokens_seen": 24050680, "router_z_loss_clip": 4.7734375, "router_z_loss_mlp": 0.54638672, "step": 1125, "time_per_iteration": 2.613260269165039 }, { "auxiliary_loss_clip": 0.06830087, "auxiliary_loss_mlp": 0.01328262, "balance_loss_clip": 0.06341281, "balance_loss_mlp": 0.01270708, "epoch": 0.06769878250413347, "flos": 25563331330560.0, "grad_norm": 8.90835044462011, "language_loss": 0.83906174, "learning_rate": 3.985111019116736e-06, "loss": 0.92064524, "num_input_tokens_seen": 24067205, "router_z_loss_clip": 4.890625, "router_z_loss_mlp": 0.57543945, "step": 1126, "time_per_iteration": 2.6647861003875732 }, { "auxiliary_loss_clip": 0.06697363, "auxiliary_loss_mlp": 0.01276992, "balance_loss_clip": 0.06377351, "balance_loss_mlp": 0.01247022, "epoch": 0.06775890575680145, "flos": 70676316385920.0, "grad_norm": 0.7696647068088143, "language_loss": 0.60035443, "learning_rate": 3.985063547731735e-06, "loss": 0.68009794, "num_input_tokens_seen": 24131320, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.29907227, "step": 1127, "time_per_iteration": 4.718736171722412 }, { "auxiliary_loss_clip": 0.06818643, "auxiliary_loss_mlp": 0.01328356, "balance_loss_clip": 0.06333777, "balance_loss_mlp": 0.01267082, "epoch": 0.06781902900946941, "flos": 24241051175040.0, "grad_norm": 3.088434625477809, "language_loss": 0.8316614, "learning_rate": 3.985016001072925e-06, "loss": 0.91313136, "num_input_tokens_seen": 24149930, "router_z_loss_clip": 4.84375, "router_z_loss_mlp": 0.61279297, "step": 1128, "time_per_iteration": 4.068151473999023 }, { "auxiliary_loss_clip": 0.06852579, "auxiliary_loss_mlp": 0.01338982, "balance_loss_clip": 0.0633589, "balance_loss_mlp": 0.01271677, "epoch": 0.06787915226213738, "flos": 22423825751040.0, "grad_norm": 6.155001555715284, "language_loss": 0.78524429, "learning_rate": 3.984968379142109e-06, "loss": 0.8671599, "num_input_tokens_seen": 24169590, "router_z_loss_clip": 5.1640625, "router_z_loss_mlp": 0.67260742, "step": 1129, "time_per_iteration": 2.6009809970855713 }, { "auxiliary_loss_clip": 0.06829156, "auxiliary_loss_mlp": 0.01326062, "balance_loss_clip": 0.06333808, "balance_loss_mlp": 0.01268985, "epoch": 0.06793927551480534, "flos": 37716092576640.0, "grad_norm": 2.948770394091891, "language_loss": 0.74888194, "learning_rate": 3.984920681941094e-06, "loss": 0.83043408, "num_input_tokens_seen": 24189965, "router_z_loss_clip": 4.94921875, "router_z_loss_mlp": 0.57128906, "step": 1130, "time_per_iteration": 2.793092966079712 }, { "auxiliary_loss_clip": 0.06811357, "auxiliary_loss_mlp": 0.01325836, "balance_loss_clip": 0.06329827, "balance_loss_mlp": 0.01267996, "epoch": 0.06799939876747332, "flos": 20637682992000.0, "grad_norm": 3.013115433126716, "language_loss": 0.82856643, "learning_rate": 3.984872909471688e-06, "loss": 0.9099384, "num_input_tokens_seen": 24208045, "router_z_loss_clip": 4.8125, "router_z_loss_mlp": 0.57910156, "step": 1131, "time_per_iteration": 2.6665101051330566 }, { "auxiliary_loss_clip": 0.06803429, "auxiliary_loss_mlp": 0.01319102, "balance_loss_clip": 0.06332237, "balance_loss_mlp": 0.0126355, "epoch": 0.06805952202014129, "flos": 14869759011840.0, "grad_norm": 3.3649392902510296, "language_loss": 0.82421893, "learning_rate": 3.984825061735701e-06, "loss": 0.90544426, "num_input_tokens_seen": 24223805, "router_z_loss_clip": 4.71484375, "router_z_loss_mlp": 0.55566406, "step": 1132, "time_per_iteration": 2.5645501613616943 }, { "auxiliary_loss_clip": 0.06818463, "auxiliary_loss_mlp": 0.01328652, "balance_loss_clip": 0.06338424, "balance_loss_mlp": 0.0127322, "epoch": 0.06811964527280925, "flos": 48920710147200.0, "grad_norm": 2.2181352293519128, "language_loss": 0.65825111, "learning_rate": 3.9847771387349495e-06, "loss": 0.73972225, "num_input_tokens_seen": 24249475, "router_z_loss_clip": 4.80078125, "router_z_loss_mlp": 0.55419922, "step": 1133, "time_per_iteration": 2.879998207092285 }, { "auxiliary_loss_clip": 0.06860361, "auxiliary_loss_mlp": 0.01335374, "balance_loss_clip": 0.06346437, "balance_loss_mlp": 0.01271526, "epoch": 0.06817976852547723, "flos": 15382649053440.0, "grad_norm": 3.9902542114360737, "language_loss": 0.77488816, "learning_rate": 3.9847291404712506e-06, "loss": 0.8568455, "num_input_tokens_seen": 24267980, "router_z_loss_clip": 5.13671875, "router_z_loss_mlp": 0.63916016, "step": 1134, "time_per_iteration": 2.634927749633789 }, { "auxiliary_loss_clip": 0.06814021, "auxiliary_loss_mlp": 0.01324886, "balance_loss_clip": 0.0633366, "balance_loss_mlp": 0.01271075, "epoch": 0.0682398917781452, "flos": 20161661546880.0, "grad_norm": 4.442775247175686, "language_loss": 0.89516115, "learning_rate": 3.984681066946423e-06, "loss": 0.97655028, "num_input_tokens_seen": 24286805, "router_z_loss_clip": 4.80078125, "router_z_loss_mlp": 0.53833008, "step": 1135, "time_per_iteration": 2.6078789234161377 }, { "auxiliary_loss_clip": 0.06828885, "auxiliary_loss_mlp": 0.01323692, "balance_loss_clip": 0.06336395, "balance_loss_mlp": 0.01264278, "epoch": 0.06830001503081316, "flos": 23447341774080.0, "grad_norm": 7.065414995397775, "language_loss": 0.8130306, "learning_rate": 3.984632918162291e-06, "loss": 0.8945564, "num_input_tokens_seen": 24305855, "router_z_loss_clip": 4.92578125, "router_z_loss_mlp": 0.59350586, "step": 1136, "time_per_iteration": 2.6461100578308105 }, { "auxiliary_loss_clip": 0.06831002, "auxiliary_loss_mlp": 0.01325558, "balance_loss_clip": 0.06344996, "balance_loss_mlp": 0.01266168, "epoch": 0.06836013828348114, "flos": 34358352238080.0, "grad_norm": 3.065914346648095, "language_loss": 0.87168711, "learning_rate": 3.984584694120679e-06, "loss": 0.95325267, "num_input_tokens_seen": 24326535, "router_z_loss_clip": 4.86328125, "router_z_loss_mlp": 0.59399414, "step": 1137, "time_per_iteration": 2.7953803539276123 }, { "auxiliary_loss_clip": 0.06824087, "auxiliary_loss_mlp": 0.01327699, "balance_loss_clip": 0.06346586, "balance_loss_mlp": 0.01273698, "epoch": 0.06842026153614911, "flos": 23155537530240.0, "grad_norm": 5.9280319979322025, "language_loss": 0.81393003, "learning_rate": 3.984536394823418e-06, "loss": 0.89544785, "num_input_tokens_seen": 24345810, "router_z_loss_clip": 4.7734375, "router_z_loss_mlp": 0.53930664, "step": 1138, "time_per_iteration": 2.603360891342163 }, { "auxiliary_loss_clip": 0.06844363, "auxiliary_loss_mlp": 0.01325073, "balance_loss_clip": 0.06355919, "balance_loss_mlp": 0.01266422, "epoch": 0.06848038478881707, "flos": 24616026195840.0, "grad_norm": 3.7767633139589387, "language_loss": 0.87646127, "learning_rate": 3.984488020272336e-06, "loss": 0.95815569, "num_input_tokens_seen": 24366095, "router_z_loss_clip": 4.8828125, "router_z_loss_mlp": 0.58642578, "step": 1139, "time_per_iteration": 2.7419023513793945 }, { "auxiliary_loss_clip": 0.06846154, "auxiliary_loss_mlp": 0.01317387, "balance_loss_clip": 0.06362832, "balance_loss_mlp": 0.01264601, "epoch": 0.06854050804148504, "flos": 40890663889920.0, "grad_norm": 5.784471136637954, "language_loss": 0.77257586, "learning_rate": 3.984439570469271e-06, "loss": 0.85421133, "num_input_tokens_seen": 24388665, "router_z_loss_clip": 4.828125, "router_z_loss_mlp": 0.52807617, "step": 1140, "time_per_iteration": 2.8028995990753174 }, { "auxiliary_loss_clip": 0.06852936, "auxiliary_loss_mlp": 0.01332919, "balance_loss_clip": 0.06364779, "balance_loss_mlp": 0.01272956, "epoch": 0.06860063129415302, "flos": 31694448833280.0, "grad_norm": 4.622340982562495, "language_loss": 0.71029729, "learning_rate": 3.9843910454160574e-06, "loss": 0.7921558, "num_input_tokens_seen": 24407705, "router_z_loss_clip": 4.8828125, "router_z_loss_mlp": 0.59960938, "step": 1141, "time_per_iteration": 2.7197511196136475 }, { "auxiliary_loss_clip": 0.0687833, "auxiliary_loss_mlp": 0.01338547, "balance_loss_clip": 0.06369672, "balance_loss_mlp": 0.01278322, "epoch": 0.06866075454682098, "flos": 26549265997440.0, "grad_norm": 3.1585089798031274, "language_loss": 0.81198102, "learning_rate": 3.984342445114538e-06, "loss": 0.89414978, "num_input_tokens_seen": 24428390, "router_z_loss_clip": 5.08203125, "router_z_loss_mlp": 0.60253906, "step": 1142, "time_per_iteration": 2.6639058589935303 }, { "auxiliary_loss_clip": 0.06839508, "auxiliary_loss_mlp": 0.01317368, "balance_loss_clip": 0.06370801, "balance_loss_mlp": 0.01265655, "epoch": 0.06872087779948895, "flos": 29797658357760.0, "grad_norm": 2.2887546462650534, "language_loss": 0.71336615, "learning_rate": 3.984293769566553e-06, "loss": 0.79493493, "num_input_tokens_seen": 24450810, "router_z_loss_clip": 4.6875, "router_z_loss_mlp": 0.5168457, "step": 1143, "time_per_iteration": 2.690455198287964 }, { "auxiliary_loss_clip": 0.06844655, "auxiliary_loss_mlp": 0.01323838, "balance_loss_clip": 0.06378265, "balance_loss_mlp": 0.0126998, "epoch": 0.06878100105215693, "flos": 26948070305280.0, "grad_norm": 4.977830054126713, "language_loss": 0.76814008, "learning_rate": 3.98424501877395e-06, "loss": 0.84982502, "num_input_tokens_seen": 24469965, "router_z_loss_clip": 4.6640625, "router_z_loss_mlp": 0.53808594, "step": 1144, "time_per_iteration": 2.6749818325042725 }, { "auxiliary_loss_clip": 0.0688947, "auxiliary_loss_mlp": 0.01324275, "balance_loss_clip": 0.06392536, "balance_loss_mlp": 0.01262763, "epoch": 0.06884112430482489, "flos": 10675361255040.0, "grad_norm": 17.58786933026608, "language_loss": 0.94273782, "learning_rate": 3.984196192738577e-06, "loss": 1.02487528, "num_input_tokens_seen": 24486370, "router_z_loss_clip": 4.96875, "router_z_loss_mlp": 0.61523438, "step": 1145, "time_per_iteration": 2.6025874614715576 }, { "auxiliary_loss_clip": 0.06902653, "auxiliary_loss_mlp": 0.01327328, "balance_loss_clip": 0.06395769, "balance_loss_mlp": 0.01265577, "epoch": 0.06890124755749286, "flos": 20199871808640.0, "grad_norm": 4.91814016501852, "language_loss": 0.8512277, "learning_rate": 3.984147291462285e-06, "loss": 0.93352753, "num_input_tokens_seen": 24503780, "router_z_loss_clip": 5.0703125, "router_z_loss_mlp": 0.61767578, "step": 1146, "time_per_iteration": 2.6212267875671387 }, { "auxiliary_loss_clip": 0.06845256, "auxiliary_loss_mlp": 0.013236, "balance_loss_clip": 0.0638338, "balance_loss_mlp": 0.01268549, "epoch": 0.06896137081016084, "flos": 20455520215680.0, "grad_norm": 3.0557225361544655, "language_loss": 0.87307316, "learning_rate": 3.98409831494693e-06, "loss": 0.95476174, "num_input_tokens_seen": 24522320, "router_z_loss_clip": 4.6171875, "router_z_loss_mlp": 0.55102539, "step": 1147, "time_per_iteration": 2.6277894973754883 }, { "auxiliary_loss_clip": 0.06882894, "auxiliary_loss_mlp": 0.01321759, "balance_loss_clip": 0.06413517, "balance_loss_mlp": 0.01266518, "epoch": 0.0690214940628288, "flos": 18374512538880.0, "grad_norm": 6.446677858997519, "language_loss": 0.87979251, "learning_rate": 3.984049263194367e-06, "loss": 0.96183908, "num_input_tokens_seen": 24540445, "router_z_loss_clip": 4.6875, "router_z_loss_mlp": 0.55224609, "step": 1148, "time_per_iteration": 2.6124143600463867 }, { "auxiliary_loss_clip": 0.06901125, "auxiliary_loss_mlp": 0.0131723, "balance_loss_clip": 0.0641467, "balance_loss_mlp": 0.01258603, "epoch": 0.06908161731549677, "flos": 20564239288320.0, "grad_norm": 3.673277638229188, "language_loss": 0.72245765, "learning_rate": 3.9840001362064575e-06, "loss": 0.80464119, "num_input_tokens_seen": 24557105, "router_z_loss_clip": 4.86328125, "router_z_loss_mlp": 0.58618164, "step": 1149, "time_per_iteration": 2.6486287117004395 }, { "auxiliary_loss_clip": 0.06908189, "auxiliary_loss_mlp": 0.01321201, "balance_loss_clip": 0.06422941, "balance_loss_mlp": 0.01264314, "epoch": 0.06914174056816474, "flos": 27571104938880.0, "grad_norm": 5.029366612785119, "language_loss": 0.86841375, "learning_rate": 3.983950933985064e-06, "loss": 0.95070755, "num_input_tokens_seen": 24578240, "router_z_loss_clip": 4.85546875, "router_z_loss_mlp": 0.56884766, "step": 1150, "time_per_iteration": 2.678577423095703 }, { "auxiliary_loss_clip": 0.06928146, "auxiliary_loss_mlp": 0.01321027, "balance_loss_clip": 0.06445625, "balance_loss_mlp": 0.01266954, "epoch": 0.06920186382083271, "flos": 15309331130880.0, "grad_norm": 5.783492975036766, "language_loss": 0.85974914, "learning_rate": 3.983901656532052e-06, "loss": 0.94224089, "num_input_tokens_seen": 24593585, "router_z_loss_clip": 4.828125, "router_z_loss_mlp": 0.54077148, "step": 1151, "time_per_iteration": 2.633819103240967 }, { "auxiliary_loss_clip": 0.06942916, "auxiliary_loss_mlp": 0.01320759, "balance_loss_clip": 0.06443185, "balance_loss_mlp": 0.01263109, "epoch": 0.06926198707350067, "flos": 25198125310080.0, "grad_norm": 4.237000711496794, "language_loss": 0.87598753, "learning_rate": 3.983852303849291e-06, "loss": 0.9586243, "num_input_tokens_seen": 24613110, "router_z_loss_clip": 4.9921875, "router_z_loss_mlp": 0.5769043, "step": 1152, "time_per_iteration": 2.7689571380615234 }, { "auxiliary_loss_clip": 0.06916767, "auxiliary_loss_mlp": 0.01318739, "balance_loss_clip": 0.06431305, "balance_loss_mlp": 0.01264284, "epoch": 0.06932211032616864, "flos": 13260328513920.0, "grad_norm": 3.9583826642915265, "language_loss": 0.92840642, "learning_rate": 3.983802875938651e-06, "loss": 1.0107615, "num_input_tokens_seen": 24628795, "router_z_loss_clip": 4.8515625, "router_z_loss_mlp": 0.54418945, "step": 1153, "time_per_iteration": 2.6416866779327393 }, { "auxiliary_loss_clip": 0.06913867, "auxiliary_loss_mlp": 0.01319521, "balance_loss_clip": 0.06430358, "balance_loss_mlp": 0.01266759, "epoch": 0.06938223357883662, "flos": 24834386736000.0, "grad_norm": 7.666945030328703, "language_loss": 0.83794808, "learning_rate": 3.983753372802008e-06, "loss": 0.92028195, "num_input_tokens_seen": 24645480, "router_z_loss_clip": 4.83984375, "router_z_loss_mlp": 0.52807617, "step": 1154, "time_per_iteration": 2.656444549560547 }, { "auxiliary_loss_clip": 0.06906221, "auxiliary_loss_mlp": 0.01324304, "balance_loss_clip": 0.06426887, "balance_loss_mlp": 0.01269658, "epoch": 0.06944235683150458, "flos": 27274730647680.0, "grad_norm": 3.792617185061299, "language_loss": 0.7734133, "learning_rate": 3.983703794441237e-06, "loss": 0.85571849, "num_input_tokens_seen": 24664630, "router_z_loss_clip": 4.7890625, "router_z_loss_mlp": 0.54711914, "step": 1155, "time_per_iteration": 2.7740423679351807 }, { "auxiliary_loss_clip": 0.0692126, "auxiliary_loss_mlp": 0.01323722, "balance_loss_clip": 0.06434212, "balance_loss_mlp": 0.01267789, "epoch": 0.06950248008417255, "flos": 25814493544320.0, "grad_norm": 4.634326169348915, "language_loss": 0.7215963, "learning_rate": 3.98365414085822e-06, "loss": 0.80404615, "num_input_tokens_seen": 24684210, "router_z_loss_clip": 4.86328125, "router_z_loss_mlp": 0.55908203, "step": 1156, "time_per_iteration": 2.713042736053467 }, { "auxiliary_loss_clip": 0.06917889, "auxiliary_loss_mlp": 0.01331104, "balance_loss_clip": 0.06421691, "balance_loss_mlp": 0.01269878, "epoch": 0.06956260333684053, "flos": 22277818811520.0, "grad_norm": 2.887061236750819, "language_loss": 0.76043659, "learning_rate": 3.98360441205484e-06, "loss": 0.8429265, "num_input_tokens_seen": 24702490, "router_z_loss_clip": 4.9609375, "router_z_loss_mlp": 0.61230469, "step": 1157, "time_per_iteration": 2.654886484146118 }, { "auxiliary_loss_clip": 0.06909375, "auxiliary_loss_mlp": 0.01327581, "balance_loss_clip": 0.06421179, "balance_loss_mlp": 0.01273794, "epoch": 0.0696227265895085, "flos": 29689442409600.0, "grad_norm": 5.992335092116119, "language_loss": 0.73579061, "learning_rate": 3.983554608032982e-06, "loss": 0.81816018, "num_input_tokens_seen": 24724340, "router_z_loss_clip": 4.8828125, "router_z_loss_mlp": 0.53808594, "step": 1158, "time_per_iteration": 2.6934680938720703 }, { "auxiliary_loss_clip": 0.0692484, "auxiliary_loss_mlp": 0.01324232, "balance_loss_clip": 0.06434871, "balance_loss_mlp": 0.01269753, "epoch": 0.06968284984217646, "flos": 25531158562560.0, "grad_norm": 3.0763795676102674, "language_loss": 0.82210267, "learning_rate": 3.983504728794533e-06, "loss": 0.90459347, "num_input_tokens_seen": 24745550, "router_z_loss_clip": 4.890625, "router_z_loss_mlp": 0.54492188, "step": 1159, "time_per_iteration": 2.685081958770752 }, { "auxiliary_loss_clip": 0.06899732, "auxiliary_loss_mlp": 0.01331899, "balance_loss_clip": 0.06414417, "balance_loss_mlp": 0.01270578, "epoch": 0.06974297309484444, "flos": 20703454047360.0, "grad_norm": 4.091874649676539, "language_loss": 0.84442407, "learning_rate": 3.983454774341387e-06, "loss": 0.92674041, "num_input_tokens_seen": 24762575, "router_z_loss_clip": 4.85546875, "router_z_loss_mlp": 0.61376953, "step": 1160, "time_per_iteration": 2.6008927822113037 }, { "auxiliary_loss_clip": 0.06901676, "auxiliary_loss_mlp": 0.01320458, "balance_loss_clip": 0.06423983, "balance_loss_mlp": 0.01264954, "epoch": 0.0698030963475124, "flos": 26512397400960.0, "grad_norm": 2.552240619739084, "language_loss": 0.78166449, "learning_rate": 3.983404744675437e-06, "loss": 0.86388588, "num_input_tokens_seen": 24782605, "router_z_loss_clip": 4.77734375, "router_z_loss_mlp": 0.55566406, "step": 1161, "time_per_iteration": 4.093398809432983 }, { "auxiliary_loss_clip": 0.06898399, "auxiliary_loss_mlp": 0.01323148, "balance_loss_clip": 0.06412846, "balance_loss_mlp": 0.01269003, "epoch": 0.06986321960018037, "flos": 23047279655040.0, "grad_norm": 2.1528566440313894, "language_loss": 0.84595072, "learning_rate": 3.9833546397985794e-06, "loss": 0.92816621, "num_input_tokens_seen": 24802910, "router_z_loss_clip": 4.84375, "router_z_loss_mlp": 0.54125977, "step": 1162, "time_per_iteration": 2.635493040084839 }, { "auxiliary_loss_clip": 0.0686815, "auxiliary_loss_mlp": 0.01321862, "balance_loss_clip": 0.06400825, "balance_loss_mlp": 0.0126848, "epoch": 0.06992334285284833, "flos": 28592356901760.0, "grad_norm": 7.317210725319534, "language_loss": 0.82033539, "learning_rate": 3.983304459712716e-06, "loss": 0.90223545, "num_input_tokens_seen": 24823305, "router_z_loss_clip": 4.66796875, "router_z_loss_mlp": 0.53369141, "step": 1163, "time_per_iteration": 2.8104028701782227 }, { "auxiliary_loss_clip": 0.06905609, "auxiliary_loss_mlp": 0.01323806, "balance_loss_clip": 0.0641627, "balance_loss_mlp": 0.01264631, "epoch": 0.06998346610551631, "flos": 20601694863360.0, "grad_norm": 3.529280919372945, "language_loss": 0.80540574, "learning_rate": 3.983254204419749e-06, "loss": 0.88769984, "num_input_tokens_seen": 24842155, "router_z_loss_clip": 4.89453125, "router_z_loss_mlp": 0.59106445, "step": 1164, "time_per_iteration": 4.081567049026489 }, { "auxiliary_loss_clip": 0.06892569, "auxiliary_loss_mlp": 0.01321028, "balance_loss_clip": 0.06401532, "balance_loss_mlp": 0.01263665, "epoch": 0.07004358935818428, "flos": 22535437789440.0, "grad_norm": 1.9452851162354474, "language_loss": 0.74774206, "learning_rate": 3.983203873921583e-06, "loss": 0.82987797, "num_input_tokens_seen": 24862080, "router_z_loss_clip": 4.9140625, "router_z_loss_mlp": 0.57421875, "step": 1165, "time_per_iteration": 2.645021438598633 }, { "auxiliary_loss_clip": 0.06877379, "auxiliary_loss_mlp": 0.01321727, "balance_loss_clip": 0.06401758, "balance_loss_mlp": 0.01268107, "epoch": 0.07010371261085224, "flos": 28957646776320.0, "grad_norm": 4.896871570857254, "language_loss": 0.83339489, "learning_rate": 3.983153468220128e-06, "loss": 0.91538596, "num_input_tokens_seen": 24886165, "router_z_loss_clip": 4.76171875, "router_z_loss_mlp": 0.53588867, "step": 1166, "time_per_iteration": 2.7016406059265137 }, { "auxiliary_loss_clip": 0.06879304, "auxiliary_loss_mlp": 0.01317898, "balance_loss_clip": 0.06393659, "balance_loss_mlp": 0.01264087, "epoch": 0.07016383586352022, "flos": 23665870022400.0, "grad_norm": 3.6236312953246768, "language_loss": 0.86401725, "learning_rate": 3.983102987317295e-06, "loss": 0.94598925, "num_input_tokens_seen": 24905775, "router_z_loss_clip": 4.859375, "router_z_loss_mlp": 0.53857422, "step": 1167, "time_per_iteration": 5.501924276351929 }, { "auxiliary_loss_clip": 0.06851912, "auxiliary_loss_mlp": 0.0131528, "balance_loss_clip": 0.06375459, "balance_loss_mlp": 0.01262542, "epoch": 0.07022395911618819, "flos": 19798258389120.0, "grad_norm": 3.2154934885381388, "language_loss": 0.92725998, "learning_rate": 3.983052431214997e-06, "loss": 1.00893188, "num_input_tokens_seen": 24924295, "router_z_loss_clip": 4.76171875, "router_z_loss_mlp": 0.52807617, "step": 1168, "time_per_iteration": 2.5987935066223145 }, { "auxiliary_loss_clip": 0.06884024, "auxiliary_loss_mlp": 0.0133314, "balance_loss_clip": 0.06376383, "balance_loss_mlp": 0.01269339, "epoch": 0.07028408236885615, "flos": 21695551989120.0, "grad_norm": 6.303472096748922, "language_loss": 0.91418755, "learning_rate": 3.983001799915153e-06, "loss": 0.99635917, "num_input_tokens_seen": 24943210, "router_z_loss_clip": 5.0703125, "router_z_loss_mlp": 0.63769531, "step": 1169, "time_per_iteration": 2.6322898864746094 }, { "auxiliary_loss_clip": 0.06867345, "auxiliary_loss_mlp": 0.01329821, "balance_loss_clip": 0.06372204, "balance_loss_mlp": 0.01272147, "epoch": 0.07034420562152413, "flos": 25637445866880.0, "grad_norm": 9.383009066511542, "language_loss": 0.86303312, "learning_rate": 3.982951093419681e-06, "loss": 0.94500482, "num_input_tokens_seen": 24960360, "router_z_loss_clip": 4.9453125, "router_z_loss_mlp": 0.57641602, "step": 1170, "time_per_iteration": 2.6394033432006836 }, { "auxiliary_loss_clip": 0.06836584, "auxiliary_loss_mlp": 0.01329306, "balance_loss_clip": 0.06372304, "balance_loss_mlp": 0.01278284, "epoch": 0.0704043288741921, "flos": 20816198115840.0, "grad_norm": 2.9425189733608232, "language_loss": 0.77598417, "learning_rate": 3.982900311730506e-06, "loss": 0.85764313, "num_input_tokens_seen": 24978290, "router_z_loss_clip": 4.640625, "router_z_loss_mlp": 0.51049805, "step": 1171, "time_per_iteration": 2.619486093521118 }, { "auxiliary_loss_clip": 0.06840634, "auxiliary_loss_mlp": 0.01313111, "balance_loss_clip": 0.06366189, "balance_loss_mlp": 0.01260063, "epoch": 0.07046445212686006, "flos": 25600241854080.0, "grad_norm": 6.977448070554172, "language_loss": 0.91686797, "learning_rate": 3.9828494548495514e-06, "loss": 0.99840546, "num_input_tokens_seen": 24997055, "router_z_loss_clip": 4.7421875, "router_z_loss_mlp": 0.53027344, "step": 1172, "time_per_iteration": 2.6406054496765137 }, { "auxiliary_loss_clip": 0.06871365, "auxiliary_loss_mlp": 0.01325018, "balance_loss_clip": 0.0637502, "balance_loss_mlp": 0.01265938, "epoch": 0.07052457537952803, "flos": 25564086017280.0, "grad_norm": 2.241964168121463, "language_loss": 0.83845216, "learning_rate": 3.982798522778748e-06, "loss": 0.920416, "num_input_tokens_seen": 25017490, "router_z_loss_clip": 4.9609375, "router_z_loss_mlp": 0.59033203, "step": 1173, "time_per_iteration": 2.6577799320220947 }, { "auxiliary_loss_clip": 0.06853268, "auxiliary_loss_mlp": 0.01323751, "balance_loss_clip": 0.06371342, "balance_loss_mlp": 0.01268962, "epoch": 0.070584698632196, "flos": 17974450419840.0, "grad_norm": 8.265892198652372, "language_loss": 0.84469044, "learning_rate": 3.9827475155200245e-06, "loss": 0.92646062, "num_input_tokens_seen": 25035660, "router_z_loss_clip": 4.8125, "router_z_loss_mlp": 0.54785156, "step": 1174, "time_per_iteration": 2.6084201335906982 }, { "auxiliary_loss_clip": 0.06856353, "auxiliary_loss_mlp": 0.01332619, "balance_loss_clip": 0.06376255, "balance_loss_mlp": 0.01278903, "epoch": 0.07064482188486397, "flos": 25377353193600.0, "grad_norm": 5.722530506177177, "language_loss": 0.87051809, "learning_rate": 3.982696433075317e-06, "loss": 0.95240784, "num_input_tokens_seen": 25054785, "router_z_loss_clip": 4.796875, "router_z_loss_mlp": 0.53710938, "step": 1175, "time_per_iteration": 2.652148723602295 }, { "auxiliary_loss_clip": 0.06867881, "auxiliary_loss_mlp": 0.01335702, "balance_loss_clip": 0.06384303, "balance_loss_mlp": 0.01281462, "epoch": 0.07070494513753194, "flos": 24906782263680.0, "grad_norm": 3.3256246445725224, "language_loss": 0.86090517, "learning_rate": 3.982645275446563e-06, "loss": 0.94294107, "num_input_tokens_seen": 25075180, "router_z_loss_clip": 4.8359375, "router_z_loss_mlp": 0.54223633, "step": 1176, "time_per_iteration": 2.6980369091033936 }, { "auxiliary_loss_clip": 0.06858423, "auxiliary_loss_mlp": 0.01332548, "balance_loss_clip": 0.06383165, "balance_loss_mlp": 0.01277545, "epoch": 0.07076506839019991, "flos": 22343715648000.0, "grad_norm": 3.181632814750577, "language_loss": 0.7633518, "learning_rate": 3.982594042635701e-06, "loss": 0.84526151, "num_input_tokens_seen": 25093035, "router_z_loss_clip": 4.74609375, "router_z_loss_mlp": 0.55029297, "step": 1177, "time_per_iteration": 2.6611247062683105 }, { "auxiliary_loss_clip": 0.06882346, "auxiliary_loss_mlp": 0.01332906, "balance_loss_clip": 0.06388259, "balance_loss_mlp": 0.01273539, "epoch": 0.07082519164286788, "flos": 18666694126080.0, "grad_norm": 3.8787742845430264, "language_loss": 0.8722254, "learning_rate": 3.982542734644673e-06, "loss": 0.95437789, "num_input_tokens_seen": 25112520, "router_z_loss_clip": 4.94140625, "router_z_loss_mlp": 0.59326172, "step": 1178, "time_per_iteration": 2.6391470432281494 }, { "auxiliary_loss_clip": 0.06753771, "auxiliary_loss_mlp": 0.01434408, "balance_loss_clip": 0.06439468, "balance_loss_mlp": 0.01403795, "epoch": 0.07088531489553584, "flos": 63674691615360.0, "grad_norm": 0.8482464371644913, "language_loss": 0.63441557, "learning_rate": 3.982491351475427e-06, "loss": 0.71629739, "num_input_tokens_seen": 25177760, "router_z_loss_clip": 3.140625, "router_z_loss_mlp": 0.30566406, "step": 1179, "time_per_iteration": 3.3645083904266357 }, { "auxiliary_loss_clip": 0.06882965, "auxiliary_loss_mlp": 0.01349842, "balance_loss_clip": 0.06385313, "balance_loss_mlp": 0.01290189, "epoch": 0.07094543814820382, "flos": 21577902456960.0, "grad_norm": 6.197207891665816, "language_loss": 0.87149632, "learning_rate": 3.98243989312991e-06, "loss": 0.9538244, "num_input_tokens_seen": 25195260, "router_z_loss_clip": 4.96875, "router_z_loss_mlp": 0.59619141, "step": 1180, "time_per_iteration": 2.6418955326080322 }, { "auxiliary_loss_clip": 0.06876922, "auxiliary_loss_mlp": 0.01334026, "balance_loss_clip": 0.06381802, "balance_loss_mlp": 0.01272467, "epoch": 0.07100556140087179, "flos": 22096326867840.0, "grad_norm": 3.0290555865461157, "language_loss": 0.9036032, "learning_rate": 3.982388359610074e-06, "loss": 0.98571265, "num_input_tokens_seen": 25212740, "router_z_loss_clip": 4.94921875, "router_z_loss_mlp": 0.61523438, "step": 1181, "time_per_iteration": 2.6138322353363037 }, { "auxiliary_loss_clip": 0.06865921, "auxiliary_loss_mlp": 0.01327606, "balance_loss_clip": 0.06392993, "balance_loss_mlp": 0.01273723, "epoch": 0.07106568465353975, "flos": 47933056471680.0, "grad_norm": 3.8822883456186266, "language_loss": 0.85964346, "learning_rate": 3.9823367509178725e-06, "loss": 0.94157869, "num_input_tokens_seen": 25236420, "router_z_loss_clip": 4.73046875, "router_z_loss_mlp": 0.5390625, "step": 1182, "time_per_iteration": 2.8353023529052734 }, { "auxiliary_loss_clip": 0.0688815, "auxiliary_loss_mlp": 0.01348918, "balance_loss_clip": 0.06392165, "balance_loss_mlp": 0.01290696, "epoch": 0.07112580790620772, "flos": 23447551409280.0, "grad_norm": 3.4789933372588937, "language_loss": 0.82617205, "learning_rate": 3.982285067055262e-06, "loss": 0.90854275, "num_input_tokens_seen": 25255120, "router_z_loss_clip": 4.96875, "router_z_loss_mlp": 0.58251953, "step": 1183, "time_per_iteration": 2.6314446926116943 }, { "auxiliary_loss_clip": 0.06898059, "auxiliary_loss_mlp": 0.013505, "balance_loss_clip": 0.06398439, "balance_loss_mlp": 0.01287081, "epoch": 0.0711859311588757, "flos": 31877030880000.0, "grad_norm": 5.235436518416188, "language_loss": 0.81538922, "learning_rate": 3.982233308024204e-06, "loss": 0.89787483, "num_input_tokens_seen": 25275150, "router_z_loss_clip": 5.0, "router_z_loss_mlp": 0.6340332, "step": 1184, "time_per_iteration": 2.7040364742279053 }, { "auxiliary_loss_clip": 0.06870694, "auxiliary_loss_mlp": 0.01351984, "balance_loss_clip": 0.06397763, "balance_loss_mlp": 0.01297076, "epoch": 0.07124605441154366, "flos": 19616514883200.0, "grad_norm": 4.742647039719274, "language_loss": 0.7951445, "learning_rate": 3.98218147382666e-06, "loss": 0.87737131, "num_input_tokens_seen": 25293680, "router_z_loss_clip": 4.7265625, "router_z_loss_mlp": 0.54931641, "step": 1185, "time_per_iteration": 2.6206746101379395 }, { "auxiliary_loss_clip": 0.06888313, "auxiliary_loss_mlp": 0.01368214, "balance_loss_clip": 0.06400357, "balance_loss_mlp": 0.01313688, "epoch": 0.07130617766421163, "flos": 14689776441600.0, "grad_norm": 11.47881569137389, "language_loss": 0.67963564, "learning_rate": 3.982129564464596e-06, "loss": 0.76220095, "num_input_tokens_seen": 25310050, "router_z_loss_clip": 4.8828125, "router_z_loss_mlp": 0.54516602, "step": 1186, "time_per_iteration": 2.5778346061706543 }, { "auxiliary_loss_clip": 0.0687844, "auxiliary_loss_mlp": 0.01368402, "balance_loss_clip": 0.06402063, "balance_loss_mlp": 0.01314591, "epoch": 0.07136630091687961, "flos": 26075131269120.0, "grad_norm": 2.94716626931024, "language_loss": 0.71699858, "learning_rate": 3.98207757993998e-06, "loss": 0.79946697, "num_input_tokens_seen": 25331020, "router_z_loss_clip": 4.75390625, "router_z_loss_mlp": 0.53808594, "step": 1187, "time_per_iteration": 2.6725175380706787 }, { "auxiliary_loss_clip": 0.06882382, "auxiliary_loss_mlp": 0.01388595, "balance_loss_clip": 0.06411749, "balance_loss_mlp": 0.01334497, "epoch": 0.07142642416954757, "flos": 15674621005440.0, "grad_norm": 14.608807378833697, "language_loss": 0.80367273, "learning_rate": 3.9820255202547845e-06, "loss": 0.88638258, "num_input_tokens_seen": 25347875, "router_z_loss_clip": 4.70703125, "router_z_loss_mlp": 0.54101562, "step": 1188, "time_per_iteration": 2.6342875957489014 }, { "auxiliary_loss_clip": 0.06892011, "auxiliary_loss_mlp": 0.01386527, "balance_loss_clip": 0.06412277, "balance_loss_mlp": 0.01326517, "epoch": 0.07148654742221554, "flos": 19761389792640.0, "grad_norm": 3.709729171820191, "language_loss": 0.87619191, "learning_rate": 3.981973385410981e-06, "loss": 0.95897728, "num_input_tokens_seen": 25366715, "router_z_loss_clip": 4.80078125, "router_z_loss_mlp": 0.60058594, "step": 1189, "time_per_iteration": 2.5988876819610596 }, { "auxiliary_loss_clip": 0.06898282, "auxiliary_loss_mlp": 0.01417272, "balance_loss_clip": 0.06424505, "balance_loss_mlp": 0.01360576, "epoch": 0.07154667067488352, "flos": 23477669752320.0, "grad_norm": 35.8986958457736, "language_loss": 0.79121357, "learning_rate": 3.9819211754105494e-06, "loss": 0.87436914, "num_input_tokens_seen": 25385450, "router_z_loss_clip": 4.7265625, "router_z_loss_mlp": 0.56665039, "step": 1190, "time_per_iteration": 2.644943952560425 }, { "auxiliary_loss_clip": 0.06924173, "auxiliary_loss_mlp": 0.01422717, "balance_loss_clip": 0.06431466, "balance_loss_mlp": 0.0135348, "epoch": 0.07160679392755148, "flos": 18338859826560.0, "grad_norm": 10.829201720765539, "language_loss": 0.78015763, "learning_rate": 3.981868890255468e-06, "loss": 0.86362654, "num_input_tokens_seen": 25403940, "router_z_loss_clip": 4.92578125, "router_z_loss_mlp": 0.69287109, "step": 1191, "time_per_iteration": 2.619168281555176 }, { "auxiliary_loss_clip": 0.06909736, "auxiliary_loss_mlp": 0.01418808, "balance_loss_clip": 0.06421021, "balance_loss_mlp": 0.01358726, "epoch": 0.07166691718021945, "flos": 17752484154240.0, "grad_norm": 3.776798925886064, "language_loss": 0.77002394, "learning_rate": 3.981816529947719e-06, "loss": 0.85330939, "num_input_tokens_seen": 25420410, "router_z_loss_clip": 4.8828125, "router_z_loss_mlp": 0.60058594, "step": 1192, "time_per_iteration": 2.6235485076904297 }, { "auxiliary_loss_clip": 0.06910318, "auxiliary_loss_mlp": 0.01424041, "balance_loss_clip": 0.06422938, "balance_loss_mlp": 0.01361099, "epoch": 0.07172704043288743, "flos": 22457885235840.0, "grad_norm": 4.002967356587833, "language_loss": 0.80233216, "learning_rate": 3.9817640944892896e-06, "loss": 0.88567579, "num_input_tokens_seen": 25439415, "router_z_loss_clip": 4.87109375, "router_z_loss_mlp": 0.63012695, "step": 1193, "time_per_iteration": 2.6469383239746094 }, { "auxiliary_loss_clip": 0.06913844, "auxiliary_loss_mlp": 0.01420261, "balance_loss_clip": 0.06437434, "balance_loss_mlp": 0.01360108, "epoch": 0.07178716368555539, "flos": 23228981233920.0, "grad_norm": 3.0013195184265467, "language_loss": 0.87965262, "learning_rate": 3.981711583882166e-06, "loss": 0.96299368, "num_input_tokens_seen": 25458715, "router_z_loss_clip": 4.76953125, "router_z_loss_mlp": 0.6015625, "step": 1194, "time_per_iteration": 2.6223418712615967 }, { "auxiliary_loss_clip": 0.06897493, "auxiliary_loss_mlp": 0.01422468, "balance_loss_clip": 0.06418943, "balance_loss_mlp": 0.01365868, "epoch": 0.07184728693822336, "flos": 25157064009600.0, "grad_norm": 2.89064205172596, "language_loss": 0.83118343, "learning_rate": 3.981658998128341e-06, "loss": 0.91438305, "num_input_tokens_seen": 25477985, "router_z_loss_clip": 4.78515625, "router_z_loss_mlp": 0.56591797, "step": 1195, "time_per_iteration": 2.664261817932129 }, { "auxiliary_loss_clip": 0.06906717, "auxiliary_loss_mlp": 0.01404394, "balance_loss_clip": 0.06432495, "balance_loss_mlp": 0.01347102, "epoch": 0.07190741019089132, "flos": 22717894055040.0, "grad_norm": 3.4978409826972054, "language_loss": 0.81433398, "learning_rate": 3.981606337229808e-06, "loss": 0.89744508, "num_input_tokens_seen": 25497110, "router_z_loss_clip": 4.7421875, "router_z_loss_mlp": 0.57299805, "step": 1196, "time_per_iteration": 2.6410884857177734 }, { "auxiliary_loss_clip": 0.06910779, "auxiliary_loss_mlp": 0.01410933, "balance_loss_clip": 0.06426223, "balance_loss_mlp": 0.01351662, "epoch": 0.0719675334435593, "flos": 29357247697920.0, "grad_norm": 3.4734924449202884, "language_loss": 0.73718917, "learning_rate": 3.9815536011885655e-06, "loss": 0.82040632, "num_input_tokens_seen": 25516555, "router_z_loss_clip": 4.83984375, "router_z_loss_mlp": 0.59277344, "step": 1197, "time_per_iteration": 2.676809310913086 }, { "auxiliary_loss_clip": 0.06878561, "auxiliary_loss_mlp": 0.01390033, "balance_loss_clip": 0.06409752, "balance_loss_mlp": 0.01338535, "epoch": 0.07202765669622727, "flos": 17645609871360.0, "grad_norm": 2.5834742678485583, "language_loss": 0.87388885, "learning_rate": 3.98150079000661e-06, "loss": 0.9565748, "num_input_tokens_seen": 25533895, "router_z_loss_clip": 4.68359375, "router_z_loss_mlp": 0.51489258, "step": 1198, "time_per_iteration": 2.5951528549194336 }, { "auxiliary_loss_clip": 0.06893395, "auxiliary_loss_mlp": 0.01370531, "balance_loss_clip": 0.06420478, "balance_loss_mlp": 0.0132001, "epoch": 0.07208777994889523, "flos": 21440448633600.0, "grad_norm": 2.7683045044489996, "language_loss": 0.85612297, "learning_rate": 3.981447903685947e-06, "loss": 0.93876219, "num_input_tokens_seen": 25554195, "router_z_loss_clip": 4.72265625, "router_z_loss_mlp": 0.50488281, "step": 1199, "time_per_iteration": 2.6243205070495605 }, { "auxiliary_loss_clip": 0.06886448, "auxiliary_loss_mlp": 0.01374455, "balance_loss_clip": 0.06407969, "balance_loss_mlp": 0.01320954, "epoch": 0.07214790320156321, "flos": 26947776816000.0, "grad_norm": 2.7107175527661327, "language_loss": 0.7864269, "learning_rate": 3.981394942228581e-06, "loss": 0.86903596, "num_input_tokens_seen": 25574155, "router_z_loss_clip": 4.77734375, "router_z_loss_mlp": 0.53540039, "step": 1200, "time_per_iteration": 2.6824848651885986 }, { "auxiliary_loss_clip": 0.0687978, "auxiliary_loss_mlp": 0.01362568, "balance_loss_clip": 0.06405678, "balance_loss_mlp": 0.01309949, "epoch": 0.07220802645423118, "flos": 23886997747200.0, "grad_norm": 5.164449098320939, "language_loss": 0.8438189, "learning_rate": 3.98134190563652e-06, "loss": 0.92624235, "num_input_tokens_seen": 25592735, "router_z_loss_clip": 4.73828125, "router_z_loss_mlp": 0.52587891, "step": 1201, "time_per_iteration": 4.078294038772583 }, { "auxiliary_loss_clip": 0.06890298, "auxiliary_loss_mlp": 0.01370488, "balance_loss_clip": 0.06402189, "balance_loss_mlp": 0.0131446, "epoch": 0.07226814970689914, "flos": 19249464072960.0, "grad_norm": 3.6590496671728236, "language_loss": 0.71281874, "learning_rate": 3.981288793911775e-06, "loss": 0.79542667, "num_input_tokens_seen": 25611510, "router_z_loss_clip": 4.87890625, "router_z_loss_mlp": 0.55957031, "step": 1202, "time_per_iteration": 2.626877546310425 }, { "auxiliary_loss_clip": 0.06871038, "auxiliary_loss_mlp": 0.01339206, "balance_loss_clip": 0.06393398, "balance_loss_mlp": 0.01284989, "epoch": 0.07232827295956712, "flos": 19178074794240.0, "grad_norm": 6.976443312082852, "language_loss": 0.88956237, "learning_rate": 3.98123560705636e-06, "loss": 0.97166479, "num_input_tokens_seen": 25629560, "router_z_loss_clip": 4.77734375, "router_z_loss_mlp": 0.54174805, "step": 1203, "time_per_iteration": 4.0787742137908936 }, { "auxiliary_loss_clip": 0.06874412, "auxiliary_loss_mlp": 0.01357009, "balance_loss_clip": 0.06389236, "balance_loss_mlp": 0.01297285, "epoch": 0.07238839621223508, "flos": 17645567944320.0, "grad_norm": 2.947767566626719, "language_loss": 0.81537277, "learning_rate": 3.981182345072293e-06, "loss": 0.89768696, "num_input_tokens_seen": 25648330, "router_z_loss_clip": 4.84765625, "router_z_loss_mlp": 0.59765625, "step": 1204, "time_per_iteration": 2.6058878898620605 }, { "auxiliary_loss_clip": 0.06871112, "auxiliary_loss_mlp": 0.01344359, "balance_loss_clip": 0.06394836, "balance_loss_mlp": 0.01287639, "epoch": 0.07244851946490305, "flos": 28299797971200.0, "grad_norm": 2.172885938522659, "language_loss": 0.84463763, "learning_rate": 3.981129007961593e-06, "loss": 0.92679238, "num_input_tokens_seen": 25669470, "router_z_loss_clip": 4.7578125, "router_z_loss_mlp": 0.56738281, "step": 1205, "time_per_iteration": 2.6729061603546143 }, { "auxiliary_loss_clip": 0.06871061, "auxiliary_loss_mlp": 0.01338105, "balance_loss_clip": 0.06389042, "balance_loss_mlp": 0.01282363, "epoch": 0.07250864271757101, "flos": 22571383991040.0, "grad_norm": 3.7809877563113616, "language_loss": 0.77643365, "learning_rate": 3.981075595726283e-06, "loss": 0.85852534, "num_input_tokens_seen": 25690470, "router_z_loss_clip": 4.8125, "router_z_loss_mlp": 0.55737305, "step": 1206, "time_per_iteration": 4.08908486366272 }, { "auxiliary_loss_clip": 0.06853853, "auxiliary_loss_mlp": 0.01325854, "balance_loss_clip": 0.06383866, "balance_loss_mlp": 0.01271232, "epoch": 0.072568765970239, "flos": 21768869911680.0, "grad_norm": 2.4812776574062245, "language_loss": 0.79747766, "learning_rate": 3.981022108368387e-06, "loss": 0.87927473, "num_input_tokens_seen": 25709205, "router_z_loss_clip": 4.6953125, "router_z_loss_mlp": 0.54638672, "step": 1207, "time_per_iteration": 4.012465715408325 }, { "auxiliary_loss_clip": 0.06846504, "auxiliary_loss_mlp": 0.01327267, "balance_loss_clip": 0.06375916, "balance_loss_mlp": 0.01275316, "epoch": 0.07262888922290696, "flos": 25526672369280.0, "grad_norm": 4.02036902085487, "language_loss": 0.81935424, "learning_rate": 3.9809685458899345e-06, "loss": 0.90109193, "num_input_tokens_seen": 25728485, "router_z_loss_clip": 4.69921875, "router_z_loss_mlp": 0.51928711, "step": 1208, "time_per_iteration": 2.6995739936828613 }, { "auxiliary_loss_clip": 0.06826167, "auxiliary_loss_mlp": 0.01328356, "balance_loss_clip": 0.06369339, "balance_loss_mlp": 0.0127662, "epoch": 0.07268901247557492, "flos": 21252080655360.0, "grad_norm": 3.182698756559801, "language_loss": 0.80772579, "learning_rate": 3.980914908292955e-06, "loss": 0.88927102, "num_input_tokens_seen": 25747730, "router_z_loss_clip": 4.5625, "router_z_loss_mlp": 0.51831055, "step": 1209, "time_per_iteration": 2.728100299835205 }, { "auxiliary_loss_clip": 0.06844154, "auxiliary_loss_mlp": 0.01331755, "balance_loss_clip": 0.06380506, "balance_loss_mlp": 0.01274868, "epoch": 0.0727491357282429, "flos": 25485611068800.0, "grad_norm": 3.27029233839345, "language_loss": 0.83587134, "learning_rate": 3.980861195579486e-06, "loss": 0.91763043, "num_input_tokens_seen": 25768050, "router_z_loss_clip": 4.63671875, "router_z_loss_mlp": 0.56982422, "step": 1210, "time_per_iteration": 2.643608570098877 }, { "auxiliary_loss_clip": 0.06830629, "auxiliary_loss_mlp": 0.01332824, "balance_loss_clip": 0.06369007, "balance_loss_mlp": 0.01278273, "epoch": 0.07280925898091087, "flos": 24469054934400.0, "grad_norm": 2.676707736329508, "language_loss": 0.86671269, "learning_rate": 3.98080740775156e-06, "loss": 0.94834721, "num_input_tokens_seen": 25787985, "router_z_loss_clip": 4.6171875, "router_z_loss_mlp": 0.54516602, "step": 1211, "time_per_iteration": 2.6407406330108643 }, { "auxiliary_loss_clip": 0.06835641, "auxiliary_loss_mlp": 0.01321076, "balance_loss_clip": 0.06366418, "balance_loss_mlp": 0.0126581, "epoch": 0.07286938223357883, "flos": 18292725354240.0, "grad_norm": 3.199024005911181, "language_loss": 0.93303943, "learning_rate": 3.98075354481122e-06, "loss": 1.01460648, "num_input_tokens_seen": 25803620, "router_z_loss_clip": 4.6875, "router_z_loss_mlp": 0.55249023, "step": 1212, "time_per_iteration": 2.5741117000579834 }, { "auxiliary_loss_clip": 0.06840718, "auxiliary_loss_mlp": 0.01318805, "balance_loss_clip": 0.06367832, "balance_loss_mlp": 0.01263731, "epoch": 0.07292950548624681, "flos": 21221123771520.0, "grad_norm": 3.2875295333094394, "language_loss": 0.7479192, "learning_rate": 3.9806996067605055e-06, "loss": 0.82951444, "num_input_tokens_seen": 25823315, "router_z_loss_clip": 4.7265625, "router_z_loss_mlp": 0.55126953, "step": 1213, "time_per_iteration": 2.6142115592956543 }, { "auxiliary_loss_clip": 0.06857519, "auxiliary_loss_mlp": 0.01313161, "balance_loss_clip": 0.06378648, "balance_loss_mlp": 0.01258777, "epoch": 0.07298962873891478, "flos": 24648492453120.0, "grad_norm": 2.5656928758625166, "language_loss": 0.86178505, "learning_rate": 3.980645593601465e-06, "loss": 0.94349194, "num_input_tokens_seen": 25842605, "router_z_loss_clip": 4.78515625, "router_z_loss_mlp": 0.54443359, "step": 1214, "time_per_iteration": 2.6660587787628174 }, { "auxiliary_loss_clip": 0.06842355, "auxiliary_loss_mlp": 0.01312925, "balance_loss_clip": 0.06372207, "balance_loss_mlp": 0.01261546, "epoch": 0.07304975199158274, "flos": 27060101614080.0, "grad_norm": 3.226654049323809, "language_loss": 0.86972737, "learning_rate": 3.980591505336144e-06, "loss": 0.95128012, "num_input_tokens_seen": 25863030, "router_z_loss_clip": 4.69921875, "router_z_loss_mlp": 0.5144043, "step": 1215, "time_per_iteration": 2.715561866760254 }, { "auxiliary_loss_clip": 0.0685658, "auxiliary_loss_mlp": 0.01323078, "balance_loss_clip": 0.06376137, "balance_loss_mlp": 0.01264904, "epoch": 0.07310987524425071, "flos": 33558353781120.0, "grad_norm": 2.4898176980612643, "language_loss": 0.8381207, "learning_rate": 3.980537341966595e-06, "loss": 0.91991735, "num_input_tokens_seen": 25888015, "router_z_loss_clip": 4.80078125, "router_z_loss_mlp": 0.58203125, "step": 1216, "time_per_iteration": 2.761395215988159 }, { "auxiliary_loss_clip": 0.06858741, "auxiliary_loss_mlp": 0.01312063, "balance_loss_clip": 0.06379574, "balance_loss_mlp": 0.01258061, "epoch": 0.07316999849691869, "flos": 28118473735680.0, "grad_norm": 3.217269239025195, "language_loss": 0.7803812, "learning_rate": 3.980483103494872e-06, "loss": 0.86208928, "num_input_tokens_seen": 25908660, "router_z_loss_clip": 4.7890625, "router_z_loss_mlp": 0.5402832, "step": 1217, "time_per_iteration": 2.6801400184631348 }, { "auxiliary_loss_clip": 0.06849602, "auxiliary_loss_mlp": 0.01313633, "balance_loss_clip": 0.06384354, "balance_loss_mlp": 0.01264066, "epoch": 0.07323012174958665, "flos": 14397888343680.0, "grad_norm": 39.13711675997337, "language_loss": 0.88241172, "learning_rate": 3.98042878992303e-06, "loss": 0.96404409, "num_input_tokens_seen": 25927215, "router_z_loss_clip": 4.6484375, "router_z_loss_mlp": 0.49560547, "step": 1218, "time_per_iteration": 2.597013235092163 }, { "auxiliary_loss_clip": 0.0686431, "auxiliary_loss_mlp": 0.01322597, "balance_loss_clip": 0.06393274, "balance_loss_mlp": 0.01271361, "epoch": 0.07329024500225462, "flos": 21622862972160.0, "grad_norm": 2.5964079725884974, "language_loss": 0.88994539, "learning_rate": 3.9803744012531305e-06, "loss": 0.97181451, "num_input_tokens_seen": 25945500, "router_z_loss_clip": 4.71484375, "router_z_loss_mlp": 0.51220703, "step": 1219, "time_per_iteration": 2.618020534515381 }, { "auxiliary_loss_clip": 0.06863548, "auxiliary_loss_mlp": 0.01324772, "balance_loss_clip": 0.06403221, "balance_loss_mlp": 0.0127325, "epoch": 0.0733503682549226, "flos": 13229078140800.0, "grad_norm": 3.5592379482998533, "language_loss": 0.86050498, "learning_rate": 3.980319937487235e-06, "loss": 0.94238818, "num_input_tokens_seen": 25963105, "router_z_loss_clip": 4.6015625, "router_z_loss_mlp": 0.515625, "step": 1220, "time_per_iteration": 2.644991636276245 }, { "auxiliary_loss_clip": 0.0688997, "auxiliary_loss_mlp": 0.01329115, "balance_loss_clip": 0.0641377, "balance_loss_mlp": 0.01275066, "epoch": 0.07341049150759056, "flos": 20893331399040.0, "grad_norm": 2.7339883922920136, "language_loss": 0.80559975, "learning_rate": 3.98026539862741e-06, "loss": 0.88779062, "num_input_tokens_seen": 25981690, "router_z_loss_clip": 4.76171875, "router_z_loss_mlp": 0.53955078, "step": 1221, "time_per_iteration": 2.6410973072052 }, { "auxiliary_loss_clip": 0.06897819, "auxiliary_loss_mlp": 0.0132867, "balance_loss_clip": 0.06428786, "balance_loss_mlp": 0.0127741, "epoch": 0.07347061476025853, "flos": 15418972598400.0, "grad_norm": 3.909443749392169, "language_loss": 0.947281, "learning_rate": 3.980210784675722e-06, "loss": 1.0295459, "num_input_tokens_seen": 25999890, "router_z_loss_clip": 4.69140625, "router_z_loss_mlp": 0.51293945, "step": 1222, "time_per_iteration": 2.748973846435547 }, { "auxiliary_loss_clip": 0.06902704, "auxiliary_loss_mlp": 0.01333696, "balance_loss_clip": 0.06432158, "balance_loss_mlp": 0.01284057, "epoch": 0.0735307380129265, "flos": 11113591708800.0, "grad_norm": 5.2883428469703695, "language_loss": 0.93389761, "learning_rate": 3.980156095634242e-06, "loss": 1.01626158, "num_input_tokens_seen": 26016445, "router_z_loss_clip": 4.70703125, "router_z_loss_mlp": 0.49633789, "step": 1223, "time_per_iteration": 2.587402105331421 }, { "auxiliary_loss_clip": 0.06915934, "auxiliary_loss_mlp": 0.0134861, "balance_loss_clip": 0.06440357, "balance_loss_mlp": 0.01296611, "epoch": 0.07359086126559447, "flos": 23739146017920.0, "grad_norm": 2.7885974938768787, "language_loss": 0.84026349, "learning_rate": 3.980101331505045e-06, "loss": 0.9229089, "num_input_tokens_seen": 26036080, "router_z_loss_clip": 4.75, "router_z_loss_mlp": 0.51977539, "step": 1224, "time_per_iteration": 2.6632871627807617 }, { "auxiliary_loss_clip": 0.06918092, "auxiliary_loss_mlp": 0.01351729, "balance_loss_clip": 0.06435004, "balance_loss_mlp": 0.01296917, "epoch": 0.07365098451826244, "flos": 20999115578880.0, "grad_norm": 37.75827382532671, "language_loss": 0.85657191, "learning_rate": 3.9800464922902076e-06, "loss": 0.93927014, "num_input_tokens_seen": 26055805, "router_z_loss_clip": 4.8203125, "router_z_loss_mlp": 0.54882812, "step": 1225, "time_per_iteration": 2.6112029552459717 }, { "auxiliary_loss_clip": 0.06904247, "auxiliary_loss_mlp": 0.01358642, "balance_loss_clip": 0.06438492, "balance_loss_mlp": 0.01307787, "epoch": 0.0737111077709304, "flos": 19938982521600.0, "grad_norm": 3.6822232064514937, "language_loss": 0.92601639, "learning_rate": 3.979991577991808e-06, "loss": 1.0086453, "num_input_tokens_seen": 26073905, "router_z_loss_clip": 4.66015625, "router_z_loss_mlp": 0.50830078, "step": 1226, "time_per_iteration": 2.621450424194336 }, { "auxiliary_loss_clip": 0.06934807, "auxiliary_loss_mlp": 0.01372354, "balance_loss_clip": 0.06441566, "balance_loss_mlp": 0.0131418, "epoch": 0.07377123102359838, "flos": 16587153895680.0, "grad_norm": 8.479040130691532, "language_loss": 0.7918098, "learning_rate": 3.97993658861193e-06, "loss": 0.87488139, "num_input_tokens_seen": 26091700, "router_z_loss_clip": 4.9296875, "router_z_loss_mlp": 0.58203125, "step": 1227, "time_per_iteration": 2.580733299255371 }, { "auxiliary_loss_clip": 0.06903081, "auxiliary_loss_mlp": 0.01361871, "balance_loss_clip": 0.06438152, "balance_loss_mlp": 0.01309729, "epoch": 0.07383135427626634, "flos": 28335911880960.0, "grad_norm": 5.028489392723981, "language_loss": 0.8814519, "learning_rate": 3.9798815241526575e-06, "loss": 0.96410137, "num_input_tokens_seen": 26114105, "router_z_loss_clip": 4.6484375, "router_z_loss_mlp": 0.52172852, "step": 1228, "time_per_iteration": 2.6761841773986816 }, { "auxiliary_loss_clip": 0.06905236, "auxiliary_loss_mlp": 0.01350787, "balance_loss_clip": 0.06427941, "balance_loss_mlp": 0.01296094, "epoch": 0.07389147752893431, "flos": 20053277890560.0, "grad_norm": 6.688310891903159, "language_loss": 0.81804132, "learning_rate": 3.97982638461608e-06, "loss": 0.90060157, "num_input_tokens_seen": 26131165, "router_z_loss_clip": 4.76953125, "router_z_loss_mlp": 0.54663086, "step": 1229, "time_per_iteration": 2.6139299869537354 }, { "auxiliary_loss_clip": 0.0690244, "auxiliary_loss_mlp": 0.01364285, "balance_loss_clip": 0.06420869, "balance_loss_mlp": 0.01305252, "epoch": 0.07395160078160229, "flos": 18120038088960.0, "grad_norm": 2.999801988090645, "language_loss": 0.81441516, "learning_rate": 3.979771170004287e-06, "loss": 0.89708239, "num_input_tokens_seen": 26150040, "router_z_loss_clip": 4.80859375, "router_z_loss_mlp": 0.59057617, "step": 1230, "time_per_iteration": 2.5894291400909424 }, { "auxiliary_loss_clip": 0.06888521, "auxiliary_loss_mlp": 0.01345647, "balance_loss_clip": 0.06417054, "balance_loss_mlp": 0.01290572, "epoch": 0.07401172403427025, "flos": 23593726056960.0, "grad_norm": 2.3840019358848097, "language_loss": 0.83529967, "learning_rate": 3.979715880319372e-06, "loss": 0.9176414, "num_input_tokens_seen": 26169380, "router_z_loss_clip": 4.70703125, "router_z_loss_mlp": 0.55053711, "step": 1231, "time_per_iteration": 2.651292562484741 }, { "auxiliary_loss_clip": 0.06892464, "auxiliary_loss_mlp": 0.0136052, "balance_loss_clip": 0.06413259, "balance_loss_mlp": 0.01310071, "epoch": 0.07407184728693822, "flos": 26367187075200.0, "grad_norm": 4.80133050432031, "language_loss": 0.97683096, "learning_rate": 3.979660515563434e-06, "loss": 1.05936074, "num_input_tokens_seen": 26189420, "router_z_loss_clip": 4.7890625, "router_z_loss_mlp": 0.50512695, "step": 1232, "time_per_iteration": 2.672973394393921 }, { "auxiliary_loss_clip": 0.06865875, "auxiliary_loss_mlp": 0.01352417, "balance_loss_clip": 0.06407259, "balance_loss_mlp": 0.01302563, "epoch": 0.0741319705396062, "flos": 22207016511360.0, "grad_norm": 3.1638674467011034, "language_loss": 0.83532524, "learning_rate": 3.979605075738569e-06, "loss": 0.91750824, "num_input_tokens_seen": 26209300, "router_z_loss_clip": 4.5859375, "router_z_loss_mlp": 0.49853516, "step": 1233, "time_per_iteration": 2.6146700382232666 }, { "auxiliary_loss_clip": 0.06891397, "auxiliary_loss_mlp": 0.01330096, "balance_loss_clip": 0.06409904, "balance_loss_mlp": 0.01272494, "epoch": 0.07419209379227416, "flos": 39209508696960.0, "grad_norm": 2.9680495635096693, "language_loss": 0.72376931, "learning_rate": 3.979549560846883e-06, "loss": 0.80598426, "num_input_tokens_seen": 26228110, "router_z_loss_clip": 4.8125, "router_z_loss_mlp": 0.57617188, "step": 1234, "time_per_iteration": 2.7493910789489746 }, { "auxiliary_loss_clip": 0.06871308, "auxiliary_loss_mlp": 0.01337661, "balance_loss_clip": 0.06399044, "balance_loss_mlp": 0.01282753, "epoch": 0.07425221704494213, "flos": 22787899741440.0, "grad_norm": 2.305794609101285, "language_loss": 0.79099047, "learning_rate": 3.979493970890478e-06, "loss": 0.87308019, "num_input_tokens_seen": 26247020, "router_z_loss_clip": 4.7265625, "router_z_loss_mlp": 0.54907227, "step": 1235, "time_per_iteration": 2.617532730102539 }, { "auxiliary_loss_clip": 0.06855091, "auxiliary_loss_mlp": 0.01339543, "balance_loss_clip": 0.06399906, "balance_loss_mlp": 0.01287759, "epoch": 0.0743123402976101, "flos": 22279495893120.0, "grad_norm": 2.8508038703551377, "language_loss": 0.8519761, "learning_rate": 3.979438305871464e-06, "loss": 0.93392247, "num_input_tokens_seen": 26265750, "router_z_loss_clip": 4.5546875, "router_z_loss_mlp": 0.51782227, "step": 1236, "time_per_iteration": 2.623521089553833 }, { "auxiliary_loss_clip": 0.06881636, "auxiliary_loss_mlp": 0.01351028, "balance_loss_clip": 0.06404039, "balance_loss_mlp": 0.01294665, "epoch": 0.07437246355027807, "flos": 29322768942720.0, "grad_norm": 12.393897463637147, "language_loss": 0.78690493, "learning_rate": 3.979382565791951e-06, "loss": 0.86923152, "num_input_tokens_seen": 26287905, "router_z_loss_clip": 4.7734375, "router_z_loss_mlp": 0.56396484, "step": 1237, "time_per_iteration": 2.684342861175537 }, { "auxiliary_loss_clip": 0.06871857, "auxiliary_loss_mlp": 0.01338946, "balance_loss_clip": 0.06401341, "balance_loss_mlp": 0.01287924, "epoch": 0.07443258680294604, "flos": 31953367549440.0, "grad_norm": 2.39046537743176, "language_loss": 0.79400146, "learning_rate": 3.979326750654053e-06, "loss": 0.87610948, "num_input_tokens_seen": 26311795, "router_z_loss_clip": 4.69921875, "router_z_loss_mlp": 0.50976562, "step": 1238, "time_per_iteration": 2.731171131134033 }, { "auxiliary_loss_clip": 0.06877753, "auxiliary_loss_mlp": 0.01344706, "balance_loss_clip": 0.06392928, "balance_loss_mlp": 0.01283576, "epoch": 0.074492710055614, "flos": 22682031707520.0, "grad_norm": 3.097621136482105, "language_loss": 0.878618, "learning_rate": 3.9792708604598854e-06, "loss": 0.96084261, "num_input_tokens_seen": 26330330, "router_z_loss_clip": 4.8515625, "router_z_loss_mlp": 0.61108398, "step": 1239, "time_per_iteration": 2.6062469482421875 }, { "auxiliary_loss_clip": 0.06866613, "auxiliary_loss_mlp": 0.01332479, "balance_loss_clip": 0.06390389, "balance_loss_mlp": 0.01274114, "epoch": 0.07455283330828198, "flos": 21290752114560.0, "grad_norm": 2.2340078647068187, "language_loss": 0.91112554, "learning_rate": 3.979214895211569e-06, "loss": 0.9931165, "num_input_tokens_seen": 26348865, "router_z_loss_clip": 4.76953125, "router_z_loss_mlp": 0.58374023, "step": 1240, "time_per_iteration": 2.644989013671875 }, { "auxiliary_loss_clip": 0.0686075, "auxiliary_loss_mlp": 0.01331402, "balance_loss_clip": 0.06393693, "balance_loss_mlp": 0.01277615, "epoch": 0.07461295656094995, "flos": 24395150033280.0, "grad_norm": 12.130644681989601, "language_loss": 0.90683806, "learning_rate": 3.979158854911225e-06, "loss": 0.98875964, "num_input_tokens_seen": 26368210, "router_z_loss_clip": 4.6640625, "router_z_loss_mlp": 0.53759766, "step": 1241, "time_per_iteration": 4.029593229293823 }, { "auxiliary_loss_clip": 0.06757005, "auxiliary_loss_mlp": 0.01340355, "balance_loss_clip": 0.06441899, "balance_loss_mlp": 0.01306595, "epoch": 0.07467307981361791, "flos": 62127971498880.0, "grad_norm": 0.8916634100808546, "language_loss": 0.6328128, "learning_rate": 3.979102739560979e-06, "loss": 0.71378642, "num_input_tokens_seen": 26424890, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.33789062, "step": 1242, "time_per_iteration": 3.2686424255371094 }, { "auxiliary_loss_clip": 0.0688021, "auxiliary_loss_mlp": 0.01346588, "balance_loss_clip": 0.06386064, "balance_loss_mlp": 0.01287388, "epoch": 0.07473320306628589, "flos": 24870039448320.0, "grad_norm": 4.374558267454034, "language_loss": 0.66650683, "learning_rate": 3.9790465491629595e-06, "loss": 0.74877483, "num_input_tokens_seen": 26446405, "router_z_loss_clip": 4.93359375, "router_z_loss_mlp": 0.59155273, "step": 1243, "time_per_iteration": 4.0655951499938965 }, { "auxiliary_loss_clip": 0.06840666, "auxiliary_loss_mlp": 0.01323525, "balance_loss_clip": 0.06374593, "balance_loss_mlp": 0.01271907, "epoch": 0.07479332631895386, "flos": 24903973152000.0, "grad_norm": 3.6857557122560825, "language_loss": 0.78323078, "learning_rate": 3.978990283719296e-06, "loss": 0.86487269, "num_input_tokens_seen": 26466070, "router_z_loss_clip": 4.66015625, "router_z_loss_mlp": 0.51635742, "step": 1244, "time_per_iteration": 2.6486032009124756 }, { "auxiliary_loss_clip": 0.06838006, "auxiliary_loss_mlp": 0.01323777, "balance_loss_clip": 0.06372724, "balance_loss_mlp": 0.01273709, "epoch": 0.07485344957162182, "flos": 17819932291200.0, "grad_norm": 3.704191566989751, "language_loss": 0.71219629, "learning_rate": 3.978933943232123e-06, "loss": 0.79381412, "num_input_tokens_seen": 26479350, "router_z_loss_clip": 4.64453125, "router_z_loss_mlp": 0.5012207, "step": 1245, "time_per_iteration": 2.563450336456299 }, { "auxiliary_loss_clip": 0.06841563, "auxiliary_loss_mlp": 0.01330736, "balance_loss_clip": 0.06375417, "balance_loss_mlp": 0.01278165, "epoch": 0.0749135728242898, "flos": 25017304199040.0, "grad_norm": 2.9999852572811068, "language_loss": 0.91148496, "learning_rate": 3.978877527703576e-06, "loss": 0.99320793, "num_input_tokens_seen": 26498255, "router_z_loss_clip": 4.6640625, "router_z_loss_mlp": 0.52636719, "step": 1246, "time_per_iteration": 4.090303659439087 }, { "auxiliary_loss_clip": 0.06869231, "auxiliary_loss_mlp": 0.01348942, "balance_loss_clip": 0.06375602, "balance_loss_mlp": 0.01287812, "epoch": 0.07497369607695777, "flos": 17827898428800.0, "grad_norm": 3.0654841225431078, "language_loss": 0.9104594, "learning_rate": 3.9788210371357945e-06, "loss": 0.99264115, "num_input_tokens_seen": 26515375, "router_z_loss_clip": 4.93359375, "router_z_loss_mlp": 0.61083984, "step": 1247, "time_per_iteration": 4.018019914627075 }, { "auxiliary_loss_clip": 0.06827988, "auxiliary_loss_mlp": 0.01321547, "balance_loss_clip": 0.06365719, "balance_loss_mlp": 0.01268594, "epoch": 0.07503381932962573, "flos": 15126287886720.0, "grad_norm": 3.1310796508178145, "language_loss": 0.66668546, "learning_rate": 3.978764471530921e-06, "loss": 0.74818087, "num_input_tokens_seen": 26533595, "router_z_loss_clip": 4.62109375, "router_z_loss_mlp": 0.52978516, "step": 1248, "time_per_iteration": 2.5906383991241455 }, { "auxiliary_loss_clip": 0.06820458, "auxiliary_loss_mlp": 0.01317213, "balance_loss_clip": 0.06369913, "balance_loss_mlp": 0.01267408, "epoch": 0.0750939425822937, "flos": 12820588686720.0, "grad_norm": 3.778144201539907, "language_loss": 0.75585759, "learning_rate": 3.978707830891102e-06, "loss": 0.83723426, "num_input_tokens_seen": 26549405, "router_z_loss_clip": 4.5, "router_z_loss_mlp": 0.49755859, "step": 1249, "time_per_iteration": 2.5766448974609375 }, { "auxiliary_loss_clip": 0.06850679, "auxiliary_loss_mlp": 0.01330332, "balance_loss_clip": 0.0637356, "balance_loss_mlp": 0.01276354, "epoch": 0.07515406583496168, "flos": 24213700016640.0, "grad_norm": 6.4942398471370275, "language_loss": 0.84685826, "learning_rate": 3.978651115218482e-06, "loss": 0.92866838, "num_input_tokens_seen": 26567200, "router_z_loss_clip": 4.76953125, "router_z_loss_mlp": 0.54003906, "step": 1250, "time_per_iteration": 2.6318576335906982 }, { "auxiliary_loss_clip": 0.06822018, "auxiliary_loss_mlp": 0.01315735, "balance_loss_clip": 0.06364814, "balance_loss_mlp": 0.01266788, "epoch": 0.07521418908762964, "flos": 26695482572160.0, "grad_norm": 5.8732439285289155, "language_loss": 0.6943388, "learning_rate": 3.978594324515215e-06, "loss": 0.77571636, "num_input_tokens_seen": 26586190, "router_z_loss_clip": 4.5703125, "router_z_loss_mlp": 0.48950195, "step": 1251, "time_per_iteration": 2.6408932209014893 }, { "auxiliary_loss_clip": 0.06722394, "auxiliary_loss_mlp": 0.01292988, "balance_loss_clip": 0.06406701, "balance_loss_mlp": 0.01260348, "epoch": 0.0752743123402976, "flos": 59115255546240.0, "grad_norm": 0.8747684237732218, "language_loss": 0.70389903, "learning_rate": 3.9785374587834515e-06, "loss": 0.78405285, "num_input_tokens_seen": 26650710, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.32641602, "step": 1252, "time_per_iteration": 3.2688472270965576 }, { "auxiliary_loss_clip": 0.06820567, "auxiliary_loss_mlp": 0.0132372, "balance_loss_clip": 0.06361295, "balance_loss_mlp": 0.01272627, "epoch": 0.07533443559296558, "flos": 23483749173120.0, "grad_norm": 2.664725558290417, "language_loss": 0.81882787, "learning_rate": 3.97848051802535e-06, "loss": 0.9002707, "num_input_tokens_seen": 26669000, "router_z_loss_clip": 4.58984375, "router_z_loss_mlp": 0.51074219, "step": 1253, "time_per_iteration": 2.6179168224334717 }, { "auxiliary_loss_clip": 0.0684301, "auxiliary_loss_mlp": 0.0131857, "balance_loss_clip": 0.06369831, "balance_loss_mlp": 0.01266666, "epoch": 0.07539455884563355, "flos": 20884149377280.0, "grad_norm": 12.169420348553928, "language_loss": 0.96765357, "learning_rate": 3.978423502243069e-06, "loss": 1.04926944, "num_input_tokens_seen": 26683075, "router_z_loss_clip": 4.7265625, "router_z_loss_mlp": 0.51928711, "step": 1254, "time_per_iteration": 2.68080735206604 }, { "auxiliary_loss_clip": 0.06818113, "auxiliary_loss_mlp": 0.01320637, "balance_loss_clip": 0.0636491, "balance_loss_mlp": 0.01266302, "epoch": 0.07545468209830151, "flos": 27680327136000.0, "grad_norm": 5.053747364667812, "language_loss": 0.90245646, "learning_rate": 3.97836641143877e-06, "loss": 0.98384392, "num_input_tokens_seen": 26701875, "router_z_loss_clip": 4.53125, "router_z_loss_mlp": 0.54418945, "step": 1255, "time_per_iteration": 2.717275381088257 }, { "auxiliary_loss_clip": 0.06813965, "auxiliary_loss_mlp": 0.01318782, "balance_loss_clip": 0.06361683, "balance_loss_mlp": 0.01267951, "epoch": 0.0755148053509695, "flos": 14142198009600.0, "grad_norm": 20.782480596280838, "language_loss": 0.81908655, "learning_rate": 3.978309245614618e-06, "loss": 0.90041399, "num_input_tokens_seen": 26719050, "router_z_loss_clip": 4.51953125, "router_z_loss_mlp": 0.50805664, "step": 1256, "time_per_iteration": 2.6101739406585693 }, { "auxiliary_loss_clip": 0.06698799, "auxiliary_loss_mlp": 0.0129631, "balance_loss_clip": 0.06385806, "balance_loss_mlp": 0.01263743, "epoch": 0.07557492860363746, "flos": 58251764822400.0, "grad_norm": 0.7513764531143458, "language_loss": 0.57992971, "learning_rate": 3.9782520047727825e-06, "loss": 0.65988082, "num_input_tokens_seen": 26780650, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.32568359, "step": 1257, "time_per_iteration": 3.315246343612671 }, { "auxiliary_loss_clip": 0.06837741, "auxiliary_loss_mlp": 0.01331508, "balance_loss_clip": 0.06376496, "balance_loss_mlp": 0.01272714, "epoch": 0.07563505185630542, "flos": 24651259637760.0, "grad_norm": 3.885256586932677, "language_loss": 0.93120694, "learning_rate": 3.978194688915432e-06, "loss": 1.0128994, "num_input_tokens_seen": 26798725, "router_z_loss_clip": 4.61328125, "router_z_loss_mlp": 0.58813477, "step": 1258, "time_per_iteration": 2.6410725116729736 }, { "auxiliary_loss_clip": 0.06823822, "auxiliary_loss_mlp": 0.01308765, "balance_loss_clip": 0.06379073, "balance_loss_mlp": 0.01260032, "epoch": 0.07569517510897339, "flos": 15528362503680.0, "grad_norm": 5.262871995950865, "language_loss": 0.84340668, "learning_rate": 3.978137298044741e-06, "loss": 0.92473257, "num_input_tokens_seen": 26817005, "router_z_loss_clip": 4.4453125, "router_z_loss_mlp": 0.48803711, "step": 1259, "time_per_iteration": 2.594383478164673 }, { "auxiliary_loss_clip": 0.06824595, "auxiliary_loss_mlp": 0.01318044, "balance_loss_clip": 0.06371176, "balance_loss_mlp": 0.01269097, "epoch": 0.07575529836164137, "flos": 22934954856960.0, "grad_norm": 2.8726909310646556, "language_loss": 0.77796423, "learning_rate": 3.978079832162885e-06, "loss": 0.85939062, "num_input_tokens_seen": 26836655, "router_z_loss_clip": 4.53515625, "router_z_loss_mlp": 0.48974609, "step": 1260, "time_per_iteration": 2.6256487369537354 }, { "auxiliary_loss_clip": 0.06816028, "auxiliary_loss_mlp": 0.01316486, "balance_loss_clip": 0.06365525, "balance_loss_mlp": 0.01266132, "epoch": 0.07581542161430933, "flos": 19506537999360.0, "grad_norm": 2.765932363162835, "language_loss": 0.87274933, "learning_rate": 3.978022291272044e-06, "loss": 0.95407438, "num_input_tokens_seen": 26854925, "router_z_loss_clip": 4.5, "router_z_loss_mlp": 0.50415039, "step": 1261, "time_per_iteration": 2.599558115005493 }, { "auxiliary_loss_clip": 0.06812128, "auxiliary_loss_mlp": 0.01319054, "balance_loss_clip": 0.06359419, "balance_loss_mlp": 0.01269082, "epoch": 0.0758755448669773, "flos": 24980519456640.0, "grad_norm": 3.0410826393481596, "language_loss": 0.84475231, "learning_rate": 3.977964675374399e-06, "loss": 0.92606413, "num_input_tokens_seen": 26876170, "router_z_loss_clip": 4.52734375, "router_z_loss_mlp": 0.50024414, "step": 1262, "time_per_iteration": 2.6405603885650635 }, { "auxiliary_loss_clip": 0.06829001, "auxiliary_loss_mlp": 0.01333405, "balance_loss_clip": 0.06365935, "balance_loss_mlp": 0.0127628, "epoch": 0.07593566811964528, "flos": 22754678797440.0, "grad_norm": 18.261816433475932, "language_loss": 0.86480361, "learning_rate": 3.977906984472136e-06, "loss": 0.9464277, "num_input_tokens_seen": 26895005, "router_z_loss_clip": 4.62890625, "router_z_loss_mlp": 0.57080078, "step": 1263, "time_per_iteration": 2.612079620361328 }, { "auxiliary_loss_clip": 0.06820109, "auxiliary_loss_mlp": 0.0131788, "balance_loss_clip": 0.06363004, "balance_loss_mlp": 0.01267717, "epoch": 0.07599579137231324, "flos": 23119088204160.0, "grad_norm": 7.703308625634278, "language_loss": 0.78287488, "learning_rate": 3.977849218567442e-06, "loss": 0.86425477, "num_input_tokens_seen": 26913930, "router_z_loss_clip": 4.57421875, "router_z_loss_mlp": 0.50195312, "step": 1264, "time_per_iteration": 2.6182007789611816 }, { "auxiliary_loss_clip": 0.06827517, "auxiliary_loss_mlp": 0.01340481, "balance_loss_clip": 0.06360842, "balance_loss_mlp": 0.01288339, "epoch": 0.07605591462498121, "flos": 14507362103040.0, "grad_norm": 4.85834658557581, "language_loss": 0.83506459, "learning_rate": 3.977791377662507e-06, "loss": 0.91674459, "num_input_tokens_seen": 26931485, "router_z_loss_clip": 4.66796875, "router_z_loss_mlp": 0.5222168, "step": 1265, "time_per_iteration": 2.580470323562622 }, { "auxiliary_loss_clip": 0.06825687, "auxiliary_loss_mlp": 0.01342932, "balance_loss_clip": 0.0635899, "balance_loss_mlp": 0.01287213, "epoch": 0.07611603787764919, "flos": 23521037040000.0, "grad_norm": 5.900041599191952, "language_loss": 0.67511463, "learning_rate": 3.977733461759524e-06, "loss": 0.75680089, "num_input_tokens_seen": 26951670, "router_z_loss_clip": 4.66796875, "router_z_loss_mlp": 0.55737305, "step": 1266, "time_per_iteration": 2.635059356689453 }, { "auxiliary_loss_clip": 0.06830101, "auxiliary_loss_mlp": 0.0133168, "balance_loss_clip": 0.06361119, "balance_loss_mlp": 0.01276796, "epoch": 0.07617616113031715, "flos": 21513640775040.0, "grad_norm": 8.073566856404657, "language_loss": 0.82398748, "learning_rate": 3.977675470860691e-06, "loss": 0.90560532, "num_input_tokens_seen": 26970335, "router_z_loss_clip": 4.69140625, "router_z_loss_mlp": 0.54907227, "step": 1267, "time_per_iteration": 2.6212158203125 }, { "auxiliary_loss_clip": 0.06815384, "auxiliary_loss_mlp": 0.0131729, "balance_loss_clip": 0.06357354, "balance_loss_mlp": 0.01265911, "epoch": 0.07623628438298512, "flos": 14578164403200.0, "grad_norm": 4.321195598819173, "language_loss": 0.75522876, "learning_rate": 3.977617404968205e-06, "loss": 0.83655548, "num_input_tokens_seen": 26986025, "router_z_loss_clip": 4.57421875, "router_z_loss_mlp": 0.51367188, "step": 1268, "time_per_iteration": 2.5733296871185303 }, { "auxiliary_loss_clip": 0.06805046, "auxiliary_loss_mlp": 0.01319364, "balance_loss_clip": 0.06351054, "balance_loss_mlp": 0.01266149, "epoch": 0.07629640763565308, "flos": 14725638789120.0, "grad_norm": 5.024519113066485, "language_loss": 0.85737455, "learning_rate": 3.977559264084269e-06, "loss": 0.9386186, "num_input_tokens_seen": 27004045, "router_z_loss_clip": 4.5390625, "router_z_loss_mlp": 0.53222656, "step": 1269, "time_per_iteration": 2.6174402236938477 }, { "auxiliary_loss_clip": 0.06798578, "auxiliary_loss_mlp": 0.01333586, "balance_loss_clip": 0.0635134, "balance_loss_mlp": 0.01282255, "epoch": 0.07635653088832106, "flos": 14908220835840.0, "grad_norm": 10.724569756655544, "language_loss": 0.91167724, "learning_rate": 3.977501048211088e-06, "loss": 0.9929989, "num_input_tokens_seen": 27022070, "router_z_loss_clip": 4.47265625, "router_z_loss_mlp": 0.51342773, "step": 1270, "time_per_iteration": 2.5663654804229736 }, { "auxiliary_loss_clip": 0.06805307, "auxiliary_loss_mlp": 0.01321886, "balance_loss_clip": 0.06354983, "balance_loss_mlp": 0.01269243, "epoch": 0.07641665414098903, "flos": 26658865537920.0, "grad_norm": 4.549123149375347, "language_loss": 0.73493057, "learning_rate": 3.977442757350869e-06, "loss": 0.81620246, "num_input_tokens_seen": 27041755, "router_z_loss_clip": 4.50390625, "router_z_loss_mlp": 0.52685547, "step": 1271, "time_per_iteration": 2.6428263187408447 }, { "auxiliary_loss_clip": 0.0678401, "auxiliary_loss_mlp": 0.01325592, "balance_loss_clip": 0.06355461, "balance_loss_mlp": 0.01280054, "epoch": 0.07647677739365699, "flos": 25199970099840.0, "grad_norm": 5.4098006031448875, "language_loss": 0.84476131, "learning_rate": 3.977384391505823e-06, "loss": 0.92585731, "num_input_tokens_seen": 27061540, "router_z_loss_clip": 4.28125, "router_z_loss_mlp": 0.45581055, "step": 1272, "time_per_iteration": 2.647448778152466 }, { "auxiliary_loss_clip": 0.06800665, "auxiliary_loss_mlp": 0.01327486, "balance_loss_clip": 0.0635142, "balance_loss_mlp": 0.01275487, "epoch": 0.07653690064632497, "flos": 20564365069440.0, "grad_norm": 2.500686200295455, "language_loss": 0.82629162, "learning_rate": 3.977325950678162e-06, "loss": 0.90757316, "num_input_tokens_seen": 27081395, "router_z_loss_clip": 4.48828125, "router_z_loss_mlp": 0.52001953, "step": 1273, "time_per_iteration": 2.606720209121704 }, { "auxiliary_loss_clip": 0.06803629, "auxiliary_loss_mlp": 0.01331059, "balance_loss_clip": 0.06350712, "balance_loss_mlp": 0.01280562, "epoch": 0.07659702389899294, "flos": 22275219335040.0, "grad_norm": 3.5522515718545105, "language_loss": 0.83000225, "learning_rate": 3.977267434870103e-06, "loss": 0.91134912, "num_input_tokens_seen": 27101175, "router_z_loss_clip": 4.52734375, "router_z_loss_mlp": 0.50488281, "step": 1274, "time_per_iteration": 2.622249126434326 }, { "auxiliary_loss_clip": 0.06791196, "auxiliary_loss_mlp": 0.01324302, "balance_loss_clip": 0.06356397, "balance_loss_mlp": 0.01274258, "epoch": 0.0766571471516609, "flos": 32644563079680.0, "grad_norm": 4.3650585374054405, "language_loss": 0.7529552, "learning_rate": 3.977208844083865e-06, "loss": 0.83411014, "num_input_tokens_seen": 27124505, "router_z_loss_clip": 4.3515625, "router_z_loss_mlp": 0.50048828, "step": 1275, "time_per_iteration": 2.6974880695343018 }, { "auxiliary_loss_clip": 0.06814535, "auxiliary_loss_mlp": 0.01318768, "balance_loss_clip": 0.06354551, "balance_loss_mlp": 0.01265791, "epoch": 0.07671727040432888, "flos": 15272672169600.0, "grad_norm": 2.98672413491996, "language_loss": 0.82350194, "learning_rate": 3.9771501783216685e-06, "loss": 0.90483499, "num_input_tokens_seen": 27140960, "router_z_loss_clip": 4.6015625, "router_z_loss_mlp": 0.52978516, "step": 1276, "time_per_iteration": 2.603785991668701 }, { "auxiliary_loss_clip": 0.0680862, "auxiliary_loss_mlp": 0.01321677, "balance_loss_clip": 0.06363055, "balance_loss_mlp": 0.01273874, "epoch": 0.07677739365699685, "flos": 28191665877120.0, "grad_norm": 5.542167239027269, "language_loss": 0.61346543, "learning_rate": 3.97709143758574e-06, "loss": 0.69476843, "num_input_tokens_seen": 27160985, "router_z_loss_clip": 4.453125, "router_z_loss_mlp": 0.4777832, "step": 1277, "time_per_iteration": 2.655226230621338 }, { "auxiliary_loss_clip": 0.06807046, "auxiliary_loss_mlp": 0.01322192, "balance_loss_clip": 0.0635727, "balance_loss_mlp": 0.01269239, "epoch": 0.07683751690966481, "flos": 18301991230080.0, "grad_norm": 4.506898244116206, "language_loss": 0.76325059, "learning_rate": 3.977032621878305e-06, "loss": 0.84454298, "num_input_tokens_seen": 27178390, "router_z_loss_clip": 4.49609375, "router_z_loss_mlp": 0.5300293, "step": 1278, "time_per_iteration": 2.5822627544403076 }, { "auxiliary_loss_clip": 0.06792641, "auxiliary_loss_mlp": 0.01314838, "balance_loss_clip": 0.0635884, "balance_loss_mlp": 0.01266082, "epoch": 0.07689764016233278, "flos": 21987565868160.0, "grad_norm": 9.002473858228559, "language_loss": 0.90098226, "learning_rate": 3.976973731201596e-06, "loss": 0.98205703, "num_input_tokens_seen": 27197505, "router_z_loss_clip": 4.33984375, "router_z_loss_mlp": 0.48754883, "step": 1279, "time_per_iteration": 2.6031653881073 }, { "auxiliary_loss_clip": 0.06797595, "auxiliary_loss_mlp": 0.01310162, "balance_loss_clip": 0.06360811, "balance_loss_mlp": 0.01262979, "epoch": 0.07695776341500075, "flos": 22242417661440.0, "grad_norm": 3.08640123345723, "language_loss": 0.84783381, "learning_rate": 3.976914765557845e-06, "loss": 0.92891133, "num_input_tokens_seen": 27214260, "router_z_loss_clip": 4.36328125, "router_z_loss_mlp": 0.47167969, "step": 1280, "time_per_iteration": 2.6198878288269043 }, { "auxiliary_loss_clip": 0.06789574, "auxiliary_loss_mlp": 0.01316858, "balance_loss_clip": 0.06358397, "balance_loss_mlp": 0.01266028, "epoch": 0.07701788666766872, "flos": 16149300785280.0, "grad_norm": 2.8657976306766533, "language_loss": 0.78384274, "learning_rate": 3.9768557249492875e-06, "loss": 0.86490715, "num_input_tokens_seen": 27232525, "router_z_loss_clip": 4.3125, "router_z_loss_mlp": 0.50854492, "step": 1281, "time_per_iteration": 4.048556804656982 }, { "auxiliary_loss_clip": 0.06808408, "auxiliary_loss_mlp": 0.01317997, "balance_loss_clip": 0.06352665, "balance_loss_mlp": 0.01264972, "epoch": 0.07707800992033668, "flos": 19468998570240.0, "grad_norm": 2.9010023011982176, "language_loss": 0.77605438, "learning_rate": 3.9767966093781634e-06, "loss": 0.85731846, "num_input_tokens_seen": 27249800, "router_z_loss_clip": 4.5625, "router_z_loss_mlp": 0.52978516, "step": 1282, "time_per_iteration": 4.044853448867798 }, { "auxiliary_loss_clip": 0.06794888, "auxiliary_loss_mlp": 0.01316958, "balance_loss_clip": 0.0635736, "balance_loss_mlp": 0.01266938, "epoch": 0.07713813317300466, "flos": 18996415142400.0, "grad_norm": 6.294493966997313, "language_loss": 0.85473979, "learning_rate": 3.976737418846713e-06, "loss": 0.93585825, "num_input_tokens_seen": 27268895, "router_z_loss_clip": 4.37109375, "router_z_loss_mlp": 0.50024414, "step": 1283, "time_per_iteration": 2.604196310043335 }, { "auxiliary_loss_clip": 0.06800342, "auxiliary_loss_mlp": 0.0131597, "balance_loss_clip": 0.06356904, "balance_loss_mlp": 0.01266236, "epoch": 0.07719825642567263, "flos": 18119828453760.0, "grad_norm": 5.495742050802493, "language_loss": 0.76851392, "learning_rate": 3.976678153357181e-06, "loss": 0.84967709, "num_input_tokens_seen": 27288180, "router_z_loss_clip": 4.4296875, "router_z_loss_mlp": 0.49682617, "step": 1284, "time_per_iteration": 2.618973731994629 }, { "auxiliary_loss_clip": 0.06802978, "auxiliary_loss_mlp": 0.01324853, "balance_loss_clip": 0.06361112, "balance_loss_mlp": 0.01272687, "epoch": 0.0772583796783406, "flos": 42204307075200.0, "grad_norm": 5.633505368102679, "language_loss": 0.7779451, "learning_rate": 3.976618812911817e-06, "loss": 0.85922337, "num_input_tokens_seen": 27311815, "router_z_loss_clip": 4.41796875, "router_z_loss_mlp": 0.52197266, "step": 1285, "time_per_iteration": 2.7807929515838623 }, { "auxiliary_loss_clip": 0.06800807, "auxiliary_loss_mlp": 0.01313309, "balance_loss_clip": 0.06360544, "balance_loss_mlp": 0.01262287, "epoch": 0.07731850293100857, "flos": 24760565688960.0, "grad_norm": 2.3784395774495466, "language_loss": 0.86015058, "learning_rate": 3.9765593975128685e-06, "loss": 0.94129169, "num_input_tokens_seen": 27331890, "router_z_loss_clip": 4.39453125, "router_z_loss_mlp": 0.51049805, "step": 1286, "time_per_iteration": 5.303734540939331 }, { "auxiliary_loss_clip": 0.0683402, "auxiliary_loss_mlp": 0.01320763, "balance_loss_clip": 0.06366763, "balance_loss_mlp": 0.01264019, "epoch": 0.07737862618367654, "flos": 17571537262080.0, "grad_norm": 6.095036347698616, "language_loss": 0.79682153, "learning_rate": 3.97649990716259e-06, "loss": 0.87836933, "num_input_tokens_seen": 27348320, "router_z_loss_clip": 4.67578125, "router_z_loss_mlp": 0.56713867, "step": 1287, "time_per_iteration": 2.5932695865631104 }, { "auxiliary_loss_clip": 0.06803174, "auxiliary_loss_mlp": 0.01315795, "balance_loss_clip": 0.06366473, "balance_loss_mlp": 0.01267968, "epoch": 0.0774387494363445, "flos": 25633798214400.0, "grad_norm": 4.003280416076135, "language_loss": 0.86537921, "learning_rate": 3.976440341863237e-06, "loss": 0.94656885, "num_input_tokens_seen": 27367670, "router_z_loss_clip": 4.36328125, "router_z_loss_mlp": 0.47851562, "step": 1288, "time_per_iteration": 2.643289089202881 }, { "auxiliary_loss_clip": 0.06829764, "auxiliary_loss_mlp": 0.01321201, "balance_loss_clip": 0.06368472, "balance_loss_mlp": 0.01267796, "epoch": 0.07749887268901248, "flos": 12244778628480.0, "grad_norm": 2.935404585565529, "language_loss": 0.88425815, "learning_rate": 3.976380701617068e-06, "loss": 0.96576774, "num_input_tokens_seen": 27385485, "router_z_loss_clip": 4.609375, "router_z_loss_mlp": 0.53417969, "step": 1289, "time_per_iteration": 2.6043169498443604 }, { "auxiliary_loss_clip": 0.06802014, "auxiliary_loss_mlp": 0.01315709, "balance_loss_clip": 0.06361391, "balance_loss_mlp": 0.01264425, "epoch": 0.07755899594168045, "flos": 25088609623680.0, "grad_norm": 3.7818005309803024, "language_loss": 0.86951339, "learning_rate": 3.976320986426344e-06, "loss": 0.95069057, "num_input_tokens_seen": 27405110, "router_z_loss_clip": 4.41015625, "router_z_loss_mlp": 0.51342773, "step": 1290, "time_per_iteration": 2.6927967071533203 }, { "auxiliary_loss_clip": 0.06806204, "auxiliary_loss_mlp": 0.0131517, "balance_loss_clip": 0.06365865, "balance_loss_mlp": 0.01261836, "epoch": 0.07761911919434841, "flos": 14251629841920.0, "grad_norm": 7.749175606663422, "language_loss": 0.92974937, "learning_rate": 3.9762611962933315e-06, "loss": 1.01096308, "num_input_tokens_seen": 27422855, "router_z_loss_clip": 4.40234375, "router_z_loss_mlp": 0.53369141, "step": 1291, "time_per_iteration": 2.590052843093872 }, { "auxiliary_loss_clip": 0.06752348, "auxiliary_loss_mlp": 0.01290972, "balance_loss_clip": 0.06447075, "balance_loss_mlp": 0.01254614, "epoch": 0.07767924244701638, "flos": 67259639099520.0, "grad_norm": 0.8664117201101175, "language_loss": 0.65306276, "learning_rate": 3.9762013312202955e-06, "loss": 0.73349595, "num_input_tokens_seen": 27487190, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.36376953, "step": 1292, "time_per_iteration": 3.327082633972168 }, { "auxiliary_loss_clip": 0.06816617, "auxiliary_loss_mlp": 0.01305985, "balance_loss_clip": 0.06366798, "balance_loss_mlp": 0.01254796, "epoch": 0.07773936569968436, "flos": 28558548979200.0, "grad_norm": 2.660365795639129, "language_loss": 0.89257479, "learning_rate": 3.9761413912095075e-06, "loss": 0.97380078, "num_input_tokens_seen": 27510465, "router_z_loss_clip": 4.49609375, "router_z_loss_mlp": 0.51220703, "step": 1293, "time_per_iteration": 2.6706478595733643 }, { "auxiliary_loss_clip": 0.06807794, "auxiliary_loss_mlp": 0.01317994, "balance_loss_clip": 0.06359866, "balance_loss_mlp": 0.01264422, "epoch": 0.07779948895235232, "flos": 27497619308160.0, "grad_norm": 3.38625060401486, "language_loss": 0.87392485, "learning_rate": 3.976081376263239e-06, "loss": 0.95518267, "num_input_tokens_seen": 27528645, "router_z_loss_clip": 4.4765625, "router_z_loss_mlp": 0.53540039, "step": 1294, "time_per_iteration": 2.676814079284668 }, { "auxiliary_loss_clip": 0.06811766, "auxiliary_loss_mlp": 0.01327348, "balance_loss_clip": 0.06357933, "balance_loss_mlp": 0.01271081, "epoch": 0.07785961220502029, "flos": 18229176432000.0, "grad_norm": 2.7635754940888533, "language_loss": 0.81890839, "learning_rate": 3.976021286383768e-06, "loss": 0.90029955, "num_input_tokens_seen": 27546165, "router_z_loss_clip": 4.53515625, "router_z_loss_mlp": 0.56323242, "step": 1295, "time_per_iteration": 2.577387571334839 }, { "auxiliary_loss_clip": 0.06792964, "auxiliary_loss_mlp": 0.01315305, "balance_loss_clip": 0.06355481, "balance_loss_mlp": 0.01261399, "epoch": 0.07791973545768827, "flos": 24615145728000.0, "grad_norm": 5.429063381285701, "language_loss": 0.89473361, "learning_rate": 3.975961121573371e-06, "loss": 0.97581631, "num_input_tokens_seen": 27566520, "router_z_loss_clip": 4.37109375, "router_z_loss_mlp": 0.54003906, "step": 1296, "time_per_iteration": 2.6761512756347656 }, { "auxiliary_loss_clip": 0.06811439, "auxiliary_loss_mlp": 0.01328017, "balance_loss_clip": 0.06355432, "balance_loss_mlp": 0.01270463, "epoch": 0.07797985871035623, "flos": 14287156773120.0, "grad_norm": 3.131113416507501, "language_loss": 0.98261821, "learning_rate": 3.9759008818343305e-06, "loss": 1.06401277, "num_input_tokens_seen": 27581960, "router_z_loss_clip": 4.5625, "router_z_loss_mlp": 0.57568359, "step": 1297, "time_per_iteration": 2.58783221244812 }, { "auxiliary_loss_clip": 0.06811233, "auxiliary_loss_mlp": 0.01324346, "balance_loss_clip": 0.06360362, "balance_loss_mlp": 0.01268866, "epoch": 0.0780399819630242, "flos": 26616965696640.0, "grad_norm": 4.8691543535926884, "language_loss": 0.78969622, "learning_rate": 3.97584056716893e-06, "loss": 0.87105197, "num_input_tokens_seen": 27601415, "router_z_loss_clip": 4.5078125, "router_z_loss_mlp": 0.55493164, "step": 1298, "time_per_iteration": 2.650791645050049 }, { "auxiliary_loss_clip": 0.06793238, "auxiliary_loss_mlp": 0.01317976, "balance_loss_clip": 0.06354129, "balance_loss_mlp": 0.01265428, "epoch": 0.07810010521569218, "flos": 21840846168960.0, "grad_norm": 2.78456618457118, "language_loss": 0.82862318, "learning_rate": 3.9757801775794575e-06, "loss": 0.90973532, "num_input_tokens_seen": 27621490, "router_z_loss_clip": 4.39453125, "router_z_loss_mlp": 0.52563477, "step": 1299, "time_per_iteration": 2.6147687435150146 }, { "auxiliary_loss_clip": 0.0677482, "auxiliary_loss_mlp": 0.01325984, "balance_loss_clip": 0.06346228, "balance_loss_mlp": 0.01273651, "epoch": 0.07816022846836014, "flos": 25088022645120.0, "grad_norm": 4.196158464271996, "language_loss": 0.88815135, "learning_rate": 3.975719713068202e-06, "loss": 0.96915936, "num_input_tokens_seen": 27640600, "router_z_loss_clip": 4.28125, "router_z_loss_mlp": 0.52246094, "step": 1300, "time_per_iteration": 2.6157398223876953 }, { "auxiliary_loss_clip": 0.0679385, "auxiliary_loss_mlp": 0.01328588, "balance_loss_clip": 0.06344642, "balance_loss_mlp": 0.01270604, "epoch": 0.0782203517210281, "flos": 40927197070080.0, "grad_norm": 2.9711940336127305, "language_loss": 0.74260759, "learning_rate": 3.975659173637458e-06, "loss": 0.82383192, "num_input_tokens_seen": 27663070, "router_z_loss_clip": 4.48828125, "router_z_loss_mlp": 0.58007812, "step": 1301, "time_per_iteration": 2.8502848148345947 }, { "auxiliary_loss_clip": 0.06803196, "auxiliary_loss_mlp": 0.01321962, "balance_loss_clip": 0.0636137, "balance_loss_mlp": 0.01267507, "epoch": 0.07828047497369607, "flos": 41181587665920.0, "grad_norm": 2.30743520153715, "language_loss": 0.72990763, "learning_rate": 3.97559855928952e-06, "loss": 0.81115913, "num_input_tokens_seen": 27686425, "router_z_loss_clip": 4.4140625, "router_z_loss_mlp": 0.54418945, "step": 1302, "time_per_iteration": 2.8619706630706787 }, { "auxiliary_loss_clip": 0.06778888, "auxiliary_loss_mlp": 0.01316327, "balance_loss_clip": 0.06341925, "balance_loss_mlp": 0.01263112, "epoch": 0.07834059822636405, "flos": 23513951370240.0, "grad_norm": 3.0291305995314564, "language_loss": 0.83647799, "learning_rate": 3.9755378700266864e-06, "loss": 0.91743016, "num_input_tokens_seen": 27704900, "router_z_loss_clip": 4.37109375, "router_z_loss_mlp": 0.5324707, "step": 1303, "time_per_iteration": 2.7767086029052734 }, { "auxiliary_loss_clip": 0.06787658, "auxiliary_loss_mlp": 0.01313691, "balance_loss_clip": 0.06343704, "balance_loss_mlp": 0.01262216, "epoch": 0.07840072147903202, "flos": 20200165297920.0, "grad_norm": 3.035416737917653, "language_loss": 0.76988822, "learning_rate": 3.9754771058512585e-06, "loss": 0.85090166, "num_input_tokens_seen": 27724890, "router_z_loss_clip": 4.44140625, "router_z_loss_mlp": 0.51513672, "step": 1304, "time_per_iteration": 2.623281717300415 }, { "auxiliary_loss_clip": 0.06799365, "auxiliary_loss_mlp": 0.01322801, "balance_loss_clip": 0.06351283, "balance_loss_mlp": 0.01268942, "epoch": 0.07846084473169998, "flos": 21367172638080.0, "grad_norm": 4.617520288962608, "language_loss": 0.78051078, "learning_rate": 3.975416266765542e-06, "loss": 0.86173248, "num_input_tokens_seen": 27743115, "router_z_loss_clip": 4.4765625, "router_z_loss_mlp": 0.5378418, "step": 1305, "time_per_iteration": 2.60632586479187 }, { "auxiliary_loss_clip": 0.06801295, "auxiliary_loss_mlp": 0.01310502, "balance_loss_clip": 0.06350474, "balance_loss_mlp": 0.01255999, "epoch": 0.07852096798436796, "flos": 25418037150720.0, "grad_norm": 2.409056190452998, "language_loss": 0.87821513, "learning_rate": 3.975355352771841e-06, "loss": 0.95933312, "num_input_tokens_seen": 27763570, "router_z_loss_clip": 4.51171875, "router_z_loss_mlp": 0.54589844, "step": 1306, "time_per_iteration": 2.6463375091552734 }, { "auxiliary_loss_clip": 0.06788158, "auxiliary_loss_mlp": 0.01307364, "balance_loss_clip": 0.06351565, "balance_loss_mlp": 0.0125775, "epoch": 0.07858109123703592, "flos": 24578360985600.0, "grad_norm": 2.9774712886282915, "language_loss": 0.9326756, "learning_rate": 3.975294363872468e-06, "loss": 1.01363087, "num_input_tokens_seen": 27780030, "router_z_loss_clip": 4.3671875, "router_z_loss_mlp": 0.49560547, "step": 1307, "time_per_iteration": 2.618227481842041 }, { "auxiliary_loss_clip": 0.06803893, "auxiliary_loss_mlp": 0.0130761, "balance_loss_clip": 0.06356704, "balance_loss_mlp": 0.01253131, "epoch": 0.07864121448970389, "flos": 20704250661120.0, "grad_norm": 2.70963642974971, "language_loss": 0.84173167, "learning_rate": 3.975233300069735e-06, "loss": 0.92284667, "num_input_tokens_seen": 27796225, "router_z_loss_clip": 4.46875, "router_z_loss_mlp": 0.54516602, "step": 1308, "time_per_iteration": 2.6380257606506348 }, { "auxiliary_loss_clip": 0.06785707, "auxiliary_loss_mlp": 0.01304102, "balance_loss_clip": 0.06352418, "balance_loss_mlp": 0.0125649, "epoch": 0.07870133774237187, "flos": 22973207045760.0, "grad_norm": 1.9549903231820054, "language_loss": 0.79146695, "learning_rate": 3.975172161365958e-06, "loss": 0.872365, "num_input_tokens_seen": 27815975, "router_z_loss_clip": 4.328125, "router_z_loss_mlp": 0.47607422, "step": 1309, "time_per_iteration": 2.640385866165161 }, { "auxiliary_loss_clip": 0.06817955, "auxiliary_loss_mlp": 0.01322367, "balance_loss_clip": 0.06367552, "balance_loss_mlp": 0.01268675, "epoch": 0.07876146099503983, "flos": 18848689194240.0, "grad_norm": 2.990663799021783, "language_loss": 0.82001823, "learning_rate": 3.975110947763453e-06, "loss": 0.90142143, "num_input_tokens_seen": 27832255, "router_z_loss_clip": 4.50390625, "router_z_loss_mlp": 0.53735352, "step": 1310, "time_per_iteration": 2.6170318126678467 }, { "auxiliary_loss_clip": 0.06797603, "auxiliary_loss_mlp": 0.01316315, "balance_loss_clip": 0.06369173, "balance_loss_mlp": 0.0126775, "epoch": 0.0788215842477078, "flos": 23812631648640.0, "grad_norm": 2.85675252439969, "language_loss": 0.75007832, "learning_rate": 3.9750496592645435e-06, "loss": 0.83121753, "num_input_tokens_seen": 27852180, "router_z_loss_clip": 4.27734375, "router_z_loss_mlp": 0.48583984, "step": 1311, "time_per_iteration": 2.6331772804260254 }, { "auxiliary_loss_clip": 0.06818242, "auxiliary_loss_mlp": 0.01327235, "balance_loss_clip": 0.06379943, "balance_loss_mlp": 0.01275141, "epoch": 0.07888170750037576, "flos": 21586329792000.0, "grad_norm": 3.3412554809063693, "language_loss": 0.87359881, "learning_rate": 3.974988295871553e-06, "loss": 0.95505363, "num_input_tokens_seen": 27871435, "router_z_loss_clip": 4.390625, "router_z_loss_mlp": 0.52172852, "step": 1312, "time_per_iteration": 2.6309688091278076 }, { "auxiliary_loss_clip": 0.0681145, "auxiliary_loss_mlp": 0.01331681, "balance_loss_clip": 0.06380966, "balance_loss_mlp": 0.0128457, "epoch": 0.07894183075304374, "flos": 19870947406080.0, "grad_norm": 2.437934703696723, "language_loss": 0.84412205, "learning_rate": 3.9749268575868085e-06, "loss": 0.92555344, "num_input_tokens_seen": 27890625, "router_z_loss_clip": 4.30078125, "router_z_loss_mlp": 0.47094727, "step": 1313, "time_per_iteration": 2.6494240760803223 }, { "auxiliary_loss_clip": 0.06857973, "auxiliary_loss_mlp": 0.01352109, "balance_loss_clip": 0.06394482, "balance_loss_mlp": 0.01292123, "epoch": 0.07900195400571171, "flos": 16148965368960.0, "grad_norm": 4.5881040820033565, "language_loss": 0.76193917, "learning_rate": 3.97486534441264e-06, "loss": 0.84403998, "num_input_tokens_seen": 27906530, "router_z_loss_clip": 4.63671875, "router_z_loss_mlp": 0.59960938, "step": 1314, "time_per_iteration": 2.5957956314086914 }, { "auxiliary_loss_clip": 0.06846425, "auxiliary_loss_mlp": 0.01323865, "balance_loss_clip": 0.06404707, "balance_loss_mlp": 0.01272271, "epoch": 0.07906207725837967, "flos": 23736840030720.0, "grad_norm": 2.7086900568299597, "language_loss": 0.81763363, "learning_rate": 3.974803756351379e-06, "loss": 0.89933658, "num_input_tokens_seen": 27926725, "router_z_loss_clip": 4.41796875, "router_z_loss_mlp": 0.51538086, "step": 1315, "time_per_iteration": 2.6237897872924805 }, { "auxiliary_loss_clip": 0.06849663, "auxiliary_loss_mlp": 0.0133935, "balance_loss_clip": 0.06399409, "balance_loss_mlp": 0.01282082, "epoch": 0.07912220051104765, "flos": 24322712578560.0, "grad_norm": 2.4162265847584887, "language_loss": 0.76300514, "learning_rate": 3.974742093405362e-06, "loss": 0.84489524, "num_input_tokens_seen": 27947875, "router_z_loss_clip": 4.49609375, "router_z_loss_mlp": 0.57275391, "step": 1316, "time_per_iteration": 2.6448686122894287 }, { "auxiliary_loss_clip": 0.06858169, "auxiliary_loss_mlp": 0.01340088, "balance_loss_clip": 0.06397662, "balance_loss_mlp": 0.01283607, "epoch": 0.07918232376371562, "flos": 18886018988160.0, "grad_norm": 3.11530380486918, "language_loss": 0.67863446, "learning_rate": 3.974680355576927e-06, "loss": 0.76061702, "num_input_tokens_seen": 27965040, "router_z_loss_clip": 4.60546875, "router_z_loss_mlp": 0.56518555, "step": 1317, "time_per_iteration": 2.597947597503662 }, { "auxiliary_loss_clip": 0.06888518, "auxiliary_loss_mlp": 0.01347913, "balance_loss_clip": 0.06414036, "balance_loss_mlp": 0.01286783, "epoch": 0.07924244701638358, "flos": 27382862741760.0, "grad_norm": 6.164190452269006, "language_loss": 0.75667953, "learning_rate": 3.974618542868415e-06, "loss": 0.83904386, "num_input_tokens_seen": 27985330, "router_z_loss_clip": 4.74609375, "router_z_loss_mlp": 0.61108398, "step": 1318, "time_per_iteration": 2.6694834232330322 }, { "auxiliary_loss_clip": 0.06844892, "auxiliary_loss_mlp": 0.01325031, "balance_loss_clip": 0.06401798, "balance_loss_mlp": 0.01274557, "epoch": 0.07930257026905156, "flos": 25127574572160.0, "grad_norm": 2.27443686862764, "language_loss": 0.92559683, "learning_rate": 3.97455665528217e-06, "loss": 1.00729609, "num_input_tokens_seen": 28007615, "router_z_loss_clip": 4.42578125, "router_z_loss_mlp": 0.50463867, "step": 1319, "time_per_iteration": 2.6797702312469482 }, { "auxiliary_loss_clip": 0.06850702, "auxiliary_loss_mlp": 0.01330091, "balance_loss_clip": 0.0641076, "balance_loss_mlp": 0.01281215, "epoch": 0.07936269352171953, "flos": 21840804241920.0, "grad_norm": 3.0044716742400497, "language_loss": 0.8188991, "learning_rate": 3.974494692820539e-06, "loss": 0.90070707, "num_input_tokens_seen": 28027765, "router_z_loss_clip": 4.39453125, "router_z_loss_mlp": 0.48876953, "step": 1320, "time_per_iteration": 4.146268606185913 }, { "auxiliary_loss_clip": 0.06863311, "auxiliary_loss_mlp": 0.01322685, "balance_loss_clip": 0.06407386, "balance_loss_mlp": 0.01267753, "epoch": 0.07942281677438749, "flos": 16944477632640.0, "grad_norm": 2.44095413462841, "language_loss": 0.71808481, "learning_rate": 3.974432655485872e-06, "loss": 0.79994476, "num_input_tokens_seen": 28044225, "router_z_loss_clip": 4.5546875, "router_z_loss_mlp": 0.54956055, "step": 1321, "time_per_iteration": 4.038928508758545 }, { "auxiliary_loss_clip": 0.06838338, "auxiliary_loss_mlp": 0.0132268, "balance_loss_clip": 0.06400378, "balance_loss_mlp": 0.01270872, "epoch": 0.07948294002705546, "flos": 18992515927680.0, "grad_norm": 2.974604571777948, "language_loss": 0.85867155, "learning_rate": 3.9743705432805195e-06, "loss": 0.94028175, "num_input_tokens_seen": 28062915, "router_z_loss_clip": 4.375, "router_z_loss_mlp": 0.51855469, "step": 1322, "time_per_iteration": 2.6364011764526367 }, { "auxiliary_loss_clip": 0.06833766, "auxiliary_loss_mlp": 0.01323136, "balance_loss_clip": 0.06393819, "balance_loss_mlp": 0.01273617, "epoch": 0.07954306327972344, "flos": 21659983130880.0, "grad_norm": 2.949180421594881, "language_loss": 0.92558193, "learning_rate": 3.974308356206838e-06, "loss": 1.00715101, "num_input_tokens_seen": 28082175, "router_z_loss_clip": 4.3984375, "router_z_loss_mlp": 0.49536133, "step": 1323, "time_per_iteration": 2.624887228012085 }, { "auxiliary_loss_clip": 0.06826771, "auxiliary_loss_mlp": 0.01317773, "balance_loss_clip": 0.06395385, "balance_loss_mlp": 0.01269612, "epoch": 0.0796031865323914, "flos": 23226717173760.0, "grad_norm": 6.085384820418891, "language_loss": 0.83568341, "learning_rate": 3.974246094267187e-06, "loss": 0.9171288, "num_input_tokens_seen": 28102645, "router_z_loss_clip": 4.3125, "router_z_loss_mlp": 0.48168945, "step": 1324, "time_per_iteration": 2.6597259044647217 }, { "auxiliary_loss_clip": 0.06829728, "auxiliary_loss_mlp": 0.01313043, "balance_loss_clip": 0.06393737, "balance_loss_mlp": 0.01261664, "epoch": 0.07966330978505937, "flos": 23301209053440.0, "grad_norm": 3.4768423034524427, "language_loss": 0.82323319, "learning_rate": 3.974183757463925e-06, "loss": 0.90466094, "num_input_tokens_seen": 28122805, "router_z_loss_clip": 4.359375, "router_z_loss_mlp": 0.51342773, "step": 1325, "time_per_iteration": 4.044478893280029 }, { "auxiliary_loss_clip": 0.06821624, "auxiliary_loss_mlp": 0.01317229, "balance_loss_clip": 0.06389525, "balance_loss_mlp": 0.01268067, "epoch": 0.07972343303772735, "flos": 18368768534400.0, "grad_norm": 7.582808767681401, "language_loss": 0.90660149, "learning_rate": 3.974121345799418e-06, "loss": 0.98799002, "num_input_tokens_seen": 28140530, "router_z_loss_clip": 4.3203125, "router_z_loss_mlp": 0.49194336, "step": 1326, "time_per_iteration": 4.050646066665649 }, { "auxiliary_loss_clip": 0.06820662, "auxiliary_loss_mlp": 0.01317244, "balance_loss_clip": 0.06390231, "balance_loss_mlp": 0.01269846, "epoch": 0.07978355629039531, "flos": 21768995692800.0, "grad_norm": 3.4003819844004566, "language_loss": 0.84863716, "learning_rate": 3.974058859276032e-06, "loss": 0.93001628, "num_input_tokens_seen": 28159640, "router_z_loss_clip": 4.30859375, "router_z_loss_mlp": 0.47460938, "step": 1327, "time_per_iteration": 2.6256065368652344 }, { "auxiliary_loss_clip": 0.06839729, "auxiliary_loss_mlp": 0.01318323, "balance_loss_clip": 0.06387426, "balance_loss_mlp": 0.01263225, "epoch": 0.07984367954306328, "flos": 18557178439680.0, "grad_norm": 5.442886130606657, "language_loss": 0.8236919, "learning_rate": 3.9739962978961354e-06, "loss": 0.90527242, "num_input_tokens_seen": 28177050, "router_z_loss_clip": 4.52734375, "router_z_loss_mlp": 0.55053711, "step": 1328, "time_per_iteration": 2.594224214553833 }, { "auxiliary_loss_clip": 0.06832618, "auxiliary_loss_mlp": 0.0131136, "balance_loss_clip": 0.0638874, "balance_loss_mlp": 0.01261006, "epoch": 0.07990380279573125, "flos": 16908741066240.0, "grad_norm": 4.380631160267627, "language_loss": 0.77226627, "learning_rate": 3.973933661662101e-06, "loss": 0.85370612, "num_input_tokens_seen": 28193245, "router_z_loss_clip": 4.4375, "router_z_loss_mlp": 0.50341797, "step": 1329, "time_per_iteration": 2.5845048427581787 }, { "auxiliary_loss_clip": 0.06830715, "auxiliary_loss_mlp": 0.0132089, "balance_loss_clip": 0.06394874, "balance_loss_mlp": 0.0127056, "epoch": 0.07996392604839922, "flos": 24105358287360.0, "grad_norm": 2.71002775454259, "language_loss": 0.83401871, "learning_rate": 3.973870950576305e-06, "loss": 0.91553473, "num_input_tokens_seen": 28213570, "router_z_loss_clip": 4.35546875, "router_z_loss_mlp": 0.50390625, "step": 1330, "time_per_iteration": 2.6356656551361084 }, { "auxiliary_loss_clip": 0.06845114, "auxiliary_loss_mlp": 0.01321473, "balance_loss_clip": 0.06394225, "balance_loss_mlp": 0.01267781, "epoch": 0.08002404930106718, "flos": 14283257558400.0, "grad_norm": 25.419426183394144, "language_loss": 0.9080537, "learning_rate": 3.9738081646411255e-06, "loss": 0.98971951, "num_input_tokens_seen": 28229980, "router_z_loss_clip": 4.50390625, "router_z_loss_mlp": 0.53662109, "step": 1331, "time_per_iteration": 2.589137554168701 }, { "auxiliary_loss_clip": 0.06855188, "auxiliary_loss_mlp": 0.01317093, "balance_loss_clip": 0.06401541, "balance_loss_mlp": 0.01264617, "epoch": 0.08008417255373516, "flos": 40415732547840.0, "grad_norm": 2.5691608574064775, "language_loss": 0.75797516, "learning_rate": 3.973745303858942e-06, "loss": 0.83969796, "num_input_tokens_seen": 28253840, "router_z_loss_clip": 4.53515625, "router_z_loss_mlp": 0.5246582, "step": 1332, "time_per_iteration": 2.7843849658966064 }, { "auxiliary_loss_clip": 0.06832892, "auxiliary_loss_mlp": 0.01317043, "balance_loss_clip": 0.06405588, "balance_loss_mlp": 0.01271839, "epoch": 0.08014429580640313, "flos": 18484866766080.0, "grad_norm": 2.4797693890810333, "language_loss": 0.83837235, "learning_rate": 3.973682368232138e-06, "loss": 0.91987169, "num_input_tokens_seen": 28271675, "router_z_loss_clip": 4.26953125, "router_z_loss_mlp": 0.45214844, "step": 1333, "time_per_iteration": 2.6305739879608154 }, { "auxiliary_loss_clip": 0.0683224, "auxiliary_loss_mlp": 0.01327241, "balance_loss_clip": 0.06401684, "balance_loss_mlp": 0.01278269, "epoch": 0.0802044190590711, "flos": 22059835614720.0, "grad_norm": 3.614420298855631, "language_loss": 0.78118378, "learning_rate": 3.9736193577631015e-06, "loss": 0.86277866, "num_input_tokens_seen": 28291850, "router_z_loss_clip": 4.30859375, "router_z_loss_mlp": 0.48950195, "step": 1334, "time_per_iteration": 2.6693274974823 }, { "auxiliary_loss_clip": 0.06820589, "auxiliary_loss_mlp": 0.01323807, "balance_loss_clip": 0.0639246, "balance_loss_mlp": 0.01277459, "epoch": 0.08026454231173906, "flos": 24579115672320.0, "grad_norm": 4.534276716112402, "language_loss": 0.81566334, "learning_rate": 3.973556272454221e-06, "loss": 0.89710724, "num_input_tokens_seen": 28310780, "router_z_loss_clip": 4.27929688, "router_z_loss_mlp": 0.46362305, "step": 1335, "time_per_iteration": 2.6687629222869873 }, { "auxiliary_loss_clip": 0.06715396, "auxiliary_loss_mlp": 0.01283268, "balance_loss_clip": 0.06410095, "balance_loss_mlp": 0.01259032, "epoch": 0.08032466556440704, "flos": 52597716940800.0, "grad_norm": 0.7165168522171163, "language_loss": 0.56094319, "learning_rate": 3.973493112307889e-06, "loss": 0.64092982, "num_input_tokens_seen": 28369985, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.24206543, "step": 1336, "time_per_iteration": 3.2713184356689453 }, { "auxiliary_loss_clip": 0.06807633, "auxiliary_loss_mlp": 0.0131851, "balance_loss_clip": 0.06379829, "balance_loss_mlp": 0.01271256, "epoch": 0.080384788817075, "flos": 23849500245120.0, "grad_norm": 2.553582137307675, "language_loss": 0.70179701, "learning_rate": 3.9734298773265005e-06, "loss": 0.7830584, "num_input_tokens_seen": 28388670, "router_z_loss_clip": 4.27734375, "router_z_loss_mlp": 0.47216797, "step": 1337, "time_per_iteration": 2.616483211517334 }, { "auxiliary_loss_clip": 0.06812733, "auxiliary_loss_mlp": 0.0131909, "balance_loss_clip": 0.06387404, "balance_loss_mlp": 0.01271264, "epoch": 0.08044491206974297, "flos": 25307640996480.0, "grad_norm": 3.0576243017080165, "language_loss": 0.89164704, "learning_rate": 3.973366567512453e-06, "loss": 0.97296524, "num_input_tokens_seen": 28411845, "router_z_loss_clip": 4.25, "router_z_loss_mlp": 0.47851562, "step": 1338, "time_per_iteration": 2.694410800933838 }, { "auxiliary_loss_clip": 0.06818192, "auxiliary_loss_mlp": 0.01314467, "balance_loss_clip": 0.06383583, "balance_loss_mlp": 0.01267832, "epoch": 0.08050503532241095, "flos": 22382093617920.0, "grad_norm": 5.399785500524944, "language_loss": 0.8930319, "learning_rate": 3.973303182868147e-06, "loss": 0.9743585, "num_input_tokens_seen": 28427875, "router_z_loss_clip": 4.34375, "router_z_loss_mlp": 0.46630859, "step": 1339, "time_per_iteration": 2.5886189937591553 }, { "auxiliary_loss_clip": 0.06803965, "auxiliary_loss_mlp": 0.01322074, "balance_loss_clip": 0.06387182, "balance_loss_mlp": 0.01279469, "epoch": 0.08056515857507891, "flos": 18375351079680.0, "grad_norm": 3.069940665664252, "language_loss": 0.91347075, "learning_rate": 3.973239723395988e-06, "loss": 0.99473119, "num_input_tokens_seen": 28446615, "router_z_loss_clip": 4.16601562, "router_z_loss_mlp": 0.42626953, "step": 1340, "time_per_iteration": 2.602062702178955 }, { "auxiliary_loss_clip": 0.0670572, "auxiliary_loss_mlp": 0.01284136, "balance_loss_clip": 0.06402789, "balance_loss_mlp": 0.01259245, "epoch": 0.08062528182774688, "flos": 51364938545280.0, "grad_norm": 0.8769421297221061, "language_loss": 0.65333545, "learning_rate": 3.97317618909838e-06, "loss": 0.73323399, "num_input_tokens_seen": 28505290, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.2487793, "step": 1341, "time_per_iteration": 3.19069242477417 }, { "auxiliary_loss_clip": 0.06831988, "auxiliary_loss_mlp": 0.01328634, "balance_loss_clip": 0.06392418, "balance_loss_mlp": 0.01277255, "epoch": 0.08068540508041486, "flos": 17604925914240.0, "grad_norm": 3.182461433726278, "language_loss": 0.90732449, "learning_rate": 3.973112579977733e-06, "loss": 0.9889307, "num_input_tokens_seen": 28522735, "router_z_loss_clip": 4.3984375, "router_z_loss_mlp": 0.51416016, "step": 1342, "time_per_iteration": 2.583099603652954 }, { "auxiliary_loss_clip": 0.06822106, "auxiliary_loss_mlp": 0.013427, "balance_loss_clip": 0.06395915, "balance_loss_mlp": 0.01293824, "epoch": 0.08074552833308282, "flos": 10565761714560.0, "grad_norm": 3.3225218750953465, "language_loss": 0.78267854, "learning_rate": 3.973048896036459e-06, "loss": 0.8643266, "num_input_tokens_seen": 28539460, "router_z_loss_clip": 4.265625, "router_z_loss_mlp": 0.48876953, "step": 1343, "time_per_iteration": 2.57952618598938 }, { "auxiliary_loss_clip": 0.06674941, "auxiliary_loss_mlp": 0.01298163, "balance_loss_clip": 0.06376521, "balance_loss_mlp": 0.01267836, "epoch": 0.08080565158575079, "flos": 60859624245120.0, "grad_norm": 0.7705640741052968, "language_loss": 0.57620555, "learning_rate": 3.972985137276974e-06, "loss": 0.65593654, "num_input_tokens_seen": 28599855, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.30273438, "step": 1344, "time_per_iteration": 3.1767115592956543 }, { "auxiliary_loss_clip": 0.0681165, "auxiliary_loss_mlp": 0.0132853, "balance_loss_clip": 0.06383555, "balance_loss_mlp": 0.01280775, "epoch": 0.08086577483841875, "flos": 18338188993920.0, "grad_norm": 3.201717534593794, "language_loss": 0.89504194, "learning_rate": 3.972921303701695e-06, "loss": 0.97644377, "num_input_tokens_seen": 28617585, "router_z_loss_clip": 4.28125, "router_z_loss_mlp": 0.47753906, "step": 1345, "time_per_iteration": 2.637646436691284 }, { "auxiliary_loss_clip": 0.06793234, "auxiliary_loss_mlp": 0.01340758, "balance_loss_clip": 0.06379046, "balance_loss_mlp": 0.01295601, "epoch": 0.08092589809108673, "flos": 21550048174080.0, "grad_norm": 2.466271519580033, "language_loss": 0.89533085, "learning_rate": 3.972857395313042e-06, "loss": 0.97667074, "num_input_tokens_seen": 28636355, "router_z_loss_clip": 4.14257812, "router_z_loss_mlp": 0.45141602, "step": 1346, "time_per_iteration": 2.6263370513916016 }, { "auxiliary_loss_clip": 0.06786372, "auxiliary_loss_mlp": 0.01334312, "balance_loss_clip": 0.06371249, "balance_loss_mlp": 0.01286366, "epoch": 0.0809860213437547, "flos": 22134662910720.0, "grad_norm": 2.8397684802280976, "language_loss": 0.94483101, "learning_rate": 3.972793412113439e-06, "loss": 1.02603781, "num_input_tokens_seen": 28656260, "router_z_loss_clip": 4.15234375, "router_z_loss_mlp": 0.47949219, "step": 1347, "time_per_iteration": 2.695847511291504 }, { "auxiliary_loss_clip": 0.06787719, "auxiliary_loss_mlp": 0.01333932, "balance_loss_clip": 0.06371319, "balance_loss_mlp": 0.01285557, "epoch": 0.08104614459642266, "flos": 21731875534080.0, "grad_norm": 3.085103494486459, "language_loss": 0.91226625, "learning_rate": 3.972729354105312e-06, "loss": 0.99348271, "num_input_tokens_seen": 28675865, "router_z_loss_clip": 4.1640625, "router_z_loss_mlp": 0.48364258, "step": 1348, "time_per_iteration": 2.677760124206543 }, { "auxiliary_loss_clip": 0.06791787, "auxiliary_loss_mlp": 0.01344909, "balance_loss_clip": 0.06376396, "balance_loss_mlp": 0.01297488, "epoch": 0.08110626784909064, "flos": 23958764369280.0, "grad_norm": 9.854538701333915, "language_loss": 0.77657855, "learning_rate": 3.97266522129109e-06, "loss": 0.85794556, "num_input_tokens_seen": 28696255, "router_z_loss_clip": 4.1484375, "router_z_loss_mlp": 0.47412109, "step": 1349, "time_per_iteration": 2.6929068565368652 }, { "auxiliary_loss_clip": 0.06799743, "auxiliary_loss_mlp": 0.01318099, "balance_loss_clip": 0.06374045, "balance_loss_mlp": 0.01266958, "epoch": 0.0811663911017586, "flos": 19031648584320.0, "grad_norm": 2.1674256085784407, "language_loss": 0.90093899, "learning_rate": 3.972601013673205e-06, "loss": 0.98211741, "num_input_tokens_seen": 28713905, "router_z_loss_clip": 4.26171875, "router_z_loss_mlp": 0.51171875, "step": 1350, "time_per_iteration": 2.704737901687622 }, { "auxiliary_loss_clip": 0.0678364, "auxiliary_loss_mlp": 0.01320365, "balance_loss_clip": 0.06369821, "balance_loss_mlp": 0.01272682, "epoch": 0.08122651435442657, "flos": 15346744778880.0, "grad_norm": 12.40077004841414, "language_loss": 0.84035999, "learning_rate": 3.972536731254092e-06, "loss": 0.92140007, "num_input_tokens_seen": 28732075, "router_z_loss_clip": 4.13867188, "router_z_loss_mlp": 0.47705078, "step": 1351, "time_per_iteration": 2.61253023147583 }, { "auxiliary_loss_clip": 0.06790473, "auxiliary_loss_mlp": 0.01317537, "balance_loss_clip": 0.06371726, "balance_loss_mlp": 0.01269043, "epoch": 0.08128663760709455, "flos": 23228226547200.0, "grad_norm": 2.43618836651162, "language_loss": 0.77113891, "learning_rate": 3.972472374036189e-06, "loss": 0.85221899, "num_input_tokens_seen": 28751150, "router_z_loss_clip": 4.1875, "router_z_loss_mlp": 0.48486328, "step": 1352, "time_per_iteration": 2.636263847351074 }, { "auxiliary_loss_clip": 0.06788935, "auxiliary_loss_mlp": 0.0131244, "balance_loss_clip": 0.06367129, "balance_loss_mlp": 0.01263087, "epoch": 0.08134676085976252, "flos": 22972158869760.0, "grad_norm": 3.8026771527838763, "language_loss": 0.84317464, "learning_rate": 3.972407942021935e-06, "loss": 0.92418838, "num_input_tokens_seen": 28773360, "router_z_loss_clip": 4.22265625, "router_z_loss_mlp": 0.49316406, "step": 1353, "time_per_iteration": 2.6695218086242676 }, { "auxiliary_loss_clip": 0.06649142, "auxiliary_loss_mlp": 0.01304123, "balance_loss_clip": 0.06358597, "balance_loss_mlp": 0.01277563, "epoch": 0.08140688411243048, "flos": 64338592642560.0, "grad_norm": 0.8280446838290786, "language_loss": 0.59788895, "learning_rate": 3.972343435213775e-06, "loss": 0.67742163, "num_input_tokens_seen": 28833390, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.26611328, "step": 1354, "time_per_iteration": 3.250324010848999 }, { "auxiliary_loss_clip": 0.06782897, "auxiliary_loss_mlp": 0.01303221, "balance_loss_clip": 0.06369577, "balance_loss_mlp": 0.01257302, "epoch": 0.08146700736509845, "flos": 22498401484800.0, "grad_norm": 5.393923005520776, "language_loss": 0.84419119, "learning_rate": 3.972278853614154e-06, "loss": 0.9250524, "num_input_tokens_seen": 28852430, "router_z_loss_clip": 4.1328125, "router_z_loss_mlp": 0.45947266, "step": 1355, "time_per_iteration": 2.624955892562866 }, { "auxiliary_loss_clip": 0.06786052, "auxiliary_loss_mlp": 0.01303189, "balance_loss_clip": 0.06369187, "balance_loss_mlp": 0.01255767, "epoch": 0.08152713061776642, "flos": 20453885061120.0, "grad_norm": 2.502646923100622, "language_loss": 0.73636764, "learning_rate": 3.972214197225521e-06, "loss": 0.81726003, "num_input_tokens_seen": 28870685, "router_z_loss_clip": 4.171875, "router_z_loss_mlp": 0.47412109, "step": 1356, "time_per_iteration": 2.6265954971313477 }, { "auxiliary_loss_clip": 0.06801346, "auxiliary_loss_mlp": 0.01307143, "balance_loss_clip": 0.06369761, "balance_loss_mlp": 0.01257456, "epoch": 0.08158725387043439, "flos": 23556983241600.0, "grad_norm": 3.9201975003617493, "language_loss": 0.72184783, "learning_rate": 3.972149466050329e-06, "loss": 0.80293274, "num_input_tokens_seen": 28889860, "router_z_loss_clip": 4.31640625, "router_z_loss_mlp": 0.49658203, "step": 1357, "time_per_iteration": 2.6566643714904785 }, { "auxiliary_loss_clip": 0.06807837, "auxiliary_loss_mlp": 0.01304274, "balance_loss_clip": 0.06379502, "balance_loss_mlp": 0.0125485, "epoch": 0.08164737712310235, "flos": 22023763632000.0, "grad_norm": 3.2811819391136448, "language_loss": 0.8567394, "learning_rate": 3.97208466009103e-06, "loss": 0.93786055, "num_input_tokens_seen": 28905865, "router_z_loss_clip": 4.27734375, "router_z_loss_mlp": 0.49438477, "step": 1358, "time_per_iteration": 2.611875057220459 }, { "auxiliary_loss_clip": 0.0680777, "auxiliary_loss_mlp": 0.01309468, "balance_loss_clip": 0.06376061, "balance_loss_mlp": 0.0126064, "epoch": 0.08170750037577033, "flos": 23374568903040.0, "grad_norm": 2.656503679499957, "language_loss": 1.04580355, "learning_rate": 3.972019779350084e-06, "loss": 1.12697589, "num_input_tokens_seen": 28925250, "router_z_loss_clip": 4.3203125, "router_z_loss_mlp": 0.48803711, "step": 1359, "time_per_iteration": 2.637932300567627 }, { "auxiliary_loss_clip": 0.06823578, "auxiliary_loss_mlp": 0.01314678, "balance_loss_clip": 0.06384859, "balance_loss_mlp": 0.01262941, "epoch": 0.0817676236284383, "flos": 28404743610240.0, "grad_norm": 6.067142582734608, "language_loss": 0.86466098, "learning_rate": 3.971954823829951e-06, "loss": 0.94604355, "num_input_tokens_seen": 28943445, "router_z_loss_clip": 4.38867188, "router_z_loss_mlp": 0.51733398, "step": 1360, "time_per_iteration": 4.077237844467163 }, { "auxiliary_loss_clip": 0.0681985, "auxiliary_loss_mlp": 0.01314399, "balance_loss_clip": 0.06382175, "balance_loss_mlp": 0.01264593, "epoch": 0.08182774688110626, "flos": 19215027244800.0, "grad_norm": 3.8831206054853147, "language_loss": 0.7412684, "learning_rate": 3.971889793533093e-06, "loss": 0.82261086, "num_input_tokens_seen": 28962695, "router_z_loss_clip": 4.375, "router_z_loss_mlp": 0.49804688, "step": 1361, "time_per_iteration": 4.015737771987915 }, { "auxiliary_loss_clip": 0.06808663, "auxiliary_loss_mlp": 0.01311046, "balance_loss_clip": 0.06382647, "balance_loss_mlp": 0.01266676, "epoch": 0.08188787013377424, "flos": 22790750780160.0, "grad_norm": 4.382825712564127, "language_loss": 0.78255737, "learning_rate": 3.971824688461976e-06, "loss": 0.86375439, "num_input_tokens_seen": 28982120, "router_z_loss_clip": 4.26171875, "router_z_loss_mlp": 0.44384766, "step": 1362, "time_per_iteration": 2.6487903594970703 }, { "auxiliary_loss_clip": 0.06806561, "auxiliary_loss_mlp": 0.01314057, "balance_loss_clip": 0.06380424, "balance_loss_mlp": 0.01267089, "epoch": 0.08194799338644221, "flos": 16473026234880.0, "grad_norm": 5.013685511376402, "language_loss": 0.7429502, "learning_rate": 3.971759508619069e-06, "loss": 0.8241564, "num_input_tokens_seen": 28998100, "router_z_loss_clip": 4.25976562, "router_z_loss_mlp": 0.4699707, "step": 1363, "time_per_iteration": 2.5814743041992188 }, { "auxiliary_loss_clip": 0.06815648, "auxiliary_loss_mlp": 0.01312633, "balance_loss_clip": 0.06383276, "balance_loss_mlp": 0.01262542, "epoch": 0.08200811663911017, "flos": 23920218691200.0, "grad_norm": 2.7475244758144215, "language_loss": 0.793661, "learning_rate": 3.971694254006844e-06, "loss": 0.87494373, "num_input_tokens_seen": 29017095, "router_z_loss_clip": 4.3203125, "router_z_loss_mlp": 0.5012207, "step": 1364, "time_per_iteration": 2.6444108486175537 }, { "auxiliary_loss_clip": 0.06812383, "auxiliary_loss_mlp": 0.01316843, "balance_loss_clip": 0.06378247, "balance_loss_mlp": 0.01268063, "epoch": 0.08206823989177814, "flos": 17902641870720.0, "grad_norm": 2.1979076093885763, "language_loss": 0.83498645, "learning_rate": 3.971628924627776e-06, "loss": 0.91627872, "num_input_tokens_seen": 29037240, "router_z_loss_clip": 4.34179688, "router_z_loss_mlp": 0.48852539, "step": 1365, "time_per_iteration": 4.065819025039673 }, { "auxiliary_loss_clip": 0.06794491, "auxiliary_loss_mlp": 0.01312058, "balance_loss_clip": 0.06371957, "balance_loss_mlp": 0.0126757, "epoch": 0.08212836314444612, "flos": 22094272442880.0, "grad_norm": 3.753823041364841, "language_loss": 0.83336103, "learning_rate": 3.97156352048434e-06, "loss": 0.91442651, "num_input_tokens_seen": 29056250, "router_z_loss_clip": 4.2265625, "router_z_loss_mlp": 0.4453125, "step": 1366, "time_per_iteration": 4.010633945465088 }, { "auxiliary_loss_clip": 0.06819817, "auxiliary_loss_mlp": 0.01330823, "balance_loss_clip": 0.06386814, "balance_loss_mlp": 0.01282448, "epoch": 0.08218848639711408, "flos": 17602326437760.0, "grad_norm": 2.3631074481455543, "language_loss": 0.83658195, "learning_rate": 3.97149804157902e-06, "loss": 0.91808838, "num_input_tokens_seen": 29073380, "router_z_loss_clip": 4.328125, "router_z_loss_mlp": 0.48388672, "step": 1367, "time_per_iteration": 2.603330612182617 }, { "auxiliary_loss_clip": 0.06819797, "auxiliary_loss_mlp": 0.01330981, "balance_loss_clip": 0.06375226, "balance_loss_mlp": 0.01281366, "epoch": 0.08224860964978205, "flos": 17863551141120.0, "grad_norm": 3.1241443568005822, "language_loss": 0.85783124, "learning_rate": 3.9714324879142946e-06, "loss": 0.93933892, "num_input_tokens_seen": 29091330, "router_z_loss_clip": 4.4453125, "router_z_loss_mlp": 0.49658203, "step": 1368, "time_per_iteration": 2.5958266258239746 }, { "auxiliary_loss_clip": 0.06781819, "auxiliary_loss_mlp": 0.01321693, "balance_loss_clip": 0.06368346, "balance_loss_mlp": 0.01276584, "epoch": 0.08230873290245003, "flos": 25234406928000.0, "grad_norm": 3.397298532549121, "language_loss": 0.82471359, "learning_rate": 3.971366859492653e-06, "loss": 0.90574867, "num_input_tokens_seen": 29110375, "router_z_loss_clip": 4.1328125, "router_z_loss_mlp": 0.45043945, "step": 1369, "time_per_iteration": 2.676534414291382 }, { "auxiliary_loss_clip": 0.06790002, "auxiliary_loss_mlp": 0.0132823, "balance_loss_clip": 0.06363969, "balance_loss_mlp": 0.01282358, "epoch": 0.08236885615511799, "flos": 31768144099200.0, "grad_norm": 3.49227460765547, "language_loss": 0.77147663, "learning_rate": 3.971301156316582e-06, "loss": 0.85265893, "num_input_tokens_seen": 29129395, "router_z_loss_clip": 4.26171875, "router_z_loss_mlp": 0.45874023, "step": 1370, "time_per_iteration": 2.6993091106414795 }, { "auxiliary_loss_clip": 0.06801488, "auxiliary_loss_mlp": 0.01320529, "balance_loss_clip": 0.06368399, "balance_loss_mlp": 0.01271009, "epoch": 0.08242897940778596, "flos": 23192615761920.0, "grad_norm": 2.588205091719104, "language_loss": 0.76232147, "learning_rate": 3.971235378388573e-06, "loss": 0.84354162, "num_input_tokens_seen": 29148650, "router_z_loss_clip": 4.328125, "router_z_loss_mlp": 0.49536133, "step": 1371, "time_per_iteration": 2.642458915710449 }, { "auxiliary_loss_clip": 0.06786287, "auxiliary_loss_mlp": 0.01322759, "balance_loss_clip": 0.06360248, "balance_loss_mlp": 0.01275958, "epoch": 0.08248910266045394, "flos": 34499327932800.0, "grad_norm": 3.46396095752924, "language_loss": 0.72532398, "learning_rate": 3.971169525711122e-06, "loss": 0.80641448, "num_input_tokens_seen": 29170785, "router_z_loss_clip": 4.25585938, "router_z_loss_mlp": 0.46801758, "step": 1372, "time_per_iteration": 2.7164242267608643 }, { "auxiliary_loss_clip": 0.06809958, "auxiliary_loss_mlp": 0.01312753, "balance_loss_clip": 0.06367619, "balance_loss_mlp": 0.01258441, "epoch": 0.0825492259131219, "flos": 13440059521920.0, "grad_norm": 3.783098435401879, "language_loss": 0.90802908, "learning_rate": 3.9711035982867246e-06, "loss": 0.98925614, "num_input_tokens_seen": 29185210, "router_z_loss_clip": 4.42578125, "router_z_loss_mlp": 0.54345703, "step": 1373, "time_per_iteration": 2.587191104888916 }, { "auxiliary_loss_clip": 0.06801145, "auxiliary_loss_mlp": 0.01316271, "balance_loss_clip": 0.06369428, "balance_loss_mlp": 0.01267467, "epoch": 0.08260934916578987, "flos": 25819608643200.0, "grad_norm": 14.32389690587468, "language_loss": 0.84578776, "learning_rate": 3.971037596117882e-06, "loss": 0.92696196, "num_input_tokens_seen": 29205210, "router_z_loss_clip": 4.3203125, "router_z_loss_mlp": 0.48852539, "step": 1374, "time_per_iteration": 2.6687963008880615 }, { "auxiliary_loss_clip": 0.0666659, "auxiliary_loss_mlp": 0.01288498, "balance_loss_clip": 0.06379743, "balance_loss_mlp": 0.01265777, "epoch": 0.08266947241845783, "flos": 63478609061760.0, "grad_norm": 0.812524615545664, "language_loss": 0.60640192, "learning_rate": 3.970971519207095e-06, "loss": 0.68595278, "num_input_tokens_seen": 29265350, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.22705078, "step": 1375, "time_per_iteration": 3.21915864944458 }, { "auxiliary_loss_clip": 0.06666552, "auxiliary_loss_mlp": 0.01291942, "balance_loss_clip": 0.06379029, "balance_loss_mlp": 0.01269245, "epoch": 0.08272959567112581, "flos": 70013855606400.0, "grad_norm": 0.9206897445973669, "language_loss": 0.6219722, "learning_rate": 3.970905367556871e-06, "loss": 0.70155716, "num_input_tokens_seen": 29321475, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.22680664, "step": 1376, "time_per_iteration": 3.1745214462280273 }, { "auxiliary_loss_clip": 0.06782143, "auxiliary_loss_mlp": 0.01312463, "balance_loss_clip": 0.06360729, "balance_loss_mlp": 0.01265256, "epoch": 0.08278971892379378, "flos": 20419574014080.0, "grad_norm": 4.338962934527115, "language_loss": 0.84359241, "learning_rate": 3.970839141169718e-06, "loss": 0.92453843, "num_input_tokens_seen": 29341405, "router_z_loss_clip": 4.21484375, "router_z_loss_mlp": 0.47192383, "step": 1377, "time_per_iteration": 2.7606139183044434 }, { "auxiliary_loss_clip": 0.06774974, "auxiliary_loss_mlp": 0.01306834, "balance_loss_clip": 0.06358546, "balance_loss_mlp": 0.01257815, "epoch": 0.08284984217646174, "flos": 26257461753600.0, "grad_norm": 2.7790543155815053, "language_loss": 0.85870367, "learning_rate": 3.970772840048147e-06, "loss": 0.93952179, "num_input_tokens_seen": 29361955, "router_z_loss_clip": 4.16796875, "router_z_loss_mlp": 0.48999023, "step": 1378, "time_per_iteration": 2.693389415740967 }, { "auxiliary_loss_clip": 0.06790127, "auxiliary_loss_mlp": 0.01309582, "balance_loss_clip": 0.06363343, "balance_loss_mlp": 0.01260778, "epoch": 0.08290996542912972, "flos": 27201370798080.0, "grad_norm": 5.04701915476586, "language_loss": 0.89570999, "learning_rate": 3.970706464194672e-06, "loss": 0.97670704, "num_input_tokens_seen": 29382395, "router_z_loss_clip": 4.26367188, "router_z_loss_mlp": 0.48852539, "step": 1379, "time_per_iteration": 2.712203025817871 }, { "auxiliary_loss_clip": 0.06780834, "auxiliary_loss_mlp": 0.01302933, "balance_loss_clip": 0.06363746, "balance_loss_mlp": 0.01257967, "epoch": 0.08297008868179769, "flos": 38627367655680.0, "grad_norm": 3.1523568534515487, "language_loss": 0.8042016, "learning_rate": 3.970640013611812e-06, "loss": 0.88503933, "num_input_tokens_seen": 29404460, "router_z_loss_clip": 4.16796875, "router_z_loss_mlp": 0.44970703, "step": 1380, "time_per_iteration": 2.7713801860809326 }, { "auxiliary_loss_clip": 0.0678175, "auxiliary_loss_mlp": 0.01309654, "balance_loss_clip": 0.06368162, "balance_loss_mlp": 0.01261183, "epoch": 0.08303021193446565, "flos": 19980924289920.0, "grad_norm": 4.758868496653788, "language_loss": 0.88349688, "learning_rate": 3.970573488302083e-06, "loss": 0.9644109, "num_input_tokens_seen": 29422675, "router_z_loss_clip": 4.13671875, "router_z_loss_mlp": 0.48413086, "step": 1381, "time_per_iteration": 2.6007914543151855 }, { "auxiliary_loss_clip": 0.068161, "auxiliary_loss_mlp": 0.01312173, "balance_loss_clip": 0.06378001, "balance_loss_mlp": 0.01262105, "epoch": 0.08309033518713363, "flos": 13667769792000.0, "grad_norm": 3.944718050803525, "language_loss": 0.91056073, "learning_rate": 3.970506888268011e-06, "loss": 0.99184346, "num_input_tokens_seen": 29439840, "router_z_loss_clip": 4.3828125, "router_z_loss_mlp": 0.50097656, "step": 1382, "time_per_iteration": 2.5953328609466553 }, { "auxiliary_loss_clip": 0.06799466, "auxiliary_loss_mlp": 0.01313212, "balance_loss_clip": 0.06373025, "balance_loss_mlp": 0.01263621, "epoch": 0.0831504584398016, "flos": 17974492346880.0, "grad_norm": 2.9898421623775215, "language_loss": 0.78803343, "learning_rate": 3.970440213512121e-06, "loss": 0.86916023, "num_input_tokens_seen": 29457360, "router_z_loss_clip": 4.265625, "router_z_loss_mlp": 0.49609375, "step": 1383, "time_per_iteration": 2.6213021278381348 }, { "auxiliary_loss_clip": 0.06798691, "auxiliary_loss_mlp": 0.01310299, "balance_loss_clip": 0.0637487, "balance_loss_mlp": 0.0126252, "epoch": 0.08321058169246956, "flos": 22607959098240.0, "grad_norm": 4.714327655114364, "language_loss": 0.85144502, "learning_rate": 3.97037346403694e-06, "loss": 0.93253493, "num_input_tokens_seen": 29477040, "router_z_loss_clip": 4.2421875, "router_z_loss_mlp": 0.47827148, "step": 1384, "time_per_iteration": 2.673546314239502 }, { "auxiliary_loss_clip": 0.06837188, "auxiliary_loss_mlp": 0.01317228, "balance_loss_clip": 0.06385852, "balance_loss_mlp": 0.01262344, "epoch": 0.08327070494513754, "flos": 22855976784000.0, "grad_norm": 3.484184388863046, "language_loss": 0.86550218, "learning_rate": 3.970306639845e-06, "loss": 0.94704628, "num_input_tokens_seen": 29492010, "router_z_loss_clip": 4.51367188, "router_z_loss_mlp": 0.54858398, "step": 1385, "time_per_iteration": 2.6665520668029785 }, { "auxiliary_loss_clip": 0.06816041, "auxiliary_loss_mlp": 0.01308586, "balance_loss_clip": 0.06377316, "balance_loss_mlp": 0.01256038, "epoch": 0.0833308281978055, "flos": 22789451041920.0, "grad_norm": 6.097153059124893, "language_loss": 0.71590078, "learning_rate": 3.970239740938835e-06, "loss": 0.79714704, "num_input_tokens_seen": 29511850, "router_z_loss_clip": 4.390625, "router_z_loss_mlp": 0.52539062, "step": 1386, "time_per_iteration": 2.629229784011841 }, { "auxiliary_loss_clip": 0.06800511, "auxiliary_loss_mlp": 0.01308201, "balance_loss_clip": 0.06373021, "balance_loss_mlp": 0.01259183, "epoch": 0.08339095145047347, "flos": 20818713738240.0, "grad_norm": 3.2289156378264905, "language_loss": 0.84164453, "learning_rate": 3.97017276732098e-06, "loss": 0.92273164, "num_input_tokens_seen": 29531415, "router_z_loss_clip": 4.2734375, "router_z_loss_mlp": 0.49047852, "step": 1387, "time_per_iteration": 2.614027261734009 }, { "auxiliary_loss_clip": 0.06814528, "auxiliary_loss_mlp": 0.01307421, "balance_loss_clip": 0.06379831, "balance_loss_mlp": 0.01254659, "epoch": 0.08345107470314143, "flos": 18521274165120.0, "grad_norm": 3.2122733400710795, "language_loss": 0.79672682, "learning_rate": 3.970105718993978e-06, "loss": 0.87794632, "num_input_tokens_seen": 29549525, "router_z_loss_clip": 4.34375, "router_z_loss_mlp": 0.52734375, "step": 1388, "time_per_iteration": 2.596893548965454 }, { "auxiliary_loss_clip": 0.06807838, "auxiliary_loss_mlp": 0.0131394, "balance_loss_clip": 0.06388763, "balance_loss_mlp": 0.01263419, "epoch": 0.08351119795580941, "flos": 18813623460480.0, "grad_norm": 3.8290772092105465, "language_loss": 0.81710732, "learning_rate": 3.970038595960369e-06, "loss": 0.89832509, "num_input_tokens_seen": 29568705, "router_z_loss_clip": 4.1953125, "router_z_loss_mlp": 0.50537109, "step": 1389, "time_per_iteration": 2.605808734893799 }, { "auxiliary_loss_clip": 0.06830563, "auxiliary_loss_mlp": 0.0132061, "balance_loss_clip": 0.06390934, "balance_loss_mlp": 0.01266656, "epoch": 0.08357132120847738, "flos": 18447662753280.0, "grad_norm": 3.754215189444604, "language_loss": 0.89016169, "learning_rate": 3.969971398222699e-06, "loss": 0.97167337, "num_input_tokens_seen": 29585855, "router_z_loss_clip": 4.3984375, "router_z_loss_mlp": 0.53979492, "step": 1390, "time_per_iteration": 2.600830554962158 }, { "auxiliary_loss_clip": 0.06817445, "auxiliary_loss_mlp": 0.01326677, "balance_loss_clip": 0.06391303, "balance_loss_mlp": 0.01275346, "epoch": 0.08363144446114534, "flos": 25929585527040.0, "grad_norm": 2.0459034063949173, "language_loss": 0.88455319, "learning_rate": 3.969904125783517e-06, "loss": 0.96599436, "num_input_tokens_seen": 29607280, "router_z_loss_clip": 4.265625, "router_z_loss_mlp": 0.51367188, "step": 1391, "time_per_iteration": 2.7147068977355957 }, { "auxiliary_loss_clip": 0.06856215, "auxiliary_loss_mlp": 0.01324856, "balance_loss_clip": 0.06403765, "balance_loss_mlp": 0.01268207, "epoch": 0.08369156771381332, "flos": 18047223290880.0, "grad_norm": 6.662773268850137, "language_loss": 0.90752006, "learning_rate": 3.969836778645371e-06, "loss": 0.98933083, "num_input_tokens_seen": 29624130, "router_z_loss_clip": 4.515625, "router_z_loss_mlp": 0.56665039, "step": 1392, "time_per_iteration": 2.6092424392700195 }, { "auxiliary_loss_clip": 0.06837721, "auxiliary_loss_mlp": 0.01332149, "balance_loss_clip": 0.06412439, "balance_loss_mlp": 0.01281223, "epoch": 0.08375169096648129, "flos": 22681822072320.0, "grad_norm": 10.000027001150544, "language_loss": 0.82539105, "learning_rate": 3.969769356810819e-06, "loss": 0.90708977, "num_input_tokens_seen": 29643210, "router_z_loss_clip": 4.25, "router_z_loss_mlp": 0.5090332, "step": 1393, "time_per_iteration": 2.69884991645813 }, { "auxiliary_loss_clip": 0.06827004, "auxiliary_loss_mlp": 0.01340855, "balance_loss_clip": 0.0641011, "balance_loss_mlp": 0.01290024, "epoch": 0.08381181421914925, "flos": 26110238929920.0, "grad_norm": 2.1530089483442563, "language_loss": 0.86808723, "learning_rate": 3.969701860282415e-06, "loss": 0.94976574, "num_input_tokens_seen": 29663920, "router_z_loss_clip": 4.16601562, "router_z_loss_mlp": 0.50756836, "step": 1394, "time_per_iteration": 2.796704053878784 }, { "auxiliary_loss_clip": 0.06844091, "auxiliary_loss_mlp": 0.01328061, "balance_loss_clip": 0.06419879, "balance_loss_mlp": 0.01278756, "epoch": 0.08387193747181723, "flos": 20635796275200.0, "grad_norm": 15.858206316424011, "language_loss": 0.84419626, "learning_rate": 3.969634289062719e-06, "loss": 0.9259178, "num_input_tokens_seen": 29683825, "router_z_loss_clip": 4.2421875, "router_z_loss_mlp": 0.4934082, "step": 1395, "time_per_iteration": 2.7117879390716553 }, { "auxiliary_loss_clip": 0.06862187, "auxiliary_loss_mlp": 0.01351346, "balance_loss_clip": 0.06425214, "balance_loss_mlp": 0.01296248, "epoch": 0.0839320607244852, "flos": 13448193367680.0, "grad_norm": 3.4520913021521875, "language_loss": 0.85134453, "learning_rate": 3.969566643154293e-06, "loss": 0.93347979, "num_input_tokens_seen": 29698775, "router_z_loss_clip": 4.3671875, "router_z_loss_mlp": 0.55102539, "step": 1396, "time_per_iteration": 2.689019203186035 }, { "auxiliary_loss_clip": 0.06847882, "auxiliary_loss_mlp": 0.01332932, "balance_loss_clip": 0.06426952, "balance_loss_mlp": 0.01282816, "epoch": 0.08399218397715316, "flos": 23484000735360.0, "grad_norm": 3.220931169600806, "language_loss": 0.78643376, "learning_rate": 3.969498922559703e-06, "loss": 0.86824191, "num_input_tokens_seen": 29719430, "router_z_loss_clip": 4.20703125, "router_z_loss_mlp": 0.50146484, "step": 1397, "time_per_iteration": 2.838592052459717 }, { "auxiliary_loss_clip": 0.06845379, "auxiliary_loss_mlp": 0.01339567, "balance_loss_clip": 0.06424718, "balance_loss_mlp": 0.01287854, "epoch": 0.08405230722982113, "flos": 25927698810240.0, "grad_norm": 3.8700164603076423, "language_loss": 0.79865229, "learning_rate": 3.969431127281516e-06, "loss": 0.88050175, "num_input_tokens_seen": 29739685, "router_z_loss_clip": 4.203125, "router_z_loss_mlp": 0.5168457, "step": 1398, "time_per_iteration": 2.717039108276367 }, { "auxiliary_loss_clip": 0.06843504, "auxiliary_loss_mlp": 0.0133697, "balance_loss_clip": 0.06432187, "balance_loss_mlp": 0.01289573, "epoch": 0.0841124304824891, "flos": 17973192608640.0, "grad_norm": 3.658615116982572, "language_loss": 0.97116327, "learning_rate": 3.969363257322304e-06, "loss": 1.05296791, "num_input_tokens_seen": 29756165, "router_z_loss_clip": 4.11523438, "router_z_loss_mlp": 0.47387695, "step": 1399, "time_per_iteration": 4.044884443283081 }, { "auxiliary_loss_clip": 0.06863065, "auxiliary_loss_mlp": 0.0133894, "balance_loss_clip": 0.06415625, "balance_loss_mlp": 0.01284462, "epoch": 0.08417255373515707, "flos": 25636733107200.0, "grad_norm": 2.8377833584488528, "language_loss": 0.84039676, "learning_rate": 3.96929531268464e-06, "loss": 0.92241687, "num_input_tokens_seen": 29776425, "router_z_loss_clip": 4.48046875, "router_z_loss_mlp": 0.54492188, "step": 1400, "time_per_iteration": 4.110914707183838 }, { "auxiliary_loss_clip": 0.06840687, "auxiliary_loss_mlp": 0.01327857, "balance_loss_clip": 0.06411987, "balance_loss_mlp": 0.01277264, "epoch": 0.08423267698782504, "flos": 26256874775040.0, "grad_norm": 3.7603584461432975, "language_loss": 0.87724417, "learning_rate": 3.969227293371099e-06, "loss": 0.95892954, "num_input_tokens_seen": 29796440, "router_z_loss_clip": 4.28515625, "router_z_loss_mlp": 0.50634766, "step": 1401, "time_per_iteration": 2.7151927947998047 }, { "auxiliary_loss_clip": 0.06833786, "auxiliary_loss_mlp": 0.01336513, "balance_loss_clip": 0.06411898, "balance_loss_mlp": 0.01288043, "epoch": 0.08429280024049302, "flos": 20125757272320.0, "grad_norm": 2.7264121687011014, "language_loss": 0.89651525, "learning_rate": 3.969159199384263e-06, "loss": 0.9782182, "num_input_tokens_seen": 29814755, "router_z_loss_clip": 4.21289062, "router_z_loss_mlp": 0.48486328, "step": 1402, "time_per_iteration": 2.631727695465088 }, { "auxiliary_loss_clip": 0.06827483, "auxiliary_loss_mlp": 0.01320622, "balance_loss_clip": 0.06409387, "balance_loss_mlp": 0.01274178, "epoch": 0.08435292349316098, "flos": 42934593335040.0, "grad_norm": 3.2902986947089152, "language_loss": 0.91102475, "learning_rate": 3.9690910307267125e-06, "loss": 0.99250579, "num_input_tokens_seen": 29834785, "router_z_loss_clip": 4.17773438, "router_z_loss_mlp": 0.46435547, "step": 1403, "time_per_iteration": 2.819493532180786 }, { "auxiliary_loss_clip": 0.06822287, "auxiliary_loss_mlp": 0.01327252, "balance_loss_clip": 0.06390239, "balance_loss_mlp": 0.01277494, "epoch": 0.08441304674582895, "flos": 22863984848640.0, "grad_norm": 3.415753201324661, "language_loss": 0.81896323, "learning_rate": 3.969022787401033e-06, "loss": 0.90045863, "num_input_tokens_seen": 29854695, "router_z_loss_clip": 4.32421875, "router_z_loss_mlp": 0.49731445, "step": 1404, "time_per_iteration": 4.083405256271362 }, { "auxiliary_loss_clip": 0.06842888, "auxiliary_loss_mlp": 0.01322709, "balance_loss_clip": 0.06401537, "balance_loss_mlp": 0.01274214, "epoch": 0.08447316999849692, "flos": 18703436941440.0, "grad_norm": 5.454101462896606, "language_loss": 0.85605884, "learning_rate": 3.968954469409811e-06, "loss": 0.93771482, "num_input_tokens_seen": 29872180, "router_z_loss_clip": 4.41796875, "router_z_loss_mlp": 0.48486328, "step": 1405, "time_per_iteration": 3.994919776916504 }, { "auxiliary_loss_clip": 0.06819507, "auxiliary_loss_mlp": 0.01313325, "balance_loss_clip": 0.06399807, "balance_loss_mlp": 0.01267429, "epoch": 0.08453329325116489, "flos": 25491061584000.0, "grad_norm": 3.7058050315766935, "language_loss": 0.81874073, "learning_rate": 3.968886076755639e-06, "loss": 0.90006906, "num_input_tokens_seen": 29893205, "router_z_loss_clip": 4.20117188, "router_z_loss_mlp": 0.45922852, "step": 1406, "time_per_iteration": 2.6766669750213623 }, { "auxiliary_loss_clip": 0.06822647, "auxiliary_loss_mlp": 0.01321282, "balance_loss_clip": 0.06396304, "balance_loss_mlp": 0.01271214, "epoch": 0.08459341650383286, "flos": 20925839583360.0, "grad_norm": 10.439145172425087, "language_loss": 0.81429958, "learning_rate": 3.96881760944111e-06, "loss": 0.8957389, "num_input_tokens_seen": 29911970, "router_z_loss_clip": 4.26757812, "router_z_loss_mlp": 0.50073242, "step": 1407, "time_per_iteration": 2.6134438514709473 }, { "auxiliary_loss_clip": 0.06819937, "auxiliary_loss_mlp": 0.01313194, "balance_loss_clip": 0.06393565, "balance_loss_mlp": 0.01264461, "epoch": 0.08465353975650082, "flos": 13048215102720.0, "grad_norm": 4.032208634033973, "language_loss": 0.93168592, "learning_rate": 3.968749067468819e-06, "loss": 1.01301718, "num_input_tokens_seen": 29929925, "router_z_loss_clip": 4.26953125, "router_z_loss_mlp": 0.48754883, "step": 1408, "time_per_iteration": 2.613210916519165 }, { "auxiliary_loss_clip": 0.06753281, "auxiliary_loss_mlp": 0.01290439, "balance_loss_clip": 0.0647053, "balance_loss_mlp": 0.01268838, "epoch": 0.0847136630091688, "flos": 60896912112000.0, "grad_norm": 0.8850865043161644, "language_loss": 0.62198818, "learning_rate": 3.968680450841368e-06, "loss": 0.70242536, "num_input_tokens_seen": 29985950, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.21618652, "step": 1409, "time_per_iteration": 3.2772274017333984 }, { "auxiliary_loss_clip": 0.0680154, "auxiliary_loss_mlp": 0.01308393, "balance_loss_clip": 0.06389298, "balance_loss_mlp": 0.01262164, "epoch": 0.08477378626183676, "flos": 22051743696000.0, "grad_norm": 5.435559659134362, "language_loss": 0.88542092, "learning_rate": 3.968611759561355e-06, "loss": 0.96652025, "num_input_tokens_seen": 30004330, "router_z_loss_clip": 4.1171875, "router_z_loss_mlp": 0.46191406, "step": 1410, "time_per_iteration": 2.622310161590576 }, { "auxiliary_loss_clip": 0.06805363, "auxiliary_loss_mlp": 0.01307432, "balance_loss_clip": 0.06378859, "balance_loss_mlp": 0.01259199, "epoch": 0.08483390951450473, "flos": 16695537552000.0, "grad_norm": 3.882565952569812, "language_loss": 0.75618672, "learning_rate": 3.968542993631388e-06, "loss": 0.83731461, "num_input_tokens_seen": 30022555, "router_z_loss_clip": 4.26953125, "router_z_loss_mlp": 0.48266602, "step": 1411, "time_per_iteration": 2.6063525676727295 }, { "auxiliary_loss_clip": 0.06753913, "auxiliary_loss_mlp": 0.01309647, "balance_loss_clip": 0.0646983, "balance_loss_mlp": 0.01284923, "epoch": 0.08489403276717271, "flos": 51604430313600.0, "grad_norm": 0.9446460825263643, "language_loss": 0.57076931, "learning_rate": 3.968474153054073e-06, "loss": 0.65140486, "num_input_tokens_seen": 30077220, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.24731445, "step": 1412, "time_per_iteration": 3.1715779304504395 }, { "auxiliary_loss_clip": 0.06796849, "auxiliary_loss_mlp": 0.01304622, "balance_loss_clip": 0.06374292, "balance_loss_mlp": 0.01257725, "epoch": 0.08495415601984067, "flos": 17098031439360.0, "grad_norm": 6.1003185342553214, "language_loss": 0.91702032, "learning_rate": 3.96840523783202e-06, "loss": 0.99803507, "num_input_tokens_seen": 30094600, "router_z_loss_clip": 4.22460938, "router_z_loss_mlp": 0.46923828, "step": 1413, "time_per_iteration": 2.6105144023895264 }, { "auxiliary_loss_clip": 0.06783018, "auxiliary_loss_mlp": 0.01305441, "balance_loss_clip": 0.06369048, "balance_loss_mlp": 0.01258091, "epoch": 0.08501427927250864, "flos": 23155034405760.0, "grad_norm": 10.17857001022725, "language_loss": 0.90435195, "learning_rate": 3.968336247967844e-06, "loss": 0.98523653, "num_input_tokens_seen": 30114475, "router_z_loss_clip": 4.14453125, "router_z_loss_mlp": 0.47314453, "step": 1414, "time_per_iteration": 2.627969264984131 }, { "auxiliary_loss_clip": 0.0679217, "auxiliary_loss_mlp": 0.01302544, "balance_loss_clip": 0.06369887, "balance_loss_mlp": 0.0125641, "epoch": 0.08507440252517662, "flos": 19069649210880.0, "grad_norm": 4.1997621613109555, "language_loss": 0.78762352, "learning_rate": 3.96826718346416e-06, "loss": 0.86857063, "num_input_tokens_seen": 30133350, "router_z_loss_clip": 4.21875, "router_z_loss_mlp": 0.46142578, "step": 1415, "time_per_iteration": 2.5945403575897217 }, { "auxiliary_loss_clip": 0.0677311, "auxiliary_loss_mlp": 0.01308116, "balance_loss_clip": 0.06362577, "balance_loss_mlp": 0.01258501, "epoch": 0.08513452577784458, "flos": 60195249550080.0, "grad_norm": 3.735941323494289, "language_loss": 0.72296226, "learning_rate": 3.968198044323587e-06, "loss": 0.80377454, "num_input_tokens_seen": 30159005, "router_z_loss_clip": 4.10351562, "router_z_loss_mlp": 0.49584961, "step": 1416, "time_per_iteration": 2.9714183807373047 }, { "auxiliary_loss_clip": 0.06807504, "auxiliary_loss_mlp": 0.01306375, "balance_loss_clip": 0.06363998, "balance_loss_mlp": 0.0125645, "epoch": 0.08519464903051255, "flos": 27315917729280.0, "grad_norm": 8.995517615923605, "language_loss": 0.76934201, "learning_rate": 3.968128830548748e-06, "loss": 0.85048079, "num_input_tokens_seen": 30179450, "router_z_loss_clip": 4.4296875, "router_z_loss_mlp": 0.49902344, "step": 1417, "time_per_iteration": 2.6542632579803467 }, { "auxiliary_loss_clip": 0.06795413, "auxiliary_loss_mlp": 0.0130964, "balance_loss_clip": 0.06365018, "balance_loss_mlp": 0.01260478, "epoch": 0.08525477228318051, "flos": 20272644679680.0, "grad_norm": 3.2689436723518503, "language_loss": 0.83625793, "learning_rate": 3.968059542142265e-06, "loss": 0.91730845, "num_input_tokens_seen": 30197235, "router_z_loss_clip": 4.30078125, "router_z_loss_mlp": 0.49169922, "step": 1418, "time_per_iteration": 2.599815845489502 }, { "auxiliary_loss_clip": 0.06684311, "auxiliary_loss_mlp": 0.01282055, "balance_loss_clip": 0.06403776, "balance_loss_mlp": 0.01259489, "epoch": 0.08531489553584849, "flos": 67633580672640.0, "grad_norm": 0.8389511495812875, "language_loss": 0.5665769, "learning_rate": 3.9679901791067685e-06, "loss": 0.64624059, "num_input_tokens_seen": 30257410, "router_z_loss_clip": 2.81054688, "router_z_loss_mlp": 0.22558594, "step": 1419, "time_per_iteration": 3.1720848083496094 }, { "auxiliary_loss_clip": 0.06802432, "auxiliary_loss_mlp": 0.01308527, "balance_loss_clip": 0.06368974, "balance_loss_mlp": 0.0126182, "epoch": 0.08537501878851646, "flos": 27534362123520.0, "grad_norm": 3.728861337044891, "language_loss": 0.72044092, "learning_rate": 3.967920741444886e-06, "loss": 0.80155051, "num_input_tokens_seen": 30277865, "router_z_loss_clip": 4.33789062, "router_z_loss_mlp": 0.46704102, "step": 1420, "time_per_iteration": 2.7777280807495117 }, { "auxiliary_loss_clip": 0.06779231, "auxiliary_loss_mlp": 0.01309625, "balance_loss_clip": 0.06355698, "balance_loss_mlp": 0.01260106, "epoch": 0.08543514204118442, "flos": 22790918488320.0, "grad_norm": 5.686485740744795, "language_loss": 0.89614189, "learning_rate": 3.967851229159252e-06, "loss": 0.9770304, "num_input_tokens_seen": 30298545, "router_z_loss_clip": 4.23828125, "router_z_loss_mlp": 0.49560547, "step": 1421, "time_per_iteration": 2.641845941543579 }, { "auxiliary_loss_clip": 0.06679884, "auxiliary_loss_mlp": 0.01285856, "balance_loss_clip": 0.06402113, "balance_loss_mlp": 0.01263409, "epoch": 0.0854952652938524, "flos": 61010872064640.0, "grad_norm": 0.7790313220915479, "language_loss": 0.63634527, "learning_rate": 3.967781642252502e-06, "loss": 0.7160027, "num_input_tokens_seen": 30361725, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.2244873, "step": 1422, "time_per_iteration": 3.321504592895508 }, { "auxiliary_loss_clip": 0.06775406, "auxiliary_loss_mlp": 0.01307621, "balance_loss_clip": 0.06354707, "balance_loss_mlp": 0.01260295, "epoch": 0.08555538854652037, "flos": 28045575083520.0, "grad_norm": 3.1839200366338716, "language_loss": 0.85007787, "learning_rate": 3.967711980727276e-06, "loss": 0.9309082, "num_input_tokens_seen": 30382180, "router_z_loss_clip": 4.20507812, "router_z_loss_mlp": 0.47314453, "step": 1423, "time_per_iteration": 2.6888504028320312 }, { "auxiliary_loss_clip": 0.06787091, "auxiliary_loss_mlp": 0.0130356, "balance_loss_clip": 0.06360577, "balance_loss_mlp": 0.01257593, "epoch": 0.08561551179918833, "flos": 23515293035520.0, "grad_norm": 3.661713159826461, "language_loss": 0.76975715, "learning_rate": 3.967642244586213e-06, "loss": 0.85066372, "num_input_tokens_seen": 30402980, "router_z_loss_clip": 4.26367188, "router_z_loss_mlp": 0.45922852, "step": 1424, "time_per_iteration": 2.6553776264190674 }, { "auxiliary_loss_clip": 0.06787936, "auxiliary_loss_mlp": 0.01311613, "balance_loss_clip": 0.06362848, "balance_loss_mlp": 0.01262832, "epoch": 0.08567563505185631, "flos": 17932005527040.0, "grad_norm": 3.964847469922369, "language_loss": 0.77265346, "learning_rate": 3.96757243383196e-06, "loss": 0.85364902, "num_input_tokens_seen": 30420800, "router_z_loss_clip": 4.25390625, "router_z_loss_mlp": 0.48803711, "step": 1425, "time_per_iteration": 2.5866661071777344 }, { "auxiliary_loss_clip": 0.06788293, "auxiliary_loss_mlp": 0.01306052, "balance_loss_clip": 0.06364158, "balance_loss_mlp": 0.01256961, "epoch": 0.08573575830452428, "flos": 19725695153280.0, "grad_norm": 7.313140628684579, "language_loss": 0.95725781, "learning_rate": 3.9675025484671624e-06, "loss": 1.03820133, "num_input_tokens_seen": 30439620, "router_z_loss_clip": 4.24609375, "router_z_loss_mlp": 0.4909668, "step": 1426, "time_per_iteration": 2.6988563537597656 }, { "auxiliary_loss_clip": 0.06798804, "auxiliary_loss_mlp": 0.01309896, "balance_loss_clip": 0.06361062, "balance_loss_mlp": 0.01256276, "epoch": 0.08579588155719224, "flos": 17937414115200.0, "grad_norm": 2.8522403682599555, "language_loss": 0.7792474, "learning_rate": 3.967432588494471e-06, "loss": 0.8603344, "num_input_tokens_seen": 30457300, "router_z_loss_clip": 4.375, "router_z_loss_mlp": 0.53637695, "step": 1427, "time_per_iteration": 2.579298257827759 }, { "auxiliary_loss_clip": 0.06783529, "auxiliary_loss_mlp": 0.01307542, "balance_loss_clip": 0.06356809, "balance_loss_mlp": 0.01260144, "epoch": 0.08585600480986022, "flos": 16038694995840.0, "grad_norm": 3.7392522469779346, "language_loss": 0.84247589, "learning_rate": 3.96736255391654e-06, "loss": 0.92338669, "num_input_tokens_seen": 30471580, "router_z_loss_clip": 4.26953125, "router_z_loss_mlp": 0.47363281, "step": 1428, "time_per_iteration": 2.604942798614502 }, { "auxiliary_loss_clip": 0.06804124, "auxiliary_loss_mlp": 0.01314314, "balance_loss_clip": 0.06362091, "balance_loss_mlp": 0.01261337, "epoch": 0.08591612806252819, "flos": 28664920137600.0, "grad_norm": 2.6774600997985893, "language_loss": 0.8255406, "learning_rate": 3.967292444736023e-06, "loss": 0.90672499, "num_input_tokens_seen": 30492720, "router_z_loss_clip": 4.421875, "router_z_loss_mlp": 0.5300293, "step": 1429, "time_per_iteration": 2.685960531234741 }, { "auxiliary_loss_clip": 0.06780307, "auxiliary_loss_mlp": 0.01321471, "balance_loss_clip": 0.06359187, "balance_loss_mlp": 0.01274026, "epoch": 0.08597625131519615, "flos": 20965349583360.0, "grad_norm": 5.008558105820878, "language_loss": 0.89343977, "learning_rate": 3.967222260955578e-06, "loss": 0.97445762, "num_input_tokens_seen": 30509535, "router_z_loss_clip": 4.20703125, "router_z_loss_mlp": 0.47485352, "step": 1430, "time_per_iteration": 2.601663589477539 }, { "auxiliary_loss_clip": 0.06789441, "auxiliary_loss_mlp": 0.01329354, "balance_loss_clip": 0.06368481, "balance_loss_mlp": 0.01282433, "epoch": 0.08603637456786412, "flos": 23262747229440.0, "grad_norm": 3.75188759081604, "language_loss": 0.83522362, "learning_rate": 3.96715200257787e-06, "loss": 0.91641164, "num_input_tokens_seen": 30529490, "router_z_loss_clip": 4.21289062, "router_z_loss_mlp": 0.46923828, "step": 1431, "time_per_iteration": 2.6098172664642334 }, { "auxiliary_loss_clip": 0.06789772, "auxiliary_loss_mlp": 0.01328084, "balance_loss_clip": 0.06366286, "balance_loss_mlp": 0.01277777, "epoch": 0.0860964978205321, "flos": 28701704880000.0, "grad_norm": 2.9109548457539196, "language_loss": 0.7862255, "learning_rate": 3.967081669605559e-06, "loss": 0.86740404, "num_input_tokens_seen": 30550205, "router_z_loss_clip": 4.23046875, "router_z_loss_mlp": 0.50317383, "step": 1432, "time_per_iteration": 2.668522596359253 }, { "auxiliary_loss_clip": 0.06796686, "auxiliary_loss_mlp": 0.01329939, "balance_loss_clip": 0.06373464, "balance_loss_mlp": 0.01279967, "epoch": 0.08615662107320006, "flos": 19324542931200.0, "grad_norm": 2.099653175325114, "language_loss": 0.7513473, "learning_rate": 3.967011262041315e-06, "loss": 0.83261359, "num_input_tokens_seen": 30568830, "router_z_loss_clip": 4.234375, "router_z_loss_mlp": 0.49951172, "step": 1433, "time_per_iteration": 2.5949244499206543 }, { "auxiliary_loss_clip": 0.06818057, "auxiliary_loss_mlp": 0.01338457, "balance_loss_clip": 0.06378327, "balance_loss_mlp": 0.01286387, "epoch": 0.08621674432586802, "flos": 15857161125120.0, "grad_norm": 6.930561325999175, "language_loss": 0.8858453, "learning_rate": 3.9669407798878065e-06, "loss": 0.96741045, "num_input_tokens_seen": 30585730, "router_z_loss_clip": 4.3984375, "router_z_loss_mlp": 0.52099609, "step": 1434, "time_per_iteration": 2.5999741554260254 }, { "auxiliary_loss_clip": 0.06822381, "auxiliary_loss_mlp": 0.01330698, "balance_loss_clip": 0.06386378, "balance_loss_mlp": 0.01279676, "epoch": 0.086276867578536, "flos": 14105874464640.0, "grad_norm": 3.7383352175690683, "language_loss": 0.80415475, "learning_rate": 3.966870223147707e-06, "loss": 0.8856855, "num_input_tokens_seen": 30603180, "router_z_loss_clip": 4.35546875, "router_z_loss_mlp": 0.51049805, "step": 1435, "time_per_iteration": 2.6069695949554443 }, { "auxiliary_loss_clip": 0.06730518, "auxiliary_loss_mlp": 0.01295953, "balance_loss_clip": 0.06452174, "balance_loss_mlp": 0.01275771, "epoch": 0.08633699083120397, "flos": 70206500142720.0, "grad_norm": 0.8786380382551621, "language_loss": 0.58218193, "learning_rate": 3.96679959182369e-06, "loss": 0.66244662, "num_input_tokens_seen": 30668895, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.20178223, "step": 1436, "time_per_iteration": 3.4863064289093018 }, { "auxiliary_loss_clip": 0.06824484, "auxiliary_loss_mlp": 0.01326799, "balance_loss_clip": 0.063841, "balance_loss_mlp": 0.01277852, "epoch": 0.08639711408387193, "flos": 30306565330560.0, "grad_norm": 3.1449545749618277, "language_loss": 0.71293771, "learning_rate": 3.966728885918437e-06, "loss": 0.79445052, "num_input_tokens_seen": 30688955, "router_z_loss_clip": 4.41015625, "router_z_loss_mlp": 0.48925781, "step": 1437, "time_per_iteration": 2.904301166534424 }, { "auxiliary_loss_clip": 0.0681647, "auxiliary_loss_mlp": 0.01322068, "balance_loss_clip": 0.06377296, "balance_loss_mlp": 0.01270307, "epoch": 0.08645723733653991, "flos": 20303014584960.0, "grad_norm": 2.67906232103736, "language_loss": 0.7435168, "learning_rate": 3.966658105434627e-06, "loss": 0.82490218, "num_input_tokens_seen": 30706095, "router_z_loss_clip": 4.39257812, "router_z_loss_mlp": 0.51757812, "step": 1438, "time_per_iteration": 2.6329944133758545 }, { "auxiliary_loss_clip": 0.06812289, "auxiliary_loss_mlp": 0.01332142, "balance_loss_clip": 0.06380104, "balance_loss_mlp": 0.0128031, "epoch": 0.08651736058920788, "flos": 32898911748480.0, "grad_norm": 2.7026604867089445, "language_loss": 0.66206241, "learning_rate": 3.966587250374945e-06, "loss": 0.74350679, "num_input_tokens_seen": 30729025, "router_z_loss_clip": 4.328125, "router_z_loss_mlp": 0.51831055, "step": 1439, "time_per_iteration": 4.24591064453125 }, { "auxiliary_loss_clip": 0.06807527, "auxiliary_loss_mlp": 0.01327042, "balance_loss_clip": 0.06368051, "balance_loss_mlp": 0.01274638, "epoch": 0.08657748384187584, "flos": 22643863372800.0, "grad_norm": 2.898051669629323, "language_loss": 0.89149815, "learning_rate": 3.966516320742077e-06, "loss": 0.97284389, "num_input_tokens_seen": 30746155, "router_z_loss_clip": 4.3984375, "router_z_loss_mlp": 0.5246582, "step": 1440, "time_per_iteration": 4.074322700500488 }, { "auxiliary_loss_clip": 0.06830169, "auxiliary_loss_mlp": 0.01321808, "balance_loss_clip": 0.06371678, "balance_loss_mlp": 0.01263085, "epoch": 0.08663760709454381, "flos": 23664947627520.0, "grad_norm": 6.974689578122001, "language_loss": 0.85977066, "learning_rate": 3.9664453165387124e-06, "loss": 0.94129038, "num_input_tokens_seen": 30761410, "router_z_loss_clip": 4.578125, "router_z_loss_mlp": 0.5871582, "step": 1441, "time_per_iteration": 2.7113914489746094 }, { "auxiliary_loss_clip": 0.0667437, "auxiliary_loss_mlp": 0.01285908, "balance_loss_clip": 0.06399728, "balance_loss_mlp": 0.01265595, "epoch": 0.08669773034721179, "flos": 62703823484160.0, "grad_norm": 0.8051299255816123, "language_loss": 0.60534424, "learning_rate": 3.966374237767545e-06, "loss": 0.68494701, "num_input_tokens_seen": 30823010, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.203125, "step": 1442, "time_per_iteration": 3.4483470916748047 }, { "auxiliary_loss_clip": 0.06816222, "auxiliary_loss_mlp": 0.01314749, "balance_loss_clip": 0.06368002, "balance_loss_mlp": 0.01261081, "epoch": 0.08675785359987975, "flos": 20673713047680.0, "grad_norm": 8.891470864710914, "language_loss": 0.81020159, "learning_rate": 3.96630308443127e-06, "loss": 0.89151126, "num_input_tokens_seen": 30841980, "router_z_loss_clip": 4.48046875, "router_z_loss_mlp": 0.53637695, "step": 1443, "time_per_iteration": 2.6420979499816895 }, { "auxiliary_loss_clip": 0.06802171, "auxiliary_loss_mlp": 0.01313668, "balance_loss_clip": 0.063651, "balance_loss_mlp": 0.01263219, "epoch": 0.08681797685254772, "flos": 26948070305280.0, "grad_norm": 4.4906010275860195, "language_loss": 0.83866274, "learning_rate": 3.966231856532584e-06, "loss": 0.91982114, "num_input_tokens_seen": 30863280, "router_z_loss_clip": 4.37109375, "router_z_loss_mlp": 0.50415039, "step": 1444, "time_per_iteration": 5.449459791183472 }, { "auxiliary_loss_clip": 0.06823663, "auxiliary_loss_mlp": 0.01314349, "balance_loss_clip": 0.06368483, "balance_loss_mlp": 0.01258917, "epoch": 0.0868781001052157, "flos": 17718676231680.0, "grad_norm": 4.720760377744133, "language_loss": 0.89807439, "learning_rate": 3.966160554074189e-06, "loss": 0.97945452, "num_input_tokens_seen": 30881710, "router_z_loss_clip": 4.546875, "router_z_loss_mlp": 0.5546875, "step": 1445, "time_per_iteration": 2.595707654953003 }, { "auxiliary_loss_clip": 0.06801216, "auxiliary_loss_mlp": 0.0130964, "balance_loss_clip": 0.06362469, "balance_loss_mlp": 0.01258881, "epoch": 0.08693822335788366, "flos": 19901820435840.0, "grad_norm": 2.2347444353149917, "language_loss": 0.83396721, "learning_rate": 3.96608917705879e-06, "loss": 0.91507572, "num_input_tokens_seen": 30900225, "router_z_loss_clip": 4.38867188, "router_z_loss_mlp": 0.50732422, "step": 1446, "time_per_iteration": 2.607862949371338 }, { "auxiliary_loss_clip": 0.0662092, "auxiliary_loss_mlp": 0.01297839, "balance_loss_clip": 0.06344432, "balance_loss_mlp": 0.0127135, "epoch": 0.08699834661055163, "flos": 67040957871360.0, "grad_norm": 0.8884965523378081, "language_loss": 0.54803967, "learning_rate": 3.966017725489091e-06, "loss": 0.62722719, "num_input_tokens_seen": 30959580, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.26538086, "step": 1447, "time_per_iteration": 3.2976834774017334 }, { "auxiliary_loss_clip": 0.06772137, "auxiliary_loss_mlp": 0.01316496, "balance_loss_clip": 0.06343305, "balance_loss_mlp": 0.01266952, "epoch": 0.0870584698632196, "flos": 13485648942720.0, "grad_norm": 5.085250171807781, "language_loss": 0.86377358, "learning_rate": 3.965946199367804e-06, "loss": 0.94465983, "num_input_tokens_seen": 30976775, "router_z_loss_clip": 4.29101562, "router_z_loss_mlp": 0.49560547, "step": 1448, "time_per_iteration": 2.592886209487915 }, { "auxiliary_loss_clip": 0.06800195, "auxiliary_loss_mlp": 0.01312003, "balance_loss_clip": 0.06355714, "balance_loss_mlp": 0.0125955, "epoch": 0.08711859311588757, "flos": 16112516042880.0, "grad_norm": 15.602070631189664, "language_loss": 0.81638777, "learning_rate": 3.965874598697638e-06, "loss": 0.89750981, "num_input_tokens_seen": 30990495, "router_z_loss_clip": 4.44140625, "router_z_loss_mlp": 0.5246582, "step": 1449, "time_per_iteration": 2.569042444229126 }, { "auxiliary_loss_clip": 0.06785519, "auxiliary_loss_mlp": 0.01302803, "balance_loss_clip": 0.06359512, "balance_loss_mlp": 0.01254595, "epoch": 0.08717871636855554, "flos": 38481528424320.0, "grad_norm": 3.16953458620529, "language_loss": 0.72344548, "learning_rate": 3.965802923481313e-06, "loss": 0.80432868, "num_input_tokens_seen": 31014080, "router_z_loss_clip": 4.2578125, "router_z_loss_mlp": 0.48242188, "step": 1450, "time_per_iteration": 2.7617950439453125 }, { "auxiliary_loss_clip": 0.06792489, "auxiliary_loss_mlp": 0.01317577, "balance_loss_clip": 0.06353538, "balance_loss_mlp": 0.01264958, "epoch": 0.0872388396212235, "flos": 17605932163200.0, "grad_norm": 2.155092317598916, "language_loss": 0.8504976, "learning_rate": 3.965731173721542e-06, "loss": 0.93159825, "num_input_tokens_seen": 31031210, "router_z_loss_clip": 4.390625, "router_z_loss_mlp": 0.52661133, "step": 1451, "time_per_iteration": 2.5896897315979004 }, { "auxiliary_loss_clip": 0.06777459, "auxiliary_loss_mlp": 0.01302273, "balance_loss_clip": 0.06355521, "balance_loss_mlp": 0.01256234, "epoch": 0.08729896287389148, "flos": 25265489592960.0, "grad_norm": 2.598922646367125, "language_loss": 0.75825775, "learning_rate": 3.965659349421049e-06, "loss": 0.83905512, "num_input_tokens_seen": 31049710, "router_z_loss_clip": 4.21679688, "router_z_loss_mlp": 0.46044922, "step": 1452, "time_per_iteration": 2.6516807079315186 }, { "auxiliary_loss_clip": 0.0680016, "auxiliary_loss_mlp": 0.01314095, "balance_loss_clip": 0.06361344, "balance_loss_mlp": 0.01260784, "epoch": 0.08735908612655945, "flos": 15637836263040.0, "grad_norm": 7.008566945828881, "language_loss": 0.82309401, "learning_rate": 3.965587450582556e-06, "loss": 0.90423667, "num_input_tokens_seen": 31066160, "router_z_loss_clip": 4.390625, "router_z_loss_mlp": 0.53393555, "step": 1453, "time_per_iteration": 2.588876247406006 }, { "auxiliary_loss_clip": 0.06778792, "auxiliary_loss_mlp": 0.01302113, "balance_loss_clip": 0.06357466, "balance_loss_mlp": 0.01254143, "epoch": 0.08741920937922741, "flos": 20345920675200.0, "grad_norm": 2.8441225104728454, "language_loss": 0.72064328, "learning_rate": 3.96551547720879e-06, "loss": 0.80145228, "num_input_tokens_seen": 31085270, "router_z_loss_clip": 4.20703125, "router_z_loss_mlp": 0.47973633, "step": 1454, "time_per_iteration": 2.599233627319336 }, { "auxiliary_loss_clip": 0.06619409, "auxiliary_loss_mlp": 0.01280782, "balance_loss_clip": 0.0634511, "balance_loss_mlp": 0.01257298, "epoch": 0.08747933263189539, "flos": 62841052944000.0, "grad_norm": 0.7502949413852654, "language_loss": 0.58233279, "learning_rate": 3.96544342930248e-06, "loss": 0.66133469, "num_input_tokens_seen": 31148445, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.23461914, "step": 1455, "time_per_iteration": 3.22696590423584 }, { "auxiliary_loss_clip": 0.0679945, "auxiliary_loss_mlp": 0.01308339, "balance_loss_clip": 0.06362885, "balance_loss_mlp": 0.01255696, "epoch": 0.08753945588456336, "flos": 33044122074240.0, "grad_norm": 2.808317525764006, "language_loss": 0.78873956, "learning_rate": 3.965371306866359e-06, "loss": 0.86981738, "num_input_tokens_seen": 31168770, "router_z_loss_clip": 4.3671875, "router_z_loss_mlp": 0.52661133, "step": 1456, "time_per_iteration": 2.7126662731170654 }, { "auxiliary_loss_clip": 0.0681128, "auxiliary_loss_mlp": 0.01323719, "balance_loss_clip": 0.06371626, "balance_loss_mlp": 0.01271934, "epoch": 0.08759957913723132, "flos": 35554807088640.0, "grad_norm": 2.851469493978354, "language_loss": 0.74415958, "learning_rate": 3.96529910990316e-06, "loss": 0.82550955, "num_input_tokens_seen": 31189270, "router_z_loss_clip": 4.39453125, "router_z_loss_mlp": 0.51831055, "step": 1457, "time_per_iteration": 2.738939046859741 }, { "auxiliary_loss_clip": 0.06792164, "auxiliary_loss_mlp": 0.01306552, "balance_loss_clip": 0.06369937, "balance_loss_mlp": 0.01259035, "epoch": 0.0876597023898993, "flos": 23917283798400.0, "grad_norm": 2.1604321499183583, "language_loss": 0.88100147, "learning_rate": 3.965226838415622e-06, "loss": 0.96198857, "num_input_tokens_seen": 31210385, "router_z_loss_clip": 4.21875, "router_z_loss_mlp": 0.4753418, "step": 1458, "time_per_iteration": 2.6562001705169678 }, { "auxiliary_loss_clip": 0.0680811, "auxiliary_loss_mlp": 0.0130386, "balance_loss_clip": 0.06374275, "balance_loss_mlp": 0.01255389, "epoch": 0.08771982564256726, "flos": 18119912307840.0, "grad_norm": 2.2032297603599185, "language_loss": 0.82088172, "learning_rate": 3.965154492406486e-06, "loss": 0.90200144, "num_input_tokens_seen": 31229745, "router_z_loss_clip": 4.33398438, "router_z_loss_mlp": 0.48486328, "step": 1459, "time_per_iteration": 2.6167657375335693 }, { "auxiliary_loss_clip": 0.06822355, "auxiliary_loss_mlp": 0.01324676, "balance_loss_clip": 0.06371908, "balance_loss_mlp": 0.01269219, "epoch": 0.08777994889523523, "flos": 17717711909760.0, "grad_norm": 3.0651394723013845, "language_loss": 0.8607083, "learning_rate": 3.9650820718784945e-06, "loss": 0.94217855, "num_input_tokens_seen": 31248280, "router_z_loss_clip": 4.50976562, "router_z_loss_mlp": 0.55541992, "step": 1460, "time_per_iteration": 2.596806287765503 }, { "auxiliary_loss_clip": 0.06799135, "auxiliary_loss_mlp": 0.01316195, "balance_loss_clip": 0.06369847, "balance_loss_mlp": 0.01266318, "epoch": 0.0878400721479032, "flos": 12824320193280.0, "grad_norm": 4.163941924280463, "language_loss": 0.82240736, "learning_rate": 3.965009576834394e-06, "loss": 0.90356064, "num_input_tokens_seen": 31262190, "router_z_loss_clip": 4.29296875, "router_z_loss_mlp": 0.4987793, "step": 1461, "time_per_iteration": 2.5555269718170166 }, { "auxiliary_loss_clip": 0.06807175, "auxiliary_loss_mlp": 0.0131397, "balance_loss_clip": 0.06375993, "balance_loss_mlp": 0.01265166, "epoch": 0.08790019540057117, "flos": 26399359843200.0, "grad_norm": 2.007998059459802, "language_loss": 0.772919, "learning_rate": 3.964937007276932e-06, "loss": 0.85413051, "num_input_tokens_seen": 31283690, "router_z_loss_clip": 4.30859375, "router_z_loss_mlp": 0.48852539, "step": 1462, "time_per_iteration": 2.6547815799713135 }, { "auxiliary_loss_clip": 0.06817535, "auxiliary_loss_mlp": 0.0132395, "balance_loss_clip": 0.06368262, "balance_loss_mlp": 0.01268494, "epoch": 0.08796031865323914, "flos": 19139822605440.0, "grad_norm": 2.4449035993741446, "language_loss": 0.75898486, "learning_rate": 3.9648643632088634e-06, "loss": 0.84039968, "num_input_tokens_seen": 31302505, "router_z_loss_clip": 4.48828125, "router_z_loss_mlp": 0.55395508, "step": 1463, "time_per_iteration": 2.5973987579345703 }, { "auxiliary_loss_clip": 0.06804995, "auxiliary_loss_mlp": 0.01324892, "balance_loss_clip": 0.06356519, "balance_loss_mlp": 0.01270032, "epoch": 0.0880204419059071, "flos": 26070896638080.0, "grad_norm": 2.8294229107286144, "language_loss": 0.85376167, "learning_rate": 3.964791644632941e-06, "loss": 0.9350605, "num_input_tokens_seen": 31323070, "router_z_loss_clip": 4.484375, "router_z_loss_mlp": 0.54833984, "step": 1464, "time_per_iteration": 2.662491798400879 }, { "auxiliary_loss_clip": 0.06797796, "auxiliary_loss_mlp": 0.0133178, "balance_loss_clip": 0.06360292, "balance_loss_mlp": 0.01279948, "epoch": 0.08808056515857508, "flos": 22383602991360.0, "grad_norm": 4.978852171215123, "language_loss": 0.79747725, "learning_rate": 3.964718851551923e-06, "loss": 0.87877297, "num_input_tokens_seen": 31341880, "router_z_loss_clip": 4.3671875, "router_z_loss_mlp": 0.51855469, "step": 1465, "time_per_iteration": 2.6317458152770996 }, { "auxiliary_loss_clip": 0.06785473, "auxiliary_loss_mlp": 0.01336255, "balance_loss_clip": 0.06358767, "balance_loss_mlp": 0.01288047, "epoch": 0.08814068841124305, "flos": 23191986856320.0, "grad_norm": 2.5494653040056225, "language_loss": 0.87321806, "learning_rate": 3.9646459839685675e-06, "loss": 0.95443535, "num_input_tokens_seen": 31361995, "router_z_loss_clip": 4.26171875, "router_z_loss_mlp": 0.48242188, "step": 1466, "time_per_iteration": 2.612027168273926 }, { "auxiliary_loss_clip": 0.06785223, "auxiliary_loss_mlp": 0.0131714, "balance_loss_clip": 0.063551, "balance_loss_mlp": 0.012665, "epoch": 0.08820081166391101, "flos": 25162262962560.0, "grad_norm": 2.6019448276389436, "language_loss": 0.8492769, "learning_rate": 3.964573041885641e-06, "loss": 0.93030053, "num_input_tokens_seen": 31381515, "router_z_loss_clip": 4.30078125, "router_z_loss_mlp": 0.5065918, "step": 1467, "time_per_iteration": 2.6669795513153076 }, { "auxiliary_loss_clip": 0.06783903, "auxiliary_loss_mlp": 0.0132421, "balance_loss_clip": 0.06358944, "balance_loss_mlp": 0.01277027, "epoch": 0.08826093491657899, "flos": 22237386416640.0, "grad_norm": 5.266612196824942, "language_loss": 0.77261138, "learning_rate": 3.964500025305907e-06, "loss": 0.85369247, "num_input_tokens_seen": 31400345, "router_z_loss_clip": 4.25390625, "router_z_loss_mlp": 0.47192383, "step": 1468, "time_per_iteration": 2.6310665607452393 }, { "auxiliary_loss_clip": 0.06770585, "auxiliary_loss_mlp": 0.01310887, "balance_loss_clip": 0.06353131, "balance_loss_mlp": 0.01266756, "epoch": 0.08832105816924696, "flos": 22133279318400.0, "grad_norm": 2.412619651877539, "language_loss": 0.81608665, "learning_rate": 3.9644269342321355e-06, "loss": 0.89690137, "num_input_tokens_seen": 31419620, "router_z_loss_clip": 4.1796875, "router_z_loss_mlp": 0.44189453, "step": 1469, "time_per_iteration": 2.6198227405548096 }, { "auxiliary_loss_clip": 0.0677833, "auxiliary_loss_mlp": 0.01321099, "balance_loss_clip": 0.0634903, "balance_loss_mlp": 0.01271914, "epoch": 0.08838118142191492, "flos": 17572250021760.0, "grad_norm": 3.0739260634291874, "language_loss": 0.79933554, "learning_rate": 3.9643537686670974e-06, "loss": 0.88032985, "num_input_tokens_seen": 31437970, "router_z_loss_clip": 4.28515625, "router_z_loss_mlp": 0.49194336, "step": 1470, "time_per_iteration": 2.5906734466552734 }, { "auxiliary_loss_clip": 0.06773072, "auxiliary_loss_mlp": 0.01315923, "balance_loss_clip": 0.06348158, "balance_loss_mlp": 0.01268144, "epoch": 0.0884413046745829, "flos": 20783480296320.0, "grad_norm": 2.2266973023230454, "language_loss": 0.86105907, "learning_rate": 3.964280528613569e-06, "loss": 0.94194901, "num_input_tokens_seen": 31457040, "router_z_loss_clip": 4.2421875, "router_z_loss_mlp": 0.47802734, "step": 1471, "time_per_iteration": 2.6188974380493164 }, { "auxiliary_loss_clip": 0.06748024, "auxiliary_loss_mlp": 0.01299281, "balance_loss_clip": 0.0634297, "balance_loss_mlp": 0.01258178, "epoch": 0.08850142792725087, "flos": 22131686090880.0, "grad_norm": 2.3201119286592595, "language_loss": 0.85331887, "learning_rate": 3.964207214074324e-06, "loss": 0.93379194, "num_input_tokens_seen": 31477520, "router_z_loss_clip": 4.05078125, "router_z_loss_mlp": 0.41088867, "step": 1472, "time_per_iteration": 2.6367952823638916 }, { "auxiliary_loss_clip": 0.06763007, "auxiliary_loss_mlp": 0.01307803, "balance_loss_clip": 0.06340346, "balance_loss_mlp": 0.01260692, "epoch": 0.08856155117991883, "flos": 22425251270400.0, "grad_norm": 4.948050263105423, "language_loss": 0.85226464, "learning_rate": 3.964133825052146e-06, "loss": 0.93297279, "num_input_tokens_seen": 31495575, "router_z_loss_clip": 4.22265625, "router_z_loss_mlp": 0.47094727, "step": 1473, "time_per_iteration": 2.6179826259613037 }, { "auxiliary_loss_clip": 0.06764394, "auxiliary_loss_mlp": 0.01312419, "balance_loss_clip": 0.06342938, "balance_loss_mlp": 0.01266476, "epoch": 0.0886216744325868, "flos": 29945132743680.0, "grad_norm": 3.7270897812108212, "language_loss": 0.79609668, "learning_rate": 3.964060361549816e-06, "loss": 0.87686491, "num_input_tokens_seen": 31520020, "router_z_loss_clip": 4.2109375, "router_z_loss_mlp": 0.45947266, "step": 1474, "time_per_iteration": 2.689943552017212 }, { "auxiliary_loss_clip": 0.06749355, "auxiliary_loss_mlp": 0.01300591, "balance_loss_clip": 0.06339665, "balance_loss_mlp": 0.01255697, "epoch": 0.08868179768525478, "flos": 23988798858240.0, "grad_norm": 15.993555736128792, "language_loss": 0.80468416, "learning_rate": 3.963986823570121e-06, "loss": 0.88518357, "num_input_tokens_seen": 31539265, "router_z_loss_clip": 4.09570312, "router_z_loss_mlp": 0.44921875, "step": 1475, "time_per_iteration": 2.636167287826538 }, { "auxiliary_loss_clip": 0.06773366, "auxiliary_loss_mlp": 0.01309594, "balance_loss_clip": 0.06350081, "balance_loss_mlp": 0.01260813, "epoch": 0.08874192093792274, "flos": 43187264922240.0, "grad_norm": 3.1551085902615794, "language_loss": 0.75511563, "learning_rate": 3.963913211115848e-06, "loss": 0.83594525, "num_input_tokens_seen": 31563425, "router_z_loss_clip": 4.23046875, "router_z_loss_mlp": 0.48779297, "step": 1476, "time_per_iteration": 2.7931618690490723 }, { "auxiliary_loss_clip": 0.06762721, "auxiliary_loss_mlp": 0.01308429, "balance_loss_clip": 0.06349295, "balance_loss_mlp": 0.01260746, "epoch": 0.0888020441905907, "flos": 32860491851520.0, "grad_norm": 2.0679332069912535, "language_loss": 0.76432288, "learning_rate": 3.9638395241897895e-06, "loss": 0.84503436, "num_input_tokens_seen": 31584525, "router_z_loss_clip": 4.1328125, "router_z_loss_mlp": 0.47729492, "step": 1477, "time_per_iteration": 2.7005348205566406 }, { "auxiliary_loss_clip": 0.06770893, "auxiliary_loss_mlp": 0.01314097, "balance_loss_clip": 0.06348299, "balance_loss_mlp": 0.01266223, "epoch": 0.08886216744325869, "flos": 23156124508800.0, "grad_norm": 4.303616014587494, "language_loss": 0.88110113, "learning_rate": 3.963765762794739e-06, "loss": 0.96195102, "num_input_tokens_seen": 31603325, "router_z_loss_clip": 4.2265625, "router_z_loss_mlp": 0.47924805, "step": 1478, "time_per_iteration": 4.033190965652466 }, { "auxiliary_loss_clip": 0.06766347, "auxiliary_loss_mlp": 0.01320818, "balance_loss_clip": 0.0634589, "balance_loss_mlp": 0.01273825, "epoch": 0.08892229069592665, "flos": 23338371139200.0, "grad_norm": 3.8437408444378307, "language_loss": 0.78944385, "learning_rate": 3.963691926933495e-06, "loss": 0.87031555, "num_input_tokens_seen": 31624820, "router_z_loss_clip": 4.203125, "router_z_loss_mlp": 0.47021484, "step": 1479, "time_per_iteration": 2.67948579788208 }, { "auxiliary_loss_clip": 0.06766198, "auxiliary_loss_mlp": 0.01312218, "balance_loss_clip": 0.06346431, "balance_loss_mlp": 0.01263008, "epoch": 0.08898241394859462, "flos": 26221012427520.0, "grad_norm": 4.625080227735358, "language_loss": 0.80326724, "learning_rate": 3.9636180166088555e-06, "loss": 0.88405144, "num_input_tokens_seen": 31646080, "router_z_loss_clip": 4.1953125, "router_z_loss_mlp": 0.4921875, "step": 1480, "time_per_iteration": 4.091894865036011 }, { "auxiliary_loss_clip": 0.06792906, "auxiliary_loss_mlp": 0.01326353, "balance_loss_clip": 0.06353664, "balance_loss_mlp": 0.01271421, "epoch": 0.0890425372012626, "flos": 23557444439040.0, "grad_norm": 2.659498252065791, "language_loss": 0.68905997, "learning_rate": 3.963544031823624e-06, "loss": 0.77025259, "num_input_tokens_seen": 31665770, "router_z_loss_clip": 4.390625, "router_z_loss_mlp": 0.54858398, "step": 1481, "time_per_iteration": 2.676466464996338 }, { "auxiliary_loss_clip": 0.06767891, "auxiliary_loss_mlp": 0.01313282, "balance_loss_clip": 0.06355503, "balance_loss_mlp": 0.01266719, "epoch": 0.08910266045393056, "flos": 23009446736640.0, "grad_norm": 3.5138255685097275, "language_loss": 0.9877128, "learning_rate": 3.9634699725806065e-06, "loss": 1.0685246, "num_input_tokens_seen": 31683805, "router_z_loss_clip": 4.11914062, "router_z_loss_mlp": 0.46582031, "step": 1482, "time_per_iteration": 2.631448984146118 }, { "auxiliary_loss_clip": 0.06790081, "auxiliary_loss_mlp": 0.01311377, "balance_loss_clip": 0.0635926, "balance_loss_mlp": 0.01260665, "epoch": 0.08916278370659853, "flos": 31943766257280.0, "grad_norm": 2.25723885687945, "language_loss": 0.80514455, "learning_rate": 3.96339583888261e-06, "loss": 0.88615912, "num_input_tokens_seen": 31704630, "router_z_loss_clip": 4.30859375, "router_z_loss_mlp": 0.5065918, "step": 1483, "time_per_iteration": 4.0816895961761475 }, { "auxiliary_loss_clip": 0.06790867, "auxiliary_loss_mlp": 0.01319406, "balance_loss_clip": 0.06367749, "balance_loss_mlp": 0.01269099, "epoch": 0.08922290695926649, "flos": 17536219966080.0, "grad_norm": 24.297997132824342, "language_loss": 0.86870408, "learning_rate": 3.963321630732448e-06, "loss": 0.94980681, "num_input_tokens_seen": 31723255, "router_z_loss_clip": 4.234375, "router_z_loss_mlp": 0.50341797, "step": 1484, "time_per_iteration": 4.0772528648376465 }, { "auxiliary_loss_clip": 0.06804685, "auxiliary_loss_mlp": 0.01314136, "balance_loss_clip": 0.06368738, "balance_loss_mlp": 0.01262757, "epoch": 0.08928303021193447, "flos": 32133392046720.0, "grad_norm": 2.448355469523272, "language_loss": 0.82088012, "learning_rate": 3.963247348132932e-06, "loss": 0.90206838, "num_input_tokens_seen": 31747045, "router_z_loss_clip": 4.359375, "router_z_loss_mlp": 0.51367188, "step": 1485, "time_per_iteration": 2.8105499744415283 }, { "auxiliary_loss_clip": 0.06775875, "auxiliary_loss_mlp": 0.01306238, "balance_loss_clip": 0.06359791, "balance_loss_mlp": 0.01258363, "epoch": 0.08934315346460243, "flos": 22131392601600.0, "grad_norm": 2.0751519266753182, "language_loss": 0.84927589, "learning_rate": 3.96317299108688e-06, "loss": 0.9300971, "num_input_tokens_seen": 31766615, "router_z_loss_clip": 4.15820312, "router_z_loss_mlp": 0.47851562, "step": 1486, "time_per_iteration": 2.6617515087127686 }, { "auxiliary_loss_clip": 0.06772967, "auxiliary_loss_mlp": 0.01305309, "balance_loss_clip": 0.0635952, "balance_loss_mlp": 0.01260128, "epoch": 0.0894032767172704, "flos": 22572264458880.0, "grad_norm": 3.4128212452807785, "language_loss": 0.78395385, "learning_rate": 3.963098559597111e-06, "loss": 0.86473656, "num_input_tokens_seen": 31785855, "router_z_loss_clip": 4.13671875, "router_z_loss_mlp": 0.45166016, "step": 1487, "time_per_iteration": 2.802234172821045 }, { "auxiliary_loss_clip": 0.0677326, "auxiliary_loss_mlp": 0.01315507, "balance_loss_clip": 0.06359835, "balance_loss_mlp": 0.01267705, "epoch": 0.08946339996993838, "flos": 20199578319360.0, "grad_norm": 3.6050002466355178, "language_loss": 0.84769845, "learning_rate": 3.963024053666449e-06, "loss": 0.92858613, "num_input_tokens_seen": 31804210, "router_z_loss_clip": 4.12890625, "router_z_loss_mlp": 0.47851562, "step": 1488, "time_per_iteration": 2.678072690963745 }, { "auxiliary_loss_clip": 0.067668, "auxiliary_loss_mlp": 0.01315191, "balance_loss_clip": 0.06351823, "balance_loss_mlp": 0.01267794, "epoch": 0.08952352322260634, "flos": 48371035363200.0, "grad_norm": 12.521256074777119, "language_loss": 0.73708111, "learning_rate": 3.962949473297718e-06, "loss": 0.81790102, "num_input_tokens_seen": 31826150, "router_z_loss_clip": 4.1484375, "router_z_loss_mlp": 0.47387695, "step": 1489, "time_per_iteration": 2.9185996055603027 }, { "auxiliary_loss_clip": 0.06776004, "auxiliary_loss_mlp": 0.01319076, "balance_loss_clip": 0.06359866, "balance_loss_mlp": 0.01272441, "epoch": 0.08958364647527431, "flos": 31800736137600.0, "grad_norm": 5.986626207518481, "language_loss": 0.91433257, "learning_rate": 3.962874818493745e-06, "loss": 0.99528331, "num_input_tokens_seen": 31848060, "router_z_loss_clip": 4.16601562, "router_z_loss_mlp": 0.46655273, "step": 1490, "time_per_iteration": 2.698244571685791 }, { "auxiliary_loss_clip": 0.06781483, "auxiliary_loss_mlp": 0.01313602, "balance_loss_clip": 0.06357394, "balance_loss_mlp": 0.01267444, "epoch": 0.08964376972794229, "flos": 23374988173440.0, "grad_norm": 2.639779843277184, "language_loss": 0.75436866, "learning_rate": 3.9628000892573635e-06, "loss": 0.83531952, "num_input_tokens_seen": 31870040, "router_z_loss_clip": 4.2421875, "router_z_loss_mlp": 0.46142578, "step": 1491, "time_per_iteration": 2.6961865425109863 }, { "auxiliary_loss_clip": 0.06762505, "auxiliary_loss_mlp": 0.01317017, "balance_loss_clip": 0.06353435, "balance_loss_mlp": 0.01273744, "epoch": 0.08970389298061025, "flos": 23301502542720.0, "grad_norm": 2.1171226778484136, "language_loss": 0.7858451, "learning_rate": 3.9627252855914055e-06, "loss": 0.86664033, "num_input_tokens_seen": 31890400, "router_z_loss_clip": 4.08984375, "router_z_loss_mlp": 0.43261719, "step": 1492, "time_per_iteration": 2.6220152378082275 }, { "auxiliary_loss_clip": 0.06774241, "auxiliary_loss_mlp": 0.01320844, "balance_loss_clip": 0.06359488, "balance_loss_mlp": 0.01271562, "epoch": 0.08976401623327822, "flos": 33769419016320.0, "grad_norm": 3.2061538622422585, "language_loss": 0.72599494, "learning_rate": 3.962650407498707e-06, "loss": 0.80694586, "num_input_tokens_seen": 31913435, "router_z_loss_clip": 4.140625, "router_z_loss_mlp": 0.49291992, "step": 1493, "time_per_iteration": 2.7099030017852783 }, { "auxiliary_loss_clip": 0.06782784, "auxiliary_loss_mlp": 0.01325234, "balance_loss_clip": 0.06365973, "balance_loss_mlp": 0.01275118, "epoch": 0.08982413948594618, "flos": 23917535360640.0, "grad_norm": 5.4176032396697265, "language_loss": 0.88482964, "learning_rate": 3.962575454982109e-06, "loss": 0.96590984, "num_input_tokens_seen": 31932435, "router_z_loss_clip": 4.171875, "router_z_loss_mlp": 0.50073242, "step": 1494, "time_per_iteration": 2.764981508255005 }, { "auxiliary_loss_clip": 0.0676786, "auxiliary_loss_mlp": 0.01334044, "balance_loss_clip": 0.06360207, "balance_loss_mlp": 0.01287767, "epoch": 0.08988426273861416, "flos": 16843305427200.0, "grad_norm": 1.9880204816631677, "language_loss": 0.84779924, "learning_rate": 3.962500428044454e-06, "loss": 0.92881835, "num_input_tokens_seen": 31950125, "router_z_loss_clip": 4.07226562, "router_z_loss_mlp": 0.46264648, "step": 1495, "time_per_iteration": 2.6117618083953857 }, { "auxiliary_loss_clip": 0.06798033, "auxiliary_loss_mlp": 0.01315636, "balance_loss_clip": 0.06374112, "balance_loss_mlp": 0.01267547, "epoch": 0.08994438599128213, "flos": 14798621295360.0, "grad_norm": 2.322084562100984, "language_loss": 0.72177118, "learning_rate": 3.962425326688585e-06, "loss": 0.80290782, "num_input_tokens_seen": 31968050, "router_z_loss_clip": 4.234375, "router_z_loss_mlp": 0.48071289, "step": 1496, "time_per_iteration": 2.606762409210205 }, { "auxiliary_loss_clip": 0.06782134, "auxiliary_loss_mlp": 0.01322476, "balance_loss_clip": 0.06370303, "balance_loss_mlp": 0.01275961, "epoch": 0.09000450924395009, "flos": 17390087245440.0, "grad_norm": 2.2211046651596997, "language_loss": 0.81628186, "learning_rate": 3.962350150917351e-06, "loss": 0.89732802, "num_input_tokens_seen": 31985675, "router_z_loss_clip": 4.12109375, "router_z_loss_mlp": 0.46557617, "step": 1497, "time_per_iteration": 2.6064584255218506 }, { "auxiliary_loss_clip": 0.06781872, "auxiliary_loss_mlp": 0.0132207, "balance_loss_clip": 0.06356199, "balance_loss_mlp": 0.01270834, "epoch": 0.09006463249661807, "flos": 24287269501440.0, "grad_norm": 4.0126852153210955, "language_loss": 0.85451889, "learning_rate": 3.9622749007336035e-06, "loss": 0.9355582, "num_input_tokens_seen": 32005180, "router_z_loss_clip": 4.24804688, "router_z_loss_mlp": 0.51245117, "step": 1498, "time_per_iteration": 2.645742654800415 }, { "auxiliary_loss_clip": 0.06780759, "auxiliary_loss_mlp": 0.01321449, "balance_loss_clip": 0.06356981, "balance_loss_mlp": 0.01273742, "epoch": 0.09012475574928604, "flos": 13666931251200.0, "grad_norm": 3.6140671825432142, "language_loss": 0.80773538, "learning_rate": 3.962199576140195e-06, "loss": 0.88875741, "num_input_tokens_seen": 32022970, "router_z_loss_clip": 4.23828125, "router_z_loss_mlp": 0.47753906, "step": 1499, "time_per_iteration": 2.609973192214966 }, { "auxiliary_loss_clip": 0.06765951, "auxiliary_loss_mlp": 0.01318474, "balance_loss_clip": 0.06356155, "balance_loss_mlp": 0.01273914, "epoch": 0.090184879001954, "flos": 23333884945920.0, "grad_norm": 2.3290762856119347, "language_loss": 0.94045836, "learning_rate": 3.962124177139981e-06, "loss": 1.02130258, "num_input_tokens_seen": 32043055, "router_z_loss_clip": 4.1015625, "router_z_loss_mlp": 0.44580078, "step": 1500, "time_per_iteration": 2.6323494911193848 }, { "auxiliary_loss_clip": 0.06783759, "auxiliary_loss_mlp": 0.01323068, "balance_loss_clip": 0.06351535, "balance_loss_mlp": 0.01268494, "epoch": 0.09024500225462198, "flos": 23009320955520.0, "grad_norm": 6.503997839234503, "language_loss": 0.77435005, "learning_rate": 3.962048703735822e-06, "loss": 0.85541832, "num_input_tokens_seen": 32061900, "router_z_loss_clip": 4.32421875, "router_z_loss_mlp": 0.54589844, "step": 1501, "time_per_iteration": 2.626603603363037 }, { "auxiliary_loss_clip": 0.06625285, "auxiliary_loss_mlp": 0.01264803, "balance_loss_clip": 0.06351708, "balance_loss_mlp": 0.01247518, "epoch": 0.09030512550728995, "flos": 62208626653440.0, "grad_norm": 0.7116420757012678, "language_loss": 0.58242762, "learning_rate": 3.96197315593058e-06, "loss": 0.66132849, "num_input_tokens_seen": 32122745, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.1730957, "step": 1502, "time_per_iteration": 3.2500340938568115 }, { "auxiliary_loss_clip": 0.06770475, "auxiliary_loss_mlp": 0.01311741, "balance_loss_clip": 0.06351499, "balance_loss_mlp": 0.01264439, "epoch": 0.09036524875995791, "flos": 38809907775360.0, "grad_norm": 3.1478478342957272, "language_loss": 0.72011483, "learning_rate": 3.961897533727119e-06, "loss": 0.80093706, "num_input_tokens_seen": 32145125, "router_z_loss_clip": 4.19335938, "router_z_loss_mlp": 0.47265625, "step": 1503, "time_per_iteration": 2.7582075595855713 }, { "auxiliary_loss_clip": 0.06785157, "auxiliary_loss_mlp": 0.01305302, "balance_loss_clip": 0.06359943, "balance_loss_mlp": 0.01257118, "epoch": 0.09042537201262588, "flos": 21696642092160.0, "grad_norm": 2.2474010063390217, "language_loss": 0.87638944, "learning_rate": 3.961821837128306e-06, "loss": 0.95729405, "num_input_tokens_seen": 32166255, "router_z_loss_clip": 4.25390625, "router_z_loss_mlp": 0.48193359, "step": 1504, "time_per_iteration": 2.66056752204895 }, { "auxiliary_loss_clip": 0.06798239, "auxiliary_loss_mlp": 0.01317027, "balance_loss_clip": 0.06359342, "balance_loss_mlp": 0.01262453, "epoch": 0.09048549526529386, "flos": 22272536004480.0, "grad_norm": 2.593245939592984, "language_loss": 0.74766958, "learning_rate": 3.961746066137014e-06, "loss": 0.82882226, "num_input_tokens_seen": 32184010, "router_z_loss_clip": 4.38671875, "router_z_loss_mlp": 0.5456543, "step": 1505, "time_per_iteration": 2.598337173461914 }, { "auxiliary_loss_clip": 0.06776021, "auxiliary_loss_mlp": 0.01306443, "balance_loss_clip": 0.0635321, "balance_loss_mlp": 0.01256732, "epoch": 0.09054561851796182, "flos": 14616165029760.0, "grad_norm": 2.7402201206920687, "language_loss": 0.82701087, "learning_rate": 3.961670220756114e-06, "loss": 0.90783548, "num_input_tokens_seen": 32201635, "router_z_loss_clip": 4.22851562, "router_z_loss_mlp": 0.49731445, "step": 1506, "time_per_iteration": 2.6001081466674805 }, { "auxiliary_loss_clip": 0.06768911, "auxiliary_loss_mlp": 0.01302286, "balance_loss_clip": 0.06360223, "balance_loss_mlp": 0.01257559, "epoch": 0.09060574177062979, "flos": 27643542393600.0, "grad_norm": 5.664164793437662, "language_loss": 0.77781105, "learning_rate": 3.961594300988482e-06, "loss": 0.85852301, "num_input_tokens_seen": 32221940, "router_z_loss_clip": 4.08203125, "router_z_loss_mlp": 0.44726562, "step": 1507, "time_per_iteration": 2.6567282676696777 }, { "auxiliary_loss_clip": 0.06604032, "auxiliary_loss_mlp": 0.01268135, "balance_loss_clip": 0.06332941, "balance_loss_mlp": 0.01250147, "epoch": 0.09066586502329776, "flos": 66104637621120.0, "grad_norm": 0.7090707471582752, "language_loss": 0.5750767, "learning_rate": 3.961518306836998e-06, "loss": 0.65379834, "num_input_tokens_seen": 32276495, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.18017578, "step": 1508, "time_per_iteration": 3.1143457889556885 }, { "auxiliary_loss_clip": 0.06777391, "auxiliary_loss_mlp": 0.01303158, "balance_loss_clip": 0.06362008, "balance_loss_mlp": 0.01257095, "epoch": 0.09072598827596573, "flos": 18922426387200.0, "grad_norm": 2.0696820872259667, "language_loss": 0.86254716, "learning_rate": 3.961442238304543e-06, "loss": 0.94335258, "num_input_tokens_seen": 32294130, "router_z_loss_clip": 4.15625, "router_z_loss_mlp": 0.46020508, "step": 1509, "time_per_iteration": 2.6231539249420166 }, { "auxiliary_loss_clip": 0.06814557, "auxiliary_loss_mlp": 0.01318145, "balance_loss_clip": 0.06376374, "balance_loss_mlp": 0.01266408, "epoch": 0.0907861115286337, "flos": 24827804190720.0, "grad_norm": 5.230911640798544, "language_loss": 0.86095524, "learning_rate": 3.961366095394002e-06, "loss": 0.9422822, "num_input_tokens_seen": 32313555, "router_z_loss_clip": 4.37109375, "router_z_loss_mlp": 0.51757812, "step": 1510, "time_per_iteration": 2.6835644245147705 }, { "auxiliary_loss_clip": 0.06810378, "auxiliary_loss_mlp": 0.0130495, "balance_loss_clip": 0.06383757, "balance_loss_mlp": 0.01256289, "epoch": 0.09084623478130167, "flos": 21659270371200.0, "grad_norm": 2.2252257661694923, "language_loss": 0.89346397, "learning_rate": 3.961289878108262e-06, "loss": 0.97461724, "num_input_tokens_seen": 32331430, "router_z_loss_clip": 4.25976562, "router_z_loss_mlp": 0.48657227, "step": 1511, "time_per_iteration": 2.6757876873016357 }, { "auxiliary_loss_clip": 0.06783278, "auxiliary_loss_mlp": 0.01299259, "balance_loss_clip": 0.06369987, "balance_loss_mlp": 0.01253221, "epoch": 0.09090635803396964, "flos": 27647148119040.0, "grad_norm": 1.7960557462633309, "language_loss": 0.86326444, "learning_rate": 3.9612135864502135e-06, "loss": 0.94408983, "num_input_tokens_seen": 32353705, "router_z_loss_clip": 4.13671875, "router_z_loss_mlp": 0.46044922, "step": 1512, "time_per_iteration": 2.6887314319610596 }, { "auxiliary_loss_clip": 0.0678344, "auxiliary_loss_mlp": 0.01302062, "balance_loss_clip": 0.0637619, "balance_loss_mlp": 0.01257836, "epoch": 0.0909664812866376, "flos": 17673757643520.0, "grad_norm": 2.830204921081167, "language_loss": 0.88127124, "learning_rate": 3.961137220422749e-06, "loss": 0.96212626, "num_input_tokens_seen": 32370520, "router_z_loss_clip": 4.0703125, "router_z_loss_mlp": 0.44213867, "step": 1513, "time_per_iteration": 2.5991251468658447 }, { "auxiliary_loss_clip": 0.06797829, "auxiliary_loss_mlp": 0.01304663, "balance_loss_clip": 0.06388189, "balance_loss_mlp": 0.01261461, "epoch": 0.09102660453930557, "flos": 23958261244800.0, "grad_norm": 2.4585627569912507, "language_loss": 0.88242239, "learning_rate": 3.961060780028764e-06, "loss": 0.96344733, "num_input_tokens_seen": 32389105, "router_z_loss_clip": 4.09765625, "router_z_loss_mlp": 0.43164062, "step": 1514, "time_per_iteration": 2.653118133544922 }, { "auxiliary_loss_clip": 0.0679114, "auxiliary_loss_mlp": 0.01307868, "balance_loss_clip": 0.06383909, "balance_loss_mlp": 0.0126481, "epoch": 0.09108672779197355, "flos": 25820195621760.0, "grad_norm": 2.5879367863847005, "language_loss": 0.90751028, "learning_rate": 3.960984265271159e-06, "loss": 0.98850048, "num_input_tokens_seen": 32408065, "router_z_loss_clip": 4.0703125, "router_z_loss_mlp": 0.43066406, "step": 1515, "time_per_iteration": 2.693336009979248 }, { "auxiliary_loss_clip": 0.06798001, "auxiliary_loss_mlp": 0.01306341, "balance_loss_clip": 0.06388596, "balance_loss_mlp": 0.01261828, "epoch": 0.09114685104464151, "flos": 29646620173440.0, "grad_norm": 3.8117199058821667, "language_loss": 0.87334132, "learning_rate": 3.9609076761528335e-06, "loss": 0.95438474, "num_input_tokens_seen": 32427225, "router_z_loss_clip": 4.09375, "router_z_loss_mlp": 0.44506836, "step": 1516, "time_per_iteration": 2.714082956314087 }, { "auxiliary_loss_clip": 0.06800516, "auxiliary_loss_mlp": 0.01307779, "balance_loss_clip": 0.06390116, "balance_loss_mlp": 0.01263743, "epoch": 0.09120697429730948, "flos": 33738084789120.0, "grad_norm": 1.7349369454492423, "language_loss": 0.82733643, "learning_rate": 3.960831012676692e-06, "loss": 0.90841937, "num_input_tokens_seen": 32450510, "router_z_loss_clip": 4.10742188, "router_z_loss_mlp": 0.44042969, "step": 1517, "time_per_iteration": 2.7248404026031494 }, { "auxiliary_loss_clip": 0.06794877, "auxiliary_loss_mlp": 0.01311525, "balance_loss_clip": 0.06382877, "balance_loss_mlp": 0.01266965, "epoch": 0.09126709754997746, "flos": 18406559525760.0, "grad_norm": 1.92052642608398, "language_loss": 0.79218632, "learning_rate": 3.960754274845642e-06, "loss": 0.87325037, "num_input_tokens_seen": 32468425, "router_z_loss_clip": 4.1171875, "router_z_loss_mlp": 0.44555664, "step": 1518, "time_per_iteration": 4.105165719985962 }, { "auxiliary_loss_clip": 0.06791796, "auxiliary_loss_mlp": 0.01313018, "balance_loss_clip": 0.0638089, "balance_loss_mlp": 0.01267385, "epoch": 0.09132722080264542, "flos": 22098674782080.0, "grad_norm": 3.227700959204142, "language_loss": 0.88439995, "learning_rate": 3.960677462662594e-06, "loss": 0.96544802, "num_input_tokens_seen": 32487510, "router_z_loss_clip": 4.11523438, "router_z_loss_mlp": 0.45629883, "step": 1519, "time_per_iteration": 2.63222074508667 }, { "auxiliary_loss_clip": 0.06803968, "auxiliary_loss_mlp": 0.01309285, "balance_loss_clip": 0.06386667, "balance_loss_mlp": 0.01262985, "epoch": 0.09138734405531339, "flos": 21039547973760.0, "grad_norm": 3.2462290504446396, "language_loss": 0.74944466, "learning_rate": 3.96060057613046e-06, "loss": 0.83057714, "num_input_tokens_seen": 32507250, "router_z_loss_clip": 4.17578125, "router_z_loss_mlp": 0.46289062, "step": 1520, "time_per_iteration": 4.033954858779907 }, { "auxiliary_loss_clip": 0.0680588, "auxiliary_loss_mlp": 0.01315662, "balance_loss_clip": 0.06393393, "balance_loss_mlp": 0.01268002, "epoch": 0.09144746730798137, "flos": 20090104560000.0, "grad_norm": 3.8967735635054073, "language_loss": 0.87269294, "learning_rate": 3.960523615252156e-06, "loss": 0.95390832, "num_input_tokens_seen": 32526045, "router_z_loss_clip": 4.12109375, "router_z_loss_mlp": 0.47705078, "step": 1521, "time_per_iteration": 2.60439133644104 }, { "auxiliary_loss_clip": 0.06810939, "auxiliary_loss_mlp": 0.0134272, "balance_loss_clip": 0.06390049, "balance_loss_mlp": 0.01294655, "epoch": 0.09150759056064933, "flos": 22783874745600.0, "grad_norm": 2.204273202595362, "language_loss": 0.85719287, "learning_rate": 3.960446580030599e-06, "loss": 0.93872947, "num_input_tokens_seen": 32546575, "router_z_loss_clip": 4.21289062, "router_z_loss_mlp": 0.48071289, "step": 1522, "time_per_iteration": 4.18422794342041 }, { "auxiliary_loss_clip": 0.06777243, "auxiliary_loss_mlp": 0.01318532, "balance_loss_clip": 0.06378606, "balance_loss_mlp": 0.01274448, "epoch": 0.0915677138133173, "flos": 27571733844480.0, "grad_norm": 3.9273681911157103, "language_loss": 0.82702088, "learning_rate": 3.960369470468711e-06, "loss": 0.90797859, "num_input_tokens_seen": 32568795, "router_z_loss_clip": 3.98828125, "router_z_loss_mlp": 0.44091797, "step": 1523, "time_per_iteration": 4.143735408782959 }, { "auxiliary_loss_clip": 0.06797744, "auxiliary_loss_mlp": 0.01318097, "balance_loss_clip": 0.06380507, "balance_loss_mlp": 0.01269459, "epoch": 0.09162783706598528, "flos": 17680340188800.0, "grad_norm": 3.8337168816664384, "language_loss": 0.75863516, "learning_rate": 3.960292286569418e-06, "loss": 0.83979356, "num_input_tokens_seen": 32587010, "router_z_loss_clip": 4.16796875, "router_z_loss_mlp": 0.48657227, "step": 1524, "time_per_iteration": 2.6305859088897705 }, { "auxiliary_loss_clip": 0.06783342, "auxiliary_loss_mlp": 0.01308548, "balance_loss_clip": 0.06377824, "balance_loss_mlp": 0.01263463, "epoch": 0.09168796031865324, "flos": 18484028225280.0, "grad_norm": 4.1284856935361125, "language_loss": 0.87747633, "learning_rate": 3.960215028335644e-06, "loss": 0.95839524, "num_input_tokens_seen": 32602375, "router_z_loss_clip": 4.05273438, "router_z_loss_mlp": 0.45092773, "step": 1525, "time_per_iteration": 2.596228837966919 }, { "auxiliary_loss_clip": 0.06788874, "auxiliary_loss_mlp": 0.01324494, "balance_loss_clip": 0.06381489, "balance_loss_mlp": 0.01277192, "epoch": 0.0917480835713212, "flos": 29395290251520.0, "grad_norm": 2.696353712386686, "language_loss": 0.76699162, "learning_rate": 3.96013769577032e-06, "loss": 0.84812534, "num_input_tokens_seen": 32621460, "router_z_loss_clip": 4.07226562, "router_z_loss_mlp": 0.47314453, "step": 1526, "time_per_iteration": 2.6822009086608887 }, { "auxiliary_loss_clip": 0.06785876, "auxiliary_loss_mlp": 0.01311558, "balance_loss_clip": 0.06390624, "balance_loss_mlp": 0.01267808, "epoch": 0.09180820682398917, "flos": 19835504328960.0, "grad_norm": 2.9373801561328388, "language_loss": 0.78969884, "learning_rate": 3.960060288876378e-06, "loss": 0.87067318, "num_input_tokens_seen": 32640440, "router_z_loss_clip": 3.9453125, "router_z_loss_mlp": 0.43774414, "step": 1527, "time_per_iteration": 2.617539644241333 }, { "auxiliary_loss_clip": 0.06784845, "auxiliary_loss_mlp": 0.01319259, "balance_loss_clip": 0.06381918, "balance_loss_mlp": 0.01273483, "epoch": 0.09186833007665715, "flos": 23848619777280.0, "grad_norm": 2.815830392775537, "language_loss": 0.8069821, "learning_rate": 3.959982807656753e-06, "loss": 0.88802314, "num_input_tokens_seen": 32660020, "router_z_loss_clip": 4.02539062, "router_z_loss_mlp": 0.45751953, "step": 1528, "time_per_iteration": 2.6729676723480225 }, { "auxiliary_loss_clip": 0.06802326, "auxiliary_loss_mlp": 0.01317843, "balance_loss_clip": 0.06387784, "balance_loss_mlp": 0.01267227, "epoch": 0.09192845332932512, "flos": 12937693167360.0, "grad_norm": 4.606799516421051, "language_loss": 0.78807193, "learning_rate": 3.959905252114384e-06, "loss": 0.8692736, "num_input_tokens_seen": 32678170, "router_z_loss_clip": 4.14453125, "router_z_loss_mlp": 0.50634766, "step": 1529, "time_per_iteration": 2.593777656555176 }, { "auxiliary_loss_clip": 0.06795435, "auxiliary_loss_mlp": 0.01318828, "balance_loss_clip": 0.06391811, "balance_loss_mlp": 0.01271335, "epoch": 0.09198857658199308, "flos": 24574503697920.0, "grad_norm": 2.344798949701115, "language_loss": 0.84624648, "learning_rate": 3.959827622252211e-06, "loss": 0.92738914, "num_input_tokens_seen": 32697540, "router_z_loss_clip": 4.03125, "router_z_loss_mlp": 0.47509766, "step": 1530, "time_per_iteration": 2.645503282546997 }, { "auxiliary_loss_clip": 0.06787922, "auxiliary_loss_mlp": 0.01301558, "balance_loss_clip": 0.06394622, "balance_loss_mlp": 0.01256545, "epoch": 0.09204869983466106, "flos": 20273231658240.0, "grad_norm": 2.2728222808713454, "language_loss": 0.85708934, "learning_rate": 3.959749918073179e-06, "loss": 0.93798411, "num_input_tokens_seen": 32716805, "router_z_loss_clip": 3.9375, "router_z_loss_mlp": 0.45019531, "step": 1531, "time_per_iteration": 2.708223342895508 }, { "auxiliary_loss_clip": 0.06799568, "auxiliary_loss_mlp": 0.01321813, "balance_loss_clip": 0.06403757, "balance_loss_mlp": 0.01273867, "epoch": 0.09210882308732903, "flos": 20891780098560.0, "grad_norm": 2.217348120137983, "language_loss": 0.82410157, "learning_rate": 3.959672139580233e-06, "loss": 0.9053154, "num_input_tokens_seen": 32736385, "router_z_loss_clip": 3.95507812, "router_z_loss_mlp": 0.47924805, "step": 1532, "time_per_iteration": 2.6651251316070557 }, { "auxiliary_loss_clip": 0.06806297, "auxiliary_loss_mlp": 0.01317283, "balance_loss_clip": 0.06410284, "balance_loss_mlp": 0.01272007, "epoch": 0.09216894633999699, "flos": 30964246427520.0, "grad_norm": 3.367281722387012, "language_loss": 0.85799646, "learning_rate": 3.9595942867763235e-06, "loss": 0.93923223, "num_input_tokens_seen": 32757140, "router_z_loss_clip": 3.96289062, "router_z_loss_mlp": 0.45263672, "step": 1533, "time_per_iteration": 2.7785091400146484 }, { "auxiliary_loss_clip": 0.06811766, "auxiliary_loss_mlp": 0.01316769, "balance_loss_clip": 0.06414944, "balance_loss_mlp": 0.01270302, "epoch": 0.09222906959266497, "flos": 13156556832000.0, "grad_norm": 2.513558632657781, "language_loss": 0.92118102, "learning_rate": 3.959516359664402e-06, "loss": 1.00246644, "num_input_tokens_seen": 32774860, "router_z_loss_clip": 3.96679688, "router_z_loss_mlp": 0.46508789, "step": 1534, "time_per_iteration": 2.6522908210754395 }, { "auxiliary_loss_clip": 0.06814555, "auxiliary_loss_mlp": 0.01311757, "balance_loss_clip": 0.06415081, "balance_loss_mlp": 0.01263477, "epoch": 0.09228919284533293, "flos": 26001603711360.0, "grad_norm": 3.4485613754988926, "language_loss": 0.77278602, "learning_rate": 3.959438358247424e-06, "loss": 0.85404909, "num_input_tokens_seen": 32795250, "router_z_loss_clip": 3.9921875, "router_z_loss_mlp": 0.48266602, "step": 1535, "time_per_iteration": 2.650662899017334 }, { "auxiliary_loss_clip": 0.06804935, "auxiliary_loss_mlp": 0.01306177, "balance_loss_clip": 0.06419115, "balance_loss_mlp": 0.01261903, "epoch": 0.0923493160980009, "flos": 18666694126080.0, "grad_norm": 2.65227079358171, "language_loss": 0.83081681, "learning_rate": 3.959360282528346e-06, "loss": 0.91192794, "num_input_tokens_seen": 32813805, "router_z_loss_clip": 3.85351562, "router_z_loss_mlp": 0.44311523, "step": 1536, "time_per_iteration": 2.6015799045562744 }, { "auxiliary_loss_clip": 0.06811757, "auxiliary_loss_mlp": 0.01304998, "balance_loss_clip": 0.06430276, "balance_loss_mlp": 0.01264372, "epoch": 0.09240943935066886, "flos": 21146673818880.0, "grad_norm": 2.231407614769426, "language_loss": 0.91411829, "learning_rate": 3.959282132510131e-06, "loss": 0.99528587, "num_input_tokens_seen": 32830960, "router_z_loss_clip": 3.81445312, "router_z_loss_mlp": 0.40625, "step": 1537, "time_per_iteration": 2.6088790893554688 }, { "auxiliary_loss_clip": 0.06806786, "auxiliary_loss_mlp": 0.01318363, "balance_loss_clip": 0.06415476, "balance_loss_mlp": 0.01272563, "epoch": 0.09246956260333684, "flos": 20598298773120.0, "grad_norm": 2.965754035670092, "language_loss": 0.82266855, "learning_rate": 3.959203908195741e-06, "loss": 0.90392005, "num_input_tokens_seen": 32848275, "router_z_loss_clip": 3.91601562, "router_z_loss_mlp": 0.45800781, "step": 1538, "time_per_iteration": 2.6509573459625244 }, { "auxiliary_loss_clip": 0.06782147, "auxiliary_loss_mlp": 0.01297412, "balance_loss_clip": 0.06511211, "balance_loss_mlp": 0.0127388, "epoch": 0.09252968585600481, "flos": 67580052312960.0, "grad_norm": 0.742770008458353, "language_loss": 0.57466853, "learning_rate": 3.959125609588142e-06, "loss": 0.65546411, "num_input_tokens_seen": 32917730, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.23498535, "step": 1539, "time_per_iteration": 3.355079412460327 }, { "auxiliary_loss_clip": 0.06800942, "auxiliary_loss_mlp": 0.01315029, "balance_loss_clip": 0.06409881, "balance_loss_mlp": 0.0126868, "epoch": 0.09258980910867277, "flos": 17389542193920.0, "grad_norm": 3.9743563501486108, "language_loss": 0.70704925, "learning_rate": 3.959047236690304e-06, "loss": 0.78820896, "num_input_tokens_seen": 32934910, "router_z_loss_clip": 3.90625, "router_z_loss_mlp": 0.46313477, "step": 1540, "time_per_iteration": 2.607234001159668 }, { "auxiliary_loss_clip": 0.0680051, "auxiliary_loss_mlp": 0.01320761, "balance_loss_clip": 0.06416254, "balance_loss_mlp": 0.01277679, "epoch": 0.09264993236134075, "flos": 19872205217280.0, "grad_norm": 2.4995019645010133, "language_loss": 0.85446852, "learning_rate": 3.958968789505198e-06, "loss": 0.93568122, "num_input_tokens_seen": 32953840, "router_z_loss_clip": 3.8359375, "router_z_loss_mlp": 0.4309082, "step": 1541, "time_per_iteration": 2.6040658950805664 }, { "auxiliary_loss_clip": 0.06708808, "auxiliary_loss_mlp": 0.01307424, "balance_loss_clip": 0.06440151, "balance_loss_mlp": 0.01285358, "epoch": 0.09271005561400872, "flos": 62301455377920.0, "grad_norm": 0.8871401509793009, "language_loss": 0.62247491, "learning_rate": 3.9588902680358e-06, "loss": 0.7026372, "num_input_tokens_seen": 33011410, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.22094727, "step": 1542, "time_per_iteration": 3.254204750061035 }, { "auxiliary_loss_clip": 0.06768715, "auxiliary_loss_mlp": 0.01314533, "balance_loss_clip": 0.0637852, "balance_loss_mlp": 0.01271594, "epoch": 0.09277017886667668, "flos": 23336358641280.0, "grad_norm": 2.2837865446841694, "language_loss": 0.84089226, "learning_rate": 3.958811672285086e-06, "loss": 0.92172468, "num_input_tokens_seen": 33031675, "router_z_loss_clip": 3.90039062, "router_z_loss_mlp": 0.42944336, "step": 1543, "time_per_iteration": 2.6350626945495605 }, { "auxiliary_loss_clip": 0.06756574, "auxiliary_loss_mlp": 0.01317438, "balance_loss_clip": 0.06374413, "balance_loss_mlp": 0.01272139, "epoch": 0.09283030211934466, "flos": 54757088513280.0, "grad_norm": 2.133293231301749, "language_loss": 0.74024165, "learning_rate": 3.958733002256038e-06, "loss": 0.82098174, "num_input_tokens_seen": 33056355, "router_z_loss_clip": 3.82421875, "router_z_loss_mlp": 0.45263672, "step": 1544, "time_per_iteration": 2.8956406116485596 }, { "auxiliary_loss_clip": 0.06772668, "auxiliary_loss_mlp": 0.01310226, "balance_loss_clip": 0.0637433, "balance_loss_mlp": 0.01266404, "epoch": 0.09289042537201263, "flos": 30342385751040.0, "grad_norm": 2.7511497529440745, "language_loss": 0.79011381, "learning_rate": 3.958654257951637e-06, "loss": 0.87094277, "num_input_tokens_seen": 33079520, "router_z_loss_clip": 3.984375, "router_z_loss_mlp": 0.43847656, "step": 1545, "time_per_iteration": 2.7013938426971436 }, { "auxiliary_loss_clip": 0.06750963, "auxiliary_loss_mlp": 0.01321837, "balance_loss_clip": 0.0636421, "balance_loss_mlp": 0.0127897, "epoch": 0.09295054862468059, "flos": 17752274519040.0, "grad_norm": 4.952089517396567, "language_loss": 0.76905489, "learning_rate": 3.9585754393748706e-06, "loss": 0.84978294, "num_input_tokens_seen": 33096135, "router_z_loss_clip": 3.8671875, "router_z_loss_mlp": 0.4284668, "step": 1546, "time_per_iteration": 2.5994997024536133 }, { "auxiliary_loss_clip": 0.06752537, "auxiliary_loss_mlp": 0.0131737, "balance_loss_clip": 0.06359969, "balance_loss_mlp": 0.01272381, "epoch": 0.09301067187734856, "flos": 23664528357120.0, "grad_norm": 2.2306706853924, "language_loss": 0.85463792, "learning_rate": 3.9584965465287275e-06, "loss": 0.93533695, "num_input_tokens_seen": 33115245, "router_z_loss_clip": 3.92382812, "router_z_loss_mlp": 0.44995117, "step": 1547, "time_per_iteration": 2.6246957778930664 }, { "auxiliary_loss_clip": 0.06758925, "auxiliary_loss_mlp": 0.01313229, "balance_loss_clip": 0.06359994, "balance_loss_mlp": 0.01267071, "epoch": 0.09307079513001654, "flos": 27535242591360.0, "grad_norm": 3.901756214211823, "language_loss": 0.69953144, "learning_rate": 3.958417579416199e-06, "loss": 0.78025293, "num_input_tokens_seen": 33136640, "router_z_loss_clip": 3.9921875, "router_z_loss_mlp": 0.46166992, "step": 1548, "time_per_iteration": 2.6540093421936035 }, { "auxiliary_loss_clip": 0.0675448, "auxiliary_loss_mlp": 0.01315691, "balance_loss_clip": 0.06356379, "balance_loss_mlp": 0.01269318, "epoch": 0.0931309183826845, "flos": 20632945236480.0, "grad_norm": 2.3585480356257653, "language_loss": 0.85757577, "learning_rate": 3.9583385380402795e-06, "loss": 0.93827748, "num_input_tokens_seen": 33155060, "router_z_loss_clip": 3.97851562, "router_z_loss_mlp": 0.46386719, "step": 1549, "time_per_iteration": 2.6040759086608887 }, { "auxiliary_loss_clip": 0.06742711, "auxiliary_loss_mlp": 0.0132405, "balance_loss_clip": 0.06354286, "balance_loss_mlp": 0.01279346, "epoch": 0.09319104163535247, "flos": 29028239441280.0, "grad_norm": 2.0277581887520837, "language_loss": 0.77898222, "learning_rate": 3.958259422403966e-06, "loss": 0.85964984, "num_input_tokens_seen": 33175420, "router_z_loss_clip": 3.88671875, "router_z_loss_mlp": 0.44677734, "step": 1550, "time_per_iteration": 2.6782095432281494 }, { "auxiliary_loss_clip": 0.06753023, "auxiliary_loss_mlp": 0.01311876, "balance_loss_clip": 0.06360143, "balance_loss_mlp": 0.01267435, "epoch": 0.09325116488802045, "flos": 25308605318400.0, "grad_norm": 2.6814896018279253, "language_loss": 0.85380828, "learning_rate": 3.95818023251026e-06, "loss": 0.9344573, "num_input_tokens_seen": 33194120, "router_z_loss_clip": 3.92578125, "router_z_loss_mlp": 0.44384766, "step": 1551, "time_per_iteration": 2.6498703956604004 }, { "auxiliary_loss_clip": 0.06651377, "auxiliary_loss_mlp": 0.01272962, "balance_loss_clip": 0.06376474, "balance_loss_mlp": 0.01251242, "epoch": 0.09331128814068841, "flos": 61556144509440.0, "grad_norm": 0.7250844221835586, "language_loss": 0.61744189, "learning_rate": 3.958100968362163e-06, "loss": 0.69668525, "num_input_tokens_seen": 33261080, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.21740723, "step": 1552, "time_per_iteration": 3.3768808841705322 }, { "auxiliary_loss_clip": 0.06645823, "auxiliary_loss_mlp": 0.01279798, "balance_loss_clip": 0.06372906, "balance_loss_mlp": 0.01259318, "epoch": 0.09337141139335638, "flos": 53312810883840.0, "grad_norm": 0.8010536157787661, "language_loss": 0.59145486, "learning_rate": 3.958021629962681e-06, "loss": 0.6707111, "num_input_tokens_seen": 33330235, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.20483398, "step": 1553, "time_per_iteration": 3.3755106925964355 }, { "auxiliary_loss_clip": 0.06773467, "auxiliary_loss_mlp": 0.01303372, "balance_loss_clip": 0.06369022, "balance_loss_mlp": 0.01259217, "epoch": 0.09343153464602436, "flos": 23483539537920.0, "grad_norm": 4.479843221023589, "language_loss": 0.89316463, "learning_rate": 3.957942217314823e-06, "loss": 0.97393298, "num_input_tokens_seen": 33349035, "router_z_loss_clip": 4.05078125, "router_z_loss_mlp": 0.44116211, "step": 1554, "time_per_iteration": 2.6912035942077637 }, { "auxiliary_loss_clip": 0.06755567, "auxiliary_loss_mlp": 0.01313188, "balance_loss_clip": 0.06369317, "balance_loss_mlp": 0.01271274, "epoch": 0.09349165789869232, "flos": 19359399029760.0, "grad_norm": 3.1760283658474298, "language_loss": 0.83453882, "learning_rate": 3.957862730421599e-06, "loss": 0.91522634, "num_input_tokens_seen": 33368060, "router_z_loss_clip": 3.86132812, "router_z_loss_mlp": 0.41918945, "step": 1555, "time_per_iteration": 2.6163687705993652 }, { "auxiliary_loss_clip": 0.06639385, "auxiliary_loss_mlp": 0.01276224, "balance_loss_clip": 0.06369699, "balance_loss_mlp": 0.0125609, "epoch": 0.09355178115136029, "flos": 67520626968960.0, "grad_norm": 0.8595968669345828, "language_loss": 0.5986408, "learning_rate": 3.957783169286024e-06, "loss": 0.6777969, "num_input_tokens_seen": 33430825, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.20129395, "step": 1556, "time_per_iteration": 3.257835865020752 }, { "auxiliary_loss_clip": 0.06763105, "auxiliary_loss_mlp": 0.01307313, "balance_loss_clip": 0.06374504, "balance_loss_mlp": 0.0126745, "epoch": 0.09361190440402825, "flos": 37350676920960.0, "grad_norm": 6.837303428895899, "language_loss": 0.85856962, "learning_rate": 3.9577035339111155e-06, "loss": 0.93927383, "num_input_tokens_seen": 33454855, "router_z_loss_clip": 3.88476562, "router_z_loss_mlp": 0.39892578, "step": 1557, "time_per_iteration": 2.7551822662353516 }, { "auxiliary_loss_clip": 0.06771216, "auxiliary_loss_mlp": 0.01306079, "balance_loss_clip": 0.06380416, "balance_loss_mlp": 0.0126531, "epoch": 0.09367202765669623, "flos": 24906614555520.0, "grad_norm": 2.986135904892164, "language_loss": 0.79538667, "learning_rate": 3.957623824299893e-06, "loss": 0.87615955, "num_input_tokens_seen": 33476000, "router_z_loss_clip": 3.90820312, "router_z_loss_mlp": 0.40795898, "step": 1558, "time_per_iteration": 4.09453821182251 }, { "auxiliary_loss_clip": 0.06777401, "auxiliary_loss_mlp": 0.01304282, "balance_loss_clip": 0.06380872, "balance_loss_mlp": 0.01263178, "epoch": 0.0937321509093642, "flos": 15710986477440.0, "grad_norm": 2.9966713735428554, "language_loss": 0.81451976, "learning_rate": 3.957544040455379e-06, "loss": 0.89533657, "num_input_tokens_seen": 33493845, "router_z_loss_clip": 3.96289062, "router_z_loss_mlp": 0.41088867, "step": 1559, "time_per_iteration": 4.064835548400879 }, { "auxiliary_loss_clip": 0.06756769, "auxiliary_loss_mlp": 0.01298231, "balance_loss_clip": 0.06364279, "balance_loss_mlp": 0.01256531, "epoch": 0.09379227416203216, "flos": 20489663554560.0, "grad_norm": 3.6523938600927934, "language_loss": 0.77689308, "learning_rate": 3.957464182380599e-06, "loss": 0.85744309, "num_input_tokens_seen": 33510850, "router_z_loss_clip": 3.93359375, "router_z_loss_mlp": 0.41699219, "step": 1560, "time_per_iteration": 2.5986931324005127 }, { "auxiliary_loss_clip": 0.0677727, "auxiliary_loss_mlp": 0.01308596, "balance_loss_clip": 0.06379139, "balance_loss_mlp": 0.01265919, "epoch": 0.09385239741470014, "flos": 24359329612800.0, "grad_norm": 2.425750054789628, "language_loss": 0.82646525, "learning_rate": 3.95738425007858e-06, "loss": 0.90732396, "num_input_tokens_seen": 33530430, "router_z_loss_clip": 3.98046875, "router_z_loss_mlp": 0.42675781, "step": 1561, "time_per_iteration": 4.126224040985107 }, { "auxiliary_loss_clip": 0.06779194, "auxiliary_loss_mlp": 0.01311124, "balance_loss_clip": 0.06378566, "balance_loss_mlp": 0.01270164, "epoch": 0.0939125206673681, "flos": 33299812408320.0, "grad_norm": 6.537172984440492, "language_loss": 0.63813555, "learning_rate": 3.957304243552354e-06, "loss": 0.71903872, "num_input_tokens_seen": 33551975, "router_z_loss_clip": 4.00390625, "router_z_loss_mlp": 0.40991211, "step": 1562, "time_per_iteration": 4.185566425323486 }, { "auxiliary_loss_clip": 0.06746057, "auxiliary_loss_mlp": 0.01314606, "balance_loss_clip": 0.06364264, "balance_loss_mlp": 0.01274671, "epoch": 0.09397264392003607, "flos": 19250973446400.0, "grad_norm": 20.296103379801263, "language_loss": 0.86690772, "learning_rate": 3.957224162804956e-06, "loss": 0.94751436, "num_input_tokens_seen": 33569850, "router_z_loss_clip": 3.81835938, "router_z_loss_mlp": 0.39916992, "step": 1563, "time_per_iteration": 2.6276252269744873 }, { "auxiliary_loss_clip": 0.0676408, "auxiliary_loss_mlp": 0.01299671, "balance_loss_clip": 0.06376237, "balance_loss_mlp": 0.01262716, "epoch": 0.09403276717270405, "flos": 19323997879680.0, "grad_norm": 2.99375328357717, "language_loss": 0.78054428, "learning_rate": 3.9571440078394205e-06, "loss": 0.86118174, "num_input_tokens_seen": 33590510, "router_z_loss_clip": 3.88085938, "router_z_loss_mlp": 0.36938477, "step": 1564, "time_per_iteration": 2.6109578609466553 }, { "auxiliary_loss_clip": 0.06773363, "auxiliary_loss_mlp": 0.0131183, "balance_loss_clip": 0.06379582, "balance_loss_mlp": 0.0127354, "epoch": 0.09409289042537201, "flos": 23589701061120.0, "grad_norm": 5.2703012759236065, "language_loss": 0.81633496, "learning_rate": 3.9570637786587895e-06, "loss": 0.89718688, "num_input_tokens_seen": 33608810, "router_z_loss_clip": 3.94140625, "router_z_loss_mlp": 0.38305664, "step": 1565, "time_per_iteration": 2.6401426792144775 }, { "auxiliary_loss_clip": 0.06792289, "auxiliary_loss_mlp": 0.01309647, "balance_loss_clip": 0.06391626, "balance_loss_mlp": 0.01266612, "epoch": 0.09415301367803998, "flos": 20083689722880.0, "grad_norm": 3.326301375820198, "language_loss": 0.78240979, "learning_rate": 3.956983475266103e-06, "loss": 0.86342919, "num_input_tokens_seen": 33627265, "router_z_loss_clip": 4.0078125, "router_z_loss_mlp": 0.43066406, "step": 1566, "time_per_iteration": 2.694084644317627 }, { "auxiliary_loss_clip": 0.06775223, "auxiliary_loss_mlp": 0.01314953, "balance_loss_clip": 0.06385452, "balance_loss_mlp": 0.01275519, "epoch": 0.09421313693070796, "flos": 21067234548480.0, "grad_norm": 2.893684102958201, "language_loss": 0.7970981, "learning_rate": 3.956903097664407e-06, "loss": 0.87799978, "num_input_tokens_seen": 33644810, "router_z_loss_clip": 3.89648438, "router_z_loss_mlp": 0.39428711, "step": 1567, "time_per_iteration": 2.6816234588623047 }, { "auxiliary_loss_clip": 0.06782618, "auxiliary_loss_mlp": 0.01317794, "balance_loss_clip": 0.06389236, "balance_loss_mlp": 0.01277335, "epoch": 0.09427326018337592, "flos": 24323006067840.0, "grad_norm": 2.145330871694343, "language_loss": 0.84104002, "learning_rate": 3.956822645856749e-06, "loss": 0.92204416, "num_input_tokens_seen": 33665665, "router_z_loss_clip": 3.93945312, "router_z_loss_mlp": 0.40454102, "step": 1568, "time_per_iteration": 2.703965663909912 }, { "auxiliary_loss_clip": 0.06795053, "auxiliary_loss_mlp": 0.01306909, "balance_loss_clip": 0.06392466, "balance_loss_mlp": 0.0126571, "epoch": 0.09433338343604389, "flos": 20269667859840.0, "grad_norm": 3.7720062467564848, "language_loss": 0.77958316, "learning_rate": 3.9567421198461814e-06, "loss": 0.86060274, "num_input_tokens_seen": 33684760, "router_z_loss_clip": 4.02734375, "router_z_loss_mlp": 0.41162109, "step": 1569, "time_per_iteration": 2.6603755950927734 }, { "auxiliary_loss_clip": 0.06784353, "auxiliary_loss_mlp": 0.01300174, "balance_loss_clip": 0.06391923, "balance_loss_mlp": 0.01263911, "epoch": 0.09439350668871185, "flos": 12746683785600.0, "grad_norm": 2.5897618270600278, "language_loss": 0.87125528, "learning_rate": 3.956661519635756e-06, "loss": 0.95210046, "num_input_tokens_seen": 33700750, "router_z_loss_clip": 3.91992188, "router_z_loss_mlp": 0.36254883, "step": 1570, "time_per_iteration": 2.618032217025757 }, { "auxiliary_loss_clip": 0.06795709, "auxiliary_loss_mlp": 0.0130691, "balance_loss_clip": 0.06392537, "balance_loss_mlp": 0.01268024, "epoch": 0.09445362994137983, "flos": 25970101776000.0, "grad_norm": 1.8645025371209143, "language_loss": 0.7803719, "learning_rate": 3.95658084522853e-06, "loss": 0.8613981, "num_input_tokens_seen": 33724430, "router_z_loss_clip": 4.02734375, "router_z_loss_mlp": 0.38842773, "step": 1571, "time_per_iteration": 2.7616801261901855 }, { "auxiliary_loss_clip": 0.06771859, "auxiliary_loss_mlp": 0.01301899, "balance_loss_clip": 0.06394264, "balance_loss_mlp": 0.01266161, "epoch": 0.0945137531940478, "flos": 19720831616640.0, "grad_norm": 4.015016250908715, "language_loss": 0.81328589, "learning_rate": 3.956500096627561e-06, "loss": 0.89402348, "num_input_tokens_seen": 33743455, "router_z_loss_clip": 3.77929688, "router_z_loss_mlp": 0.35766602, "step": 1572, "time_per_iteration": 2.632453680038452 }, { "auxiliary_loss_clip": 0.06801708, "auxiliary_loss_mlp": 0.01313017, "balance_loss_clip": 0.06398404, "balance_loss_mlp": 0.01271127, "epoch": 0.09457387644671576, "flos": 23622796224000.0, "grad_norm": 6.648319096391224, "language_loss": 0.8906976, "learning_rate": 3.956419273835913e-06, "loss": 0.97184485, "num_input_tokens_seen": 33763435, "router_z_loss_clip": 4.03125, "router_z_loss_mlp": 0.41821289, "step": 1573, "time_per_iteration": 2.6650760173797607 }, { "auxiliary_loss_clip": 0.0681649, "auxiliary_loss_mlp": 0.01314731, "balance_loss_clip": 0.06413237, "balance_loss_mlp": 0.01272077, "epoch": 0.09463399969938374, "flos": 26914681653120.0, "grad_norm": 3.5888644937096474, "language_loss": 0.82766992, "learning_rate": 3.95633837685665e-06, "loss": 0.90898216, "num_input_tokens_seen": 33784325, "router_z_loss_clip": 4.02929688, "router_z_loss_mlp": 0.42675781, "step": 1574, "time_per_iteration": 2.693629741668701 }, { "auxiliary_loss_clip": 0.06812151, "auxiliary_loss_mlp": 0.0130223, "balance_loss_clip": 0.06411262, "balance_loss_mlp": 0.01261437, "epoch": 0.0946941229520517, "flos": 23666331219840.0, "grad_norm": 2.360495470654502, "language_loss": 0.82620764, "learning_rate": 3.95625740569284e-06, "loss": 0.90735149, "num_input_tokens_seen": 33802510, "router_z_loss_clip": 4.01171875, "router_z_loss_mlp": 0.40771484, "step": 1575, "time_per_iteration": 2.676100254058838 }, { "auxiliary_loss_clip": 0.06807074, "auxiliary_loss_mlp": 0.01301523, "balance_loss_clip": 0.06405923, "balance_loss_mlp": 0.01259705, "epoch": 0.09475424620471967, "flos": 24140927145600.0, "grad_norm": 3.2237272147249136, "language_loss": 0.88702792, "learning_rate": 3.956176360347553e-06, "loss": 0.9681139, "num_input_tokens_seen": 33819980, "router_z_loss_clip": 4.00976562, "router_z_loss_mlp": 0.41821289, "step": 1576, "time_per_iteration": 2.6696910858154297 }, { "auxiliary_loss_clip": 0.06631337, "auxiliary_loss_mlp": 0.01274651, "balance_loss_clip": 0.06364952, "balance_loss_mlp": 0.01254624, "epoch": 0.09481436945738765, "flos": 68446283022720.0, "grad_norm": 0.9734151685362951, "language_loss": 0.65956485, "learning_rate": 3.956095240823862e-06, "loss": 0.73862475, "num_input_tokens_seen": 33878925, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.20019531, "step": 1577, "time_per_iteration": 3.25164532661438 }, { "auxiliary_loss_clip": 0.0682106, "auxiliary_loss_mlp": 0.0130047, "balance_loss_clip": 0.06422111, "balance_loss_mlp": 0.01262919, "epoch": 0.09487449271005562, "flos": 16659633277440.0, "grad_norm": 3.0434309086935607, "language_loss": 0.81474173, "learning_rate": 3.956014047124844e-06, "loss": 0.89595699, "num_input_tokens_seen": 33897600, "router_z_loss_clip": 3.9921875, "router_z_loss_mlp": 0.37573242, "step": 1578, "time_per_iteration": 2.609311819076538 }, { "auxiliary_loss_clip": 0.06827201, "auxiliary_loss_mlp": 0.01302064, "balance_loss_clip": 0.06432835, "balance_loss_mlp": 0.01260436, "epoch": 0.09493461596272358, "flos": 24281860913280.0, "grad_norm": 2.1104103425503, "language_loss": 0.79487371, "learning_rate": 3.955932779253578e-06, "loss": 0.8761664, "num_input_tokens_seen": 33917365, "router_z_loss_clip": 3.94335938, "router_z_loss_mlp": 0.41674805, "step": 1579, "time_per_iteration": 2.835291862487793 }, { "auxiliary_loss_clip": 0.06832175, "auxiliary_loss_mlp": 0.01303333, "balance_loss_clip": 0.06430145, "balance_loss_mlp": 0.01262825, "epoch": 0.09499473921539155, "flos": 21876373100160.0, "grad_norm": 7.088617778692598, "language_loss": 0.75339806, "learning_rate": 3.955851437213144e-06, "loss": 0.83475316, "num_input_tokens_seen": 33936680, "router_z_loss_clip": 4.01953125, "router_z_loss_mlp": 0.40478516, "step": 1580, "time_per_iteration": 2.7197537422180176 }, { "auxiliary_loss_clip": 0.06816164, "auxiliary_loss_mlp": 0.01302079, "balance_loss_clip": 0.06422467, "balance_loss_mlp": 0.01261715, "epoch": 0.09505486246805953, "flos": 33555544669440.0, "grad_norm": 2.332208366060225, "language_loss": 0.7901389, "learning_rate": 3.955770021006627e-06, "loss": 0.87132132, "num_input_tokens_seen": 33960685, "router_z_loss_clip": 3.93554688, "router_z_loss_mlp": 0.40380859, "step": 1581, "time_per_iteration": 2.7321808338165283 }, { "auxiliary_loss_clip": 0.06815472, "auxiliary_loss_mlp": 0.0130298, "balance_loss_clip": 0.06417145, "balance_loss_mlp": 0.01262139, "epoch": 0.09511498572072749, "flos": 21221752677120.0, "grad_norm": 2.9018832239983947, "language_loss": 0.89103514, "learning_rate": 3.955688530637116e-06, "loss": 0.97221959, "num_input_tokens_seen": 33980015, "router_z_loss_clip": 3.98046875, "router_z_loss_mlp": 0.40820312, "step": 1582, "time_per_iteration": 2.6304686069488525 }, { "auxiliary_loss_clip": 0.06831427, "auxiliary_loss_mlp": 0.01300784, "balance_loss_clip": 0.0643124, "balance_loss_mlp": 0.01259562, "epoch": 0.09517510897339546, "flos": 14616542373120.0, "grad_norm": 6.557243180887431, "language_loss": 0.68935877, "learning_rate": 3.955606966107699e-06, "loss": 0.7706809, "num_input_tokens_seen": 33997705, "router_z_loss_clip": 4.0, "router_z_loss_mlp": 0.41235352, "step": 1583, "time_per_iteration": 2.59810471534729 }, { "auxiliary_loss_clip": 0.06843662, "auxiliary_loss_mlp": 0.01310277, "balance_loss_clip": 0.06442066, "balance_loss_mlp": 0.01267767, "epoch": 0.09523523222606343, "flos": 27824531212800.0, "grad_norm": 2.749642437770565, "language_loss": 0.72138506, "learning_rate": 3.95552532742147e-06, "loss": 0.80292445, "num_input_tokens_seen": 34017465, "router_z_loss_clip": 4.015625, "router_z_loss_mlp": 0.42529297, "step": 1584, "time_per_iteration": 2.6751868724823 }, { "auxiliary_loss_clip": 0.06839274, "auxiliary_loss_mlp": 0.01308924, "balance_loss_clip": 0.06446652, "balance_loss_mlp": 0.01269728, "epoch": 0.0952953554787314, "flos": 20712887631360.0, "grad_norm": 1.865287967842826, "language_loss": 0.81961453, "learning_rate": 3.955443614581525e-06, "loss": 0.90109658, "num_input_tokens_seen": 34038550, "router_z_loss_clip": 3.92578125, "router_z_loss_mlp": 0.3918457, "step": 1585, "time_per_iteration": 2.6614761352539062 }, { "auxiliary_loss_clip": 0.06857733, "auxiliary_loss_mlp": 0.01305807, "balance_loss_clip": 0.06441283, "balance_loss_mlp": 0.01261628, "epoch": 0.09535547873139937, "flos": 24794080122240.0, "grad_norm": 2.6469747669459007, "language_loss": 0.74548328, "learning_rate": 3.955361827590961e-06, "loss": 0.82711869, "num_input_tokens_seen": 34058665, "router_z_loss_clip": 4.16796875, "router_z_loss_mlp": 0.44165039, "step": 1586, "time_per_iteration": 2.652515172958374 }, { "auxiliary_loss_clip": 0.06658298, "auxiliary_loss_mlp": 0.01287671, "balance_loss_clip": 0.06393337, "balance_loss_mlp": 0.01265152, "epoch": 0.09541560198406734, "flos": 71930114956800.0, "grad_norm": 0.796136059270385, "language_loss": 0.55050206, "learning_rate": 3.955279966452883e-06, "loss": 0.62996173, "num_input_tokens_seen": 34109655, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.22509766, "step": 1587, "time_per_iteration": 3.0630173683166504 }, { "auxiliary_loss_clip": 0.06855502, "auxiliary_loss_mlp": 0.01311672, "balance_loss_clip": 0.06447985, "balance_loss_mlp": 0.01267183, "epoch": 0.09547572523673531, "flos": 28989609909120.0, "grad_norm": 2.3745928464982873, "language_loss": 0.82586581, "learning_rate": 3.955198031170391e-06, "loss": 0.90753758, "num_input_tokens_seen": 34131115, "router_z_loss_clip": 4.07226562, "router_z_loss_mlp": 0.44506836, "step": 1588, "time_per_iteration": 2.8014349937438965 }, { "auxiliary_loss_clip": 0.06846365, "auxiliary_loss_mlp": 0.0131207, "balance_loss_clip": 0.06442844, "balance_loss_mlp": 0.0126987, "epoch": 0.09553584848940327, "flos": 24140759437440.0, "grad_norm": 2.3246572293413847, "language_loss": 0.83013839, "learning_rate": 3.955116021746594e-06, "loss": 0.91172278, "num_input_tokens_seen": 34151925, "router_z_loss_clip": 4.03125, "router_z_loss_mlp": 0.421875, "step": 1589, "time_per_iteration": 2.681642770767212 }, { "auxiliary_loss_clip": 0.06826401, "auxiliary_loss_mlp": 0.01314812, "balance_loss_clip": 0.06428735, "balance_loss_mlp": 0.01273518, "epoch": 0.09559597174207124, "flos": 42861401193600.0, "grad_norm": 1.964128813509078, "language_loss": 0.6674878, "learning_rate": 3.955033938184601e-06, "loss": 0.74889994, "num_input_tokens_seen": 34175395, "router_z_loss_clip": 3.97851562, "router_z_loss_mlp": 0.4128418, "step": 1590, "time_per_iteration": 2.836643934249878 }, { "auxiliary_loss_clip": 0.06823058, "auxiliary_loss_mlp": 0.01305966, "balance_loss_clip": 0.06423153, "balance_loss_mlp": 0.01266246, "epoch": 0.09565609499473922, "flos": 32678999907840.0, "grad_norm": 2.9655371030614215, "language_loss": 0.84791631, "learning_rate": 3.954951780487526e-06, "loss": 0.92920649, "num_input_tokens_seen": 34197760, "router_z_loss_clip": 3.99414062, "router_z_loss_mlp": 0.3972168, "step": 1591, "time_per_iteration": 2.7709174156188965 }, { "auxiliary_loss_clip": 0.06818701, "auxiliary_loss_mlp": 0.01311114, "balance_loss_clip": 0.06409694, "balance_loss_mlp": 0.01269033, "epoch": 0.09571621824740718, "flos": 18484279787520.0, "grad_norm": 2.9592714665429036, "language_loss": 0.76731241, "learning_rate": 3.9548695486584835e-06, "loss": 0.84861052, "num_input_tokens_seen": 34215330, "router_z_loss_clip": 4.09375, "router_z_loss_mlp": 0.4206543, "step": 1592, "time_per_iteration": 2.611149311065674 }, { "auxiliary_loss_clip": 0.06788595, "auxiliary_loss_mlp": 0.01320736, "balance_loss_clip": 0.06390189, "balance_loss_mlp": 0.01277821, "epoch": 0.09577634150007515, "flos": 29395164470400.0, "grad_norm": 2.4203874735249404, "language_loss": 0.75760221, "learning_rate": 3.954787242700592e-06, "loss": 0.83869553, "num_input_tokens_seen": 34237745, "router_z_loss_clip": 3.98046875, "router_z_loss_mlp": 0.42919922, "step": 1593, "time_per_iteration": 2.768343210220337 }, { "auxiliary_loss_clip": 0.06779286, "auxiliary_loss_mlp": 0.01312167, "balance_loss_clip": 0.06384113, "balance_loss_mlp": 0.0127166, "epoch": 0.09583646475274313, "flos": 22754511089280.0, "grad_norm": 4.156594638976538, "language_loss": 0.70707798, "learning_rate": 3.954704862616971e-06, "loss": 0.7879926, "num_input_tokens_seen": 34256565, "router_z_loss_clip": 3.94921875, "router_z_loss_mlp": 0.40527344, "step": 1594, "time_per_iteration": 2.699131965637207 }, { "auxiliary_loss_clip": 0.06777552, "auxiliary_loss_mlp": 0.01305651, "balance_loss_clip": 0.06379361, "balance_loss_mlp": 0.01267099, "epoch": 0.0958965880054111, "flos": 23224495040640.0, "grad_norm": 3.4362793643290086, "language_loss": 0.8371923, "learning_rate": 3.954622408410747e-06, "loss": 0.9180243, "num_input_tokens_seen": 34275970, "router_z_loss_clip": 3.98046875, "router_z_loss_mlp": 0.38549805, "step": 1595, "time_per_iteration": 2.649012327194214 }, { "auxiliary_loss_clip": 0.06794346, "auxiliary_loss_mlp": 0.01312131, "balance_loss_clip": 0.06381249, "balance_loss_mlp": 0.01271242, "epoch": 0.09595671125807906, "flos": 21330807166080.0, "grad_norm": 2.3454480168618024, "language_loss": 0.87317681, "learning_rate": 3.954539880085045e-06, "loss": 0.95424163, "num_input_tokens_seen": 34295490, "router_z_loss_clip": 4.12304688, "router_z_loss_mlp": 0.40869141, "step": 1596, "time_per_iteration": 2.6437904834747314 }, { "auxiliary_loss_clip": 0.06781958, "auxiliary_loss_mlp": 0.01308996, "balance_loss_clip": 0.06385041, "balance_loss_mlp": 0.01268847, "epoch": 0.09601683451074704, "flos": 39612841125120.0, "grad_norm": 3.0171444732123085, "language_loss": 0.69853055, "learning_rate": 3.9544572776429945e-06, "loss": 0.77944005, "num_input_tokens_seen": 34319990, "router_z_loss_clip": 3.97070312, "router_z_loss_mlp": 0.40161133, "step": 1597, "time_per_iteration": 4.293586015701294 }, { "auxiliary_loss_clip": 0.06805645, "auxiliary_loss_mlp": 0.0130963, "balance_loss_clip": 0.06392217, "balance_loss_mlp": 0.01268097, "epoch": 0.096076957763415, "flos": 23739523361280.0, "grad_norm": 8.996156771748588, "language_loss": 0.76319027, "learning_rate": 3.954374601087729e-06, "loss": 0.84434301, "num_input_tokens_seen": 34339225, "router_z_loss_clip": 4.13671875, "router_z_loss_mlp": 0.4152832, "step": 1598, "time_per_iteration": 4.126522541046143 }, { "auxiliary_loss_clip": 0.0680766, "auxiliary_loss_mlp": 0.01317555, "balance_loss_clip": 0.06395619, "balance_loss_mlp": 0.01274115, "epoch": 0.09613708101608297, "flos": 34686689662080.0, "grad_norm": 2.599333904795329, "language_loss": 0.70867896, "learning_rate": 3.954291850422382e-06, "loss": 0.78993106, "num_input_tokens_seen": 34361020, "router_z_loss_clip": 4.12109375, "router_z_loss_mlp": 0.43432617, "step": 1599, "time_per_iteration": 2.7556023597717285 }, { "auxiliary_loss_clip": 0.06798576, "auxiliary_loss_mlp": 0.0131625, "balance_loss_clip": 0.06400183, "balance_loss_mlp": 0.01277078, "epoch": 0.09619720426875093, "flos": 20746192429440.0, "grad_norm": 5.7692755076620745, "language_loss": 0.86410445, "learning_rate": 3.954209025650093e-06, "loss": 0.94525278, "num_input_tokens_seen": 34378630, "router_z_loss_clip": 3.9765625, "router_z_loss_mlp": 0.3918457, "step": 1600, "time_per_iteration": 2.642096519470215 }, { "auxiliary_loss_clip": 0.06809561, "auxiliary_loss_mlp": 0.01316928, "balance_loss_clip": 0.06401065, "balance_loss_mlp": 0.01275109, "epoch": 0.09625732752141891, "flos": 13047795832320.0, "grad_norm": 3.563092833950158, "language_loss": 0.82019019, "learning_rate": 3.954126126774001e-06, "loss": 0.90145504, "num_input_tokens_seen": 34397110, "router_z_loss_clip": 4.08789062, "router_z_loss_mlp": 0.41821289, "step": 1601, "time_per_iteration": 4.061777591705322 }, { "auxiliary_loss_clip": 0.06828745, "auxiliary_loss_mlp": 0.01324397, "balance_loss_clip": 0.06407447, "balance_loss_mlp": 0.01277929, "epoch": 0.09631745077408688, "flos": 22280250579840.0, "grad_norm": 6.957110915167849, "language_loss": 0.84670901, "learning_rate": 3.954043153797251e-06, "loss": 0.92824042, "num_input_tokens_seen": 34414165, "router_z_loss_clip": 4.2109375, "router_z_loss_mlp": 0.46435547, "step": 1602, "time_per_iteration": 4.063988924026489 }, { "auxiliary_loss_clip": 0.0681641, "auxiliary_loss_mlp": 0.01312935, "balance_loss_clip": 0.06410914, "balance_loss_mlp": 0.01269495, "epoch": 0.09637757402675484, "flos": 24761236521600.0, "grad_norm": 5.496684077673654, "language_loss": 0.64964414, "learning_rate": 3.953960106722989e-06, "loss": 0.7309376, "num_input_tokens_seen": 34434445, "router_z_loss_clip": 4.05664062, "router_z_loss_mlp": 0.43432617, "step": 1603, "time_per_iteration": 2.651362657546997 }, { "auxiliary_loss_clip": 0.0685363, "auxiliary_loss_mlp": 0.01316142, "balance_loss_clip": 0.06430502, "balance_loss_mlp": 0.01268935, "epoch": 0.09643769727942282, "flos": 22531873991040.0, "grad_norm": 4.601922818252253, "language_loss": 0.73691726, "learning_rate": 3.953876985554364e-06, "loss": 0.81861496, "num_input_tokens_seen": 34453095, "router_z_loss_clip": 4.234375, "router_z_loss_mlp": 0.47216797, "step": 1604, "time_per_iteration": 2.6685450077056885 }, { "auxiliary_loss_clip": 0.06825727, "auxiliary_loss_mlp": 0.01314242, "balance_loss_clip": 0.06427594, "balance_loss_mlp": 0.01274498, "epoch": 0.09649782053209079, "flos": 30929138766720.0, "grad_norm": 3.1421023300502395, "language_loss": 0.80832821, "learning_rate": 3.953793790294527e-06, "loss": 0.88972789, "num_input_tokens_seen": 34473680, "router_z_loss_clip": 3.984375, "router_z_loss_mlp": 0.39746094, "step": 1605, "time_per_iteration": 2.710249900817871 }, { "auxiliary_loss_clip": 0.06849471, "auxiliary_loss_mlp": 0.01309271, "balance_loss_clip": 0.06429809, "balance_loss_mlp": 0.01264615, "epoch": 0.09655794378475875, "flos": 25344635374080.0, "grad_norm": 4.343199688284644, "language_loss": 0.77394378, "learning_rate": 3.953710520946634e-06, "loss": 0.85553122, "num_input_tokens_seen": 34492610, "router_z_loss_clip": 4.19921875, "router_z_loss_mlp": 0.44677734, "step": 1606, "time_per_iteration": 2.656527042388916 }, { "auxiliary_loss_clip": 0.06837443, "auxiliary_loss_mlp": 0.01313014, "balance_loss_clip": 0.06430595, "balance_loss_mlp": 0.01269145, "epoch": 0.09661806703742673, "flos": 22352604180480.0, "grad_norm": 3.869756349466742, "language_loss": 0.77301174, "learning_rate": 3.953627177513843e-06, "loss": 0.85451633, "num_input_tokens_seen": 34511855, "router_z_loss_clip": 4.07226562, "router_z_loss_mlp": 0.4387207, "step": 1607, "time_per_iteration": 2.6257846355438232 }, { "auxiliary_loss_clip": 0.06847326, "auxiliary_loss_mlp": 0.01316018, "balance_loss_clip": 0.0643334, "balance_loss_mlp": 0.012742, "epoch": 0.0966781902900947, "flos": 17463405168000.0, "grad_norm": 9.26945566775857, "language_loss": 0.89126092, "learning_rate": 3.953543759999312e-06, "loss": 0.97289437, "num_input_tokens_seen": 34528905, "router_z_loss_clip": 4.13867188, "router_z_loss_mlp": 0.41821289, "step": 1608, "time_per_iteration": 2.596721649169922 }, { "auxiliary_loss_clip": 0.06850153, "auxiliary_loss_mlp": 0.01306925, "balance_loss_clip": 0.06436494, "balance_loss_mlp": 0.01262746, "epoch": 0.09673831354276266, "flos": 36912991518720.0, "grad_norm": 5.682396970736513, "language_loss": 0.73299819, "learning_rate": 3.953460268406207e-06, "loss": 0.814569, "num_input_tokens_seen": 34548480, "router_z_loss_clip": 4.13085938, "router_z_loss_mlp": 0.44189453, "step": 1609, "time_per_iteration": 2.7790322303771973 }, { "auxiliary_loss_clip": 0.0682722, "auxiliary_loss_mlp": 0.01320046, "balance_loss_clip": 0.06419265, "balance_loss_mlp": 0.01274723, "epoch": 0.09679843679543064, "flos": 20707185553920.0, "grad_norm": 8.600455308183669, "language_loss": 0.86406654, "learning_rate": 3.953376702737693e-06, "loss": 0.94553924, "num_input_tokens_seen": 34565410, "router_z_loss_clip": 4.08007812, "router_z_loss_mlp": 0.453125, "step": 1610, "time_per_iteration": 2.626142740249634 }, { "auxiliary_loss_clip": 0.06829825, "auxiliary_loss_mlp": 0.01328921, "balance_loss_clip": 0.06420224, "balance_loss_mlp": 0.01284265, "epoch": 0.0968585600480986, "flos": 23521288602240.0, "grad_norm": 5.704504730971905, "language_loss": 0.67875975, "learning_rate": 3.953293062996939e-06, "loss": 0.76034719, "num_input_tokens_seen": 34584840, "router_z_loss_clip": 4.09375, "router_z_loss_mlp": 0.4465332, "step": 1611, "time_per_iteration": 2.659867763519287 }, { "auxiliary_loss_clip": 0.06827842, "auxiliary_loss_mlp": 0.01322567, "balance_loss_clip": 0.06419422, "balance_loss_mlp": 0.01278508, "epoch": 0.09691868330076657, "flos": 20127350499840.0, "grad_norm": 2.999796163243482, "language_loss": 0.83071721, "learning_rate": 3.953209349187115e-06, "loss": 0.91222131, "num_input_tokens_seen": 34603360, "router_z_loss_clip": 4.08203125, "router_z_loss_mlp": 0.44042969, "step": 1612, "time_per_iteration": 2.6919703483581543 }, { "auxiliary_loss_clip": 0.06835426, "auxiliary_loss_mlp": 0.01324186, "balance_loss_clip": 0.06428889, "balance_loss_mlp": 0.0127903, "epoch": 0.09697880655343454, "flos": 16550243372160.0, "grad_norm": 17.993983114642468, "language_loss": 0.82540518, "learning_rate": 3.953125561311398e-06, "loss": 0.90700126, "num_input_tokens_seen": 34620760, "router_z_loss_clip": 4.06640625, "router_z_loss_mlp": 0.45141602, "step": 1613, "time_per_iteration": 2.6237611770629883 }, { "auxiliary_loss_clip": 0.06814279, "auxiliary_loss_mlp": 0.01327385, "balance_loss_clip": 0.06416509, "balance_loss_mlp": 0.01283802, "epoch": 0.09703892980610251, "flos": 26111370960000.0, "grad_norm": 3.491961523702346, "language_loss": 0.8642323, "learning_rate": 3.953041699372964e-06, "loss": 0.94564897, "num_input_tokens_seen": 34640695, "router_z_loss_clip": 3.97460938, "router_z_loss_mlp": 0.43603516, "step": 1614, "time_per_iteration": 2.6981637477874756 }, { "auxiliary_loss_clip": 0.06638677, "auxiliary_loss_mlp": 0.01352991, "balance_loss_clip": 0.06380041, "balance_loss_mlp": 0.01331736, "epoch": 0.09709905305877048, "flos": 60463712903040.0, "grad_norm": 0.7181909781309744, "language_loss": 0.54655242, "learning_rate": 3.952957763374992e-06, "loss": 0.62646908, "num_input_tokens_seen": 34702395, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.21276855, "step": 1615, "time_per_iteration": 3.1880884170532227 }, { "auxiliary_loss_clip": 0.06623524, "auxiliary_loss_mlp": 0.01305245, "balance_loss_clip": 0.06364653, "balance_loss_mlp": 0.01287471, "epoch": 0.09715917631143844, "flos": 57660510885120.0, "grad_norm": 0.7705617908072572, "language_loss": 0.58064634, "learning_rate": 3.952873753320666e-06, "loss": 0.65993404, "num_input_tokens_seen": 34768910, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.17810059, "step": 1616, "time_per_iteration": 3.3982138633728027 }, { "auxiliary_loss_clip": 0.06814089, "auxiliary_loss_mlp": 0.01323326, "balance_loss_clip": 0.06401956, "balance_loss_mlp": 0.01275213, "epoch": 0.09721929956410642, "flos": 20564448923520.0, "grad_norm": 2.2891690183467888, "language_loss": 0.70450777, "learning_rate": 3.952789669213172e-06, "loss": 0.78588194, "num_input_tokens_seen": 34787680, "router_z_loss_clip": 4.11914062, "router_z_loss_mlp": 0.48022461, "step": 1617, "time_per_iteration": 2.6285903453826904 }, { "auxiliary_loss_clip": 0.06820536, "auxiliary_loss_mlp": 0.01317214, "balance_loss_clip": 0.06405352, "balance_loss_mlp": 0.01272201, "epoch": 0.09727942281677439, "flos": 27351696222720.0, "grad_norm": 2.561311566303171, "language_loss": 0.81778812, "learning_rate": 3.952705511055698e-06, "loss": 0.89916563, "num_input_tokens_seen": 34808330, "router_z_loss_clip": 4.15234375, "router_z_loss_mlp": 0.45019531, "step": 1618, "time_per_iteration": 2.6741485595703125 }, { "auxiliary_loss_clip": 0.06796389, "auxiliary_loss_mlp": 0.01299003, "balance_loss_clip": 0.06401727, "balance_loss_mlp": 0.01258376, "epoch": 0.09733954606944235, "flos": 24906991898880.0, "grad_norm": 1.9059323689445884, "language_loss": 0.94299763, "learning_rate": 3.952621278851435e-06, "loss": 1.02395153, "num_input_tokens_seen": 34830020, "router_z_loss_clip": 3.94726562, "router_z_loss_mlp": 0.40600586, "step": 1619, "time_per_iteration": 2.6609883308410645 }, { "auxiliary_loss_clip": 0.06812394, "auxiliary_loss_mlp": 0.01300772, "balance_loss_clip": 0.06402659, "balance_loss_mlp": 0.01256545, "epoch": 0.09739966932211033, "flos": 31511992567680.0, "grad_norm": 3.8636492727289258, "language_loss": 0.900231, "learning_rate": 3.9525369726035784e-06, "loss": 0.98136264, "num_input_tokens_seen": 34850330, "router_z_loss_clip": 4.09765625, "router_z_loss_mlp": 0.44213867, "step": 1620, "time_per_iteration": 2.75185489654541 }, { "auxiliary_loss_clip": 0.06813627, "auxiliary_loss_mlp": 0.01307046, "balance_loss_clip": 0.06410681, "balance_loss_mlp": 0.01261937, "epoch": 0.0974597925747783, "flos": 23885614154880.0, "grad_norm": 2.5209676988432617, "language_loss": 0.78672957, "learning_rate": 3.952452592315324e-06, "loss": 0.86793631, "num_input_tokens_seen": 34871640, "router_z_loss_clip": 4.02734375, "router_z_loss_mlp": 0.45117188, "step": 1621, "time_per_iteration": 2.645829916000366 }, { "auxiliary_loss_clip": 0.06825703, "auxiliary_loss_mlp": 0.01309645, "balance_loss_clip": 0.06418963, "balance_loss_mlp": 0.01265824, "epoch": 0.09751991582744626, "flos": 17025300495360.0, "grad_norm": 2.328195483337783, "language_loss": 0.79668486, "learning_rate": 3.952368137989871e-06, "loss": 0.87803841, "num_input_tokens_seen": 34888100, "router_z_loss_clip": 4.06445312, "router_z_loss_mlp": 0.43847656, "step": 1622, "time_per_iteration": 2.5973925590515137 }, { "auxiliary_loss_clip": 0.0684754, "auxiliary_loss_mlp": 0.01306825, "balance_loss_clip": 0.06425382, "balance_loss_mlp": 0.01260334, "epoch": 0.09758003908011423, "flos": 28410403760640.0, "grad_norm": 2.3640345400223914, "language_loss": 0.86742455, "learning_rate": 3.9522836096304225e-06, "loss": 0.94896817, "num_input_tokens_seen": 34910485, "router_z_loss_clip": 4.2265625, "router_z_loss_mlp": 0.46557617, "step": 1623, "time_per_iteration": 2.691011428833008 }, { "auxiliary_loss_clip": 0.06837116, "auxiliary_loss_mlp": 0.01312829, "balance_loss_clip": 0.06422335, "balance_loss_mlp": 0.01266719, "epoch": 0.09764016233278221, "flos": 18149150183040.0, "grad_norm": 2.787232429455711, "language_loss": 0.82036459, "learning_rate": 3.952199007240184e-06, "loss": 0.90186405, "num_input_tokens_seen": 34928615, "router_z_loss_clip": 4.14453125, "router_z_loss_mlp": 0.46069336, "step": 1624, "time_per_iteration": 2.6071035861968994 }, { "auxiliary_loss_clip": 0.06826637, "auxiliary_loss_mlp": 0.01315424, "balance_loss_clip": 0.06427745, "balance_loss_mlp": 0.01273319, "epoch": 0.09770028558545017, "flos": 15270869306880.0, "grad_norm": 2.777659842544731, "language_loss": 0.87779939, "learning_rate": 3.952114330822364e-06, "loss": 0.95921993, "num_input_tokens_seen": 34946045, "router_z_loss_clip": 3.98632812, "router_z_loss_mlp": 0.42089844, "step": 1625, "time_per_iteration": 2.680159091949463 }, { "auxiliary_loss_clip": 0.06858696, "auxiliary_loss_mlp": 0.01333544, "balance_loss_clip": 0.06428944, "balance_loss_mlp": 0.0128524, "epoch": 0.09776040883811814, "flos": 23478382512000.0, "grad_norm": 3.447848371826157, "language_loss": 0.86472219, "learning_rate": 3.952029580380172e-06, "loss": 0.9466446, "num_input_tokens_seen": 34962865, "router_z_loss_clip": 4.29882812, "router_z_loss_mlp": 0.48291016, "step": 1626, "time_per_iteration": 2.6568892002105713 }, { "auxiliary_loss_clip": 0.06859336, "auxiliary_loss_mlp": 0.01319146, "balance_loss_clip": 0.06437381, "balance_loss_mlp": 0.01275539, "epoch": 0.09782053209078612, "flos": 24506510509440.0, "grad_norm": 2.2107743959948207, "language_loss": 0.85048008, "learning_rate": 3.9519447559168234e-06, "loss": 0.93226492, "num_input_tokens_seen": 34983505, "router_z_loss_clip": 4.22070312, "router_z_loss_mlp": 0.43652344, "step": 1627, "time_per_iteration": 2.650768756866455 }, { "auxiliary_loss_clip": 0.06839998, "auxiliary_loss_mlp": 0.01335418, "balance_loss_clip": 0.06430433, "balance_loss_mlp": 0.01290977, "epoch": 0.09788065534345408, "flos": 21586623281280.0, "grad_norm": 2.370322684178263, "language_loss": 0.86169255, "learning_rate": 3.951859857435534e-06, "loss": 0.94344676, "num_input_tokens_seen": 35001825, "router_z_loss_clip": 4.08789062, "router_z_loss_mlp": 0.44433594, "step": 1628, "time_per_iteration": 2.6319398880004883 }, { "auxiliary_loss_clip": 0.06829481, "auxiliary_loss_mlp": 0.01329187, "balance_loss_clip": 0.06425454, "balance_loss_mlp": 0.01285461, "epoch": 0.09794077859612205, "flos": 23849332536960.0, "grad_norm": 2.220378455003258, "language_loss": 0.77031827, "learning_rate": 3.951774884939523e-06, "loss": 0.85190499, "num_input_tokens_seen": 35023075, "router_z_loss_clip": 4.03320312, "router_z_loss_mlp": 0.43676758, "step": 1629, "time_per_iteration": 2.6397199630737305 }, { "auxiliary_loss_clip": 0.06841273, "auxiliary_loss_mlp": 0.01325852, "balance_loss_clip": 0.06430403, "balance_loss_mlp": 0.01281792, "epoch": 0.09800090184879003, "flos": 23666708563200.0, "grad_norm": 1.8512199508772271, "language_loss": 0.79164016, "learning_rate": 3.951689838432013e-06, "loss": 0.87331146, "num_input_tokens_seen": 35043480, "router_z_loss_clip": 4.11132812, "router_z_loss_mlp": 0.44067383, "step": 1630, "time_per_iteration": 2.638368844985962 }, { "auxiliary_loss_clip": 0.06839906, "auxiliary_loss_mlp": 0.01322302, "balance_loss_clip": 0.06425111, "balance_loss_mlp": 0.01278766, "epoch": 0.09806102510145799, "flos": 17061456332160.0, "grad_norm": 2.522054948766387, "language_loss": 0.88036811, "learning_rate": 3.951604717916228e-06, "loss": 0.96199024, "num_input_tokens_seen": 35061490, "router_z_loss_clip": 4.14648438, "router_z_loss_mlp": 0.43530273, "step": 1631, "time_per_iteration": 2.5925827026367188 }, { "auxiliary_loss_clip": 0.06838663, "auxiliary_loss_mlp": 0.01318525, "balance_loss_clip": 0.0643258, "balance_loss_mlp": 0.01274179, "epoch": 0.09812114835412596, "flos": 23885278738560.0, "grad_norm": 3.1592293929527315, "language_loss": 0.83908844, "learning_rate": 3.9515195233953975e-06, "loss": 0.92066038, "num_input_tokens_seen": 35079670, "router_z_loss_clip": 4.06054688, "router_z_loss_mlp": 0.44384766, "step": 1632, "time_per_iteration": 2.654395580291748 }, { "auxiliary_loss_clip": 0.06824605, "auxiliary_loss_mlp": 0.01316385, "balance_loss_clip": 0.06418946, "balance_loss_mlp": 0.01272277, "epoch": 0.09818127160679392, "flos": 20601862571520.0, "grad_norm": 2.1118722550860207, "language_loss": 0.79685259, "learning_rate": 3.951434254872751e-06, "loss": 0.87826246, "num_input_tokens_seen": 35099205, "router_z_loss_clip": 4.05859375, "router_z_loss_mlp": 0.44091797, "step": 1633, "time_per_iteration": 2.6291708946228027 }, { "auxiliary_loss_clip": 0.06828479, "auxiliary_loss_mlp": 0.01320463, "balance_loss_clip": 0.06417617, "balance_loss_mlp": 0.01275116, "epoch": 0.0982413948594619, "flos": 15492835572480.0, "grad_norm": 3.392276606222135, "language_loss": 0.74619663, "learning_rate": 3.951348912351521e-06, "loss": 0.82768607, "num_input_tokens_seen": 35115270, "router_z_loss_clip": 4.10546875, "router_z_loss_mlp": 0.45385742, "step": 1634, "time_per_iteration": 2.5908143520355225 }, { "auxiliary_loss_clip": 0.06843372, "auxiliary_loss_mlp": 0.0132176, "balance_loss_clip": 0.06421532, "balance_loss_mlp": 0.01278368, "epoch": 0.09830151811212987, "flos": 24214999754880.0, "grad_norm": 5.0928555461542375, "language_loss": 0.75814009, "learning_rate": 3.951263495834947e-06, "loss": 0.8397913, "num_input_tokens_seen": 35134065, "router_z_loss_clip": 4.21289062, "router_z_loss_mlp": 0.43383789, "step": 1635, "time_per_iteration": 2.6418635845184326 }, { "auxiliary_loss_clip": 0.06832182, "auxiliary_loss_mlp": 0.0132607, "balance_loss_clip": 0.06414011, "balance_loss_mlp": 0.01280556, "epoch": 0.09836164136479783, "flos": 20600814395520.0, "grad_norm": 2.227540419972846, "language_loss": 0.79420936, "learning_rate": 3.951178005326264e-06, "loss": 0.87579191, "num_input_tokens_seen": 35154870, "router_z_loss_clip": 4.1875, "router_z_loss_mlp": 0.45507812, "step": 1636, "time_per_iteration": 2.693150281906128 }, { "auxiliary_loss_clip": 0.06825246, "auxiliary_loss_mlp": 0.01311418, "balance_loss_clip": 0.06404217, "balance_loss_mlp": 0.01266524, "epoch": 0.09842176461746581, "flos": 19939653354240.0, "grad_norm": 2.2759078596903897, "language_loss": 0.71527505, "learning_rate": 3.951092440828715e-06, "loss": 0.79664171, "num_input_tokens_seen": 35171850, "router_z_loss_clip": 4.21289062, "router_z_loss_mlp": 0.44897461, "step": 1637, "time_per_iteration": 4.1530961990356445 }, { "auxiliary_loss_clip": 0.06805379, "auxiliary_loss_mlp": 0.01313305, "balance_loss_clip": 0.06399655, "balance_loss_mlp": 0.012712, "epoch": 0.09848188787013377, "flos": 21220956063360.0, "grad_norm": 3.969115847180107, "language_loss": 0.79366612, "learning_rate": 3.951006802345545e-06, "loss": 0.87485301, "num_input_tokens_seen": 35188795, "router_z_loss_clip": 4.0546875, "router_z_loss_mlp": 0.42089844, "step": 1638, "time_per_iteration": 4.00510573387146 }, { "auxiliary_loss_clip": 0.06786624, "auxiliary_loss_mlp": 0.0130182, "balance_loss_clip": 0.06392949, "balance_loss_mlp": 0.01260907, "epoch": 0.09854201112280174, "flos": 30162109691520.0, "grad_norm": 1.6707637414245335, "language_loss": 0.7417376, "learning_rate": 3.950921089880003e-06, "loss": 0.82262206, "num_input_tokens_seen": 35212100, "router_z_loss_clip": 3.9375, "router_z_loss_mlp": 0.40917969, "step": 1639, "time_per_iteration": 2.6964805126190186 }, { "auxiliary_loss_clip": 0.06808764, "auxiliary_loss_mlp": 0.01305448, "balance_loss_clip": 0.06396259, "balance_loss_mlp": 0.01262294, "epoch": 0.09860213437546972, "flos": 21801671585280.0, "grad_norm": 2.2948939842841654, "language_loss": 0.89731163, "learning_rate": 3.950835303435337e-06, "loss": 0.97845376, "num_input_tokens_seen": 35230390, "router_z_loss_clip": 4.12695312, "router_z_loss_mlp": 0.43139648, "step": 1640, "time_per_iteration": 4.067364454269409 }, { "auxiliary_loss_clip": 0.06802482, "auxiliary_loss_mlp": 0.01304858, "balance_loss_clip": 0.06394708, "balance_loss_mlp": 0.01260488, "epoch": 0.09866225762813768, "flos": 21842062053120.0, "grad_norm": 2.0825769157762832, "language_loss": 0.8226456, "learning_rate": 3.950749443014801e-06, "loss": 0.90371901, "num_input_tokens_seen": 35250405, "router_z_loss_clip": 4.07421875, "router_z_loss_mlp": 0.44384766, "step": 1641, "time_per_iteration": 3.98569917678833 }, { "auxiliary_loss_clip": 0.06800507, "auxiliary_loss_mlp": 0.01307113, "balance_loss_clip": 0.06388289, "balance_loss_mlp": 0.01262124, "epoch": 0.09872238088080565, "flos": 17605093622400.0, "grad_norm": 5.878249466894718, "language_loss": 0.88381696, "learning_rate": 3.95066350862165e-06, "loss": 0.9648931, "num_input_tokens_seen": 35262820, "router_z_loss_clip": 4.12304688, "router_z_loss_mlp": 0.44995117, "step": 1642, "time_per_iteration": 2.6030068397521973 }, { "auxiliary_loss_clip": 0.06787306, "auxiliary_loss_mlp": 0.01308054, "balance_loss_clip": 0.0639085, "balance_loss_mlp": 0.01267022, "epoch": 0.09878250413347361, "flos": 27643500466560.0, "grad_norm": 1.7994724566928781, "language_loss": 0.81797612, "learning_rate": 3.950577500259144e-06, "loss": 0.89892972, "num_input_tokens_seen": 35284490, "router_z_loss_clip": 3.96484375, "router_z_loss_mlp": 0.41015625, "step": 1643, "time_per_iteration": 2.676265239715576 }, { "auxiliary_loss_clip": 0.06788822, "auxiliary_loss_mlp": 0.01309576, "balance_loss_clip": 0.06386857, "balance_loss_mlp": 0.01266636, "epoch": 0.0988426273861416, "flos": 16550285299200.0, "grad_norm": 2.7112897526167266, "language_loss": 0.84404564, "learning_rate": 3.950491417930543e-06, "loss": 0.92502958, "num_input_tokens_seen": 35302815, "router_z_loss_clip": 4.01953125, "router_z_loss_mlp": 0.42944336, "step": 1644, "time_per_iteration": 2.5965194702148438 }, { "auxiliary_loss_clip": 0.06767458, "auxiliary_loss_mlp": 0.01306396, "balance_loss_clip": 0.06382136, "balance_loss_mlp": 0.01265293, "epoch": 0.09890275063880956, "flos": 21221668823040.0, "grad_norm": 2.5877252095725347, "language_loss": 0.70087314, "learning_rate": 3.9504052616391124e-06, "loss": 0.78161168, "num_input_tokens_seen": 35321175, "router_z_loss_clip": 3.8515625, "router_z_loss_mlp": 0.41088867, "step": 1645, "time_per_iteration": 2.6166796684265137 }, { "auxiliary_loss_clip": 0.06640589, "auxiliary_loss_mlp": 0.01332463, "balance_loss_clip": 0.06384113, "balance_loss_mlp": 0.01310183, "epoch": 0.09896287389147752, "flos": 59398255111680.0, "grad_norm": 0.8711436767627322, "language_loss": 0.60752082, "learning_rate": 3.950319031388119e-06, "loss": 0.68725133, "num_input_tokens_seen": 35381740, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.22314453, "step": 1646, "time_per_iteration": 3.1558802127838135 }, { "auxiliary_loss_clip": 0.06780185, "auxiliary_loss_mlp": 0.01307205, "balance_loss_clip": 0.06374516, "balance_loss_mlp": 0.01261, "epoch": 0.0990229971441455, "flos": 29650351680000.0, "grad_norm": 2.655533429830348, "language_loss": 0.74625409, "learning_rate": 3.950232727180833e-06, "loss": 0.82712799, "num_input_tokens_seen": 35403760, "router_z_loss_clip": 4.0625, "router_z_loss_mlp": 0.46191406, "step": 1647, "time_per_iteration": 2.6785542964935303 }, { "auxiliary_loss_clip": 0.06781325, "auxiliary_loss_mlp": 0.01313325, "balance_loss_clip": 0.06377213, "balance_loss_mlp": 0.01269694, "epoch": 0.09908312039681347, "flos": 21841265439360.0, "grad_norm": 2.088656430177847, "language_loss": 0.86456299, "learning_rate": 3.950146349020525e-06, "loss": 0.94550943, "num_input_tokens_seen": 35424050, "router_z_loss_clip": 4.04296875, "router_z_loss_mlp": 0.43652344, "step": 1648, "time_per_iteration": 2.6117892265319824 }, { "auxiliary_loss_clip": 0.06597417, "auxiliary_loss_mlp": 0.01330424, "balance_loss_clip": 0.06341484, "balance_loss_mlp": 0.01306355, "epoch": 0.09914324364948143, "flos": 57584425777920.0, "grad_norm": 0.7282940893549259, "language_loss": 0.55670488, "learning_rate": 3.950059896910473e-06, "loss": 0.63598329, "num_input_tokens_seen": 35481690, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.24047852, "step": 1649, "time_per_iteration": 3.1555604934692383 }, { "auxiliary_loss_clip": 0.06760127, "auxiliary_loss_mlp": 0.01306528, "balance_loss_clip": 0.06365506, "balance_loss_mlp": 0.01264423, "epoch": 0.09920336690214941, "flos": 34131270873600.0, "grad_norm": 3.8503042973712325, "language_loss": 0.91195536, "learning_rate": 3.949973370853954e-06, "loss": 0.99262178, "num_input_tokens_seen": 35498635, "router_z_loss_clip": 3.9453125, "router_z_loss_mlp": 0.42114258, "step": 1650, "time_per_iteration": 2.703866481781006 }, { "auxiliary_loss_clip": 0.06581616, "auxiliary_loss_mlp": 0.01286866, "balance_loss_clip": 0.06327701, "balance_loss_mlp": 0.01263131, "epoch": 0.09926349015481738, "flos": 71239910947200.0, "grad_norm": 0.7870252581643468, "language_loss": 0.63582277, "learning_rate": 3.94988677085425e-06, "loss": 0.71450764, "num_input_tokens_seen": 35565720, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.23718262, "step": 1651, "time_per_iteration": 3.368925094604492 }, { "auxiliary_loss_clip": 0.06761239, "auxiliary_loss_mlp": 0.01317729, "balance_loss_clip": 0.06366159, "balance_loss_mlp": 0.01274885, "epoch": 0.09932361340748534, "flos": 23155369822080.0, "grad_norm": 2.007747252492298, "language_loss": 0.89783341, "learning_rate": 3.949800096914643e-06, "loss": 0.97862315, "num_input_tokens_seen": 35586000, "router_z_loss_clip": 3.95507812, "router_z_loss_mlp": 0.42822266, "step": 1652, "time_per_iteration": 2.6213316917419434 }, { "auxiliary_loss_clip": 0.06763043, "auxiliary_loss_mlp": 0.01334191, "balance_loss_clip": 0.06368564, "balance_loss_mlp": 0.01291085, "epoch": 0.09938373666015332, "flos": 19834791569280.0, "grad_norm": 2.207758981621569, "language_loss": 0.84100169, "learning_rate": 3.949713349038422e-06, "loss": 0.921974, "num_input_tokens_seen": 35604355, "router_z_loss_clip": 3.93945312, "router_z_loss_mlp": 0.43115234, "step": 1653, "time_per_iteration": 2.5970795154571533 }, { "auxiliary_loss_clip": 0.06775633, "auxiliary_loss_mlp": 0.01334104, "balance_loss_clip": 0.06368305, "balance_loss_mlp": 0.01288065, "epoch": 0.09944385991282129, "flos": 22097165408640.0, "grad_norm": 2.2183247092979377, "language_loss": 0.8190884, "learning_rate": 3.949626527228875e-06, "loss": 0.90018582, "num_input_tokens_seen": 35625495, "router_z_loss_clip": 4.06640625, "router_z_loss_mlp": 0.46044922, "step": 1654, "time_per_iteration": 2.653975248336792 }, { "auxiliary_loss_clip": 0.06744026, "auxiliary_loss_mlp": 0.01318454, "balance_loss_clip": 0.0636694, "balance_loss_mlp": 0.01277899, "epoch": 0.09950398316548925, "flos": 19835043131520.0, "grad_norm": 1.8269483074970134, "language_loss": 0.827191, "learning_rate": 3.949539631489295e-06, "loss": 0.90781581, "num_input_tokens_seen": 35645030, "router_z_loss_clip": 3.76953125, "router_z_loss_mlp": 0.40551758, "step": 1655, "time_per_iteration": 2.592196464538574 }, { "auxiliary_loss_clip": 0.06766345, "auxiliary_loss_mlp": 0.01329413, "balance_loss_clip": 0.06372343, "balance_loss_mlp": 0.01288476, "epoch": 0.09956410641815722, "flos": 25009715404800.0, "grad_norm": 1.7582635929162527, "language_loss": 0.82310939, "learning_rate": 3.9494526618229765e-06, "loss": 0.90406698, "num_input_tokens_seen": 35664305, "router_z_loss_clip": 3.9375, "router_z_loss_mlp": 0.40917969, "step": 1656, "time_per_iteration": 2.6689748764038086 }, { "auxiliary_loss_clip": 0.0675211, "auxiliary_loss_mlp": 0.01316947, "balance_loss_clip": 0.06367099, "balance_loss_mlp": 0.01277203, "epoch": 0.0996242296708252, "flos": 19323746317440.0, "grad_norm": 2.5280076636536135, "language_loss": 0.89985466, "learning_rate": 3.949365618233217e-06, "loss": 0.98054522, "num_input_tokens_seen": 35684060, "router_z_loss_clip": 3.85742188, "router_z_loss_mlp": 0.39697266, "step": 1657, "time_per_iteration": 2.611522912979126 }, { "auxiliary_loss_clip": 0.06785768, "auxiliary_loss_mlp": 0.01339264, "balance_loss_clip": 0.06372987, "balance_loss_mlp": 0.01291605, "epoch": 0.09968435292349316, "flos": 21878050181760.0, "grad_norm": 2.6436697182400697, "language_loss": 0.86565757, "learning_rate": 3.9492785007233195e-06, "loss": 0.94690788, "num_input_tokens_seen": 35703250, "router_z_loss_clip": 4.125, "router_z_loss_mlp": 0.47631836, "step": 1658, "time_per_iteration": 2.616729974746704 }, { "auxiliary_loss_clip": 0.06570406, "auxiliary_loss_mlp": 0.01316401, "balance_loss_clip": 0.06318223, "balance_loss_mlp": 0.01291129, "epoch": 0.09974447617616113, "flos": 65401912154880.0, "grad_norm": 0.8890782328562097, "language_loss": 0.60353184, "learning_rate": 3.949191309296585e-06, "loss": 0.68239999, "num_input_tokens_seen": 35762165, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.25280762, "step": 1659, "time_per_iteration": 3.264832019805908 }, { "auxiliary_loss_clip": 0.06774598, "auxiliary_loss_mlp": 0.01327395, "balance_loss_clip": 0.06382218, "balance_loss_mlp": 0.01287412, "epoch": 0.0998045994288291, "flos": 23666624709120.0, "grad_norm": 1.8357087370640222, "language_loss": 0.86479384, "learning_rate": 3.949104043956321e-06, "loss": 0.94581378, "num_input_tokens_seen": 35781520, "router_z_loss_clip": 3.921875, "router_z_loss_mlp": 0.3996582, "step": 1660, "time_per_iteration": 2.631796360015869 }, { "auxiliary_loss_clip": 0.06774916, "auxiliary_loss_mlp": 0.01321336, "balance_loss_clip": 0.06384875, "balance_loss_mlp": 0.01278564, "epoch": 0.09986472268149707, "flos": 19615802123520.0, "grad_norm": 2.9656127139216384, "language_loss": 0.80874622, "learning_rate": 3.949016704705836e-06, "loss": 0.88970876, "num_input_tokens_seen": 35799565, "router_z_loss_clip": 3.89257812, "router_z_loss_mlp": 0.42749023, "step": 1661, "time_per_iteration": 2.609998941421509 }, { "auxiliary_loss_clip": 0.06827177, "auxiliary_loss_mlp": 0.01323749, "balance_loss_clip": 0.06417167, "balance_loss_mlp": 0.01279022, "epoch": 0.09992484593416504, "flos": 26220467376000.0, "grad_norm": 2.726256406134661, "language_loss": 0.85366565, "learning_rate": 3.948929291548443e-06, "loss": 0.93517494, "num_input_tokens_seen": 35821085, "router_z_loss_clip": 4.09570312, "router_z_loss_mlp": 0.44677734, "step": 1662, "time_per_iteration": 2.6621854305267334 }, { "auxiliary_loss_clip": 0.06800206, "auxiliary_loss_mlp": 0.01318423, "balance_loss_clip": 0.06406052, "balance_loss_mlp": 0.01275913, "epoch": 0.09998496918683301, "flos": 17499393296640.0, "grad_norm": 2.2855890573193496, "language_loss": 0.9087252, "learning_rate": 3.9488418044874546e-06, "loss": 0.9899115, "num_input_tokens_seen": 35839840, "router_z_loss_clip": 3.9375, "router_z_loss_mlp": 0.42504883, "step": 1663, "time_per_iteration": 2.590015172958374 }, { "auxiliary_loss_clip": 0.0680726, "auxiliary_loss_mlp": 0.01320459, "balance_loss_clip": 0.06400399, "balance_loss_mlp": 0.01277925, "epoch": 0.10004509243950098, "flos": 22791715102080.0, "grad_norm": 2.994391993365425, "language_loss": 0.71805179, "learning_rate": 3.948754243526191e-06, "loss": 0.79932898, "num_input_tokens_seen": 35861545, "router_z_loss_clip": 4.0703125, "router_z_loss_mlp": 0.42529297, "step": 1664, "time_per_iteration": 2.698751211166382 }, { "auxiliary_loss_clip": 0.06796186, "auxiliary_loss_mlp": 0.01314314, "balance_loss_clip": 0.06396335, "balance_loss_mlp": 0.01271541, "epoch": 0.10010521569216894, "flos": 16258984179840.0, "grad_norm": 3.1917096283694746, "language_loss": 0.80124289, "learning_rate": 3.94866660866797e-06, "loss": 0.88234782, "num_input_tokens_seen": 35878295, "router_z_loss_clip": 4.0, "router_z_loss_mlp": 0.42773438, "step": 1665, "time_per_iteration": 2.5983753204345703 }, { "auxiliary_loss_clip": 0.06799965, "auxiliary_loss_mlp": 0.01309313, "balance_loss_clip": 0.06406134, "balance_loss_mlp": 0.01268973, "epoch": 0.10016533894483691, "flos": 23409047658240.0, "grad_norm": 2.813916889455134, "language_loss": 0.71466374, "learning_rate": 3.9485788999161165e-06, "loss": 0.79575652, "num_input_tokens_seen": 35898990, "router_z_loss_clip": 3.9375, "router_z_loss_mlp": 0.40332031, "step": 1666, "time_per_iteration": 2.6242268085479736 }, { "auxiliary_loss_clip": 0.06826035, "auxiliary_loss_mlp": 0.01329318, "balance_loss_clip": 0.06420899, "balance_loss_mlp": 0.01281968, "epoch": 0.10022546219750489, "flos": 19360195643520.0, "grad_norm": 2.1476894148849834, "language_loss": 0.80402625, "learning_rate": 3.948491117273956e-06, "loss": 0.88557982, "num_input_tokens_seen": 35916225, "router_z_loss_clip": 4.04882812, "router_z_loss_mlp": 0.47338867, "step": 1667, "time_per_iteration": 2.5803945064544678 }, { "auxiliary_loss_clip": 0.06811778, "auxiliary_loss_mlp": 0.01307353, "balance_loss_clip": 0.06412067, "balance_loss_mlp": 0.01265964, "epoch": 0.10028558545017285, "flos": 27092525944320.0, "grad_norm": 3.8786747607057377, "language_loss": 0.79558617, "learning_rate": 3.948403260744817e-06, "loss": 0.87677747, "num_input_tokens_seen": 35934630, "router_z_loss_clip": 3.99609375, "router_z_loss_mlp": 0.4140625, "step": 1668, "time_per_iteration": 2.644613265991211 }, { "auxiliary_loss_clip": 0.06807578, "auxiliary_loss_mlp": 0.01303096, "balance_loss_clip": 0.06405379, "balance_loss_mlp": 0.01261182, "epoch": 0.10034570870284082, "flos": 25854003544320.0, "grad_norm": 2.1610360739190453, "language_loss": 0.79690278, "learning_rate": 3.948315330332031e-06, "loss": 0.87800956, "num_input_tokens_seen": 35953855, "router_z_loss_clip": 4.03125, "router_z_loss_mlp": 0.41894531, "step": 1669, "time_per_iteration": 2.722670078277588 }, { "auxiliary_loss_clip": 0.06808427, "auxiliary_loss_mlp": 0.0131318, "balance_loss_clip": 0.06400635, "balance_loss_mlp": 0.01268643, "epoch": 0.1004058319555088, "flos": 26256707066880.0, "grad_norm": 3.918083874972199, "language_loss": 0.87150764, "learning_rate": 3.948227326038933e-06, "loss": 0.95272368, "num_input_tokens_seen": 35974555, "router_z_loss_clip": 4.08007812, "router_z_loss_mlp": 0.4453125, "step": 1670, "time_per_iteration": 2.6505398750305176 }, { "auxiliary_loss_clip": 0.06790937, "auxiliary_loss_mlp": 0.01296899, "balance_loss_clip": 0.06410235, "balance_loss_mlp": 0.01258442, "epoch": 0.10046595520817676, "flos": 25381545897600.0, "grad_norm": 2.4965623386036886, "language_loss": 0.78454256, "learning_rate": 3.9481392478688586e-06, "loss": 0.86542094, "num_input_tokens_seen": 35996830, "router_z_loss_clip": 3.8046875, "router_z_loss_mlp": 0.38452148, "step": 1671, "time_per_iteration": 2.738497018814087 }, { "auxiliary_loss_clip": 0.06604688, "auxiliary_loss_mlp": 0.0127085, "balance_loss_clip": 0.06347454, "balance_loss_mlp": 0.01252397, "epoch": 0.10052607846084473, "flos": 67479146398080.0, "grad_norm": 0.7640845810491056, "language_loss": 0.60862678, "learning_rate": 3.948051095825149e-06, "loss": 0.68738216, "num_input_tokens_seen": 36054465, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.18444824, "step": 1672, "time_per_iteration": 3.2610771656036377 }, { "auxiliary_loss_clip": 0.06800318, "auxiliary_loss_mlp": 0.01298944, "balance_loss_clip": 0.06405104, "balance_loss_mlp": 0.01257554, "epoch": 0.10058620171351271, "flos": 21366795294720.0, "grad_norm": 2.3438603180950635, "language_loss": 0.78103471, "learning_rate": 3.947962869911147e-06, "loss": 0.86202729, "num_input_tokens_seen": 36073480, "router_z_loss_clip": 3.953125, "router_z_loss_mlp": 0.41357422, "step": 1673, "time_per_iteration": 2.607046604156494 }, { "auxiliary_loss_clip": 0.06795797, "auxiliary_loss_mlp": 0.01297539, "balance_loss_clip": 0.06399722, "balance_loss_mlp": 0.01256984, "epoch": 0.10064632496618067, "flos": 16805724071040.0, "grad_norm": 2.625093324127513, "language_loss": 0.75179297, "learning_rate": 3.947874570130197e-06, "loss": 0.83272636, "num_input_tokens_seen": 36091830, "router_z_loss_clip": 3.96484375, "router_z_loss_mlp": 0.40551758, "step": 1674, "time_per_iteration": 2.6084465980529785 }, { "auxiliary_loss_clip": 0.06787049, "auxiliary_loss_mlp": 0.01297864, "balance_loss_clip": 0.06390038, "balance_loss_mlp": 0.01257405, "epoch": 0.10070644821884864, "flos": 23631433194240.0, "grad_norm": 2.4748092942206172, "language_loss": 0.81397063, "learning_rate": 3.947786196485649e-06, "loss": 0.89481974, "num_input_tokens_seen": 36111400, "router_z_loss_clip": 3.97070312, "router_z_loss_mlp": 0.40454102, "step": 1675, "time_per_iteration": 2.631908416748047 }, { "auxiliary_loss_clip": 0.06784555, "auxiliary_loss_mlp": 0.01301305, "balance_loss_clip": 0.06385794, "balance_loss_mlp": 0.01258438, "epoch": 0.1007665714715166, "flos": 24469516131840.0, "grad_norm": 2.1762693186839557, "language_loss": 0.83092719, "learning_rate": 3.947697748980853e-06, "loss": 0.91178578, "num_input_tokens_seen": 36129345, "router_z_loss_clip": 3.98828125, "router_z_loss_mlp": 0.42871094, "step": 1676, "time_per_iteration": 4.066492795944214 }, { "auxiliary_loss_clip": 0.06783256, "auxiliary_loss_mlp": 0.01299294, "balance_loss_clip": 0.06388523, "balance_loss_mlp": 0.01257714, "epoch": 0.10082669472418458, "flos": 16804550113920.0, "grad_norm": 2.8170817100199192, "language_loss": 0.87586164, "learning_rate": 3.947609227619163e-06, "loss": 0.95668721, "num_input_tokens_seen": 36146255, "router_z_loss_clip": 3.9453125, "router_z_loss_mlp": 0.41577148, "step": 1677, "time_per_iteration": 2.5997416973114014 }, { "auxiliary_loss_clip": 0.06777111, "auxiliary_loss_mlp": 0.01309065, "balance_loss_clip": 0.06381422, "balance_loss_mlp": 0.01264958, "epoch": 0.10088681797685255, "flos": 13558673376000.0, "grad_norm": 2.315339255670269, "language_loss": 0.88096803, "learning_rate": 3.947520632403936e-06, "loss": 0.96182978, "num_input_tokens_seen": 36164050, "router_z_loss_clip": 3.94921875, "router_z_loss_mlp": 0.44116211, "step": 1678, "time_per_iteration": 3.9891040325164795 }, { "auxiliary_loss_clip": 0.06772361, "auxiliary_loss_mlp": 0.01306895, "balance_loss_clip": 0.06380506, "balance_loss_mlp": 0.01267532, "epoch": 0.10094694122952051, "flos": 25272868752000.0, "grad_norm": 3.141181194929972, "language_loss": 0.91844684, "learning_rate": 3.947431963338532e-06, "loss": 0.99923944, "num_input_tokens_seen": 36183530, "router_z_loss_clip": 3.91796875, "router_z_loss_mlp": 0.39355469, "step": 1679, "time_per_iteration": 2.6430015563964844 }, { "auxiliary_loss_clip": 0.06582121, "auxiliary_loss_mlp": 0.01281054, "balance_loss_clip": 0.06327081, "balance_loss_mlp": 0.01263769, "epoch": 0.10100706448218849, "flos": 69875521315200.0, "grad_norm": 0.764731272765972, "language_loss": 0.53112793, "learning_rate": 3.947343220426312e-06, "loss": 0.60975975, "num_input_tokens_seen": 36248550, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.1730957, "step": 1680, "time_per_iteration": 4.703542709350586 }, { "auxiliary_loss_clip": 0.06778012, "auxiliary_loss_mlp": 0.01300627, "balance_loss_clip": 0.06388174, "balance_loss_mlp": 0.01260119, "epoch": 0.10106718773485646, "flos": 20012677787520.0, "grad_norm": 2.528616478308878, "language_loss": 0.78135496, "learning_rate": 3.947254403670641e-06, "loss": 0.86214137, "num_input_tokens_seen": 36266065, "router_z_loss_clip": 3.90625, "router_z_loss_mlp": 0.4050293, "step": 1681, "time_per_iteration": 3.953171968460083 }, { "auxiliary_loss_clip": 0.06789708, "auxiliary_loss_mlp": 0.01313993, "balance_loss_clip": 0.06386667, "balance_loss_mlp": 0.0126948, "epoch": 0.10112731098752442, "flos": 13484852328960.0, "grad_norm": 2.4618180652491333, "language_loss": 0.95165372, "learning_rate": 3.947165513074889e-06, "loss": 1.03269076, "num_input_tokens_seen": 36280960, "router_z_loss_clip": 4.02929688, "router_z_loss_mlp": 0.4453125, "step": 1682, "time_per_iteration": 2.581505298614502 }, { "auxiliary_loss_clip": 0.06788251, "auxiliary_loss_mlp": 0.01318516, "balance_loss_clip": 0.06392267, "balance_loss_mlp": 0.0127529, "epoch": 0.1011874342401924, "flos": 18521944997760.0, "grad_norm": 1.9444937178952226, "language_loss": 0.89061642, "learning_rate": 3.947076548642425e-06, "loss": 0.9716841, "num_input_tokens_seen": 36299010, "router_z_loss_clip": 3.95898438, "router_z_loss_mlp": 0.43237305, "step": 1683, "time_per_iteration": 2.6159780025482178 }, { "auxiliary_loss_clip": 0.06772564, "auxiliary_loss_mlp": 0.01318472, "balance_loss_clip": 0.06381905, "balance_loss_mlp": 0.01276391, "epoch": 0.10124755749286037, "flos": 20708904562560.0, "grad_norm": 3.4851565030597116, "language_loss": 0.7656526, "learning_rate": 3.946987510376624e-06, "loss": 0.84656298, "num_input_tokens_seen": 36318400, "router_z_loss_clip": 3.90625, "router_z_loss_mlp": 0.4206543, "step": 1684, "time_per_iteration": 2.6281511783599854 }, { "auxiliary_loss_clip": 0.06590798, "auxiliary_loss_mlp": 0.01288813, "balance_loss_clip": 0.06339097, "balance_loss_mlp": 0.01271134, "epoch": 0.10130768074552833, "flos": 56130100387200.0, "grad_norm": 0.7325212754553229, "language_loss": 0.61082077, "learning_rate": 3.9468983982808615e-06, "loss": 0.68961686, "num_input_tokens_seen": 36381815, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.17712402, "step": 1685, "time_per_iteration": 3.3406574726104736 }, { "auxiliary_loss_clip": 0.06781903, "auxiliary_loss_mlp": 0.01315628, "balance_loss_clip": 0.06387345, "balance_loss_mlp": 0.01272451, "epoch": 0.1013678039981963, "flos": 33410921322240.0, "grad_norm": 2.807153539270987, "language_loss": 0.62366712, "learning_rate": 3.946809212358516e-06, "loss": 0.70464242, "num_input_tokens_seen": 36404320, "router_z_loss_clip": 3.9453125, "router_z_loss_mlp": 0.43237305, "step": 1686, "time_per_iteration": 2.70238995552063 }, { "auxiliary_loss_clip": 0.06771023, "auxiliary_loss_mlp": 0.01320728, "balance_loss_clip": 0.06386469, "balance_loss_mlp": 0.0127972, "epoch": 0.10142792725086427, "flos": 31913480206080.0, "grad_norm": 3.918069704384211, "language_loss": 0.8275941, "learning_rate": 3.946719952612972e-06, "loss": 0.90851164, "num_input_tokens_seen": 36427510, "router_z_loss_clip": 3.84765625, "router_z_loss_mlp": 0.41015625, "step": 1687, "time_per_iteration": 2.708700180053711 }, { "auxiliary_loss_clip": 0.06788588, "auxiliary_loss_mlp": 0.01315335, "balance_loss_clip": 0.06387962, "balance_loss_mlp": 0.01271442, "epoch": 0.10148805050353224, "flos": 28483512048000.0, "grad_norm": 1.8303327681905437, "language_loss": 0.73186976, "learning_rate": 3.94663061904761e-06, "loss": 0.81290901, "num_input_tokens_seen": 36448230, "router_z_loss_clip": 4.00195312, "router_z_loss_mlp": 0.43847656, "step": 1688, "time_per_iteration": 2.67338490486145 }, { "auxiliary_loss_clip": 0.06764233, "auxiliary_loss_mlp": 0.01318729, "balance_loss_clip": 0.06374675, "balance_loss_mlp": 0.01274455, "epoch": 0.1015481737562002, "flos": 25154799949440.0, "grad_norm": 2.2539641561035197, "language_loss": 0.88593805, "learning_rate": 3.94654121166582e-06, "loss": 0.96676767, "num_input_tokens_seen": 36464395, "router_z_loss_clip": 3.89453125, "router_z_loss_mlp": 0.44262695, "step": 1689, "time_per_iteration": 2.710644245147705 }, { "auxiliary_loss_clip": 0.06768821, "auxiliary_loss_mlp": 0.01299807, "balance_loss_clip": 0.0637335, "balance_loss_mlp": 0.01259276, "epoch": 0.10160829700886818, "flos": 30890593088640.0, "grad_norm": 2.5175062584583006, "language_loss": 0.90375525, "learning_rate": 3.946451730470993e-06, "loss": 0.98444158, "num_input_tokens_seen": 36486475, "router_z_loss_clip": 3.95703125, "router_z_loss_mlp": 0.40527344, "step": 1690, "time_per_iteration": 2.691603183746338 }, { "auxiliary_loss_clip": 0.0677006, "auxiliary_loss_mlp": 0.01315, "balance_loss_clip": 0.06377738, "balance_loss_mlp": 0.01271083, "epoch": 0.10166842026153615, "flos": 20418190421760.0, "grad_norm": 2.3598610491488468, "language_loss": 0.8499521, "learning_rate": 3.946362175466521e-06, "loss": 0.9308027, "num_input_tokens_seen": 36505310, "router_z_loss_clip": 3.91992188, "router_z_loss_mlp": 0.43896484, "step": 1691, "time_per_iteration": 2.6045167446136475 }, { "auxiliary_loss_clip": 0.0675917, "auxiliary_loss_mlp": 0.01303824, "balance_loss_clip": 0.06357136, "balance_loss_mlp": 0.01261027, "epoch": 0.10172854351420411, "flos": 33485832472320.0, "grad_norm": 1.9051953608382466, "language_loss": 0.68136227, "learning_rate": 3.946272546655801e-06, "loss": 0.76199228, "num_input_tokens_seen": 36529820, "router_z_loss_clip": 4.01367188, "router_z_loss_mlp": 0.42773438, "step": 1692, "time_per_iteration": 2.7391531467437744 }, { "auxiliary_loss_clip": 0.06751983, "auxiliary_loss_mlp": 0.01299868, "balance_loss_clip": 0.06359828, "balance_loss_mlp": 0.01257668, "epoch": 0.1017886667668721, "flos": 23557109022720.0, "grad_norm": 1.9654760886175358, "language_loss": 0.78582251, "learning_rate": 3.94618284404223e-06, "loss": 0.86634099, "num_input_tokens_seen": 36549000, "router_z_loss_clip": 3.92382812, "router_z_loss_mlp": 0.421875, "step": 1693, "time_per_iteration": 2.622077703475952 }, { "auxiliary_loss_clip": 0.067615, "auxiliary_loss_mlp": 0.01298765, "balance_loss_clip": 0.06361839, "balance_loss_mlp": 0.01256374, "epoch": 0.10184879001954006, "flos": 23303011916160.0, "grad_norm": 2.2166948388710717, "language_loss": 0.88360226, "learning_rate": 3.9460930676292105e-06, "loss": 0.96420497, "num_input_tokens_seen": 36567515, "router_z_loss_clip": 3.9921875, "router_z_loss_mlp": 0.42358398, "step": 1694, "time_per_iteration": 2.6147615909576416 }, { "auxiliary_loss_clip": 0.06772526, "auxiliary_loss_mlp": 0.01301096, "balance_loss_clip": 0.06369168, "balance_loss_mlp": 0.01258538, "epoch": 0.10190891327220802, "flos": 18339069461760.0, "grad_norm": 2.8411031759471475, "language_loss": 0.81101739, "learning_rate": 3.946003217420147e-06, "loss": 0.89175361, "num_input_tokens_seen": 36586190, "router_z_loss_clip": 4.04101562, "router_z_loss_mlp": 0.42553711, "step": 1695, "time_per_iteration": 2.591867208480835 }, { "auxiliary_loss_clip": 0.06764834, "auxiliary_loss_mlp": 0.01300028, "balance_loss_clip": 0.06369587, "balance_loss_mlp": 0.01257733, "epoch": 0.10196903652487599, "flos": 26472006933120.0, "grad_norm": 1.998551065219649, "language_loss": 0.87789923, "learning_rate": 3.945913293418447e-06, "loss": 0.95854783, "num_input_tokens_seen": 36607495, "router_z_loss_clip": 3.9453125, "router_z_loss_mlp": 0.42285156, "step": 1696, "time_per_iteration": 2.665383815765381 }, { "auxiliary_loss_clip": 0.06744932, "auxiliary_loss_mlp": 0.01300162, "balance_loss_clip": 0.06355016, "balance_loss_mlp": 0.01260084, "epoch": 0.10202915977754397, "flos": 21875618413440.0, "grad_norm": 2.26917792770745, "language_loss": 0.8355934, "learning_rate": 3.945823295627519e-06, "loss": 0.91604435, "num_input_tokens_seen": 36628555, "router_z_loss_clip": 3.8984375, "router_z_loss_mlp": 0.40087891, "step": 1697, "time_per_iteration": 2.6518120765686035 }, { "auxiliary_loss_clip": 0.06761591, "auxiliary_loss_mlp": 0.01295843, "balance_loss_clip": 0.06363, "balance_loss_mlp": 0.01253214, "epoch": 0.10208928303021193, "flos": 22316322562560.0, "grad_norm": 2.43101523257584, "language_loss": 0.82581925, "learning_rate": 3.9457332240507775e-06, "loss": 0.90639359, "num_input_tokens_seen": 36646250, "router_z_loss_clip": 3.98242188, "router_z_loss_mlp": 0.42651367, "step": 1698, "time_per_iteration": 2.611130475997925 }, { "auxiliary_loss_clip": 0.06752223, "auxiliary_loss_mlp": 0.01297173, "balance_loss_clip": 0.0635695, "balance_loss_mlp": 0.01256332, "epoch": 0.1021494062828799, "flos": 22131811872000.0, "grad_norm": 2.852771289298235, "language_loss": 0.7780683, "learning_rate": 3.945643078691637e-06, "loss": 0.85856229, "num_input_tokens_seen": 36666675, "router_z_loss_clip": 3.95117188, "router_z_loss_mlp": 0.40844727, "step": 1699, "time_per_iteration": 2.626645565032959 }, { "auxiliary_loss_clip": 0.06742118, "auxiliary_loss_mlp": 0.01302172, "balance_loss_clip": 0.06348678, "balance_loss_mlp": 0.01260449, "epoch": 0.10220952953554788, "flos": 19652922282240.0, "grad_norm": 1.7758513597931402, "language_loss": 0.80882525, "learning_rate": 3.945552859553516e-06, "loss": 0.88926816, "num_input_tokens_seen": 36685225, "router_z_loss_clip": 3.9375, "router_z_loss_mlp": 0.41723633, "step": 1700, "time_per_iteration": 2.603602170944214 }, { "auxiliary_loss_clip": 0.06751487, "auxiliary_loss_mlp": 0.01304956, "balance_loss_clip": 0.06352808, "balance_loss_mlp": 0.01265355, "epoch": 0.10226965278821584, "flos": 29794765392000.0, "grad_norm": 2.5793341241191645, "language_loss": 0.78734672, "learning_rate": 3.945462566639836e-06, "loss": 0.86791116, "num_input_tokens_seen": 36705985, "router_z_loss_clip": 3.984375, "router_z_loss_mlp": 0.39599609, "step": 1701, "time_per_iteration": 2.681716203689575 }, { "auxiliary_loss_clip": 0.06768816, "auxiliary_loss_mlp": 0.01307022, "balance_loss_clip": 0.06358479, "balance_loss_mlp": 0.01262747, "epoch": 0.10232977604088381, "flos": 27024239266560.0, "grad_norm": 3.18986337634024, "language_loss": 0.79673725, "learning_rate": 3.945372199954019e-06, "loss": 0.87749565, "num_input_tokens_seen": 36725815, "router_z_loss_clip": 4.10351562, "router_z_loss_mlp": 0.44213867, "step": 1702, "time_per_iteration": 2.726588249206543 }, { "auxiliary_loss_clip": 0.06728837, "auxiliary_loss_mlp": 0.01294723, "balance_loss_clip": 0.06336316, "balance_loss_mlp": 0.01256266, "epoch": 0.10238989929355179, "flos": 20783857639680.0, "grad_norm": 2.6941817496802996, "language_loss": 0.96137637, "learning_rate": 3.945281759499494e-06, "loss": 1.04161203, "num_input_tokens_seen": 36742345, "router_z_loss_clip": 3.92578125, "router_z_loss_mlp": 0.38476562, "step": 1703, "time_per_iteration": 2.7147741317749023 }, { "auxiliary_loss_clip": 0.06593141, "auxiliary_loss_mlp": 0.01275666, "balance_loss_clip": 0.06346206, "balance_loss_mlp": 0.01258416, "epoch": 0.10245002254621975, "flos": 57716471013120.0, "grad_norm": 0.8480756525085669, "language_loss": 0.55249149, "learning_rate": 3.94519124527969e-06, "loss": 0.63117957, "num_input_tokens_seen": 36798775, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.17272949, "step": 1704, "time_per_iteration": 3.175872564315796 }, { "auxiliary_loss_clip": 0.06731471, "auxiliary_loss_mlp": 0.01308819, "balance_loss_clip": 0.06336983, "balance_loss_mlp": 0.01269766, "epoch": 0.10251014579888772, "flos": 16805724071040.0, "grad_norm": 2.959763275417659, "language_loss": 0.86458576, "learning_rate": 3.945100657298039e-06, "loss": 0.94498867, "num_input_tokens_seen": 36816295, "router_z_loss_clip": 3.9453125, "router_z_loss_mlp": 0.39086914, "step": 1705, "time_per_iteration": 2.618105411529541 }, { "auxiliary_loss_clip": 0.06591776, "auxiliary_loss_mlp": 0.01283374, "balance_loss_clip": 0.06345932, "balance_loss_mlp": 0.0126399, "epoch": 0.1025702690515557, "flos": 68584533459840.0, "grad_norm": 0.7511073683660529, "language_loss": 0.60580182, "learning_rate": 3.9450099955579765e-06, "loss": 0.68455333, "num_input_tokens_seen": 36882030, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.19360352, "step": 1706, "time_per_iteration": 3.2861266136169434 }, { "auxiliary_loss_clip": 0.06726408, "auxiliary_loss_mlp": 0.01296122, "balance_loss_clip": 0.06334072, "balance_loss_mlp": 0.01255257, "epoch": 0.10263039230422366, "flos": 14871939217920.0, "grad_norm": 2.3771088361516517, "language_loss": 0.88444376, "learning_rate": 3.94491926006294e-06, "loss": 0.96466905, "num_input_tokens_seen": 36899245, "router_z_loss_clip": 3.92382812, "router_z_loss_mlp": 0.40893555, "step": 1707, "time_per_iteration": 2.5968525409698486 }, { "auxiliary_loss_clip": 0.06707543, "auxiliary_loss_mlp": 0.01298485, "balance_loss_clip": 0.06327088, "balance_loss_mlp": 0.01259575, "epoch": 0.10269051555689163, "flos": 25344593447040.0, "grad_norm": 1.689267589132795, "language_loss": 0.74579132, "learning_rate": 3.944828450816369e-06, "loss": 0.82585156, "num_input_tokens_seen": 36920950, "router_z_loss_clip": 3.79882812, "router_z_loss_mlp": 0.38891602, "step": 1708, "time_per_iteration": 2.672722816467285 }, { "auxiliary_loss_clip": 0.06708644, "auxiliary_loss_mlp": 0.01299276, "balance_loss_clip": 0.06324098, "balance_loss_mlp": 0.0125834, "epoch": 0.10275063880955959, "flos": 21075116832000.0, "grad_norm": 2.010179308693606, "language_loss": 0.92401564, "learning_rate": 3.944737567821709e-06, "loss": 1.00409484, "num_input_tokens_seen": 36938900, "router_z_loss_clip": 3.84375, "router_z_loss_mlp": 0.40942383, "step": 1709, "time_per_iteration": 2.59494686126709 }, { "auxiliary_loss_clip": 0.06715196, "auxiliary_loss_mlp": 0.01294746, "balance_loss_clip": 0.06326044, "balance_loss_mlp": 0.01255789, "epoch": 0.10281076206222757, "flos": 30373636124160.0, "grad_norm": 2.169589913492783, "language_loss": 0.89608252, "learning_rate": 3.944646611082406e-06, "loss": 0.97618198, "num_input_tokens_seen": 36957010, "router_z_loss_clip": 3.89453125, "router_z_loss_mlp": 0.38964844, "step": 1710, "time_per_iteration": 2.6663694381713867 }, { "auxiliary_loss_clip": 0.06700775, "auxiliary_loss_mlp": 0.01297501, "balance_loss_clip": 0.06318964, "balance_loss_mlp": 0.0125778, "epoch": 0.10287088531489554, "flos": 22424748145920.0, "grad_norm": 3.500246664901109, "language_loss": 0.80915207, "learning_rate": 3.944555580601908e-06, "loss": 0.88913482, "num_input_tokens_seen": 36977690, "router_z_loss_clip": 3.81835938, "router_z_loss_mlp": 0.3972168, "step": 1711, "time_per_iteration": 2.628270149230957 }, { "auxiliary_loss_clip": 0.0672567, "auxiliary_loss_mlp": 0.01304868, "balance_loss_clip": 0.06335302, "balance_loss_mlp": 0.01264146, "epoch": 0.1029310085675635, "flos": 25122501400320.0, "grad_norm": 1.9090188589396564, "language_loss": 0.74775374, "learning_rate": 3.944464476383668e-06, "loss": 0.82805908, "num_input_tokens_seen": 36997300, "router_z_loss_clip": 3.8984375, "router_z_loss_mlp": 0.40722656, "step": 1712, "time_per_iteration": 2.620079755783081 }, { "auxiliary_loss_clip": 0.06699392, "auxiliary_loss_mlp": 0.01300434, "balance_loss_clip": 0.06329162, "balance_loss_mlp": 0.01261262, "epoch": 0.10299113182023148, "flos": 19871869800960.0, "grad_norm": 2.1031690550625943, "language_loss": 0.87874138, "learning_rate": 3.94437329843114e-06, "loss": 0.95873964, "num_input_tokens_seen": 37016110, "router_z_loss_clip": 3.70117188, "router_z_loss_mlp": 0.3918457, "step": 1713, "time_per_iteration": 2.599639654159546 }, { "auxiliary_loss_clip": 0.06702211, "auxiliary_loss_mlp": 0.01302337, "balance_loss_clip": 0.06327008, "balance_loss_mlp": 0.01263618, "epoch": 0.10305125507289944, "flos": 20453633498880.0, "grad_norm": 1.7088254345561649, "language_loss": 0.73634529, "learning_rate": 3.944282046747782e-06, "loss": 0.81639087, "num_input_tokens_seen": 37036405, "router_z_loss_clip": 3.75195312, "router_z_loss_mlp": 0.38745117, "step": 1714, "time_per_iteration": 2.62599778175354 }, { "auxiliary_loss_clip": 0.06740281, "auxiliary_loss_mlp": 0.01315126, "balance_loss_clip": 0.06347235, "balance_loss_mlp": 0.0127233, "epoch": 0.10311137832556741, "flos": 26258090659200.0, "grad_norm": 2.38818416832329, "language_loss": 0.92387938, "learning_rate": 3.944190721337053e-06, "loss": 1.00443339, "num_input_tokens_seen": 37057580, "router_z_loss_clip": 3.92578125, "router_z_loss_mlp": 0.4284668, "step": 1715, "time_per_iteration": 4.162328004837036 }, { "auxiliary_loss_clip": 0.06710421, "auxiliary_loss_mlp": 0.01303007, "balance_loss_clip": 0.06330691, "balance_loss_mlp": 0.0126529, "epoch": 0.10317150157823539, "flos": 35307711797760.0, "grad_norm": 3.6436550626364834, "language_loss": 0.77461016, "learning_rate": 3.944099322202418e-06, "loss": 0.85474443, "num_input_tokens_seen": 37079120, "router_z_loss_clip": 3.796875, "router_z_loss_mlp": 0.37670898, "step": 1716, "time_per_iteration": 2.7176599502563477 }, { "auxiliary_loss_clip": 0.06708028, "auxiliary_loss_mlp": 0.01302187, "balance_loss_clip": 0.06327547, "balance_loss_mlp": 0.01262228, "epoch": 0.10323162483090335, "flos": 25747171188480.0, "grad_norm": 2.1507338686689366, "language_loss": 0.86576855, "learning_rate": 3.944007849347342e-06, "loss": 0.94587076, "num_input_tokens_seen": 37099710, "router_z_loss_clip": 3.80664062, "router_z_loss_mlp": 0.3996582, "step": 1717, "time_per_iteration": 4.173694372177124 }, { "auxiliary_loss_clip": 0.06722271, "auxiliary_loss_mlp": 0.01310151, "balance_loss_clip": 0.0632742, "balance_loss_mlp": 0.01270621, "epoch": 0.10329174808357132, "flos": 16295475432960.0, "grad_norm": 2.3219205658567046, "language_loss": 0.84122527, "learning_rate": 3.943916302775292e-06, "loss": 0.92154944, "num_input_tokens_seen": 37117775, "router_z_loss_clip": 3.95117188, "router_z_loss_mlp": 0.39501953, "step": 1718, "time_per_iteration": 2.590820074081421 }, { "auxiliary_loss_clip": 0.06697292, "auxiliary_loss_mlp": 0.01309377, "balance_loss_clip": 0.06321198, "balance_loss_mlp": 0.01271349, "epoch": 0.10335187133623928, "flos": 36696475768320.0, "grad_norm": 1.9119233071786863, "language_loss": 0.74383771, "learning_rate": 3.943824682489742e-06, "loss": 0.8239044, "num_input_tokens_seen": 37140280, "router_z_loss_clip": 3.75976562, "router_z_loss_mlp": 0.38012695, "step": 1719, "time_per_iteration": 4.262228488922119 }, { "auxiliary_loss_clip": 0.06696761, "auxiliary_loss_mlp": 0.01297765, "balance_loss_clip": 0.06318565, "balance_loss_mlp": 0.01260596, "epoch": 0.10341199458890726, "flos": 14980909852800.0, "grad_norm": 1.91699203253682, "language_loss": 0.93790376, "learning_rate": 3.9437329884941665e-06, "loss": 1.01784897, "num_input_tokens_seen": 37158350, "router_z_loss_clip": 3.77734375, "router_z_loss_mlp": 0.37158203, "step": 1720, "time_per_iteration": 3.9832537174224854 }, { "auxiliary_loss_clip": 0.06705417, "auxiliary_loss_mlp": 0.01297193, "balance_loss_clip": 0.06320945, "balance_loss_mlp": 0.01257544, "epoch": 0.10347211784157523, "flos": 21037745111040.0, "grad_norm": 2.4392741622353333, "language_loss": 0.81599873, "learning_rate": 3.943641220792039e-06, "loss": 0.89602482, "num_input_tokens_seen": 37177120, "router_z_loss_clip": 3.84960938, "router_z_loss_mlp": 0.39697266, "step": 1721, "time_per_iteration": 2.7174367904663086 }, { "auxiliary_loss_clip": 0.06731169, "auxiliary_loss_mlp": 0.01299255, "balance_loss_clip": 0.06334744, "balance_loss_mlp": 0.01257746, "epoch": 0.1035322410942432, "flos": 19798216462080.0, "grad_norm": 2.1559840014336125, "language_loss": 0.82064557, "learning_rate": 3.9435493793868434e-06, "loss": 0.90094984, "num_input_tokens_seen": 37195895, "router_z_loss_clip": 3.96484375, "router_z_loss_mlp": 0.4152832, "step": 1722, "time_per_iteration": 2.6043319702148438 }, { "auxiliary_loss_clip": 0.06545238, "auxiliary_loss_mlp": 0.01323616, "balance_loss_clip": 0.06302489, "balance_loss_mlp": 0.01305127, "epoch": 0.10359236434691117, "flos": 52716037305600.0, "grad_norm": 0.9103626329735425, "language_loss": 0.66849035, "learning_rate": 3.943457464282059e-06, "loss": 0.74717885, "num_input_tokens_seen": 37247270, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.18469238, "step": 1723, "time_per_iteration": 2.980790853500366 }, { "auxiliary_loss_clip": 0.06728095, "auxiliary_loss_mlp": 0.01305725, "balance_loss_clip": 0.06336408, "balance_loss_mlp": 0.01265575, "epoch": 0.10365248759957914, "flos": 18411255354240.0, "grad_norm": 3.032745549003655, "language_loss": 0.79659927, "learning_rate": 3.9433654754811745e-06, "loss": 0.87693745, "num_input_tokens_seen": 37265595, "router_z_loss_clip": 3.9140625, "router_z_loss_mlp": 0.40136719, "step": 1724, "time_per_iteration": 2.596210241317749 }, { "auxiliary_loss_clip": 0.06727262, "auxiliary_loss_mlp": 0.01301852, "balance_loss_clip": 0.06335428, "balance_loss_mlp": 0.01262036, "epoch": 0.1037126108522471, "flos": 47563615820160.0, "grad_norm": 1.9647436925467943, "language_loss": 0.76589561, "learning_rate": 3.943273412987676e-06, "loss": 0.84618676, "num_input_tokens_seen": 37286660, "router_z_loss_clip": 3.91992188, "router_z_loss_mlp": 0.39794922, "step": 1725, "time_per_iteration": 2.8270158767700195 }, { "auxiliary_loss_clip": 0.06715703, "auxiliary_loss_mlp": 0.01300919, "balance_loss_clip": 0.06334748, "balance_loss_mlp": 0.01261508, "epoch": 0.10377273410491508, "flos": 22822671985920.0, "grad_norm": 2.421920451866757, "language_loss": 0.76082361, "learning_rate": 3.943181276805054e-06, "loss": 0.84098983, "num_input_tokens_seen": 37304915, "router_z_loss_clip": 3.80859375, "router_z_loss_mlp": 0.39379883, "step": 1726, "time_per_iteration": 2.657620906829834 }, { "auxiliary_loss_clip": 0.067459, "auxiliary_loss_mlp": 0.0130819, "balance_loss_clip": 0.06346358, "balance_loss_mlp": 0.01265489, "epoch": 0.10383285735758305, "flos": 26145556225920.0, "grad_norm": 2.77011934906034, "language_loss": 0.75943089, "learning_rate": 3.9430890669368035e-06, "loss": 0.83997178, "num_input_tokens_seen": 37325265, "router_z_loss_clip": 3.99414062, "router_z_loss_mlp": 0.42675781, "step": 1727, "time_per_iteration": 2.6774017810821533 }, { "auxiliary_loss_clip": 0.06736755, "auxiliary_loss_mlp": 0.01300237, "balance_loss_clip": 0.0635007, "balance_loss_mlp": 0.01260469, "epoch": 0.10389298061025101, "flos": 17097402533760.0, "grad_norm": 2.9052753042149453, "language_loss": 0.86583781, "learning_rate": 3.942996783386422e-06, "loss": 0.94620776, "num_input_tokens_seen": 37341650, "router_z_loss_clip": 3.8671875, "router_z_loss_mlp": 0.39746094, "step": 1728, "time_per_iteration": 2.6014554500579834 }, { "auxiliary_loss_clip": 0.06732737, "auxiliary_loss_mlp": 0.01299217, "balance_loss_clip": 0.06342444, "balance_loss_mlp": 0.01259878, "epoch": 0.10395310386291898, "flos": 20782683682560.0, "grad_norm": 3.0177447981286583, "language_loss": 0.71546459, "learning_rate": 3.942904426157406e-06, "loss": 0.79578418, "num_input_tokens_seen": 37360270, "router_z_loss_clip": 3.90234375, "router_z_loss_mlp": 0.39331055, "step": 1729, "time_per_iteration": 2.642432451248169 }, { "auxiliary_loss_clip": 0.06752016, "auxiliary_loss_mlp": 0.01301208, "balance_loss_clip": 0.06357242, "balance_loss_mlp": 0.01255503, "epoch": 0.10401322711558696, "flos": 12825032952960.0, "grad_norm": 3.2747114240098, "language_loss": 0.83668637, "learning_rate": 3.9428119952532605e-06, "loss": 0.91721857, "num_input_tokens_seen": 37375225, "router_z_loss_clip": 3.94921875, "router_z_loss_mlp": 0.45703125, "step": 1730, "time_per_iteration": 2.582914352416992 }, { "auxiliary_loss_clip": 0.06734428, "auxiliary_loss_mlp": 0.01303637, "balance_loss_clip": 0.06352323, "balance_loss_mlp": 0.01264155, "epoch": 0.10407335036825492, "flos": 23191274096640.0, "grad_norm": 3.90829516170375, "language_loss": 0.78041697, "learning_rate": 3.942719490677489e-06, "loss": 0.86079764, "num_input_tokens_seen": 37395165, "router_z_loss_clip": 3.82226562, "router_z_loss_mlp": 0.39501953, "step": 1731, "time_per_iteration": 2.6326560974121094 }, { "auxiliary_loss_clip": 0.06744785, "auxiliary_loss_mlp": 0.0130787, "balance_loss_clip": 0.06365629, "balance_loss_mlp": 0.01268912, "epoch": 0.10413347362092289, "flos": 26111370960000.0, "grad_norm": 2.5290904091418467, "language_loss": 0.840523, "learning_rate": 3.9426269124336e-06, "loss": 0.92104948, "num_input_tokens_seen": 37414845, "router_z_loss_clip": 3.79492188, "router_z_loss_mlp": 0.38989258, "step": 1732, "time_per_iteration": 2.6632378101348877 }, { "auxiliary_loss_clip": 0.06739879, "auxiliary_loss_mlp": 0.01300221, "balance_loss_clip": 0.06358413, "balance_loss_mlp": 0.01259404, "epoch": 0.10419359687359087, "flos": 12646014704640.0, "grad_norm": 2.1669102083186718, "language_loss": 0.85552847, "learning_rate": 3.942534260525104e-06, "loss": 0.93592948, "num_input_tokens_seen": 37432490, "router_z_loss_clip": 3.80859375, "router_z_loss_mlp": 0.40844727, "step": 1733, "time_per_iteration": 2.5915281772613525 }, { "auxiliary_loss_clip": 0.06758474, "auxiliary_loss_mlp": 0.01300294, "balance_loss_clip": 0.0636512, "balance_loss_mlp": 0.0125764, "epoch": 0.10425372012625883, "flos": 12129099667200.0, "grad_norm": 2.6505117443633512, "language_loss": 0.78477192, "learning_rate": 3.942441534955514e-06, "loss": 0.8653596, "num_input_tokens_seen": 37449435, "router_z_loss_clip": 3.93359375, "router_z_loss_mlp": 0.42651367, "step": 1734, "time_per_iteration": 2.591111660003662 }, { "auxiliary_loss_clip": 0.06726675, "auxiliary_loss_mlp": 0.01297915, "balance_loss_clip": 0.06347746, "balance_loss_mlp": 0.01260578, "epoch": 0.1043138433789268, "flos": 25344551520000.0, "grad_norm": 2.0325786559960988, "language_loss": 0.76812565, "learning_rate": 3.9423487357283465e-06, "loss": 0.84837151, "num_input_tokens_seen": 37469105, "router_z_loss_clip": 3.7890625, "router_z_loss_mlp": 0.37353516, "step": 1735, "time_per_iteration": 2.6401844024658203 }, { "auxiliary_loss_clip": 0.06736632, "auxiliary_loss_mlp": 0.01308944, "balance_loss_clip": 0.06345089, "balance_loss_mlp": 0.01267245, "epoch": 0.10437396663159478, "flos": 29174539870080.0, "grad_norm": 3.016228548256704, "language_loss": 0.80422491, "learning_rate": 3.94225586284712e-06, "loss": 0.88468069, "num_input_tokens_seen": 37490540, "router_z_loss_clip": 3.9140625, "router_z_loss_mlp": 0.41723633, "step": 1736, "time_per_iteration": 2.6762027740478516 }, { "auxiliary_loss_clip": 0.06727256, "auxiliary_loss_mlp": 0.01301325, "balance_loss_clip": 0.06343007, "balance_loss_mlp": 0.01261247, "epoch": 0.10443408988426274, "flos": 25087687228800.0, "grad_norm": 2.971276534611853, "language_loss": 0.72562718, "learning_rate": 3.942162916315356e-06, "loss": 0.80591303, "num_input_tokens_seen": 37511905, "router_z_loss_clip": 3.84375, "router_z_loss_mlp": 0.40087891, "step": 1737, "time_per_iteration": 2.657329559326172 }, { "auxiliary_loss_clip": 0.06738716, "auxiliary_loss_mlp": 0.01300615, "balance_loss_clip": 0.06343226, "balance_loss_mlp": 0.01257509, "epoch": 0.1044942131369307, "flos": 26766746069760.0, "grad_norm": 2.209989727494516, "language_loss": 0.83258551, "learning_rate": 3.942069896136581e-06, "loss": 0.91297889, "num_input_tokens_seen": 37533635, "router_z_loss_clip": 3.95117188, "router_z_loss_mlp": 0.43066406, "step": 1738, "time_per_iteration": 2.6589818000793457 }, { "auxiliary_loss_clip": 0.06743355, "auxiliary_loss_mlp": 0.01301258, "balance_loss_clip": 0.0634867, "balance_loss_mlp": 0.01258963, "epoch": 0.10455433638959867, "flos": 18448543221120.0, "grad_norm": 2.062071271407505, "language_loss": 0.7715342, "learning_rate": 3.9419768023143196e-06, "loss": 0.85198039, "num_input_tokens_seen": 37552035, "router_z_loss_clip": 3.94726562, "router_z_loss_mlp": 0.42285156, "step": 1739, "time_per_iteration": 2.5998666286468506 }, { "auxiliary_loss_clip": 0.06729727, "auxiliary_loss_mlp": 0.01307649, "balance_loss_clip": 0.06344253, "balance_loss_mlp": 0.01266736, "epoch": 0.10461445964226665, "flos": 23225207800320.0, "grad_norm": 1.814158405768875, "language_loss": 0.78509504, "learning_rate": 3.941883634852104e-06, "loss": 0.86546874, "num_input_tokens_seen": 37571540, "router_z_loss_clip": 3.84960938, "router_z_loss_mlp": 0.40917969, "step": 1740, "time_per_iteration": 2.6358137130737305 }, { "auxiliary_loss_clip": 0.06733131, "auxiliary_loss_mlp": 0.0130659, "balance_loss_clip": 0.06354977, "balance_loss_mlp": 0.01267775, "epoch": 0.10467458289493461, "flos": 24350860350720.0, "grad_norm": 2.204202700527243, "language_loss": 0.87477261, "learning_rate": 3.941790393753467e-06, "loss": 0.95516986, "num_input_tokens_seen": 37588265, "router_z_loss_clip": 3.78125, "router_z_loss_mlp": 0.38793945, "step": 1741, "time_per_iteration": 2.6549882888793945 }, { "auxiliary_loss_clip": 0.06739981, "auxiliary_loss_mlp": 0.01304851, "balance_loss_clip": 0.06342392, "balance_loss_mlp": 0.01261054, "epoch": 0.10473470614760258, "flos": 21294315912960.0, "grad_norm": 4.652013166266206, "language_loss": 0.77741516, "learning_rate": 3.941697079021942e-06, "loss": 0.85786343, "num_input_tokens_seen": 37606860, "router_z_loss_clip": 3.97460938, "router_z_loss_mlp": 0.43798828, "step": 1742, "time_per_iteration": 2.6286184787750244 }, { "auxiliary_loss_clip": 0.06723545, "auxiliary_loss_mlp": 0.01300127, "balance_loss_clip": 0.06339721, "balance_loss_mlp": 0.01260478, "epoch": 0.10479482940027056, "flos": 21693287928960.0, "grad_norm": 5.791872291296476, "language_loss": 0.88905656, "learning_rate": 3.94160369066107e-06, "loss": 0.96929324, "num_input_tokens_seen": 37625210, "router_z_loss_clip": 3.84179688, "router_z_loss_mlp": 0.39672852, "step": 1743, "time_per_iteration": 2.6487653255462646 }, { "auxiliary_loss_clip": 0.06715135, "auxiliary_loss_mlp": 0.01304099, "balance_loss_clip": 0.06340219, "balance_loss_mlp": 0.01263377, "epoch": 0.10485495265293852, "flos": 21579076414080.0, "grad_norm": 2.0702684058766634, "language_loss": 0.77192998, "learning_rate": 3.941510228674391e-06, "loss": 0.85212231, "num_input_tokens_seen": 37644110, "router_z_loss_clip": 3.74804688, "router_z_loss_mlp": 0.40722656, "step": 1744, "time_per_iteration": 2.658606767654419 }, { "auxiliary_loss_clip": 0.06720918, "auxiliary_loss_mlp": 0.01307856, "balance_loss_clip": 0.06343406, "balance_loss_mlp": 0.01270162, "epoch": 0.10491507590560649, "flos": 37971070151040.0, "grad_norm": 2.2059695633823924, "language_loss": 0.82304239, "learning_rate": 3.941416693065451e-06, "loss": 0.90333021, "num_input_tokens_seen": 37665800, "router_z_loss_clip": 3.7734375, "router_z_loss_mlp": 0.37719727, "step": 1745, "time_per_iteration": 2.7681896686553955 }, { "auxiliary_loss_clip": 0.06728493, "auxiliary_loss_mlp": 0.01299736, "balance_loss_clip": 0.06341316, "balance_loss_mlp": 0.01259276, "epoch": 0.10497519915827447, "flos": 26403552547200.0, "grad_norm": 3.1323613368788545, "language_loss": 0.84577364, "learning_rate": 3.941323083837794e-06, "loss": 0.92605591, "num_input_tokens_seen": 37685095, "router_z_loss_clip": 3.87109375, "router_z_loss_mlp": 0.40478516, "step": 1746, "time_per_iteration": 2.650648832321167 }, { "auxiliary_loss_clip": 0.0672038, "auxiliary_loss_mlp": 0.01300024, "balance_loss_clip": 0.06342249, "balance_loss_mlp": 0.01262783, "epoch": 0.10503532241094243, "flos": 40671842152320.0, "grad_norm": 1.909028137191986, "language_loss": 0.72250944, "learning_rate": 3.941229400994971e-06, "loss": 0.80271351, "num_input_tokens_seen": 37707445, "router_z_loss_clip": 3.78320312, "router_z_loss_mlp": 0.37231445, "step": 1747, "time_per_iteration": 2.7806317806243896 }, { "auxiliary_loss_clip": 0.06755256, "auxiliary_loss_mlp": 0.01302637, "balance_loss_clip": 0.06359273, "balance_loss_mlp": 0.01260199, "epoch": 0.1050954456636104, "flos": 29797239087360.0, "grad_norm": 5.310113250334217, "language_loss": 0.86294389, "learning_rate": 3.941135644540535e-06, "loss": 0.94352281, "num_input_tokens_seen": 37728325, "router_z_loss_clip": 3.96289062, "router_z_loss_mlp": 0.42456055, "step": 1748, "time_per_iteration": 2.682865858078003 }, { "auxiliary_loss_clip": 0.06742712, "auxiliary_loss_mlp": 0.01303236, "balance_loss_clip": 0.06356352, "balance_loss_mlp": 0.01263849, "epoch": 0.10515556891627838, "flos": 23955116716800.0, "grad_norm": 1.8155039760606204, "language_loss": 0.73649222, "learning_rate": 3.941041814478041e-06, "loss": 0.81695175, "num_input_tokens_seen": 37748910, "router_z_loss_clip": 3.8671875, "router_z_loss_mlp": 0.39404297, "step": 1749, "time_per_iteration": 2.6402781009674072 }, { "auxiliary_loss_clip": 0.06725884, "auxiliary_loss_mlp": 0.01303545, "balance_loss_clip": 0.06354854, "balance_loss_mlp": 0.0126628, "epoch": 0.10521569216894634, "flos": 18265458049920.0, "grad_norm": 2.6145466433554323, "language_loss": 0.83571011, "learning_rate": 3.940947910811047e-06, "loss": 0.91600442, "num_input_tokens_seen": 37765745, "router_z_loss_clip": 3.70898438, "router_z_loss_mlp": 0.37280273, "step": 1750, "time_per_iteration": 2.595276355743408 }, { "auxiliary_loss_clip": 0.06743371, "auxiliary_loss_mlp": 0.01304127, "balance_loss_clip": 0.06352091, "balance_loss_mlp": 0.01265431, "epoch": 0.10527581542161431, "flos": 15636033400320.0, "grad_norm": 2.4933756749099243, "language_loss": 0.94849771, "learning_rate": 3.940853933543114e-06, "loss": 1.02897263, "num_input_tokens_seen": 37780520, "router_z_loss_clip": 3.91796875, "router_z_loss_mlp": 0.38720703, "step": 1751, "time_per_iteration": 2.645843982696533 }, { "auxiliary_loss_clip": 0.06723708, "auxiliary_loss_mlp": 0.01302623, "balance_loss_clip": 0.06348504, "balance_loss_mlp": 0.01268672, "epoch": 0.10533593867428227, "flos": 18302494354560.0, "grad_norm": 2.3988187875400477, "language_loss": 0.80251527, "learning_rate": 3.940759882677805e-06, "loss": 0.88277858, "num_input_tokens_seen": 37799515, "router_z_loss_clip": 3.75390625, "router_z_loss_mlp": 0.33984375, "step": 1752, "time_per_iteration": 2.624795913696289 }, { "auxiliary_loss_clip": 0.06725404, "auxiliary_loss_mlp": 0.01304269, "balance_loss_clip": 0.06350703, "balance_loss_mlp": 0.01267052, "epoch": 0.10539606192695025, "flos": 29030922771840.0, "grad_norm": 2.1934453669686773, "language_loss": 0.77173191, "learning_rate": 3.940665758218686e-06, "loss": 0.85202861, "num_input_tokens_seen": 37818695, "router_z_loss_clip": 3.7421875, "router_z_loss_mlp": 0.37207031, "step": 1753, "time_per_iteration": 2.695124864578247 }, { "auxiliary_loss_clip": 0.06739736, "auxiliary_loss_mlp": 0.01309088, "balance_loss_clip": 0.06351628, "balance_loss_mlp": 0.01267532, "epoch": 0.10545618517961822, "flos": 19974593306880.0, "grad_norm": 4.635218751975694, "language_loss": 0.85707641, "learning_rate": 3.940571560169328e-06, "loss": 0.93756461, "num_input_tokens_seen": 37837860, "router_z_loss_clip": 3.88085938, "router_z_loss_mlp": 0.4152832, "step": 1754, "time_per_iteration": 2.6279032230377197 }, { "auxiliary_loss_clip": 0.06736176, "auxiliary_loss_mlp": 0.01304751, "balance_loss_clip": 0.06351094, "balance_loss_mlp": 0.01263766, "epoch": 0.10551630843228618, "flos": 16148923441920.0, "grad_norm": 2.580981612451451, "language_loss": 0.7097339, "learning_rate": 3.940477288533302e-06, "loss": 0.79014313, "num_input_tokens_seen": 37856260, "router_z_loss_clip": 3.84960938, "router_z_loss_mlp": 0.40966797, "step": 1755, "time_per_iteration": 4.032710075378418 }, { "auxiliary_loss_clip": 0.06737705, "auxiliary_loss_mlp": 0.01316248, "balance_loss_clip": 0.06346476, "balance_loss_mlp": 0.01274572, "epoch": 0.10557643168495416, "flos": 23446754795520.0, "grad_norm": 2.6615526556253326, "language_loss": 0.79370111, "learning_rate": 3.940382943314182e-06, "loss": 0.87424064, "num_input_tokens_seen": 37876960, "router_z_loss_clip": 3.9140625, "router_z_loss_mlp": 0.41699219, "step": 1756, "time_per_iteration": 2.642373561859131 }, { "auxiliary_loss_clip": 0.06735237, "auxiliary_loss_mlp": 0.01310759, "balance_loss_clip": 0.0634992, "balance_loss_mlp": 0.01275545, "epoch": 0.10563655493762213, "flos": 21805528872960.0, "grad_norm": 1.7565310882467051, "language_loss": 0.81513107, "learning_rate": 3.940288524515547e-06, "loss": 0.89559102, "num_input_tokens_seen": 37897070, "router_z_loss_clip": 3.85546875, "router_z_loss_mlp": 0.35229492, "step": 1757, "time_per_iteration": 4.081970930099487 }, { "auxiliary_loss_clip": 0.06741115, "auxiliary_loss_mlp": 0.01308512, "balance_loss_clip": 0.06349169, "balance_loss_mlp": 0.01267408, "epoch": 0.10569667819029009, "flos": 53813347176960.0, "grad_norm": 1.8967470292942805, "language_loss": 0.799685, "learning_rate": 3.940194032140976e-06, "loss": 0.88018131, "num_input_tokens_seen": 37923635, "router_z_loss_clip": 3.91796875, "router_z_loss_mlp": 0.41088867, "step": 1758, "time_per_iteration": 2.9698007106781006 }, { "auxiliary_loss_clip": 0.06739447, "auxiliary_loss_mlp": 0.01309146, "balance_loss_clip": 0.06352532, "balance_loss_mlp": 0.01272644, "epoch": 0.10575680144295807, "flos": 22931432985600.0, "grad_norm": 2.5784566176180257, "language_loss": 0.93426132, "learning_rate": 3.940099466194054e-06, "loss": 1.01474726, "num_input_tokens_seen": 37942650, "router_z_loss_clip": 3.86523438, "router_z_loss_mlp": 0.36499023, "step": 1759, "time_per_iteration": 4.075969219207764 }, { "auxiliary_loss_clip": 0.06730944, "auxiliary_loss_mlp": 0.01295979, "balance_loss_clip": 0.06343548, "balance_loss_mlp": 0.01258333, "epoch": 0.10581692469562604, "flos": 14141820666240.0, "grad_norm": 2.6269816828973536, "language_loss": 0.79004514, "learning_rate": 3.940004826678365e-06, "loss": 0.87031436, "num_input_tokens_seen": 37960660, "router_z_loss_clip": 3.87109375, "router_z_loss_mlp": 0.37646484, "step": 1760, "time_per_iteration": 4.120116233825684 }, { "auxiliary_loss_clip": 0.06750239, "auxiliary_loss_mlp": 0.01306221, "balance_loss_clip": 0.06352186, "balance_loss_mlp": 0.01263735, "epoch": 0.105877047948294, "flos": 25965909072000.0, "grad_norm": 2.4355725414614375, "language_loss": 0.90527141, "learning_rate": 3.939910113597498e-06, "loss": 0.98583603, "num_input_tokens_seen": 37978625, "router_z_loss_clip": 3.97851562, "router_z_loss_mlp": 0.42480469, "step": 1761, "time_per_iteration": 2.6786856651306152 }, { "auxiliary_loss_clip": 0.06732333, "auxiliary_loss_mlp": 0.01303828, "balance_loss_clip": 0.06347519, "balance_loss_mlp": 0.01266325, "epoch": 0.10593717120096197, "flos": 30672693745920.0, "grad_norm": 2.0719530393530383, "language_loss": 0.79342449, "learning_rate": 3.9398153269550464e-06, "loss": 0.87378609, "num_input_tokens_seen": 38000005, "router_z_loss_clip": 3.84570312, "router_z_loss_mlp": 0.375, "step": 1762, "time_per_iteration": 2.75628924369812 }, { "auxiliary_loss_clip": 0.06562629, "auxiliary_loss_mlp": 0.01297007, "balance_loss_clip": 0.06323512, "balance_loss_mlp": 0.01279722, "epoch": 0.10599729445362994, "flos": 66459347153280.0, "grad_norm": 0.7459232242709756, "language_loss": 0.6061008, "learning_rate": 3.939720466754602e-06, "loss": 0.68469715, "num_input_tokens_seen": 38066165, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.1730957, "step": 1763, "time_per_iteration": 3.46354341506958 }, { "auxiliary_loss_clip": 0.0673538, "auxiliary_loss_mlp": 0.01308782, "balance_loss_clip": 0.06352193, "balance_loss_mlp": 0.01270968, "epoch": 0.10605741770629791, "flos": 23954445884160.0, "grad_norm": 1.6897857875443758, "language_loss": 0.81847203, "learning_rate": 3.939625532999763e-06, "loss": 0.89891362, "num_input_tokens_seen": 38086150, "router_z_loss_clip": 3.83007812, "router_z_loss_mlp": 0.37817383, "step": 1764, "time_per_iteration": 2.658155918121338 }, { "auxiliary_loss_clip": 0.06721058, "auxiliary_loss_mlp": 0.01298292, "balance_loss_clip": 0.0634248, "balance_loss_mlp": 0.01261456, "epoch": 0.10611754095896588, "flos": 19393039244160.0, "grad_norm": 2.103217339909875, "language_loss": 0.81598264, "learning_rate": 3.9395305256941314e-06, "loss": 0.89617616, "num_input_tokens_seen": 38104205, "router_z_loss_clip": 3.78710938, "router_z_loss_mlp": 0.36816406, "step": 1765, "time_per_iteration": 2.6301581859588623 }, { "auxiliary_loss_clip": 0.06714988, "auxiliary_loss_mlp": 0.01296487, "balance_loss_clip": 0.06333689, "balance_loss_mlp": 0.01258864, "epoch": 0.10617766421163385, "flos": 22244472086400.0, "grad_norm": 2.275296226776247, "language_loss": 0.78686655, "learning_rate": 3.939435444841306e-06, "loss": 0.86698133, "num_input_tokens_seen": 38122005, "router_z_loss_clip": 3.8125, "router_z_loss_mlp": 0.37573242, "step": 1766, "time_per_iteration": 2.608264207839966 }, { "auxiliary_loss_clip": 0.06716, "auxiliary_loss_mlp": 0.01296782, "balance_loss_clip": 0.06344299, "balance_loss_mlp": 0.01261305, "epoch": 0.10623778746430182, "flos": 28412248550400.0, "grad_norm": 2.057819859788809, "language_loss": 0.78866148, "learning_rate": 3.939340290444895e-06, "loss": 0.86878932, "num_input_tokens_seen": 38143365, "router_z_loss_clip": 3.71679688, "router_z_loss_mlp": 0.35473633, "step": 1767, "time_per_iteration": 2.6701128482818604 }, { "auxiliary_loss_clip": 0.06551116, "auxiliary_loss_mlp": 0.01271318, "balance_loss_clip": 0.06311435, "balance_loss_mlp": 0.01255273, "epoch": 0.10629791071696978, "flos": 64254778231680.0, "grad_norm": 0.6514396491987577, "language_loss": 0.57731628, "learning_rate": 3.939245062508506e-06, "loss": 0.65554059, "num_input_tokens_seen": 38210035, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.16040039, "step": 1768, "time_per_iteration": 3.3235738277435303 }, { "auxiliary_loss_clip": 0.06716575, "auxiliary_loss_mlp": 0.01297818, "balance_loss_clip": 0.06338175, "balance_loss_mlp": 0.01261626, "epoch": 0.10635803396963776, "flos": 22754217600000.0, "grad_norm": 2.092595723600137, "language_loss": 0.878528, "learning_rate": 3.939149761035749e-06, "loss": 0.95867193, "num_input_tokens_seen": 38231230, "router_z_loss_clip": 3.78515625, "router_z_loss_mlp": 0.36206055, "step": 1769, "time_per_iteration": 2.6863794326782227 }, { "auxiliary_loss_clip": 0.06729269, "auxiliary_loss_mlp": 0.01297247, "balance_loss_clip": 0.0634488, "balance_loss_mlp": 0.01259339, "epoch": 0.10641815722230573, "flos": 31403818546560.0, "grad_norm": 3.4146942128678877, "language_loss": 0.62738085, "learning_rate": 3.9390543860302395e-06, "loss": 0.70764601, "num_input_tokens_seen": 38253890, "router_z_loss_clip": 3.84765625, "router_z_loss_mlp": 0.37915039, "step": 1770, "time_per_iteration": 2.7087926864624023 }, { "auxiliary_loss_clip": 0.06541073, "auxiliary_loss_mlp": 0.01273235, "balance_loss_clip": 0.06302737, "balance_loss_mlp": 0.01257572, "epoch": 0.1064782804749737, "flos": 58567230645120.0, "grad_norm": 0.8622734743774665, "language_loss": 0.57138908, "learning_rate": 3.9389589374955925e-06, "loss": 0.6495322, "num_input_tokens_seen": 38304290, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.15649414, "step": 1771, "time_per_iteration": 3.1408112049102783 }, { "auxiliary_loss_clip": 0.06721154, "auxiliary_loss_mlp": 0.01300575, "balance_loss_clip": 0.06344942, "balance_loss_mlp": 0.01263835, "epoch": 0.10653840372764166, "flos": 23994626716800.0, "grad_norm": 2.6519251987599253, "language_loss": 0.90256768, "learning_rate": 3.938863415435429e-06, "loss": 0.98278493, "num_input_tokens_seen": 38324725, "router_z_loss_clip": 3.76953125, "router_z_loss_mlp": 0.36767578, "step": 1772, "time_per_iteration": 2.6599767208099365 }, { "auxiliary_loss_clip": 0.06737404, "auxiliary_loss_mlp": 0.01303717, "balance_loss_clip": 0.06343309, "balance_loss_mlp": 0.01264426, "epoch": 0.10659852698030964, "flos": 18300272221440.0, "grad_norm": 3.9417703062433445, "language_loss": 0.78368533, "learning_rate": 3.93876781985337e-06, "loss": 0.86409652, "num_input_tokens_seen": 38340735, "router_z_loss_clip": 3.94335938, "router_z_loss_mlp": 0.39306641, "step": 1773, "time_per_iteration": 2.598918914794922 }, { "auxiliary_loss_clip": 0.06721453, "auxiliary_loss_mlp": 0.01298559, "balance_loss_clip": 0.06338908, "balance_loss_mlp": 0.01260889, "epoch": 0.1066586502329776, "flos": 32168751269760.0, "grad_norm": 1.9692839040464514, "language_loss": 0.84534752, "learning_rate": 3.938672150753041e-06, "loss": 0.92554766, "num_input_tokens_seen": 38361315, "router_z_loss_clip": 3.82617188, "router_z_loss_mlp": 0.37670898, "step": 1774, "time_per_iteration": 2.701111316680908 }, { "auxiliary_loss_clip": 0.06737497, "auxiliary_loss_mlp": 0.01299497, "balance_loss_clip": 0.06351019, "balance_loss_mlp": 0.01263353, "epoch": 0.10671877348564557, "flos": 17790904051200.0, "grad_norm": 2.6565720394320187, "language_loss": 0.78360683, "learning_rate": 3.9385764081380704e-06, "loss": 0.86397678, "num_input_tokens_seen": 38377425, "router_z_loss_clip": 3.86523438, "router_z_loss_mlp": 0.36132812, "step": 1775, "time_per_iteration": 2.5798158645629883 }, { "auxiliary_loss_clip": 0.06523601, "auxiliary_loss_mlp": 0.01281479, "balance_loss_clip": 0.06284779, "balance_loss_mlp": 0.01266543, "epoch": 0.10677889673831355, "flos": 63531074517120.0, "grad_norm": 0.7969480084035764, "language_loss": 0.57521546, "learning_rate": 3.9384805920120876e-06, "loss": 0.65326625, "num_input_tokens_seen": 38440275, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.14904785, "step": 1776, "time_per_iteration": 3.268854856491089 }, { "auxiliary_loss_clip": 0.06708598, "auxiliary_loss_mlp": 0.01295415, "balance_loss_clip": 0.06333056, "balance_loss_mlp": 0.01258603, "epoch": 0.10683901999098151, "flos": 22024182902400.0, "grad_norm": 1.6623325040119958, "language_loss": 0.84596694, "learning_rate": 3.938384702378727e-06, "loss": 0.92600703, "num_input_tokens_seen": 38461820, "router_z_loss_clip": 3.75, "router_z_loss_mlp": 0.36816406, "step": 1777, "time_per_iteration": 2.6743314266204834 }, { "auxiliary_loss_clip": 0.06701466, "auxiliary_loss_mlp": 0.01291656, "balance_loss_clip": 0.06333312, "balance_loss_mlp": 0.01257371, "epoch": 0.10689914324364948, "flos": 25049435040000.0, "grad_norm": 3.115669438492136, "language_loss": 0.8902365, "learning_rate": 3.938288739241625e-06, "loss": 0.97016764, "num_input_tokens_seen": 38482235, "router_z_loss_clip": 3.68164062, "router_z_loss_mlp": 0.34277344, "step": 1778, "time_per_iteration": 2.6666784286499023 }, { "auxiliary_loss_clip": 0.06709429, "auxiliary_loss_mlp": 0.0129802, "balance_loss_clip": 0.06336947, "balance_loss_mlp": 0.01263331, "epoch": 0.10695926649631746, "flos": 16440643831680.0, "grad_norm": 3.8081319044526634, "language_loss": 0.8551929, "learning_rate": 3.938192702604417e-06, "loss": 0.93526733, "num_input_tokens_seen": 38500690, "router_z_loss_clip": 3.7265625, "router_z_loss_mlp": 0.34692383, "step": 1779, "time_per_iteration": 2.5929481983184814 }, { "auxiliary_loss_clip": 0.06716636, "auxiliary_loss_mlp": 0.01293606, "balance_loss_clip": 0.06338841, "balance_loss_mlp": 0.01259155, "epoch": 0.10701938974898542, "flos": 16984281121920.0, "grad_norm": 2.382305622957927, "language_loss": 0.68545926, "learning_rate": 3.9380965924707495e-06, "loss": 0.7655617, "num_input_tokens_seen": 38518405, "router_z_loss_clip": 3.77539062, "router_z_loss_mlp": 0.34472656, "step": 1780, "time_per_iteration": 2.60901141166687 }, { "auxiliary_loss_clip": 0.0671557, "auxiliary_loss_mlp": 0.01303952, "balance_loss_clip": 0.06337985, "balance_loss_mlp": 0.01266807, "epoch": 0.10707951300165339, "flos": 15893568524160.0, "grad_norm": 2.3384238911938864, "language_loss": 0.93337232, "learning_rate": 3.938000408844265e-06, "loss": 1.01356757, "num_input_tokens_seen": 38535060, "router_z_loss_clip": 3.77734375, "router_z_loss_mlp": 0.37109375, "step": 1781, "time_per_iteration": 2.5681169033050537 }, { "auxiliary_loss_clip": 0.06729555, "auxiliary_loss_mlp": 0.01310368, "balance_loss_clip": 0.06346232, "balance_loss_mlp": 0.01275726, "epoch": 0.10713963625432135, "flos": 14252510309760.0, "grad_norm": 2.907192993436027, "language_loss": 0.80746579, "learning_rate": 3.9379041517286105e-06, "loss": 0.88786495, "num_input_tokens_seen": 38552855, "router_z_loss_clip": 3.83007812, "router_z_loss_mlp": 0.34619141, "step": 1782, "time_per_iteration": 2.5992612838745117 }, { "auxiliary_loss_clip": 0.06735106, "auxiliary_loss_mlp": 0.01318538, "balance_loss_clip": 0.06346156, "balance_loss_mlp": 0.01282083, "epoch": 0.10719975950698933, "flos": 16761224753280.0, "grad_norm": 2.2233701866618567, "language_loss": 0.80776727, "learning_rate": 3.937807821127436e-06, "loss": 0.8883037, "num_input_tokens_seen": 38570075, "router_z_loss_clip": 3.89453125, "router_z_loss_mlp": 0.36450195, "step": 1783, "time_per_iteration": 2.587249755859375 }, { "auxiliary_loss_clip": 0.06731443, "auxiliary_loss_mlp": 0.0132027, "balance_loss_clip": 0.06339222, "balance_loss_mlp": 0.0128167, "epoch": 0.1072598827596573, "flos": 22717181295360.0, "grad_norm": 2.283914657496892, "language_loss": 0.88092601, "learning_rate": 3.937711417044395e-06, "loss": 0.96144307, "num_input_tokens_seen": 38587970, "router_z_loss_clip": 3.91992188, "router_z_loss_mlp": 0.38598633, "step": 1784, "time_per_iteration": 2.6136913299560547 }, { "auxiliary_loss_clip": 0.06729417, "auxiliary_loss_mlp": 0.01308185, "balance_loss_clip": 0.06345326, "balance_loss_mlp": 0.01270944, "epoch": 0.10732000601232526, "flos": 23264969362560.0, "grad_norm": 2.690251574725644, "language_loss": 1.02568042, "learning_rate": 3.937614939483143e-06, "loss": 1.10605645, "num_input_tokens_seen": 38605840, "router_z_loss_clip": 3.83789062, "router_z_loss_mlp": 0.37255859, "step": 1785, "time_per_iteration": 2.60931134223938 }, { "auxiliary_loss_clip": 0.06712453, "auxiliary_loss_mlp": 0.01301832, "balance_loss_clip": 0.06343086, "balance_loss_mlp": 0.01268525, "epoch": 0.10738012926499324, "flos": 24213951578880.0, "grad_norm": 1.797687720180478, "language_loss": 0.86529815, "learning_rate": 3.937518388447339e-06, "loss": 0.94544107, "num_input_tokens_seen": 38627070, "router_z_loss_clip": 3.69335938, "router_z_loss_mlp": 0.33276367, "step": 1786, "time_per_iteration": 2.6608426570892334 }, { "auxiliary_loss_clip": 0.06741363, "auxiliary_loss_mlp": 0.01305165, "balance_loss_clip": 0.06351133, "balance_loss_mlp": 0.01266517, "epoch": 0.1074402525176612, "flos": 20929361454720.0, "grad_norm": 1.9713556423862393, "language_loss": 0.80826217, "learning_rate": 3.937421763940642e-06, "loss": 0.88872743, "num_input_tokens_seen": 38645840, "router_z_loss_clip": 3.90039062, "router_z_loss_mlp": 0.38671875, "step": 1787, "time_per_iteration": 2.617178201675415 }, { "auxiliary_loss_clip": 0.06741038, "auxiliary_loss_mlp": 0.01304961, "balance_loss_clip": 0.06349903, "balance_loss_mlp": 0.01267457, "epoch": 0.10750037577032917, "flos": 16952695332480.0, "grad_norm": 1.9032256688607494, "language_loss": 0.84057653, "learning_rate": 3.937325065966719e-06, "loss": 0.9210366, "num_input_tokens_seen": 38664770, "router_z_loss_clip": 3.91015625, "router_z_loss_mlp": 0.37475586, "step": 1788, "time_per_iteration": 2.6503140926361084 }, { "auxiliary_loss_clip": 0.0673246, "auxiliary_loss_mlp": 0.01307263, "balance_loss_clip": 0.06347752, "balance_loss_mlp": 0.01272121, "epoch": 0.10756049902299715, "flos": 20272770460800.0, "grad_norm": 2.619485370866872, "language_loss": 0.79958236, "learning_rate": 3.9372282945292335e-06, "loss": 0.87997961, "num_input_tokens_seen": 38683865, "router_z_loss_clip": 3.84765625, "router_z_loss_mlp": 0.35131836, "step": 1789, "time_per_iteration": 2.618931293487549 }, { "auxiliary_loss_clip": 0.06740378, "auxiliary_loss_mlp": 0.0130442, "balance_loss_clip": 0.06351598, "balance_loss_mlp": 0.01266392, "epoch": 0.10762062227566511, "flos": 23593264859520.0, "grad_norm": 3.154985969398953, "language_loss": 0.76667225, "learning_rate": 3.937131449631859e-06, "loss": 0.84712023, "num_input_tokens_seen": 38702485, "router_z_loss_clip": 3.88476562, "router_z_loss_mlp": 0.38037109, "step": 1790, "time_per_iteration": 2.6352031230926514 }, { "auxiliary_loss_clip": 0.06751899, "auxiliary_loss_mlp": 0.01317348, "balance_loss_clip": 0.06358258, "balance_loss_mlp": 0.01279177, "epoch": 0.10768074552833308, "flos": 24316549303680.0, "grad_norm": 3.681652761867378, "language_loss": 0.80224228, "learning_rate": 3.9370345312782645e-06, "loss": 0.88293475, "num_input_tokens_seen": 38722475, "router_z_loss_clip": 3.9375, "router_z_loss_mlp": 0.38208008, "step": 1791, "time_per_iteration": 2.680741310119629 }, { "auxiliary_loss_clip": 0.06731535, "auxiliary_loss_mlp": 0.01303524, "balance_loss_clip": 0.06358186, "balance_loss_mlp": 0.01269072, "epoch": 0.10774086878100106, "flos": 25306760528640.0, "grad_norm": 1.6672640509818657, "language_loss": 0.7236166, "learning_rate": 3.936937539472126e-06, "loss": 0.80396724, "num_input_tokens_seen": 38743285, "router_z_loss_clip": 3.72851562, "router_z_loss_mlp": 0.34472656, "step": 1792, "time_per_iteration": 2.7442760467529297 }, { "auxiliary_loss_clip": 0.06730826, "auxiliary_loss_mlp": 0.01309135, "balance_loss_clip": 0.06344867, "balance_loss_mlp": 0.01270249, "epoch": 0.10780099203366902, "flos": 22060506447360.0, "grad_norm": 2.5194918140543323, "language_loss": 0.77820432, "learning_rate": 3.9368404742171236e-06, "loss": 0.85860395, "num_input_tokens_seen": 38763035, "router_z_loss_clip": 3.85742188, "router_z_loss_mlp": 0.38891602, "step": 1793, "time_per_iteration": 2.6273720264434814 }, { "auxiliary_loss_clip": 0.06724426, "auxiliary_loss_mlp": 0.01300358, "balance_loss_clip": 0.06351026, "balance_loss_mlp": 0.01266789, "epoch": 0.10786111528633699, "flos": 22754091818880.0, "grad_norm": 1.797785472540021, "language_loss": 0.86205715, "learning_rate": 3.936743335516936e-06, "loss": 0.94230497, "num_input_tokens_seen": 38784900, "router_z_loss_clip": 3.734375, "router_z_loss_mlp": 0.33544922, "step": 1794, "time_per_iteration": 4.106438636779785 }, { "auxiliary_loss_clip": 0.06761616, "auxiliary_loss_mlp": 0.01312359, "balance_loss_clip": 0.06368966, "balance_loss_mlp": 0.01273568, "epoch": 0.10792123853900495, "flos": 20857510978560.0, "grad_norm": 1.7851230532600766, "language_loss": 0.76862454, "learning_rate": 3.936646123375246e-06, "loss": 0.84936434, "num_input_tokens_seen": 38804695, "router_z_loss_clip": 3.92578125, "router_z_loss_mlp": 0.38818359, "step": 1795, "time_per_iteration": 2.6224448680877686 }, { "auxiliary_loss_clip": 0.06751359, "auxiliary_loss_mlp": 0.01307273, "balance_loss_clip": 0.06360252, "balance_loss_mlp": 0.01268864, "epoch": 0.10798136179167293, "flos": 17754454725120.0, "grad_norm": 2.9570164030421195, "language_loss": 0.83630598, "learning_rate": 3.936548837795741e-06, "loss": 0.91689229, "num_input_tokens_seen": 38822395, "router_z_loss_clip": 3.9140625, "router_z_loss_mlp": 0.3840332, "step": 1796, "time_per_iteration": 4.072604179382324 }, { "auxiliary_loss_clip": 0.06750999, "auxiliary_loss_mlp": 0.01321353, "balance_loss_clip": 0.06355894, "balance_loss_mlp": 0.01281657, "epoch": 0.1080414850443409, "flos": 13594745358720.0, "grad_norm": 2.4163076240846615, "language_loss": 0.75671899, "learning_rate": 3.936451478782111e-06, "loss": 0.83744252, "num_input_tokens_seen": 38839865, "router_z_loss_clip": 3.94921875, "router_z_loss_mlp": 0.39697266, "step": 1797, "time_per_iteration": 2.590670585632324 }, { "auxiliary_loss_clip": 0.06724604, "auxiliary_loss_mlp": 0.01306044, "balance_loss_clip": 0.06349407, "balance_loss_mlp": 0.01271497, "epoch": 0.10810160829700886, "flos": 16259026106880.0, "grad_norm": 1.9695962343724365, "language_loss": 0.83530939, "learning_rate": 3.936354046338046e-06, "loss": 0.91561592, "num_input_tokens_seen": 38857300, "router_z_loss_clip": 3.75585938, "router_z_loss_mlp": 0.34570312, "step": 1798, "time_per_iteration": 4.033038854598999 }, { "auxiliary_loss_clip": 0.06727793, "auxiliary_loss_mlp": 0.01296423, "balance_loss_clip": 0.06353678, "balance_loss_mlp": 0.0125942, "epoch": 0.10816173154967684, "flos": 15163282264320.0, "grad_norm": 2.2450778676710974, "language_loss": 0.86987847, "learning_rate": 3.936256540467242e-06, "loss": 0.95012057, "num_input_tokens_seen": 38874960, "router_z_loss_clip": 3.73828125, "router_z_loss_mlp": 0.36987305, "step": 1799, "time_per_iteration": 2.571202516555786 }, { "auxiliary_loss_clip": 0.06720261, "auxiliary_loss_mlp": 0.01305818, "balance_loss_clip": 0.0634516, "balance_loss_mlp": 0.01270175, "epoch": 0.10822185480234481, "flos": 17791113686400.0, "grad_norm": 1.7846230150664704, "language_loss": 0.78576899, "learning_rate": 3.9361589611733955e-06, "loss": 0.86602974, "num_input_tokens_seen": 38893610, "router_z_loss_clip": 3.75390625, "router_z_loss_mlp": 0.35620117, "step": 1800, "time_per_iteration": 4.059294700622559 }, { "auxiliary_loss_clip": 0.0671755, "auxiliary_loss_mlp": 0.01292671, "balance_loss_clip": 0.06340902, "balance_loss_mlp": 0.01258673, "epoch": 0.10828197805501277, "flos": 25563708673920.0, "grad_norm": 1.7141607863459776, "language_loss": 0.73848468, "learning_rate": 3.9360613084602075e-06, "loss": 0.81858689, "num_input_tokens_seen": 38913485, "router_z_loss_clip": 3.765625, "router_z_loss_mlp": 0.34033203, "step": 1801, "time_per_iteration": 2.652409315109253 }, { "auxiliary_loss_clip": 0.06738233, "auxiliary_loss_mlp": 0.01291396, "balance_loss_clip": 0.06353469, "balance_loss_mlp": 0.01256826, "epoch": 0.10834210130768075, "flos": 28991748188160.0, "grad_norm": 2.140248510662861, "language_loss": 0.67250025, "learning_rate": 3.935963582331381e-06, "loss": 0.75279659, "num_input_tokens_seen": 38935650, "router_z_loss_clip": 3.84570312, "router_z_loss_mlp": 0.34570312, "step": 1802, "time_per_iteration": 2.6728196144104004 }, { "auxiliary_loss_clip": 0.06723827, "auxiliary_loss_mlp": 0.0129431, "balance_loss_clip": 0.06349164, "balance_loss_mlp": 0.01260407, "epoch": 0.10840222456034872, "flos": 20270045203200.0, "grad_norm": 1.8623837207804126, "language_loss": 0.82907271, "learning_rate": 3.935865782790621e-06, "loss": 0.90925407, "num_input_tokens_seen": 38954130, "router_z_loss_clip": 3.7421875, "router_z_loss_mlp": 0.33911133, "step": 1803, "time_per_iteration": 2.604793071746826 }, { "auxiliary_loss_clip": 0.0672362, "auxiliary_loss_mlp": 0.01299798, "balance_loss_clip": 0.0634575, "balance_loss_mlp": 0.01262938, "epoch": 0.10846234781301668, "flos": 19868851054080.0, "grad_norm": 1.8599634751244307, "language_loss": 0.9230057, "learning_rate": 3.9357679098416365e-06, "loss": 1.00323987, "num_input_tokens_seen": 38972905, "router_z_loss_clip": 3.77929688, "router_z_loss_mlp": 0.3684082, "step": 1804, "time_per_iteration": 2.6131608486175537 }, { "auxiliary_loss_clip": 0.06729435, "auxiliary_loss_mlp": 0.01298588, "balance_loss_clip": 0.06349289, "balance_loss_mlp": 0.01261394, "epoch": 0.10852247106568465, "flos": 26476283491200.0, "grad_norm": 3.049331818785039, "language_loss": 0.77840412, "learning_rate": 3.935669963488139e-06, "loss": 0.8586843, "num_input_tokens_seen": 38993255, "router_z_loss_clip": 3.80078125, "router_z_loss_mlp": 0.37158203, "step": 1805, "time_per_iteration": 2.740501880645752 }, { "auxiliary_loss_clip": 0.06710729, "auxiliary_loss_mlp": 0.01301999, "balance_loss_clip": 0.06334636, "balance_loss_mlp": 0.01268573, "epoch": 0.10858259431835263, "flos": 30089420674560.0, "grad_norm": 1.703080310298981, "language_loss": 0.87424093, "learning_rate": 3.935571943733843e-06, "loss": 0.95436823, "num_input_tokens_seen": 39012610, "router_z_loss_clip": 3.75976562, "router_z_loss_mlp": 0.33422852, "step": 1806, "time_per_iteration": 2.71536922454834 }, { "auxiliary_loss_clip": 0.06715487, "auxiliary_loss_mlp": 0.01297066, "balance_loss_clip": 0.06344931, "balance_loss_mlp": 0.01263664, "epoch": 0.10864271757102059, "flos": 19069313794560.0, "grad_norm": 3.280240511136423, "language_loss": 0.83128846, "learning_rate": 3.9354738505824635e-06, "loss": 0.91141397, "num_input_tokens_seen": 39030120, "router_z_loss_clip": 3.70507812, "router_z_loss_mlp": 0.33422852, "step": 1807, "time_per_iteration": 2.5976665019989014 }, { "auxiliary_loss_clip": 0.06719753, "auxiliary_loss_mlp": 0.0129601, "balance_loss_clip": 0.06348457, "balance_loss_mlp": 0.01264109, "epoch": 0.10870284082368856, "flos": 24721558813440.0, "grad_norm": 3.4217098167347237, "language_loss": 0.80489111, "learning_rate": 3.9353756840377225e-06, "loss": 0.88504875, "num_input_tokens_seen": 39049875, "router_z_loss_clip": 3.7109375, "router_z_loss_mlp": 0.3190918, "step": 1808, "time_per_iteration": 2.6432385444641113 }, { "auxiliary_loss_clip": 0.06711018, "auxiliary_loss_mlp": 0.01303706, "balance_loss_clip": 0.06335071, "balance_loss_mlp": 0.01269565, "epoch": 0.10876296407635654, "flos": 20633322579840.0, "grad_norm": 1.9506056457312462, "language_loss": 0.8051936, "learning_rate": 3.935277444103342e-06, "loss": 0.88534081, "num_input_tokens_seen": 39068935, "router_z_loss_clip": 3.76171875, "router_z_loss_mlp": 0.34130859, "step": 1809, "time_per_iteration": 2.73478102684021 }, { "auxiliary_loss_clip": 0.06713926, "auxiliary_loss_mlp": 0.01307846, "balance_loss_clip": 0.06344826, "balance_loss_mlp": 0.0127275, "epoch": 0.1088230873290245, "flos": 21586245937920.0, "grad_norm": 2.588108519794563, "language_loss": 0.86601579, "learning_rate": 3.935179130783046e-06, "loss": 0.94623345, "num_input_tokens_seen": 39087370, "router_z_loss_clip": 3.6875, "router_z_loss_mlp": 0.35083008, "step": 1810, "time_per_iteration": 2.6036012172698975 }, { "auxiliary_loss_clip": 0.06742188, "auxiliary_loss_mlp": 0.01298815, "balance_loss_clip": 0.06357391, "balance_loss_mlp": 0.01264339, "epoch": 0.10888321058169247, "flos": 26476283491200.0, "grad_norm": 2.3987893874823016, "language_loss": 0.65197039, "learning_rate": 3.935080744080564e-06, "loss": 0.73238039, "num_input_tokens_seen": 39106635, "router_z_loss_clip": 3.84179688, "router_z_loss_mlp": 0.3449707, "step": 1811, "time_per_iteration": 2.662487745285034 }, { "auxiliary_loss_clip": 0.06716074, "auxiliary_loss_mlp": 0.01304391, "balance_loss_clip": 0.06338754, "balance_loss_mlp": 0.01268509, "epoch": 0.10894333383436045, "flos": 25855722552960.0, "grad_norm": 3.0323492443980546, "language_loss": 0.7637887, "learning_rate": 3.934982283999626e-06, "loss": 0.84399343, "num_input_tokens_seen": 39126335, "router_z_loss_clip": 3.7734375, "router_z_loss_mlp": 0.35888672, "step": 1812, "time_per_iteration": 2.6510910987854004 }, { "auxiliary_loss_clip": 0.06707129, "auxiliary_loss_mlp": 0.01295897, "balance_loss_clip": 0.06339261, "balance_loss_mlp": 0.01263353, "epoch": 0.10900345708702841, "flos": 19543238887680.0, "grad_norm": 1.7001665327457591, "language_loss": 0.74304169, "learning_rate": 3.934883750543966e-06, "loss": 0.82307196, "num_input_tokens_seen": 39144820, "router_z_loss_clip": 3.6796875, "router_z_loss_mlp": 0.32519531, "step": 1813, "time_per_iteration": 2.6019182205200195 }, { "auxiliary_loss_clip": 0.06700865, "auxiliary_loss_mlp": 0.0129173, "balance_loss_clip": 0.06336875, "balance_loss_mlp": 0.0126083, "epoch": 0.10906358033969638, "flos": 23630091528960.0, "grad_norm": 4.179214397304707, "language_loss": 0.83690947, "learning_rate": 3.93478514371732e-06, "loss": 0.91683543, "num_input_tokens_seen": 39165945, "router_z_loss_clip": 3.64453125, "router_z_loss_mlp": 0.30883789, "step": 1814, "time_per_iteration": 2.6160478591918945 }, { "auxiliary_loss_clip": 0.06711717, "auxiliary_loss_mlp": 0.01297599, "balance_loss_clip": 0.06335256, "balance_loss_mlp": 0.01262218, "epoch": 0.10912370359236434, "flos": 21221039917440.0, "grad_norm": 2.716884255884509, "language_loss": 0.85472077, "learning_rate": 3.934686463523429e-06, "loss": 0.93481392, "num_input_tokens_seen": 39183520, "router_z_loss_clip": 3.765625, "router_z_loss_mlp": 0.35400391, "step": 1815, "time_per_iteration": 2.6159420013427734 }, { "auxiliary_loss_clip": 0.06693321, "auxiliary_loss_mlp": 0.01296268, "balance_loss_clip": 0.06331614, "balance_loss_mlp": 0.01263581, "epoch": 0.10918382684503232, "flos": 13558296032640.0, "grad_norm": 3.089519068830128, "language_loss": 0.74386048, "learning_rate": 3.9345877099660315e-06, "loss": 0.82375634, "num_input_tokens_seen": 39201190, "router_z_loss_clip": 3.61914062, "router_z_loss_mlp": 0.3269043, "step": 1816, "time_per_iteration": 2.5948195457458496 }, { "auxiliary_loss_clip": 0.06717807, "auxiliary_loss_mlp": 0.0129396, "balance_loss_clip": 0.06339249, "balance_loss_mlp": 0.01257243, "epoch": 0.10924395009770028, "flos": 27971712109440.0, "grad_norm": 3.9778737345224995, "language_loss": 0.74357486, "learning_rate": 3.9344888830488744e-06, "loss": 0.8236925, "num_input_tokens_seen": 39221210, "router_z_loss_clip": 3.79101562, "router_z_loss_mlp": 0.36743164, "step": 1817, "time_per_iteration": 2.6781673431396484 }, { "auxiliary_loss_clip": 0.06703123, "auxiliary_loss_mlp": 0.01289954, "balance_loss_clip": 0.06340259, "balance_loss_mlp": 0.01257457, "epoch": 0.10930407335036825, "flos": 25600912686720.0, "grad_norm": 1.874732216161101, "language_loss": 0.68504643, "learning_rate": 3.934389982775706e-06, "loss": 0.76497722, "num_input_tokens_seen": 39242025, "router_z_loss_clip": 3.63085938, "router_z_loss_mlp": 0.32519531, "step": 1818, "time_per_iteration": 2.660086154937744 }, { "auxiliary_loss_clip": 0.06706676, "auxiliary_loss_mlp": 0.01303264, "balance_loss_clip": 0.06335007, "balance_loss_mlp": 0.01267239, "epoch": 0.10936419660303623, "flos": 18412177749120.0, "grad_norm": 2.707331243644028, "language_loss": 0.74520171, "learning_rate": 3.934291009150275e-06, "loss": 0.82530111, "num_input_tokens_seen": 39259870, "router_z_loss_clip": 3.71289062, "router_z_loss_mlp": 0.35986328, "step": 1819, "time_per_iteration": 2.5727698802948 }, { "auxiliary_loss_clip": 0.06692385, "auxiliary_loss_mlp": 0.0129336, "balance_loss_clip": 0.0633123, "balance_loss_mlp": 0.01259934, "epoch": 0.1094243198557042, "flos": 23846523425280.0, "grad_norm": 3.968584185069686, "language_loss": 0.7555775, "learning_rate": 3.934191962176335e-06, "loss": 0.83543491, "num_input_tokens_seen": 39278500, "router_z_loss_clip": 3.609375, "router_z_loss_mlp": 0.33447266, "step": 1820, "time_per_iteration": 2.732038974761963 }, { "auxiliary_loss_clip": 0.06698038, "auxiliary_loss_mlp": 0.01296334, "balance_loss_clip": 0.06333117, "balance_loss_mlp": 0.01261644, "epoch": 0.10948444310837216, "flos": 14648589360000.0, "grad_norm": 2.6803683246736485, "language_loss": 0.8368001, "learning_rate": 3.934092841857642e-06, "loss": 0.91674387, "num_input_tokens_seen": 39294800, "router_z_loss_clip": 3.65234375, "router_z_loss_mlp": 0.34692383, "step": 1821, "time_per_iteration": 2.5677993297576904 }, { "auxiliary_loss_clip": 0.06702963, "auxiliary_loss_mlp": 0.01294061, "balance_loss_clip": 0.06341104, "balance_loss_mlp": 0.01261278, "epoch": 0.10954456636104014, "flos": 27826250221440.0, "grad_norm": 2.624568086134168, "language_loss": 0.78448993, "learning_rate": 3.933993648197955e-06, "loss": 0.86446017, "num_input_tokens_seen": 39314625, "router_z_loss_clip": 3.61914062, "router_z_loss_mlp": 0.32788086, "step": 1822, "time_per_iteration": 2.6299421787261963 }, { "auxiliary_loss_clip": 0.06694746, "auxiliary_loss_mlp": 0.01294151, "balance_loss_clip": 0.06332302, "balance_loss_mlp": 0.01260248, "epoch": 0.1096046896137081, "flos": 33629491497600.0, "grad_norm": 2.2131431505491754, "language_loss": 0.80686355, "learning_rate": 3.933894381201034e-06, "loss": 0.88675249, "num_input_tokens_seen": 39336465, "router_z_loss_clip": 3.625, "router_z_loss_mlp": 0.33862305, "step": 1823, "time_per_iteration": 2.697221517562866 }, { "auxiliary_loss_clip": 0.06698006, "auxiliary_loss_mlp": 0.01295794, "balance_loss_clip": 0.06337363, "balance_loss_mlp": 0.01263846, "epoch": 0.10966481286637607, "flos": 26987370670080.0, "grad_norm": 1.5271265976632737, "language_loss": 0.81279266, "learning_rate": 3.933795040870645e-06, "loss": 0.89273059, "num_input_tokens_seen": 39357930, "router_z_loss_clip": 3.60546875, "router_z_loss_mlp": 0.31933594, "step": 1824, "time_per_iteration": 2.642705202102661 }, { "auxiliary_loss_clip": 0.06697571, "auxiliary_loss_mlp": 0.01293774, "balance_loss_clip": 0.06336005, "balance_loss_mlp": 0.01260848, "epoch": 0.10972493611904403, "flos": 23042751534720.0, "grad_norm": 2.616496265559467, "language_loss": 0.88967931, "learning_rate": 3.933695627210554e-06, "loss": 0.96959269, "num_input_tokens_seen": 39376380, "router_z_loss_clip": 3.61523438, "router_z_loss_mlp": 0.32910156, "step": 1825, "time_per_iteration": 2.5974128246307373 }, { "auxiliary_loss_clip": 0.06694004, "auxiliary_loss_mlp": 0.01296841, "balance_loss_clip": 0.06332415, "balance_loss_mlp": 0.01263606, "epoch": 0.10978505937171201, "flos": 38113261729920.0, "grad_norm": 2.9349940228628157, "language_loss": 0.77303469, "learning_rate": 3.933596140224532e-06, "loss": 0.85294318, "num_input_tokens_seen": 39399935, "router_z_loss_clip": 3.61328125, "router_z_loss_mlp": 0.33203125, "step": 1826, "time_per_iteration": 2.7509207725524902 }, { "auxiliary_loss_clip": 0.06582361, "auxiliary_loss_mlp": 0.01263845, "balance_loss_clip": 0.06346957, "balance_loss_mlp": 0.01247704, "epoch": 0.10984518262437998, "flos": 59867987500800.0, "grad_norm": 0.8146927720448127, "language_loss": 0.55144185, "learning_rate": 3.93349657991635e-06, "loss": 0.62990391, "num_input_tokens_seen": 39460685, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.16137695, "step": 1827, "time_per_iteration": 3.233736991882324 }, { "auxiliary_loss_clip": 0.06581853, "auxiliary_loss_mlp": 0.01265389, "balance_loss_clip": 0.06346662, "balance_loss_mlp": 0.01249892, "epoch": 0.10990530587704794, "flos": 66741088907520.0, "grad_norm": 0.735083580159245, "language_loss": 0.55375302, "learning_rate": 3.933396946289784e-06, "loss": 0.63222539, "num_input_tokens_seen": 39524765, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.15478516, "step": 1828, "time_per_iteration": 3.246952533721924 }, { "auxiliary_loss_clip": 0.06695381, "auxiliary_loss_mlp": 0.01292769, "balance_loss_clip": 0.06329372, "balance_loss_mlp": 0.01258651, "epoch": 0.10996542912971592, "flos": 25454234914560.0, "grad_norm": 4.883261181555761, "language_loss": 0.8623122, "learning_rate": 3.933297239348612e-06, "loss": 0.94219375, "num_input_tokens_seen": 39543640, "router_z_loss_clip": 3.65820312, "router_z_loss_mlp": 0.34082031, "step": 1829, "time_per_iteration": 2.721956729888916 }, { "auxiliary_loss_clip": 0.06697147, "auxiliary_loss_mlp": 0.01304163, "balance_loss_clip": 0.06332457, "balance_loss_mlp": 0.0127138, "epoch": 0.11002555238238389, "flos": 44028282752640.0, "grad_norm": 3.274118930398833, "language_loss": 0.9058876, "learning_rate": 3.933197459096614e-06, "loss": 0.9859007, "num_input_tokens_seen": 39567525, "router_z_loss_clip": 3.64453125, "router_z_loss_mlp": 0.32788086, "step": 1830, "time_per_iteration": 2.8400607109069824 }, { "auxiliary_loss_clip": 0.0654056, "auxiliary_loss_mlp": 0.01264781, "balance_loss_clip": 0.06306783, "balance_loss_mlp": 0.01249296, "epoch": 0.11008567563505185, "flos": 54085248547200.0, "grad_norm": 0.6707125760510162, "language_loss": 0.55652738, "learning_rate": 3.9330976055375756e-06, "loss": 0.63458079, "num_input_tokens_seen": 39628470, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.15466309, "step": 1831, "time_per_iteration": 3.247241258621216 }, { "auxiliary_loss_clip": 0.06692874, "auxiliary_loss_mlp": 0.01297568, "balance_loss_clip": 0.0632375, "balance_loss_mlp": 0.0126109, "epoch": 0.11014579888771983, "flos": 24249981634560.0, "grad_norm": 3.2062546916986268, "language_loss": 0.92152119, "learning_rate": 3.932997678675282e-06, "loss": 1.0014255, "num_input_tokens_seen": 39646670, "router_z_loss_clip": 3.6875, "router_z_loss_mlp": 0.36499023, "step": 1832, "time_per_iteration": 2.647019863128662 }, { "auxiliary_loss_clip": 0.06527835, "auxiliary_loss_mlp": 0.01263599, "balance_loss_clip": 0.06294599, "balance_loss_mlp": 0.01248901, "epoch": 0.1102059221403878, "flos": 57763653661440.0, "grad_norm": 0.7049587597980651, "language_loss": 0.59766996, "learning_rate": 3.932897678513523e-06, "loss": 0.67558432, "num_input_tokens_seen": 39712915, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.14660645, "step": 1833, "time_per_iteration": 3.249866247177124 }, { "auxiliary_loss_clip": 0.06685147, "auxiliary_loss_mlp": 0.01292406, "balance_loss_clip": 0.0631419, "balance_loss_mlp": 0.01257287, "epoch": 0.11026604539305576, "flos": 16800818607360.0, "grad_norm": 3.840800919023438, "language_loss": 0.80642807, "learning_rate": 3.93279760505609e-06, "loss": 0.88620365, "num_input_tokens_seen": 39730650, "router_z_loss_clip": 3.70898438, "router_z_loss_mlp": 0.35131836, "step": 1834, "time_per_iteration": 4.032536268234253 }, { "auxiliary_loss_clip": 0.06679696, "auxiliary_loss_mlp": 0.01300359, "balance_loss_clip": 0.06315024, "balance_loss_mlp": 0.0126574, "epoch": 0.11032616864572373, "flos": 23994920206080.0, "grad_norm": 4.364036022394268, "language_loss": 0.9218905, "learning_rate": 3.932697458306779e-06, "loss": 1.00169098, "num_input_tokens_seen": 39751065, "router_z_loss_clip": 3.6484375, "router_z_loss_mlp": 0.34594727, "step": 1835, "time_per_iteration": 2.622229814529419 }, { "auxiliary_loss_clip": 0.06673124, "auxiliary_loss_mlp": 0.01290764, "balance_loss_clip": 0.0631308, "balance_loss_mlp": 0.01257958, "epoch": 0.1103862918983917, "flos": 19689329681280.0, "grad_norm": 2.963575092745852, "language_loss": 0.66380441, "learning_rate": 3.932597238269386e-06, "loss": 0.74344337, "num_input_tokens_seen": 39769245, "router_z_loss_clip": 3.59765625, "router_z_loss_mlp": 0.32836914, "step": 1836, "time_per_iteration": 4.0383079051971436 }, { "auxiliary_loss_clip": 0.06666906, "auxiliary_loss_mlp": 0.01290235, "balance_loss_clip": 0.06312954, "balance_loss_mlp": 0.01258001, "epoch": 0.11044641515105967, "flos": 32169086686080.0, "grad_norm": 3.1629166497522236, "language_loss": 0.74198592, "learning_rate": 3.932496944947711e-06, "loss": 0.8215574, "num_input_tokens_seen": 39790830, "router_z_loss_clip": 3.5390625, "router_z_loss_mlp": 0.32202148, "step": 1837, "time_per_iteration": 2.685263156890869 }, { "auxiliary_loss_clip": 0.06676803, "auxiliary_loss_mlp": 0.01290881, "balance_loss_clip": 0.0630995, "balance_loss_mlp": 0.01258719, "epoch": 0.11050653840372764, "flos": 16694573230080.0, "grad_norm": 3.0287220284217438, "language_loss": 0.79513532, "learning_rate": 3.93239657834556e-06, "loss": 0.87481225, "num_input_tokens_seen": 39809475, "router_z_loss_clip": 3.67382812, "router_z_loss_mlp": 0.3215332, "step": 1838, "time_per_iteration": 4.074202060699463 }, { "auxiliary_loss_clip": 0.0666853, "auxiliary_loss_mlp": 0.01301277, "balance_loss_clip": 0.06310084, "balance_loss_mlp": 0.0126654, "epoch": 0.11056666165639562, "flos": 21214205809920.0, "grad_norm": 2.5219923638292006, "language_loss": 0.73011059, "learning_rate": 3.932296138466736e-06, "loss": 0.80980867, "num_input_tokens_seen": 39826355, "router_z_loss_clip": 3.58203125, "router_z_loss_mlp": 0.34716797, "step": 1839, "time_per_iteration": 3.9914519786834717 }, { "auxiliary_loss_clip": 0.06676085, "auxiliary_loss_mlp": 0.01302149, "balance_loss_clip": 0.06311172, "balance_loss_mlp": 0.0126722, "epoch": 0.11062678490906358, "flos": 19170444072960.0, "grad_norm": 2.679695941975857, "language_loss": 0.80542421, "learning_rate": 3.93219562531505e-06, "loss": 0.88520658, "num_input_tokens_seen": 39845335, "router_z_loss_clip": 3.65039062, "router_z_loss_mlp": 0.34936523, "step": 1840, "time_per_iteration": 2.6028892993927 }, { "auxiliary_loss_clip": 0.06662393, "auxiliary_loss_mlp": 0.01295684, "balance_loss_clip": 0.06307924, "balance_loss_mlp": 0.01263759, "epoch": 0.11068690816173155, "flos": 24901457529600.0, "grad_norm": 1.7614537781407815, "language_loss": 0.89710593, "learning_rate": 3.932095038894311e-06, "loss": 0.97668672, "num_input_tokens_seen": 39865065, "router_z_loss_clip": 3.54296875, "router_z_loss_mlp": 0.3190918, "step": 1841, "time_per_iteration": 2.627708673477173 }, { "auxiliary_loss_clip": 0.06669509, "auxiliary_loss_mlp": 0.01304693, "balance_loss_clip": 0.06309649, "balance_loss_mlp": 0.0127017, "epoch": 0.11074703141439952, "flos": 16478015552640.0, "grad_norm": 2.3325794268073947, "language_loss": 0.92256629, "learning_rate": 3.931994379208334e-06, "loss": 1.00230837, "num_input_tokens_seen": 39882780, "router_z_loss_clip": 3.59375, "router_z_loss_mlp": 0.34521484, "step": 1842, "time_per_iteration": 2.5783631801605225 }, { "auxiliary_loss_clip": 0.06665114, "auxiliary_loss_mlp": 0.01298685, "balance_loss_clip": 0.06307659, "balance_loss_mlp": 0.0126533, "epoch": 0.11080715466706749, "flos": 19178535991680.0, "grad_norm": 2.5207621997698535, "language_loss": 0.87384152, "learning_rate": 3.931893646260937e-06, "loss": 0.95347953, "num_input_tokens_seen": 39900295, "router_z_loss_clip": 3.57421875, "router_z_loss_mlp": 0.33398438, "step": 1843, "time_per_iteration": 2.563303232192993 }, { "auxiliary_loss_clip": 0.0667585, "auxiliary_loss_mlp": 0.01305077, "balance_loss_clip": 0.0631239, "balance_loss_mlp": 0.01269052, "epoch": 0.11086727791973545, "flos": 27711325946880.0, "grad_norm": 2.2288483826510648, "language_loss": 0.75724268, "learning_rate": 3.931792840055941e-06, "loss": 0.83705187, "num_input_tokens_seen": 39922075, "router_z_loss_clip": 3.63671875, "router_z_loss_mlp": 0.3605957, "step": 1844, "time_per_iteration": 2.651308059692383 }, { "auxiliary_loss_clip": 0.0667518, "auxiliary_loss_mlp": 0.01309115, "balance_loss_clip": 0.06311113, "balance_loss_mlp": 0.01272327, "epoch": 0.11092740117240343, "flos": 18520854894720.0, "grad_norm": 3.0526250765803633, "language_loss": 0.76525688, "learning_rate": 3.931691960597165e-06, "loss": 0.84509981, "num_input_tokens_seen": 39940115, "router_z_loss_clip": 3.640625, "router_z_loss_mlp": 0.36791992, "step": 1845, "time_per_iteration": 2.583050012588501 }, { "auxiliary_loss_clip": 0.06658874, "auxiliary_loss_mlp": 0.01305592, "balance_loss_clip": 0.06304693, "balance_loss_mlp": 0.01272118, "epoch": 0.1109875244250714, "flos": 20528796211200.0, "grad_norm": 3.496532557076185, "language_loss": 0.77423459, "learning_rate": 3.9315910078884375e-06, "loss": 0.85387927, "num_input_tokens_seen": 39959920, "router_z_loss_clip": 3.54492188, "router_z_loss_mlp": 0.33447266, "step": 1846, "time_per_iteration": 2.6024203300476074 }, { "auxiliary_loss_clip": 0.06680156, "auxiliary_loss_mlp": 0.01299461, "balance_loss_clip": 0.06314759, "balance_loss_mlp": 0.01263389, "epoch": 0.11104764767773936, "flos": 14103484623360.0, "grad_norm": 3.8295271082834823, "language_loss": 0.87804818, "learning_rate": 3.931489981933584e-06, "loss": 0.95784432, "num_input_tokens_seen": 39974755, "router_z_loss_clip": 3.65820312, "router_z_loss_mlp": 0.36083984, "step": 1847, "time_per_iteration": 2.5874574184417725 }, { "auxiliary_loss_clip": 0.06675105, "auxiliary_loss_mlp": 0.01304735, "balance_loss_clip": 0.06312139, "balance_loss_mlp": 0.01266326, "epoch": 0.11110777093040733, "flos": 20600730541440.0, "grad_norm": 2.294305268891443, "language_loss": 0.78850681, "learning_rate": 3.931388882736438e-06, "loss": 0.86830521, "num_input_tokens_seen": 39993355, "router_z_loss_clip": 3.6328125, "router_z_loss_mlp": 0.3840332, "step": 1848, "time_per_iteration": 2.610058069229126 }, { "auxiliary_loss_clip": 0.06674077, "auxiliary_loss_mlp": 0.01307289, "balance_loss_clip": 0.06321533, "balance_loss_mlp": 0.01272718, "epoch": 0.11116789418307531, "flos": 21876247319040.0, "grad_norm": 5.019662655261397, "language_loss": 0.79178512, "learning_rate": 3.931287710300832e-06, "loss": 0.87159878, "num_input_tokens_seen": 40012410, "router_z_loss_clip": 3.5234375, "router_z_loss_mlp": 0.34521484, "step": 1849, "time_per_iteration": 2.5954344272613525 }, { "auxiliary_loss_clip": 0.06686635, "auxiliary_loss_mlp": 0.01308182, "balance_loss_clip": 0.06322825, "balance_loss_mlp": 0.01271108, "epoch": 0.11122801743574327, "flos": 15528488284800.0, "grad_norm": 3.013152016628719, "language_loss": 0.73660517, "learning_rate": 3.931186464630601e-06, "loss": 0.81655329, "num_input_tokens_seen": 40029315, "router_z_loss_clip": 3.63867188, "router_z_loss_mlp": 0.37060547, "step": 1850, "time_per_iteration": 2.599748134613037 }, { "auxiliary_loss_clip": 0.06684881, "auxiliary_loss_mlp": 0.01305538, "balance_loss_clip": 0.06320906, "balance_loss_mlp": 0.01270395, "epoch": 0.11128814068841124, "flos": 14397511000320.0, "grad_norm": 3.694947675801697, "language_loss": 0.82632327, "learning_rate": 3.931085145729588e-06, "loss": 0.90622747, "num_input_tokens_seen": 40045765, "router_z_loss_clip": 3.63671875, "router_z_loss_mlp": 0.3515625, "step": 1851, "time_per_iteration": 2.573387861251831 }, { "auxiliary_loss_clip": 0.06678903, "auxiliary_loss_mlp": 0.01295484, "balance_loss_clip": 0.06321475, "balance_loss_mlp": 0.01262701, "epoch": 0.11134826394107922, "flos": 16659465569280.0, "grad_norm": 3.789484516329885, "language_loss": 0.9020946, "learning_rate": 3.930983753601631e-06, "loss": 0.98183846, "num_input_tokens_seen": 40061660, "router_z_loss_clip": 3.578125, "router_z_loss_mlp": 0.32763672, "step": 1852, "time_per_iteration": 2.609722375869751 }, { "auxiliary_loss_clip": 0.0668074, "auxiliary_loss_mlp": 0.01299534, "balance_loss_clip": 0.06324693, "balance_loss_mlp": 0.01262316, "epoch": 0.11140838719374718, "flos": 16696627655040.0, "grad_norm": 5.559784410024488, "language_loss": 0.73855948, "learning_rate": 3.930882288250578e-06, "loss": 0.81836224, "num_input_tokens_seen": 40080180, "router_z_loss_clip": 3.55664062, "router_z_loss_mlp": 0.37182617, "step": 1853, "time_per_iteration": 2.5936899185180664 }, { "auxiliary_loss_clip": 0.06560332, "auxiliary_loss_mlp": 0.01383272, "balance_loss_clip": 0.06328955, "balance_loss_mlp": 0.01365712, "epoch": 0.11146851044641515, "flos": 60994101248640.0, "grad_norm": 0.760751684298083, "language_loss": 0.53837717, "learning_rate": 3.930780749680273e-06, "loss": 0.61781323, "num_input_tokens_seen": 40138910, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.17590332, "step": 1854, "time_per_iteration": 3.189054012298584 }, { "auxiliary_loss_clip": 0.06725724, "auxiliary_loss_mlp": 0.01302225, "balance_loss_clip": 0.06342278, "balance_loss_mlp": 0.012626, "epoch": 0.11152863369908313, "flos": 22199301936000.0, "grad_norm": 2.1628180478892856, "language_loss": 0.86473268, "learning_rate": 3.9306791378945705e-06, "loss": 0.94501215, "num_input_tokens_seen": 40157745, "router_z_loss_clip": 3.83203125, "router_z_loss_mlp": 0.39599609, "step": 1855, "time_per_iteration": 2.710020065307617 }, { "auxiliary_loss_clip": 0.06698984, "auxiliary_loss_mlp": 0.01295745, "balance_loss_clip": 0.06334175, "balance_loss_mlp": 0.01259696, "epoch": 0.11158875695175109, "flos": 19543742012160.0, "grad_norm": 2.280169983036742, "language_loss": 0.83080161, "learning_rate": 3.9305774528973205e-06, "loss": 0.9107489, "num_input_tokens_seen": 40175375, "router_z_loss_clip": 3.64453125, "router_z_loss_mlp": 0.36035156, "step": 1856, "time_per_iteration": 2.7718665599823 }, { "auxiliary_loss_clip": 0.0668416, "auxiliary_loss_mlp": 0.01291681, "balance_loss_clip": 0.06330565, "balance_loss_mlp": 0.01260042, "epoch": 0.11164888020441906, "flos": 25448994034560.0, "grad_norm": 1.9899284533729353, "language_loss": 0.84308171, "learning_rate": 3.93047569469238e-06, "loss": 0.92284012, "num_input_tokens_seen": 40195715, "router_z_loss_clip": 3.5390625, "router_z_loss_mlp": 0.31640625, "step": 1857, "time_per_iteration": 2.6773715019226074 }, { "auxiliary_loss_clip": 0.0669369, "auxiliary_loss_mlp": 0.01299624, "balance_loss_clip": 0.0632579, "balance_loss_mlp": 0.01262669, "epoch": 0.11170900345708702, "flos": 15638171679360.0, "grad_norm": 4.004205228890528, "language_loss": 0.85606116, "learning_rate": 3.930373863283608e-06, "loss": 0.93599427, "num_input_tokens_seen": 40213975, "router_z_loss_clip": 3.68164062, "router_z_loss_mlp": 0.36962891, "step": 1858, "time_per_iteration": 2.63914155960083 }, { "auxiliary_loss_clip": 0.06689073, "auxiliary_loss_mlp": 0.01291899, "balance_loss_clip": 0.06327327, "balance_loss_mlp": 0.01257424, "epoch": 0.111769126709755, "flos": 23046105697920.0, "grad_norm": 2.285369596870992, "language_loss": 0.9381876, "learning_rate": 3.930271958674866e-06, "loss": 1.01799726, "num_input_tokens_seen": 40233905, "router_z_loss_clip": 3.6171875, "router_z_loss_mlp": 0.34448242, "step": 1859, "time_per_iteration": 2.621091365814209 }, { "auxiliary_loss_clip": 0.06697309, "auxiliary_loss_mlp": 0.01299086, "balance_loss_clip": 0.06324266, "balance_loss_mlp": 0.01260009, "epoch": 0.11182924996242297, "flos": 20857091708160.0, "grad_norm": 2.696354977257055, "language_loss": 0.8367632, "learning_rate": 3.930169980870018e-06, "loss": 0.91672719, "num_input_tokens_seen": 40252810, "router_z_loss_clip": 3.73046875, "router_z_loss_mlp": 0.390625, "step": 1860, "time_per_iteration": 2.62812876701355 }, { "auxiliary_loss_clip": 0.06697439, "auxiliary_loss_mlp": 0.01298718, "balance_loss_clip": 0.06342375, "balance_loss_mlp": 0.0126522, "epoch": 0.11188937321509093, "flos": 17460763764480.0, "grad_norm": 1.8292427358358572, "language_loss": 0.7669518, "learning_rate": 3.930067929872931e-06, "loss": 0.84691334, "num_input_tokens_seen": 40272000, "router_z_loss_clip": 3.54882812, "router_z_loss_mlp": 0.33496094, "step": 1861, "time_per_iteration": 2.605734348297119 }, { "auxiliary_loss_clip": 0.06679627, "auxiliary_loss_mlp": 0.01293226, "balance_loss_clip": 0.06326197, "balance_loss_mlp": 0.01259895, "epoch": 0.11194949646775891, "flos": 24102507248640.0, "grad_norm": 2.083839532519778, "language_loss": 0.9022817, "learning_rate": 3.929965805687474e-06, "loss": 0.98201025, "num_input_tokens_seen": 40290660, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.33349609, "step": 1862, "time_per_iteration": 2.6270318031311035 }, { "auxiliary_loss_clip": 0.06682491, "auxiliary_loss_mlp": 0.01298067, "balance_loss_clip": 0.06322115, "balance_loss_mlp": 0.0126414, "epoch": 0.11200961972042688, "flos": 25160627808000.0, "grad_norm": 2.1924480125041352, "language_loss": 0.88175023, "learning_rate": 3.92986360831752e-06, "loss": 0.96155584, "num_input_tokens_seen": 40307820, "router_z_loss_clip": 3.6015625, "router_z_loss_mlp": 0.33911133, "step": 1863, "time_per_iteration": 2.618379831314087 }, { "auxiliary_loss_clip": 0.06689321, "auxiliary_loss_mlp": 0.01295557, "balance_loss_clip": 0.06335796, "balance_loss_mlp": 0.01259937, "epoch": 0.11206974297309484, "flos": 21294735183360.0, "grad_norm": 2.2082745704943094, "language_loss": 0.65419668, "learning_rate": 3.929761337766945e-06, "loss": 0.73404545, "num_input_tokens_seen": 40327430, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.35620117, "step": 1864, "time_per_iteration": 2.6115169525146484 }, { "auxiliary_loss_clip": 0.06697607, "auxiliary_loss_mlp": 0.01299959, "balance_loss_clip": 0.06337062, "balance_loss_mlp": 0.01266652, "epoch": 0.11212986622576282, "flos": 18921881335680.0, "grad_norm": 2.1528154085406057, "language_loss": 0.75858945, "learning_rate": 3.929658994039627e-06, "loss": 0.83856511, "num_input_tokens_seen": 40344545, "router_z_loss_clip": 3.60351562, "router_z_loss_mlp": 0.33325195, "step": 1865, "time_per_iteration": 2.5943052768707275 }, { "auxiliary_loss_clip": 0.06707559, "auxiliary_loss_mlp": 0.0130784, "balance_loss_clip": 0.06338301, "balance_loss_mlp": 0.01268883, "epoch": 0.11218998947843078, "flos": 22061344988160.0, "grad_norm": 2.5654098677631794, "language_loss": 0.86796927, "learning_rate": 3.929556577139446e-06, "loss": 0.94812334, "num_input_tokens_seen": 40362300, "router_z_loss_clip": 3.69726562, "router_z_loss_mlp": 0.38964844, "step": 1866, "time_per_iteration": 2.6080706119537354 }, { "auxiliary_loss_clip": 0.06695735, "auxiliary_loss_mlp": 0.01298442, "balance_loss_clip": 0.06334243, "balance_loss_mlp": 0.01262036, "epoch": 0.11225011273109875, "flos": 24578612547840.0, "grad_norm": 2.128970478515064, "language_loss": 0.82326984, "learning_rate": 3.929454087070286e-06, "loss": 0.90321159, "num_input_tokens_seen": 40384720, "router_z_loss_clip": 3.61523438, "router_z_loss_mlp": 0.36401367, "step": 1867, "time_per_iteration": 2.6828408241271973 }, { "auxiliary_loss_clip": 0.0669997, "auxiliary_loss_mlp": 0.01297169, "balance_loss_clip": 0.06341106, "balance_loss_mlp": 0.01263171, "epoch": 0.11231023598376672, "flos": 28446140327040.0, "grad_norm": 4.522382805548379, "language_loss": 0.87932336, "learning_rate": 3.929351523836035e-06, "loss": 0.95929468, "num_input_tokens_seen": 40404000, "router_z_loss_clip": 3.58398438, "router_z_loss_mlp": 0.34008789, "step": 1868, "time_per_iteration": 2.669480085372925 }, { "auxiliary_loss_clip": 0.06688584, "auxiliary_loss_mlp": 0.01288869, "balance_loss_clip": 0.06337132, "balance_loss_mlp": 0.01257326, "epoch": 0.1123703592364347, "flos": 14431318922880.0, "grad_norm": 4.0397435767444145, "language_loss": 0.69824159, "learning_rate": 3.9292488874405795e-06, "loss": 0.77801609, "num_input_tokens_seen": 40418665, "router_z_loss_clip": 3.51171875, "router_z_loss_mlp": 0.31494141, "step": 1869, "time_per_iteration": 2.558607578277588 }, { "auxiliary_loss_clip": 0.06709035, "auxiliary_loss_mlp": 0.01307575, "balance_loss_clip": 0.06343918, "balance_loss_mlp": 0.01270525, "epoch": 0.11243048248910266, "flos": 22242753077760.0, "grad_norm": 1.7926393234789486, "language_loss": 0.7903595, "learning_rate": 3.929146177887814e-06, "loss": 0.8705256, "num_input_tokens_seen": 40437870, "router_z_loss_clip": 3.6484375, "router_z_loss_mlp": 0.37060547, "step": 1870, "time_per_iteration": 2.6431384086608887 }, { "auxiliary_loss_clip": 0.0671682, "auxiliary_loss_mlp": 0.0129603, "balance_loss_clip": 0.06341697, "balance_loss_mlp": 0.01259099, "epoch": 0.11249060574177062, "flos": 18589435061760.0, "grad_norm": 2.525784838887822, "language_loss": 0.76996738, "learning_rate": 3.929043395181631e-06, "loss": 0.85009587, "num_input_tokens_seen": 40455570, "router_z_loss_clip": 3.75, "router_z_loss_mlp": 0.36914062, "step": 1871, "time_per_iteration": 2.5829198360443115 }, { "auxiliary_loss_clip": 0.06704703, "auxiliary_loss_mlp": 0.0129756, "balance_loss_clip": 0.06341989, "balance_loss_mlp": 0.01261821, "epoch": 0.1125507289944386, "flos": 22863146307840.0, "grad_norm": 2.042073501082535, "language_loss": 0.83773804, "learning_rate": 3.928940539325929e-06, "loss": 0.91776061, "num_input_tokens_seen": 40473600, "router_z_loss_clip": 3.625, "router_z_loss_mlp": 0.35766602, "step": 1872, "time_per_iteration": 2.6185102462768555 }, { "auxiliary_loss_clip": 0.06701942, "auxiliary_loss_mlp": 0.01305675, "balance_loss_clip": 0.06336774, "balance_loss_mlp": 0.01268816, "epoch": 0.11261085224710657, "flos": 19681447397760.0, "grad_norm": 2.3883608118786452, "language_loss": 0.84623766, "learning_rate": 3.9288376103246095e-06, "loss": 0.92631388, "num_input_tokens_seen": 40490025, "router_z_loss_clip": 3.6484375, "router_z_loss_mlp": 0.36865234, "step": 1873, "time_per_iteration": 4.0070812702178955 }, { "auxiliary_loss_clip": 0.06704355, "auxiliary_loss_mlp": 0.01292499, "balance_loss_clip": 0.06330845, "balance_loss_mlp": 0.01255378, "epoch": 0.11267097549977453, "flos": 26069680753920.0, "grad_norm": 1.963504979046527, "language_loss": 0.92591405, "learning_rate": 3.928734608181575e-06, "loss": 1.00588262, "num_input_tokens_seen": 40511580, "router_z_loss_clip": 3.734375, "router_z_loss_mlp": 0.37109375, "step": 1874, "time_per_iteration": 2.6709036827087402 }, { "auxiliary_loss_clip": 0.06692813, "auxiliary_loss_mlp": 0.01298053, "balance_loss_clip": 0.0633601, "balance_loss_mlp": 0.01260955, "epoch": 0.11273109875244251, "flos": 21074194437120.0, "grad_norm": 1.7019266247715938, "language_loss": 0.76221412, "learning_rate": 3.928631532900729e-06, "loss": 0.84212279, "num_input_tokens_seen": 40530155, "router_z_loss_clip": 3.56835938, "router_z_loss_mlp": 0.37084961, "step": 1875, "time_per_iteration": 4.096051454544067 }, { "auxiliary_loss_clip": 0.06671818, "auxiliary_loss_mlp": 0.01293174, "balance_loss_clip": 0.06319894, "balance_loss_mlp": 0.01260559, "epoch": 0.11279122200511048, "flos": 27096299377920.0, "grad_norm": 2.8724354562849843, "language_loss": 0.73217654, "learning_rate": 3.928528384485984e-06, "loss": 0.81182647, "num_input_tokens_seen": 40549500, "router_z_loss_clip": 3.51757812, "router_z_loss_mlp": 0.32617188, "step": 1876, "time_per_iteration": 2.6567742824554443 }, { "auxiliary_loss_clip": 0.06687267, "auxiliary_loss_mlp": 0.01292154, "balance_loss_clip": 0.06332663, "balance_loss_mlp": 0.01258728, "epoch": 0.11285134525777844, "flos": 20193163482240.0, "grad_norm": 2.2423823448427713, "language_loss": 0.78266239, "learning_rate": 3.9284251629412475e-06, "loss": 0.86245662, "num_input_tokens_seen": 40567475, "router_z_loss_clip": 3.54492188, "router_z_loss_mlp": 0.33422852, "step": 1877, "time_per_iteration": 4.021712303161621 }, { "auxiliary_loss_clip": 0.06679913, "auxiliary_loss_mlp": 0.01295914, "balance_loss_clip": 0.06323195, "balance_loss_mlp": 0.01258768, "epoch": 0.11291146851044641, "flos": 12463348803840.0, "grad_norm": 2.7593160071388763, "language_loss": 0.89898968, "learning_rate": 3.928321868270436e-06, "loss": 0.9787479, "num_input_tokens_seen": 40583280, "router_z_loss_clip": 3.5625, "router_z_loss_mlp": 0.37084961, "step": 1878, "time_per_iteration": 2.606019973754883 }, { "auxiliary_loss_clip": 0.0666862, "auxiliary_loss_mlp": 0.01295888, "balance_loss_clip": 0.06311584, "balance_loss_mlp": 0.01261722, "epoch": 0.11297159176311439, "flos": 23849164828800.0, "grad_norm": 2.3238600552100284, "language_loss": 0.82916653, "learning_rate": 3.928218500477466e-06, "loss": 0.90881169, "num_input_tokens_seen": 40603080, "router_z_loss_clip": 3.56445312, "router_z_loss_mlp": 0.34155273, "step": 1879, "time_per_iteration": 4.08495306968689 }, { "auxiliary_loss_clip": 0.06682587, "auxiliary_loss_mlp": 0.01292356, "balance_loss_clip": 0.06320714, "balance_loss_mlp": 0.01256307, "epoch": 0.11303171501578235, "flos": 29937585876480.0, "grad_norm": 3.1002113631093873, "language_loss": 0.71561062, "learning_rate": 3.928115059566259e-06, "loss": 0.79536009, "num_input_tokens_seen": 40623255, "router_z_loss_clip": 3.6171875, "router_z_loss_mlp": 0.36035156, "step": 1880, "time_per_iteration": 2.709467649459839 }, { "auxiliary_loss_clip": 0.06670148, "auxiliary_loss_mlp": 0.01288942, "balance_loss_clip": 0.06322654, "balance_loss_mlp": 0.01256302, "epoch": 0.11309183826845032, "flos": 16186169381760.0, "grad_norm": 1.8976778972001516, "language_loss": 0.73761874, "learning_rate": 3.928011545540734e-06, "loss": 0.8172096, "num_input_tokens_seen": 40641570, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.32592773, "step": 1881, "time_per_iteration": 2.6311357021331787 }, { "auxiliary_loss_clip": 0.06680716, "auxiliary_loss_mlp": 0.01296766, "balance_loss_clip": 0.06314579, "balance_loss_mlp": 0.01259668, "epoch": 0.1131519615211183, "flos": 12025537620480.0, "grad_norm": 2.5717367291942916, "language_loss": 0.75514632, "learning_rate": 3.927907958404819e-06, "loss": 0.83492118, "num_input_tokens_seen": 40658775, "router_z_loss_clip": 3.6640625, "router_z_loss_mlp": 0.37084961, "step": 1882, "time_per_iteration": 2.6315016746520996 }, { "auxiliary_loss_clip": 0.06680475, "auxiliary_loss_mlp": 0.01296228, "balance_loss_clip": 0.06317085, "balance_loss_mlp": 0.01261729, "epoch": 0.11321208477378626, "flos": 26257335972480.0, "grad_norm": 2.256182273369387, "language_loss": 0.80541563, "learning_rate": 3.92780429816244e-06, "loss": 0.88518262, "num_input_tokens_seen": 40679555, "router_z_loss_clip": 3.6328125, "router_z_loss_mlp": 0.34521484, "step": 1883, "time_per_iteration": 2.63373064994812 }, { "auxiliary_loss_clip": 0.06667344, "auxiliary_loss_mlp": 0.01291964, "balance_loss_clip": 0.06308279, "balance_loss_mlp": 0.0125837, "epoch": 0.11327220802645423, "flos": 13631530101120.0, "grad_norm": 2.249330463874184, "language_loss": 0.78754115, "learning_rate": 3.927700564817529e-06, "loss": 0.86713427, "num_input_tokens_seen": 40697295, "router_z_loss_clip": 3.59375, "router_z_loss_mlp": 0.3359375, "step": 1884, "time_per_iteration": 2.5995190143585205 }, { "auxiliary_loss_clip": 0.06584897, "auxiliary_loss_mlp": 0.0128245, "balance_loss_clip": 0.06357123, "balance_loss_mlp": 0.01267358, "epoch": 0.1133323312791222, "flos": 57210582787200.0, "grad_norm": 0.7644713923982304, "language_loss": 0.55427796, "learning_rate": 3.927596758374019e-06, "loss": 0.63295138, "num_input_tokens_seen": 40758095, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.15063477, "step": 1885, "time_per_iteration": 3.1334331035614014 }, { "auxiliary_loss_clip": 0.06657837, "auxiliary_loss_mlp": 0.01295374, "balance_loss_clip": 0.06307373, "balance_loss_mlp": 0.01261686, "epoch": 0.11339245453179017, "flos": 24358407217920.0, "grad_norm": 2.080940802655039, "language_loss": 0.91937298, "learning_rate": 3.927492878835848e-06, "loss": 0.99890506, "num_input_tokens_seen": 40777140, "router_z_loss_clip": 3.50390625, "router_z_loss_mlp": 0.33666992, "step": 1886, "time_per_iteration": 2.633795738220215 }, { "auxiliary_loss_clip": 0.06665167, "auxiliary_loss_mlp": 0.0129579, "balance_loss_clip": 0.06313114, "balance_loss_mlp": 0.0126594, "epoch": 0.11345257778445814, "flos": 22676665046400.0, "grad_norm": 2.764457916463333, "language_loss": 0.87074465, "learning_rate": 3.927388926206953e-06, "loss": 0.95035422, "num_input_tokens_seen": 40797505, "router_z_loss_clip": 3.515625, "router_z_loss_mlp": 0.29858398, "step": 1887, "time_per_iteration": 2.618793487548828 }, { "auxiliary_loss_clip": 0.06657237, "auxiliary_loss_mlp": 0.01289137, "balance_loss_clip": 0.06305318, "balance_loss_mlp": 0.01256855, "epoch": 0.11351270103712612, "flos": 20993245793280.0, "grad_norm": 4.689649698099584, "language_loss": 0.78185391, "learning_rate": 3.927284900491277e-06, "loss": 0.86131775, "num_input_tokens_seen": 40812970, "router_z_loss_clip": 3.51953125, "router_z_loss_mlp": 0.32275391, "step": 1888, "time_per_iteration": 2.6333365440368652 }, { "auxiliary_loss_clip": 0.06671902, "auxiliary_loss_mlp": 0.01297865, "balance_loss_clip": 0.06310825, "balance_loss_mlp": 0.01262507, "epoch": 0.11357282428979408, "flos": 37358014152960.0, "grad_norm": 2.805803529962428, "language_loss": 0.69533169, "learning_rate": 3.927180801692764e-06, "loss": 0.77502942, "num_input_tokens_seen": 40837745, "router_z_loss_clip": 3.61132812, "router_z_loss_mlp": 0.35375977, "step": 1889, "time_per_iteration": 2.7670116424560547 }, { "auxiliary_loss_clip": 0.06664692, "auxiliary_loss_mlp": 0.01301842, "balance_loss_clip": 0.06309208, "balance_loss_mlp": 0.01268749, "epoch": 0.11363294754246205, "flos": 21762580855680.0, "grad_norm": 1.9782666285415855, "language_loss": 0.8522985, "learning_rate": 3.927076629815362e-06, "loss": 0.9319638, "num_input_tokens_seen": 40856490, "router_z_loss_clip": 3.55664062, "router_z_loss_mlp": 0.33105469, "step": 1890, "time_per_iteration": 2.6155149936676025 }, { "auxiliary_loss_clip": 0.06663813, "auxiliary_loss_mlp": 0.01299309, "balance_loss_clip": 0.06311534, "balance_loss_mlp": 0.01266932, "epoch": 0.11369307079513001, "flos": 22608252587520.0, "grad_norm": 2.348806934972491, "language_loss": 0.66017115, "learning_rate": 3.926972384863022e-06, "loss": 0.73980236, "num_input_tokens_seen": 40874070, "router_z_loss_clip": 3.51953125, "router_z_loss_mlp": 0.32373047, "step": 1891, "time_per_iteration": 2.616682767868042 }, { "auxiliary_loss_clip": 0.06680353, "auxiliary_loss_mlp": 0.01303948, "balance_loss_clip": 0.06314366, "balance_loss_mlp": 0.01269926, "epoch": 0.11375319404779799, "flos": 21950655344640.0, "grad_norm": 2.240980979121344, "language_loss": 0.89498806, "learning_rate": 3.9268680668396956e-06, "loss": 0.97483099, "num_input_tokens_seen": 40892425, "router_z_loss_clip": 3.65625, "router_z_loss_mlp": 0.34033203, "step": 1892, "time_per_iteration": 2.604227066040039 }, { "auxiliary_loss_clip": 0.06685763, "auxiliary_loss_mlp": 0.01303337, "balance_loss_clip": 0.06319462, "balance_loss_mlp": 0.01269911, "epoch": 0.11381331730046595, "flos": 26402588225280.0, "grad_norm": 2.5293448536088468, "language_loss": 0.74263728, "learning_rate": 3.926763675749339e-06, "loss": 0.82252824, "num_input_tokens_seen": 40912190, "router_z_loss_clip": 3.6640625, "router_z_loss_mlp": 0.33398438, "step": 1893, "time_per_iteration": 2.6589958667755127 }, { "auxiliary_loss_clip": 0.06672771, "auxiliary_loss_mlp": 0.01300807, "balance_loss_clip": 0.06317091, "balance_loss_mlp": 0.01266975, "epoch": 0.11387344055313392, "flos": 23811373837440.0, "grad_norm": 3.061346748059557, "language_loss": 0.8083666, "learning_rate": 3.92665921159591e-06, "loss": 0.88810241, "num_input_tokens_seen": 40928395, "router_z_loss_clip": 3.55664062, "router_z_loss_mlp": 0.33837891, "step": 1894, "time_per_iteration": 2.652711868286133 }, { "auxiliary_loss_clip": 0.06678164, "auxiliary_loss_mlp": 0.01305478, "balance_loss_clip": 0.06315798, "balance_loss_mlp": 0.01271503, "epoch": 0.1139335638058019, "flos": 34529865546240.0, "grad_norm": 4.199509874767895, "language_loss": 0.81443357, "learning_rate": 3.926554674383371e-06, "loss": 0.89427, "num_input_tokens_seen": 40946555, "router_z_loss_clip": 3.61914062, "router_z_loss_mlp": 0.33984375, "step": 1895, "time_per_iteration": 2.679929494857788 }, { "auxiliary_loss_clip": 0.06554746, "auxiliary_loss_mlp": 0.01263622, "balance_loss_clip": 0.06329311, "balance_loss_mlp": 0.01250151, "epoch": 0.11399368705846986, "flos": 70609790643840.0, "grad_norm": 0.7721730636723292, "language_loss": 0.6328578, "learning_rate": 3.926450064115686e-06, "loss": 0.71104145, "num_input_tokens_seen": 41004910, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.13500977, "step": 1896, "time_per_iteration": 3.3143398761749268 }, { "auxiliary_loss_clip": 0.06673442, "auxiliary_loss_mlp": 0.01296952, "balance_loss_clip": 0.06317714, "balance_loss_mlp": 0.01263597, "epoch": 0.11405381031113783, "flos": 21330597530880.0, "grad_norm": 1.8003774627472444, "language_loss": 0.8627857, "learning_rate": 3.926345380796821e-06, "loss": 0.94248962, "num_input_tokens_seen": 41026385, "router_z_loss_clip": 3.5546875, "router_z_loss_mlp": 0.33325195, "step": 1897, "time_per_iteration": 2.626641273498535 }, { "auxiliary_loss_clip": 0.06677877, "auxiliary_loss_mlp": 0.01292857, "balance_loss_clip": 0.06317862, "balance_loss_mlp": 0.01260718, "epoch": 0.11411393356380581, "flos": 19725820934400.0, "grad_norm": 3.2343595030755554, "language_loss": 0.81335056, "learning_rate": 3.9262406244307465e-06, "loss": 0.89305788, "num_input_tokens_seen": 41045315, "router_z_loss_clip": 3.59765625, "router_z_loss_mlp": 0.3215332, "step": 1898, "time_per_iteration": 2.6260228157043457 }, { "auxiliary_loss_clip": 0.06678617, "auxiliary_loss_mlp": 0.01290897, "balance_loss_clip": 0.06316775, "balance_loss_mlp": 0.01256183, "epoch": 0.11417405681647377, "flos": 17536261893120.0, "grad_norm": 2.2972488472662307, "language_loss": 0.74581569, "learning_rate": 3.926135795021435e-06, "loss": 0.82551086, "num_input_tokens_seen": 41063390, "router_z_loss_clip": 3.61914062, "router_z_loss_mlp": 0.34716797, "step": 1899, "time_per_iteration": 2.572300672531128 }, { "auxiliary_loss_clip": 0.06546521, "auxiliary_loss_mlp": 0.01276019, "balance_loss_clip": 0.06319572, "balance_loss_mlp": 0.01261857, "epoch": 0.11423418006914174, "flos": 59694168205440.0, "grad_norm": 0.8624229324870447, "language_loss": 0.63627207, "learning_rate": 3.92603089257286e-06, "loss": 0.71449745, "num_input_tokens_seen": 41124180, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.14172363, "step": 1900, "time_per_iteration": 3.205031394958496 }, { "auxiliary_loss_clip": 0.06657197, "auxiliary_loss_mlp": 0.0128721, "balance_loss_clip": 0.06309847, "balance_loss_mlp": 0.01256573, "epoch": 0.1142943033218097, "flos": 22969223976960.0, "grad_norm": 1.7817822660293858, "language_loss": 0.79792953, "learning_rate": 3.925925917089001e-06, "loss": 0.87737358, "num_input_tokens_seen": 41143485, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.30615234, "step": 1901, "time_per_iteration": 2.5918800830841064 }, { "auxiliary_loss_clip": 0.06662698, "auxiliary_loss_mlp": 0.01295587, "balance_loss_clip": 0.06313328, "balance_loss_mlp": 0.01262709, "epoch": 0.11435442657447768, "flos": 18261558835200.0, "grad_norm": 1.9154047488197188, "language_loss": 0.85501117, "learning_rate": 3.925820868573839e-06, "loss": 0.93459404, "num_input_tokens_seen": 41161695, "router_z_loss_clip": 3.49023438, "router_z_loss_mlp": 0.32910156, "step": 1902, "time_per_iteration": 2.725536823272705 }, { "auxiliary_loss_clip": 0.06670421, "auxiliary_loss_mlp": 0.0128976, "balance_loss_clip": 0.0631306, "balance_loss_mlp": 0.0125848, "epoch": 0.11441454982714565, "flos": 24068070420480.0, "grad_norm": 1.7102944055891551, "language_loss": 0.79143399, "learning_rate": 3.925715747031356e-06, "loss": 0.87103581, "num_input_tokens_seen": 41181715, "router_z_loss_clip": 3.57226562, "router_z_loss_mlp": 0.31274414, "step": 1903, "time_per_iteration": 2.601917266845703 }, { "auxiliary_loss_clip": 0.06659578, "auxiliary_loss_mlp": 0.0128968, "balance_loss_clip": 0.06310965, "balance_loss_mlp": 0.01259878, "epoch": 0.11447467307981361, "flos": 25344719228160.0, "grad_norm": 1.9130338919765855, "language_loss": 0.77407873, "learning_rate": 3.925610552465539e-06, "loss": 0.8535713, "num_input_tokens_seen": 41201770, "router_z_loss_clip": 3.484375, "router_z_loss_mlp": 0.2980957, "step": 1904, "time_per_iteration": 2.640028715133667 }, { "auxiliary_loss_clip": 0.06658151, "auxiliary_loss_mlp": 0.0129771, "balance_loss_clip": 0.06309384, "balance_loss_mlp": 0.01262925, "epoch": 0.11453479633248159, "flos": 21732546366720.0, "grad_norm": 2.3653342409742573, "language_loss": 0.93336338, "learning_rate": 3.9255052848803764e-06, "loss": 1.01292205, "num_input_tokens_seen": 41220590, "router_z_loss_clip": 3.484375, "router_z_loss_mlp": 0.34790039, "step": 1905, "time_per_iteration": 2.5818891525268555 }, { "auxiliary_loss_clip": 0.06682967, "auxiliary_loss_mlp": 0.01293684, "balance_loss_clip": 0.06314029, "balance_loss_mlp": 0.01259685, "epoch": 0.11459491958514956, "flos": 12974771399040.0, "grad_norm": 3.338824541976454, "language_loss": 0.78051317, "learning_rate": 3.925399944279861e-06, "loss": 0.86027968, "num_input_tokens_seen": 41237250, "router_z_loss_clip": 3.68164062, "router_z_loss_mlp": 0.33984375, "step": 1906, "time_per_iteration": 2.680459499359131 }, { "auxiliary_loss_clip": 0.06663641, "auxiliary_loss_mlp": 0.01294932, "balance_loss_clip": 0.06314653, "balance_loss_mlp": 0.01261386, "epoch": 0.11465504283781752, "flos": 22717935982080.0, "grad_norm": 2.6397938675258543, "language_loss": 0.84726858, "learning_rate": 3.925294530667986e-06, "loss": 0.92685425, "num_input_tokens_seen": 41256680, "router_z_loss_clip": 3.48632812, "router_z_loss_mlp": 0.33569336, "step": 1907, "time_per_iteration": 2.6264963150024414 }, { "auxiliary_loss_clip": 0.06674697, "auxiliary_loss_mlp": 0.01299696, "balance_loss_clip": 0.06321891, "balance_loss_mlp": 0.01265435, "epoch": 0.1147151660904855, "flos": 23404142194560.0, "grad_norm": 2.2579380040853096, "language_loss": 0.86191094, "learning_rate": 3.92518904404875e-06, "loss": 0.9416548, "num_input_tokens_seen": 41270955, "router_z_loss_clip": 3.5234375, "router_z_loss_mlp": 0.3425293, "step": 1908, "time_per_iteration": 2.5953009128570557 }, { "auxiliary_loss_clip": 0.06582385, "auxiliary_loss_mlp": 0.01272712, "balance_loss_clip": 0.06354444, "balance_loss_mlp": 0.01259909, "epoch": 0.11477528934315347, "flos": 63028639036800.0, "grad_norm": 0.8826469866415451, "language_loss": 0.60865176, "learning_rate": 3.925083484426153e-06, "loss": 0.68720275, "num_input_tokens_seen": 41319180, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.12817383, "step": 1909, "time_per_iteration": 2.9700307846069336 }, { "auxiliary_loss_clip": 0.06666069, "auxiliary_loss_mlp": 0.01300916, "balance_loss_clip": 0.06314292, "balance_loss_mlp": 0.01267656, "epoch": 0.11483541259582143, "flos": 16331086218240.0, "grad_norm": 13.418082416290645, "language_loss": 0.80721229, "learning_rate": 3.924977851804197e-06, "loss": 0.88688207, "num_input_tokens_seen": 41337480, "router_z_loss_clip": 3.515625, "router_z_loss_mlp": 0.33251953, "step": 1910, "time_per_iteration": 2.6018080711364746 }, { "auxiliary_loss_clip": 0.06672288, "auxiliary_loss_mlp": 0.01295681, "balance_loss_clip": 0.0631741, "balance_loss_mlp": 0.01261873, "epoch": 0.1148955358484894, "flos": 21586916770560.0, "grad_norm": 2.5453374723382853, "language_loss": 0.78526646, "learning_rate": 3.9248721461868875e-06, "loss": 0.86494613, "num_input_tokens_seen": 41354650, "router_z_loss_clip": 3.54882812, "router_z_loss_mlp": 0.33789062, "step": 1911, "time_per_iteration": 2.5862114429473877 }, { "auxiliary_loss_clip": 0.06657763, "auxiliary_loss_mlp": 0.0130318, "balance_loss_clip": 0.06318952, "balance_loss_mlp": 0.01271423, "epoch": 0.11495565910115738, "flos": 27681249530880.0, "grad_norm": 1.6630952465230024, "language_loss": 0.80810463, "learning_rate": 3.9247663675782336e-06, "loss": 0.88771403, "num_input_tokens_seen": 41376935, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.31738281, "step": 1912, "time_per_iteration": 2.657280683517456 }, { "auxiliary_loss_clip": 0.06667089, "auxiliary_loss_mlp": 0.01297495, "balance_loss_clip": 0.06318444, "balance_loss_mlp": 0.01265546, "epoch": 0.11501578235382534, "flos": 20638815022080.0, "grad_norm": 2.555961080194125, "language_loss": 0.79567146, "learning_rate": 3.924660515982246e-06, "loss": 0.87531728, "num_input_tokens_seen": 41396105, "router_z_loss_clip": 3.484375, "router_z_loss_mlp": 0.3190918, "step": 1913, "time_per_iteration": 3.9935431480407715 }, { "auxiliary_loss_clip": 0.06667054, "auxiliary_loss_mlp": 0.01295911, "balance_loss_clip": 0.06317306, "balance_loss_mlp": 0.01263271, "epoch": 0.1150759056064933, "flos": 19835252766720.0, "grad_norm": 3.0938457332384997, "language_loss": 0.72313976, "learning_rate": 3.924554591402939e-06, "loss": 0.80276942, "num_input_tokens_seen": 41415600, "router_z_loss_clip": 3.5, "router_z_loss_mlp": 0.32617188, "step": 1914, "time_per_iteration": 2.620129108428955 }, { "auxiliary_loss_clip": 0.06579944, "auxiliary_loss_mlp": 0.01283036, "balance_loss_clip": 0.06350713, "balance_loss_mlp": 0.01269529, "epoch": 0.11513602885916129, "flos": 70068543194880.0, "grad_norm": 0.7399782525144987, "language_loss": 0.6110236, "learning_rate": 3.92444859384433e-06, "loss": 0.6896534, "num_input_tokens_seen": 41478760, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.13537598, "step": 1915, "time_per_iteration": 4.730472087860107 }, { "auxiliary_loss_clip": 0.06667674, "auxiliary_loss_mlp": 0.01300912, "balance_loss_clip": 0.06319308, "balance_loss_mlp": 0.01268749, "epoch": 0.11519615211182925, "flos": 15747100387200.0, "grad_norm": 2.1497154368664173, "language_loss": 0.9483459, "learning_rate": 3.924342523310436e-06, "loss": 1.02803171, "num_input_tokens_seen": 41495720, "router_z_loss_clip": 3.48242188, "router_z_loss_mlp": 0.32177734, "step": 1916, "time_per_iteration": 2.609708309173584 }, { "auxiliary_loss_clip": 0.06669152, "auxiliary_loss_mlp": 0.01296576, "balance_loss_clip": 0.06316996, "balance_loss_mlp": 0.01262243, "epoch": 0.11525627536449722, "flos": 20673880755840.0, "grad_norm": 1.8513132231203941, "language_loss": 0.73861837, "learning_rate": 3.9242363798052806e-06, "loss": 0.81827569, "num_input_tokens_seen": 41513585, "router_z_loss_clip": 3.51953125, "router_z_loss_mlp": 0.34326172, "step": 1917, "time_per_iteration": 4.071179389953613 }, { "auxiliary_loss_clip": 0.06652237, "auxiliary_loss_mlp": 0.01298412, "balance_loss_clip": 0.06312522, "balance_loss_mlp": 0.01266321, "epoch": 0.1153163986171652, "flos": 20309555203200.0, "grad_norm": 2.538841133496323, "language_loss": 0.76346982, "learning_rate": 3.92413016333289e-06, "loss": 0.84297639, "num_input_tokens_seen": 41533390, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.32080078, "step": 1918, "time_per_iteration": 2.638519763946533 }, { "auxiliary_loss_clip": 0.06659237, "auxiliary_loss_mlp": 0.01295123, "balance_loss_clip": 0.06308634, "balance_loss_mlp": 0.01263413, "epoch": 0.11537652186983316, "flos": 17645064819840.0, "grad_norm": 2.3561677011704467, "language_loss": 0.8827911, "learning_rate": 3.92402387389729e-06, "loss": 0.96233469, "num_input_tokens_seen": 41551015, "router_z_loss_clip": 3.5078125, "router_z_loss_mlp": 0.31713867, "step": 1919, "time_per_iteration": 4.009744167327881 }, { "auxiliary_loss_clip": 0.06652683, "auxiliary_loss_mlp": 0.01297311, "balance_loss_clip": 0.06310599, "balance_loss_mlp": 0.01266984, "epoch": 0.11543664512250112, "flos": 21075787664640.0, "grad_norm": 2.6735800084227335, "language_loss": 0.87543523, "learning_rate": 3.923917511502512e-06, "loss": 0.95493519, "num_input_tokens_seen": 41568055, "router_z_loss_clip": 3.41992188, "router_z_loss_mlp": 0.30322266, "step": 1920, "time_per_iteration": 2.661965847015381 }, { "auxiliary_loss_clip": 0.06644671, "auxiliary_loss_mlp": 0.01296146, "balance_loss_clip": 0.06307144, "balance_loss_mlp": 0.01265223, "epoch": 0.11549676837516909, "flos": 22754175672960.0, "grad_norm": 1.9968392196187934, "language_loss": 0.81708211, "learning_rate": 3.923811076152589e-06, "loss": 0.89649034, "num_input_tokens_seen": 41587435, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.30932617, "step": 1921, "time_per_iteration": 2.655803918838501 }, { "auxiliary_loss_clip": 0.06670271, "auxiliary_loss_mlp": 0.01297503, "balance_loss_clip": 0.06313515, "balance_loss_mlp": 0.01264458, "epoch": 0.11555689162783707, "flos": 19174510995840.0, "grad_norm": 3.3068793624710544, "language_loss": 0.79778731, "learning_rate": 3.923704567851557e-06, "loss": 0.87746507, "num_input_tokens_seen": 41604975, "router_z_loss_clip": 3.57226562, "router_z_loss_mlp": 0.33056641, "step": 1922, "time_per_iteration": 2.621760845184326 }, { "auxiliary_loss_clip": 0.06659418, "auxiliary_loss_mlp": 0.01301112, "balance_loss_clip": 0.06312384, "balance_loss_mlp": 0.01270332, "epoch": 0.11561701488050503, "flos": 24579031818240.0, "grad_norm": 2.122514704949017, "language_loss": 0.86054385, "learning_rate": 3.923597986603456e-06, "loss": 0.94014913, "num_input_tokens_seen": 41626155, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.30786133, "step": 1923, "time_per_iteration": 2.683898687362671 }, { "auxiliary_loss_clip": 0.06653705, "auxiliary_loss_mlp": 0.01286807, "balance_loss_clip": 0.06309462, "balance_loss_mlp": 0.01257458, "epoch": 0.115677138133173, "flos": 17098283001600.0, "grad_norm": 1.957579581730531, "language_loss": 0.82567763, "learning_rate": 3.9234913324123264e-06, "loss": 0.90508276, "num_input_tokens_seen": 41644805, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.29370117, "step": 1924, "time_per_iteration": 2.6235733032226562 }, { "auxiliary_loss_clip": 0.06560338, "auxiliary_loss_mlp": 0.01273935, "balance_loss_clip": 0.06333525, "balance_loss_mlp": 0.01262372, "epoch": 0.11573726138584098, "flos": 62724032317440.0, "grad_norm": 0.8040426070599355, "language_loss": 0.60984486, "learning_rate": 3.923384605282212e-06, "loss": 0.6881876, "num_input_tokens_seen": 41709345, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.11547852, "step": 1925, "time_per_iteration": 3.260688304901123 }, { "auxiliary_loss_clip": 0.06662598, "auxiliary_loss_mlp": 0.01303409, "balance_loss_clip": 0.06318849, "balance_loss_mlp": 0.012722, "epoch": 0.11579738463850894, "flos": 22607665608960.0, "grad_norm": 1.7579374326416841, "language_loss": 0.76574051, "learning_rate": 3.923277805217161e-06, "loss": 0.84540057, "num_input_tokens_seen": 41730210, "router_z_loss_clip": 3.44140625, "router_z_loss_mlp": 0.31225586, "step": 1926, "time_per_iteration": 2.6425979137420654 }, { "auxiliary_loss_clip": 0.06667764, "auxiliary_loss_mlp": 0.01289494, "balance_loss_clip": 0.06310713, "balance_loss_mlp": 0.01254542, "epoch": 0.11585750789117691, "flos": 21732630220800.0, "grad_norm": 3.1302074472870243, "language_loss": 0.74479765, "learning_rate": 3.923170932221222e-06, "loss": 0.82437027, "num_input_tokens_seen": 41750270, "router_z_loss_clip": 3.57421875, "router_z_loss_mlp": 0.34936523, "step": 1927, "time_per_iteration": 2.6240248680114746 }, { "auxiliary_loss_clip": 0.06663428, "auxiliary_loss_mlp": 0.01294992, "balance_loss_clip": 0.06314497, "balance_loss_mlp": 0.01261542, "epoch": 0.11591763114384489, "flos": 26294917328640.0, "grad_norm": 2.9169288919842677, "language_loss": 0.88299632, "learning_rate": 3.92306398629845e-06, "loss": 0.96258056, "num_input_tokens_seen": 41772975, "router_z_loss_clip": 3.49023438, "router_z_loss_mlp": 0.3347168, "step": 1928, "time_per_iteration": 2.6960220336914062 }, { "auxiliary_loss_clip": 0.06667801, "auxiliary_loss_mlp": 0.01294283, "balance_loss_clip": 0.0631565, "balance_loss_mlp": 0.01261501, "epoch": 0.11597775439651285, "flos": 23006721479040.0, "grad_norm": 1.7919188192311781, "language_loss": 0.78701556, "learning_rate": 3.922956967452898e-06, "loss": 0.8666364, "num_input_tokens_seen": 41791765, "router_z_loss_clip": 3.52148438, "router_z_loss_mlp": 0.32788086, "step": 1929, "time_per_iteration": 2.5912296772003174 }, { "auxiliary_loss_clip": 0.06665514, "auxiliary_loss_mlp": 0.01290465, "balance_loss_clip": 0.06323067, "balance_loss_mlp": 0.01258875, "epoch": 0.11603787764918082, "flos": 31949845678080.0, "grad_norm": 2.105300082895556, "language_loss": 0.78904676, "learning_rate": 3.922849875688626e-06, "loss": 0.86860651, "num_input_tokens_seen": 41815615, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.31616211, "step": 1930, "time_per_iteration": 2.703596830368042 }, { "auxiliary_loss_clip": 0.06660958, "auxiliary_loss_mlp": 0.01286843, "balance_loss_clip": 0.06319465, "balance_loss_mlp": 0.01256612, "epoch": 0.1160980009018488, "flos": 22277944592640.0, "grad_norm": 1.9554567128070628, "language_loss": 0.7329163, "learning_rate": 3.922742711009693e-06, "loss": 0.81239426, "num_input_tokens_seen": 41834810, "router_z_loss_clip": 3.41601562, "router_z_loss_mlp": 0.30249023, "step": 1931, "time_per_iteration": 2.603456735610962 }, { "auxiliary_loss_clip": 0.06667371, "auxiliary_loss_mlp": 0.01291651, "balance_loss_clip": 0.06322567, "balance_loss_mlp": 0.01259989, "epoch": 0.11615812415451676, "flos": 22790205728640.0, "grad_norm": 1.686760519968094, "language_loss": 0.83407509, "learning_rate": 3.922635473420164e-06, "loss": 0.91366529, "num_input_tokens_seen": 41854975, "router_z_loss_clip": 3.44921875, "router_z_loss_mlp": 0.31689453, "step": 1932, "time_per_iteration": 2.6439192295074463 }, { "auxiliary_loss_clip": 0.06551108, "auxiliary_loss_mlp": 0.0128464, "balance_loss_clip": 0.06325561, "balance_loss_mlp": 0.01272749, "epoch": 0.11621824740718473, "flos": 67165483438080.0, "grad_norm": 0.7699476260896971, "language_loss": 0.61285573, "learning_rate": 3.922528162924105e-06, "loss": 0.69121319, "num_input_tokens_seen": 41911105, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.11883545, "step": 1933, "time_per_iteration": 3.0986878871917725 }, { "auxiliary_loss_clip": 0.06656185, "auxiliary_loss_mlp": 0.012926, "balance_loss_clip": 0.06308815, "balance_loss_mlp": 0.01261307, "epoch": 0.11627837065985269, "flos": 20382160366080.0, "grad_norm": 2.954329665979815, "language_loss": 0.88217968, "learning_rate": 3.922420779525586e-06, "loss": 0.96166754, "num_input_tokens_seen": 41931750, "router_z_loss_clip": 3.47460938, "router_z_loss_mlp": 0.31286621, "step": 1934, "time_per_iteration": 2.614062547683716 }, { "auxiliary_loss_clip": 0.0667129, "auxiliary_loss_mlp": 0.01300219, "balance_loss_clip": 0.06314799, "balance_loss_mlp": 0.01265624, "epoch": 0.11633849391252067, "flos": 21732252877440.0, "grad_norm": 3.2885031118075, "language_loss": 0.68833745, "learning_rate": 3.9223133232286776e-06, "loss": 0.76805258, "num_input_tokens_seen": 41949400, "router_z_loss_clip": 3.5625, "router_z_loss_mlp": 0.34619141, "step": 1935, "time_per_iteration": 2.602053165435791 }, { "auxiliary_loss_clip": 0.06658518, "auxiliary_loss_mlp": 0.01297045, "balance_loss_clip": 0.06312076, "balance_loss_mlp": 0.01266336, "epoch": 0.11639861716518864, "flos": 18811023984000.0, "grad_norm": 2.2326287090554486, "language_loss": 0.77001047, "learning_rate": 3.922205794037456e-06, "loss": 0.84956616, "num_input_tokens_seen": 41968100, "router_z_loss_clip": 3.46289062, "router_z_loss_mlp": 0.30712891, "step": 1936, "time_per_iteration": 2.5709667205810547 }, { "auxiliary_loss_clip": 0.06660575, "auxiliary_loss_mlp": 0.01298907, "balance_loss_clip": 0.06312922, "balance_loss_mlp": 0.01266172, "epoch": 0.1164587404178566, "flos": 21221333406720.0, "grad_norm": 1.8909247357880565, "language_loss": 0.85711944, "learning_rate": 3.922098191955998e-06, "loss": 0.93671417, "num_input_tokens_seen": 41986375, "router_z_loss_clip": 3.47460938, "router_z_loss_mlp": 0.32739258, "step": 1937, "time_per_iteration": 2.5864510536193848 }, { "auxiliary_loss_clip": 0.06655846, "auxiliary_loss_mlp": 0.01300426, "balance_loss_clip": 0.0631671, "balance_loss_mlp": 0.01270624, "epoch": 0.11651886367052458, "flos": 27826040586240.0, "grad_norm": 2.923564951669825, "language_loss": 0.77645111, "learning_rate": 3.921990516988384e-06, "loss": 0.85601389, "num_input_tokens_seen": 42006055, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.2980957, "step": 1938, "time_per_iteration": 2.6376659870147705 }, { "auxiliary_loss_clip": 0.06668936, "auxiliary_loss_mlp": 0.01300287, "balance_loss_clip": 0.06321251, "balance_loss_mlp": 0.01269352, "epoch": 0.11657898692319255, "flos": 22895570638080.0, "grad_norm": 5.054082860883598, "language_loss": 0.80892301, "learning_rate": 3.921882769138696e-06, "loss": 0.88861525, "num_input_tokens_seen": 42024995, "router_z_loss_clip": 3.4765625, "router_z_loss_mlp": 0.30944824, "step": 1939, "time_per_iteration": 2.6128103733062744 }, { "auxiliary_loss_clip": 0.06668065, "auxiliary_loss_mlp": 0.01306836, "balance_loss_clip": 0.06325598, "balance_loss_mlp": 0.01278202, "epoch": 0.11663911017586051, "flos": 24322712578560.0, "grad_norm": 2.523986699951151, "language_loss": 0.87833107, "learning_rate": 3.9217749484110215e-06, "loss": 0.95808005, "num_input_tokens_seen": 42042640, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.28601074, "step": 1940, "time_per_iteration": 2.6430504322052 }, { "auxiliary_loss_clip": 0.06661282, "auxiliary_loss_mlp": 0.01297394, "balance_loss_clip": 0.06327221, "balance_loss_mlp": 0.01268331, "epoch": 0.11669923342852849, "flos": 42350020525440.0, "grad_norm": 1.8089309290833089, "language_loss": 0.77219993, "learning_rate": 3.921667054809449e-06, "loss": 0.85178661, "num_input_tokens_seen": 42067005, "router_z_loss_clip": 3.34179688, "router_z_loss_mlp": 0.29089355, "step": 1941, "time_per_iteration": 2.7708358764648438 }, { "auxiliary_loss_clip": 0.06670938, "auxiliary_loss_mlp": 0.01319577, "balance_loss_clip": 0.06329534, "balance_loss_mlp": 0.01290276, "epoch": 0.11675935668119646, "flos": 14646660716160.0, "grad_norm": 2.2574483501751983, "language_loss": 0.89964455, "learning_rate": 3.921559088338068e-06, "loss": 0.97954971, "num_input_tokens_seen": 42082295, "router_z_loss_clip": 3.4140625, "router_z_loss_mlp": 0.29296875, "step": 1942, "time_per_iteration": 2.5657546520233154 }, { "auxiliary_loss_clip": 0.06663041, "auxiliary_loss_mlp": 0.01311866, "balance_loss_clip": 0.06323043, "balance_loss_mlp": 0.01282946, "epoch": 0.11681947993386442, "flos": 35125213605120.0, "grad_norm": 1.8104114347299471, "language_loss": 0.69362277, "learning_rate": 3.921451049000975e-06, "loss": 0.77337182, "num_input_tokens_seen": 42105295, "router_z_loss_clip": 3.40039062, "router_z_loss_mlp": 0.2890625, "step": 1943, "time_per_iteration": 2.7447032928466797 }, { "auxiliary_loss_clip": 0.06656276, "auxiliary_loss_mlp": 0.01298604, "balance_loss_clip": 0.06318842, "balance_loss_mlp": 0.01267229, "epoch": 0.11687960318653239, "flos": 38992531749120.0, "grad_norm": 2.9310505785274974, "language_loss": 0.71238691, "learning_rate": 3.921342936802265e-06, "loss": 0.79193568, "num_input_tokens_seen": 42125520, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.31396484, "step": 1944, "time_per_iteration": 2.7777347564697266 }, { "auxiliary_loss_clip": 0.0666577, "auxiliary_loss_mlp": 0.0129079, "balance_loss_clip": 0.06329659, "balance_loss_mlp": 0.01263134, "epoch": 0.11693972643920036, "flos": 26002190689920.0, "grad_norm": 1.9999769232611049, "language_loss": 0.84392244, "learning_rate": 3.921234751746038e-06, "loss": 0.92348802, "num_input_tokens_seen": 42146335, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.27685547, "step": 1945, "time_per_iteration": 2.7090139389038086 }, { "auxiliary_loss_clip": 0.06670263, "auxiliary_loss_mlp": 0.01289529, "balance_loss_clip": 0.06332859, "balance_loss_mlp": 0.01261658, "epoch": 0.11699984969186833, "flos": 27279552257280.0, "grad_norm": 2.327814942646584, "language_loss": 0.78000045, "learning_rate": 3.9211264938363975e-06, "loss": 0.8595984, "num_input_tokens_seen": 42165320, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.27880859, "step": 1946, "time_per_iteration": 2.772411584854126 }, { "auxiliary_loss_clip": 0.0666942, "auxiliary_loss_mlp": 0.01289031, "balance_loss_clip": 0.06330158, "balance_loss_mlp": 0.01259646, "epoch": 0.1170599729445363, "flos": 15273217221120.0, "grad_norm": 2.3004186244370617, "language_loss": 0.70638961, "learning_rate": 3.921018163077448e-06, "loss": 0.78597414, "num_input_tokens_seen": 42182955, "router_z_loss_clip": 3.39257812, "router_z_loss_mlp": 0.29406738, "step": 1947, "time_per_iteration": 2.6184568405151367 }, { "auxiliary_loss_clip": 0.06672447, "auxiliary_loss_mlp": 0.01296738, "balance_loss_clip": 0.06327086, "balance_loss_mlp": 0.01264694, "epoch": 0.11712009619720427, "flos": 17170007696640.0, "grad_norm": 1.7249341069757549, "language_loss": 0.86615121, "learning_rate": 3.920909759473295e-06, "loss": 0.94584298, "num_input_tokens_seen": 42200760, "router_z_loss_clip": 3.453125, "router_z_loss_mlp": 0.32080078, "step": 1948, "time_per_iteration": 2.674774646759033 }, { "auxiliary_loss_clip": 0.06548826, "auxiliary_loss_mlp": 0.01299465, "balance_loss_clip": 0.06317288, "balance_loss_mlp": 0.0128653, "epoch": 0.11718021944987224, "flos": 70961076887040.0, "grad_norm": 0.8128837287683558, "language_loss": 0.65257621, "learning_rate": 3.920801283028054e-06, "loss": 0.73105913, "num_input_tokens_seen": 42265745, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.1295166, "step": 1949, "time_per_iteration": 3.45168399810791 }, { "auxiliary_loss_clip": 0.06662367, "auxiliary_loss_mlp": 0.01289772, "balance_loss_clip": 0.06328895, "balance_loss_mlp": 0.01262115, "epoch": 0.1172403427025402, "flos": 27460750711680.0, "grad_norm": 1.6362224024380025, "language_loss": 0.73293662, "learning_rate": 3.920692733745835e-06, "loss": 0.81245798, "num_input_tokens_seen": 42286245, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.27685547, "step": 1950, "time_per_iteration": 2.747299909591675 }, { "auxiliary_loss_clip": 0.0666894, "auxiliary_loss_mlp": 0.01287999, "balance_loss_clip": 0.06323231, "balance_loss_mlp": 0.01258173, "epoch": 0.11730046595520818, "flos": 15674075953920.0, "grad_norm": 2.014491399463593, "language_loss": 0.77301168, "learning_rate": 3.920584111630755e-06, "loss": 0.85258114, "num_input_tokens_seen": 42302710, "router_z_loss_clip": 3.4609375, "router_z_loss_mlp": 0.29821777, "step": 1951, "time_per_iteration": 2.7623751163482666 }, { "auxiliary_loss_clip": 0.06658565, "auxiliary_loss_mlp": 0.01285172, "balance_loss_clip": 0.06313654, "balance_loss_mlp": 0.01255369, "epoch": 0.11736058920787615, "flos": 25637320085760.0, "grad_norm": 2.0399510497097304, "language_loss": 0.77926004, "learning_rate": 3.9204754166869325e-06, "loss": 0.85869741, "num_input_tokens_seen": 42324115, "router_z_loss_clip": 3.44921875, "router_z_loss_mlp": 0.29833984, "step": 1952, "time_per_iteration": 4.121968030929565 }, { "auxiliary_loss_clip": 0.0666765, "auxiliary_loss_mlp": 0.01292196, "balance_loss_clip": 0.06320512, "balance_loss_mlp": 0.01261297, "epoch": 0.11742071246054411, "flos": 21440742122880.0, "grad_norm": 2.110852154787703, "language_loss": 0.73418462, "learning_rate": 3.920366648918491e-06, "loss": 0.81378305, "num_input_tokens_seen": 42342505, "router_z_loss_clip": 3.47070312, "router_z_loss_mlp": 0.30932617, "step": 1953, "time_per_iteration": 2.688979148864746 }, { "auxiliary_loss_clip": 0.06678875, "auxiliary_loss_mlp": 0.01295942, "balance_loss_clip": 0.06324254, "balance_loss_mlp": 0.01262277, "epoch": 0.11748083571321208, "flos": 16003377699840.0, "grad_norm": 2.629359200834231, "language_loss": 0.81258243, "learning_rate": 3.920257808329552e-06, "loss": 0.89233059, "num_input_tokens_seen": 42360525, "router_z_loss_clip": 3.546875, "router_z_loss_mlp": 0.33642578, "step": 1954, "time_per_iteration": 4.076863050460815 }, { "auxiliary_loss_clip": 0.06664394, "auxiliary_loss_mlp": 0.01293493, "balance_loss_clip": 0.06315328, "balance_loss_mlp": 0.01258326, "epoch": 0.11754095896588006, "flos": 16185582403200.0, "grad_norm": 1.9767796632889651, "language_loss": 0.86900485, "learning_rate": 3.920148894924246e-06, "loss": 0.94858366, "num_input_tokens_seen": 42377045, "router_z_loss_clip": 3.4921875, "router_z_loss_mlp": 0.35131836, "step": 1955, "time_per_iteration": 2.5734145641326904 }, { "auxiliary_loss_clip": 0.06662707, "auxiliary_loss_mlp": 0.01290445, "balance_loss_clip": 0.06316449, "balance_loss_mlp": 0.01258759, "epoch": 0.11760108221854802, "flos": 13266701424000.0, "grad_norm": 2.3158222709727787, "language_loss": 0.79224956, "learning_rate": 3.920039908706701e-06, "loss": 0.87178105, "num_input_tokens_seen": 42393960, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.31665039, "step": 1956, "time_per_iteration": 4.0664122104644775 }, { "auxiliary_loss_clip": 0.06654395, "auxiliary_loss_mlp": 0.0129344, "balance_loss_clip": 0.06315206, "balance_loss_mlp": 0.01262923, "epoch": 0.11766120547121599, "flos": 24505294625280.0, "grad_norm": 4.085151581701321, "language_loss": 0.82418418, "learning_rate": 3.91993084968105e-06, "loss": 0.9036625, "num_input_tokens_seen": 42413160, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.30493164, "step": 1957, "time_per_iteration": 2.653226137161255 }, { "auxiliary_loss_clip": 0.06663565, "auxiliary_loss_mlp": 0.01299142, "balance_loss_clip": 0.06316181, "balance_loss_mlp": 0.01266097, "epoch": 0.11772132872388397, "flos": 17789562385920.0, "grad_norm": 2.6745926339974146, "language_loss": 0.79669452, "learning_rate": 3.919821717851428e-06, "loss": 0.87632161, "num_input_tokens_seen": 42432590, "router_z_loss_clip": 3.47070312, "router_z_loss_mlp": 0.33032227, "step": 1958, "time_per_iteration": 4.050562143325806 }, { "auxiliary_loss_clip": 0.06672384, "auxiliary_loss_mlp": 0.01298447, "balance_loss_clip": 0.06322809, "balance_loss_mlp": 0.01263972, "epoch": 0.11778145197655193, "flos": 13220776586880.0, "grad_norm": 1.9715676135672744, "language_loss": 0.78538543, "learning_rate": 3.919712513221976e-06, "loss": 0.86509377, "num_input_tokens_seen": 42450135, "router_z_loss_clip": 3.49804688, "router_z_loss_mlp": 0.3449707, "step": 1959, "time_per_iteration": 2.569636344909668 }, { "auxiliary_loss_clip": 0.06669998, "auxiliary_loss_mlp": 0.01290547, "balance_loss_clip": 0.06323093, "balance_loss_mlp": 0.01259004, "epoch": 0.1178415752292199, "flos": 20236446915840.0, "grad_norm": 2.4982621807307663, "language_loss": 0.71612489, "learning_rate": 3.919603235796832e-06, "loss": 0.79573035, "num_input_tokens_seen": 42470050, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.31494141, "step": 1960, "time_per_iteration": 2.66517972946167 }, { "auxiliary_loss_clip": 0.06678104, "auxiliary_loss_mlp": 0.01297364, "balance_loss_clip": 0.06325482, "balance_loss_mlp": 0.01262722, "epoch": 0.11790169848188788, "flos": 13044777085440.0, "grad_norm": 2.8762968968906026, "language_loss": 0.83000976, "learning_rate": 3.9194938855801406e-06, "loss": 0.90976441, "num_input_tokens_seen": 42484335, "router_z_loss_clip": 3.52539062, "router_z_loss_mlp": 0.34643555, "step": 1961, "time_per_iteration": 2.6083531379699707 }, { "auxiliary_loss_clip": 0.06655965, "auxiliary_loss_mlp": 0.01294027, "balance_loss_clip": 0.06319997, "balance_loss_mlp": 0.0126351, "epoch": 0.11796182173455584, "flos": 22271026631040.0, "grad_norm": 1.865938373186966, "language_loss": 0.93145943, "learning_rate": 3.919384462576049e-06, "loss": 1.01095939, "num_input_tokens_seen": 42502720, "router_z_loss_clip": 3.36132812, "router_z_loss_mlp": 0.30517578, "step": 1962, "time_per_iteration": 2.6784181594848633 }, { "auxiliary_loss_clip": 0.06664149, "auxiliary_loss_mlp": 0.0129715, "balance_loss_clip": 0.06317696, "balance_loss_mlp": 0.0126494, "epoch": 0.1180219449872238, "flos": 10639750469760.0, "grad_norm": 2.173752322547373, "language_loss": 0.89760542, "learning_rate": 3.919274966788707e-06, "loss": 0.97721845, "num_input_tokens_seen": 42519460, "router_z_loss_clip": 3.46679688, "router_z_loss_mlp": 0.32177734, "step": 1963, "time_per_iteration": 2.643177032470703 }, { "auxiliary_loss_clip": 0.06676782, "auxiliary_loss_mlp": 0.01299697, "balance_loss_clip": 0.06322226, "balance_loss_mlp": 0.01265698, "epoch": 0.11808206823989177, "flos": 20929906506240.0, "grad_norm": 2.2781769741241087, "language_loss": 0.85024601, "learning_rate": 3.919165398222265e-06, "loss": 0.9300108, "num_input_tokens_seen": 42539420, "router_z_loss_clip": 3.54296875, "router_z_loss_mlp": 0.34008789, "step": 1964, "time_per_iteration": 2.7101893424987793 }, { "auxiliary_loss_clip": 0.06671144, "auxiliary_loss_mlp": 0.01292122, "balance_loss_clip": 0.06328788, "balance_loss_mlp": 0.01260532, "epoch": 0.11814219149255975, "flos": 20784151128960.0, "grad_norm": 2.29837604242661, "language_loss": 0.84130979, "learning_rate": 3.919055756880879e-06, "loss": 0.92094243, "num_input_tokens_seen": 42558225, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.31591797, "step": 1965, "time_per_iteration": 2.624079704284668 }, { "auxiliary_loss_clip": 0.06671554, "auxiliary_loss_mlp": 0.01298898, "balance_loss_clip": 0.06319286, "balance_loss_mlp": 0.01261633, "epoch": 0.11820231474522772, "flos": 48770594357760.0, "grad_norm": 1.9774220467813268, "language_loss": 0.75649196, "learning_rate": 3.918946042768707e-06, "loss": 0.83619648, "num_input_tokens_seen": 42580790, "router_z_loss_clip": 3.5234375, "router_z_loss_mlp": 0.37255859, "step": 1966, "time_per_iteration": 2.9230074882507324 }, { "auxiliary_loss_clip": 0.06671228, "auxiliary_loss_mlp": 0.01298607, "balance_loss_clip": 0.06324338, "balance_loss_mlp": 0.01264919, "epoch": 0.11826243799789568, "flos": 16696166457600.0, "grad_norm": 2.356966514769353, "language_loss": 0.74954194, "learning_rate": 3.918836255889908e-06, "loss": 0.82924032, "num_input_tokens_seen": 42597355, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.3371582, "step": 1967, "time_per_iteration": 2.704519271850586 }, { "auxiliary_loss_clip": 0.0667581, "auxiliary_loss_mlp": 0.0129802, "balance_loss_clip": 0.06321435, "balance_loss_mlp": 0.01264164, "epoch": 0.11832256125056366, "flos": 16915533246720.0, "grad_norm": 2.184959350684013, "language_loss": 0.90192902, "learning_rate": 3.9187263962486456e-06, "loss": 0.98166728, "num_input_tokens_seen": 42616060, "router_z_loss_clip": 3.54296875, "router_z_loss_mlp": 0.33886719, "step": 1968, "time_per_iteration": 2.628969430923462 }, { "auxiliary_loss_clip": 0.06678899, "auxiliary_loss_mlp": 0.01289004, "balance_loss_clip": 0.06331887, "balance_loss_mlp": 0.01259034, "epoch": 0.11838268450323162, "flos": 22827032398080.0, "grad_norm": 2.0686097785429483, "language_loss": 0.68483168, "learning_rate": 3.918616463849087e-06, "loss": 0.76451063, "num_input_tokens_seen": 42636285, "router_z_loss_clip": 3.46679688, "router_z_loss_mlp": 0.29980469, "step": 1969, "time_per_iteration": 2.651386260986328 }, { "auxiliary_loss_clip": 0.06669046, "auxiliary_loss_mlp": 0.01300492, "balance_loss_clip": 0.06322227, "balance_loss_mlp": 0.01266994, "epoch": 0.11844280775589959, "flos": 33554035296000.0, "grad_norm": 1.8987461412168165, "language_loss": 0.82406372, "learning_rate": 3.918506458695399e-06, "loss": 0.90375906, "num_input_tokens_seen": 42658320, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.33520508, "step": 1970, "time_per_iteration": 2.767200469970703 }, { "auxiliary_loss_clip": 0.06550057, "auxiliary_loss_mlp": 0.01264661, "balance_loss_clip": 0.06325336, "balance_loss_mlp": 0.01252698, "epoch": 0.11850293100856757, "flos": 66371522474880.0, "grad_norm": 0.784734355836642, "language_loss": 0.66316283, "learning_rate": 3.918396380791754e-06, "loss": 0.74131, "num_input_tokens_seen": 42721500, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.11956787, "step": 1971, "time_per_iteration": 3.2080862522125244 }, { "auxiliary_loss_clip": 0.06692907, "auxiliary_loss_mlp": 0.01292006, "balance_loss_clip": 0.06339626, "balance_loss_mlp": 0.01258532, "epoch": 0.11856305426123553, "flos": 24687960526080.0, "grad_norm": 1.9898344806634025, "language_loss": 0.81438309, "learning_rate": 3.918286230142327e-06, "loss": 0.89423215, "num_input_tokens_seen": 42739825, "router_z_loss_clip": 3.53515625, "router_z_loss_mlp": 0.33496094, "step": 1972, "time_per_iteration": 2.681095838546753 }, { "auxiliary_loss_clip": 0.06671526, "auxiliary_loss_mlp": 0.01296047, "balance_loss_clip": 0.06327984, "balance_loss_mlp": 0.01263789, "epoch": 0.1186231775139035, "flos": 24287017939200.0, "grad_norm": 2.663653158390204, "language_loss": 0.74380159, "learning_rate": 3.918176006751292e-06, "loss": 0.82347733, "num_input_tokens_seen": 42758695, "router_z_loss_clip": 3.43359375, "router_z_loss_mlp": 0.32250977, "step": 1973, "time_per_iteration": 2.6248931884765625 }, { "auxiliary_loss_clip": 0.06673607, "auxiliary_loss_mlp": 0.01290344, "balance_loss_clip": 0.06328759, "balance_loss_mlp": 0.0125594, "epoch": 0.11868330076657148, "flos": 21763042053120.0, "grad_norm": 1.7266152138468815, "language_loss": 0.73322082, "learning_rate": 3.918065710622832e-06, "loss": 0.81286037, "num_input_tokens_seen": 42778510, "router_z_loss_clip": 3.44921875, "router_z_loss_mlp": 0.34399414, "step": 1974, "time_per_iteration": 2.629833459854126 }, { "auxiliary_loss_clip": 0.06675911, "auxiliary_loss_mlp": 0.01298991, "balance_loss_clip": 0.06325506, "balance_loss_mlp": 0.01265374, "epoch": 0.11874342401923944, "flos": 17197568490240.0, "grad_norm": 2.587089120832187, "language_loss": 0.79621851, "learning_rate": 3.917955341761128e-06, "loss": 0.8759675, "num_input_tokens_seen": 42793995, "router_z_loss_clip": 3.50390625, "router_z_loss_mlp": 0.33666992, "step": 1975, "time_per_iteration": 2.583198070526123 }, { "auxiliary_loss_clip": 0.06664468, "auxiliary_loss_mlp": 0.01290157, "balance_loss_clip": 0.06324387, "balance_loss_mlp": 0.0125865, "epoch": 0.11880354727190741, "flos": 15234629616000.0, "grad_norm": 2.18505094483664, "language_loss": 0.76893008, "learning_rate": 3.917844900170364e-06, "loss": 0.84847641, "num_input_tokens_seen": 42809000, "router_z_loss_clip": 3.40039062, "router_z_loss_mlp": 0.31481934, "step": 1976, "time_per_iteration": 2.604430675506592 }, { "auxiliary_loss_clip": 0.06681673, "auxiliary_loss_mlp": 0.01296559, "balance_loss_clip": 0.06330037, "balance_loss_mlp": 0.01262585, "epoch": 0.11886367052457537, "flos": 27317343248640.0, "grad_norm": 1.6089698553512903, "language_loss": 0.76356822, "learning_rate": 3.91773438585473e-06, "loss": 0.84335053, "num_input_tokens_seen": 42831585, "router_z_loss_clip": 3.51757812, "router_z_loss_mlp": 0.33984375, "step": 1977, "time_per_iteration": 2.6653900146484375 }, { "auxiliary_loss_clip": 0.06686049, "auxiliary_loss_mlp": 0.01291745, "balance_loss_clip": 0.06326656, "balance_loss_mlp": 0.01257437, "epoch": 0.11892379377724335, "flos": 21804648405120.0, "grad_norm": 3.7079965114577482, "language_loss": 0.75955701, "learning_rate": 3.9176237988184165e-06, "loss": 0.83933496, "num_input_tokens_seen": 42848420, "router_z_loss_clip": 3.59375, "router_z_loss_mlp": 0.34326172, "step": 1978, "time_per_iteration": 2.6067562103271484 }, { "auxiliary_loss_clip": 0.0666989, "auxiliary_loss_mlp": 0.01289454, "balance_loss_clip": 0.06322923, "balance_loss_mlp": 0.01257005, "epoch": 0.11898391702991132, "flos": 13996191070080.0, "grad_norm": 1.7459529819780002, "language_loss": 0.74619722, "learning_rate": 3.917513139065616e-06, "loss": 0.82579064, "num_input_tokens_seen": 42866645, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.32495117, "step": 1979, "time_per_iteration": 2.585822343826294 }, { "auxiliary_loss_clip": 0.06673482, "auxiliary_loss_mlp": 0.01292077, "balance_loss_clip": 0.06326729, "balance_loss_mlp": 0.01259509, "epoch": 0.11904404028257928, "flos": 32242907733120.0, "grad_norm": 1.6880834964565588, "language_loss": 0.99836838, "learning_rate": 3.917402406600525e-06, "loss": 1.07802391, "num_input_tokens_seen": 42888515, "router_z_loss_clip": 3.46679688, "router_z_loss_mlp": 0.32568359, "step": 1980, "time_per_iteration": 2.6958765983581543 }, { "auxiliary_loss_clip": 0.06670927, "auxiliary_loss_mlp": 0.01290378, "balance_loss_clip": 0.06321254, "balance_loss_mlp": 0.01257167, "epoch": 0.11910416353524726, "flos": 23592971370240.0, "grad_norm": 1.7744681697219922, "language_loss": 0.87364393, "learning_rate": 3.917291601427342e-06, "loss": 0.95325696, "num_input_tokens_seen": 42909035, "router_z_loss_clip": 3.49609375, "router_z_loss_mlp": 0.33203125, "step": 1981, "time_per_iteration": 2.6188480854034424 }, { "auxiliary_loss_clip": 0.06668454, "auxiliary_loss_mlp": 0.01295252, "balance_loss_clip": 0.06320257, "balance_loss_mlp": 0.0126123, "epoch": 0.11916428678791523, "flos": 25339268712960.0, "grad_norm": 1.829027695625723, "language_loss": 0.87002993, "learning_rate": 3.91718072355027e-06, "loss": 0.94966698, "num_input_tokens_seen": 42927555, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.34033203, "step": 1982, "time_per_iteration": 2.6864326000213623 }, { "auxiliary_loss_clip": 0.06661372, "auxiliary_loss_mlp": 0.01291519, "balance_loss_clip": 0.06316379, "balance_loss_mlp": 0.01260239, "epoch": 0.11922441004058319, "flos": 19793939904000.0, "grad_norm": 1.8657623537869674, "language_loss": 0.86596388, "learning_rate": 3.917069772973513e-06, "loss": 0.94549274, "num_input_tokens_seen": 42945300, "router_z_loss_clip": 3.44921875, "router_z_loss_mlp": 0.31311035, "step": 1983, "time_per_iteration": 2.654564619064331 }, { "auxiliary_loss_clip": 0.06688195, "auxiliary_loss_mlp": 0.01290285, "balance_loss_clip": 0.06327978, "balance_loss_mlp": 0.01256454, "epoch": 0.11928453329325117, "flos": 21541578912000.0, "grad_norm": 2.625433363490397, "language_loss": 0.77867281, "learning_rate": 3.916958749701277e-06, "loss": 0.85845757, "num_input_tokens_seen": 42961295, "router_z_loss_clip": 3.6015625, "router_z_loss_mlp": 0.33813477, "step": 1984, "time_per_iteration": 2.651768922805786 }, { "auxiliary_loss_clip": 0.06681857, "auxiliary_loss_mlp": 0.01297806, "balance_loss_clip": 0.06328661, "balance_loss_mlp": 0.01263474, "epoch": 0.11934465654591914, "flos": 20821522849920.0, "grad_norm": 2.215151767951213, "language_loss": 0.84551775, "learning_rate": 3.9168476537377745e-06, "loss": 0.92531443, "num_input_tokens_seen": 42980330, "router_z_loss_clip": 3.53125, "router_z_loss_mlp": 0.34301758, "step": 1985, "time_per_iteration": 2.620133876800537 }, { "auxiliary_loss_clip": 0.06664218, "auxiliary_loss_mlp": 0.01291439, "balance_loss_clip": 0.06319501, "balance_loss_mlp": 0.01259896, "epoch": 0.1194047797985871, "flos": 19066169266560.0, "grad_norm": 1.946619412474834, "language_loss": 0.75739169, "learning_rate": 3.916736485087216e-06, "loss": 0.83694828, "num_input_tokens_seen": 42996125, "router_z_loss_clip": 3.4453125, "router_z_loss_mlp": 0.31542969, "step": 1986, "time_per_iteration": 2.58420729637146 }, { "auxiliary_loss_clip": 0.06679335, "auxiliary_loss_mlp": 0.01296551, "balance_loss_clip": 0.06330016, "balance_loss_mlp": 0.01264388, "epoch": 0.11946490305125507, "flos": 27196842677760.0, "grad_norm": 2.048986932565308, "language_loss": 0.73895997, "learning_rate": 3.916625243753819e-06, "loss": 0.81871879, "num_input_tokens_seen": 43014180, "router_z_loss_clip": 3.49023438, "router_z_loss_mlp": 0.3215332, "step": 1987, "time_per_iteration": 2.6242306232452393 }, { "auxiliary_loss_clip": 0.06681274, "auxiliary_loss_mlp": 0.01296887, "balance_loss_clip": 0.06328075, "balance_loss_mlp": 0.0126048, "epoch": 0.11952502630392305, "flos": 21146925381120.0, "grad_norm": 2.027231224932873, "language_loss": 0.73175812, "learning_rate": 3.916513929741799e-06, "loss": 0.81153977, "num_input_tokens_seen": 43032120, "router_z_loss_clip": 3.53320312, "router_z_loss_mlp": 0.36425781, "step": 1988, "time_per_iteration": 2.6603357791900635 }, { "auxiliary_loss_clip": 0.06676263, "auxiliary_loss_mlp": 0.01300083, "balance_loss_clip": 0.06330296, "balance_loss_mlp": 0.01265775, "epoch": 0.11958514955659101, "flos": 22130260571520.0, "grad_norm": 2.2677006398543926, "language_loss": 0.82254291, "learning_rate": 3.91640254305538e-06, "loss": 0.90230638, "num_input_tokens_seen": 43052215, "router_z_loss_clip": 3.4609375, "router_z_loss_mlp": 0.34326172, "step": 1989, "time_per_iteration": 2.6058130264282227 }, { "auxiliary_loss_clip": 0.0668063, "auxiliary_loss_mlp": 0.013012, "balance_loss_clip": 0.06327389, "balance_loss_mlp": 0.01265818, "epoch": 0.11964527280925898, "flos": 17427333185280.0, "grad_norm": 3.5452026408787853, "language_loss": 0.77502626, "learning_rate": 3.916291083698784e-06, "loss": 0.85484457, "num_input_tokens_seen": 43069720, "router_z_loss_clip": 3.53125, "router_z_loss_mlp": 0.35400391, "step": 1990, "time_per_iteration": 2.6167078018188477 }, { "auxiliary_loss_clip": 0.06529604, "auxiliary_loss_mlp": 0.01268248, "balance_loss_clip": 0.06303058, "balance_loss_mlp": 0.01256524, "epoch": 0.11970539606192696, "flos": 70698804007680.0, "grad_norm": 0.8414873291481804, "language_loss": 0.55348802, "learning_rate": 3.916179551676238e-06, "loss": 0.63146651, "num_input_tokens_seen": 43123130, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.11712646, "step": 1991, "time_per_iteration": 3.2240688800811768 }, { "auxiliary_loss_clip": 0.06668018, "auxiliary_loss_mlp": 0.01291523, "balance_loss_clip": 0.06329835, "balance_loss_mlp": 0.01262149, "epoch": 0.11976551931459492, "flos": 21221375333760.0, "grad_norm": 2.2552688300825636, "language_loss": 0.78781033, "learning_rate": 3.916067946991971e-06, "loss": 0.86740571, "num_input_tokens_seen": 43140015, "router_z_loss_clip": 3.38085938, "router_z_loss_mlp": 0.29418945, "step": 1992, "time_per_iteration": 4.115568161010742 }, { "auxiliary_loss_clip": 0.06685202, "auxiliary_loss_mlp": 0.0129266, "balance_loss_clip": 0.06333624, "balance_loss_mlp": 0.01259543, "epoch": 0.11982564256726289, "flos": 25995566217600.0, "grad_norm": 2.976162129343215, "language_loss": 0.7998234, "learning_rate": 3.915956269650216e-06, "loss": 0.87960196, "num_input_tokens_seen": 43160105, "router_z_loss_clip": 3.515625, "router_z_loss_mlp": 0.33105469, "step": 1993, "time_per_iteration": 4.149216890335083 }, { "auxiliary_loss_clip": 0.06671647, "auxiliary_loss_mlp": 0.01296317, "balance_loss_clip": 0.06328364, "balance_loss_mlp": 0.01265442, "epoch": 0.11988576581993086, "flos": 21656964384000.0, "grad_norm": 2.797609405656881, "language_loss": 0.83724558, "learning_rate": 3.915844519655208e-06, "loss": 0.91692525, "num_input_tokens_seen": 43179835, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.30847168, "step": 1994, "time_per_iteration": 2.7715189456939697 }, { "auxiliary_loss_clip": 0.06664415, "auxiliary_loss_mlp": 0.01290649, "balance_loss_clip": 0.06323615, "balance_loss_mlp": 0.01258999, "epoch": 0.11994588907259883, "flos": 17863048016640.0, "grad_norm": 2.419014028224383, "language_loss": 0.90825856, "learning_rate": 3.915732697011183e-06, "loss": 0.98780918, "num_input_tokens_seen": 43197210, "router_z_loss_clip": 3.40429688, "router_z_loss_mlp": 0.31652832, "step": 1995, "time_per_iteration": 2.701319932937622 }, { "auxiliary_loss_clip": 0.0667762, "auxiliary_loss_mlp": 0.01291326, "balance_loss_clip": 0.06329444, "balance_loss_mlp": 0.0125871, "epoch": 0.1200060123252668, "flos": 24469725767040.0, "grad_norm": 2.2905087479505335, "language_loss": 0.75889766, "learning_rate": 3.9156208017223825e-06, "loss": 0.83858711, "num_input_tokens_seen": 43215050, "router_z_loss_clip": 3.48046875, "router_z_loss_mlp": 0.32592773, "step": 1996, "time_per_iteration": 4.09159255027771 }, { "auxiliary_loss_clip": 0.06663934, "auxiliary_loss_mlp": 0.01295397, "balance_loss_clip": 0.06325096, "balance_loss_mlp": 0.01264021, "epoch": 0.12006613557793476, "flos": 18737831842560.0, "grad_norm": 1.866605430313135, "language_loss": 0.88864857, "learning_rate": 3.915508833793048e-06, "loss": 0.96824187, "num_input_tokens_seen": 43233900, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.31396484, "step": 1997, "time_per_iteration": 2.6010241508483887 }, { "auxiliary_loss_clip": 0.06665299, "auxiliary_loss_mlp": 0.01288032, "balance_loss_clip": 0.06324785, "balance_loss_mlp": 0.01257491, "epoch": 0.12012625883060274, "flos": 22273374545280.0, "grad_norm": 2.0636582815899995, "language_loss": 0.80143958, "learning_rate": 3.915396793227428e-06, "loss": 0.88097286, "num_input_tokens_seen": 43252105, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.30517578, "step": 1998, "time_per_iteration": 4.068720102310181 }, { "auxiliary_loss_clip": 0.06646979, "auxiliary_loss_mlp": 0.01288392, "balance_loss_clip": 0.06313373, "balance_loss_mlp": 0.01257874, "epoch": 0.1201863820832707, "flos": 21764761061760.0, "grad_norm": 2.1242383901881117, "language_loss": 0.74449384, "learning_rate": 3.915284680029769e-06, "loss": 0.82384759, "num_input_tokens_seen": 43270315, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.30517578, "step": 1999, "time_per_iteration": 2.611339807510376 }, { "auxiliary_loss_clip": 0.06664731, "auxiliary_loss_mlp": 0.01292093, "balance_loss_clip": 0.06317364, "balance_loss_mlp": 0.01261742, "epoch": 0.12024650533593867, "flos": 21914415653760.0, "grad_norm": 3.0193426529622722, "language_loss": 0.76335311, "learning_rate": 3.915172494204323e-06, "loss": 0.84292138, "num_input_tokens_seen": 43289935, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.3034668, "step": 2000, "time_per_iteration": 2.6348862648010254 }, { "auxiliary_loss_clip": 0.06661794, "auxiliary_loss_mlp": 0.01286741, "balance_loss_clip": 0.06317423, "balance_loss_mlp": 0.01255675, "epoch": 0.12030662858860665, "flos": 21695635843200.0, "grad_norm": 1.7247365612951768, "language_loss": 0.86480206, "learning_rate": 3.915060235755344e-06, "loss": 0.94428742, "num_input_tokens_seen": 43309325, "router_z_loss_clip": 3.44335938, "router_z_loss_mlp": 0.31054688, "step": 2001, "time_per_iteration": 2.6052653789520264 }, { "auxiliary_loss_clip": 0.0666251, "auxiliary_loss_mlp": 0.01293735, "balance_loss_clip": 0.06320523, "balance_loss_mlp": 0.0126286, "epoch": 0.12036675184127461, "flos": 12938280145920.0, "grad_norm": 3.2999908929247845, "language_loss": 0.76396286, "learning_rate": 3.91494790468709e-06, "loss": 0.84352535, "num_input_tokens_seen": 43327010, "router_z_loss_clip": 3.41601562, "router_z_loss_mlp": 0.30883789, "step": 2002, "time_per_iteration": 2.5901994705200195 }, { "auxiliary_loss_clip": 0.066781, "auxiliary_loss_mlp": 0.0128888, "balance_loss_clip": 0.06324723, "balance_loss_mlp": 0.01257886, "epoch": 0.12042687509394258, "flos": 20857469051520.0, "grad_norm": 2.4074435649237085, "language_loss": 0.80273557, "learning_rate": 3.9148355010038185e-06, "loss": 0.88240534, "num_input_tokens_seen": 43345650, "router_z_loss_clip": 3.53515625, "router_z_loss_mlp": 0.31005859, "step": 2003, "time_per_iteration": 2.596489429473877 }, { "auxiliary_loss_clip": 0.06671172, "auxiliary_loss_mlp": 0.01291988, "balance_loss_clip": 0.06328445, "balance_loss_mlp": 0.01262043, "epoch": 0.12048699834661056, "flos": 23885320665600.0, "grad_norm": 1.8276528225808473, "language_loss": 0.73013288, "learning_rate": 3.914723024709793e-06, "loss": 0.80976444, "num_input_tokens_seen": 43365555, "router_z_loss_clip": 3.42578125, "router_z_loss_mlp": 0.29931641, "step": 2004, "time_per_iteration": 2.6301188468933105 }, { "auxiliary_loss_clip": 0.06672995, "auxiliary_loss_mlp": 0.01289306, "balance_loss_clip": 0.06323993, "balance_loss_mlp": 0.01257501, "epoch": 0.12054712159927852, "flos": 19762605676800.0, "grad_norm": 2.0025857022610296, "language_loss": 0.79908019, "learning_rate": 3.914610475809279e-06, "loss": 0.87870324, "num_input_tokens_seen": 43384990, "router_z_loss_clip": 3.4921875, "router_z_loss_mlp": 0.31811523, "step": 2005, "time_per_iteration": 2.6069774627685547 }, { "auxiliary_loss_clip": 0.06530743, "auxiliary_loss_mlp": 0.01281264, "balance_loss_clip": 0.06309827, "balance_loss_mlp": 0.01271012, "epoch": 0.12060724485194649, "flos": 51688999411200.0, "grad_norm": 0.8919646990823881, "language_loss": 0.58214134, "learning_rate": 3.914497854306543e-06, "loss": 0.66026145, "num_input_tokens_seen": 43436335, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.10253906, "step": 2006, "time_per_iteration": 3.030663013458252 }, { "auxiliary_loss_clip": 0.06652389, "auxiliary_loss_mlp": 0.01287027, "balance_loss_clip": 0.06315408, "balance_loss_mlp": 0.01256294, "epoch": 0.12066736810461445, "flos": 18996582850560.0, "grad_norm": 1.8753830220403427, "language_loss": 0.77871305, "learning_rate": 3.9143851602058575e-06, "loss": 0.85810721, "num_input_tokens_seen": 43456495, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.30725098, "step": 2007, "time_per_iteration": 2.644420623779297 }, { "auxiliary_loss_clip": 0.06660695, "auxiliary_loss_mlp": 0.01291719, "balance_loss_clip": 0.0631538, "balance_loss_mlp": 0.01259437, "epoch": 0.12072749135728243, "flos": 16477554355200.0, "grad_norm": 4.3577174774792935, "language_loss": 0.85235661, "learning_rate": 3.914272393511494e-06, "loss": 0.93188083, "num_input_tokens_seen": 43473085, "router_z_loss_clip": 3.45117188, "router_z_loss_mlp": 0.32275391, "step": 2008, "time_per_iteration": 2.6049115657806396 }, { "auxiliary_loss_clip": 0.06665313, "auxiliary_loss_mlp": 0.01287415, "balance_loss_clip": 0.06320223, "balance_loss_mlp": 0.01258328, "epoch": 0.1207876146099504, "flos": 18082917930240.0, "grad_norm": 2.0149803657976353, "language_loss": 0.85010189, "learning_rate": 3.91415955422773e-06, "loss": 0.92962915, "num_input_tokens_seen": 43491135, "router_z_loss_clip": 3.45117188, "router_z_loss_mlp": 0.29101562, "step": 2009, "time_per_iteration": 2.6175761222839355 }, { "auxiliary_loss_clip": 0.06656942, "auxiliary_loss_mlp": 0.01292996, "balance_loss_clip": 0.06319468, "balance_loss_mlp": 0.01262526, "epoch": 0.12084773786261836, "flos": 21878008254720.0, "grad_norm": 2.1523582304301088, "language_loss": 0.85558176, "learning_rate": 3.914046642358844e-06, "loss": 0.93508112, "num_input_tokens_seen": 43510440, "router_z_loss_clip": 3.37304688, "router_z_loss_mlp": 0.30493164, "step": 2010, "time_per_iteration": 2.6464717388153076 }, { "auxiliary_loss_clip": 0.06670299, "auxiliary_loss_mlp": 0.01299259, "balance_loss_clip": 0.06323658, "balance_loss_mlp": 0.01266167, "epoch": 0.12090786111528634, "flos": 18338985607680.0, "grad_norm": 1.912663180725063, "language_loss": 0.8536253, "learning_rate": 3.9139336579091174e-06, "loss": 0.93332088, "num_input_tokens_seen": 43530145, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.33056641, "step": 2011, "time_per_iteration": 2.6002657413482666 }, { "auxiliary_loss_clip": 0.06676108, "auxiliary_loss_mlp": 0.01289377, "balance_loss_clip": 0.06327154, "balance_loss_mlp": 0.01257977, "epoch": 0.1209679843679543, "flos": 21112236990720.0, "grad_norm": 3.4063221265044232, "language_loss": 0.9780165, "learning_rate": 3.913820600882834e-06, "loss": 1.05767131, "num_input_tokens_seen": 43549315, "router_z_loss_clip": 3.49414062, "router_z_loss_mlp": 0.31420898, "step": 2012, "time_per_iteration": 2.630295515060425 }, { "auxiliary_loss_clip": 0.06653053, "auxiliary_loss_mlp": 0.0128872, "balance_loss_clip": 0.06316875, "balance_loss_mlp": 0.01258393, "epoch": 0.12102810762062227, "flos": 29248612479360.0, "grad_norm": 1.9854513399041707, "language_loss": 0.81873858, "learning_rate": 3.913707471284283e-06, "loss": 0.8981564, "num_input_tokens_seen": 43569240, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.30322266, "step": 2013, "time_per_iteration": 2.658069133758545 }, { "auxiliary_loss_clip": 0.0667921, "auxiliary_loss_mlp": 0.01293511, "balance_loss_clip": 0.06327724, "balance_loss_mlp": 0.0126142, "epoch": 0.12108823087329025, "flos": 17936407866240.0, "grad_norm": 3.0399817808400202, "language_loss": 0.78518796, "learning_rate": 3.9135942691177515e-06, "loss": 0.86491519, "num_input_tokens_seen": 43587710, "router_z_loss_clip": 3.50976562, "router_z_loss_mlp": 0.32055664, "step": 2014, "time_per_iteration": 2.69826602935791 }, { "auxiliary_loss_clip": 0.06668429, "auxiliary_loss_mlp": 0.01294162, "balance_loss_clip": 0.06327156, "balance_loss_mlp": 0.0126355, "epoch": 0.12114835412595822, "flos": 22098549000960.0, "grad_norm": 3.32259324955648, "language_loss": 0.88428509, "learning_rate": 3.913480994387535e-06, "loss": 0.96391094, "num_input_tokens_seen": 43606000, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.30566406, "step": 2015, "time_per_iteration": 2.6266419887542725 }, { "auxiliary_loss_clip": 0.06665312, "auxiliary_loss_mlp": 0.01297892, "balance_loss_clip": 0.06326847, "balance_loss_mlp": 0.0126735, "epoch": 0.12120847737862618, "flos": 20418567765120.0, "grad_norm": 2.7188567638636125, "language_loss": 0.71118283, "learning_rate": 3.913367647097926e-06, "loss": 0.79081488, "num_input_tokens_seen": 43624815, "router_z_loss_clip": 3.38085938, "router_z_loss_mlp": 0.30541992, "step": 2016, "time_per_iteration": 2.6465635299682617 }, { "auxiliary_loss_clip": 0.06681353, "auxiliary_loss_mlp": 0.01300361, "balance_loss_clip": 0.06332939, "balance_loss_mlp": 0.01265743, "epoch": 0.12126860063129415, "flos": 22315484021760.0, "grad_norm": 2.4584805865110173, "language_loss": 0.82206166, "learning_rate": 3.913254227253225e-06, "loss": 0.90187877, "num_input_tokens_seen": 43643960, "router_z_loss_clip": 3.48632812, "router_z_loss_mlp": 0.34619141, "step": 2017, "time_per_iteration": 2.591280460357666 }, { "auxiliary_loss_clip": 0.0668739, "auxiliary_loss_mlp": 0.01300237, "balance_loss_clip": 0.06338536, "balance_loss_mlp": 0.0126898, "epoch": 0.12132872388396213, "flos": 13704428753280.0, "grad_norm": 3.3616696475697783, "language_loss": 0.70661056, "learning_rate": 3.913140734857731e-06, "loss": 0.7864868, "num_input_tokens_seen": 43662650, "router_z_loss_clip": 3.48632812, "router_z_loss_mlp": 0.3125, "step": 2018, "time_per_iteration": 2.614570140838623 }, { "auxiliary_loss_clip": 0.06672039, "auxiliary_loss_mlp": 0.01290628, "balance_loss_clip": 0.06332742, "balance_loss_mlp": 0.0126098, "epoch": 0.12138884713663009, "flos": 26473851722880.0, "grad_norm": 1.9267426743399785, "language_loss": 0.73371083, "learning_rate": 3.91302716991575e-06, "loss": 0.8133375, "num_input_tokens_seen": 43684205, "router_z_loss_clip": 3.39257812, "router_z_loss_mlp": 0.29650879, "step": 2019, "time_per_iteration": 2.6509287357330322 }, { "auxiliary_loss_clip": 0.06675591, "auxiliary_loss_mlp": 0.01300396, "balance_loss_clip": 0.06329313, "balance_loss_mlp": 0.01270928, "epoch": 0.12144897038929806, "flos": 26148952316160.0, "grad_norm": 1.944630818416307, "language_loss": 0.93376637, "learning_rate": 3.912913532431586e-06, "loss": 1.0135262, "num_input_tokens_seen": 43706320, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.29492188, "step": 2020, "time_per_iteration": 2.6440539360046387 }, { "auxiliary_loss_clip": 0.0667241, "auxiliary_loss_mlp": 0.01298477, "balance_loss_clip": 0.06331992, "balance_loss_mlp": 0.01267364, "epoch": 0.12150909364196603, "flos": 24724451779200.0, "grad_norm": 2.706908048548949, "language_loss": 0.79275727, "learning_rate": 3.912799822409549e-06, "loss": 0.87246615, "num_input_tokens_seen": 43724805, "router_z_loss_clip": 3.40039062, "router_z_loss_mlp": 0.31103516, "step": 2021, "time_per_iteration": 2.6395931243896484 }, { "auxiliary_loss_clip": 0.06672388, "auxiliary_loss_mlp": 0.01301162, "balance_loss_clip": 0.06336221, "balance_loss_mlp": 0.01271276, "epoch": 0.121569216894634, "flos": 25193177919360.0, "grad_norm": 2.309400634060028, "language_loss": 0.82288653, "learning_rate": 3.912686039853952e-06, "loss": 0.90262198, "num_input_tokens_seen": 43742320, "router_z_loss_clip": 3.36132812, "router_z_loss_mlp": 0.29870605, "step": 2022, "time_per_iteration": 2.661166191101074 }, { "auxiliary_loss_clip": 0.06692475, "auxiliary_loss_mlp": 0.01295488, "balance_loss_clip": 0.0634577, "balance_loss_mlp": 0.01263754, "epoch": 0.12162934014730196, "flos": 13449241543680.0, "grad_norm": 2.149794629327988, "language_loss": 0.86572969, "learning_rate": 3.912572184769108e-06, "loss": 0.94560933, "num_input_tokens_seen": 43760665, "router_z_loss_clip": 3.46484375, "router_z_loss_mlp": 0.31738281, "step": 2023, "time_per_iteration": 2.5834767818450928 }, { "auxiliary_loss_clip": 0.0667965, "auxiliary_loss_mlp": 0.01297176, "balance_loss_clip": 0.0633366, "balance_loss_mlp": 0.01264394, "epoch": 0.12168946339996994, "flos": 16951772937600.0, "grad_norm": 3.0991522527298243, "language_loss": 0.87941372, "learning_rate": 3.912458257159335e-06, "loss": 0.95918202, "num_input_tokens_seen": 43779020, "router_z_loss_clip": 3.4609375, "router_z_loss_mlp": 0.32763672, "step": 2024, "time_per_iteration": 2.59972882270813 }, { "auxiliary_loss_clip": 0.06672604, "auxiliary_loss_mlp": 0.0129592, "balance_loss_clip": 0.06331176, "balance_loss_mlp": 0.01265843, "epoch": 0.12174958665263791, "flos": 29828699095680.0, "grad_norm": 2.052398430340737, "language_loss": 0.73275286, "learning_rate": 3.912344257028954e-06, "loss": 0.81243807, "num_input_tokens_seen": 43798850, "router_z_loss_clip": 3.4140625, "router_z_loss_mlp": 0.30090332, "step": 2025, "time_per_iteration": 2.6736183166503906 }, { "auxiliary_loss_clip": 0.06670412, "auxiliary_loss_mlp": 0.01293829, "balance_loss_clip": 0.06329495, "balance_loss_mlp": 0.01261488, "epoch": 0.12180970990530587, "flos": 24648366672000.0, "grad_norm": 2.2464787866506803, "language_loss": 0.76857626, "learning_rate": 3.912230184382286e-06, "loss": 0.84821868, "num_input_tokens_seen": 43820130, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.3236084, "step": 2026, "time_per_iteration": 2.6678597927093506 }, { "auxiliary_loss_clip": 0.06671368, "auxiliary_loss_mlp": 0.01290776, "balance_loss_clip": 0.0633187, "balance_loss_mlp": 0.0126021, "epoch": 0.12186983315797385, "flos": 20527915743360.0, "grad_norm": 2.249147395527317, "language_loss": 0.8978951, "learning_rate": 3.912116039223659e-06, "loss": 0.97751659, "num_input_tokens_seen": 43838485, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.30541992, "step": 2027, "time_per_iteration": 2.618662118911743 }, { "auxiliary_loss_clip": 0.06667203, "auxiliary_loss_mlp": 0.01290937, "balance_loss_clip": 0.06332479, "balance_loss_mlp": 0.01260801, "epoch": 0.12192995641064182, "flos": 27825705169920.0, "grad_norm": 2.0484507865229338, "language_loss": 0.7680217, "learning_rate": 3.912001821557399e-06, "loss": 0.84760308, "num_input_tokens_seen": 43859080, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.30126953, "step": 2028, "time_per_iteration": 2.714759111404419 }, { "auxiliary_loss_clip": 0.06673577, "auxiliary_loss_mlp": 0.01294258, "balance_loss_clip": 0.06333582, "balance_loss_mlp": 0.01264551, "epoch": 0.12199007966330978, "flos": 22023512069760.0, "grad_norm": 2.2327665035655104, "language_loss": 0.78570402, "learning_rate": 3.911887531387839e-06, "loss": 0.86538243, "num_input_tokens_seen": 43879030, "router_z_loss_clip": 3.39648438, "router_z_loss_mlp": 0.296875, "step": 2029, "time_per_iteration": 2.623775005340576 }, { "auxiliary_loss_clip": 0.0667366, "auxiliary_loss_mlp": 0.01287723, "balance_loss_clip": 0.06335774, "balance_loss_mlp": 0.01258636, "epoch": 0.12205020291597775, "flos": 23302005667200.0, "grad_norm": 1.7064313357588097, "language_loss": 0.80840766, "learning_rate": 3.911773168719313e-06, "loss": 0.88802147, "num_input_tokens_seen": 43898505, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.29077148, "step": 2030, "time_per_iteration": 2.651017665863037 }, { "auxiliary_loss_clip": 0.06661309, "auxiliary_loss_mlp": 0.01291563, "balance_loss_clip": 0.06324676, "balance_loss_mlp": 0.01259519, "epoch": 0.12211032616864573, "flos": 26038849651200.0, "grad_norm": 2.6559404920734315, "language_loss": 0.75797635, "learning_rate": 3.911658733556155e-06, "loss": 0.8375051, "num_input_tokens_seen": 43917945, "router_z_loss_clip": 3.3671875, "router_z_loss_mlp": 0.32043457, "step": 2031, "time_per_iteration": 4.024958372116089 }, { "auxiliary_loss_clip": 0.06660353, "auxiliary_loss_mlp": 0.01291979, "balance_loss_clip": 0.06325457, "balance_loss_mlp": 0.0126139, "epoch": 0.12217044942131369, "flos": 20416932610560.0, "grad_norm": 2.6048658222113823, "language_loss": 0.76757002, "learning_rate": 3.911544225902707e-06, "loss": 0.84709334, "num_input_tokens_seen": 43937385, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.3059082, "step": 2032, "time_per_iteration": 2.6739118099212646 }, { "auxiliary_loss_clip": 0.06648312, "auxiliary_loss_mlp": 0.01295727, "balance_loss_clip": 0.06320114, "balance_loss_mlp": 0.01264565, "epoch": 0.12223057267398166, "flos": 22863817140480.0, "grad_norm": 1.7387431788498307, "language_loss": 0.90320414, "learning_rate": 3.911429645763311e-06, "loss": 0.9826445, "num_input_tokens_seen": 43958130, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.3112793, "step": 2033, "time_per_iteration": 4.078023195266724 }, { "auxiliary_loss_clip": 0.06679579, "auxiliary_loss_mlp": 0.01294231, "balance_loss_clip": 0.063292, "balance_loss_mlp": 0.01261568, "epoch": 0.12229069592664964, "flos": 20053739088000.0, "grad_norm": 2.325355590099053, "language_loss": 0.67676264, "learning_rate": 3.911314993142311e-06, "loss": 0.75650078, "num_input_tokens_seen": 43976800, "router_z_loss_clip": 3.50195312, "router_z_loss_mlp": 0.32641602, "step": 2034, "time_per_iteration": 2.620124101638794 }, { "auxiliary_loss_clip": 0.06653701, "auxiliary_loss_mlp": 0.01293889, "balance_loss_clip": 0.06319837, "balance_loss_mlp": 0.01263729, "epoch": 0.1223508191793176, "flos": 22280963339520.0, "grad_norm": 1.774716977618959, "language_loss": 0.77978754, "learning_rate": 3.911200268044055e-06, "loss": 0.85926342, "num_input_tokens_seen": 43996620, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.30126953, "step": 2035, "time_per_iteration": 2.6203556060791016 }, { "auxiliary_loss_clip": 0.06673524, "auxiliary_loss_mlp": 0.01290324, "balance_loss_clip": 0.06324498, "balance_loss_mlp": 0.01258352, "epoch": 0.12241094243198557, "flos": 21292009925760.0, "grad_norm": 2.0636425086854735, "language_loss": 0.72482699, "learning_rate": 3.911085470472892e-06, "loss": 0.80446547, "num_input_tokens_seen": 44016175, "router_z_loss_clip": 3.49023438, "router_z_loss_mlp": 0.31982422, "step": 2036, "time_per_iteration": 4.135621547698975 }, { "auxiliary_loss_clip": 0.06664376, "auxiliary_loss_mlp": 0.01294227, "balance_loss_clip": 0.0632482, "balance_loss_mlp": 0.01262756, "epoch": 0.12247106568465355, "flos": 17387823185280.0, "grad_norm": 2.520646577018679, "language_loss": 0.84355676, "learning_rate": 3.910970600433178e-06, "loss": 0.92314285, "num_input_tokens_seen": 44035060, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.31445312, "step": 2037, "time_per_iteration": 4.03408670425415 }, { "auxiliary_loss_clip": 0.0666662, "auxiliary_loss_mlp": 0.01303312, "balance_loss_clip": 0.06326273, "balance_loss_mlp": 0.01272532, "epoch": 0.12253118893732151, "flos": 27051548497920.0, "grad_norm": 3.5956101132940894, "language_loss": 0.81913072, "learning_rate": 3.910855657929267e-06, "loss": 0.89883006, "num_input_tokens_seen": 44053330, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.30761719, "step": 2038, "time_per_iteration": 2.6685240268707275 }, { "auxiliary_loss_clip": 0.06557406, "auxiliary_loss_mlp": 0.01287558, "balance_loss_clip": 0.06342089, "balance_loss_mlp": 0.01275173, "epoch": 0.12259131218998948, "flos": 53878055328000.0, "grad_norm": 0.8129907030591378, "language_loss": 0.58555347, "learning_rate": 3.910740642965518e-06, "loss": 0.66400313, "num_input_tokens_seen": 44107575, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.12365723, "step": 2039, "time_per_iteration": 3.0810346603393555 }, { "auxiliary_loss_clip": 0.0666442, "auxiliary_loss_mlp": 0.01293663, "balance_loss_clip": 0.06325404, "balance_loss_mlp": 0.01262144, "epoch": 0.12265143544265744, "flos": 17897233282560.0, "grad_norm": 2.2942530933466747, "language_loss": 0.81562138, "learning_rate": 3.910625555546292e-06, "loss": 0.89520216, "num_input_tokens_seen": 44126075, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.31518555, "step": 2040, "time_per_iteration": 2.697298288345337 }, { "auxiliary_loss_clip": 0.06654456, "auxiliary_loss_mlp": 0.01291009, "balance_loss_clip": 0.06320883, "balance_loss_mlp": 0.01261159, "epoch": 0.12271155869532542, "flos": 21806577048960.0, "grad_norm": 1.7469902859086581, "language_loss": 0.83938897, "learning_rate": 3.910510395675953e-06, "loss": 0.91884357, "num_input_tokens_seen": 44145605, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.29858398, "step": 2041, "time_per_iteration": 2.612274169921875 }, { "auxiliary_loss_clip": 0.06672604, "auxiliary_loss_mlp": 0.01300915, "balance_loss_clip": 0.0632796, "balance_loss_mlp": 0.01268037, "epoch": 0.12277168194799339, "flos": 19834917350400.0, "grad_norm": 1.9532972986052892, "language_loss": 0.68475246, "learning_rate": 3.9103951633588694e-06, "loss": 0.76448768, "num_input_tokens_seen": 44164770, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.32885742, "step": 2042, "time_per_iteration": 2.6499156951904297 }, { "auxiliary_loss_clip": 0.06657621, "auxiliary_loss_mlp": 0.0130227, "balance_loss_clip": 0.06322321, "balance_loss_mlp": 0.01269582, "epoch": 0.12283180520066135, "flos": 23227597641600.0, "grad_norm": 12.398879426596979, "language_loss": 0.82756287, "learning_rate": 3.910279858599409e-06, "loss": 0.90716171, "num_input_tokens_seen": 44184025, "router_z_loss_clip": 3.3515625, "router_z_loss_mlp": 0.32641602, "step": 2043, "time_per_iteration": 2.605039596557617 }, { "auxiliary_loss_clip": 0.06671244, "auxiliary_loss_mlp": 0.01294587, "balance_loss_clip": 0.06328465, "balance_loss_mlp": 0.01263378, "epoch": 0.12289192845332933, "flos": 18594466306560.0, "grad_norm": 2.1132451310192066, "language_loss": 0.8087815, "learning_rate": 3.910164481401946e-06, "loss": 0.88843983, "num_input_tokens_seen": 44202950, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.31225586, "step": 2044, "time_per_iteration": 2.6182942390441895 }, { "auxiliary_loss_clip": 0.06667601, "auxiliary_loss_mlp": 0.01290248, "balance_loss_clip": 0.0632637, "balance_loss_mlp": 0.01258586, "epoch": 0.1229520517059973, "flos": 25775612449920.0, "grad_norm": 2.051590293614858, "language_loss": 0.78619283, "learning_rate": 3.910049031770853e-06, "loss": 0.86577129, "num_input_tokens_seen": 44221115, "router_z_loss_clip": 3.41601562, "router_z_loss_mlp": 0.31640625, "step": 2045, "time_per_iteration": 2.618065595626831 }, { "auxiliary_loss_clip": 0.06675293, "auxiliary_loss_mlp": 0.01290805, "balance_loss_clip": 0.06331405, "balance_loss_mlp": 0.01260407, "epoch": 0.12301217495866526, "flos": 20893541034240.0, "grad_norm": 2.0241757314581124, "language_loss": 0.69345963, "learning_rate": 3.90993350971051e-06, "loss": 0.77312064, "num_input_tokens_seen": 44240575, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.30371094, "step": 2046, "time_per_iteration": 2.6308672428131104 }, { "auxiliary_loss_clip": 0.0665939, "auxiliary_loss_mlp": 0.01293888, "balance_loss_clip": 0.06321438, "balance_loss_mlp": 0.01261749, "epoch": 0.12307229821133324, "flos": 22384735021440.0, "grad_norm": 2.300219377300525, "language_loss": 0.73829556, "learning_rate": 3.909817915225297e-06, "loss": 0.8178283, "num_input_tokens_seen": 44257145, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.32141113, "step": 2047, "time_per_iteration": 2.615429162979126 }, { "auxiliary_loss_clip": 0.06665099, "auxiliary_loss_mlp": 0.0129169, "balance_loss_clip": 0.06324428, "balance_loss_mlp": 0.01259432, "epoch": 0.1231324214640012, "flos": 23374065778560.0, "grad_norm": 1.7609471075138148, "language_loss": 0.77640486, "learning_rate": 3.909702248319597e-06, "loss": 0.85597277, "num_input_tokens_seen": 44278035, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.32299805, "step": 2048, "time_per_iteration": 2.680250406265259 }, { "auxiliary_loss_clip": 0.06657434, "auxiliary_loss_mlp": 0.01290948, "balance_loss_clip": 0.06322372, "balance_loss_mlp": 0.01261956, "epoch": 0.12319254471666917, "flos": 23773624773120.0, "grad_norm": 2.249590019502558, "language_loss": 0.86700356, "learning_rate": 3.909586508997797e-06, "loss": 0.94648737, "num_input_tokens_seen": 44296980, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.2902832, "step": 2049, "time_per_iteration": 2.627065896987915 }, { "auxiliary_loss_clip": 0.06671897, "auxiliary_loss_mlp": 0.0128727, "balance_loss_clip": 0.06325816, "balance_loss_mlp": 0.01257467, "epoch": 0.12325266796933713, "flos": 23556899387520.0, "grad_norm": 1.7043717242685072, "language_loss": 0.76777178, "learning_rate": 3.909470697264285e-06, "loss": 0.84736347, "num_input_tokens_seen": 44318005, "router_z_loss_clip": 3.4609375, "router_z_loss_mlp": 0.29748535, "step": 2050, "time_per_iteration": 2.6526830196380615 }, { "auxiliary_loss_clip": 0.06663968, "auxiliary_loss_mlp": 0.01293063, "balance_loss_clip": 0.06322484, "balance_loss_mlp": 0.01261329, "epoch": 0.12331279122200511, "flos": 24430593110400.0, "grad_norm": 2.873895614511558, "language_loss": 0.82618481, "learning_rate": 3.909354813123452e-06, "loss": 0.9057551, "num_input_tokens_seen": 44335260, "router_z_loss_clip": 3.41210938, "router_z_loss_mlp": 0.31738281, "step": 2051, "time_per_iteration": 2.6293227672576904 }, { "auxiliary_loss_clip": 0.06662316, "auxiliary_loss_mlp": 0.01298494, "balance_loss_clip": 0.06323639, "balance_loss_mlp": 0.01268728, "epoch": 0.12337291447467308, "flos": 25491438927360.0, "grad_norm": 5.665138182682157, "language_loss": 0.80914605, "learning_rate": 3.909238856579693e-06, "loss": 0.88875413, "num_input_tokens_seen": 44355315, "router_z_loss_clip": 3.38476562, "router_z_loss_mlp": 0.29772949, "step": 2052, "time_per_iteration": 2.674184799194336 }, { "auxiliary_loss_clip": 0.06665488, "auxiliary_loss_mlp": 0.01294637, "balance_loss_clip": 0.0632115, "balance_loss_mlp": 0.01263094, "epoch": 0.12343303772734104, "flos": 23556731679360.0, "grad_norm": 3.8940436104755256, "language_loss": 0.75741881, "learning_rate": 3.909122827637406e-06, "loss": 0.83702004, "num_input_tokens_seen": 44373020, "router_z_loss_clip": 3.44140625, "router_z_loss_mlp": 0.31542969, "step": 2053, "time_per_iteration": 2.6848433017730713 }, { "auxiliary_loss_clip": 0.06660753, "auxiliary_loss_mlp": 0.01304874, "balance_loss_clip": 0.06316479, "balance_loss_mlp": 0.0127407, "epoch": 0.12349316098000902, "flos": 47567724670080.0, "grad_norm": 2.78305089732421, "language_loss": 0.75962061, "learning_rate": 3.909006726300991e-06, "loss": 0.83927691, "num_input_tokens_seen": 44397525, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.30834961, "step": 2054, "time_per_iteration": 2.8707709312438965 }, { "auxiliary_loss_clip": 0.06659834, "auxiliary_loss_mlp": 0.01298535, "balance_loss_clip": 0.06324986, "balance_loss_mlp": 0.01269126, "epoch": 0.12355328423267699, "flos": 25052956911360.0, "grad_norm": 1.7233327287906415, "language_loss": 0.86490452, "learning_rate": 3.908890552574849e-06, "loss": 0.94448823, "num_input_tokens_seen": 44415890, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.29431152, "step": 2055, "time_per_iteration": 2.616908073425293 }, { "auxiliary_loss_clip": 0.06670651, "auxiliary_loss_mlp": 0.0131359, "balance_loss_clip": 0.06332459, "balance_loss_mlp": 0.01284479, "epoch": 0.12361340748534495, "flos": 27716524899840.0, "grad_norm": 2.0853292974632507, "language_loss": 0.7935729, "learning_rate": 3.908774306463384e-06, "loss": 0.87341523, "num_input_tokens_seen": 44436625, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.29125977, "step": 2056, "time_per_iteration": 2.672306537628174 }, { "auxiliary_loss_clip": 0.06670897, "auxiliary_loss_mlp": 0.01308309, "balance_loss_clip": 0.06326777, "balance_loss_mlp": 0.01277767, "epoch": 0.12367353073801293, "flos": 26147778359040.0, "grad_norm": 2.348004529836474, "language_loss": 0.84133333, "learning_rate": 3.908657987971009e-06, "loss": 0.92112541, "num_input_tokens_seen": 44455265, "router_z_loss_clip": 3.44140625, "router_z_loss_mlp": 0.30541992, "step": 2057, "time_per_iteration": 2.6532130241394043 }, { "auxiliary_loss_clip": 0.06676157, "auxiliary_loss_mlp": 0.01307325, "balance_loss_clip": 0.06337131, "balance_loss_mlp": 0.01276855, "epoch": 0.1237336539906809, "flos": 25163143430400.0, "grad_norm": 1.9731404358992073, "language_loss": 0.79133415, "learning_rate": 3.90854159710213e-06, "loss": 0.87116897, "num_input_tokens_seen": 44475815, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.30444336, "step": 2058, "time_per_iteration": 2.6647064685821533 }, { "auxiliary_loss_clip": 0.06679498, "auxiliary_loss_mlp": 0.01305504, "balance_loss_clip": 0.06336415, "balance_loss_mlp": 0.0127439, "epoch": 0.12379377724334886, "flos": 15310001963520.0, "grad_norm": 2.2324195521824888, "language_loss": 0.85207236, "learning_rate": 3.9084251338611624e-06, "loss": 0.93192232, "num_input_tokens_seen": 44494045, "router_z_loss_clip": 3.42773438, "router_z_loss_mlp": 0.31103516, "step": 2059, "time_per_iteration": 2.6154305934906006 }, { "auxiliary_loss_clip": 0.06677826, "auxiliary_loss_mlp": 0.01314618, "balance_loss_clip": 0.0633397, "balance_loss_mlp": 0.01284339, "epoch": 0.12385390049601683, "flos": 21321792852480.0, "grad_norm": 2.7163804903951863, "language_loss": 0.82990843, "learning_rate": 3.908308598252523e-06, "loss": 0.90983289, "num_input_tokens_seen": 44509120, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.30297852, "step": 2060, "time_per_iteration": 2.6452767848968506 }, { "auxiliary_loss_clip": 0.06672956, "auxiliary_loss_mlp": 0.01302103, "balance_loss_clip": 0.06332318, "balance_loss_mlp": 0.0127056, "epoch": 0.1239140237486848, "flos": 15120711590400.0, "grad_norm": 2.3048122286111403, "language_loss": 0.87237865, "learning_rate": 3.9081919902806306e-06, "loss": 0.95212924, "num_input_tokens_seen": 44525780, "router_z_loss_clip": 3.40429688, "router_z_loss_mlp": 0.31518555, "step": 2061, "time_per_iteration": 2.6737120151519775 }, { "auxiliary_loss_clip": 0.06666769, "auxiliary_loss_mlp": 0.01309824, "balance_loss_clip": 0.06333002, "balance_loss_mlp": 0.01281655, "epoch": 0.12397414700135277, "flos": 21982534623360.0, "grad_norm": 1.8813596240459118, "language_loss": 0.85875571, "learning_rate": 3.908075309949906e-06, "loss": 0.93852162, "num_input_tokens_seen": 44543125, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.28161621, "step": 2062, "time_per_iteration": 2.5977354049682617 }, { "auxiliary_loss_clip": 0.0666952, "auxiliary_loss_mlp": 0.01302284, "balance_loss_clip": 0.06332973, "balance_loss_mlp": 0.01273614, "epoch": 0.12403427025402074, "flos": 13404909934080.0, "grad_norm": 21.86418325819906, "language_loss": 0.79789519, "learning_rate": 3.907958557264774e-06, "loss": 0.87761325, "num_input_tokens_seen": 44560275, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.28674316, "step": 2063, "time_per_iteration": 2.5667166709899902 }, { "auxiliary_loss_clip": 0.06672988, "auxiliary_loss_mlp": 0.01294207, "balance_loss_clip": 0.06334599, "balance_loss_mlp": 0.01263642, "epoch": 0.12409439350668872, "flos": 15309750401280.0, "grad_norm": 2.4694399702179495, "language_loss": 0.81177163, "learning_rate": 3.907841732229663e-06, "loss": 0.89144349, "num_input_tokens_seen": 44577640, "router_z_loss_clip": 3.3828125, "router_z_loss_mlp": 0.30541992, "step": 2064, "time_per_iteration": 2.5881853103637695 }, { "auxiliary_loss_clip": 0.06689031, "auxiliary_loss_mlp": 0.01308503, "balance_loss_clip": 0.0634339, "balance_loss_mlp": 0.01278558, "epoch": 0.12415451675935668, "flos": 25016339877120.0, "grad_norm": 2.8780260445873727, "language_loss": 0.93810511, "learning_rate": 3.907724834849002e-06, "loss": 1.01808047, "num_input_tokens_seen": 44594860, "router_z_loss_clip": 3.453125, "router_z_loss_mlp": 0.29931641, "step": 2065, "time_per_iteration": 2.6281514167785645 }, { "auxiliary_loss_clip": 0.06679468, "auxiliary_loss_mlp": 0.01294032, "balance_loss_clip": 0.06333914, "balance_loss_mlp": 0.01263992, "epoch": 0.12421464001202465, "flos": 23666457000960.0, "grad_norm": 2.525251267050076, "language_loss": 0.82282221, "learning_rate": 3.907607865127225e-06, "loss": 0.90255725, "num_input_tokens_seen": 44614780, "router_z_loss_clip": 3.453125, "router_z_loss_mlp": 0.30053711, "step": 2066, "time_per_iteration": 2.6580257415771484 }, { "auxiliary_loss_clip": 0.06562649, "auxiliary_loss_mlp": 0.01282805, "balance_loss_clip": 0.0635229, "balance_loss_mlp": 0.01268643, "epoch": 0.12427476326469263, "flos": 65753686794240.0, "grad_norm": 0.8630506337266264, "language_loss": 0.63623631, "learning_rate": 3.907490823068766e-06, "loss": 0.7146908, "num_input_tokens_seen": 44671240, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.14172363, "step": 2067, "time_per_iteration": 3.17289137840271 }, { "auxiliary_loss_clip": 0.06677484, "auxiliary_loss_mlp": 0.01291872, "balance_loss_clip": 0.06332251, "balance_loss_mlp": 0.01260925, "epoch": 0.12433488651736059, "flos": 24542372856960.0, "grad_norm": 1.9074590304232677, "language_loss": 0.94798112, "learning_rate": 3.907373708678063e-06, "loss": 1.02767467, "num_input_tokens_seen": 44691050, "router_z_loss_clip": 3.453125, "router_z_loss_mlp": 0.30944824, "step": 2068, "time_per_iteration": 2.63907527923584 }, { "auxiliary_loss_clip": 0.06673917, "auxiliary_loss_mlp": 0.0129291, "balance_loss_clip": 0.06335518, "balance_loss_mlp": 0.01264419, "epoch": 0.12439500977002856, "flos": 21037828965120.0, "grad_norm": 2.419202963936644, "language_loss": 0.83215094, "learning_rate": 3.9072565219595596e-06, "loss": 0.91181922, "num_input_tokens_seen": 44709850, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.28491211, "step": 2069, "time_per_iteration": 2.6045193672180176 }, { "auxiliary_loss_clip": 0.06682322, "auxiliary_loss_mlp": 0.0128485, "balance_loss_clip": 0.06341006, "balance_loss_mlp": 0.01254666, "epoch": 0.12445513302269653, "flos": 26837380661760.0, "grad_norm": 3.5874655342371478, "language_loss": 0.78793967, "learning_rate": 3.907139262917696e-06, "loss": 0.86761135, "num_input_tokens_seen": 44731475, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.30151367, "step": 2070, "time_per_iteration": 2.660435676574707 }, { "auxiliary_loss_clip": 0.06673655, "auxiliary_loss_mlp": 0.01292175, "balance_loss_clip": 0.06328382, "balance_loss_mlp": 0.01263422, "epoch": 0.1245152562753645, "flos": 18374764101120.0, "grad_norm": 2.7562083533110306, "language_loss": 0.82218945, "learning_rate": 3.907021931556922e-06, "loss": 0.90184772, "num_input_tokens_seen": 44749685, "router_z_loss_clip": 3.45117188, "router_z_loss_mlp": 0.28759766, "step": 2071, "time_per_iteration": 4.062412977218628 }, { "auxiliary_loss_clip": 0.066572, "auxiliary_loss_mlp": 0.01293014, "balance_loss_clip": 0.06326189, "balance_loss_mlp": 0.01263307, "epoch": 0.12457537952803246, "flos": 33116098331520.0, "grad_norm": 1.6632449968401417, "language_loss": 0.78748655, "learning_rate": 3.906904527881684e-06, "loss": 0.86698872, "num_input_tokens_seen": 44772165, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.296875, "step": 2072, "time_per_iteration": 2.745945453643799 }, { "auxiliary_loss_clip": 0.06678903, "auxiliary_loss_mlp": 0.01285164, "balance_loss_clip": 0.06337182, "balance_loss_mlp": 0.01255004, "epoch": 0.12463550278070043, "flos": 22276267511040.0, "grad_norm": 14.032019780631058, "language_loss": 0.76740259, "learning_rate": 3.9067870518964355e-06, "loss": 0.84704328, "num_input_tokens_seen": 44790580, "router_z_loss_clip": 3.41796875, "router_z_loss_mlp": 0.30126953, "step": 2073, "time_per_iteration": 4.141234397888184 }, { "auxiliary_loss_clip": 0.06657363, "auxiliary_loss_mlp": 0.01290798, "balance_loss_clip": 0.06323462, "balance_loss_mlp": 0.01260829, "epoch": 0.12469562603336841, "flos": 14683445458560.0, "grad_norm": 2.394580323919386, "language_loss": 0.91221523, "learning_rate": 3.906669503605631e-06, "loss": 0.99169677, "num_input_tokens_seen": 44806730, "router_z_loss_clip": 3.34375, "router_z_loss_mlp": 0.29980469, "step": 2074, "time_per_iteration": 2.6412739753723145 }, { "auxiliary_loss_clip": 0.06673712, "auxiliary_loss_mlp": 0.01290841, "balance_loss_clip": 0.06326747, "balance_loss_mlp": 0.01259179, "epoch": 0.12475574928603637, "flos": 24651720835200.0, "grad_norm": 3.5752064801678665, "language_loss": 0.84904438, "learning_rate": 3.906551883013728e-06, "loss": 0.92868996, "num_input_tokens_seen": 44825550, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.31640625, "step": 2075, "time_per_iteration": 4.092012166976929 }, { "auxiliary_loss_clip": 0.0665894, "auxiliary_loss_mlp": 0.01293621, "balance_loss_clip": 0.06321204, "balance_loss_mlp": 0.01262221, "epoch": 0.12481587253870434, "flos": 21769540744320.0, "grad_norm": 2.1999003878078587, "language_loss": 0.73537886, "learning_rate": 3.9064341901251865e-06, "loss": 0.81490445, "num_input_tokens_seen": 44844155, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.31396484, "step": 2076, "time_per_iteration": 2.6131999492645264 }, { "auxiliary_loss_clip": 0.0664769, "auxiliary_loss_mlp": 0.01288259, "balance_loss_clip": 0.06313661, "balance_loss_mlp": 0.01259363, "epoch": 0.12487599579137232, "flos": 21438687697920.0, "grad_norm": 2.8716489621120376, "language_loss": 0.77097225, "learning_rate": 3.906316424944469e-06, "loss": 0.85033178, "num_input_tokens_seen": 44863780, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.28857422, "step": 2077, "time_per_iteration": 4.109433889389038 }, { "auxiliary_loss_clip": 0.06656761, "auxiliary_loss_mlp": 0.01287265, "balance_loss_clip": 0.06316017, "balance_loss_mlp": 0.0125527, "epoch": 0.12493611904404028, "flos": 16113228802560.0, "grad_norm": 3.2078745161812043, "language_loss": 0.84237158, "learning_rate": 3.906198587476043e-06, "loss": 0.92181188, "num_input_tokens_seen": 44881480, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.3203125, "step": 2078, "time_per_iteration": 2.58109188079834 }, { "auxiliary_loss_clip": 0.06661537, "auxiliary_loss_mlp": 0.01296484, "balance_loss_clip": 0.06320031, "balance_loss_mlp": 0.01265204, "epoch": 0.12499624229670825, "flos": 21586749062400.0, "grad_norm": 1.9822948631805142, "language_loss": 0.76725483, "learning_rate": 3.906080677724374e-06, "loss": 0.84683502, "num_input_tokens_seen": 44900390, "router_z_loss_clip": 3.41601562, "router_z_loss_mlp": 0.31274414, "step": 2079, "time_per_iteration": 2.622265100479126 }, { "auxiliary_loss_clip": 0.06661409, "auxiliary_loss_mlp": 0.01288424, "balance_loss_clip": 0.06317792, "balance_loss_mlp": 0.01257525, "epoch": 0.1250563655493762, "flos": 25705522909440.0, "grad_norm": 2.9914305785726993, "language_loss": 0.85063589, "learning_rate": 3.905962695693935e-06, "loss": 0.93013418, "num_input_tokens_seen": 44920375, "router_z_loss_clip": 3.4296875, "router_z_loss_mlp": 0.30883789, "step": 2080, "time_per_iteration": 2.65181303024292 }, { "auxiliary_loss_clip": 0.06646614, "auxiliary_loss_mlp": 0.01297474, "balance_loss_clip": 0.06308747, "balance_loss_mlp": 0.01265645, "epoch": 0.12511648880204418, "flos": 16915113976320.0, "grad_norm": 2.292861257949496, "language_loss": 0.86006379, "learning_rate": 3.9058446413892e-06, "loss": 0.93950462, "num_input_tokens_seen": 44938415, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.31835938, "step": 2081, "time_per_iteration": 2.6822636127471924 }, { "auxiliary_loss_clip": 0.06650865, "auxiliary_loss_mlp": 0.01288873, "balance_loss_clip": 0.06315771, "balance_loss_mlp": 0.01259405, "epoch": 0.12517661205471217, "flos": 17573423978880.0, "grad_norm": 2.131400355232845, "language_loss": 0.77744424, "learning_rate": 3.905726514814646e-06, "loss": 0.85684168, "num_input_tokens_seen": 44957135, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.29467773, "step": 2082, "time_per_iteration": 2.594223976135254 }, { "auxiliary_loss_clip": 0.0666978, "auxiliary_loss_mlp": 0.0128982, "balance_loss_clip": 0.06316972, "balance_loss_mlp": 0.01258921, "epoch": 0.12523673530738014, "flos": 16039240047360.0, "grad_norm": 2.6910799152275175, "language_loss": 0.80066377, "learning_rate": 3.9056083159747495e-06, "loss": 0.88025975, "num_input_tokens_seen": 44974480, "router_z_loss_clip": 3.52734375, "router_z_loss_mlp": 0.30859375, "step": 2083, "time_per_iteration": 2.591735363006592 }, { "auxiliary_loss_clip": 0.06658069, "auxiliary_loss_mlp": 0.01296744, "balance_loss_clip": 0.06315754, "balance_loss_mlp": 0.01263819, "epoch": 0.1252968585600481, "flos": 18813833095680.0, "grad_norm": 4.129819351321878, "language_loss": 0.91886491, "learning_rate": 3.9054900448739966e-06, "loss": 0.99841303, "num_input_tokens_seen": 44990310, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.3293457, "step": 2084, "time_per_iteration": 2.59188175201416 }, { "auxiliary_loss_clip": 0.06652532, "auxiliary_loss_mlp": 0.01289759, "balance_loss_clip": 0.06311814, "balance_loss_mlp": 0.01259075, "epoch": 0.12535698181271607, "flos": 27278923351680.0, "grad_norm": 2.1078298256225962, "language_loss": 0.81001443, "learning_rate": 3.905371701516869e-06, "loss": 0.88943732, "num_input_tokens_seen": 45010720, "router_z_loss_clip": 3.40625, "router_z_loss_mlp": 0.30712891, "step": 2085, "time_per_iteration": 2.6891255378723145 }, { "auxiliary_loss_clip": 0.06653389, "auxiliary_loss_mlp": 0.01298384, "balance_loss_clip": 0.06318642, "balance_loss_mlp": 0.01268105, "epoch": 0.12541710506538403, "flos": 22060590301440.0, "grad_norm": 2.136642403302242, "language_loss": 0.8921231, "learning_rate": 3.905253285907856e-06, "loss": 0.97164083, "num_input_tokens_seen": 45030360, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.3026123, "step": 2086, "time_per_iteration": 2.6509063243865967 }, { "auxiliary_loss_clip": 0.0664153, "auxiliary_loss_mlp": 0.01292869, "balance_loss_clip": 0.06312476, "balance_loss_mlp": 0.01264807, "epoch": 0.125477228318052, "flos": 12607888296960.0, "grad_norm": 19.248810811174366, "language_loss": 0.88246512, "learning_rate": 3.905134798051447e-06, "loss": 0.96180916, "num_input_tokens_seen": 45045085, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.28076172, "step": 2087, "time_per_iteration": 2.6890814304351807 }, { "auxiliary_loss_clip": 0.06652989, "auxiliary_loss_mlp": 0.01295312, "balance_loss_clip": 0.06315287, "balance_loss_mlp": 0.01264318, "epoch": 0.12553735157071996, "flos": 23885362592640.0, "grad_norm": 2.072271963585369, "language_loss": 0.75456077, "learning_rate": 3.905016237952136e-06, "loss": 0.8340438, "num_input_tokens_seen": 45065145, "router_z_loss_clip": 3.37890625, "router_z_loss_mlp": 0.30969238, "step": 2088, "time_per_iteration": 2.7857837677001953 }, { "auxiliary_loss_clip": 0.06535433, "auxiliary_loss_mlp": 0.01293274, "balance_loss_clip": 0.06325129, "balance_loss_mlp": 0.01278742, "epoch": 0.12559747482338796, "flos": 69940998881280.0, "grad_norm": 0.778031372329776, "language_loss": 0.61757219, "learning_rate": 3.904897605614418e-06, "loss": 0.69585931, "num_input_tokens_seen": 45126230, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.14489746, "step": 2089, "time_per_iteration": 3.194706439971924 }, { "auxiliary_loss_clip": 0.06643566, "auxiliary_loss_mlp": 0.01298813, "balance_loss_clip": 0.06310321, "balance_loss_mlp": 0.01268295, "epoch": 0.12565759807605592, "flos": 24286389033600.0, "grad_norm": 2.360516283268605, "language_loss": 0.79818106, "learning_rate": 3.904778901042793e-06, "loss": 0.87760484, "num_input_tokens_seen": 45145545, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.30493164, "step": 2090, "time_per_iteration": 2.61397647857666 }, { "auxiliary_loss_clip": 0.06513915, "auxiliary_loss_mlp": 0.01278216, "balance_loss_clip": 0.06305914, "balance_loss_mlp": 0.01264376, "epoch": 0.12571772132872389, "flos": 56468011904640.0, "grad_norm": 0.7351776046467878, "language_loss": 0.59009266, "learning_rate": 3.90466012424176e-06, "loss": 0.66801393, "num_input_tokens_seen": 45206845, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.13879395, "step": 2091, "time_per_iteration": 3.13883376121521 }, { "auxiliary_loss_clip": 0.06647807, "auxiliary_loss_mlp": 0.01293029, "balance_loss_clip": 0.06305975, "balance_loss_mlp": 0.01263036, "epoch": 0.12577784458139185, "flos": 41255576421120.0, "grad_norm": 2.06700640795302, "language_loss": 0.65169811, "learning_rate": 3.904541275215825e-06, "loss": 0.7311064, "num_input_tokens_seen": 45228495, "router_z_loss_clip": 3.41796875, "router_z_loss_mlp": 0.30004883, "step": 2092, "time_per_iteration": 2.871838331222534 }, { "auxiliary_loss_clip": 0.06662323, "auxiliary_loss_mlp": 0.01302839, "balance_loss_clip": 0.06314332, "balance_loss_mlp": 0.01269055, "epoch": 0.12583796783405982, "flos": 19761599427840.0, "grad_norm": 4.835900851845552, "language_loss": 0.82148147, "learning_rate": 3.904422353969493e-06, "loss": 0.90113312, "num_input_tokens_seen": 45245720, "router_z_loss_clip": 3.48242188, "router_z_loss_mlp": 0.33789062, "step": 2093, "time_per_iteration": 2.6254448890686035 }, { "auxiliary_loss_clip": 0.06649391, "auxiliary_loss_mlp": 0.01288546, "balance_loss_clip": 0.06318879, "balance_loss_mlp": 0.01257957, "epoch": 0.12589809108672778, "flos": 22608797639040.0, "grad_norm": 1.966851615217355, "language_loss": 0.76820326, "learning_rate": 3.904303360507276e-06, "loss": 0.84758258, "num_input_tokens_seen": 45265650, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.3059082, "step": 2094, "time_per_iteration": 2.655712127685547 }, { "auxiliary_loss_clip": 0.06649688, "auxiliary_loss_mlp": 0.01296624, "balance_loss_clip": 0.0631924, "balance_loss_mlp": 0.0126489, "epoch": 0.12595821433939577, "flos": 45233248792320.0, "grad_norm": 1.985770194120831, "language_loss": 0.78086627, "learning_rate": 3.9041842948336835e-06, "loss": 0.86032939, "num_input_tokens_seen": 45287790, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.31713867, "step": 2095, "time_per_iteration": 2.8264968395233154 }, { "auxiliary_loss_clip": 0.06671008, "auxiliary_loss_mlp": 0.01297715, "balance_loss_clip": 0.06328534, "balance_loss_mlp": 0.01267269, "epoch": 0.12601833759206374, "flos": 14325115472640.0, "grad_norm": 2.457008080715345, "language_loss": 0.84614813, "learning_rate": 3.904065156953232e-06, "loss": 0.92583531, "num_input_tokens_seen": 45305720, "router_z_loss_clip": 3.42578125, "router_z_loss_mlp": 0.30395508, "step": 2096, "time_per_iteration": 2.6071906089782715 }, { "auxiliary_loss_clip": 0.06679587, "auxiliary_loss_mlp": 0.01301813, "balance_loss_clip": 0.06334113, "balance_loss_mlp": 0.01268602, "epoch": 0.1260784608447317, "flos": 21294651329280.0, "grad_norm": 1.9008982853435556, "language_loss": 0.76869702, "learning_rate": 3.903945946870439e-06, "loss": 0.84851104, "num_input_tokens_seen": 45325290, "router_z_loss_clip": 3.45703125, "router_z_loss_mlp": 0.33203125, "step": 2097, "time_per_iteration": 2.6648738384246826 }, { "auxiliary_loss_clip": 0.06678839, "auxiliary_loss_mlp": 0.01293535, "balance_loss_clip": 0.06338479, "balance_loss_mlp": 0.01262851, "epoch": 0.12613858409739967, "flos": 26258719564800.0, "grad_norm": 31.472642392855207, "language_loss": 0.88515007, "learning_rate": 3.9038266645898246e-06, "loss": 0.96487379, "num_input_tokens_seen": 45344465, "router_z_loss_clip": 3.40625, "router_z_loss_mlp": 0.30664062, "step": 2098, "time_per_iteration": 2.6721866130828857 }, { "auxiliary_loss_clip": 0.06682683, "auxiliary_loss_mlp": 0.01301901, "balance_loss_clip": 0.06334361, "balance_loss_mlp": 0.01267164, "epoch": 0.12619870735006763, "flos": 21586413646080.0, "grad_norm": 2.290155517590768, "language_loss": 0.70834613, "learning_rate": 3.903707310115912e-06, "loss": 0.78819191, "num_input_tokens_seen": 45362465, "router_z_loss_clip": 3.48242188, "router_z_loss_mlp": 0.34741211, "step": 2099, "time_per_iteration": 2.633538246154785 }, { "auxiliary_loss_clip": 0.06679571, "auxiliary_loss_mlp": 0.01303515, "balance_loss_clip": 0.06336117, "balance_loss_mlp": 0.01268539, "epoch": 0.1262588306027356, "flos": 23373646508160.0, "grad_norm": 2.6750232912949894, "language_loss": 0.83354473, "learning_rate": 3.903587883453228e-06, "loss": 0.91337562, "num_input_tokens_seen": 45382700, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.34985352, "step": 2100, "time_per_iteration": 2.673985004425049 }, { "auxiliary_loss_clip": 0.06685178, "auxiliary_loss_mlp": 0.01304203, "balance_loss_clip": 0.06345223, "balance_loss_mlp": 0.01271611, "epoch": 0.12631895385540357, "flos": 23955619841280.0, "grad_norm": 1.9198788877326518, "language_loss": 0.81996632, "learning_rate": 3.903468384606302e-06, "loss": 0.89986008, "num_input_tokens_seen": 45401005, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.32568359, "step": 2101, "time_per_iteration": 2.639629364013672 }, { "auxiliary_loss_clip": 0.06520818, "auxiliary_loss_mlp": 0.01280477, "balance_loss_clip": 0.06312232, "balance_loss_mlp": 0.01268538, "epoch": 0.12637907710807156, "flos": 70301760635520.0, "grad_norm": 0.7019850446380927, "language_loss": 0.57068926, "learning_rate": 3.903348813579662e-06, "loss": 0.6487022, "num_input_tokens_seen": 45466555, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.11932373, "step": 2102, "time_per_iteration": 3.2846505641937256 }, { "auxiliary_loss_clip": 0.06682087, "auxiliary_loss_mlp": 0.01298199, "balance_loss_clip": 0.06337696, "balance_loss_mlp": 0.01268182, "epoch": 0.12643920036073952, "flos": 18920833159680.0, "grad_norm": 3.3427997717556734, "language_loss": 0.95250022, "learning_rate": 3.903229170377845e-06, "loss": 1.03230309, "num_input_tokens_seen": 45485165, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.30029297, "step": 2103, "time_per_iteration": 2.6296322345733643 }, { "auxiliary_loss_clip": 0.06663994, "auxiliary_loss_mlp": 0.01289838, "balance_loss_clip": 0.06335546, "balance_loss_mlp": 0.01260703, "epoch": 0.1264993236134075, "flos": 27789926676480.0, "grad_norm": 5.227248889192276, "language_loss": 0.79533768, "learning_rate": 3.903109455005387e-06, "loss": 0.87487602, "num_input_tokens_seen": 45504630, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.29150391, "step": 2104, "time_per_iteration": 2.753303050994873 }, { "auxiliary_loss_clip": 0.06678744, "auxiliary_loss_mlp": 0.01295306, "balance_loss_clip": 0.06337819, "balance_loss_mlp": 0.01264193, "epoch": 0.12655944686607545, "flos": 24761739646080.0, "grad_norm": 1.9154708187216285, "language_loss": 0.82089412, "learning_rate": 3.902989667466828e-06, "loss": 0.90063465, "num_input_tokens_seen": 45524885, "router_z_loss_clip": 3.41210938, "router_z_loss_mlp": 0.31103516, "step": 2105, "time_per_iteration": 2.6564178466796875 }, { "auxiliary_loss_clip": 0.06693116, "auxiliary_loss_mlp": 0.01298133, "balance_loss_clip": 0.06338367, "balance_loss_mlp": 0.01262466, "epoch": 0.12661957011874342, "flos": 24139753188480.0, "grad_norm": 2.2200056037552383, "language_loss": 0.84206927, "learning_rate": 3.90286980776671e-06, "loss": 0.92198181, "num_input_tokens_seen": 45545000, "router_z_loss_clip": 3.546875, "router_z_loss_mlp": 0.35693359, "step": 2106, "time_per_iteration": 2.627692222595215 }, { "auxiliary_loss_clip": 0.06666955, "auxiliary_loss_mlp": 0.01295287, "balance_loss_clip": 0.06328609, "balance_loss_mlp": 0.01263005, "epoch": 0.12667969337141138, "flos": 24576180779520.0, "grad_norm": 1.8710051191430703, "language_loss": 0.74740863, "learning_rate": 3.902749875909578e-06, "loss": 0.82703102, "num_input_tokens_seen": 45564210, "router_z_loss_clip": 3.38085938, "router_z_loss_mlp": 0.32275391, "step": 2107, "time_per_iteration": 2.6802444458007812 }, { "auxiliary_loss_clip": 0.06669559, "auxiliary_loss_mlp": 0.01287283, "balance_loss_clip": 0.06334509, "balance_loss_mlp": 0.01257338, "epoch": 0.12673981662407935, "flos": 22967546895360.0, "grad_norm": 2.869119824458429, "language_loss": 0.80273247, "learning_rate": 3.90262987189998e-06, "loss": 0.88230091, "num_input_tokens_seen": 45583030, "router_z_loss_clip": 3.3515625, "router_z_loss_mlp": 0.29956055, "step": 2108, "time_per_iteration": 2.6619622707366943 }, { "auxiliary_loss_clip": 0.06680745, "auxiliary_loss_mlp": 0.01298047, "balance_loss_clip": 0.0633916, "balance_loss_mlp": 0.01267028, "epoch": 0.12679993987674734, "flos": 17280613486080.0, "grad_norm": 1.9222241253405326, "language_loss": 0.77197909, "learning_rate": 3.902509795742467e-06, "loss": 0.851767, "num_input_tokens_seen": 45602265, "router_z_loss_clip": 3.41015625, "router_z_loss_mlp": 0.31054688, "step": 2109, "time_per_iteration": 4.038797378540039 }, { "auxiliary_loss_clip": 0.0665843, "auxiliary_loss_mlp": 0.01289866, "balance_loss_clip": 0.06323051, "balance_loss_mlp": 0.01257656, "epoch": 0.1268600631294153, "flos": 17280865048320.0, "grad_norm": 1.803474804492637, "language_loss": 0.83678102, "learning_rate": 3.902389647441592e-06, "loss": 0.91626394, "num_input_tokens_seen": 45620595, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.32202148, "step": 2110, "time_per_iteration": 2.6062796115875244 }, { "auxiliary_loss_clip": 0.06686662, "auxiliary_loss_mlp": 0.01295827, "balance_loss_clip": 0.06338375, "balance_loss_mlp": 0.01262257, "epoch": 0.12692018638208327, "flos": 24067902712320.0, "grad_norm": 1.8029197853705012, "language_loss": 0.79765153, "learning_rate": 3.90226942700191e-06, "loss": 0.87747639, "num_input_tokens_seen": 45641140, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.3359375, "step": 2111, "time_per_iteration": 2.6506264209747314 }, { "auxiliary_loss_clip": 0.06701725, "auxiliary_loss_mlp": 0.01298172, "balance_loss_clip": 0.06343186, "balance_loss_mlp": 0.01262314, "epoch": 0.12698030963475124, "flos": 31839952648320.0, "grad_norm": 2.7140379984822323, "language_loss": 0.78057611, "learning_rate": 3.902149134427982e-06, "loss": 0.86057508, "num_input_tokens_seen": 45662315, "router_z_loss_clip": 3.58789062, "router_z_loss_mlp": 0.35864258, "step": 2112, "time_per_iteration": 4.141291379928589 }, { "auxiliary_loss_clip": 0.06670395, "auxiliary_loss_mlp": 0.01295765, "balance_loss_clip": 0.06335577, "balance_loss_mlp": 0.01265629, "epoch": 0.1270404328874192, "flos": 25194058387200.0, "grad_norm": 2.232896423665493, "language_loss": 0.86234361, "learning_rate": 3.902028769724367e-06, "loss": 0.94200522, "num_input_tokens_seen": 45680335, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.30102539, "step": 2113, "time_per_iteration": 2.624316692352295 }, { "auxiliary_loss_clip": 0.06677905, "auxiliary_loss_mlp": 0.01298343, "balance_loss_clip": 0.06333854, "balance_loss_mlp": 0.01265584, "epoch": 0.12710055614008717, "flos": 16002790721280.0, "grad_norm": 2.0768388493209433, "language_loss": 0.74801755, "learning_rate": 3.9019083328956315e-06, "loss": 0.82778001, "num_input_tokens_seen": 45696240, "router_z_loss_clip": 3.43945312, "router_z_loss_mlp": 0.32739258, "step": 2114, "time_per_iteration": 2.5561461448669434 }, { "auxiliary_loss_clip": 0.06690836, "auxiliary_loss_mlp": 0.01297035, "balance_loss_clip": 0.06354278, "balance_loss_mlp": 0.0126647, "epoch": 0.12716067939275516, "flos": 15091012517760.0, "grad_norm": 1.9704170229968612, "language_loss": 0.84086752, "learning_rate": 3.901787823946341e-06, "loss": 0.92074621, "num_input_tokens_seen": 45713695, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.30541992, "step": 2115, "time_per_iteration": 3.9964022636413574 }, { "auxiliary_loss_clip": 0.06683823, "auxiliary_loss_mlp": 0.01294699, "balance_loss_clip": 0.06344505, "balance_loss_mlp": 0.01264825, "epoch": 0.12722080264542313, "flos": 28374373704960.0, "grad_norm": 1.8342414712755848, "language_loss": 0.88361776, "learning_rate": 3.901667242881065e-06, "loss": 0.96340293, "num_input_tokens_seen": 45736655, "router_z_loss_clip": 3.39648438, "router_z_loss_mlp": 0.29846191, "step": 2116, "time_per_iteration": 4.018234968185425 }, { "auxiliary_loss_clip": 0.06672248, "auxiliary_loss_mlp": 0.01307314, "balance_loss_clip": 0.06344011, "balance_loss_mlp": 0.01277988, "epoch": 0.1272809258980911, "flos": 32388159985920.0, "grad_norm": 2.082069309770579, "language_loss": 0.72105676, "learning_rate": 3.9015465897043775e-06, "loss": 0.80085236, "num_input_tokens_seen": 45758195, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.29345703, "step": 2117, "time_per_iteration": 2.6880388259887695 }, { "auxiliary_loss_clip": 0.06678931, "auxiliary_loss_mlp": 0.01303131, "balance_loss_clip": 0.06339373, "balance_loss_mlp": 0.01270873, "epoch": 0.12734104915075906, "flos": 16039952807040.0, "grad_norm": 2.1529615493366907, "language_loss": 0.87739605, "learning_rate": 3.901425864420852e-06, "loss": 0.95721668, "num_input_tokens_seen": 45774280, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.32250977, "step": 2118, "time_per_iteration": 2.6234524250030518 }, { "auxiliary_loss_clip": 0.06668986, "auxiliary_loss_mlp": 0.01321088, "balance_loss_clip": 0.06336488, "balance_loss_mlp": 0.01290809, "epoch": 0.12740117240342702, "flos": 18266296590720.0, "grad_norm": 2.129499238846293, "language_loss": 0.88283175, "learning_rate": 3.901305067035068e-06, "loss": 0.96273249, "num_input_tokens_seen": 45792760, "router_z_loss_clip": 3.32617188, "router_z_loss_mlp": 0.30249023, "step": 2119, "time_per_iteration": 2.6117024421691895 }, { "auxiliary_loss_clip": 0.06678066, "auxiliary_loss_mlp": 0.01309373, "balance_loss_clip": 0.06341145, "balance_loss_mlp": 0.01277353, "epoch": 0.127461295656095, "flos": 12125242379520.0, "grad_norm": 5.027625092826215, "language_loss": 0.8875978, "learning_rate": 3.901184197551605e-06, "loss": 0.96747226, "num_input_tokens_seen": 45804300, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.31982422, "step": 2120, "time_per_iteration": 2.5373382568359375 }, { "auxiliary_loss_clip": 0.06677809, "auxiliary_loss_mlp": 0.01320674, "balance_loss_clip": 0.0634141, "balance_loss_mlp": 0.01288511, "epoch": 0.12752141890876295, "flos": 23155831019520.0, "grad_norm": 2.0668971034047714, "language_loss": 0.7731241, "learning_rate": 3.901063255975046e-06, "loss": 0.85310888, "num_input_tokens_seen": 45823780, "router_z_loss_clip": 3.36132812, "router_z_loss_mlp": 0.32128906, "step": 2121, "time_per_iteration": 2.6229841709136963 }, { "auxiliary_loss_clip": 0.0667927, "auxiliary_loss_mlp": 0.01305265, "balance_loss_clip": 0.06342435, "balance_loss_mlp": 0.01272482, "epoch": 0.12758154216143094, "flos": 21622359847680.0, "grad_norm": 3.3188538243109353, "language_loss": 0.84650183, "learning_rate": 3.900942242309978e-06, "loss": 0.92634714, "num_input_tokens_seen": 45840495, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.32763672, "step": 2122, "time_per_iteration": 2.6155877113342285 }, { "auxiliary_loss_clip": 0.06672647, "auxiliary_loss_mlp": 0.01305826, "balance_loss_clip": 0.0633715, "balance_loss_mlp": 0.01272448, "epoch": 0.1276416654140989, "flos": 15930395193600.0, "grad_norm": 1.8309341113622823, "language_loss": 0.803231, "learning_rate": 3.90082115656099e-06, "loss": 0.88301575, "num_input_tokens_seen": 45857735, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.33374023, "step": 2123, "time_per_iteration": 2.60456919670105 }, { "auxiliary_loss_clip": 0.06673528, "auxiliary_loss_mlp": 0.01305403, "balance_loss_clip": 0.06341042, "balance_loss_mlp": 0.01274671, "epoch": 0.12770178866676687, "flos": 22389263141760.0, "grad_norm": 2.051042761927722, "language_loss": 0.8060075, "learning_rate": 3.900699998732673e-06, "loss": 0.88579684, "num_input_tokens_seen": 45876485, "router_z_loss_clip": 3.32226562, "router_z_loss_mlp": 0.30761719, "step": 2124, "time_per_iteration": 2.625894784927368 }, { "auxiliary_loss_clip": 0.06670982, "auxiliary_loss_mlp": 0.013, "balance_loss_clip": 0.06334305, "balance_loss_mlp": 0.01269792, "epoch": 0.12776191191943484, "flos": 21658851100800.0, "grad_norm": 2.0944223927778793, "language_loss": 0.77075112, "learning_rate": 3.900578768829623e-06, "loss": 0.85046089, "num_input_tokens_seen": 45894645, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.30175781, "step": 2125, "time_per_iteration": 2.6787307262420654 }, { "auxiliary_loss_clip": 0.06668903, "auxiliary_loss_mlp": 0.01309592, "balance_loss_clip": 0.06335377, "balance_loss_mlp": 0.01276642, "epoch": 0.1278220351721028, "flos": 25742056089600.0, "grad_norm": 2.3638891532569613, "language_loss": 0.79440832, "learning_rate": 3.900457466856434e-06, "loss": 0.87419325, "num_input_tokens_seen": 45913755, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.3293457, "step": 2126, "time_per_iteration": 2.712763786315918 }, { "auxiliary_loss_clip": 0.06670581, "auxiliary_loss_mlp": 0.01309767, "balance_loss_clip": 0.06335605, "balance_loss_mlp": 0.01277795, "epoch": 0.12788215842477077, "flos": 41252515747200.0, "grad_norm": 1.6033035200423598, "language_loss": 0.70196438, "learning_rate": 3.9003360928177085e-06, "loss": 0.78176785, "num_input_tokens_seen": 45936095, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.31933594, "step": 2127, "time_per_iteration": 2.8305816650390625 }, { "auxiliary_loss_clip": 0.06523642, "auxiliary_loss_mlp": 0.01308757, "balance_loss_clip": 0.0631796, "balance_loss_mlp": 0.01297981, "epoch": 0.12794228167743876, "flos": 70899079265280.0, "grad_norm": 0.8188913599150125, "language_loss": 0.62593043, "learning_rate": 3.900214646718047e-06, "loss": 0.70425451, "num_input_tokens_seen": 46004655, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.10791016, "step": 2128, "time_per_iteration": 3.25937557220459 }, { "auxiliary_loss_clip": 0.06672983, "auxiliary_loss_mlp": 0.01296788, "balance_loss_clip": 0.06331964, "balance_loss_mlp": 0.01264315, "epoch": 0.12800240493010673, "flos": 16295307724800.0, "grad_norm": 4.268304080961306, "language_loss": 0.78482497, "learning_rate": 3.900093128562056e-06, "loss": 0.8645227, "num_input_tokens_seen": 46023610, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.32470703, "step": 2129, "time_per_iteration": 2.597813367843628 }, { "auxiliary_loss_clip": 0.06672934, "auxiliary_loss_mlp": 0.01296014, "balance_loss_clip": 0.06327202, "balance_loss_mlp": 0.01260823, "epoch": 0.1280625281827747, "flos": 20637850700160.0, "grad_norm": 2.3185532178053845, "language_loss": 0.80627823, "learning_rate": 3.899971538354343e-06, "loss": 0.88596773, "num_input_tokens_seen": 46041725, "router_z_loss_clip": 3.45703125, "router_z_loss_mlp": 0.35205078, "step": 2130, "time_per_iteration": 2.6111629009246826 }, { "auxiliary_loss_clip": 0.06659792, "auxiliary_loss_mlp": 0.01299702, "balance_loss_clip": 0.06323011, "balance_loss_mlp": 0.01268135, "epoch": 0.12812265143544266, "flos": 22644869621760.0, "grad_norm": 1.9033527375227983, "language_loss": 0.73045647, "learning_rate": 3.899849876099518e-06, "loss": 0.81005144, "num_input_tokens_seen": 46061095, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.31567383, "step": 2131, "time_per_iteration": 2.646266222000122 }, { "auxiliary_loss_clip": 0.06650864, "auxiliary_loss_mlp": 0.0129527, "balance_loss_clip": 0.06316892, "balance_loss_mlp": 0.01263822, "epoch": 0.12818277468811062, "flos": 34723306696320.0, "grad_norm": 2.135002769228079, "language_loss": 0.74037182, "learning_rate": 3.899728141802197e-06, "loss": 0.81983316, "num_input_tokens_seen": 46082670, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.31445312, "step": 2132, "time_per_iteration": 2.7210400104522705 }, { "auxiliary_loss_clip": 0.06647925, "auxiliary_loss_mlp": 0.01299565, "balance_loss_clip": 0.06324285, "balance_loss_mlp": 0.01268451, "epoch": 0.1282428979407786, "flos": 23118752787840.0, "grad_norm": 2.0350845880191115, "language_loss": 0.83406377, "learning_rate": 3.8996063354669935e-06, "loss": 0.91353869, "num_input_tokens_seen": 46102410, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.3112793, "step": 2133, "time_per_iteration": 2.608858346939087 }, { "auxiliary_loss_clip": 0.06662954, "auxiliary_loss_mlp": 0.01299047, "balance_loss_clip": 0.06322567, "balance_loss_mlp": 0.01266813, "epoch": 0.12830302119344655, "flos": 20892786347520.0, "grad_norm": 3.948406229009712, "language_loss": 0.81827044, "learning_rate": 3.899484457098528e-06, "loss": 0.89789045, "num_input_tokens_seen": 46121145, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.32177734, "step": 2134, "time_per_iteration": 2.67045521736145 }, { "auxiliary_loss_clip": 0.06653412, "auxiliary_loss_mlp": 0.01289876, "balance_loss_clip": 0.06319513, "balance_loss_mlp": 0.01258882, "epoch": 0.12836314444611455, "flos": 21404208942720.0, "grad_norm": 1.8303491813269563, "language_loss": 0.83875299, "learning_rate": 3.899362506701421e-06, "loss": 0.91818583, "num_input_tokens_seen": 46140740, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.30981445, "step": 2135, "time_per_iteration": 2.592489242553711 }, { "auxiliary_loss_clip": 0.06645668, "auxiliary_loss_mlp": 0.0129149, "balance_loss_clip": 0.06317086, "balance_loss_mlp": 0.01260281, "epoch": 0.1284232676987825, "flos": 13667560156800.0, "grad_norm": 2.6950426732140396, "language_loss": 0.78326613, "learning_rate": 3.899240484280298e-06, "loss": 0.8626377, "num_input_tokens_seen": 46156805, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.31201172, "step": 2136, "time_per_iteration": 2.5711612701416016 }, { "auxiliary_loss_clip": 0.06518874, "auxiliary_loss_mlp": 0.01263711, "balance_loss_clip": 0.06312662, "balance_loss_mlp": 0.01251474, "epoch": 0.12848339095145048, "flos": 60012904337280.0, "grad_norm": 0.8753820320679084, "language_loss": 0.59393358, "learning_rate": 3.899118389839785e-06, "loss": 0.67175937, "num_input_tokens_seen": 46222085, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.12231445, "step": 2137, "time_per_iteration": 3.329263210296631 }, { "auxiliary_loss_clip": 0.06652823, "auxiliary_loss_mlp": 0.0129689, "balance_loss_clip": 0.0631768, "balance_loss_mlp": 0.0126611, "epoch": 0.12854351420411844, "flos": 13886507675520.0, "grad_norm": 5.92829234514399, "language_loss": 0.83670002, "learning_rate": 3.898996223384512e-06, "loss": 0.91619718, "num_input_tokens_seen": 46239970, "router_z_loss_clip": 3.3515625, "router_z_loss_mlp": 0.30786133, "step": 2138, "time_per_iteration": 2.595745086669922 }, { "auxiliary_loss_clip": 0.06658295, "auxiliary_loss_mlp": 0.01303314, "balance_loss_clip": 0.0631889, "balance_loss_mlp": 0.01269744, "epoch": 0.1286036374567864, "flos": 22644534205440.0, "grad_norm": 2.4931486433246204, "language_loss": 0.79455954, "learning_rate": 3.898873984919113e-06, "loss": 0.87417561, "num_input_tokens_seen": 46257740, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.33569336, "step": 2139, "time_per_iteration": 2.5964252948760986 }, { "auxiliary_loss_clip": 0.06649633, "auxiliary_loss_mlp": 0.01295164, "balance_loss_clip": 0.06318066, "balance_loss_mlp": 0.0126498, "epoch": 0.12866376070945437, "flos": 16330121896320.0, "grad_norm": 3.289160728923572, "language_loss": 0.86036539, "learning_rate": 3.8987516744482215e-06, "loss": 0.93981338, "num_input_tokens_seen": 46275445, "router_z_loss_clip": 3.31640625, "router_z_loss_mlp": 0.30114746, "step": 2140, "time_per_iteration": 2.605328321456909 }, { "auxiliary_loss_clip": 0.06646602, "auxiliary_loss_mlp": 0.01300357, "balance_loss_clip": 0.06318555, "balance_loss_mlp": 0.01271556, "epoch": 0.12872388396212234, "flos": 11879321045760.0, "grad_norm": 3.6446260640106978, "language_loss": 0.87346601, "learning_rate": 3.898629291976476e-06, "loss": 0.95293558, "num_input_tokens_seen": 46291710, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.28808594, "step": 2141, "time_per_iteration": 2.578706741333008 }, { "auxiliary_loss_clip": 0.06659333, "auxiliary_loss_mlp": 0.01298754, "balance_loss_clip": 0.06323049, "balance_loss_mlp": 0.01268236, "epoch": 0.12878400721479033, "flos": 28374331777920.0, "grad_norm": 2.3405745486193563, "language_loss": 0.69754994, "learning_rate": 3.898506837508518e-06, "loss": 0.77713084, "num_input_tokens_seen": 46311335, "router_z_loss_clip": 3.36132812, "router_z_loss_mlp": 0.30493164, "step": 2142, "time_per_iteration": 2.6795918941497803 }, { "auxiliary_loss_clip": 0.06658895, "auxiliary_loss_mlp": 0.01307209, "balance_loss_clip": 0.06317982, "balance_loss_mlp": 0.01274903, "epoch": 0.1288441304674583, "flos": 25892842711680.0, "grad_norm": 2.6167144757266216, "language_loss": 0.84035921, "learning_rate": 3.89838431104899e-06, "loss": 0.92002034, "num_input_tokens_seen": 46330985, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.32324219, "step": 2143, "time_per_iteration": 2.63285756111145 }, { "auxiliary_loss_clip": 0.06656098, "auxiliary_loss_mlp": 0.01302265, "balance_loss_clip": 0.0631617, "balance_loss_mlp": 0.01268767, "epoch": 0.12890425372012626, "flos": 20820097330560.0, "grad_norm": 2.1319701708809022, "language_loss": 0.82419014, "learning_rate": 3.898261712602539e-06, "loss": 0.90377384, "num_input_tokens_seen": 46351295, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.33496094, "step": 2144, "time_per_iteration": 2.6219964027404785 }, { "auxiliary_loss_clip": 0.06642935, "auxiliary_loss_mlp": 0.01304433, "balance_loss_clip": 0.06311032, "balance_loss_mlp": 0.0127146, "epoch": 0.12896437697279423, "flos": 22572599875200.0, "grad_norm": 2.387836882476009, "language_loss": 0.8024416, "learning_rate": 3.898139042173813e-06, "loss": 0.88191533, "num_input_tokens_seen": 46368600, "router_z_loss_clip": 3.32421875, "router_z_loss_mlp": 0.3293457, "step": 2145, "time_per_iteration": 2.6193313598632812 }, { "auxiliary_loss_clip": 0.06649332, "auxiliary_loss_mlp": 0.01294807, "balance_loss_clip": 0.06313621, "balance_loss_mlp": 0.01261881, "epoch": 0.1290245002254622, "flos": 17499561004800.0, "grad_norm": 2.5069935660808476, "language_loss": 0.83618438, "learning_rate": 3.898016299767465e-06, "loss": 0.91562581, "num_input_tokens_seen": 46387370, "router_z_loss_clip": 3.35742188, "router_z_loss_mlp": 0.32910156, "step": 2146, "time_per_iteration": 2.5969059467315674 }, { "auxiliary_loss_clip": 0.06651069, "auxiliary_loss_mlp": 0.01294883, "balance_loss_clip": 0.06319144, "balance_loss_mlp": 0.01261957, "epoch": 0.12908462347813016, "flos": 36324142151040.0, "grad_norm": 2.297012901381127, "language_loss": 0.72035474, "learning_rate": 3.897893485388149e-06, "loss": 0.79981422, "num_input_tokens_seen": 46409570, "router_z_loss_clip": 3.31835938, "router_z_loss_mlp": 0.3293457, "step": 2147, "time_per_iteration": 2.7285234928131104 }, { "auxiliary_loss_clip": 0.06649996, "auxiliary_loss_mlp": 0.01291063, "balance_loss_clip": 0.06311303, "balance_loss_mlp": 0.01260689, "epoch": 0.12914474673079815, "flos": 22535312008320.0, "grad_norm": 2.4540855006686595, "language_loss": 0.72907352, "learning_rate": 3.897770599040521e-06, "loss": 0.80848408, "num_input_tokens_seen": 46429320, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.3034668, "step": 2148, "time_per_iteration": 2.594552516937256 }, { "auxiliary_loss_clip": 0.06651918, "auxiliary_loss_mlp": 0.0129802, "balance_loss_clip": 0.06319931, "balance_loss_mlp": 0.01267621, "epoch": 0.12920486998346611, "flos": 21478533114240.0, "grad_norm": 2.5789939222108775, "language_loss": 0.80140853, "learning_rate": 3.897647640729242e-06, "loss": 0.88090789, "num_input_tokens_seen": 46450155, "router_z_loss_clip": 3.31835938, "router_z_loss_mlp": 0.30419922, "step": 2149, "time_per_iteration": 4.05189323425293 }, { "auxiliary_loss_clip": 0.06654231, "auxiliary_loss_mlp": 0.01290368, "balance_loss_clip": 0.06319247, "balance_loss_mlp": 0.01258443, "epoch": 0.12926499323613408, "flos": 27316001583360.0, "grad_norm": 1.9673961392337926, "language_loss": 0.78031766, "learning_rate": 3.897524610458975e-06, "loss": 0.85976362, "num_input_tokens_seen": 46470280, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.3190918, "step": 2150, "time_per_iteration": 2.6420886516571045 }, { "auxiliary_loss_clip": 0.06659093, "auxiliary_loss_mlp": 0.01308242, "balance_loss_clip": 0.06318897, "balance_loss_mlp": 0.01276771, "epoch": 0.12932511648880204, "flos": 22097710460160.0, "grad_norm": 2.9932410580833873, "language_loss": 0.72125357, "learning_rate": 3.8974015082343835e-06, "loss": 0.80092692, "num_input_tokens_seen": 46487605, "router_z_loss_clip": 3.3984375, "router_z_loss_mlp": 0.31445312, "step": 2151, "time_per_iteration": 4.044545412063599 }, { "auxiliary_loss_clip": 0.06653269, "auxiliary_loss_mlp": 0.01286979, "balance_loss_clip": 0.06316134, "balance_loss_mlp": 0.01256128, "epoch": 0.12938523974147, "flos": 20308968224640.0, "grad_norm": 2.267396905935732, "language_loss": 0.8516506, "learning_rate": 3.897278334060137e-06, "loss": 0.9310531, "num_input_tokens_seen": 46505100, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.30859375, "step": 2152, "time_per_iteration": 2.6111819744110107 }, { "auxiliary_loss_clip": 0.06649873, "auxiliary_loss_mlp": 0.01289444, "balance_loss_clip": 0.06315651, "balance_loss_mlp": 0.01258521, "epoch": 0.12944536299413797, "flos": 19505992947840.0, "grad_norm": 1.6795880458464898, "language_loss": 0.79620683, "learning_rate": 3.897155087940906e-06, "loss": 0.87559998, "num_input_tokens_seen": 46524020, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.30908203, "step": 2153, "time_per_iteration": 2.6573259830474854 }, { "auxiliary_loss_clip": 0.06651285, "auxiliary_loss_mlp": 0.01285105, "balance_loss_clip": 0.06317265, "balance_loss_mlp": 0.01255136, "epoch": 0.12950548624680594, "flos": 27715099380480.0, "grad_norm": 1.8012048578351454, "language_loss": 0.81124055, "learning_rate": 3.897031769881364e-06, "loss": 0.89060444, "num_input_tokens_seen": 46544640, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.29980469, "step": 2154, "time_per_iteration": 2.6540896892547607 }, { "auxiliary_loss_clip": 0.06648971, "auxiliary_loss_mlp": 0.01290988, "balance_loss_clip": 0.06317852, "balance_loss_mlp": 0.01261233, "epoch": 0.12956560949947393, "flos": 17571369553920.0, "grad_norm": 2.3388601297716964, "language_loss": 0.85037011, "learning_rate": 3.896908379886188e-06, "loss": 0.92976964, "num_input_tokens_seen": 46561395, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.29736328, "step": 2155, "time_per_iteration": 3.9949562549591064 }, { "auxiliary_loss_clip": 0.06658676, "auxiliary_loss_mlp": 0.01293, "balance_loss_clip": 0.06317672, "balance_loss_mlp": 0.01262316, "epoch": 0.1296257327521419, "flos": 20746989043200.0, "grad_norm": 2.5226383349856696, "language_loss": 0.78072512, "learning_rate": 3.896784917960055e-06, "loss": 0.86024189, "num_input_tokens_seen": 46579395, "router_z_loss_clip": 3.41210938, "router_z_loss_mlp": 0.30712891, "step": 2156, "time_per_iteration": 4.035828113555908 }, { "auxiliary_loss_clip": 0.06650443, "auxiliary_loss_mlp": 0.01291361, "balance_loss_clip": 0.06319518, "balance_loss_mlp": 0.01260724, "epoch": 0.12968585600480986, "flos": 16400756488320.0, "grad_norm": 1.9768200523335515, "language_loss": 0.87217999, "learning_rate": 3.896661384107648e-06, "loss": 0.95159805, "num_input_tokens_seen": 46597090, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.30664062, "step": 2157, "time_per_iteration": 2.5658509731292725 }, { "auxiliary_loss_clip": 0.06654897, "auxiliary_loss_mlp": 0.012911, "balance_loss_clip": 0.06315389, "balance_loss_mlp": 0.01259438, "epoch": 0.12974597925747783, "flos": 28337043911040.0, "grad_norm": 2.130120930373601, "language_loss": 0.81588864, "learning_rate": 3.896537778333651e-06, "loss": 0.89534867, "num_input_tokens_seen": 46617355, "router_z_loss_clip": 3.39257812, "router_z_loss_mlp": 0.31689453, "step": 2158, "time_per_iteration": 2.6392898559570312 }, { "auxiliary_loss_clip": 0.06651846, "auxiliary_loss_mlp": 0.01303523, "balance_loss_clip": 0.06311579, "balance_loss_mlp": 0.01274245, "epoch": 0.1298061025101458, "flos": 9687036746880.0, "grad_norm": 2.488761284360694, "language_loss": 0.76446968, "learning_rate": 3.896414100642752e-06, "loss": 0.84402335, "num_input_tokens_seen": 46633130, "router_z_loss_clip": 3.40429688, "router_z_loss_mlp": 0.29272461, "step": 2159, "time_per_iteration": 2.5698201656341553 }, { "auxiliary_loss_clip": 0.06636173, "auxiliary_loss_mlp": 0.01291796, "balance_loss_clip": 0.06307851, "balance_loss_mlp": 0.0126185, "epoch": 0.12986622576281376, "flos": 27716986097280.0, "grad_norm": 2.1572198243017056, "language_loss": 0.83316523, "learning_rate": 3.89629035103964e-06, "loss": 0.91244489, "num_input_tokens_seen": 46650575, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.29907227, "step": 2160, "time_per_iteration": 2.652229070663452 }, { "auxiliary_loss_clip": 0.06632384, "auxiliary_loss_mlp": 0.01288463, "balance_loss_clip": 0.06308892, "balance_loss_mlp": 0.01257516, "epoch": 0.12992634901548175, "flos": 18807963310080.0, "grad_norm": 1.6046276823301038, "language_loss": 0.83440906, "learning_rate": 3.896166529529008e-06, "loss": 0.91361749, "num_input_tokens_seen": 46668780, "router_z_loss_clip": 3.23828125, "router_z_loss_mlp": 0.30908203, "step": 2161, "time_per_iteration": 2.6028802394866943 }, { "auxiliary_loss_clip": 0.06645763, "auxiliary_loss_mlp": 0.01287798, "balance_loss_clip": 0.06315318, "balance_loss_mlp": 0.01256566, "epoch": 0.12998647226814972, "flos": 29134442891520.0, "grad_norm": 2.2993417025877174, "language_loss": 0.83235848, "learning_rate": 3.896042636115551e-06, "loss": 0.91169405, "num_input_tokens_seen": 46687550, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.3125, "step": 2162, "time_per_iteration": 2.6676836013793945 }, { "auxiliary_loss_clip": 0.06653346, "auxiliary_loss_mlp": 0.0128713, "balance_loss_clip": 0.06314065, "balance_loss_mlp": 0.01257137, "epoch": 0.13004659552081768, "flos": 19579855921920.0, "grad_norm": 2.572412515601701, "language_loss": 0.74275589, "learning_rate": 3.895918670803968e-06, "loss": 0.8221606, "num_input_tokens_seen": 46706730, "router_z_loss_clip": 3.39453125, "router_z_loss_mlp": 0.29980469, "step": 2163, "time_per_iteration": 2.5764896869659424 }, { "auxiliary_loss_clip": 0.06652612, "auxiliary_loss_mlp": 0.01289725, "balance_loss_clip": 0.06315242, "balance_loss_mlp": 0.01258897, "epoch": 0.13010671877348565, "flos": 22497059819520.0, "grad_norm": 2.327685359134474, "language_loss": 0.82429326, "learning_rate": 3.895794633598958e-06, "loss": 0.90371668, "num_input_tokens_seen": 46724250, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.30834961, "step": 2164, "time_per_iteration": 2.6097941398620605 }, { "auxiliary_loss_clip": 0.06650669, "auxiliary_loss_mlp": 0.0128647, "balance_loss_clip": 0.06313838, "balance_loss_mlp": 0.01255904, "epoch": 0.1301668420261536, "flos": 23884985249280.0, "grad_norm": 2.3061499427220173, "language_loss": 0.72988331, "learning_rate": 3.8956705245052256e-06, "loss": 0.80925471, "num_input_tokens_seen": 46744105, "router_z_loss_clip": 3.3671875, "router_z_loss_mlp": 0.3059082, "step": 2165, "time_per_iteration": 2.615105390548706 }, { "auxiliary_loss_clip": 0.06665786, "auxiliary_loss_mlp": 0.01287182, "balance_loss_clip": 0.06323749, "balance_loss_mlp": 0.01254447, "epoch": 0.13022696527882158, "flos": 23156963049600.0, "grad_norm": 1.9057031538503388, "language_loss": 0.76001763, "learning_rate": 3.8955463435274765e-06, "loss": 0.83954728, "num_input_tokens_seen": 46764250, "router_z_loss_clip": 3.41992188, "router_z_loss_mlp": 0.32727051, "step": 2166, "time_per_iteration": 2.603456497192383 }, { "auxiliary_loss_clip": 0.06653428, "auxiliary_loss_mlp": 0.01287832, "balance_loss_clip": 0.06316885, "balance_loss_mlp": 0.01257649, "epoch": 0.13028708853148954, "flos": 26916149099520.0, "grad_norm": 1.628773584023069, "language_loss": 0.84166449, "learning_rate": 3.895422090670421e-06, "loss": 0.92107707, "num_input_tokens_seen": 46786865, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.30200195, "step": 2167, "time_per_iteration": 2.6379201412200928 }, { "auxiliary_loss_clip": 0.06645852, "auxiliary_loss_mlp": 0.01292249, "balance_loss_clip": 0.06316822, "balance_loss_mlp": 0.01260444, "epoch": 0.13034721178415754, "flos": 21257824659840.0, "grad_norm": 3.0565096756930044, "language_loss": 0.84729427, "learning_rate": 3.89529776593877e-06, "loss": 0.92667526, "num_input_tokens_seen": 46807030, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.31762695, "step": 2168, "time_per_iteration": 2.616518259048462 }, { "auxiliary_loss_clip": 0.06655989, "auxiliary_loss_mlp": 0.01291634, "balance_loss_clip": 0.06319959, "balance_loss_mlp": 0.01261021, "epoch": 0.1304073350368255, "flos": 18772646014080.0, "grad_norm": 2.7419379061455738, "language_loss": 0.81108958, "learning_rate": 3.8951733693372375e-06, "loss": 0.89056581, "num_input_tokens_seen": 46826280, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.30603027, "step": 2169, "time_per_iteration": 2.590477705001831 }, { "auxiliary_loss_clip": 0.06660919, "auxiliary_loss_mlp": 0.01287046, "balance_loss_clip": 0.06324854, "balance_loss_mlp": 0.01254693, "epoch": 0.13046745828949347, "flos": 28371941936640.0, "grad_norm": 2.860836375964924, "language_loss": 0.69902045, "learning_rate": 3.8950489008705406e-06, "loss": 0.77850008, "num_input_tokens_seen": 46846505, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.32348633, "step": 2170, "time_per_iteration": 2.681915760040283 }, { "auxiliary_loss_clip": 0.06652927, "auxiliary_loss_mlp": 0.01286232, "balance_loss_clip": 0.06321403, "balance_loss_mlp": 0.01257764, "epoch": 0.13052758154216143, "flos": 29612518761600.0, "grad_norm": 2.369797862936806, "language_loss": 0.68171978, "learning_rate": 3.8949243605434e-06, "loss": 0.76111138, "num_input_tokens_seen": 46867380, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.28515625, "step": 2171, "time_per_iteration": 2.651721239089966 }, { "auxiliary_loss_clip": 0.06661142, "auxiliary_loss_mlp": 0.0129286, "balance_loss_clip": 0.06325538, "balance_loss_mlp": 0.01261126, "epoch": 0.1305877047948294, "flos": 19396938458880.0, "grad_norm": 2.032261677327073, "language_loss": 0.73807323, "learning_rate": 3.894799748360537e-06, "loss": 0.8176133, "num_input_tokens_seen": 46886810, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.31689453, "step": 2172, "time_per_iteration": 2.5868470668792725 }, { "auxiliary_loss_clip": 0.06656741, "auxiliary_loss_mlp": 0.01287014, "balance_loss_clip": 0.06329897, "balance_loss_mlp": 0.01258081, "epoch": 0.13064782804749736, "flos": 16879209701760.0, "grad_norm": 3.663593604334122, "language_loss": 0.76919717, "learning_rate": 3.894675064326678e-06, "loss": 0.84863466, "num_input_tokens_seen": 46905620, "router_z_loss_clip": 3.265625, "router_z_loss_mlp": 0.28918457, "step": 2173, "time_per_iteration": 2.5722827911376953 }, { "auxiliary_loss_clip": 0.06668784, "auxiliary_loss_mlp": 0.0129077, "balance_loss_clip": 0.06329011, "balance_loss_mlp": 0.0125875, "epoch": 0.13070795130016533, "flos": 24506049312000.0, "grad_norm": 2.3125439553548492, "language_loss": 0.72354364, "learning_rate": 3.894550308446551e-06, "loss": 0.80313921, "num_input_tokens_seen": 46925120, "router_z_loss_clip": 3.39648438, "router_z_loss_mlp": 0.32006836, "step": 2174, "time_per_iteration": 2.615844488143921 }, { "auxiliary_loss_clip": 0.06556276, "auxiliary_loss_mlp": 0.01302143, "balance_loss_clip": 0.06352093, "balance_loss_mlp": 0.01288363, "epoch": 0.13076807455283332, "flos": 71075288401920.0, "grad_norm": 0.7751974409814449, "language_loss": 0.58759427, "learning_rate": 3.894425480724886e-06, "loss": 0.66617846, "num_input_tokens_seen": 46988195, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.13818359, "step": 2175, "time_per_iteration": 3.366513729095459 }, { "auxiliary_loss_clip": 0.06667797, "auxiliary_loss_mlp": 0.01296329, "balance_loss_clip": 0.06334398, "balance_loss_mlp": 0.012645, "epoch": 0.13082819780550128, "flos": 20270380619520.0, "grad_norm": 2.264354779228682, "language_loss": 0.80949211, "learning_rate": 3.894300581166417e-06, "loss": 0.88913345, "num_input_tokens_seen": 47004720, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.31835938, "step": 2176, "time_per_iteration": 2.613854169845581 }, { "auxiliary_loss_clip": 0.06666317, "auxiliary_loss_mlp": 0.01288461, "balance_loss_clip": 0.06333156, "balance_loss_mlp": 0.01257348, "epoch": 0.13088832105816925, "flos": 34211884101120.0, "grad_norm": 1.8907092853578351, "language_loss": 0.75849044, "learning_rate": 3.894175609775881e-06, "loss": 0.83803821, "num_input_tokens_seen": 47024255, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.31103516, "step": 2177, "time_per_iteration": 2.698428153991699 }, { "auxiliary_loss_clip": 0.06671502, "auxiliary_loss_mlp": 0.01292116, "balance_loss_clip": 0.06340569, "balance_loss_mlp": 0.01259095, "epoch": 0.13094844431083721, "flos": 17900797080960.0, "grad_norm": 2.2441479307022143, "language_loss": 0.83320332, "learning_rate": 3.894050566558015e-06, "loss": 0.91283941, "num_input_tokens_seen": 47042465, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.33007812, "step": 2178, "time_per_iteration": 2.563239336013794 }, { "auxiliary_loss_clip": 0.06668338, "auxiliary_loss_mlp": 0.01288204, "balance_loss_clip": 0.06335298, "balance_loss_mlp": 0.01256447, "epoch": 0.13100856756350518, "flos": 17317062812160.0, "grad_norm": 2.7989121668418537, "language_loss": 0.7586987, "learning_rate": 3.893925451517562e-06, "loss": 0.83826417, "num_input_tokens_seen": 47060370, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.31762695, "step": 2179, "time_per_iteration": 2.5634965896606445 }, { "auxiliary_loss_clip": 0.06658857, "auxiliary_loss_mlp": 0.01293576, "balance_loss_clip": 0.06336895, "balance_loss_mlp": 0.01263559, "epoch": 0.13106869081617314, "flos": 22207142292480.0, "grad_norm": 2.3171849716236532, "language_loss": 0.85446107, "learning_rate": 3.893800264659266e-06, "loss": 0.93398547, "num_input_tokens_seen": 47081415, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.30029297, "step": 2180, "time_per_iteration": 2.6069674491882324 }, { "auxiliary_loss_clip": 0.0666893, "auxiliary_loss_mlp": 0.01290331, "balance_loss_clip": 0.06340741, "balance_loss_mlp": 0.01260791, "epoch": 0.13112881406884114, "flos": 21769708452480.0, "grad_norm": 1.8517972798377353, "language_loss": 0.9080494, "learning_rate": 3.8936750059878746e-06, "loss": 0.98764199, "num_input_tokens_seen": 47099860, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.29541016, "step": 2181, "time_per_iteration": 2.6869521141052246 }, { "auxiliary_loss_clip": 0.06667092, "auxiliary_loss_mlp": 0.01290944, "balance_loss_clip": 0.06334066, "balance_loss_mlp": 0.01260069, "epoch": 0.1311889373215091, "flos": 23337784160640.0, "grad_norm": 2.4341169995842473, "language_loss": 0.71065688, "learning_rate": 3.893549675508137e-06, "loss": 0.79023725, "num_input_tokens_seen": 47118540, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.30908203, "step": 2182, "time_per_iteration": 2.6479387283325195 }, { "auxiliary_loss_clip": 0.0668165, "auxiliary_loss_mlp": 0.0129408, "balance_loss_clip": 0.06342609, "balance_loss_mlp": 0.01262513, "epoch": 0.13124906057417707, "flos": 21473250307200.0, "grad_norm": 2.7520340617667234, "language_loss": 0.79232091, "learning_rate": 3.893424273224806e-06, "loss": 0.87207818, "num_input_tokens_seen": 47136710, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.31567383, "step": 2183, "time_per_iteration": 2.590818166732788 }, { "auxiliary_loss_clip": 0.06653181, "auxiliary_loss_mlp": 0.01292163, "balance_loss_clip": 0.06325489, "balance_loss_mlp": 0.01262313, "epoch": 0.13130918382684503, "flos": 23261531345280.0, "grad_norm": 17.987800682674642, "language_loss": 0.86405122, "learning_rate": 3.893298799142636e-06, "loss": 0.94350469, "num_input_tokens_seen": 47157155, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.29833984, "step": 2184, "time_per_iteration": 2.6124908924102783 }, { "auxiliary_loss_clip": 0.06668615, "auxiliary_loss_mlp": 0.01291724, "balance_loss_clip": 0.06334969, "balance_loss_mlp": 0.01261135, "epoch": 0.131369307079513, "flos": 20856588583680.0, "grad_norm": 2.46259137543141, "language_loss": 0.83327305, "learning_rate": 3.893173253266387e-06, "loss": 0.91287649, "num_input_tokens_seen": 47176820, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.30615234, "step": 2185, "time_per_iteration": 2.6830925941467285 }, { "auxiliary_loss_clip": 0.06650031, "auxiliary_loss_mlp": 0.01306128, "balance_loss_clip": 0.06318167, "balance_loss_mlp": 0.01274704, "epoch": 0.13142943033218096, "flos": 17864138119680.0, "grad_norm": 2.3841801376102456, "language_loss": 0.74378645, "learning_rate": 3.893047635600818e-06, "loss": 0.82334805, "num_input_tokens_seen": 47195855, "router_z_loss_clip": 3.31640625, "router_z_loss_mlp": 0.31396484, "step": 2186, "time_per_iteration": 2.5751583576202393 }, { "auxiliary_loss_clip": 0.06658599, "auxiliary_loss_mlp": 0.01314605, "balance_loss_clip": 0.06329136, "balance_loss_mlp": 0.01280845, "epoch": 0.13148955358484893, "flos": 21002343960960.0, "grad_norm": 3.860999227871164, "language_loss": 0.81613976, "learning_rate": 3.892921946150693e-06, "loss": 0.89587176, "num_input_tokens_seen": 47214535, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.33740234, "step": 2187, "time_per_iteration": 2.6038947105407715 }, { "auxiliary_loss_clip": 0.06509157, "auxiliary_loss_mlp": 0.01324163, "balance_loss_clip": 0.06311134, "balance_loss_mlp": 0.01311444, "epoch": 0.13154967683751692, "flos": 70192035313920.0, "grad_norm": 0.8073319514706689, "language_loss": 0.58757174, "learning_rate": 3.892796184920778e-06, "loss": 0.665905, "num_input_tokens_seen": 47270300, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.12731934, "step": 2188, "time_per_iteration": 4.650205850601196 }, { "auxiliary_loss_clip": 0.06644315, "auxiliary_loss_mlp": 0.01325369, "balance_loss_clip": 0.06320122, "balance_loss_mlp": 0.01293111, "epoch": 0.1316098000901849, "flos": 20382411928320.0, "grad_norm": 1.8858126069832104, "language_loss": 0.75209635, "learning_rate": 3.892670351915842e-06, "loss": 0.83179319, "num_input_tokens_seen": 47290720, "router_z_loss_clip": 3.24414062, "router_z_loss_mlp": 0.32275391, "step": 2189, "time_per_iteration": 2.615147829055786 }, { "auxiliary_loss_clip": 0.06645817, "auxiliary_loss_mlp": 0.01318459, "balance_loss_clip": 0.06316431, "balance_loss_mlp": 0.01284651, "epoch": 0.13166992334285285, "flos": 23227723422720.0, "grad_norm": 19.793486076609863, "language_loss": 0.72940445, "learning_rate": 3.892544447140657e-06, "loss": 0.80904716, "num_input_tokens_seen": 47311820, "router_z_loss_clip": 3.29492188, "router_z_loss_mlp": 0.33813477, "step": 2190, "time_per_iteration": 2.626457452774048 }, { "auxiliary_loss_clip": 0.06643681, "auxiliary_loss_mlp": 0.01302974, "balance_loss_clip": 0.06315855, "balance_loss_mlp": 0.01272004, "epoch": 0.13173004659552082, "flos": 23337616452480.0, "grad_norm": 2.3905119592107127, "language_loss": 0.75542271, "learning_rate": 3.892418470599996e-06, "loss": 0.83488923, "num_input_tokens_seen": 47331605, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.30932617, "step": 2191, "time_per_iteration": 4.029491662979126 }, { "auxiliary_loss_clip": 0.06655753, "auxiliary_loss_mlp": 0.01300481, "balance_loss_clip": 0.06319224, "balance_loss_mlp": 0.01265576, "epoch": 0.13179016984818878, "flos": 21257866586880.0, "grad_norm": 1.934307134090181, "language_loss": 0.79828274, "learning_rate": 3.892292422298637e-06, "loss": 0.87784505, "num_input_tokens_seen": 47350455, "router_z_loss_clip": 3.36132812, "router_z_loss_mlp": 0.34887695, "step": 2192, "time_per_iteration": 2.581678867340088 }, { "auxiliary_loss_clip": 0.06652547, "auxiliary_loss_mlp": 0.01311371, "balance_loss_clip": 0.0632004, "balance_loss_mlp": 0.01278303, "epoch": 0.13185029310085675, "flos": 17783357184000.0, "grad_norm": 1.8863607305911154, "language_loss": 0.86354637, "learning_rate": 3.892166302241361e-06, "loss": 0.94318551, "num_input_tokens_seen": 47368225, "router_z_loss_clip": 3.328125, "router_z_loss_mlp": 0.33056641, "step": 2193, "time_per_iteration": 2.5970799922943115 }, { "auxiliary_loss_clip": 0.06494874, "auxiliary_loss_mlp": 0.0128281, "balance_loss_clip": 0.06297364, "balance_loss_mlp": 0.01270901, "epoch": 0.1319104163535247, "flos": 69872586422400.0, "grad_norm": 0.7284369206085407, "language_loss": 0.54015988, "learning_rate": 3.8920401104329475e-06, "loss": 0.61793673, "num_input_tokens_seen": 47427125, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.11901855, "step": 2194, "time_per_iteration": 4.541178464889526 }, { "auxiliary_loss_clip": 0.06641771, "auxiliary_loss_mlp": 0.01296825, "balance_loss_clip": 0.06313748, "balance_loss_mlp": 0.0126521, "epoch": 0.1319705396061927, "flos": 25200305516160.0, "grad_norm": 1.833048275320002, "language_loss": 0.73704731, "learning_rate": 3.891913846878185e-06, "loss": 0.81643331, "num_input_tokens_seen": 47450275, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.31616211, "step": 2195, "time_per_iteration": 2.6483042240142822 }, { "auxiliary_loss_clip": 0.06654826, "auxiliary_loss_mlp": 0.01291462, "balance_loss_clip": 0.06314928, "balance_loss_mlp": 0.01259371, "epoch": 0.13203066285886067, "flos": 20746695553920.0, "grad_norm": 1.9241262945783473, "language_loss": 0.79202163, "learning_rate": 3.891787511581859e-06, "loss": 0.87148452, "num_input_tokens_seen": 47469155, "router_z_loss_clip": 3.3984375, "router_z_loss_mlp": 0.32104492, "step": 2196, "time_per_iteration": 3.969031572341919 }, { "auxiliary_loss_clip": 0.06653547, "auxiliary_loss_mlp": 0.01291449, "balance_loss_clip": 0.06313545, "balance_loss_mlp": 0.01260693, "epoch": 0.13209078611152864, "flos": 22060925717760.0, "grad_norm": 2.452069031433634, "language_loss": 0.75967753, "learning_rate": 3.89166110454876e-06, "loss": 0.83912754, "num_input_tokens_seen": 47488405, "router_z_loss_clip": 3.39648438, "router_z_loss_mlp": 0.30737305, "step": 2197, "time_per_iteration": 2.5934393405914307 }, { "auxiliary_loss_clip": 0.06651574, "auxiliary_loss_mlp": 0.01290326, "balance_loss_clip": 0.0631271, "balance_loss_mlp": 0.01257591, "epoch": 0.1321509093641966, "flos": 16289731428480.0, "grad_norm": 2.326892305224954, "language_loss": 0.81433266, "learning_rate": 3.891534625783685e-06, "loss": 0.89375168, "num_input_tokens_seen": 47505650, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.32739258, "step": 2198, "time_per_iteration": 2.568772315979004 }, { "auxiliary_loss_clip": 0.06647719, "auxiliary_loss_mlp": 0.01289548, "balance_loss_clip": 0.06316163, "balance_loss_mlp": 0.01258053, "epoch": 0.13221103261686457, "flos": 16988725388160.0, "grad_norm": 2.9007259858247276, "language_loss": 0.84236705, "learning_rate": 3.891408075291425e-06, "loss": 0.9217397, "num_input_tokens_seen": 47521540, "router_z_loss_clip": 3.31640625, "router_z_loss_mlp": 0.31494141, "step": 2199, "time_per_iteration": 2.5964837074279785 }, { "auxiliary_loss_clip": 0.0664039, "auxiliary_loss_mlp": 0.01289431, "balance_loss_clip": 0.06312728, "balance_loss_mlp": 0.01256792, "epoch": 0.13227115586953253, "flos": 34240996195200.0, "grad_norm": 2.5359288692859816, "language_loss": 0.70744622, "learning_rate": 3.8912814530767826e-06, "loss": 0.78674442, "num_input_tokens_seen": 47543625, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.32641602, "step": 2200, "time_per_iteration": 2.720669746398926 }, { "auxiliary_loss_clip": 0.06635808, "auxiliary_loss_mlp": 0.01294619, "balance_loss_clip": 0.06313781, "balance_loss_mlp": 0.01265222, "epoch": 0.13233127912220052, "flos": 20711000914560.0, "grad_norm": 3.250592141035469, "language_loss": 0.85879028, "learning_rate": 3.891154759144557e-06, "loss": 0.93809462, "num_input_tokens_seen": 47563740, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.29418945, "step": 2201, "time_per_iteration": 2.615795135498047 }, { "auxiliary_loss_clip": 0.06648582, "auxiliary_loss_mlp": 0.01290469, "balance_loss_clip": 0.06313038, "balance_loss_mlp": 0.01258139, "epoch": 0.1323914023748685, "flos": 25810971672960.0, "grad_norm": 1.9616246342223536, "language_loss": 0.88221478, "learning_rate": 3.891027993499554e-06, "loss": 0.96160519, "num_input_tokens_seen": 47582655, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.32299805, "step": 2202, "time_per_iteration": 2.6523184776306152 }, { "auxiliary_loss_clip": 0.06648468, "auxiliary_loss_mlp": 0.01289442, "balance_loss_clip": 0.0631742, "balance_loss_mlp": 0.01257887, "epoch": 0.13245152562753645, "flos": 21257908513920.0, "grad_norm": 2.0536841024399535, "language_loss": 0.73689562, "learning_rate": 3.89090115614658e-06, "loss": 0.81627476, "num_input_tokens_seen": 47600875, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.31555176, "step": 2203, "time_per_iteration": 2.607921600341797 }, { "auxiliary_loss_clip": 0.06647663, "auxiliary_loss_mlp": 0.0128955, "balance_loss_clip": 0.06316672, "balance_loss_mlp": 0.01258937, "epoch": 0.13251164888020442, "flos": 26617552675200.0, "grad_norm": 4.017000026142092, "language_loss": 0.74743706, "learning_rate": 3.890774247090444e-06, "loss": 0.82680917, "num_input_tokens_seen": 47619250, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.3059082, "step": 2204, "time_per_iteration": 2.6475489139556885 }, { "auxiliary_loss_clip": 0.0664769, "auxiliary_loss_mlp": 0.01295828, "balance_loss_clip": 0.06318369, "balance_loss_mlp": 0.01263856, "epoch": 0.13257177213287238, "flos": 29834485027200.0, "grad_norm": 4.44008087609111, "language_loss": 0.79476535, "learning_rate": 3.89064726633596e-06, "loss": 0.87420052, "num_input_tokens_seen": 47639445, "router_z_loss_clip": 3.29492188, "router_z_loss_mlp": 0.31982422, "step": 2205, "time_per_iteration": 2.661792039871216 }, { "auxiliary_loss_clip": 0.06639123, "auxiliary_loss_mlp": 0.01298715, "balance_loss_clip": 0.06312895, "balance_loss_mlp": 0.01266791, "epoch": 0.13263189538554035, "flos": 21294902891520.0, "grad_norm": 2.0972629502371656, "language_loss": 0.80850184, "learning_rate": 3.890520213887941e-06, "loss": 0.88788015, "num_input_tokens_seen": 47658740, "router_z_loss_clip": 3.26367188, "router_z_loss_mlp": 0.31933594, "step": 2206, "time_per_iteration": 2.6411712169647217 }, { "auxiliary_loss_clip": 0.06651797, "auxiliary_loss_mlp": 0.01294488, "balance_loss_clip": 0.06321478, "balance_loss_mlp": 0.01264114, "epoch": 0.13269201863820831, "flos": 16879880534400.0, "grad_norm": 2.558005402896444, "language_loss": 0.75766361, "learning_rate": 3.890393089751208e-06, "loss": 0.83712643, "num_input_tokens_seen": 47676880, "router_z_loss_clip": 3.29882812, "router_z_loss_mlp": 0.30371094, "step": 2207, "time_per_iteration": 2.565460205078125 }, { "auxiliary_loss_clip": 0.06637227, "auxiliary_loss_mlp": 0.01285778, "balance_loss_clip": 0.06312844, "balance_loss_mlp": 0.01256739, "epoch": 0.1327521418908763, "flos": 23775679198080.0, "grad_norm": 1.964870859377886, "language_loss": 0.84994465, "learning_rate": 3.890265893930578e-06, "loss": 0.92917472, "num_input_tokens_seen": 47696635, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.29052734, "step": 2208, "time_per_iteration": 2.598996877670288 }, { "auxiliary_loss_clip": 0.06630406, "auxiliary_loss_mlp": 0.01298945, "balance_loss_clip": 0.06313878, "balance_loss_mlp": 0.01270585, "epoch": 0.13281226514354427, "flos": 26512858598400.0, "grad_norm": 1.7402235803976283, "language_loss": 0.86518002, "learning_rate": 3.890138626430876e-06, "loss": 0.94447351, "num_input_tokens_seen": 47717760, "router_z_loss_clip": 3.1640625, "router_z_loss_mlp": 0.28356934, "step": 2209, "time_per_iteration": 2.635894775390625 }, { "auxiliary_loss_clip": 0.06656629, "auxiliary_loss_mlp": 0.0129361, "balance_loss_clip": 0.06327051, "balance_loss_mlp": 0.01264166, "epoch": 0.13287238839621224, "flos": 24505671968640.0, "grad_norm": 2.4444301943860065, "language_loss": 0.83248889, "learning_rate": 3.890011287256929e-06, "loss": 0.91199124, "num_input_tokens_seen": 47737685, "router_z_loss_clip": 3.29882812, "router_z_loss_mlp": 0.29443359, "step": 2210, "time_per_iteration": 2.647632360458374 }, { "auxiliary_loss_clip": 0.06525309, "auxiliary_loss_mlp": 0.01275898, "balance_loss_clip": 0.06326912, "balance_loss_mlp": 0.01263655, "epoch": 0.1329325116488802, "flos": 67713984264960.0, "grad_norm": 0.7512008201688825, "language_loss": 0.58055949, "learning_rate": 3.889883876413563e-06, "loss": 0.65857154, "num_input_tokens_seen": 47802415, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.12225342, "step": 2211, "time_per_iteration": 3.3171684741973877 }, { "auxiliary_loss_clip": 0.06523456, "auxiliary_loss_mlp": 0.01265396, "balance_loss_clip": 0.06325866, "balance_loss_mlp": 0.0125183, "epoch": 0.13299263490154817, "flos": 72283440896640.0, "grad_norm": 0.7663717133975608, "language_loss": 0.54966056, "learning_rate": 3.889756393905611e-06, "loss": 0.62754905, "num_input_tokens_seen": 47871485, "router_z_loss_clip": 1.97363281, "router_z_loss_mlp": 0.13598633, "step": 2212, "time_per_iteration": 3.2891740798950195 }, { "auxiliary_loss_clip": 0.06655736, "auxiliary_loss_mlp": 0.01291579, "balance_loss_clip": 0.06323216, "balance_loss_mlp": 0.01260656, "epoch": 0.13305275815421613, "flos": 17937078698880.0, "grad_norm": 2.292594494132916, "language_loss": 0.7610147, "learning_rate": 3.889628839737908e-06, "loss": 0.84048778, "num_input_tokens_seen": 47888315, "router_z_loss_clip": 3.32421875, "router_z_loss_mlp": 0.30883789, "step": 2213, "time_per_iteration": 2.5859363079071045 }, { "auxiliary_loss_clip": 0.06640226, "auxiliary_loss_mlp": 0.01295203, "balance_loss_clip": 0.06318632, "balance_loss_mlp": 0.01265329, "epoch": 0.13311288140688413, "flos": 22346566686720.0, "grad_norm": 1.842079994884463, "language_loss": 0.80360681, "learning_rate": 3.889501213915291e-06, "loss": 0.88296115, "num_input_tokens_seen": 47906600, "router_z_loss_clip": 3.21679688, "router_z_loss_mlp": 0.29858398, "step": 2214, "time_per_iteration": 2.678612470626831 }, { "auxiliary_loss_clip": 0.06652616, "auxiliary_loss_mlp": 0.01285904, "balance_loss_clip": 0.06325643, "balance_loss_mlp": 0.01254671, "epoch": 0.1331730046595521, "flos": 31877030880000.0, "grad_norm": 2.527617953229609, "language_loss": 0.70108044, "learning_rate": 3.889373516442597e-06, "loss": 0.7804656, "num_input_tokens_seen": 47927630, "router_z_loss_clip": 3.26757812, "router_z_loss_mlp": 0.3125, "step": 2215, "time_per_iteration": 2.692718505859375 }, { "auxiliary_loss_clip": 0.06656729, "auxiliary_loss_mlp": 0.01289186, "balance_loss_clip": 0.06326686, "balance_loss_mlp": 0.01257834, "epoch": 0.13323312791222006, "flos": 22573438416000.0, "grad_norm": 2.0789927604770626, "language_loss": 0.82134366, "learning_rate": 3.889245747324671e-06, "loss": 0.90080285, "num_input_tokens_seen": 47947935, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.31347656, "step": 2216, "time_per_iteration": 2.6349222660064697 }, { "auxiliary_loss_clip": 0.06659051, "auxiliary_loss_mlp": 0.01291142, "balance_loss_clip": 0.06328358, "balance_loss_mlp": 0.01260386, "epoch": 0.13329325116488802, "flos": 15090635174400.0, "grad_norm": 2.418821269122473, "language_loss": 0.87998152, "learning_rate": 3.889117906566356e-06, "loss": 0.95948344, "num_input_tokens_seen": 47965515, "router_z_loss_clip": 3.30664062, "router_z_loss_mlp": 0.30786133, "step": 2217, "time_per_iteration": 2.589515447616577 }, { "auxiliary_loss_clip": 0.06645657, "auxiliary_loss_mlp": 0.01293425, "balance_loss_clip": 0.0632371, "balance_loss_mlp": 0.01263169, "epoch": 0.133353374417556, "flos": 27461002273920.0, "grad_norm": 2.6949038600749575, "language_loss": 0.74712747, "learning_rate": 3.888989994172501e-06, "loss": 0.8265183, "num_input_tokens_seen": 47985675, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.30249023, "step": 2218, "time_per_iteration": 2.630105495452881 }, { "auxiliary_loss_clip": 0.06663436, "auxiliary_loss_mlp": 0.0129199, "balance_loss_clip": 0.0633011, "balance_loss_mlp": 0.01259828, "epoch": 0.13341349767022395, "flos": 24101081729280.0, "grad_norm": 1.802325757265264, "language_loss": 0.88444072, "learning_rate": 3.8888620101479565e-06, "loss": 0.96399498, "num_input_tokens_seen": 48004985, "router_z_loss_clip": 3.33398438, "router_z_loss_mlp": 0.32177734, "step": 2219, "time_per_iteration": 2.6141817569732666 }, { "auxiliary_loss_clip": 0.06643952, "auxiliary_loss_mlp": 0.01288585, "balance_loss_clip": 0.0632049, "balance_loss_mlp": 0.01258234, "epoch": 0.13347362092289192, "flos": 24140088604800.0, "grad_norm": 1.5203493140027595, "language_loss": 0.77850664, "learning_rate": 3.888733954497574e-06, "loss": 0.85783195, "num_input_tokens_seen": 48024965, "router_z_loss_clip": 3.234375, "router_z_loss_mlp": 0.30371094, "step": 2220, "time_per_iteration": 2.6075220108032227 }, { "auxiliary_loss_clip": 0.06651217, "auxiliary_loss_mlp": 0.01299184, "balance_loss_clip": 0.06327267, "balance_loss_mlp": 0.01269406, "epoch": 0.1335337441755599, "flos": 18441499478400.0, "grad_norm": 2.498998280384867, "language_loss": 0.80458468, "learning_rate": 3.888605827226212e-06, "loss": 0.8840887, "num_input_tokens_seen": 48040890, "router_z_loss_clip": 3.2421875, "router_z_loss_mlp": 0.29785156, "step": 2221, "time_per_iteration": 2.566829204559326 }, { "auxiliary_loss_clip": 0.06515221, "auxiliary_loss_mlp": 0.01279324, "balance_loss_clip": 0.06321517, "balance_loss_mlp": 0.01268333, "epoch": 0.13359386742822787, "flos": 50627608542720.0, "grad_norm": 0.9618706701732972, "language_loss": 0.69213259, "learning_rate": 3.8884776283387275e-06, "loss": 0.77007806, "num_input_tokens_seen": 48091855, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.10998535, "step": 2222, "time_per_iteration": 3.0535356998443604 }, { "auxiliary_loss_clip": 0.06649515, "auxiliary_loss_mlp": 0.01290591, "balance_loss_clip": 0.06329323, "balance_loss_mlp": 0.01262899, "epoch": 0.13365399068089584, "flos": 22784294016000.0, "grad_norm": 2.0282729343061288, "language_loss": 0.68629527, "learning_rate": 3.888349357839982e-06, "loss": 0.76569629, "num_input_tokens_seen": 48111350, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.2767334, "step": 2223, "time_per_iteration": 2.622174024581909 }, { "auxiliary_loss_clip": 0.06656332, "auxiliary_loss_mlp": 0.01293201, "balance_loss_clip": 0.06331104, "balance_loss_mlp": 0.01262027, "epoch": 0.1337141139335638, "flos": 12536540945280.0, "grad_norm": 1.9898587003403299, "language_loss": 0.83710206, "learning_rate": 3.88822101573484e-06, "loss": 0.91659737, "num_input_tokens_seen": 48129840, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.31188965, "step": 2224, "time_per_iteration": 2.570305109024048 }, { "auxiliary_loss_clip": 0.06669085, "auxiliary_loss_mlp": 0.01288859, "balance_loss_clip": 0.06333357, "balance_loss_mlp": 0.01255528, "epoch": 0.13377423718623177, "flos": 23045560646400.0, "grad_norm": 1.9806770131408802, "language_loss": 0.67848361, "learning_rate": 3.888092602028167e-06, "loss": 0.75806302, "num_input_tokens_seen": 48149240, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.33325195, "step": 2225, "time_per_iteration": 2.598600387573242 }, { "auxiliary_loss_clip": 0.06656678, "auxiliary_loss_mlp": 0.01288664, "balance_loss_clip": 0.06328507, "balance_loss_mlp": 0.01260674, "epoch": 0.13383436043889974, "flos": 16221905948160.0, "grad_norm": 2.9262555121127622, "language_loss": 0.9095937, "learning_rate": 3.887964116724835e-06, "loss": 0.98904717, "num_input_tokens_seen": 48166330, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.2800293, "step": 2226, "time_per_iteration": 2.5659286975860596 }, { "auxiliary_loss_clip": 0.06660965, "auxiliary_loss_mlp": 0.01301921, "balance_loss_clip": 0.06331907, "balance_loss_mlp": 0.01271904, "epoch": 0.1338944836915677, "flos": 24286514814720.0, "grad_norm": 2.5792690778587177, "language_loss": 0.75151515, "learning_rate": 3.887835559829712e-06, "loss": 0.83114398, "num_input_tokens_seen": 48187600, "router_z_loss_clip": 3.29296875, "router_z_loss_mlp": 0.3001709, "step": 2227, "time_per_iteration": 2.6514875888824463 }, { "auxiliary_loss_clip": 0.06655754, "auxiliary_loss_mlp": 0.0129035, "balance_loss_clip": 0.06326726, "balance_loss_mlp": 0.01259665, "epoch": 0.1339546069442357, "flos": 17603793884160.0, "grad_norm": 2.008106705457057, "language_loss": 0.85768372, "learning_rate": 3.8877069313476764e-06, "loss": 0.93714476, "num_input_tokens_seen": 48204400, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.30664062, "step": 2228, "time_per_iteration": 4.053513288497925 }, { "auxiliary_loss_clip": 0.06650669, "auxiliary_loss_mlp": 0.01283077, "balance_loss_clip": 0.06327419, "balance_loss_mlp": 0.01253847, "epoch": 0.13401473019690366, "flos": 18996163580160.0, "grad_norm": 1.882271935958489, "language_loss": 0.82166672, "learning_rate": 3.8875782312836054e-06, "loss": 0.9010042, "num_input_tokens_seen": 48222180, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.29248047, "step": 2229, "time_per_iteration": 2.586491823196411 }, { "auxiliary_loss_clip": 0.06653672, "auxiliary_loss_mlp": 0.01289847, "balance_loss_clip": 0.06325512, "balance_loss_mlp": 0.01259353, "epoch": 0.13407485344957162, "flos": 26951214833280.0, "grad_norm": 1.9863945289881773, "language_loss": 0.75670981, "learning_rate": 3.887449459642378e-06, "loss": 0.83614498, "num_input_tokens_seen": 48243245, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.30480957, "step": 2230, "time_per_iteration": 2.6443538665771484 }, { "auxiliary_loss_clip": 0.06650474, "auxiliary_loss_mlp": 0.01291257, "balance_loss_clip": 0.06323431, "balance_loss_mlp": 0.01262146, "epoch": 0.1341349767022396, "flos": 20345585258880.0, "grad_norm": 14.311586695984527, "language_loss": 0.81207496, "learning_rate": 3.8873206164288785e-06, "loss": 0.89149225, "num_input_tokens_seen": 48262600, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.29101562, "step": 2231, "time_per_iteration": 4.034775972366333 }, { "auxiliary_loss_clip": 0.06666023, "auxiliary_loss_mlp": 0.01299518, "balance_loss_clip": 0.06327891, "balance_loss_mlp": 0.01264518, "epoch": 0.13419509995490755, "flos": 29869802323200.0, "grad_norm": 1.5522224084064, "language_loss": 0.74027741, "learning_rate": 3.887191701647992e-06, "loss": 0.81993282, "num_input_tokens_seen": 48285075, "router_z_loss_clip": 3.38085938, "router_z_loss_mlp": 0.35009766, "step": 2232, "time_per_iteration": 2.666081428527832 }, { "auxiliary_loss_clip": 0.0665703, "auxiliary_loss_mlp": 0.01289462, "balance_loss_clip": 0.06322, "balance_loss_mlp": 0.01257132, "epoch": 0.13425522320757552, "flos": 26950250511360.0, "grad_norm": 4.207471718344417, "language_loss": 0.6718455, "learning_rate": 3.8870627153046066e-06, "loss": 0.75131047, "num_input_tokens_seen": 48301285, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.32299805, "step": 2233, "time_per_iteration": 2.6304879188537598 }, { "auxiliary_loss_clip": 0.06649952, "auxiliary_loss_mlp": 0.01292953, "balance_loss_clip": 0.06323561, "balance_loss_mlp": 0.01261839, "epoch": 0.1343153464602435, "flos": 15782501537280.0, "grad_norm": 2.4429140458464627, "language_loss": 0.83186913, "learning_rate": 3.886933657403615e-06, "loss": 0.91129816, "num_input_tokens_seen": 48317835, "router_z_loss_clip": 3.265625, "router_z_loss_mlp": 0.31079102, "step": 2234, "time_per_iteration": 4.007932424545288 }, { "auxiliary_loss_clip": 0.06648239, "auxiliary_loss_mlp": 0.01287063, "balance_loss_clip": 0.06324121, "balance_loss_mlp": 0.01256426, "epoch": 0.13437546971291148, "flos": 24321370913280.0, "grad_norm": 1.7977769741320488, "language_loss": 0.82982957, "learning_rate": 3.886804527949909e-06, "loss": 0.90918267, "num_input_tokens_seen": 48335670, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.30578613, "step": 2235, "time_per_iteration": 4.003185749053955 }, { "auxiliary_loss_clip": 0.06648313, "auxiliary_loss_mlp": 0.01288546, "balance_loss_clip": 0.06325349, "balance_loss_mlp": 0.0125686, "epoch": 0.13443559296557944, "flos": 26657817361920.0, "grad_norm": 1.528468574247237, "language_loss": 0.87696999, "learning_rate": 3.8866753269483864e-06, "loss": 0.95633858, "num_input_tokens_seen": 48357805, "router_z_loss_clip": 3.22851562, "router_z_loss_mlp": 0.31665039, "step": 2236, "time_per_iteration": 2.647139310836792 }, { "auxiliary_loss_clip": 0.06646527, "auxiliary_loss_mlp": 0.01292953, "balance_loss_clip": 0.06320441, "balance_loss_mlp": 0.01260647, "epoch": 0.1344957162182474, "flos": 21802216636800.0, "grad_norm": 2.7378792483624, "language_loss": 0.78522968, "learning_rate": 3.886546054403946e-06, "loss": 0.86462444, "num_input_tokens_seen": 48377845, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.32299805, "step": 2237, "time_per_iteration": 2.5754201412200928 }, { "auxiliary_loss_clip": 0.06652346, "auxiliary_loss_mlp": 0.01291628, "balance_loss_clip": 0.06323533, "balance_loss_mlp": 0.01259108, "epoch": 0.13455583947091537, "flos": 19871785946880.0, "grad_norm": 4.120527149203498, "language_loss": 0.80260015, "learning_rate": 3.886416710321491e-06, "loss": 0.8820399, "num_input_tokens_seen": 48394735, "router_z_loss_clip": 3.28515625, "router_z_loss_mlp": 0.32543945, "step": 2238, "time_per_iteration": 2.567777156829834 }, { "auxiliary_loss_clip": 0.06646476, "auxiliary_loss_mlp": 0.01290332, "balance_loss_clip": 0.06324197, "balance_loss_mlp": 0.01258741, "epoch": 0.13461596272358334, "flos": 30854730741120.0, "grad_norm": 1.9574326113073213, "language_loss": 0.69581604, "learning_rate": 3.886287294705924e-06, "loss": 0.77518415, "num_input_tokens_seen": 48414200, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.31542969, "step": 2239, "time_per_iteration": 2.661813259124756 }, { "auxiliary_loss_clip": 0.06646799, "auxiliary_loss_mlp": 0.01294626, "balance_loss_clip": 0.06319153, "balance_loss_mlp": 0.01262201, "epoch": 0.1346760859762513, "flos": 12499253078400.0, "grad_norm": 3.7265425012853277, "language_loss": 0.8304466, "learning_rate": 3.8861578075621555e-06, "loss": 0.90986079, "num_input_tokens_seen": 48431065, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.32421875, "step": 2240, "time_per_iteration": 2.5665884017944336 }, { "auxiliary_loss_clip": 0.06652196, "auxiliary_loss_mlp": 0.01292123, "balance_loss_clip": 0.06322171, "balance_loss_mlp": 0.0125946, "epoch": 0.1347362092289193, "flos": 21842607104640.0, "grad_norm": 1.9097066803396625, "language_loss": 0.7909137, "learning_rate": 3.886028248895093e-06, "loss": 0.87035692, "num_input_tokens_seen": 48450335, "router_z_loss_clip": 3.30664062, "router_z_loss_mlp": 0.32666016, "step": 2241, "time_per_iteration": 2.5948221683502197 }, { "auxiliary_loss_clip": 0.06635761, "auxiliary_loss_mlp": 0.01289763, "balance_loss_clip": 0.06322226, "balance_loss_mlp": 0.01260843, "epoch": 0.13479633248158726, "flos": 23515502670720.0, "grad_norm": 1.734559299230357, "language_loss": 0.84356666, "learning_rate": 3.88589861870965e-06, "loss": 0.92282188, "num_input_tokens_seen": 48468555, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.28918457, "step": 2242, "time_per_iteration": 2.6031153202056885 }, { "auxiliary_loss_clip": 0.06658329, "auxiliary_loss_mlp": 0.0129164, "balance_loss_clip": 0.06330433, "balance_loss_mlp": 0.01259477, "epoch": 0.13485645573425523, "flos": 29350874787840.0, "grad_norm": 4.122981587735446, "language_loss": 0.6566422, "learning_rate": 3.885768917010744e-06, "loss": 0.73614192, "num_input_tokens_seen": 48488515, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.32177734, "step": 2243, "time_per_iteration": 2.6463465690612793 }, { "auxiliary_loss_clip": 0.06642441, "auxiliary_loss_mlp": 0.01289482, "balance_loss_clip": 0.06324495, "balance_loss_mlp": 0.01261468, "epoch": 0.1349165789869232, "flos": 28044484980480.0, "grad_norm": 1.4390014345944864, "language_loss": 0.7362079, "learning_rate": 3.8856391438032895e-06, "loss": 0.8155272, "num_input_tokens_seen": 48510515, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.2800293, "step": 2244, "time_per_iteration": 2.6334080696105957 }, { "auxiliary_loss_clip": 0.06648639, "auxiliary_loss_mlp": 0.01288477, "balance_loss_clip": 0.06329806, "balance_loss_mlp": 0.01258675, "epoch": 0.13497670223959116, "flos": 22859834071680.0, "grad_norm": 1.73255097279916, "language_loss": 0.87628615, "learning_rate": 3.88550929909221e-06, "loss": 0.9556573, "num_input_tokens_seen": 48529940, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.29833984, "step": 2245, "time_per_iteration": 2.5933241844177246 }, { "auxiliary_loss_clip": 0.06642, "auxiliary_loss_mlp": 0.01292267, "balance_loss_clip": 0.06329978, "balance_loss_mlp": 0.0126113, "epoch": 0.13503682549225912, "flos": 16509517488000.0, "grad_norm": 2.1037440819676725, "language_loss": 0.80085242, "learning_rate": 3.88537938288243e-06, "loss": 0.88019502, "num_input_tokens_seen": 48548190, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.3112793, "step": 2246, "time_per_iteration": 2.613226890563965 }, { "auxiliary_loss_clip": 0.06527317, "auxiliary_loss_mlp": 0.01283854, "balance_loss_clip": 0.06331921, "balance_loss_mlp": 0.0127247, "epoch": 0.1350969487449271, "flos": 70775979217920.0, "grad_norm": 0.7364069366078652, "language_loss": 0.60637558, "learning_rate": 3.885249395178874e-06, "loss": 0.68448728, "num_input_tokens_seen": 48613165, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.11364746, "step": 2247, "time_per_iteration": 3.3123950958251953 }, { "auxiliary_loss_clip": 0.06665818, "auxiliary_loss_mlp": 0.01290849, "balance_loss_clip": 0.0633022, "balance_loss_mlp": 0.01258686, "epoch": 0.13515707199759508, "flos": 23082680805120.0, "grad_norm": 2.0852008402819777, "language_loss": 0.82341707, "learning_rate": 3.885119335986473e-06, "loss": 0.90298378, "num_input_tokens_seen": 48631705, "router_z_loss_clip": 3.35351562, "router_z_loss_mlp": 0.3215332, "step": 2248, "time_per_iteration": 2.594421863555908 }, { "auxiliary_loss_clip": 0.0664115, "auxiliary_loss_mlp": 0.01285391, "balance_loss_clip": 0.06327441, "balance_loss_mlp": 0.01257258, "epoch": 0.13521719525026304, "flos": 23193244667520.0, "grad_norm": 1.7951710594480137, "language_loss": 0.78589427, "learning_rate": 3.884989205310157e-06, "loss": 0.86515969, "num_input_tokens_seen": 48649740, "router_z_loss_clip": 3.13867188, "router_z_loss_mlp": 0.28112793, "step": 2249, "time_per_iteration": 2.6076085567474365 }, { "auxiliary_loss_clip": 0.06639341, "auxiliary_loss_mlp": 0.01289126, "balance_loss_clip": 0.06323837, "balance_loss_mlp": 0.01261279, "epoch": 0.135277318502931, "flos": 24797937409920.0, "grad_norm": 1.753491468905162, "language_loss": 0.85785007, "learning_rate": 3.884859003154862e-06, "loss": 0.93713474, "num_input_tokens_seen": 48671565, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.27880859, "step": 2250, "time_per_iteration": 2.6749954223632812 }, { "auxiliary_loss_clip": 0.0664289, "auxiliary_loss_mlp": 0.0129394, "balance_loss_clip": 0.06319918, "balance_loss_mlp": 0.01261921, "epoch": 0.13533744175559898, "flos": 21915044559360.0, "grad_norm": 2.876152720190089, "language_loss": 0.83123744, "learning_rate": 3.884728729525524e-06, "loss": 0.91060573, "num_input_tokens_seen": 48690425, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.32006836, "step": 2251, "time_per_iteration": 2.584428071975708 }, { "auxiliary_loss_clip": 0.06645809, "auxiliary_loss_mlp": 0.01289259, "balance_loss_clip": 0.06326133, "balance_loss_mlp": 0.0125824, "epoch": 0.13539756500826694, "flos": 21217434192000.0, "grad_norm": 1.658475094418693, "language_loss": 0.87071097, "learning_rate": 3.884598384427084e-06, "loss": 0.95006162, "num_input_tokens_seen": 48707505, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.31005859, "step": 2252, "time_per_iteration": 2.591170310974121 }, { "auxiliary_loss_clip": 0.06510928, "auxiliary_loss_mlp": 0.01279007, "balance_loss_clip": 0.06317531, "balance_loss_mlp": 0.01266145, "epoch": 0.1354576882609349, "flos": 63260835500160.0, "grad_norm": 0.757696103849647, "language_loss": 0.61325628, "learning_rate": 3.884467967864485e-06, "loss": 0.69115567, "num_input_tokens_seen": 48775895, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.12878418, "step": 2253, "time_per_iteration": 3.3056299686431885 }, { "auxiliary_loss_clip": 0.06642744, "auxiliary_loss_mlp": 0.01296596, "balance_loss_clip": 0.06320517, "balance_loss_mlp": 0.01266365, "epoch": 0.1355178115136029, "flos": 25489971480960.0, "grad_norm": 1.778158475028912, "language_loss": 0.90718108, "learning_rate": 3.884337479842671e-06, "loss": 0.98657447, "num_input_tokens_seen": 48798370, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 0.30224609, "step": 2254, "time_per_iteration": 2.6434669494628906 }, { "auxiliary_loss_clip": 0.06641755, "auxiliary_loss_mlp": 0.01292564, "balance_loss_clip": 0.06316711, "balance_loss_mlp": 0.01261617, "epoch": 0.13557793476627086, "flos": 21623491877760.0, "grad_norm": 2.6932045979428, "language_loss": 0.85940826, "learning_rate": 3.884206920366591e-06, "loss": 0.93875146, "num_input_tokens_seen": 48817955, "router_z_loss_clip": 3.24804688, "router_z_loss_mlp": 0.30981445, "step": 2255, "time_per_iteration": 2.5847268104553223 }, { "auxiliary_loss_clip": 0.06637843, "auxiliary_loss_mlp": 0.0128864, "balance_loss_clip": 0.06317864, "balance_loss_mlp": 0.01259362, "epoch": 0.13563805801893883, "flos": 24933839932800.0, "grad_norm": 3.059304311497172, "language_loss": 0.75765556, "learning_rate": 3.884076289441196e-06, "loss": 0.83692038, "num_input_tokens_seen": 48836330, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.29272461, "step": 2256, "time_per_iteration": 2.600383996963501 }, { "auxiliary_loss_clip": 0.06637068, "auxiliary_loss_mlp": 0.01291905, "balance_loss_clip": 0.06311259, "balance_loss_mlp": 0.01259885, "epoch": 0.1356981812716068, "flos": 14754415466880.0, "grad_norm": 2.3350532106588857, "language_loss": 0.84197336, "learning_rate": 3.88394558707144e-06, "loss": 0.92126304, "num_input_tokens_seen": 48851890, "router_z_loss_clip": 3.25976562, "router_z_loss_mlp": 0.3203125, "step": 2257, "time_per_iteration": 2.556800365447998 }, { "auxiliary_loss_clip": 0.0664893, "auxiliary_loss_mlp": 0.01290951, "balance_loss_clip": 0.06318957, "balance_loss_mlp": 0.01260075, "epoch": 0.13575830452427476, "flos": 11113256292480.0, "grad_norm": 2.029825330296972, "language_loss": 0.82873547, "learning_rate": 3.883814813262277e-06, "loss": 0.90813434, "num_input_tokens_seen": 48865510, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.30834961, "step": 2258, "time_per_iteration": 2.5393433570861816 }, { "auxiliary_loss_clip": 0.06642655, "auxiliary_loss_mlp": 0.01303373, "balance_loss_clip": 0.06315137, "balance_loss_mlp": 0.01271306, "epoch": 0.13581842777694272, "flos": 17964849127680.0, "grad_norm": 3.3123719929102036, "language_loss": 0.84256232, "learning_rate": 3.883683968018669e-06, "loss": 0.92202258, "num_input_tokens_seen": 48882360, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.32055664, "step": 2259, "time_per_iteration": 2.557461738586426 }, { "auxiliary_loss_clip": 0.06632648, "auxiliary_loss_mlp": 0.01289815, "balance_loss_clip": 0.06314713, "balance_loss_mlp": 0.01261443, "epoch": 0.1358785510296107, "flos": 22863817140480.0, "grad_norm": 3.07588208106043, "language_loss": 0.75251687, "learning_rate": 3.8835530513455755e-06, "loss": 0.83174145, "num_input_tokens_seen": 48902700, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.28417969, "step": 2260, "time_per_iteration": 2.5768818855285645 }, { "auxiliary_loss_clip": 0.06631185, "auxiliary_loss_mlp": 0.01289292, "balance_loss_clip": 0.06313552, "balance_loss_mlp": 0.01260396, "epoch": 0.13593867428227868, "flos": 25746542282880.0, "grad_norm": 2.715474295668406, "language_loss": 0.76659864, "learning_rate": 3.883422063247961e-06, "loss": 0.84580338, "num_input_tokens_seen": 48922525, "router_z_loss_clip": 3.1796875, "router_z_loss_mlp": 0.28930664, "step": 2261, "time_per_iteration": 2.605717658996582 }, { "auxiliary_loss_clip": 0.06635377, "auxiliary_loss_mlp": 0.01286924, "balance_loss_clip": 0.06312954, "balance_loss_mlp": 0.0125674, "epoch": 0.13599879753494665, "flos": 31257350409600.0, "grad_norm": 1.9737993949688484, "language_loss": 0.65221834, "learning_rate": 3.883291003730794e-06, "loss": 0.73144138, "num_input_tokens_seen": 48942510, "router_z_loss_clip": 3.22265625, "router_z_loss_mlp": 0.30200195, "step": 2262, "time_per_iteration": 2.6659975051879883 }, { "auxiliary_loss_clip": 0.06638278, "auxiliary_loss_mlp": 0.01299581, "balance_loss_clip": 0.06313223, "balance_loss_mlp": 0.01268587, "epoch": 0.1360589207876146, "flos": 23921853845760.0, "grad_norm": 7.3617431828503905, "language_loss": 0.83463883, "learning_rate": 3.883159872799043e-06, "loss": 0.91401744, "num_input_tokens_seen": 48962625, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.31030273, "step": 2263, "time_per_iteration": 2.61606502532959 }, { "auxiliary_loss_clip": 0.06641826, "auxiliary_loss_mlp": 0.01291451, "balance_loss_clip": 0.06313907, "balance_loss_mlp": 0.01259264, "epoch": 0.13611904404028258, "flos": 19980295384320.0, "grad_norm": 1.8358332131269897, "language_loss": 0.89243436, "learning_rate": 3.8830286704576815e-06, "loss": 0.97176713, "num_input_tokens_seen": 48982525, "router_z_loss_clip": 3.28320312, "router_z_loss_mlp": 0.32177734, "step": 2264, "time_per_iteration": 2.6116437911987305 }, { "auxiliary_loss_clip": 0.0664786, "auxiliary_loss_mlp": 0.01298852, "balance_loss_clip": 0.06316487, "balance_loss_mlp": 0.01266665, "epoch": 0.13617916729295054, "flos": 15345990092160.0, "grad_norm": 3.262835771207304, "language_loss": 0.72658336, "learning_rate": 3.882897396711683e-06, "loss": 0.80605048, "num_input_tokens_seen": 48997605, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.32177734, "step": 2265, "time_per_iteration": 2.5578320026397705 }, { "auxiliary_loss_clip": 0.06631643, "auxiliary_loss_mlp": 0.01292115, "balance_loss_clip": 0.06310781, "balance_loss_mlp": 0.01263553, "epoch": 0.1362392905456185, "flos": 27458402797440.0, "grad_norm": 2.0681540546780504, "language_loss": 0.6776244, "learning_rate": 3.882766051566027e-06, "loss": 0.75686198, "num_input_tokens_seen": 49018535, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.28588867, "step": 2266, "time_per_iteration": 2.63922119140625 }, { "auxiliary_loss_clip": 0.06633092, "auxiliary_loss_mlp": 0.01287653, "balance_loss_clip": 0.06312242, "balance_loss_mlp": 0.01257231, "epoch": 0.1362994137982865, "flos": 25015920606720.0, "grad_norm": 2.5450218435706944, "language_loss": 0.77274287, "learning_rate": 3.882634635025694e-06, "loss": 0.85195029, "num_input_tokens_seen": 49038865, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.30407715, "step": 2267, "time_per_iteration": 4.0458807945251465 }, { "auxiliary_loss_clip": 0.06637824, "auxiliary_loss_mlp": 0.01289443, "balance_loss_clip": 0.06313182, "balance_loss_mlp": 0.01259831, "epoch": 0.13635953705095447, "flos": 20309261713920.0, "grad_norm": 2.2302823295121983, "language_loss": 0.83258009, "learning_rate": 3.882503147095667e-06, "loss": 0.91185272, "num_input_tokens_seen": 49058010, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.29614258, "step": 2268, "time_per_iteration": 2.573268175125122 }, { "auxiliary_loss_clip": 0.06634745, "auxiliary_loss_mlp": 0.01300635, "balance_loss_clip": 0.06314349, "balance_loss_mlp": 0.01268162, "epoch": 0.13641966030362243, "flos": 31366530679680.0, "grad_norm": 2.913392327152937, "language_loss": 0.77285707, "learning_rate": 3.882371587780931e-06, "loss": 0.85221088, "num_input_tokens_seen": 49080330, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.32446289, "step": 2269, "time_per_iteration": 2.647632598876953 }, { "auxiliary_loss_clip": 0.06634189, "auxiliary_loss_mlp": 0.01294789, "balance_loss_clip": 0.06313358, "balance_loss_mlp": 0.01264557, "epoch": 0.1364797835562904, "flos": 20483122936320.0, "grad_norm": 2.2439969945416833, "language_loss": 0.82317245, "learning_rate": 3.882239957086477e-06, "loss": 0.90246218, "num_input_tokens_seen": 49097035, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.30224609, "step": 2270, "time_per_iteration": 4.002497434616089 }, { "auxiliary_loss_clip": 0.06641596, "auxiliary_loss_mlp": 0.01298338, "balance_loss_clip": 0.06312804, "balance_loss_mlp": 0.01265961, "epoch": 0.13653990680895836, "flos": 13083280836480.0, "grad_norm": 2.2709323799763865, "language_loss": 0.75793099, "learning_rate": 3.882108255017295e-06, "loss": 0.83733034, "num_input_tokens_seen": 49113945, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.32348633, "step": 2271, "time_per_iteration": 2.583298921585083 }, { "auxiliary_loss_clip": 0.06635663, "auxiliary_loss_mlp": 0.01298928, "balance_loss_clip": 0.06310488, "balance_loss_mlp": 0.01268458, "epoch": 0.13660003006162633, "flos": 16952443770240.0, "grad_norm": 3.1313468212341373, "language_loss": 0.81648731, "learning_rate": 3.881976481578379e-06, "loss": 0.89583319, "num_input_tokens_seen": 49132855, "router_z_loss_clip": 3.25195312, "router_z_loss_mlp": 0.3046875, "step": 2272, "time_per_iteration": 2.5717668533325195 }, { "auxiliary_loss_clip": 0.0651096, "auxiliary_loss_mlp": 0.01279157, "balance_loss_clip": 0.06317343, "balance_loss_mlp": 0.01267254, "epoch": 0.1366601533142943, "flos": 68703105386880.0, "grad_norm": 0.6780800600233977, "language_loss": 0.60558271, "learning_rate": 3.8818446367747255e-06, "loss": 0.68348396, "num_input_tokens_seen": 49198310, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.11895752, "step": 2273, "time_per_iteration": 3.3098249435424805 }, { "auxiliary_loss_clip": 0.06635391, "auxiliary_loss_mlp": 0.01298318, "balance_loss_clip": 0.06313962, "balance_loss_mlp": 0.01267824, "epoch": 0.13672027656696228, "flos": 19250176832640.0, "grad_norm": 1.6086963976374056, "language_loss": 0.78959727, "learning_rate": 3.881712720611336e-06, "loss": 0.86893439, "num_input_tokens_seen": 49217250, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.30493164, "step": 2274, "time_per_iteration": 4.211851119995117 }, { "auxiliary_loss_clip": 0.06637919, "auxiliary_loss_mlp": 0.01306627, "balance_loss_clip": 0.06315631, "balance_loss_mlp": 0.01274703, "epoch": 0.13678039981963025, "flos": 24541785878400.0, "grad_norm": 2.1230457439227277, "language_loss": 0.79809368, "learning_rate": 3.881580733093211e-06, "loss": 0.87753916, "num_input_tokens_seen": 49236615, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.31933594, "step": 2275, "time_per_iteration": 3.947599172592163 }, { "auxiliary_loss_clip": 0.06640088, "auxiliary_loss_mlp": 0.01291153, "balance_loss_clip": 0.06312578, "balance_loss_mlp": 0.01258776, "epoch": 0.13684052307229821, "flos": 15674788713600.0, "grad_norm": 2.9504128078751615, "language_loss": 0.8223139, "learning_rate": 3.881448674225356e-06, "loss": 0.90162635, "num_input_tokens_seen": 49253935, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.32348633, "step": 2276, "time_per_iteration": 2.581557512283325 }, { "auxiliary_loss_clip": 0.06656739, "auxiliary_loss_mlp": 0.01303651, "balance_loss_clip": 0.06316226, "balance_loss_mlp": 0.01268961, "epoch": 0.13690064632496618, "flos": 28371983863680.0, "grad_norm": 2.6411485131107546, "language_loss": 0.71475494, "learning_rate": 3.881316544012779e-06, "loss": 0.79435885, "num_input_tokens_seen": 49273605, "router_z_loss_clip": 3.40039062, "router_z_loss_mlp": 0.34716797, "step": 2277, "time_per_iteration": 2.639893054962158 }, { "auxiliary_loss_clip": 0.06655806, "auxiliary_loss_mlp": 0.01290611, "balance_loss_clip": 0.06319788, "balance_loss_mlp": 0.01260236, "epoch": 0.13696076957763414, "flos": 23411605207680.0, "grad_norm": 2.2019314016793325, "language_loss": 0.82029128, "learning_rate": 3.88118434246049e-06, "loss": 0.89975542, "num_input_tokens_seen": 49291785, "router_z_loss_clip": 3.359375, "router_z_loss_mlp": 0.30371094, "step": 2278, "time_per_iteration": 2.6235666275024414 }, { "auxiliary_loss_clip": 0.06646387, "auxiliary_loss_mlp": 0.0129381, "balance_loss_clip": 0.06318481, "balance_loss_mlp": 0.01261146, "epoch": 0.1370208928303021, "flos": 37205760084480.0, "grad_norm": 9.099475636627526, "language_loss": 0.75904274, "learning_rate": 3.881052069573502e-06, "loss": 0.83844471, "num_input_tokens_seen": 49311405, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.3269043, "step": 2279, "time_per_iteration": 2.7320058345794678 }, { "auxiliary_loss_clip": 0.06652343, "auxiliary_loss_mlp": 0.01296209, "balance_loss_clip": 0.06322006, "balance_loss_mlp": 0.01266598, "epoch": 0.13708101608297008, "flos": 26983052184960.0, "grad_norm": 1.7756431228054845, "language_loss": 0.77621996, "learning_rate": 3.880919725356831e-06, "loss": 0.8557055, "num_input_tokens_seen": 49331835, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.29602051, "step": 2280, "time_per_iteration": 2.6814723014831543 }, { "auxiliary_loss_clip": 0.06641738, "auxiliary_loss_mlp": 0.01288361, "balance_loss_clip": 0.06320938, "balance_loss_mlp": 0.01257581, "epoch": 0.13714113933563807, "flos": 32564243341440.0, "grad_norm": 2.275395401641284, "language_loss": 0.8020314, "learning_rate": 3.880787309815496e-06, "loss": 0.8813324, "num_input_tokens_seen": 49352290, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.30786133, "step": 2281, "time_per_iteration": 2.7622010707855225 }, { "auxiliary_loss_clip": 0.06657283, "auxiliary_loss_mlp": 0.01299392, "balance_loss_clip": 0.06322965, "balance_loss_mlp": 0.01265704, "epoch": 0.13720126258830603, "flos": 16105807716480.0, "grad_norm": 1.8243836396853508, "language_loss": 0.84715909, "learning_rate": 3.880654822954518e-06, "loss": 0.9267258, "num_input_tokens_seen": 49370285, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.33691406, "step": 2282, "time_per_iteration": 2.56280779838562 }, { "auxiliary_loss_clip": 0.06639207, "auxiliary_loss_mlp": 0.01287658, "balance_loss_clip": 0.06321704, "balance_loss_mlp": 0.01256962, "epoch": 0.137261385840974, "flos": 18959630400000.0, "grad_norm": 2.0044692079960846, "language_loss": 0.73708737, "learning_rate": 3.8805222647789195e-06, "loss": 0.81635594, "num_input_tokens_seen": 49389610, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.3067627, "step": 2283, "time_per_iteration": 2.570735454559326 }, { "auxiliary_loss_clip": 0.06633626, "auxiliary_loss_mlp": 0.01289804, "balance_loss_clip": 0.06317855, "balance_loss_mlp": 0.01257832, "epoch": 0.13732150909364196, "flos": 23302173375360.0, "grad_norm": 6.241019585322374, "language_loss": 0.85530084, "learning_rate": 3.880389635293729e-06, "loss": 0.93453515, "num_input_tokens_seen": 49408390, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.31958008, "step": 2284, "time_per_iteration": 2.595885992050171 }, { "auxiliary_loss_clip": 0.06659424, "auxiliary_loss_mlp": 0.01288313, "balance_loss_clip": 0.06323364, "balance_loss_mlp": 0.01255149, "epoch": 0.13738163234630993, "flos": 29358966706560.0, "grad_norm": 1.9006482258357398, "language_loss": 0.76470804, "learning_rate": 3.880256934503974e-06, "loss": 0.84418535, "num_input_tokens_seen": 49427725, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.33154297, "step": 2285, "time_per_iteration": 2.6503543853759766 }, { "auxiliary_loss_clip": 0.06634455, "auxiliary_loss_mlp": 0.01293642, "balance_loss_clip": 0.06313635, "balance_loss_mlp": 0.01263172, "epoch": 0.1374417555989779, "flos": 26658572048640.0, "grad_norm": 1.7554125028473317, "language_loss": 0.76392376, "learning_rate": 3.880124162414689e-06, "loss": 0.84320474, "num_input_tokens_seen": 49449000, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.30493164, "step": 2286, "time_per_iteration": 2.612583637237549 }, { "auxiliary_loss_clip": 0.0664337, "auxiliary_loss_mlp": 0.01291822, "balance_loss_clip": 0.06317099, "balance_loss_mlp": 0.01258372, "epoch": 0.1375018788516459, "flos": 28411074593280.0, "grad_norm": 2.1257724890775394, "language_loss": 0.87414926, "learning_rate": 3.879991319030908e-06, "loss": 0.95350122, "num_input_tokens_seen": 49468360, "router_z_loss_clip": 3.25976562, "router_z_loss_mlp": 0.33447266, "step": 2287, "time_per_iteration": 2.6290180683135986 }, { "auxiliary_loss_clip": 0.0664247, "auxiliary_loss_mlp": 0.01296533, "balance_loss_clip": 0.06318147, "balance_loss_mlp": 0.01266134, "epoch": 0.13756200210431385, "flos": 37422695105280.0, "grad_norm": 1.7532421543216845, "language_loss": 0.6959011, "learning_rate": 3.879858404357666e-06, "loss": 0.77529109, "num_input_tokens_seen": 49493450, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.30371094, "step": 2288, "time_per_iteration": 2.742271661758423 }, { "auxiliary_loss_clip": 0.06640653, "auxiliary_loss_mlp": 0.01293158, "balance_loss_clip": 0.06313635, "balance_loss_mlp": 0.01261496, "epoch": 0.13762212535698182, "flos": 22717642492800.0, "grad_norm": 2.4505549152795925, "language_loss": 0.87590146, "learning_rate": 3.879725418400005e-06, "loss": 0.95523959, "num_input_tokens_seen": 49511220, "router_z_loss_clip": 3.26953125, "router_z_loss_mlp": 0.31689453, "step": 2289, "time_per_iteration": 2.6007354259490967 }, { "auxiliary_loss_clip": 0.06630298, "auxiliary_loss_mlp": 0.01295009, "balance_loss_clip": 0.06313723, "balance_loss_mlp": 0.01264826, "epoch": 0.13768224860964978, "flos": 23959057858560.0, "grad_norm": 1.8995774774106968, "language_loss": 0.75199801, "learning_rate": 3.879592361162969e-06, "loss": 0.83125114, "num_input_tokens_seen": 49529820, "router_z_loss_clip": 3.1640625, "router_z_loss_mlp": 0.30175781, "step": 2290, "time_per_iteration": 2.6102135181427 }, { "auxiliary_loss_clip": 0.06535356, "auxiliary_loss_mlp": 0.01264504, "balance_loss_clip": 0.06342818, "balance_loss_mlp": 0.01252785, "epoch": 0.13774237186231775, "flos": 63612568212480.0, "grad_norm": 0.6894016634798978, "language_loss": 0.5162124, "learning_rate": 3.8794592326516015e-06, "loss": 0.59421098, "num_input_tokens_seen": 49595325, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.11706543, "step": 2291, "time_per_iteration": 3.286865711212158 }, { "auxiliary_loss_clip": 0.06640623, "auxiliary_loss_mlp": 0.01296112, "balance_loss_clip": 0.0631856, "balance_loss_mlp": 0.01265284, "epoch": 0.1378024951149857, "flos": 24286263252480.0, "grad_norm": 2.180297577303965, "language_loss": 0.72101736, "learning_rate": 3.879326032870952e-06, "loss": 0.8003847, "num_input_tokens_seen": 49615850, "router_z_loss_clip": 3.21679688, "router_z_loss_mlp": 0.30810547, "step": 2292, "time_per_iteration": 2.643129348754883 }, { "auxiliary_loss_clip": 0.06638347, "auxiliary_loss_mlp": 0.01295862, "balance_loss_clip": 0.06313488, "balance_loss_mlp": 0.01263437, "epoch": 0.13786261836765368, "flos": 14025722434560.0, "grad_norm": 2.6759138593175504, "language_loss": 0.81311595, "learning_rate": 3.879192761826071e-06, "loss": 0.89245802, "num_input_tokens_seen": 49631860, "router_z_loss_clip": 3.24609375, "router_z_loss_mlp": 0.32446289, "step": 2293, "time_per_iteration": 2.7327139377593994 }, { "auxiliary_loss_clip": 0.0663797, "auxiliary_loss_mlp": 0.01291018, "balance_loss_clip": 0.06312536, "balance_loss_mlp": 0.01260834, "epoch": 0.13792274162032167, "flos": 28886592913920.0, "grad_norm": 4.142243887390777, "language_loss": 0.7967577, "learning_rate": 3.879059419522011e-06, "loss": 0.87604761, "num_input_tokens_seen": 49652145, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.30175781, "step": 2294, "time_per_iteration": 2.6932642459869385 }, { "auxiliary_loss_clip": 0.0663304, "auxiliary_loss_mlp": 0.01300185, "balance_loss_clip": 0.06316404, "balance_loss_mlp": 0.01271217, "epoch": 0.13798286487298964, "flos": 21147344651520.0, "grad_norm": 2.037623312044562, "language_loss": 0.81314808, "learning_rate": 3.878926005963831e-06, "loss": 0.89248031, "num_input_tokens_seen": 49669880, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.28979492, "step": 2295, "time_per_iteration": 2.6054439544677734 }, { "auxiliary_loss_clip": 0.06636374, "auxiliary_loss_mlp": 0.0128817, "balance_loss_clip": 0.06313908, "balance_loss_mlp": 0.01258892, "epoch": 0.1380429881256576, "flos": 22493286385920.0, "grad_norm": 2.5737052842908112, "language_loss": 0.79166341, "learning_rate": 3.878792521156588e-06, "loss": 0.87090886, "num_input_tokens_seen": 49687255, "router_z_loss_clip": 3.2265625, "router_z_loss_mlp": 0.29272461, "step": 2296, "time_per_iteration": 2.619140863418579 }, { "auxiliary_loss_clip": 0.06628993, "auxiliary_loss_mlp": 0.01289633, "balance_loss_clip": 0.06310926, "balance_loss_mlp": 0.01260212, "epoch": 0.13810311137832557, "flos": 21399429260160.0, "grad_norm": 2.851651623968021, "language_loss": 0.79410422, "learning_rate": 3.8786589651053446e-06, "loss": 0.87329048, "num_input_tokens_seen": 49706650, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.29418945, "step": 2297, "time_per_iteration": 2.6009418964385986 }, { "auxiliary_loss_clip": 0.06626092, "auxiliary_loss_mlp": 0.01294043, "balance_loss_clip": 0.06308778, "balance_loss_mlp": 0.01263645, "epoch": 0.13816323463099353, "flos": 25996195123200.0, "grad_norm": 2.298667652135635, "language_loss": 0.69980407, "learning_rate": 3.878525337815164e-06, "loss": 0.77900541, "num_input_tokens_seen": 49725715, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.30395508, "step": 2298, "time_per_iteration": 2.6135764122009277 }, { "auxiliary_loss_clip": 0.06641927, "auxiliary_loss_mlp": 0.01295043, "balance_loss_clip": 0.06320815, "balance_loss_mlp": 0.01265122, "epoch": 0.1382233578836615, "flos": 19250260686720.0, "grad_norm": 2.018428929764807, "language_loss": 0.88277024, "learning_rate": 3.878391639291116e-06, "loss": 0.9621399, "num_input_tokens_seen": 49744710, "router_z_loss_clip": 3.21289062, "router_z_loss_mlp": 0.29931641, "step": 2299, "time_per_iteration": 2.566169023513794 }, { "auxiliary_loss_clip": 0.06646445, "auxiliary_loss_mlp": 0.01293801, "balance_loss_clip": 0.06321067, "balance_loss_mlp": 0.01261901, "epoch": 0.1382834811363295, "flos": 25673392068480.0, "grad_norm": 1.7362537080860665, "language_loss": 0.77143472, "learning_rate": 3.878257869538267e-06, "loss": 0.85083717, "num_input_tokens_seen": 49764300, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.31884766, "step": 2300, "time_per_iteration": 2.6392014026641846 }, { "auxiliary_loss_clip": 0.06630307, "auxiliary_loss_mlp": 0.0129214, "balance_loss_clip": 0.06313756, "balance_loss_mlp": 0.01262027, "epoch": 0.13834360438899745, "flos": 19788992513280.0, "grad_norm": 2.693910071229929, "language_loss": 0.84121746, "learning_rate": 3.878124028561692e-06, "loss": 0.92044187, "num_input_tokens_seen": 49778380, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.30126953, "step": 2301, "time_per_iteration": 2.558757781982422 }, { "auxiliary_loss_clip": 0.06628647, "auxiliary_loss_mlp": 0.01298363, "balance_loss_clip": 0.06311902, "balance_loss_mlp": 0.01269395, "epoch": 0.13840372764166542, "flos": 26659200954240.0, "grad_norm": 2.1787677292046106, "language_loss": 0.87197256, "learning_rate": 3.877990116366466e-06, "loss": 0.95124263, "num_input_tokens_seen": 49797460, "router_z_loss_clip": 3.16601562, "router_z_loss_mlp": 0.28955078, "step": 2302, "time_per_iteration": 2.627093553543091 }, { "auxiliary_loss_clip": 0.06527168, "auxiliary_loss_mlp": 0.01267371, "balance_loss_clip": 0.06333651, "balance_loss_mlp": 0.01256439, "epoch": 0.13846385089433338, "flos": 70532321944320.0, "grad_norm": 0.7257235909460257, "language_loss": 0.65486395, "learning_rate": 3.877856132957667e-06, "loss": 0.73280942, "num_input_tokens_seen": 49868005, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.10943604, "step": 2303, "time_per_iteration": 3.365152359008789 }, { "auxiliary_loss_clip": 0.06623267, "auxiliary_loss_mlp": 0.01290913, "balance_loss_clip": 0.06313284, "balance_loss_mlp": 0.01262708, "epoch": 0.13852397414700135, "flos": 17354644168320.0, "grad_norm": 1.9195422353839127, "language_loss": 0.79109794, "learning_rate": 3.877722078340374e-06, "loss": 0.87023973, "num_input_tokens_seen": 49885825, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.28186035, "step": 2304, "time_per_iteration": 2.559267044067383 }, { "auxiliary_loss_clip": 0.06629011, "auxiliary_loss_mlp": 0.01294539, "balance_loss_clip": 0.0631142, "balance_loss_mlp": 0.01266096, "epoch": 0.13858409739966931, "flos": 21550257809280.0, "grad_norm": 1.8096723173866252, "language_loss": 0.78757071, "learning_rate": 3.877587952519672e-06, "loss": 0.86680621, "num_input_tokens_seen": 49905975, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.28417969, "step": 2305, "time_per_iteration": 2.667126417160034 }, { "auxiliary_loss_clip": 0.06614989, "auxiliary_loss_mlp": 0.01292095, "balance_loss_clip": 0.06308585, "balance_loss_mlp": 0.01264415, "epoch": 0.13864422065233728, "flos": 21586329792000.0, "grad_norm": 2.005213924601552, "language_loss": 0.88849187, "learning_rate": 3.877453755500647e-06, "loss": 0.96756274, "num_input_tokens_seen": 49925800, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.27685547, "step": 2306, "time_per_iteration": 2.6051061153411865 }, { "auxiliary_loss_clip": 0.0650812, "auxiliary_loss_mlp": 0.01261639, "balance_loss_clip": 0.06315988, "balance_loss_mlp": 0.012519, "epoch": 0.13870434390500527, "flos": 53384927650560.0, "grad_norm": 0.845513561745338, "language_loss": 0.58612549, "learning_rate": 3.877319487288387e-06, "loss": 0.66382313, "num_input_tokens_seen": 49977620, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.09729004, "step": 2307, "time_per_iteration": 4.7395710945129395 }, { "auxiliary_loss_clip": 0.06631291, "auxiliary_loss_mlp": 0.01295685, "balance_loss_clip": 0.06308831, "balance_loss_mlp": 0.01266335, "epoch": 0.13876446715767324, "flos": 22572641802240.0, "grad_norm": 2.0993609808629548, "language_loss": 0.80348885, "learning_rate": 3.877185147887984e-06, "loss": 0.88275862, "num_input_tokens_seen": 49996650, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.29345703, "step": 2308, "time_per_iteration": 2.593931198120117 }, { "auxiliary_loss_clip": 0.06614208, "auxiliary_loss_mlp": 0.01296196, "balance_loss_clip": 0.06303508, "balance_loss_mlp": 0.01265893, "epoch": 0.1388245904103412, "flos": 20711671747200.0, "grad_norm": 2.1471525067223998, "language_loss": 0.79673272, "learning_rate": 3.877050737304533e-06, "loss": 0.87583673, "num_input_tokens_seen": 50015640, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.30297852, "step": 2309, "time_per_iteration": 2.5786383152008057 }, { "auxiliary_loss_clip": 0.06623891, "auxiliary_loss_mlp": 0.01290882, "balance_loss_clip": 0.06302572, "balance_loss_mlp": 0.01261842, "epoch": 0.13888471366300917, "flos": 20560382000640.0, "grad_norm": 2.438895695474165, "language_loss": 0.6946345, "learning_rate": 3.876916255543129e-06, "loss": 0.77378219, "num_input_tokens_seen": 50033500, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.2902832, "step": 2310, "time_per_iteration": 3.9968554973602295 }, { "auxiliary_loss_clip": 0.06619713, "auxiliary_loss_mlp": 0.01296027, "balance_loss_clip": 0.06306162, "balance_loss_mlp": 0.01266368, "epoch": 0.13894483691567713, "flos": 13842008357760.0, "grad_norm": 2.46984386327342, "language_loss": 0.8482908, "learning_rate": 3.8767817026088725e-06, "loss": 0.92744827, "num_input_tokens_seen": 50050075, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.29663086, "step": 2311, "time_per_iteration": 2.557309627532959 }, { "auxiliary_loss_clip": 0.06625531, "auxiliary_loss_mlp": 0.01291265, "balance_loss_clip": 0.06305869, "balance_loss_mlp": 0.01261463, "epoch": 0.1390049601683451, "flos": 28037567018880.0, "grad_norm": 2.1264529830633565, "language_loss": 0.83343637, "learning_rate": 3.876647078506866e-06, "loss": 0.91260433, "num_input_tokens_seen": 50070080, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.2980957, "step": 2312, "time_per_iteration": 2.617638111114502 }, { "auxiliary_loss_clip": 0.06625237, "auxiliary_loss_mlp": 0.012896, "balance_loss_clip": 0.06305325, "balance_loss_mlp": 0.01259869, "epoch": 0.13906508342101306, "flos": 26763475760640.0, "grad_norm": 1.8253434610684443, "language_loss": 0.88131434, "learning_rate": 3.876512383242215e-06, "loss": 0.96046269, "num_input_tokens_seen": 50090040, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.29736328, "step": 2313, "time_per_iteration": 2.611941337585449 }, { "auxiliary_loss_clip": 0.06622252, "auxiliary_loss_mlp": 0.01289025, "balance_loss_clip": 0.06306234, "balance_loss_mlp": 0.01259771, "epoch": 0.13912520667368106, "flos": 24541995513600.0, "grad_norm": 2.373944457193903, "language_loss": 0.8144865, "learning_rate": 3.876377616820024e-06, "loss": 0.89359927, "num_input_tokens_seen": 50110595, "router_z_loss_clip": 3.16015625, "router_z_loss_mlp": 0.29248047, "step": 2314, "time_per_iteration": 5.454007863998413 }, { "auxiliary_loss_clip": 0.06612702, "auxiliary_loss_mlp": 0.01296616, "balance_loss_clip": 0.06300454, "balance_loss_mlp": 0.01267385, "epoch": 0.13918532992634902, "flos": 19388007999360.0, "grad_norm": 3.937985163934756, "language_loss": 0.86462736, "learning_rate": 3.876242779245409e-06, "loss": 0.94372046, "num_input_tokens_seen": 50125430, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 0.29223633, "step": 2315, "time_per_iteration": 2.5443289279937744 }, { "auxiliary_loss_clip": 0.06615126, "auxiliary_loss_mlp": 0.01286446, "balance_loss_clip": 0.06302479, "balance_loss_mlp": 0.01257812, "epoch": 0.139245453179017, "flos": 21330010552320.0, "grad_norm": 2.2798865120442433, "language_loss": 0.79412866, "learning_rate": 3.876107870523477e-06, "loss": 0.87314439, "num_input_tokens_seen": 50144120, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.28613281, "step": 2316, "time_per_iteration": 2.5711560249328613 }, { "auxiliary_loss_clip": 0.06615543, "auxiliary_loss_mlp": 0.01289425, "balance_loss_clip": 0.06302175, "balance_loss_mlp": 0.01260565, "epoch": 0.13930557643168495, "flos": 19506747634560.0, "grad_norm": 1.6893778571345108, "language_loss": 0.77942312, "learning_rate": 3.875972890659349e-06, "loss": 0.85847282, "num_input_tokens_seen": 50162500, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.28845215, "step": 2317, "time_per_iteration": 2.5790538787841797 }, { "auxiliary_loss_clip": 0.06624401, "auxiliary_loss_mlp": 0.01286827, "balance_loss_clip": 0.06305093, "balance_loss_mlp": 0.01257907, "epoch": 0.13936569968435292, "flos": 25417869442560.0, "grad_norm": 2.100776623691209, "language_loss": 0.81819069, "learning_rate": 3.875837839658139e-06, "loss": 0.89730299, "num_input_tokens_seen": 50182415, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.28955078, "step": 2318, "time_per_iteration": 2.6414830684661865 }, { "auxiliary_loss_clip": 0.06514758, "auxiliary_loss_mlp": 0.01264539, "balance_loss_clip": 0.06325078, "balance_loss_mlp": 0.01254716, "epoch": 0.13942582293702088, "flos": 70793211231360.0, "grad_norm": 0.8236692678269992, "language_loss": 0.59268427, "learning_rate": 3.87570271752497e-06, "loss": 0.67047727, "num_input_tokens_seen": 50245160, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.09814453, "step": 2319, "time_per_iteration": 3.3297441005706787 }, { "auxiliary_loss_clip": 0.06627646, "auxiliary_loss_mlp": 0.01298806, "balance_loss_clip": 0.06310232, "balance_loss_mlp": 0.01269028, "epoch": 0.13948594618968888, "flos": 35599725676800.0, "grad_norm": 3.5932282260314437, "language_loss": 0.66497266, "learning_rate": 3.875567524264967e-06, "loss": 0.74423718, "num_input_tokens_seen": 50268215, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.29785156, "step": 2320, "time_per_iteration": 2.808304786682129 }, { "auxiliary_loss_clip": 0.06616521, "auxiliary_loss_mlp": 0.01287328, "balance_loss_clip": 0.0630998, "balance_loss_mlp": 0.01259099, "epoch": 0.13954606944235684, "flos": 21111482304000.0, "grad_norm": 1.5623200838718154, "language_loss": 0.71989262, "learning_rate": 3.875432259883256e-06, "loss": 0.79893112, "num_input_tokens_seen": 50288575, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.2824707, "step": 2321, "time_per_iteration": 2.6180925369262695 }, { "auxiliary_loss_clip": 0.06624351, "auxiliary_loss_mlp": 0.01296225, "balance_loss_clip": 0.06311399, "balance_loss_mlp": 0.012669, "epoch": 0.1396061926950248, "flos": 25051154048640.0, "grad_norm": 4.340587212206564, "language_loss": 0.87141341, "learning_rate": 3.875296924384965e-06, "loss": 0.9506191, "num_input_tokens_seen": 50308735, "router_z_loss_clip": 3.13085938, "router_z_loss_mlp": 0.29296875, "step": 2322, "time_per_iteration": 2.6213951110839844 }, { "auxiliary_loss_clip": 0.06604984, "auxiliary_loss_mlp": 0.01288051, "balance_loss_clip": 0.06305783, "balance_loss_mlp": 0.01260418, "epoch": 0.13966631594769277, "flos": 37643193924480.0, "grad_norm": 1.636446738289497, "language_loss": 0.68044829, "learning_rate": 3.875161517775226e-06, "loss": 0.75937867, "num_input_tokens_seen": 50331025, "router_z_loss_clip": 2.99023438, "router_z_loss_mlp": 0.27636719, "step": 2323, "time_per_iteration": 2.749052047729492 }, { "auxiliary_loss_clip": 0.0663865, "auxiliary_loss_mlp": 0.01286727, "balance_loss_clip": 0.06317831, "balance_loss_mlp": 0.01259118, "epoch": 0.13972643920036074, "flos": 16696627655040.0, "grad_norm": 2.892813837390436, "language_loss": 0.90952963, "learning_rate": 3.875026040059175e-06, "loss": 0.98878336, "num_input_tokens_seen": 50349725, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.27624512, "step": 2324, "time_per_iteration": 2.5816195011138916 }, { "auxiliary_loss_clip": 0.06628054, "auxiliary_loss_mlp": 0.01283583, "balance_loss_clip": 0.06314127, "balance_loss_mlp": 0.01257118, "epoch": 0.1397865624530287, "flos": 23337742233600.0, "grad_norm": 7.042290016397892, "language_loss": 0.73093688, "learning_rate": 3.8748904912419485e-06, "loss": 0.81005323, "num_input_tokens_seen": 50367965, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.26489258, "step": 2325, "time_per_iteration": 2.5923280715942383 }, { "auxiliary_loss_clip": 0.06633748, "auxiliary_loss_mlp": 0.01293467, "balance_loss_clip": 0.06320151, "balance_loss_mlp": 0.01266776, "epoch": 0.13984668570569667, "flos": 22784000526720.0, "grad_norm": 2.071319614304471, "language_loss": 0.82788438, "learning_rate": 3.874754871328688e-06, "loss": 0.90715659, "num_input_tokens_seen": 50385605, "router_z_loss_clip": 3.13085938, "router_z_loss_mlp": 0.26696777, "step": 2326, "time_per_iteration": 2.6218478679656982 }, { "auxiliary_loss_clip": 0.06615911, "auxiliary_loss_mlp": 0.01286342, "balance_loss_clip": 0.06313764, "balance_loss_mlp": 0.01258436, "epoch": 0.13990680895836466, "flos": 19470759505920.0, "grad_norm": 1.796301502642996, "language_loss": 0.90362573, "learning_rate": 3.874619180324534e-06, "loss": 0.98264819, "num_input_tokens_seen": 50403985, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.2791748, "step": 2327, "time_per_iteration": 2.5812952518463135 }, { "auxiliary_loss_clip": 0.06629534, "auxiliary_loss_mlp": 0.01289726, "balance_loss_clip": 0.06319246, "balance_loss_mlp": 0.01259757, "epoch": 0.13996693221103262, "flos": 20309555203200.0, "grad_norm": 2.1777996564745354, "language_loss": 0.86941373, "learning_rate": 3.874483418234632e-06, "loss": 0.94860625, "num_input_tokens_seen": 50421590, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.30004883, "step": 2328, "time_per_iteration": 2.560528039932251 }, { "auxiliary_loss_clip": 0.06636021, "auxiliary_loss_mlp": 0.01292963, "balance_loss_clip": 0.06323667, "balance_loss_mlp": 0.01263184, "epoch": 0.1400270554637006, "flos": 26625434958720.0, "grad_norm": 7.864756253569306, "language_loss": 0.75382876, "learning_rate": 3.874347585064131e-06, "loss": 0.83311856, "num_input_tokens_seen": 50443945, "router_z_loss_clip": 3.12304688, "router_z_loss_mlp": 0.2980957, "step": 2329, "time_per_iteration": 2.632369041442871 }, { "auxiliary_loss_clip": 0.06632884, "auxiliary_loss_mlp": 0.01288374, "balance_loss_clip": 0.06322619, "balance_loss_mlp": 0.01259835, "epoch": 0.14008717871636855, "flos": 19397651218560.0, "grad_norm": 2.2057496507539356, "language_loss": 0.78519773, "learning_rate": 3.874211680818183e-06, "loss": 0.86441028, "num_input_tokens_seen": 50462065, "router_z_loss_clip": 3.10351562, "router_z_loss_mlp": 0.28515625, "step": 2330, "time_per_iteration": 2.567037343978882 }, { "auxiliary_loss_clip": 0.06630664, "auxiliary_loss_mlp": 0.01286552, "balance_loss_clip": 0.06321794, "balance_loss_mlp": 0.01259253, "epoch": 0.14014730196903652, "flos": 15309624620160.0, "grad_norm": 4.47634564124555, "language_loss": 0.73212135, "learning_rate": 3.87407570550194e-06, "loss": 0.81129348, "num_input_tokens_seen": 50479565, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.27294922, "step": 2331, "time_per_iteration": 2.5528464317321777 }, { "auxiliary_loss_clip": 0.06618339, "auxiliary_loss_mlp": 0.01291006, "balance_loss_clip": 0.06320776, "balance_loss_mlp": 0.01262706, "epoch": 0.14020742522170448, "flos": 14945047505280.0, "grad_norm": 2.1425400818487295, "language_loss": 0.74175417, "learning_rate": 3.873939659120557e-06, "loss": 0.82084763, "num_input_tokens_seen": 50497305, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 0.28320312, "step": 2332, "time_per_iteration": 2.5735912322998047 }, { "auxiliary_loss_clip": 0.06533103, "auxiliary_loss_mlp": 0.01293947, "balance_loss_clip": 0.0634401, "balance_loss_mlp": 0.01284434, "epoch": 0.14026754847437245, "flos": 48839956410240.0, "grad_norm": 0.8137201528824098, "language_loss": 0.56065142, "learning_rate": 3.873803541679196e-06, "loss": 0.63892192, "num_input_tokens_seen": 50549735, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.0949707, "step": 2333, "time_per_iteration": 3.0604465007781982 }, { "auxiliary_loss_clip": 0.06634796, "auxiliary_loss_mlp": 0.01292053, "balance_loss_clip": 0.0632502, "balance_loss_mlp": 0.012633, "epoch": 0.14032767172704044, "flos": 25779972862080.0, "grad_norm": 2.848470589999661, "language_loss": 0.83784837, "learning_rate": 3.873667353183016e-06, "loss": 0.91711688, "num_input_tokens_seen": 50570100, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.28735352, "step": 2334, "time_per_iteration": 2.6547484397888184 }, { "auxiliary_loss_clip": 0.06637342, "auxiliary_loss_mlp": 0.01283462, "balance_loss_clip": 0.06322505, "balance_loss_mlp": 0.01255471, "epoch": 0.1403877949797084, "flos": 21222884707200.0, "grad_norm": 1.7913856376301829, "language_loss": 0.8169418, "learning_rate": 3.8735310936371825e-06, "loss": 0.89614987, "num_input_tokens_seen": 50589185, "router_z_loss_clip": 3.15234375, "router_z_loss_mlp": 0.2800293, "step": 2335, "time_per_iteration": 2.6130177974700928 }, { "auxiliary_loss_clip": 0.06658404, "auxiliary_loss_mlp": 0.01294184, "balance_loss_clip": 0.06339075, "balance_loss_mlp": 0.01262165, "epoch": 0.14044791823237637, "flos": 22754678797440.0, "grad_norm": 1.7452040382072127, "language_loss": 0.82855004, "learning_rate": 3.873394763046862e-06, "loss": 0.90807593, "num_input_tokens_seen": 50609645, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.31982422, "step": 2336, "time_per_iteration": 2.6077589988708496 }, { "auxiliary_loss_clip": 0.06638287, "auxiliary_loss_mlp": 0.01290305, "balance_loss_clip": 0.06330252, "balance_loss_mlp": 0.01260241, "epoch": 0.14050804148504434, "flos": 22970775277440.0, "grad_norm": 1.9664673656674663, "language_loss": 0.81802392, "learning_rate": 3.873258361417225e-06, "loss": 0.89730984, "num_input_tokens_seen": 50628385, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.30029297, "step": 2337, "time_per_iteration": 2.614114284515381 }, { "auxiliary_loss_clip": 0.06646544, "auxiliary_loss_mlp": 0.0128879, "balance_loss_clip": 0.06329951, "balance_loss_mlp": 0.01260073, "epoch": 0.1405681647377123, "flos": 22206890730240.0, "grad_norm": 1.8569222962169998, "language_loss": 0.80096775, "learning_rate": 3.873121888753442e-06, "loss": 0.88032115, "num_input_tokens_seen": 50647260, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.28710938, "step": 2338, "time_per_iteration": 2.6481027603149414 }, { "auxiliary_loss_clip": 0.06627528, "auxiliary_loss_mlp": 0.01292457, "balance_loss_clip": 0.06314635, "balance_loss_mlp": 0.01261462, "epoch": 0.14062828799038027, "flos": 23739607215360.0, "grad_norm": 4.5417223975638255, "language_loss": 0.82207668, "learning_rate": 3.87298534506069e-06, "loss": 0.90127647, "num_input_tokens_seen": 50666130, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.30981445, "step": 2339, "time_per_iteration": 2.600585460662842 }, { "auxiliary_loss_clip": 0.06626246, "auxiliary_loss_mlp": 0.01282962, "balance_loss_clip": 0.06320694, "balance_loss_mlp": 0.01254256, "epoch": 0.14068841124304826, "flos": 39211856611200.0, "grad_norm": 2.2880034158019313, "language_loss": 0.67047691, "learning_rate": 3.872848730344146e-06, "loss": 0.749569, "num_input_tokens_seen": 50687440, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.28710938, "step": 2340, "time_per_iteration": 2.8082752227783203 }, { "auxiliary_loss_clip": 0.06627908, "auxiliary_loss_mlp": 0.01290863, "balance_loss_clip": 0.06321965, "balance_loss_mlp": 0.01262562, "epoch": 0.14074853449571623, "flos": 20198278581120.0, "grad_norm": 2.862344986025437, "language_loss": 0.79904425, "learning_rate": 3.87271204460899e-06, "loss": 0.87823188, "num_input_tokens_seen": 50704030, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.28295898, "step": 2341, "time_per_iteration": 2.568960189819336 }, { "auxiliary_loss_clip": 0.06629483, "auxiliary_loss_mlp": 0.01287607, "balance_loss_clip": 0.06323874, "balance_loss_mlp": 0.012579, "epoch": 0.1408086577483842, "flos": 18411800405760.0, "grad_norm": 1.9768302308711836, "language_loss": 0.81785768, "learning_rate": 3.8725752878604066e-06, "loss": 0.89702857, "num_input_tokens_seen": 50723305, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.29711914, "step": 2342, "time_per_iteration": 2.5819427967071533 }, { "auxiliary_loss_clip": 0.06627341, "auxiliary_loss_mlp": 0.01282186, "balance_loss_clip": 0.06327035, "balance_loss_mlp": 0.012559, "epoch": 0.14086878100105216, "flos": 25271569013760.0, "grad_norm": 2.081921562660608, "language_loss": 0.78615391, "learning_rate": 3.87243846010358e-06, "loss": 0.86524922, "num_input_tokens_seen": 50743270, "router_z_loss_clip": 3.00585938, "router_z_loss_mlp": 0.26330566, "step": 2343, "time_per_iteration": 2.623281717300415 }, { "auxiliary_loss_clip": 0.06529941, "auxiliary_loss_mlp": 0.0127189, "balance_loss_clip": 0.06341338, "balance_loss_mlp": 0.01262168, "epoch": 0.14092890425372012, "flos": 65997553703040.0, "grad_norm": 0.8072413760396987, "language_loss": 0.61641532, "learning_rate": 3.872301561343699e-06, "loss": 0.69443369, "num_input_tokens_seen": 50802710, "router_z_loss_clip": 1.88964844, "router_z_loss_mlp": 0.09710693, "step": 2344, "time_per_iteration": 3.134500741958618 }, { "auxiliary_loss_clip": 0.06623751, "auxiliary_loss_mlp": 0.01288118, "balance_loss_clip": 0.06318251, "balance_loss_mlp": 0.01260533, "epoch": 0.1409890275063881, "flos": 23701564661760.0, "grad_norm": 1.794185616831027, "language_loss": 0.65711713, "learning_rate": 3.872164591585956e-06, "loss": 0.7362358, "num_input_tokens_seen": 50822625, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.27587891, "step": 2345, "time_per_iteration": 2.6262340545654297 }, { "auxiliary_loss_clip": 0.06640497, "auxiliary_loss_mlp": 0.01285405, "balance_loss_clip": 0.06319802, "balance_loss_mlp": 0.0125565, "epoch": 0.14104915075905605, "flos": 23629923820800.0, "grad_norm": 2.1977760894246328, "language_loss": 0.75037247, "learning_rate": 3.8720275508355435e-06, "loss": 0.82963151, "num_input_tokens_seen": 50842330, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.29736328, "step": 2346, "time_per_iteration": 2.6191298961639404 }, { "auxiliary_loss_clip": 0.06631111, "auxiliary_loss_mlp": 0.01287242, "balance_loss_clip": 0.06323205, "balance_loss_mlp": 0.01256724, "epoch": 0.14110927401172405, "flos": 20601485228160.0, "grad_norm": 2.1442818530360106, "language_loss": 0.78323603, "learning_rate": 3.8718904390976585e-06, "loss": 0.86241961, "num_input_tokens_seen": 50861035, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.30505371, "step": 2347, "time_per_iteration": 4.035961866378784 }, { "auxiliary_loss_clip": 0.06622925, "auxiliary_loss_mlp": 0.0128359, "balance_loss_clip": 0.06316812, "balance_loss_mlp": 0.01255742, "epoch": 0.141169397264392, "flos": 28555530232320.0, "grad_norm": 1.912242712578431, "language_loss": 0.77859038, "learning_rate": 3.8717532563775e-06, "loss": 0.85765547, "num_input_tokens_seen": 50880105, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.27844238, "step": 2348, "time_per_iteration": 2.793919801712036 }, { "auxiliary_loss_clip": 0.06631707, "auxiliary_loss_mlp": 0.01291314, "balance_loss_clip": 0.06326723, "balance_loss_mlp": 0.01263872, "epoch": 0.14122952051705998, "flos": 17097947585280.0, "grad_norm": 1.8435267821214465, "language_loss": 0.87861168, "learning_rate": 3.871616002680272e-06, "loss": 0.95784187, "num_input_tokens_seen": 50897720, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.2746582, "step": 2349, "time_per_iteration": 4.029081583023071 }, { "auxiliary_loss_clip": 0.06621853, "auxiliary_loss_mlp": 0.01289568, "balance_loss_clip": 0.06320713, "balance_loss_mlp": 0.01259586, "epoch": 0.14128964376972794, "flos": 28953915269760.0, "grad_norm": 1.6548291429398556, "language_loss": 0.89889908, "learning_rate": 3.871478678011177e-06, "loss": 0.97801328, "num_input_tokens_seen": 50918385, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.29968262, "step": 2350, "time_per_iteration": 2.6556153297424316 }, { "auxiliary_loss_clip": 0.06627364, "auxiliary_loss_mlp": 0.01286402, "balance_loss_clip": 0.06319305, "balance_loss_mlp": 0.01258174, "epoch": 0.1413497670223959, "flos": 18995828163840.0, "grad_norm": 1.9067163351311454, "language_loss": 0.81632346, "learning_rate": 3.871341282375423e-06, "loss": 0.89546114, "num_input_tokens_seen": 50938270, "router_z_loss_clip": 3.08203125, "router_z_loss_mlp": 0.28222656, "step": 2351, "time_per_iteration": 2.5817818641662598 }, { "auxiliary_loss_clip": 0.06625715, "auxiliary_loss_mlp": 0.01280442, "balance_loss_clip": 0.06316902, "balance_loss_mlp": 0.01253405, "epoch": 0.14140989027506387, "flos": 29870053885440.0, "grad_norm": 2.467739281112864, "language_loss": 0.85193241, "learning_rate": 3.871203815778219e-06, "loss": 0.93099391, "num_input_tokens_seen": 50958155, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.26989746, "step": 2352, "time_per_iteration": 2.6379127502441406 }, { "auxiliary_loss_clip": 0.06516664, "auxiliary_loss_mlp": 0.01291348, "balance_loss_clip": 0.06328386, "balance_loss_mlp": 0.01280798, "epoch": 0.14147001352773186, "flos": 62098901331840.0, "grad_norm": 0.8938732431203316, "language_loss": 0.61960411, "learning_rate": 3.87106627822478e-06, "loss": 0.69768417, "num_input_tokens_seen": 51020705, "router_z_loss_clip": 1.8828125, "router_z_loss_mlp": 0.10559082, "step": 2353, "time_per_iteration": 6.022890090942383 }, { "auxiliary_loss_clip": 0.06619617, "auxiliary_loss_mlp": 0.01290992, "balance_loss_clip": 0.0631777, "balance_loss_mlp": 0.01263121, "epoch": 0.14153013678039983, "flos": 22023973267200.0, "grad_norm": 2.2616611632596904, "language_loss": 0.87810457, "learning_rate": 3.8709286697203196e-06, "loss": 0.95721066, "num_input_tokens_seen": 51039995, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.27880859, "step": 2354, "time_per_iteration": 2.689209222793579 }, { "auxiliary_loss_clip": 0.06621894, "auxiliary_loss_mlp": 0.0128883, "balance_loss_clip": 0.06316005, "balance_loss_mlp": 0.01259791, "epoch": 0.1415902600330678, "flos": 19726365985920.0, "grad_norm": 1.8258498303306698, "language_loss": 0.75684965, "learning_rate": 3.870790990270057e-06, "loss": 0.83595693, "num_input_tokens_seen": 51059075, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.2902832, "step": 2355, "time_per_iteration": 2.6054351329803467 }, { "auxiliary_loss_clip": 0.06523017, "auxiliary_loss_mlp": 0.01275886, "balance_loss_clip": 0.06336153, "balance_loss_mlp": 0.01265306, "epoch": 0.14165038328573576, "flos": 65919330316800.0, "grad_norm": 0.6606889041552486, "language_loss": 0.51798892, "learning_rate": 3.870653239879212e-06, "loss": 0.59597802, "num_input_tokens_seen": 51120380, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.105896, "step": 2356, "time_per_iteration": 3.1415841579437256 }, { "auxiliary_loss_clip": 0.06623605, "auxiliary_loss_mlp": 0.01282972, "balance_loss_clip": 0.06317122, "balance_loss_mlp": 0.01254243, "epoch": 0.14171050653840372, "flos": 12135011379840.0, "grad_norm": 2.1765200603227415, "language_loss": 0.71760404, "learning_rate": 3.8705154185530095e-06, "loss": 0.79666984, "num_input_tokens_seen": 51136950, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.28735352, "step": 2357, "time_per_iteration": 2.6125292778015137 }, { "auxiliary_loss_clip": 0.06620441, "auxiliary_loss_mlp": 0.01284881, "balance_loss_clip": 0.06310157, "balance_loss_mlp": 0.01256462, "epoch": 0.1417706297910717, "flos": 20418735473280.0, "grad_norm": 2.119170594125461, "language_loss": 0.83471036, "learning_rate": 3.870377526296674e-06, "loss": 0.91376364, "num_input_tokens_seen": 51155175, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.28466797, "step": 2358, "time_per_iteration": 2.5898125171661377 }, { "auxiliary_loss_clip": 0.06628573, "auxiliary_loss_mlp": 0.01290954, "balance_loss_clip": 0.06315868, "balance_loss_mlp": 0.0125953, "epoch": 0.14183075304373965, "flos": 22386831373440.0, "grad_norm": 2.188916045766554, "language_loss": 0.72436833, "learning_rate": 3.870239563115436e-06, "loss": 0.80356359, "num_input_tokens_seen": 51174500, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.31396484, "step": 2359, "time_per_iteration": 2.620107889175415 }, { "auxiliary_loss_clip": 0.06623169, "auxiliary_loss_mlp": 0.01287226, "balance_loss_clip": 0.06311983, "balance_loss_mlp": 0.01257352, "epoch": 0.14189087629640765, "flos": 21587503749120.0, "grad_norm": 2.281643245894803, "language_loss": 0.76985788, "learning_rate": 3.870101529014526e-06, "loss": 0.84896183, "num_input_tokens_seen": 51194270, "router_z_loss_clip": 3.11132812, "router_z_loss_mlp": 0.29882812, "step": 2360, "time_per_iteration": 2.60552978515625 }, { "auxiliary_loss_clip": 0.06605987, "auxiliary_loss_mlp": 0.01283814, "balance_loss_clip": 0.06303318, "balance_loss_mlp": 0.01255919, "epoch": 0.1419509995490756, "flos": 20014312942080.0, "grad_norm": 2.2294666992134315, "language_loss": 0.82775712, "learning_rate": 3.869963423999178e-06, "loss": 0.90665519, "num_input_tokens_seen": 51211850, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.27905273, "step": 2361, "time_per_iteration": 2.5981693267822266 }, { "auxiliary_loss_clip": 0.06613867, "auxiliary_loss_mlp": 0.01286514, "balance_loss_clip": 0.06309792, "balance_loss_mlp": 0.01257403, "epoch": 0.14201112280174358, "flos": 31949552188800.0, "grad_norm": 2.0979966425277206, "language_loss": 0.76078814, "learning_rate": 3.86982524807463e-06, "loss": 0.83979195, "num_input_tokens_seen": 51233545, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.29125977, "step": 2362, "time_per_iteration": 2.692680835723877 }, { "auxiliary_loss_clip": 0.06610263, "auxiliary_loss_mlp": 0.01286644, "balance_loss_clip": 0.06311732, "balance_loss_mlp": 0.01259178, "epoch": 0.14207124605441154, "flos": 41473811180160.0, "grad_norm": 1.6249592610950359, "language_loss": 0.74539495, "learning_rate": 3.869687001246122e-06, "loss": 0.82436401, "num_input_tokens_seen": 51257615, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.2746582, "step": 2363, "time_per_iteration": 2.821244955062866 }, { "auxiliary_loss_clip": 0.0661366, "auxiliary_loss_mlp": 0.01290258, "balance_loss_clip": 0.06308872, "balance_loss_mlp": 0.0126179, "epoch": 0.1421313693070795, "flos": 31913186716800.0, "grad_norm": 2.4958289114614773, "language_loss": 0.74392349, "learning_rate": 3.8695486835188946e-06, "loss": 0.82296264, "num_input_tokens_seen": 51279645, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.28491211, "step": 2364, "time_per_iteration": 2.747854709625244 }, { "auxiliary_loss_clip": 0.06605902, "auxiliary_loss_mlp": 0.01287106, "balance_loss_clip": 0.06309938, "balance_loss_mlp": 0.01259533, "epoch": 0.14219149255974747, "flos": 26878609670400.0, "grad_norm": 2.6025335121900195, "language_loss": 0.91536272, "learning_rate": 3.869410294898195e-06, "loss": 0.99429274, "num_input_tokens_seen": 51299775, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.27563477, "step": 2365, "time_per_iteration": 2.6236953735351562 }, { "auxiliary_loss_clip": 0.06614055, "auxiliary_loss_mlp": 0.01291363, "balance_loss_clip": 0.06308778, "balance_loss_mlp": 0.01262276, "epoch": 0.14225161581241544, "flos": 27461882741760.0, "grad_norm": 1.89731986684481, "language_loss": 0.66740435, "learning_rate": 3.869271835389268e-06, "loss": 0.74645853, "num_input_tokens_seen": 51319430, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.29064941, "step": 2366, "time_per_iteration": 2.7431375980377197 }, { "auxiliary_loss_clip": 0.06611389, "auxiliary_loss_mlp": 0.01292268, "balance_loss_clip": 0.06312396, "balance_loss_mlp": 0.01265124, "epoch": 0.14231173906508343, "flos": 10566055203840.0, "grad_norm": 2.2260620218274805, "language_loss": 0.82441288, "learning_rate": 3.8691333049973665e-06, "loss": 0.90344948, "num_input_tokens_seen": 51336045, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.27148438, "step": 2367, "time_per_iteration": 2.648030996322632 }, { "auxiliary_loss_clip": 0.06619353, "auxiliary_loss_mlp": 0.0128447, "balance_loss_clip": 0.0631291, "balance_loss_mlp": 0.01255931, "epoch": 0.1423718623177514, "flos": 28367539597440.0, "grad_norm": 1.7867836293404455, "language_loss": 0.83717012, "learning_rate": 3.868994703727742e-06, "loss": 0.91620833, "num_input_tokens_seen": 51357030, "router_z_loss_clip": 3.06835938, "router_z_loss_mlp": 0.28503418, "step": 2368, "time_per_iteration": 2.6981911659240723 }, { "auxiliary_loss_clip": 0.06620209, "auxiliary_loss_mlp": 0.01295873, "balance_loss_clip": 0.06313119, "balance_loss_mlp": 0.01266095, "epoch": 0.14243198557041936, "flos": 19360279497600.0, "grad_norm": 2.7187095752901964, "language_loss": 0.88522977, "learning_rate": 3.868856031585652e-06, "loss": 0.96439064, "num_input_tokens_seen": 51374890, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.29785156, "step": 2369, "time_per_iteration": 2.6070384979248047 }, { "auxiliary_loss_clip": 0.06627437, "auxiliary_loss_mlp": 0.01290094, "balance_loss_clip": 0.06316016, "balance_loss_mlp": 0.01260768, "epoch": 0.14249210882308733, "flos": 28814952072960.0, "grad_norm": 1.5455268527588326, "language_loss": 0.76822388, "learning_rate": 3.868717288576354e-06, "loss": 0.84739918, "num_input_tokens_seen": 51398100, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.29296875, "step": 2370, "time_per_iteration": 2.672473192214966 }, { "auxiliary_loss_clip": 0.06618766, "auxiliary_loss_mlp": 0.01303502, "balance_loss_clip": 0.06318148, "balance_loss_mlp": 0.01274677, "epoch": 0.1425522320757553, "flos": 21841433147520.0, "grad_norm": 1.7011402241794695, "language_loss": 0.83499879, "learning_rate": 3.868578474705109e-06, "loss": 0.91422147, "num_input_tokens_seen": 51418745, "router_z_loss_clip": 3.00390625, "router_z_loss_mlp": 0.28808594, "step": 2371, "time_per_iteration": 2.647955894470215 }, { "auxiliary_loss_clip": 0.06626646, "auxiliary_loss_mlp": 0.01291134, "balance_loss_clip": 0.06318572, "balance_loss_mlp": 0.01262428, "epoch": 0.14261235532842326, "flos": 17317230520320.0, "grad_norm": 2.1220732485710876, "language_loss": 0.83985734, "learning_rate": 3.868439589977181e-06, "loss": 0.9190352, "num_input_tokens_seen": 51437455, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.28735352, "step": 2372, "time_per_iteration": 2.5621936321258545 }, { "auxiliary_loss_clip": 0.066202, "auxiliary_loss_mlp": 0.01301259, "balance_loss_clip": 0.06314029, "balance_loss_mlp": 0.01272243, "epoch": 0.14267247858109125, "flos": 18812659138560.0, "grad_norm": 4.325317256428279, "language_loss": 0.85771072, "learning_rate": 3.868300634397836e-06, "loss": 0.93692529, "num_input_tokens_seen": 51455710, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.29016113, "step": 2373, "time_per_iteration": 2.5607993602752686 }, { "auxiliary_loss_clip": 0.06617084, "auxiliary_loss_mlp": 0.01294888, "balance_loss_clip": 0.06313992, "balance_loss_mlp": 0.0126666, "epoch": 0.14273260183375922, "flos": 11362783351680.0, "grad_norm": 2.42140427229718, "language_loss": 0.8738938, "learning_rate": 3.8681616079723445e-06, "loss": 0.95301348, "num_input_tokens_seen": 51471270, "router_z_loss_clip": 3.03710938, "router_z_loss_mlp": 0.28198242, "step": 2374, "time_per_iteration": 2.5559117794036865 }, { "auxiliary_loss_clip": 0.06633274, "auxiliary_loss_mlp": 0.01298387, "balance_loss_clip": 0.06322244, "balance_loss_mlp": 0.01268156, "epoch": 0.14279272508642718, "flos": 27575800767360.0, "grad_norm": 1.6977244541123546, "language_loss": 0.80050802, "learning_rate": 3.868022510705977e-06, "loss": 0.87982464, "num_input_tokens_seen": 51492705, "router_z_loss_clip": 3.11132812, "router_z_loss_mlp": 0.30273438, "step": 2375, "time_per_iteration": 2.647430658340454 }, { "auxiliary_loss_clip": 0.06628229, "auxiliary_loss_mlp": 0.01301765, "balance_loss_clip": 0.06320103, "balance_loss_mlp": 0.01271009, "epoch": 0.14285284833909515, "flos": 16258019857920.0, "grad_norm": 2.2315034435539607, "language_loss": 0.78006876, "learning_rate": 3.867883342604009e-06, "loss": 0.85936874, "num_input_tokens_seen": 51510780, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.30761719, "step": 2376, "time_per_iteration": 2.5522685050964355 }, { "auxiliary_loss_clip": 0.06625591, "auxiliary_loss_mlp": 0.01289403, "balance_loss_clip": 0.0631832, "balance_loss_mlp": 0.012614, "epoch": 0.1429129715917631, "flos": 19761725208960.0, "grad_norm": 1.92155450838772, "language_loss": 0.94439381, "learning_rate": 3.867744103671717e-06, "loss": 1.02354372, "num_input_tokens_seen": 51531400, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.28027344, "step": 2377, "time_per_iteration": 2.6066181659698486 }, { "auxiliary_loss_clip": 0.06628527, "auxiliary_loss_mlp": 0.01302001, "balance_loss_clip": 0.06321198, "balance_loss_mlp": 0.01270792, "epoch": 0.14297309484443108, "flos": 21142606896000.0, "grad_norm": 1.8432938508083039, "language_loss": 0.92140758, "learning_rate": 3.867604793914382e-06, "loss": 1.00071287, "num_input_tokens_seen": 51548215, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.31201172, "step": 2378, "time_per_iteration": 2.5862956047058105 }, { "auxiliary_loss_clip": 0.0662895, "auxiliary_loss_mlp": 0.01303178, "balance_loss_clip": 0.06321508, "balance_loss_mlp": 0.01273829, "epoch": 0.14303321809709904, "flos": 23593432567680.0, "grad_norm": 2.2903839476625674, "language_loss": 0.75404656, "learning_rate": 3.8674654133372864e-06, "loss": 0.83336782, "num_input_tokens_seen": 51566820, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.29321289, "step": 2379, "time_per_iteration": 2.6060972213745117 }, { "auxiliary_loss_clip": 0.06616461, "auxiliary_loss_mlp": 0.01286006, "balance_loss_clip": 0.06312446, "balance_loss_mlp": 0.0125742, "epoch": 0.14309334134976703, "flos": 15893778159360.0, "grad_norm": 1.79645632132124, "language_loss": 0.79625404, "learning_rate": 3.867325961945714e-06, "loss": 0.87527871, "num_input_tokens_seen": 51585075, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.28552246, "step": 2380, "time_per_iteration": 2.587057590484619 }, { "auxiliary_loss_clip": 0.06635334, "auxiliary_loss_mlp": 0.01292837, "balance_loss_clip": 0.06324492, "balance_loss_mlp": 0.01263357, "epoch": 0.143153464602435, "flos": 16331086218240.0, "grad_norm": 2.9596322907656463, "language_loss": 0.89666271, "learning_rate": 3.867186439744955e-06, "loss": 0.9759444, "num_input_tokens_seen": 51603185, "router_z_loss_clip": 3.11132812, "router_z_loss_mlp": 0.29504395, "step": 2381, "time_per_iteration": 2.5927491188049316 }, { "auxiliary_loss_clip": 0.06617668, "auxiliary_loss_mlp": 0.01285658, "balance_loss_clip": 0.06312943, "balance_loss_mlp": 0.01257751, "epoch": 0.14321358785510296, "flos": 17097737950080.0, "grad_norm": 2.7584088063824685, "language_loss": 0.77624708, "learning_rate": 3.867046846740299e-06, "loss": 0.8552804, "num_input_tokens_seen": 51620880, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.27941895, "step": 2382, "time_per_iteration": 2.5824480056762695 }, { "auxiliary_loss_clip": 0.06619269, "auxiliary_loss_mlp": 0.01286294, "balance_loss_clip": 0.06312849, "balance_loss_mlp": 0.01257756, "epoch": 0.14327371110777093, "flos": 26330108843520.0, "grad_norm": 2.442770855802001, "language_loss": 0.78289378, "learning_rate": 3.866907182937039e-06, "loss": 0.86194944, "num_input_tokens_seen": 51640170, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.28552246, "step": 2383, "time_per_iteration": 2.6402127742767334 }, { "auxiliary_loss_clip": 0.06624854, "auxiliary_loss_mlp": 0.01288386, "balance_loss_clip": 0.06314556, "balance_loss_mlp": 0.01259347, "epoch": 0.1433338343604389, "flos": 18082163243520.0, "grad_norm": 2.2910508715416578, "language_loss": 0.89168108, "learning_rate": 3.866767448340471e-06, "loss": 0.97081351, "num_input_tokens_seen": 51656580, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.29052734, "step": 2384, "time_per_iteration": 2.5642945766448975 }, { "auxiliary_loss_clip": 0.06632537, "auxiliary_loss_mlp": 0.01286276, "balance_loss_clip": 0.06318326, "balance_loss_mlp": 0.01257213, "epoch": 0.14339395761310686, "flos": 15528110941440.0, "grad_norm": 2.3712025267008756, "language_loss": 0.80844009, "learning_rate": 3.866627642955895e-06, "loss": 0.88762826, "num_input_tokens_seen": 51674645, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.2902832, "step": 2385, "time_per_iteration": 2.5720767974853516 }, { "auxiliary_loss_clip": 0.06627949, "auxiliary_loss_mlp": 0.01290318, "balance_loss_clip": 0.06316341, "balance_loss_mlp": 0.01263115, "epoch": 0.14345408086577485, "flos": 28556368773120.0, "grad_norm": 1.7830210663594275, "language_loss": 0.76067573, "learning_rate": 3.866487766788612e-06, "loss": 0.83985841, "num_input_tokens_seen": 51695770, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.2722168, "step": 2386, "time_per_iteration": 4.074405193328857 }, { "auxiliary_loss_clip": 0.06620689, "auxiliary_loss_mlp": 0.01283842, "balance_loss_clip": 0.06316052, "balance_loss_mlp": 0.01256996, "epoch": 0.14351420411844282, "flos": 20236279207680.0, "grad_norm": 2.655469187080145, "language_loss": 0.79360378, "learning_rate": 3.866347819843925e-06, "loss": 0.87264907, "num_input_tokens_seen": 51714165, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.26843262, "step": 2387, "time_per_iteration": 2.6629223823547363 }, { "auxiliary_loss_clip": 0.06619377, "auxiliary_loss_mlp": 0.01286573, "balance_loss_clip": 0.06313196, "balance_loss_mlp": 0.01257462, "epoch": 0.14357432737111078, "flos": 19871157041280.0, "grad_norm": 2.271731076634031, "language_loss": 0.8418681, "learning_rate": 3.866207802127143e-06, "loss": 0.92092764, "num_input_tokens_seen": 51734440, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.29077148, "step": 2388, "time_per_iteration": 2.6023788452148438 }, { "auxiliary_loss_clip": 0.06631047, "auxiliary_loss_mlp": 0.01285645, "balance_loss_clip": 0.06320864, "balance_loss_mlp": 0.0125787, "epoch": 0.14363445062377875, "flos": 28264354894080.0, "grad_norm": 2.4985964072296745, "language_loss": 0.83018243, "learning_rate": 3.866067713643573e-06, "loss": 0.90934944, "num_input_tokens_seen": 51753730, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.27807617, "step": 2389, "time_per_iteration": 4.065879583358765 }, { "auxiliary_loss_clip": 0.0663128, "auxiliary_loss_mlp": 0.01287408, "balance_loss_clip": 0.06317842, "balance_loss_mlp": 0.01258655, "epoch": 0.1436945738764467, "flos": 18192517470720.0, "grad_norm": 2.054621814545176, "language_loss": 0.83916593, "learning_rate": 3.8659275543985285e-06, "loss": 0.91835284, "num_input_tokens_seen": 51771195, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.28759766, "step": 2390, "time_per_iteration": 2.567599058151245 }, { "auxiliary_loss_clip": 0.06631414, "auxiliary_loss_mlp": 0.01293387, "balance_loss_clip": 0.06325203, "balance_loss_mlp": 0.01265194, "epoch": 0.14375469712911468, "flos": 27315246896640.0, "grad_norm": 3.747277826729068, "language_loss": 0.75853693, "learning_rate": 3.865787324397324e-06, "loss": 0.83778501, "num_input_tokens_seen": 51792290, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.28210449, "step": 2391, "time_per_iteration": 2.6642255783081055 }, { "auxiliary_loss_clip": 0.06548672, "auxiliary_loss_mlp": 0.01315738, "balance_loss_clip": 0.06359644, "balance_loss_mlp": 0.01304485, "epoch": 0.14381482038178264, "flos": 56908757980800.0, "grad_norm": 0.8504442326694696, "language_loss": 0.6177569, "learning_rate": 3.865647023645277e-06, "loss": 0.696401, "num_input_tokens_seen": 51843675, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.11242676, "step": 2392, "time_per_iteration": 3.059433937072754 }, { "auxiliary_loss_clip": 0.06648397, "auxiliary_loss_mlp": 0.01304068, "balance_loss_clip": 0.06331053, "balance_loss_mlp": 0.01274957, "epoch": 0.14387494363445064, "flos": 14287282554240.0, "grad_norm": 2.375969706474392, "language_loss": 0.77953762, "learning_rate": 3.865506652147709e-06, "loss": 0.85906219, "num_input_tokens_seen": 51860285, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.29101562, "step": 2393, "time_per_iteration": 5.480144739151001 }, { "auxiliary_loss_clip": 0.06638768, "auxiliary_loss_mlp": 0.01293456, "balance_loss_clip": 0.06328851, "balance_loss_mlp": 0.0126574, "epoch": 0.1439350668871186, "flos": 26768884348800.0, "grad_norm": 2.006938560039621, "language_loss": 0.77320492, "learning_rate": 3.865366209909941e-06, "loss": 0.85252714, "num_input_tokens_seen": 51880105, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.27685547, "step": 2394, "time_per_iteration": 2.65118408203125 }, { "auxiliary_loss_clip": 0.06635658, "auxiliary_loss_mlp": 0.01283536, "balance_loss_clip": 0.06330845, "balance_loss_mlp": 0.01255927, "epoch": 0.14399519013978657, "flos": 40709926632960.0, "grad_norm": 1.751237613298438, "language_loss": 0.86930156, "learning_rate": 3.8652256969372994e-06, "loss": 0.94849348, "num_input_tokens_seen": 51905175, "router_z_loss_clip": 3.046875, "router_z_loss_mlp": 0.27636719, "step": 2395, "time_per_iteration": 2.7666845321655273 }, { "auxiliary_loss_clip": 0.06631589, "auxiliary_loss_mlp": 0.01286267, "balance_loss_clip": 0.06327799, "balance_loss_mlp": 0.01259016, "epoch": 0.14405531339245453, "flos": 20563652309760.0, "grad_norm": 2.4409761931448752, "language_loss": 0.83365035, "learning_rate": 3.865085113235113e-06, "loss": 0.91282892, "num_input_tokens_seen": 51924490, "router_z_loss_clip": 3.03710938, "router_z_loss_mlp": 0.27294922, "step": 2396, "time_per_iteration": 2.5954229831695557 }, { "auxiliary_loss_clip": 0.06631335, "auxiliary_loss_mlp": 0.01283225, "balance_loss_clip": 0.06328983, "balance_loss_mlp": 0.01255902, "epoch": 0.1441154366451225, "flos": 19578975454080.0, "grad_norm": 3.2827981205417074, "language_loss": 0.84095848, "learning_rate": 3.864944458808712e-06, "loss": 0.92010409, "num_input_tokens_seen": 51940490, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.27319336, "step": 2397, "time_per_iteration": 2.6073825359344482 }, { "auxiliary_loss_clip": 0.06626462, "auxiliary_loss_mlp": 0.01283344, "balance_loss_clip": 0.06320775, "balance_loss_mlp": 0.01257488, "epoch": 0.14417555989779046, "flos": 18521735362560.0, "grad_norm": 4.788811772163622, "language_loss": 0.81007946, "learning_rate": 3.86480373366343e-06, "loss": 0.88917744, "num_input_tokens_seen": 51957910, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.25854492, "step": 2398, "time_per_iteration": 2.5632195472717285 }, { "auxiliary_loss_clip": 0.06625049, "auxiliary_loss_mlp": 0.01289649, "balance_loss_clip": 0.06320351, "balance_loss_mlp": 0.01261563, "epoch": 0.14423568315045843, "flos": 26038933505280.0, "grad_norm": 2.1756347825024505, "language_loss": 0.65657938, "learning_rate": 3.864662937804603e-06, "loss": 0.73572636, "num_input_tokens_seen": 51978010, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.28100586, "step": 2399, "time_per_iteration": 2.6356699466705322 }, { "auxiliary_loss_clip": 0.06629928, "auxiliary_loss_mlp": 0.01283415, "balance_loss_clip": 0.06324457, "balance_loss_mlp": 0.01256938, "epoch": 0.14429580640312642, "flos": 21295238307840.0, "grad_norm": 1.949921408984985, "language_loss": 0.8341313, "learning_rate": 3.864522071237571e-06, "loss": 0.91326475, "num_input_tokens_seen": 51998515, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.26525879, "step": 2400, "time_per_iteration": 2.5946872234344482 }, { "auxiliary_loss_clip": 0.06632758, "auxiliary_loss_mlp": 0.01293569, "balance_loss_clip": 0.06325201, "balance_loss_mlp": 0.01264136, "epoch": 0.14435592965579438, "flos": 25634636755200.0, "grad_norm": 1.8786927590247477, "language_loss": 0.76261079, "learning_rate": 3.864381133967676e-06, "loss": 0.84187412, "num_input_tokens_seen": 52019270, "router_z_loss_clip": 3.07617188, "router_z_loss_mlp": 0.29394531, "step": 2401, "time_per_iteration": 2.6546132564544678 }, { "auxiliary_loss_clip": 0.06621733, "auxiliary_loss_mlp": 0.012816, "balance_loss_clip": 0.06320705, "balance_loss_mlp": 0.01255469, "epoch": 0.14441605290846235, "flos": 22971488037120.0, "grad_norm": 1.5605289939917422, "language_loss": 0.81874019, "learning_rate": 3.86424012600026e-06, "loss": 0.8977735, "num_input_tokens_seen": 52039315, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.26159668, "step": 2402, "time_per_iteration": 2.622436761856079 }, { "auxiliary_loss_clip": 0.06630217, "auxiliary_loss_mlp": 0.01288103, "balance_loss_clip": 0.06325455, "balance_loss_mlp": 0.01260947, "epoch": 0.14447617616113032, "flos": 17353386357120.0, "grad_norm": 2.447894294066606, "language_loss": 0.84922701, "learning_rate": 3.864099047340673e-06, "loss": 0.92841017, "num_input_tokens_seen": 52056555, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.27172852, "step": 2403, "time_per_iteration": 2.6012954711914062 }, { "auxiliary_loss_clip": 0.06635939, "auxiliary_loss_mlp": 0.01286318, "balance_loss_clip": 0.06333338, "balance_loss_mlp": 0.01258125, "epoch": 0.14453629941379828, "flos": 24066896463360.0, "grad_norm": 2.0457585471438193, "language_loss": 0.7147063, "learning_rate": 3.863957897994262e-06, "loss": 0.79392886, "num_input_tokens_seen": 52075800, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.28186035, "step": 2404, "time_per_iteration": 2.629450798034668 }, { "auxiliary_loss_clip": 0.06625305, "auxiliary_loss_mlp": 0.01286974, "balance_loss_clip": 0.06324972, "balance_loss_mlp": 0.01259174, "epoch": 0.14459642266646625, "flos": 14434924648320.0, "grad_norm": 2.7396218899762643, "language_loss": 0.73844749, "learning_rate": 3.863816677966381e-06, "loss": 0.81757033, "num_input_tokens_seen": 52092585, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.27807617, "step": 2405, "time_per_iteration": 2.5810999870300293 }, { "auxiliary_loss_clip": 0.06624916, "auxiliary_loss_mlp": 0.01294369, "balance_loss_clip": 0.06323808, "balance_loss_mlp": 0.01267118, "epoch": 0.14465654591913424, "flos": 9871337802240.0, "grad_norm": 2.084455224605974, "language_loss": 0.74219376, "learning_rate": 3.863675387262386e-06, "loss": 0.82138658, "num_input_tokens_seen": 52108990, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.27270508, "step": 2406, "time_per_iteration": 2.5579636096954346 }, { "auxiliary_loss_clip": 0.06633939, "auxiliary_loss_mlp": 0.01291411, "balance_loss_clip": 0.0633127, "balance_loss_mlp": 0.01264088, "epoch": 0.1447166691718022, "flos": 24979890551040.0, "grad_norm": 4.6559119230411214, "language_loss": 0.769732, "learning_rate": 3.8635340258876325e-06, "loss": 0.84898543, "num_input_tokens_seen": 52125385, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.27294922, "step": 2407, "time_per_iteration": 2.627394914627075 }, { "auxiliary_loss_clip": 0.06622772, "auxiliary_loss_mlp": 0.01290284, "balance_loss_clip": 0.06324601, "balance_loss_mlp": 0.01263867, "epoch": 0.14477679242447017, "flos": 21914457580800.0, "grad_norm": 1.6108911573581073, "language_loss": 0.80138743, "learning_rate": 3.8633925938474826e-06, "loss": 0.88051796, "num_input_tokens_seen": 52144985, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.26416016, "step": 2408, "time_per_iteration": 2.612398386001587 }, { "auxiliary_loss_clip": 0.06633321, "auxiliary_loss_mlp": 0.01292013, "balance_loss_clip": 0.0633056, "balance_loss_mlp": 0.01262688, "epoch": 0.14483691567713813, "flos": 20747030970240.0, "grad_norm": 2.6170275779029217, "language_loss": 0.83594483, "learning_rate": 3.863251091147299e-06, "loss": 0.91519815, "num_input_tokens_seen": 52163885, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.29296875, "step": 2409, "time_per_iteration": 2.593832492828369 }, { "auxiliary_loss_clip": 0.0663624, "auxiliary_loss_mlp": 0.01292951, "balance_loss_clip": 0.06324767, "balance_loss_mlp": 0.0126626, "epoch": 0.1448970389298061, "flos": 35416388943360.0, "grad_norm": 3.3639908597303987, "language_loss": 0.76062793, "learning_rate": 3.863109517792446e-06, "loss": 0.83991987, "num_input_tokens_seen": 52184325, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.26708984, "step": 2410, "time_per_iteration": 2.705941915512085 }, { "auxiliary_loss_clip": 0.06626114, "auxiliary_loss_mlp": 0.01292124, "balance_loss_clip": 0.06325512, "balance_loss_mlp": 0.01266077, "epoch": 0.14495716218247406, "flos": 15419853066240.0, "grad_norm": 1.7961414997124963, "language_loss": 0.82266629, "learning_rate": 3.8629678737882945e-06, "loss": 0.90184861, "num_input_tokens_seen": 52202740, "router_z_loss_clip": 3.00585938, "router_z_loss_mlp": 0.26049805, "step": 2411, "time_per_iteration": 2.5773556232452393 }, { "auxiliary_loss_clip": 0.06619175, "auxiliary_loss_mlp": 0.0129448, "balance_loss_clip": 0.06316753, "balance_loss_mlp": 0.01267336, "epoch": 0.14501728543514203, "flos": 33701677390080.0, "grad_norm": 2.3196165161840145, "language_loss": 0.71320486, "learning_rate": 3.862826159140214e-06, "loss": 0.79234147, "num_input_tokens_seen": 52223100, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.2713623, "step": 2412, "time_per_iteration": 2.704249858856201 }, { "auxiliary_loss_clip": 0.06619076, "auxiliary_loss_mlp": 0.01292351, "balance_loss_clip": 0.0632138, "balance_loss_mlp": 0.01266685, "epoch": 0.14507740868781002, "flos": 15601512718080.0, "grad_norm": 2.9693498952995725, "language_loss": 0.78264928, "learning_rate": 3.862684373853579e-06, "loss": 0.86176354, "num_input_tokens_seen": 52239690, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.2565918, "step": 2413, "time_per_iteration": 2.562924385070801 }, { "auxiliary_loss_clip": 0.06573121, "auxiliary_loss_mlp": 0.0127601, "balance_loss_clip": 0.06382166, "balance_loss_mlp": 0.01263988, "epoch": 0.145137531940478, "flos": 66695247924480.0, "grad_norm": 0.8909904856959326, "language_loss": 0.58803552, "learning_rate": 3.8625425179337656e-06, "loss": 0.66652685, "num_input_tokens_seen": 52296705, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.12017822, "step": 2414, "time_per_iteration": 3.2238924503326416 }, { "auxiliary_loss_clip": 0.06564604, "auxiliary_loss_mlp": 0.01267202, "balance_loss_clip": 0.06374325, "balance_loss_mlp": 0.01255645, "epoch": 0.14519765519314595, "flos": 67542806373120.0, "grad_norm": 0.8174649895230109, "language_loss": 0.62043023, "learning_rate": 3.862400591386154e-06, "loss": 0.69874835, "num_input_tokens_seen": 52361830, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.11541748, "step": 2415, "time_per_iteration": 3.2569305896759033 }, { "auxiliary_loss_clip": 0.06617674, "auxiliary_loss_mlp": 0.01290056, "balance_loss_clip": 0.06317365, "balance_loss_mlp": 0.01263758, "epoch": 0.14525777844581392, "flos": 17204151035520.0, "grad_norm": 11.312073058548444, "language_loss": 0.73022175, "learning_rate": 3.8622585942161245e-06, "loss": 0.80929905, "num_input_tokens_seen": 52379420, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.26281738, "step": 2416, "time_per_iteration": 2.5738091468811035 }, { "auxiliary_loss_clip": 0.0651779, "auxiliary_loss_mlp": 0.01263433, "balance_loss_clip": 0.06326576, "balance_loss_mlp": 0.01252555, "epoch": 0.14531790169848188, "flos": 65425349370240.0, "grad_norm": 0.6785380096518067, "language_loss": 0.60367751, "learning_rate": 3.8621165264290635e-06, "loss": 0.68148971, "num_input_tokens_seen": 52446290, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.10894775, "step": 2417, "time_per_iteration": 3.2635796070098877 }, { "auxiliary_loss_clip": 0.06625329, "auxiliary_loss_mlp": 0.01296621, "balance_loss_clip": 0.06316277, "balance_loss_mlp": 0.01269978, "epoch": 0.14537802495114985, "flos": 32570783959680.0, "grad_norm": 7.959958549270525, "language_loss": 0.79859388, "learning_rate": 3.861974388030356e-06, "loss": 0.87781334, "num_input_tokens_seen": 52467295, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.26660156, "step": 2418, "time_per_iteration": 2.699692726135254 }, { "auxiliary_loss_clip": 0.06615001, "auxiliary_loss_mlp": 0.01290838, "balance_loss_clip": 0.06318241, "balance_loss_mlp": 0.01265125, "epoch": 0.1454381482038178, "flos": 20232338065920.0, "grad_norm": 2.363362452662923, "language_loss": 0.72490239, "learning_rate": 3.861832179025394e-06, "loss": 0.8039608, "num_input_tokens_seen": 52487295, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.25732422, "step": 2419, "time_per_iteration": 2.598541259765625 }, { "auxiliary_loss_clip": 0.06619254, "auxiliary_loss_mlp": 0.01295369, "balance_loss_clip": 0.0631675, "balance_loss_mlp": 0.01268785, "epoch": 0.1454982714564858, "flos": 22899721415040.0, "grad_norm": 2.7555894292682006, "language_loss": 0.9120661, "learning_rate": 3.861689899419569e-06, "loss": 0.99121237, "num_input_tokens_seen": 52504220, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.265625, "step": 2420, "time_per_iteration": 2.666790246963501 }, { "auxiliary_loss_clip": 0.06615137, "auxiliary_loss_mlp": 0.012911, "balance_loss_clip": 0.06314351, "balance_loss_mlp": 0.01265839, "epoch": 0.14555839470915377, "flos": 20236027645440.0, "grad_norm": 1.9319476935179771, "language_loss": 0.83878934, "learning_rate": 3.861547549218276e-06, "loss": 0.91785175, "num_input_tokens_seen": 52521900, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.25219727, "step": 2421, "time_per_iteration": 2.5808117389678955 }, { "auxiliary_loss_clip": 0.06619306, "auxiliary_loss_mlp": 0.01293623, "balance_loss_clip": 0.06313141, "balance_loss_mlp": 0.01267552, "epoch": 0.14561851796182174, "flos": 22242753077760.0, "grad_norm": 1.6315366120437957, "language_loss": 0.82819211, "learning_rate": 3.861405128426914e-06, "loss": 0.90732145, "num_input_tokens_seen": 52540495, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.26098633, "step": 2422, "time_per_iteration": 2.6057937145233154 }, { "auxiliary_loss_clip": 0.06501481, "auxiliary_loss_mlp": 0.01308123, "balance_loss_clip": 0.06314002, "balance_loss_mlp": 0.01297061, "epoch": 0.1456786412144897, "flos": 52655758692480.0, "grad_norm": 0.8835533411168587, "language_loss": 0.6331948, "learning_rate": 3.861262637050883e-06, "loss": 0.71129078, "num_input_tokens_seen": 52603305, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.11083984, "step": 2423, "time_per_iteration": 3.206160545349121 }, { "auxiliary_loss_clip": 0.06612821, "auxiliary_loss_mlp": 0.01293982, "balance_loss_clip": 0.06313649, "balance_loss_mlp": 0.01268841, "epoch": 0.14573876446715767, "flos": 23228352328320.0, "grad_norm": 2.02988051539655, "language_loss": 0.83173096, "learning_rate": 3.861120075095585e-06, "loss": 0.91079903, "num_input_tokens_seen": 52623435, "router_z_loss_clip": 2.99023438, "router_z_loss_mlp": 0.25158691, "step": 2424, "time_per_iteration": 2.659306764602661 }, { "auxiliary_loss_clip": 0.06612056, "auxiliary_loss_mlp": 0.01290558, "balance_loss_clip": 0.06313705, "balance_loss_mlp": 0.01264237, "epoch": 0.14579888771982563, "flos": 18120331578240.0, "grad_norm": 2.5743197915335974, "language_loss": 0.79109621, "learning_rate": 3.860977442566429e-06, "loss": 0.87012231, "num_input_tokens_seen": 52642255, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.26342773, "step": 2425, "time_per_iteration": 3.9990618228912354 }, { "auxiliary_loss_clip": 0.06616093, "auxiliary_loss_mlp": 0.01293865, "balance_loss_clip": 0.0631013, "balance_loss_mlp": 0.01267735, "epoch": 0.14585901097249362, "flos": 23007476165760.0, "grad_norm": 2.317285296476293, "language_loss": 0.84407902, "learning_rate": 3.860834739468821e-06, "loss": 0.92317867, "num_input_tokens_seen": 52658700, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.26159668, "step": 2426, "time_per_iteration": 2.6008524894714355 }, { "auxiliary_loss_clip": 0.06618112, "auxiliary_loss_mlp": 0.01303064, "balance_loss_clip": 0.06312896, "balance_loss_mlp": 0.01275825, "epoch": 0.1459191342251616, "flos": 21915212267520.0, "grad_norm": 2.4857765070536613, "language_loss": 0.88035125, "learning_rate": 3.860691965808173e-06, "loss": 0.95956302, "num_input_tokens_seen": 52678140, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.27209473, "step": 2427, "time_per_iteration": 2.606297492980957 }, { "auxiliary_loss_clip": 0.06616133, "auxiliary_loss_mlp": 0.01296751, "balance_loss_clip": 0.063076, "balance_loss_mlp": 0.01267926, "epoch": 0.14597925747782955, "flos": 14980742144640.0, "grad_norm": 2.6608480470461835, "language_loss": 0.68678498, "learning_rate": 3.8605491215899e-06, "loss": 0.76591372, "num_input_tokens_seen": 52696825, "router_z_loss_clip": 3.0859375, "router_z_loss_mlp": 0.28808594, "step": 2428, "time_per_iteration": 2.570387363433838 }, { "auxiliary_loss_clip": 0.06612208, "auxiliary_loss_mlp": 0.0129632, "balance_loss_clip": 0.06307065, "balance_loss_mlp": 0.01269164, "epoch": 0.14603938073049752, "flos": 21075200686080.0, "grad_norm": 2.040637728046451, "language_loss": 0.84395814, "learning_rate": 3.860406206819417e-06, "loss": 0.92304343, "num_input_tokens_seen": 52715125, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.27197266, "step": 2429, "time_per_iteration": 4.0550994873046875 }, { "auxiliary_loss_clip": 0.06600729, "auxiliary_loss_mlp": 0.01298777, "balance_loss_clip": 0.06303641, "balance_loss_mlp": 0.01272539, "epoch": 0.14609950398316549, "flos": 19870863552000.0, "grad_norm": 1.8208232316822253, "language_loss": 0.79777008, "learning_rate": 3.860263221502145e-06, "loss": 0.87676513, "num_input_tokens_seen": 52734015, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 0.26257324, "step": 2430, "time_per_iteration": 2.577716827392578 }, { "auxiliary_loss_clip": 0.06620528, "auxiliary_loss_mlp": 0.01298773, "balance_loss_clip": 0.06314421, "balance_loss_mlp": 0.01271605, "epoch": 0.14615962723583345, "flos": 22425377051520.0, "grad_norm": 5.766367316503805, "language_loss": 0.84360051, "learning_rate": 3.860120165643504e-06, "loss": 0.92279351, "num_input_tokens_seen": 52753025, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.27148438, "step": 2431, "time_per_iteration": 2.6038317680358887 }, { "auxiliary_loss_clip": 0.0662015, "auxiliary_loss_mlp": 0.01296486, "balance_loss_clip": 0.06305872, "balance_loss_mlp": 0.0126809, "epoch": 0.14621975048850142, "flos": 22352813815680.0, "grad_norm": 2.609401262949467, "language_loss": 0.79298973, "learning_rate": 3.859977039248921e-06, "loss": 0.87215608, "num_input_tokens_seen": 52773420, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.28393555, "step": 2432, "time_per_iteration": 4.170436143875122 }, { "auxiliary_loss_clip": 0.06618159, "auxiliary_loss_mlp": 0.01291568, "balance_loss_clip": 0.0631116, "balance_loss_mlp": 0.01263316, "epoch": 0.1462798737411694, "flos": 24396030501120.0, "grad_norm": 2.3641672527929156, "language_loss": 0.81327516, "learning_rate": 3.859833842323822e-06, "loss": 0.89237249, "num_input_tokens_seen": 52792870, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.2824707, "step": 2433, "time_per_iteration": 4.097440958023071 }, { "auxiliary_loss_clip": 0.06603768, "auxiliary_loss_mlp": 0.01291421, "balance_loss_clip": 0.06305823, "balance_loss_mlp": 0.01265576, "epoch": 0.14633999699383737, "flos": 19250679957120.0, "grad_norm": 1.9644247957462486, "language_loss": 0.7868101, "learning_rate": 3.859690574873638e-06, "loss": 0.865762, "num_input_tokens_seen": 52811615, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.25854492, "step": 2434, "time_per_iteration": 2.563028573989868 }, { "auxiliary_loss_clip": 0.06518564, "auxiliary_loss_mlp": 0.01260166, "balance_loss_clip": 0.06336281, "balance_loss_mlp": 0.01250868, "epoch": 0.14640012024650534, "flos": 62679658780800.0, "grad_norm": 0.8264520825603087, "language_loss": 0.58297759, "learning_rate": 3.8595472369038e-06, "loss": 0.66076493, "num_input_tokens_seen": 52873230, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.09289551, "step": 2435, "time_per_iteration": 3.209561824798584 }, { "auxiliary_loss_clip": 0.06604543, "auxiliary_loss_mlp": 0.01290984, "balance_loss_clip": 0.06306066, "balance_loss_mlp": 0.01262874, "epoch": 0.1464602434991733, "flos": 12281144100480.0, "grad_norm": 2.1614509323681, "language_loss": 0.89384437, "learning_rate": 3.859403828419744e-06, "loss": 0.97279966, "num_input_tokens_seen": 52889325, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.28125, "step": 2436, "time_per_iteration": 2.562387466430664 }, { "auxiliary_loss_clip": 0.06621374, "auxiliary_loss_mlp": 0.0128764, "balance_loss_clip": 0.06312045, "balance_loss_mlp": 0.0126114, "epoch": 0.14652036675184127, "flos": 20928480986880.0, "grad_norm": 1.95481639595091, "language_loss": 0.75329602, "learning_rate": 3.85926034942691e-06, "loss": 0.83238614, "num_input_tokens_seen": 52909705, "router_z_loss_clip": 3.09179688, "router_z_loss_mlp": 0.26501465, "step": 2437, "time_per_iteration": 2.6051061153411865 }, { "auxiliary_loss_clip": 0.06621069, "auxiliary_loss_mlp": 0.01292497, "balance_loss_clip": 0.06314135, "balance_loss_mlp": 0.01265055, "epoch": 0.14658049000450923, "flos": 27710151989760.0, "grad_norm": 2.157118474993867, "language_loss": 0.74514592, "learning_rate": 3.859116799930736e-06, "loss": 0.82428157, "num_input_tokens_seen": 52930300, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.27453613, "step": 2438, "time_per_iteration": 2.6322929859161377 }, { "auxiliary_loss_clip": 0.06612237, "auxiliary_loss_mlp": 0.01290644, "balance_loss_clip": 0.06309271, "balance_loss_mlp": 0.01263595, "epoch": 0.14664061325717723, "flos": 24943483152000.0, "grad_norm": 1.9419829289259585, "language_loss": 0.75486737, "learning_rate": 3.858973179936668e-06, "loss": 0.83389622, "num_input_tokens_seen": 52949955, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.27050781, "step": 2439, "time_per_iteration": 2.6128647327423096 }, { "auxiliary_loss_clip": 0.06614884, "auxiliary_loss_mlp": 0.01288903, "balance_loss_clip": 0.06312023, "balance_loss_mlp": 0.01262164, "epoch": 0.1467007365098452, "flos": 40307306964480.0, "grad_norm": 2.1530892621826774, "language_loss": 0.75419325, "learning_rate": 3.85882948945015e-06, "loss": 0.83323109, "num_input_tokens_seen": 52972905, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.26757812, "step": 2440, "time_per_iteration": 2.7500603199005127 }, { "auxiliary_loss_clip": 0.06607741, "auxiliary_loss_mlp": 0.01285254, "balance_loss_clip": 0.06311009, "balance_loss_mlp": 0.01261615, "epoch": 0.14676085976251316, "flos": 26548175894400.0, "grad_norm": 1.6638348527813922, "language_loss": 0.83537364, "learning_rate": 3.85868572847663e-06, "loss": 0.9143036, "num_input_tokens_seen": 52994850, "router_z_loss_clip": 2.96679688, "router_z_loss_mlp": 0.23620605, "step": 2441, "time_per_iteration": 2.6314055919647217 }, { "auxiliary_loss_clip": 0.0663666, "auxiliary_loss_mlp": 0.01295221, "balance_loss_clip": 0.0632076, "balance_loss_mlp": 0.01266706, "epoch": 0.14682098301518112, "flos": 23556857460480.0, "grad_norm": 2.3057186688867124, "language_loss": 0.73418248, "learning_rate": 3.858541897021563e-06, "loss": 0.8135013, "num_input_tokens_seen": 53014740, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.28503418, "step": 2442, "time_per_iteration": 2.5996439456939697 }, { "auxiliary_loss_clip": 0.0663258, "auxiliary_loss_mlp": 0.01292192, "balance_loss_clip": 0.06319716, "balance_loss_mlp": 0.01264476, "epoch": 0.1468811062678491, "flos": 11655048792960.0, "grad_norm": 6.754123855342651, "language_loss": 0.82870024, "learning_rate": 3.8583979950904e-06, "loss": 0.90794802, "num_input_tokens_seen": 53029780, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.27722168, "step": 2443, "time_per_iteration": 2.5732295513153076 }, { "auxiliary_loss_clip": 0.06621885, "auxiliary_loss_mlp": 0.01289789, "balance_loss_clip": 0.06315422, "balance_loss_mlp": 0.01263134, "epoch": 0.14694122952051705, "flos": 23009237101440.0, "grad_norm": 1.716612681604004, "language_loss": 0.8372072, "learning_rate": 3.858254022688599e-06, "loss": 0.91632402, "num_input_tokens_seen": 53048620, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.26672363, "step": 2444, "time_per_iteration": 2.5859906673431396 }, { "auxiliary_loss_clip": 0.06627093, "auxiliary_loss_mlp": 0.0128943, "balance_loss_clip": 0.06320448, "balance_loss_mlp": 0.01262274, "epoch": 0.14700135277318502, "flos": 26509797924480.0, "grad_norm": 1.6033497773919092, "language_loss": 0.71906739, "learning_rate": 3.85810997982162e-06, "loss": 0.79823256, "num_input_tokens_seen": 53070055, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.2713623, "step": 2445, "time_per_iteration": 2.6423544883728027 }, { "auxiliary_loss_clip": 0.06503738, "auxiliary_loss_mlp": 0.01265859, "balance_loss_clip": 0.06320799, "balance_loss_mlp": 0.01256615, "epoch": 0.147061476025853, "flos": 59467841527680.0, "grad_norm": 0.815019981260895, "language_loss": 0.62776315, "learning_rate": 3.857965866494923e-06, "loss": 0.70545912, "num_input_tokens_seen": 53126945, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.09234619, "step": 2446, "time_per_iteration": 3.0988593101501465 }, { "auxiliary_loss_clip": 0.06638723, "auxiliary_loss_mlp": 0.01308143, "balance_loss_clip": 0.06328332, "balance_loss_mlp": 0.01280856, "epoch": 0.14712159927852098, "flos": 28338637138560.0, "grad_norm": 1.6218988050504903, "language_loss": 0.75881028, "learning_rate": 3.857821682713975e-06, "loss": 0.83827901, "num_input_tokens_seen": 53149130, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.27307129, "step": 2447, "time_per_iteration": 2.666799306869507 }, { "auxiliary_loss_clip": 0.06630026, "auxiliary_loss_mlp": 0.01297248, "balance_loss_clip": 0.06324963, "balance_loss_mlp": 0.01270259, "epoch": 0.14718172253118894, "flos": 27097263699840.0, "grad_norm": 2.3722061835510133, "language_loss": 0.8652221, "learning_rate": 3.857677428484242e-06, "loss": 0.94449484, "num_input_tokens_seen": 53167120, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.27026367, "step": 2448, "time_per_iteration": 2.6344106197357178 }, { "auxiliary_loss_clip": 0.06500575, "auxiliary_loss_mlp": 0.01262559, "balance_loss_clip": 0.06318511, "balance_loss_mlp": 0.01252582, "epoch": 0.1472418457838569, "flos": 66725827464960.0, "grad_norm": 0.728902821691753, "language_loss": 0.56773651, "learning_rate": 3.857533103811195e-06, "loss": 0.64536786, "num_input_tokens_seen": 53227945, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.09973145, "step": 2449, "time_per_iteration": 3.1622180938720703 }, { "auxiliary_loss_clip": 0.06613135, "auxiliary_loss_mlp": 0.01286853, "balance_loss_clip": 0.063123, "balance_loss_mlp": 0.01260186, "epoch": 0.14730196903652487, "flos": 19579730140800.0, "grad_norm": 2.122213408946895, "language_loss": 0.86108178, "learning_rate": 3.857388708700307e-06, "loss": 0.94008166, "num_input_tokens_seen": 53244615, "router_z_loss_clip": 3.00585938, "router_z_loss_mlp": 0.26635742, "step": 2450, "time_per_iteration": 2.584256887435913 }, { "auxiliary_loss_clip": 0.06627408, "auxiliary_loss_mlp": 0.01293162, "balance_loss_clip": 0.06317165, "balance_loss_mlp": 0.01265803, "epoch": 0.14736209228919284, "flos": 16076611768320.0, "grad_norm": 9.319281148996653, "language_loss": 0.76637107, "learning_rate": 3.857244243157052e-06, "loss": 0.84557676, "num_input_tokens_seen": 53262205, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.27331543, "step": 2451, "time_per_iteration": 2.5788676738739014 }, { "auxiliary_loss_clip": 0.06612815, "auxiliary_loss_mlp": 0.01285215, "balance_loss_clip": 0.06318128, "balance_loss_mlp": 0.01258775, "epoch": 0.1474222155418608, "flos": 23046147624960.0, "grad_norm": 1.621858446303659, "language_loss": 0.82905799, "learning_rate": 3.85709970718691e-06, "loss": 0.90803826, "num_input_tokens_seen": 53282445, "router_z_loss_clip": 2.94726562, "router_z_loss_mlp": 0.2644043, "step": 2452, "time_per_iteration": 2.607292413711548 }, { "auxiliary_loss_clip": 0.06614914, "auxiliary_loss_mlp": 0.01277924, "balance_loss_clip": 0.06313531, "balance_loss_mlp": 0.01251925, "epoch": 0.1474823387945288, "flos": 17024210392320.0, "grad_norm": 3.1774024212077316, "language_loss": 0.75115883, "learning_rate": 3.856955100795361e-06, "loss": 0.83008718, "num_input_tokens_seen": 53299060, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.26000977, "step": 2453, "time_per_iteration": 2.5651986598968506 }, { "auxiliary_loss_clip": 0.06626463, "auxiliary_loss_mlp": 0.01284781, "balance_loss_clip": 0.06315137, "balance_loss_mlp": 0.01257887, "epoch": 0.14754246204719676, "flos": 17900880935040.0, "grad_norm": 1.9113746123558089, "language_loss": 0.7675122, "learning_rate": 3.856810423987889e-06, "loss": 0.84662461, "num_input_tokens_seen": 53315970, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.26904297, "step": 2454, "time_per_iteration": 2.5587871074676514 }, { "auxiliary_loss_clip": 0.06612409, "auxiliary_loss_mlp": 0.01282623, "balance_loss_clip": 0.06309909, "balance_loss_mlp": 0.01256528, "epoch": 0.14760258529986472, "flos": 13084161304320.0, "grad_norm": 2.0509946513952815, "language_loss": 0.84619546, "learning_rate": 3.856665676769979e-06, "loss": 0.92514575, "num_input_tokens_seen": 53332940, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.26086426, "step": 2455, "time_per_iteration": 2.559802532196045 }, { "auxiliary_loss_clip": 0.06625605, "auxiliary_loss_mlp": 0.01282731, "balance_loss_clip": 0.06312753, "balance_loss_mlp": 0.0125753, "epoch": 0.1476627085525327, "flos": 30813627513600.0, "grad_norm": 2.015181025171273, "language_loss": 0.85254562, "learning_rate": 3.85652085914712e-06, "loss": 0.93162894, "num_input_tokens_seen": 53353295, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.25195312, "step": 2456, "time_per_iteration": 2.6405742168426514 }, { "auxiliary_loss_clip": 0.0660251, "auxiliary_loss_mlp": 0.01282931, "balance_loss_clip": 0.06307548, "balance_loss_mlp": 0.01256323, "epoch": 0.14772283180520066, "flos": 21695887405440.0, "grad_norm": 1.8300388331320618, "language_loss": 0.85395437, "learning_rate": 3.856375971124805e-06, "loss": 0.93280882, "num_input_tokens_seen": 53373410, "router_z_loss_clip": 2.94921875, "router_z_loss_mlp": 0.26574707, "step": 2457, "time_per_iteration": 2.597926378250122 }, { "auxiliary_loss_clip": 0.06595472, "auxiliary_loss_mlp": 0.01286624, "balance_loss_clip": 0.06303298, "balance_loss_mlp": 0.0125892, "epoch": 0.14778295505786862, "flos": 18776335593600.0, "grad_norm": 6.724909369912658, "language_loss": 0.76564109, "learning_rate": 3.856231012708527e-06, "loss": 0.84446204, "num_input_tokens_seen": 53391430, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.27722168, "step": 2458, "time_per_iteration": 2.5698001384735107 }, { "auxiliary_loss_clip": 0.06627193, "auxiliary_loss_mlp": 0.01288305, "balance_loss_clip": 0.06312182, "balance_loss_mlp": 0.01260398, "epoch": 0.1478430783105366, "flos": 22900224539520.0, "grad_norm": 2.299836795586772, "language_loss": 0.84225619, "learning_rate": 3.856085983903782e-06, "loss": 0.92141116, "num_input_tokens_seen": 53409960, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.27893066, "step": 2459, "time_per_iteration": 2.590435028076172 }, { "auxiliary_loss_clip": 0.06606895, "auxiliary_loss_mlp": 0.0128102, "balance_loss_clip": 0.06310716, "balance_loss_mlp": 0.01255807, "epoch": 0.14790320156320458, "flos": 15090635174400.0, "grad_norm": 2.2173265793391823, "language_loss": 0.76503688, "learning_rate": 3.855940884716071e-06, "loss": 0.84391606, "num_input_tokens_seen": 53426160, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.25195312, "step": 2460, "time_per_iteration": 2.647670030593872 }, { "auxiliary_loss_clip": 0.06613672, "auxiliary_loss_mlp": 0.01285126, "balance_loss_clip": 0.06304128, "balance_loss_mlp": 0.01258161, "epoch": 0.14796332481587254, "flos": 26511894276480.0, "grad_norm": 1.5542639432411243, "language_loss": 0.82118464, "learning_rate": 3.855795715150896e-06, "loss": 0.90017259, "num_input_tokens_seen": 53448530, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.26940918, "step": 2461, "time_per_iteration": 2.634885787963867 }, { "auxiliary_loss_clip": 0.06612928, "auxiliary_loss_mlp": 0.01294059, "balance_loss_clip": 0.06310219, "balance_loss_mlp": 0.01265378, "epoch": 0.1480234480685405, "flos": 17568392734080.0, "grad_norm": 3.174597274260731, "language_loss": 0.67269433, "learning_rate": 3.855650475213761e-06, "loss": 0.7517643, "num_input_tokens_seen": 53465915, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.28710938, "step": 2462, "time_per_iteration": 2.5367038249969482 }, { "auxiliary_loss_clip": 0.06614989, "auxiliary_loss_mlp": 0.01294455, "balance_loss_clip": 0.06309801, "balance_loss_mlp": 0.01267681, "epoch": 0.14808357132120847, "flos": 53594693147520.0, "grad_norm": 1.6760619645641088, "language_loss": 0.68116009, "learning_rate": 3.8555051649101745e-06, "loss": 0.7602545, "num_input_tokens_seen": 53496055, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.26806641, "step": 2463, "time_per_iteration": 2.885603904724121 }, { "auxiliary_loss_clip": 0.06613472, "auxiliary_loss_mlp": 0.01289429, "balance_loss_clip": 0.06307065, "balance_loss_mlp": 0.01261772, "epoch": 0.14814369457387644, "flos": 19835420474880.0, "grad_norm": 1.7970882239091133, "language_loss": 0.77211404, "learning_rate": 3.855359784245646e-06, "loss": 0.85114306, "num_input_tokens_seen": 53513790, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.2767334, "step": 2464, "time_per_iteration": 3.9542620182037354 }, { "auxiliary_loss_clip": 0.06601919, "auxiliary_loss_mlp": 0.01290963, "balance_loss_clip": 0.06308855, "balance_loss_mlp": 0.01266346, "epoch": 0.1482038178265444, "flos": 23921769991680.0, "grad_norm": 3.0039329036904845, "language_loss": 0.80569428, "learning_rate": 3.855214333225688e-06, "loss": 0.88462311, "num_input_tokens_seen": 53533410, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.24609375, "step": 2465, "time_per_iteration": 2.616785764694214 }, { "auxiliary_loss_clip": 0.06621528, "auxiliary_loss_mlp": 0.01290491, "balance_loss_clip": 0.06312159, "balance_loss_mlp": 0.0126101, "epoch": 0.1482639410792124, "flos": 24177376471680.0, "grad_norm": 1.8799244484802993, "language_loss": 0.77151048, "learning_rate": 3.855068811855817e-06, "loss": 0.85063064, "num_input_tokens_seen": 53554775, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.29516602, "step": 2466, "time_per_iteration": 2.6242728233337402 }, { "auxiliary_loss_clip": 0.06500272, "auxiliary_loss_mlp": 0.01308111, "balance_loss_clip": 0.06320722, "balance_loss_mlp": 0.01298128, "epoch": 0.14832406433188036, "flos": 66209205916800.0, "grad_norm": 0.7735005485569161, "language_loss": 0.60238034, "learning_rate": 3.854923220141551e-06, "loss": 0.68046415, "num_input_tokens_seen": 53609675, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.09979248, "step": 2467, "time_per_iteration": 3.2313191890716553 }, { "auxiliary_loss_clip": 0.06612825, "auxiliary_loss_mlp": 0.01294072, "balance_loss_clip": 0.06312412, "balance_loss_mlp": 0.01265557, "epoch": 0.14838418758454833, "flos": 25418372567040.0, "grad_norm": 2.111307933871342, "language_loss": 0.89099252, "learning_rate": 3.85477755808841e-06, "loss": 0.97006154, "num_input_tokens_seen": 53626950, "router_z_loss_clip": 3.00585938, "router_z_loss_mlp": 0.28491211, "step": 2468, "time_per_iteration": 4.0723161697387695 }, { "auxiliary_loss_clip": 0.06623815, "auxiliary_loss_mlp": 0.01297674, "balance_loss_clip": 0.06312951, "balance_loss_mlp": 0.01267824, "epoch": 0.1484443108372163, "flos": 23295800465280.0, "grad_norm": 2.0967309177248135, "language_loss": 0.77599746, "learning_rate": 3.854631825701919e-06, "loss": 0.85521227, "num_input_tokens_seen": 53644200, "router_z_loss_clip": 3.11132812, "router_z_loss_mlp": 0.29858398, "step": 2469, "time_per_iteration": 2.5994715690612793 }, { "auxiliary_loss_clip": 0.06616829, "auxiliary_loss_mlp": 0.01288151, "balance_loss_clip": 0.0631614, "balance_loss_mlp": 0.01262461, "epoch": 0.14850443408988426, "flos": 14652949772160.0, "grad_norm": 2.4293368275628744, "language_loss": 0.76506519, "learning_rate": 3.854486022987603e-06, "loss": 0.84411502, "num_input_tokens_seen": 53659650, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.25695801, "step": 2470, "time_per_iteration": 2.552640199661255 }, { "auxiliary_loss_clip": 0.06612031, "auxiliary_loss_mlp": 0.01291705, "balance_loss_clip": 0.0631754, "balance_loss_mlp": 0.01265455, "epoch": 0.14856455734255222, "flos": 23554761108480.0, "grad_norm": 1.841433039581974, "language_loss": 0.73205495, "learning_rate": 3.8543401499509905e-06, "loss": 0.81109226, "num_input_tokens_seen": 53680275, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.26220703, "step": 2471, "time_per_iteration": 2.630610704421997 }, { "auxiliary_loss_clip": 0.06618859, "auxiliary_loss_mlp": 0.01282884, "balance_loss_clip": 0.06311336, "balance_loss_mlp": 0.01255848, "epoch": 0.1486246805952202, "flos": 18083127565440.0, "grad_norm": 2.521245837160973, "language_loss": 0.90399003, "learning_rate": 3.854194206597615e-06, "loss": 0.98300743, "num_input_tokens_seen": 53698270, "router_z_loss_clip": 3.07617188, "router_z_loss_mlp": 0.2701416, "step": 2472, "time_per_iteration": 5.3475611209869385 }, { "auxiliary_loss_clip": 0.06610876, "auxiliary_loss_mlp": 0.01284323, "balance_loss_clip": 0.0631116, "balance_loss_mlp": 0.01257679, "epoch": 0.14868480384788818, "flos": 19359566737920.0, "grad_norm": 2.4390412575145164, "language_loss": 0.81720155, "learning_rate": 3.854048192933008e-06, "loss": 0.89615357, "num_input_tokens_seen": 53716845, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.26635742, "step": 2473, "time_per_iteration": 2.5644073486328125 }, { "auxiliary_loss_clip": 0.06621817, "auxiliary_loss_mlp": 0.01292534, "balance_loss_clip": 0.06320642, "balance_loss_mlp": 0.01266117, "epoch": 0.14874492710055615, "flos": 22206723022080.0, "grad_norm": 2.4159844222137132, "language_loss": 0.79197407, "learning_rate": 3.853902108962709e-06, "loss": 0.87111759, "num_input_tokens_seen": 53734970, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.26379395, "step": 2474, "time_per_iteration": 2.5716023445129395 }, { "auxiliary_loss_clip": 0.0662141, "auxiliary_loss_mlp": 0.01282795, "balance_loss_clip": 0.06310292, "balance_loss_mlp": 0.01256676, "epoch": 0.1488050503532241, "flos": 21109427879040.0, "grad_norm": 2.2463682351192404, "language_loss": 0.82777905, "learning_rate": 3.853755954692255e-06, "loss": 0.90682113, "num_input_tokens_seen": 53753415, "router_z_loss_clip": 3.11132812, "router_z_loss_mlp": 0.2611084, "step": 2475, "time_per_iteration": 2.599461317062378 }, { "auxiliary_loss_clip": 0.06618808, "auxiliary_loss_mlp": 0.01284199, "balance_loss_clip": 0.06320474, "balance_loss_mlp": 0.01258676, "epoch": 0.14886517360589208, "flos": 12791476592640.0, "grad_norm": 1.90343891399597, "language_loss": 0.81468928, "learning_rate": 3.85360973012719e-06, "loss": 0.89371932, "num_input_tokens_seen": 53770305, "router_z_loss_clip": 2.98828125, "router_z_loss_mlp": 0.25561523, "step": 2476, "time_per_iteration": 2.5603742599487305 }, { "auxiliary_loss_clip": 0.06609592, "auxiliary_loss_mlp": 0.01280845, "balance_loss_clip": 0.06319873, "balance_loss_mlp": 0.01256431, "epoch": 0.14892529685856004, "flos": 29030503501440.0, "grad_norm": 1.7887943782194917, "language_loss": 0.78815502, "learning_rate": 3.853463435273058e-06, "loss": 0.86705941, "num_input_tokens_seen": 53788895, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.24414062, "step": 2477, "time_per_iteration": 2.6560657024383545 }, { "auxiliary_loss_clip": 0.0649665, "auxiliary_loss_mlp": 0.01266751, "balance_loss_clip": 0.06318527, "balance_loss_mlp": 0.01256576, "epoch": 0.148985420111228, "flos": 61944215495040.0, "grad_norm": 0.7729857244156197, "language_loss": 0.60017025, "learning_rate": 3.853317070135407e-06, "loss": 0.67780423, "num_input_tokens_seen": 53850260, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.10174561, "step": 2478, "time_per_iteration": 3.2383460998535156 }, { "auxiliary_loss_clip": 0.06619629, "auxiliary_loss_mlp": 0.01284435, "balance_loss_clip": 0.06319613, "balance_loss_mlp": 0.01259342, "epoch": 0.149045543363896, "flos": 23921937699840.0, "grad_norm": 2.4961349850905212, "language_loss": 0.7196604, "learning_rate": 3.853170634719787e-06, "loss": 0.79870105, "num_input_tokens_seen": 53867520, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.25109863, "step": 2479, "time_per_iteration": 2.6687169075012207 }, { "auxiliary_loss_clip": 0.06611249, "auxiliary_loss_mlp": 0.0128429, "balance_loss_clip": 0.06311296, "balance_loss_mlp": 0.01259256, "epoch": 0.14910566661656396, "flos": 23660293726080.0, "grad_norm": 1.6379112681216084, "language_loss": 0.81937945, "learning_rate": 3.853024129031751e-06, "loss": 0.89833486, "num_input_tokens_seen": 53886620, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.25048828, "step": 2480, "time_per_iteration": 2.6154608726501465 }, { "auxiliary_loss_clip": 0.06612669, "auxiliary_loss_mlp": 0.01295324, "balance_loss_clip": 0.06307863, "balance_loss_mlp": 0.01270159, "epoch": 0.14916578986923193, "flos": 20520452730240.0, "grad_norm": 1.997062239457256, "language_loss": 0.85434616, "learning_rate": 3.852877553076854e-06, "loss": 0.93342602, "num_input_tokens_seen": 53902230, "router_z_loss_clip": 3.046875, "router_z_loss_mlp": 0.25146484, "step": 2481, "time_per_iteration": 2.5826804637908936 }, { "auxiliary_loss_clip": 0.06612612, "auxiliary_loss_mlp": 0.01287446, "balance_loss_clip": 0.06310129, "balance_loss_mlp": 0.01261172, "epoch": 0.1492259131218999, "flos": 22498359557760.0, "grad_norm": 2.5352273687190197, "language_loss": 0.78458112, "learning_rate": 3.8527309068606546e-06, "loss": 0.86358172, "num_input_tokens_seen": 53919475, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.26306152, "step": 2482, "time_per_iteration": 2.5866291522979736 }, { "auxiliary_loss_clip": 0.06618811, "auxiliary_loss_mlp": 0.01286074, "balance_loss_clip": 0.06308958, "balance_loss_mlp": 0.01260695, "epoch": 0.14928603637456786, "flos": 23192657688960.0, "grad_norm": 2.572685541564373, "language_loss": 0.81102502, "learning_rate": 3.852584190388713e-06, "loss": 0.8900739, "num_input_tokens_seen": 53939150, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.25390625, "step": 2483, "time_per_iteration": 2.613994598388672 }, { "auxiliary_loss_clip": 0.06595196, "auxiliary_loss_mlp": 0.01289388, "balance_loss_clip": 0.06304857, "balance_loss_mlp": 0.01265605, "epoch": 0.14934615962723582, "flos": 21659731568640.0, "grad_norm": 1.7608563615545023, "language_loss": 0.71482587, "learning_rate": 3.852437403666595e-06, "loss": 0.79367173, "num_input_tokens_seen": 53958735, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.23791504, "step": 2484, "time_per_iteration": 2.5848000049591064 }, { "auxiliary_loss_clip": 0.06606701, "auxiliary_loss_mlp": 0.01297305, "balance_loss_clip": 0.06306116, "balance_loss_mlp": 0.01270149, "epoch": 0.1494062828799038, "flos": 27016356983040.0, "grad_norm": 1.9646853362780605, "language_loss": 0.85987997, "learning_rate": 3.852290546699863e-06, "loss": 0.93892002, "num_input_tokens_seen": 53975065, "router_z_loss_clip": 3.00585938, "router_z_loss_mlp": 0.27172852, "step": 2485, "time_per_iteration": 2.6558597087860107 }, { "auxiliary_loss_clip": 0.06605119, "auxiliary_loss_mlp": 0.01284949, "balance_loss_clip": 0.06303811, "balance_loss_mlp": 0.01259617, "epoch": 0.14946640613257178, "flos": 21221291479680.0, "grad_norm": 2.551026544159898, "language_loss": 0.85915017, "learning_rate": 3.8521436194940894e-06, "loss": 0.93805081, "num_input_tokens_seen": 53993330, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.25354004, "step": 2486, "time_per_iteration": 2.583022356033325 }, { "auxiliary_loss_clip": 0.06590807, "auxiliary_loss_mlp": 0.0128912, "balance_loss_clip": 0.0630283, "balance_loss_mlp": 0.01265588, "epoch": 0.14952652938523975, "flos": 13375965548160.0, "grad_norm": 2.439779367438925, "language_loss": 0.75327069, "learning_rate": 3.851996622054842e-06, "loss": 0.83206999, "num_input_tokens_seen": 54010515, "router_z_loss_clip": 2.87890625, "router_z_loss_mlp": 0.23547363, "step": 2487, "time_per_iteration": 2.6049013137817383 }, { "auxiliary_loss_clip": 0.06597006, "auxiliary_loss_mlp": 0.01292655, "balance_loss_clip": 0.06298991, "balance_loss_mlp": 0.01267168, "epoch": 0.1495866526379077, "flos": 35526491608320.0, "grad_norm": 2.0832114310453846, "language_loss": 0.73014927, "learning_rate": 3.8518495543877e-06, "loss": 0.80904591, "num_input_tokens_seen": 54031315, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.25476074, "step": 2488, "time_per_iteration": 2.696970224380493 }, { "auxiliary_loss_clip": 0.066026, "auxiliary_loss_mlp": 0.01305324, "balance_loss_clip": 0.06299467, "balance_loss_mlp": 0.01279062, "epoch": 0.14964677589057568, "flos": 17637392171520.0, "grad_norm": 3.182439512025129, "language_loss": 0.71500075, "learning_rate": 3.851702416498235e-06, "loss": 0.79408002, "num_input_tokens_seen": 54045965, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.2623291, "step": 2489, "time_per_iteration": 2.5289976596832275 }, { "auxiliary_loss_clip": 0.06606641, "auxiliary_loss_mlp": 0.01302624, "balance_loss_clip": 0.06302458, "balance_loss_mlp": 0.01274526, "epoch": 0.14970689914324364, "flos": 20190102808320.0, "grad_norm": 6.865347331794833, "language_loss": 0.82587075, "learning_rate": 3.8515552083920295e-06, "loss": 0.90496337, "num_input_tokens_seen": 54059960, "router_z_loss_clip": 3.04296875, "router_z_loss_mlp": 0.28100586, "step": 2490, "time_per_iteration": 2.532212972640991 }, { "auxiliary_loss_clip": 0.0660749, "auxiliary_loss_mlp": 0.01294088, "balance_loss_clip": 0.06303991, "balance_loss_mlp": 0.01268268, "epoch": 0.1497670223959116, "flos": 37237136238720.0, "grad_norm": 1.7786512546313842, "language_loss": 0.81458724, "learning_rate": 3.851407930074666e-06, "loss": 0.89360303, "num_input_tokens_seen": 54079330, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.25817871, "step": 2491, "time_per_iteration": 2.712315082550049 }, { "auxiliary_loss_clip": 0.06605379, "auxiliary_loss_mlp": 0.01291967, "balance_loss_clip": 0.06301905, "balance_loss_mlp": 0.0126561, "epoch": 0.1498271456485796, "flos": 24461675775360.0, "grad_norm": 3.164586858455185, "language_loss": 0.92099947, "learning_rate": 3.851260581551727e-06, "loss": 0.99997288, "num_input_tokens_seen": 54097555, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.26367188, "step": 2492, "time_per_iteration": 2.608463764190674 }, { "auxiliary_loss_clip": 0.06605618, "auxiliary_loss_mlp": 0.01295428, "balance_loss_clip": 0.0630588, "balance_loss_mlp": 0.01267986, "epoch": 0.14988726890124757, "flos": 16259235742080.0, "grad_norm": 4.32694154917685, "language_loss": 0.81527483, "learning_rate": 3.851113162828802e-06, "loss": 0.89428532, "num_input_tokens_seen": 54115600, "router_z_loss_clip": 3.00195312, "router_z_loss_mlp": 0.27429199, "step": 2493, "time_per_iteration": 2.5761873722076416 }, { "auxiliary_loss_clip": 0.0660115, "auxiliary_loss_mlp": 0.01293967, "balance_loss_clip": 0.06298319, "balance_loss_mlp": 0.01265918, "epoch": 0.14994739215391553, "flos": 20672622944640.0, "grad_norm": 2.299802461365796, "language_loss": 0.81131268, "learning_rate": 3.85096567391148e-06, "loss": 0.89026386, "num_input_tokens_seen": 54135220, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.28039551, "step": 2494, "time_per_iteration": 2.5692503452301025 }, { "auxiliary_loss_clip": 0.06610721, "auxiliary_loss_mlp": 0.01284258, "balance_loss_clip": 0.0631149, "balance_loss_mlp": 0.01256804, "epoch": 0.1500075154065835, "flos": 70666855603200.0, "grad_norm": 2.3930753599275825, "language_loss": 0.66820425, "learning_rate": 3.850818114805354e-06, "loss": 0.74715406, "num_input_tokens_seen": 54161065, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.27490234, "step": 2495, "time_per_iteration": 2.9933786392211914 }, { "auxiliary_loss_clip": 0.06503314, "auxiliary_loss_mlp": 0.01269967, "balance_loss_clip": 0.06328183, "balance_loss_mlp": 0.01260764, "epoch": 0.15006763865925146, "flos": 68029827431040.0, "grad_norm": 0.8663720383015487, "language_loss": 0.5946548, "learning_rate": 3.850670485516019e-06, "loss": 0.6723876, "num_input_tokens_seen": 54225095, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.09197998, "step": 2496, "time_per_iteration": 3.2043511867523193 }, { "auxiliary_loss_clip": 0.06617353, "auxiliary_loss_mlp": 0.01291756, "balance_loss_clip": 0.06310008, "balance_loss_mlp": 0.01262586, "epoch": 0.15012776191191943, "flos": 18922216752000.0, "grad_norm": 2.3675844439737026, "language_loss": 0.6633693, "learning_rate": 3.850522786049075e-06, "loss": 0.74246043, "num_input_tokens_seen": 54243750, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.29187012, "step": 2497, "time_per_iteration": 2.5878705978393555 }, { "auxiliary_loss_clip": 0.06608002, "auxiliary_loss_mlp": 0.01288877, "balance_loss_clip": 0.06307665, "balance_loss_mlp": 0.01264404, "epoch": 0.1501878851645874, "flos": 23708985747840.0, "grad_norm": 1.7223099375214759, "language_loss": 0.75996488, "learning_rate": 3.850375016410121e-06, "loss": 0.83893371, "num_input_tokens_seen": 54266185, "router_z_loss_clip": 3.00585938, "router_z_loss_mlp": 0.24462891, "step": 2498, "time_per_iteration": 2.71097731590271 }, { "auxiliary_loss_clip": 0.06620348, "auxiliary_loss_mlp": 0.01291379, "balance_loss_clip": 0.06310785, "balance_loss_mlp": 0.01263341, "epoch": 0.15024800841725539, "flos": 20418777400320.0, "grad_norm": 2.3603132420134867, "language_loss": 0.73086417, "learning_rate": 3.850227176604761e-06, "loss": 0.80998147, "num_input_tokens_seen": 54283940, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.28027344, "step": 2499, "time_per_iteration": 2.591294765472412 }, { "auxiliary_loss_clip": 0.06609441, "auxiliary_loss_mlp": 0.01288608, "balance_loss_clip": 0.0630732, "balance_loss_mlp": 0.01260785, "epoch": 0.15030813166992335, "flos": 31838904472320.0, "grad_norm": 1.9724379876356521, "language_loss": 0.73123294, "learning_rate": 3.850079266638601e-06, "loss": 0.81021345, "num_input_tokens_seen": 54304830, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.27832031, "step": 2500, "time_per_iteration": 2.715202808380127 }, { "auxiliary_loss_clip": 0.06613097, "auxiliary_loss_mlp": 0.01290848, "balance_loss_clip": 0.06313007, "balance_loss_mlp": 0.01264085, "epoch": 0.15036825492259132, "flos": 35665664440320.0, "grad_norm": 1.886206316470411, "language_loss": 0.65747327, "learning_rate": 3.849931286517249e-06, "loss": 0.73651272, "num_input_tokens_seen": 54325595, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.26733398, "step": 2501, "time_per_iteration": 2.705343246459961 }, { "auxiliary_loss_clip": 0.06612681, "auxiliary_loss_mlp": 0.0128487, "balance_loss_clip": 0.06311826, "balance_loss_mlp": 0.01255854, "epoch": 0.15042837817525928, "flos": 18843238679040.0, "grad_norm": 6.508840123769626, "language_loss": 0.84734476, "learning_rate": 3.849783236246318e-06, "loss": 0.92632031, "num_input_tokens_seen": 54342180, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.29003906, "step": 2502, "time_per_iteration": 2.593310594558716 }, { "auxiliary_loss_clip": 0.06607031, "auxiliary_loss_mlp": 0.0128825, "balance_loss_clip": 0.06309739, "balance_loss_mlp": 0.01262668, "epoch": 0.15048850142792725, "flos": 19541436024960.0, "grad_norm": 2.627783961550482, "language_loss": 0.78347075, "learning_rate": 3.849635115831421e-06, "loss": 0.86242354, "num_input_tokens_seen": 54360255, "router_z_loss_clip": 2.97070312, "router_z_loss_mlp": 0.25585938, "step": 2503, "time_per_iteration": 2.5834269523620605 }, { "auxiliary_loss_clip": 0.06603687, "auxiliary_loss_mlp": 0.01283324, "balance_loss_clip": 0.06306908, "balance_loss_mlp": 0.01259256, "epoch": 0.1505486246805952, "flos": 22024015194240.0, "grad_norm": 2.211793615361082, "language_loss": 0.86544859, "learning_rate": 3.849486925278176e-06, "loss": 0.94431865, "num_input_tokens_seen": 54378260, "router_z_loss_clip": 2.97070312, "router_z_loss_mlp": 0.24072266, "step": 2504, "time_per_iteration": 4.064289808273315 }, { "auxiliary_loss_clip": 0.0659824, "auxiliary_loss_mlp": 0.01295578, "balance_loss_clip": 0.06303933, "balance_loss_mlp": 0.01271319, "epoch": 0.15060874793326318, "flos": 20749840081920.0, "grad_norm": 1.6629846492306422, "language_loss": 0.83719456, "learning_rate": 3.8493386645922e-06, "loss": 0.91613269, "num_input_tokens_seen": 54399745, "router_z_loss_clip": 2.9453125, "router_z_loss_mlp": 0.24279785, "step": 2505, "time_per_iteration": 2.6120595932006836 }, { "auxiliary_loss_clip": 0.06599384, "auxiliary_loss_mlp": 0.01300355, "balance_loss_clip": 0.06302282, "balance_loss_mlp": 0.01274415, "epoch": 0.15066887118593117, "flos": 16477470501120.0, "grad_norm": 6.276885726266476, "language_loss": 0.76873982, "learning_rate": 3.849190333779117e-06, "loss": 0.84773719, "num_input_tokens_seen": 54417105, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 0.25952148, "step": 2506, "time_per_iteration": 2.600741147994995 }, { "auxiliary_loss_clip": 0.0662045, "auxiliary_loss_mlp": 0.01295548, "balance_loss_clip": 0.06310435, "balance_loss_mlp": 0.01269596, "epoch": 0.15072899443859913, "flos": 19864490641920.0, "grad_norm": 3.0396646600451107, "language_loss": 0.78138447, "learning_rate": 3.849041932844552e-06, "loss": 0.86054444, "num_input_tokens_seen": 54433920, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.25939941, "step": 2507, "time_per_iteration": 2.661181926727295 }, { "auxiliary_loss_clip": 0.06600948, "auxiliary_loss_mlp": 0.01286641, "balance_loss_clip": 0.06307893, "balance_loss_mlp": 0.0126256, "epoch": 0.1507891176912671, "flos": 20782348266240.0, "grad_norm": 3.3171973076003747, "language_loss": 0.69681203, "learning_rate": 3.848893461794131e-06, "loss": 0.77568793, "num_input_tokens_seen": 54451540, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.2409668, "step": 2508, "time_per_iteration": 4.127498626708984 }, { "auxiliary_loss_clip": 0.06606746, "auxiliary_loss_mlp": 0.01298031, "balance_loss_clip": 0.06302519, "balance_loss_mlp": 0.01271876, "epoch": 0.15084924094393506, "flos": 23593390640640.0, "grad_norm": 3.203892146843804, "language_loss": 0.79166663, "learning_rate": 3.8487449206334845e-06, "loss": 0.87071443, "num_input_tokens_seen": 54470800, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.26147461, "step": 2509, "time_per_iteration": 2.609705686569214 }, { "auxiliary_loss_clip": 0.06625839, "auxiliary_loss_mlp": 0.01301505, "balance_loss_clip": 0.06309884, "balance_loss_mlp": 0.01273109, "epoch": 0.15090936419660303, "flos": 18916430820480.0, "grad_norm": 2.845891617829668, "language_loss": 0.82756555, "learning_rate": 3.848596309368246e-06, "loss": 0.90683895, "num_input_tokens_seen": 54486525, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.28417969, "step": 2510, "time_per_iteration": 2.5862367153167725 }, { "auxiliary_loss_clip": 0.0661762, "auxiliary_loss_mlp": 0.01298722, "balance_loss_clip": 0.06309114, "balance_loss_mlp": 0.01270506, "epoch": 0.150969487449271, "flos": 17933514900480.0, "grad_norm": 3.0224407060895704, "language_loss": 0.74916613, "learning_rate": 3.8484476280040495e-06, "loss": 0.82832956, "num_input_tokens_seen": 54503795, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.28234863, "step": 2511, "time_per_iteration": 2.572169780731201 }, { "auxiliary_loss_clip": 0.06605422, "auxiliary_loss_mlp": 0.01294237, "balance_loss_clip": 0.06306832, "balance_loss_mlp": 0.01268082, "epoch": 0.151029610701939, "flos": 24249897780480.0, "grad_norm": 3.433598926275837, "language_loss": 0.70557827, "learning_rate": 3.848298876546534e-06, "loss": 0.78457487, "num_input_tokens_seen": 54523025, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.26171875, "step": 2512, "time_per_iteration": 5.447293519973755 }, { "auxiliary_loss_clip": 0.06607978, "auxiliary_loss_mlp": 0.01292572, "balance_loss_clip": 0.06307654, "balance_loss_mlp": 0.01266799, "epoch": 0.15108973395460695, "flos": 30270199858560.0, "grad_norm": 3.86706171177843, "language_loss": 0.75359702, "learning_rate": 3.84815005500134e-06, "loss": 0.8326025, "num_input_tokens_seen": 54545025, "router_z_loss_clip": 3.00390625, "router_z_loss_mlp": 0.25756836, "step": 2513, "time_per_iteration": 2.6425740718841553 }, { "auxiliary_loss_clip": 0.06494533, "auxiliary_loss_mlp": 0.01261311, "balance_loss_clip": 0.06317423, "balance_loss_mlp": 0.0125249, "epoch": 0.15114985720727492, "flos": 60456711087360.0, "grad_norm": 0.8422236095875046, "language_loss": 0.64509964, "learning_rate": 3.84800116337411e-06, "loss": 0.7226581, "num_input_tokens_seen": 54604545, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.08837891, "step": 2514, "time_per_iteration": 3.190636157989502 }, { "auxiliary_loss_clip": 0.06606548, "auxiliary_loss_mlp": 0.01290829, "balance_loss_clip": 0.06308034, "balance_loss_mlp": 0.01264662, "epoch": 0.15120998045994288, "flos": 20527915743360.0, "grad_norm": 2.5736292265077134, "language_loss": 0.74240136, "learning_rate": 3.8478522016704916e-06, "loss": 0.82137513, "num_input_tokens_seen": 54620590, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.2611084, "step": 2515, "time_per_iteration": 2.592106580734253 }, { "auxiliary_loss_clip": 0.0660974, "auxiliary_loss_mlp": 0.01292169, "balance_loss_clip": 0.06313001, "balance_loss_mlp": 0.01266396, "epoch": 0.15127010371261085, "flos": 21185303351040.0, "grad_norm": 1.7723271532143297, "language_loss": 0.78280842, "learning_rate": 3.8477031698961325e-06, "loss": 0.86182749, "num_input_tokens_seen": 54640410, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.25793457, "step": 2516, "time_per_iteration": 2.6392548084259033 }, { "auxiliary_loss_clip": 0.06491962, "auxiliary_loss_mlp": 0.01266003, "balance_loss_clip": 0.06315169, "balance_loss_mlp": 0.01256931, "epoch": 0.1513302269652788, "flos": 65339537189760.0, "grad_norm": 0.737993484315905, "language_loss": 0.5466885, "learning_rate": 3.8475540680566835e-06, "loss": 0.62426817, "num_input_tokens_seen": 54701430, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.09082031, "step": 2517, "time_per_iteration": 3.2253119945526123 }, { "auxiliary_loss_clip": 0.06608815, "auxiliary_loss_mlp": 0.01298055, "balance_loss_clip": 0.06310357, "balance_loss_mlp": 0.01271054, "epoch": 0.15139035021794678, "flos": 19141918957440.0, "grad_norm": 1.923837055247291, "language_loss": 0.79718876, "learning_rate": 3.8474048961577995e-06, "loss": 0.87625748, "num_input_tokens_seen": 54720845, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.2701416, "step": 2518, "time_per_iteration": 2.61926007270813 }, { "auxiliary_loss_clip": 0.0662035, "auxiliary_loss_mlp": 0.01300162, "balance_loss_clip": 0.063131, "balance_loss_mlp": 0.01273233, "epoch": 0.15145047347061477, "flos": 26585841104640.0, "grad_norm": 2.053817277199586, "language_loss": 0.71308547, "learning_rate": 3.847255654205137e-06, "loss": 0.79229057, "num_input_tokens_seen": 54740495, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.26916504, "step": 2519, "time_per_iteration": 2.6403841972351074 }, { "auxiliary_loss_clip": 0.0660717, "auxiliary_loss_mlp": 0.01290816, "balance_loss_clip": 0.06307891, "balance_loss_mlp": 0.01264101, "epoch": 0.15151059672328274, "flos": 20309177859840.0, "grad_norm": 30.166981574223833, "language_loss": 0.79762936, "learning_rate": 3.847106342204354e-06, "loss": 0.87660921, "num_input_tokens_seen": 54758415, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.26733398, "step": 2520, "time_per_iteration": 2.616215944290161 }, { "auxiliary_loss_clip": 0.0660826, "auxiliary_loss_mlp": 0.01297183, "balance_loss_clip": 0.06306899, "balance_loss_mlp": 0.01269765, "epoch": 0.1515707199759507, "flos": 27234591742080.0, "grad_norm": 3.1503973116903645, "language_loss": 0.75527251, "learning_rate": 3.846956960161114e-06, "loss": 0.83432692, "num_input_tokens_seen": 54779355, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.27392578, "step": 2521, "time_per_iteration": 2.633646249771118 }, { "auxiliary_loss_clip": 0.06615866, "auxiliary_loss_mlp": 0.01297314, "balance_loss_clip": 0.06313347, "balance_loss_mlp": 0.01270778, "epoch": 0.15163084322861867, "flos": 23594229181440.0, "grad_norm": 2.5147055420419644, "language_loss": 0.83173269, "learning_rate": 3.84680750808108e-06, "loss": 0.91086447, "num_input_tokens_seen": 54799465, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.26538086, "step": 2522, "time_per_iteration": 2.66507625579834 }, { "auxiliary_loss_clip": 0.06483574, "auxiliary_loss_mlp": 0.01293442, "balance_loss_clip": 0.0630988, "balance_loss_mlp": 0.01283822, "epoch": 0.15169096648128663, "flos": 66908786855040.0, "grad_norm": 0.7965363673197495, "language_loss": 0.58005989, "learning_rate": 3.846657985969922e-06, "loss": 0.65783, "num_input_tokens_seen": 54857665, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.09606934, "step": 2523, "time_per_iteration": 3.177516222000122 }, { "auxiliary_loss_clip": 0.06602359, "auxiliary_loss_mlp": 0.01297522, "balance_loss_clip": 0.06308782, "balance_loss_mlp": 0.01270295, "epoch": 0.1517510897339546, "flos": 29103024810240.0, "grad_norm": 2.090145560631929, "language_loss": 0.75474232, "learning_rate": 3.8465083938333066e-06, "loss": 0.83374113, "num_input_tokens_seen": 54879895, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.27246094, "step": 2524, "time_per_iteration": 2.7121479511260986 }, { "auxiliary_loss_clip": 0.06608734, "auxiliary_loss_mlp": 0.01292152, "balance_loss_clip": 0.06308055, "balance_loss_mlp": 0.01264305, "epoch": 0.1518112129866226, "flos": 18412597019520.0, "grad_norm": 1.6582539928777569, "language_loss": 0.75768781, "learning_rate": 3.8463587316769085e-06, "loss": 0.83669668, "num_input_tokens_seen": 54898245, "router_z_loss_clip": 3.00390625, "router_z_loss_mlp": 0.27868652, "step": 2525, "time_per_iteration": 2.6479623317718506 }, { "auxiliary_loss_clip": 0.06614053, "auxiliary_loss_mlp": 0.01292347, "balance_loss_clip": 0.06305401, "balance_loss_mlp": 0.01263546, "epoch": 0.15187133623929056, "flos": 19431165651840.0, "grad_norm": 1.8723443926660492, "language_loss": 0.80555564, "learning_rate": 3.846208999506402e-06, "loss": 0.88461959, "num_input_tokens_seen": 54917060, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.2878418, "step": 2526, "time_per_iteration": 2.657947063446045 }, { "auxiliary_loss_clip": 0.06596982, "auxiliary_loss_mlp": 0.01288594, "balance_loss_clip": 0.06303021, "balance_loss_mlp": 0.01262881, "epoch": 0.15193145949195852, "flos": 17571914605440.0, "grad_norm": 2.1406465546286486, "language_loss": 0.85730231, "learning_rate": 3.846059197327466e-06, "loss": 0.93615818, "num_input_tokens_seen": 54936365, "router_z_loss_clip": 2.93945312, "router_z_loss_mlp": 0.25720215, "step": 2527, "time_per_iteration": 2.5813958644866943 }, { "auxiliary_loss_clip": 0.06607256, "auxiliary_loss_mlp": 0.01297824, "balance_loss_clip": 0.0630881, "balance_loss_mlp": 0.0127043, "epoch": 0.15199158274462649, "flos": 36185472443520.0, "grad_norm": 1.9459650080272557, "language_loss": 0.69524384, "learning_rate": 3.845909325145779e-06, "loss": 0.77429461, "num_input_tokens_seen": 54961365, "router_z_loss_clip": 2.98242188, "router_z_loss_mlp": 0.27416992, "step": 2528, "time_per_iteration": 2.7741472721099854 }, { "auxiliary_loss_clip": 0.06604479, "auxiliary_loss_mlp": 0.01309272, "balance_loss_clip": 0.06310427, "balance_loss_mlp": 0.01280722, "epoch": 0.15205170599729445, "flos": 23080416744960.0, "grad_norm": 1.8731603704973934, "language_loss": 0.88031405, "learning_rate": 3.845759382967026e-06, "loss": 0.95945156, "num_input_tokens_seen": 54980750, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.28552246, "step": 2529, "time_per_iteration": 2.6027984619140625 }, { "auxiliary_loss_clip": 0.06605279, "auxiliary_loss_mlp": 0.0129279, "balance_loss_clip": 0.06308101, "balance_loss_mlp": 0.01265979, "epoch": 0.15211182924996242, "flos": 21914876851200.0, "grad_norm": 2.050960818044353, "language_loss": 0.84224641, "learning_rate": 3.845609370796893e-06, "loss": 0.9212271, "num_input_tokens_seen": 54999675, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 0.26831055, "step": 2530, "time_per_iteration": 2.637221574783325 }, { "auxiliary_loss_clip": 0.06606491, "auxiliary_loss_mlp": 0.01303814, "balance_loss_clip": 0.06308407, "balance_loss_mlp": 0.01276408, "epoch": 0.15217195250263038, "flos": 13886675383680.0, "grad_norm": 2.1721642923664652, "language_loss": 0.81727386, "learning_rate": 3.845459288641066e-06, "loss": 0.89637685, "num_input_tokens_seen": 55018295, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.27429199, "step": 2531, "time_per_iteration": 2.5692903995513916 }, { "auxiliary_loss_clip": 0.06601556, "auxiliary_loss_mlp": 0.01295961, "balance_loss_clip": 0.06306115, "balance_loss_mlp": 0.01268329, "epoch": 0.15223207575529837, "flos": 24542247075840.0, "grad_norm": 2.189689075964524, "language_loss": 0.79782724, "learning_rate": 3.8453091365052394e-06, "loss": 0.87680238, "num_input_tokens_seen": 55037975, "router_z_loss_clip": 2.95507812, "router_z_loss_mlp": 0.27575684, "step": 2532, "time_per_iteration": 2.6717190742492676 }, { "auxiliary_loss_clip": 0.0660826, "auxiliary_loss_mlp": 0.0130772, "balance_loss_clip": 0.06314317, "balance_loss_mlp": 0.01278538, "epoch": 0.15229219900796634, "flos": 25563876382080.0, "grad_norm": 2.0435980643401286, "language_loss": 0.88579077, "learning_rate": 3.845158914395105e-06, "loss": 0.96495056, "num_input_tokens_seen": 55057135, "router_z_loss_clip": 2.93945312, "router_z_loss_mlp": 0.29223633, "step": 2533, "time_per_iteration": 2.6267738342285156 }, { "auxiliary_loss_clip": 0.06612068, "auxiliary_loss_mlp": 0.01304016, "balance_loss_clip": 0.06311779, "balance_loss_mlp": 0.01273355, "epoch": 0.1523523222606343, "flos": 18222761594880.0, "grad_norm": 2.416707069652284, "language_loss": 0.80010903, "learning_rate": 3.84500862231636e-06, "loss": 0.87926984, "num_input_tokens_seen": 55075525, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.30664062, "step": 2534, "time_per_iteration": 2.6062283515930176 }, { "auxiliary_loss_clip": 0.06634057, "auxiliary_loss_mlp": 0.01296118, "balance_loss_clip": 0.06323276, "balance_loss_mlp": 0.01265553, "epoch": 0.15241244551330227, "flos": 13264940488320.0, "grad_norm": 2.504028354946347, "language_loss": 0.77870369, "learning_rate": 3.844858260274702e-06, "loss": 0.8580054, "num_input_tokens_seen": 55090845, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.30566406, "step": 2535, "time_per_iteration": 2.5675017833709717 }, { "auxiliary_loss_clip": 0.06628579, "auxiliary_loss_mlp": 0.01303465, "balance_loss_clip": 0.06318083, "balance_loss_mlp": 0.01271422, "epoch": 0.15247256876597023, "flos": 19721083178880.0, "grad_norm": 2.190788929649691, "language_loss": 0.79129606, "learning_rate": 3.844707828275835e-06, "loss": 0.87061656, "num_input_tokens_seen": 55108750, "router_z_loss_clip": 3.10351562, "router_z_loss_mlp": 0.32055664, "step": 2536, "time_per_iteration": 2.6326189041137695 }, { "auxiliary_loss_clip": 0.06615944, "auxiliary_loss_mlp": 0.01305276, "balance_loss_clip": 0.06316701, "balance_loss_mlp": 0.01275188, "epoch": 0.1525326920186382, "flos": 20382076512000.0, "grad_norm": 2.4310558066413734, "language_loss": 0.76409996, "learning_rate": 3.844557326325461e-06, "loss": 0.8433122, "num_input_tokens_seen": 55126750, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.30102539, "step": 2537, "time_per_iteration": 2.595167636871338 }, { "auxiliary_loss_clip": 0.06636497, "auxiliary_loss_mlp": 0.0129245, "balance_loss_clip": 0.06329127, "balance_loss_mlp": 0.01261361, "epoch": 0.15259281527130616, "flos": 13595122702080.0, "grad_norm": 2.2744687053544803, "language_loss": 0.78706133, "learning_rate": 3.8444067544292896e-06, "loss": 0.86635077, "num_input_tokens_seen": 55144690, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.31079102, "step": 2538, "time_per_iteration": 2.5884246826171875 }, { "auxiliary_loss_clip": 0.06610388, "auxiliary_loss_mlp": 0.01310526, "balance_loss_clip": 0.06314623, "balance_loss_mlp": 0.0128101, "epoch": 0.15265293852397416, "flos": 22867590574080.0, "grad_norm": 1.6397445511935549, "language_loss": 0.90253615, "learning_rate": 3.844256112593029e-06, "loss": 0.98174524, "num_input_tokens_seen": 55166055, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.29516602, "step": 2539, "time_per_iteration": 2.63265323638916 }, { "auxiliary_loss_clip": 0.06629737, "auxiliary_loss_mlp": 0.01292393, "balance_loss_clip": 0.06323546, "balance_loss_mlp": 0.01263068, "epoch": 0.15271306177664212, "flos": 29245174462080.0, "grad_norm": 2.0975140505019914, "language_loss": 0.93930042, "learning_rate": 3.844105400822391e-06, "loss": 1.01852179, "num_input_tokens_seen": 55186285, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.29296875, "step": 2540, "time_per_iteration": 2.6512339115142822 }, { "auxiliary_loss_clip": 0.06619726, "auxiliary_loss_mlp": 0.01311653, "balance_loss_clip": 0.06320129, "balance_loss_mlp": 0.01282137, "epoch": 0.1527731850293101, "flos": 31253912392320.0, "grad_norm": 1.6383225467523845, "language_loss": 0.76534855, "learning_rate": 3.843954619123092e-06, "loss": 0.84466231, "num_input_tokens_seen": 55207915, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.29541016, "step": 2541, "time_per_iteration": 2.697366952896118 }, { "auxiliary_loss_clip": 0.06616975, "auxiliary_loss_mlp": 0.01294504, "balance_loss_clip": 0.06318195, "balance_loss_mlp": 0.01263844, "epoch": 0.15283330828197805, "flos": 22388550382080.0, "grad_norm": 3.4304124960624454, "language_loss": 0.82669508, "learning_rate": 3.84380376750085e-06, "loss": 0.90580988, "num_input_tokens_seen": 55227860, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.30664062, "step": 2542, "time_per_iteration": 2.65317702293396 }, { "auxiliary_loss_clip": 0.06628412, "auxiliary_loss_mlp": 0.01297116, "balance_loss_clip": 0.06320496, "balance_loss_mlp": 0.01265633, "epoch": 0.15289343153464602, "flos": 25527175493760.0, "grad_norm": 2.2127107506079695, "language_loss": 0.7887978, "learning_rate": 3.843652845961383e-06, "loss": 0.86805308, "num_input_tokens_seen": 55247330, "router_z_loss_clip": 3.07617188, "router_z_loss_mlp": 0.31506348, "step": 2543, "time_per_iteration": 2.6449692249298096 }, { "auxiliary_loss_clip": 0.06614979, "auxiliary_loss_mlp": 0.01300962, "balance_loss_clip": 0.06313193, "balance_loss_mlp": 0.01269252, "epoch": 0.15295355478731398, "flos": 22716468535680.0, "grad_norm": 2.0432453783329163, "language_loss": 0.8763842, "learning_rate": 3.843501854510416e-06, "loss": 0.95554364, "num_input_tokens_seen": 55266195, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.3170166, "step": 2544, "time_per_iteration": 3.988982677459717 }, { "auxiliary_loss_clip": 0.06637387, "auxiliary_loss_mlp": 0.01294305, "balance_loss_clip": 0.06321561, "balance_loss_mlp": 0.01261451, "epoch": 0.15301367803998198, "flos": 23257548276480.0, "grad_norm": 2.340930459552637, "language_loss": 0.84168667, "learning_rate": 3.843350793153673e-06, "loss": 0.92100352, "num_input_tokens_seen": 55283305, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.32861328, "step": 2545, "time_per_iteration": 2.599148988723755 }, { "auxiliary_loss_clip": 0.06625731, "auxiliary_loss_mlp": 0.01289556, "balance_loss_clip": 0.06321938, "balance_loss_mlp": 0.01259324, "epoch": 0.15307380129264994, "flos": 25893597398400.0, "grad_norm": 2.0464885645802675, "language_loss": 0.71671623, "learning_rate": 3.843199661896884e-06, "loss": 0.79586911, "num_input_tokens_seen": 55303035, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.30212402, "step": 2546, "time_per_iteration": 2.6555771827697754 }, { "auxiliary_loss_clip": 0.0662304, "auxiliary_loss_mlp": 0.01291243, "balance_loss_clip": 0.06313829, "balance_loss_mlp": 0.01258437, "epoch": 0.1531339245453179, "flos": 46983780766080.0, "grad_norm": 1.762623302195713, "language_loss": 0.78475881, "learning_rate": 3.843048460745779e-06, "loss": 0.86390167, "num_input_tokens_seen": 55327570, "router_z_loss_clip": 3.09570312, "router_z_loss_mlp": 0.328125, "step": 2547, "time_per_iteration": 4.2404749393463135 }, { "auxiliary_loss_clip": 0.06631334, "auxiliary_loss_mlp": 0.01296225, "balance_loss_clip": 0.06316938, "balance_loss_mlp": 0.01263108, "epoch": 0.15319404779798587, "flos": 35890817160960.0, "grad_norm": 2.817108794311352, "language_loss": 0.75434214, "learning_rate": 3.842897189706092e-06, "loss": 0.83361769, "num_input_tokens_seen": 55351090, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.33105469, "step": 2548, "time_per_iteration": 2.7341978549957275 }, { "auxiliary_loss_clip": 0.06618114, "auxiliary_loss_mlp": 0.01292878, "balance_loss_clip": 0.06315062, "balance_loss_mlp": 0.01263803, "epoch": 0.15325417105065384, "flos": 25671463424640.0, "grad_norm": 1.3610238287947989, "language_loss": 0.81819195, "learning_rate": 3.842745848783558e-06, "loss": 0.89730191, "num_input_tokens_seen": 55371050, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.29089355, "step": 2549, "time_per_iteration": 2.651493549346924 }, { "auxiliary_loss_clip": 0.06614317, "auxiliary_loss_mlp": 0.01288878, "balance_loss_clip": 0.06307559, "balance_loss_mlp": 0.01258575, "epoch": 0.1533142943033218, "flos": 18776838718080.0, "grad_norm": 1.9890564976553657, "language_loss": 0.76023573, "learning_rate": 3.842594437983917e-06, "loss": 0.83926767, "num_input_tokens_seen": 55390375, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.30297852, "step": 2550, "time_per_iteration": 2.5688512325286865 }, { "auxiliary_loss_clip": 0.06624287, "auxiliary_loss_mlp": 0.01291793, "balance_loss_clip": 0.06311043, "balance_loss_mlp": 0.01260274, "epoch": 0.15337441755598977, "flos": 23113218418560.0, "grad_norm": 2.1789379844374053, "language_loss": 0.77826226, "learning_rate": 3.8424429573129115e-06, "loss": 0.85742307, "num_input_tokens_seen": 55408890, "router_z_loss_clip": 3.13085938, "router_z_loss_mlp": 0.31518555, "step": 2551, "time_per_iteration": 5.588515758514404 }, { "auxiliary_loss_clip": 0.06494932, "auxiliary_loss_mlp": 0.01280903, "balance_loss_clip": 0.06321778, "balance_loss_mlp": 0.0127071, "epoch": 0.15343454080865776, "flos": 59881278372480.0, "grad_norm": 0.9323189158498145, "language_loss": 0.56683791, "learning_rate": 3.842291406776283e-06, "loss": 0.64459622, "num_input_tokens_seen": 55463815, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.10192871, "step": 2552, "time_per_iteration": 3.2671830654144287 }, { "auxiliary_loss_clip": 0.06631321, "auxiliary_loss_mlp": 0.01298469, "balance_loss_clip": 0.06318095, "balance_loss_mlp": 0.0126645, "epoch": 0.15349466406132573, "flos": 11915644590720.0, "grad_norm": 3.1547958197080215, "language_loss": 0.89164442, "learning_rate": 3.84213978637978e-06, "loss": 0.97094232, "num_input_tokens_seen": 55481050, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.32006836, "step": 2553, "time_per_iteration": 2.6941583156585693 }, { "auxiliary_loss_clip": 0.06625405, "auxiliary_loss_mlp": 0.01297677, "balance_loss_clip": 0.0631056, "balance_loss_mlp": 0.0126623, "epoch": 0.1535547873139937, "flos": 24103681205760.0, "grad_norm": 1.6150764234396426, "language_loss": 0.79216129, "learning_rate": 3.841988096129152e-06, "loss": 0.87139213, "num_input_tokens_seen": 55500050, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.31469727, "step": 2554, "time_per_iteration": 2.6324498653411865 }, { "auxiliary_loss_clip": 0.0662127, "auxiliary_loss_mlp": 0.01293901, "balance_loss_clip": 0.06311573, "balance_loss_mlp": 0.01262096, "epoch": 0.15361491056666166, "flos": 17572208094720.0, "grad_norm": 2.2511153696773567, "language_loss": 0.7881493, "learning_rate": 3.841836336030151e-06, "loss": 0.86730099, "num_input_tokens_seen": 55518125, "router_z_loss_clip": 3.09570312, "router_z_loss_mlp": 0.31835938, "step": 2555, "time_per_iteration": 2.629599094390869 }, { "auxiliary_loss_clip": 0.0660154, "auxiliary_loss_mlp": 0.01288217, "balance_loss_clip": 0.06303895, "balance_loss_mlp": 0.01259905, "epoch": 0.15367503381932962, "flos": 25053040765440.0, "grad_norm": 1.4831625053552775, "language_loss": 0.77845746, "learning_rate": 3.8416845060885305e-06, "loss": 0.857355, "num_input_tokens_seen": 55540960, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.28308105, "step": 2556, "time_per_iteration": 2.6309401988983154 }, { "auxiliary_loss_clip": 0.06604153, "auxiliary_loss_mlp": 0.01287301, "balance_loss_clip": 0.06306747, "balance_loss_mlp": 0.01257904, "epoch": 0.15373515707199759, "flos": 21513808483200.0, "grad_norm": 1.8340721487897182, "language_loss": 0.90670311, "learning_rate": 3.84153260631005e-06, "loss": 0.98561764, "num_input_tokens_seen": 55559210, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.29382324, "step": 2557, "time_per_iteration": 2.6205475330352783 }, { "auxiliary_loss_clip": 0.06614143, "auxiliary_loss_mlp": 0.01291874, "balance_loss_clip": 0.06308581, "balance_loss_mlp": 0.01261619, "epoch": 0.15379528032466555, "flos": 26001897200640.0, "grad_norm": 1.8930775204589747, "language_loss": 0.70910573, "learning_rate": 3.841380636700468e-06, "loss": 0.78816593, "num_input_tokens_seen": 55578925, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.30273438, "step": 2558, "time_per_iteration": 2.6315982341766357 }, { "auxiliary_loss_clip": 0.06616564, "auxiliary_loss_mlp": 0.01287207, "balance_loss_clip": 0.06311535, "balance_loss_mlp": 0.01257536, "epoch": 0.15385540357733354, "flos": 19282685016960.0, "grad_norm": 1.9208355728711235, "language_loss": 0.92617512, "learning_rate": 3.841228597265548e-06, "loss": 1.00521278, "num_input_tokens_seen": 55597255, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.29675293, "step": 2559, "time_per_iteration": 2.605739116668701 }, { "auxiliary_loss_clip": 0.06618339, "auxiliary_loss_mlp": 0.01292973, "balance_loss_clip": 0.06312785, "balance_loss_mlp": 0.01261288, "epoch": 0.1539155268300015, "flos": 28556788043520.0, "grad_norm": 2.4260150916982446, "language_loss": 0.6467098, "learning_rate": 3.841076488011055e-06, "loss": 0.72582293, "num_input_tokens_seen": 55619515, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.31689453, "step": 2560, "time_per_iteration": 2.6519405841827393 }, { "auxiliary_loss_clip": 0.06618251, "auxiliary_loss_mlp": 0.01292277, "balance_loss_clip": 0.06308191, "balance_loss_mlp": 0.01259995, "epoch": 0.15397565008266947, "flos": 23554257984000.0, "grad_norm": 1.5986505223879934, "language_loss": 0.88807344, "learning_rate": 3.8409243089427574e-06, "loss": 0.96717864, "num_input_tokens_seen": 55640050, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.32275391, "step": 2561, "time_per_iteration": 2.6416916847229004 }, { "auxiliary_loss_clip": 0.06601766, "auxiliary_loss_mlp": 0.0128715, "balance_loss_clip": 0.06308782, "balance_loss_mlp": 0.0125916, "epoch": 0.15403577333533744, "flos": 17135696649600.0, "grad_norm": 1.9680233121240596, "language_loss": 0.83824265, "learning_rate": 3.840772060066425e-06, "loss": 0.91713178, "num_input_tokens_seen": 55658695, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.2800293, "step": 2562, "time_per_iteration": 2.6299245357513428 }, { "auxiliary_loss_clip": 0.06629212, "auxiliary_loss_mlp": 0.0129221, "balance_loss_clip": 0.06314519, "balance_loss_mlp": 0.01258403, "epoch": 0.1540958965880054, "flos": 17900252029440.0, "grad_norm": 2.2953300958283194, "language_loss": 0.75858402, "learning_rate": 3.840619741387832e-06, "loss": 0.83779824, "num_input_tokens_seen": 55676340, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.33789062, "step": 2563, "time_per_iteration": 2.577042579650879 }, { "auxiliary_loss_clip": 0.06622527, "auxiliary_loss_mlp": 0.01291333, "balance_loss_clip": 0.06311286, "balance_loss_mlp": 0.01259742, "epoch": 0.15415601984067337, "flos": 32169296321280.0, "grad_norm": 3.642756524312086, "language_loss": 0.77586448, "learning_rate": 3.8404673529127534e-06, "loss": 0.85500312, "num_input_tokens_seen": 55698890, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.31591797, "step": 2564, "time_per_iteration": 2.6610729694366455 }, { "auxiliary_loss_clip": 0.06604235, "auxiliary_loss_mlp": 0.01283991, "balance_loss_clip": 0.06304228, "balance_loss_mlp": 0.01254844, "epoch": 0.15421614309334136, "flos": 24031243751040.0, "grad_norm": 3.118926128068363, "language_loss": 0.72184128, "learning_rate": 3.840314894646969e-06, "loss": 0.80072355, "num_input_tokens_seen": 55718535, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.2911377, "step": 2565, "time_per_iteration": 2.590589761734009 }, { "auxiliary_loss_clip": 0.06610316, "auxiliary_loss_mlp": 0.01290033, "balance_loss_clip": 0.06307696, "balance_loss_mlp": 0.01259635, "epoch": 0.15427626634600933, "flos": 24392676337920.0, "grad_norm": 2.757069308674896, "language_loss": 0.73001659, "learning_rate": 3.840162366596259e-06, "loss": 0.8090201, "num_input_tokens_seen": 55738970, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.30419922, "step": 2566, "time_per_iteration": 2.6215245723724365 }, { "auxiliary_loss_clip": 0.06607586, "auxiliary_loss_mlp": 0.01285529, "balance_loss_clip": 0.06313589, "balance_loss_mlp": 0.01257967, "epoch": 0.1543363895986773, "flos": 23338287285120.0, "grad_norm": 1.7135088487425119, "language_loss": 0.86240155, "learning_rate": 3.840009768766408e-06, "loss": 0.9413327, "num_input_tokens_seen": 55759585, "router_z_loss_clip": 2.93945312, "router_z_loss_mlp": 0.2755127, "step": 2567, "time_per_iteration": 2.596306562423706 }, { "auxiliary_loss_clip": 0.06613518, "auxiliary_loss_mlp": 0.0128925, "balance_loss_clip": 0.06315524, "balance_loss_mlp": 0.01261582, "epoch": 0.15439651285134526, "flos": 24280225758720.0, "grad_norm": 1.9617016257142286, "language_loss": 0.79351652, "learning_rate": 3.839857101163202e-06, "loss": 0.87254417, "num_input_tokens_seen": 55779250, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.27636719, "step": 2568, "time_per_iteration": 2.6086530685424805 }, { "auxiliary_loss_clip": 0.06609654, "auxiliary_loss_mlp": 0.01289859, "balance_loss_clip": 0.06311747, "balance_loss_mlp": 0.01261761, "epoch": 0.15445663610401322, "flos": 22462832626560.0, "grad_norm": 4.721540766439947, "language_loss": 0.70803976, "learning_rate": 3.83970436379243e-06, "loss": 0.78703487, "num_input_tokens_seen": 55800470, "router_z_loss_clip": 2.98242188, "router_z_loss_mlp": 0.28149414, "step": 2569, "time_per_iteration": 2.6025655269622803 }, { "auxiliary_loss_clip": 0.06609397, "auxiliary_loss_mlp": 0.01302787, "balance_loss_clip": 0.06312976, "balance_loss_mlp": 0.01272293, "epoch": 0.1545167593566812, "flos": 22055223640320.0, "grad_norm": 1.698697084973463, "language_loss": 0.77690601, "learning_rate": 3.839551556659884e-06, "loss": 0.85602784, "num_input_tokens_seen": 55817795, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.30517578, "step": 2570, "time_per_iteration": 2.5800044536590576 }, { "auxiliary_loss_clip": 0.06606238, "auxiliary_loss_mlp": 0.01291793, "balance_loss_clip": 0.06309968, "balance_loss_mlp": 0.01263517, "epoch": 0.15457688260934915, "flos": 19324375223040.0, "grad_norm": 2.595288612271702, "language_loss": 0.78327322, "learning_rate": 3.839398679771359e-06, "loss": 0.86225355, "num_input_tokens_seen": 55836125, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.28344727, "step": 2571, "time_per_iteration": 2.5805015563964844 }, { "auxiliary_loss_clip": 0.06619109, "auxiliary_loss_mlp": 0.01289512, "balance_loss_clip": 0.06319947, "balance_loss_mlp": 0.01260998, "epoch": 0.15463700586201715, "flos": 24140843291520.0, "grad_norm": 1.9251490831493632, "language_loss": 0.83319438, "learning_rate": 3.839245733132652e-06, "loss": 0.91228062, "num_input_tokens_seen": 55855280, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.28515625, "step": 2572, "time_per_iteration": 2.6159746646881104 }, { "auxiliary_loss_clip": 0.06620476, "auxiliary_loss_mlp": 0.01291251, "balance_loss_clip": 0.06318189, "balance_loss_mlp": 0.01262295, "epoch": 0.1546971291146851, "flos": 22427808819840.0, "grad_norm": 1.6060795800231986, "language_loss": 0.9157483, "learning_rate": 3.839092716749563e-06, "loss": 0.99486554, "num_input_tokens_seen": 55875695, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.28991699, "step": 2573, "time_per_iteration": 2.659757614135742 }, { "auxiliary_loss_clip": 0.0662847, "auxiliary_loss_mlp": 0.01291659, "balance_loss_clip": 0.06326868, "balance_loss_mlp": 0.01262929, "epoch": 0.15475725236735308, "flos": 17536010330880.0, "grad_norm": 1.624869634336126, "language_loss": 0.71351713, "learning_rate": 3.838939630627893e-06, "loss": 0.79271841, "num_input_tokens_seen": 55894575, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.28747559, "step": 2574, "time_per_iteration": 2.5563197135925293 }, { "auxiliary_loss_clip": 0.06617411, "auxiliary_loss_mlp": 0.01292132, "balance_loss_clip": 0.06318954, "balance_loss_mlp": 0.0126413, "epoch": 0.15481737562002104, "flos": 22567778265600.0, "grad_norm": 1.5746702871711757, "language_loss": 0.83215183, "learning_rate": 3.838786474773448e-06, "loss": 0.91124725, "num_input_tokens_seen": 55912855, "router_z_loss_clip": 2.98242188, "router_z_loss_mlp": 0.2800293, "step": 2575, "time_per_iteration": 2.599748373031616 }, { "auxiliary_loss_clip": 0.06617352, "auxiliary_loss_mlp": 0.01286776, "balance_loss_clip": 0.06318173, "balance_loss_mlp": 0.01259787, "epoch": 0.154877498872689, "flos": 24907620804480.0, "grad_norm": 1.8765277568431995, "language_loss": 0.85578275, "learning_rate": 3.838633249192036e-06, "loss": 0.93482399, "num_input_tokens_seen": 55932375, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.2701416, "step": 2576, "time_per_iteration": 2.6400909423828125 }, { "auxiliary_loss_clip": 0.0661969, "auxiliary_loss_mlp": 0.01296713, "balance_loss_clip": 0.06321131, "balance_loss_mlp": 0.01270224, "epoch": 0.15493762212535697, "flos": 28155048842880.0, "grad_norm": 1.7922705865184712, "language_loss": 0.82765424, "learning_rate": 3.838479953889465e-06, "loss": 0.90681827, "num_input_tokens_seen": 55953970, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.26513672, "step": 2577, "time_per_iteration": 2.643472909927368 }, { "auxiliary_loss_clip": 0.06607416, "auxiliary_loss_mlp": 0.01296142, "balance_loss_clip": 0.06307444, "balance_loss_mlp": 0.01268259, "epoch": 0.15499774537802496, "flos": 25418162931840.0, "grad_norm": 2.3908305752114267, "language_loss": 0.77065587, "learning_rate": 3.8383265888715525e-06, "loss": 0.84969151, "num_input_tokens_seen": 55973120, "router_z_loss_clip": 3.00195312, "router_z_loss_mlp": 0.27868652, "step": 2578, "time_per_iteration": 2.6193366050720215 }, { "auxiliary_loss_clip": 0.06628486, "auxiliary_loss_mlp": 0.01285627, "balance_loss_clip": 0.0632163, "balance_loss_mlp": 0.01256517, "epoch": 0.15505786863069293, "flos": 22098213584640.0, "grad_norm": 1.923105551105221, "language_loss": 0.83889723, "learning_rate": 3.83817315414411e-06, "loss": 0.91803843, "num_input_tokens_seen": 55993260, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.29125977, "step": 2579, "time_per_iteration": 2.6148979663848877 }, { "auxiliary_loss_clip": 0.06607211, "auxiliary_loss_mlp": 0.01288645, "balance_loss_clip": 0.06306189, "balance_loss_mlp": 0.01258891, "epoch": 0.1551179918833609, "flos": 18923223000960.0, "grad_norm": 2.0295897807918273, "language_loss": 0.81469512, "learning_rate": 3.838019649712958e-06, "loss": 0.89365363, "num_input_tokens_seen": 56012130, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.29772949, "step": 2580, "time_per_iteration": 2.5842509269714355 }, { "auxiliary_loss_clip": 0.06476809, "auxiliary_loss_mlp": 0.01260819, "balance_loss_clip": 0.0630457, "balance_loss_mlp": 0.01251235, "epoch": 0.15517811513602886, "flos": 66259281530880.0, "grad_norm": 0.8176812862551616, "language_loss": 0.58951712, "learning_rate": 3.8378660755839166e-06, "loss": 0.66689348, "num_input_tokens_seen": 56079045, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.09570312, "step": 2581, "time_per_iteration": 3.3203601837158203 }, { "auxiliary_loss_clip": 0.06607162, "auxiliary_loss_mlp": 0.01284086, "balance_loss_clip": 0.06302726, "balance_loss_mlp": 0.01256716, "epoch": 0.15523823838869683, "flos": 24027344536320.0, "grad_norm": 2.1965171579332656, "language_loss": 0.85806131, "learning_rate": 3.8377124317628095e-06, "loss": 0.93697375, "num_input_tokens_seen": 56098745, "router_z_loss_clip": 3.04296875, "router_z_loss_mlp": 0.27392578, "step": 2582, "time_per_iteration": 2.6113739013671875 }, { "auxiliary_loss_clip": 0.06615459, "auxiliary_loss_mlp": 0.01289136, "balance_loss_clip": 0.06309454, "balance_loss_mlp": 0.01258166, "epoch": 0.1552983616413648, "flos": 20491256782080.0, "grad_norm": 1.9524764164692456, "language_loss": 0.79259545, "learning_rate": 3.8375587182554625e-06, "loss": 0.8716414, "num_input_tokens_seen": 56117655, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.30957031, "step": 2583, "time_per_iteration": 4.066518545150757 }, { "auxiliary_loss_clip": 0.06601977, "auxiliary_loss_mlp": 0.01286319, "balance_loss_clip": 0.06300838, "balance_loss_mlp": 0.01257709, "epoch": 0.15535848489403276, "flos": 32131798819200.0, "grad_norm": 1.997036030323782, "language_loss": 0.77037442, "learning_rate": 3.837404935067705e-06, "loss": 0.84925735, "num_input_tokens_seen": 56141960, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.28613281, "step": 2584, "time_per_iteration": 2.708723545074463 }, { "auxiliary_loss_clip": 0.06606479, "auxiliary_loss_mlp": 0.01284767, "balance_loss_clip": 0.06303644, "balance_loss_mlp": 0.0125649, "epoch": 0.15541860814670075, "flos": 19104379528320.0, "grad_norm": 1.910031721693465, "language_loss": 0.76853824, "learning_rate": 3.837251082205368e-06, "loss": 0.84745073, "num_input_tokens_seen": 56161430, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.28283691, "step": 2585, "time_per_iteration": 2.630079507827759 }, { "auxiliary_loss_clip": 0.06591938, "auxiliary_loss_mlp": 0.01282491, "balance_loss_clip": 0.06297951, "balance_loss_mlp": 0.01253618, "epoch": 0.1554787313993687, "flos": 19178158648320.0, "grad_norm": 2.4971840348336, "language_loss": 0.62490731, "learning_rate": 3.837097159674286e-06, "loss": 0.70365167, "num_input_tokens_seen": 56179390, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.28857422, "step": 2586, "time_per_iteration": 2.5668046474456787 }, { "auxiliary_loss_clip": 0.06609427, "auxiliary_loss_mlp": 0.01284316, "balance_loss_clip": 0.06308058, "balance_loss_mlp": 0.01255456, "epoch": 0.15553885465203668, "flos": 16149384639360.0, "grad_norm": 1.694625738769222, "language_loss": 0.82126498, "learning_rate": 3.836943167480296e-06, "loss": 0.90020245, "num_input_tokens_seen": 56198020, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.28881836, "step": 2587, "time_per_iteration": 4.053009510040283 }, { "auxiliary_loss_clip": 0.06612436, "auxiliary_loss_mlp": 0.01291481, "balance_loss_clip": 0.06307032, "balance_loss_mlp": 0.01260535, "epoch": 0.15559897790470464, "flos": 25344803082240.0, "grad_norm": 1.9308607715982444, "language_loss": 0.89902598, "learning_rate": 3.836789105629236e-06, "loss": 0.97806513, "num_input_tokens_seen": 56218165, "router_z_loss_clip": 3.0546875, "router_z_loss_mlp": 0.30957031, "step": 2588, "time_per_iteration": 2.6123545169830322 }, { "auxiliary_loss_clip": 0.06600653, "auxiliary_loss_mlp": 0.01289223, "balance_loss_clip": 0.06305009, "balance_loss_mlp": 0.01260661, "epoch": 0.1556591011573726, "flos": 23155453676160.0, "grad_norm": 13.25056664159663, "language_loss": 0.65842021, "learning_rate": 3.83663497412695e-06, "loss": 0.73731899, "num_input_tokens_seen": 56237160, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.28564453, "step": 2589, "time_per_iteration": 2.592397451400757 }, { "auxiliary_loss_clip": 0.06601107, "auxiliary_loss_mlp": 0.01289664, "balance_loss_clip": 0.06303181, "balance_loss_mlp": 0.01261244, "epoch": 0.15571922441004057, "flos": 25377353193600.0, "grad_norm": 2.075074516255277, "language_loss": 0.83965498, "learning_rate": 3.836480772979281e-06, "loss": 0.91856271, "num_input_tokens_seen": 56257610, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.28417969, "step": 2590, "time_per_iteration": 4.096376180648804 }, { "auxiliary_loss_clip": 0.06613502, "auxiliary_loss_mlp": 0.01285646, "balance_loss_clip": 0.06312703, "balance_loss_mlp": 0.01258228, "epoch": 0.15577934766270854, "flos": 14506565489280.0, "grad_norm": 2.1520897326080966, "language_loss": 0.81088769, "learning_rate": 3.836326502192077e-06, "loss": 0.88987917, "num_input_tokens_seen": 56275215, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.27416992, "step": 2591, "time_per_iteration": 4.04630446434021 }, { "auxiliary_loss_clip": 0.06606337, "auxiliary_loss_mlp": 0.01290523, "balance_loss_clip": 0.06311424, "balance_loss_mlp": 0.01263654, "epoch": 0.15583947091537653, "flos": 37423575573120.0, "grad_norm": 2.5126534480198472, "language_loss": 0.65953201, "learning_rate": 3.836172161771189e-06, "loss": 0.7385006, "num_input_tokens_seen": 56297130, "router_z_loss_clip": 2.95117188, "router_z_loss_mlp": 0.26867676, "step": 2592, "time_per_iteration": 2.7298388481140137 }, { "auxiliary_loss_clip": 0.06616622, "auxiliary_loss_mlp": 0.01288004, "balance_loss_clip": 0.06315108, "balance_loss_mlp": 0.01258559, "epoch": 0.1558995941680445, "flos": 21841097731200.0, "grad_norm": 2.688123943229366, "language_loss": 0.83136523, "learning_rate": 3.836017751722467e-06, "loss": 0.91041148, "num_input_tokens_seen": 56314995, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.29431152, "step": 2593, "time_per_iteration": 2.584350347518921 }, { "auxiliary_loss_clip": 0.06611862, "auxiliary_loss_mlp": 0.0128583, "balance_loss_clip": 0.06319325, "balance_loss_mlp": 0.01256838, "epoch": 0.15595971742071246, "flos": 19798845367680.0, "grad_norm": 2.154484277291066, "language_loss": 0.74194026, "learning_rate": 3.8358632720517695e-06, "loss": 0.82091713, "num_input_tokens_seen": 56334005, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.28967285, "step": 2594, "time_per_iteration": 2.5702009201049805 }, { "auxiliary_loss_clip": 0.06598504, "auxiliary_loss_mlp": 0.01286893, "balance_loss_clip": 0.06309351, "balance_loss_mlp": 0.01258855, "epoch": 0.15601984067338043, "flos": 26729038932480.0, "grad_norm": 2.111665855261263, "language_loss": 0.83203411, "learning_rate": 3.835708722764952e-06, "loss": 0.91088808, "num_input_tokens_seen": 56353795, "router_z_loss_clip": 2.89453125, "router_z_loss_mlp": 0.28051758, "step": 2595, "time_per_iteration": 2.609630823135376 }, { "auxiliary_loss_clip": 0.06610011, "auxiliary_loss_mlp": 0.01286373, "balance_loss_clip": 0.06311941, "balance_loss_mlp": 0.0125843, "epoch": 0.1560799639260484, "flos": 18375183371520.0, "grad_norm": 2.596774846987677, "language_loss": 0.87947893, "learning_rate": 3.835554103867876e-06, "loss": 0.95844275, "num_input_tokens_seen": 56373195, "router_z_loss_clip": 2.98242188, "router_z_loss_mlp": 0.27954102, "step": 2596, "time_per_iteration": 2.567917823791504 }, { "auxiliary_loss_clip": 0.06600852, "auxiliary_loss_mlp": 0.01288282, "balance_loss_clip": 0.06310678, "balance_loss_mlp": 0.01261067, "epoch": 0.15614008717871636, "flos": 22605149986560.0, "grad_norm": 1.6801767301139348, "language_loss": 0.69294024, "learning_rate": 3.835399415366404e-06, "loss": 0.77183151, "num_input_tokens_seen": 56391525, "router_z_loss_clip": 2.90039062, "router_z_loss_mlp": 0.27185059, "step": 2597, "time_per_iteration": 2.5745768547058105 }, { "auxiliary_loss_clip": 0.06602786, "auxiliary_loss_mlp": 0.01285382, "balance_loss_clip": 0.0631384, "balance_loss_mlp": 0.01259096, "epoch": 0.15620021043138435, "flos": 22753379059200.0, "grad_norm": 1.681528841517423, "language_loss": 0.8052848, "learning_rate": 3.8352446572664035e-06, "loss": 0.88416648, "num_input_tokens_seen": 56410715, "router_z_loss_clip": 2.88476562, "router_z_loss_mlp": 0.26245117, "step": 2598, "time_per_iteration": 2.588578701019287 }, { "auxiliary_loss_clip": 0.06595258, "auxiliary_loss_mlp": 0.01286193, "balance_loss_clip": 0.06305841, "balance_loss_mlp": 0.01259598, "epoch": 0.15626033368405232, "flos": 13119897870720.0, "grad_norm": 2.8308698390467035, "language_loss": 0.83298475, "learning_rate": 3.8350898295737405e-06, "loss": 0.91179919, "num_input_tokens_seen": 56429170, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.265625, "step": 2599, "time_per_iteration": 2.6035964488983154 }, { "auxiliary_loss_clip": 0.0661836, "auxiliary_loss_mlp": 0.01292576, "balance_loss_clip": 0.06314251, "balance_loss_mlp": 0.01263393, "epoch": 0.15632045693672028, "flos": 16477931698560.0, "grad_norm": 2.2460856194015895, "language_loss": 0.8231473, "learning_rate": 3.834934932294287e-06, "loss": 0.90225661, "num_input_tokens_seen": 56445685, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.29138184, "step": 2600, "time_per_iteration": 2.586421012878418 }, { "auxiliary_loss_clip": 0.06612265, "auxiliary_loss_mlp": 0.01284988, "balance_loss_clip": 0.06314948, "balance_loss_mlp": 0.01257582, "epoch": 0.15638058018938825, "flos": 20856672437760.0, "grad_norm": 2.193610868988509, "language_loss": 0.8878442, "learning_rate": 3.834779965433917e-06, "loss": 0.96681678, "num_input_tokens_seen": 56465900, "router_z_loss_clip": 2.9765625, "router_z_loss_mlp": 0.27404785, "step": 2601, "time_per_iteration": 2.599134922027588 }, { "auxiliary_loss_clip": 0.06612127, "auxiliary_loss_mlp": 0.01290211, "balance_loss_clip": 0.06309108, "balance_loss_mlp": 0.01261291, "epoch": 0.1564407034420562, "flos": 21878762941440.0, "grad_norm": 2.0482921160199483, "language_loss": 0.79324281, "learning_rate": 3.834624928998508e-06, "loss": 0.87226623, "num_input_tokens_seen": 56485020, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.2890625, "step": 2602, "time_per_iteration": 2.597504138946533 }, { "auxiliary_loss_clip": 0.06599724, "auxiliary_loss_mlp": 0.0128451, "balance_loss_clip": 0.06303525, "balance_loss_mlp": 0.01256579, "epoch": 0.15650082669472418, "flos": 21840888096000.0, "grad_norm": 1.9397148618112643, "language_loss": 0.74677926, "learning_rate": 3.8344698229939376e-06, "loss": 0.82562155, "num_input_tokens_seen": 56505205, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.27929688, "step": 2603, "time_per_iteration": 2.601565361022949 }, { "auxiliary_loss_clip": 0.06601485, "auxiliary_loss_mlp": 0.0128634, "balance_loss_clip": 0.06306989, "balance_loss_mlp": 0.01259279, "epoch": 0.15656094994739214, "flos": 13804343147520.0, "grad_norm": 3.758669867458869, "language_loss": 0.88674307, "learning_rate": 3.8343146474260865e-06, "loss": 0.96562135, "num_input_tokens_seen": 56521495, "router_z_loss_clip": 2.94726562, "router_z_loss_mlp": 0.27050781, "step": 2604, "time_per_iteration": 2.587618350982666 }, { "auxiliary_loss_clip": 0.06608459, "auxiliary_loss_mlp": 0.01281721, "balance_loss_clip": 0.06307901, "balance_loss_mlp": 0.01255817, "epoch": 0.15662107320006013, "flos": 27315582312960.0, "grad_norm": 2.168002343486017, "language_loss": 0.86519945, "learning_rate": 3.834159402300841e-06, "loss": 0.94410127, "num_input_tokens_seen": 56540665, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.25915527, "step": 2605, "time_per_iteration": 2.6343142986297607 }, { "auxiliary_loss_clip": 0.06615131, "auxiliary_loss_mlp": 0.01285234, "balance_loss_clip": 0.06308685, "balance_loss_mlp": 0.01256933, "epoch": 0.1566811964527281, "flos": 26691876846720.0, "grad_norm": 2.9532310924219454, "language_loss": 0.7450707, "learning_rate": 3.834004087624087e-06, "loss": 0.82407439, "num_input_tokens_seen": 56560805, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.28271484, "step": 2606, "time_per_iteration": 2.658179759979248 }, { "auxiliary_loss_clip": 0.06596589, "auxiliary_loss_mlp": 0.0129174, "balance_loss_clip": 0.06304587, "balance_loss_mlp": 0.01264668, "epoch": 0.15674131970539606, "flos": 16108323338880.0, "grad_norm": 2.10121338898336, "language_loss": 0.78179371, "learning_rate": 3.8338487034017145e-06, "loss": 0.86067694, "num_input_tokens_seen": 56576335, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.27087402, "step": 2607, "time_per_iteration": 2.5696003437042236 }, { "auxiliary_loss_clip": 0.06600057, "auxiliary_loss_mlp": 0.01288439, "balance_loss_clip": 0.06308938, "balance_loss_mlp": 0.01261939, "epoch": 0.15680144295806403, "flos": 19175349536640.0, "grad_norm": 2.0266481004499237, "language_loss": 0.82862294, "learning_rate": 3.833693249639615e-06, "loss": 0.90750796, "num_input_tokens_seen": 56595880, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.26501465, "step": 2608, "time_per_iteration": 2.5881803035736084 }, { "auxiliary_loss_clip": 0.06615664, "auxiliary_loss_mlp": 0.01292118, "balance_loss_clip": 0.06313608, "balance_loss_mlp": 0.01261934, "epoch": 0.156861566210732, "flos": 20819678060160.0, "grad_norm": 1.7582826250066406, "language_loss": 0.73453629, "learning_rate": 3.833537726343684e-06, "loss": 0.81361413, "num_input_tokens_seen": 56615130, "router_z_loss_clip": 3.01953125, "router_z_loss_mlp": 0.30175781, "step": 2609, "time_per_iteration": 2.573194742202759 }, { "auxiliary_loss_clip": 0.0661141, "auxiliary_loss_mlp": 0.01288218, "balance_loss_clip": 0.06310086, "balance_loss_mlp": 0.01260824, "epoch": 0.15692168946339996, "flos": 20054158358400.0, "grad_norm": 1.9684913819470338, "language_loss": 0.73417795, "learning_rate": 3.833382133519818e-06, "loss": 0.81317425, "num_input_tokens_seen": 56634005, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.27404785, "step": 2610, "time_per_iteration": 2.5797505378723145 }, { "auxiliary_loss_clip": 0.06606954, "auxiliary_loss_mlp": 0.01288021, "balance_loss_clip": 0.06304321, "balance_loss_mlp": 0.01258862, "epoch": 0.15698181271606793, "flos": 21404502432000.0, "grad_norm": 2.139093504429537, "language_loss": 0.73635983, "learning_rate": 3.833226471173919e-06, "loss": 0.81530958, "num_input_tokens_seen": 56653480, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.29162598, "step": 2611, "time_per_iteration": 2.578779458999634 }, { "auxiliary_loss_clip": 0.06598929, "auxiliary_loss_mlp": 0.01288969, "balance_loss_clip": 0.06305359, "balance_loss_mlp": 0.01262481, "epoch": 0.15704193596873592, "flos": 20851347703680.0, "grad_norm": 2.0563188604071407, "language_loss": 0.7152831, "learning_rate": 3.833070739311887e-06, "loss": 0.79416209, "num_input_tokens_seen": 56672270, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.26464844, "step": 2612, "time_per_iteration": 2.602477788925171 }, { "auxiliary_loss_clip": 0.06605162, "auxiliary_loss_mlp": 0.01284714, "balance_loss_clip": 0.06307212, "balance_loss_mlp": 0.01258465, "epoch": 0.15710205922140388, "flos": 21769456890240.0, "grad_norm": 1.8613188171854886, "language_loss": 0.77016759, "learning_rate": 3.83291493793963e-06, "loss": 0.84906638, "num_input_tokens_seen": 56691510, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.2623291, "step": 2613, "time_per_iteration": 2.608349323272705 }, { "auxiliary_loss_clip": 0.0660708, "auxiliary_loss_mlp": 0.01289553, "balance_loss_clip": 0.06307974, "balance_loss_mlp": 0.01262993, "epoch": 0.15716218247407185, "flos": 25014453160320.0, "grad_norm": 2.6151515527903375, "language_loss": 0.67192137, "learning_rate": 3.832759067063055e-06, "loss": 0.75088769, "num_input_tokens_seen": 56712230, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.265625, "step": 2614, "time_per_iteration": 2.6356890201568604 }, { "auxiliary_loss_clip": 0.06614471, "auxiliary_loss_mlp": 0.01288984, "balance_loss_clip": 0.0631128, "balance_loss_mlp": 0.01261804, "epoch": 0.1572223057267398, "flos": 20197691602560.0, "grad_norm": 2.05500197143393, "language_loss": 0.76132077, "learning_rate": 3.832603126688072e-06, "loss": 0.8403554, "num_input_tokens_seen": 56727490, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.27197266, "step": 2615, "time_per_iteration": 2.574708938598633 }, { "auxiliary_loss_clip": 0.0659243, "auxiliary_loss_mlp": 0.01287058, "balance_loss_clip": 0.06304538, "balance_loss_mlp": 0.01261606, "epoch": 0.15728242897940778, "flos": 20965810780800.0, "grad_norm": 2.2144020003925795, "language_loss": 0.74080521, "learning_rate": 3.832447116820594e-06, "loss": 0.81960011, "num_input_tokens_seen": 56747385, "router_z_loss_clip": 2.88085938, "router_z_loss_mlp": 0.25415039, "step": 2616, "time_per_iteration": 2.6108505725860596 }, { "auxiliary_loss_clip": 0.06601946, "auxiliary_loss_mlp": 0.01282421, "balance_loss_clip": 0.0630438, "balance_loss_mlp": 0.01256553, "epoch": 0.15734255223207574, "flos": 23044764032640.0, "grad_norm": 3.637948852941385, "language_loss": 0.72630799, "learning_rate": 3.832291037466539e-06, "loss": 0.8051517, "num_input_tokens_seen": 56768055, "router_z_loss_clip": 2.9765625, "router_z_loss_mlp": 0.25878906, "step": 2617, "time_per_iteration": 2.6034765243530273 }, { "auxiliary_loss_clip": 0.06599738, "auxiliary_loss_mlp": 0.01284584, "balance_loss_clip": 0.06308369, "balance_loss_mlp": 0.01257738, "epoch": 0.15740267548474374, "flos": 20556357004800.0, "grad_norm": 2.168529641923418, "language_loss": 0.75329578, "learning_rate": 3.8321348886318235e-06, "loss": 0.83213902, "num_input_tokens_seen": 56785110, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.26843262, "step": 2618, "time_per_iteration": 2.576533555984497 }, { "auxiliary_loss_clip": 0.06614328, "auxiliary_loss_mlp": 0.01284951, "balance_loss_clip": 0.06308326, "balance_loss_mlp": 0.01256782, "epoch": 0.1574627987374117, "flos": 22672262707200.0, "grad_norm": 3.5306780179969395, "language_loss": 0.79201108, "learning_rate": 3.8319786703223695e-06, "loss": 0.87100387, "num_input_tokens_seen": 56804975, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.28186035, "step": 2619, "time_per_iteration": 2.597569227218628 }, { "auxiliary_loss_clip": 0.06600776, "auxiliary_loss_mlp": 0.01281865, "balance_loss_clip": 0.06308819, "balance_loss_mlp": 0.01256259, "epoch": 0.15752292199007967, "flos": 16806352976640.0, "grad_norm": 2.596832886276353, "language_loss": 0.77486944, "learning_rate": 3.831822382544101e-06, "loss": 0.85369593, "num_input_tokens_seen": 56822470, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.25622559, "step": 2620, "time_per_iteration": 2.5771632194519043 }, { "auxiliary_loss_clip": 0.06608981, "auxiliary_loss_mlp": 0.01291029, "balance_loss_clip": 0.06312708, "balance_loss_mlp": 0.01264076, "epoch": 0.15758304524274763, "flos": 29833856121600.0, "grad_norm": 2.419629495884251, "language_loss": 0.72516483, "learning_rate": 3.831666025302944e-06, "loss": 0.80416495, "num_input_tokens_seen": 56842100, "router_z_loss_clip": 2.96679688, "router_z_loss_mlp": 0.26953125, "step": 2621, "time_per_iteration": 2.6913509368896484 }, { "auxiliary_loss_clip": 0.06607518, "auxiliary_loss_mlp": 0.01284948, "balance_loss_clip": 0.06308022, "balance_loss_mlp": 0.01257244, "epoch": 0.1576431684954156, "flos": 53589116851200.0, "grad_norm": 30.46721674945661, "language_loss": 0.73422027, "learning_rate": 3.831509598604828e-06, "loss": 0.81314492, "num_input_tokens_seen": 56865920, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.27734375, "step": 2622, "time_per_iteration": 2.8835372924804688 }, { "auxiliary_loss_clip": 0.06589037, "auxiliary_loss_mlp": 0.01284443, "balance_loss_clip": 0.06299748, "balance_loss_mlp": 0.0125898, "epoch": 0.15770329174808356, "flos": 20819887695360.0, "grad_norm": 2.0142478774850763, "language_loss": 0.88264894, "learning_rate": 3.831353102455684e-06, "loss": 0.9613837, "num_input_tokens_seen": 56885265, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.25476074, "step": 2623, "time_per_iteration": 4.024603843688965 }, { "auxiliary_loss_clip": 0.0659555, "auxiliary_loss_mlp": 0.01282865, "balance_loss_clip": 0.06305721, "balance_loss_mlp": 0.01255924, "epoch": 0.15776341500075153, "flos": 24981148362240.0, "grad_norm": 1.7654900658550492, "language_loss": 0.82261729, "learning_rate": 3.831196536861448e-06, "loss": 0.90140146, "num_input_tokens_seen": 56906710, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.26928711, "step": 2624, "time_per_iteration": 2.62631893157959 }, { "auxiliary_loss_clip": 0.06605396, "auxiliary_loss_mlp": 0.01292276, "balance_loss_clip": 0.06304979, "balance_loss_mlp": 0.01264559, "epoch": 0.15782353825341952, "flos": 21914331799680.0, "grad_norm": 2.3444008718957097, "language_loss": 0.81506079, "learning_rate": 3.831039901828054e-06, "loss": 0.89403754, "num_input_tokens_seen": 56924275, "router_z_loss_clip": 3.00390625, "router_z_loss_mlp": 0.27722168, "step": 2625, "time_per_iteration": 2.57684326171875 }, { "auxiliary_loss_clip": 0.06600584, "auxiliary_loss_mlp": 0.01291737, "balance_loss_clip": 0.06305356, "balance_loss_mlp": 0.01265022, "epoch": 0.15788366150608749, "flos": 26184395393280.0, "grad_norm": 2.6177821981311356, "language_loss": 0.80802536, "learning_rate": 3.830883197361445e-06, "loss": 0.88694859, "num_input_tokens_seen": 56941525, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.26721191, "step": 2626, "time_per_iteration": 4.021886110305786 }, { "auxiliary_loss_clip": 0.06597294, "auxiliary_loss_mlp": 0.012914, "balance_loss_clip": 0.06306295, "balance_loss_mlp": 0.01264781, "epoch": 0.15794378475875545, "flos": 27717321513600.0, "grad_norm": 1.671946479754039, "language_loss": 0.74176764, "learning_rate": 3.830726423467561e-06, "loss": 0.82065457, "num_input_tokens_seen": 56962145, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.26611328, "step": 2627, "time_per_iteration": 2.6622495651245117 }, { "auxiliary_loss_clip": 0.06597044, "auxiliary_loss_mlp": 0.01294725, "balance_loss_clip": 0.06301828, "balance_loss_mlp": 0.01266591, "epoch": 0.15800390801142342, "flos": 12135011379840.0, "grad_norm": 3.499265524666764, "language_loss": 0.86503458, "learning_rate": 3.830569580152348e-06, "loss": 0.94395226, "num_input_tokens_seen": 56977505, "router_z_loss_clip": 2.95117188, "router_z_loss_mlp": 0.28100586, "step": 2628, "time_per_iteration": 2.6143949031829834 }, { "auxiliary_loss_clip": 0.06589276, "auxiliary_loss_mlp": 0.01280716, "balance_loss_clip": 0.06298569, "balance_loss_mlp": 0.01254454, "epoch": 0.15806403126409138, "flos": 20711084768640.0, "grad_norm": 1.9997755171928104, "language_loss": 0.7784816, "learning_rate": 3.830412667421752e-06, "loss": 0.85718149, "num_input_tokens_seen": 56996770, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.26269531, "step": 2629, "time_per_iteration": 2.595682144165039 }, { "auxiliary_loss_clip": 0.06598566, "auxiliary_loss_mlp": 0.01286265, "balance_loss_clip": 0.0630328, "balance_loss_mlp": 0.01258537, "epoch": 0.15812415451675935, "flos": 17827479158400.0, "grad_norm": 2.7835221120808766, "language_loss": 0.7499122, "learning_rate": 3.8302556852817245e-06, "loss": 0.8287605, "num_input_tokens_seen": 57014970, "router_z_loss_clip": 2.94921875, "router_z_loss_mlp": 0.27746582, "step": 2630, "time_per_iteration": 4.025224447250366 }, { "auxiliary_loss_clip": 0.06601042, "auxiliary_loss_mlp": 0.01288643, "balance_loss_clip": 0.06304196, "balance_loss_mlp": 0.01260534, "epoch": 0.15818427776942734, "flos": 20090230341120.0, "grad_norm": 2.2890088693368043, "language_loss": 0.85086977, "learning_rate": 3.8300986337382184e-06, "loss": 0.9297666, "num_input_tokens_seen": 57034045, "router_z_loss_clip": 2.97070312, "router_z_loss_mlp": 0.28125, "step": 2631, "time_per_iteration": 4.016499280929565 }, { "auxiliary_loss_clip": 0.06599826, "auxiliary_loss_mlp": 0.0128481, "balance_loss_clip": 0.06305894, "balance_loss_mlp": 0.01258012, "epoch": 0.1582444010220953, "flos": 21221249552640.0, "grad_norm": 1.9372099099909872, "language_loss": 0.79732352, "learning_rate": 3.8299415127971895e-06, "loss": 0.87616986, "num_input_tokens_seen": 57053695, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.26855469, "step": 2632, "time_per_iteration": 2.573352098464966 }, { "auxiliary_loss_clip": 0.06607036, "auxiliary_loss_mlp": 0.0128427, "balance_loss_clip": 0.0631061, "balance_loss_mlp": 0.01258151, "epoch": 0.15830452427476327, "flos": 17864138119680.0, "grad_norm": 2.9133543038059786, "language_loss": 0.83861303, "learning_rate": 3.829784322464594e-06, "loss": 0.91752607, "num_input_tokens_seen": 57071290, "router_z_loss_clip": 2.96679688, "router_z_loss_mlp": 0.2611084, "step": 2633, "time_per_iteration": 2.579097032546997 }, { "auxiliary_loss_clip": 0.06607216, "auxiliary_loss_mlp": 0.01287507, "balance_loss_clip": 0.06310122, "balance_loss_mlp": 0.01261782, "epoch": 0.15836464752743123, "flos": 24541827805440.0, "grad_norm": 1.637233030086304, "language_loss": 0.77652597, "learning_rate": 3.829627062746394e-06, "loss": 0.85547316, "num_input_tokens_seen": 57091465, "router_z_loss_clip": 2.97070312, "router_z_loss_mlp": 0.25756836, "step": 2634, "time_per_iteration": 2.6106808185577393 }, { "auxiliary_loss_clip": 0.06607287, "auxiliary_loss_mlp": 0.01288889, "balance_loss_clip": 0.06309836, "balance_loss_mlp": 0.01261828, "epoch": 0.1584247707800992, "flos": 20127057010560.0, "grad_norm": 2.4087404625288493, "language_loss": 0.90018535, "learning_rate": 3.829469733648552e-06, "loss": 0.9791472, "num_input_tokens_seen": 57110075, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.27075195, "step": 2635, "time_per_iteration": 2.5701229572296143 }, { "auxiliary_loss_clip": 0.06607021, "auxiliary_loss_mlp": 0.01284173, "balance_loss_clip": 0.06307556, "balance_loss_mlp": 0.0125747, "epoch": 0.15848489403276717, "flos": 20382202293120.0, "grad_norm": 5.018179728789676, "language_loss": 0.76388437, "learning_rate": 3.829312335177034e-06, "loss": 0.84279633, "num_input_tokens_seen": 57128945, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.26733398, "step": 2636, "time_per_iteration": 2.572906494140625 }, { "auxiliary_loss_clip": 0.06609698, "auxiliary_loss_mlp": 0.01288687, "balance_loss_clip": 0.06308319, "balance_loss_mlp": 0.01262068, "epoch": 0.15854501728543513, "flos": 39356018760960.0, "grad_norm": 2.9673464576147506, "language_loss": 0.73551261, "learning_rate": 3.82915486733781e-06, "loss": 0.81449652, "num_input_tokens_seen": 57152385, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.26623535, "step": 2637, "time_per_iteration": 2.7447075843811035 }, { "auxiliary_loss_clip": 0.06597561, "auxiliary_loss_mlp": 0.01289165, "balance_loss_clip": 0.06306161, "balance_loss_mlp": 0.01264406, "epoch": 0.15860514053810312, "flos": 24871297259520.0, "grad_norm": 1.9462294074133928, "language_loss": 0.7898885, "learning_rate": 3.82899733013685e-06, "loss": 0.86875576, "num_input_tokens_seen": 57172620, "router_z_loss_clip": 2.91601562, "router_z_loss_mlp": 0.24731445, "step": 2638, "time_per_iteration": 2.6130805015563965 }, { "auxiliary_loss_clip": 0.06609805, "auxiliary_loss_mlp": 0.01282973, "balance_loss_clip": 0.06315001, "balance_loss_mlp": 0.01257677, "epoch": 0.1586652637907711, "flos": 26184982371840.0, "grad_norm": 5.654329397060169, "language_loss": 0.7632665, "learning_rate": 3.828839723580128e-06, "loss": 0.84219432, "num_input_tokens_seen": 57194680, "router_z_loss_clip": 2.94921875, "router_z_loss_mlp": 0.25305176, "step": 2639, "time_per_iteration": 2.637460231781006 }, { "auxiliary_loss_clip": 0.06615762, "auxiliary_loss_mlp": 0.01295171, "balance_loss_clip": 0.06319334, "balance_loss_mlp": 0.01268242, "epoch": 0.15872538704343905, "flos": 19798174535040.0, "grad_norm": 2.5205081692071545, "language_loss": 0.81972277, "learning_rate": 3.82868204767362e-06, "loss": 0.89883214, "num_input_tokens_seen": 57214675, "router_z_loss_clip": 2.96679688, "router_z_loss_mlp": 0.26940918, "step": 2640, "time_per_iteration": 2.602304220199585 }, { "auxiliary_loss_clip": 0.0660692, "auxiliary_loss_mlp": 0.01289673, "balance_loss_clip": 0.0631547, "balance_loss_mlp": 0.01262743, "epoch": 0.15878551029610702, "flos": 28482883142400.0, "grad_norm": 1.5701778245567146, "language_loss": 0.67516017, "learning_rate": 3.828524302423306e-06, "loss": 0.75412613, "num_input_tokens_seen": 57235830, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.26928711, "step": 2641, "time_per_iteration": 2.6732048988342285 }, { "auxiliary_loss_clip": 0.06612749, "auxiliary_loss_mlp": 0.01289302, "balance_loss_clip": 0.06309782, "balance_loss_mlp": 0.01263636, "epoch": 0.15884563354877498, "flos": 24213532308480.0, "grad_norm": 2.788685270924063, "language_loss": 0.76549804, "learning_rate": 3.828366487835167e-06, "loss": 0.84451854, "num_input_tokens_seen": 57255970, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.25646973, "step": 2642, "time_per_iteration": 2.622061014175415 }, { "auxiliary_loss_clip": 0.06604865, "auxiliary_loss_mlp": 0.01288872, "balance_loss_clip": 0.06314359, "balance_loss_mlp": 0.01263122, "epoch": 0.15890575680144295, "flos": 23956332600960.0, "grad_norm": 2.1894033299813485, "language_loss": 0.71357572, "learning_rate": 3.828208603915186e-06, "loss": 0.79251313, "num_input_tokens_seen": 57274435, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.25756836, "step": 2643, "time_per_iteration": 2.6045260429382324 }, { "auxiliary_loss_clip": 0.06602301, "auxiliary_loss_mlp": 0.01284096, "balance_loss_clip": 0.06313421, "balance_loss_mlp": 0.01258204, "epoch": 0.15896588005411091, "flos": 21221375333760.0, "grad_norm": 2.027542131911063, "language_loss": 0.79203773, "learning_rate": 3.828050650669353e-06, "loss": 0.8709017, "num_input_tokens_seen": 57293115, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.2590332, "step": 2644, "time_per_iteration": 2.60225772857666 }, { "auxiliary_loss_clip": 0.06604774, "auxiliary_loss_mlp": 0.01295922, "balance_loss_clip": 0.0631158, "balance_loss_mlp": 0.01269362, "epoch": 0.1590260033067789, "flos": 24359203831680.0, "grad_norm": 1.922700392437205, "language_loss": 0.83090079, "learning_rate": 3.827892628103657e-06, "loss": 0.90990782, "num_input_tokens_seen": 57312565, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.26574707, "step": 2645, "time_per_iteration": 2.618866443634033 }, { "auxiliary_loss_clip": 0.06603596, "auxiliary_loss_mlp": 0.01295228, "balance_loss_clip": 0.06307568, "balance_loss_mlp": 0.01267846, "epoch": 0.15908612655944687, "flos": 32056719960960.0, "grad_norm": 2.635641739458814, "language_loss": 0.70324302, "learning_rate": 3.827734536224087e-06, "loss": 0.78223127, "num_input_tokens_seen": 57333360, "router_z_loss_clip": 2.95703125, "router_z_loss_mlp": 0.27380371, "step": 2646, "time_per_iteration": 2.6730308532714844 }, { "auxiliary_loss_clip": 0.06603046, "auxiliary_loss_mlp": 0.01292236, "balance_loss_clip": 0.06314952, "balance_loss_mlp": 0.01265831, "epoch": 0.15914624981211484, "flos": 17791155613440.0, "grad_norm": 2.3120345001476297, "language_loss": 0.63063896, "learning_rate": 3.827576375036642e-06, "loss": 0.70959175, "num_input_tokens_seen": 57350575, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.26416016, "step": 2647, "time_per_iteration": 2.6658055782318115 }, { "auxiliary_loss_clip": 0.0660402, "auxiliary_loss_mlp": 0.01287348, "balance_loss_clip": 0.06314114, "balance_loss_mlp": 0.01260347, "epoch": 0.1592063730647828, "flos": 17718298888320.0, "grad_norm": 2.7505665052043544, "language_loss": 0.9035877, "learning_rate": 3.827418144547318e-06, "loss": 0.98250133, "num_input_tokens_seen": 57367570, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.27001953, "step": 2648, "time_per_iteration": 2.6719377040863037 }, { "auxiliary_loss_clip": 0.06593539, "auxiliary_loss_mlp": 0.01289495, "balance_loss_clip": 0.06309247, "balance_loss_mlp": 0.01265558, "epoch": 0.15926649631745077, "flos": 18808927632000.0, "grad_norm": 1.8970872364868876, "language_loss": 0.92999208, "learning_rate": 3.827259844762114e-06, "loss": 1.00882244, "num_input_tokens_seen": 57383980, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.23925781, "step": 2649, "time_per_iteration": 2.6212291717529297 }, { "auxiliary_loss_clip": 0.06615176, "auxiliary_loss_mlp": 0.01297932, "balance_loss_clip": 0.06305825, "balance_loss_mlp": 0.01270657, "epoch": 0.15932661957011873, "flos": 17571956532480.0, "grad_norm": 5.984966220086332, "language_loss": 0.71932703, "learning_rate": 3.827101475687033e-06, "loss": 0.7984581, "num_input_tokens_seen": 57400840, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.27319336, "step": 2650, "time_per_iteration": 2.5435845851898193 }, { "auxiliary_loss_clip": 0.06597764, "auxiliary_loss_mlp": 0.01285984, "balance_loss_clip": 0.06311691, "balance_loss_mlp": 0.01260604, "epoch": 0.15938674282278673, "flos": 13339432368000.0, "grad_norm": 2.0532649061067905, "language_loss": 0.72240865, "learning_rate": 3.826943037328082e-06, "loss": 0.80124611, "num_input_tokens_seen": 57419230, "router_z_loss_clip": 2.86328125, "router_z_loss_mlp": 0.25390625, "step": 2651, "time_per_iteration": 2.5733819007873535 }, { "auxiliary_loss_clip": 0.06598484, "auxiliary_loss_mlp": 0.01284771, "balance_loss_clip": 0.06304282, "balance_loss_mlp": 0.01258748, "epoch": 0.1594468660754547, "flos": 22494879613440.0, "grad_norm": 2.3790696802370186, "language_loss": 0.80672061, "learning_rate": 3.8267845296912674e-06, "loss": 0.88555318, "num_input_tokens_seen": 57439315, "router_z_loss_clip": 2.94726562, "router_z_loss_mlp": 0.26025391, "step": 2652, "time_per_iteration": 2.597522735595703 }, { "auxiliary_loss_clip": 0.06602211, "auxiliary_loss_mlp": 0.0127968, "balance_loss_clip": 0.06317028, "balance_loss_mlp": 0.01255206, "epoch": 0.15950698932812266, "flos": 15011782882560.0, "grad_norm": 2.903591254465774, "language_loss": 0.71566945, "learning_rate": 3.826625952782601e-06, "loss": 0.79448837, "num_input_tokens_seen": 57454635, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.24487305, "step": 2653, "time_per_iteration": 2.567106246948242 }, { "auxiliary_loss_clip": 0.0660688, "auxiliary_loss_mlp": 0.01280421, "balance_loss_clip": 0.0631595, "balance_loss_mlp": 0.01256484, "epoch": 0.15956711258079062, "flos": 30163074013440.0, "grad_norm": 2.5698968064794703, "language_loss": 0.77850169, "learning_rate": 3.826467306608095e-06, "loss": 0.85737467, "num_input_tokens_seen": 57476805, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.23950195, "step": 2654, "time_per_iteration": 2.6600024700164795 }, { "auxiliary_loss_clip": 0.06600739, "auxiliary_loss_mlp": 0.01289953, "balance_loss_clip": 0.06313183, "balance_loss_mlp": 0.01265992, "epoch": 0.1596272358334586, "flos": 21039044849280.0, "grad_norm": 1.9221175390168828, "language_loss": 0.82836902, "learning_rate": 3.826308591173765e-06, "loss": 0.90727592, "num_input_tokens_seen": 57496400, "router_z_loss_clip": 2.87695312, "router_z_loss_mlp": 0.23962402, "step": 2655, "time_per_iteration": 2.584836721420288 }, { "auxiliary_loss_clip": 0.06606472, "auxiliary_loss_mlp": 0.01287668, "balance_loss_clip": 0.06313139, "balance_loss_mlp": 0.01263039, "epoch": 0.15968735908612655, "flos": 15273426856320.0, "grad_norm": 1.9993118013960707, "language_loss": 0.74100256, "learning_rate": 3.826149806485631e-06, "loss": 0.81994396, "num_input_tokens_seen": 57513700, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.24645996, "step": 2656, "time_per_iteration": 2.568310499191284 }, { "auxiliary_loss_clip": 0.06600446, "auxiliary_loss_mlp": 0.01292246, "balance_loss_clip": 0.06319536, "balance_loss_mlp": 0.0126689, "epoch": 0.15974748233879452, "flos": 52677338647680.0, "grad_norm": 2.512148183831747, "language_loss": 0.78296876, "learning_rate": 3.825990952549713e-06, "loss": 0.86189568, "num_input_tokens_seen": 57536180, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.25390625, "step": 2657, "time_per_iteration": 2.8613502979278564 }, { "auxiliary_loss_clip": 0.06609382, "auxiliary_loss_mlp": 0.01291869, "balance_loss_clip": 0.06319862, "balance_loss_mlp": 0.01266441, "epoch": 0.1598076055914625, "flos": 18739047726720.0, "grad_norm": 1.906714602916198, "language_loss": 0.75165701, "learning_rate": 3.825832029372035e-06, "loss": 0.83066952, "num_input_tokens_seen": 57555025, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.25439453, "step": 2658, "time_per_iteration": 2.573993682861328 }, { "auxiliary_loss_clip": 0.06613521, "auxiliary_loss_mlp": 0.01293085, "balance_loss_clip": 0.0632284, "balance_loss_mlp": 0.01266215, "epoch": 0.15986772884413047, "flos": 34357681405440.0, "grad_norm": 1.9097663522269102, "language_loss": 0.7618072, "learning_rate": 3.825673036958624e-06, "loss": 0.84087336, "num_input_tokens_seen": 57577660, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.26904297, "step": 2659, "time_per_iteration": 2.7070558071136475 }, { "auxiliary_loss_clip": 0.06606516, "auxiliary_loss_mlp": 0.01292931, "balance_loss_clip": 0.06313382, "balance_loss_mlp": 0.01267193, "epoch": 0.15992785209679844, "flos": 22061596550400.0, "grad_norm": 2.41166897856134, "language_loss": 0.91508836, "learning_rate": 3.825513975315508e-06, "loss": 0.99408281, "num_input_tokens_seen": 57596335, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.25732422, "step": 2660, "time_per_iteration": 2.602963447570801 }, { "auxiliary_loss_clip": 0.06608314, "auxiliary_loss_mlp": 0.01287385, "balance_loss_clip": 0.06316063, "balance_loss_mlp": 0.01261719, "epoch": 0.1599879753494664, "flos": 33073946928000.0, "grad_norm": 2.160976401922214, "language_loss": 0.78768754, "learning_rate": 3.82535484444872e-06, "loss": 0.8666445, "num_input_tokens_seen": 57616830, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.25671387, "step": 2661, "time_per_iteration": 2.7185232639312744 }, { "auxiliary_loss_clip": 0.0660533, "auxiliary_loss_mlp": 0.01290195, "balance_loss_clip": 0.0631523, "balance_loss_mlp": 0.01265078, "epoch": 0.16004809860213437, "flos": 28045533156480.0, "grad_norm": 2.096950988828906, "language_loss": 0.75260818, "learning_rate": 3.825195644364292e-06, "loss": 0.83156341, "num_input_tokens_seen": 57635515, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.25134277, "step": 2662, "time_per_iteration": 2.68851900100708 }, { "auxiliary_loss_clip": 0.06612583, "auxiliary_loss_mlp": 0.01285972, "balance_loss_clip": 0.06319761, "balance_loss_mlp": 0.01261201, "epoch": 0.16010822185480234, "flos": 22786096878720.0, "grad_norm": 2.2103170448162475, "language_loss": 0.83222181, "learning_rate": 3.825036375068263e-06, "loss": 0.91120744, "num_input_tokens_seen": 57654250, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.24768066, "step": 2663, "time_per_iteration": 3.992859125137329 }, { "auxiliary_loss_clip": 0.06611072, "auxiliary_loss_mlp": 0.0128294, "balance_loss_clip": 0.06320186, "balance_loss_mlp": 0.01258729, "epoch": 0.16016834510747033, "flos": 20090188414080.0, "grad_norm": 2.3188297371187905, "language_loss": 0.80646616, "learning_rate": 3.824877036566672e-06, "loss": 0.88540626, "num_input_tokens_seen": 57672645, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.2421875, "step": 2664, "time_per_iteration": 2.5853071212768555 }, { "auxiliary_loss_clip": 0.06613179, "auxiliary_loss_mlp": 0.01280751, "balance_loss_clip": 0.06324189, "balance_loss_mlp": 0.01255562, "epoch": 0.1602284683601383, "flos": 21179391638400.0, "grad_norm": 2.4968402813376676, "language_loss": 0.95183104, "learning_rate": 3.824717628865561e-06, "loss": 1.0307703, "num_input_tokens_seen": 57691055, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.2520752, "step": 2665, "time_per_iteration": 2.618492364883423 }, { "auxiliary_loss_clip": 0.0660919, "auxiliary_loss_mlp": 0.01281346, "balance_loss_clip": 0.06317276, "balance_loss_mlp": 0.01256193, "epoch": 0.16028859161280626, "flos": 14652823991040.0, "grad_norm": 2.0410611562047793, "language_loss": 0.85735929, "learning_rate": 3.824558151970974e-06, "loss": 0.93626463, "num_input_tokens_seen": 57707235, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.25158691, "step": 2666, "time_per_iteration": 4.029683351516724 }, { "auxiliary_loss_clip": 0.06603447, "auxiliary_loss_mlp": 0.01282182, "balance_loss_clip": 0.06312859, "balance_loss_mlp": 0.01258793, "epoch": 0.16034871486547422, "flos": 20995677561600.0, "grad_norm": 2.14324319819121, "language_loss": 0.82283229, "learning_rate": 3.8243986058889595e-06, "loss": 0.90168858, "num_input_tokens_seen": 57724190, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.23400879, "step": 2667, "time_per_iteration": 2.7296810150146484 }, { "auxiliary_loss_clip": 0.06606573, "auxiliary_loss_mlp": 0.01284864, "balance_loss_clip": 0.06318572, "balance_loss_mlp": 0.01259758, "epoch": 0.1604088381181422, "flos": 21404167015680.0, "grad_norm": 2.554905867440798, "language_loss": 0.74526972, "learning_rate": 3.824238990625567e-06, "loss": 0.82418406, "num_input_tokens_seen": 57743620, "router_z_loss_clip": 2.87695312, "router_z_loss_mlp": 0.25097656, "step": 2668, "time_per_iteration": 2.6244282722473145 }, { "auxiliary_loss_clip": 0.066148, "auxiliary_loss_mlp": 0.01282708, "balance_loss_clip": 0.06327011, "balance_loss_mlp": 0.01257746, "epoch": 0.16046896137081015, "flos": 23883601656960.0, "grad_norm": 1.9695057388212966, "language_loss": 0.78236306, "learning_rate": 3.824079306186848e-06, "loss": 0.86133814, "num_input_tokens_seen": 57764810, "router_z_loss_clip": 2.87890625, "router_z_loss_mlp": 0.24951172, "step": 2669, "time_per_iteration": 2.6182546615600586 }, { "auxiliary_loss_clip": 0.06587729, "auxiliary_loss_mlp": 0.01285131, "balance_loss_clip": 0.06422175, "balance_loss_mlp": 0.01275588, "epoch": 0.16052908462347812, "flos": 59823907453440.0, "grad_norm": 0.7765244253756174, "language_loss": 0.55413258, "learning_rate": 3.823919552578861e-06, "loss": 0.63286114, "num_input_tokens_seen": 57824390, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.09527588, "step": 2670, "time_per_iteration": 6.029687404632568 }, { "auxiliary_loss_clip": 0.06616767, "auxiliary_loss_mlp": 0.01296858, "balance_loss_clip": 0.06321438, "balance_loss_mlp": 0.01271859, "epoch": 0.1605892078761461, "flos": 18302494354560.0, "grad_norm": 2.3146584032539756, "language_loss": 0.78887844, "learning_rate": 3.82375972980766e-06, "loss": 0.86801463, "num_input_tokens_seen": 57843665, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.25024414, "step": 2671, "time_per_iteration": 2.615837812423706 }, { "auxiliary_loss_clip": 0.06606413, "auxiliary_loss_mlp": 0.01287485, "balance_loss_clip": 0.06313349, "balance_loss_mlp": 0.01262463, "epoch": 0.16064933112881408, "flos": 32168918977920.0, "grad_norm": 2.5267067550306694, "language_loss": 0.65657914, "learning_rate": 3.8235998378793086e-06, "loss": 0.7355181, "num_input_tokens_seen": 57863305, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.25012207, "step": 2672, "time_per_iteration": 2.6602303981781006 }, { "auxiliary_loss_clip": 0.06615108, "auxiliary_loss_mlp": 0.01294021, "balance_loss_clip": 0.0631806, "balance_loss_mlp": 0.01266257, "epoch": 0.16070945438148204, "flos": 19834959277440.0, "grad_norm": 2.2663105340238623, "language_loss": 0.86904883, "learning_rate": 3.8234398767998675e-06, "loss": 0.94814008, "num_input_tokens_seen": 57883025, "router_z_loss_clip": 2.97070312, "router_z_loss_mlp": 0.27770996, "step": 2673, "time_per_iteration": 2.62134051322937 }, { "auxiliary_loss_clip": 0.06609727, "auxiliary_loss_mlp": 0.01294562, "balance_loss_clip": 0.06316403, "balance_loss_mlp": 0.0126898, "epoch": 0.16076957763415, "flos": 18918569099520.0, "grad_norm": 2.2642260877383267, "language_loss": 0.73137546, "learning_rate": 3.823279846575403e-06, "loss": 0.81041837, "num_input_tokens_seen": 57901430, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.25598145, "step": 2674, "time_per_iteration": 2.586122989654541 }, { "auxiliary_loss_clip": 0.06599432, "auxiliary_loss_mlp": 0.01296039, "balance_loss_clip": 0.06309329, "balance_loss_mlp": 0.01269824, "epoch": 0.16082970088681797, "flos": 16770071358720.0, "grad_norm": 1.7021220935017196, "language_loss": 0.85203063, "learning_rate": 3.823119747211986e-06, "loss": 0.93098533, "num_input_tokens_seen": 57919550, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.26208496, "step": 2675, "time_per_iteration": 2.5880439281463623 }, { "auxiliary_loss_clip": 0.06602027, "auxiliary_loss_mlp": 0.01300732, "balance_loss_clip": 0.06310204, "balance_loss_mlp": 0.0127478, "epoch": 0.16088982413948594, "flos": 35158560330240.0, "grad_norm": 2.9009509496581485, "language_loss": 0.83026576, "learning_rate": 3.822959578715685e-06, "loss": 0.90929335, "num_input_tokens_seen": 57939890, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.25952148, "step": 2676, "time_per_iteration": 2.703472852706909 }, { "auxiliary_loss_clip": 0.06594877, "auxiliary_loss_mlp": 0.01286426, "balance_loss_clip": 0.06308959, "balance_loss_mlp": 0.01261881, "epoch": 0.1609499473921539, "flos": 18631125267840.0, "grad_norm": 2.077222370338648, "language_loss": 0.7417165, "learning_rate": 3.822799341092573e-06, "loss": 0.82052952, "num_input_tokens_seen": 57957410, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.24536133, "step": 2677, "time_per_iteration": 2.5770697593688965 }, { "auxiliary_loss_clip": 0.0659379, "auxiliary_loss_mlp": 0.01285389, "balance_loss_clip": 0.06304213, "balance_loss_mlp": 0.01261094, "epoch": 0.1610100706448219, "flos": 33154057031040.0, "grad_norm": 1.839087718987541, "language_loss": 0.77220982, "learning_rate": 3.822639034348728e-06, "loss": 0.85100162, "num_input_tokens_seen": 57977900, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.24304199, "step": 2678, "time_per_iteration": 2.7260477542877197 }, { "auxiliary_loss_clip": 0.0659876, "auxiliary_loss_mlp": 0.01290526, "balance_loss_clip": 0.06305566, "balance_loss_mlp": 0.01265063, "epoch": 0.16107019389748986, "flos": 34685054507520.0, "grad_norm": 3.167366441860106, "language_loss": 0.71062309, "learning_rate": 3.822478658490228e-06, "loss": 0.78951597, "num_input_tokens_seen": 57998210, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.25476074, "step": 2679, "time_per_iteration": 2.7236526012420654 }, { "auxiliary_loss_clip": 0.06494494, "auxiliary_loss_mlp": 0.01268273, "balance_loss_clip": 0.06329551, "balance_loss_mlp": 0.01259106, "epoch": 0.16113031715015783, "flos": 65730920411520.0, "grad_norm": 0.7580674261617011, "language_loss": 0.51771116, "learning_rate": 3.822318213523154e-06, "loss": 0.59533888, "num_input_tokens_seen": 58059420, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.0916748, "step": 2680, "time_per_iteration": 3.2725491523742676 }, { "auxiliary_loss_clip": 0.06597667, "auxiliary_loss_mlp": 0.01294976, "balance_loss_clip": 0.06301491, "balance_loss_mlp": 0.01266962, "epoch": 0.1611904404028258, "flos": 20816156188800.0, "grad_norm": 1.7445843429394678, "language_loss": 0.81152344, "learning_rate": 3.8221576994535925e-06, "loss": 0.89044988, "num_input_tokens_seen": 58078370, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.2800293, "step": 2681, "time_per_iteration": 2.6089513301849365 }, { "auxiliary_loss_clip": 0.0658503, "auxiliary_loss_mlp": 0.01288063, "balance_loss_clip": 0.06295653, "balance_loss_mlp": 0.01263566, "epoch": 0.16125056365549376, "flos": 27020172343680.0, "grad_norm": 2.1518584573498236, "language_loss": 0.70406246, "learning_rate": 3.821997116287627e-06, "loss": 0.7827934, "num_input_tokens_seen": 58097395, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.24487305, "step": 2682, "time_per_iteration": 2.6434500217437744 }, { "auxiliary_loss_clip": 0.06590522, "auxiliary_loss_mlp": 0.01292689, "balance_loss_clip": 0.06296895, "balance_loss_mlp": 0.01267226, "epoch": 0.16131068690816172, "flos": 19281762622080.0, "grad_norm": 2.2359697645143655, "language_loss": 0.88021636, "learning_rate": 3.821836464031348e-06, "loss": 0.95904851, "num_input_tokens_seen": 58115630, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.25476074, "step": 2683, "time_per_iteration": 2.577204704284668 }, { "auxiliary_loss_clip": 0.0659593, "auxiliary_loss_mlp": 0.01291464, "balance_loss_clip": 0.06304008, "balance_loss_mlp": 0.01267241, "epoch": 0.16137081016082971, "flos": 35347137943680.0, "grad_norm": 2.3783535642568108, "language_loss": 0.75241375, "learning_rate": 3.821675742690849e-06, "loss": 0.83128768, "num_input_tokens_seen": 58138655, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.24243164, "step": 2684, "time_per_iteration": 2.706479787826538 }, { "auxiliary_loss_clip": 0.0659652, "auxiliary_loss_mlp": 0.01281478, "balance_loss_clip": 0.06298411, "balance_loss_mlp": 0.01255204, "epoch": 0.16143093341349768, "flos": 34242924839040.0, "grad_norm": 2.2046646919334614, "language_loss": 0.70677799, "learning_rate": 3.821514952272223e-06, "loss": 0.78555793, "num_input_tokens_seen": 58157440, "router_z_loss_clip": 2.98242188, "router_z_loss_mlp": 0.26293945, "step": 2685, "time_per_iteration": 2.695157289505005 }, { "auxiliary_loss_clip": 0.06584306, "auxiliary_loss_mlp": 0.01281802, "balance_loss_clip": 0.06297034, "balance_loss_mlp": 0.01258044, "epoch": 0.16149105666616564, "flos": 28006400499840.0, "grad_norm": 2.1386980717720947, "language_loss": 0.72488177, "learning_rate": 3.821354092781567e-06, "loss": 0.80354285, "num_input_tokens_seen": 58176660, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 0.2376709, "step": 2686, "time_per_iteration": 2.619473695755005 }, { "auxiliary_loss_clip": 0.06593218, "auxiliary_loss_mlp": 0.01285079, "balance_loss_clip": 0.06296895, "balance_loss_mlp": 0.01258269, "epoch": 0.1615511799188336, "flos": 19427434145280.0, "grad_norm": 1.8548123192723087, "language_loss": 0.82646024, "learning_rate": 3.821193164224981e-06, "loss": 0.90524322, "num_input_tokens_seen": 58195085, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.26806641, "step": 2687, "time_per_iteration": 2.5848939418792725 }, { "auxiliary_loss_clip": 0.06603031, "auxiliary_loss_mlp": 0.012961, "balance_loss_clip": 0.06298052, "balance_loss_mlp": 0.01269063, "epoch": 0.16161130317150157, "flos": 22861217664000.0, "grad_norm": 2.0591387643477064, "language_loss": 0.72633898, "learning_rate": 3.821032166608568e-06, "loss": 0.80533028, "num_input_tokens_seen": 58213540, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.2701416, "step": 2688, "time_per_iteration": 2.5879268646240234 }, { "auxiliary_loss_clip": 0.06592157, "auxiliary_loss_mlp": 0.01280414, "balance_loss_clip": 0.06298617, "balance_loss_mlp": 0.01255178, "epoch": 0.16167142642416954, "flos": 26118833973120.0, "grad_norm": 1.8782568302248794, "language_loss": 0.76473528, "learning_rate": 3.8208710999384325e-06, "loss": 0.84346104, "num_input_tokens_seen": 58236995, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.25219727, "step": 2689, "time_per_iteration": 2.718583106994629 }, { "auxiliary_loss_clip": 0.06587997, "auxiliary_loss_mlp": 0.01285422, "balance_loss_clip": 0.06296047, "balance_loss_mlp": 0.0126009, "epoch": 0.1617315496768375, "flos": 22785551827200.0, "grad_norm": 2.2027652294492284, "language_loss": 0.88254648, "learning_rate": 3.820709964220683e-06, "loss": 0.96128064, "num_input_tokens_seen": 58257230, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.25354004, "step": 2690, "time_per_iteration": 2.6265504360198975 }, { "auxiliary_loss_clip": 0.06588322, "auxiliary_loss_mlp": 0.01285882, "balance_loss_clip": 0.06296124, "balance_loss_mlp": 0.01261302, "epoch": 0.1617916729295055, "flos": 22023721704960.0, "grad_norm": 1.6246668340543624, "language_loss": 0.88968742, "learning_rate": 3.8205487594614284e-06, "loss": 0.96842951, "num_input_tokens_seen": 58277080, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.24584961, "step": 2691, "time_per_iteration": 2.6203246116638184 }, { "auxiliary_loss_clip": 0.06598006, "auxiliary_loss_mlp": 0.0128757, "balance_loss_clip": 0.06291765, "balance_loss_mlp": 0.01259938, "epoch": 0.16185179618217346, "flos": 23444574589440.0, "grad_norm": 3.3682253531400876, "language_loss": 0.83233762, "learning_rate": 3.820387485666784e-06, "loss": 0.91119337, "num_input_tokens_seen": 58294815, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.27636719, "step": 2692, "time_per_iteration": 2.6385879516601562 }, { "auxiliary_loss_clip": 0.06603174, "auxiliary_loss_mlp": 0.01286046, "balance_loss_clip": 0.06298657, "balance_loss_mlp": 0.01260118, "epoch": 0.16191191943484143, "flos": 25673182433280.0, "grad_norm": 2.4181288654683035, "language_loss": 0.81991434, "learning_rate": 3.820226142842862e-06, "loss": 0.89880657, "num_input_tokens_seen": 58313215, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.25939941, "step": 2693, "time_per_iteration": 2.5938735008239746 }, { "auxiliary_loss_clip": 0.06579369, "auxiliary_loss_mlp": 0.01284933, "balance_loss_clip": 0.06295624, "balance_loss_mlp": 0.01261365, "epoch": 0.1619720426875094, "flos": 23484126516480.0, "grad_norm": 1.4372315628686285, "language_loss": 0.8495087, "learning_rate": 3.820064730995783e-06, "loss": 0.92815173, "num_input_tokens_seen": 58333215, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.23583984, "step": 2694, "time_per_iteration": 2.669663906097412 }, { "auxiliary_loss_clip": 0.06594041, "auxiliary_loss_mlp": 0.01286658, "balance_loss_clip": 0.06293199, "balance_loss_mlp": 0.01260814, "epoch": 0.16203216594017736, "flos": 24140465948160.0, "grad_norm": 3.025261963702485, "language_loss": 0.70360291, "learning_rate": 3.819903250131667e-06, "loss": 0.78240991, "num_input_tokens_seen": 58351160, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.25842285, "step": 2695, "time_per_iteration": 2.5945165157318115 }, { "auxiliary_loss_clip": 0.06594622, "auxiliary_loss_mlp": 0.01282518, "balance_loss_clip": 0.06295438, "balance_loss_mlp": 0.01257091, "epoch": 0.16209228919284532, "flos": 22346566686720.0, "grad_norm": 2.1928584669737208, "language_loss": 0.8406381, "learning_rate": 3.819741700256637e-06, "loss": 0.91940951, "num_input_tokens_seen": 58368505, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.2545166, "step": 2696, "time_per_iteration": 2.5870494842529297 }, { "auxiliary_loss_clip": 0.066076, "auxiliary_loss_mlp": 0.01289027, "balance_loss_clip": 0.06297978, "balance_loss_mlp": 0.01262086, "epoch": 0.1621524124455133, "flos": 15820586017920.0, "grad_norm": 2.7881185089610643, "language_loss": 0.8997916, "learning_rate": 3.8195800813768194e-06, "loss": 0.97875786, "num_input_tokens_seen": 58385085, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.26977539, "step": 2697, "time_per_iteration": 2.574023723602295 }, { "auxiliary_loss_clip": 0.06588366, "auxiliary_loss_mlp": 0.01281007, "balance_loss_clip": 0.06301074, "balance_loss_mlp": 0.01256724, "epoch": 0.16221253569818128, "flos": 30193905116160.0, "grad_norm": 1.8044348408680628, "language_loss": 0.81378847, "learning_rate": 3.819418393498343e-06, "loss": 0.89248222, "num_input_tokens_seen": 58406985, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.24267578, "step": 2698, "time_per_iteration": 2.640111207962036 }, { "auxiliary_loss_clip": 0.06596252, "auxiliary_loss_mlp": 0.01286733, "balance_loss_clip": 0.06305863, "balance_loss_mlp": 0.0126189, "epoch": 0.16227265895084925, "flos": 24612546251520.0, "grad_norm": 1.5773986595336373, "language_loss": 0.78230441, "learning_rate": 3.819256636627339e-06, "loss": 0.86113429, "num_input_tokens_seen": 58426205, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.24829102, "step": 2699, "time_per_iteration": 2.5916168689727783 }, { "auxiliary_loss_clip": 0.06591481, "auxiliary_loss_mlp": 0.01287355, "balance_loss_clip": 0.06300542, "balance_loss_mlp": 0.01263727, "epoch": 0.1623327822035172, "flos": 19579436651520.0, "grad_norm": 1.9536843756834508, "language_loss": 0.86723238, "learning_rate": 3.81909481076994e-06, "loss": 0.94602072, "num_input_tokens_seen": 58443830, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.2364502, "step": 2700, "time_per_iteration": 2.583972454071045 }, { "auxiliary_loss_clip": 0.06595836, "auxiliary_loss_mlp": 0.01286551, "balance_loss_clip": 0.06304829, "balance_loss_mlp": 0.01262304, "epoch": 0.16239290545618518, "flos": 26475612658560.0, "grad_norm": 2.3222665948289167, "language_loss": 0.81401825, "learning_rate": 3.818932915932284e-06, "loss": 0.89284217, "num_input_tokens_seen": 58464405, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.24243164, "step": 2701, "time_per_iteration": 2.6302552223205566 }, { "auxiliary_loss_clip": 0.06600548, "auxiliary_loss_mlp": 0.01286681, "balance_loss_clip": 0.06304196, "balance_loss_mlp": 0.01260383, "epoch": 0.16245302870885314, "flos": 15857454614400.0, "grad_norm": 1.8821456157406729, "language_loss": 0.74298483, "learning_rate": 3.818770952120511e-06, "loss": 0.82185709, "num_input_tokens_seen": 58483295, "router_z_loss_clip": 2.9609375, "router_z_loss_mlp": 0.26293945, "step": 2702, "time_per_iteration": 3.9845192432403564 }, { "auxiliary_loss_clip": 0.06604616, "auxiliary_loss_mlp": 0.01293288, "balance_loss_clip": 0.06305587, "balance_loss_mlp": 0.01267479, "epoch": 0.1625131519615211, "flos": 14761710771840.0, "grad_norm": 2.1140365954301195, "language_loss": 0.73851514, "learning_rate": 3.81860891934076e-06, "loss": 0.81749421, "num_input_tokens_seen": 58501205, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.25830078, "step": 2703, "time_per_iteration": 2.5665102005004883 }, { "auxiliary_loss_clip": 0.06608394, "auxiliary_loss_mlp": 0.012992, "balance_loss_clip": 0.06310092, "balance_loss_mlp": 0.01273009, "epoch": 0.1625732752141891, "flos": 28228073276160.0, "grad_norm": 2.012538672412157, "language_loss": 0.70840418, "learning_rate": 3.818446817599176e-06, "loss": 0.78748012, "num_input_tokens_seen": 58522315, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.26220703, "step": 2704, "time_per_iteration": 2.640143632888794 }, { "auxiliary_loss_clip": 0.06542097, "auxiliary_loss_mlp": 0.01261331, "balance_loss_clip": 0.06378339, "balance_loss_mlp": 0.01253213, "epoch": 0.16263339846685707, "flos": 67347268871040.0, "grad_norm": 0.7794995114963142, "language_loss": 0.53342807, "learning_rate": 3.818284646901907e-06, "loss": 0.61146235, "num_input_tokens_seen": 58586695, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.08117676, "step": 2705, "time_per_iteration": 4.696590900421143 }, { "auxiliary_loss_clip": 0.06606038, "auxiliary_loss_mlp": 0.01286293, "balance_loss_clip": 0.06308051, "balance_loss_mlp": 0.01261247, "epoch": 0.16269352171952503, "flos": 14324360785920.0, "grad_norm": 2.3904775875947792, "language_loss": 0.76588118, "learning_rate": 3.818122407255102e-06, "loss": 0.84480447, "num_input_tokens_seen": 58602435, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.25061035, "step": 2706, "time_per_iteration": 2.5731327533721924 }, { "auxiliary_loss_clip": 0.06608212, "auxiliary_loss_mlp": 0.01288076, "balance_loss_clip": 0.06314749, "balance_loss_mlp": 0.01264747, "epoch": 0.162753644972193, "flos": 28367916940800.0, "grad_norm": 4.3587430447117, "language_loss": 0.7327956, "learning_rate": 3.817960098664914e-06, "loss": 0.81175852, "num_input_tokens_seen": 58621275, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.23339844, "step": 2707, "time_per_iteration": 2.663604259490967 }, { "auxiliary_loss_clip": 0.06607334, "auxiliary_loss_mlp": 0.01283796, "balance_loss_clip": 0.06311669, "balance_loss_mlp": 0.01258833, "epoch": 0.16281376822486096, "flos": 19943971839360.0, "grad_norm": 2.6880086391659783, "language_loss": 0.83838141, "learning_rate": 3.817797721137495e-06, "loss": 0.91729271, "num_input_tokens_seen": 58637550, "router_z_loss_clip": 2.95703125, "router_z_loss_mlp": 0.24975586, "step": 2708, "time_per_iteration": 2.573350429534912 }, { "auxiliary_loss_clip": 0.06609033, "auxiliary_loss_mlp": 0.01285745, "balance_loss_clip": 0.06308259, "balance_loss_mlp": 0.01258804, "epoch": 0.16287389147752893, "flos": 21258118149120.0, "grad_norm": 2.9348550223005923, "language_loss": 0.86517495, "learning_rate": 3.817635274679006e-06, "loss": 0.94412279, "num_input_tokens_seen": 58654135, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.26928711, "step": 2709, "time_per_iteration": 3.9454338550567627 }, { "auxiliary_loss_clip": 0.06601147, "auxiliary_loss_mlp": 0.01285444, "balance_loss_clip": 0.06305508, "balance_loss_mlp": 0.01261399, "epoch": 0.1629340147301969, "flos": 19250679957120.0, "grad_norm": 4.719940417678647, "language_loss": 0.92617929, "learning_rate": 3.817472759295605e-06, "loss": 1.00504518, "num_input_tokens_seen": 58674320, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.24072266, "step": 2710, "time_per_iteration": 4.045980215072632 }, { "auxiliary_loss_clip": 0.066084, "auxiliary_loss_mlp": 0.01287837, "balance_loss_clip": 0.06314912, "balance_loss_mlp": 0.01261695, "epoch": 0.16299413798286488, "flos": 21255896016000.0, "grad_norm": 2.2372466906940454, "language_loss": 0.82625377, "learning_rate": 3.817310174993453e-06, "loss": 0.90521616, "num_input_tokens_seen": 58691000, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.26159668, "step": 2711, "time_per_iteration": 2.5890142917633057 }, { "auxiliary_loss_clip": 0.06610202, "auxiliary_loss_mlp": 0.01285321, "balance_loss_clip": 0.06305216, "balance_loss_mlp": 0.01258833, "epoch": 0.16305426123553285, "flos": 18776545228800.0, "grad_norm": 2.6140501514424677, "language_loss": 0.82106149, "learning_rate": 3.817147521778719e-06, "loss": 0.90001678, "num_input_tokens_seen": 58710230, "router_z_loss_clip": 3.05078125, "router_z_loss_mlp": 0.26489258, "step": 2712, "time_per_iteration": 2.5711746215820312 }, { "auxiliary_loss_clip": 0.06620587, "auxiliary_loss_mlp": 0.01295494, "balance_loss_clip": 0.06318417, "balance_loss_mlp": 0.01269102, "epoch": 0.16311438448820081, "flos": 22093643537280.0, "grad_norm": 2.1291681047667095, "language_loss": 0.777982, "learning_rate": 3.816984799657568e-06, "loss": 0.85714287, "num_input_tokens_seen": 58728610, "router_z_loss_clip": 3.0234375, "router_z_loss_mlp": 0.26403809, "step": 2713, "time_per_iteration": 2.6952600479125977 }, { "auxiliary_loss_clip": 0.06604794, "auxiliary_loss_mlp": 0.01292344, "balance_loss_clip": 0.06319276, "balance_loss_mlp": 0.01265248, "epoch": 0.16317450774086878, "flos": 16472565037440.0, "grad_norm": 2.2416644490754765, "language_loss": 0.80519021, "learning_rate": 3.8168220086361715e-06, "loss": 0.88416165, "num_input_tokens_seen": 58744385, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.27087402, "step": 2714, "time_per_iteration": 2.5742337703704834 }, { "auxiliary_loss_clip": 0.06611259, "auxiliary_loss_mlp": 0.01294877, "balance_loss_clip": 0.06319658, "balance_loss_mlp": 0.01270367, "epoch": 0.16323463099353674, "flos": 24359832737280.0, "grad_norm": 1.6659796261764193, "language_loss": 0.78427529, "learning_rate": 3.816659148720702e-06, "loss": 0.86333668, "num_input_tokens_seen": 58763905, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.24499512, "step": 2715, "time_per_iteration": 2.6201252937316895 }, { "auxiliary_loss_clip": 0.06609513, "auxiliary_loss_mlp": 0.01300276, "balance_loss_clip": 0.06317706, "balance_loss_mlp": 0.01274467, "epoch": 0.1632947542462047, "flos": 24907872366720.0, "grad_norm": 2.0663571179504627, "language_loss": 0.82737768, "learning_rate": 3.816496219917336e-06, "loss": 0.90647554, "num_input_tokens_seen": 58785580, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.25830078, "step": 2716, "time_per_iteration": 2.6022303104400635 }, { "auxiliary_loss_clip": 0.06618381, "auxiliary_loss_mlp": 0.01305383, "balance_loss_clip": 0.0632153, "balance_loss_mlp": 0.01278143, "epoch": 0.1633548774988727, "flos": 24907285388160.0, "grad_norm": 1.9873058347338448, "language_loss": 0.87421596, "learning_rate": 3.816333222232251e-06, "loss": 0.95345354, "num_input_tokens_seen": 58806075, "router_z_loss_clip": 2.97070312, "router_z_loss_mlp": 0.27246094, "step": 2717, "time_per_iteration": 2.6200110912323 }, { "auxiliary_loss_clip": 0.06612625, "auxiliary_loss_mlp": 0.01297008, "balance_loss_clip": 0.06322896, "balance_loss_mlp": 0.01271652, "epoch": 0.16341500075154067, "flos": 30449008471680.0, "grad_norm": 2.0725608404104925, "language_loss": 0.77299559, "learning_rate": 3.816170155671629e-06, "loss": 0.85209191, "num_input_tokens_seen": 58827405, "router_z_loss_clip": 2.90039062, "router_z_loss_mlp": 0.25341797, "step": 2718, "time_per_iteration": 2.683641195297241 }, { "auxiliary_loss_clip": 0.06611897, "auxiliary_loss_mlp": 0.0129251, "balance_loss_clip": 0.06315339, "balance_loss_mlp": 0.0126781, "epoch": 0.16347512400420863, "flos": 22791253904640.0, "grad_norm": 2.1037044695550358, "language_loss": 0.75234449, "learning_rate": 3.816007020241652e-06, "loss": 0.83138859, "num_input_tokens_seen": 58847205, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.24707031, "step": 2719, "time_per_iteration": 2.619210720062256 }, { "auxiliary_loss_clip": 0.06607968, "auxiliary_loss_mlp": 0.01293701, "balance_loss_clip": 0.06315035, "balance_loss_mlp": 0.01267595, "epoch": 0.1635352472568766, "flos": 22639083690240.0, "grad_norm": 1.8007287176405442, "language_loss": 0.73257118, "learning_rate": 3.815843815948507e-06, "loss": 0.81158787, "num_input_tokens_seen": 58866865, "router_z_loss_clip": 2.92578125, "router_z_loss_mlp": 0.26123047, "step": 2720, "time_per_iteration": 2.6206960678100586 }, { "auxiliary_loss_clip": 0.06609845, "auxiliary_loss_mlp": 0.01293826, "balance_loss_clip": 0.06322145, "balance_loss_mlp": 0.01267433, "epoch": 0.16359537050954456, "flos": 15528362503680.0, "grad_norm": 14.432526492210625, "language_loss": 0.7673502, "learning_rate": 3.8156805427983824e-06, "loss": 0.84638691, "num_input_tokens_seen": 58885200, "router_z_loss_clip": 2.87890625, "router_z_loss_mlp": 0.26367188, "step": 2721, "time_per_iteration": 2.587341785430908 }, { "auxiliary_loss_clip": 0.06614177, "auxiliary_loss_mlp": 0.01287189, "balance_loss_clip": 0.06314304, "balance_loss_mlp": 0.01260295, "epoch": 0.16365549376221253, "flos": 22096578430080.0, "grad_norm": 1.98511285813964, "language_loss": 0.80421853, "learning_rate": 3.8155172007974695e-06, "loss": 0.88323224, "num_input_tokens_seen": 58906385, "router_z_loss_clip": 2.99414062, "router_z_loss_mlp": 0.26904297, "step": 2722, "time_per_iteration": 2.6274313926696777 }, { "auxiliary_loss_clip": 0.06624432, "auxiliary_loss_mlp": 0.01292029, "balance_loss_clip": 0.06317495, "balance_loss_mlp": 0.01262978, "epoch": 0.1637156170148805, "flos": 24067148025600.0, "grad_norm": 2.035956218546174, "language_loss": 0.85432696, "learning_rate": 3.8153537899519624e-06, "loss": 0.93349159, "num_input_tokens_seen": 58925040, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.29052734, "step": 2723, "time_per_iteration": 2.616931438446045 }, { "auxiliary_loss_clip": 0.06597339, "auxiliary_loss_mlp": 0.0128621, "balance_loss_clip": 0.0631131, "balance_loss_mlp": 0.01261331, "epoch": 0.1637757402675485, "flos": 26692212263040.0, "grad_norm": 1.7196679152239358, "language_loss": 0.71624935, "learning_rate": 3.815190310268058e-06, "loss": 0.79508483, "num_input_tokens_seen": 58944790, "router_z_loss_clip": 2.86132812, "router_z_loss_mlp": 0.2487793, "step": 2724, "time_per_iteration": 2.628941059112549 }, { "auxiliary_loss_clip": 0.06607274, "auxiliary_loss_mlp": 0.01293694, "balance_loss_clip": 0.06322981, "balance_loss_mlp": 0.0127015, "epoch": 0.16383586352021645, "flos": 16112432188800.0, "grad_norm": 2.075155788489511, "language_loss": 0.71629345, "learning_rate": 3.815026761751955e-06, "loss": 0.79530311, "num_input_tokens_seen": 58962500, "router_z_loss_clip": 2.84570312, "router_z_loss_mlp": 0.23547363, "step": 2725, "time_per_iteration": 2.5858798027038574 }, { "auxiliary_loss_clip": 0.06602125, "auxiliary_loss_mlp": 0.0128694, "balance_loss_clip": 0.06313856, "balance_loss_mlp": 0.01262025, "epoch": 0.16389598677288442, "flos": 19171031051520.0, "grad_norm": 1.8491736609977811, "language_loss": 0.89056909, "learning_rate": 3.814863144409855e-06, "loss": 0.96945977, "num_input_tokens_seen": 58980355, "router_z_loss_clip": 2.88085938, "router_z_loss_mlp": 0.24938965, "step": 2726, "time_per_iteration": 2.585641860961914 }, { "auxiliary_loss_clip": 0.06601219, "auxiliary_loss_mlp": 0.01285149, "balance_loss_clip": 0.0630772, "balance_loss_mlp": 0.01259638, "epoch": 0.16395611002555238, "flos": 21513431139840.0, "grad_norm": 2.057433508169315, "language_loss": 0.74715078, "learning_rate": 3.814699458247963e-06, "loss": 0.82601452, "num_input_tokens_seen": 58999505, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.25512695, "step": 2727, "time_per_iteration": 2.6056270599365234 }, { "auxiliary_loss_clip": 0.06595918, "auxiliary_loss_mlp": 0.01286948, "balance_loss_clip": 0.06307799, "balance_loss_mlp": 0.01264441, "epoch": 0.16401623327822035, "flos": 21477401084160.0, "grad_norm": 1.6156509696775279, "language_loss": 0.8362059, "learning_rate": 3.8145357032724855e-06, "loss": 0.91503453, "num_input_tokens_seen": 59017930, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.22497559, "step": 2728, "time_per_iteration": 2.686333179473877 }, { "auxiliary_loss_clip": 0.06608763, "auxiliary_loss_mlp": 0.01287836, "balance_loss_clip": 0.0631096, "balance_loss_mlp": 0.01263184, "epoch": 0.1640763565308883, "flos": 13631362392960.0, "grad_norm": 2.159095193685412, "language_loss": 0.85378039, "learning_rate": 3.814371879489633e-06, "loss": 0.93274641, "num_input_tokens_seen": 59035130, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.24658203, "step": 2729, "time_per_iteration": 2.5730342864990234 }, { "auxiliary_loss_clip": 0.06604773, "auxiliary_loss_mlp": 0.01291634, "balance_loss_clip": 0.06310131, "balance_loss_mlp": 0.01266266, "epoch": 0.16413647978355628, "flos": 15457057079040.0, "grad_norm": 2.728747793715554, "language_loss": 0.73181844, "learning_rate": 3.814207986905616e-06, "loss": 0.81078249, "num_input_tokens_seen": 59053080, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.25390625, "step": 2730, "time_per_iteration": 2.562091588973999 }, { "auxiliary_loss_clip": 0.06612486, "auxiliary_loss_mlp": 0.0129791, "balance_loss_clip": 0.06313783, "balance_loss_mlp": 0.01271815, "epoch": 0.16419660303622427, "flos": 45889043172480.0, "grad_norm": 2.1935102896358507, "language_loss": 0.75327623, "learning_rate": 3.814044025526651e-06, "loss": 0.83238024, "num_input_tokens_seen": 59075610, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.26098633, "step": 2731, "time_per_iteration": 2.8388257026672363 }, { "auxiliary_loss_clip": 0.06604335, "auxiliary_loss_mlp": 0.01299179, "balance_loss_clip": 0.0630808, "balance_loss_mlp": 0.01273251, "epoch": 0.16425672628889224, "flos": 18958791859200.0, "grad_norm": 3.2092894144901245, "language_loss": 0.79703677, "learning_rate": 3.8138799953589548e-06, "loss": 0.87607193, "num_input_tokens_seen": 59094555, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.25915527, "step": 2732, "time_per_iteration": 2.5881400108337402 }, { "auxiliary_loss_clip": 0.06608977, "auxiliary_loss_mlp": 0.01298607, "balance_loss_clip": 0.06311108, "balance_loss_mlp": 0.01273072, "epoch": 0.1643168495415602, "flos": 24319316488320.0, "grad_norm": 2.075711671867396, "language_loss": 0.70122778, "learning_rate": 3.8137158964087473e-06, "loss": 0.78030366, "num_input_tokens_seen": 59113515, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.25537109, "step": 2733, "time_per_iteration": 2.678619623184204 }, { "auxiliary_loss_clip": 0.0659823, "auxiliary_loss_mlp": 0.01293642, "balance_loss_clip": 0.0630572, "balance_loss_mlp": 0.01267476, "epoch": 0.16437697279422817, "flos": 26434970628480.0, "grad_norm": 2.11105811000347, "language_loss": 0.81880128, "learning_rate": 3.8135517286822508e-06, "loss": 0.89771998, "num_input_tokens_seen": 59133275, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.26171875, "step": 2734, "time_per_iteration": 2.657433032989502 }, { "auxiliary_loss_clip": 0.06600637, "auxiliary_loss_mlp": 0.01291426, "balance_loss_clip": 0.0630694, "balance_loss_mlp": 0.01265807, "epoch": 0.16443709604689613, "flos": 34540808503680.0, "grad_norm": 2.2127548749442942, "language_loss": 0.82813942, "learning_rate": 3.8133874921856914e-06, "loss": 0.90706003, "num_input_tokens_seen": 59154095, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.25622559, "step": 2735, "time_per_iteration": 2.7093727588653564 }, { "auxiliary_loss_clip": 0.06589248, "auxiliary_loss_mlp": 0.01285881, "balance_loss_clip": 0.06302258, "balance_loss_mlp": 0.01262814, "epoch": 0.1644972192995641, "flos": 23264717800320.0, "grad_norm": 3.570442932340297, "language_loss": 0.79392266, "learning_rate": 3.813223186925296e-06, "loss": 0.87267399, "num_input_tokens_seen": 59173795, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 0.23071289, "step": 2736, "time_per_iteration": 2.65553617477417 }, { "auxiliary_loss_clip": 0.06597127, "auxiliary_loss_mlp": 0.01294996, "balance_loss_clip": 0.06305688, "balance_loss_mlp": 0.0126995, "epoch": 0.1645573425522321, "flos": 26986825618560.0, "grad_norm": 1.754637324242373, "language_loss": 0.82250041, "learning_rate": 3.8130588129072964e-06, "loss": 0.90142167, "num_input_tokens_seen": 59191610, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.25073242, "step": 2737, "time_per_iteration": 2.6837198734283447 }, { "auxiliary_loss_clip": 0.06603885, "auxiliary_loss_mlp": 0.01285336, "balance_loss_clip": 0.06310879, "balance_loss_mlp": 0.01262234, "epoch": 0.16461746580490005, "flos": 28739495871360.0, "grad_norm": 2.0370256294070623, "language_loss": 0.88484359, "learning_rate": 3.8128943701379246e-06, "loss": 0.96373582, "num_input_tokens_seen": 59213000, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.2310791, "step": 2738, "time_per_iteration": 2.6647865772247314 }, { "auxiliary_loss_clip": 0.06596494, "auxiliary_loss_mlp": 0.01287708, "balance_loss_clip": 0.06303519, "balance_loss_mlp": 0.01263032, "epoch": 0.16467758905756802, "flos": 24936062065920.0, "grad_norm": 1.8183512682209517, "language_loss": 0.72860682, "learning_rate": 3.8127298586234167e-06, "loss": 0.80744886, "num_input_tokens_seen": 59232340, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.24694824, "step": 2739, "time_per_iteration": 2.6202497482299805 }, { "auxiliary_loss_clip": 0.06590893, "auxiliary_loss_mlp": 0.01280912, "balance_loss_clip": 0.06304143, "balance_loss_mlp": 0.01257058, "epoch": 0.16473771231023598, "flos": 24833380487040.0, "grad_norm": 1.7469274986059515, "language_loss": 0.82168651, "learning_rate": 3.8125652783700104e-06, "loss": 0.90040457, "num_input_tokens_seen": 59253950, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.23840332, "step": 2740, "time_per_iteration": 2.6261162757873535 }, { "auxiliary_loss_clip": 0.06601524, "auxiliary_loss_mlp": 0.01294268, "balance_loss_clip": 0.06303169, "balance_loss_mlp": 0.01269258, "epoch": 0.16479783556290395, "flos": 39905609690880.0, "grad_norm": 2.032008844960574, "language_loss": 0.69779432, "learning_rate": 3.8124006293839475e-06, "loss": 0.77675223, "num_input_tokens_seen": 59275545, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.25012207, "step": 2741, "time_per_iteration": 2.7564237117767334 }, { "auxiliary_loss_clip": 0.06592128, "auxiliary_loss_mlp": 0.01287773, "balance_loss_clip": 0.06300766, "balance_loss_mlp": 0.01263108, "epoch": 0.16485795881557191, "flos": 19902449341440.0, "grad_norm": 1.786066487112363, "language_loss": 0.80755699, "learning_rate": 3.812235911671472e-06, "loss": 0.88635594, "num_input_tokens_seen": 59293480, "router_z_loss_clip": 2.91601562, "router_z_loss_mlp": 0.24658203, "step": 2742, "time_per_iteration": 4.171692371368408 }, { "auxiliary_loss_clip": 0.06590544, "auxiliary_loss_mlp": 0.01283179, "balance_loss_clip": 0.06300201, "balance_loss_mlp": 0.01259301, "epoch": 0.16491808206823988, "flos": 20562017155200.0, "grad_norm": 1.8036156316196643, "language_loss": 0.85088408, "learning_rate": 3.8120711252388274e-06, "loss": 0.92962134, "num_input_tokens_seen": 59313435, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.23901367, "step": 2743, "time_per_iteration": 2.590350866317749 }, { "auxiliary_loss_clip": 0.065938, "auxiliary_loss_mlp": 0.01281793, "balance_loss_clip": 0.0630413, "balance_loss_mlp": 0.01257474, "epoch": 0.16497820532090787, "flos": 23806803790080.0, "grad_norm": 4.019513765073351, "language_loss": 0.86295557, "learning_rate": 3.811906270092265e-06, "loss": 0.94171149, "num_input_tokens_seen": 59331535, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.24316406, "step": 2744, "time_per_iteration": 4.055168867111206 }, { "auxiliary_loss_clip": 0.06585832, "auxiliary_loss_mlp": 0.01279452, "balance_loss_clip": 0.06305298, "balance_loss_mlp": 0.01256504, "epoch": 0.16503832857357584, "flos": 25489510283520.0, "grad_norm": 2.2022289207981878, "language_loss": 0.83713174, "learning_rate": 3.811741346238036e-06, "loss": 0.9157846, "num_input_tokens_seen": 59350680, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.22961426, "step": 2745, "time_per_iteration": 2.646596670150757 }, { "auxiliary_loss_clip": 0.06594153, "auxiliary_loss_mlp": 0.01285666, "balance_loss_clip": 0.06307253, "balance_loss_mlp": 0.01261562, "epoch": 0.1650984518262438, "flos": 17681849562240.0, "grad_norm": 1.9348043249857754, "language_loss": 0.77654201, "learning_rate": 3.8115763536823923e-06, "loss": 0.85534012, "num_input_tokens_seen": 59367020, "router_z_loss_clip": 2.87109375, "router_z_loss_mlp": 0.24121094, "step": 2746, "time_per_iteration": 2.5621542930603027 }, { "auxiliary_loss_clip": 0.06595436, "auxiliary_loss_mlp": 0.01283429, "balance_loss_clip": 0.06308801, "balance_loss_mlp": 0.01259384, "epoch": 0.16515857507891177, "flos": 18704401263360.0, "grad_norm": 1.6495754583509734, "language_loss": 0.81227016, "learning_rate": 3.811411292431592e-06, "loss": 0.8910588, "num_input_tokens_seen": 59386075, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.2401123, "step": 2747, "time_per_iteration": 2.5652058124542236 }, { "auxiliary_loss_clip": 0.06604638, "auxiliary_loss_mlp": 0.01283842, "balance_loss_clip": 0.0631256, "balance_loss_mlp": 0.01260883, "epoch": 0.16521869833157973, "flos": 15015472462080.0, "grad_norm": 3.187735889372738, "language_loss": 0.70816416, "learning_rate": 3.8112461624918945e-06, "loss": 0.78704894, "num_input_tokens_seen": 59402690, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.22973633, "step": 2748, "time_per_iteration": 2.594172477722168 }, { "auxiliary_loss_clip": 0.0659425, "auxiliary_loss_mlp": 0.01282892, "balance_loss_clip": 0.06307982, "balance_loss_mlp": 0.01259777, "epoch": 0.1652788215842477, "flos": 22126654846080.0, "grad_norm": 2.9945758700922402, "language_loss": 0.88715076, "learning_rate": 3.811080963869561e-06, "loss": 0.96592224, "num_input_tokens_seen": 59421130, "router_z_loss_clip": 2.86523438, "router_z_loss_mlp": 0.23120117, "step": 2749, "time_per_iteration": 5.440465927124023 }, { "auxiliary_loss_clip": 0.06609833, "auxiliary_loss_mlp": 0.01284152, "balance_loss_clip": 0.06315511, "balance_loss_mlp": 0.0125963, "epoch": 0.16533894483691566, "flos": 18339027534720.0, "grad_norm": 2.0862970879628135, "language_loss": 0.79517901, "learning_rate": 3.8109156965708557e-06, "loss": 0.8741188, "num_input_tokens_seen": 59438970, "router_z_loss_clip": 2.94335938, "router_z_loss_mlp": 0.24511719, "step": 2750, "time_per_iteration": 2.5797481536865234 }, { "auxiliary_loss_clip": 0.06601331, "auxiliary_loss_mlp": 0.01282223, "balance_loss_clip": 0.06312278, "balance_loss_mlp": 0.01257249, "epoch": 0.16539906808958366, "flos": 22388592309120.0, "grad_norm": 2.3535746543888965, "language_loss": 0.96025252, "learning_rate": 3.8107503606020455e-06, "loss": 1.03908801, "num_input_tokens_seen": 59458510, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.24975586, "step": 2751, "time_per_iteration": 2.5994503498077393 }, { "auxiliary_loss_clip": 0.06595852, "auxiliary_loss_mlp": 0.01283438, "balance_loss_clip": 0.0631168, "balance_loss_mlp": 0.01259143, "epoch": 0.16545919134225162, "flos": 22717726346880.0, "grad_norm": 1.972728342160194, "language_loss": 0.71451247, "learning_rate": 3.8105849559693997e-06, "loss": 0.7933054, "num_input_tokens_seen": 59477110, "router_z_loss_clip": 2.84570312, "router_z_loss_mlp": 0.24291992, "step": 2752, "time_per_iteration": 2.6012086868286133 }, { "auxiliary_loss_clip": 0.06569115, "auxiliary_loss_mlp": 0.01266985, "balance_loss_clip": 0.06404319, "balance_loss_mlp": 0.01258462, "epoch": 0.1655193145949196, "flos": 67822493702400.0, "grad_norm": 0.7391245245332396, "language_loss": 0.53943425, "learning_rate": 3.810419482679192e-06, "loss": 0.61779529, "num_input_tokens_seen": 59541155, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.08532715, "step": 2753, "time_per_iteration": 3.336597204208374 }, { "auxiliary_loss_clip": 0.06604259, "auxiliary_loss_mlp": 0.01282812, "balance_loss_clip": 0.0631845, "balance_loss_mlp": 0.01257551, "epoch": 0.16557943784758755, "flos": 24287353355520.0, "grad_norm": 1.8320636528337808, "language_loss": 0.75923306, "learning_rate": 3.8102539407376954e-06, "loss": 0.83810371, "num_input_tokens_seen": 59561155, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.25256348, "step": 2754, "time_per_iteration": 2.6564741134643555 }, { "auxiliary_loss_clip": 0.06613949, "auxiliary_loss_mlp": 0.01286367, "balance_loss_clip": 0.0631547, "balance_loss_mlp": 0.0126113, "epoch": 0.16563956110025552, "flos": 20089727216640.0, "grad_norm": 3.1105947242297907, "language_loss": 0.87996626, "learning_rate": 3.810088330151188e-06, "loss": 0.95896935, "num_input_tokens_seen": 59580460, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.25219727, "step": 2755, "time_per_iteration": 2.6149191856384277 }, { "auxiliary_loss_clip": 0.06598168, "auxiliary_loss_mlp": 0.01279764, "balance_loss_clip": 0.06313087, "balance_loss_mlp": 0.01255373, "epoch": 0.16569968435292348, "flos": 28041382379520.0, "grad_norm": 1.83935160373803, "language_loss": 0.73770916, "learning_rate": 3.80992265092595e-06, "loss": 0.81648844, "num_input_tokens_seen": 59600025, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.24401855, "step": 2756, "time_per_iteration": 2.6559159755706787 }, { "auxiliary_loss_clip": 0.06589829, "auxiliary_loss_mlp": 0.01281922, "balance_loss_clip": 0.06309052, "balance_loss_mlp": 0.01257758, "epoch": 0.16575980760559147, "flos": 26257461753600.0, "grad_norm": 1.7719772406223628, "language_loss": 0.75945288, "learning_rate": 3.8097569030682636e-06, "loss": 0.83817041, "num_input_tokens_seen": 59620600, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.24145508, "step": 2757, "time_per_iteration": 2.6621878147125244 }, { "auxiliary_loss_clip": 0.06593063, "auxiliary_loss_mlp": 0.01288807, "balance_loss_clip": 0.06309265, "balance_loss_mlp": 0.01264047, "epoch": 0.16581993085825944, "flos": 26951382541440.0, "grad_norm": 1.8281670159542527, "language_loss": 0.85665452, "learning_rate": 3.8095910865844137e-06, "loss": 0.9354732, "num_input_tokens_seen": 59641385, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.24731445, "step": 2758, "time_per_iteration": 2.6509227752685547 }, { "auxiliary_loss_clip": 0.06596737, "auxiliary_loss_mlp": 0.01282537, "balance_loss_clip": 0.06308722, "balance_loss_mlp": 0.01258981, "epoch": 0.1658800541109274, "flos": 21660192766080.0, "grad_norm": 2.0618859000201026, "language_loss": 0.80137968, "learning_rate": 3.809425201480689e-06, "loss": 0.88017243, "num_input_tokens_seen": 59659865, "router_z_loss_clip": 2.88085938, "router_z_loss_mlp": 0.23571777, "step": 2759, "time_per_iteration": 2.607492208480835 }, { "auxiliary_loss_clip": 0.06594959, "auxiliary_loss_mlp": 0.01285035, "balance_loss_clip": 0.06305306, "balance_loss_mlp": 0.01259858, "epoch": 0.16594017736359537, "flos": 16441063102080.0, "grad_norm": 3.0065897447004915, "language_loss": 0.77002472, "learning_rate": 3.8092592477633793e-06, "loss": 0.84882462, "num_input_tokens_seen": 59678780, "router_z_loss_clip": 2.90039062, "router_z_loss_mlp": 0.25183105, "step": 2760, "time_per_iteration": 2.5904173851013184 }, { "auxiliary_loss_clip": 0.06601678, "auxiliary_loss_mlp": 0.01283511, "balance_loss_clip": 0.06309515, "balance_loss_mlp": 0.01259311, "epoch": 0.16600030061626334, "flos": 22643779518720.0, "grad_norm": 1.8977410739877718, "language_loss": 0.74223989, "learning_rate": 3.8090932254387774e-06, "loss": 0.82109177, "num_input_tokens_seen": 59698795, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.2421875, "step": 2761, "time_per_iteration": 2.6018998622894287 }, { "auxiliary_loss_clip": 0.06595834, "auxiliary_loss_mlp": 0.0128462, "balance_loss_clip": 0.06310543, "balance_loss_mlp": 0.01260278, "epoch": 0.1660604238689313, "flos": 26403887963520.0, "grad_norm": 1.8691271099816091, "language_loss": 0.89216167, "learning_rate": 3.8089271345131788e-06, "loss": 0.97096622, "num_input_tokens_seen": 59718795, "router_z_loss_clip": 2.85351562, "router_z_loss_mlp": 0.2434082, "step": 2762, "time_per_iteration": 2.628852605819702 }, { "auxiliary_loss_clip": 0.06596511, "auxiliary_loss_mlp": 0.0128286, "balance_loss_clip": 0.06304266, "balance_loss_mlp": 0.01257373, "epoch": 0.16612054712159927, "flos": 23046776530560.0, "grad_norm": 1.758423296752262, "language_loss": 0.89164734, "learning_rate": 3.8087609749928822e-06, "loss": 0.97044098, "num_input_tokens_seen": 59737555, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.25524902, "step": 2763, "time_per_iteration": 2.5864124298095703 }, { "auxiliary_loss_clip": 0.06533614, "auxiliary_loss_mlp": 0.01258244, "balance_loss_clip": 0.06368529, "balance_loss_mlp": 0.01249601, "epoch": 0.16618067037426726, "flos": 59261388266880.0, "grad_norm": 0.7628135939357781, "language_loss": 0.59705412, "learning_rate": 3.8085947468841885e-06, "loss": 0.67497277, "num_input_tokens_seen": 59800915, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.08654785, "step": 2764, "time_per_iteration": 3.2265610694885254 }, { "auxiliary_loss_clip": 0.06592819, "auxiliary_loss_mlp": 0.01284596, "balance_loss_clip": 0.06302822, "balance_loss_mlp": 0.01260134, "epoch": 0.16624079362693522, "flos": 27206192407680.0, "grad_norm": 2.6095147763846698, "language_loss": 0.82588208, "learning_rate": 3.808428450193401e-06, "loss": 0.90465623, "num_input_tokens_seen": 59822910, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.24462891, "step": 2765, "time_per_iteration": 2.6394457817077637 }, { "auxiliary_loss_clip": 0.066031, "auxiliary_loss_mlp": 0.01289646, "balance_loss_clip": 0.06303521, "balance_loss_mlp": 0.01264338, "epoch": 0.1663009168796032, "flos": 10929542215680.0, "grad_norm": 2.4830665638485234, "language_loss": 0.70541412, "learning_rate": 3.8082620849268244e-06, "loss": 0.78434157, "num_input_tokens_seen": 59838805, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.25317383, "step": 2766, "time_per_iteration": 2.5755722522735596 }, { "auxiliary_loss_clip": 0.06585571, "auxiliary_loss_mlp": 0.01281125, "balance_loss_clip": 0.06300395, "balance_loss_mlp": 0.01257009, "epoch": 0.16636104013227115, "flos": 17900168175360.0, "grad_norm": 2.6032352424272975, "language_loss": 0.89822239, "learning_rate": 3.808095651090769e-06, "loss": 0.97688937, "num_input_tokens_seen": 59855345, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.24145508, "step": 2767, "time_per_iteration": 2.5545692443847656 }, { "auxiliary_loss_clip": 0.0650973, "auxiliary_loss_mlp": 0.01260795, "balance_loss_clip": 0.0634639, "balance_loss_mlp": 0.01253094, "epoch": 0.16642116338493912, "flos": 66748342285440.0, "grad_norm": 0.6233624746347975, "language_loss": 0.52868712, "learning_rate": 3.8079291486915447e-06, "loss": 0.60639238, "num_input_tokens_seen": 59917710, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.0769043, "step": 2768, "time_per_iteration": 3.323945999145508 }, { "auxiliary_loss_clip": 0.06592688, "auxiliary_loss_mlp": 0.01277967, "balance_loss_clip": 0.06297106, "balance_loss_mlp": 0.01253875, "epoch": 0.16648128663760708, "flos": 19032067854720.0, "grad_norm": 2.9092102952515644, "language_loss": 0.86253029, "learning_rate": 3.8077625777354667e-06, "loss": 0.94123685, "num_input_tokens_seen": 59935105, "router_z_loss_clip": 2.95703125, "router_z_loss_mlp": 0.2409668, "step": 2769, "time_per_iteration": 2.589024066925049 }, { "auxiliary_loss_clip": 0.06490245, "auxiliary_loss_mlp": 0.0125784, "balance_loss_clip": 0.06325944, "balance_loss_mlp": 0.01250128, "epoch": 0.16654140989027508, "flos": 70154370103680.0, "grad_norm": 0.7890611959468584, "language_loss": 0.57382113, "learning_rate": 3.80759593822885e-06, "loss": 0.65130198, "num_input_tokens_seen": 59984085, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.07702637, "step": 2770, "time_per_iteration": 3.092777967453003 }, { "auxiliary_loss_clip": 0.06478504, "auxiliary_loss_mlp": 0.01259168, "balance_loss_clip": 0.06315337, "balance_loss_mlp": 0.01251038, "epoch": 0.16660153314294304, "flos": 70290398407680.0, "grad_norm": 0.8357051981389256, "language_loss": 0.56273657, "learning_rate": 3.807429230178015e-06, "loss": 0.64011329, "num_input_tokens_seen": 60043470, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.08129883, "step": 2771, "time_per_iteration": 3.074455976486206 }, { "auxiliary_loss_clip": 0.06587251, "auxiliary_loss_mlp": 0.01281643, "balance_loss_clip": 0.06301891, "balance_loss_mlp": 0.01257074, "epoch": 0.166661656395611, "flos": 23081590702080.0, "grad_norm": 2.997883097914838, "language_loss": 0.71184409, "learning_rate": 3.8072624535892817e-06, "loss": 0.79053307, "num_input_tokens_seen": 60063045, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.24560547, "step": 2772, "time_per_iteration": 2.6015679836273193 }, { "auxiliary_loss_clip": 0.06575358, "auxiliary_loss_mlp": 0.01277421, "balance_loss_clip": 0.06294251, "balance_loss_mlp": 0.01253627, "epoch": 0.16672177964827897, "flos": 28373912507520.0, "grad_norm": 2.1469337720442923, "language_loss": 0.87351608, "learning_rate": 3.807095608468975e-06, "loss": 0.95204389, "num_input_tokens_seen": 60081945, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.23791504, "step": 2773, "time_per_iteration": 2.6469244956970215 }, { "auxiliary_loss_clip": 0.06586014, "auxiliary_loss_mlp": 0.01279377, "balance_loss_clip": 0.06298675, "balance_loss_mlp": 0.01255321, "epoch": 0.16678190290094694, "flos": 19095700631040.0, "grad_norm": 3.829206503629051, "language_loss": 0.82982534, "learning_rate": 3.8069286948234224e-06, "loss": 0.90847921, "num_input_tokens_seen": 60096820, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.24047852, "step": 2774, "time_per_iteration": 2.5553364753723145 }, { "auxiliary_loss_clip": 0.06592643, "auxiliary_loss_mlp": 0.01278543, "balance_loss_clip": 0.06302781, "balance_loss_mlp": 0.01254022, "epoch": 0.1668420261536149, "flos": 21805612727040.0, "grad_norm": 3.3978030957264242, "language_loss": 0.84093374, "learning_rate": 3.806761712658952e-06, "loss": 0.91964561, "num_input_tokens_seen": 60116140, "router_z_loss_clip": 2.90234375, "router_z_loss_mlp": 0.24523926, "step": 2775, "time_per_iteration": 2.599358558654785 }, { "auxiliary_loss_clip": 0.06591177, "auxiliary_loss_mlp": 0.01285737, "balance_loss_clip": 0.06303197, "balance_loss_mlp": 0.01262002, "epoch": 0.16690214940628287, "flos": 19068559107840.0, "grad_norm": 2.2657755363385554, "language_loss": 0.81553781, "learning_rate": 3.806594661981897e-06, "loss": 0.89430696, "num_input_tokens_seen": 60134235, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.23730469, "step": 2776, "time_per_iteration": 2.5721664428710938 }, { "auxiliary_loss_clip": 0.06582209, "auxiliary_loss_mlp": 0.01277724, "balance_loss_clip": 0.0630286, "balance_loss_mlp": 0.01254383, "epoch": 0.16696227265895086, "flos": 18594550160640.0, "grad_norm": 2.2044979395121005, "language_loss": 0.81095469, "learning_rate": 3.8064275427985906e-06, "loss": 0.88955402, "num_input_tokens_seen": 60153275, "router_z_loss_clip": 2.79296875, "router_z_loss_mlp": 0.2331543, "step": 2777, "time_per_iteration": 2.5955896377563477 }, { "auxiliary_loss_clip": 0.06592335, "auxiliary_loss_mlp": 0.01276782, "balance_loss_clip": 0.06303791, "balance_loss_mlp": 0.01253345, "epoch": 0.16702239591161883, "flos": 23300747856000.0, "grad_norm": 2.213288865579561, "language_loss": 0.85971951, "learning_rate": 3.806260355115371e-06, "loss": 0.93841064, "num_input_tokens_seen": 60173215, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.234375, "step": 2778, "time_per_iteration": 2.607053279876709 }, { "auxiliary_loss_clip": 0.06596272, "auxiliary_loss_mlp": 0.01282843, "balance_loss_clip": 0.06307624, "balance_loss_mlp": 0.012585, "epoch": 0.1670825191642868, "flos": 24432521754240.0, "grad_norm": 1.8940318827229372, "language_loss": 0.74743557, "learning_rate": 3.8060930989385778e-06, "loss": 0.82622671, "num_input_tokens_seen": 60190515, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.24353027, "step": 2779, "time_per_iteration": 2.6115641593933105 }, { "auxiliary_loss_clip": 0.065971, "auxiliary_loss_mlp": 0.01287602, "balance_loss_clip": 0.06308439, "balance_loss_mlp": 0.01262735, "epoch": 0.16714264241695476, "flos": 26804830550400.0, "grad_norm": 4.8211301277129595, "language_loss": 0.66799539, "learning_rate": 3.805925774274554e-06, "loss": 0.74684238, "num_input_tokens_seen": 60211655, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.24890137, "step": 2780, "time_per_iteration": 2.651808023452759 }, { "auxiliary_loss_clip": 0.06598936, "auxiliary_loss_mlp": 0.01288497, "balance_loss_clip": 0.06312377, "balance_loss_mlp": 0.01262891, "epoch": 0.16720276566962272, "flos": 21841768563840.0, "grad_norm": 2.7007574514655963, "language_loss": 0.7932933, "learning_rate": 3.805758381129643e-06, "loss": 0.87216759, "num_input_tokens_seen": 60230860, "router_z_loss_clip": 2.86328125, "router_z_loss_mlp": 0.2557373, "step": 2781, "time_per_iteration": 4.026163816452026 }, { "auxiliary_loss_clip": 0.06599694, "auxiliary_loss_mlp": 0.01281524, "balance_loss_clip": 0.06310578, "balance_loss_mlp": 0.01257098, "epoch": 0.1672628889222907, "flos": 21476814105600.0, "grad_norm": 2.9908887936542703, "language_loss": 0.7602421, "learning_rate": 3.805590919510193e-06, "loss": 0.83905429, "num_input_tokens_seen": 60250535, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.24438477, "step": 2782, "time_per_iteration": 2.5767109394073486 }, { "auxiliary_loss_clip": 0.06618329, "auxiliary_loss_mlp": 0.01287586, "balance_loss_clip": 0.06318615, "balance_loss_mlp": 0.01262016, "epoch": 0.16732301217495865, "flos": 30781915943040.0, "grad_norm": 2.125069162099584, "language_loss": 0.6860708, "learning_rate": 3.8054233894225547e-06, "loss": 0.76513004, "num_input_tokens_seen": 60269530, "router_z_loss_clip": 3.00195312, "router_z_loss_mlp": 0.25610352, "step": 2783, "time_per_iteration": 4.116421461105347 }, { "auxiliary_loss_clip": 0.06607824, "auxiliary_loss_mlp": 0.01281946, "balance_loss_clip": 0.06320405, "balance_loss_mlp": 0.01257687, "epoch": 0.16738313542762664, "flos": 23480940061440.0, "grad_norm": 1.6962010226359645, "language_loss": 0.71101862, "learning_rate": 3.805255790873081e-06, "loss": 0.78991628, "num_input_tokens_seen": 60289900, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.24279785, "step": 2784, "time_per_iteration": 2.612215518951416 }, { "auxiliary_loss_clip": 0.06603765, "auxiliary_loss_mlp": 0.01288457, "balance_loss_clip": 0.06312717, "balance_loss_mlp": 0.01263757, "epoch": 0.1674432586802946, "flos": 29796861744000.0, "grad_norm": 2.1302991183408, "language_loss": 0.62005299, "learning_rate": 3.805088123868126e-06, "loss": 0.69897521, "num_input_tokens_seen": 60310025, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.24731445, "step": 2785, "time_per_iteration": 2.6331019401550293 }, { "auxiliary_loss_clip": 0.06515244, "auxiliary_loss_mlp": 0.01305497, "balance_loss_clip": 0.06356005, "balance_loss_mlp": 0.01297964, "epoch": 0.16750338193296258, "flos": 66157228857600.0, "grad_norm": 0.7598187269601181, "language_loss": 0.58385062, "learning_rate": 3.8049203884140492e-06, "loss": 0.662058, "num_input_tokens_seen": 60377800, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.07519531, "step": 2786, "time_per_iteration": 3.2999796867370605 }, { "auxiliary_loss_clip": 0.0659813, "auxiliary_loss_mlp": 0.01282415, "balance_loss_clip": 0.06307257, "balance_loss_mlp": 0.01257869, "epoch": 0.16756350518563054, "flos": 25702881505920.0, "grad_norm": 1.9079917028904732, "language_loss": 0.76845211, "learning_rate": 3.80475258451721e-06, "loss": 0.84725761, "num_input_tokens_seen": 60398215, "router_z_loss_clip": 2.90820312, "router_z_loss_mlp": 0.2454834, "step": 2787, "time_per_iteration": 2.730437755584717 }, { "auxiliary_loss_clip": 0.06593737, "auxiliary_loss_mlp": 0.01284301, "balance_loss_clip": 0.06303587, "balance_loss_mlp": 0.01259697, "epoch": 0.1676236284382985, "flos": 23841911450880.0, "grad_norm": 1.9076788215149112, "language_loss": 0.78176892, "learning_rate": 3.804584712183972e-06, "loss": 0.86054933, "num_input_tokens_seen": 60416910, "router_z_loss_clip": 2.90039062, "router_z_loss_mlp": 0.24633789, "step": 2788, "time_per_iteration": 4.152398347854614 }, { "auxiliary_loss_clip": 0.06477497, "auxiliary_loss_mlp": 0.01269238, "balance_loss_clip": 0.06319068, "balance_loss_mlp": 0.01261906, "epoch": 0.16768375169096647, "flos": 59891313663360.0, "grad_norm": 0.83411476391176, "language_loss": 0.59369695, "learning_rate": 3.8044167714207013e-06, "loss": 0.67116427, "num_input_tokens_seen": 60468660, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.07312012, "step": 2789, "time_per_iteration": 4.4736151695251465 }, { "auxiliary_loss_clip": 0.06583238, "auxiliary_loss_mlp": 0.01284196, "balance_loss_clip": 0.06296631, "balance_loss_mlp": 0.01260259, "epoch": 0.16774387494363446, "flos": 38444785608960.0, "grad_norm": 2.1607529159331866, "language_loss": 0.70418978, "learning_rate": 3.804248762233765e-06, "loss": 0.78286409, "num_input_tokens_seen": 60492370, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 0.23950195, "step": 2790, "time_per_iteration": 2.7256968021392822 }, { "auxiliary_loss_clip": 0.06586705, "auxiliary_loss_mlp": 0.01282732, "balance_loss_clip": 0.06299667, "balance_loss_mlp": 0.01258294, "epoch": 0.16780399819630243, "flos": 22644156862080.0, "grad_norm": 2.9342201481049415, "language_loss": 0.80038536, "learning_rate": 3.8040806846295356e-06, "loss": 0.87907976, "num_input_tokens_seen": 60512655, "router_z_loss_clip": 2.87109375, "router_z_loss_mlp": 0.24438477, "step": 2791, "time_per_iteration": 2.619274377822876 }, { "auxiliary_loss_clip": 0.06590283, "auxiliary_loss_mlp": 0.01283898, "balance_loss_clip": 0.06303135, "balance_loss_mlp": 0.01259126, "epoch": 0.1678641214489704, "flos": 32900001851520.0, "grad_norm": 2.170377718388457, "language_loss": 0.7216121, "learning_rate": 3.8039125386143853e-06, "loss": 0.80035388, "num_input_tokens_seen": 60533090, "router_z_loss_clip": 2.87695312, "router_z_loss_mlp": 0.24768066, "step": 2792, "time_per_iteration": 2.6903083324432373 }, { "auxiliary_loss_clip": 0.06592578, "auxiliary_loss_mlp": 0.01285906, "balance_loss_clip": 0.06305111, "balance_loss_mlp": 0.01261409, "epoch": 0.16792424470163836, "flos": 19981133925120.0, "grad_norm": 2.4778298001185033, "language_loss": 0.72496432, "learning_rate": 3.803744324194691e-06, "loss": 0.8037492, "num_input_tokens_seen": 60553190, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.24475098, "step": 2793, "time_per_iteration": 2.584862470626831 }, { "auxiliary_loss_clip": 0.0659109, "auxiliary_loss_mlp": 0.01281157, "balance_loss_clip": 0.06303991, "balance_loss_mlp": 0.01257721, "epoch": 0.16798436795430632, "flos": 19726114423680.0, "grad_norm": 2.106713441055543, "language_loss": 0.78007013, "learning_rate": 3.803576041376831e-06, "loss": 0.85879254, "num_input_tokens_seen": 60571995, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.23425293, "step": 2794, "time_per_iteration": 2.5948705673217773 }, { "auxiliary_loss_clip": 0.0658558, "auxiliary_loss_mlp": 0.01281211, "balance_loss_clip": 0.06299847, "balance_loss_mlp": 0.01256987, "epoch": 0.1680444912069743, "flos": 28111346138880.0, "grad_norm": 2.4980034923864074, "language_loss": 0.7205267, "learning_rate": 3.803407690167187e-06, "loss": 0.79919457, "num_input_tokens_seen": 60591275, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.24206543, "step": 2795, "time_per_iteration": 2.6171820163726807 }, { "auxiliary_loss_clip": 0.06583866, "auxiliary_loss_mlp": 0.01282689, "balance_loss_clip": 0.06299572, "balance_loss_mlp": 0.01260111, "epoch": 0.16810461445964225, "flos": 18080695797120.0, "grad_norm": 1.9124950068817004, "language_loss": 0.84891903, "learning_rate": 3.803239270572142e-06, "loss": 0.92758465, "num_input_tokens_seen": 60609235, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.22570801, "step": 2796, "time_per_iteration": 2.5479891300201416 }, { "auxiliary_loss_clip": 0.06599239, "auxiliary_loss_mlp": 0.01284898, "balance_loss_clip": 0.06305602, "balance_loss_mlp": 0.01259303, "epoch": 0.16816473771231025, "flos": 23885488373760.0, "grad_norm": 1.9057206872358285, "language_loss": 0.81853652, "learning_rate": 3.8030707825980838e-06, "loss": 0.89737785, "num_input_tokens_seen": 60629880, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.25585938, "step": 2797, "time_per_iteration": 2.608950614929199 }, { "auxiliary_loss_clip": 0.06581121, "auxiliary_loss_mlp": 0.01285687, "balance_loss_clip": 0.06305031, "balance_loss_mlp": 0.01264289, "epoch": 0.1682248609649782, "flos": 22790163801600.0, "grad_norm": 1.3968917855891816, "language_loss": 0.75649953, "learning_rate": 3.802902226251401e-06, "loss": 0.83516765, "num_input_tokens_seen": 60651175, "router_z_loss_clip": 2.76367188, "router_z_loss_mlp": 0.21398926, "step": 2798, "time_per_iteration": 2.6254189014434814 }, { "auxiliary_loss_clip": 0.06591804, "auxiliary_loss_mlp": 0.01286637, "balance_loss_clip": 0.06309825, "balance_loss_mlp": 0.01263952, "epoch": 0.16828498421764618, "flos": 20711545966080.0, "grad_norm": 1.69773162864149, "language_loss": 0.80631369, "learning_rate": 3.8027336015384845e-06, "loss": 0.88509816, "num_input_tokens_seen": 60670210, "router_z_loss_clip": 2.81835938, "router_z_loss_mlp": 0.22680664, "step": 2799, "time_per_iteration": 2.5779192447662354 }, { "auxiliary_loss_clip": 0.06603397, "auxiliary_loss_mlp": 0.01285188, "balance_loss_clip": 0.06319581, "balance_loss_mlp": 0.01262395, "epoch": 0.16834510747031414, "flos": 29427714581760.0, "grad_norm": 2.256707312133866, "language_loss": 0.71208614, "learning_rate": 3.8025649084657296e-06, "loss": 0.79097199, "num_input_tokens_seen": 60690895, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.22802734, "step": 2800, "time_per_iteration": 2.6470723152160645 }, { "auxiliary_loss_clip": 0.06586856, "auxiliary_loss_mlp": 0.01282559, "balance_loss_clip": 0.06305306, "balance_loss_mlp": 0.01257227, "epoch": 0.1684052307229821, "flos": 18150407994240.0, "grad_norm": 2.1154680746675627, "language_loss": 0.84259951, "learning_rate": 3.8023961470395326e-06, "loss": 0.92129362, "num_input_tokens_seen": 60708280, "router_z_loss_clip": 2.81054688, "router_z_loss_mlp": 0.25366211, "step": 2801, "time_per_iteration": 2.5618093013763428 }, { "auxiliary_loss_clip": 0.0658918, "auxiliary_loss_mlp": 0.01281425, "balance_loss_clip": 0.06305625, "balance_loss_mlp": 0.01257416, "epoch": 0.16846535397565007, "flos": 16579439320320.0, "grad_norm": 3.1868807590689916, "language_loss": 0.83712494, "learning_rate": 3.8022273172662933e-06, "loss": 0.91583097, "num_input_tokens_seen": 60724150, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.2401123, "step": 2802, "time_per_iteration": 2.621237277984619 }, { "auxiliary_loss_clip": 0.06588113, "auxiliary_loss_mlp": 0.0127955, "balance_loss_clip": 0.06303762, "balance_loss_mlp": 0.01255196, "epoch": 0.16852547722831807, "flos": 30416667995520.0, "grad_norm": 1.783692503383837, "language_loss": 0.81591618, "learning_rate": 3.802058419152413e-06, "loss": 0.89459276, "num_input_tokens_seen": 60746485, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.24328613, "step": 2803, "time_per_iteration": 2.665057897567749 }, { "auxiliary_loss_clip": 0.06585726, "auxiliary_loss_mlp": 0.01282061, "balance_loss_clip": 0.06305395, "balance_loss_mlp": 0.01258267, "epoch": 0.16858560048098603, "flos": 33515279982720.0, "grad_norm": 2.5032946500125934, "language_loss": 0.77377582, "learning_rate": 3.801889452704297e-06, "loss": 0.85245365, "num_input_tokens_seen": 60762875, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.23803711, "step": 2804, "time_per_iteration": 2.651336908340454 }, { "auxiliary_loss_clip": 0.06454368, "auxiliary_loss_mlp": 0.01277363, "balance_loss_clip": 0.06296547, "balance_loss_mlp": 0.01269877, "epoch": 0.168645723733654, "flos": 67390845793920.0, "grad_norm": 0.8006220708999812, "language_loss": 0.55464768, "learning_rate": 3.8017204179283526e-06, "loss": 0.63196504, "num_input_tokens_seen": 60825510, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.07470703, "step": 2805, "time_per_iteration": 3.152416229248047 }, { "auxiliary_loss_clip": 0.06583509, "auxiliary_loss_mlp": 0.01280192, "balance_loss_clip": 0.06308034, "balance_loss_mlp": 0.01257495, "epoch": 0.16870584698632196, "flos": 21331016801280.0, "grad_norm": 1.9497393294779608, "language_loss": 0.73864007, "learning_rate": 3.8015513148309892e-06, "loss": 0.81727713, "num_input_tokens_seen": 60844440, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.22692871, "step": 2806, "time_per_iteration": 2.578082323074341 }, { "auxiliary_loss_clip": 0.06580403, "auxiliary_loss_mlp": 0.01281295, "balance_loss_clip": 0.06302486, "balance_loss_mlp": 0.01258621, "epoch": 0.16876597023898993, "flos": 20747030970240.0, "grad_norm": 2.0145454617199157, "language_loss": 0.70736009, "learning_rate": 3.80138214341862e-06, "loss": 0.78597707, "num_input_tokens_seen": 60863210, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.22692871, "step": 2807, "time_per_iteration": 2.6292715072631836 }, { "auxiliary_loss_clip": 0.06582195, "auxiliary_loss_mlp": 0.01280573, "balance_loss_clip": 0.06299719, "balance_loss_mlp": 0.01256183, "epoch": 0.1688260934916579, "flos": 20309806765440.0, "grad_norm": 5.83597846774363, "language_loss": 0.71288335, "learning_rate": 3.8012129036976587e-06, "loss": 0.79151106, "num_input_tokens_seen": 60882510, "router_z_loss_clip": 2.82617188, "router_z_loss_mlp": 0.24389648, "step": 2808, "time_per_iteration": 2.5833864212036133 }, { "auxiliary_loss_clip": 0.06593136, "auxiliary_loss_mlp": 0.01280096, "balance_loss_clip": 0.06303813, "balance_loss_mlp": 0.01255623, "epoch": 0.16888621674432586, "flos": 20347136559360.0, "grad_norm": 2.5497062401667, "language_loss": 0.80280995, "learning_rate": 3.8010435956745236e-06, "loss": 0.88154233, "num_input_tokens_seen": 60901105, "router_z_loss_clip": 2.89453125, "router_z_loss_mlp": 0.24462891, "step": 2809, "time_per_iteration": 2.6217727661132812 }, { "auxiliary_loss_clip": 0.06592437, "auxiliary_loss_mlp": 0.01278287, "balance_loss_clip": 0.06303529, "balance_loss_mlp": 0.01254266, "epoch": 0.16894633999699385, "flos": 16248963617280.0, "grad_norm": 2.4192016604539157, "language_loss": 0.88697726, "learning_rate": 3.8008742193556358e-06, "loss": 0.96568447, "num_input_tokens_seen": 60915340, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.2401123, "step": 2810, "time_per_iteration": 2.5493710041046143 }, { "auxiliary_loss_clip": 0.06593819, "auxiliary_loss_mlp": 0.01290007, "balance_loss_clip": 0.06306688, "balance_loss_mlp": 0.01263841, "epoch": 0.16900646324966181, "flos": 19616347175040.0, "grad_norm": 3.999441705090632, "language_loss": 0.92835152, "learning_rate": 3.800704774747416e-06, "loss": 1.00718987, "num_input_tokens_seen": 60933735, "router_z_loss_clip": 2.87109375, "router_z_loss_mlp": 0.26171875, "step": 2811, "time_per_iteration": 2.5667202472686768 }, { "auxiliary_loss_clip": 0.0658356, "auxiliary_loss_mlp": 0.0128752, "balance_loss_clip": 0.06297669, "balance_loss_mlp": 0.01263368, "epoch": 0.16906658650232978, "flos": 22024644099840.0, "grad_norm": 2.453174648731671, "language_loss": 0.79939735, "learning_rate": 3.800535261856291e-06, "loss": 0.87810814, "num_input_tokens_seen": 60953105, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.24157715, "step": 2812, "time_per_iteration": 2.582925319671631 }, { "auxiliary_loss_clip": 0.0658536, "auxiliary_loss_mlp": 0.01281853, "balance_loss_clip": 0.06301364, "balance_loss_mlp": 0.01257951, "epoch": 0.16912670975499774, "flos": 11768212131840.0, "grad_norm": 2.7440820175950082, "language_loss": 0.76046646, "learning_rate": 3.8003656806886887e-06, "loss": 0.83913863, "num_input_tokens_seen": 60969150, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.23913574, "step": 2813, "time_per_iteration": 2.559337854385376 }, { "auxiliary_loss_clip": 0.06592312, "auxiliary_loss_mlp": 0.01281781, "balance_loss_clip": 0.06304032, "balance_loss_mlp": 0.01259191, "epoch": 0.1691868330076657, "flos": 17166443898240.0, "grad_norm": 4.217232217765603, "language_loss": 0.69955575, "learning_rate": 3.8001960312510396e-06, "loss": 0.77829671, "num_input_tokens_seen": 60982825, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.22583008, "step": 2814, "time_per_iteration": 2.5470924377441406 }, { "auxiliary_loss_clip": 0.06588061, "auxiliary_loss_mlp": 0.01290177, "balance_loss_clip": 0.06302725, "balance_loss_mlp": 0.01265322, "epoch": 0.16924695626033368, "flos": 22422693720960.0, "grad_norm": 2.672448820325804, "language_loss": 0.63002443, "learning_rate": 3.800026313549776e-06, "loss": 0.70880687, "num_input_tokens_seen": 61000875, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.24829102, "step": 2815, "time_per_iteration": 2.6028995513916016 }, { "auxiliary_loss_clip": 0.06579466, "auxiliary_loss_mlp": 0.01284336, "balance_loss_clip": 0.06302023, "balance_loss_mlp": 0.01260518, "epoch": 0.16930707951300164, "flos": 25746835772160.0, "grad_norm": 2.8214835491827484, "language_loss": 0.82436299, "learning_rate": 3.7998565275913342e-06, "loss": 0.90300101, "num_input_tokens_seen": 61021940, "router_z_loss_clip": 2.77539062, "router_z_loss_mlp": 0.23815918, "step": 2816, "time_per_iteration": 2.598306655883789 }, { "auxiliary_loss_clip": 0.06589007, "auxiliary_loss_mlp": 0.01280709, "balance_loss_clip": 0.06308231, "balance_loss_mlp": 0.01255544, "epoch": 0.16936720276566963, "flos": 22753588694400.0, "grad_norm": 4.108004476748167, "language_loss": 0.87842584, "learning_rate": 3.799686673382153e-06, "loss": 0.95712304, "num_input_tokens_seen": 61040285, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.25170898, "step": 2817, "time_per_iteration": 2.604064702987671 }, { "auxiliary_loss_clip": 0.0658396, "auxiliary_loss_mlp": 0.01284551, "balance_loss_clip": 0.06305184, "balance_loss_mlp": 0.01259565, "epoch": 0.1694273260183376, "flos": 19580191338240.0, "grad_norm": 1.7473765526970888, "language_loss": 0.82669318, "learning_rate": 3.799516750928672e-06, "loss": 0.90537822, "num_input_tokens_seen": 61059020, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.25012207, "step": 2818, "time_per_iteration": 2.55311918258667 }, { "auxiliary_loss_clip": 0.06593108, "auxiliary_loss_mlp": 0.01290718, "balance_loss_clip": 0.06306435, "balance_loss_mlp": 0.01266804, "epoch": 0.16948744927100556, "flos": 12462636044160.0, "grad_norm": 3.3350207222613406, "language_loss": 0.81380916, "learning_rate": 3.799346760237336e-06, "loss": 0.89264739, "num_input_tokens_seen": 61074245, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.23937988, "step": 2819, "time_per_iteration": 2.575705051422119 }, { "auxiliary_loss_clip": 0.06473879, "auxiliary_loss_mlp": 0.01276301, "balance_loss_clip": 0.06315587, "balance_loss_mlp": 0.01269095, "epoch": 0.16954757252367353, "flos": 71309470164480.0, "grad_norm": 0.9011445241793777, "language_loss": 0.60235721, "learning_rate": 3.7991767013145902e-06, "loss": 0.67985898, "num_input_tokens_seen": 61127080, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.07183838, "step": 2820, "time_per_iteration": 3.0858023166656494 }, { "auxiliary_loss_clip": 0.0659028, "auxiliary_loss_mlp": 0.01282246, "balance_loss_clip": 0.06305803, "balance_loss_mlp": 0.01257522, "epoch": 0.1696076957763415, "flos": 29614237770240.0, "grad_norm": 2.086009009741077, "language_loss": 0.797032, "learning_rate": 3.7990065741668844e-06, "loss": 0.87575728, "num_input_tokens_seen": 61146955, "router_z_loss_clip": 2.84570312, "router_z_loss_mlp": 0.24731445, "step": 2821, "time_per_iteration": 4.080414056777954 }, { "auxiliary_loss_clip": 0.06591958, "auxiliary_loss_mlp": 0.01287931, "balance_loss_clip": 0.06307516, "balance_loss_mlp": 0.01261717, "epoch": 0.16966781902900946, "flos": 24395359668480.0, "grad_norm": 1.8200395007398544, "language_loss": 0.79536641, "learning_rate": 3.7988363788006685e-06, "loss": 0.8741653, "num_input_tokens_seen": 61166605, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.26220703, "step": 2822, "time_per_iteration": 2.6595821380615234 }, { "auxiliary_loss_clip": 0.06578845, "auxiliary_loss_mlp": 0.01282412, "balance_loss_clip": 0.06300627, "balance_loss_mlp": 0.0125739, "epoch": 0.16972794228167745, "flos": 23045392938240.0, "grad_norm": 2.0006584602676574, "language_loss": 0.75348401, "learning_rate": 3.7986661152223967e-06, "loss": 0.83209658, "num_input_tokens_seen": 61186535, "router_z_loss_clip": 2.78320312, "router_z_loss_mlp": 0.25048828, "step": 2823, "time_per_iteration": 4.068921804428101 }, { "auxiliary_loss_clip": 0.06593618, "auxiliary_loss_mlp": 0.01284607, "balance_loss_clip": 0.06309824, "balance_loss_mlp": 0.01259227, "epoch": 0.16978806553434542, "flos": 35237915746560.0, "grad_norm": 1.9046600250877848, "language_loss": 0.61001635, "learning_rate": 3.7984957834385257e-06, "loss": 0.68879855, "num_input_tokens_seen": 61208965, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.25390625, "step": 2824, "time_per_iteration": 2.7564797401428223 }, { "auxiliary_loss_clip": 0.06590492, "auxiliary_loss_mlp": 0.01284325, "balance_loss_clip": 0.0630842, "balance_loss_mlp": 0.01258123, "epoch": 0.16984818878701338, "flos": 32022366986880.0, "grad_norm": 1.7591645610543418, "language_loss": 0.73364192, "learning_rate": 3.7983253834555144e-06, "loss": 0.81239009, "num_input_tokens_seen": 61230670, "router_z_loss_clip": 2.8203125, "router_z_loss_mlp": 0.26208496, "step": 2825, "time_per_iteration": 2.6733593940734863 }, { "auxiliary_loss_clip": 0.06603675, "auxiliary_loss_mlp": 0.01287662, "balance_loss_clip": 0.06307179, "balance_loss_mlp": 0.01258503, "epoch": 0.16990831203968135, "flos": 22824936046080.0, "grad_norm": 2.195813533731434, "language_loss": 0.86089766, "learning_rate": 3.7981549152798245e-06, "loss": 0.93981099, "num_input_tokens_seen": 61249510, "router_z_loss_clip": 2.96679688, "router_z_loss_mlp": 0.29150391, "step": 2826, "time_per_iteration": 2.588055372238159 }, { "auxiliary_loss_clip": 0.06596129, "auxiliary_loss_mlp": 0.01284008, "balance_loss_clip": 0.06304398, "balance_loss_mlp": 0.01256113, "epoch": 0.1699684352923493, "flos": 23046315333120.0, "grad_norm": 2.3188931194143847, "language_loss": 0.83067, "learning_rate": 3.7979843789179196e-06, "loss": 0.90947139, "num_input_tokens_seen": 61269440, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.2791748, "step": 2827, "time_per_iteration": 2.5947251319885254 }, { "auxiliary_loss_clip": 0.06597918, "auxiliary_loss_mlp": 0.01288015, "balance_loss_clip": 0.06307127, "balance_loss_mlp": 0.01260048, "epoch": 0.17002855854501728, "flos": 21440532487680.0, "grad_norm": 1.8000008367473657, "language_loss": 0.7436074, "learning_rate": 3.797813774376267e-06, "loss": 0.82246673, "num_input_tokens_seen": 61288195, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.2800293, "step": 2828, "time_per_iteration": 5.464878797531128 }, { "auxiliary_loss_clip": 0.06471334, "auxiliary_loss_mlp": 0.01267675, "balance_loss_clip": 0.06312086, "balance_loss_mlp": 0.01260773, "epoch": 0.17008868179768524, "flos": 71473966928640.0, "grad_norm": 0.7605342478261499, "language_loss": 0.56452358, "learning_rate": 3.797643101661336e-06, "loss": 0.64191365, "num_input_tokens_seen": 61350850, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.06921387, "step": 2829, "time_per_iteration": 3.2566096782684326 }, { "auxiliary_loss_clip": 0.06587525, "auxiliary_loss_mlp": 0.01285003, "balance_loss_clip": 0.0630295, "balance_loss_mlp": 0.01258908, "epoch": 0.17014880505035324, "flos": 24907327315200.0, "grad_norm": 1.904122567172159, "language_loss": 0.84346819, "learning_rate": 3.7974723607795983e-06, "loss": 0.92219353, "num_input_tokens_seen": 61370765, "router_z_loss_clip": 2.84570312, "router_z_loss_mlp": 0.26098633, "step": 2830, "time_per_iteration": 2.6125986576080322 }, { "auxiliary_loss_clip": 0.06592658, "auxiliary_loss_mlp": 0.01286861, "balance_loss_clip": 0.06304396, "balance_loss_mlp": 0.01259121, "epoch": 0.1702089283030212, "flos": 29870263520640.0, "grad_norm": 2.1923280294715566, "language_loss": 0.79392099, "learning_rate": 3.797301551737529e-06, "loss": 0.87271619, "num_input_tokens_seen": 61388935, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.27758789, "step": 2831, "time_per_iteration": 2.653477907180786 }, { "auxiliary_loss_clip": 0.06592255, "auxiliary_loss_mlp": 0.01289703, "balance_loss_clip": 0.06303892, "balance_loss_mlp": 0.01263322, "epoch": 0.17026905155568917, "flos": 17749171918080.0, "grad_norm": 1.9557636954344795, "language_loss": 0.79930627, "learning_rate": 3.7971306745416044e-06, "loss": 0.87812579, "num_input_tokens_seen": 61407350, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.26391602, "step": 2832, "time_per_iteration": 2.5538837909698486 }, { "auxiliary_loss_clip": 0.0658903, "auxiliary_loss_mlp": 0.01291836, "balance_loss_clip": 0.06302816, "balance_loss_mlp": 0.01264108, "epoch": 0.17032917480835713, "flos": 23155327895040.0, "grad_norm": 1.8308264741215106, "language_loss": 0.89647031, "learning_rate": 3.7969597291983046e-06, "loss": 0.97527897, "num_input_tokens_seen": 61429010, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 0.27734375, "step": 2833, "time_per_iteration": 2.700129508972168 }, { "auxiliary_loss_clip": 0.06586251, "auxiliary_loss_mlp": 0.01293744, "balance_loss_clip": 0.06302163, "balance_loss_mlp": 0.01267184, "epoch": 0.1703892980610251, "flos": 39211940465280.0, "grad_norm": 3.276452676017043, "language_loss": 0.73648953, "learning_rate": 3.7967887157141115e-06, "loss": 0.8152895, "num_input_tokens_seen": 61450040, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.26599121, "step": 2834, "time_per_iteration": 2.757235288619995 }, { "auxiliary_loss_clip": 0.06600027, "auxiliary_loss_mlp": 0.01290769, "balance_loss_clip": 0.06309012, "balance_loss_mlp": 0.01263709, "epoch": 0.17044942131369306, "flos": 23045728354560.0, "grad_norm": 1.9608134320685748, "language_loss": 0.87151068, "learning_rate": 3.7966176340955106e-06, "loss": 0.95041859, "num_input_tokens_seen": 61468585, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.27075195, "step": 2835, "time_per_iteration": 2.598161220550537 }, { "auxiliary_loss_clip": 0.06597374, "auxiliary_loss_mlp": 0.01296068, "balance_loss_clip": 0.06302858, "balance_loss_mlp": 0.01267553, "epoch": 0.17050954456636103, "flos": 17060533937280.0, "grad_norm": 2.9217604769258787, "language_loss": 0.75327957, "learning_rate": 3.796446484348989e-06, "loss": 0.83221406, "num_input_tokens_seen": 61486330, "router_z_loss_clip": 2.94726562, "router_z_loss_mlp": 0.28515625, "step": 2836, "time_per_iteration": 2.580458402633667 }, { "auxiliary_loss_clip": 0.06595731, "auxiliary_loss_mlp": 0.0129021, "balance_loss_clip": 0.06302872, "balance_loss_mlp": 0.01261707, "epoch": 0.17056966781902902, "flos": 16842634594560.0, "grad_norm": 2.351293448797915, "language_loss": 0.80834883, "learning_rate": 3.796275266481036e-06, "loss": 0.88720822, "num_input_tokens_seen": 61503950, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.28503418, "step": 2837, "time_per_iteration": 2.564781427383423 }, { "auxiliary_loss_clip": 0.06584309, "auxiliary_loss_mlp": 0.01293208, "balance_loss_clip": 0.06306651, "balance_loss_mlp": 0.01267078, "epoch": 0.17062979107169698, "flos": 17718340815360.0, "grad_norm": 1.7971365019442012, "language_loss": 0.84110022, "learning_rate": 3.7961039804981456e-06, "loss": 0.91987532, "num_input_tokens_seen": 61523550, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.26123047, "step": 2838, "time_per_iteration": 2.578479290008545 }, { "auxiliary_loss_clip": 0.0658372, "auxiliary_loss_mlp": 0.01288209, "balance_loss_clip": 0.06300814, "balance_loss_mlp": 0.01262925, "epoch": 0.17068991432436495, "flos": 22531035450240.0, "grad_norm": 3.1392277840806244, "language_loss": 0.93965626, "learning_rate": 3.795932626406812e-06, "loss": 1.01837552, "num_input_tokens_seen": 61542720, "router_z_loss_clip": 2.83007812, "router_z_loss_mlp": 0.25292969, "step": 2839, "time_per_iteration": 2.6199238300323486 }, { "auxiliary_loss_clip": 0.0659101, "auxiliary_loss_mlp": 0.01289122, "balance_loss_clip": 0.06303986, "balance_loss_mlp": 0.01260286, "epoch": 0.17075003757703291, "flos": 25889698183680.0, "grad_norm": 2.4216319050313615, "language_loss": 0.84631699, "learning_rate": 3.7957612042135336e-06, "loss": 0.92511833, "num_input_tokens_seen": 61563040, "router_z_loss_clip": 2.87109375, "router_z_loss_mlp": 0.28833008, "step": 2840, "time_per_iteration": 2.6135177612304688 }, { "auxiliary_loss_clip": 0.06595434, "auxiliary_loss_mlp": 0.01295184, "balance_loss_clip": 0.06310986, "balance_loss_mlp": 0.01267432, "epoch": 0.17081016082970088, "flos": 20126931229440.0, "grad_norm": 1.8824174064321713, "language_loss": 0.76919818, "learning_rate": 3.79558971392481e-06, "loss": 0.84810436, "num_input_tokens_seen": 61581890, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.27783203, "step": 2841, "time_per_iteration": 2.6078665256500244 }, { "auxiliary_loss_clip": 0.0659371, "auxiliary_loss_mlp": 0.01287825, "balance_loss_clip": 0.06308879, "balance_loss_mlp": 0.0126142, "epoch": 0.17087028408236885, "flos": 24943441224960.0, "grad_norm": 1.8418891231863028, "language_loss": 0.77185446, "learning_rate": 3.7954181555471443e-06, "loss": 0.85066986, "num_input_tokens_seen": 61602095, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.26367188, "step": 2842, "time_per_iteration": 2.6307713985443115 }, { "auxiliary_loss_clip": 0.06583681, "auxiliary_loss_mlp": 0.01287605, "balance_loss_clip": 0.06309272, "balance_loss_mlp": 0.01260235, "epoch": 0.17093040733503684, "flos": 19063108592640.0, "grad_norm": 1.8417589579241467, "language_loss": 0.86583626, "learning_rate": 3.795246529087043e-06, "loss": 0.94454914, "num_input_tokens_seen": 61620400, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.2734375, "step": 2843, "time_per_iteration": 2.5830729007720947 }, { "auxiliary_loss_clip": 0.06592656, "auxiliary_loss_mlp": 0.01285573, "balance_loss_clip": 0.06310514, "balance_loss_mlp": 0.01260956, "epoch": 0.1709905305877048, "flos": 13083993596160.0, "grad_norm": 1.7742479593312366, "language_loss": 0.69365013, "learning_rate": 3.7950748345510126e-06, "loss": 0.77243245, "num_input_tokens_seen": 61637680, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.24633789, "step": 2844, "time_per_iteration": 2.570035934448242 }, { "auxiliary_loss_clip": 0.06593182, "auxiliary_loss_mlp": 0.01286052, "balance_loss_clip": 0.06308553, "balance_loss_mlp": 0.01260958, "epoch": 0.17105065384037277, "flos": 19215530369280.0, "grad_norm": 2.085395554923278, "language_loss": 0.78957653, "learning_rate": 3.7949030719455646e-06, "loss": 0.86836892, "num_input_tokens_seen": 61655630, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.25073242, "step": 2845, "time_per_iteration": 2.5686326026916504 }, { "auxiliary_loss_clip": 0.06602698, "auxiliary_loss_mlp": 0.0128286, "balance_loss_clip": 0.06319099, "balance_loss_mlp": 0.01257492, "epoch": 0.17111077709304073, "flos": 18521106456960.0, "grad_norm": 3.2891964390332977, "language_loss": 0.7849201, "learning_rate": 3.7947312412772127e-06, "loss": 0.86377567, "num_input_tokens_seen": 61673475, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.25378418, "step": 2846, "time_per_iteration": 2.5692508220672607 }, { "auxiliary_loss_clip": 0.06591799, "auxiliary_loss_mlp": 0.01287306, "balance_loss_clip": 0.06311939, "balance_loss_mlp": 0.01263047, "epoch": 0.1711709003457087, "flos": 25089699726720.0, "grad_norm": 2.3808861011894185, "language_loss": 0.80757928, "learning_rate": 3.794559342552472e-06, "loss": 0.8863703, "num_input_tokens_seen": 61693370, "router_z_loss_clip": 2.79882812, "router_z_loss_mlp": 0.24267578, "step": 2847, "time_per_iteration": 2.6548376083374023 }, { "auxiliary_loss_clip": 0.06601509, "auxiliary_loss_mlp": 0.01286992, "balance_loss_clip": 0.06314921, "balance_loss_mlp": 0.0126197, "epoch": 0.17123102359837666, "flos": 17572124240640.0, "grad_norm": 2.6889241167024434, "language_loss": 0.87510145, "learning_rate": 3.7943873757778614e-06, "loss": 0.95398641, "num_input_tokens_seen": 61710820, "router_z_loss_clip": 2.86523438, "router_z_loss_mlp": 0.25012207, "step": 2848, "time_per_iteration": 2.5596494674682617 }, { "auxiliary_loss_clip": 0.06596913, "auxiliary_loss_mlp": 0.01301204, "balance_loss_clip": 0.06315379, "balance_loss_mlp": 0.01275336, "epoch": 0.17129114685104463, "flos": 26180244616320.0, "grad_norm": 1.9434292672730502, "language_loss": 0.75315857, "learning_rate": 3.794215340959902e-06, "loss": 0.83213973, "num_input_tokens_seen": 61729855, "router_z_loss_clip": 2.81835938, "router_z_loss_mlp": 0.25891113, "step": 2849, "time_per_iteration": 2.62538743019104 }, { "auxiliary_loss_clip": 0.06478406, "auxiliary_loss_mlp": 0.01261991, "balance_loss_clip": 0.06319973, "balance_loss_mlp": 0.01254273, "epoch": 0.17135127010371262, "flos": 69290696943360.0, "grad_norm": 0.755439137315207, "language_loss": 0.57323897, "learning_rate": 3.7940432381051163e-06, "loss": 0.65064287, "num_input_tokens_seen": 61790290, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.0770874, "step": 2850, "time_per_iteration": 3.2022218704223633 }, { "auxiliary_loss_clip": 0.06592154, "auxiliary_loss_mlp": 0.0127929, "balance_loss_clip": 0.06316843, "balance_loss_mlp": 0.01255615, "epoch": 0.1714113933563806, "flos": 23556857460480.0, "grad_norm": 2.32533624173931, "language_loss": 0.81668454, "learning_rate": 3.793871067220031e-06, "loss": 0.89539897, "num_input_tokens_seen": 61809265, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.23681641, "step": 2851, "time_per_iteration": 2.6135449409484863 }, { "auxiliary_loss_clip": 0.06598148, "auxiliary_loss_mlp": 0.01285296, "balance_loss_clip": 0.06321833, "balance_loss_mlp": 0.01260882, "epoch": 0.17147151660904855, "flos": 21148854024960.0, "grad_norm": 2.0825579907770964, "language_loss": 0.94405293, "learning_rate": 3.7936988283111764e-06, "loss": 1.02288735, "num_input_tokens_seen": 61828980, "router_z_loss_clip": 2.76367188, "router_z_loss_mlp": 0.24401855, "step": 2852, "time_per_iteration": 2.5816879272460938 }, { "auxiliary_loss_clip": 0.06605224, "auxiliary_loss_mlp": 0.01289029, "balance_loss_clip": 0.06318798, "balance_loss_mlp": 0.01261575, "epoch": 0.17153163986171652, "flos": 18630873705600.0, "grad_norm": 2.059392063318093, "language_loss": 0.70439374, "learning_rate": 3.7935265213850817e-06, "loss": 0.78333634, "num_input_tokens_seen": 61847915, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.27429199, "step": 2853, "time_per_iteration": 2.5954501628875732 }, { "auxiliary_loss_clip": 0.06597768, "auxiliary_loss_mlp": 0.01286667, "balance_loss_clip": 0.06313429, "balance_loss_mlp": 0.01260644, "epoch": 0.17159176311438448, "flos": 18229134504960.0, "grad_norm": 2.510119008251969, "language_loss": 0.6756041, "learning_rate": 3.7933541464482815e-06, "loss": 0.75444847, "num_input_tokens_seen": 61865570, "router_z_loss_clip": 2.84570312, "router_z_loss_mlp": 0.26025391, "step": 2854, "time_per_iteration": 2.681424140930176 }, { "auxiliary_loss_clip": 0.06591872, "auxiliary_loss_mlp": 0.01289858, "balance_loss_clip": 0.06314783, "balance_loss_mlp": 0.01266755, "epoch": 0.17165188636705245, "flos": 20744976545280.0, "grad_norm": 1.7207807563813524, "language_loss": 0.89614582, "learning_rate": 3.7931817035073124e-06, "loss": 0.97496313, "num_input_tokens_seen": 61883340, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.2310791, "step": 2855, "time_per_iteration": 2.6082212924957275 }, { "auxiliary_loss_clip": 0.06603999, "auxiliary_loss_mlp": 0.01297427, "balance_loss_clip": 0.06322894, "balance_loss_mlp": 0.01270522, "epoch": 0.17171200961972044, "flos": 24906824190720.0, "grad_norm": 2.546944321699345, "language_loss": 0.83930707, "learning_rate": 3.7930091925687134e-06, "loss": 0.91832137, "num_input_tokens_seen": 61900610, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.26916504, "step": 2856, "time_per_iteration": 2.6457669734954834 }, { "auxiliary_loss_clip": 0.06604557, "auxiliary_loss_mlp": 0.01294894, "balance_loss_clip": 0.06322478, "balance_loss_mlp": 0.01269765, "epoch": 0.1717721328723884, "flos": 20163464409600.0, "grad_norm": 2.1163599835392883, "language_loss": 0.87039745, "learning_rate": 3.792836613639026e-06, "loss": 0.94939202, "num_input_tokens_seen": 61916795, "router_z_loss_clip": 2.8203125, "router_z_loss_mlp": 0.25146484, "step": 2857, "time_per_iteration": 2.58998441696167 }, { "auxiliary_loss_clip": 0.06592583, "auxiliary_loss_mlp": 0.01288866, "balance_loss_clip": 0.06310833, "balance_loss_mlp": 0.01262365, "epoch": 0.17183225612505637, "flos": 23367357452160.0, "grad_norm": 2.1218238817244393, "language_loss": 0.78645504, "learning_rate": 3.7926639667247947e-06, "loss": 0.86526954, "num_input_tokens_seen": 61936665, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.26513672, "step": 2858, "time_per_iteration": 2.6184048652648926 }, { "auxiliary_loss_clip": 0.06629676, "auxiliary_loss_mlp": 0.01295132, "balance_loss_clip": 0.0633216, "balance_loss_mlp": 0.01268143, "epoch": 0.17189237937772434, "flos": 18120163870080.0, "grad_norm": 2.010867139626068, "language_loss": 0.7784903, "learning_rate": 3.7924912518325663e-06, "loss": 0.85773838, "num_input_tokens_seen": 61954415, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.26989746, "step": 2859, "time_per_iteration": 2.619129180908203 }, { "auxiliary_loss_clip": 0.066029, "auxiliary_loss_mlp": 0.01290879, "balance_loss_clip": 0.06322081, "balance_loss_mlp": 0.01266334, "epoch": 0.1719525026303923, "flos": 23265137070720.0, "grad_norm": 1.7886845145097834, "language_loss": 0.77765048, "learning_rate": 3.7923184689688902e-06, "loss": 0.8565883, "num_input_tokens_seen": 61973940, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.2454834, "step": 2860, "time_per_iteration": 4.049073934555054 }, { "auxiliary_loss_clip": 0.06606647, "auxiliary_loss_mlp": 0.01298875, "balance_loss_clip": 0.06322468, "balance_loss_mlp": 0.01273376, "epoch": 0.17201262588306027, "flos": 20816156188800.0, "grad_norm": 1.921902413233944, "language_loss": 0.82175934, "learning_rate": 3.792145618140317e-06, "loss": 0.90081453, "num_input_tokens_seen": 61991845, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.25512695, "step": 2861, "time_per_iteration": 2.585855484008789 }, { "auxiliary_loss_clip": 0.06607929, "auxiliary_loss_mlp": 0.01288531, "balance_loss_clip": 0.06323941, "balance_loss_mlp": 0.01263366, "epoch": 0.17207274913572823, "flos": 20382076512000.0, "grad_norm": 2.3295983901470514, "language_loss": 0.86871421, "learning_rate": 3.7919726993534038e-06, "loss": 0.94767874, "num_input_tokens_seen": 62009395, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.25170898, "step": 2862, "time_per_iteration": 2.614417314529419 }, { "auxiliary_loss_clip": 0.06599614, "auxiliary_loss_mlp": 0.01285998, "balance_loss_clip": 0.06322961, "balance_loss_mlp": 0.01262156, "epoch": 0.17213287238839622, "flos": 26805082112640.0, "grad_norm": 1.8258078165687752, "language_loss": 0.78731954, "learning_rate": 3.7917997126147054e-06, "loss": 0.86617565, "num_input_tokens_seen": 62029005, "router_z_loss_clip": 2.76757812, "router_z_loss_mlp": 0.23864746, "step": 2863, "time_per_iteration": 4.02463173866272 }, { "auxiliary_loss_clip": 0.06606402, "auxiliary_loss_mlp": 0.01296207, "balance_loss_clip": 0.06326783, "balance_loss_mlp": 0.01270839, "epoch": 0.1721929956410642, "flos": 26037927256320.0, "grad_norm": 2.0496431089080303, "language_loss": 0.73182213, "learning_rate": 3.7916266579307823e-06, "loss": 0.81084824, "num_input_tokens_seen": 62048730, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.25366211, "step": 2864, "time_per_iteration": 2.6643948554992676 }, { "auxiliary_loss_clip": 0.0661484, "auxiliary_loss_mlp": 0.01302064, "balance_loss_clip": 0.06331193, "balance_loss_mlp": 0.01275993, "epoch": 0.17225311889373215, "flos": 22279621674240.0, "grad_norm": 1.6415840991639497, "language_loss": 0.73911881, "learning_rate": 3.7914535353081973e-06, "loss": 0.81828785, "num_input_tokens_seen": 62069000, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.26074219, "step": 2865, "time_per_iteration": 2.6128196716308594 }, { "auxiliary_loss_clip": 0.06605476, "auxiliary_loss_mlp": 0.01297207, "balance_loss_clip": 0.06324138, "balance_loss_mlp": 0.01270779, "epoch": 0.17231324214640012, "flos": 21294106277760.0, "grad_norm": 2.122060687099998, "language_loss": 0.78704321, "learning_rate": 3.7912803447535145e-06, "loss": 0.86607003, "num_input_tokens_seen": 62086750, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.26452637, "step": 2866, "time_per_iteration": 2.60732102394104 }, { "auxiliary_loss_clip": 0.06611149, "auxiliary_loss_mlp": 0.01290639, "balance_loss_clip": 0.06324057, "balance_loss_mlp": 0.01263304, "epoch": 0.17237336539906808, "flos": 19686520569600.0, "grad_norm": 1.806009391543746, "language_loss": 0.80598807, "learning_rate": 3.7911070862733016e-06, "loss": 0.88500589, "num_input_tokens_seen": 62106240, "router_z_loss_clip": 2.87109375, "router_z_loss_mlp": 0.27319336, "step": 2867, "time_per_iteration": 3.993446111679077 }, { "auxiliary_loss_clip": 0.06597306, "auxiliary_loss_mlp": 0.01293882, "balance_loss_clip": 0.06315336, "balance_loss_mlp": 0.012683, "epoch": 0.17243348865173605, "flos": 17535339498240.0, "grad_norm": 2.5488773731397036, "language_loss": 0.80093145, "learning_rate": 3.7909337598741276e-06, "loss": 0.87984335, "num_input_tokens_seen": 62124895, "router_z_loss_clip": 2.8203125, "router_z_loss_mlp": 0.25598145, "step": 2868, "time_per_iteration": 4.069528102874756 }, { "auxiliary_loss_clip": 0.06605339, "auxiliary_loss_mlp": 0.01295413, "balance_loss_clip": 0.06317406, "balance_loss_mlp": 0.01269581, "epoch": 0.17249361190440402, "flos": 18265751539200.0, "grad_norm": 5.618390597871782, "language_loss": 0.84339893, "learning_rate": 3.7907603655625674e-06, "loss": 0.92240644, "num_input_tokens_seen": 62143510, "router_z_loss_clip": 2.88085938, "router_z_loss_mlp": 0.25854492, "step": 2869, "time_per_iteration": 2.5833122730255127 }, { "auxiliary_loss_clip": 0.06598997, "auxiliary_loss_mlp": 0.01284736, "balance_loss_clip": 0.06315819, "balance_loss_mlp": 0.01258987, "epoch": 0.172553735157072, "flos": 21180020544000.0, "grad_norm": 2.0558660471286836, "language_loss": 0.781708, "learning_rate": 3.7905869033451932e-06, "loss": 0.86054534, "num_input_tokens_seen": 62162285, "router_z_loss_clip": 2.83203125, "router_z_loss_mlp": 0.25769043, "step": 2870, "time_per_iteration": 2.5832550525665283 }, { "auxiliary_loss_clip": 0.06582072, "auxiliary_loss_mlp": 0.01284867, "balance_loss_clip": 0.06307869, "balance_loss_mlp": 0.01262265, "epoch": 0.17261385840973997, "flos": 22279831309440.0, "grad_norm": 1.7211076514879724, "language_loss": 0.77941501, "learning_rate": 3.7904133732285857e-06, "loss": 0.85808444, "num_input_tokens_seen": 62180970, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.22583008, "step": 2871, "time_per_iteration": 2.585320472717285 }, { "auxiliary_loss_clip": 0.06594387, "auxiliary_loss_mlp": 0.01282978, "balance_loss_clip": 0.06311061, "balance_loss_mlp": 0.0125761, "epoch": 0.17267398166240794, "flos": 27928680238080.0, "grad_norm": 2.2348708092540903, "language_loss": 0.74958283, "learning_rate": 3.7902397752193228e-06, "loss": 0.8283565, "num_input_tokens_seen": 62198965, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.25341797, "step": 2872, "time_per_iteration": 2.6437840461730957 }, { "auxiliary_loss_clip": 0.06579047, "auxiliary_loss_mlp": 0.01284708, "balance_loss_clip": 0.06303184, "balance_loss_mlp": 0.01261712, "epoch": 0.1727341049150759, "flos": 21951661593600.0, "grad_norm": 2.0604222830437875, "language_loss": 0.83399528, "learning_rate": 3.790066109323988e-06, "loss": 0.91263282, "num_input_tokens_seen": 62219890, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.23010254, "step": 2873, "time_per_iteration": 2.626477003097534 }, { "auxiliary_loss_clip": 0.06586813, "auxiliary_loss_mlp": 0.01284863, "balance_loss_clip": 0.06308744, "balance_loss_mlp": 0.01259185, "epoch": 0.17279422816774387, "flos": 18112742784000.0, "grad_norm": 3.0246499142679713, "language_loss": 0.75208211, "learning_rate": 3.7898923755491678e-06, "loss": 0.83079886, "num_input_tokens_seen": 62237140, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.25671387, "step": 2874, "time_per_iteration": 2.587806224822998 }, { "auxiliary_loss_clip": 0.06595424, "auxiliary_loss_mlp": 0.01284467, "balance_loss_clip": 0.06311328, "balance_loss_mlp": 0.0125786, "epoch": 0.17285435142041183, "flos": 21841936272000.0, "grad_norm": 2.403683918263097, "language_loss": 0.81564546, "learning_rate": 3.7897185739014487e-06, "loss": 0.89444435, "num_input_tokens_seen": 62255405, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.26611328, "step": 2875, "time_per_iteration": 2.586859941482544 }, { "auxiliary_loss_clip": 0.06591827, "auxiliary_loss_mlp": 0.01285464, "balance_loss_clip": 0.06309834, "balance_loss_mlp": 0.01258213, "epoch": 0.17291447467307983, "flos": 18374219049600.0, "grad_norm": 3.1932664874079753, "language_loss": 0.89004451, "learning_rate": 3.7895447043874217e-06, "loss": 0.96881747, "num_input_tokens_seen": 62271280, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.27246094, "step": 2876, "time_per_iteration": 2.5397756099700928 }, { "auxiliary_loss_clip": 0.06586965, "auxiliary_loss_mlp": 0.01286624, "balance_loss_clip": 0.06310652, "balance_loss_mlp": 0.01262329, "epoch": 0.1729745979257478, "flos": 18630580216320.0, "grad_norm": 2.844921905085775, "language_loss": 0.8537541, "learning_rate": 3.789370767013681e-06, "loss": 0.93248999, "num_input_tokens_seen": 62289140, "router_z_loss_clip": 2.76367188, "router_z_loss_mlp": 0.24291992, "step": 2877, "time_per_iteration": 2.5549545288085938 }, { "auxiliary_loss_clip": 0.06590667, "auxiliary_loss_mlp": 0.0128142, "balance_loss_clip": 0.06309815, "balance_loss_mlp": 0.01256291, "epoch": 0.17303472117841576, "flos": 23004122002560.0, "grad_norm": 2.1713199105657557, "language_loss": 0.79948634, "learning_rate": 3.7891967617868204e-06, "loss": 0.87820721, "num_input_tokens_seen": 62307490, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.25146484, "step": 2878, "time_per_iteration": 2.5992984771728516 }, { "auxiliary_loss_clip": 0.06583587, "auxiliary_loss_mlp": 0.01280223, "balance_loss_clip": 0.06305645, "balance_loss_mlp": 0.01255034, "epoch": 0.17309484443108372, "flos": 25671169935360.0, "grad_norm": 1.8455822480941126, "language_loss": 0.7154249, "learning_rate": 3.78902268871344e-06, "loss": 0.79406297, "num_input_tokens_seen": 62328570, "router_z_loss_clip": 2.78320312, "router_z_loss_mlp": 0.25195312, "step": 2879, "time_per_iteration": 2.6216084957122803 }, { "auxiliary_loss_clip": 0.06585556, "auxiliary_loss_mlp": 0.01282071, "balance_loss_clip": 0.06304365, "balance_loss_mlp": 0.01256453, "epoch": 0.1731549676837517, "flos": 13557960616320.0, "grad_norm": 2.443336669643163, "language_loss": 0.84388685, "learning_rate": 3.78884854780014e-06, "loss": 0.9225632, "num_input_tokens_seen": 62345735, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.25610352, "step": 2880, "time_per_iteration": 2.556252956390381 }, { "auxiliary_loss_clip": 0.06595, "auxiliary_loss_mlp": 0.01284027, "balance_loss_clip": 0.06311958, "balance_loss_mlp": 0.01258707, "epoch": 0.17321509093641965, "flos": 22863733286400.0, "grad_norm": 5.005606900754332, "language_loss": 0.82406116, "learning_rate": 3.7886743390535236e-06, "loss": 0.9028514, "num_input_tokens_seen": 62365525, "router_z_loss_clip": 2.83007812, "router_z_loss_mlp": 0.25317383, "step": 2881, "time_per_iteration": 2.7062482833862305 }, { "auxiliary_loss_clip": 0.06591585, "auxiliary_loss_mlp": 0.0128477, "balance_loss_clip": 0.06312501, "balance_loss_mlp": 0.01258723, "epoch": 0.17327521418908762, "flos": 24359665029120.0, "grad_norm": 1.724912140095326, "language_loss": 0.77739704, "learning_rate": 3.788500062480197e-06, "loss": 0.85616064, "num_input_tokens_seen": 62385160, "router_z_loss_clip": 2.79101562, "router_z_loss_mlp": 0.26049805, "step": 2882, "time_per_iteration": 2.7033581733703613 }, { "auxiliary_loss_clip": 0.06584686, "auxiliary_loss_mlp": 0.01283828, "balance_loss_clip": 0.06310899, "balance_loss_mlp": 0.01261047, "epoch": 0.1733353374417556, "flos": 33113373073920.0, "grad_norm": 1.8746805443169459, "language_loss": 0.76901275, "learning_rate": 3.788325718086769e-06, "loss": 0.84769791, "num_input_tokens_seen": 62405280, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.2277832, "step": 2883, "time_per_iteration": 2.707132577896118 }, { "auxiliary_loss_clip": 0.06580157, "auxiliary_loss_mlp": 0.01279333, "balance_loss_clip": 0.0630409, "balance_loss_mlp": 0.01256076, "epoch": 0.17339546069442358, "flos": 24395778938880.0, "grad_norm": 1.9503699286433562, "language_loss": 0.86630893, "learning_rate": 3.7881513058798503e-06, "loss": 0.94490379, "num_input_tokens_seen": 62423665, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.23266602, "step": 2884, "time_per_iteration": 2.612090587615967 }, { "auxiliary_loss_clip": 0.06590074, "auxiliary_loss_mlp": 0.01283258, "balance_loss_clip": 0.06311445, "balance_loss_mlp": 0.012595, "epoch": 0.17345558394709154, "flos": 27461589252480.0, "grad_norm": 1.7808949687172908, "language_loss": 0.7507984, "learning_rate": 3.787976825866055e-06, "loss": 0.82953173, "num_input_tokens_seen": 62445170, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.23754883, "step": 2885, "time_per_iteration": 2.6505095958709717 }, { "auxiliary_loss_clip": 0.06576861, "auxiliary_loss_mlp": 0.01283883, "balance_loss_clip": 0.06305946, "balance_loss_mlp": 0.01259934, "epoch": 0.1735157071997595, "flos": 24689260264320.0, "grad_norm": 1.9907298355473484, "language_loss": 0.71639907, "learning_rate": 3.7878022780519998e-06, "loss": 0.79500651, "num_input_tokens_seen": 62466135, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.23950195, "step": 2886, "time_per_iteration": 2.66995906829834 }, { "auxiliary_loss_clip": 0.06585108, "auxiliary_loss_mlp": 0.01280728, "balance_loss_clip": 0.06307538, "balance_loss_mlp": 0.01257637, "epoch": 0.17357583045242747, "flos": 21695300426880.0, "grad_norm": 3.0238034227961323, "language_loss": 0.69925594, "learning_rate": 3.7876276624443024e-06, "loss": 0.77791429, "num_input_tokens_seen": 62483910, "router_z_loss_clip": 2.77539062, "router_z_loss_mlp": 0.23083496, "step": 2887, "time_per_iteration": 2.615978956222534 }, { "auxiliary_loss_clip": 0.06580704, "auxiliary_loss_mlp": 0.01286144, "balance_loss_clip": 0.06303519, "balance_loss_mlp": 0.01263518, "epoch": 0.17363595370509544, "flos": 15380846190720.0, "grad_norm": 2.024832582823287, "language_loss": 0.85644293, "learning_rate": 3.787452979049585e-06, "loss": 0.9351114, "num_input_tokens_seen": 62501530, "router_z_loss_clip": 2.77148438, "router_z_loss_mlp": 0.22619629, "step": 2888, "time_per_iteration": 2.58100962638855 }, { "auxiliary_loss_clip": 0.06586316, "auxiliary_loss_mlp": 0.01287471, "balance_loss_clip": 0.06309387, "balance_loss_mlp": 0.01263463, "epoch": 0.1736960769577634, "flos": 23447719117440.0, "grad_norm": 2.0027808025835374, "language_loss": 0.80020881, "learning_rate": 3.7872782278744718e-06, "loss": 0.87894666, "num_input_tokens_seen": 62521295, "router_z_loss_clip": 2.77539062, "router_z_loss_mlp": 0.2401123, "step": 2889, "time_per_iteration": 2.6174864768981934 }, { "auxiliary_loss_clip": 0.06574136, "auxiliary_loss_mlp": 0.01283552, "balance_loss_clip": 0.06305757, "balance_loss_mlp": 0.01259471, "epoch": 0.1737562002104314, "flos": 18593711619840.0, "grad_norm": 2.210435393313451, "language_loss": 0.84789312, "learning_rate": 3.7871034089255883e-06, "loss": 0.92646992, "num_input_tokens_seen": 62539615, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.24072266, "step": 2890, "time_per_iteration": 2.5838615894317627 }, { "auxiliary_loss_clip": 0.06575635, "auxiliary_loss_mlp": 0.01282805, "balance_loss_clip": 0.06299696, "balance_loss_mlp": 0.01258606, "epoch": 0.17381632346309936, "flos": 16003629262080.0, "grad_norm": 1.9222155630441886, "language_loss": 0.828565, "learning_rate": 3.7869285222095653e-06, "loss": 0.90714943, "num_input_tokens_seen": 62556820, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.24206543, "step": 2891, "time_per_iteration": 2.539716958999634 }, { "auxiliary_loss_clip": 0.06580164, "auxiliary_loss_mlp": 0.01286808, "balance_loss_clip": 0.06299774, "balance_loss_mlp": 0.0126299, "epoch": 0.17387644671576732, "flos": 13374749664000.0, "grad_norm": 2.339951579671826, "language_loss": 0.82283539, "learning_rate": 3.7867535677330334e-06, "loss": 0.90150511, "num_input_tokens_seen": 62572450, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.23791504, "step": 2892, "time_per_iteration": 2.578162908554077 }, { "auxiliary_loss_clip": 0.06589254, "auxiliary_loss_mlp": 0.01288169, "balance_loss_clip": 0.06308119, "balance_loss_mlp": 0.01262468, "epoch": 0.1739365699684353, "flos": 26622877409280.0, "grad_norm": 2.072532833581574, "language_loss": 0.75458276, "learning_rate": 3.786578545502627e-06, "loss": 0.83335698, "num_input_tokens_seen": 62592580, "router_z_loss_clip": 2.81054688, "router_z_loss_mlp": 0.25708008, "step": 2893, "time_per_iteration": 2.64274001121521 }, { "auxiliary_loss_clip": 0.06578118, "auxiliary_loss_mlp": 0.01282252, "balance_loss_clip": 0.06299041, "balance_loss_mlp": 0.01257195, "epoch": 0.17399669322110325, "flos": 23374736611200.0, "grad_norm": 1.9174452659561334, "language_loss": 0.83278084, "learning_rate": 3.7864034555249828e-06, "loss": 0.91138458, "num_input_tokens_seen": 62611220, "router_z_loss_clip": 2.79492188, "router_z_loss_mlp": 0.25073242, "step": 2894, "time_per_iteration": 2.6083853244781494 }, { "auxiliary_loss_clip": 0.06582405, "auxiliary_loss_mlp": 0.01289912, "balance_loss_clip": 0.06302546, "balance_loss_mlp": 0.01264401, "epoch": 0.17405681647377122, "flos": 22060590301440.0, "grad_norm": 2.299298537591756, "language_loss": 0.75210178, "learning_rate": 3.786228297806741e-06, "loss": 0.83082491, "num_input_tokens_seen": 62629185, "router_z_loss_clip": 2.79882812, "router_z_loss_mlp": 0.25500488, "step": 2895, "time_per_iteration": 2.5894687175750732 }, { "auxiliary_loss_clip": 0.06479482, "auxiliary_loss_mlp": 0.01260012, "balance_loss_clip": 0.06318798, "balance_loss_mlp": 0.01252604, "epoch": 0.1741169397264392, "flos": 61476537530880.0, "grad_norm": 0.8194364787159105, "language_loss": 0.62699997, "learning_rate": 3.7860530723545435e-06, "loss": 0.70439494, "num_input_tokens_seen": 62691895, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.07391357, "step": 2896, "time_per_iteration": 3.260850191116333 }, { "auxiliary_loss_clip": 0.06579271, "auxiliary_loss_mlp": 0.01286922, "balance_loss_clip": 0.063018, "balance_loss_mlp": 0.01262484, "epoch": 0.17417706297910718, "flos": 27025245515520.0, "grad_norm": 3.0505280435407545, "language_loss": 0.7659421, "learning_rate": 3.785877779175034e-06, "loss": 0.84460402, "num_input_tokens_seen": 62713790, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.24462891, "step": 2897, "time_per_iteration": 2.6435751914978027 }, { "auxiliary_loss_clip": 0.06568733, "auxiliary_loss_mlp": 0.01281183, "balance_loss_clip": 0.06295142, "balance_loss_mlp": 0.01256828, "epoch": 0.17423718623177514, "flos": 33516957064320.0, "grad_norm": 2.459569230452225, "language_loss": 0.6993742, "learning_rate": 3.7857024182748606e-06, "loss": 0.7778734, "num_input_tokens_seen": 62736285, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.24377441, "step": 2898, "time_per_iteration": 2.700194835662842 }, { "auxiliary_loss_clip": 0.06590258, "auxiliary_loss_mlp": 0.01283524, "balance_loss_clip": 0.06303807, "balance_loss_mlp": 0.01260099, "epoch": 0.1742973094844431, "flos": 27205982772480.0, "grad_norm": 15.175144578976923, "language_loss": 0.78010726, "learning_rate": 3.7855269896606717e-06, "loss": 0.85884511, "num_input_tokens_seen": 62756240, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 0.23413086, "step": 2899, "time_per_iteration": 2.6369526386260986 }, { "auxiliary_loss_clip": 0.06569845, "auxiliary_loss_mlp": 0.01288329, "balance_loss_clip": 0.06300715, "balance_loss_mlp": 0.01265262, "epoch": 0.17435743273711107, "flos": 22717307076480.0, "grad_norm": 2.142098753798532, "language_loss": 0.73256433, "learning_rate": 3.785351493339121e-06, "loss": 0.81114608, "num_input_tokens_seen": 62775910, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.23083496, "step": 2900, "time_per_iteration": 4.084202766418457 }, { "auxiliary_loss_clip": 0.06571554, "auxiliary_loss_mlp": 0.01283908, "balance_loss_clip": 0.06297609, "balance_loss_mlp": 0.01260269, "epoch": 0.17441755598977904, "flos": 41656141664640.0, "grad_norm": 1.5265132139643411, "language_loss": 0.70525002, "learning_rate": 3.785175929316863e-06, "loss": 0.7838046, "num_input_tokens_seen": 62799385, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.23632812, "step": 2901, "time_per_iteration": 2.859034299850464 }, { "auxiliary_loss_clip": 0.06579003, "auxiliary_loss_mlp": 0.01285018, "balance_loss_clip": 0.06301558, "balance_loss_mlp": 0.01261343, "epoch": 0.174477679242447, "flos": 26294372277120.0, "grad_norm": 1.8091174226316524, "language_loss": 0.76879811, "learning_rate": 3.7850002976005543e-06, "loss": 0.84743828, "num_input_tokens_seen": 62819380, "router_z_loss_clip": 2.77539062, "router_z_loss_mlp": 0.23693848, "step": 2902, "time_per_iteration": 4.103576421737671 }, { "auxiliary_loss_clip": 0.06572604, "auxiliary_loss_mlp": 0.01285969, "balance_loss_clip": 0.06296843, "balance_loss_mlp": 0.01262783, "epoch": 0.174537802495115, "flos": 17864221973760.0, "grad_norm": 2.111189004180583, "language_loss": 0.82377291, "learning_rate": 3.7848245981968558e-06, "loss": 0.90235865, "num_input_tokens_seen": 62836205, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.23181152, "step": 2903, "time_per_iteration": 2.546605348587036 }, { "auxiliary_loss_clip": 0.06568789, "auxiliary_loss_mlp": 0.0128969, "balance_loss_clip": 0.06297918, "balance_loss_mlp": 0.01265419, "epoch": 0.17459792574778296, "flos": 16945441954560.0, "grad_norm": 2.2042275371940008, "language_loss": 0.7485109, "learning_rate": 3.784648831112429e-06, "loss": 0.82709575, "num_input_tokens_seen": 62854045, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.24255371, "step": 2904, "time_per_iteration": 2.5724611282348633 }, { "auxiliary_loss_clip": 0.06567978, "auxiliary_loss_mlp": 0.01284559, "balance_loss_clip": 0.06294218, "balance_loss_mlp": 0.01260074, "epoch": 0.17465804900045093, "flos": 25527049712640.0, "grad_norm": 1.9279299410816209, "language_loss": 0.64269578, "learning_rate": 3.7844729963539406e-06, "loss": 0.72122109, "num_input_tokens_seen": 62873075, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.24499512, "step": 2905, "time_per_iteration": 2.6045501232147217 }, { "auxiliary_loss_clip": 0.06580234, "auxiliary_loss_mlp": 0.01281963, "balance_loss_clip": 0.06294703, "balance_loss_mlp": 0.01257883, "epoch": 0.1747181722531189, "flos": 24135853973760.0, "grad_norm": 1.9554133926792991, "language_loss": 0.80369329, "learning_rate": 3.7842970939280566e-06, "loss": 0.88231528, "num_input_tokens_seen": 62892675, "router_z_loss_clip": 2.85351562, "router_z_loss_mlp": 0.24072266, "step": 2906, "time_per_iteration": 2.5938329696655273 }, { "auxiliary_loss_clip": 0.06574237, "auxiliary_loss_mlp": 0.01285228, "balance_loss_clip": 0.06294893, "balance_loss_mlp": 0.01260146, "epoch": 0.17477829550578686, "flos": 17754580506240.0, "grad_norm": 1.99324990425338, "language_loss": 0.81757271, "learning_rate": 3.784121123841449e-06, "loss": 0.8961674, "num_input_tokens_seen": 62910675, "router_z_loss_clip": 2.79492188, "router_z_loss_mlp": 0.25073242, "step": 2907, "time_per_iteration": 5.442713975906372 }, { "auxiliary_loss_clip": 0.06577054, "auxiliary_loss_mlp": 0.01289765, "balance_loss_clip": 0.06301472, "balance_loss_mlp": 0.0126578, "epoch": 0.17483841875845482, "flos": 15382732907520.0, "grad_norm": 3.8321920052126703, "language_loss": 0.82207847, "learning_rate": 3.7839450861007886e-06, "loss": 0.9007467, "num_input_tokens_seen": 62928130, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.23986816, "step": 2908, "time_per_iteration": 2.5508573055267334 }, { "auxiliary_loss_clip": 0.06575532, "auxiliary_loss_mlp": 0.0128798, "balance_loss_clip": 0.06298365, "balance_loss_mlp": 0.01263637, "epoch": 0.17489854201112282, "flos": 17168624104320.0, "grad_norm": 2.471875271907969, "language_loss": 0.8119514, "learning_rate": 3.7837689807127518e-06, "loss": 0.8905865, "num_input_tokens_seen": 62944290, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.24365234, "step": 2909, "time_per_iteration": 2.549501895904541 }, { "auxiliary_loss_clip": 0.06574887, "auxiliary_loss_mlp": 0.01287461, "balance_loss_clip": 0.06296254, "balance_loss_mlp": 0.01262237, "epoch": 0.17495866526379078, "flos": 19761347865600.0, "grad_norm": 2.0286867336205567, "language_loss": 0.77382338, "learning_rate": 3.783592807684017e-06, "loss": 0.85244691, "num_input_tokens_seen": 62963505, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.2520752, "step": 2910, "time_per_iteration": 2.5694198608398438 }, { "auxiliary_loss_clip": 0.06573681, "auxiliary_loss_mlp": 0.01286311, "balance_loss_clip": 0.06296194, "balance_loss_mlp": 0.01260156, "epoch": 0.17501878851645875, "flos": 28518535854720.0, "grad_norm": 1.8942751862880665, "language_loss": 0.87793922, "learning_rate": 3.7834165670212645e-06, "loss": 0.95653915, "num_input_tokens_seen": 62985020, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.26159668, "step": 2911, "time_per_iteration": 2.671816825866699 }, { "auxiliary_loss_clip": 0.06569624, "auxiliary_loss_mlp": 0.01284251, "balance_loss_clip": 0.06294904, "balance_loss_mlp": 0.01259754, "epoch": 0.1750789117691267, "flos": 17936994844800.0, "grad_norm": 2.347350709369304, "language_loss": 0.90544116, "learning_rate": 3.7832402587311764e-06, "loss": 0.98397994, "num_input_tokens_seen": 63001745, "router_z_loss_clip": 2.74804688, "router_z_loss_mlp": 0.24499512, "step": 2912, "time_per_iteration": 2.577449321746826 }, { "auxiliary_loss_clip": 0.06577341, "auxiliary_loss_mlp": 0.01290943, "balance_loss_clip": 0.06296869, "balance_loss_mlp": 0.01263716, "epoch": 0.17513903502179468, "flos": 18265248414720.0, "grad_norm": 2.084734605573609, "language_loss": 0.74024457, "learning_rate": 3.783063882820439e-06, "loss": 0.81892741, "num_input_tokens_seen": 63019750, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.27172852, "step": 2913, "time_per_iteration": 2.677562952041626 }, { "auxiliary_loss_clip": 0.06570071, "auxiliary_loss_mlp": 0.01282696, "balance_loss_clip": 0.06295758, "balance_loss_mlp": 0.01258509, "epoch": 0.17519915827446264, "flos": 20711084768640.0, "grad_norm": 2.2942963999504298, "language_loss": 0.70279396, "learning_rate": 3.782887439295741e-06, "loss": 0.78132159, "num_input_tokens_seen": 63039500, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.24206543, "step": 2914, "time_per_iteration": 2.586487293243408 }, { "auxiliary_loss_clip": 0.06569919, "auxiliary_loss_mlp": 0.01286671, "balance_loss_clip": 0.06297816, "balance_loss_mlp": 0.01261863, "epoch": 0.1752592815271306, "flos": 20529928241280.0, "grad_norm": 1.6927577921102055, "language_loss": 0.93989015, "learning_rate": 3.782710928163772e-06, "loss": 1.0184561, "num_input_tokens_seen": 63059785, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.2479248, "step": 2915, "time_per_iteration": 2.5940020084381104 }, { "auxiliary_loss_clip": 0.06576179, "auxiliary_loss_mlp": 0.01282124, "balance_loss_clip": 0.06303805, "balance_loss_mlp": 0.01256935, "epoch": 0.1753194047797986, "flos": 21805696581120.0, "grad_norm": 1.631076786389171, "language_loss": 0.81719792, "learning_rate": 3.782534349431226e-06, "loss": 0.89578092, "num_input_tokens_seen": 63079385, "router_z_loss_clip": 2.72265625, "router_z_loss_mlp": 0.25158691, "step": 2916, "time_per_iteration": 2.5917229652404785 }, { "auxiliary_loss_clip": 0.06571977, "auxiliary_loss_mlp": 0.01283786, "balance_loss_clip": 0.06295365, "balance_loss_mlp": 0.0125911, "epoch": 0.17537952803246656, "flos": 20674719296640.0, "grad_norm": 1.6458246758296187, "language_loss": 0.74556196, "learning_rate": 3.782357703104799e-06, "loss": 0.82411957, "num_input_tokens_seen": 63098970, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 0.24682617, "step": 2917, "time_per_iteration": 2.5874979496002197 }, { "auxiliary_loss_clip": 0.06573032, "auxiliary_loss_mlp": 0.01282336, "balance_loss_clip": 0.06301966, "balance_loss_mlp": 0.01258994, "epoch": 0.17543965128513453, "flos": 23301837959040.0, "grad_norm": 2.0538848683887947, "language_loss": 0.77829349, "learning_rate": 3.7821809891911897e-06, "loss": 0.85684717, "num_input_tokens_seen": 63118750, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.23339844, "step": 2918, "time_per_iteration": 2.5996530055999756 }, { "auxiliary_loss_clip": 0.06579714, "auxiliary_loss_mlp": 0.01289025, "balance_loss_clip": 0.06297953, "balance_loss_mlp": 0.01264134, "epoch": 0.1754997745378025, "flos": 29103234445440.0, "grad_norm": 2.390619467718924, "language_loss": 0.75149322, "learning_rate": 3.782004207697098e-06, "loss": 0.83018064, "num_input_tokens_seen": 63136865, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.24890137, "step": 2919, "time_per_iteration": 2.664829730987549 }, { "auxiliary_loss_clip": 0.06578292, "auxiliary_loss_mlp": 0.01285792, "balance_loss_clip": 0.06296217, "balance_loss_mlp": 0.01261259, "epoch": 0.17555989779047046, "flos": 30379547836800.0, "grad_norm": 2.034067919261868, "language_loss": 0.74871165, "learning_rate": 3.781827358629228e-06, "loss": 0.82735252, "num_input_tokens_seen": 63158325, "router_z_loss_clip": 2.8203125, "router_z_loss_mlp": 0.24499512, "step": 2920, "time_per_iteration": 2.6630120277404785 }, { "auxiliary_loss_clip": 0.0656738, "auxiliary_loss_mlp": 0.01285682, "balance_loss_clip": 0.06298083, "balance_loss_mlp": 0.0126283, "epoch": 0.17562002104313842, "flos": 23293284842880.0, "grad_norm": 2.338395255793507, "language_loss": 0.80313551, "learning_rate": 3.7816504419942873e-06, "loss": 0.88166618, "num_input_tokens_seen": 63173115, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.22839355, "step": 2921, "time_per_iteration": 2.5504560470581055 }, { "auxiliary_loss_clip": 0.06576529, "auxiliary_loss_mlp": 0.01281321, "balance_loss_clip": 0.06296987, "balance_loss_mlp": 0.01256919, "epoch": 0.1756801442958064, "flos": 24797434285440.0, "grad_norm": 3.097675019854769, "language_loss": 0.88358212, "learning_rate": 3.7814734577989823e-06, "loss": 0.96216059, "num_input_tokens_seen": 63192880, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.24401855, "step": 2922, "time_per_iteration": 2.6048707962036133 }, { "auxiliary_loss_clip": 0.06574564, "auxiliary_loss_mlp": 0.01281257, "balance_loss_clip": 0.06295121, "balance_loss_mlp": 0.0125639, "epoch": 0.17574026754847438, "flos": 25778086145280.0, "grad_norm": 2.2948828096653364, "language_loss": 0.62676954, "learning_rate": 3.7812964060500253e-06, "loss": 0.70532775, "num_input_tokens_seen": 63214395, "router_z_loss_clip": 2.79882812, "router_z_loss_mlp": 0.24890137, "step": 2923, "time_per_iteration": 2.6018636226654053 }, { "auxiliary_loss_clip": 0.06581612, "auxiliary_loss_mlp": 0.01285034, "balance_loss_clip": 0.06301795, "balance_loss_mlp": 0.0126068, "epoch": 0.17580039080114235, "flos": 17462273137920.0, "grad_norm": 3.6597539625814757, "language_loss": 0.81851065, "learning_rate": 3.78111928675413e-06, "loss": 0.8971771, "num_input_tokens_seen": 63231020, "router_z_loss_clip": 2.79882812, "router_z_loss_mlp": 0.24353027, "step": 2924, "time_per_iteration": 2.5633602142333984 }, { "auxiliary_loss_clip": 0.06575157, "auxiliary_loss_mlp": 0.01285588, "balance_loss_clip": 0.06292467, "balance_loss_mlp": 0.01258575, "epoch": 0.1758605140538103, "flos": 14869633230720.0, "grad_norm": 2.4202063355465597, "language_loss": 0.71891695, "learning_rate": 3.7809420999180126e-06, "loss": 0.79752445, "num_input_tokens_seen": 63246245, "router_z_loss_clip": 2.82617188, "router_z_loss_mlp": 0.2701416, "step": 2925, "time_per_iteration": 2.539473295211792 }, { "auxiliary_loss_clip": 0.06565076, "auxiliary_loss_mlp": 0.01277475, "balance_loss_clip": 0.06296335, "balance_loss_mlp": 0.01254574, "epoch": 0.17592063730647828, "flos": 23011165745280.0, "grad_norm": 1.6123628638498926, "language_loss": 0.72161448, "learning_rate": 3.7807648455483934e-06, "loss": 0.80003989, "num_input_tokens_seen": 63267790, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.22912598, "step": 2926, "time_per_iteration": 2.646449327468872 }, { "auxiliary_loss_clip": 0.06581371, "auxiliary_loss_mlp": 0.01281972, "balance_loss_clip": 0.06297526, "balance_loss_mlp": 0.01255996, "epoch": 0.17598076055914624, "flos": 20747911438080.0, "grad_norm": 2.1563872156476287, "language_loss": 0.86622727, "learning_rate": 3.7805875236519918e-06, "loss": 0.9448607, "num_input_tokens_seen": 63286830, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.2598877, "step": 2927, "time_per_iteration": 2.6115870475769043 }, { "auxiliary_loss_clip": 0.06569106, "auxiliary_loss_mlp": 0.01280486, "balance_loss_clip": 0.06297803, "balance_loss_mlp": 0.01257467, "epoch": 0.1760408838118142, "flos": 34100607479040.0, "grad_norm": 1.7639174950942322, "language_loss": 0.72188282, "learning_rate": 3.7804101342355336e-06, "loss": 0.80037868, "num_input_tokens_seen": 63308870, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.23034668, "step": 2928, "time_per_iteration": 2.789346218109131 }, { "auxiliary_loss_clip": 0.0656768, "auxiliary_loss_mlp": 0.01287679, "balance_loss_clip": 0.06298967, "balance_loss_mlp": 0.01263718, "epoch": 0.1761010070644822, "flos": 24174902776320.0, "grad_norm": 2.6486743751260096, "language_loss": 0.84223914, "learning_rate": 3.780232677305744e-06, "loss": 0.9207927, "num_input_tokens_seen": 63329005, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.23974609, "step": 2929, "time_per_iteration": 2.6732053756713867 }, { "auxiliary_loss_clip": 0.06572673, "auxiliary_loss_mlp": 0.01289233, "balance_loss_clip": 0.06296633, "balance_loss_mlp": 0.0126489, "epoch": 0.17616113031715017, "flos": 26583660898560.0, "grad_norm": 1.6910290863703223, "language_loss": 0.79969978, "learning_rate": 3.7800551528693535e-06, "loss": 0.87831885, "num_input_tokens_seen": 63349390, "router_z_loss_clip": 2.75976562, "router_z_loss_mlp": 0.24365234, "step": 2930, "time_per_iteration": 2.6168062686920166 }, { "auxiliary_loss_clip": 0.06577934, "auxiliary_loss_mlp": 0.01291553, "balance_loss_clip": 0.06303322, "balance_loss_mlp": 0.01266149, "epoch": 0.17622125356981813, "flos": 25673853265920.0, "grad_norm": 1.949425121289626, "language_loss": 0.77595741, "learning_rate": 3.7798775609330927e-06, "loss": 0.85465229, "num_input_tokens_seen": 63368835, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.25427246, "step": 2931, "time_per_iteration": 2.630659818649292 }, { "auxiliary_loss_clip": 0.06567451, "auxiliary_loss_mlp": 0.01290527, "balance_loss_clip": 0.06295471, "balance_loss_mlp": 0.0126734, "epoch": 0.1762813768224861, "flos": 16514129462400.0, "grad_norm": 2.7442511141626733, "language_loss": 0.76240015, "learning_rate": 3.779699901503696e-06, "loss": 0.84097993, "num_input_tokens_seen": 63385220, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.23193359, "step": 2932, "time_per_iteration": 2.597299098968506 }, { "auxiliary_loss_clip": 0.06583011, "auxiliary_loss_mlp": 0.01295002, "balance_loss_clip": 0.06300768, "balance_loss_mlp": 0.01269038, "epoch": 0.17634150007515406, "flos": 11215518600960.0, "grad_norm": 2.07166026418737, "language_loss": 0.90111673, "learning_rate": 3.7795221745879016e-06, "loss": 0.97989678, "num_input_tokens_seen": 63400865, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.25952148, "step": 2933, "time_per_iteration": 2.568341016769409 }, { "auxiliary_loss_clip": 0.06565055, "auxiliary_loss_mlp": 0.01297647, "balance_loss_clip": 0.0629684, "balance_loss_mlp": 0.01274747, "epoch": 0.17640162332782203, "flos": 23666750490240.0, "grad_norm": 1.682976978172233, "language_loss": 0.88891697, "learning_rate": 3.779344380192448e-06, "loss": 0.96754396, "num_input_tokens_seen": 63421390, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.22924805, "step": 2934, "time_per_iteration": 2.647782564163208 }, { "auxiliary_loss_clip": 0.06565426, "auxiliary_loss_mlp": 0.01288781, "balance_loss_clip": 0.06297104, "balance_loss_mlp": 0.01265297, "epoch": 0.17646174658049, "flos": 53808819056640.0, "grad_norm": 1.5734447033981056, "language_loss": 0.71286559, "learning_rate": 3.779166518324077e-06, "loss": 0.79140759, "num_input_tokens_seen": 63444715, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.23498535, "step": 2935, "time_per_iteration": 2.892477512359619 }, { "auxiliary_loss_clip": 0.06583835, "auxiliary_loss_mlp": 0.01296259, "balance_loss_clip": 0.0630125, "balance_loss_mlp": 0.01270474, "epoch": 0.17652186983315798, "flos": 24250820175360.0, "grad_norm": 2.5767013933769927, "language_loss": 0.70913076, "learning_rate": 3.7789885889895325e-06, "loss": 0.78793168, "num_input_tokens_seen": 63465525, "router_z_loss_clip": 2.82617188, "router_z_loss_mlp": 0.25805664, "step": 2936, "time_per_iteration": 2.61706280708313 }, { "auxiliary_loss_clip": 0.06573154, "auxiliary_loss_mlp": 0.01290085, "balance_loss_clip": 0.06298952, "balance_loss_mlp": 0.01266851, "epoch": 0.17658199308582595, "flos": 27461715033600.0, "grad_norm": 2.069004536504155, "language_loss": 0.72421145, "learning_rate": 3.7788105921955634e-06, "loss": 0.80284381, "num_input_tokens_seen": 63485815, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.2322998, "step": 2937, "time_per_iteration": 2.6745684146881104 }, { "auxiliary_loss_clip": 0.06581512, "auxiliary_loss_mlp": 0.01287352, "balance_loss_clip": 0.06301185, "balance_loss_mlp": 0.01262401, "epoch": 0.17664211633849392, "flos": 22425167416320.0, "grad_norm": 2.171382087741982, "language_loss": 0.76306599, "learning_rate": 3.7786325279489184e-06, "loss": 0.84175467, "num_input_tokens_seen": 63503905, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.24963379, "step": 2938, "time_per_iteration": 2.6119019985198975 }, { "auxiliary_loss_clip": 0.06577297, "auxiliary_loss_mlp": 0.01297955, "balance_loss_clip": 0.06298657, "balance_loss_mlp": 0.01273899, "epoch": 0.17670223959116188, "flos": 24721642667520.0, "grad_norm": 2.3618718299508714, "language_loss": 0.71528327, "learning_rate": 3.7784543962563495e-06, "loss": 0.79403585, "num_input_tokens_seen": 63521985, "router_z_loss_clip": 2.78710938, "router_z_loss_mlp": 0.24060059, "step": 2939, "time_per_iteration": 4.059311389923096 }, { "auxiliary_loss_clip": 0.06567534, "auxiliary_loss_mlp": 0.01286607, "balance_loss_clip": 0.0629691, "balance_loss_mlp": 0.01263397, "epoch": 0.17676236284382985, "flos": 22533383364480.0, "grad_norm": 2.6276779856146115, "language_loss": 0.75055271, "learning_rate": 3.7782761971246115e-06, "loss": 0.82909411, "num_input_tokens_seen": 63539830, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.23205566, "step": 2940, "time_per_iteration": 2.5992467403411865 }, { "auxiliary_loss_clip": 0.0657551, "auxiliary_loss_mlp": 0.01285631, "balance_loss_clip": 0.06300494, "balance_loss_mlp": 0.01262123, "epoch": 0.1768224860964978, "flos": 12389988954240.0, "grad_norm": 2.3029487743841517, "language_loss": 0.87074387, "learning_rate": 3.7780979305604616e-06, "loss": 0.94935536, "num_input_tokens_seen": 63555495, "router_z_loss_clip": 2.74609375, "router_z_loss_mlp": 0.23522949, "step": 2941, "time_per_iteration": 2.5594239234924316 }, { "auxiliary_loss_clip": 0.06575219, "auxiliary_loss_mlp": 0.01295671, "balance_loss_clip": 0.0630125, "balance_loss_mlp": 0.01271912, "epoch": 0.1768826093491658, "flos": 24360335861760.0, "grad_norm": 3.9127247284169733, "language_loss": 0.77792335, "learning_rate": 3.7779195965706607e-06, "loss": 0.85663217, "num_input_tokens_seen": 63575290, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.2376709, "step": 2942, "time_per_iteration": 4.037717342376709 }, { "auxiliary_loss_clip": 0.06576265, "auxiliary_loss_mlp": 0.01291324, "balance_loss_clip": 0.06298118, "balance_loss_mlp": 0.01266863, "epoch": 0.17694273260183377, "flos": 23593893765120.0, "grad_norm": 1.9529061665268215, "language_loss": 0.8100217, "learning_rate": 3.77774119516197e-06, "loss": 0.88869762, "num_input_tokens_seen": 63594670, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.24462891, "step": 2943, "time_per_iteration": 2.602827310562134 }, { "auxiliary_loss_clip": 0.06579427, "auxiliary_loss_mlp": 0.01286639, "balance_loss_clip": 0.06300879, "balance_loss_mlp": 0.01262344, "epoch": 0.17700285585450173, "flos": 26768297370240.0, "grad_norm": 2.0334611212065306, "language_loss": 0.81809789, "learning_rate": 3.777562726341155e-06, "loss": 0.89675856, "num_input_tokens_seen": 63614780, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.24279785, "step": 2944, "time_per_iteration": 2.6301403045654297 }, { "auxiliary_loss_clip": 0.06576176, "auxiliary_loss_mlp": 0.01287361, "balance_loss_clip": 0.06304395, "balance_loss_mlp": 0.0126371, "epoch": 0.1770629791071697, "flos": 42785986919040.0, "grad_norm": 1.9254907499147162, "language_loss": 0.74267095, "learning_rate": 3.7773841901149835e-06, "loss": 0.82130635, "num_input_tokens_seen": 63637190, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.23669434, "step": 2945, "time_per_iteration": 2.7553200721740723 }, { "auxiliary_loss_clip": 0.06569788, "auxiliary_loss_mlp": 0.0128407, "balance_loss_clip": 0.06298173, "balance_loss_mlp": 0.01260025, "epoch": 0.17712310235983766, "flos": 17350954588800.0, "grad_norm": 3.064561289678324, "language_loss": 0.78074199, "learning_rate": 3.7772055864902256e-06, "loss": 0.85928053, "num_input_tokens_seen": 63652140, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.24035645, "step": 2946, "time_per_iteration": 2.5196805000305176 }, { "auxiliary_loss_clip": 0.06570028, "auxiliary_loss_mlp": 0.01286111, "balance_loss_clip": 0.06298494, "balance_loss_mlp": 0.01263688, "epoch": 0.17718322561250563, "flos": 23885278738560.0, "grad_norm": 2.250877461914314, "language_loss": 0.7700265, "learning_rate": 3.7770269154736535e-06, "loss": 0.84858787, "num_input_tokens_seen": 63671700, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.22436523, "step": 2947, "time_per_iteration": 5.567399501800537 }, { "auxiliary_loss_clip": 0.06571156, "auxiliary_loss_mlp": 0.01282646, "balance_loss_clip": 0.06297424, "balance_loss_mlp": 0.01258851, "epoch": 0.1772433488651736, "flos": 36475306116480.0, "grad_norm": 1.957946887588338, "language_loss": 0.74042368, "learning_rate": 3.7768481770720424e-06, "loss": 0.81896162, "num_input_tokens_seen": 63691685, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.23815918, "step": 2948, "time_per_iteration": 2.7135331630706787 }, { "auxiliary_loss_clip": 0.06571531, "auxiliary_loss_mlp": 0.01278044, "balance_loss_clip": 0.06303978, "balance_loss_mlp": 0.01255132, "epoch": 0.1773034721178416, "flos": 26691457576320.0, "grad_norm": 2.051296731068589, "language_loss": 0.82766765, "learning_rate": 3.776669371292171e-06, "loss": 0.90616345, "num_input_tokens_seen": 63711720, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.22900391, "step": 2949, "time_per_iteration": 2.6329972743988037 }, { "auxiliary_loss_clip": 0.06489453, "auxiliary_loss_mlp": 0.01281996, "balance_loss_clip": 0.06338011, "balance_loss_mlp": 0.01274659, "epoch": 0.17736359537050955, "flos": 57136007053440.0, "grad_norm": 0.740635256420754, "language_loss": 0.65050024, "learning_rate": 3.7764904981408186e-06, "loss": 0.72821474, "num_input_tokens_seen": 63776280, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.07318115, "step": 2950, "time_per_iteration": 3.2704784870147705 }, { "auxiliary_loss_clip": 0.0657433, "auxiliary_loss_mlp": 0.01279227, "balance_loss_clip": 0.06303596, "balance_loss_mlp": 0.0125554, "epoch": 0.17742371862317752, "flos": 27205479648000.0, "grad_norm": 1.7163062086705037, "language_loss": 0.84785342, "learning_rate": 3.7763115576247686e-06, "loss": 0.92638898, "num_input_tokens_seen": 63797535, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.23693848, "step": 2951, "time_per_iteration": 2.65769624710083 }, { "auxiliary_loss_clip": 0.06581852, "auxiliary_loss_mlp": 0.01277336, "balance_loss_clip": 0.06305576, "balance_loss_mlp": 0.01254865, "epoch": 0.17748384187584548, "flos": 20966020416000.0, "grad_norm": 2.4218511770262636, "language_loss": 0.81625873, "learning_rate": 3.776132549750806e-06, "loss": 0.89485061, "num_input_tokens_seen": 63817045, "router_z_loss_clip": 2.76367188, "router_z_loss_mlp": 0.22473145, "step": 2952, "time_per_iteration": 2.6270592212677 }, { "auxiliary_loss_clip": 0.06579398, "auxiliary_loss_mlp": 0.01280551, "balance_loss_clip": 0.06308106, "balance_loss_mlp": 0.01256769, "epoch": 0.17754396512851345, "flos": 25017052636800.0, "grad_norm": 2.1526553193371156, "language_loss": 0.79706568, "learning_rate": 3.7759534745257194e-06, "loss": 0.87566519, "num_input_tokens_seen": 63837665, "router_z_loss_clip": 2.71289062, "router_z_loss_mlp": 0.23754883, "step": 2953, "time_per_iteration": 2.6308398246765137 }, { "auxiliary_loss_clip": 0.06581727, "auxiliary_loss_mlp": 0.01284325, "balance_loss_clip": 0.06310253, "balance_loss_mlp": 0.01260995, "epoch": 0.1776040883811814, "flos": 32059780634880.0, "grad_norm": 1.8933516377768465, "language_loss": 0.88497895, "learning_rate": 3.7757743319562994e-06, "loss": 0.96363944, "num_input_tokens_seen": 63858455, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.23327637, "step": 2954, "time_per_iteration": 2.7091305255889893 }, { "auxiliary_loss_clip": 0.06574754, "auxiliary_loss_mlp": 0.01284341, "balance_loss_clip": 0.06304131, "balance_loss_mlp": 0.01260344, "epoch": 0.17766421163384938, "flos": 21579579538560.0, "grad_norm": 3.3213888275665515, "language_loss": 0.85634947, "learning_rate": 3.7755951220493386e-06, "loss": 0.93494034, "num_input_tokens_seen": 63876935, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.23974609, "step": 2955, "time_per_iteration": 2.6035308837890625 }, { "auxiliary_loss_clip": 0.06576174, "auxiliary_loss_mlp": 0.01291753, "balance_loss_clip": 0.06308063, "balance_loss_mlp": 0.01267696, "epoch": 0.17772433488651737, "flos": 22425922103040.0, "grad_norm": 1.8205598522970292, "language_loss": 0.71869564, "learning_rate": 3.7754158448116327e-06, "loss": 0.79737484, "num_input_tokens_seen": 63896815, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.24035645, "step": 2956, "time_per_iteration": 2.606555700302124 }, { "auxiliary_loss_clip": 0.06567657, "auxiliary_loss_mlp": 0.01284611, "balance_loss_clip": 0.06301963, "balance_loss_mlp": 0.01262009, "epoch": 0.17778445813918534, "flos": 25636481544960.0, "grad_norm": 2.108303420884734, "language_loss": 0.83950496, "learning_rate": 3.7752365002499795e-06, "loss": 0.91802764, "num_input_tokens_seen": 63916140, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.22607422, "step": 2957, "time_per_iteration": 2.60801100730896 }, { "auxiliary_loss_clip": 0.06579109, "auxiliary_loss_mlp": 0.01281636, "balance_loss_clip": 0.06309652, "balance_loss_mlp": 0.01259272, "epoch": 0.1778445813918533, "flos": 25635810712320.0, "grad_norm": 1.6259642218482109, "language_loss": 0.75628734, "learning_rate": 3.7750570883711807e-06, "loss": 0.83489478, "num_input_tokens_seen": 63935220, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.22363281, "step": 2958, "time_per_iteration": 2.69297456741333 }, { "auxiliary_loss_clip": 0.06578213, "auxiliary_loss_mlp": 0.01282846, "balance_loss_clip": 0.06305465, "balance_loss_mlp": 0.01260458, "epoch": 0.17790470464452127, "flos": 22351975274880.0, "grad_norm": 3.1789206413442765, "language_loss": 0.81843531, "learning_rate": 3.7748776091820397e-06, "loss": 0.89704585, "num_input_tokens_seen": 63954550, "router_z_loss_clip": 2.72851562, "router_z_loss_mlp": 0.22375488, "step": 2959, "time_per_iteration": 2.5962655544281006 }, { "auxiliary_loss_clip": 0.06576964, "auxiliary_loss_mlp": 0.01284976, "balance_loss_clip": 0.06300794, "balance_loss_mlp": 0.01261301, "epoch": 0.17796482789718923, "flos": 18771052786560.0, "grad_norm": 2.3036421268237053, "language_loss": 0.52296329, "learning_rate": 3.774698062689362e-06, "loss": 0.60158271, "num_input_tokens_seen": 63972425, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.23681641, "step": 2960, "time_per_iteration": 2.5785250663757324 }, { "auxiliary_loss_clip": 0.06569673, "auxiliary_loss_mlp": 0.01286028, "balance_loss_clip": 0.06298809, "balance_loss_mlp": 0.01263664, "epoch": 0.1780249511498572, "flos": 23447719117440.0, "grad_norm": 3.8180820953819965, "language_loss": 0.89396507, "learning_rate": 3.7745184488999548e-06, "loss": 0.97252214, "num_input_tokens_seen": 63992165, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.22363281, "step": 2961, "time_per_iteration": 2.60394287109375 }, { "auxiliary_loss_clip": 0.06574726, "auxiliary_loss_mlp": 0.01287483, "balance_loss_clip": 0.06300126, "balance_loss_mlp": 0.01262938, "epoch": 0.1780850744025252, "flos": 23374149632640.0, "grad_norm": 3.3035254993272174, "language_loss": 0.7974183, "learning_rate": 3.774338767820631e-06, "loss": 0.87604046, "num_input_tokens_seen": 64013470, "router_z_loss_clip": 2.74804688, "router_z_loss_mlp": 0.24572754, "step": 2962, "time_per_iteration": 2.648967742919922 }, { "auxiliary_loss_clip": 0.06575057, "auxiliary_loss_mlp": 0.01284718, "balance_loss_clip": 0.06301236, "balance_loss_mlp": 0.01260805, "epoch": 0.17814519765519315, "flos": 13777117770240.0, "grad_norm": 2.175605753905275, "language_loss": 0.75074184, "learning_rate": 3.774159019458203e-06, "loss": 0.82933956, "num_input_tokens_seen": 64030975, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.23925781, "step": 2963, "time_per_iteration": 2.6056294441223145 }, { "auxiliary_loss_clip": 0.06582996, "auxiliary_loss_mlp": 0.0128472, "balance_loss_clip": 0.06305082, "balance_loss_mlp": 0.01260282, "epoch": 0.17820532090786112, "flos": 21982073425920.0, "grad_norm": 1.775734359176491, "language_loss": 0.79670954, "learning_rate": 3.7739792038194877e-06, "loss": 0.87538666, "num_input_tokens_seen": 64050075, "router_z_loss_clip": 2.77539062, "router_z_loss_mlp": 0.24475098, "step": 2964, "time_per_iteration": 2.6530025005340576 }, { "auxiliary_loss_clip": 0.06580613, "auxiliary_loss_mlp": 0.01289246, "balance_loss_clip": 0.06306624, "balance_loss_mlp": 0.01265476, "epoch": 0.17826544416052909, "flos": 24797727774720.0, "grad_norm": 2.3422026559641917, "language_loss": 0.81630987, "learning_rate": 3.7737993209113027e-06, "loss": 0.89500844, "num_input_tokens_seen": 64071920, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.23779297, "step": 2965, "time_per_iteration": 2.6157188415527344 }, { "auxiliary_loss_clip": 0.06573331, "auxiliary_loss_mlp": 0.01279155, "balance_loss_clip": 0.06303273, "balance_loss_mlp": 0.01256291, "epoch": 0.17832556741319705, "flos": 13884411323520.0, "grad_norm": 2.9294415831930354, "language_loss": 0.96070975, "learning_rate": 3.7736193707404698e-06, "loss": 1.03923464, "num_input_tokens_seen": 64086835, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.22888184, "step": 2966, "time_per_iteration": 2.5554311275482178 }, { "auxiliary_loss_clip": 0.06584379, "auxiliary_loss_mlp": 0.0128919, "balance_loss_clip": 0.06309429, "balance_loss_mlp": 0.01264919, "epoch": 0.17838569066586502, "flos": 36649502755200.0, "grad_norm": 2.297392190547122, "language_loss": 0.73790348, "learning_rate": 3.7734393533138127e-06, "loss": 0.81663918, "num_input_tokens_seen": 64107360, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.24279785, "step": 2967, "time_per_iteration": 2.7282259464263916 }, { "auxiliary_loss_clip": 0.06574322, "auxiliary_loss_mlp": 0.01295161, "balance_loss_clip": 0.06306927, "balance_loss_mlp": 0.01272285, "epoch": 0.17844581391853298, "flos": 18732087838080.0, "grad_norm": 2.0453021909899887, "language_loss": 0.7741797, "learning_rate": 3.773259268638157e-06, "loss": 0.85287446, "num_input_tokens_seen": 64124690, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.22875977, "step": 2968, "time_per_iteration": 2.571967601776123 }, { "auxiliary_loss_clip": 0.0657907, "auxiliary_loss_mlp": 0.01284471, "balance_loss_clip": 0.06310016, "balance_loss_mlp": 0.0126157, "epoch": 0.17850593717120097, "flos": 27385168728960.0, "grad_norm": 1.9721947659504337, "language_loss": 0.76625562, "learning_rate": 3.7730791167203333e-06, "loss": 0.84489101, "num_input_tokens_seen": 64146315, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.22912598, "step": 2969, "time_per_iteration": 2.755000352859497 }, { "auxiliary_loss_clip": 0.06527292, "auxiliary_loss_mlp": 0.0126637, "balance_loss_clip": 0.06370288, "balance_loss_mlp": 0.0125886, "epoch": 0.17856606042386894, "flos": 67014696816000.0, "grad_norm": 0.8276999652024172, "language_loss": 0.69072402, "learning_rate": 3.772898897567171e-06, "loss": 0.76866066, "num_input_tokens_seen": 64210875, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.07495117, "step": 2970, "time_per_iteration": 3.3062524795532227 }, { "auxiliary_loss_clip": 0.06596911, "auxiliary_loss_mlp": 0.01285213, "balance_loss_clip": 0.06316687, "balance_loss_mlp": 0.01260966, "epoch": 0.1786261836765369, "flos": 36986015952000.0, "grad_norm": 2.1022032327731544, "language_loss": 0.67647475, "learning_rate": 3.772718611185505e-06, "loss": 0.75529599, "num_input_tokens_seen": 64230740, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.24206543, "step": 2971, "time_per_iteration": 2.7516863346099854 }, { "auxiliary_loss_clip": 0.06579806, "auxiliary_loss_mlp": 0.01285476, "balance_loss_clip": 0.06304751, "balance_loss_mlp": 0.01259929, "epoch": 0.17868630692920487, "flos": 24832122675840.0, "grad_norm": 2.0353780679148135, "language_loss": 0.90618682, "learning_rate": 3.7725382575821717e-06, "loss": 0.98483968, "num_input_tokens_seen": 64252300, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.25500488, "step": 2972, "time_per_iteration": 2.6505086421966553 }, { "auxiliary_loss_clip": 0.06575884, "auxiliary_loss_mlp": 0.01287555, "balance_loss_clip": 0.06302503, "balance_loss_mlp": 0.01263797, "epoch": 0.17874643018187283, "flos": 16987509504000.0, "grad_norm": 3.175863156600302, "language_loss": 0.88652301, "learning_rate": 3.77235783676401e-06, "loss": 0.96515745, "num_input_tokens_seen": 64270105, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.23779297, "step": 2973, "time_per_iteration": 2.6099398136138916 }, { "auxiliary_loss_clip": 0.0658465, "auxiliary_loss_mlp": 0.01282112, "balance_loss_clip": 0.06309888, "balance_loss_mlp": 0.01258187, "epoch": 0.1788065534345408, "flos": 21038499797760.0, "grad_norm": 2.2340936597999623, "language_loss": 0.76924169, "learning_rate": 3.7721773487378615e-06, "loss": 0.84790933, "num_input_tokens_seen": 64287250, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.23925781, "step": 2974, "time_per_iteration": 2.6151134967803955 }, { "auxiliary_loss_clip": 0.06574824, "auxiliary_loss_mlp": 0.01281279, "balance_loss_clip": 0.06302232, "balance_loss_mlp": 0.01257986, "epoch": 0.17886667668720876, "flos": 23994500935680.0, "grad_norm": 4.1847972678751075, "language_loss": 0.75827211, "learning_rate": 3.7719967935105705e-06, "loss": 0.83683312, "num_input_tokens_seen": 64307140, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.23278809, "step": 2975, "time_per_iteration": 2.693406581878662 }, { "auxiliary_loss_clip": 0.06567906, "auxiliary_loss_mlp": 0.01276656, "balance_loss_clip": 0.06299624, "balance_loss_mlp": 0.01254829, "epoch": 0.17892679993987676, "flos": 25746626136960.0, "grad_norm": 2.2602586131642517, "language_loss": 0.73908722, "learning_rate": 3.7718161710889833e-06, "loss": 0.8175329, "num_input_tokens_seen": 64328760, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.21813965, "step": 2976, "time_per_iteration": 2.832336902618408 }, { "auxiliary_loss_clip": 0.06568113, "auxiliary_loss_mlp": 0.01279569, "balance_loss_clip": 0.06303799, "balance_loss_mlp": 0.01259304, "epoch": 0.17898692319254472, "flos": 25706277596160.0, "grad_norm": 1.4696873097877703, "language_loss": 0.77822483, "learning_rate": 3.7716354814799495e-06, "loss": 0.85670167, "num_input_tokens_seen": 64348800, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.20263672, "step": 2977, "time_per_iteration": 2.663849115371704 }, { "auxiliary_loss_clip": 0.06574525, "auxiliary_loss_mlp": 0.01279818, "balance_loss_clip": 0.06305519, "balance_loss_mlp": 0.0125718, "epoch": 0.1790470464452127, "flos": 19323830171520.0, "grad_norm": 2.2358935503791573, "language_loss": 0.80635166, "learning_rate": 3.7714547246903203e-06, "loss": 0.88489503, "num_input_tokens_seen": 64367955, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.22631836, "step": 2978, "time_per_iteration": 2.675710439682007 }, { "auxiliary_loss_clip": 0.06578527, "auxiliary_loss_mlp": 0.01285899, "balance_loss_clip": 0.06304687, "balance_loss_mlp": 0.01260984, "epoch": 0.17910716969788065, "flos": 30052048953600.0, "grad_norm": 1.5225935285624657, "language_loss": 0.768704, "learning_rate": 3.7712739007269508e-06, "loss": 0.84734827, "num_input_tokens_seen": 64389805, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.24926758, "step": 2979, "time_per_iteration": 4.19325590133667 }, { "auxiliary_loss_clip": 0.0657348, "auxiliary_loss_mlp": 0.01280371, "balance_loss_clip": 0.06306391, "balance_loss_mlp": 0.01257304, "epoch": 0.17916729295054862, "flos": 19433848982400.0, "grad_norm": 1.6690999417210601, "language_loss": 0.69625783, "learning_rate": 3.7710930095966976e-06, "loss": 0.77479637, "num_input_tokens_seen": 64408220, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.23071289, "step": 2980, "time_per_iteration": 2.5804026126861572 }, { "auxiliary_loss_clip": 0.06583263, "auxiliary_loss_mlp": 0.01280139, "balance_loss_clip": 0.06308515, "balance_loss_mlp": 0.01255272, "epoch": 0.17922741620321658, "flos": 14616877789440.0, "grad_norm": 2.1330636782549752, "language_loss": 0.7145319, "learning_rate": 3.7709120513064196e-06, "loss": 0.79316592, "num_input_tokens_seen": 64426380, "router_z_loss_clip": 2.74804688, "router_z_loss_mlp": 0.24853516, "step": 2981, "time_per_iteration": 4.080436944961548 }, { "auxiliary_loss_clip": 0.06588735, "auxiliary_loss_mlp": 0.01287742, "balance_loss_clip": 0.06312168, "balance_loss_mlp": 0.01264377, "epoch": 0.17928753945588458, "flos": 17171013945600.0, "grad_norm": 2.4885530527598605, "language_loss": 0.82570159, "learning_rate": 3.7707310258629796e-06, "loss": 0.90446633, "num_input_tokens_seen": 64444355, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.23352051, "step": 2982, "time_per_iteration": 2.563359022140503 }, { "auxiliary_loss_clip": 0.0657592, "auxiliary_loss_mlp": 0.01285452, "balance_loss_clip": 0.06306229, "balance_loss_mlp": 0.0126273, "epoch": 0.17934766270855254, "flos": 31403860473600.0, "grad_norm": 1.5252308279523172, "language_loss": 0.8394537, "learning_rate": 3.7705499332732413e-06, "loss": 0.9180674, "num_input_tokens_seen": 64467800, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.22729492, "step": 2983, "time_per_iteration": 2.6672022342681885 }, { "auxiliary_loss_clip": 0.06588089, "auxiliary_loss_mlp": 0.012868, "balance_loss_clip": 0.06308962, "balance_loss_mlp": 0.01262183, "epoch": 0.1794077859612205, "flos": 20820558528000.0, "grad_norm": 2.1067514571847554, "language_loss": 0.85950226, "learning_rate": 3.7703687735440718e-06, "loss": 0.93825114, "num_input_tokens_seen": 64487230, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.24645996, "step": 2984, "time_per_iteration": 2.5791690349578857 }, { "auxiliary_loss_clip": 0.06584004, "auxiliary_loss_mlp": 0.01288882, "balance_loss_clip": 0.06308208, "balance_loss_mlp": 0.01264539, "epoch": 0.17946790921388847, "flos": 28994096102400.0, "grad_norm": 1.6571622320493422, "language_loss": 0.89952695, "learning_rate": 3.7701875466823416e-06, "loss": 0.97825575, "num_input_tokens_seen": 64509165, "router_z_loss_clip": 2.75585938, "router_z_loss_mlp": 0.24353027, "step": 2985, "time_per_iteration": 2.6539669036865234 }, { "auxiliary_loss_clip": 0.06575144, "auxiliary_loss_mlp": 0.01277564, "balance_loss_clip": 0.06309445, "balance_loss_mlp": 0.01257025, "epoch": 0.17952803246655644, "flos": 20743131755520.0, "grad_norm": 2.3677488379943283, "language_loss": 0.7057445, "learning_rate": 3.770006252694922e-06, "loss": 0.78427154, "num_input_tokens_seen": 64527940, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.20532227, "step": 2986, "time_per_iteration": 4.035936117172241 }, { "auxiliary_loss_clip": 0.0658123, "auxiliary_loss_mlp": 0.01280936, "balance_loss_clip": 0.06310678, "balance_loss_mlp": 0.01257535, "epoch": 0.1795881557192244, "flos": 28263390572160.0, "grad_norm": 3.673334676616046, "language_loss": 0.78734505, "learning_rate": 3.769824891588688e-06, "loss": 0.86596668, "num_input_tokens_seen": 64545230, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.23400879, "step": 2987, "time_per_iteration": 4.091249227523804 }, { "auxiliary_loss_clip": 0.06580971, "auxiliary_loss_mlp": 0.01282877, "balance_loss_clip": 0.06305602, "balance_loss_mlp": 0.01259333, "epoch": 0.17964827897189237, "flos": 18558016980480.0, "grad_norm": 1.8238561519494636, "language_loss": 0.78957534, "learning_rate": 3.7696434633705164e-06, "loss": 0.86821383, "num_input_tokens_seen": 64563820, "router_z_loss_clip": 2.75585938, "router_z_loss_mlp": 0.23535156, "step": 2988, "time_per_iteration": 2.574251413345337 }, { "auxiliary_loss_clip": 0.06496195, "auxiliary_loss_mlp": 0.01264256, "balance_loss_clip": 0.0633928, "balance_loss_mlp": 0.01256871, "epoch": 0.17970840222456036, "flos": 58182052625280.0, "grad_norm": 0.7381711512914264, "language_loss": 0.62587917, "learning_rate": 3.7694619680472875e-06, "loss": 0.7034837, "num_input_tokens_seen": 64621315, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.07366943, "step": 2989, "time_per_iteration": 3.1414754390716553 }, { "auxiliary_loss_clip": 0.06581208, "auxiliary_loss_mlp": 0.01279846, "balance_loss_clip": 0.06311718, "balance_loss_mlp": 0.01256552, "epoch": 0.17976852547722832, "flos": 20306662237440.0, "grad_norm": 2.0456126038121205, "language_loss": 0.71636361, "learning_rate": 3.7692804056258837e-06, "loss": 0.79497421, "num_input_tokens_seen": 64639885, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.23303223, "step": 2990, "time_per_iteration": 2.6325509548187256 }, { "auxiliary_loss_clip": 0.065872, "auxiliary_loss_mlp": 0.01278759, "balance_loss_clip": 0.06310942, "balance_loss_mlp": 0.01255298, "epoch": 0.1798286487298963, "flos": 39677564004480.0, "grad_norm": 2.7132005259355507, "language_loss": 0.69825327, "learning_rate": 3.7690987761131893e-06, "loss": 0.77691281, "num_input_tokens_seen": 64661220, "router_z_loss_clip": 2.76757812, "router_z_loss_mlp": 0.23449707, "step": 2991, "time_per_iteration": 2.768507719039917 }, { "auxiliary_loss_clip": 0.06581175, "auxiliary_loss_mlp": 0.01280471, "balance_loss_clip": 0.06309318, "balance_loss_mlp": 0.01257834, "epoch": 0.17988877198256426, "flos": 25527385128960.0, "grad_norm": 1.4796670948239967, "language_loss": 0.83643234, "learning_rate": 3.7689170795160924e-06, "loss": 0.91504884, "num_input_tokens_seen": 64682530, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.2265625, "step": 2992, "time_per_iteration": 2.6347596645355225 }, { "auxiliary_loss_clip": 0.06566273, "auxiliary_loss_mlp": 0.01283486, "balance_loss_clip": 0.06304817, "balance_loss_mlp": 0.01261576, "epoch": 0.17994889523523222, "flos": 18813539606400.0, "grad_norm": 2.043664479724167, "language_loss": 0.8271448, "learning_rate": 3.7687353158414822e-06, "loss": 0.90564239, "num_input_tokens_seen": 64701025, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.21899414, "step": 2993, "time_per_iteration": 2.585724115371704 }, { "auxiliary_loss_clip": 0.06577885, "auxiliary_loss_mlp": 0.0128451, "balance_loss_clip": 0.06307471, "balance_loss_mlp": 0.01261061, "epoch": 0.18000901848790019, "flos": 21110601836160.0, "grad_norm": 1.7872150760741086, "language_loss": 0.79010266, "learning_rate": 3.7685534850962517e-06, "loss": 0.86872661, "num_input_tokens_seen": 64719570, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.23474121, "step": 2994, "time_per_iteration": 2.6895878314971924 }, { "auxiliary_loss_clip": 0.06577295, "auxiliary_loss_mlp": 0.01290422, "balance_loss_clip": 0.0630691, "balance_loss_mlp": 0.0126689, "epoch": 0.18006914174056818, "flos": 19652586865920.0, "grad_norm": 6.626385640725532, "language_loss": 0.81618321, "learning_rate": 3.768371587287296e-06, "loss": 0.89486039, "num_input_tokens_seen": 64738110, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.23522949, "step": 2995, "time_per_iteration": 2.5990352630615234 }, { "auxiliary_loss_clip": 0.06573625, "auxiliary_loss_mlp": 0.01275811, "balance_loss_clip": 0.06304122, "balance_loss_mlp": 0.01254341, "epoch": 0.18012926499323614, "flos": 19505909093760.0, "grad_norm": 1.5194215279731556, "language_loss": 0.84832329, "learning_rate": 3.768189622421512e-06, "loss": 0.92681766, "num_input_tokens_seen": 64756345, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.21459961, "step": 2996, "time_per_iteration": 2.5847666263580322 }, { "auxiliary_loss_clip": 0.0655952, "auxiliary_loss_mlp": 0.01282104, "balance_loss_clip": 0.06296531, "balance_loss_mlp": 0.01260372, "epoch": 0.1801893882459041, "flos": 19470759505920.0, "grad_norm": 1.9192473866297421, "language_loss": 0.88365835, "learning_rate": 3.7680075905058006e-06, "loss": 0.96207458, "num_input_tokens_seen": 64776375, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.21740723, "step": 2997, "time_per_iteration": 2.5866732597351074 }, { "auxiliary_loss_clip": 0.06581239, "auxiliary_loss_mlp": 0.01286827, "balance_loss_clip": 0.06304204, "balance_loss_mlp": 0.01262866, "epoch": 0.18024951149857207, "flos": 26877938837760.0, "grad_norm": 2.0956697047218316, "language_loss": 0.86586189, "learning_rate": 3.7678254915470643e-06, "loss": 0.94454259, "num_input_tokens_seen": 64796210, "router_z_loss_clip": 2.77148438, "router_z_loss_mlp": 0.23974609, "step": 2998, "time_per_iteration": 2.645350933074951 }, { "auxiliary_loss_clip": 0.06566925, "auxiliary_loss_mlp": 0.0129446, "balance_loss_clip": 0.06301016, "balance_loss_mlp": 0.01271179, "epoch": 0.18030963475124004, "flos": 30234421365120.0, "grad_norm": 1.7829395654789846, "language_loss": 0.84979331, "learning_rate": 3.7676433255522084e-06, "loss": 0.92840719, "num_input_tokens_seen": 64818590, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.23266602, "step": 2999, "time_per_iteration": 2.6678271293640137 }, { "auxiliary_loss_clip": 0.06572828, "auxiliary_loss_mlp": 0.01280671, "balance_loss_clip": 0.0630395, "balance_loss_mlp": 0.01257067, "epoch": 0.180369758003908, "flos": 22313681159040.0, "grad_norm": 2.0550690609779734, "language_loss": 0.75803101, "learning_rate": 3.76746109252814e-06, "loss": 0.83656597, "num_input_tokens_seen": 64838350, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.23620605, "step": 3000, "time_per_iteration": 2.5923664569854736 }, { "auxiliary_loss_clip": 0.06561126, "auxiliary_loss_mlp": 0.012846, "balance_loss_clip": 0.06294817, "balance_loss_mlp": 0.0126381, "epoch": 0.18042988125657597, "flos": 23738726747520.0, "grad_norm": 2.3153313007948513, "language_loss": 0.71853566, "learning_rate": 3.76727879248177e-06, "loss": 0.7969929, "num_input_tokens_seen": 64858065, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.20800781, "step": 3001, "time_per_iteration": 2.7149789333343506 }, { "auxiliary_loss_clip": 0.06570506, "auxiliary_loss_mlp": 0.01282952, "balance_loss_clip": 0.06296468, "balance_loss_mlp": 0.0125942, "epoch": 0.18049000450924396, "flos": 24099781991040.0, "grad_norm": 2.063047496167109, "language_loss": 0.88412905, "learning_rate": 3.767096425420011e-06, "loss": 0.96266365, "num_input_tokens_seen": 64877305, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.23522949, "step": 3002, "time_per_iteration": 2.609341621398926 }, { "auxiliary_loss_clip": 0.06567153, "auxiliary_loss_mlp": 0.01285404, "balance_loss_clip": 0.06297585, "balance_loss_mlp": 0.01263624, "epoch": 0.18055012776191193, "flos": 22169602863360.0, "grad_norm": 2.6312857614372995, "language_loss": 0.82054502, "learning_rate": 3.7669139913497788e-06, "loss": 0.89907062, "num_input_tokens_seen": 64896955, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.21789551, "step": 3003, "time_per_iteration": 2.6802446842193604 }, { "auxiliary_loss_clip": 0.06567004, "auxiliary_loss_mlp": 0.01291371, "balance_loss_clip": 0.06296987, "balance_loss_mlp": 0.01269127, "epoch": 0.1806102510145799, "flos": 28921155523200.0, "grad_norm": 2.1179289084603137, "language_loss": 0.68005776, "learning_rate": 3.7667314902779907e-06, "loss": 0.75864154, "num_input_tokens_seen": 64917080, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.22229004, "step": 3004, "time_per_iteration": 2.6267902851104736 }, { "auxiliary_loss_clip": 0.06567127, "auxiliary_loss_mlp": 0.01281515, "balance_loss_clip": 0.06298201, "balance_loss_mlp": 0.01258901, "epoch": 0.18067037426724786, "flos": 19031648584320.0, "grad_norm": 1.8568874529241761, "language_loss": 0.85612327, "learning_rate": 3.7665489222115677e-06, "loss": 0.93460965, "num_input_tokens_seen": 64935215, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.22607422, "step": 3005, "time_per_iteration": 2.5543386936187744 }, { "auxiliary_loss_clip": 0.06561237, "auxiliary_loss_mlp": 0.01276986, "balance_loss_clip": 0.06296539, "balance_loss_mlp": 0.01255373, "epoch": 0.18073049751991582, "flos": 27460960346880.0, "grad_norm": 2.146866530974439, "language_loss": 0.83872485, "learning_rate": 3.766366287157432e-06, "loss": 0.91710705, "num_input_tokens_seen": 64956275, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.21594238, "step": 3006, "time_per_iteration": 2.6294777393341064 }, { "auxiliary_loss_clip": 0.06565307, "auxiliary_loss_mlp": 0.01281831, "balance_loss_clip": 0.063005, "balance_loss_mlp": 0.01259026, "epoch": 0.1807906207725838, "flos": 28736309416320.0, "grad_norm": 1.8914498366933552, "language_loss": 0.7812562, "learning_rate": 3.7661835851225103e-06, "loss": 0.85972756, "num_input_tokens_seen": 64979390, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.22790527, "step": 3007, "time_per_iteration": 2.7258126735687256 }, { "auxiliary_loss_clip": 0.06459042, "auxiliary_loss_mlp": 0.01258218, "balance_loss_clip": 0.06307605, "balance_loss_mlp": 0.01251453, "epoch": 0.18085074402525175, "flos": 64488861411840.0, "grad_norm": 0.7851415903060679, "language_loss": 0.57001245, "learning_rate": 3.7660008161137294e-06, "loss": 0.64718503, "num_input_tokens_seen": 65043135, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.06781006, "step": 3008, "time_per_iteration": 3.320929765701294 }, { "auxiliary_loss_clip": 0.06569356, "auxiliary_loss_mlp": 0.01283839, "balance_loss_clip": 0.06297991, "balance_loss_mlp": 0.01259878, "epoch": 0.18091086727791975, "flos": 23483665319040.0, "grad_norm": 2.0965795823142606, "language_loss": 0.68222213, "learning_rate": 3.765817980138021e-06, "loss": 0.76075411, "num_input_tokens_seen": 65062845, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.23974609, "step": 3009, "time_per_iteration": 2.590083360671997 }, { "auxiliary_loss_clip": 0.06560355, "auxiliary_loss_mlp": 0.01282295, "balance_loss_clip": 0.06294376, "balance_loss_mlp": 0.01260266, "epoch": 0.1809709905305877, "flos": 24177334544640.0, "grad_norm": 2.38759829828509, "language_loss": 0.7641325, "learning_rate": 3.7656350772023177e-06, "loss": 0.84255904, "num_input_tokens_seen": 65082110, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.22045898, "step": 3010, "time_per_iteration": 2.600771903991699 }, { "auxiliary_loss_clip": 0.06550412, "auxiliary_loss_mlp": 0.01276831, "balance_loss_clip": 0.06293486, "balance_loss_mlp": 0.01255409, "epoch": 0.18103111378325568, "flos": 21657006311040.0, "grad_norm": 1.6185888742255483, "language_loss": 0.67803186, "learning_rate": 3.7654521073135553e-06, "loss": 0.75630426, "num_input_tokens_seen": 65101985, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.21411133, "step": 3011, "time_per_iteration": 2.5872552394866943 }, { "auxiliary_loss_clip": 0.0656146, "auxiliary_loss_mlp": 0.01281717, "balance_loss_clip": 0.0630061, "balance_loss_mlp": 0.01259842, "epoch": 0.18109123703592364, "flos": 53698632537600.0, "grad_norm": 1.5117774604802758, "language_loss": 0.72273612, "learning_rate": 3.7652690704786723e-06, "loss": 0.80116785, "num_input_tokens_seen": 65129295, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.21875, "step": 3012, "time_per_iteration": 2.882783889770508 }, { "auxiliary_loss_clip": 0.06554094, "auxiliary_loss_mlp": 0.01280052, "balance_loss_clip": 0.06294497, "balance_loss_mlp": 0.01255984, "epoch": 0.1811513602885916, "flos": 35854325907840.0, "grad_norm": 6.357790195776853, "language_loss": 0.63651943, "learning_rate": 3.765085966704609e-06, "loss": 0.71486092, "num_input_tokens_seen": 65150625, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.24084473, "step": 3013, "time_per_iteration": 2.689950942993164 }, { "auxiliary_loss_clip": 0.06555605, "auxiliary_loss_mlp": 0.012801, "balance_loss_clip": 0.06292212, "balance_loss_mlp": 0.01258308, "epoch": 0.18121148354125957, "flos": 23739355653120.0, "grad_norm": 1.571133025260885, "language_loss": 0.77109551, "learning_rate": 3.764902795998309e-06, "loss": 0.8494525, "num_input_tokens_seen": 65170880, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.21801758, "step": 3014, "time_per_iteration": 2.5919554233551025 }, { "auxiliary_loss_clip": 0.06570922, "auxiliary_loss_mlp": 0.01290026, "balance_loss_clip": 0.06298085, "balance_loss_mlp": 0.01265314, "epoch": 0.18127160679392756, "flos": 28735470875520.0, "grad_norm": 2.1858883384222243, "language_loss": 0.66570294, "learning_rate": 3.7647195583667184e-06, "loss": 0.74431241, "num_input_tokens_seen": 65192530, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.24731445, "step": 3015, "time_per_iteration": 2.644361972808838 }, { "auxiliary_loss_clip": 0.06554876, "auxiliary_loss_mlp": 0.0127787, "balance_loss_clip": 0.06293118, "balance_loss_mlp": 0.01256091, "epoch": 0.18133173004659553, "flos": 20491256782080.0, "grad_norm": 2.0171718628943633, "language_loss": 0.78979778, "learning_rate": 3.764536253816785e-06, "loss": 0.86812526, "num_input_tokens_seen": 65211675, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.21777344, "step": 3016, "time_per_iteration": 2.593520402908325 }, { "auxiliary_loss_clip": 0.06563821, "auxiliary_loss_mlp": 0.01286087, "balance_loss_clip": 0.0629105, "balance_loss_mlp": 0.01262507, "epoch": 0.1813918532992635, "flos": 22857905427840.0, "grad_norm": 1.6611001644772394, "language_loss": 0.84095114, "learning_rate": 3.7643528823554602e-06, "loss": 0.91945022, "num_input_tokens_seen": 65231185, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.23571777, "step": 3017, "time_per_iteration": 2.613632917404175 }, { "auxiliary_loss_clip": 0.06550705, "auxiliary_loss_mlp": 0.01283847, "balance_loss_clip": 0.06292675, "balance_loss_mlp": 0.01262329, "epoch": 0.18145197655193146, "flos": 36074028113280.0, "grad_norm": 2.2146872089629452, "language_loss": 0.68173236, "learning_rate": 3.764169443989697e-06, "loss": 0.76007783, "num_input_tokens_seen": 65251645, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.21520996, "step": 3018, "time_per_iteration": 4.163667917251587 }, { "auxiliary_loss_clip": 0.06560764, "auxiliary_loss_mlp": 0.01280441, "balance_loss_clip": 0.06290927, "balance_loss_mlp": 0.01258351, "epoch": 0.18151209980459942, "flos": 24030698699520.0, "grad_norm": 1.9864543989280576, "language_loss": 0.76331818, "learning_rate": 3.7639859387264518e-06, "loss": 0.84173024, "num_input_tokens_seen": 65271125, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.2208252, "step": 3019, "time_per_iteration": 2.61763596534729 }, { "auxiliary_loss_clip": 0.06556346, "auxiliary_loss_mlp": 0.01283227, "balance_loss_clip": 0.06287208, "balance_loss_mlp": 0.01258968, "epoch": 0.1815722230572674, "flos": 23958470880000.0, "grad_norm": 2.6846295205751463, "language_loss": 0.82038939, "learning_rate": 3.7638023665726834e-06, "loss": 0.89878511, "num_input_tokens_seen": 65290600, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.24255371, "step": 3020, "time_per_iteration": 4.026724338531494 }, { "auxiliary_loss_clip": 0.06558514, "auxiliary_loss_mlp": 0.01282445, "balance_loss_clip": 0.06291334, "balance_loss_mlp": 0.0125995, "epoch": 0.18163234630993536, "flos": 24392885973120.0, "grad_norm": 1.9941355964341623, "language_loss": 0.78870964, "learning_rate": 3.763618727535352e-06, "loss": 0.86711931, "num_input_tokens_seen": 65311040, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.22509766, "step": 3021, "time_per_iteration": 2.7121315002441406 }, { "auxiliary_loss_clip": 0.06550196, "auxiliary_loss_mlp": 0.01276758, "balance_loss_clip": 0.06290603, "balance_loss_mlp": 0.01255669, "epoch": 0.18169246956260335, "flos": 24688295942400.0, "grad_norm": 1.5683592777151634, "language_loss": 0.8570112, "learning_rate": 3.763435021621422e-06, "loss": 0.93528068, "num_input_tokens_seen": 65332115, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.2109375, "step": 3022, "time_per_iteration": 2.771974563598633 }, { "auxiliary_loss_clip": 0.06558332, "auxiliary_loss_mlp": 0.01281072, "balance_loss_clip": 0.06290826, "balance_loss_mlp": 0.01259292, "epoch": 0.1817525928152713, "flos": 24250149342720.0, "grad_norm": 2.5605807812150068, "language_loss": 0.70000815, "learning_rate": 3.763251248837859e-06, "loss": 0.77840221, "num_input_tokens_seen": 65352210, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.21789551, "step": 3023, "time_per_iteration": 2.643718719482422 }, { "auxiliary_loss_clip": 0.06552249, "auxiliary_loss_mlp": 0.01277255, "balance_loss_clip": 0.06287287, "balance_loss_mlp": 0.0125594, "epoch": 0.18181271606793928, "flos": 16477680136320.0, "grad_norm": 1.675876969904522, "language_loss": 0.74959821, "learning_rate": 3.7630674091916317e-06, "loss": 0.82789326, "num_input_tokens_seen": 65370600, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.21325684, "step": 3024, "time_per_iteration": 2.5572316646575928 }, { "auxiliary_loss_clip": 0.06555603, "auxiliary_loss_mlp": 0.01275835, "balance_loss_clip": 0.0629197, "balance_loss_mlp": 0.01254294, "epoch": 0.18187283932060724, "flos": 18585787409280.0, "grad_norm": 2.022102144856858, "language_loss": 0.89395899, "learning_rate": 3.7628835026897123e-06, "loss": 0.97227335, "num_input_tokens_seen": 65387270, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.21533203, "step": 3025, "time_per_iteration": 4.0263707637786865 }, { "auxiliary_loss_clip": 0.06550944, "auxiliary_loss_mlp": 0.01277742, "balance_loss_clip": 0.06288683, "balance_loss_mlp": 0.0125576, "epoch": 0.1819329625732752, "flos": 20273105877120.0, "grad_norm": 1.7078804238190308, "language_loss": 0.7970314, "learning_rate": 3.7626995293390735e-06, "loss": 0.87531823, "num_input_tokens_seen": 65406550, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.21984863, "step": 3026, "time_per_iteration": 2.5575664043426514 }, { "auxiliary_loss_clip": 0.06559742, "auxiliary_loss_mlp": 0.01280088, "balance_loss_clip": 0.06293423, "balance_loss_mlp": 0.01258106, "epoch": 0.18199308582594317, "flos": 25921242046080.0, "grad_norm": 1.7132226033944622, "language_loss": 0.76440555, "learning_rate": 3.762515489146692e-06, "loss": 0.84280384, "num_input_tokens_seen": 65425955, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.21972656, "step": 3027, "time_per_iteration": 4.000532627105713 }, { "auxiliary_loss_clip": 0.06559108, "auxiliary_loss_mlp": 0.01286567, "balance_loss_clip": 0.06290536, "balance_loss_mlp": 0.01264335, "epoch": 0.18205320907861114, "flos": 15382942542720.0, "grad_norm": 2.300434315787094, "language_loss": 0.85910809, "learning_rate": 3.762331382119546e-06, "loss": 0.93756485, "num_input_tokens_seen": 65442820, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.22253418, "step": 3028, "time_per_iteration": 2.545525074005127 }, { "auxiliary_loss_clip": 0.06563008, "auxiliary_loss_mlp": 0.01283462, "balance_loss_clip": 0.06295959, "balance_loss_mlp": 0.0125937, "epoch": 0.18211333233127913, "flos": 25630485978240.0, "grad_norm": 6.262494557110479, "language_loss": 0.83846939, "learning_rate": 3.7621472082646183e-06, "loss": 0.91693407, "num_input_tokens_seen": 65461825, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.2409668, "step": 3029, "time_per_iteration": 2.620073080062866 }, { "auxiliary_loss_clip": 0.06559475, "auxiliary_loss_mlp": 0.01286023, "balance_loss_clip": 0.06292637, "balance_loss_mlp": 0.01261681, "epoch": 0.1821734555839471, "flos": 14981329123200.0, "grad_norm": 2.1517247367139225, "language_loss": 0.78724837, "learning_rate": 3.761962967588891e-06, "loss": 0.86570334, "num_input_tokens_seen": 65479480, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.2434082, "step": 3030, "time_per_iteration": 2.5678727626800537 }, { "auxiliary_loss_clip": 0.0656284, "auxiliary_loss_mlp": 0.01282506, "balance_loss_clip": 0.06294208, "balance_loss_mlp": 0.01259558, "epoch": 0.18223357883661506, "flos": 20200291079040.0, "grad_norm": 2.035932459583794, "language_loss": 0.86146092, "learning_rate": 3.761778660099352e-06, "loss": 0.93991435, "num_input_tokens_seen": 65497775, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.22961426, "step": 3031, "time_per_iteration": 2.5752203464508057 }, { "auxiliary_loss_clip": 0.06558241, "auxiliary_loss_mlp": 0.01281383, "balance_loss_clip": 0.06291313, "balance_loss_mlp": 0.01259628, "epoch": 0.18229370208928303, "flos": 15237438727680.0, "grad_norm": 2.1114412998941017, "language_loss": 0.80979735, "learning_rate": 3.76159428580299e-06, "loss": 0.88819361, "num_input_tokens_seen": 65516505, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.2175293, "step": 3032, "time_per_iteration": 2.566174030303955 }, { "auxiliary_loss_clip": 0.06567264, "auxiliary_loss_mlp": 0.01282807, "balance_loss_clip": 0.06293347, "balance_loss_mlp": 0.01259752, "epoch": 0.182353825341951, "flos": 23847026549760.0, "grad_norm": 2.0236448727057557, "language_loss": 0.81601685, "learning_rate": 3.761409844706795e-06, "loss": 0.89451754, "num_input_tokens_seen": 65536160, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.23046875, "step": 3033, "time_per_iteration": 2.5869858264923096 }, { "auxiliary_loss_clip": 0.06450877, "auxiliary_loss_mlp": 0.01270753, "balance_loss_clip": 0.06300801, "balance_loss_mlp": 0.01264381, "epoch": 0.18241394859461896, "flos": 61208017522560.0, "grad_norm": 0.8504300113825612, "language_loss": 0.63340849, "learning_rate": 3.7612253368177625e-06, "loss": 0.71062481, "num_input_tokens_seen": 65589375, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.06375122, "step": 3034, "time_per_iteration": 3.1018402576446533 }, { "auxiliary_loss_clip": 0.06556351, "auxiliary_loss_mlp": 0.01279799, "balance_loss_clip": 0.06293222, "balance_loss_mlp": 0.01257996, "epoch": 0.18247407184728695, "flos": 18476439431040.0, "grad_norm": 2.8955509106843444, "language_loss": 0.80286646, "learning_rate": 3.7610407621428893e-06, "loss": 0.88122785, "num_input_tokens_seen": 65606720, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.21801758, "step": 3035, "time_per_iteration": 2.560429334640503 }, { "auxiliary_loss_clip": 0.06552492, "auxiliary_loss_mlp": 0.01281515, "balance_loss_clip": 0.06291486, "balance_loss_mlp": 0.01260379, "epoch": 0.18253419509995492, "flos": 21801042679680.0, "grad_norm": 2.840558433956585, "language_loss": 0.85213482, "learning_rate": 3.7608561206891735e-06, "loss": 0.930475, "num_input_tokens_seen": 65625495, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.21142578, "step": 3036, "time_per_iteration": 2.599622964859009 }, { "auxiliary_loss_clip": 0.06550538, "auxiliary_loss_mlp": 0.01284148, "balance_loss_clip": 0.06292109, "balance_loss_mlp": 0.01263477, "epoch": 0.18259431835262288, "flos": 20154743585280.0, "grad_norm": 2.027024854888995, "language_loss": 0.80324984, "learning_rate": 3.760671412463617e-06, "loss": 0.88159674, "num_input_tokens_seen": 65643515, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.20666504, "step": 3037, "time_per_iteration": 2.5619537830352783 }, { "auxiliary_loss_clip": 0.06564653, "auxiliary_loss_mlp": 0.01286227, "balance_loss_clip": 0.06298555, "balance_loss_mlp": 0.0126341, "epoch": 0.18265444160529085, "flos": 16987132160640.0, "grad_norm": 9.558982361390127, "language_loss": 0.81076211, "learning_rate": 3.7604866374732246e-06, "loss": 0.8892709, "num_input_tokens_seen": 65658155, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.22802734, "step": 3038, "time_per_iteration": 2.5335614681243896 }, { "auxiliary_loss_clip": 0.06556495, "auxiliary_loss_mlp": 0.01285909, "balance_loss_clip": 0.0629587, "balance_loss_mlp": 0.01264225, "epoch": 0.1827145648579588, "flos": 34431879795840.0, "grad_norm": 2.714980670612514, "language_loss": 0.68247974, "learning_rate": 3.7603017957250023e-06, "loss": 0.76090378, "num_input_tokens_seen": 65679310, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.21679688, "step": 3039, "time_per_iteration": 2.6802639961242676 }, { "auxiliary_loss_clip": 0.06560489, "auxiliary_loss_mlp": 0.01278017, "balance_loss_clip": 0.06294721, "balance_loss_mlp": 0.01255748, "epoch": 0.18277468811062678, "flos": 53298905834880.0, "grad_norm": 1.8625903363306242, "language_loss": 0.74793363, "learning_rate": 3.7601168872259593e-06, "loss": 0.82631862, "num_input_tokens_seen": 65705235, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.22290039, "step": 3040, "time_per_iteration": 2.8798465728759766 }, { "auxiliary_loss_clip": 0.06551532, "auxiliary_loss_mlp": 0.01285159, "balance_loss_clip": 0.06290005, "balance_loss_mlp": 0.01263737, "epoch": 0.18283481136329474, "flos": 31658879975040.0, "grad_norm": 1.8738758653841736, "language_loss": 0.61304259, "learning_rate": 3.7599319119831075e-06, "loss": 0.69140947, "num_input_tokens_seen": 65727575, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.21435547, "step": 3041, "time_per_iteration": 2.8356761932373047 }, { "auxiliary_loss_clip": 0.06557202, "auxiliary_loss_mlp": 0.01279172, "balance_loss_clip": 0.06290968, "balance_loss_mlp": 0.01256641, "epoch": 0.18289493461596273, "flos": 53148957753600.0, "grad_norm": 1.8115616455747157, "language_loss": 0.60873133, "learning_rate": 3.7597468700034616e-06, "loss": 0.68709505, "num_input_tokens_seen": 65751370, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.2253418, "step": 3042, "time_per_iteration": 2.8683786392211914 }, { "auxiliary_loss_clip": 0.06561162, "auxiliary_loss_mlp": 0.01281994, "balance_loss_clip": 0.0630011, "balance_loss_mlp": 0.01260358, "epoch": 0.1829550578686307, "flos": 25595797587840.0, "grad_norm": 2.3922731104491546, "language_loss": 0.87981689, "learning_rate": 3.7595617612940374e-06, "loss": 0.9582485, "num_input_tokens_seen": 65771040, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.21618652, "step": 3043, "time_per_iteration": 2.620074510574341 }, { "auxiliary_loss_clip": 0.06564254, "auxiliary_loss_mlp": 0.01281055, "balance_loss_clip": 0.06295581, "balance_loss_mlp": 0.01259049, "epoch": 0.18301518112129866, "flos": 22608001025280.0, "grad_norm": 1.8923144383462571, "language_loss": 0.71200752, "learning_rate": 3.7593765858618552e-06, "loss": 0.79046065, "num_input_tokens_seen": 65789345, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.22021484, "step": 3044, "time_per_iteration": 2.584308385848999 }, { "auxiliary_loss_clip": 0.06563708, "auxiliary_loss_mlp": 0.01279286, "balance_loss_clip": 0.06292644, "balance_loss_mlp": 0.01256171, "epoch": 0.18307530437396663, "flos": 34029176273280.0, "grad_norm": 1.8582777868682785, "language_loss": 0.65120864, "learning_rate": 3.7591913437139365e-06, "loss": 0.72963858, "num_input_tokens_seen": 65810990, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.23132324, "step": 3045, "time_per_iteration": 2.7298026084899902 }, { "auxiliary_loss_clip": 0.06558979, "auxiliary_loss_mlp": 0.01282278, "balance_loss_clip": 0.0629494, "balance_loss_mlp": 0.01261487, "epoch": 0.1831354276266346, "flos": 21284756547840.0, "grad_norm": 3.371395686037364, "language_loss": 0.80150473, "learning_rate": 3.7590060348573066e-06, "loss": 0.87991726, "num_input_tokens_seen": 65827230, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.20812988, "step": 3046, "time_per_iteration": 2.582831621170044 }, { "auxiliary_loss_clip": 0.0656763, "auxiliary_loss_mlp": 0.01291748, "balance_loss_clip": 0.06296637, "balance_loss_mlp": 0.01269146, "epoch": 0.18319555087930256, "flos": 21039338338560.0, "grad_norm": 2.422755478593393, "language_loss": 0.80085003, "learning_rate": 3.7588206592989903e-06, "loss": 0.87944382, "num_input_tokens_seen": 65845900, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.22583008, "step": 3047, "time_per_iteration": 2.5872011184692383 }, { "auxiliary_loss_clip": 0.06557032, "auxiliary_loss_mlp": 0.01279511, "balance_loss_clip": 0.0629735, "balance_loss_mlp": 0.01258733, "epoch": 0.18325567413197055, "flos": 34390944276480.0, "grad_norm": 1.6188060006952487, "language_loss": 0.81793106, "learning_rate": 3.7586352170460194e-06, "loss": 0.8962965, "num_input_tokens_seen": 65868730, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.20776367, "step": 3048, "time_per_iteration": 2.746086597442627 }, { "auxiliary_loss_clip": 0.06556903, "auxiliary_loss_mlp": 0.01281234, "balance_loss_clip": 0.06292225, "balance_loss_mlp": 0.012598, "epoch": 0.18331579738463852, "flos": 20564742412800.0, "grad_norm": 1.9455042841202328, "language_loss": 0.87142164, "learning_rate": 3.758449708105424e-06, "loss": 0.94980299, "num_input_tokens_seen": 65888420, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.21435547, "step": 3049, "time_per_iteration": 2.555647373199463 }, { "auxiliary_loss_clip": 0.06572159, "auxiliary_loss_mlp": 0.01286034, "balance_loss_clip": 0.06296804, "balance_loss_mlp": 0.01263337, "epoch": 0.18337592063730648, "flos": 19613663844480.0, "grad_norm": 3.217404444960485, "language_loss": 0.78235185, "learning_rate": 3.75826413248424e-06, "loss": 0.86093378, "num_input_tokens_seen": 65905840, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.22705078, "step": 3050, "time_per_iteration": 2.5559730529785156 }, { "auxiliary_loss_clip": 0.06556386, "auxiliary_loss_mlp": 0.01278068, "balance_loss_clip": 0.06290183, "balance_loss_mlp": 0.01257504, "epoch": 0.18343604388997445, "flos": 20857301343360.0, "grad_norm": 2.1659263119642227, "language_loss": 1.00205207, "learning_rate": 3.7580784901895035e-06, "loss": 1.08039665, "num_input_tokens_seen": 65922845, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.20568848, "step": 3051, "time_per_iteration": 2.6007583141326904 }, { "auxiliary_loss_clip": 0.06554615, "auxiliary_loss_mlp": 0.01284443, "balance_loss_clip": 0.06292006, "balance_loss_mlp": 0.01262806, "epoch": 0.1834961671426424, "flos": 24402109921920.0, "grad_norm": 2.031311349940212, "language_loss": 0.86606133, "learning_rate": 3.7578927812282542e-06, "loss": 0.94445187, "num_input_tokens_seen": 65945555, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.21643066, "step": 3052, "time_per_iteration": 2.6367642879486084 }, { "auxiliary_loss_clip": 0.06551608, "auxiliary_loss_mlp": 0.01277789, "balance_loss_clip": 0.06289356, "balance_loss_mlp": 0.01256761, "epoch": 0.18355629039531038, "flos": 21257992368000.0, "grad_norm": 2.107676181598376, "language_loss": 0.73817629, "learning_rate": 3.7577070056075356e-06, "loss": 0.81647027, "num_input_tokens_seen": 65963965, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.21032715, "step": 3053, "time_per_iteration": 2.571791410446167 }, { "auxiliary_loss_clip": 0.06563292, "auxiliary_loss_mlp": 0.01286431, "balance_loss_clip": 0.06292282, "balance_loss_mlp": 0.01264174, "epoch": 0.18361641364797834, "flos": 28663830034560.0, "grad_norm": 1.70886668979281, "language_loss": 0.628492, "learning_rate": 3.7575211633343902e-06, "loss": 0.70698923, "num_input_tokens_seen": 65985965, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.22265625, "step": 3054, "time_per_iteration": 2.621251106262207 }, { "auxiliary_loss_clip": 0.06560157, "auxiliary_loss_mlp": 0.01277862, "balance_loss_clip": 0.06291135, "balance_loss_mlp": 0.0125532, "epoch": 0.18367653690064634, "flos": 20924414064000.0, "grad_norm": 1.8645737696834306, "language_loss": 0.79245305, "learning_rate": 3.7573352544158663e-06, "loss": 0.87083328, "num_input_tokens_seen": 66005645, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.22521973, "step": 3055, "time_per_iteration": 2.5870935916900635 }, { "auxiliary_loss_clip": 0.06549063, "auxiliary_loss_mlp": 0.01276805, "balance_loss_clip": 0.06287945, "balance_loss_mlp": 0.01256122, "epoch": 0.1837366601533143, "flos": 28772884523520.0, "grad_norm": 17.199377815410962, "language_loss": 0.70506108, "learning_rate": 3.757149278859014e-06, "loss": 0.78331971, "num_input_tokens_seen": 66025675, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.20690918, "step": 3056, "time_per_iteration": 2.606657028198242 }, { "auxiliary_loss_clip": 0.06560816, "auxiliary_loss_mlp": 0.01278923, "balance_loss_clip": 0.0629185, "balance_loss_mlp": 0.01258324, "epoch": 0.18379678340598227, "flos": 21257782732800.0, "grad_norm": 1.7379223568451916, "language_loss": 0.81172109, "learning_rate": 3.7569632366708842e-06, "loss": 0.89011848, "num_input_tokens_seen": 66046125, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.20593262, "step": 3057, "time_per_iteration": 3.995802164077759 }, { "auxiliary_loss_clip": 0.0656729, "auxiliary_loss_mlp": 0.01286991, "balance_loss_clip": 0.06290267, "balance_loss_mlp": 0.01263411, "epoch": 0.18385690665865023, "flos": 20455981413120.0, "grad_norm": 2.526922417769403, "language_loss": 0.83173186, "learning_rate": 3.756777127858533e-06, "loss": 0.91027462, "num_input_tokens_seen": 66064375, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 0.23596191, "step": 3058, "time_per_iteration": 2.5589208602905273 }, { "auxiliary_loss_clip": 0.0655975, "auxiliary_loss_mlp": 0.01286214, "balance_loss_clip": 0.06291498, "balance_loss_mlp": 0.01262969, "epoch": 0.1839170299113182, "flos": 26147736432000.0, "grad_norm": 2.448297716292587, "language_loss": 0.86267102, "learning_rate": 3.756590952429017e-06, "loss": 0.94113064, "num_input_tokens_seen": 66084590, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.23254395, "step": 3059, "time_per_iteration": 2.583343505859375 }, { "auxiliary_loss_clip": 0.06557044, "auxiliary_loss_mlp": 0.012775, "balance_loss_clip": 0.06291164, "balance_loss_mlp": 0.01256818, "epoch": 0.18397715316398616, "flos": 31765921966080.0, "grad_norm": 1.6437624947545906, "language_loss": 0.7346741, "learning_rate": 3.756404710389396e-06, "loss": 0.81301957, "num_input_tokens_seen": 66107105, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.20690918, "step": 3060, "time_per_iteration": 4.121454954147339 }, { "auxiliary_loss_clip": 0.06562789, "auxiliary_loss_mlp": 0.01281624, "balance_loss_clip": 0.06291692, "balance_loss_mlp": 0.01259439, "epoch": 0.18403727641665413, "flos": 24619548067200.0, "grad_norm": 1.7163930382011248, "language_loss": 0.73082983, "learning_rate": 3.7562184017467323e-06, "loss": 0.80927396, "num_input_tokens_seen": 66129295, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.22180176, "step": 3061, "time_per_iteration": 2.6234312057495117 }, { "auxiliary_loss_clip": 0.06561273, "auxiliary_loss_mlp": 0.01286774, "balance_loss_clip": 0.06293832, "balance_loss_mlp": 0.01265697, "epoch": 0.18409739966932212, "flos": 23446503233280.0, "grad_norm": 1.8957931029351287, "language_loss": 0.82040632, "learning_rate": 3.7560320265080906e-06, "loss": 0.8988868, "num_input_tokens_seen": 66146910, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.21081543, "step": 3062, "time_per_iteration": 2.595551013946533 }, { "auxiliary_loss_clip": 0.06567098, "auxiliary_loss_mlp": 0.01280743, "balance_loss_clip": 0.06291941, "balance_loss_mlp": 0.01258939, "epoch": 0.18415752292199009, "flos": 21878637160320.0, "grad_norm": 3.3410771594634663, "language_loss": 0.73912948, "learning_rate": 3.7558455846805383e-06, "loss": 0.81760788, "num_input_tokens_seen": 66165370, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.21789551, "step": 3063, "time_per_iteration": 2.5677857398986816 }, { "auxiliary_loss_clip": 0.06559767, "auxiliary_loss_mlp": 0.01281543, "balance_loss_clip": 0.06292219, "balance_loss_mlp": 0.01259656, "epoch": 0.18421764617465805, "flos": 25417701734400.0, "grad_norm": 1.7818096137814972, "language_loss": 0.66240573, "learning_rate": 3.7556590762711463e-06, "loss": 0.74081886, "num_input_tokens_seen": 66186210, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.21875, "step": 3064, "time_per_iteration": 2.5959877967834473 }, { "auxiliary_loss_clip": 0.06565136, "auxiliary_loss_mlp": 0.01277349, "balance_loss_clip": 0.06296474, "balance_loss_mlp": 0.01255498, "epoch": 0.18427776942732602, "flos": 27205395793920.0, "grad_norm": 1.936416977002335, "language_loss": 0.68699926, "learning_rate": 3.7554725012869853e-06, "loss": 0.76542413, "num_input_tokens_seen": 66204800, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.21838379, "step": 3065, "time_per_iteration": 4.000540018081665 }, { "auxiliary_loss_clip": 0.06567807, "auxiliary_loss_mlp": 0.01279801, "balance_loss_clip": 0.0629541, "balance_loss_mlp": 0.01256149, "epoch": 0.18433789267999398, "flos": 27859303457280.0, "grad_norm": 2.2700271580640154, "language_loss": 0.73662722, "learning_rate": 3.7552858597351318e-06, "loss": 0.81510329, "num_input_tokens_seen": 66222195, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.23657227, "step": 3066, "time_per_iteration": 2.6361887454986572 }, { "auxiliary_loss_clip": 0.06562449, "auxiliary_loss_mlp": 0.01281477, "balance_loss_clip": 0.06292394, "balance_loss_mlp": 0.01258946, "epoch": 0.18439801593266195, "flos": 17862502965120.0, "grad_norm": 2.106134251916698, "language_loss": 0.82964337, "learning_rate": 3.7550991516226622e-06, "loss": 0.9080826, "num_input_tokens_seen": 66239505, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.22546387, "step": 3067, "time_per_iteration": 3.9745683670043945 }, { "auxiliary_loss_clip": 0.06454746, "auxiliary_loss_mlp": 0.01257759, "balance_loss_clip": 0.06302468, "balance_loss_mlp": 0.01251844, "epoch": 0.18445813918532994, "flos": 56408236416000.0, "grad_norm": 0.7877012611370661, "language_loss": 0.59651202, "learning_rate": 3.754912376956657e-06, "loss": 0.67363703, "num_input_tokens_seen": 66295695, "router_z_loss_clip": 1.52539062, "router_z_loss_mlp": 0.05911255, "step": 3068, "time_per_iteration": 3.158179998397827 }, { "auxiliary_loss_clip": 0.06558856, "auxiliary_loss_mlp": 0.01286378, "balance_loss_clip": 0.0629551, "balance_loss_mlp": 0.01263609, "epoch": 0.1845182624379979, "flos": 20963085523200.0, "grad_norm": 2.063201326185846, "language_loss": 0.76836985, "learning_rate": 3.7547255357441987e-06, "loss": 0.8468222, "num_input_tokens_seen": 66315315, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.2277832, "step": 3069, "time_per_iteration": 2.5919296741485596 }, { "auxiliary_loss_clip": 0.06566764, "auxiliary_loss_mlp": 0.01286358, "balance_loss_clip": 0.06298262, "balance_loss_mlp": 0.01263494, "epoch": 0.18457838569066587, "flos": 20491382563200.0, "grad_norm": 1.8724013246688795, "language_loss": 0.85693324, "learning_rate": 3.7545386279923718e-06, "loss": 0.93546444, "num_input_tokens_seen": 66333675, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.2286377, "step": 3070, "time_per_iteration": 2.5901472568511963 }, { "auxiliary_loss_clip": 0.06565585, "auxiliary_loss_mlp": 0.01278541, "balance_loss_clip": 0.06294007, "balance_loss_mlp": 0.01255486, "epoch": 0.18463850894333383, "flos": 25017094563840.0, "grad_norm": 2.706645471039013, "language_loss": 0.78835541, "learning_rate": 3.754351653708265e-06, "loss": 0.86679673, "num_input_tokens_seen": 66354075, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.23034668, "step": 3071, "time_per_iteration": 2.5964162349700928 }, { "auxiliary_loss_clip": 0.06567757, "auxiliary_loss_mlp": 0.01283952, "balance_loss_clip": 0.06295228, "balance_loss_mlp": 0.01260349, "epoch": 0.1846986321960018, "flos": 16806311049600.0, "grad_norm": 2.1322996842609956, "language_loss": 0.78028429, "learning_rate": 3.7541646128989674e-06, "loss": 0.85880136, "num_input_tokens_seen": 66372520, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.23608398, "step": 3072, "time_per_iteration": 2.547231435775757 }, { "auxiliary_loss_clip": 0.06569453, "auxiliary_loss_mlp": 0.01281208, "balance_loss_clip": 0.06296159, "balance_loss_mlp": 0.01257271, "epoch": 0.18475875544866976, "flos": 20820726236160.0, "grad_norm": 1.9727362984791863, "language_loss": 0.86961615, "learning_rate": 3.7539775055715715e-06, "loss": 0.94812274, "num_input_tokens_seen": 66390745, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.23925781, "step": 3073, "time_per_iteration": 2.5606017112731934 }, { "auxiliary_loss_clip": 0.06571247, "auxiliary_loss_mlp": 0.01284013, "balance_loss_clip": 0.06296413, "balance_loss_mlp": 0.01261434, "epoch": 0.18481887870133773, "flos": 22608001025280.0, "grad_norm": 2.2159980990677597, "language_loss": 0.92650092, "learning_rate": 3.7537903317331732e-06, "loss": 1.00505352, "num_input_tokens_seen": 66410525, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.22583008, "step": 3074, "time_per_iteration": 2.569260358810425 }, { "auxiliary_loss_clip": 0.06557702, "auxiliary_loss_mlp": 0.01283665, "balance_loss_clip": 0.0628906, "balance_loss_mlp": 0.01258631, "epoch": 0.18487900195400572, "flos": 29466218332800.0, "grad_norm": 1.839067906386594, "language_loss": 0.65010464, "learning_rate": 3.75360309139087e-06, "loss": 0.72851831, "num_input_tokens_seen": 66432535, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.25036621, "step": 3075, "time_per_iteration": 2.63143253326416 }, { "auxiliary_loss_clip": 0.06561191, "auxiliary_loss_mlp": 0.01287345, "balance_loss_clip": 0.06296839, "balance_loss_mlp": 0.01264135, "epoch": 0.1849391252066737, "flos": 20634622318080.0, "grad_norm": 2.1102309008150386, "language_loss": 0.73353571, "learning_rate": 3.753415784551761e-06, "loss": 0.81202108, "num_input_tokens_seen": 66450620, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.23205566, "step": 3076, "time_per_iteration": 2.568328380584717 }, { "auxiliary_loss_clip": 0.06565771, "auxiliary_loss_mlp": 0.01282059, "balance_loss_clip": 0.06291091, "balance_loss_mlp": 0.01260113, "epoch": 0.18499924845934165, "flos": 14433750691200.0, "grad_norm": 2.3453234763671387, "language_loss": 0.81632322, "learning_rate": 3.7532284112229507e-06, "loss": 0.8948015, "num_input_tokens_seen": 66467865, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.21948242, "step": 3077, "time_per_iteration": 2.547475814819336 }, { "auxiliary_loss_clip": 0.06552808, "auxiliary_loss_mlp": 0.01279833, "balance_loss_clip": 0.06291958, "balance_loss_mlp": 0.01257088, "epoch": 0.18505937171200962, "flos": 23733611648640.0, "grad_norm": 1.9860643061568128, "language_loss": 0.79175949, "learning_rate": 3.7530409714115424e-06, "loss": 0.87008584, "num_input_tokens_seen": 66486245, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.22729492, "step": 3078, "time_per_iteration": 2.635568141937256 }, { "auxiliary_loss_clip": 0.06557982, "auxiliary_loss_mlp": 0.01281154, "balance_loss_clip": 0.06288693, "balance_loss_mlp": 0.01258588, "epoch": 0.18511949496467758, "flos": 25964525479680.0, "grad_norm": 2.6051778899987443, "language_loss": 0.78121287, "learning_rate": 3.7528534651246453e-06, "loss": 0.85960424, "num_input_tokens_seen": 66506510, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.22570801, "step": 3079, "time_per_iteration": 2.5965464115142822 }, { "auxiliary_loss_clip": 0.06558189, "auxiliary_loss_mlp": 0.01284721, "balance_loss_clip": 0.06293534, "balance_loss_mlp": 0.01261463, "epoch": 0.18517961821734555, "flos": 42423506156160.0, "grad_norm": 1.651707155058429, "language_loss": 0.82573342, "learning_rate": 3.752665892369369e-06, "loss": 0.90416259, "num_input_tokens_seen": 66530960, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.23242188, "step": 3080, "time_per_iteration": 2.7619755268096924 }, { "auxiliary_loss_clip": 0.06570546, "auxiliary_loss_mlp": 0.01278402, "balance_loss_clip": 0.06297151, "balance_loss_mlp": 0.01253619, "epoch": 0.18523974147001354, "flos": 24104435892480.0, "grad_norm": 2.1457310107971144, "language_loss": 0.75250912, "learning_rate": 3.7524782531528266e-06, "loss": 0.8309986, "num_input_tokens_seen": 66550275, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.2479248, "step": 3081, "time_per_iteration": 2.599451780319214 }, { "auxiliary_loss_clip": 0.06559093, "auxiliary_loss_mlp": 0.01288663, "balance_loss_clip": 0.06289418, "balance_loss_mlp": 0.01263677, "epoch": 0.1852998647226815, "flos": 27381688784640.0, "grad_norm": 2.3954661689850676, "language_loss": 0.72398198, "learning_rate": 3.7522905474821334e-06, "loss": 0.8024596, "num_input_tokens_seen": 66569040, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.25012207, "step": 3082, "time_per_iteration": 2.62174916267395 }, { "auxiliary_loss_clip": 0.0656904, "auxiliary_loss_mlp": 0.01283236, "balance_loss_clip": 0.06292188, "balance_loss_mlp": 0.01258417, "epoch": 0.18535998797534947, "flos": 18338650191360.0, "grad_norm": 2.3831404770402216, "language_loss": 0.70724034, "learning_rate": 3.752102775364407e-06, "loss": 0.78576308, "num_input_tokens_seen": 66587775, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.24816895, "step": 3083, "time_per_iteration": 2.560497283935547 }, { "auxiliary_loss_clip": 0.06553149, "auxiliary_loss_mlp": 0.01278106, "balance_loss_clip": 0.0628906, "balance_loss_mlp": 0.01254383, "epoch": 0.18542011122801744, "flos": 37853881816320.0, "grad_norm": 2.238027878625801, "language_loss": 0.69484138, "learning_rate": 3.751914936806767e-06, "loss": 0.77315402, "num_input_tokens_seen": 66610800, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.23730469, "step": 3084, "time_per_iteration": 2.7251832485198975 }, { "auxiliary_loss_clip": 0.06552036, "auxiliary_loss_mlp": 0.01278667, "balance_loss_clip": 0.06289829, "balance_loss_mlp": 0.01256935, "epoch": 0.1854802344806854, "flos": 25192171670400.0, "grad_norm": 3.4857012520684667, "language_loss": 0.78244442, "learning_rate": 3.7517270318163377e-06, "loss": 0.86075139, "num_input_tokens_seen": 66630960, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.21728516, "step": 3085, "time_per_iteration": 2.6047935485839844 }, { "auxiliary_loss_clip": 0.06560589, "auxiliary_loss_mlp": 0.01281343, "balance_loss_clip": 0.06292744, "balance_loss_mlp": 0.01257239, "epoch": 0.18554035773335337, "flos": 26691541430400.0, "grad_norm": 1.823292111985676, "language_loss": 0.74518895, "learning_rate": 3.751539060400244e-06, "loss": 0.82360822, "num_input_tokens_seen": 66650585, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.2409668, "step": 3086, "time_per_iteration": 2.5879721641540527 }, { "auxiliary_loss_clip": 0.06565833, "auxiliary_loss_mlp": 0.01282029, "balance_loss_clip": 0.06297665, "balance_loss_mlp": 0.01257126, "epoch": 0.18560048098602133, "flos": 22353568502400.0, "grad_norm": 2.6203228006885513, "language_loss": 0.70087814, "learning_rate": 3.7513510225656132e-06, "loss": 0.77935672, "num_input_tokens_seen": 66670045, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.24938965, "step": 3087, "time_per_iteration": 2.5602662563323975 }, { "auxiliary_loss_clip": 0.06560057, "auxiliary_loss_mlp": 0.01282809, "balance_loss_clip": 0.06292567, "balance_loss_mlp": 0.01258001, "epoch": 0.18566060423868933, "flos": 17754245089920.0, "grad_norm": 2.0421874574402645, "language_loss": 0.73134744, "learning_rate": 3.7511629183195764e-06, "loss": 0.80977607, "num_input_tokens_seen": 66688790, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.24829102, "step": 3088, "time_per_iteration": 2.5415878295898438 }, { "auxiliary_loss_clip": 0.065574, "auxiliary_loss_mlp": 0.01283354, "balance_loss_clip": 0.06291871, "balance_loss_mlp": 0.01261598, "epoch": 0.1857207274913573, "flos": 24683558186880.0, "grad_norm": 8.283774439534799, "language_loss": 0.92225575, "learning_rate": 3.7509747476692663e-06, "loss": 1.00066328, "num_input_tokens_seen": 66708090, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.21740723, "step": 3089, "time_per_iteration": 2.66800594329834 }, { "auxiliary_loss_clip": 0.06558194, "auxiliary_loss_mlp": 0.01283431, "balance_loss_clip": 0.06293039, "balance_loss_mlp": 0.01261604, "epoch": 0.18578085074402526, "flos": 28155426186240.0, "grad_norm": 3.067088270163474, "language_loss": 0.58625102, "learning_rate": 3.7507865106218176e-06, "loss": 0.66466725, "num_input_tokens_seen": 66727320, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.21838379, "step": 3090, "time_per_iteration": 2.6374762058258057 }, { "auxiliary_loss_clip": 0.06560756, "auxiliary_loss_mlp": 0.01287169, "balance_loss_clip": 0.06298556, "balance_loss_mlp": 0.0126409, "epoch": 0.18584097399669322, "flos": 23958764369280.0, "grad_norm": 1.9045809375312135, "language_loss": 0.82354259, "learning_rate": 3.7505982071843695e-06, "loss": 0.90202188, "num_input_tokens_seen": 66747505, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.2310791, "step": 3091, "time_per_iteration": 2.594632625579834 }, { "auxiliary_loss_clip": 0.06561984, "auxiliary_loss_mlp": 0.01284922, "balance_loss_clip": 0.06292604, "balance_loss_mlp": 0.01261605, "epoch": 0.18590109724936119, "flos": 17207379417600.0, "grad_norm": 2.3461327022205176, "language_loss": 0.85405672, "learning_rate": 3.7504098373640617e-06, "loss": 0.93252575, "num_input_tokens_seen": 66766425, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.2331543, "step": 3092, "time_per_iteration": 2.5433249473571777 }, { "auxiliary_loss_clip": 0.06566819, "auxiliary_loss_mlp": 0.01294575, "balance_loss_clip": 0.06294323, "balance_loss_mlp": 0.01270578, "epoch": 0.18596122050202915, "flos": 17239761820800.0, "grad_norm": 2.7214348389446474, "language_loss": 0.93916994, "learning_rate": 3.750221401168038e-06, "loss": 1.01778388, "num_input_tokens_seen": 66781130, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.23999023, "step": 3093, "time_per_iteration": 2.5305089950561523 }, { "auxiliary_loss_clip": 0.06561021, "auxiliary_loss_mlp": 0.01283053, "balance_loss_clip": 0.06293846, "balance_loss_mlp": 0.01260356, "epoch": 0.18602134375469712, "flos": 19025862652800.0, "grad_norm": 1.8102358694743903, "language_loss": 0.77774411, "learning_rate": 3.750032898603443e-06, "loss": 0.85618484, "num_input_tokens_seen": 66797535, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.22680664, "step": 3094, "time_per_iteration": 2.628169536590576 }, { "auxiliary_loss_clip": 0.06554732, "auxiliary_loss_mlp": 0.01285264, "balance_loss_clip": 0.06288824, "balance_loss_mlp": 0.01261256, "epoch": 0.1860814670073651, "flos": 50961285429120.0, "grad_norm": 1.68785225407661, "language_loss": 0.70635337, "learning_rate": 3.749844329677425e-06, "loss": 0.78475332, "num_input_tokens_seen": 66821720, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.23999023, "step": 3095, "time_per_iteration": 2.849398612976074 }, { "auxiliary_loss_clip": 0.06563748, "auxiliary_loss_mlp": 0.01290697, "balance_loss_clip": 0.06290467, "balance_loss_mlp": 0.01265341, "epoch": 0.18614159026003307, "flos": 19397064240000.0, "grad_norm": 2.256629587715542, "language_loss": 0.81116551, "learning_rate": 3.749655694397135e-06, "loss": 0.88970995, "num_input_tokens_seen": 66839060, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.25390625, "step": 3096, "time_per_iteration": 2.5835916996002197 }, { "auxiliary_loss_clip": 0.06560127, "auxiliary_loss_mlp": 0.01288154, "balance_loss_clip": 0.06290288, "balance_loss_mlp": 0.01263907, "epoch": 0.18620171351270104, "flos": 21805235383680.0, "grad_norm": 2.2034469885623222, "language_loss": 0.75860298, "learning_rate": 3.7494669927697255e-06, "loss": 0.83708578, "num_input_tokens_seen": 66857760, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.24255371, "step": 3097, "time_per_iteration": 3.9847328662872314 }, { "auxiliary_loss_clip": 0.06555636, "auxiliary_loss_mlp": 0.01278413, "balance_loss_clip": 0.0629536, "balance_loss_mlp": 0.01255966, "epoch": 0.186261836765369, "flos": 16368499866240.0, "grad_norm": 3.470305586816219, "language_loss": 0.66528046, "learning_rate": 3.749278224802352e-06, "loss": 0.74362099, "num_input_tokens_seen": 66876460, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.22436523, "step": 3098, "time_per_iteration": 2.545896530151367 }, { "auxiliary_loss_clip": 0.06560551, "auxiliary_loss_mlp": 0.0128408, "balance_loss_clip": 0.0629134, "balance_loss_mlp": 0.0125895, "epoch": 0.18632196001803697, "flos": 23377168379520.0, "grad_norm": 1.8282472414572155, "language_loss": 0.70448875, "learning_rate": 3.7490893905021733e-06, "loss": 0.78293502, "num_input_tokens_seen": 66897960, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.25146484, "step": 3099, "time_per_iteration": 2.575425148010254 }, { "auxiliary_loss_clip": 0.06555377, "auxiliary_loss_mlp": 0.01289557, "balance_loss_clip": 0.06288858, "balance_loss_mlp": 0.0126568, "epoch": 0.18638208327070493, "flos": 22498569192960.0, "grad_norm": 1.5419437708433534, "language_loss": 0.72196913, "learning_rate": 3.7489004898763494e-06, "loss": 0.8004185, "num_input_tokens_seen": 66917675, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.2388916, "step": 3100, "time_per_iteration": 3.9815714359283447 }, { "auxiliary_loss_clip": 0.06562316, "auxiliary_loss_mlp": 0.01293057, "balance_loss_clip": 0.06290765, "balance_loss_mlp": 0.01269215, "epoch": 0.18644220652337293, "flos": 29172317736960.0, "grad_norm": 2.535960017846674, "language_loss": 0.80971348, "learning_rate": 3.7487115229320444e-06, "loss": 0.88826722, "num_input_tokens_seen": 66936000, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.23864746, "step": 3101, "time_per_iteration": 2.611849308013916 }, { "auxiliary_loss_clip": 0.06556149, "auxiliary_loss_mlp": 0.01284792, "balance_loss_clip": 0.0629319, "balance_loss_mlp": 0.0126399, "epoch": 0.1865023297760409, "flos": 24250736321280.0, "grad_norm": 1.8593172708034158, "language_loss": 0.77509266, "learning_rate": 3.7485224896764222e-06, "loss": 0.85350209, "num_input_tokens_seen": 66955700, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.20788574, "step": 3102, "time_per_iteration": 2.585897207260132 }, { "auxiliary_loss_clip": 0.06558263, "auxiliary_loss_mlp": 0.0128432, "balance_loss_clip": 0.06289326, "balance_loss_mlp": 0.0126161, "epoch": 0.18656245302870886, "flos": 19133617403520.0, "grad_norm": 3.4958302489672426, "language_loss": 0.77357364, "learning_rate": 3.7483333901166525e-06, "loss": 0.8519994, "num_input_tokens_seen": 66972815, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.22717285, "step": 3103, "time_per_iteration": 2.533132553100586 }, { "auxiliary_loss_clip": 0.06562337, "auxiliary_loss_mlp": 0.01277453, "balance_loss_clip": 0.0629338, "balance_loss_mlp": 0.01254457, "epoch": 0.18662257628137682, "flos": 17791994154240.0, "grad_norm": 2.037263899395108, "language_loss": 0.79891914, "learning_rate": 3.7481442242599054e-06, "loss": 0.87731707, "num_input_tokens_seen": 66992280, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.2298584, "step": 3104, "time_per_iteration": 2.543550491333008 }, { "auxiliary_loss_clip": 0.0656213, "auxiliary_loss_mlp": 0.01288773, "balance_loss_clip": 0.06294823, "balance_loss_mlp": 0.012666, "epoch": 0.1866826995340448, "flos": 24031201824000.0, "grad_norm": 1.987207628691066, "language_loss": 0.85713547, "learning_rate": 3.747954992113354e-06, "loss": 0.93564451, "num_input_tokens_seen": 67012220, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.22167969, "step": 3105, "time_per_iteration": 4.0152623653411865 }, { "auxiliary_loss_clip": 0.06575066, "auxiliary_loss_mlp": 0.01280949, "balance_loss_clip": 0.06298874, "balance_loss_mlp": 0.01258371, "epoch": 0.18674282278671275, "flos": 26148533045760.0, "grad_norm": 2.0649585064566427, "language_loss": 0.87877464, "learning_rate": 3.7477656936841742e-06, "loss": 0.95733476, "num_input_tokens_seen": 67032030, "router_z_loss_clip": 2.76367188, "router_z_loss_mlp": 0.22583008, "step": 3106, "time_per_iteration": 4.114020824432373 }, { "auxiliary_loss_clip": 0.06571257, "auxiliary_loss_mlp": 0.01277426, "balance_loss_clip": 0.06296742, "balance_loss_mlp": 0.01255896, "epoch": 0.18680294603938072, "flos": 19206893399040.0, "grad_norm": 3.175186959819465, "language_loss": 0.78748417, "learning_rate": 3.7475763289795445e-06, "loss": 0.86597097, "num_input_tokens_seen": 67048920, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.21533203, "step": 3107, "time_per_iteration": 2.5600945949554443 }, { "auxiliary_loss_clip": 0.06570213, "auxiliary_loss_mlp": 0.01292015, "balance_loss_clip": 0.06294572, "balance_loss_mlp": 0.01267541, "epoch": 0.1868630692920487, "flos": 28551840652800.0, "grad_norm": 2.601539308668114, "language_loss": 0.75392568, "learning_rate": 3.7473868980066446e-06, "loss": 0.8325479, "num_input_tokens_seen": 67068645, "router_z_loss_clip": 2.75585938, "router_z_loss_mlp": 0.24487305, "step": 3108, "time_per_iteration": 2.6044113636016846 }, { "auxiliary_loss_clip": 0.06570502, "auxiliary_loss_mlp": 0.01283704, "balance_loss_clip": 0.06300136, "balance_loss_mlp": 0.01260708, "epoch": 0.18692319254471668, "flos": 17243702962560.0, "grad_norm": 1.7209464630789715, "language_loss": 0.75707453, "learning_rate": 3.747197400772658e-06, "loss": 0.83561659, "num_input_tokens_seen": 67087075, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.23010254, "step": 3109, "time_per_iteration": 2.566704511642456 }, { "auxiliary_loss_clip": 0.06561249, "auxiliary_loss_mlp": 0.01284119, "balance_loss_clip": 0.06293292, "balance_loss_mlp": 0.01259597, "epoch": 0.18698331579738464, "flos": 23191861075200.0, "grad_norm": 1.6069352928892566, "language_loss": 0.85241866, "learning_rate": 3.747007837284772e-06, "loss": 0.93087232, "num_input_tokens_seen": 67108040, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.24511719, "step": 3110, "time_per_iteration": 2.591439962387085 }, { "auxiliary_loss_clip": 0.06561273, "auxiliary_loss_mlp": 0.01292493, "balance_loss_clip": 0.06296884, "balance_loss_mlp": 0.01269748, "epoch": 0.1870434390500526, "flos": 25523192424960.0, "grad_norm": 1.9336853513861396, "language_loss": 0.8527379, "learning_rate": 3.7468182075501737e-06, "loss": 0.93127555, "num_input_tokens_seen": 67127605, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.22741699, "step": 3111, "time_per_iteration": 2.6025898456573486 }, { "auxiliary_loss_clip": 0.06557271, "auxiliary_loss_mlp": 0.01290596, "balance_loss_clip": 0.06291113, "balance_loss_mlp": 0.01267208, "epoch": 0.18710356230272057, "flos": 19506999196800.0, "grad_norm": 2.008238930866278, "language_loss": 0.77660972, "learning_rate": 3.7466285115760536e-06, "loss": 0.85508835, "num_input_tokens_seen": 67145785, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.23376465, "step": 3112, "time_per_iteration": 2.586298942565918 }, { "auxiliary_loss_clip": 0.06556826, "auxiliary_loss_mlp": 0.0128125, "balance_loss_clip": 0.0629088, "balance_loss_mlp": 0.01259876, "epoch": 0.18716368555538854, "flos": 26768129662080.0, "grad_norm": 2.4590934674248697, "language_loss": 0.65935189, "learning_rate": 3.7464387493696046e-06, "loss": 0.73773265, "num_input_tokens_seen": 67165930, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.21362305, "step": 3113, "time_per_iteration": 2.591568946838379 }, { "auxiliary_loss_clip": 0.06567281, "auxiliary_loss_mlp": 0.01281481, "balance_loss_clip": 0.06295586, "balance_loss_mlp": 0.01258938, "epoch": 0.1872238088080565, "flos": 25196490155520.0, "grad_norm": 3.2934884391537986, "language_loss": 0.81875104, "learning_rate": 3.746248920938024e-06, "loss": 0.89723861, "num_input_tokens_seen": 67185830, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.22570801, "step": 3114, "time_per_iteration": 2.599579334259033 }, { "auxiliary_loss_clip": 0.06565085, "auxiliary_loss_mlp": 0.01292252, "balance_loss_clip": 0.0629497, "balance_loss_mlp": 0.01268446, "epoch": 0.1872839320607245, "flos": 24141220634880.0, "grad_norm": 2.5230982787270797, "language_loss": 0.5791105, "learning_rate": 3.74605902628851e-06, "loss": 0.65768385, "num_input_tokens_seen": 67206930, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.23803711, "step": 3115, "time_per_iteration": 2.614021062850952 }, { "auxiliary_loss_clip": 0.06559339, "auxiliary_loss_mlp": 0.01283663, "balance_loss_clip": 0.06294023, "balance_loss_mlp": 0.01262623, "epoch": 0.18734405531339246, "flos": 21179349711360.0, "grad_norm": 1.9865041918584405, "language_loss": 0.71934485, "learning_rate": 3.745869065428261e-06, "loss": 0.79777491, "num_input_tokens_seen": 67226290, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.21044922, "step": 3116, "time_per_iteration": 2.687678575515747 }, { "auxiliary_loss_clip": 0.06554516, "auxiliary_loss_mlp": 0.01282601, "balance_loss_clip": 0.06292449, "balance_loss_mlp": 0.01261763, "epoch": 0.18740417856606043, "flos": 17243325619200.0, "grad_norm": 2.567379725333533, "language_loss": 0.7961368, "learning_rate": 3.7456790383644833e-06, "loss": 0.8745079, "num_input_tokens_seen": 67244410, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.20861816, "step": 3117, "time_per_iteration": 2.5427491664886475 }, { "auxiliary_loss_clip": 0.06554545, "auxiliary_loss_mlp": 0.01295661, "balance_loss_clip": 0.0629508, "balance_loss_mlp": 0.01273297, "epoch": 0.1874643018187284, "flos": 32565626933760.0, "grad_norm": 2.0509058460965948, "language_loss": 0.84781718, "learning_rate": 3.745488945104381e-06, "loss": 0.92631924, "num_input_tokens_seen": 67264470, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.22363281, "step": 3118, "time_per_iteration": 2.655946731567383 }, { "auxiliary_loss_clip": 0.06553064, "auxiliary_loss_mlp": 0.01284261, "balance_loss_clip": 0.06287703, "balance_loss_mlp": 0.01261349, "epoch": 0.18752442507139636, "flos": 23264843581440.0, "grad_norm": 2.116898730778344, "language_loss": 0.77273417, "learning_rate": 3.7452987856551636e-06, "loss": 0.85110742, "num_input_tokens_seen": 67284315, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.22912598, "step": 3119, "time_per_iteration": 2.5638411045074463 }, { "auxiliary_loss_clip": 0.06562437, "auxiliary_loss_mlp": 0.01278235, "balance_loss_clip": 0.06294557, "balance_loss_mlp": 0.0125636, "epoch": 0.18758454832406432, "flos": 21767150903040.0, "grad_norm": 2.128469822072834, "language_loss": 0.82762182, "learning_rate": 3.7451085600240406e-06, "loss": 0.90602851, "num_input_tokens_seen": 67302780, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.21875, "step": 3120, "time_per_iteration": 2.5614137649536133 }, { "auxiliary_loss_clip": 0.06547903, "auxiliary_loss_mlp": 0.01283027, "balance_loss_clip": 0.06288967, "balance_loss_mlp": 0.01261295, "epoch": 0.1876446715767323, "flos": 29577956152320.0, "grad_norm": 1.980164222611683, "language_loss": 0.8597616, "learning_rate": 3.7449182682182263e-06, "loss": 0.93807083, "num_input_tokens_seen": 67323405, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.21740723, "step": 3121, "time_per_iteration": 2.6356570720672607 }, { "auxiliary_loss_clip": 0.06552959, "auxiliary_loss_mlp": 0.01280083, "balance_loss_clip": 0.0629174, "balance_loss_mlp": 0.01258435, "epoch": 0.18770479482940028, "flos": 30348465171840.0, "grad_norm": 3.856061755934454, "language_loss": 0.70956582, "learning_rate": 3.744727910244937e-06, "loss": 0.78789628, "num_input_tokens_seen": 67345800, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.21643066, "step": 3122, "time_per_iteration": 2.645657777786255 }, { "auxiliary_loss_clip": 0.06554797, "auxiliary_loss_mlp": 0.01277845, "balance_loss_clip": 0.06291892, "balance_loss_mlp": 0.01255731, "epoch": 0.18776491808206824, "flos": 14470619287680.0, "grad_norm": 2.2521387648679116, "language_loss": 0.72409528, "learning_rate": 3.7445374861113905e-06, "loss": 0.80242169, "num_input_tokens_seen": 67363575, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.22119141, "step": 3123, "time_per_iteration": 2.5234899520874023 }, { "auxiliary_loss_clip": 0.06553608, "auxiliary_loss_mlp": 0.01276266, "balance_loss_clip": 0.06292704, "balance_loss_mlp": 0.01255393, "epoch": 0.1878250413347362, "flos": 24505420406400.0, "grad_norm": 1.8586898673111412, "language_loss": 0.74919778, "learning_rate": 3.7443469958248066e-06, "loss": 0.82749653, "num_input_tokens_seen": 67381765, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.20874023, "step": 3124, "time_per_iteration": 2.579131603240967 }, { "auxiliary_loss_clip": 0.06553487, "auxiliary_loss_mlp": 0.01279367, "balance_loss_clip": 0.06290945, "balance_loss_mlp": 0.01256396, "epoch": 0.18788516458740417, "flos": 39795632807040.0, "grad_norm": 3.0575316482737205, "language_loss": 0.817186, "learning_rate": 3.7441564393924106e-06, "loss": 0.89551461, "num_input_tokens_seen": 67405000, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.22973633, "step": 3125, "time_per_iteration": 2.714900016784668 }, { "auxiliary_loss_clip": 0.06456469, "auxiliary_loss_mlp": 0.01259245, "balance_loss_clip": 0.06304465, "balance_loss_mlp": 0.01252337, "epoch": 0.18794528784007214, "flos": 64717844221440.0, "grad_norm": 0.9044900555941209, "language_loss": 0.63625228, "learning_rate": 3.7439658168214273e-06, "loss": 0.71340942, "num_input_tokens_seen": 67467140, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.0692749, "step": 3126, "time_per_iteration": 3.22729229927063 }, { "auxiliary_loss_clip": 0.06557314, "auxiliary_loss_mlp": 0.01283364, "balance_loss_clip": 0.06300624, "balance_loss_mlp": 0.01261966, "epoch": 0.1880054110927401, "flos": 28629728622720.0, "grad_norm": 2.363505215685626, "language_loss": 0.81883967, "learning_rate": 3.7437751281190857e-06, "loss": 0.89724648, "num_input_tokens_seen": 67487980, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.21398926, "step": 3127, "time_per_iteration": 2.6192307472229004 }, { "auxiliary_loss_clip": 0.06455354, "auxiliary_loss_mlp": 0.01262323, "balance_loss_clip": 0.06303865, "balance_loss_mlp": 0.01255802, "epoch": 0.1880655343454081, "flos": 64508959192320.0, "grad_norm": 0.7375551186555714, "language_loss": 0.61976981, "learning_rate": 3.7435843732926164e-06, "loss": 0.69694662, "num_input_tokens_seen": 67552500, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.06530762, "step": 3128, "time_per_iteration": 3.294023275375366 }, { "auxiliary_loss_clip": 0.0655987, "auxiliary_loss_mlp": 0.01280851, "balance_loss_clip": 0.06294271, "balance_loss_mlp": 0.01259774, "epoch": 0.18812565759807606, "flos": 32132679287040.0, "grad_norm": 2.1788062313660492, "language_loss": 0.72258389, "learning_rate": 3.7433935523492536e-06, "loss": 0.80099106, "num_input_tokens_seen": 67573295, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.2109375, "step": 3129, "time_per_iteration": 2.6478915214538574 }, { "auxiliary_loss_clip": 0.06557339, "auxiliary_loss_mlp": 0.01282178, "balance_loss_clip": 0.06292832, "balance_loss_mlp": 0.01259564, "epoch": 0.18818578085074403, "flos": 20629674927360.0, "grad_norm": 2.363528313784909, "language_loss": 0.86036986, "learning_rate": 3.7432026652962314e-06, "loss": 0.93876505, "num_input_tokens_seen": 67590010, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.22619629, "step": 3130, "time_per_iteration": 2.5758726596832275 }, { "auxiliary_loss_clip": 0.06560846, "auxiliary_loss_mlp": 0.01277707, "balance_loss_clip": 0.06292815, "balance_loss_mlp": 0.01256035, "epoch": 0.188245904103412, "flos": 28848131089920.0, "grad_norm": 2.3887102173010177, "language_loss": 0.771909, "learning_rate": 3.7430117121407897e-06, "loss": 0.85029459, "num_input_tokens_seen": 67611110, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.21679688, "step": 3131, "time_per_iteration": 2.6141395568847656 }, { "auxiliary_loss_clip": 0.06564063, "auxiliary_loss_mlp": 0.01280888, "balance_loss_clip": 0.06302249, "balance_loss_mlp": 0.01257237, "epoch": 0.18830602735607996, "flos": 29427379165440.0, "grad_norm": 1.7507325085080927, "language_loss": 0.82261133, "learning_rate": 3.74282069289017e-06, "loss": 0.90106088, "num_input_tokens_seen": 67631990, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.23657227, "step": 3132, "time_per_iteration": 2.632276773452759 }, { "auxiliary_loss_clip": 0.06572425, "auxiliary_loss_mlp": 0.01285038, "balance_loss_clip": 0.06304249, "balance_loss_mlp": 0.01260529, "epoch": 0.18836615060874792, "flos": 28879884587520.0, "grad_norm": 2.557168546737583, "language_loss": 0.80555874, "learning_rate": 3.742629607551614e-06, "loss": 0.8841334, "num_input_tokens_seen": 67650490, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.24499512, "step": 3133, "time_per_iteration": 2.6269218921661377 }, { "auxiliary_loss_clip": 0.06567606, "auxiliary_loss_mlp": 0.01278393, "balance_loss_clip": 0.06300623, "balance_loss_mlp": 0.01256351, "epoch": 0.18842627386141592, "flos": 22608294514560.0, "grad_norm": 1.6958608722950383, "language_loss": 0.83186269, "learning_rate": 3.7424384561323698e-06, "loss": 0.91032267, "num_input_tokens_seen": 67668860, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.22045898, "step": 3134, "time_per_iteration": 2.575277090072632 }, { "auxiliary_loss_clip": 0.0656025, "auxiliary_loss_mlp": 0.01282322, "balance_loss_clip": 0.06297721, "balance_loss_mlp": 0.01259505, "epoch": 0.18848639711408388, "flos": 24580834680960.0, "grad_norm": 1.6230630508001345, "language_loss": 0.83605814, "learning_rate": 3.742247238639684e-06, "loss": 0.9144839, "num_input_tokens_seen": 67690220, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.22827148, "step": 3135, "time_per_iteration": 2.613309621810913 }, { "auxiliary_loss_clip": 0.06563347, "auxiliary_loss_mlp": 0.01280197, "balance_loss_clip": 0.06300263, "balance_loss_mlp": 0.01257702, "epoch": 0.18854652036675185, "flos": 34175350920960.0, "grad_norm": 1.8566842005286497, "language_loss": 0.791098, "learning_rate": 3.7420559550808083e-06, "loss": 0.86953342, "num_input_tokens_seen": 67709820, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.22497559, "step": 3136, "time_per_iteration": 4.116787910461426 }, { "auxiliary_loss_clip": 0.06565725, "auxiliary_loss_mlp": 0.0128152, "balance_loss_clip": 0.06299069, "balance_loss_mlp": 0.01258405, "epoch": 0.1886066436194198, "flos": 24205985441280.0, "grad_norm": 6.7007554125978945, "language_loss": 0.82221532, "learning_rate": 3.741864605462996e-06, "loss": 0.90068781, "num_input_tokens_seen": 67729490, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.23120117, "step": 3137, "time_per_iteration": 2.5911996364593506 }, { "auxiliary_loss_clip": 0.06569691, "auxiliary_loss_mlp": 0.01278201, "balance_loss_clip": 0.06302986, "balance_loss_mlp": 0.01256481, "epoch": 0.18866676687208778, "flos": 21257405389440.0, "grad_norm": 1.5369871235085917, "language_loss": 0.81866789, "learning_rate": 3.741673189793504e-06, "loss": 0.89714682, "num_input_tokens_seen": 67749665, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.21728516, "step": 3138, "time_per_iteration": 2.57149600982666 }, { "auxiliary_loss_clip": 0.06570037, "auxiliary_loss_mlp": 0.0127773, "balance_loss_clip": 0.06302545, "balance_loss_mlp": 0.01255294, "epoch": 0.18872689012475574, "flos": 37318294517760.0, "grad_norm": 3.4141206302478215, "language_loss": 0.63665867, "learning_rate": 3.7414817080795896e-06, "loss": 0.71513635, "num_input_tokens_seen": 67776230, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.22460938, "step": 3139, "time_per_iteration": 4.200807809829712 }, { "auxiliary_loss_clip": 0.06575046, "auxiliary_loss_mlp": 0.01286427, "balance_loss_clip": 0.06306463, "balance_loss_mlp": 0.01263134, "epoch": 0.1887870133774237, "flos": 21658641465600.0, "grad_norm": 2.0000477450066945, "language_loss": 0.71548373, "learning_rate": 3.741290160328514e-06, "loss": 0.7940985, "num_input_tokens_seen": 67795080, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.23303223, "step": 3140, "time_per_iteration": 2.5600693225860596 }, { "auxiliary_loss_clip": 0.06580178, "auxiliary_loss_mlp": 0.01278404, "balance_loss_clip": 0.06312801, "balance_loss_mlp": 0.01254824, "epoch": 0.1888471366300917, "flos": 15930143631360.0, "grad_norm": 2.249518703605882, "language_loss": 0.88053322, "learning_rate": 3.7410985465475412e-06, "loss": 0.95911908, "num_input_tokens_seen": 67813110, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.23583984, "step": 3141, "time_per_iteration": 2.5488758087158203 }, { "auxiliary_loss_clip": 0.06575929, "auxiliary_loss_mlp": 0.01280935, "balance_loss_clip": 0.06304078, "balance_loss_mlp": 0.01256688, "epoch": 0.18890725988275966, "flos": 18557933126400.0, "grad_norm": 1.9436364456133823, "language_loss": 0.77409005, "learning_rate": 3.7409068667439378e-06, "loss": 0.85265875, "num_input_tokens_seen": 67831070, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.24243164, "step": 3142, "time_per_iteration": 2.53222393989563 }, { "auxiliary_loss_clip": 0.06568392, "auxiliary_loss_mlp": 0.01280014, "balance_loss_clip": 0.06304529, "balance_loss_mlp": 0.01259164, "epoch": 0.18896738313542763, "flos": 28848550360320.0, "grad_norm": 1.6145988697237987, "language_loss": 0.79999912, "learning_rate": 3.740715120924971e-06, "loss": 0.87848318, "num_input_tokens_seen": 67852170, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.20849609, "step": 3143, "time_per_iteration": 2.6227293014526367 }, { "auxiliary_loss_clip": 0.06567247, "auxiliary_loss_mlp": 0.01286689, "balance_loss_clip": 0.06300146, "balance_loss_mlp": 0.01263217, "epoch": 0.1890275063880956, "flos": 22418249454720.0, "grad_norm": 1.9115329245731831, "language_loss": 0.72258025, "learning_rate": 3.740523309097912e-06, "loss": 0.80111957, "num_input_tokens_seen": 67869945, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.23486328, "step": 3144, "time_per_iteration": 4.07940411567688 }, { "auxiliary_loss_clip": 0.06570509, "auxiliary_loss_mlp": 0.01283511, "balance_loss_clip": 0.0630118, "balance_loss_mlp": 0.01260384, "epoch": 0.18908762964076356, "flos": 24250862102400.0, "grad_norm": 2.5032783382186454, "language_loss": 0.74396068, "learning_rate": 3.7403314312700356e-06, "loss": 0.82250082, "num_input_tokens_seen": 67890240, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.23132324, "step": 3145, "time_per_iteration": 2.632801055908203 }, { "auxiliary_loss_clip": 0.06561744, "auxiliary_loss_mlp": 0.01282141, "balance_loss_clip": 0.06299555, "balance_loss_mlp": 0.0126035, "epoch": 0.18914775289343153, "flos": 16988599607040.0, "grad_norm": 5.1816114602130865, "language_loss": 0.77532065, "learning_rate": 3.740139487448616e-06, "loss": 0.85375941, "num_input_tokens_seen": 67907825, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.21789551, "step": 3146, "time_per_iteration": 3.9270663261413574 }, { "auxiliary_loss_clip": 0.06568474, "auxiliary_loss_mlp": 0.01288962, "balance_loss_clip": 0.0630566, "balance_loss_mlp": 0.012663, "epoch": 0.1892078761460995, "flos": 21550257809280.0, "grad_norm": 3.2141632432649923, "language_loss": 0.79363048, "learning_rate": 3.7399474776409326e-06, "loss": 0.87220484, "num_input_tokens_seen": 67926670, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.22668457, "step": 3147, "time_per_iteration": 2.5516302585601807 }, { "auxiliary_loss_clip": 0.06565282, "auxiliary_loss_mlp": 0.01285168, "balance_loss_clip": 0.06300612, "balance_loss_mlp": 0.01263007, "epoch": 0.18926799939876748, "flos": 23007979290240.0, "grad_norm": 3.3731402502544943, "language_loss": 0.68100959, "learning_rate": 3.739755401854267e-06, "loss": 0.75951409, "num_input_tokens_seen": 67943645, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.22167969, "step": 3148, "time_per_iteration": 2.5608787536621094 }, { "auxiliary_loss_clip": 0.06563473, "auxiliary_loss_mlp": 0.01282753, "balance_loss_clip": 0.06296356, "balance_loss_mlp": 0.01261128, "epoch": 0.18932812265143545, "flos": 22279537820160.0, "grad_norm": 2.367470867602643, "language_loss": 0.77613235, "learning_rate": 3.739563260095902e-06, "loss": 0.85459459, "num_input_tokens_seen": 67962345, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.21630859, "step": 3149, "time_per_iteration": 2.5470128059387207 }, { "auxiliary_loss_clip": 0.06559218, "auxiliary_loss_mlp": 0.01282477, "balance_loss_clip": 0.06299353, "balance_loss_mlp": 0.01260829, "epoch": 0.1893882459041034, "flos": 18630328654080.0, "grad_norm": 2.835359701600925, "language_loss": 0.81505525, "learning_rate": 3.7393710523731245e-06, "loss": 0.89347219, "num_input_tokens_seen": 67979760, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.21643066, "step": 3150, "time_per_iteration": 2.530758857727051 }, { "auxiliary_loss_clip": 0.06566418, "auxiliary_loss_mlp": 0.01280868, "balance_loss_clip": 0.06300581, "balance_loss_mlp": 0.01259327, "epoch": 0.18944836915677138, "flos": 22899553706880.0, "grad_norm": 2.5982642424583475, "language_loss": 0.85780621, "learning_rate": 3.7391787786932215e-06, "loss": 0.93627906, "num_input_tokens_seen": 67996895, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.21533203, "step": 3151, "time_per_iteration": 2.5516517162323 }, { "auxiliary_loss_clip": 0.06569106, "auxiliary_loss_mlp": 0.01284658, "balance_loss_clip": 0.06303114, "balance_loss_mlp": 0.01262485, "epoch": 0.18950849240943934, "flos": 26803698520320.0, "grad_norm": 1.7399254835080507, "language_loss": 0.74921858, "learning_rate": 3.7389864390634857e-06, "loss": 0.82775629, "num_input_tokens_seen": 68018365, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.22180176, "step": 3152, "time_per_iteration": 2.624476194381714 }, { "auxiliary_loss_clip": 0.06559573, "auxiliary_loss_mlp": 0.01283746, "balance_loss_clip": 0.0629357, "balance_loss_mlp": 0.0126031, "epoch": 0.1895686156621073, "flos": 24977919980160.0, "grad_norm": 2.195281290428237, "language_loss": 0.76102835, "learning_rate": 3.738794033491209e-06, "loss": 0.83946157, "num_input_tokens_seen": 68037985, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.23449707, "step": 3153, "time_per_iteration": 2.591895818710327 }, { "auxiliary_loss_clip": 0.06559088, "auxiliary_loss_mlp": 0.01284782, "balance_loss_clip": 0.06292748, "balance_loss_mlp": 0.01261667, "epoch": 0.1896287389147753, "flos": 21950990760960.0, "grad_norm": 2.336055272719015, "language_loss": 0.79728687, "learning_rate": 3.7386015619836887e-06, "loss": 0.87572551, "num_input_tokens_seen": 68057975, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.23120117, "step": 3154, "time_per_iteration": 2.5740058422088623 }, { "auxiliary_loss_clip": 0.06576705, "auxiliary_loss_mlp": 0.01290113, "balance_loss_clip": 0.06305413, "balance_loss_mlp": 0.01265974, "epoch": 0.18968886216744327, "flos": 18183628938240.0, "grad_norm": 3.360103130041063, "language_loss": 0.73749465, "learning_rate": 3.738409024548223e-06, "loss": 0.81616282, "num_input_tokens_seen": 68074175, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.24157715, "step": 3155, "time_per_iteration": 2.545229434967041 }, { "auxiliary_loss_clip": 0.06561871, "auxiliary_loss_mlp": 0.01290325, "balance_loss_clip": 0.06298369, "balance_loss_mlp": 0.01267294, "epoch": 0.18974898542011123, "flos": 20418735473280.0, "grad_norm": 1.771698057636437, "language_loss": 0.74103349, "learning_rate": 3.7382164211921136e-06, "loss": 0.8195554, "num_input_tokens_seen": 68095230, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.23046875, "step": 3156, "time_per_iteration": 2.598987340927124 }, { "auxiliary_loss_clip": 0.06564815, "auxiliary_loss_mlp": 0.01290735, "balance_loss_clip": 0.06296512, "balance_loss_mlp": 0.01268884, "epoch": 0.1898091086727792, "flos": 23991356407680.0, "grad_norm": 1.9548589183413405, "language_loss": 0.69059646, "learning_rate": 3.7380237519226623e-06, "loss": 0.76915205, "num_input_tokens_seen": 68113805, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.21850586, "step": 3157, "time_per_iteration": 2.5953800678253174 }, { "auxiliary_loss_clip": 0.06563977, "auxiliary_loss_mlp": 0.01290376, "balance_loss_clip": 0.06297615, "balance_loss_mlp": 0.01266558, "epoch": 0.18986923192544716, "flos": 27644590569600.0, "grad_norm": 2.014331595300552, "language_loss": 0.80437362, "learning_rate": 3.737831016747176e-06, "loss": 0.88291705, "num_input_tokens_seen": 68133190, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.23815918, "step": 3158, "time_per_iteration": 2.614509105682373 }, { "auxiliary_loss_clip": 0.06579784, "auxiliary_loss_mlp": 0.0129171, "balance_loss_clip": 0.06305975, "balance_loss_mlp": 0.01267904, "epoch": 0.18992935517811513, "flos": 25491271219200.0, "grad_norm": 1.9963000634054422, "language_loss": 0.72796714, "learning_rate": 3.737638215672964e-06, "loss": 0.80668211, "num_input_tokens_seen": 68152330, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.23815918, "step": 3159, "time_per_iteration": 2.597355365753174 }, { "auxiliary_loss_clip": 0.06575465, "auxiliary_loss_mlp": 0.01287936, "balance_loss_clip": 0.06305251, "balance_loss_mlp": 0.01264309, "epoch": 0.1899894784307831, "flos": 17426578498560.0, "grad_norm": 5.50636530134779, "language_loss": 0.85842425, "learning_rate": 3.7374453487073366e-06, "loss": 0.93705827, "num_input_tokens_seen": 68170185, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.23632812, "step": 3160, "time_per_iteration": 2.5393388271331787 }, { "auxiliary_loss_clip": 0.06559652, "auxiliary_loss_mlp": 0.01290031, "balance_loss_clip": 0.0629966, "balance_loss_mlp": 0.01268621, "epoch": 0.19004960168345109, "flos": 27499925295360.0, "grad_norm": 2.014597282051303, "language_loss": 0.73995191, "learning_rate": 3.7372524158576074e-06, "loss": 0.81844878, "num_input_tokens_seen": 68191665, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.21386719, "step": 3161, "time_per_iteration": 2.60854434967041 }, { "auxiliary_loss_clip": 0.06560433, "auxiliary_loss_mlp": 0.01286513, "balance_loss_clip": 0.06297013, "balance_loss_mlp": 0.01263243, "epoch": 0.19010972493611905, "flos": 38663858908800.0, "grad_norm": 1.7063304670835666, "language_loss": 0.81614822, "learning_rate": 3.7370594171310926e-06, "loss": 0.89461762, "num_input_tokens_seen": 68214635, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.23278809, "step": 3162, "time_per_iteration": 2.7618346214294434 }, { "auxiliary_loss_clip": 0.06564572, "auxiliary_loss_mlp": 0.01287151, "balance_loss_clip": 0.06297353, "balance_loss_mlp": 0.01262975, "epoch": 0.19016984818878702, "flos": 19250763811200.0, "grad_norm": 2.1447627440846255, "language_loss": 0.76669592, "learning_rate": 3.73686635253511e-06, "loss": 0.84521317, "num_input_tokens_seen": 68232150, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.24169922, "step": 3163, "time_per_iteration": 2.5524415969848633 }, { "auxiliary_loss_clip": 0.06564389, "auxiliary_loss_mlp": 0.01287109, "balance_loss_clip": 0.06302056, "balance_loss_mlp": 0.012649, "epoch": 0.19022997144145498, "flos": 37605947984640.0, "grad_norm": 1.795520822152396, "language_loss": 0.75134832, "learning_rate": 3.736673222076982e-06, "loss": 0.82986331, "num_input_tokens_seen": 68253370, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.22216797, "step": 3164, "time_per_iteration": 2.808396339416504 }, { "auxiliary_loss_clip": 0.06563777, "auxiliary_loss_mlp": 0.01281695, "balance_loss_clip": 0.06301472, "balance_loss_mlp": 0.01258414, "epoch": 0.19029009469412295, "flos": 61543874615040.0, "grad_norm": 3.420940923040235, "language_loss": 0.67529964, "learning_rate": 3.7364800257640313e-06, "loss": 0.75375438, "num_input_tokens_seen": 68278895, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.23278809, "step": 3165, "time_per_iteration": 2.937565803527832 }, { "auxiliary_loss_clip": 0.0655838, "auxiliary_loss_mlp": 0.01287298, "balance_loss_clip": 0.06295228, "balance_loss_mlp": 0.0126398, "epoch": 0.1903502179467909, "flos": 13960077160320.0, "grad_norm": 3.2843872774346825, "language_loss": 0.75129896, "learning_rate": 3.7362867636035835e-06, "loss": 0.82975572, "num_input_tokens_seen": 68294880, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.23303223, "step": 3166, "time_per_iteration": 2.5051655769348145 }, { "auxiliary_loss_clip": 0.06446188, "auxiliary_loss_mlp": 0.01280719, "balance_loss_clip": 0.06299216, "balance_loss_mlp": 0.01275056, "epoch": 0.1904103411994589, "flos": 66920484499200.0, "grad_norm": 0.7555639916156172, "language_loss": 0.50387698, "learning_rate": 3.736093435602968e-06, "loss": 0.581146, "num_input_tokens_seen": 68359665, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.05657959, "step": 3167, "time_per_iteration": 3.212897539138794 }, { "auxiliary_loss_clip": 0.06557639, "auxiliary_loss_mlp": 0.01288524, "balance_loss_clip": 0.06298111, "balance_loss_mlp": 0.0126529, "epoch": 0.19047046445212687, "flos": 21915296121600.0, "grad_norm": 1.698464443615914, "language_loss": 0.7485106, "learning_rate": 3.7359000417695156e-06, "loss": 0.82697213, "num_input_tokens_seen": 68378950, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.2322998, "step": 3168, "time_per_iteration": 2.5648458003997803 }, { "auxiliary_loss_clip": 0.0642972, "auxiliary_loss_mlp": 0.01278571, "balance_loss_clip": 0.06282866, "balance_loss_mlp": 0.01272566, "epoch": 0.19053058770479483, "flos": 59271549338880.0, "grad_norm": 0.8410329311115898, "language_loss": 0.60146296, "learning_rate": 3.73570658211056e-06, "loss": 0.67854583, "num_input_tokens_seen": 68434235, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.06002808, "step": 3169, "time_per_iteration": 3.0954527854919434 }, { "auxiliary_loss_clip": 0.06571373, "auxiliary_loss_mlp": 0.01286083, "balance_loss_clip": 0.06301917, "balance_loss_mlp": 0.01262873, "epoch": 0.1905907109574628, "flos": 23958093536640.0, "grad_norm": 1.9995379949401926, "language_loss": 0.78773737, "learning_rate": 3.735513056633436e-06, "loss": 0.86631191, "num_input_tokens_seen": 68453830, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.23217773, "step": 3170, "time_per_iteration": 2.5916740894317627 }, { "auxiliary_loss_clip": 0.06563796, "auxiliary_loss_mlp": 0.01283368, "balance_loss_clip": 0.06301726, "balance_loss_mlp": 0.01260576, "epoch": 0.19065083421013077, "flos": 20818378321920.0, "grad_norm": 1.72560348826007, "language_loss": 0.79053074, "learning_rate": 3.7353194653454834e-06, "loss": 0.8690024, "num_input_tokens_seen": 68473005, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.22802734, "step": 3171, "time_per_iteration": 2.5773825645446777 }, { "auxiliary_loss_clip": 0.06579502, "auxiliary_loss_mlp": 0.01283958, "balance_loss_clip": 0.06306317, "balance_loss_mlp": 0.01259258, "epoch": 0.19071095746279873, "flos": 31293003121920.0, "grad_norm": 2.6170502201877404, "language_loss": 0.79806745, "learning_rate": 3.7351258082540426e-06, "loss": 0.87670201, "num_input_tokens_seen": 68493470, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.24719238, "step": 3172, "time_per_iteration": 2.6508913040161133 }, { "auxiliary_loss_clip": 0.06559053, "auxiliary_loss_mlp": 0.012832, "balance_loss_clip": 0.06297396, "balance_loss_mlp": 0.01260395, "epoch": 0.1907710807154667, "flos": 14361397090560.0, "grad_norm": 1.5857411665190633, "language_loss": 0.81405979, "learning_rate": 3.7349320853664576e-06, "loss": 0.89248234, "num_input_tokens_seen": 68511290, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.22802734, "step": 3173, "time_per_iteration": 2.5940113067626953 }, { "auxiliary_loss_clip": 0.06571985, "auxiliary_loss_mlp": 0.01288304, "balance_loss_clip": 0.06302775, "balance_loss_mlp": 0.01263103, "epoch": 0.1908312039681347, "flos": 26914388163840.0, "grad_norm": 1.9448781118678329, "language_loss": 0.79351616, "learning_rate": 3.7347382966900735e-06, "loss": 0.87211895, "num_input_tokens_seen": 68532575, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.25195312, "step": 3174, "time_per_iteration": 2.636056661605835 }, { "auxiliary_loss_clip": 0.06569053, "auxiliary_loss_mlp": 0.01284858, "balance_loss_clip": 0.06300195, "balance_loss_mlp": 0.01260241, "epoch": 0.19089132722080265, "flos": 14498767059840.0, "grad_norm": 1.6945987877068591, "language_loss": 0.81592268, "learning_rate": 3.7345444422322395e-06, "loss": 0.89446181, "num_input_tokens_seen": 68548760, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.24621582, "step": 3175, "time_per_iteration": 3.9698843955993652 }, { "auxiliary_loss_clip": 0.06569412, "auxiliary_loss_mlp": 0.0128394, "balance_loss_clip": 0.06304078, "balance_loss_mlp": 0.01259049, "epoch": 0.19095145047347062, "flos": 13957771173120.0, "grad_norm": 9.051018195443977, "language_loss": 0.87021673, "learning_rate": 3.7343505220003067e-06, "loss": 0.94875026, "num_input_tokens_seen": 68563100, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.2487793, "step": 3176, "time_per_iteration": 2.526215076446533 }, { "auxiliary_loss_clip": 0.06573889, "auxiliary_loss_mlp": 0.01285828, "balance_loss_clip": 0.06300858, "balance_loss_mlp": 0.01257087, "epoch": 0.19101157372613858, "flos": 25308940734720.0, "grad_norm": 1.9899140708719187, "language_loss": 0.82765734, "learning_rate": 3.7341565360016285e-06, "loss": 0.90625459, "num_input_tokens_seen": 68581650, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.28759766, "step": 3177, "time_per_iteration": 2.612494468688965 }, { "auxiliary_loss_clip": 0.06565228, "auxiliary_loss_mlp": 0.01283526, "balance_loss_clip": 0.06301549, "balance_loss_mlp": 0.01259279, "epoch": 0.19107169697880655, "flos": 20564448923520.0, "grad_norm": 2.453185938258763, "language_loss": 0.75949985, "learning_rate": 3.73396248424356e-06, "loss": 0.83798736, "num_input_tokens_seen": 68600360, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.24267578, "step": 3178, "time_per_iteration": 3.9797072410583496 }, { "auxiliary_loss_clip": 0.06569405, "auxiliary_loss_mlp": 0.012778, "balance_loss_clip": 0.06301382, "balance_loss_mlp": 0.01254578, "epoch": 0.19113182023147451, "flos": 22169644790400.0, "grad_norm": 2.3106624874256547, "language_loss": 0.82163501, "learning_rate": 3.7337683667334606e-06, "loss": 0.90010703, "num_input_tokens_seen": 68617885, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.23217773, "step": 3179, "time_per_iteration": 2.5642664432525635 }, { "auxiliary_loss_clip": 0.06563543, "auxiliary_loss_mlp": 0.0128683, "balance_loss_clip": 0.06299855, "balance_loss_mlp": 0.01263191, "epoch": 0.19119194348414248, "flos": 18586667877120.0, "grad_norm": 2.57694463344588, "language_loss": 0.80713159, "learning_rate": 3.733574183478691e-06, "loss": 0.88563538, "num_input_tokens_seen": 68634550, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.23657227, "step": 3180, "time_per_iteration": 2.5548110008239746 }, { "auxiliary_loss_clip": 0.06565818, "auxiliary_loss_mlp": 0.01290396, "balance_loss_clip": 0.06301222, "balance_loss_mlp": 0.01265994, "epoch": 0.19125206673681047, "flos": 19032738687360.0, "grad_norm": 3.3961947538900783, "language_loss": 0.80107945, "learning_rate": 3.733379934486615e-06, "loss": 0.87964153, "num_input_tokens_seen": 68651895, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.24414062, "step": 3181, "time_per_iteration": 2.5618672370910645 }, { "auxiliary_loss_clip": 0.06565354, "auxiliary_loss_mlp": 0.01283382, "balance_loss_clip": 0.06297246, "balance_loss_mlp": 0.01258169, "epoch": 0.19131218998947844, "flos": 21696725946240.0, "grad_norm": 1.8678171481169017, "language_loss": 0.74450707, "learning_rate": 3.7331856197645973e-06, "loss": 0.82299441, "num_input_tokens_seen": 68671500, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.25183105, "step": 3182, "time_per_iteration": 2.5864486694335938 }, { "auxiliary_loss_clip": 0.06569409, "auxiliary_loss_mlp": 0.01291052, "balance_loss_clip": 0.06303977, "balance_loss_mlp": 0.01267127, "epoch": 0.1913723132421464, "flos": 18448459367040.0, "grad_norm": 1.8613816600925224, "language_loss": 0.65973079, "learning_rate": 3.7329912393200084e-06, "loss": 0.73833537, "num_input_tokens_seen": 68690570, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.23925781, "step": 3183, "time_per_iteration": 2.681401014328003 }, { "auxiliary_loss_clip": 0.06574817, "auxiliary_loss_mlp": 0.01291575, "balance_loss_clip": 0.06303791, "balance_loss_mlp": 0.01263263, "epoch": 0.19143243649481437, "flos": 27167101678080.0, "grad_norm": 1.5841511989268366, "language_loss": 0.73428369, "learning_rate": 3.7327967931602173e-06, "loss": 0.81294751, "num_input_tokens_seen": 68709735, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.28344727, "step": 3184, "time_per_iteration": 4.0972349643707275 }, { "auxiliary_loss_clip": 0.06562617, "auxiliary_loss_mlp": 0.01285202, "balance_loss_clip": 0.06294708, "balance_loss_mlp": 0.01259179, "epoch": 0.19149255974748233, "flos": 21724244812800.0, "grad_norm": 1.969532904322405, "language_loss": 0.88727802, "learning_rate": 3.732602281292598e-06, "loss": 0.96575624, "num_input_tokens_seen": 68727565, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.26049805, "step": 3185, "time_per_iteration": 2.562516689300537 }, { "auxiliary_loss_clip": 0.06564388, "auxiliary_loss_mlp": 0.0128268, "balance_loss_clip": 0.06297157, "balance_loss_mlp": 0.0125643, "epoch": 0.1915526830001503, "flos": 22969433612160.0, "grad_norm": 2.31320146072041, "language_loss": 0.73233867, "learning_rate": 3.7324077037245267e-06, "loss": 0.81080937, "num_input_tokens_seen": 68748110, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.26208496, "step": 3186, "time_per_iteration": 4.000355958938599 }, { "auxiliary_loss_clip": 0.06567053, "auxiliary_loss_mlp": 0.01286324, "balance_loss_clip": 0.06297332, "balance_loss_mlp": 0.01259406, "epoch": 0.1916128062528183, "flos": 26147946067200.0, "grad_norm": 1.8827382453773054, "language_loss": 0.84356236, "learning_rate": 3.7322130604633825e-06, "loss": 0.92209613, "num_input_tokens_seen": 68769765, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.26965332, "step": 3187, "time_per_iteration": 2.5988521575927734 }, { "auxiliary_loss_clip": 0.06425679, "auxiliary_loss_mlp": 0.01289595, "balance_loss_clip": 0.06278239, "balance_loss_mlp": 0.01283074, "epoch": 0.19167292950548626, "flos": 54943513119360.0, "grad_norm": 0.844062105575567, "language_loss": 0.55909473, "learning_rate": 3.732018351516544e-06, "loss": 0.63624752, "num_input_tokens_seen": 68826815, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.06530762, "step": 3188, "time_per_iteration": 3.2010395526885986 }, { "auxiliary_loss_clip": 0.06571436, "auxiliary_loss_mlp": 0.0129337, "balance_loss_clip": 0.06303748, "balance_loss_mlp": 0.01267787, "epoch": 0.19173305275815422, "flos": 29943497589120.0, "grad_norm": 1.7774347513487803, "language_loss": 0.70720649, "learning_rate": 3.731823576891397e-06, "loss": 0.78585458, "num_input_tokens_seen": 68847585, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.25610352, "step": 3189, "time_per_iteration": 2.639324426651001 }, { "auxiliary_loss_clip": 0.06552472, "auxiliary_loss_mlp": 0.01277537, "balance_loss_clip": 0.06293275, "balance_loss_mlp": 0.01254458, "epoch": 0.1917931760108222, "flos": 24759140169600.0, "grad_norm": 3.3569859108796125, "language_loss": 0.74797952, "learning_rate": 3.7316287365953266e-06, "loss": 0.82627964, "num_input_tokens_seen": 68866620, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.23083496, "step": 3190, "time_per_iteration": 2.619107961654663 }, { "auxiliary_loss_clip": 0.06559356, "auxiliary_loss_mlp": 0.01285226, "balance_loss_clip": 0.06296682, "balance_loss_mlp": 0.01260323, "epoch": 0.19185329926349015, "flos": 18849527735040.0, "grad_norm": 1.9903628428631517, "language_loss": 0.84833604, "learning_rate": 3.73143383063572e-06, "loss": 0.92678189, "num_input_tokens_seen": 68885515, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.24914551, "step": 3191, "time_per_iteration": 2.5204226970672607 }, { "auxiliary_loss_clip": 0.06550826, "auxiliary_loss_mlp": 0.0128461, "balance_loss_clip": 0.06293717, "balance_loss_mlp": 0.01260529, "epoch": 0.19191342251615812, "flos": 22092721142400.0, "grad_norm": 1.7274412022461219, "language_loss": 0.90124381, "learning_rate": 3.73123885901997e-06, "loss": 0.97959816, "num_input_tokens_seen": 68903225, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.24072266, "step": 3192, "time_per_iteration": 2.555752992630005 }, { "auxiliary_loss_clip": 0.06572013, "auxiliary_loss_mlp": 0.01282205, "balance_loss_clip": 0.06301364, "balance_loss_mlp": 0.01256516, "epoch": 0.19197354576882608, "flos": 22205465210880.0, "grad_norm": 1.80232061365757, "language_loss": 0.75833052, "learning_rate": 3.7310438217554687e-06, "loss": 0.8368727, "num_input_tokens_seen": 68922860, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.25708008, "step": 3193, "time_per_iteration": 2.5766279697418213 }, { "auxiliary_loss_clip": 0.06568294, "auxiliary_loss_mlp": 0.01283815, "balance_loss_clip": 0.06297608, "balance_loss_mlp": 0.01259198, "epoch": 0.19203366902149407, "flos": 24902505705600.0, "grad_norm": 1.9478503275109964, "language_loss": 0.75451797, "learning_rate": 3.730848718849612e-06, "loss": 0.83303905, "num_input_tokens_seen": 68943000, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.24609375, "step": 3194, "time_per_iteration": 2.604844093322754 }, { "auxiliary_loss_clip": 0.06414267, "auxiliary_loss_mlp": 0.01258887, "balance_loss_clip": 0.06267606, "balance_loss_mlp": 0.0125258, "epoch": 0.19209379227416204, "flos": 68435256211200.0, "grad_norm": 0.7333299732203802, "language_loss": 0.68230605, "learning_rate": 3.7306535503097985e-06, "loss": 0.75903755, "num_input_tokens_seen": 69000255, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.06307983, "step": 3195, "time_per_iteration": 3.1192173957824707 }, { "auxiliary_loss_clip": 0.06553577, "auxiliary_loss_mlp": 0.01282795, "balance_loss_clip": 0.06290012, "balance_loss_mlp": 0.01259037, "epoch": 0.19215391552683, "flos": 22061848112640.0, "grad_norm": 3.5966024590550307, "language_loss": 0.73115736, "learning_rate": 3.730458316143429e-06, "loss": 0.80952114, "num_input_tokens_seen": 69019665, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.23754883, "step": 3196, "time_per_iteration": 2.576105833053589 }, { "auxiliary_loss_clip": 0.06562181, "auxiliary_loss_mlp": 0.01288942, "balance_loss_clip": 0.06296592, "balance_loss_mlp": 0.01264289, "epoch": 0.19221403877949797, "flos": 20309177859840.0, "grad_norm": 3.887281308315821, "language_loss": 0.84270287, "learning_rate": 3.7302630163579068e-06, "loss": 0.9212141, "num_input_tokens_seen": 69039055, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.2467041, "step": 3197, "time_per_iteration": 2.5723042488098145 }, { "auxiliary_loss_clip": 0.06560457, "auxiliary_loss_mlp": 0.01284816, "balance_loss_clip": 0.06292455, "balance_loss_mlp": 0.01258805, "epoch": 0.19227416203216594, "flos": 23192028783360.0, "grad_norm": 2.1049759719214283, "language_loss": 0.81363505, "learning_rate": 3.7300676509606373e-06, "loss": 0.89208776, "num_input_tokens_seen": 69056370, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.26037598, "step": 3198, "time_per_iteration": 2.591064929962158 }, { "auxiliary_loss_clip": 0.0656453, "auxiliary_loss_mlp": 0.01288506, "balance_loss_clip": 0.06293168, "balance_loss_mlp": 0.01263317, "epoch": 0.1923342852848339, "flos": 25783872076800.0, "grad_norm": 1.9865175017813772, "language_loss": 0.79774582, "learning_rate": 3.729872219959029e-06, "loss": 0.87627614, "num_input_tokens_seen": 69075915, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.25183105, "step": 3199, "time_per_iteration": 2.6155216693878174 }, { "auxiliary_loss_clip": 0.06557593, "auxiliary_loss_mlp": 0.01284716, "balance_loss_clip": 0.06292385, "balance_loss_mlp": 0.01260325, "epoch": 0.19239440853750187, "flos": 17133977640960.0, "grad_norm": 2.266376779358397, "language_loss": 0.84777647, "learning_rate": 3.7296767233604934e-06, "loss": 0.92619956, "num_input_tokens_seen": 69094145, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.24365234, "step": 3200, "time_per_iteration": 2.5348246097564697 }, { "auxiliary_loss_clip": 0.06553052, "auxiliary_loss_mlp": 0.01281411, "balance_loss_clip": 0.06289976, "balance_loss_mlp": 0.01257116, "epoch": 0.19245453179016986, "flos": 16440601904640.0, "grad_norm": 1.9179136579906386, "language_loss": 0.80122739, "learning_rate": 3.729481161172443e-06, "loss": 0.87957203, "num_input_tokens_seen": 69111110, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.24316406, "step": 3201, "time_per_iteration": 2.5217678546905518 }, { "auxiliary_loss_clip": 0.06557955, "auxiliary_loss_mlp": 0.01278815, "balance_loss_clip": 0.0628983, "balance_loss_mlp": 0.01254961, "epoch": 0.19251465504283782, "flos": 20236530769920.0, "grad_norm": 2.356798211423804, "language_loss": 0.70836967, "learning_rate": 3.7292855334022927e-06, "loss": 0.78673738, "num_input_tokens_seen": 69130280, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.23852539, "step": 3202, "time_per_iteration": 2.55057954788208 }, { "auxiliary_loss_clip": 0.06554681, "auxiliary_loss_mlp": 0.01285457, "balance_loss_clip": 0.06297234, "balance_loss_mlp": 0.01261878, "epoch": 0.1925747782955058, "flos": 19470549870720.0, "grad_norm": 1.8778695205786617, "language_loss": 0.9210794, "learning_rate": 3.7290898400574627e-06, "loss": 0.99948072, "num_input_tokens_seen": 69149570, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.23583984, "step": 3203, "time_per_iteration": 2.520298719406128 }, { "auxiliary_loss_clip": 0.06567451, "auxiliary_loss_mlp": 0.01284336, "balance_loss_clip": 0.06301238, "balance_loss_mlp": 0.01258158, "epoch": 0.19263490154817375, "flos": 17791407175680.0, "grad_norm": 2.180180248497546, "language_loss": 0.82363081, "learning_rate": 3.7288940811453725e-06, "loss": 0.90214866, "num_input_tokens_seen": 69168190, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.26184082, "step": 3204, "time_per_iteration": 2.5203773975372314 }, { "auxiliary_loss_clip": 0.06554898, "auxiliary_loss_mlp": 0.01283431, "balance_loss_clip": 0.062962, "balance_loss_mlp": 0.01258778, "epoch": 0.19269502480084172, "flos": 17462818189440.0, "grad_norm": 2.222766725925588, "language_loss": 0.76673341, "learning_rate": 3.7286982566734454e-06, "loss": 0.84511662, "num_input_tokens_seen": 69186950, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.24645996, "step": 3205, "time_per_iteration": 2.528719186782837 }, { "auxiliary_loss_clip": 0.06573448, "auxiliary_loss_mlp": 0.01282805, "balance_loss_clip": 0.06303548, "balance_loss_mlp": 0.01258713, "epoch": 0.19275514805350968, "flos": 21513305358720.0, "grad_norm": 3.4382833194799027, "language_loss": 0.835428, "learning_rate": 3.728502366649107e-06, "loss": 0.9139905, "num_input_tokens_seen": 69204850, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.24084473, "step": 3206, "time_per_iteration": 2.5537803173065186 }, { "auxiliary_loss_clip": 0.06434441, "auxiliary_loss_mlp": 0.01255943, "balance_loss_clip": 0.06287721, "balance_loss_mlp": 0.0125048, "epoch": 0.19281527130617768, "flos": 47711578602240.0, "grad_norm": 0.8120321759891623, "language_loss": 0.60550392, "learning_rate": 3.728306411079786e-06, "loss": 0.68240774, "num_input_tokens_seen": 69259200, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.0546875, "step": 3207, "time_per_iteration": 3.005746603012085 }, { "auxiliary_loss_clip": 0.06573709, "auxiliary_loss_mlp": 0.01286043, "balance_loss_clip": 0.06303503, "balance_loss_mlp": 0.01259746, "epoch": 0.19287539455884564, "flos": 11805961196160.0, "grad_norm": 4.5342404325316545, "language_loss": 0.76138294, "learning_rate": 3.7281103899729125e-06, "loss": 0.83998048, "num_input_tokens_seen": 69275835, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.26306152, "step": 3208, "time_per_iteration": 2.512152910232544 }, { "auxiliary_loss_clip": 0.06566785, "auxiliary_loss_mlp": 0.01286042, "balance_loss_clip": 0.06297609, "balance_loss_mlp": 0.01260435, "epoch": 0.1929355178115136, "flos": 20637724919040.0, "grad_norm": 2.1444010686975443, "language_loss": 0.61459303, "learning_rate": 3.7279143033359195e-06, "loss": 0.69312131, "num_input_tokens_seen": 69294810, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.25610352, "step": 3209, "time_per_iteration": 2.5462749004364014 }, { "auxiliary_loss_clip": 0.06573249, "auxiliary_loss_mlp": 0.0129114, "balance_loss_clip": 0.06304809, "balance_loss_mlp": 0.01265319, "epoch": 0.19299564106418157, "flos": 40817555602560.0, "grad_norm": 2.777528588026736, "language_loss": 0.81285805, "learning_rate": 3.727718151176243e-06, "loss": 0.89150196, "num_input_tokens_seen": 69316065, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.25830078, "step": 3210, "time_per_iteration": 2.729135751724243 }, { "auxiliary_loss_clip": 0.06561173, "auxiliary_loss_mlp": 0.0128349, "balance_loss_clip": 0.06303526, "balance_loss_mlp": 0.01259994, "epoch": 0.19305576431684954, "flos": 11365718244480.0, "grad_norm": 2.185783898040645, "language_loss": 0.83858752, "learning_rate": 3.7275219335013217e-06, "loss": 0.91703409, "num_input_tokens_seen": 69332900, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.23498535, "step": 3211, "time_per_iteration": 2.6102914810180664 }, { "auxiliary_loss_clip": 0.06424443, "auxiliary_loss_mlp": 0.01262523, "balance_loss_clip": 0.0627909, "balance_loss_mlp": 0.01256366, "epoch": 0.1931158875695175, "flos": 54527476798080.0, "grad_norm": 0.9488699338496913, "language_loss": 0.63516569, "learning_rate": 3.7273256503185953e-06, "loss": 0.7120353, "num_input_tokens_seen": 69382535, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.06155396, "step": 3212, "time_per_iteration": 3.1288435459136963 }, { "auxiliary_loss_clip": 0.06560645, "auxiliary_loss_mlp": 0.01287066, "balance_loss_clip": 0.06300551, "balance_loss_mlp": 0.01261734, "epoch": 0.19317601082218547, "flos": 19834540007040.0, "grad_norm": 1.6995844436601253, "language_loss": 0.77290815, "learning_rate": 3.7271293016355074e-06, "loss": 0.85138524, "num_input_tokens_seen": 69400600, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.25341797, "step": 3213, "time_per_iteration": 2.566713571548462 }, { "auxiliary_loss_clip": 0.0656857, "auxiliary_loss_mlp": 0.0128551, "balance_loss_clip": 0.06299466, "balance_loss_mlp": 0.01261156, "epoch": 0.19323613407485346, "flos": 13157143810560.0, "grad_norm": 3.4007554166419114, "language_loss": 0.71297789, "learning_rate": 3.726932887459503e-06, "loss": 0.79151875, "num_input_tokens_seen": 69417350, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.2434082, "step": 3214, "time_per_iteration": 2.5191268920898438 }, { "auxiliary_loss_clip": 0.06564617, "auxiliary_loss_mlp": 0.01286406, "balance_loss_clip": 0.06299525, "balance_loss_mlp": 0.01260442, "epoch": 0.19329625732752143, "flos": 14032388833920.0, "grad_norm": 2.158400135104075, "language_loss": 0.76779926, "learning_rate": 3.72673640779803e-06, "loss": 0.84630948, "num_input_tokens_seen": 69431845, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.25952148, "step": 3215, "time_per_iteration": 3.945453405380249 }, { "auxiliary_loss_clip": 0.06559886, "auxiliary_loss_mlp": 0.01278847, "balance_loss_clip": 0.06299014, "balance_loss_mlp": 0.0125553, "epoch": 0.1933563805801894, "flos": 23448641512320.0, "grad_norm": 1.6451658617860392, "language_loss": 0.88744223, "learning_rate": 3.72653986265854e-06, "loss": 0.96582949, "num_input_tokens_seen": 69453275, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.23339844, "step": 3216, "time_per_iteration": 2.593763589859009 }, { "auxiliary_loss_clip": 0.06551571, "auxiliary_loss_mlp": 0.01279443, "balance_loss_clip": 0.06293644, "balance_loss_mlp": 0.01255446, "epoch": 0.19341650383285736, "flos": 20491550271360.0, "grad_norm": 2.0448872586693376, "language_loss": 0.80182165, "learning_rate": 3.726343252048485e-06, "loss": 0.88013184, "num_input_tokens_seen": 69471830, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.23986816, "step": 3217, "time_per_iteration": 2.5734663009643555 }, { "auxiliary_loss_clip": 0.06573892, "auxiliary_loss_mlp": 0.01286339, "balance_loss_clip": 0.06302994, "balance_loss_mlp": 0.01258706, "epoch": 0.19347662708552532, "flos": 17864305827840.0, "grad_norm": 2.777386116383901, "language_loss": 0.6318301, "learning_rate": 3.7261465759753206e-06, "loss": 0.71043247, "num_input_tokens_seen": 69489320, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.27636719, "step": 3218, "time_per_iteration": 3.960317611694336 }, { "auxiliary_loss_clip": 0.06569123, "auxiliary_loss_mlp": 0.01285638, "balance_loss_clip": 0.06300177, "balance_loss_mlp": 0.01259484, "epoch": 0.1935367503381933, "flos": 18193188303360.0, "grad_norm": 1.7491464920000177, "language_loss": 0.80912852, "learning_rate": 3.7259498344465053e-06, "loss": 0.88767612, "num_input_tokens_seen": 69506665, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.26171875, "step": 3219, "time_per_iteration": 2.530503273010254 }, { "auxiliary_loss_clip": 0.0655995, "auxiliary_loss_mlp": 0.01283403, "balance_loss_clip": 0.06300078, "balance_loss_mlp": 0.01259668, "epoch": 0.19359687359086128, "flos": 15961939056000.0, "grad_norm": 2.3509782699367308, "language_loss": 0.87098092, "learning_rate": 3.7257530274694993e-06, "loss": 0.94941437, "num_input_tokens_seen": 69523835, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.23730469, "step": 3220, "time_per_iteration": 2.5067532062530518 }, { "auxiliary_loss_clip": 0.06552724, "auxiliary_loss_mlp": 0.01283865, "balance_loss_clip": 0.0630091, "balance_loss_mlp": 0.01260536, "epoch": 0.19365699684352924, "flos": 21221584968960.0, "grad_norm": 1.9630331788984254, "language_loss": 0.84650195, "learning_rate": 3.725556155051766e-06, "loss": 0.92486781, "num_input_tokens_seen": 69542620, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.23339844, "step": 3221, "time_per_iteration": 2.562643527984619 }, { "auxiliary_loss_clip": 0.06549601, "auxiliary_loss_mlp": 0.0128454, "balance_loss_clip": 0.06293675, "balance_loss_mlp": 0.01260638, "epoch": 0.1937171200961972, "flos": 17316811249920.0, "grad_norm": 2.408871097986346, "language_loss": 0.86488467, "learning_rate": 3.7253592172007702e-06, "loss": 0.9432261, "num_input_tokens_seen": 69561130, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.23876953, "step": 3222, "time_per_iteration": 2.5236871242523193 }, { "auxiliary_loss_clip": 0.06554019, "auxiliary_loss_mlp": 0.01281629, "balance_loss_clip": 0.06288524, "balance_loss_mlp": 0.0125756, "epoch": 0.19377724334886517, "flos": 22642228218240.0, "grad_norm": 5.810958716259905, "language_loss": 0.78610241, "learning_rate": 3.72516221392398e-06, "loss": 0.86445892, "num_input_tokens_seen": 69580425, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.24047852, "step": 3223, "time_per_iteration": 3.858581304550171 }, { "auxiliary_loss_clip": 0.06556758, "auxiliary_loss_mlp": 0.01283121, "balance_loss_clip": 0.06294604, "balance_loss_mlp": 0.01259386, "epoch": 0.19383736660153314, "flos": 15081872423040.0, "grad_norm": 2.464923207374437, "language_loss": 0.75700903, "learning_rate": 3.7249651452288653e-06, "loss": 0.83540785, "num_input_tokens_seen": 69597085, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.23742676, "step": 3224, "time_per_iteration": 2.52414870262146 }, { "auxiliary_loss_clip": 0.06558035, "auxiliary_loss_mlp": 0.01283357, "balance_loss_clip": 0.06295717, "balance_loss_mlp": 0.01257655, "epoch": 0.1938974898542011, "flos": 47130626246400.0, "grad_norm": 2.5739862934165805, "language_loss": 0.71615517, "learning_rate": 3.7247680111229e-06, "loss": 0.79456908, "num_input_tokens_seen": 69618885, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.25720215, "step": 3225, "time_per_iteration": 2.7660937309265137 }, { "auxiliary_loss_clip": 0.06562433, "auxiliary_loss_mlp": 0.01284439, "balance_loss_clip": 0.06296895, "balance_loss_mlp": 0.01259703, "epoch": 0.19395761310686907, "flos": 25819734424320.0, "grad_norm": 5.092810797953727, "language_loss": 0.69684464, "learning_rate": 3.7245708116135585e-06, "loss": 0.77531338, "num_input_tokens_seen": 69638200, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.24755859, "step": 3226, "time_per_iteration": 4.0160582065582275 }, { "auxiliary_loss_clip": 0.06555431, "auxiliary_loss_mlp": 0.01288696, "balance_loss_clip": 0.06297284, "balance_loss_mlp": 0.01263066, "epoch": 0.19401773635953706, "flos": 23046315333120.0, "grad_norm": 2.4389779149253528, "language_loss": 0.770679, "learning_rate": 3.7243735467083193e-06, "loss": 0.84912026, "num_input_tokens_seen": 69657550, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.25610352, "step": 3227, "time_per_iteration": 2.5802974700927734 }, { "auxiliary_loss_clip": 0.06560017, "auxiliary_loss_mlp": 0.01291876, "balance_loss_clip": 0.06293418, "balance_loss_mlp": 0.01268404, "epoch": 0.19407785961220503, "flos": 15925615511040.0, "grad_norm": 3.5388134081184908, "language_loss": 0.70373201, "learning_rate": 3.724176216414662e-06, "loss": 0.782251, "num_input_tokens_seen": 69675005, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.23474121, "step": 3228, "time_per_iteration": 2.53769588470459 }, { "auxiliary_loss_clip": 0.06558903, "auxiliary_loss_mlp": 0.01286535, "balance_loss_clip": 0.06295532, "balance_loss_mlp": 0.01261203, "epoch": 0.194137982864873, "flos": 25928872767360.0, "grad_norm": 2.466260459334836, "language_loss": 0.74605376, "learning_rate": 3.72397882074007e-06, "loss": 0.82450819, "num_input_tokens_seen": 69696455, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.25317383, "step": 3229, "time_per_iteration": 2.592423915863037 }, { "auxiliary_loss_clip": 0.06560139, "auxiliary_loss_mlp": 0.01291817, "balance_loss_clip": 0.06297114, "balance_loss_mlp": 0.01266271, "epoch": 0.19419810611754096, "flos": 13266407934720.0, "grad_norm": 5.375295419366952, "language_loss": 0.66334349, "learning_rate": 3.7237813596920285e-06, "loss": 0.74186301, "num_input_tokens_seen": 69714245, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.25537109, "step": 3230, "time_per_iteration": 2.618450164794922 }, { "auxiliary_loss_clip": 0.06558853, "auxiliary_loss_mlp": 0.01291178, "balance_loss_clip": 0.0629864, "balance_loss_mlp": 0.01266585, "epoch": 0.19425822937020892, "flos": 15710986477440.0, "grad_norm": 1.9472499841980055, "language_loss": 0.82559395, "learning_rate": 3.7235838332780254e-06, "loss": 0.90409434, "num_input_tokens_seen": 69731515, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.24621582, "step": 3231, "time_per_iteration": 2.5364418029785156 }, { "auxiliary_loss_clip": 0.06564944, "auxiliary_loss_mlp": 0.01289368, "balance_loss_clip": 0.06299853, "balance_loss_mlp": 0.01264561, "epoch": 0.1943183526228769, "flos": 23110912431360.0, "grad_norm": 1.8835043050661613, "language_loss": 0.87506694, "learning_rate": 3.72338624150555e-06, "loss": 0.95361006, "num_input_tokens_seen": 69748885, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.24829102, "step": 3232, "time_per_iteration": 2.5649492740631104 }, { "auxiliary_loss_clip": 0.06557282, "auxiliary_loss_mlp": 0.01280353, "balance_loss_clip": 0.06299061, "balance_loss_mlp": 0.01255641, "epoch": 0.19437847587554485, "flos": 24718707774720.0, "grad_norm": 3.370796543944349, "language_loss": 0.86158383, "learning_rate": 3.723188584382096e-06, "loss": 0.93996012, "num_input_tokens_seen": 69767540, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.24719238, "step": 3233, "time_per_iteration": 2.5800533294677734 }, { "auxiliary_loss_clip": 0.06576683, "auxiliary_loss_mlp": 0.01286692, "balance_loss_clip": 0.06310052, "balance_loss_mlp": 0.01260299, "epoch": 0.19443859912821285, "flos": 23123448616320.0, "grad_norm": 1.7258692835678382, "language_loss": 0.8939876, "learning_rate": 3.722990861915158e-06, "loss": 0.97262132, "num_input_tokens_seen": 69789340, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.26403809, "step": 3234, "time_per_iteration": 2.607447862625122 }, { "auxiliary_loss_clip": 0.06598233, "auxiliary_loss_mlp": 0.01297589, "balance_loss_clip": 0.06329684, "balance_loss_mlp": 0.01271375, "epoch": 0.1944987223808808, "flos": 15089545071360.0, "grad_norm": 2.610410457566145, "language_loss": 0.79074848, "learning_rate": 3.722793074112234e-06, "loss": 0.86970663, "num_input_tokens_seen": 69806470, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.26220703, "step": 3235, "time_per_iteration": 2.541757106781006 }, { "auxiliary_loss_clip": 0.06592801, "auxiliary_loss_mlp": 0.0128871, "balance_loss_clip": 0.06333907, "balance_loss_mlp": 0.01265953, "epoch": 0.19455884563354878, "flos": 17132258632320.0, "grad_norm": 3.2019881088264124, "language_loss": 0.79669213, "learning_rate": 3.7225952209808233e-06, "loss": 0.8755073, "num_input_tokens_seen": 69822655, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.22766113, "step": 3236, "time_per_iteration": 2.5128278732299805 }, { "auxiliary_loss_clip": 0.06590606, "auxiliary_loss_mlp": 0.01291184, "balance_loss_clip": 0.06330372, "balance_loss_mlp": 0.01265828, "epoch": 0.19461896888621674, "flos": 20199578319360.0, "grad_norm": 2.624899485944591, "language_loss": 0.76706231, "learning_rate": 3.72239730252843e-06, "loss": 0.84588021, "num_input_tokens_seen": 69841895, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.25366211, "step": 3237, "time_per_iteration": 2.569143056869507 }, { "auxiliary_loss_clip": 0.06604208, "auxiliary_loss_mlp": 0.01298275, "balance_loss_clip": 0.0633404, "balance_loss_mlp": 0.01271799, "epoch": 0.1946790921388847, "flos": 25308395683200.0, "grad_norm": 1.7419045148880457, "language_loss": 0.75495267, "learning_rate": 3.7221993187625583e-06, "loss": 0.83397746, "num_input_tokens_seen": 69862220, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.26452637, "step": 3238, "time_per_iteration": 2.611388683319092 }, { "auxiliary_loss_clip": 0.06595492, "auxiliary_loss_mlp": 0.01288655, "balance_loss_clip": 0.06332803, "balance_loss_mlp": 0.01263668, "epoch": 0.19473921539155267, "flos": 20199578319360.0, "grad_norm": 2.0288856197117413, "language_loss": 0.74054396, "learning_rate": 3.7220012696907155e-06, "loss": 0.81938547, "num_input_tokens_seen": 69881830, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.25, "step": 3239, "time_per_iteration": 2.565948009490967 }, { "auxiliary_loss_clip": 0.06596151, "auxiliary_loss_mlp": 0.01284482, "balance_loss_clip": 0.063364, "balance_loss_mlp": 0.01259925, "epoch": 0.19479933864422067, "flos": 20894002231680.0, "grad_norm": 3.152503275323812, "language_loss": 0.73976183, "learning_rate": 3.721803155320412e-06, "loss": 0.81856817, "num_input_tokens_seen": 69900515, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.24560547, "step": 3240, "time_per_iteration": 2.5691373348236084 }, { "auxiliary_loss_clip": 0.06601106, "auxiliary_loss_mlp": 0.0128726, "balance_loss_clip": 0.06337487, "balance_loss_mlp": 0.01263264, "epoch": 0.19485946189688863, "flos": 23301837959040.0, "grad_norm": 2.010296493712955, "language_loss": 0.67451853, "learning_rate": 3.7216049756591606e-06, "loss": 0.75340223, "num_input_tokens_seen": 69920060, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.23974609, "step": 3241, "time_per_iteration": 2.680013418197632 }, { "auxiliary_loss_clip": 0.06590153, "auxiliary_loss_mlp": 0.01292403, "balance_loss_clip": 0.06328317, "balance_loss_mlp": 0.01265509, "epoch": 0.1949195851495566, "flos": 23301796032000.0, "grad_norm": 1.3820881934852272, "language_loss": 0.83076775, "learning_rate": 3.7214067307144754e-06, "loss": 0.90959334, "num_input_tokens_seen": 69939820, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.26904297, "step": 3242, "time_per_iteration": 2.594592332839966 }, { "auxiliary_loss_clip": 0.06483683, "auxiliary_loss_mlp": 0.01309785, "balance_loss_clip": 0.06336219, "balance_loss_mlp": 0.01301721, "epoch": 0.19497970840222456, "flos": 64982884285440.0, "grad_norm": 0.7980526104418099, "language_loss": 0.57395804, "learning_rate": 3.721208420493875e-06, "loss": 0.65189266, "num_input_tokens_seen": 70002145, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.08062744, "step": 3243, "time_per_iteration": 3.2212629318237305 }, { "auxiliary_loss_clip": 0.06593497, "auxiliary_loss_mlp": 0.01286279, "balance_loss_clip": 0.0632982, "balance_loss_mlp": 0.01259516, "epoch": 0.19503983165489253, "flos": 19650574368000.0, "grad_norm": 2.222682888300658, "language_loss": 0.84262294, "learning_rate": 3.7210100450048784e-06, "loss": 0.92142069, "num_input_tokens_seen": 70020510, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.26782227, "step": 3244, "time_per_iteration": 2.5565433502197266 }, { "auxiliary_loss_clip": 0.06594366, "auxiliary_loss_mlp": 0.0128343, "balance_loss_clip": 0.06324144, "balance_loss_mlp": 0.01258372, "epoch": 0.1950999549075605, "flos": 21148308973440.0, "grad_norm": 2.694231612029829, "language_loss": 0.7729153, "learning_rate": 3.7208116042550088e-06, "loss": 0.85169327, "num_input_tokens_seen": 70040760, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.25073242, "step": 3245, "time_per_iteration": 2.5854952335357666 }, { "auxiliary_loss_clip": 0.06584131, "auxiliary_loss_mlp": 0.01285686, "balance_loss_clip": 0.06318545, "balance_loss_mlp": 0.01259067, "epoch": 0.19516007816022846, "flos": 20890815776640.0, "grad_norm": 2.019575764350909, "language_loss": 0.8499428, "learning_rate": 3.7206130982517906e-06, "loss": 0.92864096, "num_input_tokens_seen": 70058720, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.26599121, "step": 3246, "time_per_iteration": 2.5784146785736084 }, { "auxiliary_loss_clip": 0.06588197, "auxiliary_loss_mlp": 0.01284752, "balance_loss_clip": 0.06319804, "balance_loss_mlp": 0.01261411, "epoch": 0.19522020141289645, "flos": 16916287933440.0, "grad_norm": 3.598862537662422, "language_loss": 0.77153313, "learning_rate": 3.7204145270027514e-06, "loss": 0.85026264, "num_input_tokens_seen": 70076470, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.23352051, "step": 3247, "time_per_iteration": 2.542954921722412 }, { "auxiliary_loss_clip": 0.06580489, "auxiliary_loss_mlp": 0.0128852, "balance_loss_clip": 0.06318909, "balance_loss_mlp": 0.01263796, "epoch": 0.19528032466556441, "flos": 26732183460480.0, "grad_norm": 1.664286659825605, "language_loss": 0.76088548, "learning_rate": 3.720215890515421e-06, "loss": 0.83957553, "num_input_tokens_seen": 70096220, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.24731445, "step": 3248, "time_per_iteration": 2.60410475730896 }, { "auxiliary_loss_clip": 0.06577042, "auxiliary_loss_mlp": 0.01285777, "balance_loss_clip": 0.06310949, "balance_loss_mlp": 0.01260684, "epoch": 0.19534044791823238, "flos": 21039170630400.0, "grad_norm": 2.3203088550078377, "language_loss": 0.79011941, "learning_rate": 3.7200171887973316e-06, "loss": 0.86874765, "num_input_tokens_seen": 70114800, "router_z_loss_clip": 2.6640625, "router_z_loss_mlp": 0.25036621, "step": 3249, "time_per_iteration": 2.5478992462158203 }, { "auxiliary_loss_clip": 0.06578436, "auxiliary_loss_mlp": 0.012879, "balance_loss_clip": 0.06313793, "balance_loss_mlp": 0.01262878, "epoch": 0.19540057117090034, "flos": 22350256266240.0, "grad_norm": 1.6850323929138913, "language_loss": 0.73729002, "learning_rate": 3.7198184218560176e-06, "loss": 0.81595343, "num_input_tokens_seen": 70134930, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.25048828, "step": 3250, "time_per_iteration": 2.566455841064453 }, { "auxiliary_loss_clip": 0.06566043, "auxiliary_loss_mlp": 0.01291613, "balance_loss_clip": 0.0630637, "balance_loss_mlp": 0.01266412, "epoch": 0.1954606944235683, "flos": 20307626559360.0, "grad_norm": 4.4814380988371685, "language_loss": 0.80125546, "learning_rate": 3.719619589699017e-06, "loss": 0.87983203, "num_input_tokens_seen": 70152045, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.25219727, "step": 3251, "time_per_iteration": 2.5512759685516357 }, { "auxiliary_loss_clip": 0.06576625, "auxiliary_loss_mlp": 0.01287137, "balance_loss_clip": 0.06311119, "balance_loss_mlp": 0.01262962, "epoch": 0.19552081767623627, "flos": 17352463962240.0, "grad_norm": 2.799832285104655, "language_loss": 0.8453514, "learning_rate": 3.7194206923338695e-06, "loss": 0.92398906, "num_input_tokens_seen": 70169240, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.24169922, "step": 3252, "time_per_iteration": 2.520543098449707 }, { "auxiliary_loss_clip": 0.06585015, "auxiliary_loss_mlp": 0.01290819, "balance_loss_clip": 0.06311544, "balance_loss_mlp": 0.01264235, "epoch": 0.19558094092890424, "flos": 31985666098560.0, "grad_norm": 1.620517512192069, "language_loss": 0.73717171, "learning_rate": 3.719221729768117e-06, "loss": 0.81593007, "num_input_tokens_seen": 70192690, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.26574707, "step": 3253, "time_per_iteration": 2.6475884914398193 }, { "auxiliary_loss_clip": 0.06578259, "auxiliary_loss_mlp": 0.01286986, "balance_loss_clip": 0.06305083, "balance_loss_mlp": 0.01260772, "epoch": 0.19564106418157223, "flos": 22274716210560.0, "grad_norm": 2.0945377998925334, "language_loss": 0.77398759, "learning_rate": 3.7190227020093037e-06, "loss": 0.85264003, "num_input_tokens_seen": 70209685, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.26220703, "step": 3254, "time_per_iteration": 3.99912691116333 }, { "auxiliary_loss_clip": 0.06428351, "auxiliary_loss_mlp": 0.01265445, "balance_loss_clip": 0.0628251, "balance_loss_mlp": 0.01258638, "epoch": 0.1957011874342402, "flos": 54379876631040.0, "grad_norm": 0.724192141192442, "language_loss": 0.55022264, "learning_rate": 3.7188236090649774e-06, "loss": 0.62716055, "num_input_tokens_seen": 70265050, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.0682373, "step": 3255, "time_per_iteration": 3.1518607139587402 }, { "auxiliary_loss_clip": 0.06571525, "auxiliary_loss_mlp": 0.01289462, "balance_loss_clip": 0.0630006, "balance_loss_mlp": 0.01263189, "epoch": 0.19576131068690816, "flos": 16511991183360.0, "grad_norm": 4.017670460450298, "language_loss": 0.70790416, "learning_rate": 3.718624450942688e-06, "loss": 0.78651404, "num_input_tokens_seen": 70281830, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.26293945, "step": 3256, "time_per_iteration": 2.534898042678833 }, { "auxiliary_loss_clip": 0.06565905, "auxiliary_loss_mlp": 0.01284293, "balance_loss_clip": 0.06301007, "balance_loss_mlp": 0.01259402, "epoch": 0.19582143393957613, "flos": 14724800248320.0, "grad_norm": 2.2875815291947994, "language_loss": 0.81302989, "learning_rate": 3.718425227649987e-06, "loss": 0.89153188, "num_input_tokens_seen": 70297420, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.24938965, "step": 3257, "time_per_iteration": 3.973170518875122 }, { "auxiliary_loss_clip": 0.06572357, "auxiliary_loss_mlp": 0.012825, "balance_loss_clip": 0.06305994, "balance_loss_mlp": 0.01257669, "epoch": 0.1958815571922441, "flos": 24432354046080.0, "grad_norm": 1.6924156076915022, "language_loss": 0.75807285, "learning_rate": 3.7182259391944292e-06, "loss": 0.8366214, "num_input_tokens_seen": 70319210, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.24804688, "step": 3258, "time_per_iteration": 2.6772849559783936 }, { "auxiliary_loss_clip": 0.0657105, "auxiliary_loss_mlp": 0.01284249, "balance_loss_clip": 0.06301577, "balance_loss_mlp": 0.01258905, "epoch": 0.19594168044491206, "flos": 24907285388160.0, "grad_norm": 1.6330519649270778, "language_loss": 0.74211061, "learning_rate": 3.7180265855835714e-06, "loss": 0.82066357, "num_input_tokens_seen": 70339045, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.25354004, "step": 3259, "time_per_iteration": 2.7535526752471924 }, { "auxiliary_loss_clip": 0.0657144, "auxiliary_loss_mlp": 0.01282398, "balance_loss_clip": 0.0629983, "balance_loss_mlp": 0.01255838, "epoch": 0.19600180369758005, "flos": 12061819238400.0, "grad_norm": 2.4888831690785618, "language_loss": 0.775675, "learning_rate": 3.7178271668249735e-06, "loss": 0.85421336, "num_input_tokens_seen": 70356505, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.26574707, "step": 3260, "time_per_iteration": 2.54984450340271 }, { "auxiliary_loss_clip": 0.065685, "auxiliary_loss_mlp": 0.01285932, "balance_loss_clip": 0.06301485, "balance_loss_mlp": 0.01261089, "epoch": 0.19606192695024802, "flos": 20856504729600.0, "grad_norm": 2.2013275382878703, "language_loss": 0.83192354, "learning_rate": 3.7176276829261975e-06, "loss": 0.91046786, "num_input_tokens_seen": 70375410, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.24853516, "step": 3261, "time_per_iteration": 2.604604721069336 }, { "auxiliary_loss_clip": 0.0656291, "auxiliary_loss_mlp": 0.0129033, "balance_loss_clip": 0.06297436, "balance_loss_mlp": 0.01264367, "epoch": 0.19612205020291598, "flos": 28483050850560.0, "grad_norm": 1.8295102066607198, "language_loss": 0.77362704, "learning_rate": 3.717428133894807e-06, "loss": 0.85215944, "num_input_tokens_seen": 70396315, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.25964355, "step": 3262, "time_per_iteration": 4.119104385375977 }, { "auxiliary_loss_clip": 0.06561276, "auxiliary_loss_mlp": 0.0129081, "balance_loss_clip": 0.06298451, "balance_loss_mlp": 0.01267397, "epoch": 0.19618217345558395, "flos": 25563666746880.0, "grad_norm": 1.651544250741648, "language_loss": 0.87051398, "learning_rate": 3.71722851973837e-06, "loss": 0.94903481, "num_input_tokens_seen": 70417945, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.23425293, "step": 3263, "time_per_iteration": 2.601956605911255 }, { "auxiliary_loss_clip": 0.06562418, "auxiliary_loss_mlp": 0.01291077, "balance_loss_clip": 0.06297875, "balance_loss_mlp": 0.0126596, "epoch": 0.1962422967082519, "flos": 25271359378560.0, "grad_norm": 1.6650411043289806, "language_loss": 0.74232471, "learning_rate": 3.717028840464455e-06, "loss": 0.82085967, "num_input_tokens_seen": 70438690, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 0.2512207, "step": 3264, "time_per_iteration": 2.6301910877227783 }, { "auxiliary_loss_clip": 0.06557558, "auxiliary_loss_mlp": 0.01302515, "balance_loss_clip": 0.06297988, "balance_loss_mlp": 0.01276539, "epoch": 0.19630241996091988, "flos": 18813371898240.0, "grad_norm": 2.427497169259081, "language_loss": 0.79669821, "learning_rate": 3.7168290960806344e-06, "loss": 0.87529892, "num_input_tokens_seen": 70455385, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.25976562, "step": 3265, "time_per_iteration": 4.115947484970093 }, { "auxiliary_loss_clip": 0.06443848, "auxiliary_loss_mlp": 0.01272144, "balance_loss_clip": 0.0629871, "balance_loss_mlp": 0.01266157, "epoch": 0.19636254321358784, "flos": 62338240120320.0, "grad_norm": 0.7659295281961965, "language_loss": 0.530846, "learning_rate": 3.716629286594483e-06, "loss": 0.60800588, "num_input_tokens_seen": 70514280, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.05984497, "step": 3266, "time_per_iteration": 3.225681781768799 }, { "auxiliary_loss_clip": 0.0656643, "auxiliary_loss_mlp": 0.01288659, "balance_loss_clip": 0.06299341, "balance_loss_mlp": 0.01262505, "epoch": 0.19642266646625584, "flos": 21075703810560.0, "grad_norm": 1.8955816385570414, "language_loss": 0.80327606, "learning_rate": 3.7164294120135767e-06, "loss": 0.88182694, "num_input_tokens_seen": 70531800, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.26171875, "step": 3267, "time_per_iteration": 2.5546226501464844 }, { "auxiliary_loss_clip": 0.06553337, "auxiliary_loss_mlp": 0.01293758, "balance_loss_clip": 0.06295966, "balance_loss_mlp": 0.0126913, "epoch": 0.1964827897189238, "flos": 14543979137280.0, "grad_norm": 2.3467769383939254, "language_loss": 0.86869287, "learning_rate": 3.7162294723454953e-06, "loss": 0.94716382, "num_input_tokens_seen": 70550615, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.24658203, "step": 3268, "time_per_iteration": 2.5362343788146973 }, { "auxiliary_loss_clip": 0.06557345, "auxiliary_loss_mlp": 0.01281476, "balance_loss_clip": 0.06300445, "balance_loss_mlp": 0.01259875, "epoch": 0.19654291297159177, "flos": 19250638030080.0, "grad_norm": 3.5536791658796663, "language_loss": 0.69660962, "learning_rate": 3.7160294675978197e-06, "loss": 0.77499783, "num_input_tokens_seen": 70568690, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.21606445, "step": 3269, "time_per_iteration": 2.5577447414398193 }, { "auxiliary_loss_clip": 0.06564417, "auxiliary_loss_mlp": 0.01294925, "balance_loss_clip": 0.06302156, "balance_loss_mlp": 0.01270225, "epoch": 0.19660303622425973, "flos": 25782823900800.0, "grad_norm": 2.654435551952525, "language_loss": 0.80965638, "learning_rate": 3.715829397778135e-06, "loss": 0.88824975, "num_input_tokens_seen": 70588665, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.2467041, "step": 3270, "time_per_iteration": 2.6112959384918213 }, { "auxiliary_loss_clip": 0.06559821, "auxiliary_loss_mlp": 0.01281263, "balance_loss_clip": 0.06299484, "balance_loss_mlp": 0.01257135, "epoch": 0.1966631594769277, "flos": 20601401374080.0, "grad_norm": 4.84805985250947, "language_loss": 0.84869337, "learning_rate": 3.715629262894028e-06, "loss": 0.92710418, "num_input_tokens_seen": 70606900, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.24108887, "step": 3271, "time_per_iteration": 2.564602851867676 }, { "auxiliary_loss_clip": 0.06549754, "auxiliary_loss_mlp": 0.01288897, "balance_loss_clip": 0.06298252, "balance_loss_mlp": 0.01265293, "epoch": 0.19672328272959566, "flos": 23629965747840.0, "grad_norm": 1.9032329498091647, "language_loss": 0.80389988, "learning_rate": 3.715429062953087e-06, "loss": 0.88228643, "num_input_tokens_seen": 70625955, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.23596191, "step": 3272, "time_per_iteration": 2.5934906005859375 }, { "auxiliary_loss_clip": 0.06564283, "auxiliary_loss_mlp": 0.01286186, "balance_loss_clip": 0.06300408, "balance_loss_mlp": 0.0126145, "epoch": 0.19678340598226365, "flos": 23117369195520.0, "grad_norm": 1.9733257795629193, "language_loss": 0.81242561, "learning_rate": 3.7152287979629043e-06, "loss": 0.89093029, "num_input_tokens_seen": 70646090, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.24768066, "step": 3273, "time_per_iteration": 2.637742042541504 }, { "auxiliary_loss_clip": 0.06563055, "auxiliary_loss_mlp": 0.01280246, "balance_loss_clip": 0.06299812, "balance_loss_mlp": 0.01257382, "epoch": 0.19684352923493162, "flos": 24541702024320.0, "grad_norm": 2.2180595918265906, "language_loss": 0.78247184, "learning_rate": 3.7150284679310735e-06, "loss": 0.86090487, "num_input_tokens_seen": 70666065, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.22875977, "step": 3274, "time_per_iteration": 2.5888671875 }, { "auxiliary_loss_clip": 0.06558681, "auxiliary_loss_mlp": 0.01288538, "balance_loss_clip": 0.06298147, "balance_loss_mlp": 0.01264255, "epoch": 0.19690365248759958, "flos": 21802510126080.0, "grad_norm": 8.322309958412141, "language_loss": 0.8268137, "learning_rate": 3.7148280728651914e-06, "loss": 0.90528584, "num_input_tokens_seen": 70681580, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.24291992, "step": 3275, "time_per_iteration": 2.561450958251953 }, { "auxiliary_loss_clip": 0.0655583, "auxiliary_loss_mlp": 0.01284109, "balance_loss_clip": 0.06296837, "balance_loss_mlp": 0.01260458, "epoch": 0.19696377574026755, "flos": 19061683073280.0, "grad_norm": 2.1965278220500744, "language_loss": 0.81692123, "learning_rate": 3.7146276127728563e-06, "loss": 0.89532065, "num_input_tokens_seen": 70697745, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.23657227, "step": 3276, "time_per_iteration": 2.5445706844329834 }, { "auxiliary_loss_clip": 0.0655741, "auxiliary_loss_mlp": 0.01278599, "balance_loss_clip": 0.06299093, "balance_loss_mlp": 0.0125589, "epoch": 0.19702389899293551, "flos": 22827325887360.0, "grad_norm": 2.259155308174848, "language_loss": 0.90438044, "learning_rate": 3.7144270876616713e-06, "loss": 0.98274052, "num_input_tokens_seen": 70715110, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.22717285, "step": 3277, "time_per_iteration": 2.625527858734131 }, { "auxiliary_loss_clip": 0.06568338, "auxiliary_loss_mlp": 0.0128256, "balance_loss_clip": 0.06300303, "balance_loss_mlp": 0.01257347, "epoch": 0.19708402224560348, "flos": 22901021153280.0, "grad_norm": 2.70168413746218, "language_loss": 0.63741922, "learning_rate": 3.714226497539239e-06, "loss": 0.7159282, "num_input_tokens_seen": 70734715, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.25219727, "step": 3278, "time_per_iteration": 2.648210048675537 }, { "auxiliary_loss_clip": 0.06567056, "auxiliary_loss_mlp": 0.0128289, "balance_loss_clip": 0.06304041, "balance_loss_mlp": 0.01258226, "epoch": 0.19714414549827144, "flos": 25668989729280.0, "grad_norm": 2.3966737564872487, "language_loss": 0.74968827, "learning_rate": 3.714025842413166e-06, "loss": 0.8281877, "num_input_tokens_seen": 70752650, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.24658203, "step": 3279, "time_per_iteration": 2.5875601768493652 }, { "auxiliary_loss_clip": 0.06561081, "auxiliary_loss_mlp": 0.01279173, "balance_loss_clip": 0.06302121, "balance_loss_mlp": 0.01256296, "epoch": 0.19720426875093944, "flos": 23922776240640.0, "grad_norm": 1.6634039631112532, "language_loss": 0.83014977, "learning_rate": 3.713825122291061e-06, "loss": 0.90855229, "num_input_tokens_seen": 70772365, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.2286377, "step": 3280, "time_per_iteration": 2.583059787750244 }, { "auxiliary_loss_clip": 0.06565882, "auxiliary_loss_mlp": 0.01283369, "balance_loss_clip": 0.0630521, "balance_loss_mlp": 0.01260898, "epoch": 0.1972643920036074, "flos": 13887178508160.0, "grad_norm": 2.046431574783248, "language_loss": 0.78018636, "learning_rate": 3.713624337180536e-06, "loss": 0.85867888, "num_input_tokens_seen": 70790340, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.22460938, "step": 3281, "time_per_iteration": 2.547661542892456 }, { "auxiliary_loss_clip": 0.06551018, "auxiliary_loss_mlp": 0.01286264, "balance_loss_clip": 0.06298459, "balance_loss_mlp": 0.0126545, "epoch": 0.19732451525627537, "flos": 19869479959680.0, "grad_norm": 1.7426725030610746, "language_loss": 0.80234051, "learning_rate": 3.7134234870892045e-06, "loss": 0.8807134, "num_input_tokens_seen": 70809295, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.20825195, "step": 3282, "time_per_iteration": 2.551631450653076 }, { "auxiliary_loss_clip": 0.06560318, "auxiliary_loss_mlp": 0.01284812, "balance_loss_clip": 0.06299675, "balance_loss_mlp": 0.012614, "epoch": 0.19738463850894333, "flos": 24980477529600.0, "grad_norm": 2.36398213735407, "language_loss": 0.72323763, "learning_rate": 3.7132225720246826e-06, "loss": 0.80168897, "num_input_tokens_seen": 70828765, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.23413086, "step": 3283, "time_per_iteration": 2.603116273880005 }, { "auxiliary_loss_clip": 0.06570686, "auxiliary_loss_mlp": 0.01282544, "balance_loss_clip": 0.06308389, "balance_loss_mlp": 0.01258917, "epoch": 0.1974447617616113, "flos": 18374722174080.0, "grad_norm": 2.749060088732829, "language_loss": 0.79777789, "learning_rate": 3.7130215919945886e-06, "loss": 0.87631023, "num_input_tokens_seen": 70846805, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.2364502, "step": 3284, "time_per_iteration": 2.535637378692627 }, { "auxiliary_loss_clip": 0.06567208, "auxiliary_loss_mlp": 0.01282562, "balance_loss_clip": 0.06305387, "balance_loss_mlp": 0.01258935, "epoch": 0.19750488501427926, "flos": 22899511779840.0, "grad_norm": 2.7000218653685115, "language_loss": 0.8657859, "learning_rate": 3.7128205470065445e-06, "loss": 0.9442836, "num_input_tokens_seen": 70863805, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.23620605, "step": 3285, "time_per_iteration": 2.5647199153900146 }, { "auxiliary_loss_clip": 0.06560684, "auxiliary_loss_mlp": 0.01282071, "balance_loss_clip": 0.06302647, "balance_loss_mlp": 0.0125997, "epoch": 0.19756500826694723, "flos": 21877924400640.0, "grad_norm": 6.101900213780323, "language_loss": 0.88827294, "learning_rate": 3.712619437068174e-06, "loss": 0.96670043, "num_input_tokens_seen": 70882660, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.2208252, "step": 3286, "time_per_iteration": 2.5600457191467285 }, { "auxiliary_loss_clip": 0.06567635, "auxiliary_loss_mlp": 0.01283754, "balance_loss_clip": 0.0630305, "balance_loss_mlp": 0.01259614, "epoch": 0.19762513151961522, "flos": 15164414294400.0, "grad_norm": 2.3925730373962693, "language_loss": 0.78870136, "learning_rate": 3.712418262187102e-06, "loss": 0.86721528, "num_input_tokens_seen": 70898765, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.24145508, "step": 3287, "time_per_iteration": 2.549769401550293 }, { "auxiliary_loss_clip": 0.06573622, "auxiliary_loss_mlp": 0.01283517, "balance_loss_clip": 0.06307884, "balance_loss_mlp": 0.01260343, "epoch": 0.1976852547722832, "flos": 16984239194880.0, "grad_norm": 2.113956119051721, "language_loss": 0.81847215, "learning_rate": 3.7122170223709584e-06, "loss": 0.89704353, "num_input_tokens_seen": 70916370, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.23181152, "step": 3288, "time_per_iteration": 2.5567567348480225 }, { "auxiliary_loss_clip": 0.06557281, "auxiliary_loss_mlp": 0.0128219, "balance_loss_clip": 0.06301958, "balance_loss_mlp": 0.01260303, "epoch": 0.19774537802495115, "flos": 20309135932800.0, "grad_norm": 1.7633463710730681, "language_loss": 0.73311967, "learning_rate": 3.712015717627374e-06, "loss": 0.81151438, "num_input_tokens_seen": 70934870, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.21887207, "step": 3289, "time_per_iteration": 2.579355001449585 }, { "auxiliary_loss_clip": 0.065603, "auxiliary_loss_mlp": 0.01286626, "balance_loss_clip": 0.06301568, "balance_loss_mlp": 0.01263869, "epoch": 0.19780550127761912, "flos": 27242893296000.0, "grad_norm": 1.7689856307546181, "language_loss": 0.80134261, "learning_rate": 3.7118143479639813e-06, "loss": 0.87981188, "num_input_tokens_seen": 70955140, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.22766113, "step": 3290, "time_per_iteration": 2.617795705795288 }, { "auxiliary_loss_clip": 0.06463088, "auxiliary_loss_mlp": 0.01287897, "balance_loss_clip": 0.06320858, "balance_loss_mlp": 0.01282676, "epoch": 0.19786562453028708, "flos": 63572597015040.0, "grad_norm": 0.8940751668811489, "language_loss": 0.60313475, "learning_rate": 3.711612913388418e-06, "loss": 0.68064457, "num_input_tokens_seen": 71012005, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.05224609, "step": 3291, "time_per_iteration": 3.2179553508758545 }, { "auxiliary_loss_clip": 0.06578006, "auxiliary_loss_mlp": 0.01301739, "balance_loss_clip": 0.06307924, "balance_loss_mlp": 0.01275465, "epoch": 0.19792574778295505, "flos": 26293869152640.0, "grad_norm": 2.0349894803359176, "language_loss": 0.81877261, "learning_rate": 3.7114114139083204e-06, "loss": 0.89757007, "num_input_tokens_seen": 71031140, "router_z_loss_clip": 2.69921875, "router_z_loss_mlp": 0.26281738, "step": 3292, "time_per_iteration": 2.616046190261841 }, { "auxiliary_loss_clip": 0.06552395, "auxiliary_loss_mlp": 0.01300044, "balance_loss_clip": 0.06295368, "balance_loss_mlp": 0.01278741, "epoch": 0.19798587103562304, "flos": 19944265328640.0, "grad_norm": 1.9432362349038044, "language_loss": 0.82141685, "learning_rate": 3.7112098495313313e-06, "loss": 0.89994127, "num_input_tokens_seen": 71050250, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.2130127, "step": 3293, "time_per_iteration": 2.5680899620056152 }, { "auxiliary_loss_clip": 0.0657787, "auxiliary_loss_mlp": 0.01294549, "balance_loss_clip": 0.06302962, "balance_loss_mlp": 0.01268847, "epoch": 0.198045994288291, "flos": 20126428104960.0, "grad_norm": 1.9730018005286571, "language_loss": 0.6180318, "learning_rate": 3.711008220265093e-06, "loss": 0.69675595, "num_input_tokens_seen": 71068665, "router_z_loss_clip": 2.74804688, "router_z_loss_mlp": 0.25695801, "step": 3294, "time_per_iteration": 3.957066774368286 }, { "auxiliary_loss_clip": 0.06557067, "auxiliary_loss_mlp": 0.01292832, "balance_loss_clip": 0.06298088, "balance_loss_mlp": 0.01271386, "epoch": 0.19810611754095897, "flos": 17973444170880.0, "grad_norm": 2.3954626524001297, "language_loss": 0.88260674, "learning_rate": 3.710806526117251e-06, "loss": 0.96110576, "num_input_tokens_seen": 71085320, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.21459961, "step": 3295, "time_per_iteration": 2.5410802364349365 }, { "auxiliary_loss_clip": 0.06555162, "auxiliary_loss_mlp": 0.01290288, "balance_loss_clip": 0.06299318, "balance_loss_mlp": 0.01268926, "epoch": 0.19816624079362694, "flos": 15090257831040.0, "grad_norm": 2.3338786560760734, "language_loss": 0.81432992, "learning_rate": 3.7106047670954544e-06, "loss": 0.89278442, "num_input_tokens_seen": 71102020, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.21374512, "step": 3296, "time_per_iteration": 2.5138893127441406 }, { "auxiliary_loss_clip": 0.06559631, "auxiliary_loss_mlp": 0.01285359, "balance_loss_clip": 0.062935, "balance_loss_mlp": 0.0126116, "epoch": 0.1982263640462949, "flos": 24907327315200.0, "grad_norm": 2.6969928876518066, "language_loss": 0.69099921, "learning_rate": 3.710402943207354e-06, "loss": 0.76944917, "num_input_tokens_seen": 71123390, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.24194336, "step": 3297, "time_per_iteration": 3.989293336868286 }, { "auxiliary_loss_clip": 0.06548852, "auxiliary_loss_mlp": 0.01285948, "balance_loss_clip": 0.06294116, "balance_loss_mlp": 0.01264907, "epoch": 0.19828648729896287, "flos": 20382453855360.0, "grad_norm": 1.8934315039164822, "language_loss": 0.82086515, "learning_rate": 3.7102010544606016e-06, "loss": 0.89921308, "num_input_tokens_seen": 71141800, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.21057129, "step": 3298, "time_per_iteration": 2.534975290298462 }, { "auxiliary_loss_clip": 0.06562842, "auxiliary_loss_mlp": 0.01292247, "balance_loss_clip": 0.0629694, "balance_loss_mlp": 0.01268191, "epoch": 0.19834661055163083, "flos": 18886018988160.0, "grad_norm": 2.394999641707543, "language_loss": 0.86265808, "learning_rate": 3.7099991008628544e-06, "loss": 0.94120896, "num_input_tokens_seen": 71159505, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.24060059, "step": 3299, "time_per_iteration": 2.549135684967041 }, { "auxiliary_loss_clip": 0.06421559, "auxiliary_loss_mlp": 0.0125543, "balance_loss_clip": 0.06280554, "balance_loss_mlp": 0.01249881, "epoch": 0.19840673380429882, "flos": 60278908723200.0, "grad_norm": 0.7316024285251337, "language_loss": 0.53192294, "learning_rate": 3.7097970824217706e-06, "loss": 0.60869282, "num_input_tokens_seen": 71223265, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.05551147, "step": 3300, "time_per_iteration": 3.1334073543548584 }, { "auxiliary_loss_clip": 0.06551819, "auxiliary_loss_mlp": 0.01286651, "balance_loss_clip": 0.06291418, "balance_loss_mlp": 0.01262071, "epoch": 0.1984668570569668, "flos": 19908235272960.0, "grad_norm": 1.6532368764105332, "language_loss": 0.74071836, "learning_rate": 3.7095949991450093e-06, "loss": 0.81910306, "num_input_tokens_seen": 71242385, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.24572754, "step": 3301, "time_per_iteration": 2.565601348876953 }, { "auxiliary_loss_clip": 0.06550995, "auxiliary_loss_mlp": 0.01279493, "balance_loss_clip": 0.06292805, "balance_loss_mlp": 0.01256832, "epoch": 0.19852698030963475, "flos": 15635865692160.0, "grad_norm": 2.3415650745860006, "language_loss": 0.89641058, "learning_rate": 3.709392851040235e-06, "loss": 0.97471547, "num_input_tokens_seen": 71258990, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.22680664, "step": 3302, "time_per_iteration": 3.9202048778533936 }, { "auxiliary_loss_clip": 0.06547316, "auxiliary_loss_mlp": 0.01280039, "balance_loss_clip": 0.06288172, "balance_loss_mlp": 0.01257616, "epoch": 0.19858710356230272, "flos": 43153037729280.0, "grad_norm": 1.719268242184337, "language_loss": 0.74288166, "learning_rate": 3.709190638115111e-06, "loss": 0.82115519, "num_input_tokens_seen": 71282770, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.22412109, "step": 3303, "time_per_iteration": 2.756568193435669 }, { "auxiliary_loss_clip": 0.06550816, "auxiliary_loss_mlp": 0.01288004, "balance_loss_clip": 0.0629342, "balance_loss_mlp": 0.01263197, "epoch": 0.19864722681497068, "flos": 35151348879360.0, "grad_norm": 2.0689138973350825, "language_loss": 0.75835705, "learning_rate": 3.7089883603773084e-06, "loss": 0.83674526, "num_input_tokens_seen": 71301410, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.2479248, "step": 3304, "time_per_iteration": 2.6765637397766113 }, { "auxiliary_loss_clip": 0.06543332, "auxiliary_loss_mlp": 0.01286775, "balance_loss_clip": 0.06289501, "balance_loss_mlp": 0.0126496, "epoch": 0.19870735006763865, "flos": 19432088046720.0, "grad_norm": 1.7660875749957357, "language_loss": 0.86458099, "learning_rate": 3.7087860178344955e-06, "loss": 0.94288206, "num_input_tokens_seen": 71319670, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.21813965, "step": 3305, "time_per_iteration": 4.022382020950317 }, { "auxiliary_loss_clip": 0.06554982, "auxiliary_loss_mlp": 0.01298283, "balance_loss_clip": 0.06292935, "balance_loss_mlp": 0.01275776, "epoch": 0.19876747332030664, "flos": 23553671005440.0, "grad_norm": 1.8145435258079068, "language_loss": 0.69103003, "learning_rate": 3.7085836104943445e-06, "loss": 0.76956272, "num_input_tokens_seen": 71339850, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.22497559, "step": 3306, "time_per_iteration": 2.5826339721679688 }, { "auxiliary_loss_clip": 0.06544727, "auxiliary_loss_mlp": 0.01283569, "balance_loss_clip": 0.06286125, "balance_loss_mlp": 0.01261444, "epoch": 0.1988275965729746, "flos": 19835672037120.0, "grad_norm": 1.4624188890468937, "language_loss": 0.76932037, "learning_rate": 3.7083811383645332e-06, "loss": 0.84760332, "num_input_tokens_seen": 71359795, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.22131348, "step": 3307, "time_per_iteration": 2.5822370052337646 }, { "auxiliary_loss_clip": 0.06554599, "auxiliary_loss_mlp": 0.01302076, "balance_loss_clip": 0.06298801, "balance_loss_mlp": 0.01279951, "epoch": 0.19888771982564257, "flos": 23520366207360.0, "grad_norm": 2.6099343714999494, "language_loss": 0.76431149, "learning_rate": 3.708178601452737e-06, "loss": 0.84287822, "num_input_tokens_seen": 71378885, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.22119141, "step": 3308, "time_per_iteration": 2.565178871154785 }, { "auxiliary_loss_clip": 0.06548911, "auxiliary_loss_mlp": 0.01303336, "balance_loss_clip": 0.06291592, "balance_loss_mlp": 0.01279375, "epoch": 0.19894784307831054, "flos": 18156403560960.0, "grad_norm": 1.682525566013164, "language_loss": 0.76445973, "learning_rate": 3.7079759997666374e-06, "loss": 0.84298217, "num_input_tokens_seen": 71397285, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.23974609, "step": 3309, "time_per_iteration": 2.538893699645996 }, { "auxiliary_loss_clip": 0.06539542, "auxiliary_loss_mlp": 0.01305957, "balance_loss_clip": 0.06289471, "balance_loss_mlp": 0.01280136, "epoch": 0.1990079663309785, "flos": 24282280183680.0, "grad_norm": 1.6316861305606751, "language_loss": 0.88182402, "learning_rate": 3.707773333313917e-06, "loss": 0.96027899, "num_input_tokens_seen": 71415775, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.25842285, "step": 3310, "time_per_iteration": 2.5915439128875732 }, { "auxiliary_loss_clip": 0.06540747, "auxiliary_loss_mlp": 0.01308469, "balance_loss_clip": 0.06289077, "balance_loss_mlp": 0.01284926, "epoch": 0.19906808958364647, "flos": 34906391867520.0, "grad_norm": 9.963571763788881, "language_loss": 0.64933443, "learning_rate": 3.70757060210226e-06, "loss": 0.7278266, "num_input_tokens_seen": 71437315, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.23571777, "step": 3311, "time_per_iteration": 2.6583034992218018 }, { "auxiliary_loss_clip": 0.06549554, "auxiliary_loss_mlp": 0.01291858, "balance_loss_clip": 0.06288202, "balance_loss_mlp": 0.01267861, "epoch": 0.19912821283631443, "flos": 24031788802560.0, "grad_norm": 2.6061859350512813, "language_loss": 0.74923646, "learning_rate": 3.707367806139355e-06, "loss": 0.82765067, "num_input_tokens_seen": 71456320, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.2401123, "step": 3312, "time_per_iteration": 2.5759105682373047 }, { "auxiliary_loss_clip": 0.06546012, "auxiliary_loss_mlp": 0.01301827, "balance_loss_clip": 0.06289276, "balance_loss_mlp": 0.01277699, "epoch": 0.19918833608898243, "flos": 19864155225600.0, "grad_norm": 1.9677348679469846, "language_loss": 0.84265673, "learning_rate": 3.7071649454328915e-06, "loss": 0.92113507, "num_input_tokens_seen": 71475360, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.24121094, "step": 3313, "time_per_iteration": 2.5481276512145996 }, { "auxiliary_loss_clip": 0.06545214, "auxiliary_loss_mlp": 0.01300689, "balance_loss_clip": 0.06291001, "balance_loss_mlp": 0.01277169, "epoch": 0.1992484593416504, "flos": 29103444080640.0, "grad_norm": 2.309394570391644, "language_loss": 0.81895983, "learning_rate": 3.7069620199905625e-06, "loss": 0.89741892, "num_input_tokens_seen": 71496155, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.23510742, "step": 3314, "time_per_iteration": 2.616997480392456 }, { "auxiliary_loss_clip": 0.06534933, "auxiliary_loss_mlp": 0.01293838, "balance_loss_clip": 0.06288206, "balance_loss_mlp": 0.01272834, "epoch": 0.19930858259431836, "flos": 23301754104960.0, "grad_norm": 1.6656946015779148, "language_loss": 0.88211703, "learning_rate": 3.7067590298200627e-06, "loss": 0.96040475, "num_input_tokens_seen": 71517295, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.21008301, "step": 3315, "time_per_iteration": 2.5929114818573 }, { "auxiliary_loss_clip": 0.06543839, "auxiliary_loss_mlp": 0.01301497, "balance_loss_clip": 0.0628673, "balance_loss_mlp": 0.01277071, "epoch": 0.19936870584698632, "flos": 25386619069440.0, "grad_norm": 1.4831999084357168, "language_loss": 0.71343929, "learning_rate": 3.7065559749290892e-06, "loss": 0.79189265, "num_input_tokens_seen": 71540000, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.24438477, "step": 3316, "time_per_iteration": 2.623885154724121 }, { "auxiliary_loss_clip": 0.06425592, "auxiliary_loss_mlp": 0.01299601, "balance_loss_clip": 0.06286121, "balance_loss_mlp": 0.01293158, "epoch": 0.1994288290996543, "flos": 62190038246400.0, "grad_norm": 0.840420273389928, "language_loss": 0.66296536, "learning_rate": 3.706352855325342e-06, "loss": 0.74021733, "num_input_tokens_seen": 71607880, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.06451416, "step": 3317, "time_per_iteration": 3.2495369911193848 }, { "auxiliary_loss_clip": 0.06544697, "auxiliary_loss_mlp": 0.0130685, "balance_loss_clip": 0.06282798, "balance_loss_mlp": 0.0128271, "epoch": 0.19948895235232225, "flos": 19031816292480.0, "grad_norm": 2.3699443317653013, "language_loss": 0.75301659, "learning_rate": 3.7061496710165233e-06, "loss": 0.83153206, "num_input_tokens_seen": 71625695, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.24157715, "step": 3318, "time_per_iteration": 2.5336976051330566 }, { "auxiliary_loss_clip": 0.0653453, "auxiliary_loss_mlp": 0.01300269, "balance_loss_clip": 0.06283958, "balance_loss_mlp": 0.01275736, "epoch": 0.19954907560499022, "flos": 37824895503360.0, "grad_norm": 1.9722812181714837, "language_loss": 0.79778427, "learning_rate": 3.7059464220103385e-06, "loss": 0.87613225, "num_input_tokens_seen": 71648520, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.2454834, "step": 3319, "time_per_iteration": 2.7037034034729004 }, { "auxiliary_loss_clip": 0.06546415, "auxiliary_loss_mlp": 0.01304623, "balance_loss_clip": 0.06290224, "balance_loss_mlp": 0.01279958, "epoch": 0.1996091988576582, "flos": 49576420673280.0, "grad_norm": 1.8810565009704423, "language_loss": 0.76235318, "learning_rate": 3.7057431083144945e-06, "loss": 0.84086359, "num_input_tokens_seen": 71672185, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.24645996, "step": 3320, "time_per_iteration": 2.8397092819213867 }, { "auxiliary_loss_clip": 0.0654221, "auxiliary_loss_mlp": 0.01306017, "balance_loss_clip": 0.06288889, "balance_loss_mlp": 0.01282437, "epoch": 0.19966932211032618, "flos": 22642018583040.0, "grad_norm": 3.074737059953239, "language_loss": 0.80476624, "learning_rate": 3.705539729936701e-06, "loss": 0.88324845, "num_input_tokens_seen": 71692890, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.23583984, "step": 3321, "time_per_iteration": 2.5743789672851562 }, { "auxiliary_loss_clip": 0.06417693, "auxiliary_loss_mlp": 0.01271182, "balance_loss_clip": 0.06278957, "balance_loss_mlp": 0.01265463, "epoch": 0.19972944536299414, "flos": 54098973417600.0, "grad_norm": 0.8396075500656395, "language_loss": 0.65259492, "learning_rate": 3.7053362868846696e-06, "loss": 0.72948372, "num_input_tokens_seen": 71745815, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.05712891, "step": 3322, "time_per_iteration": 2.982712984085083 }, { "auxiliary_loss_clip": 0.06417692, "auxiliary_loss_mlp": 0.01275146, "balance_loss_clip": 0.06280302, "balance_loss_mlp": 0.01269105, "epoch": 0.1997895686156621, "flos": 69371995731840.0, "grad_norm": 0.7709241713509142, "language_loss": 0.56840563, "learning_rate": 3.7051327791661153e-06, "loss": 0.64533401, "num_input_tokens_seen": 71806915, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06039429, "step": 3323, "time_per_iteration": 3.299748420715332 }, { "auxiliary_loss_clip": 0.06537733, "auxiliary_loss_mlp": 0.01291065, "balance_loss_clip": 0.06287688, "balance_loss_mlp": 0.01265923, "epoch": 0.19984969186833007, "flos": 18558058907520.0, "grad_norm": 1.7724295350864918, "language_loss": 0.81100869, "learning_rate": 3.7049292067887555e-06, "loss": 0.88929665, "num_input_tokens_seen": 71824645, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.25158691, "step": 3324, "time_per_iteration": 2.5281007289886475 }, { "auxiliary_loss_clip": 0.06538264, "auxiliary_loss_mlp": 0.0129485, "balance_loss_clip": 0.06284435, "balance_loss_mlp": 0.01268731, "epoch": 0.19990981512099804, "flos": 26436438074880.0, "grad_norm": 1.764431204851317, "language_loss": 0.54320514, "learning_rate": 3.7047255697603092e-06, "loss": 0.62153631, "num_input_tokens_seen": 71845125, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.26147461, "step": 3325, "time_per_iteration": 2.6945083141326904 }, { "auxiliary_loss_clip": 0.06540807, "auxiliary_loss_mlp": 0.01287242, "balance_loss_clip": 0.0628349, "balance_loss_mlp": 0.01264307, "epoch": 0.19996993837366603, "flos": 16331547415680.0, "grad_norm": 2.1272864535069873, "language_loss": 0.86415637, "learning_rate": 3.7045218680884984e-06, "loss": 0.94243687, "num_input_tokens_seen": 71863500, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.22949219, "step": 3326, "time_per_iteration": 2.534925699234009 }, { "auxiliary_loss_clip": 0.06528215, "auxiliary_loss_mlp": 0.01280538, "balance_loss_clip": 0.06280246, "balance_loss_mlp": 0.01258318, "epoch": 0.200030061626334, "flos": 20849460986880.0, "grad_norm": 1.961794095950944, "language_loss": 0.72612393, "learning_rate": 3.7043181017810476e-06, "loss": 0.80421144, "num_input_tokens_seen": 71881845, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.2220459, "step": 3327, "time_per_iteration": 2.557854175567627 }, { "auxiliary_loss_clip": 0.06543375, "auxiliary_loss_mlp": 0.0128077, "balance_loss_clip": 0.06284501, "balance_loss_mlp": 0.01254448, "epoch": 0.20009018487900196, "flos": 23768341966080.0, "grad_norm": 2.8932454092702216, "language_loss": 0.77654725, "learning_rate": 3.7041142708456833e-06, "loss": 0.85478872, "num_input_tokens_seen": 71900940, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.26293945, "step": 3328, "time_per_iteration": 2.6155385971069336 }, { "auxiliary_loss_clip": 0.06531516, "auxiliary_loss_mlp": 0.01276289, "balance_loss_clip": 0.06283394, "balance_loss_mlp": 0.01254009, "epoch": 0.20015030813166992, "flos": 28119186495360.0, "grad_norm": 1.8379145900945422, "language_loss": 0.69638389, "learning_rate": 3.7039103752901353e-06, "loss": 0.77446187, "num_input_tokens_seen": 71921925, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.22277832, "step": 3329, "time_per_iteration": 2.6272335052490234 }, { "auxiliary_loss_clip": 0.06543361, "auxiliary_loss_mlp": 0.01278312, "balance_loss_clip": 0.06285229, "balance_loss_mlp": 0.01253075, "epoch": 0.2002104313843379, "flos": 26074250801280.0, "grad_norm": 2.421437924103047, "language_loss": 0.81933641, "learning_rate": 3.7037064151221353e-06, "loss": 0.89755321, "num_input_tokens_seen": 71941855, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.25231934, "step": 3330, "time_per_iteration": 2.672532320022583 }, { "auxiliary_loss_clip": 0.06541544, "auxiliary_loss_mlp": 0.01284279, "balance_loss_clip": 0.06286322, "balance_loss_mlp": 0.01259913, "epoch": 0.20027055463700585, "flos": 22973332826880.0, "grad_norm": 2.130511469111863, "language_loss": 0.77577639, "learning_rate": 3.703502390349417e-06, "loss": 0.85403466, "num_input_tokens_seen": 71960915, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.2434082, "step": 3331, "time_per_iteration": 2.5816497802734375 }, { "auxiliary_loss_clip": 0.06548783, "auxiliary_loss_mlp": 0.01288144, "balance_loss_clip": 0.06287965, "balance_loss_mlp": 0.012628, "epoch": 0.20033067788967382, "flos": 17171433216000.0, "grad_norm": 1.7734661356011587, "language_loss": 0.79778361, "learning_rate": 3.7032983009797176e-06, "loss": 0.87615287, "num_input_tokens_seen": 71979220, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.25354004, "step": 3332, "time_per_iteration": 2.5205888748168945 }, { "auxiliary_loss_clip": 0.06444538, "auxiliary_loss_mlp": 0.0128403, "balance_loss_clip": 0.0630665, "balance_loss_mlp": 0.0127807, "epoch": 0.2003908011423418, "flos": 60842476085760.0, "grad_norm": 0.9409658941301591, "language_loss": 0.61884892, "learning_rate": 3.703094147020776e-06, "loss": 0.69613457, "num_input_tokens_seen": 72033950, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.05957031, "step": 3333, "time_per_iteration": 4.423895835876465 }, { "auxiliary_loss_clip": 0.06542687, "auxiliary_loss_mlp": 0.0127726, "balance_loss_clip": 0.06284995, "balance_loss_mlp": 0.01253204, "epoch": 0.20045092439500978, "flos": 24212987256960.0, "grad_norm": 1.9128198647413563, "language_loss": 0.81905007, "learning_rate": 3.7028899284803334e-06, "loss": 0.89724958, "num_input_tokens_seen": 72051395, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.24060059, "step": 3334, "time_per_iteration": 2.559555768966675 }, { "auxiliary_loss_clip": 0.06543221, "auxiliary_loss_mlp": 0.01285201, "balance_loss_clip": 0.06281564, "balance_loss_mlp": 0.01258427, "epoch": 0.20051104764767774, "flos": 29395290251520.0, "grad_norm": 1.9606918543148326, "language_loss": 0.75061214, "learning_rate": 3.702685645366134e-06, "loss": 0.82889634, "num_input_tokens_seen": 72071305, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.2677002, "step": 3335, "time_per_iteration": 2.6115832328796387 }, { "auxiliary_loss_clip": 0.06552087, "auxiliary_loss_mlp": 0.01283861, "balance_loss_clip": 0.06294526, "balance_loss_mlp": 0.01259161, "epoch": 0.2005711709003457, "flos": 23520575842560.0, "grad_norm": 1.8731532223258405, "language_loss": 0.80488116, "learning_rate": 3.7024812976859243e-06, "loss": 0.88324064, "num_input_tokens_seen": 72090165, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.24707031, "step": 3336, "time_per_iteration": 4.039359092712402 }, { "auxiliary_loss_clip": 0.06549684, "auxiliary_loss_mlp": 0.01284282, "balance_loss_clip": 0.06286731, "balance_loss_mlp": 0.01257114, "epoch": 0.20063129415301367, "flos": 22529106806400.0, "grad_norm": 2.7722257123004956, "language_loss": 0.78272879, "learning_rate": 3.7022768854474532e-06, "loss": 0.86106849, "num_input_tokens_seen": 72107210, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.27160645, "step": 3337, "time_per_iteration": 2.614119291305542 }, { "auxiliary_loss_clip": 0.06548899, "auxiliary_loss_mlp": 0.01285879, "balance_loss_clip": 0.06289279, "balance_loss_mlp": 0.01260237, "epoch": 0.20069141740568164, "flos": 25965405947520.0, "grad_norm": 2.267186333970112, "language_loss": 0.69526899, "learning_rate": 3.7020724086584724e-06, "loss": 0.77361679, "num_input_tokens_seen": 72126315, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.25646973, "step": 3338, "time_per_iteration": 2.630255699157715 }, { "auxiliary_loss_clip": 0.06561659, "auxiliary_loss_mlp": 0.01287498, "balance_loss_clip": 0.0630127, "balance_loss_mlp": 0.01263262, "epoch": 0.2007515406583496, "flos": 24797560066560.0, "grad_norm": 2.833422553400173, "language_loss": 0.70040399, "learning_rate": 3.701867867326735e-06, "loss": 0.77889556, "num_input_tokens_seen": 72146470, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.24243164, "step": 3339, "time_per_iteration": 2.5812883377075195 }, { "auxiliary_loss_clip": 0.0655989, "auxiliary_loss_mlp": 0.01298145, "balance_loss_clip": 0.06296228, "balance_loss_mlp": 0.01274684, "epoch": 0.2008116639110176, "flos": 37934746606080.0, "grad_norm": 1.9743873068382711, "language_loss": 0.6772626, "learning_rate": 3.7016632614599974e-06, "loss": 0.75584292, "num_input_tokens_seen": 72166600, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.23474121, "step": 3340, "time_per_iteration": 2.7652370929718018 }, { "auxiliary_loss_clip": 0.06557875, "auxiliary_loss_mlp": 0.01294972, "balance_loss_clip": 0.06296323, "balance_loss_mlp": 0.0126871, "epoch": 0.20087178716368556, "flos": 20746779408000.0, "grad_norm": 2.8690688690574015, "language_loss": 0.74910349, "learning_rate": 3.701458591066019e-06, "loss": 0.82763195, "num_input_tokens_seen": 72185160, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.26269531, "step": 3341, "time_per_iteration": 4.167885780334473 }, { "auxiliary_loss_clip": 0.06546151, "auxiliary_loss_mlp": 0.01285844, "balance_loss_clip": 0.06295016, "balance_loss_mlp": 0.01261979, "epoch": 0.20093191041635353, "flos": 23849122901760.0, "grad_norm": 1.9523225616119915, "language_loss": 0.72729945, "learning_rate": 3.70125385615256e-06, "loss": 0.80561936, "num_input_tokens_seen": 72205160, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.23864746, "step": 3342, "time_per_iteration": 2.6121041774749756 }, { "auxiliary_loss_clip": 0.06551827, "auxiliary_loss_mlp": 0.01294653, "balance_loss_clip": 0.06293394, "balance_loss_mlp": 0.01270894, "epoch": 0.2009920336690215, "flos": 21797395027200.0, "grad_norm": 1.9728970458233512, "language_loss": 0.73149621, "learning_rate": 3.701049056727384e-06, "loss": 0.80996096, "num_input_tokens_seen": 72223555, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.23754883, "step": 3343, "time_per_iteration": 2.560781717300415 }, { "auxiliary_loss_clip": 0.06551284, "auxiliary_loss_mlp": 0.01301849, "balance_loss_clip": 0.0629483, "balance_loss_mlp": 0.01277292, "epoch": 0.20105215692168946, "flos": 26366390461440.0, "grad_norm": 1.9614877351433682, "language_loss": 0.81433034, "learning_rate": 3.7008441927982574e-06, "loss": 0.89286172, "num_input_tokens_seen": 72242465, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.2454834, "step": 3344, "time_per_iteration": 4.052962303161621 }, { "auxiliary_loss_clip": 0.06560262, "auxiliary_loss_mlp": 0.01295797, "balance_loss_clip": 0.06299821, "balance_loss_mlp": 0.0127217, "epoch": 0.20111228017435742, "flos": 18813288044160.0, "grad_norm": 2.277800974988114, "language_loss": 0.84231508, "learning_rate": 3.700639264372948e-06, "loss": 0.92087567, "num_input_tokens_seen": 72260655, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.23620605, "step": 3345, "time_per_iteration": 2.5503973960876465 }, { "auxiliary_loss_clip": 0.06549319, "auxiliary_loss_mlp": 0.01293098, "balance_loss_clip": 0.06303412, "balance_loss_mlp": 0.0127189, "epoch": 0.20117240342702541, "flos": 19981301633280.0, "grad_norm": 1.7673939393800822, "language_loss": 0.68653655, "learning_rate": 3.7004342714592283e-06, "loss": 0.76496071, "num_input_tokens_seen": 72279055, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.21203613, "step": 3346, "time_per_iteration": 2.545811176300049 }, { "auxiliary_loss_clip": 0.06545946, "auxiliary_loss_mlp": 0.0128354, "balance_loss_clip": 0.06293368, "balance_loss_mlp": 0.01260377, "epoch": 0.20123252667969338, "flos": 23148368006400.0, "grad_norm": 3.1257724810798098, "language_loss": 0.74904531, "learning_rate": 3.70022921406487e-06, "loss": 0.82734025, "num_input_tokens_seen": 72297895, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.23156738, "step": 3347, "time_per_iteration": 2.5610811710357666 }, { "auxiliary_loss_clip": 0.06554425, "auxiliary_loss_mlp": 0.0129231, "balance_loss_clip": 0.06303786, "balance_loss_mlp": 0.01270066, "epoch": 0.20129264993236134, "flos": 23228352328320.0, "grad_norm": 1.687860227265696, "language_loss": 0.86904895, "learning_rate": 3.70002409219765e-06, "loss": 0.94751632, "num_input_tokens_seen": 72318385, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.22229004, "step": 3348, "time_per_iteration": 2.5674242973327637 }, { "auxiliary_loss_clip": 0.06550759, "auxiliary_loss_mlp": 0.01288852, "balance_loss_clip": 0.06299552, "balance_loss_mlp": 0.01264629, "epoch": 0.2013527731850293, "flos": 21877882473600.0, "grad_norm": 1.7950408310285537, "language_loss": 0.7167598, "learning_rate": 3.699818905865346e-06, "loss": 0.79515594, "num_input_tokens_seen": 72338235, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.24255371, "step": 3349, "time_per_iteration": 2.5642638206481934 }, { "auxiliary_loss_clip": 0.06554388, "auxiliary_loss_mlp": 0.01292117, "balance_loss_clip": 0.06304125, "balance_loss_mlp": 0.01267417, "epoch": 0.20141289643769728, "flos": 18046636312320.0, "grad_norm": 1.670023645318429, "language_loss": 0.72256708, "learning_rate": 3.6996136550757377e-06, "loss": 0.80103213, "num_input_tokens_seen": 72357825, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.24707031, "step": 3350, "time_per_iteration": 2.540297269821167 }, { "auxiliary_loss_clip": 0.06554948, "auxiliary_loss_mlp": 0.01286711, "balance_loss_clip": 0.06300614, "balance_loss_mlp": 0.01261558, "epoch": 0.20147301969036524, "flos": 23958219317760.0, "grad_norm": 2.9269735406317197, "language_loss": 0.77235317, "learning_rate": 3.69940833983661e-06, "loss": 0.85076976, "num_input_tokens_seen": 72376335, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.25170898, "step": 3351, "time_per_iteration": 2.5650479793548584 }, { "auxiliary_loss_clip": 0.06563818, "auxiliary_loss_mlp": 0.01286277, "balance_loss_clip": 0.06303009, "balance_loss_mlp": 0.01259026, "epoch": 0.2015331429430332, "flos": 25594749411840.0, "grad_norm": 1.7743915329749453, "language_loss": 0.81405985, "learning_rate": 3.699202960155748e-06, "loss": 0.89256078, "num_input_tokens_seen": 72395440, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.27258301, "step": 3352, "time_per_iteration": 2.6051418781280518 }, { "auxiliary_loss_clip": 0.06554475, "auxiliary_loss_mlp": 0.0128694, "balance_loss_clip": 0.06301582, "balance_loss_mlp": 0.01261751, "epoch": 0.2015932661957012, "flos": 26732351168640.0, "grad_norm": 1.8810397065541018, "language_loss": 0.81422096, "learning_rate": 3.6989975160409396e-06, "loss": 0.89263511, "num_input_tokens_seen": 72414670, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.25195312, "step": 3353, "time_per_iteration": 2.608476161956787 }, { "auxiliary_loss_clip": 0.06548535, "auxiliary_loss_mlp": 0.01285698, "balance_loss_clip": 0.06299379, "balance_loss_mlp": 0.01261796, "epoch": 0.20165338944836916, "flos": 15638632876800.0, "grad_norm": 2.0571373665661183, "language_loss": 0.90624583, "learning_rate": 3.6987920074999747e-06, "loss": 0.98458815, "num_input_tokens_seen": 72432210, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.23925781, "step": 3354, "time_per_iteration": 2.530097484588623 }, { "auxiliary_loss_clip": 0.06484029, "auxiliary_loss_mlp": 0.01264365, "balance_loss_clip": 0.06344289, "balance_loss_mlp": 0.01259128, "epoch": 0.20171351270103713, "flos": 57929926089600.0, "grad_norm": 0.8579533056844582, "language_loss": 0.55768967, "learning_rate": 3.6985864345406465e-06, "loss": 0.63517356, "num_input_tokens_seen": 72489225, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.05239868, "step": 3355, "time_per_iteration": 3.1349196434020996 }, { "auxiliary_loss_clip": 0.06549795, "auxiliary_loss_mlp": 0.01281323, "balance_loss_clip": 0.06301619, "balance_loss_mlp": 0.01257695, "epoch": 0.2017736359537051, "flos": 20820768163200.0, "grad_norm": 1.6594424858982209, "language_loss": 0.84805632, "learning_rate": 3.698380797170751e-06, "loss": 0.92636752, "num_input_tokens_seen": 72508715, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.23596191, "step": 3356, "time_per_iteration": 2.554287910461426 }, { "auxiliary_loss_clip": 0.06566911, "auxiliary_loss_mlp": 0.01288436, "balance_loss_clip": 0.06299387, "balance_loss_mlp": 0.01260016, "epoch": 0.20183375920637306, "flos": 17097696023040.0, "grad_norm": 3.130109958727261, "language_loss": 0.70483565, "learning_rate": 3.698175095398085e-06, "loss": 0.78338909, "num_input_tokens_seen": 72525135, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.28369141, "step": 3357, "time_per_iteration": 2.5375664234161377 }, { "auxiliary_loss_clip": 0.06556909, "auxiliary_loss_mlp": 0.0129265, "balance_loss_clip": 0.06298172, "balance_loss_mlp": 0.01267271, "epoch": 0.20189388245904102, "flos": 18667323031680.0, "grad_norm": 6.79077801228721, "language_loss": 0.72790712, "learning_rate": 3.6979693292304493e-06, "loss": 0.80640268, "num_input_tokens_seen": 72543690, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.25402832, "step": 3358, "time_per_iteration": 2.5478920936584473 }, { "auxiliary_loss_clip": 0.06545958, "auxiliary_loss_mlp": 0.0128975, "balance_loss_clip": 0.06300101, "balance_loss_mlp": 0.01268197, "epoch": 0.20195400571170902, "flos": 16802705324160.0, "grad_norm": 1.9392205884876499, "language_loss": 0.83996737, "learning_rate": 3.6977634986756463e-06, "loss": 0.91832447, "num_input_tokens_seen": 72560725, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.21557617, "step": 3359, "time_per_iteration": 2.5332376956939697 }, { "auxiliary_loss_clip": 0.06485064, "auxiliary_loss_mlp": 0.01277598, "balance_loss_clip": 0.06344491, "balance_loss_mlp": 0.01272063, "epoch": 0.20201412896437698, "flos": 67192792669440.0, "grad_norm": 0.7490276539740686, "language_loss": 0.58817172, "learning_rate": 3.697557603741482e-06, "loss": 0.66579843, "num_input_tokens_seen": 72621940, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.0553894, "step": 3360, "time_per_iteration": 3.150702714920044 }, { "auxiliary_loss_clip": 0.06556085, "auxiliary_loss_mlp": 0.01292707, "balance_loss_clip": 0.06301846, "balance_loss_mlp": 0.01268197, "epoch": 0.20207425221704495, "flos": 21331477998720.0, "grad_norm": 2.851777207006819, "language_loss": 0.63982779, "learning_rate": 3.697351644435763e-06, "loss": 0.71831572, "num_input_tokens_seen": 72639135, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.24511719, "step": 3361, "time_per_iteration": 2.5597574710845947 }, { "auxiliary_loss_clip": 0.06549235, "auxiliary_loss_mlp": 0.01289056, "balance_loss_clip": 0.06298261, "balance_loss_mlp": 0.01264689, "epoch": 0.2021343754697129, "flos": 22533509145600.0, "grad_norm": 1.9548630870875214, "language_loss": 0.76804602, "learning_rate": 3.6971456207662993e-06, "loss": 0.84642887, "num_input_tokens_seen": 72658525, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.24401855, "step": 3362, "time_per_iteration": 2.5859062671661377 }, { "auxiliary_loss_clip": 0.06550001, "auxiliary_loss_mlp": 0.01300447, "balance_loss_clip": 0.06298663, "balance_loss_mlp": 0.01274924, "epoch": 0.20219449872238088, "flos": 19068852597120.0, "grad_norm": 1.6984654031253703, "language_loss": 0.77644134, "learning_rate": 3.6969395327409035e-06, "loss": 0.85494584, "num_input_tokens_seen": 72678085, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.25537109, "step": 3363, "time_per_iteration": 2.544537305831909 }, { "auxiliary_loss_clip": 0.06546043, "auxiliary_loss_mlp": 0.01297805, "balance_loss_clip": 0.06294623, "balance_loss_mlp": 0.01275215, "epoch": 0.20225462197504884, "flos": 24723864800640.0, "grad_norm": 1.9277119426780536, "language_loss": 0.75584018, "learning_rate": 3.696733380367391e-06, "loss": 0.8342787, "num_input_tokens_seen": 72698695, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.22595215, "step": 3364, "time_per_iteration": 2.605602264404297 }, { "auxiliary_loss_clip": 0.06557161, "auxiliary_loss_mlp": 0.01288214, "balance_loss_clip": 0.06298654, "balance_loss_mlp": 0.01263276, "epoch": 0.2023147452277168, "flos": 22024895662080.0, "grad_norm": 14.090795849451979, "language_loss": 0.72528273, "learning_rate": 3.6965271636535783e-06, "loss": 0.80373645, "num_input_tokens_seen": 72717880, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.24963379, "step": 3365, "time_per_iteration": 2.5703792572021484 }, { "auxiliary_loss_clip": 0.06546411, "auxiliary_loss_mlp": 0.01297953, "balance_loss_clip": 0.06293598, "balance_loss_mlp": 0.01275554, "epoch": 0.2023748684803848, "flos": 17750555510400.0, "grad_norm": 2.214175589054578, "language_loss": 0.86091864, "learning_rate": 3.696320882607286e-06, "loss": 0.93936223, "num_input_tokens_seen": 72736410, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.22399902, "step": 3366, "time_per_iteration": 2.5267117023468018 }, { "auxiliary_loss_clip": 0.06547984, "auxiliary_loss_mlp": 0.01302005, "balance_loss_clip": 0.06298558, "balance_loss_mlp": 0.01279486, "epoch": 0.20243499173305277, "flos": 31146912328320.0, "grad_norm": 1.6427589214578961, "language_loss": 0.69939303, "learning_rate": 3.696114537236335e-06, "loss": 0.77789295, "num_input_tokens_seen": 72758295, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.2253418, "step": 3367, "time_per_iteration": 2.6167819499969482 }, { "auxiliary_loss_clip": 0.06557535, "auxiliary_loss_mlp": 0.01292066, "balance_loss_clip": 0.06296498, "balance_loss_mlp": 0.01264123, "epoch": 0.20249511498572073, "flos": 33847726256640.0, "grad_norm": 2.86808361963215, "language_loss": 0.68628454, "learning_rate": 3.6959081275485512e-06, "loss": 0.76478052, "num_input_tokens_seen": 72782495, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.2791748, "step": 3368, "time_per_iteration": 2.659282684326172 }, { "auxiliary_loss_clip": 0.06548496, "auxiliary_loss_mlp": 0.0130091, "balance_loss_clip": 0.06301886, "balance_loss_mlp": 0.01276472, "epoch": 0.2025552382383887, "flos": 21222088093440.0, "grad_norm": 1.6802226950099752, "language_loss": 0.78203523, "learning_rate": 3.6957016535517615e-06, "loss": 0.8605293, "num_input_tokens_seen": 72801885, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.24450684, "step": 3369, "time_per_iteration": 2.563291072845459 }, { "auxiliary_loss_clip": 0.06553406, "auxiliary_loss_mlp": 0.01288973, "balance_loss_clip": 0.06294736, "balance_loss_mlp": 0.01263963, "epoch": 0.20261536149105666, "flos": 14652614355840.0, "grad_norm": 5.579760963727243, "language_loss": 0.66770107, "learning_rate": 3.695495115253795e-06, "loss": 0.74612486, "num_input_tokens_seen": 72816990, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.25024414, "step": 3370, "time_per_iteration": 2.528045415878296 }, { "auxiliary_loss_clip": 0.06469151, "auxiliary_loss_mlp": 0.01254518, "balance_loss_clip": 0.06329104, "balance_loss_mlp": 0.01248641, "epoch": 0.20267548474372463, "flos": 66803380018560.0, "grad_norm": 0.6576609890842938, "language_loss": 0.58217156, "learning_rate": 3.6952885126624834e-06, "loss": 0.65940833, "num_input_tokens_seen": 72879240, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.05871582, "step": 3371, "time_per_iteration": 3.255983591079712 }, { "auxiliary_loss_clip": 0.06555347, "auxiliary_loss_mlp": 0.01288042, "balance_loss_clip": 0.06305443, "balance_loss_mlp": 0.01264081, "epoch": 0.2027356079963926, "flos": 24687667036800.0, "grad_norm": 2.060756962980693, "language_loss": 0.92380726, "learning_rate": 3.6950818457856617e-06, "loss": 1.00224113, "num_input_tokens_seen": 72899030, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.23925781, "step": 3372, "time_per_iteration": 4.029707670211792 }, { "auxiliary_loss_clip": 0.0654928, "auxiliary_loss_mlp": 0.01281675, "balance_loss_clip": 0.06295384, "balance_loss_mlp": 0.01257725, "epoch": 0.20279573124906058, "flos": 26399443697280.0, "grad_norm": 1.6594759931805825, "language_loss": 0.79290783, "learning_rate": 3.694875114631167e-06, "loss": 0.87121737, "num_input_tokens_seen": 72919190, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.23950195, "step": 3373, "time_per_iteration": 2.6233928203582764 }, { "auxiliary_loss_clip": 0.06547202, "auxiliary_loss_mlp": 0.0128432, "balance_loss_clip": 0.06303866, "balance_loss_mlp": 0.01261635, "epoch": 0.20285585450172855, "flos": 33808006621440.0, "grad_norm": 4.945825340852858, "language_loss": 0.72056484, "learning_rate": 3.6946683192068377e-06, "loss": 0.79887998, "num_input_tokens_seen": 72939720, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.22692871, "step": 3374, "time_per_iteration": 2.7081761360168457 }, { "auxiliary_loss_clip": 0.0645705, "auxiliary_loss_mlp": 0.01263564, "balance_loss_clip": 0.06316431, "balance_loss_mlp": 0.01257239, "epoch": 0.20291597775439651, "flos": 71185768410240.0, "grad_norm": 0.9962268326262281, "language_loss": 0.6246053, "learning_rate": 3.694461459520516e-06, "loss": 0.70181137, "num_input_tokens_seen": 73000015, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.06329346, "step": 3375, "time_per_iteration": 4.587355136871338 }, { "auxiliary_loss_clip": 0.06548385, "auxiliary_loss_mlp": 0.01283382, "balance_loss_clip": 0.06299493, "balance_loss_mlp": 0.01260196, "epoch": 0.20297610100706448, "flos": 19499368475520.0, "grad_norm": 1.5261813702957334, "language_loss": 0.82997835, "learning_rate": 3.6942545355800463e-06, "loss": 0.90829611, "num_input_tokens_seen": 73017675, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.23205566, "step": 3376, "time_per_iteration": 2.5516600608825684 }, { "auxiliary_loss_clip": 0.0654941, "auxiliary_loss_mlp": 0.01284986, "balance_loss_clip": 0.06295072, "balance_loss_mlp": 0.01260596, "epoch": 0.20303622425973245, "flos": 25050944413440.0, "grad_norm": 2.196967433197991, "language_loss": 0.82076085, "learning_rate": 3.6940475473932743e-06, "loss": 0.89910483, "num_input_tokens_seen": 73036135, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.24414062, "step": 3377, "time_per_iteration": 2.579921007156372 }, { "auxiliary_loss_clip": 0.06552227, "auxiliary_loss_mlp": 0.01280027, "balance_loss_clip": 0.06300913, "balance_loss_mlp": 0.01255649, "epoch": 0.2030963475124004, "flos": 21986266129920.0, "grad_norm": 1.877898363544366, "language_loss": 0.77198023, "learning_rate": 3.69384049496805e-06, "loss": 0.85030282, "num_input_tokens_seen": 73054075, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.24365234, "step": 3378, "time_per_iteration": 2.557244300842285 }, { "auxiliary_loss_clip": 0.0655366, "auxiliary_loss_mlp": 0.01281413, "balance_loss_clip": 0.0629673, "balance_loss_mlp": 0.01257523, "epoch": 0.2031564707650684, "flos": 19506496072320.0, "grad_norm": 1.8214767558491456, "language_loss": 0.80475819, "learning_rate": 3.6936333783122242e-06, "loss": 0.88310891, "num_input_tokens_seen": 73073530, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.2388916, "step": 3379, "time_per_iteration": 2.5441315174102783 }, { "auxiliary_loss_clip": 0.06543382, "auxiliary_loss_mlp": 0.01282298, "balance_loss_clip": 0.06297196, "balance_loss_mlp": 0.01259612, "epoch": 0.20321659401773637, "flos": 22753630621440.0, "grad_norm": 1.6885171823539964, "language_loss": 0.87602121, "learning_rate": 3.6934261974336505e-06, "loss": 0.95427799, "num_input_tokens_seen": 73092820, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.22692871, "step": 3380, "time_per_iteration": 4.050171852111816 }, { "auxiliary_loss_clip": 0.06553744, "auxiliary_loss_mlp": 0.01289174, "balance_loss_clip": 0.063003, "balance_loss_mlp": 0.01265594, "epoch": 0.20327671727040433, "flos": 22462455283200.0, "grad_norm": 3.8973524301397022, "language_loss": 0.75130069, "learning_rate": 3.693218952340186e-06, "loss": 0.82972991, "num_input_tokens_seen": 73113385, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.23583984, "step": 3381, "time_per_iteration": 2.6002700328826904 }, { "auxiliary_loss_clip": 0.06559919, "auxiliary_loss_mlp": 0.01278218, "balance_loss_clip": 0.0630312, "balance_loss_mlp": 0.01254531, "epoch": 0.2033368405230723, "flos": 19540807119360.0, "grad_norm": 1.8837362531198683, "language_loss": 0.79947293, "learning_rate": 3.6930116430396895e-06, "loss": 0.87785435, "num_input_tokens_seen": 73131195, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.23657227, "step": 3382, "time_per_iteration": 2.5569581985473633 }, { "auxiliary_loss_clip": 0.06553004, "auxiliary_loss_mlp": 0.0128173, "balance_loss_clip": 0.06299374, "balance_loss_mlp": 0.01257912, "epoch": 0.20339696377574026, "flos": 13814489491200.0, "grad_norm": 1.86509055256373, "language_loss": 0.79946834, "learning_rate": 3.6928042695400214e-06, "loss": 0.87781572, "num_input_tokens_seen": 73148850, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.23828125, "step": 3383, "time_per_iteration": 3.971724510192871 }, { "auxiliary_loss_clip": 0.06548432, "auxiliary_loss_mlp": 0.01280804, "balance_loss_clip": 0.06298023, "balance_loss_mlp": 0.01257272, "epoch": 0.20345708702840823, "flos": 20345627185920.0, "grad_norm": 1.9180349492492215, "language_loss": 0.75092995, "learning_rate": 3.6925968318490464e-06, "loss": 0.82922232, "num_input_tokens_seen": 73166775, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.23547363, "step": 3384, "time_per_iteration": 2.6559412479400635 }, { "auxiliary_loss_clip": 0.06565895, "auxiliary_loss_mlp": 0.01286013, "balance_loss_clip": 0.06301379, "balance_loss_mlp": 0.01259632, "epoch": 0.2035172102810762, "flos": 20339254275840.0, "grad_norm": 2.3026224212420043, "language_loss": 0.77606118, "learning_rate": 3.6923893299746293e-06, "loss": 0.85458022, "num_input_tokens_seen": 73183215, "router_z_loss_clip": 2.6484375, "router_z_loss_mlp": 0.26391602, "step": 3385, "time_per_iteration": 2.56022047996521 }, { "auxiliary_loss_clip": 0.06547818, "auxiliary_loss_mlp": 0.01282475, "balance_loss_clip": 0.06297722, "balance_loss_mlp": 0.01258741, "epoch": 0.2035773335337442, "flos": 23337658379520.0, "grad_norm": 3.3169083805243185, "language_loss": 0.69996172, "learning_rate": 3.692181763924639e-06, "loss": 0.77826464, "num_input_tokens_seen": 73203290, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.23742676, "step": 3386, "time_per_iteration": 2.616325616836548 }, { "auxiliary_loss_clip": 0.06551208, "auxiliary_loss_mlp": 0.0128493, "balance_loss_clip": 0.06297576, "balance_loss_mlp": 0.012611, "epoch": 0.20363745678641215, "flos": 28337924378880.0, "grad_norm": 1.5149188962610458, "language_loss": 0.81640327, "learning_rate": 3.691974133706947e-06, "loss": 0.89476466, "num_input_tokens_seen": 73226185, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.23840332, "step": 3387, "time_per_iteration": 2.638115882873535 }, { "auxiliary_loss_clip": 0.06544527, "auxiliary_loss_mlp": 0.01280411, "balance_loss_clip": 0.06300186, "balance_loss_mlp": 0.01257344, "epoch": 0.20369758003908012, "flos": 18921503992320.0, "grad_norm": 2.165077171724873, "language_loss": 0.80114323, "learning_rate": 3.6917664393294262e-06, "loss": 0.87939256, "num_input_tokens_seen": 73243300, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.23071289, "step": 3388, "time_per_iteration": 2.6636621952056885 }, { "auxiliary_loss_clip": 0.06550785, "auxiliary_loss_mlp": 0.01278973, "balance_loss_clip": 0.0629735, "balance_loss_mlp": 0.01256275, "epoch": 0.20375770329174808, "flos": 19212218133120.0, "grad_norm": 2.281007905665875, "language_loss": 0.72756004, "learning_rate": 3.6915586807999527e-06, "loss": 0.80585766, "num_input_tokens_seen": 73261490, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.22705078, "step": 3389, "time_per_iteration": 2.6036386489868164 }, { "auxiliary_loss_clip": 0.06544512, "auxiliary_loss_mlp": 0.012821, "balance_loss_clip": 0.06296689, "balance_loss_mlp": 0.01259164, "epoch": 0.20381782654441605, "flos": 19397106167040.0, "grad_norm": 2.0654103956012224, "language_loss": 0.87598002, "learning_rate": 3.691350858126404e-06, "loss": 0.95424616, "num_input_tokens_seen": 73280180, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.22937012, "step": 3390, "time_per_iteration": 2.5533668994903564 }, { "auxiliary_loss_clip": 0.06546026, "auxiliary_loss_mlp": 0.01281922, "balance_loss_clip": 0.06299426, "balance_loss_mlp": 0.01259951, "epoch": 0.203877949797084, "flos": 24834764079360.0, "grad_norm": 2.16877904698164, "language_loss": 0.71492898, "learning_rate": 3.691142971316662e-06, "loss": 0.79320842, "num_input_tokens_seen": 73300680, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.21972656, "step": 3391, "time_per_iteration": 2.590965986251831 }, { "auxiliary_loss_clip": 0.06543094, "auxiliary_loss_mlp": 0.01283367, "balance_loss_clip": 0.06294166, "balance_loss_mlp": 0.01260527, "epoch": 0.20393807304975198, "flos": 18009432299520.0, "grad_norm": 2.3952255686295842, "language_loss": 0.87472314, "learning_rate": 3.6909350203786086e-06, "loss": 0.95298779, "num_input_tokens_seen": 73316760, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.2286377, "step": 3392, "time_per_iteration": 2.5082573890686035 }, { "auxiliary_loss_clip": 0.06546731, "auxiliary_loss_mlp": 0.01286176, "balance_loss_clip": 0.06296757, "balance_loss_mlp": 0.01263157, "epoch": 0.20399819630241997, "flos": 24213867724800.0, "grad_norm": 3.2265564594260554, "language_loss": 0.81504601, "learning_rate": 3.69072700532013e-06, "loss": 0.8933751, "num_input_tokens_seen": 73339385, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.23046875, "step": 3393, "time_per_iteration": 2.578660488128662 }, { "auxiliary_loss_clip": 0.06546678, "auxiliary_loss_mlp": 0.01276285, "balance_loss_clip": 0.06300989, "balance_loss_mlp": 0.01255459, "epoch": 0.20405831955508794, "flos": 20783396442240.0, "grad_norm": 3.911040439926595, "language_loss": 0.8716296, "learning_rate": 3.6905189261491137e-06, "loss": 0.94985914, "num_input_tokens_seen": 73357235, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.20812988, "step": 3394, "time_per_iteration": 2.5556252002716064 }, { "auxiliary_loss_clip": 0.06541936, "auxiliary_loss_mlp": 0.01286597, "balance_loss_clip": 0.06294245, "balance_loss_mlp": 0.01264889, "epoch": 0.2041184428077559, "flos": 15492332448000.0, "grad_norm": 2.7136191001759142, "language_loss": 0.84219456, "learning_rate": 3.69031078287345e-06, "loss": 0.92047983, "num_input_tokens_seen": 73374435, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.21716309, "step": 3395, "time_per_iteration": 2.541857957839966 }, { "auxiliary_loss_clip": 0.06550741, "auxiliary_loss_mlp": 0.01282148, "balance_loss_clip": 0.06295301, "balance_loss_mlp": 0.01259975, "epoch": 0.20417856606042387, "flos": 15592582258560.0, "grad_norm": 2.2044652497889126, "language_loss": 0.84437549, "learning_rate": 3.690102575501033e-06, "loss": 0.92270446, "num_input_tokens_seen": 73391025, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.22180176, "step": 3396, "time_per_iteration": 2.548152446746826 }, { "auxiliary_loss_clip": 0.06543244, "auxiliary_loss_mlp": 0.01284306, "balance_loss_clip": 0.06296468, "balance_loss_mlp": 0.01262467, "epoch": 0.20423868931309183, "flos": 24286137471360.0, "grad_norm": 2.778269904044567, "language_loss": 0.78158611, "learning_rate": 3.6898943040397556e-06, "loss": 0.85986161, "num_input_tokens_seen": 73409270, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.21826172, "step": 3397, "time_per_iteration": 2.588963270187378 }, { "auxiliary_loss_clip": 0.06543544, "auxiliary_loss_mlp": 0.0128202, "balance_loss_clip": 0.06294782, "balance_loss_mlp": 0.01259633, "epoch": 0.2042988125657598, "flos": 18619176061440.0, "grad_norm": 2.752013685791773, "language_loss": 0.87852716, "learning_rate": 3.689685968497518e-06, "loss": 0.95678282, "num_input_tokens_seen": 73425225, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.22387695, "step": 3398, "time_per_iteration": 2.5203561782836914 }, { "auxiliary_loss_clip": 0.06547772, "auxiliary_loss_mlp": 0.01286656, "balance_loss_clip": 0.06295621, "balance_loss_mlp": 0.01263065, "epoch": 0.2043589358184278, "flos": 17855836565760.0, "grad_norm": 2.5519285616270846, "language_loss": 0.78312588, "learning_rate": 3.6894775688822186e-06, "loss": 0.86147016, "num_input_tokens_seen": 73440940, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.23596191, "step": 3399, "time_per_iteration": 2.515766143798828 }, { "auxiliary_loss_clip": 0.06547771, "auxiliary_loss_mlp": 0.01284816, "balance_loss_clip": 0.06295463, "balance_loss_mlp": 0.01262476, "epoch": 0.20441905907109575, "flos": 21441832225920.0, "grad_norm": 1.9376940286546651, "language_loss": 0.76935637, "learning_rate": 3.6892691052017603e-06, "loss": 0.84768224, "num_input_tokens_seen": 73458805, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.22338867, "step": 3400, "time_per_iteration": 2.5573954582214355 }, { "auxiliary_loss_clip": 0.06540069, "auxiliary_loss_mlp": 0.01280259, "balance_loss_clip": 0.06295654, "balance_loss_mlp": 0.01260518, "epoch": 0.20447918232376372, "flos": 27714847818240.0, "grad_norm": 1.6746876901383871, "language_loss": 0.79878026, "learning_rate": 3.6890605774640487e-06, "loss": 0.87698352, "num_input_tokens_seen": 73479380, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.1973877, "step": 3401, "time_per_iteration": 2.657695770263672 }, { "auxiliary_loss_clip": 0.06549212, "auxiliary_loss_mlp": 0.0128134, "balance_loss_clip": 0.06299876, "balance_loss_mlp": 0.01260025, "epoch": 0.20453930557643168, "flos": 30533017789440.0, "grad_norm": 1.8890988260733144, "language_loss": 0.70499468, "learning_rate": 3.688851985676991e-06, "loss": 0.78330016, "num_input_tokens_seen": 73505105, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.21313477, "step": 3402, "time_per_iteration": 2.720113754272461 }, { "auxiliary_loss_clip": 0.06555802, "auxiliary_loss_mlp": 0.01277043, "balance_loss_clip": 0.06302933, "balance_loss_mlp": 0.01255681, "epoch": 0.20459942882909965, "flos": 18993480249600.0, "grad_norm": 2.5588409372144927, "language_loss": 0.81535459, "learning_rate": 3.688643329848496e-06, "loss": 0.89368308, "num_input_tokens_seen": 73523700, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.21362305, "step": 3403, "time_per_iteration": 2.5560994148254395 }, { "auxiliary_loss_clip": 0.06552072, "auxiliary_loss_mlp": 0.01282045, "balance_loss_clip": 0.06301625, "balance_loss_mlp": 0.01260575, "epoch": 0.20465955208176762, "flos": 20345207915520.0, "grad_norm": 2.6026783632519104, "language_loss": 0.84334052, "learning_rate": 3.6884346099864772e-06, "loss": 0.9216817, "num_input_tokens_seen": 73542625, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.21484375, "step": 3404, "time_per_iteration": 2.5645363330841064 }, { "auxiliary_loss_clip": 0.06560166, "auxiliary_loss_mlp": 0.01278721, "balance_loss_clip": 0.06306443, "balance_loss_mlp": 0.01257692, "epoch": 0.20471967533443558, "flos": 21257615024640.0, "grad_norm": 1.8796994754303746, "language_loss": 0.86569178, "learning_rate": 3.6882258260988487e-06, "loss": 0.94408071, "num_input_tokens_seen": 73561450, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.21044922, "step": 3405, "time_per_iteration": 2.5699119567871094 }, { "auxiliary_loss_clip": 0.06554329, "auxiliary_loss_mlp": 0.01278893, "balance_loss_clip": 0.0630542, "balance_loss_mlp": 0.01257459, "epoch": 0.20477979858710357, "flos": 14506775124480.0, "grad_norm": 2.65650478711063, "language_loss": 0.85197735, "learning_rate": 3.6880169781935276e-06, "loss": 0.93030953, "num_input_tokens_seen": 73577155, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.21435547, "step": 3406, "time_per_iteration": 2.534712314605713 }, { "auxiliary_loss_clip": 0.06552497, "auxiliary_loss_mlp": 0.01275795, "balance_loss_clip": 0.06303934, "balance_loss_mlp": 0.01254588, "epoch": 0.20483992183977154, "flos": 11405018609280.0, "grad_norm": 2.4763168653011536, "language_loss": 0.68550599, "learning_rate": 3.6878080662784336e-06, "loss": 0.76378888, "num_input_tokens_seen": 73594900, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.2121582, "step": 3407, "time_per_iteration": 2.5283241271972656 }, { "auxiliary_loss_clip": 0.0655725, "auxiliary_loss_mlp": 0.01281107, "balance_loss_clip": 0.06308676, "balance_loss_mlp": 0.01259614, "epoch": 0.2049000450924395, "flos": 19065917704320.0, "grad_norm": 3.5884732596157516, "language_loss": 0.86037904, "learning_rate": 3.6875990903614886e-06, "loss": 0.93876261, "num_input_tokens_seen": 73613810, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.21508789, "step": 3408, "time_per_iteration": 2.5720458030700684 }, { "auxiliary_loss_clip": 0.06568567, "auxiliary_loss_mlp": 0.01282286, "balance_loss_clip": 0.0631337, "balance_loss_mlp": 0.01259398, "epoch": 0.20496016834510747, "flos": 14579799557760.0, "grad_norm": 2.539443481858464, "language_loss": 0.65152299, "learning_rate": 3.6873900504506166e-06, "loss": 0.73003149, "num_input_tokens_seen": 73631495, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.22888184, "step": 3409, "time_per_iteration": 2.5476460456848145 }, { "auxiliary_loss_clip": 0.0656409, "auxiliary_loss_mlp": 0.01280886, "balance_loss_clip": 0.06313652, "balance_loss_mlp": 0.0126006, "epoch": 0.20502029159777543, "flos": 22133069683200.0, "grad_norm": 1.6091138386851191, "language_loss": 0.81088603, "learning_rate": 3.687180946553745e-06, "loss": 0.88933575, "num_input_tokens_seen": 73652840, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.20825195, "step": 3410, "time_per_iteration": 2.579784631729126 }, { "auxiliary_loss_clip": 0.06563365, "auxiliary_loss_mlp": 0.01277362, "balance_loss_clip": 0.06314147, "balance_loss_mlp": 0.01256155, "epoch": 0.2050804148504434, "flos": 25373873249280.0, "grad_norm": 4.129422453463285, "language_loss": 0.76922286, "learning_rate": 3.686971778678803e-06, "loss": 0.84763014, "num_input_tokens_seen": 73672150, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.2121582, "step": 3411, "time_per_iteration": 3.9993667602539062 }, { "auxiliary_loss_clip": 0.06562399, "auxiliary_loss_mlp": 0.01276807, "balance_loss_clip": 0.06313695, "balance_loss_mlp": 0.01255575, "epoch": 0.2051405381031114, "flos": 23626443876480.0, "grad_norm": 2.3731120556358443, "language_loss": 0.74192643, "learning_rate": 3.686762546833722e-06, "loss": 0.82031846, "num_input_tokens_seen": 73691940, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.21240234, "step": 3412, "time_per_iteration": 2.5746471881866455 }, { "auxiliary_loss_clip": 0.06568457, "auxiliary_loss_mlp": 0.01279361, "balance_loss_clip": 0.06313829, "balance_loss_mlp": 0.01257713, "epoch": 0.20520066135577936, "flos": 19570338483840.0, "grad_norm": 4.4414081192041275, "language_loss": 0.78601575, "learning_rate": 3.6865532510264362e-06, "loss": 0.86449397, "num_input_tokens_seen": 73709080, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.2166748, "step": 3413, "time_per_iteration": 2.5464580059051514 }, { "auxiliary_loss_clip": 0.06556597, "auxiliary_loss_mlp": 0.01277046, "balance_loss_clip": 0.06314193, "balance_loss_mlp": 0.01256518, "epoch": 0.20526078460844732, "flos": 17682184978560.0, "grad_norm": 2.0543425262454913, "language_loss": 0.8516736, "learning_rate": 3.6863438912648823e-06, "loss": 0.93001002, "num_input_tokens_seen": 73727670, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.20532227, "step": 3414, "time_per_iteration": 2.5467796325683594 }, { "auxiliary_loss_clip": 0.06565066, "auxiliary_loss_mlp": 0.01277863, "balance_loss_clip": 0.06314845, "balance_loss_mlp": 0.01256882, "epoch": 0.2053209078611153, "flos": 21505632710400.0, "grad_norm": 1.9221706172002682, "language_loss": 0.81307048, "learning_rate": 3.6861344675569986e-06, "loss": 0.89149976, "num_input_tokens_seen": 73747170, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.20983887, "step": 3415, "time_per_iteration": 3.9736580848693848 }, { "auxiliary_loss_clip": 0.06561723, "auxiliary_loss_mlp": 0.01276686, "balance_loss_clip": 0.06314326, "balance_loss_mlp": 0.01256421, "epoch": 0.20538103111378325, "flos": 25670163686400.0, "grad_norm": 2.8717685171863243, "language_loss": 0.73421836, "learning_rate": 3.6859249799107275e-06, "loss": 0.8126024, "num_input_tokens_seen": 73767690, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.20263672, "step": 3416, "time_per_iteration": 2.6107587814331055 }, { "auxiliary_loss_clip": 0.06563064, "auxiliary_loss_mlp": 0.01278203, "balance_loss_clip": 0.06311429, "balance_loss_mlp": 0.01256435, "epoch": 0.20544115436645122, "flos": 23155663311360.0, "grad_norm": 7.394684330054377, "language_loss": 0.80046892, "learning_rate": 3.6857154283340115e-06, "loss": 0.87888157, "num_input_tokens_seen": 73786900, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.21777344, "step": 3417, "time_per_iteration": 2.6093976497650146 }, { "auxiliary_loss_clip": 0.06565988, "auxiliary_loss_mlp": 0.01278305, "balance_loss_clip": 0.06311999, "balance_loss_mlp": 0.01256895, "epoch": 0.20550127761911918, "flos": 19396435334400.0, "grad_norm": 2.304364002401147, "language_loss": 0.88815606, "learning_rate": 3.685505812834798e-06, "loss": 0.96659905, "num_input_tokens_seen": 73804515, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.21411133, "step": 3418, "time_per_iteration": 2.5485732555389404 }, { "auxiliary_loss_clip": 0.06565306, "auxiliary_loss_mlp": 0.01284352, "balance_loss_clip": 0.0631035, "balance_loss_mlp": 0.01261571, "epoch": 0.20556140087178718, "flos": 22899721415040.0, "grad_norm": 2.58088786627236, "language_loss": 0.62889534, "learning_rate": 3.685296133421035e-06, "loss": 0.70739192, "num_input_tokens_seen": 73822910, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.2277832, "step": 3419, "time_per_iteration": 4.074796438217163 }, { "auxiliary_loss_clip": 0.06564836, "auxiliary_loss_mlp": 0.01285508, "balance_loss_clip": 0.06307943, "balance_loss_mlp": 0.01262787, "epoch": 0.20562152412445514, "flos": 19795365423360.0, "grad_norm": 1.8396113401739085, "language_loss": 0.86759365, "learning_rate": 3.685086390100674e-06, "loss": 0.94609714, "num_input_tokens_seen": 73841160, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.22705078, "step": 3420, "time_per_iteration": 2.558760404586792 }, { "auxiliary_loss_clip": 0.06563841, "auxiliary_loss_mlp": 0.01282247, "balance_loss_clip": 0.0631224, "balance_loss_mlp": 0.01261326, "epoch": 0.2056816473771231, "flos": 31509728507520.0, "grad_norm": 2.449212277287179, "language_loss": 0.71440494, "learning_rate": 3.684876582881668e-06, "loss": 0.79286587, "num_input_tokens_seen": 73862795, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.20935059, "step": 3421, "time_per_iteration": 2.6551601886749268 }, { "auxiliary_loss_clip": 0.06557893, "auxiliary_loss_mlp": 0.01283027, "balance_loss_clip": 0.06308824, "balance_loss_mlp": 0.01261331, "epoch": 0.20574177062979107, "flos": 23265095143680.0, "grad_norm": 2.351592212605385, "language_loss": 0.71564126, "learning_rate": 3.6846667117719732e-06, "loss": 0.79405046, "num_input_tokens_seen": 73881525, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.21679688, "step": 3422, "time_per_iteration": 2.586951971054077 }, { "auxiliary_loss_clip": 0.06526943, "auxiliary_loss_mlp": 0.01296385, "balance_loss_clip": 0.06371887, "balance_loss_mlp": 0.01288291, "epoch": 0.20580189388245904, "flos": 70331124291840.0, "grad_norm": 0.7293911592593942, "language_loss": 0.55176377, "learning_rate": 3.684456776779548e-06, "loss": 0.62999707, "num_input_tokens_seen": 73937775, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.08093262, "step": 3423, "time_per_iteration": 4.622459650039673 }, { "auxiliary_loss_clip": 0.06560518, "auxiliary_loss_mlp": 0.01285883, "balance_loss_clip": 0.06308922, "balance_loss_mlp": 0.01264211, "epoch": 0.205862017135127, "flos": 30745802033280.0, "grad_norm": 3.1296508519807604, "language_loss": 0.72120404, "learning_rate": 3.684246777912353e-06, "loss": 0.79966801, "num_input_tokens_seen": 73958250, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.2166748, "step": 3424, "time_per_iteration": 2.7074568271636963 }, { "auxiliary_loss_clip": 0.06568706, "auxiliary_loss_mlp": 0.01290361, "balance_loss_clip": 0.06322412, "balance_loss_mlp": 0.01269165, "epoch": 0.20592214038779497, "flos": 21330932947200.0, "grad_norm": 1.6261821129430212, "language_loss": 0.75671887, "learning_rate": 3.684036715178351e-06, "loss": 0.83530951, "num_input_tokens_seen": 73977775, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.21203613, "step": 3425, "time_per_iteration": 2.5655953884124756 }, { "auxiliary_loss_clip": 0.06570394, "auxiliary_loss_mlp": 0.01293386, "balance_loss_clip": 0.06323562, "balance_loss_mlp": 0.01271749, "epoch": 0.20598226364046296, "flos": 22898002406400.0, "grad_norm": 1.95364571600445, "language_loss": 0.8827101, "learning_rate": 3.683826588585508e-06, "loss": 0.96134782, "num_input_tokens_seen": 73996590, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.21643066, "step": 3426, "time_per_iteration": 2.584923028945923 }, { "auxiliary_loss_clip": 0.06569426, "auxiliary_loss_mlp": 0.01289602, "balance_loss_clip": 0.06323108, "balance_loss_mlp": 0.01267942, "epoch": 0.20604238689313092, "flos": 23885362592640.0, "grad_norm": 1.7059546871206845, "language_loss": 0.77789956, "learning_rate": 3.6836163981417926e-06, "loss": 0.85648984, "num_input_tokens_seen": 74015935, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.21679688, "step": 3427, "time_per_iteration": 2.5938642024993896 }, { "auxiliary_loss_clip": 0.06571071, "auxiliary_loss_mlp": 0.0129093, "balance_loss_clip": 0.06318745, "balance_loss_mlp": 0.01270045, "epoch": 0.2061025101457989, "flos": 22498024141440.0, "grad_norm": 2.0580723993388577, "language_loss": 0.74918091, "learning_rate": 3.683406143855174e-06, "loss": 0.82780093, "num_input_tokens_seen": 74036575, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.20898438, "step": 3428, "time_per_iteration": 2.590498685836792 }, { "auxiliary_loss_clip": 0.06567979, "auxiliary_loss_mlp": 0.01285202, "balance_loss_clip": 0.06315038, "balance_loss_mlp": 0.01263518, "epoch": 0.20616263339846685, "flos": 22784713286400.0, "grad_norm": 2.6831969902601003, "language_loss": 0.73610985, "learning_rate": 3.6831958257336256e-06, "loss": 0.81464159, "num_input_tokens_seen": 74055365, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.21691895, "step": 3429, "time_per_iteration": 2.5654642581939697 }, { "auxiliary_loss_clip": 0.06569096, "auxiliary_loss_mlp": 0.01292445, "balance_loss_clip": 0.06314877, "balance_loss_mlp": 0.01268496, "epoch": 0.20622275665113482, "flos": 20887755102720.0, "grad_norm": 53.792072301901854, "language_loss": 0.86134505, "learning_rate": 3.6829854437851237e-06, "loss": 0.93996042, "num_input_tokens_seen": 74074875, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.23962402, "step": 3430, "time_per_iteration": 2.5632803440093994 }, { "auxiliary_loss_clip": 0.06565838, "auxiliary_loss_mlp": 0.01284035, "balance_loss_clip": 0.06315664, "balance_loss_mlp": 0.01262125, "epoch": 0.20628287990380278, "flos": 19360489132800.0, "grad_norm": 4.35404616110448, "language_loss": 0.69882655, "learning_rate": 3.6827749980176444e-06, "loss": 0.77732527, "num_input_tokens_seen": 74094505, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.21911621, "step": 3431, "time_per_iteration": 2.572899103164673 }, { "auxiliary_loss_clip": 0.06496906, "auxiliary_loss_mlp": 0.01259554, "balance_loss_clip": 0.06340478, "balance_loss_mlp": 0.01252163, "epoch": 0.20634300315647078, "flos": 71536970799360.0, "grad_norm": 0.8005307407321036, "language_loss": 0.60279405, "learning_rate": 3.6825644884391693e-06, "loss": 0.68035865, "num_input_tokens_seen": 74158500, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.07373047, "step": 3432, "time_per_iteration": 3.3920605182647705 }, { "auxiliary_loss_clip": 0.06558867, "auxiliary_loss_mlp": 0.01282788, "balance_loss_clip": 0.06309874, "balance_loss_mlp": 0.01262355, "epoch": 0.20640312640913874, "flos": 21730072671360.0, "grad_norm": 2.7063519142224326, "language_loss": 0.73014975, "learning_rate": 3.682353915057679e-06, "loss": 0.80856633, "num_input_tokens_seen": 74176685, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.20422363, "step": 3433, "time_per_iteration": 2.57869029045105 }, { "auxiliary_loss_clip": 0.06564479, "auxiliary_loss_mlp": 0.0128159, "balance_loss_clip": 0.06310825, "balance_loss_mlp": 0.01260704, "epoch": 0.2064632496618067, "flos": 20560256219520.0, "grad_norm": 2.281831401845248, "language_loss": 0.871849, "learning_rate": 3.6821432778811604e-06, "loss": 0.95030963, "num_input_tokens_seen": 74194935, "router_z_loss_clip": 2.5390625, "router_z_loss_mlp": 0.20874023, "step": 3434, "time_per_iteration": 2.580970048904419 }, { "auxiliary_loss_clip": 0.06569129, "auxiliary_loss_mlp": 0.01283011, "balance_loss_clip": 0.06311373, "balance_loss_mlp": 0.01260266, "epoch": 0.20652337291447467, "flos": 29830669666560.0, "grad_norm": 1.8932363048184204, "language_loss": 0.70141542, "learning_rate": 3.6819325769176004e-06, "loss": 0.77993685, "num_input_tokens_seen": 74215400, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.22741699, "step": 3435, "time_per_iteration": 2.6492793560028076 }, { "auxiliary_loss_clip": 0.06560697, "auxiliary_loss_mlp": 0.01284309, "balance_loss_clip": 0.0631285, "balance_loss_mlp": 0.01262077, "epoch": 0.20658349616714264, "flos": 26220844719360.0, "grad_norm": 2.673716936655842, "language_loss": 0.89635271, "learning_rate": 3.681721812174988e-06, "loss": 0.97480279, "num_input_tokens_seen": 74234090, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.22241211, "step": 3436, "time_per_iteration": 2.805511951446533 }, { "auxiliary_loss_clip": 0.06562749, "auxiliary_loss_mlp": 0.01276476, "balance_loss_clip": 0.06313481, "balance_loss_mlp": 0.01255733, "epoch": 0.2066436194198106, "flos": 26001477930240.0, "grad_norm": 2.1490932232251763, "language_loss": 0.777686, "learning_rate": 3.6815109836613163e-06, "loss": 0.85607821, "num_input_tokens_seen": 74253345, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.20727539, "step": 3437, "time_per_iteration": 2.613217830657959 }, { "auxiliary_loss_clip": 0.06568848, "auxiliary_loss_mlp": 0.01278998, "balance_loss_clip": 0.06317312, "balance_loss_mlp": 0.01257839, "epoch": 0.20670374267247857, "flos": 21367466127360.0, "grad_norm": 2.272450835120714, "language_loss": 0.78741163, "learning_rate": 3.6813000913845795e-06, "loss": 0.86589015, "num_input_tokens_seen": 74271615, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.21166992, "step": 3438, "time_per_iteration": 2.5522360801696777 }, { "auxiliary_loss_clip": 0.06495424, "auxiliary_loss_mlp": 0.01292552, "balance_loss_clip": 0.06338601, "balance_loss_mlp": 0.0128494, "epoch": 0.20676386592514656, "flos": 66403108264320.0, "grad_norm": 0.8166644663213698, "language_loss": 0.6683954, "learning_rate": 3.6810891353527747e-06, "loss": 0.74627513, "num_input_tokens_seen": 74331390, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.07598877, "step": 3439, "time_per_iteration": 3.157388210296631 }, { "auxiliary_loss_clip": 0.06576684, "auxiliary_loss_mlp": 0.01288065, "balance_loss_clip": 0.06318917, "balance_loss_mlp": 0.01266393, "epoch": 0.20682398917781453, "flos": 17280278069760.0, "grad_norm": 2.246064035926792, "language_loss": 0.84673643, "learning_rate": 3.6808781155739014e-06, "loss": 0.92538399, "num_input_tokens_seen": 74347335, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.21679688, "step": 3440, "time_per_iteration": 2.5320487022399902 }, { "auxiliary_loss_clip": 0.06566377, "auxiliary_loss_mlp": 0.01283705, "balance_loss_clip": 0.06314099, "balance_loss_mlp": 0.01263367, "epoch": 0.2068841124304825, "flos": 18083127565440.0, "grad_norm": 1.8647041098317, "language_loss": 0.85651273, "learning_rate": 3.6806670320559614e-06, "loss": 0.93501353, "num_input_tokens_seen": 74366310, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.20349121, "step": 3441, "time_per_iteration": 2.5568954944610596 }, { "auxiliary_loss_clip": 0.0656428, "auxiliary_loss_mlp": 0.01288436, "balance_loss_clip": 0.06316209, "balance_loss_mlp": 0.01265798, "epoch": 0.20694423568315046, "flos": 27354798823680.0, "grad_norm": 1.6109414064251413, "language_loss": 0.8617714, "learning_rate": 3.680455884806959e-06, "loss": 0.94029856, "num_input_tokens_seen": 74387100, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.22631836, "step": 3442, "time_per_iteration": 2.6313719749450684 }, { "auxiliary_loss_clip": 0.06581405, "auxiliary_loss_mlp": 0.01290044, "balance_loss_clip": 0.06324646, "balance_loss_mlp": 0.01267084, "epoch": 0.20700435893581842, "flos": 20236027645440.0, "grad_norm": 2.041240607418303, "language_loss": 0.73656356, "learning_rate": 3.6802446738349014e-06, "loss": 0.81527805, "num_input_tokens_seen": 74404460, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.22961426, "step": 3443, "time_per_iteration": 2.560818672180176 }, { "auxiliary_loss_clip": 0.06568466, "auxiliary_loss_mlp": 0.01294499, "balance_loss_clip": 0.06321386, "balance_loss_mlp": 0.01273649, "epoch": 0.2070644821884864, "flos": 20637347575680.0, "grad_norm": 1.898920290088267, "language_loss": 0.85771871, "learning_rate": 3.680033399147797e-06, "loss": 0.93634838, "num_input_tokens_seen": 74423790, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.20837402, "step": 3444, "time_per_iteration": 2.583573341369629 }, { "auxiliary_loss_clip": 0.0648589, "auxiliary_loss_mlp": 0.01267598, "balance_loss_clip": 0.06330359, "balance_loss_mlp": 0.01259969, "epoch": 0.20712460544115438, "flos": 65960098128000.0, "grad_norm": 0.6781464164572424, "language_loss": 0.56803501, "learning_rate": 3.6798220607536585e-06, "loss": 0.64556992, "num_input_tokens_seen": 74488130, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.07617188, "step": 3445, "time_per_iteration": 3.174548864364624 }, { "auxiliary_loss_clip": 0.06568654, "auxiliary_loss_mlp": 0.01280644, "balance_loss_clip": 0.06321968, "balance_loss_mlp": 0.01259329, "epoch": 0.20718472869382235, "flos": 19431542995200.0, "grad_norm": 2.0794588993462795, "language_loss": 0.78792018, "learning_rate": 3.6796106586604987e-06, "loss": 0.86641312, "num_input_tokens_seen": 74506720, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.21313477, "step": 3446, "time_per_iteration": 2.5936107635498047 }, { "auxiliary_loss_clip": 0.06576844, "auxiliary_loss_mlp": 0.01287575, "balance_loss_clip": 0.06312385, "balance_loss_mlp": 0.01263149, "epoch": 0.2072448519464903, "flos": 24506007384960.0, "grad_norm": 4.241933523493715, "language_loss": 0.63838738, "learning_rate": 3.679399192876334e-06, "loss": 0.7170316, "num_input_tokens_seen": 74525330, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.24438477, "step": 3447, "time_per_iteration": 2.679842710494995 }, { "auxiliary_loss_clip": 0.06564875, "auxiliary_loss_mlp": 0.01282323, "balance_loss_clip": 0.06313925, "balance_loss_mlp": 0.01260698, "epoch": 0.20730497519915828, "flos": 23082345388800.0, "grad_norm": 2.274083432334738, "language_loss": 0.86988389, "learning_rate": 3.679187663409184e-06, "loss": 0.94835579, "num_input_tokens_seen": 74544535, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.21643066, "step": 3448, "time_per_iteration": 2.58579421043396 }, { "auxiliary_loss_clip": 0.06560414, "auxiliary_loss_mlp": 0.01276939, "balance_loss_clip": 0.06310342, "balance_loss_mlp": 0.01255637, "epoch": 0.20736509845182624, "flos": 21075368394240.0, "grad_norm": 3.3389432796008887, "language_loss": 0.75549603, "learning_rate": 3.6789760702670696e-06, "loss": 0.83386958, "num_input_tokens_seen": 74562300, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.21289062, "step": 3449, "time_per_iteration": 2.6245529651641846 }, { "auxiliary_loss_clip": 0.06573246, "auxiliary_loss_mlp": 0.01283821, "balance_loss_clip": 0.06313391, "balance_loss_mlp": 0.01259311, "epoch": 0.2074252217044942, "flos": 17638021077120.0, "grad_norm": 1.9664471986465282, "language_loss": 0.77226096, "learning_rate": 3.6787644134580134e-06, "loss": 0.85083163, "num_input_tokens_seen": 74580080, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.24499512, "step": 3450, "time_per_iteration": 4.010894298553467 }, { "auxiliary_loss_clip": 0.06569739, "auxiliary_loss_mlp": 0.01280429, "balance_loss_clip": 0.06314199, "balance_loss_mlp": 0.01258077, "epoch": 0.20748534495716217, "flos": 23553209808000.0, "grad_norm": 17.545384867195285, "language_loss": 0.83133519, "learning_rate": 3.6785526929900436e-06, "loss": 0.90983689, "num_input_tokens_seen": 74598980, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.22351074, "step": 3451, "time_per_iteration": 2.6310675144195557 }, { "auxiliary_loss_clip": 0.06483722, "auxiliary_loss_mlp": 0.01339358, "balance_loss_clip": 0.06328495, "balance_loss_mlp": 0.01331246, "epoch": 0.20754546820983016, "flos": 52268666757120.0, "grad_norm": 0.7847680761921074, "language_loss": 0.56512487, "learning_rate": 3.6783409088711875e-06, "loss": 0.64335573, "num_input_tokens_seen": 74655275, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.08111572, "step": 3452, "time_per_iteration": 3.124998092651367 }, { "auxiliary_loss_clip": 0.06566104, "auxiliary_loss_mlp": 0.01283821, "balance_loss_clip": 0.06311833, "balance_loss_mlp": 0.01260707, "epoch": 0.20760559146249813, "flos": 20418609692160.0, "grad_norm": 3.6560845062142713, "language_loss": 0.88755393, "learning_rate": 3.6781290611094755e-06, "loss": 0.96605319, "num_input_tokens_seen": 74674560, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.23120117, "step": 3453, "time_per_iteration": 2.5895252227783203 }, { "auxiliary_loss_clip": 0.0656295, "auxiliary_loss_mlp": 0.01292039, "balance_loss_clip": 0.06310012, "balance_loss_mlp": 0.01267971, "epoch": 0.2076657147151661, "flos": 23192825397120.0, "grad_norm": 1.6031409232168792, "language_loss": 0.81141931, "learning_rate": 3.6779171497129407e-06, "loss": 0.88996923, "num_input_tokens_seen": 74694500, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.24084473, "step": 3454, "time_per_iteration": 4.056756496429443 }, { "auxiliary_loss_clip": 0.06561374, "auxiliary_loss_mlp": 0.01297951, "balance_loss_clip": 0.06308986, "balance_loss_mlp": 0.01273966, "epoch": 0.20772583796783406, "flos": 18298595139840.0, "grad_norm": 2.796660175145547, "language_loss": 0.77581024, "learning_rate": 3.6777051746896202e-06, "loss": 0.8544035, "num_input_tokens_seen": 74710485, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.23986816, "step": 3455, "time_per_iteration": 2.5280351638793945 }, { "auxiliary_loss_clip": 0.06561279, "auxiliary_loss_mlp": 0.01304255, "balance_loss_clip": 0.06310932, "balance_loss_mlp": 0.01279865, "epoch": 0.20778596122050202, "flos": 17608531639680.0, "grad_norm": 1.6693498322203333, "language_loss": 0.80587018, "learning_rate": 3.6774931360475516e-06, "loss": 0.8845256, "num_input_tokens_seen": 74727450, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.24389648, "step": 3456, "time_per_iteration": 2.5622377395629883 }, { "auxiliary_loss_clip": 0.06566875, "auxiliary_loss_mlp": 0.01312731, "balance_loss_clip": 0.06310414, "balance_loss_mlp": 0.01288269, "epoch": 0.20784608447317, "flos": 23812380086400.0, "grad_norm": 1.7237708229146926, "language_loss": 0.78927469, "learning_rate": 3.6772810337947745e-06, "loss": 0.86807072, "num_input_tokens_seen": 74746725, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.24475098, "step": 3457, "time_per_iteration": 2.5878093242645264 }, { "auxiliary_loss_clip": 0.06564215, "auxiliary_loss_mlp": 0.0129598, "balance_loss_clip": 0.06307773, "balance_loss_mlp": 0.01271316, "epoch": 0.20790620772583795, "flos": 17645022892800.0, "grad_norm": 1.8767712130701428, "language_loss": 0.84163618, "learning_rate": 3.677068867939333e-06, "loss": 0.92023814, "num_input_tokens_seen": 74765255, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.2467041, "step": 3458, "time_per_iteration": 3.948655843734741 }, { "auxiliary_loss_clip": 0.06560011, "auxiliary_loss_mlp": 0.01300775, "balance_loss_clip": 0.06311829, "balance_loss_mlp": 0.0127642, "epoch": 0.20796633097850595, "flos": 27680997968640.0, "grad_norm": 4.0306227171805284, "language_loss": 0.76337337, "learning_rate": 3.676856638489272e-06, "loss": 0.84198123, "num_input_tokens_seen": 74785710, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.2434082, "step": 3459, "time_per_iteration": 2.6387038230895996 }, { "auxiliary_loss_clip": 0.06555898, "auxiliary_loss_mlp": 0.01290356, "balance_loss_clip": 0.06307276, "balance_loss_mlp": 0.01267849, "epoch": 0.2080264542311739, "flos": 19251770060160.0, "grad_norm": 2.3392064114907862, "language_loss": 0.77908486, "learning_rate": 3.6766443454526382e-06, "loss": 0.8575474, "num_input_tokens_seen": 74804490, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.22497559, "step": 3460, "time_per_iteration": 2.595540761947632 }, { "auxiliary_loss_clip": 0.06552662, "auxiliary_loss_mlp": 0.01294891, "balance_loss_clip": 0.06301875, "balance_loss_mlp": 0.01271991, "epoch": 0.20808657748384188, "flos": 27533146239360.0, "grad_norm": 1.8417593376926635, "language_loss": 0.76102257, "learning_rate": 3.6764319888374836e-06, "loss": 0.8394981, "num_input_tokens_seen": 74826340, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.22900391, "step": 3461, "time_per_iteration": 2.6376492977142334 }, { "auxiliary_loss_clip": 0.06565135, "auxiliary_loss_mlp": 0.01292881, "balance_loss_clip": 0.06305241, "balance_loss_mlp": 0.01267632, "epoch": 0.20814670073650984, "flos": 26914262382720.0, "grad_norm": 1.8348923303118287, "language_loss": 0.88970232, "learning_rate": 3.6762195686518604e-06, "loss": 0.96828246, "num_input_tokens_seen": 74844960, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.25268555, "step": 3462, "time_per_iteration": 4.092644214630127 }, { "auxiliary_loss_clip": 0.06462891, "auxiliary_loss_mlp": 0.01278218, "balance_loss_clip": 0.06306662, "balance_loss_mlp": 0.01268925, "epoch": 0.2082068239891778, "flos": 70195850674560.0, "grad_norm": 0.7405621785134914, "language_loss": 0.59038258, "learning_rate": 3.6760070849038226e-06, "loss": 0.66779369, "num_input_tokens_seen": 74909075, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.09277344, "step": 3463, "time_per_iteration": 3.3148114681243896 }, { "auxiliary_loss_clip": 0.06559242, "auxiliary_loss_mlp": 0.01290745, "balance_loss_clip": 0.06302845, "balance_loss_mlp": 0.01264972, "epoch": 0.20826694724184577, "flos": 24614978019840.0, "grad_norm": 3.056999840717272, "language_loss": 0.67364258, "learning_rate": 3.675794537601429e-06, "loss": 0.75214243, "num_input_tokens_seen": 74928125, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.25817871, "step": 3464, "time_per_iteration": 2.6004409790039062 }, { "auxiliary_loss_clip": 0.06558409, "auxiliary_loss_mlp": 0.01290903, "balance_loss_clip": 0.06300926, "balance_loss_mlp": 0.01264522, "epoch": 0.20832707049451377, "flos": 12897218845440.0, "grad_norm": 2.069823637128088, "language_loss": 0.83978772, "learning_rate": 3.6755819267527373e-06, "loss": 0.91828078, "num_input_tokens_seen": 74945090, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.26391602, "step": 3465, "time_per_iteration": 2.5544843673706055 }, { "auxiliary_loss_clip": 0.06554835, "auxiliary_loss_mlp": 0.01285319, "balance_loss_clip": 0.06300414, "balance_loss_mlp": 0.01259593, "epoch": 0.20838719374718173, "flos": 22205129794560.0, "grad_norm": 2.223314254381994, "language_loss": 0.81816781, "learning_rate": 3.6753692523658113e-06, "loss": 0.89656931, "num_input_tokens_seen": 74963630, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.25769043, "step": 3466, "time_per_iteration": 2.5559985637664795 }, { "auxiliary_loss_clip": 0.06546155, "auxiliary_loss_mlp": 0.01290302, "balance_loss_clip": 0.06299302, "balance_loss_mlp": 0.01267914, "epoch": 0.2084473169998497, "flos": 15164036951040.0, "grad_norm": 2.247415224315857, "language_loss": 0.82842857, "learning_rate": 3.675156514448716e-06, "loss": 0.90679306, "num_input_tokens_seen": 74981875, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.22399902, "step": 3467, "time_per_iteration": 2.5334701538085938 }, { "auxiliary_loss_clip": 0.0654363, "auxiliary_loss_mlp": 0.0128443, "balance_loss_clip": 0.06301562, "balance_loss_mlp": 0.01263199, "epoch": 0.20850744025251766, "flos": 17462482773120.0, "grad_norm": 2.026491940363319, "language_loss": 0.82198972, "learning_rate": 3.674943713009518e-06, "loss": 0.90027028, "num_input_tokens_seen": 74999155, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.21228027, "step": 3468, "time_per_iteration": 2.5376534461975098 }, { "auxiliary_loss_clip": 0.06562018, "auxiliary_loss_mlp": 0.0128382, "balance_loss_clip": 0.06301953, "balance_loss_mlp": 0.01259, "epoch": 0.20856756350518563, "flos": 25705439055360.0, "grad_norm": 1.9259585322839974, "language_loss": 0.90865779, "learning_rate": 3.6747308480562856e-06, "loss": 0.98711622, "num_input_tokens_seen": 75017850, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.24816895, "step": 3469, "time_per_iteration": 2.5916454792022705 }, { "auxiliary_loss_clip": 0.06556059, "auxiliary_loss_mlp": 0.01280987, "balance_loss_clip": 0.06303911, "balance_loss_mlp": 0.01258373, "epoch": 0.2086276867578536, "flos": 37898213425920.0, "grad_norm": 1.8224235478974071, "language_loss": 0.76772851, "learning_rate": 3.674517919597092e-06, "loss": 0.84609896, "num_input_tokens_seen": 75039270, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.22607422, "step": 3470, "time_per_iteration": 2.7109458446502686 }, { "auxiliary_loss_clip": 0.06543472, "auxiliary_loss_mlp": 0.01283162, "balance_loss_clip": 0.06298421, "balance_loss_mlp": 0.01259225, "epoch": 0.20868781001052156, "flos": 25564169871360.0, "grad_norm": 1.774065492210337, "language_loss": 0.76462507, "learning_rate": 3.674304927640011e-06, "loss": 0.8428914, "num_input_tokens_seen": 75059350, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.23962402, "step": 3471, "time_per_iteration": 2.592839241027832 }, { "auxiliary_loss_clip": 0.06555966, "auxiliary_loss_mlp": 0.01284115, "balance_loss_clip": 0.06295964, "balance_loss_mlp": 0.01261311, "epoch": 0.20874793326318955, "flos": 27536961600000.0, "grad_norm": 1.7261705450784701, "language_loss": 0.76693416, "learning_rate": 3.67409187219312e-06, "loss": 0.84533495, "num_input_tokens_seen": 75080150, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.22814941, "step": 3472, "time_per_iteration": 2.6235687732696533 }, { "auxiliary_loss_clip": 0.06543493, "auxiliary_loss_mlp": 0.01279528, "balance_loss_clip": 0.06293234, "balance_loss_mlp": 0.01257545, "epoch": 0.20880805651585752, "flos": 18554243546880.0, "grad_norm": 2.31227312350039, "language_loss": 0.84981716, "learning_rate": 3.6738787532644966e-06, "loss": 0.92804736, "num_input_tokens_seen": 75097920, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.21984863, "step": 3473, "time_per_iteration": 2.5639519691467285 }, { "auxiliary_loss_clip": 0.06450883, "auxiliary_loss_mlp": 0.01257754, "balance_loss_clip": 0.06299639, "balance_loss_mlp": 0.01249123, "epoch": 0.20886817976852548, "flos": 65966596819200.0, "grad_norm": 0.8753425773560274, "language_loss": 0.63781905, "learning_rate": 3.6736655708622235e-06, "loss": 0.71490544, "num_input_tokens_seen": 75152410, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.08642578, "step": 3474, "time_per_iteration": 3.1293251514434814 }, { "auxiliary_loss_clip": 0.06545222, "auxiliary_loss_mlp": 0.01278855, "balance_loss_clip": 0.06293193, "balance_loss_mlp": 0.01255741, "epoch": 0.20892830302119345, "flos": 36548120914560.0, "grad_norm": 2.361112755123806, "language_loss": 0.70566618, "learning_rate": 3.6734523249943844e-06, "loss": 0.78390694, "num_input_tokens_seen": 75173265, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.23095703, "step": 3475, "time_per_iteration": 2.709705114364624 }, { "auxiliary_loss_clip": 0.06548501, "auxiliary_loss_mlp": 0.01277517, "balance_loss_clip": 0.06294134, "balance_loss_mlp": 0.01253472, "epoch": 0.2089884262738614, "flos": 20962582398720.0, "grad_norm": 2.1401728883126387, "language_loss": 0.70949495, "learning_rate": 3.673239015669065e-06, "loss": 0.78775513, "num_input_tokens_seen": 75193640, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.24047852, "step": 3476, "time_per_iteration": 2.5962724685668945 }, { "auxiliary_loss_clip": 0.06533613, "auxiliary_loss_mlp": 0.01275933, "balance_loss_clip": 0.06289253, "balance_loss_mlp": 0.01254452, "epoch": 0.20904854952652938, "flos": 22790666926080.0, "grad_norm": 1.7616619176686117, "language_loss": 0.89769775, "learning_rate": 3.6730256428943544e-06, "loss": 0.97579324, "num_input_tokens_seen": 75212545, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.21496582, "step": 3477, "time_per_iteration": 2.6033077239990234 }, { "auxiliary_loss_clip": 0.06541686, "auxiliary_loss_mlp": 0.01280271, "balance_loss_clip": 0.06292702, "balance_loss_mlp": 0.0125811, "epoch": 0.20910867277919734, "flos": 27309838308480.0, "grad_norm": 3.01651105857758, "language_loss": 0.68684381, "learning_rate": 3.672812206678344e-06, "loss": 0.76506341, "num_input_tokens_seen": 75230865, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.22180176, "step": 3478, "time_per_iteration": 2.6085054874420166 }, { "auxiliary_loss_clip": 0.06545427, "auxiliary_loss_mlp": 0.01278207, "balance_loss_clip": 0.06296587, "balance_loss_mlp": 0.01256761, "epoch": 0.20916879603186533, "flos": 14324444640000.0, "grad_norm": 3.4582027012292222, "language_loss": 0.85257304, "learning_rate": 3.672598707029127e-06, "loss": 0.93080938, "num_input_tokens_seen": 75248285, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.21459961, "step": 3479, "time_per_iteration": 2.6496551036834717 }, { "auxiliary_loss_clip": 0.06546111, "auxiliary_loss_mlp": 0.01283843, "balance_loss_clip": 0.06294369, "balance_loss_mlp": 0.01260359, "epoch": 0.2092289192845333, "flos": 22279537820160.0, "grad_norm": 4.596688753982838, "language_loss": 0.75876337, "learning_rate": 3.6723851439548003e-06, "loss": 0.8370629, "num_input_tokens_seen": 75266310, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.23474121, "step": 3480, "time_per_iteration": 2.5871217250823975 }, { "auxiliary_loss_clip": 0.06538141, "auxiliary_loss_mlp": 0.01280103, "balance_loss_clip": 0.06292806, "balance_loss_mlp": 0.0125905, "epoch": 0.20928904253720126, "flos": 14836118797440.0, "grad_norm": 2.3362818934679894, "language_loss": 0.76543087, "learning_rate": 3.67217151746346e-06, "loss": 0.84361333, "num_input_tokens_seen": 75284175, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.21057129, "step": 3481, "time_per_iteration": 2.555147171020508 }, { "auxiliary_loss_clip": 0.06546183, "auxiliary_loss_mlp": 0.01282073, "balance_loss_clip": 0.06298161, "balance_loss_mlp": 0.01260639, "epoch": 0.20934916578986923, "flos": 23266017538560.0, "grad_norm": 1.7165671100968176, "language_loss": 0.86048543, "learning_rate": 3.671957827563209e-06, "loss": 0.93876803, "num_input_tokens_seen": 75303465, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.21435547, "step": 3482, "time_per_iteration": 2.600435495376587 }, { "auxiliary_loss_clip": 0.06541276, "auxiliary_loss_mlp": 0.01276681, "balance_loss_clip": 0.06295399, "balance_loss_mlp": 0.01254735, "epoch": 0.2094092890425372, "flos": 32022492768000.0, "grad_norm": 2.0565506288372615, "language_loss": 0.72229016, "learning_rate": 3.6717440742621494e-06, "loss": 0.8004697, "num_input_tokens_seen": 75325290, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.21948242, "step": 3483, "time_per_iteration": 2.758492946624756 }, { "auxiliary_loss_clip": 0.06547858, "auxiliary_loss_mlp": 0.01283459, "balance_loss_clip": 0.06296424, "balance_loss_mlp": 0.01260356, "epoch": 0.20946941229520516, "flos": 20016744710400.0, "grad_norm": 2.9202458483937055, "language_loss": 0.75997305, "learning_rate": 3.6715302575683865e-06, "loss": 0.83828622, "num_input_tokens_seen": 75343895, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.23120117, "step": 3484, "time_per_iteration": 2.55989146232605 }, { "auxiliary_loss_clip": 0.06542122, "auxiliary_loss_mlp": 0.01276774, "balance_loss_clip": 0.06293049, "balance_loss_mlp": 0.01253504, "epoch": 0.20952953554787315, "flos": 30748401509760.0, "grad_norm": 2.046418880512904, "language_loss": 0.71358168, "learning_rate": 3.6713163774900292e-06, "loss": 0.7917707, "num_input_tokens_seen": 75367100, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.23278809, "step": 3485, "time_per_iteration": 2.652705430984497 }, { "auxiliary_loss_clip": 0.06552837, "auxiliary_loss_mlp": 0.0128151, "balance_loss_clip": 0.06301044, "balance_loss_mlp": 0.01256667, "epoch": 0.20958965880054112, "flos": 27055950837120.0, "grad_norm": 1.714472287282295, "language_loss": 0.83811009, "learning_rate": 3.6711024340351875e-06, "loss": 0.9164536, "num_input_tokens_seen": 75389925, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.24865723, "step": 3486, "time_per_iteration": 2.615830421447754 }, { "auxiliary_loss_clip": 0.06545253, "auxiliary_loss_mlp": 0.01276486, "balance_loss_clip": 0.06295018, "balance_loss_mlp": 0.01254862, "epoch": 0.20964978205320908, "flos": 34212680714880.0, "grad_norm": 1.8074557446657404, "language_loss": 0.87467974, "learning_rate": 3.6708884272119737e-06, "loss": 0.95289719, "num_input_tokens_seen": 75408575, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.21630859, "step": 3487, "time_per_iteration": 2.6890015602111816 }, { "auxiliary_loss_clip": 0.06539226, "auxiliary_loss_mlp": 0.01278755, "balance_loss_clip": 0.06293879, "balance_loss_mlp": 0.01255723, "epoch": 0.20970990530587705, "flos": 23484168443520.0, "grad_norm": 2.232533434054699, "language_loss": 0.72819519, "learning_rate": 3.670674357028504e-06, "loss": 0.80637497, "num_input_tokens_seen": 75427155, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.23046875, "step": 3488, "time_per_iteration": 2.5849037170410156 }, { "auxiliary_loss_clip": 0.06546244, "auxiliary_loss_mlp": 0.01278601, "balance_loss_clip": 0.06297451, "balance_loss_mlp": 0.01256393, "epoch": 0.209770028558545, "flos": 18557346147840.0, "grad_norm": 6.061187759797117, "language_loss": 0.81255585, "learning_rate": 3.6704602234928945e-06, "loss": 0.89080429, "num_input_tokens_seen": 75444450, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.22192383, "step": 3489, "time_per_iteration": 2.5575342178344727 }, { "auxiliary_loss_clip": 0.065495, "auxiliary_loss_mlp": 0.01275631, "balance_loss_clip": 0.06299399, "balance_loss_mlp": 0.01253649, "epoch": 0.20983015181121298, "flos": 21623533804800.0, "grad_norm": 3.0295860511987502, "language_loss": 0.7354002, "learning_rate": 3.670246026613266e-06, "loss": 0.81365156, "num_input_tokens_seen": 75462625, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.21984863, "step": 3490, "time_per_iteration": 3.975198745727539 }, { "auxiliary_loss_clip": 0.06534372, "auxiliary_loss_mlp": 0.01280148, "balance_loss_clip": 0.06296495, "balance_loss_mlp": 0.01258368, "epoch": 0.20989027506388094, "flos": 16619787861120.0, "grad_norm": 1.9998340852388472, "language_loss": 0.71563625, "learning_rate": 3.6700317663977415e-06, "loss": 0.79378146, "num_input_tokens_seen": 75480640, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.21789551, "step": 3491, "time_per_iteration": 2.576157331466675 }, { "auxiliary_loss_clip": 0.06547216, "auxiliary_loss_mlp": 0.01283336, "balance_loss_clip": 0.06295981, "balance_loss_mlp": 0.0125897, "epoch": 0.20995039831654894, "flos": 23222692177920.0, "grad_norm": 2.369329494436121, "language_loss": 0.80375218, "learning_rate": 3.669817442854444e-06, "loss": 0.88205767, "num_input_tokens_seen": 75494900, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.24401855, "step": 3492, "time_per_iteration": 2.5897388458251953 }, { "auxiliary_loss_clip": 0.06537955, "auxiliary_loss_mlp": 0.01283343, "balance_loss_clip": 0.06295677, "balance_loss_mlp": 0.01261194, "epoch": 0.2100105215692169, "flos": 18152881689600.0, "grad_norm": 2.3788100230430276, "language_loss": 0.87856448, "learning_rate": 3.669603055991502e-06, "loss": 0.95677745, "num_input_tokens_seen": 75513370, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.22143555, "step": 3493, "time_per_iteration": 4.009500503540039 }, { "auxiliary_loss_clip": 0.06534772, "auxiliary_loss_mlp": 0.01281672, "balance_loss_clip": 0.06293262, "balance_loss_mlp": 0.01259821, "epoch": 0.21007064482188487, "flos": 15967179936000.0, "grad_norm": 1.9466656431065654, "language_loss": 0.70102829, "learning_rate": 3.6693886058170455e-06, "loss": 0.77919275, "num_input_tokens_seen": 75532480, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.21850586, "step": 3494, "time_per_iteration": 2.552121639251709 }, { "auxiliary_loss_clip": 0.06549344, "auxiliary_loss_mlp": 0.01285725, "balance_loss_clip": 0.06299797, "balance_loss_mlp": 0.0126205, "epoch": 0.21013076807455283, "flos": 32242614243840.0, "grad_norm": 1.9239906155683537, "language_loss": 0.79640687, "learning_rate": 3.6691740923392053e-06, "loss": 0.87475759, "num_input_tokens_seen": 75552745, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.23669434, "step": 3495, "time_per_iteration": 2.6631250381469727 }, { "auxiliary_loss_clip": 0.06539144, "auxiliary_loss_mlp": 0.01282329, "balance_loss_clip": 0.06293116, "balance_loss_mlp": 0.01259203, "epoch": 0.2101908913272208, "flos": 23703493305600.0, "grad_norm": 2.7750176655575944, "language_loss": 0.77627027, "learning_rate": 3.668959515566116e-06, "loss": 0.85448498, "num_input_tokens_seen": 75574355, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.23132324, "step": 3496, "time_per_iteration": 2.6173760890960693 }, { "auxiliary_loss_clip": 0.06539504, "auxiliary_loss_mlp": 0.01280955, "balance_loss_clip": 0.06291207, "balance_loss_mlp": 0.01257578, "epoch": 0.21025101457988876, "flos": 20381992657920.0, "grad_norm": 1.7821182204485073, "language_loss": 0.82432729, "learning_rate": 3.668744875505915e-06, "loss": 0.90253192, "num_input_tokens_seen": 75592215, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.23400879, "step": 3497, "time_per_iteration": 2.565584897994995 }, { "auxiliary_loss_clip": 0.06551898, "auxiliary_loss_mlp": 0.01282271, "balance_loss_clip": 0.06297778, "balance_loss_mlp": 0.01258393, "epoch": 0.21031113783255675, "flos": 25782740046720.0, "grad_norm": 1.9007087195487933, "language_loss": 0.68135357, "learning_rate": 3.668530172166741e-06, "loss": 0.75969523, "num_input_tokens_seen": 75610740, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.23876953, "step": 3498, "time_per_iteration": 3.9751994609832764 }, { "auxiliary_loss_clip": 0.065464, "auxiliary_loss_mlp": 0.01285652, "balance_loss_clip": 0.06295598, "balance_loss_mlp": 0.01262275, "epoch": 0.21037126108522472, "flos": 22024769880960.0, "grad_norm": 2.398819874391115, "language_loss": 0.81300944, "learning_rate": 3.6683154055567352e-06, "loss": 0.89132994, "num_input_tokens_seen": 75631005, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.23376465, "step": 3499, "time_per_iteration": 2.5864665508270264 }, { "auxiliary_loss_clip": 0.06535661, "auxiliary_loss_mlp": 0.01277748, "balance_loss_clip": 0.06292032, "balance_loss_mlp": 0.0125728, "epoch": 0.21043138433789269, "flos": 25340861940480.0, "grad_norm": 1.6536240606932744, "language_loss": 0.78866321, "learning_rate": 3.668100575684043e-06, "loss": 0.86679733, "num_input_tokens_seen": 75650655, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.20471191, "step": 3500, "time_per_iteration": 2.6250295639038086 }, { "auxiliary_loss_clip": 0.06539597, "auxiliary_loss_mlp": 0.01281457, "balance_loss_clip": 0.06293619, "balance_loss_mlp": 0.01258819, "epoch": 0.21049150759056065, "flos": 25563708673920.0, "grad_norm": 1.7597832236007493, "language_loss": 0.74505997, "learning_rate": 3.6678856825568094e-06, "loss": 0.82327044, "num_input_tokens_seen": 75669895, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.2265625, "step": 3501, "time_per_iteration": 2.607487201690674 }, { "auxiliary_loss_clip": 0.06534778, "auxiliary_loss_mlp": 0.01276648, "balance_loss_clip": 0.06291081, "balance_loss_mlp": 0.01255285, "epoch": 0.21055163084322862, "flos": 24501982389120.0, "grad_norm": 1.518922896786556, "language_loss": 0.76102453, "learning_rate": 3.667670726183183e-06, "loss": 0.83913875, "num_input_tokens_seen": 75689535, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.21362305, "step": 3502, "time_per_iteration": 4.0543413162231445 }, { "auxiliary_loss_clip": 0.06538568, "auxiliary_loss_mlp": 0.01278435, "balance_loss_clip": 0.06294176, "balance_loss_mlp": 0.01255845, "epoch": 0.21061175409589658, "flos": 25746123012480.0, "grad_norm": 1.9440502469349714, "language_loss": 0.78134137, "learning_rate": 3.667455706571316e-06, "loss": 0.85951144, "num_input_tokens_seen": 75709265, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.22583008, "step": 3503, "time_per_iteration": 2.6191163063049316 }, { "auxiliary_loss_clip": 0.06544818, "auxiliary_loss_mlp": 0.01279972, "balance_loss_clip": 0.06290172, "balance_loss_mlp": 0.01254891, "epoch": 0.21067187734856455, "flos": 18995115404160.0, "grad_norm": 2.872365314636673, "language_loss": 0.78750479, "learning_rate": 3.6672406237293617e-06, "loss": 0.8657527, "num_input_tokens_seen": 75727050, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.25085449, "step": 3504, "time_per_iteration": 2.578744888305664 }, { "auxiliary_loss_clip": 0.06548011, "auxiliary_loss_mlp": 0.01279368, "balance_loss_clip": 0.06295928, "balance_loss_mlp": 0.01257004, "epoch": 0.21073200060123254, "flos": 24688337869440.0, "grad_norm": 2.228684678968389, "language_loss": 0.77800405, "learning_rate": 3.6670254776654754e-06, "loss": 0.85627782, "num_input_tokens_seen": 75747175, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.22363281, "step": 3505, "time_per_iteration": 2.5999577045440674 }, { "auxiliary_loss_clip": 0.06540024, "auxiliary_loss_mlp": 0.01279193, "balance_loss_clip": 0.06298961, "balance_loss_mlp": 0.01258546, "epoch": 0.2107921238539005, "flos": 28557039605760.0, "grad_norm": 1.9696239775393092, "language_loss": 0.64047277, "learning_rate": 3.6668102683878163e-06, "loss": 0.71866494, "num_input_tokens_seen": 75767690, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.2064209, "step": 3506, "time_per_iteration": 2.6681408882141113 }, { "auxiliary_loss_clip": 0.0653903, "auxiliary_loss_mlp": 0.01278196, "balance_loss_clip": 0.06293663, "balance_loss_mlp": 0.01256047, "epoch": 0.21085224710656847, "flos": 25893094273920.0, "grad_norm": 1.729249151911144, "language_loss": 0.82798052, "learning_rate": 3.6665949959045443e-06, "loss": 0.90615273, "num_input_tokens_seen": 75787255, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.22143555, "step": 3507, "time_per_iteration": 2.6336846351623535 }, { "auxiliary_loss_clip": 0.06544885, "auxiliary_loss_mlp": 0.01278539, "balance_loss_clip": 0.06298655, "balance_loss_mlp": 0.01256092, "epoch": 0.21091237035923643, "flos": 14981664539520.0, "grad_norm": 1.985878994308759, "language_loss": 0.76541913, "learning_rate": 3.666379660223824e-06, "loss": 0.84365332, "num_input_tokens_seen": 75805890, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.22485352, "step": 3508, "time_per_iteration": 2.562514305114746 }, { "auxiliary_loss_clip": 0.0655135, "auxiliary_loss_mlp": 0.01282673, "balance_loss_clip": 0.0630075, "balance_loss_mlp": 0.01261084, "epoch": 0.2109724936119044, "flos": 16368080595840.0, "grad_norm": 2.3274757897510274, "language_loss": 0.8577565, "learning_rate": 3.6661642613538192e-06, "loss": 0.93609679, "num_input_tokens_seen": 75821620, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.21582031, "step": 3509, "time_per_iteration": 2.55464768409729 }, { "auxiliary_loss_clip": 0.06558478, "auxiliary_loss_mlp": 0.01281387, "balance_loss_clip": 0.06305999, "balance_loss_mlp": 0.01257784, "epoch": 0.21103261686457236, "flos": 31510315486080.0, "grad_norm": 1.6902207558679083, "language_loss": 0.68930513, "learning_rate": 3.6659487993026987e-06, "loss": 0.76770377, "num_input_tokens_seen": 75842490, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.23596191, "step": 3510, "time_per_iteration": 2.6584925651550293 }, { "auxiliary_loss_clip": 0.06550091, "auxiliary_loss_mlp": 0.01283331, "balance_loss_clip": 0.06300013, "balance_loss_mlp": 0.01260884, "epoch": 0.21109274011724033, "flos": 27351360806400.0, "grad_norm": 1.6662319648327097, "language_loss": 0.73073816, "learning_rate": 3.6657332740786327e-06, "loss": 0.80907238, "num_input_tokens_seen": 75865985, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.22436523, "step": 3511, "time_per_iteration": 2.6382009983062744 }, { "auxiliary_loss_clip": 0.06558695, "auxiliary_loss_mlp": 0.01285755, "balance_loss_clip": 0.06304827, "balance_loss_mlp": 0.01261937, "epoch": 0.21115286336990832, "flos": 17825927857920.0, "grad_norm": 2.4202100083145144, "language_loss": 0.69406027, "learning_rate": 3.665517685689794e-06, "loss": 0.77250475, "num_input_tokens_seen": 75882745, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.23828125, "step": 3512, "time_per_iteration": 2.534055471420288 }, { "auxiliary_loss_clip": 0.06542812, "auxiliary_loss_mlp": 0.01279347, "balance_loss_clip": 0.06296539, "balance_loss_mlp": 0.01256424, "epoch": 0.2112129866225763, "flos": 27205228085760.0, "grad_norm": 2.0042448110879003, "language_loss": 0.73541111, "learning_rate": 3.6653020341443584e-06, "loss": 0.81363273, "num_input_tokens_seen": 75904305, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.22912598, "step": 3513, "time_per_iteration": 2.6198625564575195 }, { "auxiliary_loss_clip": 0.06540719, "auxiliary_loss_mlp": 0.01280342, "balance_loss_clip": 0.06297596, "balance_loss_mlp": 0.01258384, "epoch": 0.21127310987524425, "flos": 23737846279680.0, "grad_norm": 2.025249018165617, "language_loss": 0.75168556, "learning_rate": 3.665086319450502e-06, "loss": 0.82989621, "num_input_tokens_seen": 75923710, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.21960449, "step": 3514, "time_per_iteration": 2.571054697036743 }, { "auxiliary_loss_clip": 0.06546288, "auxiliary_loss_mlp": 0.01277029, "balance_loss_clip": 0.06296539, "balance_loss_mlp": 0.01256168, "epoch": 0.21133323312791222, "flos": 18338356702080.0, "grad_norm": 1.7916182620037653, "language_loss": 0.77347231, "learning_rate": 3.6648705416164062e-06, "loss": 0.85170555, "num_input_tokens_seen": 75942625, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.20861816, "step": 3515, "time_per_iteration": 2.5379068851470947 }, { "auxiliary_loss_clip": 0.06550843, "auxiliary_loss_mlp": 0.01282404, "balance_loss_clip": 0.06303064, "balance_loss_mlp": 0.01261697, "epoch": 0.21139335638058018, "flos": 17936994844800.0, "grad_norm": 2.027607772447968, "language_loss": 0.69119835, "learning_rate": 3.6646547006502518e-06, "loss": 0.76953077, "num_input_tokens_seen": 75959930, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.20703125, "step": 3516, "time_per_iteration": 2.532087802886963 }, { "auxiliary_loss_clip": 0.06545375, "auxiliary_loss_mlp": 0.01277828, "balance_loss_clip": 0.06295516, "balance_loss_mlp": 0.01254404, "epoch": 0.21145347963324815, "flos": 24579073745280.0, "grad_norm": 2.2178764463386345, "language_loss": 0.8590501, "learning_rate": 3.664438796560225e-06, "loss": 0.93728209, "num_input_tokens_seen": 75980335, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.234375, "step": 3517, "time_per_iteration": 2.5899839401245117 }, { "auxiliary_loss_clip": 0.06552696, "auxiliary_loss_mlp": 0.01279001, "balance_loss_clip": 0.06302936, "balance_loss_mlp": 0.01256327, "epoch": 0.21151360288591614, "flos": 35854787105280.0, "grad_norm": 2.1815417520226563, "language_loss": 0.63548481, "learning_rate": 3.664222829354512e-06, "loss": 0.71380174, "num_input_tokens_seen": 76002095, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.22680664, "step": 3518, "time_per_iteration": 2.68086576461792 }, { "auxiliary_loss_clip": 0.06544373, "auxiliary_loss_mlp": 0.01290298, "balance_loss_clip": 0.06299493, "balance_loss_mlp": 0.01268304, "epoch": 0.2115737261385841, "flos": 24647989328640.0, "grad_norm": 2.673192182938752, "language_loss": 0.89734954, "learning_rate": 3.664006799041303e-06, "loss": 0.97569627, "num_input_tokens_seen": 76020425, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.22009277, "step": 3519, "time_per_iteration": 2.574756145477295 }, { "auxiliary_loss_clip": 0.06551682, "auxiliary_loss_mlp": 0.0129281, "balance_loss_clip": 0.06302671, "balance_loss_mlp": 0.01268361, "epoch": 0.21163384939125207, "flos": 25233652241280.0, "grad_norm": 5.796064225650129, "language_loss": 0.82675397, "learning_rate": 3.6637907056287886e-06, "loss": 0.90519893, "num_input_tokens_seen": 76041210, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.24475098, "step": 3520, "time_per_iteration": 2.5988874435424805 }, { "auxiliary_loss_clip": 0.06540998, "auxiliary_loss_mlp": 0.01285527, "balance_loss_clip": 0.0630136, "balance_loss_mlp": 0.01264689, "epoch": 0.21169397264392004, "flos": 26074670071680.0, "grad_norm": 1.6339838524311434, "language_loss": 0.76862031, "learning_rate": 3.6635745491251642e-06, "loss": 0.84688556, "num_input_tokens_seen": 76062685, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.20825195, "step": 3521, "time_per_iteration": 2.6215121746063232 }, { "auxiliary_loss_clip": 0.06540794, "auxiliary_loss_mlp": 0.01287766, "balance_loss_clip": 0.062976, "balance_loss_mlp": 0.01266093, "epoch": 0.211754095896588, "flos": 23114266594560.0, "grad_norm": 3.1768350378757013, "language_loss": 0.75737512, "learning_rate": 3.663358329538626e-06, "loss": 0.8356607, "num_input_tokens_seen": 76082300, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.21655273, "step": 3522, "time_per_iteration": 2.582669258117676 }, { "auxiliary_loss_clip": 0.06553388, "auxiliary_loss_mlp": 0.01281727, "balance_loss_clip": 0.06306422, "balance_loss_mlp": 0.01260043, "epoch": 0.21181421914925597, "flos": 27928806019200.0, "grad_norm": 1.7846519687593139, "language_loss": 0.70765197, "learning_rate": 3.663142046877374e-06, "loss": 0.78600311, "num_input_tokens_seen": 76101135, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.21691895, "step": 3523, "time_per_iteration": 2.61966872215271 }, { "auxiliary_loss_clip": 0.06545214, "auxiliary_loss_mlp": 0.01285041, "balance_loss_clip": 0.06301951, "balance_loss_mlp": 0.01262487, "epoch": 0.21187434240192393, "flos": 17134313057280.0, "grad_norm": 2.525204274270883, "language_loss": 0.78752512, "learning_rate": 3.6629257011496085e-06, "loss": 0.86582768, "num_input_tokens_seen": 76119320, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.22558594, "step": 3524, "time_per_iteration": 2.5687177181243896 }, { "auxiliary_loss_clip": 0.06554852, "auxiliary_loss_mlp": 0.01286038, "balance_loss_clip": 0.06302993, "balance_loss_mlp": 0.01262637, "epoch": 0.21193446565459192, "flos": 22354071626880.0, "grad_norm": 1.7305388351497066, "language_loss": 0.81955427, "learning_rate": 3.6627092923635338e-06, "loss": 0.89796317, "num_input_tokens_seen": 76137445, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.23413086, "step": 3525, "time_per_iteration": 2.597813367843628 }, { "auxiliary_loss_clip": 0.06544932, "auxiliary_loss_mlp": 0.01282402, "balance_loss_clip": 0.06298759, "balance_loss_mlp": 0.01260897, "epoch": 0.2119945889072599, "flos": 27206779386240.0, "grad_norm": 1.7838856687073865, "language_loss": 0.75862062, "learning_rate": 3.662492820527356e-06, "loss": 0.83689398, "num_input_tokens_seen": 76159500, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.21508789, "step": 3526, "time_per_iteration": 2.644550085067749 }, { "auxiliary_loss_clip": 0.06551701, "auxiliary_loss_mlp": 0.01282673, "balance_loss_clip": 0.06301633, "balance_loss_mlp": 0.01261334, "epoch": 0.21205471215992786, "flos": 20997480424320.0, "grad_norm": 1.6886782094739061, "language_loss": 0.77147043, "learning_rate": 3.662276285649284e-06, "loss": 0.84981418, "num_input_tokens_seen": 76177990, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.21337891, "step": 3527, "time_per_iteration": 2.6859185695648193 }, { "auxiliary_loss_clip": 0.06544672, "auxiliary_loss_mlp": 0.01282849, "balance_loss_clip": 0.0630029, "balance_loss_mlp": 0.01260879, "epoch": 0.21211483541259582, "flos": 20784025347840.0, "grad_norm": 1.8940793320045108, "language_loss": 0.78544176, "learning_rate": 3.662059687737528e-06, "loss": 0.86371696, "num_input_tokens_seen": 76197125, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.21972656, "step": 3528, "time_per_iteration": 2.579636812210083 }, { "auxiliary_loss_clip": 0.06548527, "auxiliary_loss_mlp": 0.01282952, "balance_loss_clip": 0.06304349, "balance_loss_mlp": 0.01260207, "epoch": 0.21217495866526379, "flos": 18996079726080.0, "grad_norm": 1.7480688552987877, "language_loss": 0.82034647, "learning_rate": 3.6618430268003024e-06, "loss": 0.89866126, "num_input_tokens_seen": 76216215, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.22717285, "step": 3529, "time_per_iteration": 4.0336527824401855 }, { "auxiliary_loss_clip": 0.06553054, "auxiliary_loss_mlp": 0.01283335, "balance_loss_clip": 0.06304961, "balance_loss_mlp": 0.01260077, "epoch": 0.21223508191793175, "flos": 20673503412480.0, "grad_norm": 2.474646469524133, "language_loss": 0.77237034, "learning_rate": 3.6616263028458235e-06, "loss": 0.85073423, "num_input_tokens_seen": 76237010, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.23254395, "step": 3530, "time_per_iteration": 2.6832573413848877 }, { "auxiliary_loss_clip": 0.06547055, "auxiliary_loss_mlp": 0.01282953, "balance_loss_clip": 0.06305049, "balance_loss_mlp": 0.01260899, "epoch": 0.21229520517059972, "flos": 21622904899200.0, "grad_norm": 1.9479729328923965, "language_loss": 0.83668214, "learning_rate": 3.661409515882308e-06, "loss": 0.9149822, "num_input_tokens_seen": 76255965, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.22045898, "step": 3531, "time_per_iteration": 2.646005868911743 }, { "auxiliary_loss_clip": 0.06558702, "auxiliary_loss_mlp": 0.01286219, "balance_loss_clip": 0.06311597, "balance_loss_mlp": 0.01262294, "epoch": 0.2123553284232677, "flos": 13996232997120.0, "grad_norm": 2.7653125492823194, "language_loss": 0.74270028, "learning_rate": 3.661192665917977e-06, "loss": 0.82114947, "num_input_tokens_seen": 76272150, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.23925781, "step": 3532, "time_per_iteration": 2.55643892288208 }, { "auxiliary_loss_clip": 0.06556769, "auxiliary_loss_mlp": 0.01278017, "balance_loss_clip": 0.06310556, "balance_loss_mlp": 0.01255617, "epoch": 0.21241545167593567, "flos": 18302745916800.0, "grad_norm": 1.656781314695629, "language_loss": 0.74669683, "learning_rate": 3.660975752961054e-06, "loss": 0.82504463, "num_input_tokens_seen": 76291425, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.22424316, "step": 3533, "time_per_iteration": 3.971099376678467 }, { "auxiliary_loss_clip": 0.06555554, "auxiliary_loss_mlp": 0.01278113, "balance_loss_clip": 0.06305425, "balance_loss_mlp": 0.01255094, "epoch": 0.21247557492860364, "flos": 34721461906560.0, "grad_norm": 2.4805298432518086, "language_loss": 0.71908975, "learning_rate": 3.6607587770197634e-06, "loss": 0.79742634, "num_input_tokens_seen": 76313975, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.22998047, "step": 3534, "time_per_iteration": 2.6877963542938232 }, { "auxiliary_loss_clip": 0.06552647, "auxiliary_loss_mlp": 0.01281631, "balance_loss_clip": 0.0630552, "balance_loss_mlp": 0.0126028, "epoch": 0.2125356981812716, "flos": 22060254885120.0, "grad_norm": 2.9947104468792323, "language_loss": 0.72766834, "learning_rate": 3.6605417381023346e-06, "loss": 0.80601114, "num_input_tokens_seen": 76330955, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.21350098, "step": 3535, "time_per_iteration": 2.5671780109405518 }, { "auxiliary_loss_clip": 0.06547253, "auxiliary_loss_mlp": 0.01280036, "balance_loss_clip": 0.06303391, "balance_loss_mlp": 0.01257387, "epoch": 0.21259582143393957, "flos": 28555865648640.0, "grad_norm": 1.9048731550663305, "language_loss": 0.71082038, "learning_rate": 3.660324636216996e-06, "loss": 0.78909332, "num_input_tokens_seen": 76352680, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.22644043, "step": 3536, "time_per_iteration": 2.6345443725585938 }, { "auxiliary_loss_clip": 0.06551965, "auxiliary_loss_mlp": 0.01279671, "balance_loss_clip": 0.06299491, "balance_loss_mlp": 0.01256914, "epoch": 0.21265594468660753, "flos": 20127140864640.0, "grad_norm": 1.8667424030004809, "language_loss": 0.87956905, "learning_rate": 3.660107471371981e-06, "loss": 0.95788538, "num_input_tokens_seen": 76370750, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.22753906, "step": 3537, "time_per_iteration": 2.5743658542633057 }, { "auxiliary_loss_clip": 0.06538372, "auxiliary_loss_mlp": 0.01278954, "balance_loss_clip": 0.06296904, "balance_loss_mlp": 0.01255768, "epoch": 0.21271606793927553, "flos": 23082890440320.0, "grad_norm": 1.8006153669563334, "language_loss": 0.80982453, "learning_rate": 3.659890243575524e-06, "loss": 0.88799781, "num_input_tokens_seen": 76390610, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.23193359, "step": 3538, "time_per_iteration": 3.91066575050354 }, { "auxiliary_loss_clip": 0.06540292, "auxiliary_loss_mlp": 0.01278336, "balance_loss_clip": 0.06296022, "balance_loss_mlp": 0.01256247, "epoch": 0.2127761911919435, "flos": 26394118963200.0, "grad_norm": 1.592778325506125, "language_loss": 0.87725425, "learning_rate": 3.659672952835863e-06, "loss": 0.95544046, "num_input_tokens_seen": 76408860, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.22070312, "step": 3539, "time_per_iteration": 2.6127631664276123 }, { "auxiliary_loss_clip": 0.06546368, "auxiliary_loss_mlp": 0.01279956, "balance_loss_clip": 0.06297965, "balance_loss_mlp": 0.01258081, "epoch": 0.21283631444461146, "flos": 20234182855680.0, "grad_norm": 2.013150581617102, "language_loss": 0.59414458, "learning_rate": 3.659455599161237e-06, "loss": 0.67240781, "num_input_tokens_seen": 76424980, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.21850586, "step": 3540, "time_per_iteration": 2.549023389816284 }, { "auxiliary_loss_clip": 0.06538159, "auxiliary_loss_mlp": 0.01280799, "balance_loss_clip": 0.06292845, "balance_loss_mlp": 0.01257338, "epoch": 0.21289643769727942, "flos": 13522140195840.0, "grad_norm": 1.9557782869690934, "language_loss": 0.76666874, "learning_rate": 3.659238182559888e-06, "loss": 0.84485829, "num_input_tokens_seen": 76443135, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.23474121, "step": 3541, "time_per_iteration": 3.9804508686065674 }, { "auxiliary_loss_clip": 0.06536015, "auxiliary_loss_mlp": 0.01277978, "balance_loss_clip": 0.06294364, "balance_loss_mlp": 0.0125658, "epoch": 0.2129565609499474, "flos": 24833967465600.0, "grad_norm": 1.6644989681217637, "language_loss": 0.70569092, "learning_rate": 3.6590207030400615e-06, "loss": 0.78383088, "num_input_tokens_seen": 76462470, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.21386719, "step": 3542, "time_per_iteration": 2.595668315887451 }, { "auxiliary_loss_clip": 0.06536629, "auxiliary_loss_mlp": 0.01276274, "balance_loss_clip": 0.06297889, "balance_loss_mlp": 0.01255758, "epoch": 0.21301668420261535, "flos": 23665953876480.0, "grad_norm": 2.1126491337381106, "language_loss": 0.76392496, "learning_rate": 3.658803160610004e-06, "loss": 0.84205401, "num_input_tokens_seen": 76481995, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.20532227, "step": 3543, "time_per_iteration": 2.566084861755371 }, { "auxiliary_loss_clip": 0.06539389, "auxiliary_loss_mlp": 0.01279683, "balance_loss_clip": 0.06296468, "balance_loss_mlp": 0.01257379, "epoch": 0.21307680745528332, "flos": 16368416012160.0, "grad_norm": 1.791586901400964, "language_loss": 0.67110074, "learning_rate": 3.6585855552779634e-06, "loss": 0.74929154, "num_input_tokens_seen": 76500245, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.22302246, "step": 3544, "time_per_iteration": 2.5439655780792236 }, { "auxiliary_loss_clip": 0.06533439, "auxiliary_loss_mlp": 0.01280288, "balance_loss_clip": 0.06290187, "balance_loss_mlp": 0.01259438, "epoch": 0.2131369307079513, "flos": 19105092288000.0, "grad_norm": 2.2744566464174163, "language_loss": 0.71737963, "learning_rate": 3.6583678870521934e-06, "loss": 0.79551685, "num_input_tokens_seen": 76519535, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.20874023, "step": 3545, "time_per_iteration": 2.543686866760254 }, { "auxiliary_loss_clip": 0.06541806, "auxiliary_loss_mlp": 0.01279456, "balance_loss_clip": 0.06292713, "balance_loss_mlp": 0.01257855, "epoch": 0.21319705396061928, "flos": 30380050961280.0, "grad_norm": 1.6966641156645772, "language_loss": 0.7272802, "learning_rate": 3.658150155940946e-06, "loss": 0.80549282, "num_input_tokens_seen": 76542065, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.21594238, "step": 3546, "time_per_iteration": 2.6397297382354736 }, { "auxiliary_loss_clip": 0.06541472, "auxiliary_loss_mlp": 0.01287192, "balance_loss_clip": 0.06297198, "balance_loss_mlp": 0.01265747, "epoch": 0.21325717721328724, "flos": 21761616533760.0, "grad_norm": 4.078771033729166, "language_loss": 0.81099391, "learning_rate": 3.657932361952479e-06, "loss": 0.88928062, "num_input_tokens_seen": 76560540, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.21435547, "step": 3547, "time_per_iteration": 2.5617306232452393 }, { "auxiliary_loss_clip": 0.06542522, "auxiliary_loss_mlp": 0.01285205, "balance_loss_clip": 0.06292489, "balance_loss_mlp": 0.01261292, "epoch": 0.2133173004659552, "flos": 28738447695360.0, "grad_norm": 2.8885150458004523, "language_loss": 0.75576031, "learning_rate": 3.6577145050950504e-06, "loss": 0.83403754, "num_input_tokens_seen": 76581760, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.23901367, "step": 3548, "time_per_iteration": 2.6096832752227783 }, { "auxiliary_loss_clip": 0.06539074, "auxiliary_loss_mlp": 0.01283857, "balance_loss_clip": 0.06289519, "balance_loss_mlp": 0.01261231, "epoch": 0.21337742371862317, "flos": 16842760375680.0, "grad_norm": 1.8221148836002898, "language_loss": 0.74934685, "learning_rate": 3.657496585376922e-06, "loss": 0.82757616, "num_input_tokens_seen": 76599940, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.22607422, "step": 3549, "time_per_iteration": 2.5447800159454346 }, { "auxiliary_loss_clip": 0.065361, "auxiliary_loss_mlp": 0.01278151, "balance_loss_clip": 0.06289904, "balance_loss_mlp": 0.01256145, "epoch": 0.21343754697129114, "flos": 24431683213440.0, "grad_norm": 2.1878303718309855, "language_loss": 0.81348097, "learning_rate": 3.657278602806357e-06, "loss": 0.8916235, "num_input_tokens_seen": 76619580, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.22009277, "step": 3550, "time_per_iteration": 2.577362537384033 }, { "auxiliary_loss_clip": 0.06534626, "auxiliary_loss_mlp": 0.01280252, "balance_loss_clip": 0.06295162, "balance_loss_mlp": 0.01259855, "epoch": 0.21349767022395913, "flos": 19283271995520.0, "grad_norm": 1.5972858523833269, "language_loss": 0.88278615, "learning_rate": 3.657060557391621e-06, "loss": 0.96093494, "num_input_tokens_seen": 76638195, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.20410156, "step": 3551, "time_per_iteration": 2.5568361282348633 }, { "auxiliary_loss_clip": 0.06535923, "auxiliary_loss_mlp": 0.01279213, "balance_loss_clip": 0.06293243, "balance_loss_mlp": 0.01257803, "epoch": 0.2135577934766271, "flos": 17353260576000.0, "grad_norm": 2.4624468543913385, "language_loss": 0.83489716, "learning_rate": 3.656842449140983e-06, "loss": 0.91304857, "num_input_tokens_seen": 76656695, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.21411133, "step": 3552, "time_per_iteration": 2.575366258621216 }, { "auxiliary_loss_clip": 0.06537426, "auxiliary_loss_mlp": 0.01281468, "balance_loss_clip": 0.06294918, "balance_loss_mlp": 0.0125914, "epoch": 0.21361791672929506, "flos": 24063416519040.0, "grad_norm": 1.8249840246045874, "language_loss": 0.77080762, "learning_rate": 3.656624278062713e-06, "loss": 0.84899652, "num_input_tokens_seen": 76677430, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.2232666, "step": 3553, "time_per_iteration": 2.582718849182129 }, { "auxiliary_loss_clip": 0.06536148, "auxiliary_loss_mlp": 0.01276707, "balance_loss_clip": 0.06293943, "balance_loss_mlp": 0.01255094, "epoch": 0.21367803998196302, "flos": 22168596614400.0, "grad_norm": 1.7799953710870098, "language_loss": 0.7335341, "learning_rate": 3.6564060441650843e-06, "loss": 0.81166261, "num_input_tokens_seen": 76697615, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.21606445, "step": 3554, "time_per_iteration": 2.5948150157928467 }, { "auxiliary_loss_clip": 0.06537913, "auxiliary_loss_mlp": 0.01281587, "balance_loss_clip": 0.06292714, "balance_loss_mlp": 0.01259486, "epoch": 0.213738163234631, "flos": 20893205617920.0, "grad_norm": 2.1202858772535285, "language_loss": 0.68584341, "learning_rate": 3.6561877474563724e-06, "loss": 0.76403844, "num_input_tokens_seen": 76715685, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.22094727, "step": 3555, "time_per_iteration": 2.59106183052063 }, { "auxiliary_loss_clip": 0.06537574, "auxiliary_loss_mlp": 0.01278943, "balance_loss_clip": 0.06290373, "balance_loss_mlp": 0.01256925, "epoch": 0.21379828648729896, "flos": 28410739176960.0, "grad_norm": 1.7934734214909638, "language_loss": 0.65932792, "learning_rate": 3.6559693879448553e-06, "loss": 0.7374931, "num_input_tokens_seen": 76735405, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.22009277, "step": 3556, "time_per_iteration": 2.6129000186920166 }, { "auxiliary_loss_clip": 0.0653715, "auxiliary_loss_mlp": 0.01280167, "balance_loss_clip": 0.06294221, "balance_loss_mlp": 0.01258364, "epoch": 0.21385840973996692, "flos": 25486030339200.0, "grad_norm": 2.460911123525826, "language_loss": 0.73152936, "learning_rate": 3.6557509656388125e-06, "loss": 0.80970252, "num_input_tokens_seen": 76754395, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.21789551, "step": 3557, "time_per_iteration": 2.5936241149902344 }, { "auxiliary_loss_clip": 0.06541601, "auxiliary_loss_mlp": 0.01278537, "balance_loss_clip": 0.06293889, "balance_loss_mlp": 0.01255458, "epoch": 0.2139185329926349, "flos": 28081772847360.0, "grad_norm": 3.0414498494992297, "language_loss": 0.6809305, "learning_rate": 3.655532480546528e-06, "loss": 0.75913191, "num_input_tokens_seen": 76777210, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.23083496, "step": 3558, "time_per_iteration": 2.6338095664978027 }, { "auxiliary_loss_clip": 0.06542942, "auxiliary_loss_mlp": 0.01280322, "balance_loss_clip": 0.06290819, "balance_loss_mlp": 0.01258268, "epoch": 0.21397865624530288, "flos": 19614628166400.0, "grad_norm": 1.7279686613739782, "language_loss": 0.80928957, "learning_rate": 3.655313932676286e-06, "loss": 0.88752222, "num_input_tokens_seen": 76795830, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.22058105, "step": 3559, "time_per_iteration": 2.5651142597198486 }, { "auxiliary_loss_clip": 0.06532423, "auxiliary_loss_mlp": 0.01281653, "balance_loss_clip": 0.06293427, "balance_loss_mlp": 0.01262055, "epoch": 0.21403877949797084, "flos": 24688463650560.0, "grad_norm": 1.6585018029276952, "language_loss": 0.68333787, "learning_rate": 3.655095322036373e-06, "loss": 0.76147866, "num_input_tokens_seen": 76814700, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.19592285, "step": 3560, "time_per_iteration": 2.5852017402648926 }, { "auxiliary_loss_clip": 0.06541082, "auxiliary_loss_mlp": 0.01277681, "balance_loss_clip": 0.06293166, "balance_loss_mlp": 0.01255198, "epoch": 0.2140989027506388, "flos": 19866628920960.0, "grad_norm": 2.2125973487477717, "language_loss": 0.73741925, "learning_rate": 3.65487664863508e-06, "loss": 0.81560689, "num_input_tokens_seen": 76833400, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.22485352, "step": 3561, "time_per_iteration": 2.5542755126953125 }, { "auxiliary_loss_clip": 0.06536791, "auxiliary_loss_mlp": 0.01282078, "balance_loss_clip": 0.06290223, "balance_loss_mlp": 0.01261252, "epoch": 0.21415902600330677, "flos": 19141331978880.0, "grad_norm": 1.9386437608618547, "language_loss": 0.78779221, "learning_rate": 3.654657912480698e-06, "loss": 0.86598092, "num_input_tokens_seen": 76850645, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.20837402, "step": 3562, "time_per_iteration": 2.5318145751953125 }, { "auxiliary_loss_clip": 0.06536391, "auxiliary_loss_mlp": 0.01279984, "balance_loss_clip": 0.06294358, "balance_loss_mlp": 0.01258598, "epoch": 0.21421914925597474, "flos": 22279076622720.0, "grad_norm": 1.4577030498547874, "language_loss": 0.85078967, "learning_rate": 3.6544391135815237e-06, "loss": 0.92895335, "num_input_tokens_seen": 76870135, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.21386719, "step": 3563, "time_per_iteration": 2.564633846282959 }, { "auxiliary_loss_clip": 0.06534425, "auxiliary_loss_mlp": 0.01275876, "balance_loss_clip": 0.0629328, "balance_loss_mlp": 0.0125561, "epoch": 0.2142792725086427, "flos": 33883504750080.0, "grad_norm": 1.4803254385598577, "language_loss": 0.77391964, "learning_rate": 3.6542202519458507e-06, "loss": 0.85202265, "num_input_tokens_seen": 76893905, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.20263672, "step": 3564, "time_per_iteration": 2.6609861850738525 }, { "auxiliary_loss_clip": 0.06529527, "auxiliary_loss_mlp": 0.01279228, "balance_loss_clip": 0.06289228, "balance_loss_mlp": 0.01257842, "epoch": 0.2143393957613107, "flos": 19865538817920.0, "grad_norm": 1.7416410049767976, "language_loss": 0.89154023, "learning_rate": 3.654001327581981e-06, "loss": 0.9696278, "num_input_tokens_seen": 76914205, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.21386719, "step": 3565, "time_per_iteration": 2.592297315597534 }, { "auxiliary_loss_clip": 0.06436307, "auxiliary_loss_mlp": 0.01284674, "balance_loss_clip": 0.06298426, "balance_loss_mlp": 0.01277695, "epoch": 0.21439951901397866, "flos": 68549300017920.0, "grad_norm": 0.8147305120207483, "language_loss": 0.52263653, "learning_rate": 3.653782340498215e-06, "loss": 0.59984636, "num_input_tokens_seen": 76975650, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06988525, "step": 3566, "time_per_iteration": 3.1241793632507324 }, { "auxiliary_loss_clip": 0.0652534, "auxiliary_loss_mlp": 0.0127693, "balance_loss_clip": 0.06288844, "balance_loss_mlp": 0.01256629, "epoch": 0.21445964226664663, "flos": 19689161973120.0, "grad_norm": 3.697902922370556, "language_loss": 0.68136102, "learning_rate": 3.6535632907028566e-06, "loss": 0.75938368, "num_input_tokens_seen": 76992615, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.20300293, "step": 3567, "time_per_iteration": 2.563028573989868 }, { "auxiliary_loss_clip": 0.065355, "auxiliary_loss_mlp": 0.01278322, "balance_loss_clip": 0.06297686, "balance_loss_mlp": 0.0125802, "epoch": 0.2145197655193146, "flos": 31116039298560.0, "grad_norm": 1.5769358175372843, "language_loss": 0.74731469, "learning_rate": 3.6533441782042126e-06, "loss": 0.82545292, "num_input_tokens_seen": 77017005, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.203125, "step": 3568, "time_per_iteration": 2.679842710494995 }, { "auxiliary_loss_clip": 0.06530502, "auxiliary_loss_mlp": 0.0127944, "balance_loss_clip": 0.06290911, "balance_loss_mlp": 0.01257565, "epoch": 0.21457988877198256, "flos": 20127015083520.0, "grad_norm": 1.8575675421355637, "language_loss": 0.7875855, "learning_rate": 3.6531250030105917e-06, "loss": 0.86568487, "num_input_tokens_seen": 77034990, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.21887207, "step": 3569, "time_per_iteration": 4.017252683639526 }, { "auxiliary_loss_clip": 0.06540982, "auxiliary_loss_mlp": 0.01280471, "balance_loss_clip": 0.06291253, "balance_loss_mlp": 0.01257142, "epoch": 0.21464001202465052, "flos": 18593963182080.0, "grad_norm": 2.1438528826137486, "language_loss": 0.70810521, "learning_rate": 3.6529057651303053e-06, "loss": 0.78631973, "num_input_tokens_seen": 77052610, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.23327637, "step": 3570, "time_per_iteration": 2.571953535079956 }, { "auxiliary_loss_clip": 0.06541489, "auxiliary_loss_mlp": 0.01284308, "balance_loss_clip": 0.06292585, "balance_loss_mlp": 0.01261956, "epoch": 0.21470013527731852, "flos": 21841600855680.0, "grad_norm": 2.0112593382549346, "language_loss": 0.79585683, "learning_rate": 3.6526864645716666e-06, "loss": 0.87411475, "num_input_tokens_seen": 77072475, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.22338867, "step": 3571, "time_per_iteration": 2.5629312992095947 }, { "auxiliary_loss_clip": 0.06537704, "auxiliary_loss_mlp": 0.01278808, "balance_loss_clip": 0.06293266, "balance_loss_mlp": 0.01257398, "epoch": 0.21476025852998648, "flos": 17608992837120.0, "grad_norm": 2.2920024573189877, "language_loss": 0.82842696, "learning_rate": 3.652467101342991e-06, "loss": 0.90659213, "num_input_tokens_seen": 77089930, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.21411133, "step": 3572, "time_per_iteration": 3.9906656742095947 }, { "auxiliary_loss_clip": 0.06536864, "auxiliary_loss_mlp": 0.01277418, "balance_loss_clip": 0.0628861, "balance_loss_mlp": 0.01256508, "epoch": 0.21482038178265445, "flos": 24835267203840.0, "grad_norm": 3.325433487242811, "language_loss": 0.66077572, "learning_rate": 3.652247675452598e-06, "loss": 0.73891854, "num_input_tokens_seen": 77108970, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.20910645, "step": 3573, "time_per_iteration": 2.5691020488739014 }, { "auxiliary_loss_clip": 0.06526145, "auxiliary_loss_mlp": 0.01279155, "balance_loss_clip": 0.06288462, "balance_loss_mlp": 0.01258436, "epoch": 0.2148805050353224, "flos": 23264927435520.0, "grad_norm": 2.408987948884692, "language_loss": 0.761145, "learning_rate": 3.652028186908807e-06, "loss": 0.83919805, "num_input_tokens_seen": 77126045, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.20703125, "step": 3574, "time_per_iteration": 2.6342132091522217 }, { "auxiliary_loss_clip": 0.06530041, "auxiliary_loss_mlp": 0.01280304, "balance_loss_clip": 0.06286337, "balance_loss_mlp": 0.01257917, "epoch": 0.21494062828799038, "flos": 21326907951360.0, "grad_norm": 2.2028463972414336, "language_loss": 0.72844458, "learning_rate": 3.6518086357199416e-06, "loss": 0.80654806, "num_input_tokens_seen": 77144600, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.22387695, "step": 3575, "time_per_iteration": 2.543865203857422 }, { "auxiliary_loss_clip": 0.06530165, "auxiliary_loss_mlp": 0.01279138, "balance_loss_clip": 0.06288388, "balance_loss_mlp": 0.01258312, "epoch": 0.21500075154065834, "flos": 18849276172800.0, "grad_norm": 1.9613532393166704, "language_loss": 0.6920979, "learning_rate": 3.6515890218943277e-06, "loss": 0.77019095, "num_input_tokens_seen": 77162965, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.20812988, "step": 3576, "time_per_iteration": 2.5270023345947266 }, { "auxiliary_loss_clip": 0.06535509, "auxiliary_loss_mlp": 0.0128366, "balance_loss_clip": 0.06287748, "balance_loss_mlp": 0.01261487, "epoch": 0.2150608747933263, "flos": 18447872388480.0, "grad_norm": 1.91350579009716, "language_loss": 0.89314222, "learning_rate": 3.651369345440292e-06, "loss": 0.97133392, "num_input_tokens_seen": 77179960, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.22180176, "step": 3577, "time_per_iteration": 4.060603141784668 }, { "auxiliary_loss_clip": 0.06424661, "auxiliary_loss_mlp": 0.01257377, "balance_loss_clip": 0.06286506, "balance_loss_mlp": 0.01251065, "epoch": 0.2151209980459943, "flos": 66617443808640.0, "grad_norm": 0.7863630032129107, "language_loss": 0.563007, "learning_rate": 3.6511496063661654e-06, "loss": 0.63982737, "num_input_tokens_seen": 77239500, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.06314087, "step": 3578, "time_per_iteration": 3.20585298538208 }, { "auxiliary_loss_clip": 0.0653102, "auxiliary_loss_mlp": 0.0128274, "balance_loss_clip": 0.06289004, "balance_loss_mlp": 0.01261211, "epoch": 0.21518112129866226, "flos": 21581633963520.0, "grad_norm": 1.6728703829773113, "language_loss": 0.89220238, "learning_rate": 3.6509298046802807e-06, "loss": 0.97034001, "num_input_tokens_seen": 77254680, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.21533203, "step": 3579, "time_per_iteration": 2.546537160873413 }, { "auxiliary_loss_clip": 0.06536055, "auxiliary_loss_mlp": 0.01280616, "balance_loss_clip": 0.06288038, "balance_loss_mlp": 0.01258395, "epoch": 0.21524124455133023, "flos": 20053822942080.0, "grad_norm": 1.7719073122365443, "language_loss": 0.78606957, "learning_rate": 3.650709940390972e-06, "loss": 0.86423624, "num_input_tokens_seen": 77274060, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.22216797, "step": 3580, "time_per_iteration": 3.99688982963562 }, { "auxiliary_loss_clip": 0.06531046, "auxiliary_loss_mlp": 0.01279516, "balance_loss_clip": 0.06288383, "balance_loss_mlp": 0.01259108, "epoch": 0.2153013678039982, "flos": 23958680515200.0, "grad_norm": 1.826866189400915, "language_loss": 0.73366165, "learning_rate": 3.6504900135065775e-06, "loss": 0.81176722, "num_input_tokens_seen": 77293255, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.20397949, "step": 3581, "time_per_iteration": 2.5746302604675293 }, { "auxiliary_loss_clip": 0.06535675, "auxiliary_loss_mlp": 0.01285053, "balance_loss_clip": 0.0629092, "balance_loss_mlp": 0.01261986, "epoch": 0.21536149105666616, "flos": 20601107884800.0, "grad_norm": 2.3065504344201933, "language_loss": 0.71341527, "learning_rate": 3.6502700240354357e-06, "loss": 0.79162252, "num_input_tokens_seen": 77312390, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.23059082, "step": 3582, "time_per_iteration": 2.547032117843628 }, { "auxiliary_loss_clip": 0.06526519, "auxiliary_loss_mlp": 0.01278611, "balance_loss_clip": 0.06283922, "balance_loss_mlp": 0.01256187, "epoch": 0.21542161430933413, "flos": 12865046077440.0, "grad_norm": 3.199829551513024, "language_loss": 0.84962952, "learning_rate": 3.650049971985889e-06, "loss": 0.92768085, "num_input_tokens_seen": 77330985, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.22436523, "step": 3583, "time_per_iteration": 2.5711376667022705 }, { "auxiliary_loss_clip": 0.06538793, "auxiliary_loss_mlp": 0.01285017, "balance_loss_clip": 0.06292345, "balance_loss_mlp": 0.012635, "epoch": 0.21548173756200212, "flos": 26111077470720.0, "grad_norm": 2.4867185232890057, "language_loss": 0.83431268, "learning_rate": 3.6498298573662824e-06, "loss": 0.91255081, "num_input_tokens_seen": 77350770, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.21533203, "step": 3584, "time_per_iteration": 2.7275116443634033 }, { "auxiliary_loss_clip": 0.06534337, "auxiliary_loss_mlp": 0.01290384, "balance_loss_clip": 0.06294076, "balance_loss_mlp": 0.01268354, "epoch": 0.21554186081467008, "flos": 22170315623040.0, "grad_norm": 1.8859484439173106, "language_loss": 0.91060472, "learning_rate": 3.6496096801849625e-06, "loss": 0.9888519, "num_input_tokens_seen": 77370510, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.22033691, "step": 3585, "time_per_iteration": 2.5509567260742188 }, { "auxiliary_loss_clip": 0.0653406, "auxiliary_loss_mlp": 0.01281444, "balance_loss_clip": 0.06291855, "balance_loss_mlp": 0.01261548, "epoch": 0.21560198406733805, "flos": 22973458608000.0, "grad_norm": 1.7779147594859677, "language_loss": 0.75189614, "learning_rate": 3.649389440450277e-06, "loss": 0.83005118, "num_input_tokens_seen": 77390645, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.19897461, "step": 3586, "time_per_iteration": 2.550621747970581 }, { "auxiliary_loss_clip": 0.06539658, "auxiliary_loss_mlp": 0.01293781, "balance_loss_clip": 0.06293565, "balance_loss_mlp": 0.01273658, "epoch": 0.215662107320006, "flos": 22790708853120.0, "grad_norm": 2.0394285619767434, "language_loss": 0.83290595, "learning_rate": 3.6491691381705804e-06, "loss": 0.91124034, "num_input_tokens_seen": 77409655, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.20117188, "step": 3587, "time_per_iteration": 2.5649776458740234 }, { "auxiliary_loss_clip": 0.06531948, "auxiliary_loss_mlp": 0.01283187, "balance_loss_clip": 0.06290415, "balance_loss_mlp": 0.01262648, "epoch": 0.21572223057267398, "flos": 30891850899840.0, "grad_norm": 2.122725727924429, "language_loss": 0.76586783, "learning_rate": 3.648948773354224e-06, "loss": 0.84401917, "num_input_tokens_seen": 77430560, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.20532227, "step": 3588, "time_per_iteration": 2.6223881244659424 }, { "auxiliary_loss_clip": 0.06540225, "auxiliary_loss_mlp": 0.01281795, "balance_loss_clip": 0.06294642, "balance_loss_mlp": 0.01259825, "epoch": 0.21578235382534194, "flos": 26918413159680.0, "grad_norm": 1.666523444816244, "language_loss": 0.81713575, "learning_rate": 3.6487283460095643e-06, "loss": 0.895356, "num_input_tokens_seen": 77455000, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.21984863, "step": 3589, "time_per_iteration": 2.642335891723633 }, { "auxiliary_loss_clip": 0.06536719, "auxiliary_loss_mlp": 0.01282039, "balance_loss_clip": 0.06293115, "balance_loss_mlp": 0.01261881, "epoch": 0.2158424770780099, "flos": 24432605608320.0, "grad_norm": 3.3202499374102863, "language_loss": 0.74014914, "learning_rate": 3.648507856144961e-06, "loss": 0.81833673, "num_input_tokens_seen": 77475075, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.20153809, "step": 3590, "time_per_iteration": 2.5857746601104736 }, { "auxiliary_loss_clip": 0.06541452, "auxiliary_loss_mlp": 0.01281678, "balance_loss_clip": 0.06294664, "balance_loss_mlp": 0.01259111, "epoch": 0.2159026003306779, "flos": 23956542236160.0, "grad_norm": 2.0470974314934707, "language_loss": 0.85086024, "learning_rate": 3.648287303768775e-06, "loss": 0.92909157, "num_input_tokens_seen": 77495945, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.22570801, "step": 3591, "time_per_iteration": 2.58385968208313 }, { "auxiliary_loss_clip": 0.06541239, "auxiliary_loss_mlp": 0.01283499, "balance_loss_clip": 0.06292422, "balance_loss_mlp": 0.01260075, "epoch": 0.21596272358334587, "flos": 30048191665920.0, "grad_norm": 2.2947876695705633, "language_loss": 0.69753039, "learning_rate": 3.6480666888893686e-06, "loss": 0.77577782, "num_input_tokens_seen": 77517140, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.234375, "step": 3592, "time_per_iteration": 2.620638847351074 }, { "auxiliary_loss_clip": 0.06536512, "auxiliary_loss_mlp": 0.0127877, "balance_loss_clip": 0.06291977, "balance_loss_mlp": 0.01256562, "epoch": 0.21602284683601383, "flos": 20382495782400.0, "grad_norm": 2.8527169760798246, "language_loss": 0.84543419, "learning_rate": 3.647846011515108e-06, "loss": 0.92358696, "num_input_tokens_seen": 77536085, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.22192383, "step": 3593, "time_per_iteration": 2.5401549339294434 }, { "auxiliary_loss_clip": 0.06536805, "auxiliary_loss_mlp": 0.01280688, "balance_loss_clip": 0.0629239, "balance_loss_mlp": 0.01258158, "epoch": 0.2160829700886818, "flos": 20783648004480.0, "grad_norm": 2.3660465137168867, "language_loss": 0.76230508, "learning_rate": 3.6476252716543625e-06, "loss": 0.84047997, "num_input_tokens_seen": 77553675, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.22521973, "step": 3594, "time_per_iteration": 2.5447208881378174 }, { "auxiliary_loss_clip": 0.0652931, "auxiliary_loss_mlp": 0.01282535, "balance_loss_clip": 0.06290819, "balance_loss_mlp": 0.0126097, "epoch": 0.21614309334134976, "flos": 22316322562560.0, "grad_norm": 1.4478431205980211, "language_loss": 0.80836803, "learning_rate": 3.6474044693155007e-06, "loss": 0.88648653, "num_input_tokens_seen": 77573360, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.2154541, "step": 3595, "time_per_iteration": 2.553584575653076 }, { "auxiliary_loss_clip": 0.06542345, "auxiliary_loss_mlp": 0.01279564, "balance_loss_clip": 0.06290753, "balance_loss_mlp": 0.01259084, "epoch": 0.21620321659401773, "flos": 19615592488320.0, "grad_norm": 2.1367551516632557, "language_loss": 0.79048252, "learning_rate": 3.647183604506897e-06, "loss": 0.86870164, "num_input_tokens_seen": 77591865, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.20471191, "step": 3596, "time_per_iteration": 2.526993751525879 }, { "auxiliary_loss_clip": 0.06535278, "auxiliary_loss_mlp": 0.01279179, "balance_loss_clip": 0.06292116, "balance_loss_mlp": 0.01257388, "epoch": 0.2162633398466857, "flos": 18850701692160.0, "grad_norm": 1.5792249397126403, "language_loss": 0.83570898, "learning_rate": 3.6469626772369253e-06, "loss": 0.91385353, "num_input_tokens_seen": 77611600, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.21789551, "step": 3597, "time_per_iteration": 2.5589118003845215 }, { "auxiliary_loss_clip": 0.06540026, "auxiliary_loss_mlp": 0.01277323, "balance_loss_clip": 0.06294578, "balance_loss_mlp": 0.01255901, "epoch": 0.21632346309935369, "flos": 18774490803840.0, "grad_norm": 1.533017755098726, "language_loss": 0.81499988, "learning_rate": 3.6467416875139642e-06, "loss": 0.8931734, "num_input_tokens_seen": 77630665, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.21411133, "step": 3598, "time_per_iteration": 2.542987108230591 }, { "auxiliary_loss_clip": 0.06535607, "auxiliary_loss_mlp": 0.01281851, "balance_loss_clip": 0.06291659, "balance_loss_mlp": 0.01260119, "epoch": 0.21638358635202165, "flos": 26331576289920.0, "grad_norm": 1.6432248470747106, "language_loss": 0.82484829, "learning_rate": 3.6465206353463934e-06, "loss": 0.90302289, "num_input_tokens_seen": 77650835, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.21740723, "step": 3599, "time_per_iteration": 2.6079766750335693 }, { "auxiliary_loss_clip": 0.06534044, "auxiliary_loss_mlp": 0.01277512, "balance_loss_clip": 0.06292753, "balance_loss_mlp": 0.01256615, "epoch": 0.21644370960468962, "flos": 20747156751360.0, "grad_norm": 2.0454024645391224, "language_loss": 0.76761246, "learning_rate": 3.6462995207425947e-06, "loss": 0.84572804, "num_input_tokens_seen": 77669000, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.20898438, "step": 3600, "time_per_iteration": 2.561034679412842 }, { "auxiliary_loss_clip": 0.06529606, "auxiliary_loss_mlp": 0.01274704, "balance_loss_clip": 0.06289519, "balance_loss_mlp": 0.01255404, "epoch": 0.21650383285735758, "flos": 23959183639680.0, "grad_norm": 15.952337153999267, "language_loss": 0.80803961, "learning_rate": 3.6460783437109533e-06, "loss": 0.88608271, "num_input_tokens_seen": 77688745, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.19311523, "step": 3601, "time_per_iteration": 2.60654878616333 }, { "auxiliary_loss_clip": 0.06533024, "auxiliary_loss_mlp": 0.01278994, "balance_loss_clip": 0.06288671, "balance_loss_mlp": 0.01258693, "epoch": 0.21656395611002555, "flos": 23702864400000.0, "grad_norm": 1.9388407334458913, "language_loss": 0.83822954, "learning_rate": 3.6458571042598565e-06, "loss": 0.91634977, "num_input_tokens_seen": 77708445, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.203125, "step": 3602, "time_per_iteration": 2.567187786102295 }, { "auxiliary_loss_clip": 0.06531464, "auxiliary_loss_mlp": 0.01280591, "balance_loss_clip": 0.0628999, "balance_loss_mlp": 0.01258537, "epoch": 0.2166240793626935, "flos": 20672035966080.0, "grad_norm": 2.199744558963034, "language_loss": 0.75736403, "learning_rate": 3.645635802397693e-06, "loss": 0.83548462, "num_input_tokens_seen": 77728465, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.22058105, "step": 3603, "time_per_iteration": 2.555798053741455 }, { "auxiliary_loss_clip": 0.06523788, "auxiliary_loss_mlp": 0.01273638, "balance_loss_clip": 0.06289823, "balance_loss_mlp": 0.01253897, "epoch": 0.2166842026153615, "flos": 21586916770560.0, "grad_norm": 1.6788598497111424, "language_loss": 0.75347686, "learning_rate": 3.645414438132855e-06, "loss": 0.83145106, "num_input_tokens_seen": 77746735, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.19750977, "step": 3604, "time_per_iteration": 2.552936553955078 }, { "auxiliary_loss_clip": 0.06523272, "auxiliary_loss_mlp": 0.01278846, "balance_loss_clip": 0.06286885, "balance_loss_mlp": 0.01258711, "epoch": 0.21674432586802947, "flos": 25637068523520.0, "grad_norm": 1.8630138871822115, "language_loss": 0.80839568, "learning_rate": 3.6451930114737366e-06, "loss": 0.88641691, "num_input_tokens_seen": 77768105, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.20117188, "step": 3605, "time_per_iteration": 2.60320782661438 }, { "auxiliary_loss_clip": 0.06438122, "auxiliary_loss_mlp": 0.01260729, "balance_loss_clip": 0.06295931, "balance_loss_mlp": 0.01254763, "epoch": 0.21680444912069743, "flos": 56435126376960.0, "grad_norm": 0.6615292049669573, "language_loss": 0.58285773, "learning_rate": 3.6449715224287347e-06, "loss": 0.65984619, "num_input_tokens_seen": 77833750, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.05963135, "step": 3606, "time_per_iteration": 3.253887414932251 }, { "auxiliary_loss_clip": 0.06527986, "auxiliary_loss_mlp": 0.01277774, "balance_loss_clip": 0.06282756, "balance_loss_mlp": 0.01256483, "epoch": 0.2168645723733654, "flos": 23885823790080.0, "grad_norm": 2.0265552337523816, "language_loss": 0.73967379, "learning_rate": 3.644749971006248e-06, "loss": 0.81773138, "num_input_tokens_seen": 77853780, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.2130127, "step": 3607, "time_per_iteration": 2.5742363929748535 }, { "auxiliary_loss_clip": 0.06534348, "auxiliary_loss_mlp": 0.01276135, "balance_loss_clip": 0.06289087, "balance_loss_mlp": 0.01253736, "epoch": 0.21692469562603336, "flos": 16951814864640.0, "grad_norm": 2.399685512388795, "language_loss": 0.77442312, "learning_rate": 3.6445283572146765e-06, "loss": 0.85252804, "num_input_tokens_seen": 77872575, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.22399902, "step": 3608, "time_per_iteration": 3.939396619796753 }, { "auxiliary_loss_clip": 0.06535922, "auxiliary_loss_mlp": 0.01275807, "balance_loss_clip": 0.06290353, "balance_loss_mlp": 0.01255732, "epoch": 0.21698481887870133, "flos": 25126065198720.0, "grad_norm": 1.911588677694433, "language_loss": 0.74920106, "learning_rate": 3.6443066810624255e-06, "loss": 0.82731837, "num_input_tokens_seen": 77892700, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.20092773, "step": 3609, "time_per_iteration": 2.5824551582336426 }, { "auxiliary_loss_clip": 0.06535691, "auxiliary_loss_mlp": 0.01275395, "balance_loss_clip": 0.06291736, "balance_loss_mlp": 0.01253746, "epoch": 0.2170449421313693, "flos": 17900461664640.0, "grad_norm": 2.560494966463193, "language_loss": 0.89585221, "learning_rate": 3.6440849425579e-06, "loss": 0.97396314, "num_input_tokens_seen": 77911060, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.21643066, "step": 3610, "time_per_iteration": 2.527484655380249 }, { "auxiliary_loss_clip": 0.06528819, "auxiliary_loss_mlp": 0.01274953, "balance_loss_clip": 0.06289239, "balance_loss_mlp": 0.01254354, "epoch": 0.2171050653840373, "flos": 22645121184000.0, "grad_norm": 1.6763475597419837, "language_loss": 0.78224218, "learning_rate": 3.6438631417095095e-06, "loss": 0.86027992, "num_input_tokens_seen": 77929930, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.20617676, "step": 3611, "time_per_iteration": 2.5673224925994873 }, { "auxiliary_loss_clip": 0.065383, "auxiliary_loss_mlp": 0.01277124, "balance_loss_clip": 0.06294924, "balance_loss_mlp": 0.0125668, "epoch": 0.21716518863670525, "flos": 19506034874880.0, "grad_norm": 2.51783611759505, "language_loss": 0.63945687, "learning_rate": 3.6436412785256637e-06, "loss": 0.71761113, "num_input_tokens_seen": 77949060, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.2043457, "step": 3612, "time_per_iteration": 4.009690284729004 }, { "auxiliary_loss_clip": 0.06539066, "auxiliary_loss_mlp": 0.01281145, "balance_loss_clip": 0.06295832, "balance_loss_mlp": 0.01260533, "epoch": 0.21722531188937322, "flos": 19798132608000.0, "grad_norm": 2.031493044295773, "language_loss": 0.76239467, "learning_rate": 3.643419353014776e-06, "loss": 0.8405968, "num_input_tokens_seen": 77967920, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.20617676, "step": 3613, "time_per_iteration": 2.548255681991577 }, { "auxiliary_loss_clip": 0.06524637, "auxiliary_loss_mlp": 0.01279519, "balance_loss_clip": 0.0628446, "balance_loss_mlp": 0.0125787, "epoch": 0.21728543514204118, "flos": 13339474295040.0, "grad_norm": 1.800676752495967, "language_loss": 0.71692955, "learning_rate": 3.643197365185261e-06, "loss": 0.79497111, "num_input_tokens_seen": 77985330, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.21630859, "step": 3614, "time_per_iteration": 2.533747673034668 }, { "auxiliary_loss_clip": 0.06530397, "auxiliary_loss_mlp": 0.01286917, "balance_loss_clip": 0.0629079, "balance_loss_mlp": 0.01266294, "epoch": 0.21734555839470915, "flos": 15237312946560.0, "grad_norm": 1.7235608715787103, "language_loss": 0.74051803, "learning_rate": 3.6429753150455378e-06, "loss": 0.81869113, "num_input_tokens_seen": 78003105, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.20617676, "step": 3615, "time_per_iteration": 2.539907932281494 }, { "auxiliary_loss_clip": 0.06539539, "auxiliary_loss_mlp": 0.01274963, "balance_loss_clip": 0.06288858, "balance_loss_mlp": 0.01253029, "epoch": 0.2174056816473771, "flos": 19980043822080.0, "grad_norm": 2.4257889493761864, "language_loss": 0.90534937, "learning_rate": 3.6427532026040263e-06, "loss": 0.98349428, "num_input_tokens_seen": 78019655, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.21936035, "step": 3616, "time_per_iteration": 2.5438292026519775 }, { "auxiliary_loss_clip": 0.06529151, "auxiliary_loss_mlp": 0.01283902, "balance_loss_clip": 0.06285819, "balance_loss_mlp": 0.01261312, "epoch": 0.21746580490004508, "flos": 16692309169920.0, "grad_norm": 3.2209745767385614, "language_loss": 0.81991613, "learning_rate": 3.642531027869148e-06, "loss": 0.89804673, "num_input_tokens_seen": 78036025, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.22583008, "step": 3617, "time_per_iteration": 4.079348087310791 }, { "auxiliary_loss_clip": 0.06527121, "auxiliary_loss_mlp": 0.01283787, "balance_loss_clip": 0.06282357, "balance_loss_mlp": 0.01261948, "epoch": 0.21752592815271307, "flos": 25778840832000.0, "grad_norm": 1.7125262797923178, "language_loss": 0.76396084, "learning_rate": 3.642308790849329e-06, "loss": 0.84206992, "num_input_tokens_seen": 78055645, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.21826172, "step": 3618, "time_per_iteration": 2.589585304260254 }, { "auxiliary_loss_clip": 0.06529865, "auxiliary_loss_mlp": 0.01281457, "balance_loss_clip": 0.06285323, "balance_loss_mlp": 0.01259939, "epoch": 0.21758605140538104, "flos": 11259430940160.0, "grad_norm": 1.9129542341084194, "language_loss": 0.69692898, "learning_rate": 3.642086491552996e-06, "loss": 0.77504218, "num_input_tokens_seen": 78071660, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.21533203, "step": 3619, "time_per_iteration": 2.516080379486084 }, { "auxiliary_loss_clip": 0.06530088, "auxiliary_loss_mlp": 0.01285522, "balance_loss_clip": 0.06285444, "balance_loss_mlp": 0.01263862, "epoch": 0.217646174658049, "flos": 19248290115840.0, "grad_norm": 1.659786470767402, "language_loss": 0.78734386, "learning_rate": 3.641864129988579e-06, "loss": 0.86550003, "num_input_tokens_seen": 78091265, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.21655273, "step": 3620, "time_per_iteration": 3.996530294418335 }, { "auxiliary_loss_clip": 0.06524903, "auxiliary_loss_mlp": 0.01276617, "balance_loss_clip": 0.06286351, "balance_loss_mlp": 0.01257269, "epoch": 0.21770629791071697, "flos": 21951619666560.0, "grad_norm": 1.4452959490829533, "language_loss": 0.80671859, "learning_rate": 3.641641706164509e-06, "loss": 0.8847338, "num_input_tokens_seen": 78110095, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.19348145, "step": 3621, "time_per_iteration": 2.637356758117676 }, { "auxiliary_loss_clip": 0.06523848, "auxiliary_loss_mlp": 0.01277467, "balance_loss_clip": 0.06282748, "balance_loss_mlp": 0.01257046, "epoch": 0.21776642116338493, "flos": 24943776641280.0, "grad_norm": 1.5722332269828303, "language_loss": 0.88363492, "learning_rate": 3.641419220089221e-06, "loss": 0.96164811, "num_input_tokens_seen": 78129475, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.20410156, "step": 3622, "time_per_iteration": 2.5785305500030518 }, { "auxiliary_loss_clip": 0.06532726, "auxiliary_loss_mlp": 0.01278523, "balance_loss_clip": 0.06284346, "balance_loss_mlp": 0.01255313, "epoch": 0.2178265444160529, "flos": 17827017960960.0, "grad_norm": 1.7334062255483231, "language_loss": 0.77645814, "learning_rate": 3.641196671771152e-06, "loss": 0.85457069, "num_input_tokens_seen": 78146880, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.23217773, "step": 3623, "time_per_iteration": 2.5181047916412354 }, { "auxiliary_loss_clip": 0.06529677, "auxiliary_loss_mlp": 0.01279648, "balance_loss_clip": 0.06284483, "balance_loss_mlp": 0.01258393, "epoch": 0.2178866676687209, "flos": 17718760085760.0, "grad_norm": 2.2619471393214536, "language_loss": 0.85412568, "learning_rate": 3.640974061218741e-06, "loss": 0.93221891, "num_input_tokens_seen": 78165065, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.21252441, "step": 3624, "time_per_iteration": 2.532413959503174 }, { "auxiliary_loss_clip": 0.06529697, "auxiliary_loss_mlp": 0.01283675, "balance_loss_clip": 0.06286851, "balance_loss_mlp": 0.01262122, "epoch": 0.21794679092138886, "flos": 16951437521280.0, "grad_norm": 2.37886757678603, "language_loss": 0.78909349, "learning_rate": 3.640751388440429e-06, "loss": 0.8672272, "num_input_tokens_seen": 78180005, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.21569824, "step": 3625, "time_per_iteration": 2.6565589904785156 }, { "auxiliary_loss_clip": 0.06409022, "auxiliary_loss_mlp": 0.0127324, "balance_loss_clip": 0.06268899, "balance_loss_mlp": 0.01267616, "epoch": 0.21800691417405682, "flos": 63737737413120.0, "grad_norm": 0.7761017491765535, "language_loss": 0.60688221, "learning_rate": 3.64052865344466e-06, "loss": 0.68370485, "num_input_tokens_seen": 78245350, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.05621338, "step": 3626, "time_per_iteration": 3.2850842475891113 }, { "auxiliary_loss_clip": 0.06530474, "auxiliary_loss_mlp": 0.01276494, "balance_loss_clip": 0.0628517, "balance_loss_mlp": 0.0125388, "epoch": 0.21806703742672479, "flos": 21622821045120.0, "grad_norm": 2.447041091666053, "language_loss": 0.90917706, "learning_rate": 3.6403058562398795e-06, "loss": 0.98724675, "num_input_tokens_seen": 78264165, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.22631836, "step": 3627, "time_per_iteration": 2.5559210777282715 }, { "auxiliary_loss_clip": 0.06529085, "auxiliary_loss_mlp": 0.01280109, "balance_loss_clip": 0.06287614, "balance_loss_mlp": 0.01258961, "epoch": 0.21812716067939275, "flos": 19361034184320.0, "grad_norm": 1.6763854000200331, "language_loss": 0.74169755, "learning_rate": 3.6400829968345365e-06, "loss": 0.81978953, "num_input_tokens_seen": 78283745, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.21154785, "step": 3628, "time_per_iteration": 2.557877779006958 }, { "auxiliary_loss_clip": 0.06528626, "auxiliary_loss_mlp": 0.01278945, "balance_loss_clip": 0.06286624, "balance_loss_mlp": 0.01257249, "epoch": 0.21818728393206072, "flos": 23554467619200.0, "grad_norm": 1.7792252477198545, "language_loss": 0.77547789, "learning_rate": 3.6398600752370826e-06, "loss": 0.85355365, "num_input_tokens_seen": 78302900, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.21691895, "step": 3629, "time_per_iteration": 2.591768264770508 }, { "auxiliary_loss_clip": 0.06523831, "auxiliary_loss_mlp": 0.01283567, "balance_loss_clip": 0.06285177, "balance_loss_mlp": 0.01263134, "epoch": 0.21824740718472868, "flos": 30233289335040.0, "grad_norm": 1.6317924044822956, "language_loss": 0.71884358, "learning_rate": 3.63963709145597e-06, "loss": 0.79691756, "num_input_tokens_seen": 78326470, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.20446777, "step": 3630, "time_per_iteration": 2.651095390319824 }, { "auxiliary_loss_clip": 0.06521121, "auxiliary_loss_mlp": 0.01281199, "balance_loss_clip": 0.06287343, "balance_loss_mlp": 0.0126147, "epoch": 0.21830753043739667, "flos": 26140860397440.0, "grad_norm": 1.814367752285477, "language_loss": 0.77100915, "learning_rate": 3.6394140454996544e-06, "loss": 0.84903234, "num_input_tokens_seen": 78345810, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.19726562, "step": 3631, "time_per_iteration": 2.5982818603515625 }, { "auxiliary_loss_clip": 0.06529677, "auxiliary_loss_mlp": 0.01282348, "balance_loss_clip": 0.06289969, "balance_loss_mlp": 0.01262154, "epoch": 0.21836765369006464, "flos": 21726299237760.0, "grad_norm": 3.4998104428518837, "language_loss": 0.75879145, "learning_rate": 3.639190937376594e-06, "loss": 0.83691168, "num_input_tokens_seen": 78364085, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.20178223, "step": 3632, "time_per_iteration": 2.559133529663086 }, { "auxiliary_loss_clip": 0.06526562, "auxiliary_loss_mlp": 0.01282204, "balance_loss_clip": 0.06290821, "balance_loss_mlp": 0.01263644, "epoch": 0.2184277769427326, "flos": 19943678350080.0, "grad_norm": 2.0980325881202626, "language_loss": 0.84759092, "learning_rate": 3.638967767095249e-06, "loss": 0.92567861, "num_input_tokens_seen": 78381385, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.18579102, "step": 3633, "time_per_iteration": 2.5541462898254395 }, { "auxiliary_loss_clip": 0.06527165, "auxiliary_loss_mlp": 0.01280488, "balance_loss_clip": 0.06288152, "balance_loss_mlp": 0.01259889, "epoch": 0.21848790019540057, "flos": 20346591507840.0, "grad_norm": 1.7857604509390919, "language_loss": 0.81934267, "learning_rate": 3.6387445346640823e-06, "loss": 0.89741921, "num_input_tokens_seen": 78400500, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.20605469, "step": 3634, "time_per_iteration": 2.5690348148345947 }, { "auxiliary_loss_clip": 0.06535117, "auxiliary_loss_mlp": 0.01274404, "balance_loss_clip": 0.06291129, "balance_loss_mlp": 0.01254246, "epoch": 0.21854802344806853, "flos": 15456302392320.0, "grad_norm": 1.9216654294276838, "language_loss": 0.75901735, "learning_rate": 3.638521240091558e-06, "loss": 0.83711255, "num_input_tokens_seen": 78418340, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.20153809, "step": 3635, "time_per_iteration": 2.5285212993621826 }, { "auxiliary_loss_clip": 0.06530987, "auxiliary_loss_mlp": 0.01285588, "balance_loss_clip": 0.06295321, "balance_loss_mlp": 0.01265751, "epoch": 0.2186081467007365, "flos": 16325384140800.0, "grad_norm": 2.074029019740913, "language_loss": 0.88654709, "learning_rate": 3.6382978833861445e-06, "loss": 0.96471286, "num_input_tokens_seen": 78434375, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.19848633, "step": 3636, "time_per_iteration": 2.5157923698425293 }, { "auxiliary_loss_clip": 0.06534147, "auxiliary_loss_mlp": 0.01286864, "balance_loss_clip": 0.06295823, "balance_loss_mlp": 0.01266658, "epoch": 0.2186682699534045, "flos": 21695677770240.0, "grad_norm": 2.2254009033103013, "language_loss": 0.77123809, "learning_rate": 3.638074464556311e-06, "loss": 0.8494482, "num_input_tokens_seen": 78451735, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.2019043, "step": 3637, "time_per_iteration": 2.5432002544403076 }, { "auxiliary_loss_clip": 0.06538547, "auxiliary_loss_mlp": 0.0128272, "balance_loss_clip": 0.06294604, "balance_loss_mlp": 0.01261167, "epoch": 0.21872839320607246, "flos": 17743427913600.0, "grad_norm": 2.433103513156196, "language_loss": 0.90053916, "learning_rate": 3.63785098361053e-06, "loss": 0.97875178, "num_input_tokens_seen": 78462730, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.21557617, "step": 3638, "time_per_iteration": 2.4924209117889404 }, { "auxiliary_loss_clip": 0.06528364, "auxiliary_loss_mlp": 0.01293063, "balance_loss_clip": 0.06288786, "balance_loss_mlp": 0.01271843, "epoch": 0.21878851645874042, "flos": 18656757417600.0, "grad_norm": 2.478839572642557, "language_loss": 0.90775526, "learning_rate": 3.637627440557275e-06, "loss": 0.98596954, "num_input_tokens_seen": 78476300, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.21240234, "step": 3639, "time_per_iteration": 2.493516206741333 }, { "auxiliary_loss_clip": 0.06524144, "auxiliary_loss_mlp": 0.01289112, "balance_loss_clip": 0.06289467, "balance_loss_mlp": 0.0126949, "epoch": 0.2188486397114084, "flos": 25564463360640.0, "grad_norm": 1.724584847343082, "language_loss": 0.8020941, "learning_rate": 3.637403835405024e-06, "loss": 0.88022667, "num_input_tokens_seen": 78496135, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.19628906, "step": 3640, "time_per_iteration": 2.5932133197784424 }, { "auxiliary_loss_clip": 0.06537972, "auxiliary_loss_mlp": 0.01296476, "balance_loss_clip": 0.06298008, "balance_loss_mlp": 0.01275137, "epoch": 0.21890876296407635, "flos": 17897400990720.0, "grad_norm": 1.8175044620280119, "language_loss": 0.73490494, "learning_rate": 3.637180168162255e-06, "loss": 0.81324935, "num_input_tokens_seen": 78513855, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.21337891, "step": 3641, "time_per_iteration": 2.5363080501556396 }, { "auxiliary_loss_clip": 0.06530178, "auxiliary_loss_mlp": 0.01281433, "balance_loss_clip": 0.06291931, "balance_loss_mlp": 0.01261596, "epoch": 0.21896888621674432, "flos": 17754915922560.0, "grad_norm": 1.9520586527813595, "language_loss": 0.81872988, "learning_rate": 3.63695643883745e-06, "loss": 0.896846, "num_input_tokens_seen": 78531740, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.19836426, "step": 3642, "time_per_iteration": 2.5261178016662598 }, { "auxiliary_loss_clip": 0.06534447, "auxiliary_loss_mlp": 0.01292867, "balance_loss_clip": 0.06294177, "balance_loss_mlp": 0.01270706, "epoch": 0.21902900946941228, "flos": 23082890440320.0, "grad_norm": 1.6626889313990822, "language_loss": 0.72026634, "learning_rate": 3.6367326474390928e-06, "loss": 0.79853952, "num_input_tokens_seen": 78549600, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.22143555, "step": 3643, "time_per_iteration": 2.5870981216430664 }, { "auxiliary_loss_clip": 0.06529056, "auxiliary_loss_mlp": 0.01286971, "balance_loss_clip": 0.06288774, "balance_loss_mlp": 0.01266813, "epoch": 0.21908913272208028, "flos": 48189501492480.0, "grad_norm": 2.3204260912338333, "language_loss": 0.68747199, "learning_rate": 3.6365087939756696e-06, "loss": 0.76563227, "num_input_tokens_seen": 78573350, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.20153809, "step": 3644, "time_per_iteration": 2.7999022006988525 }, { "auxiliary_loss_clip": 0.06541285, "auxiliary_loss_mlp": 0.01280167, "balance_loss_clip": 0.06291512, "balance_loss_mlp": 0.01259043, "epoch": 0.21914925597474824, "flos": 22243298129280.0, "grad_norm": 2.767419238834062, "language_loss": 0.77541828, "learning_rate": 3.636284878455669e-06, "loss": 0.85363281, "num_input_tokens_seen": 78591005, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.21130371, "step": 3645, "time_per_iteration": 2.5756845474243164 }, { "auxiliary_loss_clip": 0.06524716, "auxiliary_loss_mlp": 0.01281258, "balance_loss_clip": 0.06291951, "balance_loss_mlp": 0.01262352, "epoch": 0.2192093792274162, "flos": 22131853799040.0, "grad_norm": 1.5795635666043106, "language_loss": 0.82787073, "learning_rate": 3.636060900887582e-06, "loss": 0.90593052, "num_input_tokens_seen": 78610645, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.18908691, "step": 3646, "time_per_iteration": 2.5762534141540527 }, { "auxiliary_loss_clip": 0.06531642, "auxiliary_loss_mlp": 0.01278591, "balance_loss_clip": 0.06293541, "balance_loss_mlp": 0.01259375, "epoch": 0.21926950248008417, "flos": 15674914494720.0, "grad_norm": 1.5919986801129418, "language_loss": 0.8356421, "learning_rate": 3.635836861279901e-06, "loss": 0.91374445, "num_input_tokens_seen": 78628340, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.1920166, "step": 3647, "time_per_iteration": 3.9646642208099365 }, { "auxiliary_loss_clip": 0.06528972, "auxiliary_loss_mlp": 0.01277898, "balance_loss_clip": 0.06290658, "balance_loss_mlp": 0.01257179, "epoch": 0.21932962573275214, "flos": 30270199858560.0, "grad_norm": 1.6842000609864876, "language_loss": 0.72759807, "learning_rate": 3.635612759641123e-06, "loss": 0.8056668, "num_input_tokens_seen": 78649355, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.20715332, "step": 3648, "time_per_iteration": 2.6254184246063232 }, { "auxiliary_loss_clip": 0.06538042, "auxiliary_loss_mlp": 0.0127532, "balance_loss_clip": 0.06291214, "balance_loss_mlp": 0.01254029, "epoch": 0.2193897489854201, "flos": 10784751160320.0, "grad_norm": 2.2799741342696263, "language_loss": 0.75042784, "learning_rate": 3.635388595979745e-06, "loss": 0.82856143, "num_input_tokens_seen": 78664915, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.2130127, "step": 3649, "time_per_iteration": 2.5226521492004395 }, { "auxiliary_loss_clip": 0.0653595, "auxiliary_loss_mlp": 0.0128235, "balance_loss_clip": 0.06299349, "balance_loss_mlp": 0.01262418, "epoch": 0.21944987223808807, "flos": 19138984064640.0, "grad_norm": 1.844073291847362, "language_loss": 0.86565584, "learning_rate": 3.635164370304267e-06, "loss": 0.94383878, "num_input_tokens_seen": 78681475, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.19921875, "step": 3650, "time_per_iteration": 2.5254459381103516 }, { "auxiliary_loss_clip": 0.06547211, "auxiliary_loss_mlp": 0.01277118, "balance_loss_clip": 0.06304903, "balance_loss_mlp": 0.01256364, "epoch": 0.21950999549075606, "flos": 22717726346880.0, "grad_norm": 1.8002111778121337, "language_loss": 0.84584236, "learning_rate": 3.6349400826231927e-06, "loss": 0.92408574, "num_input_tokens_seen": 78702300, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.20739746, "step": 3651, "time_per_iteration": 3.999859094619751 }, { "auxiliary_loss_clip": 0.06542894, "auxiliary_loss_mlp": 0.01275983, "balance_loss_clip": 0.06303098, "balance_loss_mlp": 0.01255407, "epoch": 0.21957011874342403, "flos": 10565929422720.0, "grad_norm": 1.9838158482678825, "language_loss": 0.75259537, "learning_rate": 3.634715732945027e-06, "loss": 0.83078414, "num_input_tokens_seen": 78720230, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.20568848, "step": 3652, "time_per_iteration": 2.5221917629241943 }, { "auxiliary_loss_clip": 0.06438021, "auxiliary_loss_mlp": 0.01259548, "balance_loss_clip": 0.0629739, "balance_loss_mlp": 0.01253022, "epoch": 0.219630241996092, "flos": 65765105677440.0, "grad_norm": 0.7214857133577021, "language_loss": 0.51535076, "learning_rate": 3.6344913212782764e-06, "loss": 0.5923264, "num_input_tokens_seen": 78780200, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.06536865, "step": 3653, "time_per_iteration": 3.168501138687134 }, { "auxiliary_loss_clip": 0.06546018, "auxiliary_loss_mlp": 0.01279029, "balance_loss_clip": 0.06306429, "balance_loss_mlp": 0.01258405, "epoch": 0.21969036524875996, "flos": 23703367524480.0, "grad_norm": 1.7398555421517021, "language_loss": 0.7610445, "learning_rate": 3.6342668476314514e-06, "loss": 0.83929503, "num_input_tokens_seen": 78800575, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.20617676, "step": 3654, "time_per_iteration": 2.5733771324157715 }, { "auxiliary_loss_clip": 0.06550446, "auxiliary_loss_mlp": 0.01279876, "balance_loss_clip": 0.06304744, "balance_loss_mlp": 0.0125781, "epoch": 0.21975048850142792, "flos": 19646130101760.0, "grad_norm": 1.8323703877868611, "language_loss": 0.73437196, "learning_rate": 3.634042312013064e-06, "loss": 0.81267512, "num_input_tokens_seen": 78819585, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.22070312, "step": 3655, "time_per_iteration": 2.5548410415649414 }, { "auxiliary_loss_clip": 0.0654039, "auxiliary_loss_mlp": 0.01277397, "balance_loss_clip": 0.06301614, "balance_loss_mlp": 0.01258109, "epoch": 0.21981061175409589, "flos": 22453944094080.0, "grad_norm": 13.687802840907755, "language_loss": 0.81823075, "learning_rate": 3.6338177144316276e-06, "loss": 0.89640862, "num_input_tokens_seen": 78837330, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.19274902, "step": 3656, "time_per_iteration": 2.5730042457580566 }, { "auxiliary_loss_clip": 0.06543394, "auxiliary_loss_mlp": 0.01280353, "balance_loss_clip": 0.06302257, "balance_loss_mlp": 0.01259336, "epoch": 0.21987073500676388, "flos": 18157032466560.0, "grad_norm": 2.0673820853487945, "language_loss": 0.85640848, "learning_rate": 3.63359305489566e-06, "loss": 0.93464589, "num_input_tokens_seen": 78854955, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.21032715, "step": 3657, "time_per_iteration": 3.955451011657715 }, { "auxiliary_loss_clip": 0.0655624, "auxiliary_loss_mlp": 0.01277606, "balance_loss_clip": 0.06311181, "balance_loss_mlp": 0.0125678, "epoch": 0.21993085825943184, "flos": 25632666184320.0, "grad_norm": 1.6564138903329624, "language_loss": 0.81090295, "learning_rate": 3.6333683334136803e-06, "loss": 0.88924146, "num_input_tokens_seen": 78874965, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.20825195, "step": 3658, "time_per_iteration": 2.5946433544158936 }, { "auxiliary_loss_clip": 0.06426056, "auxiliary_loss_mlp": 0.01254104, "balance_loss_clip": 0.06287727, "balance_loss_mlp": 0.01247804, "epoch": 0.2199909815120998, "flos": 70946429621760.0, "grad_norm": 0.7555753263110757, "language_loss": 0.58069468, "learning_rate": 3.6331435499942095e-06, "loss": 0.65749621, "num_input_tokens_seen": 78937740, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.0630188, "step": 3659, "time_per_iteration": 3.2630372047424316 }, { "auxiliary_loss_clip": 0.06538951, "auxiliary_loss_mlp": 0.01278312, "balance_loss_clip": 0.06299874, "balance_loss_mlp": 0.01258976, "epoch": 0.22005110476476777, "flos": 21549964320000.0, "grad_norm": 2.345769501832032, "language_loss": 0.75039589, "learning_rate": 3.632918704645772e-06, "loss": 0.82856852, "num_input_tokens_seen": 78955055, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.19348145, "step": 3660, "time_per_iteration": 4.047354221343994 }, { "auxiliary_loss_clip": 0.06538473, "auxiliary_loss_mlp": 0.01275137, "balance_loss_clip": 0.06296606, "balance_loss_mlp": 0.01255575, "epoch": 0.22011122801743574, "flos": 22061051498880.0, "grad_norm": 1.6363942956717492, "language_loss": 0.81645459, "learning_rate": 3.632693797376893e-06, "loss": 0.89459068, "num_input_tokens_seen": 78974895, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.19555664, "step": 3661, "time_per_iteration": 2.566891670227051 }, { "auxiliary_loss_clip": 0.06530106, "auxiliary_loss_mlp": 0.01276082, "balance_loss_clip": 0.06290665, "balance_loss_mlp": 0.01257342, "epoch": 0.2201713512701037, "flos": 26694811739520.0, "grad_norm": 1.781401596434973, "language_loss": 0.74019194, "learning_rate": 3.632468828196102e-06, "loss": 0.81825387, "num_input_tokens_seen": 78994990, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.18737793, "step": 3662, "time_per_iteration": 2.618195056915283 }, { "auxiliary_loss_clip": 0.06527928, "auxiliary_loss_mlp": 0.0127793, "balance_loss_clip": 0.06291416, "balance_loss_mlp": 0.01259525, "epoch": 0.22023147452277167, "flos": 22168470833280.0, "grad_norm": 1.8099584556416783, "language_loss": 0.79452562, "learning_rate": 3.632243797111929e-06, "loss": 0.87258422, "num_input_tokens_seen": 79014405, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.18408203, "step": 3663, "time_per_iteration": 2.5631372928619385 }, { "auxiliary_loss_clip": 0.06543434, "auxiliary_loss_mlp": 0.01279662, "balance_loss_clip": 0.06297661, "balance_loss_mlp": 0.0125861, "epoch": 0.22029159777543966, "flos": 22528981025280.0, "grad_norm": 1.7593887835408928, "language_loss": 0.81094325, "learning_rate": 3.632018704132908e-06, "loss": 0.88917416, "num_input_tokens_seen": 79032375, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.21069336, "step": 3664, "time_per_iteration": 2.5548200607299805 }, { "auxiliary_loss_clip": 0.0654851, "auxiliary_loss_mlp": 0.01275782, "balance_loss_clip": 0.06297112, "balance_loss_mlp": 0.01254277, "epoch": 0.22035172102810763, "flos": 13047502343040.0, "grad_norm": 2.604054929549286, "language_loss": 0.77621478, "learning_rate": 3.6317935492675742e-06, "loss": 0.85445768, "num_input_tokens_seen": 79049635, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.21508789, "step": 3665, "time_per_iteration": 2.5286803245544434 }, { "auxiliary_loss_clip": 0.06533431, "auxiliary_loss_mlp": 0.01277168, "balance_loss_clip": 0.06294713, "balance_loss_mlp": 0.0125732, "epoch": 0.2204118442807756, "flos": 12170538311040.0, "grad_norm": 2.5860343794341976, "language_loss": 0.99244887, "learning_rate": 3.631568332524466e-06, "loss": 1.07055485, "num_input_tokens_seen": 79062890, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.1986084, "step": 3666, "time_per_iteration": 2.5128514766693115 }, { "auxiliary_loss_clip": 0.06535064, "auxiliary_loss_mlp": 0.01277903, "balance_loss_clip": 0.0629504, "balance_loss_mlp": 0.01257267, "epoch": 0.22047196753344356, "flos": 40117345758720.0, "grad_norm": 1.5858397901934924, "language_loss": 0.81408811, "learning_rate": 3.631343053912122e-06, "loss": 0.89221776, "num_input_tokens_seen": 79085495, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.2064209, "step": 3667, "time_per_iteration": 2.715712308883667 }, { "auxiliary_loss_clip": 0.06540576, "auxiliary_loss_mlp": 0.01282346, "balance_loss_clip": 0.06294885, "balance_loss_mlp": 0.01259982, "epoch": 0.22053209078611152, "flos": 20706892064640.0, "grad_norm": 1.7028465619961088, "language_loss": 0.77920425, "learning_rate": 3.631117713439087e-06, "loss": 0.85743344, "num_input_tokens_seen": 79101820, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.22363281, "step": 3668, "time_per_iteration": 2.5430703163146973 }, { "auxiliary_loss_clip": 0.0653294, "auxiliary_loss_mlp": 0.01281112, "balance_loss_clip": 0.06291984, "balance_loss_mlp": 0.01260763, "epoch": 0.2205922140387795, "flos": 24723026259840.0, "grad_norm": 1.5412538281538215, "language_loss": 0.71935201, "learning_rate": 3.630892311113904e-06, "loss": 0.79749262, "num_input_tokens_seen": 79123320, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.20361328, "step": 3669, "time_per_iteration": 2.681663990020752 }, { "auxiliary_loss_clip": 0.06532732, "auxiliary_loss_mlp": 0.01278718, "balance_loss_clip": 0.06293084, "balance_loss_mlp": 0.01259251, "epoch": 0.22065233729144745, "flos": 23484000735360.0, "grad_norm": 1.561158322372851, "language_loss": 0.85816491, "learning_rate": 3.6306668469451215e-06, "loss": 0.93627942, "num_input_tokens_seen": 79141615, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.19458008, "step": 3670, "time_per_iteration": 2.5875813961029053 }, { "auxiliary_loss_clip": 0.0653331, "auxiliary_loss_mlp": 0.01276083, "balance_loss_clip": 0.06289626, "balance_loss_mlp": 0.012567, "epoch": 0.22071246054411545, "flos": 35234268094080.0, "grad_norm": 2.085041458749133, "language_loss": 0.77396405, "learning_rate": 3.6304413209412886e-06, "loss": 0.85205793, "num_input_tokens_seen": 79164910, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.19384766, "step": 3671, "time_per_iteration": 2.684621572494507 }, { "auxiliary_loss_clip": 0.06525455, "auxiliary_loss_mlp": 0.01278241, "balance_loss_clip": 0.06285382, "balance_loss_mlp": 0.01259013, "epoch": 0.2207725837967834, "flos": 18156151998720.0, "grad_norm": 2.8719189367108817, "language_loss": 0.81151974, "learning_rate": 3.6302157331109573e-06, "loss": 0.88955677, "num_input_tokens_seen": 79179685, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.19226074, "step": 3672, "time_per_iteration": 2.5245234966278076 }, { "auxiliary_loss_clip": 0.06534448, "auxiliary_loss_mlp": 0.01282432, "balance_loss_clip": 0.06293324, "balance_loss_mlp": 0.01262811, "epoch": 0.22083270704945138, "flos": 20484967726080.0, "grad_norm": 3.6213251886935267, "language_loss": 0.74259704, "learning_rate": 3.629990083462682e-06, "loss": 0.82076585, "num_input_tokens_seen": 79196285, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.19628906, "step": 3673, "time_per_iteration": 2.671854257583618 }, { "auxiliary_loss_clip": 0.06531164, "auxiliary_loss_mlp": 0.01278043, "balance_loss_clip": 0.06291219, "balance_loss_mlp": 0.01256943, "epoch": 0.22089283030211934, "flos": 34133451079680.0, "grad_norm": 1.8162432296170876, "language_loss": 0.77084237, "learning_rate": 3.6297643720050203e-06, "loss": 0.84893441, "num_input_tokens_seen": 79216060, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.2109375, "step": 3674, "time_per_iteration": 2.673855781555176 }, { "auxiliary_loss_clip": 0.06526837, "auxiliary_loss_mlp": 0.01280348, "balance_loss_clip": 0.0628792, "balance_loss_mlp": 0.01258545, "epoch": 0.2209529535547873, "flos": 18083043711360.0, "grad_norm": 2.8067669413988607, "language_loss": 0.75675774, "learning_rate": 3.6295385987465293e-06, "loss": 0.83482963, "num_input_tokens_seen": 79235145, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.21826172, "step": 3675, "time_per_iteration": 2.550398826599121 }, { "auxiliary_loss_clip": 0.06532901, "auxiliary_loss_mlp": 0.0127797, "balance_loss_clip": 0.06291817, "balance_loss_mlp": 0.01257144, "epoch": 0.22101307680745527, "flos": 27242725587840.0, "grad_norm": 1.7031413100258683, "language_loss": 0.80920529, "learning_rate": 3.629312763695772e-06, "loss": 0.88731396, "num_input_tokens_seen": 79256960, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.20825195, "step": 3676, "time_per_iteration": 2.605663776397705 }, { "auxiliary_loss_clip": 0.06531121, "auxiliary_loss_mlp": 0.01283627, "balance_loss_clip": 0.06285219, "balance_loss_mlp": 0.01263183, "epoch": 0.22107320006012326, "flos": 16548566290560.0, "grad_norm": 2.2734490527681226, "language_loss": 0.76307714, "learning_rate": 3.6290868668613107e-06, "loss": 0.84122467, "num_input_tokens_seen": 79274860, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.20458984, "step": 3677, "time_per_iteration": 2.529816150665283 }, { "auxiliary_loss_clip": 0.06527799, "auxiliary_loss_mlp": 0.01278821, "balance_loss_clip": 0.06287477, "balance_loss_mlp": 0.01258901, "epoch": 0.22113332331279123, "flos": 22061009571840.0, "grad_norm": 1.6230697601940747, "language_loss": 0.83705479, "learning_rate": 3.628860908251712e-06, "loss": 0.91512096, "num_input_tokens_seen": 79294005, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.19921875, "step": 3678, "time_per_iteration": 2.579233169555664 }, { "auxiliary_loss_clip": 0.06529523, "auxiliary_loss_mlp": 0.0128605, "balance_loss_clip": 0.06292619, "balance_loss_mlp": 0.01265308, "epoch": 0.2211934465654592, "flos": 26619690954240.0, "grad_norm": 1.9087640588771113, "language_loss": 0.89600533, "learning_rate": 3.6286348878755452e-06, "loss": 0.97416109, "num_input_tokens_seen": 79314005, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.20727539, "step": 3679, "time_per_iteration": 2.603208303451538 }, { "auxiliary_loss_clip": 0.06535642, "auxiliary_loss_mlp": 0.01290096, "balance_loss_clip": 0.06292568, "balance_loss_mlp": 0.01268782, "epoch": 0.22125356981812716, "flos": 16365564973440.0, "grad_norm": 2.2974097630764936, "language_loss": 0.87147868, "learning_rate": 3.6284088057413803e-06, "loss": 0.94973606, "num_input_tokens_seen": 79331030, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.2130127, "step": 3680, "time_per_iteration": 2.5288589000701904 }, { "auxiliary_loss_clip": 0.0653135, "auxiliary_loss_mlp": 0.01282738, "balance_loss_clip": 0.06295027, "balance_loss_mlp": 0.01261233, "epoch": 0.22131369307079513, "flos": 21657257873280.0, "grad_norm": 2.676404060263874, "language_loss": 0.82221341, "learning_rate": 3.6281826618577894e-06, "loss": 0.90035433, "num_input_tokens_seen": 79348560, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.21496582, "step": 3681, "time_per_iteration": 2.551560401916504 }, { "auxiliary_loss_clip": 0.06518092, "auxiliary_loss_mlp": 0.0128227, "balance_loss_clip": 0.06288147, "balance_loss_mlp": 0.01263387, "epoch": 0.2213738163234631, "flos": 19615592488320.0, "grad_norm": 2.074191205834416, "language_loss": 0.8046453, "learning_rate": 3.62795645623335e-06, "loss": 0.88264894, "num_input_tokens_seen": 79367175, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.1887207, "step": 3682, "time_per_iteration": 2.5428214073181152 }, { "auxiliary_loss_clip": 0.06530824, "auxiliary_loss_mlp": 0.01281044, "balance_loss_clip": 0.0628926, "balance_loss_mlp": 0.01260027, "epoch": 0.22143393957613106, "flos": 23630217310080.0, "grad_norm": 1.7783515605613738, "language_loss": 0.7824468, "learning_rate": 3.627730188876638e-06, "loss": 0.86056542, "num_input_tokens_seen": 79388435, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.21008301, "step": 3683, "time_per_iteration": 2.5866498947143555 }, { "auxiliary_loss_clip": 0.0653791, "auxiliary_loss_mlp": 0.01287267, "balance_loss_clip": 0.06294495, "balance_loss_mlp": 0.01266907, "epoch": 0.22149406282879905, "flos": 26185108152960.0, "grad_norm": 2.143442562601323, "language_loss": 0.73677337, "learning_rate": 3.627503859796234e-06, "loss": 0.81502515, "num_input_tokens_seen": 79407910, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.20373535, "step": 3684, "time_per_iteration": 2.5960347652435303 }, { "auxiliary_loss_clip": 0.06529492, "auxiliary_loss_mlp": 0.01288988, "balance_loss_clip": 0.06290586, "balance_loss_mlp": 0.01268532, "epoch": 0.221554186081467, "flos": 14544104918400.0, "grad_norm": 1.956863836540981, "language_loss": 0.80626702, "learning_rate": 3.6272774690007207e-06, "loss": 0.88445181, "num_input_tokens_seen": 79424020, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.20458984, "step": 3685, "time_per_iteration": 2.522022247314453 }, { "auxiliary_loss_clip": 0.06524929, "auxiliary_loss_mlp": 0.01278716, "balance_loss_clip": 0.06290641, "balance_loss_mlp": 0.01260453, "epoch": 0.22161430933413498, "flos": 22245059064960.0, "grad_norm": 1.374328707338179, "language_loss": 0.87759328, "learning_rate": 3.6270510164986823e-06, "loss": 0.95562965, "num_input_tokens_seen": 79445605, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.18273926, "step": 3686, "time_per_iteration": 2.5956871509552 }, { "auxiliary_loss_clip": 0.06527875, "auxiliary_loss_mlp": 0.01281623, "balance_loss_clip": 0.06291829, "balance_loss_mlp": 0.01260844, "epoch": 0.22167443258680294, "flos": 23483162194560.0, "grad_norm": 2.0405278165731193, "language_loss": 0.78138971, "learning_rate": 3.626824502298707e-06, "loss": 0.85948467, "num_input_tokens_seen": 79463850, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.20776367, "step": 3687, "time_per_iteration": 3.977025032043457 }, { "auxiliary_loss_clip": 0.06538539, "auxiliary_loss_mlp": 0.0127691, "balance_loss_clip": 0.06291303, "balance_loss_mlp": 0.01255369, "epoch": 0.2217345558394709, "flos": 23227723422720.0, "grad_norm": 1.6880852598747236, "language_loss": 0.85011673, "learning_rate": 3.626597926409383e-06, "loss": 0.92827117, "num_input_tokens_seen": 79482845, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.2154541, "step": 3688, "time_per_iteration": 2.587298631668091 }, { "auxiliary_loss_clip": 0.06534155, "auxiliary_loss_mlp": 0.01274775, "balance_loss_clip": 0.06289543, "balance_loss_mlp": 0.0125352, "epoch": 0.22179467909213887, "flos": 20017247834880.0, "grad_norm": 1.7952662183320591, "language_loss": 0.81769907, "learning_rate": 3.6263712888393027e-06, "loss": 0.89578837, "num_input_tokens_seen": 79501550, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.21252441, "step": 3689, "time_per_iteration": 2.557556390762329 }, { "auxiliary_loss_clip": 0.06530317, "auxiliary_loss_mlp": 0.01277279, "balance_loss_clip": 0.06293775, "balance_loss_mlp": 0.01257538, "epoch": 0.22185480234480687, "flos": 19689203900160.0, "grad_norm": 1.8620173001855242, "language_loss": 0.70974314, "learning_rate": 3.626144589597061e-06, "loss": 0.78781915, "num_input_tokens_seen": 79519680, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.19750977, "step": 3690, "time_per_iteration": 2.551992893218994 }, { "auxiliary_loss_clip": 0.06534906, "auxiliary_loss_mlp": 0.01276749, "balance_loss_clip": 0.06291821, "balance_loss_mlp": 0.01255589, "epoch": 0.22191492559747483, "flos": 21987817430400.0, "grad_norm": 1.7782245643914376, "language_loss": 0.73327625, "learning_rate": 3.6259178286912528e-06, "loss": 0.81139278, "num_input_tokens_seen": 79539000, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.21166992, "step": 3691, "time_per_iteration": 4.023361444473267 }, { "auxiliary_loss_clip": 0.06536843, "auxiliary_loss_mlp": 0.01273054, "balance_loss_clip": 0.06299391, "balance_loss_mlp": 0.01252395, "epoch": 0.2219750488501428, "flos": 23228813525760.0, "grad_norm": 2.202961798683009, "language_loss": 0.72175717, "learning_rate": 3.625691006130477e-06, "loss": 0.79985607, "num_input_tokens_seen": 79559695, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.20654297, "step": 3692, "time_per_iteration": 2.565483808517456 }, { "auxiliary_loss_clip": 0.06538771, "auxiliary_loss_mlp": 0.01275489, "balance_loss_clip": 0.06296705, "balance_loss_mlp": 0.01254544, "epoch": 0.22203517210281076, "flos": 22459939660800.0, "grad_norm": 1.5531590503764354, "language_loss": 0.87422633, "learning_rate": 3.6254641219233362e-06, "loss": 0.95236897, "num_input_tokens_seen": 79579095, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.20959473, "step": 3693, "time_per_iteration": 2.560406446456909 }, { "auxiliary_loss_clip": 0.06534384, "auxiliary_loss_mlp": 0.01277236, "balance_loss_clip": 0.0629995, "balance_loss_mlp": 0.0125815, "epoch": 0.22209529535547873, "flos": 17569985961600.0, "grad_norm": 1.8959506077171813, "language_loss": 0.86265743, "learning_rate": 3.6252371760784325e-06, "loss": 0.94077367, "num_input_tokens_seen": 79596430, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.19091797, "step": 3694, "time_per_iteration": 2.535881996154785 }, { "auxiliary_loss_clip": 0.06550194, "auxiliary_loss_mlp": 0.01276982, "balance_loss_clip": 0.06301348, "balance_loss_mlp": 0.01256443, "epoch": 0.2221554186081467, "flos": 21475178951040.0, "grad_norm": 2.363368966779423, "language_loss": 0.70211136, "learning_rate": 3.6250101686043725e-06, "loss": 0.78038311, "num_input_tokens_seen": 79615825, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.20544434, "step": 3695, "time_per_iteration": 2.564246654510498 }, { "auxiliary_loss_clip": 0.06533713, "auxiliary_loss_mlp": 0.01278055, "balance_loss_clip": 0.0630118, "balance_loss_mlp": 0.01258934, "epoch": 0.22221554186081466, "flos": 27680956041600.0, "grad_norm": 1.5674962866484947, "language_loss": 0.72048235, "learning_rate": 3.6247830995097637e-06, "loss": 0.79860002, "num_input_tokens_seen": 79637875, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.19128418, "step": 3696, "time_per_iteration": 2.6091113090515137 }, { "auxiliary_loss_clip": 0.06539566, "auxiliary_loss_mlp": 0.01274523, "balance_loss_clip": 0.06302606, "balance_loss_mlp": 0.01254115, "epoch": 0.22227566511348265, "flos": 25966202561280.0, "grad_norm": 1.6032911137285226, "language_loss": 0.8797543, "learning_rate": 3.624555968803217e-06, "loss": 0.95789522, "num_input_tokens_seen": 79656970, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.20410156, "step": 3697, "time_per_iteration": 3.9965100288391113 }, { "auxiliary_loss_clip": 0.06539309, "auxiliary_loss_mlp": 0.01280442, "balance_loss_clip": 0.06309918, "balance_loss_mlp": 0.01261715, "epoch": 0.22233578836615062, "flos": 39213240203520.0, "grad_norm": 1.6827545224918221, "language_loss": 0.66600221, "learning_rate": 3.624328776493346e-06, "loss": 0.74419969, "num_input_tokens_seen": 79680275, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.18713379, "step": 3698, "time_per_iteration": 2.710693120956421 }, { "auxiliary_loss_clip": 0.06545047, "auxiliary_loss_mlp": 0.01279125, "balance_loss_clip": 0.06306616, "balance_loss_mlp": 0.01258549, "epoch": 0.22239591161881858, "flos": 36292682142720.0, "grad_norm": 1.6898819964444458, "language_loss": 0.83176666, "learning_rate": 3.6241015225887637e-06, "loss": 0.91000837, "num_input_tokens_seen": 79701255, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.20581055, "step": 3699, "time_per_iteration": 4.158734083175659 }, { "auxiliary_loss_clip": 0.06541517, "auxiliary_loss_mlp": 0.01276004, "balance_loss_clip": 0.06306174, "balance_loss_mlp": 0.01256216, "epoch": 0.22245603487148655, "flos": 19725779007360.0, "grad_norm": 2.0228767561231877, "language_loss": 0.80370533, "learning_rate": 3.62387420709809e-06, "loss": 0.88188052, "num_input_tokens_seen": 79721315, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.19787598, "step": 3700, "time_per_iteration": 2.559785842895508 }, { "auxiliary_loss_clip": 0.06554861, "auxiliary_loss_mlp": 0.01276594, "balance_loss_clip": 0.06310678, "balance_loss_mlp": 0.01256066, "epoch": 0.2225161581241545, "flos": 46290950081280.0, "grad_norm": 3.1192692263957373, "language_loss": 0.72796971, "learning_rate": 3.623646830029943e-06, "loss": 0.80628431, "num_input_tokens_seen": 79742705, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.2052002, "step": 3701, "time_per_iteration": 2.769906759262085 }, { "auxiliary_loss_clip": 0.06540598, "auxiliary_loss_mlp": 0.01275117, "balance_loss_clip": 0.0630592, "balance_loss_mlp": 0.01256067, "epoch": 0.22257628137682248, "flos": 23702990181120.0, "grad_norm": 1.649170637822461, "language_loss": 0.80684054, "learning_rate": 3.6234193913929454e-06, "loss": 0.88499767, "num_input_tokens_seen": 79763000, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.19055176, "step": 3702, "time_per_iteration": 2.58807635307312 }, { "auxiliary_loss_clip": 0.06535593, "auxiliary_loss_mlp": 0.01278138, "balance_loss_clip": 0.06307916, "balance_loss_mlp": 0.0125885, "epoch": 0.22263640462949044, "flos": 19359986008320.0, "grad_norm": 1.8597490536410768, "language_loss": 0.78770435, "learning_rate": 3.623191891195723e-06, "loss": 0.86584163, "num_input_tokens_seen": 79781335, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.19274902, "step": 3703, "time_per_iteration": 2.561199426651001 }, { "auxiliary_loss_clip": 0.06548649, "auxiliary_loss_mlp": 0.012749, "balance_loss_clip": 0.06308694, "balance_loss_mlp": 0.01253884, "epoch": 0.22269652788215843, "flos": 20782138631040.0, "grad_norm": 2.071892208182786, "language_loss": 0.74948764, "learning_rate": 3.6229643294469005e-06, "loss": 0.82772315, "num_input_tokens_seen": 79800150, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.21020508, "step": 3704, "time_per_iteration": 2.58819580078125 }, { "auxiliary_loss_clip": 0.0654383, "auxiliary_loss_mlp": 0.01279088, "balance_loss_clip": 0.06313127, "balance_loss_mlp": 0.01260909, "epoch": 0.2227566511348264, "flos": 47969631578880.0, "grad_norm": 1.647616596349108, "language_loss": 0.64933491, "learning_rate": 3.6227367061551074e-06, "loss": 0.7275641, "num_input_tokens_seen": 79822390, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.18164062, "step": 3705, "time_per_iteration": 2.816148042678833 }, { "auxiliary_loss_clip": 0.06453681, "auxiliary_loss_mlp": 0.01300097, "balance_loss_clip": 0.06312761, "balance_loss_mlp": 0.01293475, "epoch": 0.22281677438749437, "flos": 66235676607360.0, "grad_norm": 1.1681138434216665, "language_loss": 0.65166306, "learning_rate": 3.6225090213289766e-06, "loss": 0.72920084, "num_input_tokens_seen": 79873350, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.06634521, "step": 3706, "time_per_iteration": 3.0593795776367188 }, { "auxiliary_loss_clip": 0.06542358, "auxiliary_loss_mlp": 0.01274727, "balance_loss_clip": 0.06305222, "balance_loss_mlp": 0.01255975, "epoch": 0.22287689764016233, "flos": 21878050181760.0, "grad_norm": 2.2878015343083704, "language_loss": 0.815633, "learning_rate": 3.622281274977141e-06, "loss": 0.89380383, "num_input_tokens_seen": 79891715, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.1875, "step": 3707, "time_per_iteration": 2.5719430446624756 }, { "auxiliary_loss_clip": 0.06544766, "auxiliary_loss_mlp": 0.01278075, "balance_loss_clip": 0.06310463, "balance_loss_mlp": 0.01258382, "epoch": 0.2229370208928303, "flos": 27679824011520.0, "grad_norm": 1.9508283307809566, "language_loss": 0.794819, "learning_rate": 3.6220534671082367e-06, "loss": 0.87304747, "num_input_tokens_seen": 79911175, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.19677734, "step": 3708, "time_per_iteration": 2.6220431327819824 }, { "auxiliary_loss_clip": 0.06542829, "auxiliary_loss_mlp": 0.01274915, "balance_loss_clip": 0.063043, "balance_loss_mlp": 0.01255889, "epoch": 0.22299714414549826, "flos": 30162612816000.0, "grad_norm": 2.6740282805676374, "language_loss": 0.8144201, "learning_rate": 3.6218255977309024e-06, "loss": 0.89259756, "num_input_tokens_seen": 79931875, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.19018555, "step": 3709, "time_per_iteration": 2.6442999839782715 }, { "auxiliary_loss_clip": 0.06542023, "auxiliary_loss_mlp": 0.01274104, "balance_loss_clip": 0.06301235, "balance_loss_mlp": 0.01254542, "epoch": 0.22305726739816625, "flos": 23148871130880.0, "grad_norm": 2.2317466537664714, "language_loss": 0.70054728, "learning_rate": 3.6215976668537787e-06, "loss": 0.77870852, "num_input_tokens_seen": 79952445, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.19555664, "step": 3710, "time_per_iteration": 2.5937726497650146 }, { "auxiliary_loss_clip": 0.0654363, "auxiliary_loss_mlp": 0.01279332, "balance_loss_clip": 0.06304824, "balance_loss_mlp": 0.01259233, "epoch": 0.22311739065083422, "flos": 19178116721280.0, "grad_norm": 2.6750332541336412, "language_loss": 0.91089606, "learning_rate": 3.6213696744855096e-06, "loss": 0.98912573, "num_input_tokens_seen": 79971030, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.20092773, "step": 3711, "time_per_iteration": 2.5507009029388428 }, { "auxiliary_loss_clip": 0.06536791, "auxiliary_loss_mlp": 0.01280069, "balance_loss_clip": 0.06296507, "balance_loss_mlp": 0.01259589, "epoch": 0.22317751390350218, "flos": 13621467611520.0, "grad_norm": 5.268919191736851, "language_loss": 0.89860821, "learning_rate": 3.6211416206347395e-06, "loss": 0.97677684, "num_input_tokens_seen": 79982085, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.20458984, "step": 3712, "time_per_iteration": 2.4713165760040283 }, { "auxiliary_loss_clip": 0.065349, "auxiliary_loss_mlp": 0.01272859, "balance_loss_clip": 0.06303404, "balance_loss_mlp": 0.01253523, "epoch": 0.22323763715617015, "flos": 11032643064960.0, "grad_norm": 3.9131004297122574, "language_loss": 0.75372005, "learning_rate": 3.620913505310117e-06, "loss": 0.8317976, "num_input_tokens_seen": 79997460, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.19348145, "step": 3713, "time_per_iteration": 2.504669189453125 }, { "auxiliary_loss_clip": 0.06534052, "auxiliary_loss_mlp": 0.01278589, "balance_loss_clip": 0.06297912, "balance_loss_mlp": 0.01259253, "epoch": 0.22329776040883811, "flos": 41360647841280.0, "grad_norm": 1.813551995813049, "language_loss": 0.63624305, "learning_rate": 3.6206853285202917e-06, "loss": 0.71436942, "num_input_tokens_seen": 80022450, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.19335938, "step": 3714, "time_per_iteration": 2.730003595352173 }, { "auxiliary_loss_clip": 0.06538044, "auxiliary_loss_mlp": 0.01278207, "balance_loss_clip": 0.06302144, "balance_loss_mlp": 0.01258895, "epoch": 0.22335788366150608, "flos": 25126568323200.0, "grad_norm": 2.3953785895353312, "language_loss": 0.79594123, "learning_rate": 3.6204570902739164e-06, "loss": 0.87410378, "num_input_tokens_seen": 80042100, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.1932373, "step": 3715, "time_per_iteration": 2.580397129058838 }, { "auxiliary_loss_clip": 0.06536861, "auxiliary_loss_mlp": 0.01280247, "balance_loss_clip": 0.06300189, "balance_loss_mlp": 0.01260816, "epoch": 0.22341800691417404, "flos": 16989144658560.0, "grad_norm": 3.1826645155988693, "language_loss": 0.77500772, "learning_rate": 3.620228790579645e-06, "loss": 0.8531788, "num_input_tokens_seen": 80059690, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.19433594, "step": 3716, "time_per_iteration": 2.5326740741729736 }, { "auxiliary_loss_clip": 0.06536853, "auxiliary_loss_mlp": 0.01279152, "balance_loss_clip": 0.06300043, "balance_loss_mlp": 0.01260496, "epoch": 0.22347813016684204, "flos": 14141904520320.0, "grad_norm": 2.2478269270225826, "language_loss": 0.8004092, "learning_rate": 3.6200004294461367e-06, "loss": 0.87856925, "num_input_tokens_seen": 80076060, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.18640137, "step": 3717, "time_per_iteration": 2.615377426147461 }, { "auxiliary_loss_clip": 0.06540689, "auxiliary_loss_mlp": 0.01278894, "balance_loss_clip": 0.06298757, "balance_loss_mlp": 0.01258735, "epoch": 0.22353825341951, "flos": 23589323717760.0, "grad_norm": 2.1502776492774047, "language_loss": 0.68335962, "learning_rate": 3.6197720068820497e-06, "loss": 0.76155543, "num_input_tokens_seen": 80094760, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.20153809, "step": 3718, "time_per_iteration": 2.584953784942627 }, { "auxiliary_loss_clip": 0.06540138, "auxiliary_loss_mlp": 0.01277164, "balance_loss_clip": 0.06299994, "balance_loss_mlp": 0.01256338, "epoch": 0.22359837667217797, "flos": 29831759769600.0, "grad_norm": 1.5238932442717186, "language_loss": 0.81220013, "learning_rate": 3.619543522896045e-06, "loss": 0.89037323, "num_input_tokens_seen": 80114475, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.20800781, "step": 3719, "time_per_iteration": 2.6302947998046875 }, { "auxiliary_loss_clip": 0.06543876, "auxiliary_loss_mlp": 0.01282193, "balance_loss_clip": 0.0629981, "balance_loss_mlp": 0.01260521, "epoch": 0.22365849992484593, "flos": 17608867056000.0, "grad_norm": 2.482570789881206, "language_loss": 0.87120819, "learning_rate": 3.6193149774967885e-06, "loss": 0.94946885, "num_input_tokens_seen": 80132920, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.21655273, "step": 3720, "time_per_iteration": 2.5543906688690186 }, { "auxiliary_loss_clip": 0.06538113, "auxiliary_loss_mlp": 0.01278091, "balance_loss_clip": 0.0630574, "balance_loss_mlp": 0.01259006, "epoch": 0.2237186231775139, "flos": 22717558638720.0, "grad_norm": 1.9765458797090842, "language_loss": 0.7522943, "learning_rate": 3.619086370692945e-06, "loss": 0.83045638, "num_input_tokens_seen": 80152845, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.1907959, "step": 3721, "time_per_iteration": 2.708322048187256 }, { "auxiliary_loss_clip": 0.06542026, "auxiliary_loss_mlp": 0.01277324, "balance_loss_clip": 0.0629835, "balance_loss_mlp": 0.0125763, "epoch": 0.22377874643018186, "flos": 13376720234880.0, "grad_norm": 2.2911747605161388, "language_loss": 0.7979455, "learning_rate": 3.6188577024931844e-06, "loss": 0.87613899, "num_input_tokens_seen": 80170680, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.19689941, "step": 3722, "time_per_iteration": 2.57924485206604 }, { "auxiliary_loss_clip": 0.06537658, "auxiliary_loss_mlp": 0.01279683, "balance_loss_clip": 0.0630227, "balance_loss_mlp": 0.01260931, "epoch": 0.22383886968284986, "flos": 17900797080960.0, "grad_norm": 2.255998115303415, "language_loss": 0.83517903, "learning_rate": 3.618628972906178e-06, "loss": 0.91335249, "num_input_tokens_seen": 80189030, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.1875, "step": 3723, "time_per_iteration": 2.569607734680176 }, { "auxiliary_loss_clip": 0.06534991, "auxiliary_loss_mlp": 0.0127394, "balance_loss_clip": 0.06294142, "balance_loss_mlp": 0.01254855, "epoch": 0.22389899293551782, "flos": 23886033425280.0, "grad_norm": 2.362844918549951, "language_loss": 0.85167098, "learning_rate": 3.6184001819405984e-06, "loss": 0.92976034, "num_input_tokens_seen": 80208365, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.19091797, "step": 3724, "time_per_iteration": 2.6068224906921387 }, { "auxiliary_loss_clip": 0.06535505, "auxiliary_loss_mlp": 0.01277147, "balance_loss_clip": 0.06296006, "balance_loss_mlp": 0.01257489, "epoch": 0.2239591161881858, "flos": 27279929600640.0, "grad_norm": 3.668827364562575, "language_loss": 0.80243552, "learning_rate": 3.618171329605121e-06, "loss": 0.88056207, "num_input_tokens_seen": 80228685, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.19641113, "step": 3725, "time_per_iteration": 2.607537269592285 }, { "auxiliary_loss_clip": 0.06531116, "auxiliary_loss_mlp": 0.01277232, "balance_loss_clip": 0.06294843, "balance_loss_mlp": 0.01258408, "epoch": 0.22401923944085375, "flos": 22243423910400.0, "grad_norm": 1.7383454708395245, "language_loss": 0.78265846, "learning_rate": 3.6179424159084254e-06, "loss": 0.86074191, "num_input_tokens_seen": 80247635, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.18823242, "step": 3726, "time_per_iteration": 3.9392096996307373 }, { "auxiliary_loss_clip": 0.06541507, "auxiliary_loss_mlp": 0.0128125, "balance_loss_clip": 0.06292404, "balance_loss_mlp": 0.01258386, "epoch": 0.22407936269352172, "flos": 12057920023680.0, "grad_norm": 3.073456482927531, "language_loss": 0.73663437, "learning_rate": 3.6177134408591914e-06, "loss": 0.81486189, "num_input_tokens_seen": 80260045, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.2286377, "step": 3727, "time_per_iteration": 2.5240318775177 }, { "auxiliary_loss_clip": 0.06537515, "auxiliary_loss_mlp": 0.01283164, "balance_loss_clip": 0.06294662, "balance_loss_mlp": 0.012621, "epoch": 0.22413948594618968, "flos": 19359482883840.0, "grad_norm": 1.9862596697078743, "language_loss": 0.87579334, "learning_rate": 3.6174844044661013e-06, "loss": 0.95400012, "num_input_tokens_seen": 80277680, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.21081543, "step": 3728, "time_per_iteration": 2.5489134788513184 }, { "auxiliary_loss_clip": 0.06531346, "auxiliary_loss_mlp": 0.01274952, "balance_loss_clip": 0.06293766, "balance_loss_mlp": 0.01254496, "epoch": 0.22419960919885765, "flos": 24176789493120.0, "grad_norm": 2.3986992177512434, "language_loss": 0.81271064, "learning_rate": 3.6172553067378406e-06, "loss": 0.89077365, "num_input_tokens_seen": 80294795, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.20446777, "step": 3729, "time_per_iteration": 2.5719735622406006 }, { "auxiliary_loss_clip": 0.06522711, "auxiliary_loss_mlp": 0.01273512, "balance_loss_clip": 0.062883, "balance_loss_mlp": 0.0125476, "epoch": 0.22425973245152564, "flos": 27386007269760.0, "grad_norm": 1.5537133560431695, "language_loss": 0.87737548, "learning_rate": 3.6170261476830964e-06, "loss": 0.95533764, "num_input_tokens_seen": 80315425, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.18762207, "step": 3730, "time_per_iteration": 2.6139333248138428 }, { "auxiliary_loss_clip": 0.06522769, "auxiliary_loss_mlp": 0.0127681, "balance_loss_clip": 0.06288178, "balance_loss_mlp": 0.01257951, "epoch": 0.2243198557041936, "flos": 13740794225280.0, "grad_norm": 1.687463899083719, "language_loss": 0.73902297, "learning_rate": 3.616796927310559e-06, "loss": 0.81701875, "num_input_tokens_seen": 80333905, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.1887207, "step": 3731, "time_per_iteration": 3.979857921600342 }, { "auxiliary_loss_clip": 0.06534225, "auxiliary_loss_mlp": 0.01278197, "balance_loss_clip": 0.06291226, "balance_loss_mlp": 0.01257407, "epoch": 0.22437997895686157, "flos": 19535775874560.0, "grad_norm": 2.392639233669914, "language_loss": 0.76232851, "learning_rate": 3.6165676456289195e-06, "loss": 0.84045273, "num_input_tokens_seen": 80352165, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.20788574, "step": 3732, "time_per_iteration": 2.598296880722046 }, { "auxiliary_loss_clip": 0.06528229, "auxiliary_loss_mlp": 0.0127886, "balance_loss_clip": 0.06288704, "balance_loss_mlp": 0.01258403, "epoch": 0.22444010220952954, "flos": 23703032108160.0, "grad_norm": 1.7283923422971057, "language_loss": 0.89190066, "learning_rate": 3.616338302646873e-06, "loss": 0.96997154, "num_input_tokens_seen": 80371305, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.20446777, "step": 3733, "time_per_iteration": 2.598820924758911 }, { "auxiliary_loss_clip": 0.06525433, "auxiliary_loss_mlp": 0.01274776, "balance_loss_clip": 0.0628843, "balance_loss_mlp": 0.01253509, "epoch": 0.2245002254621975, "flos": 22389514704000.0, "grad_norm": 1.5381573237693391, "language_loss": 0.85139805, "learning_rate": 3.6161088983731166e-06, "loss": 0.92940015, "num_input_tokens_seen": 80391020, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.21276855, "step": 3734, "time_per_iteration": 2.5904765129089355 }, { "auxiliary_loss_clip": 0.06530102, "auxiliary_loss_mlp": 0.01279566, "balance_loss_clip": 0.06292038, "balance_loss_mlp": 0.01260731, "epoch": 0.22456034871486547, "flos": 26949453897600.0, "grad_norm": 1.5915542257486994, "language_loss": 0.77241397, "learning_rate": 3.6158794328163482e-06, "loss": 0.8505106, "num_input_tokens_seen": 80411365, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.18847656, "step": 3735, "time_per_iteration": 2.6167635917663574 }, { "auxiliary_loss_clip": 0.06520874, "auxiliary_loss_mlp": 0.01272065, "balance_loss_clip": 0.06289445, "balance_loss_mlp": 0.01254029, "epoch": 0.22462047196753343, "flos": 28990700012160.0, "grad_norm": 3.515272274314363, "language_loss": 0.85035694, "learning_rate": 3.6156499059852702e-06, "loss": 0.92828631, "num_input_tokens_seen": 80431075, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.18041992, "step": 3736, "time_per_iteration": 4.032623529434204 }, { "auxiliary_loss_clip": 0.06530265, "auxiliary_loss_mlp": 0.01275608, "balance_loss_clip": 0.06291153, "balance_loss_mlp": 0.01254091, "epoch": 0.22468059522020142, "flos": 20017541324160.0, "grad_norm": 1.6442188722258368, "language_loss": 0.87436152, "learning_rate": 3.615420317888586e-06, "loss": 0.95242023, "num_input_tokens_seen": 80449240, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.21520996, "step": 3737, "time_per_iteration": 2.5761473178863525 }, { "auxiliary_loss_clip": 0.06532227, "auxiliary_loss_mlp": 0.01279426, "balance_loss_clip": 0.06288351, "balance_loss_mlp": 0.01257682, "epoch": 0.2247407184728694, "flos": 29321846547840.0, "grad_norm": 1.9044094285740905, "language_loss": 0.80068123, "learning_rate": 3.6151906685350006e-06, "loss": 0.87879777, "num_input_tokens_seen": 80467900, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.21740723, "step": 3738, "time_per_iteration": 4.07808256149292 }, { "auxiliary_loss_clip": 0.06531358, "auxiliary_loss_mlp": 0.01275612, "balance_loss_clip": 0.06291287, "balance_loss_mlp": 0.01256491, "epoch": 0.22480084172553735, "flos": 22317035322240.0, "grad_norm": 1.7611907242633758, "language_loss": 0.76939297, "learning_rate": 3.614960957933224e-06, "loss": 0.84746265, "num_input_tokens_seen": 80487100, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.19116211, "step": 3739, "time_per_iteration": 2.5568597316741943 }, { "auxiliary_loss_clip": 0.06534138, "auxiliary_loss_mlp": 0.01279023, "balance_loss_clip": 0.06291898, "balance_loss_mlp": 0.01258126, "epoch": 0.22486096497820532, "flos": 25598019720960.0, "grad_norm": 2.161374839475605, "language_loss": 0.75381452, "learning_rate": 3.6147311860919655e-06, "loss": 0.83194613, "num_input_tokens_seen": 80508625, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.20898438, "step": 3740, "time_per_iteration": 2.5902810096740723 }, { "auxiliary_loss_clip": 0.06526504, "auxiliary_loss_mlp": 0.01279295, "balance_loss_clip": 0.06290089, "balance_loss_mlp": 0.01259769, "epoch": 0.22492108823087328, "flos": 17645651798400.0, "grad_norm": 2.0929186004056395, "language_loss": 0.76267755, "learning_rate": 3.614501353019939e-06, "loss": 0.8407355, "num_input_tokens_seen": 80527345, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.1953125, "step": 3741, "time_per_iteration": 2.5498335361480713 }, { "auxiliary_loss_clip": 0.06529994, "auxiliary_loss_mlp": 0.01283665, "balance_loss_clip": 0.0629479, "balance_loss_mlp": 0.01264055, "epoch": 0.22498121148354125, "flos": 16040246296320.0, "grad_norm": 2.2645524574015927, "language_loss": 0.8807379, "learning_rate": 3.6142714587258592e-06, "loss": 0.95887452, "num_input_tokens_seen": 80545545, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.19604492, "step": 3742, "time_per_iteration": 2.5377237796783447 }, { "auxiliary_loss_clip": 0.06530369, "auxiliary_loss_mlp": 0.01281277, "balance_loss_clip": 0.06296778, "balance_loss_mlp": 0.01261381, "epoch": 0.22504133473620924, "flos": 24030489064320.0, "grad_norm": 1.6395913015529762, "language_loss": 0.81902534, "learning_rate": 3.614041503218444e-06, "loss": 0.89714181, "num_input_tokens_seen": 80565040, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.19897461, "step": 3743, "time_per_iteration": 2.597886323928833 }, { "auxiliary_loss_clip": 0.0653466, "auxiliary_loss_mlp": 0.01275627, "balance_loss_clip": 0.06295548, "balance_loss_mlp": 0.01256244, "epoch": 0.2251014579888772, "flos": 16769610161280.0, "grad_norm": 1.9657373102812183, "language_loss": 0.64196658, "learning_rate": 3.6138114865064134e-06, "loss": 0.72006947, "num_input_tokens_seen": 80582815, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.19384766, "step": 3744, "time_per_iteration": 2.5292813777923584 }, { "auxiliary_loss_clip": 0.06529167, "auxiliary_loss_mlp": 0.01276178, "balance_loss_clip": 0.06292233, "balance_loss_mlp": 0.01256461, "epoch": 0.22516158124154517, "flos": 13996191070080.0, "grad_norm": 2.727019859535196, "language_loss": 0.78048056, "learning_rate": 3.613581408598489e-06, "loss": 0.85853404, "num_input_tokens_seen": 80600865, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.19702148, "step": 3745, "time_per_iteration": 2.5512263774871826 }, { "auxiliary_loss_clip": 0.06529169, "auxiliary_loss_mlp": 0.01279435, "balance_loss_clip": 0.06292747, "balance_loss_mlp": 0.01259873, "epoch": 0.22522170449421314, "flos": 14394869596800.0, "grad_norm": 1.7790877128006193, "language_loss": 0.8104707, "learning_rate": 3.6133512695033965e-06, "loss": 0.88855672, "num_input_tokens_seen": 80617455, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.19555664, "step": 3746, "time_per_iteration": 2.5401556491851807 }, { "auxiliary_loss_clip": 0.06533215, "auxiliary_loss_mlp": 0.01279119, "balance_loss_clip": 0.06291902, "balance_loss_mlp": 0.01259246, "epoch": 0.2252818277468811, "flos": 23812338159360.0, "grad_norm": 3.3230522622107697, "language_loss": 0.86847925, "learning_rate": 3.613121069229862e-06, "loss": 0.94660258, "num_input_tokens_seen": 80635125, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.19885254, "step": 3747, "time_per_iteration": 2.5835578441619873 }, { "auxiliary_loss_clip": 0.06531209, "auxiliary_loss_mlp": 0.01274116, "balance_loss_clip": 0.06292082, "balance_loss_mlp": 0.01255018, "epoch": 0.22534195099954907, "flos": 24725038757760.0, "grad_norm": 1.6788909815107218, "language_loss": 0.77293825, "learning_rate": 3.6128908077866145e-06, "loss": 0.85099149, "num_input_tokens_seen": 80656370, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.19104004, "step": 3748, "time_per_iteration": 2.611402750015259 }, { "auxiliary_loss_clip": 0.06531426, "auxiliary_loss_mlp": 0.01277307, "balance_loss_clip": 0.06292202, "balance_loss_mlp": 0.01257518, "epoch": 0.22540207425221703, "flos": 21038625578880.0, "grad_norm": 1.8794943125425567, "language_loss": 0.80478311, "learning_rate": 3.6126604851823864e-06, "loss": 0.88287044, "num_input_tokens_seen": 80676495, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.19775391, "step": 3749, "time_per_iteration": 2.5742321014404297 }, { "auxiliary_loss_clip": 0.06529623, "auxiliary_loss_mlp": 0.01274518, "balance_loss_clip": 0.06295844, "balance_loss_mlp": 0.01255194, "epoch": 0.22546219750488503, "flos": 19396351480320.0, "grad_norm": 1.638613671880983, "language_loss": 0.79896867, "learning_rate": 3.6124301014259108e-06, "loss": 0.87701011, "num_input_tokens_seen": 80694755, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.1932373, "step": 3750, "time_per_iteration": 2.541565179824829 }, { "auxiliary_loss_clip": 0.06534652, "auxiliary_loss_mlp": 0.01276278, "balance_loss_clip": 0.0629303, "balance_loss_mlp": 0.01255774, "epoch": 0.225522320757553, "flos": 25199760464640.0, "grad_norm": 1.8414773671067863, "language_loss": 0.82650268, "learning_rate": 3.6121996565259244e-06, "loss": 0.90461195, "num_input_tokens_seen": 80713670, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.20495605, "step": 3751, "time_per_iteration": 2.595451831817627 }, { "auxiliary_loss_clip": 0.065275, "auxiliary_loss_mlp": 0.01277018, "balance_loss_clip": 0.06290774, "balance_loss_mlp": 0.01257206, "epoch": 0.22558244401022096, "flos": 17168456396160.0, "grad_norm": 1.815743538895464, "language_loss": 0.83927405, "learning_rate": 3.611969150491165e-06, "loss": 0.91731918, "num_input_tokens_seen": 80731450, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.19812012, "step": 3752, "time_per_iteration": 2.532973527908325 }, { "auxiliary_loss_clip": 0.06524217, "auxiliary_loss_mlp": 0.01273975, "balance_loss_clip": 0.06290136, "balance_loss_mlp": 0.0125458, "epoch": 0.22564256726288892, "flos": 15236306697600.0, "grad_norm": 1.818313860960103, "language_loss": 0.78966117, "learning_rate": 3.611738583330375e-06, "loss": 0.86764312, "num_input_tokens_seen": 80748415, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.19396973, "step": 3753, "time_per_iteration": 2.527987241744995 }, { "auxiliary_loss_clip": 0.06527199, "auxiliary_loss_mlp": 0.01278041, "balance_loss_clip": 0.06292504, "balance_loss_mlp": 0.01256917, "epoch": 0.2257026905155569, "flos": 34577215902720.0, "grad_norm": 2.1535993483020803, "language_loss": 0.79294503, "learning_rate": 3.611507955052295e-06, "loss": 0.87099743, "num_input_tokens_seen": 80770835, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.21118164, "step": 3754, "time_per_iteration": 2.701690912246704 }, { "auxiliary_loss_clip": 0.06529919, "auxiliary_loss_mlp": 0.01278313, "balance_loss_clip": 0.06296295, "balance_loss_mlp": 0.01258322, "epoch": 0.22576281376822485, "flos": 19944642672000.0, "grad_norm": 2.027879475822774, "language_loss": 0.70808536, "learning_rate": 3.6112772656656727e-06, "loss": 0.78616762, "num_input_tokens_seen": 80787840, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.19970703, "step": 3755, "time_per_iteration": 2.558521270751953 }, { "auxiliary_loss_clip": 0.06535366, "auxiliary_loss_mlp": 0.01275309, "balance_loss_clip": 0.06292143, "balance_loss_mlp": 0.01254721, "epoch": 0.22582293702089282, "flos": 24607892350080.0, "grad_norm": 4.197579076112134, "language_loss": 0.77942783, "learning_rate": 3.6110465151792547e-06, "loss": 0.85753465, "num_input_tokens_seen": 80806335, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.20581055, "step": 3756, "time_per_iteration": 2.575338125228882 }, { "auxiliary_loss_clip": 0.06535171, "auxiliary_loss_mlp": 0.01275291, "balance_loss_clip": 0.06295308, "balance_loss_mlp": 0.01254179, "epoch": 0.2258830602735608, "flos": 23041451796480.0, "grad_norm": 2.013511999488849, "language_loss": 0.82543629, "learning_rate": 3.6108157036017916e-06, "loss": 0.90354091, "num_input_tokens_seen": 80825355, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.21118164, "step": 3757, "time_per_iteration": 2.573024034500122 }, { "auxiliary_loss_clip": 0.06538092, "auxiliary_loss_mlp": 0.01276751, "balance_loss_clip": 0.06298701, "balance_loss_mlp": 0.01256152, "epoch": 0.22594318352622877, "flos": 22164068494080.0, "grad_norm": 1.7901494828182676, "language_loss": 0.74166787, "learning_rate": 3.6105848309420358e-06, "loss": 0.81981629, "num_input_tokens_seen": 80842570, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.20605469, "step": 3758, "time_per_iteration": 2.5589070320129395 }, { "auxiliary_loss_clip": 0.06532995, "auxiliary_loss_mlp": 0.01277913, "balance_loss_clip": 0.06292934, "balance_loss_mlp": 0.0125748, "epoch": 0.22600330677889674, "flos": 20600478979200.0, "grad_norm": 2.1497131314498614, "language_loss": 0.776151, "learning_rate": 3.6103538972087412e-06, "loss": 0.85426003, "num_input_tokens_seen": 80858745, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.20422363, "step": 3759, "time_per_iteration": 2.542797803878784 }, { "auxiliary_loss_clip": 0.06541681, "auxiliary_loss_mlp": 0.01280105, "balance_loss_clip": 0.06300519, "balance_loss_mlp": 0.01259243, "epoch": 0.2260634300315647, "flos": 35667970427520.0, "grad_norm": 1.6932957799653927, "language_loss": 0.79257488, "learning_rate": 3.6101229024106655e-06, "loss": 0.87079275, "num_input_tokens_seen": 80880085, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.20861816, "step": 3760, "time_per_iteration": 2.6733126640319824 }, { "auxiliary_loss_clip": 0.06439345, "auxiliary_loss_mlp": 0.01267043, "balance_loss_clip": 0.06296866, "balance_loss_mlp": 0.01258674, "epoch": 0.22612355328423267, "flos": 72107707685760.0, "grad_norm": 0.899088596722646, "language_loss": 0.59941852, "learning_rate": 3.609891846556569e-06, "loss": 0.67648238, "num_input_tokens_seen": 80937660, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.08374023, "step": 3761, "time_per_iteration": 3.1206753253936768 }, { "auxiliary_loss_clip": 0.06542079, "auxiliary_loss_mlp": 0.01280817, "balance_loss_clip": 0.06297895, "balance_loss_mlp": 0.01259944, "epoch": 0.22618367653690064, "flos": 22790373436800.0, "grad_norm": 2.3391164111635043, "language_loss": 0.781986, "learning_rate": 3.609660729655211e-06, "loss": 0.86021495, "num_input_tokens_seen": 80956265, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.20874023, "step": 3762, "time_per_iteration": 2.617570638656616 }, { "auxiliary_loss_clip": 0.06533746, "auxiliary_loss_mlp": 0.01277635, "balance_loss_clip": 0.06291777, "balance_loss_mlp": 0.0125688, "epoch": 0.22624379978956863, "flos": 20454388185600.0, "grad_norm": 2.0142936987877498, "language_loss": 0.79734826, "learning_rate": 3.6094295517153573e-06, "loss": 0.87546206, "num_input_tokens_seen": 80975185, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.20751953, "step": 3763, "time_per_iteration": 2.5455188751220703 }, { "auxiliary_loss_clip": 0.06541503, "auxiliary_loss_mlp": 0.01280146, "balance_loss_clip": 0.06296438, "balance_loss_mlp": 0.01259022, "epoch": 0.2263039230422366, "flos": 17500189910400.0, "grad_norm": 2.3117138385659093, "language_loss": 0.91965735, "learning_rate": 3.6091983127457743e-06, "loss": 0.9978739, "num_input_tokens_seen": 80992830, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.21118164, "step": 3764, "time_per_iteration": 2.5601210594177246 }, { "auxiliary_loss_clip": 0.06530042, "auxiliary_loss_mlp": 0.01276287, "balance_loss_clip": 0.06294779, "balance_loss_mlp": 0.01256569, "epoch": 0.22636404629490456, "flos": 28337295473280.0, "grad_norm": 1.7742885826046062, "language_loss": 0.75858116, "learning_rate": 3.6089670127552293e-06, "loss": 0.83664453, "num_input_tokens_seen": 81013675, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.19714355, "step": 3765, "time_per_iteration": 2.7292866706848145 }, { "auxiliary_loss_clip": 0.06526098, "auxiliary_loss_mlp": 0.01279054, "balance_loss_clip": 0.06291331, "balance_loss_mlp": 0.01258407, "epoch": 0.22642416954757252, "flos": 17494152416640.0, "grad_norm": 1.9841801976129219, "language_loss": 0.90915847, "learning_rate": 3.608735651752494e-06, "loss": 0.98720998, "num_input_tokens_seen": 81030345, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.20654297, "step": 3766, "time_per_iteration": 4.077770948410034 }, { "auxiliary_loss_clip": 0.0652349, "auxiliary_loss_mlp": 0.0127547, "balance_loss_clip": 0.06291842, "balance_loss_mlp": 0.01254954, "epoch": 0.2264842928002405, "flos": 24390621912960.0, "grad_norm": 1.776990527270744, "language_loss": 0.75204509, "learning_rate": 3.6085042297463417e-06, "loss": 0.83003473, "num_input_tokens_seen": 81051000, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.2052002, "step": 3767, "time_per_iteration": 2.593554973602295 }, { "auxiliary_loss_clip": 0.06534479, "auxiliary_loss_mlp": 0.01277859, "balance_loss_clip": 0.06295086, "balance_loss_mlp": 0.0125658, "epoch": 0.22654441605290845, "flos": 19836971775360.0, "grad_norm": 1.4494827353835886, "language_loss": 0.72625971, "learning_rate": 3.6082727467455477e-06, "loss": 0.80438304, "num_input_tokens_seen": 81071205, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.21289062, "step": 3768, "time_per_iteration": 2.615219831466675 }, { "auxiliary_loss_clip": 0.06534849, "auxiliary_loss_mlp": 0.01282577, "balance_loss_clip": 0.06297785, "balance_loss_mlp": 0.0126156, "epoch": 0.22660453930557642, "flos": 27462050449920.0, "grad_norm": 1.592090401726349, "language_loss": 0.78839165, "learning_rate": 3.6080412027588905e-06, "loss": 0.86656594, "num_input_tokens_seen": 81091880, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.21008301, "step": 3769, "time_per_iteration": 2.8460195064544678 }, { "auxiliary_loss_clip": 0.0653655, "auxiliary_loss_mlp": 0.01278807, "balance_loss_clip": 0.06292928, "balance_loss_mlp": 0.01258446, "epoch": 0.2266646625582444, "flos": 23995004060160.0, "grad_norm": 1.8250102009454396, "language_loss": 0.69465399, "learning_rate": 3.6078095977951488e-06, "loss": 0.77280754, "num_input_tokens_seen": 81113290, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.20373535, "step": 3770, "time_per_iteration": 4.055614233016968 }, { "auxiliary_loss_clip": 0.06531711, "auxiliary_loss_mlp": 0.01278168, "balance_loss_clip": 0.06292006, "balance_loss_mlp": 0.01258642, "epoch": 0.22672478581091238, "flos": 26034698874240.0, "grad_norm": 8.091119704891609, "language_loss": 0.8089307, "learning_rate": 3.6075779318631067e-06, "loss": 0.88702953, "num_input_tokens_seen": 81133535, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.19506836, "step": 3771, "time_per_iteration": 2.6001079082489014 }, { "auxiliary_loss_clip": 0.06527214, "auxiliary_loss_mlp": 0.01282819, "balance_loss_clip": 0.06293517, "balance_loss_mlp": 0.01261874, "epoch": 0.22678490906358034, "flos": 23848577850240.0, "grad_norm": 1.5048707941588466, "language_loss": 0.79324251, "learning_rate": 3.6073462049715486e-06, "loss": 0.8713429, "num_input_tokens_seen": 81154650, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.20935059, "step": 3772, "time_per_iteration": 2.615551471710205 }, { "auxiliary_loss_clip": 0.06425929, "auxiliary_loss_mlp": 0.01270159, "balance_loss_clip": 0.06288229, "balance_loss_mlp": 0.0126374, "epoch": 0.2268450323162483, "flos": 65070163912320.0, "grad_norm": 0.6302486317144508, "language_loss": 0.54040629, "learning_rate": 3.607114417129261e-06, "loss": 0.61736715, "num_input_tokens_seen": 81221240, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.06427002, "step": 3773, "time_per_iteration": 3.273441791534424 }, { "auxiliary_loss_clip": 0.06527832, "auxiliary_loss_mlp": 0.01282704, "balance_loss_clip": 0.06292132, "balance_loss_mlp": 0.01263165, "epoch": 0.22690515556891627, "flos": 22532251334400.0, "grad_norm": 1.685726508738048, "language_loss": 0.70764768, "learning_rate": 3.6068825683450334e-06, "loss": 0.78575301, "num_input_tokens_seen": 81241520, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.19555664, "step": 3774, "time_per_iteration": 2.587339162826538 }, { "auxiliary_loss_clip": 0.06527129, "auxiliary_loss_mlp": 0.01275726, "balance_loss_clip": 0.06290106, "balance_loss_mlp": 0.01256057, "epoch": 0.22696527882158424, "flos": 18229344140160.0, "grad_norm": 2.333549458691471, "language_loss": 0.75228983, "learning_rate": 3.606650658627658e-06, "loss": 0.83031833, "num_input_tokens_seen": 81256825, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.19677734, "step": 3775, "time_per_iteration": 3.9397730827331543 }, { "auxiliary_loss_clip": 0.06529707, "auxiliary_loss_mlp": 0.012786, "balance_loss_clip": 0.06292436, "balance_loss_mlp": 0.01259931, "epoch": 0.22702540207425223, "flos": 17024923152000.0, "grad_norm": 2.065673945393786, "language_loss": 0.83076286, "learning_rate": 3.606418687985928e-06, "loss": 0.9088459, "num_input_tokens_seen": 81275695, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.18664551, "step": 3776, "time_per_iteration": 2.5717642307281494 }, { "auxiliary_loss_clip": 0.06529766, "auxiliary_loss_mlp": 0.01277644, "balance_loss_clip": 0.06290306, "balance_loss_mlp": 0.01259059, "epoch": 0.2270855253269202, "flos": 21332316539520.0, "grad_norm": 1.7787654693985142, "language_loss": 0.83052409, "learning_rate": 3.606186656428641e-06, "loss": 0.90859824, "num_input_tokens_seen": 81294920, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.18603516, "step": 3777, "time_per_iteration": 2.5868208408355713 }, { "auxiliary_loss_clip": 0.06526847, "auxiliary_loss_mlp": 0.01280232, "balance_loss_clip": 0.06289411, "balance_loss_mlp": 0.01260169, "epoch": 0.22714564857958816, "flos": 23557276730880.0, "grad_norm": 1.993733967570128, "language_loss": 0.74102569, "learning_rate": 3.6059545639645955e-06, "loss": 0.81909645, "num_input_tokens_seen": 81314275, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.20043945, "step": 3778, "time_per_iteration": 4.056695461273193 }, { "auxiliary_loss_clip": 0.06527632, "auxiliary_loss_mlp": 0.0127378, "balance_loss_clip": 0.06287906, "balance_loss_mlp": 0.01254563, "epoch": 0.22720577183225613, "flos": 25996237050240.0, "grad_norm": 11.564726352580827, "language_loss": 0.65307009, "learning_rate": 3.605722410602591e-06, "loss": 0.73108423, "num_input_tokens_seen": 81333890, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.19226074, "step": 3779, "time_per_iteration": 2.5983901023864746 }, { "auxiliary_loss_clip": 0.06521505, "auxiliary_loss_mlp": 0.01274862, "balance_loss_clip": 0.06287913, "balance_loss_mlp": 0.01256313, "epoch": 0.2272658950849241, "flos": 20820432746880.0, "grad_norm": 1.6275026433801032, "language_loss": 0.71441799, "learning_rate": 3.6054901963514323e-06, "loss": 0.7923817, "num_input_tokens_seen": 81353640, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.18554688, "step": 3780, "time_per_iteration": 2.579123020172119 }, { "auxiliary_loss_clip": 0.06525001, "auxiliary_loss_mlp": 0.0127886, "balance_loss_clip": 0.06288664, "balance_loss_mlp": 0.01257581, "epoch": 0.22732601833759206, "flos": 23915187446400.0, "grad_norm": 1.789299768153445, "language_loss": 0.90118444, "learning_rate": 3.6052579212199246e-06, "loss": 0.97922301, "num_input_tokens_seen": 81371595, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.21289062, "step": 3781, "time_per_iteration": 2.5740649700164795 }, { "auxiliary_loss_clip": 0.06525634, "auxiliary_loss_mlp": 0.01276092, "balance_loss_clip": 0.06286357, "balance_loss_mlp": 0.0125529, "epoch": 0.22738614159026002, "flos": 15929850142080.0, "grad_norm": 2.8558358223581592, "language_loss": 0.75163722, "learning_rate": 3.6050255852168753e-06, "loss": 0.82965451, "num_input_tokens_seen": 81388435, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.20812988, "step": 3782, "time_per_iteration": 2.5394527912139893 }, { "auxiliary_loss_clip": 0.06521909, "auxiliary_loss_mlp": 0.01277075, "balance_loss_clip": 0.06288761, "balance_loss_mlp": 0.01257846, "epoch": 0.22744626484292801, "flos": 24212148716160.0, "grad_norm": 1.409053056435902, "language_loss": 0.83029264, "learning_rate": 3.604793188351095e-06, "loss": 0.9082824, "num_input_tokens_seen": 81410195, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.19226074, "step": 3783, "time_per_iteration": 2.593855619430542 }, { "auxiliary_loss_clip": 0.06526037, "auxiliary_loss_mlp": 0.01280662, "balance_loss_clip": 0.06290337, "balance_loss_mlp": 0.01260658, "epoch": 0.22750638809559598, "flos": 24798734023680.0, "grad_norm": 2.4305222210830064, "language_loss": 0.76451498, "learning_rate": 3.6045607306313964e-06, "loss": 0.84258193, "num_input_tokens_seen": 81430060, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.20019531, "step": 3784, "time_per_iteration": 2.599184513092041 }, { "auxiliary_loss_clip": 0.06521417, "auxiliary_loss_mlp": 0.01274108, "balance_loss_clip": 0.06285556, "balance_loss_mlp": 0.01254772, "epoch": 0.22756651134826394, "flos": 22243004640000.0, "grad_norm": 1.4883490591901734, "language_loss": 0.71603346, "learning_rate": 3.604328212066594e-06, "loss": 0.79398865, "num_input_tokens_seen": 81447375, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.1932373, "step": 3785, "time_per_iteration": 2.5701253414154053 }, { "auxiliary_loss_clip": 0.06416821, "auxiliary_loss_mlp": 0.01261014, "balance_loss_clip": 0.06281832, "balance_loss_mlp": 0.01255247, "epoch": 0.2276266346009319, "flos": 62728225021440.0, "grad_norm": 1.4040283778972782, "language_loss": 0.62280482, "learning_rate": 3.6040956326655047e-06, "loss": 0.69958323, "num_input_tokens_seen": 81505235, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.05758667, "step": 3786, "time_per_iteration": 3.1552674770355225 }, { "auxiliary_loss_clip": 0.06525306, "auxiliary_loss_mlp": 0.01276388, "balance_loss_clip": 0.06287138, "balance_loss_mlp": 0.01255467, "epoch": 0.22768675785359987, "flos": 18618085958400.0, "grad_norm": 6.294249611103932, "language_loss": 0.8724103, "learning_rate": 3.6038629924369486e-06, "loss": 0.95042729, "num_input_tokens_seen": 81518685, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.20922852, "step": 3787, "time_per_iteration": 2.52815842628479 }, { "auxiliary_loss_clip": 0.06524618, "auxiliary_loss_mlp": 0.01275889, "balance_loss_clip": 0.0629126, "balance_loss_mlp": 0.01257531, "epoch": 0.22774688110626784, "flos": 26877477640320.0, "grad_norm": 1.3070834659834072, "language_loss": 0.73069113, "learning_rate": 3.6036302913897474e-06, "loss": 0.80869621, "num_input_tokens_seen": 81538940, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.18347168, "step": 3788, "time_per_iteration": 2.5972788333892822 }, { "auxiliary_loss_clip": 0.06518674, "auxiliary_loss_mlp": 0.01277541, "balance_loss_clip": 0.06286241, "balance_loss_mlp": 0.0125836, "epoch": 0.2278070043589358, "flos": 15557977722240.0, "grad_norm": 10.316618922055067, "language_loss": 0.68353838, "learning_rate": 3.6033975295327243e-06, "loss": 0.76150048, "num_input_tokens_seen": 81555525, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.19189453, "step": 3789, "time_per_iteration": 2.5124011039733887 }, { "auxiliary_loss_clip": 0.06527053, "auxiliary_loss_mlp": 0.01281108, "balance_loss_clip": 0.06292877, "balance_loss_mlp": 0.01260461, "epoch": 0.2278671276116038, "flos": 22422987210240.0, "grad_norm": 1.9059458719631943, "language_loss": 0.76620317, "learning_rate": 3.6031647068747065e-06, "loss": 0.84428477, "num_input_tokens_seen": 81576305, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.20654297, "step": 3790, "time_per_iteration": 2.6024551391601562 }, { "auxiliary_loss_clip": 0.0651845, "auxiliary_loss_mlp": 0.01280442, "balance_loss_clip": 0.06286159, "balance_loss_mlp": 0.0125952, "epoch": 0.22792725086427176, "flos": 20637641064960.0, "grad_norm": 1.8919778449572073, "language_loss": 0.91769916, "learning_rate": 3.602931823424522e-06, "loss": 0.99568802, "num_input_tokens_seen": 81594115, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.20922852, "step": 3791, "time_per_iteration": 2.5685653686523438 }, { "auxiliary_loss_clip": 0.06527352, "auxiliary_loss_mlp": 0.0127227, "balance_loss_clip": 0.06288723, "balance_loss_mlp": 0.01253209, "epoch": 0.22798737411693973, "flos": 31436662147200.0, "grad_norm": 1.9470460016071895, "language_loss": 0.82930744, "learning_rate": 3.6026988791910026e-06, "loss": 0.90730369, "num_input_tokens_seen": 81615355, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.19067383, "step": 3792, "time_per_iteration": 2.640916347503662 }, { "auxiliary_loss_clip": 0.06415825, "auxiliary_loss_mlp": 0.01263673, "balance_loss_clip": 0.06281342, "balance_loss_mlp": 0.01258467, "epoch": 0.2280474973696077, "flos": 52412074220160.0, "grad_norm": 1.0843750464235886, "language_loss": 0.65663344, "learning_rate": 3.602465874182981e-06, "loss": 0.73342836, "num_input_tokens_seen": 81662075, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.05209351, "step": 3793, "time_per_iteration": 2.9385154247283936 }, { "auxiliary_loss_clip": 0.06531189, "auxiliary_loss_mlp": 0.01281402, "balance_loss_clip": 0.06288052, "balance_loss_mlp": 0.01259467, "epoch": 0.22810762062227566, "flos": 26403300984960.0, "grad_norm": 1.890150419694487, "language_loss": 0.7861681, "learning_rate": 3.602232808409293e-06, "loss": 0.86429405, "num_input_tokens_seen": 81681625, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.21948242, "step": 3794, "time_per_iteration": 2.6181209087371826 }, { "auxiliary_loss_clip": 0.06526062, "auxiliary_loss_mlp": 0.01278056, "balance_loss_clip": 0.06290486, "balance_loss_mlp": 0.01258243, "epoch": 0.22816774387494362, "flos": 25637445866880.0, "grad_norm": 1.7705817746614883, "language_loss": 0.81473148, "learning_rate": 3.6019996818787755e-06, "loss": 0.89277267, "num_input_tokens_seen": 81701170, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.19824219, "step": 3795, "time_per_iteration": 2.614090919494629 }, { "auxiliary_loss_clip": 0.06519388, "auxiliary_loss_mlp": 0.01287375, "balance_loss_clip": 0.0628921, "balance_loss_mlp": 0.01268874, "epoch": 0.22822786712761162, "flos": 22457507892480.0, "grad_norm": 2.2407716680674725, "language_loss": 0.77498233, "learning_rate": 3.6017664946002704e-06, "loss": 0.85304999, "num_input_tokens_seen": 81721265, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.18481445, "step": 3796, "time_per_iteration": 2.5990498065948486 }, { "auxiliary_loss_clip": 0.0652537, "auxiliary_loss_mlp": 0.01273754, "balance_loss_clip": 0.0628965, "balance_loss_mlp": 0.01255551, "epoch": 0.22828799038027958, "flos": 12207323053440.0, "grad_norm": 2.4674917383997363, "language_loss": 0.9593147, "learning_rate": 3.6015332465826188e-06, "loss": 1.03730595, "num_input_tokens_seen": 81736565, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.18188477, "step": 3797, "time_per_iteration": 2.5348947048187256 }, { "auxiliary_loss_clip": 0.06522216, "auxiliary_loss_mlp": 0.01279869, "balance_loss_clip": 0.06289041, "balance_loss_mlp": 0.01260604, "epoch": 0.22834811363294755, "flos": 22091379477120.0, "grad_norm": 1.6166567121015487, "language_loss": 0.82276249, "learning_rate": 3.601299937834666e-06, "loss": 0.9007833, "num_input_tokens_seen": 81756240, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.19262695, "step": 3798, "time_per_iteration": 2.5824451446533203 }, { "auxiliary_loss_clip": 0.06525698, "auxiliary_loss_mlp": 0.01277197, "balance_loss_clip": 0.06288112, "balance_loss_mlp": 0.01257242, "epoch": 0.2284082368856155, "flos": 24867104555520.0, "grad_norm": 2.519591031978874, "language_loss": 0.79610425, "learning_rate": 3.6010665683652596e-06, "loss": 0.87413323, "num_input_tokens_seen": 81775720, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.19958496, "step": 3799, "time_per_iteration": 2.602605104446411 }, { "auxiliary_loss_clip": 0.06527392, "auxiliary_loss_mlp": 0.01278255, "balance_loss_clip": 0.06293674, "balance_loss_mlp": 0.01259062, "epoch": 0.22846836013828348, "flos": 23299280409600.0, "grad_norm": 1.5682061504462252, "language_loss": 0.75680709, "learning_rate": 3.6008331381832484e-06, "loss": 0.83486354, "num_input_tokens_seen": 81795830, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.19189453, "step": 3800, "time_per_iteration": 2.578855037689209 }, { "auxiliary_loss_clip": 0.06522171, "auxiliary_loss_mlp": 0.01275603, "balance_loss_clip": 0.06290733, "balance_loss_mlp": 0.01256911, "epoch": 0.22852848339095144, "flos": 27423462844800.0, "grad_norm": 2.419951617971007, "language_loss": 0.64484543, "learning_rate": 3.600599647297484e-06, "loss": 0.7228232, "num_input_tokens_seen": 81815745, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.18701172, "step": 3801, "time_per_iteration": 2.6285648345947266 }, { "auxiliary_loss_clip": 0.06522678, "auxiliary_loss_mlp": 0.01273968, "balance_loss_clip": 0.06295213, "balance_loss_mlp": 0.01255515, "epoch": 0.2285886066436194, "flos": 26328054418560.0, "grad_norm": 1.6677995279420632, "language_loss": 0.82079017, "learning_rate": 3.60036609571682e-06, "loss": 0.89875662, "num_input_tokens_seen": 81835155, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.18457031, "step": 3802, "time_per_iteration": 2.6388885974884033 }, { "auxiliary_loss_clip": 0.06520783, "auxiliary_loss_mlp": 0.01278748, "balance_loss_clip": 0.06287307, "balance_loss_mlp": 0.01258745, "epoch": 0.2286487298962874, "flos": 29724298508160.0, "grad_norm": 2.093614074540454, "language_loss": 0.7952199, "learning_rate": 3.600132483450114e-06, "loss": 0.8732152, "num_input_tokens_seen": 81855655, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.20007324, "step": 3803, "time_per_iteration": 2.639857530593872 }, { "auxiliary_loss_clip": 0.06524795, "auxiliary_loss_mlp": 0.01275759, "balance_loss_clip": 0.06287412, "balance_loss_mlp": 0.01255934, "epoch": 0.22870885314895537, "flos": 21293435445120.0, "grad_norm": 1.8286251714552813, "language_loss": 0.86218131, "learning_rate": 3.5998988105062235e-06, "loss": 0.94018686, "num_input_tokens_seen": 81876385, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.19836426, "step": 3804, "time_per_iteration": 2.596717119216919 }, { "auxiliary_loss_clip": 0.06525861, "auxiliary_loss_mlp": 0.01274787, "balance_loss_clip": 0.06288424, "balance_loss_mlp": 0.0125613, "epoch": 0.22876897640162333, "flos": 14944754016000.0, "grad_norm": 1.8760064261824771, "language_loss": 0.77323329, "learning_rate": 3.59966507689401e-06, "loss": 0.8512398, "num_input_tokens_seen": 81893225, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.18652344, "step": 3805, "time_per_iteration": 3.9479541778564453 }, { "auxiliary_loss_clip": 0.06530511, "auxiliary_loss_mlp": 0.01272544, "balance_loss_clip": 0.06288554, "balance_loss_mlp": 0.01253602, "epoch": 0.2288290996542913, "flos": 18119786526720.0, "grad_norm": 2.7200115976152, "language_loss": 0.79615271, "learning_rate": 3.5994312826223363e-06, "loss": 0.87418324, "num_input_tokens_seen": 81911350, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.18969727, "step": 3806, "time_per_iteration": 2.527967929840088 }, { "auxiliary_loss_clip": 0.06523518, "auxiliary_loss_mlp": 0.01276703, "balance_loss_clip": 0.06289802, "balance_loss_mlp": 0.01258309, "epoch": 0.22888922290695926, "flos": 39864296828160.0, "grad_norm": 2.3055723406110795, "language_loss": 0.70624602, "learning_rate": 3.5991974277000684e-06, "loss": 0.78424823, "num_input_tokens_seen": 81935420, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.18408203, "step": 3807, "time_per_iteration": 2.706599473953247 }, { "auxiliary_loss_clip": 0.06534916, "auxiliary_loss_mlp": 0.01286636, "balance_loss_clip": 0.06294546, "balance_loss_mlp": 0.0126649, "epoch": 0.22894934615962723, "flos": 23410431250560.0, "grad_norm": 2.4896869103552004, "language_loss": 0.65963393, "learning_rate": 3.5989635121360733e-06, "loss": 0.73784947, "num_input_tokens_seen": 81953845, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.20153809, "step": 3808, "time_per_iteration": 2.581467390060425 }, { "auxiliary_loss_clip": 0.06525184, "auxiliary_loss_mlp": 0.0127677, "balance_loss_clip": 0.0628844, "balance_loss_mlp": 0.01257851, "epoch": 0.22900946941229522, "flos": 18848898829440.0, "grad_norm": 1.823452349709497, "language_loss": 0.75854659, "learning_rate": 3.598729535939222e-06, "loss": 0.83656621, "num_input_tokens_seen": 81972100, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.18933105, "step": 3809, "time_per_iteration": 2.566826820373535 }, { "auxiliary_loss_clip": 0.06525914, "auxiliary_loss_mlp": 0.01278658, "balance_loss_clip": 0.06292904, "balance_loss_mlp": 0.01259918, "epoch": 0.22906959266496318, "flos": 22935961105920.0, "grad_norm": 1.472216913310636, "language_loss": 0.82206529, "learning_rate": 3.5984954991183862e-06, "loss": 0.90011102, "num_input_tokens_seen": 81992760, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.18737793, "step": 3810, "time_per_iteration": 4.029102563858032 }, { "auxiliary_loss_clip": 0.06519978, "auxiliary_loss_mlp": 0.01276176, "balance_loss_clip": 0.06288371, "balance_loss_mlp": 0.0125839, "epoch": 0.22912971591763115, "flos": 19360614913920.0, "grad_norm": 4.057909176402216, "language_loss": 0.78953946, "learning_rate": 3.598261401682441e-06, "loss": 0.86750102, "num_input_tokens_seen": 82009080, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.17785645, "step": 3811, "time_per_iteration": 2.5582475662231445 }, { "auxiliary_loss_clip": 0.06526475, "auxiliary_loss_mlp": 0.01275466, "balance_loss_clip": 0.06292794, "balance_loss_mlp": 0.01256977, "epoch": 0.22918983917029911, "flos": 19938940594560.0, "grad_norm": 1.892493907472309, "language_loss": 0.83275568, "learning_rate": 3.5980272436402632e-06, "loss": 0.91077513, "num_input_tokens_seen": 82026705, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.18518066, "step": 3812, "time_per_iteration": 2.675013303756714 }, { "auxiliary_loss_clip": 0.06536272, "auxiliary_loss_mlp": 0.01284043, "balance_loss_clip": 0.0629337, "balance_loss_mlp": 0.01264529, "epoch": 0.22924996242296708, "flos": 16696501873920.0, "grad_norm": 3.460332014834686, "language_loss": 0.84193933, "learning_rate": 3.5977930250007324e-06, "loss": 0.92014253, "num_input_tokens_seen": 82043245, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.1953125, "step": 3813, "time_per_iteration": 2.554422616958618 }, { "auxiliary_loss_clip": 0.06525995, "auxiliary_loss_mlp": 0.01275164, "balance_loss_clip": 0.06291876, "balance_loss_mlp": 0.01257044, "epoch": 0.22931008567563504, "flos": 33044457490560.0, "grad_norm": 1.6217155742436697, "language_loss": 0.70885712, "learning_rate": 3.5975587457727298e-06, "loss": 0.78686875, "num_input_tokens_seen": 82066870, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.18115234, "step": 3814, "time_per_iteration": 2.689460277557373 }, { "auxiliary_loss_clip": 0.06523947, "auxiliary_loss_mlp": 0.01271808, "balance_loss_clip": 0.06292567, "balance_loss_mlp": 0.01253867, "epoch": 0.229370208928303, "flos": 23337322963200.0, "grad_norm": 2.8230965582630114, "language_loss": 0.67548239, "learning_rate": 3.597324405965139e-06, "loss": 0.75343996, "num_input_tokens_seen": 82083180, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.17956543, "step": 3815, "time_per_iteration": 4.060056686401367 }, { "auxiliary_loss_clip": 0.0653569, "auxiliary_loss_mlp": 0.01278751, "balance_loss_clip": 0.06299985, "balance_loss_mlp": 0.01259189, "epoch": 0.229430332180971, "flos": 28624068472320.0, "grad_norm": 3.1091382979330024, "language_loss": 0.83845133, "learning_rate": 3.597090005586848e-06, "loss": 0.91659582, "num_input_tokens_seen": 82102950, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.19567871, "step": 3816, "time_per_iteration": 2.734433889389038 }, { "auxiliary_loss_clip": 0.0653338, "auxiliary_loss_mlp": 0.01271781, "balance_loss_clip": 0.06299116, "balance_loss_mlp": 0.01252838, "epoch": 0.22949045543363897, "flos": 17243912597760.0, "grad_norm": 2.408408961022957, "language_loss": 0.87862182, "learning_rate": 3.596855544646742e-06, "loss": 0.95667338, "num_input_tokens_seen": 82119510, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.18933105, "step": 3817, "time_per_iteration": 2.551297664642334 }, { "auxiliary_loss_clip": 0.06535953, "auxiliary_loss_mlp": 0.0127184, "balance_loss_clip": 0.0629982, "balance_loss_mlp": 0.01252576, "epoch": 0.22955057868630693, "flos": 27496654986240.0, "grad_norm": 1.9657537401363165, "language_loss": 0.75463557, "learning_rate": 3.5966210231537154e-06, "loss": 0.83271348, "num_input_tokens_seen": 82140095, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.19262695, "step": 3818, "time_per_iteration": 4.068502902984619 }, { "auxiliary_loss_clip": 0.06536719, "auxiliary_loss_mlp": 0.01273669, "balance_loss_clip": 0.06301633, "balance_loss_mlp": 0.01253094, "epoch": 0.2296107019389749, "flos": 23483036413440.0, "grad_norm": 1.8520327525913833, "language_loss": 0.75313222, "learning_rate": 3.596386441116659e-06, "loss": 0.83123606, "num_input_tokens_seen": 82159510, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.20581055, "step": 3819, "time_per_iteration": 2.616800308227539 }, { "auxiliary_loss_clip": 0.06542967, "auxiliary_loss_mlp": 0.01276192, "balance_loss_clip": 0.06306081, "balance_loss_mlp": 0.01258108, "epoch": 0.22967082519164286, "flos": 31293212757120.0, "grad_norm": 1.5470371212291356, "language_loss": 0.81903541, "learning_rate": 3.5961517985444684e-06, "loss": 0.89722705, "num_input_tokens_seen": 82179580, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.18078613, "step": 3820, "time_per_iteration": 2.6669304370880127 }, { "auxiliary_loss_clip": 0.06545364, "auxiliary_loss_mlp": 0.01274197, "balance_loss_clip": 0.0630545, "balance_loss_mlp": 0.01255064, "epoch": 0.22973094844431083, "flos": 14647415402880.0, "grad_norm": 2.2951572894759567, "language_loss": 0.70231843, "learning_rate": 3.595917095446042e-06, "loss": 0.780514, "num_input_tokens_seen": 82195585, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.19140625, "step": 3821, "time_per_iteration": 2.551464796066284 }, { "auxiliary_loss_clip": 0.06540591, "auxiliary_loss_mlp": 0.01273878, "balance_loss_clip": 0.06307616, "balance_loss_mlp": 0.01254339, "epoch": 0.2297910716969788, "flos": 22831057393920.0, "grad_norm": 1.6182369632872518, "language_loss": 0.83715135, "learning_rate": 3.5956823318302796e-06, "loss": 0.91529608, "num_input_tokens_seen": 82217530, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.1953125, "step": 3822, "time_per_iteration": 2.6772451400756836 }, { "auxiliary_loss_clip": 0.06529879, "auxiliary_loss_mlp": 0.01277377, "balance_loss_clip": 0.06299502, "balance_loss_mlp": 0.01257529, "epoch": 0.2298511949496468, "flos": 23045644500480.0, "grad_norm": 1.5453990813834446, "language_loss": 0.66428882, "learning_rate": 3.5954475077060833e-06, "loss": 0.74236137, "num_input_tokens_seen": 82237980, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.19848633, "step": 3823, "time_per_iteration": 2.6136977672576904 }, { "auxiliary_loss_clip": 0.0642046, "auxiliary_loss_mlp": 0.01257076, "balance_loss_clip": 0.06288046, "balance_loss_mlp": 0.01251944, "epoch": 0.22991131820231475, "flos": 66910296228480.0, "grad_norm": 0.763558210822774, "language_loss": 0.56963909, "learning_rate": 3.595212623082357e-06, "loss": 0.6464144, "num_input_tokens_seen": 82301785, "router_z_loss_clip": 1.32617188, "router_z_loss_mlp": 0.05133057, "step": 3824, "time_per_iteration": 3.256777763366699 }, { "auxiliary_loss_clip": 0.06536312, "auxiliary_loss_mlp": 0.01279055, "balance_loss_clip": 0.06307616, "balance_loss_mlp": 0.01261102, "epoch": 0.22997144145498272, "flos": 17891782767360.0, "grad_norm": 2.0475777434197626, "language_loss": 0.7424655, "learning_rate": 3.594977677968009e-06, "loss": 0.82061923, "num_input_tokens_seen": 82317355, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.17944336, "step": 3825, "time_per_iteration": 2.5539796352386475 }, { "auxiliary_loss_clip": 0.0653528, "auxiliary_loss_mlp": 0.01277164, "balance_loss_clip": 0.06303304, "balance_loss_mlp": 0.01257518, "epoch": 0.23003156470765068, "flos": 24683055062400.0, "grad_norm": 1.92947458649127, "language_loss": 0.88316023, "learning_rate": 3.5947426723719473e-06, "loss": 0.96128464, "num_input_tokens_seen": 82336645, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.1965332, "step": 3826, "time_per_iteration": 2.614382266998291 }, { "auxiliary_loss_clip": 0.06546786, "auxiliary_loss_mlp": 0.01281316, "balance_loss_clip": 0.06306618, "balance_loss_mlp": 0.01260753, "epoch": 0.23009168796031865, "flos": 15819412060800.0, "grad_norm": 2.462249233433681, "language_loss": 0.82762259, "learning_rate": 3.594507606303083e-06, "loss": 0.90590358, "num_input_tokens_seen": 82354225, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.20556641, "step": 3827, "time_per_iteration": 2.556148052215576 }, { "auxiliary_loss_clip": 0.06532948, "auxiliary_loss_mlp": 0.0128024, "balance_loss_clip": 0.0630262, "balance_loss_mlp": 0.01261167, "epoch": 0.2301518112129866, "flos": 16217755171200.0, "grad_norm": 1.9022883976487763, "language_loss": 0.87464201, "learning_rate": 3.5942724797703314e-06, "loss": 0.95277393, "num_input_tokens_seen": 82370240, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.1907959, "step": 3828, "time_per_iteration": 2.5412683486938477 }, { "auxiliary_loss_clip": 0.06536857, "auxiliary_loss_mlp": 0.0127639, "balance_loss_clip": 0.06303547, "balance_loss_mlp": 0.01257102, "epoch": 0.2302119344656546, "flos": 20601820644480.0, "grad_norm": 2.6991494730270045, "language_loss": 0.71168363, "learning_rate": 3.594037292782607e-06, "loss": 0.78981614, "num_input_tokens_seen": 82389145, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.19287109, "step": 3829, "time_per_iteration": 2.5681116580963135 }, { "auxiliary_loss_clip": 0.06535871, "auxiliary_loss_mlp": 0.01272784, "balance_loss_clip": 0.06308898, "balance_loss_mlp": 0.01255582, "epoch": 0.23027205771832257, "flos": 26804117790720.0, "grad_norm": 1.5370918065770263, "language_loss": 0.85052592, "learning_rate": 3.5938020453488293e-06, "loss": 0.92861247, "num_input_tokens_seen": 82409185, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.17199707, "step": 3830, "time_per_iteration": 2.6283133029937744 }, { "auxiliary_loss_clip": 0.06539425, "auxiliary_loss_mlp": 0.01278359, "balance_loss_clip": 0.06307819, "balance_loss_mlp": 0.01259572, "epoch": 0.23033218097099054, "flos": 43883365916160.0, "grad_norm": 1.7610137187698678, "language_loss": 0.67552447, "learning_rate": 3.5935667374779177e-06, "loss": 0.75370234, "num_input_tokens_seen": 82432070, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.18798828, "step": 3831, "time_per_iteration": 2.7697412967681885 }, { "auxiliary_loss_clip": 0.06539892, "auxiliary_loss_mlp": 0.01278393, "balance_loss_clip": 0.06306942, "balance_loss_mlp": 0.01257818, "epoch": 0.2303923042236585, "flos": 26074837779840.0, "grad_norm": 3.137842860157637, "language_loss": 0.76805806, "learning_rate": 3.5933313691787957e-06, "loss": 0.84624088, "num_input_tokens_seen": 82450625, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.20568848, "step": 3832, "time_per_iteration": 2.6065778732299805 }, { "auxiliary_loss_clip": 0.06542207, "auxiliary_loss_mlp": 0.01278449, "balance_loss_clip": 0.06307908, "balance_loss_mlp": 0.01258851, "epoch": 0.23045242747632647, "flos": 18302284719360.0, "grad_norm": 1.6944820262618179, "language_loss": 0.88104248, "learning_rate": 3.593095940460389e-06, "loss": 0.95924902, "num_input_tokens_seen": 82468575, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.19616699, "step": 3833, "time_per_iteration": 2.531233787536621 }, { "auxiliary_loss_clip": 0.06538235, "auxiliary_loss_mlp": 0.0127253, "balance_loss_clip": 0.06305111, "balance_loss_mlp": 0.01253099, "epoch": 0.23051255072899443, "flos": 25527636691200.0, "grad_norm": 1.8425263839445636, "language_loss": 0.75951958, "learning_rate": 3.592860451331624e-06, "loss": 0.83762717, "num_input_tokens_seen": 82488655, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.19458008, "step": 3834, "time_per_iteration": 2.608328342437744 }, { "auxiliary_loss_clip": 0.06540484, "auxiliary_loss_mlp": 0.01281108, "balance_loss_clip": 0.06309235, "balance_loss_mlp": 0.01261283, "epoch": 0.2305726739816624, "flos": 21221584968960.0, "grad_norm": 2.0740628094467852, "language_loss": 0.864833, "learning_rate": 3.592624901801432e-06, "loss": 0.94304883, "num_input_tokens_seen": 82507220, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.19812012, "step": 3835, "time_per_iteration": 2.594825506210327 }, { "auxiliary_loss_clip": 0.06547467, "auxiliary_loss_mlp": 0.01277472, "balance_loss_clip": 0.06310354, "balance_loss_mlp": 0.0125878, "epoch": 0.2306327972343304, "flos": 23337826087680.0, "grad_norm": 2.4195196258656844, "language_loss": 0.83645052, "learning_rate": 3.5923892918787432e-06, "loss": 0.91469991, "num_input_tokens_seen": 82527920, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.18688965, "step": 3836, "time_per_iteration": 2.594911575317383 }, { "auxiliary_loss_clip": 0.06544532, "auxiliary_loss_mlp": 0.01277433, "balance_loss_clip": 0.06312107, "balance_loss_mlp": 0.01258752, "epoch": 0.23069292048699835, "flos": 20672832579840.0, "grad_norm": 1.6054051358986183, "language_loss": 0.80272841, "learning_rate": 3.5921536215724934e-06, "loss": 0.88094807, "num_input_tokens_seen": 82549040, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.18676758, "step": 3837, "time_per_iteration": 2.5872533321380615 }, { "auxiliary_loss_clip": 0.06433336, "auxiliary_loss_mlp": 0.01255802, "balance_loss_clip": 0.06300974, "balance_loss_mlp": 0.01250607, "epoch": 0.23075304373966632, "flos": 70472854673280.0, "grad_norm": 0.8620694223715952, "language_loss": 0.65454912, "learning_rate": 3.5919178908916184e-06, "loss": 0.73144042, "num_input_tokens_seen": 82604070, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.05197144, "step": 3838, "time_per_iteration": 3.1097190380096436 }, { "auxiliary_loss_clip": 0.06538699, "auxiliary_loss_mlp": 0.01276624, "balance_loss_clip": 0.06307948, "balance_loss_mlp": 0.01258802, "epoch": 0.23081316699233428, "flos": 16623603221760.0, "grad_norm": 1.9289125395324564, "language_loss": 0.76616204, "learning_rate": 3.591682099845058e-06, "loss": 0.84431529, "num_input_tokens_seen": 82619665, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.17810059, "step": 3839, "time_per_iteration": 2.557483434677124 }, { "auxiliary_loss_clip": 0.06544127, "auxiliary_loss_mlp": 0.01278324, "balance_loss_clip": 0.06309746, "balance_loss_mlp": 0.01259596, "epoch": 0.23087329024500225, "flos": 13303192677120.0, "grad_norm": 1.740631223181113, "language_loss": 0.69258332, "learning_rate": 3.591446248441752e-06, "loss": 0.7708078, "num_input_tokens_seen": 82637530, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.18725586, "step": 3840, "time_per_iteration": 2.562643527984619 }, { "auxiliary_loss_clip": 0.06536937, "auxiliary_loss_mlp": 0.01280628, "balance_loss_clip": 0.06307441, "balance_loss_mlp": 0.01261483, "epoch": 0.23093341349767021, "flos": 17791574883840.0, "grad_norm": 2.0318565654246337, "language_loss": 0.79770315, "learning_rate": 3.591210336690645e-06, "loss": 0.87587875, "num_input_tokens_seen": 82656130, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.19152832, "step": 3841, "time_per_iteration": 2.580404043197632 }, { "auxiliary_loss_clip": 0.06533793, "auxiliary_loss_mlp": 0.01281634, "balance_loss_clip": 0.06304297, "balance_loss_mlp": 0.0126436, "epoch": 0.23099353675033818, "flos": 23994920206080.0, "grad_norm": 1.721732319033179, "language_loss": 0.83132362, "learning_rate": 3.590974364600683e-06, "loss": 0.90947783, "num_input_tokens_seen": 82675295, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.17297363, "step": 3842, "time_per_iteration": 2.610469102859497 }, { "auxiliary_loss_clip": 0.06538607, "auxiliary_loss_mlp": 0.01277156, "balance_loss_clip": 0.06308981, "balance_loss_mlp": 0.01258285, "epoch": 0.23105366000300617, "flos": 36004567478400.0, "grad_norm": 1.5652340199711232, "language_loss": 0.66869473, "learning_rate": 3.5907383321808135e-06, "loss": 0.74685234, "num_input_tokens_seen": 82703260, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.1887207, "step": 3843, "time_per_iteration": 2.783522844314575 }, { "auxiliary_loss_clip": 0.06538726, "auxiliary_loss_mlp": 0.01279525, "balance_loss_clip": 0.06312381, "balance_loss_mlp": 0.01261119, "epoch": 0.23111378325567414, "flos": 31252822289280.0, "grad_norm": 1.6614125422612804, "language_loss": 0.77474922, "learning_rate": 3.590502239439987e-06, "loss": 0.85293174, "num_input_tokens_seen": 82725060, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.18395996, "step": 3844, "time_per_iteration": 4.138825416564941 }, { "auxiliary_loss_clip": 0.06533986, "auxiliary_loss_mlp": 0.01282837, "balance_loss_clip": 0.06301723, "balance_loss_mlp": 0.01263633, "epoch": 0.2311739065083421, "flos": 19214230631040.0, "grad_norm": 1.7628543951280466, "language_loss": 0.78719318, "learning_rate": 3.590266086387156e-06, "loss": 0.86536145, "num_input_tokens_seen": 82742960, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.1920166, "step": 3845, "time_per_iteration": 2.6446657180786133 }, { "auxiliary_loss_clip": 0.06527045, "auxiliary_loss_mlp": 0.01280984, "balance_loss_clip": 0.06303966, "balance_loss_mlp": 0.01263901, "epoch": 0.23123402976101007, "flos": 23365638443520.0, "grad_norm": 1.8654079776077945, "language_loss": 0.76151812, "learning_rate": 3.590029873031276e-06, "loss": 0.83959842, "num_input_tokens_seen": 82760205, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.17077637, "step": 3846, "time_per_iteration": 2.6298773288726807 }, { "auxiliary_loss_clip": 0.06535898, "auxiliary_loss_mlp": 0.0128861, "balance_loss_clip": 0.06305566, "balance_loss_mlp": 0.01269394, "epoch": 0.23129415301367803, "flos": 13740458808960.0, "grad_norm": 1.7824324671343568, "language_loss": 0.70136189, "learning_rate": 3.589793599381304e-06, "loss": 0.77960694, "num_input_tokens_seen": 82778590, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.19213867, "step": 3847, "time_per_iteration": 2.5683791637420654 }, { "auxiliary_loss_clip": 0.06431936, "auxiliary_loss_mlp": 0.01258315, "balance_loss_clip": 0.06300512, "balance_loss_mlp": 0.01253412, "epoch": 0.231354276266346, "flos": 69756907461120.0, "grad_norm": 0.7634085173470085, "language_loss": 0.60985053, "learning_rate": 3.589557265446198e-06, "loss": 0.68675303, "num_input_tokens_seen": 82833925, "router_z_loss_clip": 1.3125, "router_z_loss_mlp": 0.04898071, "step": 3848, "time_per_iteration": 3.101536273956299 }, { "auxiliary_loss_clip": 0.06539819, "auxiliary_loss_mlp": 0.01283884, "balance_loss_clip": 0.06308444, "balance_loss_mlp": 0.01265311, "epoch": 0.231414399519014, "flos": 18840597275520.0, "grad_norm": 2.4665129259260565, "language_loss": 0.79108441, "learning_rate": 3.589320871234923e-06, "loss": 0.86932147, "num_input_tokens_seen": 82850625, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.18579102, "step": 3849, "time_per_iteration": 4.004235744476318 }, { "auxiliary_loss_clip": 0.06537601, "auxiliary_loss_mlp": 0.01283821, "balance_loss_clip": 0.06305767, "balance_loss_mlp": 0.01263615, "epoch": 0.23147452277168196, "flos": 36143949945600.0, "grad_norm": 2.069737697748291, "language_loss": 0.71640551, "learning_rate": 3.5890844167564405e-06, "loss": 0.79461974, "num_input_tokens_seen": 82872105, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.20227051, "step": 3850, "time_per_iteration": 2.708922863006592 }, { "auxiliary_loss_clip": 0.06534207, "auxiliary_loss_mlp": 0.01284985, "balance_loss_clip": 0.06305167, "balance_loss_mlp": 0.01266818, "epoch": 0.23153464602434992, "flos": 20819091081600.0, "grad_norm": 1.974211063738504, "language_loss": 0.7651521, "learning_rate": 3.588847902019718e-06, "loss": 0.84334397, "num_input_tokens_seen": 82890595, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.18164062, "step": 3851, "time_per_iteration": 2.5862877368927 }, { "auxiliary_loss_clip": 0.0652981, "auxiliary_loss_mlp": 0.0128281, "balance_loss_clip": 0.06302199, "balance_loss_mlp": 0.01263224, "epoch": 0.2315947692770179, "flos": 19945606993920.0, "grad_norm": 2.092771969102281, "language_loss": 0.70628679, "learning_rate": 3.588611327033723e-06, "loss": 0.78441298, "num_input_tokens_seen": 82908910, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.19604492, "step": 3852, "time_per_iteration": 2.5755350589752197 }, { "auxiliary_loss_clip": 0.06533916, "auxiliary_loss_mlp": 0.01282076, "balance_loss_clip": 0.06301436, "balance_loss_mlp": 0.01264016, "epoch": 0.23165489252968585, "flos": 12859805197440.0, "grad_norm": 3.282412511699313, "language_loss": 0.678563, "learning_rate": 3.588374691807428e-06, "loss": 0.75672287, "num_input_tokens_seen": 82925405, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.18078613, "step": 3853, "time_per_iteration": 2.5461461544036865 }, { "auxiliary_loss_clip": 0.06534852, "auxiliary_loss_mlp": 0.01275895, "balance_loss_clip": 0.06301416, "balance_loss_mlp": 0.01256392, "epoch": 0.23171501578235382, "flos": 30636202492800.0, "grad_norm": 1.7725672497650338, "language_loss": 0.80649328, "learning_rate": 3.5881379963498053e-06, "loss": 0.88460076, "num_input_tokens_seen": 82945615, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.19506836, "step": 3854, "time_per_iteration": 4.081850051879883 }, { "auxiliary_loss_clip": 0.06542845, "auxiliary_loss_mlp": 0.01277401, "balance_loss_clip": 0.06301448, "balance_loss_mlp": 0.01258042, "epoch": 0.23177513903502178, "flos": 23849709880320.0, "grad_norm": 2.245383763677251, "language_loss": 0.66076994, "learning_rate": 3.587901240669831e-06, "loss": 0.73897243, "num_input_tokens_seen": 82967570, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.19348145, "step": 3855, "time_per_iteration": 2.619837999343872 }, { "auxiliary_loss_clip": 0.06529758, "auxiliary_loss_mlp": 0.01282077, "balance_loss_clip": 0.0629763, "balance_loss_mlp": 0.01262479, "epoch": 0.23183526228768978, "flos": 29578040006400.0, "grad_norm": 2.1019444163095042, "language_loss": 0.71462417, "learning_rate": 3.5876644247764815e-06, "loss": 0.79274249, "num_input_tokens_seen": 82987435, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.19567871, "step": 3856, "time_per_iteration": 2.6529762744903564 }, { "auxiliary_loss_clip": 0.06529835, "auxiliary_loss_mlp": 0.01275299, "balance_loss_clip": 0.06299438, "balance_loss_mlp": 0.01257095, "epoch": 0.23189538554035774, "flos": 34467155164800.0, "grad_norm": 1.6218423309135321, "language_loss": 0.78216594, "learning_rate": 3.5874275486787387e-06, "loss": 0.86021721, "num_input_tokens_seen": 83010505, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.18225098, "step": 3857, "time_per_iteration": 4.2358105182647705 }, { "auxiliary_loss_clip": 0.06533338, "auxiliary_loss_mlp": 0.01278928, "balance_loss_clip": 0.06296965, "balance_loss_mlp": 0.01257888, "epoch": 0.2319555087930257, "flos": 18009558080640.0, "grad_norm": 2.205462157666678, "language_loss": 0.91744483, "learning_rate": 3.587190612385584e-06, "loss": 0.99556756, "num_input_tokens_seen": 83026705, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.21044922, "step": 3858, "time_per_iteration": 2.5547354221343994 }, { "auxiliary_loss_clip": 0.06530625, "auxiliary_loss_mlp": 0.01278345, "balance_loss_clip": 0.06304878, "balance_loss_mlp": 0.01259951, "epoch": 0.23201563204569367, "flos": 23149709671680.0, "grad_norm": 1.864557845256136, "language_loss": 0.77162862, "learning_rate": 3.5869536159060026e-06, "loss": 0.84971833, "num_input_tokens_seen": 83046500, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.18395996, "step": 3859, "time_per_iteration": 2.5792160034179688 }, { "auxiliary_loss_clip": 0.06531394, "auxiliary_loss_mlp": 0.01278209, "balance_loss_clip": 0.06301136, "balance_loss_mlp": 0.01259851, "epoch": 0.23207575529836164, "flos": 20674300026240.0, "grad_norm": 4.823917915724452, "language_loss": 0.85090816, "learning_rate": 3.58671655924898e-06, "loss": 0.92900419, "num_input_tokens_seen": 83065280, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.18359375, "step": 3860, "time_per_iteration": 2.570201873779297 }, { "auxiliary_loss_clip": 0.06528991, "auxiliary_loss_mlp": 0.01275611, "balance_loss_clip": 0.06301373, "balance_loss_mlp": 0.01255381, "epoch": 0.2321358785510296, "flos": 16477805917440.0, "grad_norm": 1.9964751774005687, "language_loss": 0.83639717, "learning_rate": 3.586479442423508e-06, "loss": 0.91444314, "num_input_tokens_seen": 83082310, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.20239258, "step": 3861, "time_per_iteration": 2.5274901390075684 }, { "auxiliary_loss_clip": 0.06532286, "auxiliary_loss_mlp": 0.01283118, "balance_loss_clip": 0.06302361, "balance_loss_mlp": 0.01263985, "epoch": 0.2321960018036976, "flos": 21622737191040.0, "grad_norm": 1.8262099471937672, "language_loss": 0.86403418, "learning_rate": 3.586242265438576e-06, "loss": 0.94218826, "num_input_tokens_seen": 83102065, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.19128418, "step": 3862, "time_per_iteration": 2.735530138015747 }, { "auxiliary_loss_clip": 0.06529437, "auxiliary_loss_mlp": 0.01282074, "balance_loss_clip": 0.06305077, "balance_loss_mlp": 0.01264466, "epoch": 0.23225612505636556, "flos": 22277734957440.0, "grad_norm": 1.3785594042803058, "language_loss": 0.75821292, "learning_rate": 3.5860050283031773e-06, "loss": 0.83632803, "num_input_tokens_seen": 83121445, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.17602539, "step": 3863, "time_per_iteration": 2.6091060638427734 }, { "auxiliary_loss_clip": 0.06522875, "auxiliary_loss_mlp": 0.01276143, "balance_loss_clip": 0.0629922, "balance_loss_mlp": 0.01257927, "epoch": 0.23231624830903352, "flos": 17057431336320.0, "grad_norm": 1.701100974481311, "language_loss": 0.74543011, "learning_rate": 3.58576773102631e-06, "loss": 0.82342029, "num_input_tokens_seen": 83138175, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.18225098, "step": 3864, "time_per_iteration": 2.541907548904419 }, { "auxiliary_loss_clip": 0.06528582, "auxiliary_loss_mlp": 0.01272194, "balance_loss_clip": 0.06299669, "balance_loss_mlp": 0.01253216, "epoch": 0.2323763715617015, "flos": 34648353619200.0, "grad_norm": 1.6637096683763029, "language_loss": 0.70653236, "learning_rate": 3.5855303736169714e-06, "loss": 0.78454012, "num_input_tokens_seen": 83161975, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.18981934, "step": 3865, "time_per_iteration": 2.7080724239349365 }, { "auxiliary_loss_clip": 0.06538023, "auxiliary_loss_mlp": 0.01275423, "balance_loss_clip": 0.06299108, "balance_loss_mlp": 0.01254907, "epoch": 0.23243649481436945, "flos": 25557922742400.0, "grad_norm": 1.7220869919318913, "language_loss": 0.95753837, "learning_rate": 3.5852929560841617e-06, "loss": 1.0356729, "num_input_tokens_seen": 83180905, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.20507812, "step": 3866, "time_per_iteration": 2.64217209815979 }, { "auxiliary_loss_clip": 0.06526998, "auxiliary_loss_mlp": 0.01276939, "balance_loss_clip": 0.06296959, "balance_loss_mlp": 0.01257973, "epoch": 0.23249661806703742, "flos": 20489411992320.0, "grad_norm": 4.117826057643592, "language_loss": 0.73423779, "learning_rate": 3.5850554784368846e-06, "loss": 0.8122772, "num_input_tokens_seen": 83196390, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.18969727, "step": 3867, "time_per_iteration": 2.5599257946014404 }, { "auxiliary_loss_clip": 0.06528619, "auxiliary_loss_mlp": 0.01275977, "balance_loss_clip": 0.06298009, "balance_loss_mlp": 0.01256212, "epoch": 0.23255674131970538, "flos": 20382956979840.0, "grad_norm": 2.557690219526364, "language_loss": 0.82616198, "learning_rate": 3.584817940684145e-06, "loss": 0.90420789, "num_input_tokens_seen": 83216165, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.19763184, "step": 3868, "time_per_iteration": 2.5975277423858643 }, { "auxiliary_loss_clip": 0.06527787, "auxiliary_loss_mlp": 0.01273101, "balance_loss_clip": 0.06300421, "balance_loss_mlp": 0.01255113, "epoch": 0.23261686457237338, "flos": 17061833675520.0, "grad_norm": 1.685126006202794, "language_loss": 0.73854816, "learning_rate": 3.58458034283495e-06, "loss": 0.81655705, "num_input_tokens_seen": 83233845, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.18005371, "step": 3869, "time_per_iteration": 2.5514283180236816 }, { "auxiliary_loss_clip": 0.06527777, "auxiliary_loss_mlp": 0.0127864, "balance_loss_clip": 0.0629879, "balance_loss_mlp": 0.01259257, "epoch": 0.23267698782504134, "flos": 29177726325120.0, "grad_norm": 4.884663112267204, "language_loss": 0.80641294, "learning_rate": 3.5843426848983097e-06, "loss": 0.88447714, "num_input_tokens_seen": 83254930, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.19384766, "step": 3870, "time_per_iteration": 2.622922658920288 }, { "auxiliary_loss_clip": 0.06534936, "auxiliary_loss_mlp": 0.01282066, "balance_loss_clip": 0.06299663, "balance_loss_mlp": 0.012631, "epoch": 0.2327371110777093, "flos": 21180355960320.0, "grad_norm": 2.149327830584134, "language_loss": 0.70938742, "learning_rate": 3.5841049668832357e-06, "loss": 0.78755736, "num_input_tokens_seen": 83272095, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.1895752, "step": 3871, "time_per_iteration": 2.5707714557647705 }, { "auxiliary_loss_clip": 0.06534933, "auxiliary_loss_mlp": 0.01278773, "balance_loss_clip": 0.06300196, "balance_loss_mlp": 0.01258365, "epoch": 0.23279723433037727, "flos": 24869997521280.0, "grad_norm": 1.8442031350461805, "language_loss": 0.69762725, "learning_rate": 3.5838671887987433e-06, "loss": 0.77576429, "num_input_tokens_seen": 83290980, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.20397949, "step": 3872, "time_per_iteration": 2.6413564682006836 }, { "auxiliary_loss_clip": 0.0653528, "auxiliary_loss_mlp": 0.01283373, "balance_loss_clip": 0.06298024, "balance_loss_mlp": 0.01263358, "epoch": 0.23285735758304524, "flos": 38809823921280.0, "grad_norm": 1.883870621086084, "language_loss": 0.78462225, "learning_rate": 3.5836293506538474e-06, "loss": 0.86280882, "num_input_tokens_seen": 83315175, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.20019531, "step": 3873, "time_per_iteration": 2.7246110439300537 }, { "auxiliary_loss_clip": 0.06404945, "auxiliary_loss_mlp": 0.01262598, "balance_loss_clip": 0.06274081, "balance_loss_mlp": 0.01257731, "epoch": 0.2329174808357132, "flos": 53962274280960.0, "grad_norm": 0.8279291561415516, "language_loss": 0.60177165, "learning_rate": 3.5833914524575687e-06, "loss": 0.67844713, "num_input_tokens_seen": 83372060, "router_z_loss_clip": 1.31152344, "router_z_loss_mlp": 0.0486145, "step": 3874, "time_per_iteration": 3.0884509086608887 }, { "auxiliary_loss_clip": 0.06525064, "auxiliary_loss_mlp": 0.01276374, "balance_loss_clip": 0.0629583, "balance_loss_mlp": 0.01256776, "epoch": 0.23297760408838117, "flos": 21222549290880.0, "grad_norm": 2.6166875135647216, "language_loss": 0.81934506, "learning_rate": 3.583153494218927e-06, "loss": 0.89735943, "num_input_tokens_seen": 83389795, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.19604492, "step": 3875, "time_per_iteration": 2.5819342136383057 }, { "auxiliary_loss_clip": 0.06525917, "auxiliary_loss_mlp": 0.01276012, "balance_loss_clip": 0.06298947, "balance_loss_mlp": 0.01257522, "epoch": 0.23303772734104916, "flos": 28410613395840.0, "grad_norm": 1.5930956841343955, "language_loss": 0.61383373, "learning_rate": 3.5829154759469464e-06, "loss": 0.69185299, "num_input_tokens_seen": 83410005, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.18493652, "step": 3876, "time_per_iteration": 2.6333515644073486 }, { "auxiliary_loss_clip": 0.06528533, "auxiliary_loss_mlp": 0.01278896, "balance_loss_clip": 0.06296846, "balance_loss_mlp": 0.01259525, "epoch": 0.23309785059371713, "flos": 24321328986240.0, "grad_norm": 1.6306951806245973, "language_loss": 0.71127999, "learning_rate": 3.5826773976506523e-06, "loss": 0.78935432, "num_input_tokens_seen": 83430250, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.19372559, "step": 3877, "time_per_iteration": 2.584667444229126 }, { "auxiliary_loss_clip": 0.06527398, "auxiliary_loss_mlp": 0.01276504, "balance_loss_clip": 0.06299064, "balance_loss_mlp": 0.01256238, "epoch": 0.2331579738463851, "flos": 15997633695360.0, "grad_norm": 4.830857898953259, "language_loss": 0.81489748, "learning_rate": 3.582439259339073e-06, "loss": 0.89293653, "num_input_tokens_seen": 83447950, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.20263672, "step": 3878, "time_per_iteration": 2.537277936935425 }, { "auxiliary_loss_clip": 0.0652885, "auxiliary_loss_mlp": 0.01282985, "balance_loss_clip": 0.06293909, "balance_loss_mlp": 0.01263447, "epoch": 0.23321809709905306, "flos": 36435418773120.0, "grad_norm": 1.590129324839292, "language_loss": 0.75123119, "learning_rate": 3.5822010610212374e-06, "loss": 0.82934952, "num_input_tokens_seen": 83467785, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.19555664, "step": 3879, "time_per_iteration": 2.680407762527466 }, { "auxiliary_loss_clip": 0.06527779, "auxiliary_loss_mlp": 0.01276786, "balance_loss_clip": 0.06299564, "balance_loss_mlp": 0.01257581, "epoch": 0.23327822035172102, "flos": 21331184509440.0, "grad_norm": 2.3481209533342704, "language_loss": 0.90502638, "learning_rate": 3.5819628027061795e-06, "loss": 0.98307198, "num_input_tokens_seen": 83485390, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.19213867, "step": 3880, "time_per_iteration": 2.5576012134552 }, { "auxiliary_loss_clip": 0.06528395, "auxiliary_loss_mlp": 0.01277672, "balance_loss_clip": 0.06295899, "balance_loss_mlp": 0.01259386, "epoch": 0.233338343604389, "flos": 19177907086080.0, "grad_norm": 1.7862247316388138, "language_loss": 0.72226512, "learning_rate": 3.5817244844029334e-06, "loss": 0.80032575, "num_input_tokens_seen": 83504890, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.1829834, "step": 3881, "time_per_iteration": 2.5689644813537598 }, { "auxiliary_loss_clip": 0.06525555, "auxiliary_loss_mlp": 0.01278226, "balance_loss_clip": 0.06296575, "balance_loss_mlp": 0.01259379, "epoch": 0.23339846685705698, "flos": 26915939464320.0, "grad_norm": 1.6443101752503377, "language_loss": 0.68460262, "learning_rate": 3.581486106120537e-06, "loss": 0.76264048, "num_input_tokens_seen": 83526475, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.18835449, "step": 3882, "time_per_iteration": 2.611602544784546 }, { "auxiliary_loss_clip": 0.06525957, "auxiliary_loss_mlp": 0.01277475, "balance_loss_clip": 0.06294597, "balance_loss_mlp": 0.01258592, "epoch": 0.23345859010972494, "flos": 32351375243520.0, "grad_norm": 2.90896322074649, "language_loss": 0.77633119, "learning_rate": 3.5812476678680287e-06, "loss": 0.85436553, "num_input_tokens_seen": 83546620, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.18896484, "step": 3883, "time_per_iteration": 4.0844948291778564 }, { "auxiliary_loss_clip": 0.06402728, "auxiliary_loss_mlp": 0.01255894, "balance_loss_clip": 0.06272633, "balance_loss_mlp": 0.01250944, "epoch": 0.2335187133623929, "flos": 58505805273600.0, "grad_norm": 0.7565849176012877, "language_loss": 0.59174728, "learning_rate": 3.58100916965445e-06, "loss": 0.66833353, "num_input_tokens_seen": 83616160, "router_z_loss_clip": 1.296875, "router_z_loss_mlp": 0.04946899, "step": 3884, "time_per_iteration": 3.3502235412597656 }, { "auxiliary_loss_clip": 0.06528331, "auxiliary_loss_mlp": 0.01275034, "balance_loss_clip": 0.06296967, "balance_loss_mlp": 0.0125633, "epoch": 0.23357883661506088, "flos": 24509822745600.0, "grad_norm": 2.0465580402335437, "language_loss": 0.80368471, "learning_rate": 3.5807706114888455e-06, "loss": 0.8817184, "num_input_tokens_seen": 83636795, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.18701172, "step": 3885, "time_per_iteration": 2.6924593448638916 }, { "auxiliary_loss_clip": 0.0652995, "auxiliary_loss_mlp": 0.01276269, "balance_loss_clip": 0.06304348, "balance_loss_mlp": 0.01257577, "epoch": 0.23363895986772884, "flos": 18953760614400.0, "grad_norm": 2.2159334757339217, "language_loss": 0.88767856, "learning_rate": 3.580531993380261e-06, "loss": 0.9657408, "num_input_tokens_seen": 83654050, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.18688965, "step": 3886, "time_per_iteration": 2.5833497047424316 }, { "auxiliary_loss_clip": 0.06526537, "auxiliary_loss_mlp": 0.01278105, "balance_loss_clip": 0.06296596, "balance_loss_mlp": 0.01259473, "epoch": 0.2336990831203968, "flos": 31694993884800.0, "grad_norm": 2.1757433083613713, "language_loss": 0.74374366, "learning_rate": 3.5802933153377445e-06, "loss": 0.8217901, "num_input_tokens_seen": 83673720, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.18615723, "step": 3887, "time_per_iteration": 2.641467571258545 }, { "auxiliary_loss_clip": 0.06523007, "auxiliary_loss_mlp": 0.01275607, "balance_loss_clip": 0.06293093, "balance_loss_mlp": 0.01256737, "epoch": 0.23375920637306477, "flos": 27717237659520.0, "grad_norm": 1.8377779096498177, "language_loss": 0.84856647, "learning_rate": 3.5800545773703475e-06, "loss": 0.92655265, "num_input_tokens_seen": 83693470, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.18884277, "step": 3888, "time_per_iteration": 2.67206072807312 }, { "auxiliary_loss_clip": 0.06530704, "auxiliary_loss_mlp": 0.01278213, "balance_loss_clip": 0.06304913, "balance_loss_mlp": 0.01259509, "epoch": 0.23381932962573276, "flos": 17681346437760.0, "grad_norm": 2.082685594374406, "language_loss": 0.87698311, "learning_rate": 3.5798157794871225e-06, "loss": 0.95507228, "num_input_tokens_seen": 83711620, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.18701172, "step": 3889, "time_per_iteration": 3.99568510055542 }, { "auxiliary_loss_clip": 0.06520887, "auxiliary_loss_mlp": 0.01277662, "balance_loss_clip": 0.06294809, "balance_loss_mlp": 0.01260317, "epoch": 0.23387945287840073, "flos": 14395833918720.0, "grad_norm": 2.288337774712326, "language_loss": 0.77371329, "learning_rate": 3.579576921697125e-06, "loss": 0.85169876, "num_input_tokens_seen": 83727890, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.17333984, "step": 3890, "time_per_iteration": 2.545877695083618 }, { "auxiliary_loss_clip": 0.06523925, "auxiliary_loss_mlp": 0.01280909, "balance_loss_clip": 0.06295902, "balance_loss_mlp": 0.01262515, "epoch": 0.2339395761310687, "flos": 46108451888640.0, "grad_norm": 1.7619886966256277, "language_loss": 0.73837066, "learning_rate": 3.579338004009412e-06, "loss": 0.81641901, "num_input_tokens_seen": 83749370, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.18383789, "step": 3891, "time_per_iteration": 2.7861132621765137 }, { "auxiliary_loss_clip": 0.06520805, "auxiliary_loss_mlp": 0.0127765, "balance_loss_clip": 0.0629653, "balance_loss_mlp": 0.012604, "epoch": 0.23399969938373666, "flos": 22388508455040.0, "grad_norm": 1.6413787115320275, "language_loss": 0.8353585, "learning_rate": 3.5790990264330433e-06, "loss": 0.91334307, "num_input_tokens_seen": 83769560, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.17248535, "step": 3892, "time_per_iteration": 2.6020426750183105 }, { "auxiliary_loss_clip": 0.06526256, "auxiliary_loss_mlp": 0.0127812, "balance_loss_clip": 0.06297207, "balance_loss_mlp": 0.01259738, "epoch": 0.23405982263640462, "flos": 43518746874240.0, "grad_norm": 1.658647965052962, "language_loss": 0.65937996, "learning_rate": 3.578859988977082e-06, "loss": 0.73742366, "num_input_tokens_seen": 83795635, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.18395996, "step": 3893, "time_per_iteration": 2.7577033042907715 }, { "auxiliary_loss_clip": 0.06517074, "auxiliary_loss_mlp": 0.01277119, "balance_loss_clip": 0.06293826, "balance_loss_mlp": 0.01258558, "epoch": 0.2341199458890726, "flos": 22571216282880.0, "grad_norm": 1.973632347298517, "language_loss": 0.79250395, "learning_rate": 3.5786208916505916e-06, "loss": 0.87044585, "num_input_tokens_seen": 83814090, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.18566895, "step": 3894, "time_per_iteration": 3.9945216178894043 }, { "auxiliary_loss_clip": 0.06518213, "auxiliary_loss_mlp": 0.01273285, "balance_loss_clip": 0.06295222, "balance_loss_mlp": 0.01256083, "epoch": 0.23418006914174055, "flos": 25641764352000.0, "grad_norm": 2.2721911216057276, "language_loss": 0.82294774, "learning_rate": 3.5783817344626383e-06, "loss": 0.90086269, "num_input_tokens_seen": 83836870, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.17199707, "step": 3895, "time_per_iteration": 2.6270906925201416 }, { "auxiliary_loss_clip": 0.06520723, "auxiliary_loss_mlp": 0.01272362, "balance_loss_clip": 0.0629519, "balance_loss_mlp": 0.01254743, "epoch": 0.23424019239440855, "flos": 13549826770560.0, "grad_norm": 2.266007663111427, "language_loss": 0.81058514, "learning_rate": 3.578142517422292e-06, "loss": 0.88851595, "num_input_tokens_seen": 83853275, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.17614746, "step": 3896, "time_per_iteration": 4.042308807373047 }, { "auxiliary_loss_clip": 0.06522422, "auxiliary_loss_mlp": 0.01276581, "balance_loss_clip": 0.06292588, "balance_loss_mlp": 0.01256744, "epoch": 0.2343003156470765, "flos": 22426131738240.0, "grad_norm": 2.2414180934239334, "language_loss": 0.83749771, "learning_rate": 3.577903240538623e-06, "loss": 0.91548777, "num_input_tokens_seen": 83872340, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.19824219, "step": 3897, "time_per_iteration": 2.582984685897827 }, { "auxiliary_loss_clip": 0.0652552, "auxiliary_loss_mlp": 0.01276565, "balance_loss_clip": 0.06293425, "balance_loss_mlp": 0.01257051, "epoch": 0.23436043889974448, "flos": 14795644475520.0, "grad_norm": 1.5607247326260465, "language_loss": 0.79773021, "learning_rate": 3.577663903820705e-06, "loss": 0.87575102, "num_input_tokens_seen": 83888795, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.19519043, "step": 3898, "time_per_iteration": 2.54407000541687 }, { "auxiliary_loss_clip": 0.06512154, "auxiliary_loss_mlp": 0.01273251, "balance_loss_clip": 0.06291382, "balance_loss_mlp": 0.0125643, "epoch": 0.23442056215241244, "flos": 22972242723840.0, "grad_norm": 2.04411432035709, "language_loss": 0.74340713, "learning_rate": 3.577424507277614e-06, "loss": 0.82126117, "num_input_tokens_seen": 83906820, "router_z_loss_clip": 2.20800781, "router_z_loss_mlp": 0.16821289, "step": 3899, "time_per_iteration": 2.571568250656128 }, { "auxiliary_loss_clip": 0.06519584, "auxiliary_loss_mlp": 0.01273973, "balance_loss_clip": 0.06290966, "balance_loss_mlp": 0.01256008, "epoch": 0.2344806854050804, "flos": 23077901122560.0, "grad_norm": 4.281737365362743, "language_loss": 0.76259017, "learning_rate": 3.5771850509184277e-06, "loss": 0.84052575, "num_input_tokens_seen": 83926370, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.17956543, "step": 3900, "time_per_iteration": 2.6569595336914062 }, { "auxiliary_loss_clip": 0.06517534, "auxiliary_loss_mlp": 0.01275382, "balance_loss_clip": 0.06291172, "balance_loss_mlp": 0.01256487, "epoch": 0.23454080865774837, "flos": 16332805226880.0, "grad_norm": 1.8157416173765824, "language_loss": 0.67733639, "learning_rate": 3.5769455347522256e-06, "loss": 0.75526553, "num_input_tokens_seen": 83944600, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.18884277, "step": 3901, "time_per_iteration": 2.544771909713745 }, { "auxiliary_loss_clip": 0.06400798, "auxiliary_loss_mlp": 0.0127033, "balance_loss_clip": 0.0627269, "balance_loss_mlp": 0.01265386, "epoch": 0.23460093191041637, "flos": 67779545685120.0, "grad_norm": 0.7403878315068693, "language_loss": 0.58154273, "learning_rate": 3.576705958788091e-06, "loss": 0.65825403, "num_input_tokens_seen": 84005100, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.04940796, "step": 3902, "time_per_iteration": 3.1328482627868652 }, { "auxiliary_loss_clip": 0.06519808, "auxiliary_loss_mlp": 0.01274618, "balance_loss_clip": 0.06292269, "balance_loss_mlp": 0.01256558, "epoch": 0.23466105516308433, "flos": 20082725400960.0, "grad_norm": 1.7390983780323688, "language_loss": 0.80535609, "learning_rate": 3.576466323035108e-06, "loss": 0.8833003, "num_input_tokens_seen": 84023775, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.18066406, "step": 3903, "time_per_iteration": 2.5827877521514893 }, { "auxiliary_loss_clip": 0.06516325, "auxiliary_loss_mlp": 0.01273217, "balance_loss_clip": 0.06288796, "balance_loss_mlp": 0.01255515, "epoch": 0.2347211784157523, "flos": 24542708273280.0, "grad_norm": 1.7694250090841785, "language_loss": 0.82588738, "learning_rate": 3.5762266275023645e-06, "loss": 0.90378278, "num_input_tokens_seen": 84042605, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.17700195, "step": 3904, "time_per_iteration": 2.5858609676361084 }, { "auxiliary_loss_clip": 0.06519607, "auxiliary_loss_mlp": 0.01282563, "balance_loss_clip": 0.06291553, "balance_loss_mlp": 0.01263788, "epoch": 0.23478130166842026, "flos": 23811751180800.0, "grad_norm": 1.8419105968067604, "language_loss": 0.7163204, "learning_rate": 3.57598687219895e-06, "loss": 0.7943421, "num_input_tokens_seen": 84061520, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.18762207, "step": 3905, "time_per_iteration": 2.6675045490264893 }, { "auxiliary_loss_clip": 0.06511552, "auxiliary_loss_mlp": 0.01274714, "balance_loss_clip": 0.06288917, "balance_loss_mlp": 0.01257631, "epoch": 0.23484142492108823, "flos": 24099823918080.0, "grad_norm": 1.8718140705421613, "language_loss": 0.7160368, "learning_rate": 3.5757470571339543e-06, "loss": 0.79389948, "num_input_tokens_seen": 84081800, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.1706543, "step": 3906, "time_per_iteration": 2.5893938541412354 }, { "auxiliary_loss_clip": 0.06525084, "auxiliary_loss_mlp": 0.0127784, "balance_loss_clip": 0.06291868, "balance_loss_mlp": 0.01258147, "epoch": 0.2349015481737562, "flos": 29103486007680.0, "grad_norm": 2.239980912541525, "language_loss": 0.73876876, "learning_rate": 3.575507182316473e-06, "loss": 0.81679797, "num_input_tokens_seen": 84102340, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.19689941, "step": 3907, "time_per_iteration": 2.645416736602783 }, { "auxiliary_loss_clip": 0.06522915, "auxiliary_loss_mlp": 0.01276144, "balance_loss_clip": 0.06293049, "balance_loss_mlp": 0.01257285, "epoch": 0.23496167142642416, "flos": 18922258679040.0, "grad_norm": 1.8412884777997223, "language_loss": 0.7317555, "learning_rate": 3.575267247755601e-06, "loss": 0.80974603, "num_input_tokens_seen": 84120370, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.18859863, "step": 3908, "time_per_iteration": 2.5556046962738037 }, { "auxiliary_loss_clip": 0.06399594, "auxiliary_loss_mlp": 0.01255372, "balance_loss_clip": 0.06271517, "balance_loss_mlp": 0.01250854, "epoch": 0.23502179467909215, "flos": 55884906541440.0, "grad_norm": 1.0075129177503557, "language_loss": 0.73421824, "learning_rate": 3.5750272534604367e-06, "loss": 0.81076789, "num_input_tokens_seen": 84165515, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.04525757, "step": 3909, "time_per_iteration": 3.049421548843384 }, { "auxiliary_loss_clip": 0.06514414, "auxiliary_loss_mlp": 0.01278804, "balance_loss_clip": 0.06287827, "balance_loss_mlp": 0.01260315, "epoch": 0.23508191793176011, "flos": 23408083336320.0, "grad_norm": 1.5268262613130092, "language_loss": 0.88723385, "learning_rate": 3.5747871994400822e-06, "loss": 0.96516603, "num_input_tokens_seen": 84184540, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.18493652, "step": 3910, "time_per_iteration": 2.680879831314087 }, { "auxiliary_loss_clip": 0.06520303, "auxiliary_loss_mlp": 0.01272374, "balance_loss_clip": 0.06294101, "balance_loss_mlp": 0.01254517, "epoch": 0.23514204118442808, "flos": 20053864869120.0, "grad_norm": 2.2309092218217015, "language_loss": 0.76778036, "learning_rate": 3.5745470857036386e-06, "loss": 0.84570712, "num_input_tokens_seen": 84202025, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.17858887, "step": 3911, "time_per_iteration": 2.5674896240234375 }, { "auxiliary_loss_clip": 0.06509326, "auxiliary_loss_mlp": 0.01275476, "balance_loss_clip": 0.06287953, "balance_loss_mlp": 0.01258584, "epoch": 0.23520216443709605, "flos": 21587126405760.0, "grad_norm": 1.9703427072515343, "language_loss": 0.8208124, "learning_rate": 3.5743069122602122e-06, "loss": 0.89866042, "num_input_tokens_seen": 84221895, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.16882324, "step": 3912, "time_per_iteration": 2.6364312171936035 }, { "auxiliary_loss_clip": 0.06510609, "auxiliary_loss_mlp": 0.0127973, "balance_loss_clip": 0.06291088, "balance_loss_mlp": 0.01262695, "epoch": 0.235262287689764, "flos": 23192573834880.0, "grad_norm": 2.882050878947163, "language_loss": 0.72238225, "learning_rate": 3.574066679118909e-06, "loss": 0.80028564, "num_input_tokens_seen": 84240455, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.17028809, "step": 3913, "time_per_iteration": 2.587344169616699 }, { "auxiliary_loss_clip": 0.06529538, "auxiliary_loss_mlp": 0.01274166, "balance_loss_clip": 0.06298871, "balance_loss_mlp": 0.01255057, "epoch": 0.23532241094243198, "flos": 23191903002240.0, "grad_norm": 1.8923654846351774, "language_loss": 0.76837349, "learning_rate": 3.57382638628884e-06, "loss": 0.84641051, "num_input_tokens_seen": 84261605, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.19116211, "step": 3914, "time_per_iteration": 2.607999563217163 }, { "auxiliary_loss_clip": 0.06524818, "auxiliary_loss_mlp": 0.01282335, "balance_loss_clip": 0.06294082, "balance_loss_mlp": 0.01263584, "epoch": 0.23538253419509997, "flos": 17025007006080.0, "grad_norm": 2.9979843769709027, "language_loss": 0.90364957, "learning_rate": 3.5735860337791174e-06, "loss": 0.9817211, "num_input_tokens_seen": 84278675, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.18774414, "step": 3915, "time_per_iteration": 2.5443434715270996 }, { "auxiliary_loss_clip": 0.0640604, "auxiliary_loss_mlp": 0.01263319, "balance_loss_clip": 0.06279627, "balance_loss_mlp": 0.01258914, "epoch": 0.23544265744776793, "flos": 63465276263040.0, "grad_norm": 0.7855110935475196, "language_loss": 0.59484744, "learning_rate": 3.573345621598854e-06, "loss": 0.67154109, "num_input_tokens_seen": 84329765, "router_z_loss_clip": 1.265625, "router_z_loss_mlp": 0.04412842, "step": 3916, "time_per_iteration": 3.1182610988616943 }, { "auxiliary_loss_clip": 0.06407394, "auxiliary_loss_mlp": 0.01261232, "balance_loss_clip": 0.0628013, "balance_loss_mlp": 0.01256794, "epoch": 0.2355027807004359, "flos": 70537395116160.0, "grad_norm": 0.7446816205119885, "language_loss": 0.49414474, "learning_rate": 3.5731051497571675e-06, "loss": 0.57083106, "num_input_tokens_seen": 84393680, "router_z_loss_clip": 1.26953125, "router_z_loss_mlp": 0.04446411, "step": 3917, "time_per_iteration": 3.20988130569458 }, { "auxiliary_loss_clip": 0.06523033, "auxiliary_loss_mlp": 0.01276141, "balance_loss_clip": 0.06293511, "balance_loss_mlp": 0.01258165, "epoch": 0.23556290395310386, "flos": 21440742122880.0, "grad_norm": 2.0236214061743736, "language_loss": 0.76638019, "learning_rate": 3.5728646182631756e-06, "loss": 0.84437191, "num_input_tokens_seen": 84412640, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.17980957, "step": 3918, "time_per_iteration": 2.5837111473083496 }, { "auxiliary_loss_clip": 0.06524515, "auxiliary_loss_mlp": 0.01274667, "balance_loss_clip": 0.06292363, "balance_loss_mlp": 0.01256535, "epoch": 0.23562302720577183, "flos": 18192223981440.0, "grad_norm": 1.8368711618993632, "language_loss": 0.70030087, "learning_rate": 3.5726240271259995e-06, "loss": 0.77829266, "num_input_tokens_seen": 84431605, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.18127441, "step": 3919, "time_per_iteration": 2.562645435333252 }, { "auxiliary_loss_clip": 0.06516271, "auxiliary_loss_mlp": 0.01272519, "balance_loss_clip": 0.06294411, "balance_loss_mlp": 0.01255115, "epoch": 0.2356831504584398, "flos": 33739091038080.0, "grad_norm": 11.122948927293082, "language_loss": 0.70492387, "learning_rate": 3.5723833763547634e-06, "loss": 0.78281176, "num_input_tokens_seen": 84454210, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.17407227, "step": 3920, "time_per_iteration": 2.7095162868499756 }, { "auxiliary_loss_clip": 0.06522149, "auxiliary_loss_mlp": 0.01269995, "balance_loss_clip": 0.06296918, "balance_loss_mlp": 0.01252674, "epoch": 0.23574327371110776, "flos": 24939122739840.0, "grad_norm": 2.1140047091840097, "language_loss": 0.77754849, "learning_rate": 3.5721426659585916e-06, "loss": 0.85546994, "num_input_tokens_seen": 84475540, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.1730957, "step": 3921, "time_per_iteration": 2.5923006534576416 }, { "auxiliary_loss_clip": 0.06520569, "auxiliary_loss_mlp": 0.01273505, "balance_loss_clip": 0.0629485, "balance_loss_mlp": 0.01255469, "epoch": 0.23580339696377575, "flos": 17827940355840.0, "grad_norm": 3.0014394432972904, "language_loss": 0.76493347, "learning_rate": 3.571901895946612e-06, "loss": 0.84287417, "num_input_tokens_seen": 84494580, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.18029785, "step": 3922, "time_per_iteration": 2.565253734588623 }, { "auxiliary_loss_clip": 0.06522036, "auxiliary_loss_mlp": 0.01271187, "balance_loss_clip": 0.06295803, "balance_loss_mlp": 0.01254271, "epoch": 0.23586352021644372, "flos": 26293827225600.0, "grad_norm": 2.1359269745843115, "language_loss": 0.80863076, "learning_rate": 3.571661066327956e-06, "loss": 0.886563, "num_input_tokens_seen": 84513850, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.16906738, "step": 3923, "time_per_iteration": 4.042971849441528 }, { "auxiliary_loss_clip": 0.06520745, "auxiliary_loss_mlp": 0.01273582, "balance_loss_clip": 0.062967, "balance_loss_mlp": 0.01256154, "epoch": 0.23592364346911168, "flos": 14251965258240.0, "grad_norm": 1.8271198400544584, "language_loss": 0.75250578, "learning_rate": 3.571420177111754e-06, "loss": 0.83044899, "num_input_tokens_seen": 84532315, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.17431641, "step": 3924, "time_per_iteration": 2.543584108352661 }, { "auxiliary_loss_clip": 0.06518766, "auxiliary_loss_mlp": 0.01275807, "balance_loss_clip": 0.06295028, "balance_loss_mlp": 0.01258712, "epoch": 0.23598376672177965, "flos": 18593837400960.0, "grad_norm": 1.761947734344094, "language_loss": 0.82941663, "learning_rate": 3.5711792283071416e-06, "loss": 0.9073624, "num_input_tokens_seen": 84550970, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.17077637, "step": 3925, "time_per_iteration": 2.5831048488616943 }, { "auxiliary_loss_clip": 0.06522362, "auxiliary_loss_mlp": 0.0127446, "balance_loss_clip": 0.0629521, "balance_loss_mlp": 0.01256138, "epoch": 0.2360438899744476, "flos": 22682325196800.0, "grad_norm": 1.6648105513298417, "language_loss": 0.60502458, "learning_rate": 3.5709382199232564e-06, "loss": 0.68299282, "num_input_tokens_seen": 84571655, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.18322754, "step": 3926, "time_per_iteration": 2.595207691192627 }, { "auxiliary_loss_clip": 0.06515482, "auxiliary_loss_mlp": 0.01273354, "balance_loss_clip": 0.06295003, "balance_loss_mlp": 0.01256116, "epoch": 0.23610401322711558, "flos": 29577872298240.0, "grad_norm": 1.7956351097463386, "language_loss": 0.71959543, "learning_rate": 3.570697151969235e-06, "loss": 0.7974838, "num_input_tokens_seen": 84593130, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.17236328, "step": 3927, "time_per_iteration": 2.6442573070526123 }, { "auxiliary_loss_clip": 0.06517689, "auxiliary_loss_mlp": 0.01275234, "balance_loss_clip": 0.06293097, "balance_loss_mlp": 0.01258401, "epoch": 0.23616413647978354, "flos": 17864347754880.0, "grad_norm": 2.0120915380831934, "language_loss": 0.75924528, "learning_rate": 3.570456024454221e-06, "loss": 0.83717453, "num_input_tokens_seen": 84612410, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.16833496, "step": 3928, "time_per_iteration": 4.020284414291382 }, { "auxiliary_loss_clip": 0.06524414, "auxiliary_loss_mlp": 0.01280635, "balance_loss_clip": 0.06294343, "balance_loss_mlp": 0.01261037, "epoch": 0.23622425973245154, "flos": 11039393318400.0, "grad_norm": 2.6830449189608734, "language_loss": 0.82617533, "learning_rate": 3.5702148373873576e-06, "loss": 0.90422583, "num_input_tokens_seen": 84627610, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.19592285, "step": 3929, "time_per_iteration": 2.5391438007354736 }, { "auxiliary_loss_clip": 0.06532183, "auxiliary_loss_mlp": 0.0127624, "balance_loss_clip": 0.06295713, "balance_loss_mlp": 0.01256606, "epoch": 0.2362843829851195, "flos": 23410766666880.0, "grad_norm": 1.6580878539160135, "language_loss": 0.72096384, "learning_rate": 3.569973590777789e-06, "loss": 0.79904807, "num_input_tokens_seen": 84648415, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.19616699, "step": 3930, "time_per_iteration": 2.59450101852417 }, { "auxiliary_loss_clip": 0.06522617, "auxiliary_loss_mlp": 0.01275023, "balance_loss_clip": 0.06295422, "balance_loss_mlp": 0.01256974, "epoch": 0.23634450623778747, "flos": 39539103932160.0, "grad_norm": 1.7370509057413017, "language_loss": 0.74420297, "learning_rate": 3.569732284634665e-06, "loss": 0.82217944, "num_input_tokens_seen": 84670080, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.18054199, "step": 3931, "time_per_iteration": 2.7736756801605225 }, { "auxiliary_loss_clip": 0.06518148, "auxiliary_loss_mlp": 0.01277103, "balance_loss_clip": 0.06291375, "balance_loss_mlp": 0.01258876, "epoch": 0.23640462949045543, "flos": 24214077360000.0, "grad_norm": 4.046069489764663, "language_loss": 0.81460273, "learning_rate": 3.569490918967136e-06, "loss": 0.89255524, "num_input_tokens_seen": 84686465, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.18237305, "step": 3932, "time_per_iteration": 2.620328426361084 }, { "auxiliary_loss_clip": 0.06508634, "auxiliary_loss_mlp": 0.01276509, "balance_loss_clip": 0.06289142, "balance_loss_mlp": 0.01259892, "epoch": 0.2364647527431234, "flos": 26184898517760.0, "grad_norm": 2.79861226831224, "language_loss": 0.86017299, "learning_rate": 3.5692494937843537e-06, "loss": 0.93802446, "num_input_tokens_seen": 84708825, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.1661377, "step": 3933, "time_per_iteration": 3.983289957046509 }, { "auxiliary_loss_clip": 0.0652521, "auxiliary_loss_mlp": 0.01278193, "balance_loss_clip": 0.06294176, "balance_loss_mlp": 0.01258965, "epoch": 0.23652487599579136, "flos": 22643444102400.0, "grad_norm": 2.5576558082928846, "language_loss": 0.83857471, "learning_rate": 3.5690080090954727e-06, "loss": 0.91660875, "num_input_tokens_seen": 84726165, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.19238281, "step": 3934, "time_per_iteration": 2.5756101608276367 }, { "auxiliary_loss_clip": 0.06517635, "auxiliary_loss_mlp": 0.01283057, "balance_loss_clip": 0.06291368, "balance_loss_mlp": 0.01265259, "epoch": 0.23658499924845935, "flos": 21768702203520.0, "grad_norm": 1.8845789066640282, "language_loss": 0.79417437, "learning_rate": 3.5687664649096515e-06, "loss": 0.8721813, "num_input_tokens_seen": 84745815, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.17810059, "step": 3935, "time_per_iteration": 2.559720993041992 }, { "auxiliary_loss_clip": 0.06516207, "auxiliary_loss_mlp": 0.01278603, "balance_loss_clip": 0.06296463, "balance_loss_mlp": 0.01261306, "epoch": 0.23664512250112732, "flos": 21805486945920.0, "grad_norm": 1.636002891426754, "language_loss": 0.80343688, "learning_rate": 3.5685248612360487e-06, "loss": 0.88138497, "num_input_tokens_seen": 84765415, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.17297363, "step": 3936, "time_per_iteration": 4.003138065338135 }, { "auxiliary_loss_clip": 0.06509584, "auxiliary_loss_mlp": 0.01273156, "balance_loss_clip": 0.06286445, "balance_loss_mlp": 0.01255704, "epoch": 0.23670524575379528, "flos": 22644450351360.0, "grad_norm": 1.4713584529805872, "language_loss": 0.79823339, "learning_rate": 3.568283198083826e-06, "loss": 0.87606084, "num_input_tokens_seen": 84787080, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.17456055, "step": 3937, "time_per_iteration": 2.58880352973938 }, { "auxiliary_loss_clip": 0.06508659, "auxiliary_loss_mlp": 0.01276433, "balance_loss_clip": 0.06289494, "balance_loss_mlp": 0.0125922, "epoch": 0.23676536900646325, "flos": 16730225942400.0, "grad_norm": 1.8727880653425817, "language_loss": 0.85916483, "learning_rate": 3.568041475462147e-06, "loss": 0.93701577, "num_input_tokens_seen": 84805395, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.17211914, "step": 3938, "time_per_iteration": 2.535029172897339 }, { "auxiliary_loss_clip": 0.06510689, "auxiliary_loss_mlp": 0.01273652, "balance_loss_clip": 0.0629043, "balance_loss_mlp": 0.01256068, "epoch": 0.23682549225913122, "flos": 11138720734080.0, "grad_norm": 2.182377981772076, "language_loss": 0.94532889, "learning_rate": 3.5677996933801785e-06, "loss": 1.02317238, "num_input_tokens_seen": 84818090, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.17578125, "step": 3939, "time_per_iteration": 2.515692949295044 }, { "auxiliary_loss_clip": 0.06517971, "auxiliary_loss_mlp": 0.01274826, "balance_loss_clip": 0.06292187, "balance_loss_mlp": 0.01256456, "epoch": 0.23688561551179918, "flos": 22564843372800.0, "grad_norm": 1.7513530846464784, "language_loss": 0.82895887, "learning_rate": 3.567557851847088e-06, "loss": 0.90688682, "num_input_tokens_seen": 84837695, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.18371582, "step": 3940, "time_per_iteration": 2.560314416885376 }, { "auxiliary_loss_clip": 0.06522346, "auxiliary_loss_mlp": 0.01276678, "balance_loss_clip": 0.0628933, "balance_loss_mlp": 0.01257104, "epoch": 0.23694573876446715, "flos": 18520771040640.0, "grad_norm": 2.1740565586283624, "language_loss": 0.89351219, "learning_rate": 3.5673159508720464e-06, "loss": 0.97150242, "num_input_tokens_seen": 84854630, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.19580078, "step": 3941, "time_per_iteration": 2.548288345336914 }, { "auxiliary_loss_clip": 0.06518589, "auxiliary_loss_mlp": 0.01273796, "balance_loss_clip": 0.06290042, "balance_loss_mlp": 0.01255628, "epoch": 0.23700586201713514, "flos": 15340246087680.0, "grad_norm": 2.0592903689774658, "language_loss": 0.84875578, "learning_rate": 3.5670739904642274e-06, "loss": 0.92667961, "num_input_tokens_seen": 84871805, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.1817627, "step": 3942, "time_per_iteration": 2.5335047245025635 }, { "auxiliary_loss_clip": 0.06521884, "auxiliary_loss_mlp": 0.01281256, "balance_loss_clip": 0.0629342, "balance_loss_mlp": 0.01261229, "epoch": 0.2370659852698031, "flos": 23953775051520.0, "grad_norm": 2.0316531063455088, "language_loss": 0.81761438, "learning_rate": 3.5668319706328065e-06, "loss": 0.8956458, "num_input_tokens_seen": 84889815, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.20019531, "step": 3943, "time_per_iteration": 2.5850744247436523 }, { "auxiliary_loss_clip": 0.06527382, "auxiliary_loss_mlp": 0.01274658, "balance_loss_clip": 0.06294189, "balance_loss_mlp": 0.01255465, "epoch": 0.23712610852247107, "flos": 15336514581120.0, "grad_norm": 2.9480053259483228, "language_loss": 0.6825453, "learning_rate": 3.566589891386959e-06, "loss": 0.7605657, "num_input_tokens_seen": 84904380, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.1920166, "step": 3944, "time_per_iteration": 2.530987024307251 }, { "auxiliary_loss_clip": 0.06516981, "auxiliary_loss_mlp": 0.01277929, "balance_loss_clip": 0.06290559, "balance_loss_mlp": 0.0125938, "epoch": 0.23718623177513903, "flos": 19688658848640.0, "grad_norm": 2.8300505180740805, "language_loss": 0.76194179, "learning_rate": 3.566347752735866e-06, "loss": 0.8398909, "num_input_tokens_seen": 84922935, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.18554688, "step": 3945, "time_per_iteration": 2.5468409061431885 }, { "auxiliary_loss_clip": 0.06519698, "auxiliary_loss_mlp": 0.01274636, "balance_loss_clip": 0.06293871, "balance_loss_mlp": 0.01256588, "epoch": 0.237246355027807, "flos": 24980351748480.0, "grad_norm": 1.5348841527826664, "language_loss": 0.64131975, "learning_rate": 3.5661055546887094e-06, "loss": 0.71926308, "num_input_tokens_seen": 84943685, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.18054199, "step": 3946, "time_per_iteration": 2.6037070751190186 }, { "auxiliary_loss_clip": 0.06516261, "auxiliary_loss_mlp": 0.01276052, "balance_loss_clip": 0.06292217, "balance_loss_mlp": 0.01257813, "epoch": 0.23730647828047496, "flos": 15382816761600.0, "grad_norm": 2.563155714217039, "language_loss": 0.77766216, "learning_rate": 3.5658632972546734e-06, "loss": 0.85558534, "num_input_tokens_seen": 84959505, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.18237305, "step": 3947, "time_per_iteration": 2.5393693447113037 }, { "auxiliary_loss_clip": 0.06522305, "auxiliary_loss_mlp": 0.01271373, "balance_loss_clip": 0.06296217, "balance_loss_mlp": 0.01253754, "epoch": 0.23736660153314296, "flos": 28158738422400.0, "grad_norm": 1.410609744223761, "language_loss": 0.80882561, "learning_rate": 3.565620980442944e-06, "loss": 0.88676238, "num_input_tokens_seen": 84982130, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.17602539, "step": 3948, "time_per_iteration": 2.6280412673950195 }, { "auxiliary_loss_clip": 0.06518703, "auxiliary_loss_mlp": 0.01277449, "balance_loss_clip": 0.06293132, "balance_loss_mlp": 0.01258518, "epoch": 0.23742672478581092, "flos": 22092385726080.0, "grad_norm": 1.6523497761348012, "language_loss": 0.80375433, "learning_rate": 3.5653786042627107e-06, "loss": 0.88171589, "num_input_tokens_seen": 85000640, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.18920898, "step": 3949, "time_per_iteration": 2.558171033859253 }, { "auxiliary_loss_clip": 0.06527287, "auxiliary_loss_mlp": 0.01275031, "balance_loss_clip": 0.06298809, "balance_loss_mlp": 0.01255481, "epoch": 0.2374868480384789, "flos": 19543238887680.0, "grad_norm": 1.6399950275240092, "language_loss": 0.73730326, "learning_rate": 3.565136168723163e-06, "loss": 0.81532645, "num_input_tokens_seen": 85018970, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.19555664, "step": 3950, "time_per_iteration": 2.561548948287964 }, { "auxiliary_loss_clip": 0.0651445, "auxiliary_loss_mlp": 0.0127086, "balance_loss_clip": 0.06291796, "balance_loss_mlp": 0.01254302, "epoch": 0.23754697129114685, "flos": 19427769561600.0, "grad_norm": 2.0578264326163738, "language_loss": 0.73286128, "learning_rate": 3.564893673833495e-06, "loss": 0.81071436, "num_input_tokens_seen": 85035905, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.16552734, "step": 3951, "time_per_iteration": 2.584592580795288 }, { "auxiliary_loss_clip": 0.06520332, "auxiliary_loss_mlp": 0.01280266, "balance_loss_clip": 0.06294587, "balance_loss_mlp": 0.01261014, "epoch": 0.23760709454381482, "flos": 19507208832000.0, "grad_norm": 1.7988409397205072, "language_loss": 0.74396104, "learning_rate": 3.564651119602903e-06, "loss": 0.82196701, "num_input_tokens_seen": 85054560, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.19262695, "step": 3952, "time_per_iteration": 2.577423095703125 }, { "auxiliary_loss_clip": 0.06519806, "auxiliary_loss_mlp": 0.01272128, "balance_loss_clip": 0.06294528, "balance_loss_mlp": 0.01255081, "epoch": 0.23766721779648278, "flos": 27644045518080.0, "grad_norm": 1.7984489669677144, "language_loss": 0.71723235, "learning_rate": 3.564408506040583e-06, "loss": 0.79515171, "num_input_tokens_seen": 85074425, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.17053223, "step": 3953, "time_per_iteration": 2.6842761039733887 }, { "auxiliary_loss_clip": 0.06522533, "auxiliary_loss_mlp": 0.01273762, "balance_loss_clip": 0.0629461, "balance_loss_mlp": 0.01255178, "epoch": 0.23772734104915075, "flos": 23411102083200.0, "grad_norm": 3.2755414151135875, "language_loss": 0.82450241, "learning_rate": 3.5641658331557356e-06, "loss": 0.90246534, "num_input_tokens_seen": 85092865, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.18566895, "step": 3954, "time_per_iteration": 2.5987584590911865 }, { "auxiliary_loss_clip": 0.06519096, "auxiliary_loss_mlp": 0.01276471, "balance_loss_clip": 0.06293134, "balance_loss_mlp": 0.01257743, "epoch": 0.23778746430181874, "flos": 15710902623360.0, "grad_norm": 2.4888263639362216, "language_loss": 0.67278874, "learning_rate": 3.5639231009575634e-06, "loss": 0.7507444, "num_input_tokens_seen": 85110175, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.18713379, "step": 3955, "time_per_iteration": 2.658005475997925 }, { "auxiliary_loss_clip": 0.06511264, "auxiliary_loss_mlp": 0.01274801, "balance_loss_clip": 0.06290399, "balance_loss_mlp": 0.01256955, "epoch": 0.2378475875544867, "flos": 19432381536000.0, "grad_norm": 1.3956814348984705, "language_loss": 0.8439368, "learning_rate": 3.5636803094552704e-06, "loss": 0.92179739, "num_input_tokens_seen": 85129925, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.1784668, "step": 3956, "time_per_iteration": 2.5702860355377197 }, { "auxiliary_loss_clip": 0.0651319, "auxiliary_loss_mlp": 0.01274698, "balance_loss_clip": 0.06293082, "balance_loss_mlp": 0.01257687, "epoch": 0.23790771080715467, "flos": 22274338867200.0, "grad_norm": 1.9890766711021304, "language_loss": 0.86149275, "learning_rate": 3.5634374586580635e-06, "loss": 0.93937159, "num_input_tokens_seen": 85147755, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.17004395, "step": 3957, "time_per_iteration": 2.8775479793548584 }, { "auxiliary_loss_clip": 0.06518579, "auxiliary_loss_mlp": 0.01276693, "balance_loss_clip": 0.06295785, "balance_loss_mlp": 0.01259074, "epoch": 0.23796783405982264, "flos": 20053445598720.0, "grad_norm": 1.9713994190190067, "language_loss": 0.70596927, "learning_rate": 3.563194548575151e-06, "loss": 0.78392196, "num_input_tokens_seen": 85165270, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.1763916, "step": 3958, "time_per_iteration": 2.6606359481811523 }, { "auxiliary_loss_clip": 0.06516412, "auxiliary_loss_mlp": 0.01273762, "balance_loss_clip": 0.06289804, "balance_loss_mlp": 0.01255332, "epoch": 0.2380279573124906, "flos": 14251084790400.0, "grad_norm": 2.933990361894587, "language_loss": 0.67009234, "learning_rate": 3.562951579215745e-06, "loss": 0.74799407, "num_input_tokens_seen": 85181555, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.18432617, "step": 3959, "time_per_iteration": 2.5847008228302 }, { "auxiliary_loss_clip": 0.06518099, "auxiliary_loss_mlp": 0.01275193, "balance_loss_clip": 0.06295961, "balance_loss_mlp": 0.01257407, "epoch": 0.23808808056515857, "flos": 21185638767360.0, "grad_norm": 2.8049740445662303, "language_loss": 0.73071545, "learning_rate": 3.5627085505890586e-06, "loss": 0.80864835, "num_input_tokens_seen": 85199455, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.17773438, "step": 3960, "time_per_iteration": 2.586240530014038 }, { "auxiliary_loss_clip": 0.0651193, "auxiliary_loss_mlp": 0.0127496, "balance_loss_clip": 0.06289583, "balance_loss_mlp": 0.01256804, "epoch": 0.23814820381782653, "flos": 22534850810880.0, "grad_norm": 1.8069076863142883, "language_loss": 0.74825513, "learning_rate": 3.562465462704307e-06, "loss": 0.82612407, "num_input_tokens_seen": 85219170, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.1817627, "step": 3961, "time_per_iteration": 2.588303565979004 }, { "auxiliary_loss_clip": 0.06514017, "auxiliary_loss_mlp": 0.01278599, "balance_loss_clip": 0.06288013, "balance_loss_mlp": 0.01259275, "epoch": 0.23820832707049452, "flos": 22309991579520.0, "grad_norm": 1.9566168567612343, "language_loss": 0.6679641, "learning_rate": 3.5622223155707085e-06, "loss": 0.74589026, "num_input_tokens_seen": 85238480, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.19311523, "step": 3962, "time_per_iteration": 4.02237606048584 }, { "auxiliary_loss_clip": 0.06511308, "auxiliary_loss_mlp": 0.01274682, "balance_loss_clip": 0.06288845, "balance_loss_mlp": 0.01257277, "epoch": 0.2382684503231625, "flos": 24871297259520.0, "grad_norm": 1.8076378528663584, "language_loss": 0.75040221, "learning_rate": 3.561979109197483e-06, "loss": 0.82826209, "num_input_tokens_seen": 85259180, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.1739502, "step": 3963, "time_per_iteration": 2.622511386871338 }, { "auxiliary_loss_clip": 0.06515899, "auxiliary_loss_mlp": 0.01275471, "balance_loss_clip": 0.06290305, "balance_loss_mlp": 0.01256862, "epoch": 0.23832857357583045, "flos": 21878050181760.0, "grad_norm": 2.8912462490866915, "language_loss": 0.78065038, "learning_rate": 3.5617358435938538e-06, "loss": 0.85856402, "num_input_tokens_seen": 85278550, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.1862793, "step": 3964, "time_per_iteration": 2.5675227642059326 }, { "auxiliary_loss_clip": 0.06506281, "auxiliary_loss_mlp": 0.01277241, "balance_loss_clip": 0.06288122, "balance_loss_mlp": 0.01259836, "epoch": 0.23838869682849842, "flos": 21294441694080.0, "grad_norm": 2.0378883847601537, "language_loss": 0.72414386, "learning_rate": 3.561492518769045e-06, "loss": 0.80197906, "num_input_tokens_seen": 85297345, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.17419434, "step": 3965, "time_per_iteration": 2.566462516784668 }, { "auxiliary_loss_clip": 0.06511387, "auxiliary_loss_mlp": 0.01285192, "balance_loss_clip": 0.06291005, "balance_loss_mlp": 0.01266941, "epoch": 0.23844882008116638, "flos": 16186211308800.0, "grad_norm": 2.5480198912723266, "language_loss": 0.78530204, "learning_rate": 3.561249134732282e-06, "loss": 0.86326784, "num_input_tokens_seen": 85315105, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.18249512, "step": 3966, "time_per_iteration": 2.530285358428955 }, { "auxiliary_loss_clip": 0.06512743, "auxiliary_loss_mlp": 0.01275899, "balance_loss_clip": 0.06289446, "balance_loss_mlp": 0.01258816, "epoch": 0.23850894333383435, "flos": 21076165008000.0, "grad_norm": 1.9788919668829361, "language_loss": 0.69295359, "learning_rate": 3.561005691492797e-06, "loss": 0.77084005, "num_input_tokens_seen": 85334735, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.1706543, "step": 3967, "time_per_iteration": 3.9951653480529785 }, { "auxiliary_loss_clip": 0.06517242, "auxiliary_loss_mlp": 0.01282277, "balance_loss_clip": 0.06295722, "balance_loss_mlp": 0.0126275, "epoch": 0.23856906658650234, "flos": 17207295563520.0, "grad_norm": 1.9842893555639822, "language_loss": 0.68476748, "learning_rate": 3.5607621890598185e-06, "loss": 0.76276267, "num_input_tokens_seen": 85352875, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.19519043, "step": 3968, "time_per_iteration": 2.547274589538574 }, { "auxiliary_loss_clip": 0.0651328, "auxiliary_loss_mlp": 0.01281333, "balance_loss_clip": 0.06291041, "balance_loss_mlp": 0.01264274, "epoch": 0.2386291898391703, "flos": 29501451774720.0, "grad_norm": 1.7628149362457426, "language_loss": 0.77817589, "learning_rate": 3.5605186274425823e-06, "loss": 0.85612202, "num_input_tokens_seen": 85372205, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.17077637, "step": 3969, "time_per_iteration": 2.6321911811828613 }, { "auxiliary_loss_clip": 0.06511253, "auxiliary_loss_mlp": 0.01275474, "balance_loss_clip": 0.062941, "balance_loss_mlp": 0.01257938, "epoch": 0.23868931309183827, "flos": 21148854024960.0, "grad_norm": 2.1548770896236475, "language_loss": 0.7710529, "learning_rate": 3.5602750066503225e-06, "loss": 0.84892011, "num_input_tokens_seen": 85389705, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.17529297, "step": 3970, "time_per_iteration": 2.5636651515960693 }, { "auxiliary_loss_clip": 0.06511156, "auxiliary_loss_mlp": 0.01276608, "balance_loss_clip": 0.06287253, "balance_loss_mlp": 0.01257678, "epoch": 0.23874943634450624, "flos": 25665342076800.0, "grad_norm": 2.052666523662753, "language_loss": 0.8578825, "learning_rate": 3.5600313266922793e-06, "loss": 0.9357602, "num_input_tokens_seen": 85407855, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.18933105, "step": 3971, "time_per_iteration": 2.6107354164123535 }, { "auxiliary_loss_clip": 0.06426775, "auxiliary_loss_mlp": 0.01277157, "balance_loss_clip": 0.06305028, "balance_loss_mlp": 0.01272484, "epoch": 0.2388095595971742, "flos": 59006871889920.0, "grad_norm": 0.7117715319718267, "language_loss": 0.62701464, "learning_rate": 3.5597875875776915e-06, "loss": 0.704054, "num_input_tokens_seen": 85470885, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 0.04663086, "step": 3972, "time_per_iteration": 3.246826648712158 }, { "auxiliary_loss_clip": 0.06508543, "auxiliary_loss_mlp": 0.01275677, "balance_loss_clip": 0.06288491, "balance_loss_mlp": 0.01258594, "epoch": 0.23886968284984217, "flos": 16805975633280.0, "grad_norm": 1.837444353052176, "language_loss": 0.8246671, "learning_rate": 3.5595437893158013e-06, "loss": 0.90250933, "num_input_tokens_seen": 85488460, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.17102051, "step": 3973, "time_per_iteration": 3.9082539081573486 }, { "auxiliary_loss_clip": 0.06511886, "auxiliary_loss_mlp": 0.01274413, "balance_loss_clip": 0.06290391, "balance_loss_mlp": 0.01256854, "epoch": 0.23892980610251013, "flos": 22389221214720.0, "grad_norm": 1.5790561542776171, "language_loss": 0.79912031, "learning_rate": 3.5592999319158546e-06, "loss": 0.87698328, "num_input_tokens_seen": 85508590, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.17578125, "step": 3974, "time_per_iteration": 2.595360517501831 }, { "auxiliary_loss_clip": 0.06519343, "auxiliary_loss_mlp": 0.01278233, "balance_loss_clip": 0.06294742, "balance_loss_mlp": 0.01259386, "epoch": 0.23898992935517813, "flos": 12828135553920.0, "grad_norm": 1.9878233486172827, "language_loss": 0.85539907, "learning_rate": 3.5590560153870984e-06, "loss": 0.93337482, "num_input_tokens_seen": 85525970, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.18859863, "step": 3975, "time_per_iteration": 4.1959240436553955 }, { "auxiliary_loss_clip": 0.06513081, "auxiliary_loss_mlp": 0.01274395, "balance_loss_clip": 0.06294964, "balance_loss_mlp": 0.01257253, "epoch": 0.2390500526078461, "flos": 22352142983040.0, "grad_norm": 2.555514062447676, "language_loss": 0.84664583, "learning_rate": 3.5588120397387816e-06, "loss": 0.92452061, "num_input_tokens_seen": 85543700, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.17138672, "step": 3976, "time_per_iteration": 2.5765552520751953 }, { "auxiliary_loss_clip": 0.06509793, "auxiliary_loss_mlp": 0.01274115, "balance_loss_clip": 0.06293852, "balance_loss_mlp": 0.01258106, "epoch": 0.23911017586051406, "flos": 22641263896320.0, "grad_norm": 2.094540371865896, "language_loss": 0.74973333, "learning_rate": 3.5585680049801566e-06, "loss": 0.82757241, "num_input_tokens_seen": 85562765, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.16003418, "step": 3977, "time_per_iteration": 2.590379476547241 }, { "auxiliary_loss_clip": 0.06512072, "auxiliary_loss_mlp": 0.01275753, "balance_loss_clip": 0.06291845, "balance_loss_mlp": 0.01256608, "epoch": 0.23917029911318202, "flos": 23658993987840.0, "grad_norm": 1.7717883150694935, "language_loss": 0.72590792, "learning_rate": 3.5583239111204764e-06, "loss": 0.80378616, "num_input_tokens_seen": 85581755, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.19128418, "step": 3978, "time_per_iteration": 2.574932336807251 }, { "auxiliary_loss_clip": 0.06517358, "auxiliary_loss_mlp": 0.01276226, "balance_loss_clip": 0.06292418, "balance_loss_mlp": 0.01258214, "epoch": 0.23923042236585, "flos": 22790163801600.0, "grad_norm": 3.911570625591956, "language_loss": 0.7944113, "learning_rate": 3.558079758168997e-06, "loss": 0.87234712, "num_input_tokens_seen": 85599455, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.18029785, "step": 3979, "time_per_iteration": 2.566915988922119 }, { "auxiliary_loss_clip": 0.0651134, "auxiliary_loss_mlp": 0.01277651, "balance_loss_clip": 0.06291682, "balance_loss_mlp": 0.01259365, "epoch": 0.23929054561851795, "flos": 28155300405120.0, "grad_norm": 1.6675113529010193, "language_loss": 0.82322413, "learning_rate": 3.557835546134977e-06, "loss": 0.90111405, "num_input_tokens_seen": 85619970, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.18286133, "step": 3980, "time_per_iteration": 2.6538290977478027 }, { "auxiliary_loss_clip": 0.06505546, "auxiliary_loss_mlp": 0.01274359, "balance_loss_clip": 0.06287672, "balance_loss_mlp": 0.01257216, "epoch": 0.23935066887118592, "flos": 21692491315200.0, "grad_norm": 1.8144306772839585, "language_loss": 0.8414675, "learning_rate": 3.5575912750276775e-06, "loss": 0.91926658, "num_input_tokens_seen": 85638850, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.17138672, "step": 3981, "time_per_iteration": 2.5696563720703125 }, { "auxiliary_loss_clip": 0.06519134, "auxiliary_loss_mlp": 0.01274056, "balance_loss_clip": 0.06293127, "balance_loss_mlp": 0.01255316, "epoch": 0.2394107921238539, "flos": 32130121737600.0, "grad_norm": 1.90201729716672, "language_loss": 0.77546561, "learning_rate": 3.5573469448563607e-06, "loss": 0.85339761, "num_input_tokens_seen": 85656285, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.1875, "step": 3982, "time_per_iteration": 2.624718189239502 }, { "auxiliary_loss_clip": 0.06513553, "auxiliary_loss_mlp": 0.01276825, "balance_loss_clip": 0.06294535, "balance_loss_mlp": 0.01259874, "epoch": 0.23947091537652188, "flos": 17024839297920.0, "grad_norm": 1.9217035175917536, "language_loss": 0.78350186, "learning_rate": 3.5571025556302915e-06, "loss": 0.86140567, "num_input_tokens_seen": 85673020, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.16955566, "step": 3983, "time_per_iteration": 2.556861639022827 }, { "auxiliary_loss_clip": 0.06513277, "auxiliary_loss_mlp": 0.01276724, "balance_loss_clip": 0.06292634, "balance_loss_mlp": 0.01258843, "epoch": 0.23953103862918984, "flos": 20599640438400.0, "grad_norm": 1.8290779945844633, "language_loss": 0.74129128, "learning_rate": 3.556858107358737e-06, "loss": 0.81919134, "num_input_tokens_seen": 85692565, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.17883301, "step": 3984, "time_per_iteration": 2.576094627380371 }, { "auxiliary_loss_clip": 0.06515452, "auxiliary_loss_mlp": 0.01274985, "balance_loss_clip": 0.06293096, "balance_loss_mlp": 0.01257366, "epoch": 0.2395911618818578, "flos": 20710707425280.0, "grad_norm": 2.6716680941789375, "language_loss": 0.79344344, "learning_rate": 3.5566136000509674e-06, "loss": 0.87134778, "num_input_tokens_seen": 85709730, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.17626953, "step": 3985, "time_per_iteration": 2.5637972354888916 }, { "auxiliary_loss_clip": 0.06518047, "auxiliary_loss_mlp": 0.01277134, "balance_loss_clip": 0.06294298, "balance_loss_mlp": 0.01258454, "epoch": 0.23965128513452577, "flos": 27060982081920.0, "grad_norm": 2.135861672880401, "language_loss": 0.73424494, "learning_rate": 3.556369033716254e-06, "loss": 0.81219673, "num_input_tokens_seen": 85730045, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.18676758, "step": 3986, "time_per_iteration": 2.612821578979492 }, { "auxiliary_loss_clip": 0.06523813, "auxiliary_loss_mlp": 0.0127434, "balance_loss_clip": 0.06293331, "balance_loss_mlp": 0.01255409, "epoch": 0.23971140838719374, "flos": 23150254723200.0, "grad_norm": 1.7559082269861839, "language_loss": 0.88169336, "learning_rate": 3.556124408363871e-06, "loss": 0.95967484, "num_input_tokens_seen": 85747590, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.18945312, "step": 3987, "time_per_iteration": 2.5789554119110107 }, { "auxiliary_loss_clip": 0.06509244, "auxiliary_loss_mlp": 0.012746, "balance_loss_clip": 0.06297389, "balance_loss_mlp": 0.01258435, "epoch": 0.23977153163986173, "flos": 18039341007360.0, "grad_norm": 2.1439415959340855, "language_loss": 0.84160262, "learning_rate": 3.5558797240030945e-06, "loss": 0.91944104, "num_input_tokens_seen": 85763460, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.16186523, "step": 3988, "time_per_iteration": 2.531189203262329 }, { "auxiliary_loss_clip": 0.06517741, "auxiliary_loss_mlp": 0.01275998, "balance_loss_clip": 0.06295731, "balance_loss_mlp": 0.01257897, "epoch": 0.2398316548925297, "flos": 18119157621120.0, "grad_norm": 1.7616075505584836, "language_loss": 0.85892522, "learning_rate": 3.5556349806432035e-06, "loss": 0.93686265, "num_input_tokens_seen": 85782050, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.18109131, "step": 3989, "time_per_iteration": 2.5739293098449707 }, { "auxiliary_loss_clip": 0.06508145, "auxiliary_loss_mlp": 0.01273762, "balance_loss_clip": 0.0629134, "balance_loss_mlp": 0.0125544, "epoch": 0.23989177814519766, "flos": 12572612928000.0, "grad_norm": 1.8554572340019004, "language_loss": 0.85661668, "learning_rate": 3.555390178293477e-06, "loss": 0.93443573, "num_input_tokens_seen": 85797400, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.18310547, "step": 3990, "time_per_iteration": 2.5652434825897217 }, { "auxiliary_loss_clip": 0.06508712, "auxiliary_loss_mlp": 0.01271757, "balance_loss_clip": 0.06289883, "balance_loss_mlp": 0.01255378, "epoch": 0.23995190139786562, "flos": 25271569013760.0, "grad_norm": 2.735502799969402, "language_loss": 0.7647503, "learning_rate": 3.5551453169631994e-06, "loss": 0.84255505, "num_input_tokens_seen": 85818995, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.16369629, "step": 3991, "time_per_iteration": 2.598860025405884 }, { "auxiliary_loss_clip": 0.06419708, "auxiliary_loss_mlp": 0.01266139, "balance_loss_clip": 0.06299814, "balance_loss_mlp": 0.01261862, "epoch": 0.2400120246505336, "flos": 61978107271680.0, "grad_norm": 0.8706524204833025, "language_loss": 0.63534307, "learning_rate": 3.554900396661656e-06, "loss": 0.71220148, "num_input_tokens_seen": 85876695, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 0.04281616, "step": 3992, "time_per_iteration": 3.1214849948883057 }, { "auxiliary_loss_clip": 0.06414601, "auxiliary_loss_mlp": 0.01261174, "balance_loss_clip": 0.06295794, "balance_loss_mlp": 0.01256453, "epoch": 0.24007214790320155, "flos": 66727923816960.0, "grad_norm": 0.734094704448957, "language_loss": 0.62828445, "learning_rate": 3.5546554173981334e-06, "loss": 0.70504218, "num_input_tokens_seen": 85940990, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.04711914, "step": 3993, "time_per_iteration": 3.310720443725586 }, { "auxiliary_loss_clip": 0.06522331, "auxiliary_loss_mlp": 0.01273648, "balance_loss_clip": 0.06296635, "balance_loss_mlp": 0.01254241, "epoch": 0.24013227115586952, "flos": 25815667501440.0, "grad_norm": 1.6308230752801738, "language_loss": 0.77728975, "learning_rate": 3.5544103791819218e-06, "loss": 0.85524952, "num_input_tokens_seen": 85961165, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.1940918, "step": 3994, "time_per_iteration": 2.622729778289795 }, { "auxiliary_loss_clip": 0.06524046, "auxiliary_loss_mlp": 0.01276445, "balance_loss_clip": 0.0629973, "balance_loss_mlp": 0.01257014, "epoch": 0.2401923944085375, "flos": 25564672995840.0, "grad_norm": 1.540427145502037, "language_loss": 0.78547263, "learning_rate": 3.5541652820223124e-06, "loss": 0.86347747, "num_input_tokens_seen": 85982710, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.19421387, "step": 3995, "time_per_iteration": 2.70875883102417 }, { "auxiliary_loss_clip": 0.06413363, "auxiliary_loss_mlp": 0.01256487, "balance_loss_clip": 0.06295018, "balance_loss_mlp": 0.01252258, "epoch": 0.24025251766120548, "flos": 54961457892480.0, "grad_norm": 0.8829364987937729, "language_loss": 0.63601995, "learning_rate": 3.5539201259286006e-06, "loss": 0.71271843, "num_input_tokens_seen": 86046935, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.04232788, "step": 3996, "time_per_iteration": 3.3117449283599854 }, { "auxiliary_loss_clip": 0.06523508, "auxiliary_loss_mlp": 0.01278018, "balance_loss_clip": 0.06296474, "balance_loss_mlp": 0.01259922, "epoch": 0.24031264091387344, "flos": 20637305648640.0, "grad_norm": 2.954448078451075, "language_loss": 0.70373082, "learning_rate": 3.5536749109100808e-06, "loss": 0.78174615, "num_input_tokens_seen": 86064355, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.1809082, "step": 3997, "time_per_iteration": 2.577582597732544 }, { "auxiliary_loss_clip": 0.06514154, "auxiliary_loss_mlp": 0.01280728, "balance_loss_clip": 0.06294352, "balance_loss_mlp": 0.01262751, "epoch": 0.2403727641665414, "flos": 20892492858240.0, "grad_norm": 1.8436012202877745, "language_loss": 0.87788248, "learning_rate": 3.5534296369760535e-06, "loss": 0.95583129, "num_input_tokens_seen": 86081340, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.1796875, "step": 3998, "time_per_iteration": 2.5787034034729004 }, { "auxiliary_loss_clip": 0.06525771, "auxiliary_loss_mlp": 0.01277716, "balance_loss_clip": 0.06298251, "balance_loss_mlp": 0.01258881, "epoch": 0.24043288741920937, "flos": 22826613127680.0, "grad_norm": 1.690038262000073, "language_loss": 0.75844514, "learning_rate": 3.5531843041358183e-06, "loss": 0.83648002, "num_input_tokens_seen": 86102260, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.18847656, "step": 3999, "time_per_iteration": 2.581127405166626 }, { "auxiliary_loss_clip": 0.06512441, "auxiliary_loss_mlp": 0.01272326, "balance_loss_clip": 0.06294573, "balance_loss_mlp": 0.0125504, "epoch": 0.24049301067187734, "flos": 27966261594240.0, "grad_norm": 2.315251542677561, "language_loss": 0.73006994, "learning_rate": 3.552938912398679e-06, "loss": 0.80791754, "num_input_tokens_seen": 86123400, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.17297363, "step": 4000, "time_per_iteration": 2.755478620529175 }, { "auxiliary_loss_clip": 0.06530684, "auxiliary_loss_mlp": 0.01279398, "balance_loss_clip": 0.06302132, "balance_loss_mlp": 0.01260658, "epoch": 0.24055313392454533, "flos": 27458360870400.0, "grad_norm": 1.689166706992594, "language_loss": 0.67366761, "learning_rate": 3.5526934617739397e-06, "loss": 0.75176841, "num_input_tokens_seen": 86144060, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.18737793, "step": 4001, "time_per_iteration": 4.048471212387085 }, { "auxiliary_loss_clip": 0.06519324, "auxiliary_loss_mlp": 0.01276761, "balance_loss_clip": 0.06296763, "balance_loss_mlp": 0.01258069, "epoch": 0.2406132571772133, "flos": 25563666746880.0, "grad_norm": 7.017051762578249, "language_loss": 0.8342098, "learning_rate": 3.5524479522709095e-06, "loss": 0.91217065, "num_input_tokens_seen": 86163005, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.18701172, "step": 4002, "time_per_iteration": 2.6224067211151123 }, { "auxiliary_loss_clip": 0.06514729, "auxiliary_loss_mlp": 0.01280194, "balance_loss_clip": 0.06293964, "balance_loss_mlp": 0.01261991, "epoch": 0.24067338042988126, "flos": 24798482461440.0, "grad_norm": 1.8539290519714975, "language_loss": 0.83398283, "learning_rate": 3.552202383898897e-06, "loss": 0.91193199, "num_input_tokens_seen": 86182580, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.18200684, "step": 4003, "time_per_iteration": 2.6939163208007812 }, { "auxiliary_loss_clip": 0.06520158, "auxiliary_loss_mlp": 0.01278579, "balance_loss_clip": 0.06294826, "balance_loss_mlp": 0.01260042, "epoch": 0.24073350368254923, "flos": 21184171320960.0, "grad_norm": 1.9754292078308717, "language_loss": 0.87920994, "learning_rate": 3.551956756667215e-06, "loss": 0.95719731, "num_input_tokens_seen": 86200665, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.1854248, "step": 4004, "time_per_iteration": 2.6324615478515625 }, { "auxiliary_loss_clip": 0.06519816, "auxiliary_loss_mlp": 0.01278943, "balance_loss_clip": 0.06293504, "balance_loss_mlp": 0.01260072, "epoch": 0.2407936269352172, "flos": 22501252523520.0, "grad_norm": 2.294485339665406, "language_loss": 0.78346884, "learning_rate": 3.551711070585177e-06, "loss": 0.86145639, "num_input_tokens_seen": 86221640, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.18859863, "step": 4005, "time_per_iteration": 2.63293194770813 }, { "auxiliary_loss_clip": 0.06512827, "auxiliary_loss_mlp": 0.01280028, "balance_loss_clip": 0.06296138, "balance_loss_mlp": 0.0126217, "epoch": 0.24085375018788516, "flos": 18556968804480.0, "grad_norm": 1.6316011379853468, "language_loss": 0.79716468, "learning_rate": 3.5514653256620995e-06, "loss": 0.87509322, "num_input_tokens_seen": 86240795, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.17858887, "step": 4006, "time_per_iteration": 2.556011915206909 }, { "auxiliary_loss_clip": 0.06528728, "auxiliary_loss_mlp": 0.01280411, "balance_loss_clip": 0.06295227, "balance_loss_mlp": 0.01260264, "epoch": 0.24091387344055312, "flos": 24177418398720.0, "grad_norm": 1.7975117273180894, "language_loss": 0.71933651, "learning_rate": 3.551219521907302e-06, "loss": 0.79742789, "num_input_tokens_seen": 86262000, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.20141602, "step": 4007, "time_per_iteration": 4.129130601882935 }, { "auxiliary_loss_clip": 0.06514673, "auxiliary_loss_mlp": 0.01289739, "balance_loss_clip": 0.06295104, "balance_loss_mlp": 0.01270249, "epoch": 0.24097399669322112, "flos": 11041112327040.0, "grad_norm": 1.9189035885111294, "language_loss": 0.76597005, "learning_rate": 3.5509736593301042e-06, "loss": 0.84401417, "num_input_tokens_seen": 86279680, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.19482422, "step": 4008, "time_per_iteration": 2.5335564613342285 }, { "auxiliary_loss_clip": 0.06514783, "auxiliary_loss_mlp": 0.01275043, "balance_loss_clip": 0.06291878, "balance_loss_mlp": 0.01256649, "epoch": 0.24103411994588908, "flos": 17170762383360.0, "grad_norm": 2.8366730536608444, "language_loss": 0.75308251, "learning_rate": 3.5507277379398295e-06, "loss": 0.83098084, "num_input_tokens_seen": 86297180, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.18395996, "step": 4009, "time_per_iteration": 2.530377149581909 }, { "auxiliary_loss_clip": 0.06517978, "auxiliary_loss_mlp": 0.01276373, "balance_loss_clip": 0.06295413, "balance_loss_mlp": 0.01257347, "epoch": 0.24109424319855705, "flos": 20674258099200.0, "grad_norm": 1.6716016456585487, "language_loss": 0.80559433, "learning_rate": 3.550481757745804e-06, "loss": 0.88353789, "num_input_tokens_seen": 86317660, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.19030762, "step": 4010, "time_per_iteration": 2.5518219470977783 }, { "auxiliary_loss_clip": 0.06522282, "auxiliary_loss_mlp": 0.01276969, "balance_loss_clip": 0.06296436, "balance_loss_mlp": 0.01256692, "epoch": 0.241154366451225, "flos": 28188982546560.0, "grad_norm": 1.7664133860178632, "language_loss": 0.70776373, "learning_rate": 3.5502357187573555e-06, "loss": 0.78575623, "num_input_tokens_seen": 86338325, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.20275879, "step": 4011, "time_per_iteration": 2.6178793907165527 }, { "auxiliary_loss_clip": 0.06520084, "auxiliary_loss_mlp": 0.0127144, "balance_loss_clip": 0.06297226, "balance_loss_mlp": 0.01252605, "epoch": 0.24121448970389298, "flos": 21696222821760.0, "grad_norm": 1.6742098423478935, "language_loss": 0.69445437, "learning_rate": 3.5499896209838118e-06, "loss": 0.77236962, "num_input_tokens_seen": 86357615, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.18811035, "step": 4012, "time_per_iteration": 2.5721964836120605 }, { "auxiliary_loss_clip": 0.06521247, "auxiliary_loss_mlp": 0.01277044, "balance_loss_clip": 0.0629746, "balance_loss_mlp": 0.01255646, "epoch": 0.24127461295656094, "flos": 39685530142080.0, "grad_norm": 2.0242998932901846, "language_loss": 0.7460891, "learning_rate": 3.5497434644345073e-06, "loss": 0.824072, "num_input_tokens_seen": 86380355, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.21398926, "step": 4013, "time_per_iteration": 4.099107503890991 }, { "auxiliary_loss_clip": 0.06525531, "auxiliary_loss_mlp": 0.01274749, "balance_loss_clip": 0.06301265, "balance_loss_mlp": 0.01256534, "epoch": 0.2413347362092289, "flos": 19141960884480.0, "grad_norm": 1.6509857836611999, "language_loss": 0.88834858, "learning_rate": 3.5494972491187753e-06, "loss": 0.96635133, "num_input_tokens_seen": 86399125, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.18212891, "step": 4014, "time_per_iteration": 2.5599515438079834 }, { "auxiliary_loss_clip": 0.065357, "auxiliary_loss_mlp": 0.01281418, "balance_loss_clip": 0.063051, "balance_loss_mlp": 0.01261486, "epoch": 0.2413948594618969, "flos": 26946099734400.0, "grad_norm": 4.965739036509119, "language_loss": 0.95203483, "learning_rate": 3.549250975045952e-06, "loss": 1.03020597, "num_input_tokens_seen": 86418625, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.19934082, "step": 4015, "time_per_iteration": 4.045902490615845 }, { "auxiliary_loss_clip": 0.06525807, "auxiliary_loss_mlp": 0.01274094, "balance_loss_clip": 0.06302083, "balance_loss_mlp": 0.01255771, "epoch": 0.24145498271456486, "flos": 25235077760640.0, "grad_norm": 1.749434117597301, "language_loss": 0.83588821, "learning_rate": 3.5490046422253768e-06, "loss": 0.9138872, "num_input_tokens_seen": 86438375, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.18310547, "step": 4016, "time_per_iteration": 2.608877182006836 }, { "auxiliary_loss_clip": 0.06525083, "auxiliary_loss_mlp": 0.01278119, "balance_loss_clip": 0.06308419, "balance_loss_mlp": 0.01260214, "epoch": 0.24151510596723283, "flos": 40671339027840.0, "grad_norm": 2.2260546622854926, "language_loss": 0.69169158, "learning_rate": 3.54875825066639e-06, "loss": 0.76972353, "num_input_tokens_seen": 86463230, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.17895508, "step": 4017, "time_per_iteration": 2.745788812637329 }, { "auxiliary_loss_clip": 0.06528501, "auxiliary_loss_mlp": 0.01282906, "balance_loss_clip": 0.06301212, "balance_loss_mlp": 0.01262188, "epoch": 0.2415752292199008, "flos": 18151917367680.0, "grad_norm": 1.6273429042186978, "language_loss": 0.85425961, "learning_rate": 3.5485118003783353e-06, "loss": 0.93237364, "num_input_tokens_seen": 86481230, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.20715332, "step": 4018, "time_per_iteration": 2.5587148666381836 }, { "auxiliary_loss_clip": 0.0644426, "auxiliary_loss_mlp": 0.01271145, "balance_loss_clip": 0.06327604, "balance_loss_mlp": 0.01266705, "epoch": 0.24163535247256876, "flos": 67307213819520.0, "grad_norm": 0.8051215844284733, "language_loss": 0.60472673, "learning_rate": 3.548265291370558e-06, "loss": 0.68188077, "num_input_tokens_seen": 86541260, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 0.04449463, "step": 4019, "time_per_iteration": 3.275184392929077 }, { "auxiliary_loss_clip": 0.06527019, "auxiliary_loss_mlp": 0.01270737, "balance_loss_clip": 0.06304099, "balance_loss_mlp": 0.01253464, "epoch": 0.24169547572523672, "flos": 24935810503680.0, "grad_norm": 1.910870183645739, "language_loss": 0.74422944, "learning_rate": 3.5480187236524055e-06, "loss": 0.82220697, "num_input_tokens_seen": 86559580, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.17272949, "step": 4020, "time_per_iteration": 2.5903069972991943 }, { "auxiliary_loss_clip": 0.06532222, "auxiliary_loss_mlp": 0.01280531, "balance_loss_clip": 0.06311177, "balance_loss_mlp": 0.01261159, "epoch": 0.24175559897790472, "flos": 18733303722240.0, "grad_norm": 2.400303061674454, "language_loss": 0.82079387, "learning_rate": 3.5477720972332285e-06, "loss": 0.89892143, "num_input_tokens_seen": 86577560, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.19372559, "step": 4021, "time_per_iteration": 2.5839083194732666 }, { "auxiliary_loss_clip": 0.06527463, "auxiliary_loss_mlp": 0.01282734, "balance_loss_clip": 0.06301577, "balance_loss_mlp": 0.01261312, "epoch": 0.24181572223057268, "flos": 23045937989760.0, "grad_norm": 2.260672618287083, "language_loss": 0.76989245, "learning_rate": 3.547525412122378e-06, "loss": 0.84799439, "num_input_tokens_seen": 86595350, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.2142334, "step": 4022, "time_per_iteration": 2.6033263206481934 }, { "auxiliary_loss_clip": 0.06541254, "auxiliary_loss_mlp": 0.01280061, "balance_loss_clip": 0.06305698, "balance_loss_mlp": 0.01258663, "epoch": 0.24187584548324065, "flos": 20382411928320.0, "grad_norm": 5.313019603820648, "language_loss": 0.75850523, "learning_rate": 3.5472786683292083e-06, "loss": 0.83671832, "num_input_tokens_seen": 86614805, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.21398926, "step": 4023, "time_per_iteration": 2.594909429550171 }, { "auxiliary_loss_clip": 0.06533003, "auxiliary_loss_mlp": 0.01278843, "balance_loss_clip": 0.06306726, "balance_loss_mlp": 0.01258339, "epoch": 0.2419359687359086, "flos": 21403915453440.0, "grad_norm": 1.816927867764115, "language_loss": 0.83213705, "learning_rate": 3.5470318658630766e-06, "loss": 0.91025543, "num_input_tokens_seen": 86633700, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.20483398, "step": 4024, "time_per_iteration": 2.570997953414917 }, { "auxiliary_loss_clip": 0.06523281, "auxiliary_loss_mlp": 0.0128009, "balance_loss_clip": 0.06302062, "balance_loss_mlp": 0.01259765, "epoch": 0.24199609198857658, "flos": 18375309152640.0, "grad_norm": 3.622700787742357, "language_loss": 0.86628121, "learning_rate": 3.5467850047333424e-06, "loss": 0.94431496, "num_input_tokens_seen": 86650905, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.203125, "step": 4025, "time_per_iteration": 2.565856456756592 }, { "auxiliary_loss_clip": 0.06528719, "auxiliary_loss_mlp": 0.01283636, "balance_loss_clip": 0.06299011, "balance_loss_mlp": 0.01264026, "epoch": 0.24205621524124454, "flos": 19469962892160.0, "grad_norm": 2.431458399779516, "language_loss": 0.72213995, "learning_rate": 3.546538084949365e-06, "loss": 0.80026352, "num_input_tokens_seen": 86669185, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.19592285, "step": 4026, "time_per_iteration": 2.5715200901031494 }, { "auxiliary_loss_clip": 0.06515306, "auxiliary_loss_mlp": 0.0127953, "balance_loss_clip": 0.06295758, "balance_loss_mlp": 0.01260039, "epoch": 0.2421163384939125, "flos": 14981706466560.0, "grad_norm": 1.8337665321846262, "language_loss": 0.65020549, "learning_rate": 3.546291106520509e-06, "loss": 0.72815382, "num_input_tokens_seen": 86686805, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.19482422, "step": 4027, "time_per_iteration": 2.56156325340271 }, { "auxiliary_loss_clip": 0.06526658, "auxiliary_loss_mlp": 0.01276813, "balance_loss_clip": 0.06298785, "balance_loss_mlp": 0.01257966, "epoch": 0.2421764617465805, "flos": 18668161572480.0, "grad_norm": 2.5173917024434527, "language_loss": 0.71271348, "learning_rate": 3.5460440694561388e-06, "loss": 0.79074824, "num_input_tokens_seen": 86705520, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.18859863, "step": 4028, "time_per_iteration": 2.567420721054077 }, { "auxiliary_loss_clip": 0.06427158, "auxiliary_loss_mlp": 0.01253693, "balance_loss_clip": 0.06310099, "balance_loss_mlp": 0.01249577, "epoch": 0.24223658499924847, "flos": 64368025424640.0, "grad_norm": 0.8286460026276461, "language_loss": 0.55368519, "learning_rate": 3.545796973765623e-06, "loss": 0.63049376, "num_input_tokens_seen": 86767320, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 0.04116821, "step": 4029, "time_per_iteration": 3.21781325340271 }, { "auxiliary_loss_clip": 0.06526815, "auxiliary_loss_mlp": 0.01281541, "balance_loss_clip": 0.06301858, "balance_loss_mlp": 0.01261264, "epoch": 0.24229670825191643, "flos": 25782278849280.0, "grad_norm": 1.6333716749519551, "language_loss": 0.74294984, "learning_rate": 3.54554981945833e-06, "loss": 0.82103342, "num_input_tokens_seen": 86788110, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.20275879, "step": 4030, "time_per_iteration": 2.6062140464782715 }, { "auxiliary_loss_clip": 0.06520034, "auxiliary_loss_mlp": 0.01282666, "balance_loss_clip": 0.06296022, "balance_loss_mlp": 0.01262937, "epoch": 0.2423568315045844, "flos": 20673251850240.0, "grad_norm": 2.0739938740486794, "language_loss": 0.77159095, "learning_rate": 3.5453026065436343e-06, "loss": 0.84961796, "num_input_tokens_seen": 86807640, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.19726562, "step": 4031, "time_per_iteration": 2.6019492149353027 }, { "auxiliary_loss_clip": 0.06533296, "auxiliary_loss_mlp": 0.01278133, "balance_loss_clip": 0.0630248, "balance_loss_mlp": 0.01258761, "epoch": 0.24241695475725236, "flos": 22422987210240.0, "grad_norm": 2.0553243265772707, "language_loss": 0.66112298, "learning_rate": 3.5450553350309083e-06, "loss": 0.73923731, "num_input_tokens_seen": 86826795, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.19360352, "step": 4032, "time_per_iteration": 2.589769124984741 }, { "auxiliary_loss_clip": 0.06521235, "auxiliary_loss_mlp": 0.01279642, "balance_loss_clip": 0.06298756, "balance_loss_mlp": 0.01260319, "epoch": 0.24247707800992033, "flos": 17134732327680.0, "grad_norm": 2.0077733715350745, "language_loss": 0.81667918, "learning_rate": 3.5448080049295286e-06, "loss": 0.89468801, "num_input_tokens_seen": 86843175, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.1932373, "step": 4033, "time_per_iteration": 2.567622184753418 }, { "auxiliary_loss_clip": 0.06517915, "auxiliary_loss_mlp": 0.01282665, "balance_loss_clip": 0.06298724, "balance_loss_mlp": 0.01263365, "epoch": 0.2425372012625883, "flos": 31621885597440.0, "grad_norm": 2.2738661682703496, "language_loss": 0.6964637, "learning_rate": 3.5445606162488754e-06, "loss": 0.77446949, "num_input_tokens_seen": 86863185, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.19299316, "step": 4034, "time_per_iteration": 2.6475067138671875 }, { "auxiliary_loss_clip": 0.06518529, "auxiliary_loss_mlp": 0.01285634, "balance_loss_clip": 0.06297579, "balance_loss_mlp": 0.01266346, "epoch": 0.24259732451525629, "flos": 16331589342720.0, "grad_norm": 2.4392536443499835, "language_loss": 0.96609908, "learning_rate": 3.5443131689983283e-06, "loss": 1.04414058, "num_input_tokens_seen": 86880040, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.19299316, "step": 4035, "time_per_iteration": 2.570314645767212 }, { "auxiliary_loss_clip": 0.06513451, "auxiliary_loss_mlp": 0.01291122, "balance_loss_clip": 0.06297236, "balance_loss_mlp": 0.01272454, "epoch": 0.24265744776792425, "flos": 22863230161920.0, "grad_norm": 2.138747929369566, "language_loss": 0.78000891, "learning_rate": 3.5440656631872715e-06, "loss": 0.85805464, "num_input_tokens_seen": 86900610, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.18688965, "step": 4036, "time_per_iteration": 2.5713000297546387 }, { "auxiliary_loss_clip": 0.06523257, "auxiliary_loss_mlp": 0.01276845, "balance_loss_clip": 0.06302135, "balance_loss_mlp": 0.01257795, "epoch": 0.24271757102059222, "flos": 21878008254720.0, "grad_norm": 2.1038736388013515, "language_loss": 0.74863648, "learning_rate": 3.5438180988250898e-06, "loss": 0.82663751, "num_input_tokens_seen": 86919385, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.19055176, "step": 4037, "time_per_iteration": 2.606478452682495 }, { "auxiliary_loss_clip": 0.06520098, "auxiliary_loss_mlp": 0.01282141, "balance_loss_clip": 0.06297457, "balance_loss_mlp": 0.01263449, "epoch": 0.24277769427326018, "flos": 19214649901440.0, "grad_norm": 1.9529358970440194, "language_loss": 0.7740078, "learning_rate": 3.543570475921171e-06, "loss": 0.85203016, "num_input_tokens_seen": 86938885, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.18688965, "step": 4038, "time_per_iteration": 2.564720630645752 }, { "auxiliary_loss_clip": 0.06526106, "auxiliary_loss_mlp": 0.01279306, "balance_loss_clip": 0.06302428, "balance_loss_mlp": 0.01259589, "epoch": 0.24283781752592815, "flos": 19505909093760.0, "grad_norm": 1.7437177597503561, "language_loss": 0.72477949, "learning_rate": 3.543322794484905e-06, "loss": 0.80283362, "num_input_tokens_seen": 86957705, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.1973877, "step": 4039, "time_per_iteration": 2.562713623046875 }, { "auxiliary_loss_clip": 0.06527237, "auxiliary_loss_mlp": 0.01280781, "balance_loss_clip": 0.06302572, "balance_loss_mlp": 0.01260909, "epoch": 0.2428979407785961, "flos": 19908444908160.0, "grad_norm": 1.9182276508662308, "language_loss": 0.79540884, "learning_rate": 3.5430750545256843e-06, "loss": 0.87348902, "num_input_tokens_seen": 86975845, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.19873047, "step": 4040, "time_per_iteration": 2.577836513519287 }, { "auxiliary_loss_clip": 0.06507115, "auxiliary_loss_mlp": 0.01277399, "balance_loss_clip": 0.06293117, "balance_loss_mlp": 0.01260782, "epoch": 0.2429580640312641, "flos": 24722523135360.0, "grad_norm": 2.5119831469469776, "language_loss": 0.81030047, "learning_rate": 3.5428272560529027e-06, "loss": 0.88814557, "num_input_tokens_seen": 86994800, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.16625977, "step": 4041, "time_per_iteration": 4.045635938644409 }, { "auxiliary_loss_clip": 0.06512712, "auxiliary_loss_mlp": 0.01284606, "balance_loss_clip": 0.06294014, "balance_loss_mlp": 0.01266689, "epoch": 0.24301818728393207, "flos": 25637529720960.0, "grad_norm": 2.3504942853163096, "language_loss": 0.77021509, "learning_rate": 3.542579399075957e-06, "loss": 0.84818828, "num_input_tokens_seen": 87016845, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.17907715, "step": 4042, "time_per_iteration": 2.637568950653076 }, { "auxiliary_loss_clip": 0.06503736, "auxiliary_loss_mlp": 0.01278138, "balance_loss_clip": 0.06288566, "balance_loss_mlp": 0.01260507, "epoch": 0.24307831053660003, "flos": 26148700753920.0, "grad_norm": 1.8895959348812146, "language_loss": 0.8185969, "learning_rate": 3.542331483604246e-06, "loss": 0.89641559, "num_input_tokens_seen": 87036270, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.1763916, "step": 4043, "time_per_iteration": 2.6260664463043213 }, { "auxiliary_loss_clip": 0.06523955, "auxiliary_loss_mlp": 0.01279025, "balance_loss_clip": 0.06295937, "balance_loss_mlp": 0.0125907, "epoch": 0.243138433789268, "flos": 14977136419200.0, "grad_norm": 2.46286647549912, "language_loss": 0.73452526, "learning_rate": 3.5420835096471706e-06, "loss": 0.81255507, "num_input_tokens_seen": 87049920, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.19946289, "step": 4044, "time_per_iteration": 2.5204358100891113 }, { "auxiliary_loss_clip": 0.06515414, "auxiliary_loss_mlp": 0.01276379, "balance_loss_clip": 0.06294024, "balance_loss_mlp": 0.01257997, "epoch": 0.24319855704193596, "flos": 25198670361600.0, "grad_norm": 2.2222947743899772, "language_loss": 0.84265411, "learning_rate": 3.5418354772141337e-06, "loss": 0.92057204, "num_input_tokens_seen": 87068230, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.18383789, "step": 4045, "time_per_iteration": 2.6294796466827393 }, { "auxiliary_loss_clip": 0.06513146, "auxiliary_loss_mlp": 0.01276914, "balance_loss_clip": 0.06293207, "balance_loss_mlp": 0.01259426, "epoch": 0.24325868029460393, "flos": 22133740515840.0, "grad_norm": 1.628456802518516, "language_loss": 0.87057477, "learning_rate": 3.541587386314541e-06, "loss": 0.94847542, "num_input_tokens_seen": 87086435, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.17492676, "step": 4046, "time_per_iteration": 4.013038396835327 }, { "auxiliary_loss_clip": 0.06510256, "auxiliary_loss_mlp": 0.01282973, "balance_loss_clip": 0.0629411, "balance_loss_mlp": 0.01264197, "epoch": 0.2433188035472719, "flos": 23588107833600.0, "grad_norm": 1.803048936673545, "language_loss": 0.73048937, "learning_rate": 3.5413392369578e-06, "loss": 0.80842167, "num_input_tokens_seen": 87105340, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.18774414, "step": 4047, "time_per_iteration": 2.7209630012512207 }, { "auxiliary_loss_clip": 0.06518177, "auxiliary_loss_mlp": 0.01284301, "balance_loss_clip": 0.06295703, "balance_loss_mlp": 0.01264894, "epoch": 0.2433789267999399, "flos": 24469809621120.0, "grad_norm": 2.2940183189027383, "language_loss": 0.73720771, "learning_rate": 3.5410910291533213e-06, "loss": 0.81523246, "num_input_tokens_seen": 87125780, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.19396973, "step": 4048, "time_per_iteration": 2.5924007892608643 }, { "auxiliary_loss_clip": 0.06511311, "auxiliary_loss_mlp": 0.01271031, "balance_loss_clip": 0.06293012, "balance_loss_mlp": 0.01254187, "epoch": 0.24343905005260785, "flos": 16733622032640.0, "grad_norm": 2.9354086210071273, "language_loss": 0.73540246, "learning_rate": 3.5408427629105155e-06, "loss": 0.81322587, "num_input_tokens_seen": 87144470, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.16845703, "step": 4049, "time_per_iteration": 2.570204973220825 }, { "auxiliary_loss_clip": 0.06514911, "auxiliary_loss_mlp": 0.01277896, "balance_loss_clip": 0.06297374, "balance_loss_mlp": 0.01260301, "epoch": 0.24349917330527582, "flos": 20049294821760.0, "grad_norm": 1.608147411290256, "language_loss": 0.74192339, "learning_rate": 3.5405944382387985e-06, "loss": 0.81985146, "num_input_tokens_seen": 87162830, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.17602539, "step": 4050, "time_per_iteration": 2.608461380004883 }, { "auxiliary_loss_clip": 0.06509253, "auxiliary_loss_mlp": 0.01274768, "balance_loss_clip": 0.06293173, "balance_loss_mlp": 0.01257781, "epoch": 0.24355929655794378, "flos": 17426285009280.0, "grad_norm": 2.412037817002087, "language_loss": 0.76262081, "learning_rate": 3.5403460551475854e-06, "loss": 0.84046108, "num_input_tokens_seen": 87180905, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.1697998, "step": 4051, "time_per_iteration": 2.795881986618042 }, { "auxiliary_loss_clip": 0.06520288, "auxiliary_loss_mlp": 0.01277714, "balance_loss_clip": 0.06300408, "balance_loss_mlp": 0.01259809, "epoch": 0.24361941981061175, "flos": 25417995223680.0, "grad_norm": 2.2656525076768577, "language_loss": 0.7193532, "learning_rate": 3.540097613646296e-06, "loss": 0.79733318, "num_input_tokens_seen": 87202290, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.17895508, "step": 4052, "time_per_iteration": 4.0765700340271 }, { "auxiliary_loss_clip": 0.06513952, "auxiliary_loss_mlp": 0.01277225, "balance_loss_clip": 0.0629612, "balance_loss_mlp": 0.0125982, "epoch": 0.2436795430632797, "flos": 22827493595520.0, "grad_norm": 2.0405610570377575, "language_loss": 0.82197213, "learning_rate": 3.539849113744351e-06, "loss": 0.89988399, "num_input_tokens_seen": 87221650, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.17382812, "step": 4053, "time_per_iteration": 2.606809377670288 }, { "auxiliary_loss_clip": 0.06517905, "auxiliary_loss_mlp": 0.01278163, "balance_loss_clip": 0.06296052, "balance_loss_mlp": 0.01261116, "epoch": 0.2437396663159477, "flos": 15163030702080.0, "grad_norm": 1.4946809572117665, "language_loss": 0.78342795, "learning_rate": 3.539600555451172e-06, "loss": 0.86138856, "num_input_tokens_seen": 87238515, "router_z_loss_clip": 2.21777344, "router_z_loss_mlp": 0.17053223, "step": 4054, "time_per_iteration": 4.064074754714966 }, { "auxiliary_loss_clip": 0.06512484, "auxiliary_loss_mlp": 0.01282801, "balance_loss_clip": 0.06293234, "balance_loss_mlp": 0.01265254, "epoch": 0.24379978956861567, "flos": 22097710460160.0, "grad_norm": 1.6229675197214803, "language_loss": 0.84215266, "learning_rate": 3.5393519387761866e-06, "loss": 0.92010552, "num_input_tokens_seen": 87256290, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.17541504, "step": 4055, "time_per_iteration": 2.6302499771118164 }, { "auxiliary_loss_clip": 0.06526183, "auxiliary_loss_mlp": 0.01279647, "balance_loss_clip": 0.06298888, "balance_loss_mlp": 0.01261062, "epoch": 0.24385991282128364, "flos": 31475878657920.0, "grad_norm": 2.7383010114336086, "language_loss": 0.55613089, "learning_rate": 3.5391032637288217e-06, "loss": 0.63418913, "num_input_tokens_seen": 87277085, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.18591309, "step": 4056, "time_per_iteration": 2.6642189025878906 }, { "auxiliary_loss_clip": 0.06524176, "auxiliary_loss_mlp": 0.01288693, "balance_loss_clip": 0.06299561, "balance_loss_mlp": 0.01270693, "epoch": 0.2439200360739516, "flos": 23845055978880.0, "grad_norm": 2.10075351249507, "language_loss": 0.8080498, "learning_rate": 3.538854530318506e-06, "loss": 0.88617855, "num_input_tokens_seen": 87293020, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.17980957, "step": 4057, "time_per_iteration": 2.6080715656280518 }, { "auxiliary_loss_clip": 0.06511457, "auxiliary_loss_mlp": 0.01274342, "balance_loss_clip": 0.06293417, "balance_loss_mlp": 0.01257152, "epoch": 0.24398015932661957, "flos": 19175684952960.0, "grad_norm": 2.1216857230175505, "language_loss": 0.79743659, "learning_rate": 3.538605738554673e-06, "loss": 0.87529457, "num_input_tokens_seen": 87311445, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.171875, "step": 4058, "time_per_iteration": 2.6242029666900635 }, { "auxiliary_loss_clip": 0.06524321, "auxiliary_loss_mlp": 0.0128137, "balance_loss_clip": 0.06297206, "balance_loss_mlp": 0.01262404, "epoch": 0.24404028257928753, "flos": 25269095318400.0, "grad_norm": 1.5560882816050667, "language_loss": 0.86256397, "learning_rate": 3.538356888446756e-06, "loss": 0.9406209, "num_input_tokens_seen": 87332055, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.18969727, "step": 4059, "time_per_iteration": 2.6457760334014893 }, { "auxiliary_loss_clip": 0.06511753, "auxiliary_loss_mlp": 0.01271305, "balance_loss_clip": 0.06294686, "balance_loss_mlp": 0.0125458, "epoch": 0.2441004058319555, "flos": 26474606409600.0, "grad_norm": 3.1732212553629204, "language_loss": 0.74846965, "learning_rate": 3.5381079800041913e-06, "loss": 0.8263002, "num_input_tokens_seen": 87351295, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.16723633, "step": 4060, "time_per_iteration": 2.6249160766601562 }, { "auxiliary_loss_clip": 0.06530383, "auxiliary_loss_mlp": 0.01279138, "balance_loss_clip": 0.06301671, "balance_loss_mlp": 0.01259588, "epoch": 0.2441605290846235, "flos": 26767752318720.0, "grad_norm": 1.831738724741959, "language_loss": 0.73739481, "learning_rate": 3.5378590132364182e-06, "loss": 0.81549001, "num_input_tokens_seen": 87370650, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.19555664, "step": 4061, "time_per_iteration": 2.8187520503997803 }, { "auxiliary_loss_clip": 0.06512389, "auxiliary_loss_mlp": 0.01271623, "balance_loss_clip": 0.06296921, "balance_loss_mlp": 0.01253873, "epoch": 0.24422065233729146, "flos": 21112236990720.0, "grad_norm": 1.6725674222283655, "language_loss": 0.76748323, "learning_rate": 3.5376099881528768e-06, "loss": 0.84532326, "num_input_tokens_seen": 87389020, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.17749023, "step": 4062, "time_per_iteration": 2.6055190563201904 }, { "auxiliary_loss_clip": 0.06511334, "auxiliary_loss_mlp": 0.012761, "balance_loss_clip": 0.06296773, "balance_loss_mlp": 0.01257336, "epoch": 0.24428077558995942, "flos": 25269891932160.0, "grad_norm": 2.4291017513319395, "language_loss": 0.85204244, "learning_rate": 3.537360904763011e-06, "loss": 0.9299168, "num_input_tokens_seen": 87409695, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.18774414, "step": 4063, "time_per_iteration": 2.6539835929870605 }, { "auxiliary_loss_clip": 0.06527586, "auxiliary_loss_mlp": 0.0127318, "balance_loss_clip": 0.06300273, "balance_loss_mlp": 0.01253976, "epoch": 0.24434089884262739, "flos": 20491508344320.0, "grad_norm": 2.1041626195238377, "language_loss": 0.69113171, "learning_rate": 3.5371117630762656e-06, "loss": 0.76913941, "num_input_tokens_seen": 87428250, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.1920166, "step": 4064, "time_per_iteration": 2.592595338821411 }, { "auxiliary_loss_clip": 0.0652394, "auxiliary_loss_mlp": 0.01274357, "balance_loss_clip": 0.06298603, "balance_loss_mlp": 0.01255593, "epoch": 0.24440102209529535, "flos": 23628456374400.0, "grad_norm": 1.5947746039919386, "language_loss": 0.70549965, "learning_rate": 3.536862563102088e-06, "loss": 0.78348267, "num_input_tokens_seen": 87449380, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.18774414, "step": 4065, "time_per_iteration": 2.6484718322753906 }, { "auxiliary_loss_clip": 0.06530513, "auxiliary_loss_mlp": 0.01278199, "balance_loss_clip": 0.06304258, "balance_loss_mlp": 0.0125841, "epoch": 0.24446114534796332, "flos": 20560382000640.0, "grad_norm": 1.7625321435601304, "language_loss": 0.8462224, "learning_rate": 3.5366133048499282e-06, "loss": 0.92430949, "num_input_tokens_seen": 87465365, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.19787598, "step": 4066, "time_per_iteration": 2.6542210578918457 }, { "auxiliary_loss_clip": 0.06398283, "auxiliary_loss_mlp": 0.01258451, "balance_loss_clip": 0.06282073, "balance_loss_mlp": 0.01253542, "epoch": 0.24452126860063128, "flos": 60406719327360.0, "grad_norm": 0.7205984657879397, "language_loss": 0.52184802, "learning_rate": 3.5363639883292374e-06, "loss": 0.59841537, "num_input_tokens_seen": 87522525, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.04904175, "step": 4067, "time_per_iteration": 3.1394882202148438 }, { "auxiliary_loss_clip": 0.06530211, "auxiliary_loss_mlp": 0.0128044, "balance_loss_clip": 0.06305309, "balance_loss_mlp": 0.01261724, "epoch": 0.24458139185329927, "flos": 15126958719360.0, "grad_norm": 3.8565730920459007, "language_loss": 0.72922838, "learning_rate": 3.5361146135494706e-06, "loss": 0.8073349, "num_input_tokens_seen": 87539170, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.18713379, "step": 4068, "time_per_iteration": 2.5689847469329834 }, { "auxiliary_loss_clip": 0.06522009, "auxiliary_loss_mlp": 0.01275026, "balance_loss_clip": 0.0630246, "balance_loss_mlp": 0.01256512, "epoch": 0.24464151510596724, "flos": 28005771594240.0, "grad_norm": 1.4945715551455656, "language_loss": 0.78362072, "learning_rate": 3.5358651805200835e-06, "loss": 0.86159098, "num_input_tokens_seen": 87558875, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.18518066, "step": 4069, "time_per_iteration": 2.674133062362671 }, { "auxiliary_loss_clip": 0.0652047, "auxiliary_loss_mlp": 0.01279484, "balance_loss_clip": 0.06301449, "balance_loss_mlp": 0.01259815, "epoch": 0.2447016383586352, "flos": 19799138856960.0, "grad_norm": 1.9520835228778977, "language_loss": 0.80775952, "learning_rate": 3.5356156892505347e-06, "loss": 0.88575906, "num_input_tokens_seen": 87576485, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.19677734, "step": 4070, "time_per_iteration": 2.562664747238159 }, { "auxiliary_loss_clip": 0.06530267, "auxiliary_loss_mlp": 0.01277012, "balance_loss_clip": 0.06308524, "balance_loss_mlp": 0.012582, "epoch": 0.24476176161130317, "flos": 26074460436480.0, "grad_norm": 1.6109749649817182, "language_loss": 0.8485074, "learning_rate": 3.5353661397502854e-06, "loss": 0.92658025, "num_input_tokens_seen": 87598620, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.18811035, "step": 4071, "time_per_iteration": 2.639808177947998 }, { "auxiliary_loss_clip": 0.06538175, "auxiliary_loss_mlp": 0.0127508, "balance_loss_clip": 0.06309716, "balance_loss_mlp": 0.01255685, "epoch": 0.24482188486397113, "flos": 18849527735040.0, "grad_norm": 2.091392908355232, "language_loss": 0.80662686, "learning_rate": 3.535116532028798e-06, "loss": 0.88475937, "num_input_tokens_seen": 87616595, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.19421387, "step": 4072, "time_per_iteration": 2.569000720977783 }, { "auxiliary_loss_clip": 0.06522618, "auxiliary_loss_mlp": 0.01280781, "balance_loss_clip": 0.06306936, "balance_loss_mlp": 0.01263269, "epoch": 0.2448820081166391, "flos": 21258202003200.0, "grad_norm": 1.5392847929832303, "language_loss": 0.70304084, "learning_rate": 3.5348668660955382e-06, "loss": 0.78107482, "num_input_tokens_seen": 87635755, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.17504883, "step": 4073, "time_per_iteration": 2.6107308864593506 }, { "auxiliary_loss_clip": 0.06521606, "auxiliary_loss_mlp": 0.01276066, "balance_loss_clip": 0.06302588, "balance_loss_mlp": 0.01258017, "epoch": 0.2449421313693071, "flos": 23957254995840.0, "grad_norm": 2.3463418799781244, "language_loss": 0.6752106, "learning_rate": 3.5346171419599728e-06, "loss": 0.75318724, "num_input_tokens_seen": 87652885, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.18066406, "step": 4074, "time_per_iteration": 2.592031717300415 }, { "auxiliary_loss_clip": 0.06408301, "auxiliary_loss_mlp": 0.01260745, "balance_loss_clip": 0.06294337, "balance_loss_mlp": 0.01256829, "epoch": 0.24500225462197506, "flos": 60705902730240.0, "grad_norm": 0.8785371650069148, "language_loss": 0.68608207, "learning_rate": 3.5343673596315718e-06, "loss": 0.76277256, "num_input_tokens_seen": 87713220, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.03912354, "step": 4075, "time_per_iteration": 3.308628797531128 }, { "auxiliary_loss_clip": 0.06519563, "auxiliary_loss_mlp": 0.01282224, "balance_loss_clip": 0.06302176, "balance_loss_mlp": 0.01263258, "epoch": 0.24506237787464302, "flos": 26291018113920.0, "grad_norm": 1.6815372778072502, "language_loss": 0.79992425, "learning_rate": 3.5341175191198063e-06, "loss": 0.87794214, "num_input_tokens_seen": 87732680, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.1895752, "step": 4076, "time_per_iteration": 2.6111114025115967 }, { "auxiliary_loss_clip": 0.0652802, "auxiliary_loss_mlp": 0.01276745, "balance_loss_clip": 0.06300226, "balance_loss_mlp": 0.01256634, "epoch": 0.245122501127311, "flos": 20557530961920.0, "grad_norm": 1.8239325772453865, "language_loss": 0.82605344, "learning_rate": 3.533867620434151e-06, "loss": 0.90410101, "num_input_tokens_seen": 87751880, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.20129395, "step": 4077, "time_per_iteration": 2.5807669162750244 }, { "auxiliary_loss_clip": 0.06528023, "auxiliary_loss_mlp": 0.01282077, "balance_loss_clip": 0.06304206, "balance_loss_mlp": 0.01262192, "epoch": 0.24518262437997895, "flos": 29140312677120.0, "grad_norm": 1.949733728147188, "language_loss": 0.63026118, "learning_rate": 3.533617663584082e-06, "loss": 0.70836222, "num_input_tokens_seen": 87771795, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.19873047, "step": 4078, "time_per_iteration": 2.635772466659546 }, { "auxiliary_loss_clip": 0.06519622, "auxiliary_loss_mlp": 0.01278071, "balance_loss_clip": 0.06303152, "balance_loss_mlp": 0.01260177, "epoch": 0.24524274763264692, "flos": 23483623392000.0, "grad_norm": 1.4858701098880975, "language_loss": 0.76071578, "learning_rate": 3.5333676485790765e-06, "loss": 0.83869267, "num_input_tokens_seen": 87793640, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.17895508, "step": 4079, "time_per_iteration": 2.6070644855499268 }, { "auxiliary_loss_clip": 0.06516314, "auxiliary_loss_mlp": 0.01274886, "balance_loss_clip": 0.06297623, "balance_loss_mlp": 0.01255682, "epoch": 0.24530287088531488, "flos": 17206792439040.0, "grad_norm": 1.7745752350053616, "language_loss": 0.75361276, "learning_rate": 3.5331175754286173e-06, "loss": 0.83152473, "num_input_tokens_seen": 87812390, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.1920166, "step": 4080, "time_per_iteration": 3.991119861602783 }, { "auxiliary_loss_clip": 0.06515758, "auxiliary_loss_mlp": 0.01283977, "balance_loss_clip": 0.06301074, "balance_loss_mlp": 0.0126463, "epoch": 0.24536299413798288, "flos": 14872903539840.0, "grad_norm": 2.093791173640357, "language_loss": 0.83109152, "learning_rate": 3.532867444142186e-06, "loss": 0.90908885, "num_input_tokens_seen": 87830640, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.19360352, "step": 4081, "time_per_iteration": 2.556535243988037 }, { "auxiliary_loss_clip": 0.06520809, "auxiliary_loss_mlp": 0.01275545, "balance_loss_clip": 0.06302726, "balance_loss_mlp": 0.01257056, "epoch": 0.24542311739065084, "flos": 35270759347200.0, "grad_norm": 2.0868458712194933, "language_loss": 0.73838151, "learning_rate": 3.532617254729267e-06, "loss": 0.81634504, "num_input_tokens_seen": 87850450, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.18493652, "step": 4082, "time_per_iteration": 2.7035748958587646 }, { "auxiliary_loss_clip": 0.06521972, "auxiliary_loss_mlp": 0.01277861, "balance_loss_clip": 0.06306428, "balance_loss_mlp": 0.0126048, "epoch": 0.2454832406433188, "flos": 21508903019520.0, "grad_norm": 1.6770950102779687, "language_loss": 0.72650653, "learning_rate": 3.5323670071993485e-06, "loss": 0.80450487, "num_input_tokens_seen": 87868810, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.17382812, "step": 4083, "time_per_iteration": 2.5672388076782227 }, { "auxiliary_loss_clip": 0.06530803, "auxiliary_loss_mlp": 0.01283358, "balance_loss_clip": 0.06307885, "balance_loss_mlp": 0.01263831, "epoch": 0.24554336389598677, "flos": 14761878480000.0, "grad_norm": 2.2680864526391704, "language_loss": 0.75394499, "learning_rate": 3.532116701561919e-06, "loss": 0.83208662, "num_input_tokens_seen": 87885685, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.19543457, "step": 4084, "time_per_iteration": 2.556601047515869 }, { "auxiliary_loss_clip": 0.06519249, "auxiliary_loss_mlp": 0.01278181, "balance_loss_clip": 0.06301586, "balance_loss_mlp": 0.01260062, "epoch": 0.24560348714865474, "flos": 14981790320640.0, "grad_norm": 2.171914659761368, "language_loss": 0.85632288, "learning_rate": 3.531866337826471e-06, "loss": 0.9342972, "num_input_tokens_seen": 87903715, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.18115234, "step": 4085, "time_per_iteration": 2.53342866897583 }, { "auxiliary_loss_clip": 0.06526048, "auxiliary_loss_mlp": 0.0127607, "balance_loss_clip": 0.06306191, "balance_loss_mlp": 0.01256889, "epoch": 0.2456636104013227, "flos": 22682073634560.0, "grad_norm": 1.7918687553990587, "language_loss": 0.79784858, "learning_rate": 3.5316159160024982e-06, "loss": 0.87586975, "num_input_tokens_seen": 87923375, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.19165039, "step": 4086, "time_per_iteration": 3.97216796875 }, { "auxiliary_loss_clip": 0.06521144, "auxiliary_loss_mlp": 0.01279857, "balance_loss_clip": 0.06304497, "balance_loss_mlp": 0.01261594, "epoch": 0.2457237336539907, "flos": 27425307634560.0, "grad_norm": 1.677353526472031, "language_loss": 0.75674689, "learning_rate": 3.531365436099496e-06, "loss": 0.83475685, "num_input_tokens_seen": 87943115, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.18261719, "step": 4087, "time_per_iteration": 2.6057486534118652 }, { "auxiliary_loss_clip": 0.0652994, "auxiliary_loss_mlp": 0.01276022, "balance_loss_clip": 0.06308845, "balance_loss_mlp": 0.01257199, "epoch": 0.24578385690665866, "flos": 20418609692160.0, "grad_norm": 2.3949423195437416, "language_loss": 0.79966849, "learning_rate": 3.5311148981269635e-06, "loss": 0.8777281, "num_input_tokens_seen": 87959505, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.18811035, "step": 4088, "time_per_iteration": 2.579773187637329 }, { "auxiliary_loss_clip": 0.06523199, "auxiliary_loss_mlp": 0.01277152, "balance_loss_clip": 0.06310333, "balance_loss_mlp": 0.01259736, "epoch": 0.24584398015932662, "flos": 23922273116160.0, "grad_norm": 1.8055508934753426, "language_loss": 0.77342314, "learning_rate": 3.5308643020944e-06, "loss": 0.85142666, "num_input_tokens_seen": 87979725, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.17407227, "step": 4089, "time_per_iteration": 2.6025760173797607 }, { "auxiliary_loss_clip": 0.06529401, "auxiliary_loss_mlp": 0.01276368, "balance_loss_clip": 0.06308926, "balance_loss_mlp": 0.01258475, "epoch": 0.2459041034119946, "flos": 41505313115520.0, "grad_norm": 1.8806749494842034, "language_loss": 0.81731635, "learning_rate": 3.530613648011309e-06, "loss": 0.89537406, "num_input_tokens_seen": 87998270, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.17895508, "step": 4090, "time_per_iteration": 2.7283196449279785 }, { "auxiliary_loss_clip": 0.06530007, "auxiliary_loss_mlp": 0.01278546, "balance_loss_clip": 0.06310591, "balance_loss_mlp": 0.01258876, "epoch": 0.24596422666466256, "flos": 19942755955200.0, "grad_norm": 1.882442282086326, "language_loss": 0.73889917, "learning_rate": 3.5303629358871946e-06, "loss": 0.81698465, "num_input_tokens_seen": 88016760, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.19677734, "step": 4091, "time_per_iteration": 2.5715136528015137 }, { "auxiliary_loss_clip": 0.06531032, "auxiliary_loss_mlp": 0.01278378, "balance_loss_clip": 0.06313873, "balance_loss_mlp": 0.01260806, "epoch": 0.24602434991733052, "flos": 21550970568960.0, "grad_norm": 1.983188588281154, "language_loss": 0.77280831, "learning_rate": 3.5301121657315653e-06, "loss": 0.85090232, "num_input_tokens_seen": 88036465, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.17578125, "step": 4092, "time_per_iteration": 3.98994779586792 }, { "auxiliary_loss_clip": 0.06537395, "auxiliary_loss_mlp": 0.01278447, "balance_loss_clip": 0.0631182, "balance_loss_mlp": 0.01259874, "epoch": 0.24608447316999849, "flos": 23191735294080.0, "grad_norm": 2.6680293949049916, "language_loss": 0.82113403, "learning_rate": 3.5298613375539287e-06, "loss": 0.89929247, "num_input_tokens_seen": 88053270, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.18579102, "step": 4093, "time_per_iteration": 4.124249219894409 }, { "auxiliary_loss_clip": 0.06533402, "auxiliary_loss_mlp": 0.01279229, "balance_loss_clip": 0.06307849, "balance_loss_mlp": 0.0125944, "epoch": 0.24614459642266648, "flos": 19647345985920.0, "grad_norm": 2.3210234149867532, "language_loss": 0.87726772, "learning_rate": 3.529610451363797e-06, "loss": 0.95539403, "num_input_tokens_seen": 88072305, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.19799805, "step": 4094, "time_per_iteration": 2.5662341117858887 }, { "auxiliary_loss_clip": 0.06412375, "auxiliary_loss_mlp": 0.01268448, "balance_loss_clip": 0.0629854, "balance_loss_mlp": 0.01264613, "epoch": 0.24620471967533444, "flos": 61757231109120.0, "grad_norm": 0.7509959695741839, "language_loss": 0.57005757, "learning_rate": 3.5293595071706833e-06, "loss": 0.64686573, "num_input_tokens_seen": 88137995, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.03829956, "step": 4095, "time_per_iteration": 3.269861936569214 }, { "auxiliary_loss_clip": 0.06400944, "auxiliary_loss_mlp": 0.01262829, "balance_loss_clip": 0.06287458, "balance_loss_mlp": 0.01258773, "epoch": 0.2462648429280024, "flos": 69174431003520.0, "grad_norm": 0.6274121693696385, "language_loss": 0.5622437, "learning_rate": 3.5291085049841042e-06, "loss": 0.63888144, "num_input_tokens_seen": 88208490, "router_z_loss_clip": 1.13964844, "router_z_loss_mlp": 0.04055786, "step": 4096, "time_per_iteration": 3.3533711433410645 }, { "auxiliary_loss_clip": 0.06536547, "auxiliary_loss_mlp": 0.01281613, "balance_loss_clip": 0.06315184, "balance_loss_mlp": 0.01262956, "epoch": 0.24632496618067037, "flos": 29467140727680.0, "grad_norm": 1.664470799246393, "language_loss": 0.7822969, "learning_rate": 3.5288574448135773e-06, "loss": 0.86047852, "num_input_tokens_seen": 88228050, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.18664551, "step": 4097, "time_per_iteration": 2.6804261207580566 }, { "auxiliary_loss_clip": 0.06542614, "auxiliary_loss_mlp": 0.01282836, "balance_loss_clip": 0.06314778, "balance_loss_mlp": 0.01263798, "epoch": 0.24638508943333834, "flos": 24323341484160.0, "grad_norm": 2.0469215305634005, "language_loss": 0.77050364, "learning_rate": 3.5286063266686235e-06, "loss": 0.84875822, "num_input_tokens_seen": 88248090, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.19030762, "step": 4098, "time_per_iteration": 2.5951666831970215 }, { "auxiliary_loss_clip": 0.0653093, "auxiliary_loss_mlp": 0.0127674, "balance_loss_clip": 0.06311287, "balance_loss_mlp": 0.01259729, "epoch": 0.2464452126860063, "flos": 26620236005760.0, "grad_norm": 2.3825804828280095, "language_loss": 0.69047976, "learning_rate": 3.528355150558764e-06, "loss": 0.76855648, "num_input_tokens_seen": 88267545, "router_z_loss_clip": 2.19824219, "router_z_loss_mlp": 0.17004395, "step": 4099, "time_per_iteration": 2.621500253677368 }, { "auxiliary_loss_clip": 0.06519786, "auxiliary_loss_mlp": 0.01273597, "balance_loss_clip": 0.06306749, "balance_loss_mlp": 0.01255978, "epoch": 0.24650533593867427, "flos": 31220481813120.0, "grad_norm": 2.5343962198083134, "language_loss": 0.66721249, "learning_rate": 3.5281039164935237e-06, "loss": 0.74514633, "num_input_tokens_seen": 88289785, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.17626953, "step": 4100, "time_per_iteration": 2.645655393600464 }, { "auxiliary_loss_clip": 0.0639111, "auxiliary_loss_mlp": 0.01255255, "balance_loss_clip": 0.06277523, "balance_loss_mlp": 0.01250794, "epoch": 0.24656545919134226, "flos": 68513269962240.0, "grad_norm": 0.6908113870776309, "language_loss": 0.61530364, "learning_rate": 3.5278526244824304e-06, "loss": 0.69176733, "num_input_tokens_seen": 88357320, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.04470825, "step": 4101, "time_per_iteration": 3.3005430698394775 }, { "auxiliary_loss_clip": 0.06528649, "auxiliary_loss_mlp": 0.01275805, "balance_loss_clip": 0.06312701, "balance_loss_mlp": 0.01258007, "epoch": 0.24662558244401023, "flos": 20090398049280.0, "grad_norm": 1.5821172998785162, "language_loss": 0.7385903, "learning_rate": 3.527601274535012e-06, "loss": 0.81663489, "num_input_tokens_seen": 88377040, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.17785645, "step": 4102, "time_per_iteration": 2.57738995552063 }, { "auxiliary_loss_clip": 0.0653437, "auxiliary_loss_mlp": 0.01277561, "balance_loss_clip": 0.06313109, "balance_loss_mlp": 0.01260073, "epoch": 0.2466857056966782, "flos": 30709310780160.0, "grad_norm": 2.0908583039984387, "language_loss": 0.76084399, "learning_rate": 3.5273498666608004e-06, "loss": 0.83896327, "num_input_tokens_seen": 88395085, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.17492676, "step": 4103, "time_per_iteration": 2.649017333984375 }, { "auxiliary_loss_clip": 0.06531182, "auxiliary_loss_mlp": 0.01283776, "balance_loss_clip": 0.06308466, "balance_loss_mlp": 0.01265167, "epoch": 0.24674582894934616, "flos": 22535102373120.0, "grad_norm": 1.9027368153694102, "language_loss": 0.78699893, "learning_rate": 3.5270984008693288e-06, "loss": 0.86514854, "num_input_tokens_seen": 88413205, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.18603516, "step": 4104, "time_per_iteration": 2.576831340789795 }, { "auxiliary_loss_clip": 0.06518641, "auxiliary_loss_mlp": 0.01275071, "balance_loss_clip": 0.06303624, "balance_loss_mlp": 0.01255878, "epoch": 0.24680595220201412, "flos": 20710581644160.0, "grad_norm": 1.7499123573523743, "language_loss": 0.84230804, "learning_rate": 3.526846877170133e-06, "loss": 0.92024511, "num_input_tokens_seen": 88431525, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.19189453, "step": 4105, "time_per_iteration": 2.5949199199676514 }, { "auxiliary_loss_clip": 0.06529008, "auxiliary_loss_mlp": 0.01275966, "balance_loss_clip": 0.06310862, "balance_loss_mlp": 0.01258776, "epoch": 0.2468660754546821, "flos": 21836946954240.0, "grad_norm": 1.848688069090491, "language_loss": 0.77062935, "learning_rate": 3.52659529557275e-06, "loss": 0.84867907, "num_input_tokens_seen": 88451210, "router_z_loss_clip": 2.18261719, "router_z_loss_mlp": 0.171875, "step": 4106, "time_per_iteration": 2.629214286804199 }, { "auxiliary_loss_clip": 0.0652651, "auxiliary_loss_mlp": 0.01277514, "balance_loss_clip": 0.06307422, "balance_loss_mlp": 0.01259871, "epoch": 0.24692619870735008, "flos": 15273049512960.0, "grad_norm": 2.277637327007916, "language_loss": 0.73116726, "learning_rate": 3.5263436560867205e-06, "loss": 0.80920756, "num_input_tokens_seen": 88467790, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.1763916, "step": 4107, "time_per_iteration": 2.573345422744751 }, { "auxiliary_loss_clip": 0.0652599, "auxiliary_loss_mlp": 0.01275044, "balance_loss_clip": 0.06305732, "balance_loss_mlp": 0.01257174, "epoch": 0.24698632196001805, "flos": 29687933036160.0, "grad_norm": 2.2232543162815506, "language_loss": 0.66311151, "learning_rate": 3.526091958721587e-06, "loss": 0.74112189, "num_input_tokens_seen": 88490330, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.17871094, "step": 4108, "time_per_iteration": 2.6415998935699463 }, { "auxiliary_loss_clip": 0.06526116, "auxiliary_loss_mlp": 0.01275217, "balance_loss_clip": 0.06304163, "balance_loss_mlp": 0.01257598, "epoch": 0.247046445212686, "flos": 39174736452480.0, "grad_norm": 1.7898878079116691, "language_loss": 0.73285866, "learning_rate": 3.5258402034868936e-06, "loss": 0.81087196, "num_input_tokens_seen": 88512435, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.17602539, "step": 4109, "time_per_iteration": 2.747220277786255 }, { "auxiliary_loss_clip": 0.06523034, "auxiliary_loss_mlp": 0.01274791, "balance_loss_clip": 0.06303251, "balance_loss_mlp": 0.01257458, "epoch": 0.24710656846535398, "flos": 23004834762240.0, "grad_norm": 1.9873528449019078, "language_loss": 0.79425424, "learning_rate": 3.5255883903921866e-06, "loss": 0.87223256, "num_input_tokens_seen": 88529780, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.17333984, "step": 4110, "time_per_iteration": 2.592374563217163 }, { "auxiliary_loss_clip": 0.06526767, "auxiliary_loss_mlp": 0.01280415, "balance_loss_clip": 0.06304181, "balance_loss_mlp": 0.01261246, "epoch": 0.24716669171802194, "flos": 26440085727360.0, "grad_norm": 2.4056995669493815, "language_loss": 0.81351089, "learning_rate": 3.5253365194470144e-06, "loss": 0.89158273, "num_input_tokens_seen": 88547200, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.19152832, "step": 4111, "time_per_iteration": 2.6110382080078125 }, { "auxiliary_loss_clip": 0.06521976, "auxiliary_loss_mlp": 0.01275974, "balance_loss_clip": 0.0629892, "balance_loss_mlp": 0.01257854, "epoch": 0.2472268149706899, "flos": 23336358641280.0, "grad_norm": 2.170027057527657, "language_loss": 0.75941688, "learning_rate": 3.5250845906609294e-06, "loss": 0.83739638, "num_input_tokens_seen": 88566415, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.18127441, "step": 4112, "time_per_iteration": 2.570446729660034 }, { "auxiliary_loss_clip": 0.06519805, "auxiliary_loss_mlp": 0.01277101, "balance_loss_clip": 0.06298297, "balance_loss_mlp": 0.01258648, "epoch": 0.24728693822335787, "flos": 23775469562880.0, "grad_norm": 1.8569036432924757, "language_loss": 0.83257598, "learning_rate": 3.5248326040434835e-06, "loss": 0.91054505, "num_input_tokens_seen": 88585225, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.18457031, "step": 4113, "time_per_iteration": 2.5772452354431152 }, { "auxiliary_loss_clip": 0.06519163, "auxiliary_loss_mlp": 0.01273866, "balance_loss_clip": 0.06300451, "balance_loss_mlp": 0.01254459, "epoch": 0.24734706147602586, "flos": 19323494755200.0, "grad_norm": 2.390457455150412, "language_loss": 0.87625355, "learning_rate": 3.5245805596042322e-06, "loss": 0.95418382, "num_input_tokens_seen": 88603280, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.19396973, "step": 4114, "time_per_iteration": 2.5524330139160156 }, { "auxiliary_loss_clip": 0.06518512, "auxiliary_loss_mlp": 0.01273229, "balance_loss_clip": 0.06299075, "balance_loss_mlp": 0.01255848, "epoch": 0.24740718472869383, "flos": 28044275345280.0, "grad_norm": 1.680550555308428, "language_loss": 0.75708067, "learning_rate": 3.524328457352734e-06, "loss": 0.83499801, "num_input_tokens_seen": 88624925, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.17370605, "step": 4115, "time_per_iteration": 2.6161060333251953 }, { "auxiliary_loss_clip": 0.06402312, "auxiliary_loss_mlp": 0.01266942, "balance_loss_clip": 0.06290495, "balance_loss_mlp": 0.01262767, "epoch": 0.2474673079813618, "flos": 68129265899520.0, "grad_norm": 0.6375495452252266, "language_loss": 0.58111233, "learning_rate": 3.5240762972985475e-06, "loss": 0.65780485, "num_input_tokens_seen": 88691475, "router_z_loss_clip": 1.1171875, "router_z_loss_mlp": 0.04177856, "step": 4116, "time_per_iteration": 3.2489709854125977 }, { "auxiliary_loss_clip": 0.06517234, "auxiliary_loss_mlp": 0.01279224, "balance_loss_clip": 0.06299281, "balance_loss_mlp": 0.0126102, "epoch": 0.24752743123402976, "flos": 29470075620480.0, "grad_norm": 2.385960847022137, "language_loss": 0.84155613, "learning_rate": 3.523824079451235e-06, "loss": 0.91952074, "num_input_tokens_seen": 88713425, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.18188477, "step": 4117, "time_per_iteration": 2.6601505279541016 }, { "auxiliary_loss_clip": 0.06410211, "auxiliary_loss_mlp": 0.0126436, "balance_loss_clip": 0.06299269, "balance_loss_mlp": 0.01260036, "epoch": 0.24758755448669773, "flos": 58367946908160.0, "grad_norm": 0.8860893216918088, "language_loss": 0.6357801, "learning_rate": 3.5235718038203602e-06, "loss": 0.71252578, "num_input_tokens_seen": 88769995, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.04330444, "step": 4118, "time_per_iteration": 3.062098979949951 }, { "auxiliary_loss_clip": 0.06516396, "auxiliary_loss_mlp": 0.01274177, "balance_loss_clip": 0.06298648, "balance_loss_mlp": 0.01256367, "epoch": 0.2476476777393657, "flos": 20490502095360.0, "grad_norm": 1.4689373635481286, "language_loss": 0.79853457, "learning_rate": 3.523319470415491e-06, "loss": 0.87644029, "num_input_tokens_seen": 88789970, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.17797852, "step": 4119, "time_per_iteration": 2.573521137237549 }, { "auxiliary_loss_clip": 0.06515358, "auxiliary_loss_mlp": 0.01280951, "balance_loss_clip": 0.06298614, "balance_loss_mlp": 0.01263439, "epoch": 0.24770780099203366, "flos": 20492179176960.0, "grad_norm": 1.6105289511800815, "language_loss": 0.74998879, "learning_rate": 3.5230670792461943e-06, "loss": 0.82795191, "num_input_tokens_seen": 88810000, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.1751709, "step": 4120, "time_per_iteration": 4.109975814819336 }, { "auxiliary_loss_clip": 0.065217, "auxiliary_loss_mlp": 0.01279089, "balance_loss_clip": 0.06301582, "balance_loss_mlp": 0.01259527, "epoch": 0.24776792424470165, "flos": 15157915603200.0, "grad_norm": 2.026099151635329, "language_loss": 0.89042228, "learning_rate": 3.522814630322041e-06, "loss": 0.96843016, "num_input_tokens_seen": 88827515, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.19555664, "step": 4121, "time_per_iteration": 2.62178897857666 }, { "auxiliary_loss_clip": 0.06520493, "auxiliary_loss_mlp": 0.01277863, "balance_loss_clip": 0.06301016, "balance_loss_mlp": 0.01259004, "epoch": 0.2478280474973696, "flos": 21731833607040.0, "grad_norm": 3.8638510421894514, "language_loss": 0.70196867, "learning_rate": 3.5225621236526045e-06, "loss": 0.77995229, "num_input_tokens_seen": 88845025, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.1887207, "step": 4122, "time_per_iteration": 2.576864004135132 }, { "auxiliary_loss_clip": 0.06521595, "auxiliary_loss_mlp": 0.01274746, "balance_loss_clip": 0.0630009, "balance_loss_mlp": 0.01255363, "epoch": 0.24788817075003758, "flos": 20418400056960.0, "grad_norm": 2.9904767744793754, "language_loss": 0.80541736, "learning_rate": 3.5223095592474596e-06, "loss": 0.88338077, "num_input_tokens_seen": 88861740, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.19384766, "step": 4123, "time_per_iteration": 2.5582878589630127 }, { "auxiliary_loss_clip": 0.06516881, "auxiliary_loss_mlp": 0.01278606, "balance_loss_clip": 0.06300868, "balance_loss_mlp": 0.01261738, "epoch": 0.24794829400270554, "flos": 22599867179520.0, "grad_norm": 17.275640971669212, "language_loss": 0.75300902, "learning_rate": 3.5220569371161846e-06, "loss": 0.83096385, "num_input_tokens_seen": 88879740, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.16882324, "step": 4124, "time_per_iteration": 2.587434768676758 }, { "auxiliary_loss_clip": 0.06514889, "auxiliary_loss_mlp": 0.01277089, "balance_loss_clip": 0.06302745, "balance_loss_mlp": 0.01260174, "epoch": 0.2480084172553735, "flos": 39685362433920.0, "grad_norm": 1.4420649444600686, "language_loss": 0.74065542, "learning_rate": 3.521804257268357e-06, "loss": 0.81857526, "num_input_tokens_seen": 88904095, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.16918945, "step": 4125, "time_per_iteration": 4.157441854476929 }, { "auxiliary_loss_clip": 0.06527399, "auxiliary_loss_mlp": 0.01278165, "balance_loss_clip": 0.06300279, "balance_loss_mlp": 0.01258781, "epoch": 0.24806854050804147, "flos": 22060129104000.0, "grad_norm": 2.040511685616215, "language_loss": 0.70175362, "learning_rate": 3.5215515197135595e-06, "loss": 0.77980924, "num_input_tokens_seen": 88920740, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.19372559, "step": 4126, "time_per_iteration": 2.5515942573547363 }, { "auxiliary_loss_clip": 0.06520166, "auxiliary_loss_mlp": 0.01275208, "balance_loss_clip": 0.06302314, "balance_loss_mlp": 0.01256993, "epoch": 0.24812866376070947, "flos": 15492164739840.0, "grad_norm": 1.9399989210777726, "language_loss": 0.82143009, "learning_rate": 3.5212987244613764e-06, "loss": 0.89938378, "num_input_tokens_seen": 88938510, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.18212891, "step": 4127, "time_per_iteration": 2.5610573291778564 }, { "auxiliary_loss_clip": 0.06519997, "auxiliary_loss_mlp": 0.01275116, "balance_loss_clip": 0.06297445, "balance_loss_mlp": 0.01257366, "epoch": 0.24818878701337743, "flos": 14762758947840.0, "grad_norm": 2.3255280767736726, "language_loss": 0.85050458, "learning_rate": 3.5210458715213927e-06, "loss": 0.92845571, "num_input_tokens_seen": 88955235, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.17749023, "step": 4128, "time_per_iteration": 2.545405626296997 }, { "auxiliary_loss_clip": 0.06522875, "auxiliary_loss_mlp": 0.01274139, "balance_loss_clip": 0.06303228, "balance_loss_mlp": 0.01255995, "epoch": 0.2482489102660454, "flos": 27096886356480.0, "grad_norm": 4.284259733283025, "language_loss": 0.66287893, "learning_rate": 3.5207929609031973e-06, "loss": 0.74084908, "num_input_tokens_seen": 88975210, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.18151855, "step": 4129, "time_per_iteration": 2.607008457183838 }, { "auxiliary_loss_clip": 0.0651768, "auxiliary_loss_mlp": 0.01278616, "balance_loss_clip": 0.06301064, "balance_loss_mlp": 0.01259936, "epoch": 0.24830903351871336, "flos": 26474522555520.0, "grad_norm": 5.327971890644309, "language_loss": 0.75664699, "learning_rate": 3.5205399926163806e-06, "loss": 0.83460999, "num_input_tokens_seen": 88996120, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.18688965, "step": 4130, "time_per_iteration": 2.6073875427246094 }, { "auxiliary_loss_clip": 0.0651658, "auxiliary_loss_mlp": 0.01273903, "balance_loss_clip": 0.0629683, "balance_loss_mlp": 0.01255211, "epoch": 0.24836915677138133, "flos": 10232225337600.0, "grad_norm": 1.9981874324701043, "language_loss": 0.77303946, "learning_rate": 3.520286966670535e-06, "loss": 0.85094428, "num_input_tokens_seen": 89008685, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.18688965, "step": 4131, "time_per_iteration": 3.864330530166626 }, { "auxiliary_loss_clip": 0.06510352, "auxiliary_loss_mlp": 0.01273751, "balance_loss_clip": 0.06297363, "balance_loss_mlp": 0.01257002, "epoch": 0.2484292800240493, "flos": 30088162863360.0, "grad_norm": 1.5849678563062073, "language_loss": 0.84859216, "learning_rate": 3.520033883075255e-06, "loss": 0.92643321, "num_input_tokens_seen": 89031160, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.16748047, "step": 4132, "time_per_iteration": 2.6485435962677 }, { "auxiliary_loss_clip": 0.06519724, "auxiliary_loss_mlp": 0.0127886, "balance_loss_clip": 0.06302257, "balance_loss_mlp": 0.01260884, "epoch": 0.24848940327671726, "flos": 13447899878400.0, "grad_norm": 1.8213778832029144, "language_loss": 0.72078383, "learning_rate": 3.5197807418401386e-06, "loss": 0.79876971, "num_input_tokens_seen": 89047235, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.1796875, "step": 4133, "time_per_iteration": 4.088819742202759 }, { "auxiliary_loss_clip": 0.06530799, "auxiliary_loss_mlp": 0.01276064, "balance_loss_clip": 0.06301779, "balance_loss_mlp": 0.01255798, "epoch": 0.24854952652938525, "flos": 19975683409920.0, "grad_norm": 3.754177148694188, "language_loss": 0.62524092, "learning_rate": 3.5195275429747834e-06, "loss": 0.70330954, "num_input_tokens_seen": 89064790, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.20263672, "step": 4134, "time_per_iteration": 2.55244779586792 }, { "auxiliary_loss_clip": 0.06520164, "auxiliary_loss_mlp": 0.01272829, "balance_loss_clip": 0.06301191, "balance_loss_mlp": 0.01254864, "epoch": 0.24860964978205322, "flos": 18156026217600.0, "grad_norm": 1.8564690961769303, "language_loss": 0.79129297, "learning_rate": 3.5192742864887914e-06, "loss": 0.86922294, "num_input_tokens_seen": 89083250, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.17956543, "step": 4135, "time_per_iteration": 2.5559380054473877 }, { "auxiliary_loss_clip": 0.06519583, "auxiliary_loss_mlp": 0.01274558, "balance_loss_clip": 0.06300414, "balance_loss_mlp": 0.01256593, "epoch": 0.24866977303472118, "flos": 11733397960320.0, "grad_norm": 2.6560619828722385, "language_loss": 0.84096402, "learning_rate": 3.5190209723917662e-06, "loss": 0.9189055, "num_input_tokens_seen": 89100905, "router_z_loss_clip": 2.19042969, "router_z_loss_mlp": 0.1796875, "step": 4136, "time_per_iteration": 2.560894250869751 }, { "auxiliary_loss_clip": 0.0652525, "auxiliary_loss_mlp": 0.01273319, "balance_loss_clip": 0.06301145, "balance_loss_mlp": 0.01254854, "epoch": 0.24872989628738915, "flos": 34832109623040.0, "grad_norm": 1.7921158893224471, "language_loss": 0.72002268, "learning_rate": 3.518767600693314e-06, "loss": 0.79800832, "num_input_tokens_seen": 89122630, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.18457031, "step": 4137, "time_per_iteration": 2.6748592853546143 }, { "auxiliary_loss_clip": 0.06523837, "auxiliary_loss_mlp": 0.01276981, "balance_loss_clip": 0.06300022, "balance_loss_mlp": 0.0126003, "epoch": 0.2487900195400571, "flos": 13704512607360.0, "grad_norm": 1.8545987729453544, "language_loss": 0.67145228, "learning_rate": 3.518514171403042e-06, "loss": 0.74946046, "num_input_tokens_seen": 89141050, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.16955566, "step": 4138, "time_per_iteration": 2.555006742477417 }, { "auxiliary_loss_clip": 0.06521507, "auxiliary_loss_mlp": 0.01273197, "balance_loss_clip": 0.063057, "balance_loss_mlp": 0.01255995, "epoch": 0.24885014279272508, "flos": 25344845009280.0, "grad_norm": 1.7721094443859, "language_loss": 0.84010208, "learning_rate": 3.51826068453056e-06, "loss": 0.9180491, "num_input_tokens_seen": 89160810, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.171875, "step": 4139, "time_per_iteration": 2.6019153594970703 }, { "auxiliary_loss_clip": 0.06531699, "auxiliary_loss_mlp": 0.01279937, "balance_loss_clip": 0.06306729, "balance_loss_mlp": 0.01259815, "epoch": 0.24891026604539307, "flos": 20637724919040.0, "grad_norm": 1.5932315398118615, "language_loss": 0.7957589, "learning_rate": 3.518007140085481e-06, "loss": 0.87387526, "num_input_tokens_seen": 89180610, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.20129395, "step": 4140, "time_per_iteration": 2.692418336868286 }, { "auxiliary_loss_clip": 0.06444091, "auxiliary_loss_mlp": 0.01258283, "balance_loss_clip": 0.06330171, "balance_loss_mlp": 0.01254483, "epoch": 0.24897038929806103, "flos": 66979086030720.0, "grad_norm": 0.7960931665854807, "language_loss": 0.60706329, "learning_rate": 3.51775353807742e-06, "loss": 0.68408704, "num_input_tokens_seen": 89241880, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.03793335, "step": 4141, "time_per_iteration": 3.2564585208892822 }, { "auxiliary_loss_clip": 0.0652941, "auxiliary_loss_mlp": 0.01279579, "balance_loss_clip": 0.06307352, "balance_loss_mlp": 0.01261054, "epoch": 0.249030512550729, "flos": 36401359288320.0, "grad_norm": 1.8219990910094466, "language_loss": 0.73221046, "learning_rate": 3.5174998785159913e-06, "loss": 0.81030041, "num_input_tokens_seen": 89263340, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.18518066, "step": 4142, "time_per_iteration": 2.6925370693206787 }, { "auxiliary_loss_clip": 0.06528068, "auxiliary_loss_mlp": 0.01277137, "balance_loss_clip": 0.06311907, "balance_loss_mlp": 0.01259029, "epoch": 0.24909063580339696, "flos": 20160361808640.0, "grad_norm": 1.9117030178172638, "language_loss": 0.8179087, "learning_rate": 3.5172461614108157e-06, "loss": 0.89596081, "num_input_tokens_seen": 89282870, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.18103027, "step": 4143, "time_per_iteration": 2.6309010982513428 }, { "auxiliary_loss_clip": 0.06528726, "auxiliary_loss_mlp": 0.01276221, "balance_loss_clip": 0.0631021, "balance_loss_mlp": 0.01258054, "epoch": 0.24915075905606493, "flos": 26403887963520.0, "grad_norm": 1.8830412223342219, "language_loss": 0.59357029, "learning_rate": 3.5169923867715137e-06, "loss": 0.67161977, "num_input_tokens_seen": 89303830, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.18164062, "step": 4144, "time_per_iteration": 2.7216458320617676 }, { "auxiliary_loss_clip": 0.06528812, "auxiliary_loss_mlp": 0.01276589, "balance_loss_clip": 0.06311815, "balance_loss_mlp": 0.01258922, "epoch": 0.2492108823087329, "flos": 27534655612800.0, "grad_norm": 2.459137259274819, "language_loss": 0.79366952, "learning_rate": 3.516738554607708e-06, "loss": 0.87172353, "num_input_tokens_seen": 89324350, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.17651367, "step": 4145, "time_per_iteration": 2.6222622394561768 }, { "auxiliary_loss_clip": 0.06539606, "auxiliary_loss_mlp": 0.0127698, "balance_loss_clip": 0.06310558, "balance_loss_mlp": 0.01256679, "epoch": 0.24927100556140086, "flos": 16697088852480.0, "grad_norm": 2.298877882479648, "language_loss": 0.65747917, "learning_rate": 3.5164846649290253e-06, "loss": 0.735645, "num_input_tokens_seen": 89342875, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.20300293, "step": 4146, "time_per_iteration": 2.5674705505371094 }, { "auxiliary_loss_clip": 0.064436, "auxiliary_loss_mlp": 0.01258702, "balance_loss_clip": 0.06329723, "balance_loss_mlp": 0.01255018, "epoch": 0.24933112881406885, "flos": 62791899724800.0, "grad_norm": 0.9671353478320772, "language_loss": 0.67396045, "learning_rate": 3.5162307177450915e-06, "loss": 0.75098354, "num_input_tokens_seen": 89404925, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.03674316, "step": 4147, "time_per_iteration": 3.307387113571167 }, { "auxiliary_loss_clip": 0.06538475, "auxiliary_loss_mlp": 0.01276198, "balance_loss_clip": 0.0631945, "balance_loss_mlp": 0.01258078, "epoch": 0.24939125206673682, "flos": 26659242881280.0, "grad_norm": 1.6537688497818694, "language_loss": 0.89750421, "learning_rate": 3.5159767130655366e-06, "loss": 0.97565097, "num_input_tokens_seen": 89425090, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.18127441, "step": 4148, "time_per_iteration": 2.6081442832946777 }, { "auxiliary_loss_clip": 0.06547078, "auxiliary_loss_mlp": 0.01277933, "balance_loss_clip": 0.0631858, "balance_loss_mlp": 0.01257739, "epoch": 0.24945137531940478, "flos": 20710623571200.0, "grad_norm": 4.33920064708413, "language_loss": 0.68859684, "learning_rate": 3.5157226508999935e-06, "loss": 0.7668469, "num_input_tokens_seen": 89442615, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.20202637, "step": 4149, "time_per_iteration": 2.5723934173583984 }, { "auxiliary_loss_clip": 0.06533833, "auxiliary_loss_mlp": 0.01275839, "balance_loss_clip": 0.06316694, "balance_loss_mlp": 0.01257708, "epoch": 0.24951149857207275, "flos": 23775385708800.0, "grad_norm": 1.834828590642973, "language_loss": 0.71904582, "learning_rate": 3.515468531258095e-06, "loss": 0.79714251, "num_input_tokens_seen": 89463025, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.18127441, "step": 4150, "time_per_iteration": 2.620744466781616 }, { "auxiliary_loss_clip": 0.06536184, "auxiliary_loss_mlp": 0.01276574, "balance_loss_clip": 0.06314279, "balance_loss_mlp": 0.01257882, "epoch": 0.2495716218247407, "flos": 15669589760640.0, "grad_norm": 1.652659910922337, "language_loss": 0.73238057, "learning_rate": 3.515214354149478e-06, "loss": 0.81050813, "num_input_tokens_seen": 89480225, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.18701172, "step": 4151, "time_per_iteration": 2.563369035720825 }, { "auxiliary_loss_clip": 0.06546105, "auxiliary_loss_mlp": 0.01273431, "balance_loss_clip": 0.06317694, "balance_loss_mlp": 0.01253654, "epoch": 0.24963174507740868, "flos": 24057924076800.0, "grad_norm": 3.174844940714229, "language_loss": 0.64645052, "learning_rate": 3.514960119583781e-06, "loss": 0.72464591, "num_input_tokens_seen": 89496985, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.19775391, "step": 4152, "time_per_iteration": 2.58261775970459 }, { "auxiliary_loss_clip": 0.06535764, "auxiliary_loss_mlp": 0.01275269, "balance_loss_clip": 0.06318885, "balance_loss_mlp": 0.01257399, "epoch": 0.24969186833007664, "flos": 21806073924480.0, "grad_norm": 1.927127796276408, "language_loss": 0.78213561, "learning_rate": 3.514705827570645e-06, "loss": 0.86024594, "num_input_tokens_seen": 89514420, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.17871094, "step": 4153, "time_per_iteration": 2.603876829147339 }, { "auxiliary_loss_clip": 0.06540148, "auxiliary_loss_mlp": 0.0127249, "balance_loss_clip": 0.06322029, "balance_loss_mlp": 0.01254895, "epoch": 0.24975199158274464, "flos": 19944307255680.0, "grad_norm": 14.855124609330453, "language_loss": 0.77297741, "learning_rate": 3.514451478119711e-06, "loss": 0.85110378, "num_input_tokens_seen": 89532925, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.17602539, "step": 4154, "time_per_iteration": 2.5646047592163086 }, { "auxiliary_loss_clip": 0.06553791, "auxiliary_loss_mlp": 0.01281874, "balance_loss_clip": 0.06325135, "balance_loss_mlp": 0.01259951, "epoch": 0.2498121148354126, "flos": 25345515841920.0, "grad_norm": 2.0986003534637643, "language_loss": 0.71075326, "learning_rate": 3.5141970712406258e-06, "loss": 0.78910995, "num_input_tokens_seen": 89552855, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.21911621, "step": 4155, "time_per_iteration": 2.6136670112609863 }, { "auxiliary_loss_clip": 0.06552558, "auxiliary_loss_mlp": 0.01278513, "balance_loss_clip": 0.06328369, "balance_loss_mlp": 0.0125913, "epoch": 0.24987223808808057, "flos": 20565119756160.0, "grad_norm": 1.6510647501156492, "language_loss": 0.7560482, "learning_rate": 3.513942606943036e-06, "loss": 0.83435893, "num_input_tokens_seen": 89572830, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.19360352, "step": 4156, "time_per_iteration": 2.5865871906280518 }, { "auxiliary_loss_clip": 0.06545077, "auxiliary_loss_mlp": 0.01278887, "balance_loss_clip": 0.06326363, "balance_loss_mlp": 0.01260767, "epoch": 0.24993236134074853, "flos": 19754052560640.0, "grad_norm": 1.9275611647419868, "language_loss": 0.77700919, "learning_rate": 3.513688085236591e-06, "loss": 0.85524881, "num_input_tokens_seen": 89590345, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.18127441, "step": 4157, "time_per_iteration": 2.567173957824707 }, { "auxiliary_loss_clip": 0.06543615, "auxiliary_loss_mlp": 0.01275423, "balance_loss_clip": 0.06320266, "balance_loss_mlp": 0.0125648, "epoch": 0.2499924845934165, "flos": 18776209812480.0, "grad_norm": 1.7904918380944197, "language_loss": 0.82120758, "learning_rate": 3.513433506130942e-06, "loss": 0.89939797, "num_input_tokens_seen": 89610295, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.18945312, "step": 4158, "time_per_iteration": 2.5713329315185547 }, { "auxiliary_loss_clip": 0.0654888, "auxiliary_loss_mlp": 0.0127718, "balance_loss_clip": 0.06328013, "balance_loss_mlp": 0.01259275, "epoch": 0.25005260784608446, "flos": 16877658401280.0, "grad_norm": 1.7350780294419135, "language_loss": 0.75684261, "learning_rate": 3.5131788696357427e-06, "loss": 0.83510315, "num_input_tokens_seen": 89627795, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.17895508, "step": 4159, "time_per_iteration": 2.5806710720062256 }, { "auxiliary_loss_clip": 0.06546258, "auxiliary_loss_mlp": 0.01275732, "balance_loss_clip": 0.06319585, "balance_loss_mlp": 0.0125611, "epoch": 0.2501127310987524, "flos": 22131057185280.0, "grad_norm": 1.9478047257668258, "language_loss": 0.71477592, "learning_rate": 3.512924175760649e-06, "loss": 0.79299581, "num_input_tokens_seen": 89648090, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.19604492, "step": 4160, "time_per_iteration": 4.01399302482605 }, { "auxiliary_loss_clip": 0.06426668, "auxiliary_loss_mlp": 0.01262038, "balance_loss_clip": 0.06313349, "balance_loss_mlp": 0.01258182, "epoch": 0.2501728543514204, "flos": 69480071170560.0, "grad_norm": 0.7297515442142862, "language_loss": 0.56751758, "learning_rate": 3.5126694245153186e-06, "loss": 0.64440465, "num_input_tokens_seen": 89710345, "router_z_loss_clip": 1.1328125, "router_z_loss_mlp": 0.03851318, "step": 4161, "time_per_iteration": 3.2398433685302734 }, { "auxiliary_loss_clip": 0.06545512, "auxiliary_loss_mlp": 0.01277991, "balance_loss_clip": 0.06318216, "balance_loss_mlp": 0.01258989, "epoch": 0.25023297760408836, "flos": 16295601214080.0, "grad_norm": 1.6479093245560146, "language_loss": 0.8165431, "learning_rate": 3.5124146159094125e-06, "loss": 0.89477813, "num_input_tokens_seen": 89729390, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.18994141, "step": 4162, "time_per_iteration": 2.575122594833374 }, { "auxiliary_loss_clip": 0.06539032, "auxiliary_loss_mlp": 0.01275299, "balance_loss_clip": 0.06312119, "balance_loss_mlp": 0.01256189, "epoch": 0.2502931008567563, "flos": 12242598422400.0, "grad_norm": 2.127469631581566, "language_loss": 0.87956274, "learning_rate": 3.5121597499525927e-06, "loss": 0.95770609, "num_input_tokens_seen": 89742805, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.19116211, "step": 4163, "time_per_iteration": 2.526024341583252 }, { "auxiliary_loss_clip": 0.065403, "auxiliary_loss_mlp": 0.01274349, "balance_loss_clip": 0.06316312, "balance_loss_mlp": 0.01255204, "epoch": 0.25035322410942434, "flos": 23188003787520.0, "grad_norm": 1.6611271395655864, "language_loss": 0.83647567, "learning_rate": 3.5119048266545232e-06, "loss": 0.91462213, "num_input_tokens_seen": 89761145, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.19128418, "step": 4164, "time_per_iteration": 2.5864126682281494 }, { "auxiliary_loss_clip": 0.06531657, "auxiliary_loss_mlp": 0.01286444, "balance_loss_clip": 0.06317452, "balance_loss_mlp": 0.0126904, "epoch": 0.2504133473620923, "flos": 20922904690560.0, "grad_norm": 1.9574113469395025, "language_loss": 0.74704921, "learning_rate": 3.5116498460248716e-06, "loss": 0.82523024, "num_input_tokens_seen": 89780905, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.17382812, "step": 4165, "time_per_iteration": 4.026110649108887 }, { "auxiliary_loss_clip": 0.06541495, "auxiliary_loss_mlp": 0.01273818, "balance_loss_clip": 0.06314494, "balance_loss_mlp": 0.01255603, "epoch": 0.2504734706147603, "flos": 20782725609600.0, "grad_norm": 1.6197697233396926, "language_loss": 0.74591184, "learning_rate": 3.5113948080733062e-06, "loss": 0.82406497, "num_input_tokens_seen": 89799230, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.18225098, "step": 4166, "time_per_iteration": 2.594423294067383 }, { "auxiliary_loss_clip": 0.06529357, "auxiliary_loss_mlp": 0.01280441, "balance_loss_clip": 0.06310752, "balance_loss_mlp": 0.01261248, "epoch": 0.25053359386742824, "flos": 24355681960320.0, "grad_norm": 2.603280418237422, "language_loss": 0.82258415, "learning_rate": 3.5111397128094973e-06, "loss": 0.90068221, "num_input_tokens_seen": 89818240, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.19189453, "step": 4167, "time_per_iteration": 2.598820924758911 }, { "auxiliary_loss_clip": 0.06525166, "auxiliary_loss_mlp": 0.01276729, "balance_loss_clip": 0.06308403, "balance_loss_mlp": 0.01258169, "epoch": 0.2505937171200962, "flos": 21220578720000.0, "grad_norm": 2.070054627784518, "language_loss": 0.80350554, "learning_rate": 3.51088456024312e-06, "loss": 0.8815245, "num_input_tokens_seen": 89834485, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.18566895, "step": 4168, "time_per_iteration": 2.5751748085021973 }, { "auxiliary_loss_clip": 0.0654355, "auxiliary_loss_mlp": 0.0127891, "balance_loss_clip": 0.06315734, "balance_loss_mlp": 0.01258621, "epoch": 0.25065384037276417, "flos": 41436816802560.0, "grad_norm": 3.5280589468927035, "language_loss": 0.70471609, "learning_rate": 3.510629350383849e-06, "loss": 0.78294075, "num_input_tokens_seen": 89855645, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.20288086, "step": 4169, "time_per_iteration": 2.746149778366089 }, { "auxiliary_loss_clip": 0.06516986, "auxiliary_loss_mlp": 0.01278046, "balance_loss_clip": 0.06303218, "balance_loss_mlp": 0.01259998, "epoch": 0.25071396362543213, "flos": 26109274608000.0, "grad_norm": 1.7344265755744348, "language_loss": 0.78234065, "learning_rate": 3.510374083241361e-06, "loss": 0.860291, "num_input_tokens_seen": 89874895, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.18029785, "step": 4170, "time_per_iteration": 2.6005921363830566 }, { "auxiliary_loss_clip": 0.06526531, "auxiliary_loss_mlp": 0.01279556, "balance_loss_clip": 0.0630848, "balance_loss_mlp": 0.01260721, "epoch": 0.2507740868781001, "flos": 19105008433920.0, "grad_norm": 2.9517427453532274, "language_loss": 0.77219927, "learning_rate": 3.5101187588253368e-06, "loss": 0.85026014, "num_input_tokens_seen": 89891700, "router_z_loss_clip": 2.18261719, "router_z_loss_mlp": 0.18835449, "step": 4171, "time_per_iteration": 3.8735344409942627 }, { "auxiliary_loss_clip": 0.06429017, "auxiliary_loss_mlp": 0.01256395, "balance_loss_clip": 0.06316572, "balance_loss_mlp": 0.01251889, "epoch": 0.25083421013076806, "flos": 64361652514560.0, "grad_norm": 0.8025823941908006, "language_loss": 0.60015488, "learning_rate": 3.509863377145458e-06, "loss": 0.67700899, "num_input_tokens_seen": 89955775, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 0.04516602, "step": 4172, "time_per_iteration": 3.1983516216278076 }, { "auxiliary_loss_clip": 0.0652997, "auxiliary_loss_mlp": 0.01281161, "balance_loss_clip": 0.06310897, "balance_loss_mlp": 0.01262303, "epoch": 0.25089433338343603, "flos": 24286430960640.0, "grad_norm": 1.5140402746085582, "language_loss": 0.79697436, "learning_rate": 3.509607938211409e-06, "loss": 0.87508571, "num_input_tokens_seen": 89977150, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.18847656, "step": 4173, "time_per_iteration": 4.033652067184448 }, { "auxiliary_loss_clip": 0.06528303, "auxiliary_loss_mlp": 0.01276259, "balance_loss_clip": 0.06309887, "balance_loss_mlp": 0.01258771, "epoch": 0.250954456636104, "flos": 14726896600320.0, "grad_norm": 2.5594778619328125, "language_loss": 0.84376657, "learning_rate": 3.509352442032875e-06, "loss": 0.92181218, "num_input_tokens_seen": 89994925, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.17480469, "step": 4174, "time_per_iteration": 2.5482635498046875 }, { "auxiliary_loss_clip": 0.06534356, "auxiliary_loss_mlp": 0.01279666, "balance_loss_clip": 0.06312481, "balance_loss_mlp": 0.01260771, "epoch": 0.25101457988877196, "flos": 22280208652800.0, "grad_norm": 2.1542346155038348, "language_loss": 0.72310424, "learning_rate": 3.509096888619545e-06, "loss": 0.80124438, "num_input_tokens_seen": 90013235, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.18908691, "step": 4175, "time_per_iteration": 2.59110426902771 }, { "auxiliary_loss_clip": 0.06537801, "auxiliary_loss_mlp": 0.0127688, "balance_loss_clip": 0.06313558, "balance_loss_mlp": 0.01256662, "epoch": 0.2510747031414399, "flos": 25195441979520.0, "grad_norm": 2.5857019674331747, "language_loss": 0.80661529, "learning_rate": 3.50884127798111e-06, "loss": 0.88476205, "num_input_tokens_seen": 90032150, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.20227051, "step": 4176, "time_per_iteration": 2.613696336746216 }, { "auxiliary_loss_clip": 0.0652342, "auxiliary_loss_mlp": 0.01279928, "balance_loss_clip": 0.06304714, "balance_loss_mlp": 0.01260032, "epoch": 0.25113482639410795, "flos": 20710455863040.0, "grad_norm": 1.9613484870141613, "language_loss": 0.83428895, "learning_rate": 3.5085856101272623e-06, "loss": 0.9123224, "num_input_tokens_seen": 90049085, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.19909668, "step": 4177, "time_per_iteration": 2.561772346496582 }, { "auxiliary_loss_clip": 0.06528492, "auxiliary_loss_mlp": 0.0127443, "balance_loss_clip": 0.06311601, "balance_loss_mlp": 0.01254915, "epoch": 0.2511949496467759, "flos": 21513347285760.0, "grad_norm": 3.25156216402432, "language_loss": 0.83837801, "learning_rate": 3.508329885067698e-06, "loss": 0.91640723, "num_input_tokens_seen": 90067695, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.19494629, "step": 4178, "time_per_iteration": 2.5787529945373535 }, { "auxiliary_loss_clip": 0.06523974, "auxiliary_loss_mlp": 0.01275812, "balance_loss_clip": 0.06307085, "balance_loss_mlp": 0.01257215, "epoch": 0.2512550728994439, "flos": 20707898313600.0, "grad_norm": 2.3412492881357867, "language_loss": 0.76105058, "learning_rate": 3.508074102812112e-06, "loss": 0.83904845, "num_input_tokens_seen": 90083890, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.18603516, "step": 4179, "time_per_iteration": 2.546607732772827 }, { "auxiliary_loss_clip": 0.06529562, "auxiliary_loss_mlp": 0.01275838, "balance_loss_clip": 0.06307866, "balance_loss_mlp": 0.01256133, "epoch": 0.25131519615211184, "flos": 18484531349760.0, "grad_norm": 1.9690860242387676, "language_loss": 0.70520687, "learning_rate": 3.507818263370206e-06, "loss": 0.78326094, "num_input_tokens_seen": 90100995, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.19714355, "step": 4180, "time_per_iteration": 2.5500707626342773 }, { "auxiliary_loss_clip": 0.06518443, "auxiliary_loss_mlp": 0.01277022, "balance_loss_clip": 0.06303151, "balance_loss_mlp": 0.01259295, "epoch": 0.2513753194047798, "flos": 20491131000960.0, "grad_norm": 2.1568964014106298, "language_loss": 0.86480427, "learning_rate": 3.5075623667516796e-06, "loss": 0.94275892, "num_input_tokens_seen": 90120365, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.17712402, "step": 4181, "time_per_iteration": 2.569939136505127 }, { "auxiliary_loss_clip": 0.06525597, "auxiliary_loss_mlp": 0.01272051, "balance_loss_clip": 0.06308596, "balance_loss_mlp": 0.01253991, "epoch": 0.25143544265744777, "flos": 37679182053120.0, "grad_norm": 2.106523910404514, "language_loss": 0.68403637, "learning_rate": 3.507306412966238e-06, "loss": 0.76201284, "num_input_tokens_seen": 90142610, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.18066406, "step": 4182, "time_per_iteration": 2.7161571979522705 }, { "auxiliary_loss_clip": 0.06411974, "auxiliary_loss_mlp": 0.01260881, "balance_loss_clip": 0.06300326, "balance_loss_mlp": 0.01256536, "epoch": 0.25149556591011574, "flos": 69386502487680.0, "grad_norm": 0.861079857942156, "language_loss": 0.70147544, "learning_rate": 3.5070504020235853e-06, "loss": 0.77820396, "num_input_tokens_seen": 90200555, "router_z_loss_clip": 1.1171875, "router_z_loss_mlp": 0.04351807, "step": 4183, "time_per_iteration": 3.216487169265747 }, { "auxiliary_loss_clip": 0.06523224, "auxiliary_loss_mlp": 0.01276472, "balance_loss_clip": 0.06304872, "balance_loss_mlp": 0.0125784, "epoch": 0.2515556891627837, "flos": 13995478310400.0, "grad_norm": 1.742878953535354, "language_loss": 0.74835277, "learning_rate": 3.506794333933431e-06, "loss": 0.82634974, "num_input_tokens_seen": 90218120, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.18640137, "step": 4184, "time_per_iteration": 2.55863618850708 }, { "auxiliary_loss_clip": 0.06527728, "auxiliary_loss_mlp": 0.01279446, "balance_loss_clip": 0.06309514, "balance_loss_mlp": 0.01259765, "epoch": 0.25161581241545167, "flos": 22170022133760.0, "grad_norm": 1.6701002134070853, "language_loss": 0.83040297, "learning_rate": 3.506538208705484e-06, "loss": 0.90847468, "num_input_tokens_seen": 90236790, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.19677734, "step": 4185, "time_per_iteration": 2.599621295928955 }, { "auxiliary_loss_clip": 0.06421131, "auxiliary_loss_mlp": 0.0126009, "balance_loss_clip": 0.06309491, "balance_loss_mlp": 0.01256267, "epoch": 0.25167593566811963, "flos": 69375936873600.0, "grad_norm": 0.7637468783545891, "language_loss": 0.61389732, "learning_rate": 3.5062820263494574e-06, "loss": 0.69070947, "num_input_tokens_seen": 90297070, "router_z_loss_clip": 1.1171875, "router_z_loss_mlp": 0.03817749, "step": 4186, "time_per_iteration": 3.188384532928467 }, { "auxiliary_loss_clip": 0.06520672, "auxiliary_loss_mlp": 0.01277915, "balance_loss_clip": 0.06302448, "balance_loss_mlp": 0.01259247, "epoch": 0.2517360589207876, "flos": 13266533715840.0, "grad_norm": 2.938953972861522, "language_loss": 0.79751122, "learning_rate": 3.5060257868750656e-06, "loss": 0.8754971, "num_input_tokens_seen": 90315255, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.18676758, "step": 4187, "time_per_iteration": 2.699650287628174 }, { "auxiliary_loss_clip": 0.06519622, "auxiliary_loss_mlp": 0.01279582, "balance_loss_clip": 0.06305466, "balance_loss_mlp": 0.01261355, "epoch": 0.25179618217345556, "flos": 20383208542080.0, "grad_norm": 1.5129403056812145, "language_loss": 0.80608898, "learning_rate": 3.5057694902920244e-06, "loss": 0.88408101, "num_input_tokens_seen": 90334990, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.18237305, "step": 4188, "time_per_iteration": 2.5728800296783447 }, { "auxiliary_loss_clip": 0.06521676, "auxiliary_loss_mlp": 0.01279821, "balance_loss_clip": 0.06306192, "balance_loss_mlp": 0.0126194, "epoch": 0.25185630542612353, "flos": 27670767770880.0, "grad_norm": 2.1600905842338745, "language_loss": 0.75036108, "learning_rate": 3.5055131366100534e-06, "loss": 0.82837605, "num_input_tokens_seen": 90351825, "router_z_loss_clip": 2.15722656, "router_z_loss_mlp": 0.17883301, "step": 4189, "time_per_iteration": 2.6099722385406494 }, { "auxiliary_loss_clip": 0.06517597, "auxiliary_loss_mlp": 0.01273954, "balance_loss_clip": 0.06306712, "balance_loss_mlp": 0.01257765, "epoch": 0.25191642867879155, "flos": 21002805158400.0, "grad_norm": 5.104118463667084, "language_loss": 0.85145956, "learning_rate": 3.5052567258388745e-06, "loss": 0.92937505, "num_input_tokens_seen": 90369860, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.1619873, "step": 4190, "time_per_iteration": 2.627995252609253 }, { "auxiliary_loss_clip": 0.0653076, "auxiliary_loss_mlp": 0.01278268, "balance_loss_clip": 0.06312642, "balance_loss_mlp": 0.01258289, "epoch": 0.2519765519314595, "flos": 21112027355520.0, "grad_norm": 1.732055812505734, "language_loss": 0.75841278, "learning_rate": 3.5050002579882082e-06, "loss": 0.83650303, "num_input_tokens_seen": 90389245, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.1998291, "step": 4191, "time_per_iteration": 2.684187650680542 }, { "auxiliary_loss_clip": 0.06406126, "auxiliary_loss_mlp": 0.01252867, "balance_loss_clip": 0.06294628, "balance_loss_mlp": 0.01248376, "epoch": 0.2520366751841275, "flos": 62765932158720.0, "grad_norm": 0.6992878862139088, "language_loss": 0.56888926, "learning_rate": 3.5047437330677823e-06, "loss": 0.6454792, "num_input_tokens_seen": 90456735, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.04501343, "step": 4192, "time_per_iteration": 3.2944507598876953 }, { "auxiliary_loss_clip": 0.06519239, "auxiliary_loss_mlp": 0.01274386, "balance_loss_clip": 0.06307602, "balance_loss_mlp": 0.012571, "epoch": 0.25209679843679544, "flos": 22236254386560.0, "grad_norm": 2.0834888694707763, "language_loss": 0.76715398, "learning_rate": 3.504487151087323e-06, "loss": 0.84509027, "num_input_tokens_seen": 90474165, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.17285156, "step": 4193, "time_per_iteration": 2.6042380332946777 }, { "auxiliary_loss_clip": 0.06525085, "auxiliary_loss_mlp": 0.01275296, "balance_loss_clip": 0.0630419, "balance_loss_mlp": 0.01256627, "epoch": 0.2521569216894634, "flos": 12171502632960.0, "grad_norm": 2.3554473784070997, "language_loss": 0.84214854, "learning_rate": 3.5042305120565598e-06, "loss": 0.92015231, "num_input_tokens_seen": 90491660, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.18676758, "step": 4194, "time_per_iteration": 2.594128370285034 }, { "auxiliary_loss_clip": 0.06529236, "auxiliary_loss_mlp": 0.01277659, "balance_loss_clip": 0.06308638, "balance_loss_mlp": 0.0125886, "epoch": 0.2522170449421314, "flos": 23707182885120.0, "grad_norm": 1.4283208188129737, "language_loss": 0.887474, "learning_rate": 3.5039738159852253e-06, "loss": 0.96554291, "num_input_tokens_seen": 90514025, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.18786621, "step": 4195, "time_per_iteration": 2.6347367763519287 }, { "auxiliary_loss_clip": 0.06528284, "auxiliary_loss_mlp": 0.01278647, "balance_loss_clip": 0.06308497, "balance_loss_mlp": 0.01258405, "epoch": 0.25227716819479934, "flos": 20961073025280.0, "grad_norm": 1.7956055872996486, "language_loss": 0.86749303, "learning_rate": 3.503717062883053e-06, "loss": 0.94556236, "num_input_tokens_seen": 90533530, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.20239258, "step": 4196, "time_per_iteration": 2.6009750366210938 }, { "auxiliary_loss_clip": 0.06525639, "auxiliary_loss_mlp": 0.01282487, "balance_loss_clip": 0.06308141, "balance_loss_mlp": 0.01265273, "epoch": 0.2523372914474673, "flos": 23338077649920.0, "grad_norm": 2.223066401735871, "language_loss": 0.83771956, "learning_rate": 3.5034602527597786e-06, "loss": 0.91580081, "num_input_tokens_seen": 90554025, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.17211914, "step": 4197, "time_per_iteration": 2.6425771713256836 }, { "auxiliary_loss_clip": 0.06529959, "auxiliary_loss_mlp": 0.01280968, "balance_loss_clip": 0.06309733, "balance_loss_mlp": 0.01261322, "epoch": 0.25239741470013527, "flos": 36978217522560.0, "grad_norm": 1.8332570119501899, "language_loss": 0.7308321, "learning_rate": 3.5032033856251405e-06, "loss": 0.80894136, "num_input_tokens_seen": 90576930, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.19628906, "step": 4198, "time_per_iteration": 2.757308006286621 }, { "auxiliary_loss_clip": 0.06539118, "auxiliary_loss_mlp": 0.01282529, "balance_loss_clip": 0.0631301, "balance_loss_mlp": 0.01263241, "epoch": 0.25245753795280323, "flos": 18521777289600.0, "grad_norm": 2.13925554067436, "language_loss": 0.77126884, "learning_rate": 3.50294646148888e-06, "loss": 0.84948534, "num_input_tokens_seen": 90595710, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.19299316, "step": 4199, "time_per_iteration": 4.051465034484863 }, { "auxiliary_loss_clip": 0.06533039, "auxiliary_loss_mlp": 0.01277905, "balance_loss_clip": 0.06310487, "balance_loss_mlp": 0.01259976, "epoch": 0.2525176612054712, "flos": 32353387741440.0, "grad_norm": 2.7963687508793624, "language_loss": 0.73985988, "learning_rate": 3.502689480360739e-06, "loss": 0.81796932, "num_input_tokens_seen": 90617945, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.17932129, "step": 4200, "time_per_iteration": 2.691354990005493 }, { "auxiliary_loss_clip": 0.06532608, "auxiliary_loss_mlp": 0.01280651, "balance_loss_clip": 0.06312254, "balance_loss_mlp": 0.01262626, "epoch": 0.25257778445813917, "flos": 45268440307200.0, "grad_norm": 1.6451182521313745, "language_loss": 0.83076614, "learning_rate": 3.5024324422504616e-06, "loss": 0.90889871, "num_input_tokens_seen": 90640855, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.18005371, "step": 4201, "time_per_iteration": 2.7868058681488037 }, { "auxiliary_loss_clip": 0.06534209, "auxiliary_loss_mlp": 0.01281687, "balance_loss_clip": 0.06311299, "balance_loss_mlp": 0.01263222, "epoch": 0.25263790771080713, "flos": 23374526976000.0, "grad_norm": 3.2683413889247417, "language_loss": 0.75256556, "learning_rate": 3.5021753471677965e-06, "loss": 0.8307246, "num_input_tokens_seen": 90661350, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.18457031, "step": 4202, "time_per_iteration": 2.637213706970215 }, { "auxiliary_loss_clip": 0.06527035, "auxiliary_loss_mlp": 0.01279744, "balance_loss_clip": 0.06312255, "balance_loss_mlp": 0.01261147, "epoch": 0.25269803096347515, "flos": 18520938748800.0, "grad_norm": 1.7511837826399026, "language_loss": 0.73928702, "learning_rate": 3.501918195122491e-06, "loss": 0.81735486, "num_input_tokens_seen": 90680540, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.18591309, "step": 4203, "time_per_iteration": 2.5480940341949463 }, { "auxiliary_loss_clip": 0.06536435, "auxiliary_loss_mlp": 0.01274228, "balance_loss_clip": 0.06314768, "balance_loss_mlp": 0.01255429, "epoch": 0.2527581542161431, "flos": 24617870985600.0, "grad_norm": 1.5852229670841396, "language_loss": 0.78395879, "learning_rate": 3.501660986124297e-06, "loss": 0.86206537, "num_input_tokens_seen": 90703460, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.18798828, "step": 4204, "time_per_iteration": 4.090890645980835 }, { "auxiliary_loss_clip": 0.06528109, "auxiliary_loss_mlp": 0.01282485, "balance_loss_clip": 0.06308389, "balance_loss_mlp": 0.01264186, "epoch": 0.2528182774688111, "flos": 12646266266880.0, "grad_norm": 2.6838608951617187, "language_loss": 0.72945046, "learning_rate": 3.5014037201829684e-06, "loss": 0.80755639, "num_input_tokens_seen": 90718815, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.1829834, "step": 4205, "time_per_iteration": 2.5290658473968506 }, { "auxiliary_loss_clip": 0.0652043, "auxiliary_loss_mlp": 0.01276456, "balance_loss_clip": 0.0631071, "balance_loss_mlp": 0.01260064, "epoch": 0.25287840072147905, "flos": 46947331440000.0, "grad_norm": 1.2635040913649718, "language_loss": 0.76159894, "learning_rate": 3.50114639730826e-06, "loss": 0.83956778, "num_input_tokens_seen": 90742125, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.16394043, "step": 4206, "time_per_iteration": 2.8159430027008057 }, { "auxiliary_loss_clip": 0.06527723, "auxiliary_loss_mlp": 0.01281131, "balance_loss_clip": 0.06310134, "balance_loss_mlp": 0.0126387, "epoch": 0.252938523974147, "flos": 18885641644800.0, "grad_norm": 1.657922066796538, "language_loss": 0.79793382, "learning_rate": 3.5008890175099296e-06, "loss": 0.87602234, "num_input_tokens_seen": 90760785, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.17272949, "step": 4207, "time_per_iteration": 2.5589547157287598 }, { "auxiliary_loss_clip": 0.06525486, "auxiliary_loss_mlp": 0.01274971, "balance_loss_clip": 0.06310913, "balance_loss_mlp": 0.01257412, "epoch": 0.252998647226815, "flos": 21441245247360.0, "grad_norm": 1.4017129968039717, "language_loss": 0.76817048, "learning_rate": 3.5006315807977375e-06, "loss": 0.84617507, "num_input_tokens_seen": 90780045, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.17553711, "step": 4208, "time_per_iteration": 2.583739757537842 }, { "auxiliary_loss_clip": 0.06522486, "auxiliary_loss_mlp": 0.0127519, "balance_loss_clip": 0.06310444, "balance_loss_mlp": 0.01258513, "epoch": 0.25305877047948294, "flos": 25448365128960.0, "grad_norm": 1.8204089256205582, "language_loss": 0.70049119, "learning_rate": 3.5003740871814456e-06, "loss": 0.77846789, "num_input_tokens_seen": 90797980, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.16674805, "step": 4209, "time_per_iteration": 2.6134793758392334 }, { "auxiliary_loss_clip": 0.06404801, "auxiliary_loss_mlp": 0.0126218, "balance_loss_clip": 0.06294228, "balance_loss_mlp": 0.01258157, "epoch": 0.2531188937321509, "flos": 60205213457280.0, "grad_norm": 0.7275441511156797, "language_loss": 0.55076283, "learning_rate": 3.5001165366708175e-06, "loss": 0.62743264, "num_input_tokens_seen": 90864865, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.04022217, "step": 4210, "time_per_iteration": 4.655597686767578 }, { "auxiliary_loss_clip": 0.06525344, "auxiliary_loss_mlp": 0.01281726, "balance_loss_clip": 0.06308534, "balance_loss_mlp": 0.01264392, "epoch": 0.25317901698481887, "flos": 19688449213440.0, "grad_norm": 1.819970676111773, "language_loss": 0.80772674, "learning_rate": 3.4998589292756204e-06, "loss": 0.8857975, "num_input_tokens_seen": 90882885, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.17346191, "step": 4211, "time_per_iteration": 2.5541927814483643 }, { "auxiliary_loss_clip": 0.06516001, "auxiliary_loss_mlp": 0.01269727, "balance_loss_clip": 0.06305321, "balance_loss_mlp": 0.01253932, "epoch": 0.25323914023748684, "flos": 24431012380800.0, "grad_norm": 1.9893797211970055, "language_loss": 0.79024744, "learning_rate": 3.499601265005622e-06, "loss": 0.8681047, "num_input_tokens_seen": 90902985, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.15783691, "step": 4212, "time_per_iteration": 4.026504755020142 }, { "auxiliary_loss_clip": 0.06521203, "auxiliary_loss_mlp": 0.01275532, "balance_loss_clip": 0.06302588, "balance_loss_mlp": 0.0125814, "epoch": 0.2532992634901548, "flos": 25454528403840.0, "grad_norm": 1.9218186968511815, "language_loss": 0.53922385, "learning_rate": 3.4993435438705938e-06, "loss": 0.6171912, "num_input_tokens_seen": 90923550, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.1739502, "step": 4213, "time_per_iteration": 2.611571788787842 }, { "auxiliary_loss_clip": 0.06522657, "auxiliary_loss_mlp": 0.01274667, "balance_loss_clip": 0.06306091, "balance_loss_mlp": 0.01257251, "epoch": 0.25335938674282277, "flos": 18886605966720.0, "grad_norm": 2.7511434392189527, "language_loss": 0.65826392, "learning_rate": 3.499085765880308e-06, "loss": 0.73623717, "num_input_tokens_seen": 90943260, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.17419434, "step": 4214, "time_per_iteration": 2.5853748321533203 }, { "auxiliary_loss_clip": 0.06403616, "auxiliary_loss_mlp": 0.01261869, "balance_loss_clip": 0.06292337, "balance_loss_mlp": 0.01258012, "epoch": 0.25341950999549073, "flos": 53079692025600.0, "grad_norm": 0.824202618514611, "language_loss": 0.57915485, "learning_rate": 3.4988279310445396e-06, "loss": 0.6558097, "num_input_tokens_seen": 90996295, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.03851318, "step": 4215, "time_per_iteration": 2.9297239780426025 }, { "auxiliary_loss_clip": 0.06515405, "auxiliary_loss_mlp": 0.01273937, "balance_loss_clip": 0.06302991, "balance_loss_mlp": 0.01256854, "epoch": 0.2534796332481587, "flos": 39029609980800.0, "grad_norm": 1.8797266981420797, "language_loss": 0.83948117, "learning_rate": 3.498570039373066e-06, "loss": 0.91737461, "num_input_tokens_seen": 91017545, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.17077637, "step": 4216, "time_per_iteration": 2.729405164718628 }, { "auxiliary_loss_clip": 0.06518264, "auxiliary_loss_mlp": 0.0127861, "balance_loss_clip": 0.06303378, "balance_loss_mlp": 0.01261873, "epoch": 0.2535397565008267, "flos": 23593809911040.0, "grad_norm": 2.3353130396103157, "language_loss": 0.81200862, "learning_rate": 3.498312090875666e-06, "loss": 0.88997734, "num_input_tokens_seen": 91037715, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.1673584, "step": 4217, "time_per_iteration": 2.586817741394043 }, { "auxiliary_loss_clip": 0.06516115, "auxiliary_loss_mlp": 0.01275943, "balance_loss_clip": 0.06302424, "balance_loss_mlp": 0.01258908, "epoch": 0.2535998797534947, "flos": 19287422772480.0, "grad_norm": 4.686504896060369, "language_loss": 0.7503885, "learning_rate": 3.4980540855621218e-06, "loss": 0.82830906, "num_input_tokens_seen": 91055295, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.17053223, "step": 4218, "time_per_iteration": 2.601195812225342 }, { "auxiliary_loss_clip": 0.06520426, "auxiliary_loss_mlp": 0.01281186, "balance_loss_clip": 0.06305054, "balance_loss_mlp": 0.01263293, "epoch": 0.25366000300616265, "flos": 24031201824000.0, "grad_norm": 1.6840116492091777, "language_loss": 0.74879605, "learning_rate": 3.4977960234422167e-06, "loss": 0.82681221, "num_input_tokens_seen": 91075485, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.17895508, "step": 4219, "time_per_iteration": 2.6281137466430664 }, { "auxiliary_loss_clip": 0.06525005, "auxiliary_loss_mlp": 0.01277851, "balance_loss_clip": 0.06308269, "balance_loss_mlp": 0.01260208, "epoch": 0.2537201262588306, "flos": 16294888454400.0, "grad_norm": 1.851773990767565, "language_loss": 0.8184588, "learning_rate": 3.497537904525736e-06, "loss": 0.89648736, "num_input_tokens_seen": 91093620, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.1763916, "step": 4220, "time_per_iteration": 2.5711631774902344 }, { "auxiliary_loss_clip": 0.06521092, "auxiliary_loss_mlp": 0.01275612, "balance_loss_clip": 0.06303535, "balance_loss_mlp": 0.01258625, "epoch": 0.2537802495114986, "flos": 23301376761600.0, "grad_norm": 2.0473507299125577, "language_loss": 0.71350372, "learning_rate": 3.497279728822468e-06, "loss": 0.79147077, "num_input_tokens_seen": 91114110, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.17004395, "step": 4221, "time_per_iteration": 2.595878839492798 }, { "auxiliary_loss_clip": 0.06521098, "auxiliary_loss_mlp": 0.01276425, "balance_loss_clip": 0.06303165, "balance_loss_mlp": 0.01258687, "epoch": 0.25384037276416654, "flos": 17644855184640.0, "grad_norm": 3.2782666801955767, "language_loss": 0.6232636, "learning_rate": 3.497021496342202e-06, "loss": 0.70123887, "num_input_tokens_seen": 91133135, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.17724609, "step": 4222, "time_per_iteration": 2.6058080196380615 }, { "auxiliary_loss_clip": 0.06520645, "auxiliary_loss_mlp": 0.01278413, "balance_loss_clip": 0.06303033, "balance_loss_mlp": 0.01260686, "epoch": 0.2539004960168345, "flos": 21513473066880.0, "grad_norm": 1.7464083979347573, "language_loss": 0.75307328, "learning_rate": 3.496763207094731e-06, "loss": 0.83106387, "num_input_tokens_seen": 91151805, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.17724609, "step": 4223, "time_per_iteration": 2.5791330337524414 }, { "auxiliary_loss_clip": 0.0650907, "auxiliary_loss_mlp": 0.01275383, "balance_loss_clip": 0.06299471, "balance_loss_mlp": 0.01259397, "epoch": 0.2539606192695025, "flos": 23957632339200.0, "grad_norm": 2.9253615851709305, "language_loss": 0.8039794, "learning_rate": 3.49650486108985e-06, "loss": 0.88182396, "num_input_tokens_seen": 91172270, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.15979004, "step": 4224, "time_per_iteration": 2.5924830436706543 }, { "auxiliary_loss_clip": 0.06512746, "auxiliary_loss_mlp": 0.01278495, "balance_loss_clip": 0.06299093, "balance_loss_mlp": 0.01261245, "epoch": 0.25402074252217044, "flos": 24176537930880.0, "grad_norm": 1.8287497469504688, "language_loss": 0.77519357, "learning_rate": 3.496246458337354e-06, "loss": 0.85310602, "num_input_tokens_seen": 91192080, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.17248535, "step": 4225, "time_per_iteration": 2.597278594970703 }, { "auxiliary_loss_clip": 0.06512975, "auxiliary_loss_mlp": 0.01284949, "balance_loss_clip": 0.0629947, "balance_loss_mlp": 0.01267866, "epoch": 0.2540808657748384, "flos": 22309320746880.0, "grad_norm": 1.6920838611132567, "language_loss": 0.85540891, "learning_rate": 3.4959879988470426e-06, "loss": 0.93338823, "num_input_tokens_seen": 91211450, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.17077637, "step": 4226, "time_per_iteration": 2.5931732654571533 }, { "auxiliary_loss_clip": 0.06512275, "auxiliary_loss_mlp": 0.01277413, "balance_loss_clip": 0.06298514, "balance_loss_mlp": 0.012612, "epoch": 0.25414098902750637, "flos": 27606883432320.0, "grad_norm": 1.437801699072145, "language_loss": 0.71465361, "learning_rate": 3.4957294826287164e-06, "loss": 0.7925505, "num_input_tokens_seen": 91231835, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.16210938, "step": 4227, "time_per_iteration": 2.6403472423553467 }, { "auxiliary_loss_clip": 0.06396808, "auxiliary_loss_mlp": 0.0127482, "balance_loss_clip": 0.06283893, "balance_loss_mlp": 0.01269074, "epoch": 0.25420111228017434, "flos": 58188760951680.0, "grad_norm": 0.9661187774230242, "language_loss": 0.618375, "learning_rate": 3.4954709096921785e-06, "loss": 0.69509125, "num_input_tokens_seen": 91288755, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 0.05740356, "step": 4228, "time_per_iteration": 3.0267832279205322 }, { "auxiliary_loss_clip": 0.06514116, "auxiliary_loss_mlp": 0.0127697, "balance_loss_clip": 0.06298443, "balance_loss_mlp": 0.01259172, "epoch": 0.2542612355328423, "flos": 11467645136640.0, "grad_norm": 2.302832173678163, "language_loss": 0.8744756, "learning_rate": 3.4952122800472336e-06, "loss": 0.95238644, "num_input_tokens_seen": 91302485, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.17785645, "step": 4229, "time_per_iteration": 2.5391764640808105 }, { "auxiliary_loss_clip": 0.06520638, "auxiliary_loss_mlp": 0.01274788, "balance_loss_clip": 0.06302928, "balance_loss_mlp": 0.01257193, "epoch": 0.2543213587855103, "flos": 22972452359040.0, "grad_norm": 1.8146246225223064, "language_loss": 0.77408242, "learning_rate": 3.4949535937036892e-06, "loss": 0.8520366, "num_input_tokens_seen": 91321120, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.17590332, "step": 4230, "time_per_iteration": 2.6328623294830322 }, { "auxiliary_loss_clip": 0.0651052, "auxiliary_loss_mlp": 0.01278706, "balance_loss_clip": 0.06296784, "balance_loss_mlp": 0.01261564, "epoch": 0.2543814820381783, "flos": 18257953109760.0, "grad_norm": 2.4185575730283926, "language_loss": 0.75273901, "learning_rate": 3.4946948506713544e-06, "loss": 0.83063132, "num_input_tokens_seen": 91338575, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.17150879, "step": 4231, "time_per_iteration": 2.556267023086548 }, { "auxiliary_loss_clip": 0.06510555, "auxiliary_loss_mlp": 0.01274465, "balance_loss_clip": 0.06298001, "balance_loss_mlp": 0.01258348, "epoch": 0.25444160529084625, "flos": 15638129752320.0, "grad_norm": 1.8448202135859737, "language_loss": 0.74477595, "learning_rate": 3.4944360509600416e-06, "loss": 0.82262611, "num_input_tokens_seen": 91357355, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.16113281, "step": 4232, "time_per_iteration": 2.5716776847839355 }, { "auxiliary_loss_clip": 0.06514408, "auxiliary_loss_mlp": 0.01279989, "balance_loss_clip": 0.06302378, "balance_loss_mlp": 0.0126268, "epoch": 0.2545017285435142, "flos": 24607431152640.0, "grad_norm": 1.8986150069499437, "language_loss": 0.87379462, "learning_rate": 3.4941771945795637e-06, "loss": 0.9517386, "num_input_tokens_seen": 91376515, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.17321777, "step": 4233, "time_per_iteration": 2.6240108013153076 }, { "auxiliary_loss_clip": 0.0650505, "auxiliary_loss_mlp": 0.01274341, "balance_loss_clip": 0.062985, "balance_loss_mlp": 0.01258296, "epoch": 0.2545618517961822, "flos": 24685654538880.0, "grad_norm": 1.9364440474507487, "language_loss": 0.75103509, "learning_rate": 3.493918281539737e-06, "loss": 0.82882905, "num_input_tokens_seen": 91397595, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.16052246, "step": 4234, "time_per_iteration": 2.654946804046631 }, { "auxiliary_loss_clip": 0.06519476, "auxiliary_loss_mlp": 0.01278001, "balance_loss_clip": 0.06303132, "balance_loss_mlp": 0.01261574, "epoch": 0.25462197504885015, "flos": 23921937699840.0, "grad_norm": 1.6103745873682975, "language_loss": 0.75515079, "learning_rate": 3.493659311850379e-06, "loss": 0.83312553, "num_input_tokens_seen": 91417775, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.1640625, "step": 4235, "time_per_iteration": 2.676790237426758 }, { "auxiliary_loss_clip": 0.06532308, "auxiliary_loss_mlp": 0.01281174, "balance_loss_clip": 0.06304526, "balance_loss_mlp": 0.01262005, "epoch": 0.2546820983015181, "flos": 24796134547200.0, "grad_norm": 2.071274398217683, "language_loss": 0.65438795, "learning_rate": 3.4934002855213106e-06, "loss": 0.73252273, "num_input_tokens_seen": 91437665, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.19177246, "step": 4236, "time_per_iteration": 2.6281189918518066 }, { "auxiliary_loss_clip": 0.06514833, "auxiliary_loss_mlp": 0.0127159, "balance_loss_clip": 0.06302777, "balance_loss_mlp": 0.01256069, "epoch": 0.2547422215541861, "flos": 18740095902720.0, "grad_norm": 2.883620152723534, "language_loss": 0.67514497, "learning_rate": 3.493141202562354e-06, "loss": 0.7530092, "num_input_tokens_seen": 91456705, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.15527344, "step": 4237, "time_per_iteration": 2.6573407649993896 }, { "auxiliary_loss_clip": 0.0651571, "auxiliary_loss_mlp": 0.01273059, "balance_loss_clip": 0.06301519, "balance_loss_mlp": 0.01255583, "epoch": 0.25480234480685404, "flos": 21038751360000.0, "grad_norm": 2.1323595666842023, "language_loss": 0.7587955, "learning_rate": 3.492882062983333e-06, "loss": 0.83668315, "num_input_tokens_seen": 91475535, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.17492676, "step": 4238, "time_per_iteration": 2.6196227073669434 }, { "auxiliary_loss_clip": 0.06520933, "auxiliary_loss_mlp": 0.01273354, "balance_loss_clip": 0.06304637, "balance_loss_mlp": 0.01256593, "epoch": 0.254862468059522, "flos": 25089112748160.0, "grad_norm": 2.501131907196794, "language_loss": 0.80902421, "learning_rate": 3.492622866794074e-06, "loss": 0.88696706, "num_input_tokens_seen": 91499140, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.16760254, "step": 4239, "time_per_iteration": 4.014068841934204 }, { "auxiliary_loss_clip": 0.06517745, "auxiliary_loss_mlp": 0.01275257, "balance_loss_clip": 0.0630688, "balance_loss_mlp": 0.01258091, "epoch": 0.25492259131219, "flos": 20564658558720.0, "grad_norm": 2.1922838834282667, "language_loss": 0.77977854, "learning_rate": 3.492363614004407e-06, "loss": 0.85770851, "num_input_tokens_seen": 91518335, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.17163086, "step": 4240, "time_per_iteration": 2.6036789417266846 }, { "auxiliary_loss_clip": 0.0652291, "auxiliary_loss_mlp": 0.01271799, "balance_loss_clip": 0.06304391, "balance_loss_mlp": 0.01254299, "epoch": 0.25498271456485794, "flos": 25048889988480.0, "grad_norm": 1.8804512073234274, "language_loss": 0.83751202, "learning_rate": 3.492104304624162e-06, "loss": 0.91545916, "num_input_tokens_seen": 91537655, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.17480469, "step": 4241, "time_per_iteration": 2.603874683380127 }, { "auxiliary_loss_clip": 0.06521646, "auxiliary_loss_mlp": 0.01274773, "balance_loss_clip": 0.06309101, "balance_loss_mlp": 0.01258036, "epoch": 0.2550428378175259, "flos": 26185820912640.0, "grad_norm": 1.524233491692905, "language_loss": 0.73689109, "learning_rate": 3.4918449386631725e-06, "loss": 0.81485534, "num_input_tokens_seen": 91557545, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.1673584, "step": 4242, "time_per_iteration": 2.642305374145508 }, { "auxiliary_loss_clip": 0.06521682, "auxiliary_loss_mlp": 0.01274393, "balance_loss_clip": 0.06306529, "balance_loss_mlp": 0.0125827, "epoch": 0.2551029610701939, "flos": 15272420607360.0, "grad_norm": 2.155358774621667, "language_loss": 0.725173, "learning_rate": 3.491585516131273e-06, "loss": 0.80313373, "num_input_tokens_seen": 91574405, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.16119385, "step": 4243, "time_per_iteration": 4.024171352386475 }, { "auxiliary_loss_clip": 0.06516779, "auxiliary_loss_mlp": 0.01274324, "balance_loss_clip": 0.06304393, "balance_loss_mlp": 0.01258803, "epoch": 0.2551630843228619, "flos": 18117774028800.0, "grad_norm": 1.6639036149740756, "language_loss": 0.8187989, "learning_rate": 3.491326037038301e-06, "loss": 0.89670992, "num_input_tokens_seen": 91593755, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.15539551, "step": 4244, "time_per_iteration": 2.5664472579956055 }, { "auxiliary_loss_clip": 0.06456824, "auxiliary_loss_mlp": 0.01262824, "balance_loss_clip": 0.06344981, "balance_loss_mlp": 0.01258866, "epoch": 0.25522320757552985, "flos": 70543055266560.0, "grad_norm": 0.675784397795258, "language_loss": 0.57613778, "learning_rate": 3.4910665013940967e-06, "loss": 0.65333426, "num_input_tokens_seen": 91660335, "router_z_loss_clip": 1.1171875, "router_z_loss_mlp": 0.03955078, "step": 4245, "time_per_iteration": 3.30998158454895 }, { "auxiliary_loss_clip": 0.06531489, "auxiliary_loss_mlp": 0.01272302, "balance_loss_clip": 0.06313007, "balance_loss_mlp": 0.01255422, "epoch": 0.2552833308281978, "flos": 22899679488000.0, "grad_norm": 2.0569383665790393, "language_loss": 0.65859342, "learning_rate": 3.4908069092085015e-06, "loss": 0.73663139, "num_input_tokens_seen": 91678500, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.16870117, "step": 4246, "time_per_iteration": 2.5965259075164795 }, { "auxiliary_loss_clip": 0.06507382, "auxiliary_loss_mlp": 0.01279669, "balance_loss_clip": 0.06301302, "balance_loss_mlp": 0.01263075, "epoch": 0.2553434540808658, "flos": 22060003322880.0, "grad_norm": 1.6678891256613628, "language_loss": 0.81835473, "learning_rate": 3.4905472604913585e-06, "loss": 0.89622533, "num_input_tokens_seen": 91696430, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.1661377, "step": 4247, "time_per_iteration": 2.581709146499634 }, { "auxiliary_loss_clip": 0.06532992, "auxiliary_loss_mlp": 0.01276491, "balance_loss_clip": 0.06310653, "balance_loss_mlp": 0.01257203, "epoch": 0.25540357733353375, "flos": 16549656393600.0, "grad_norm": 3.904710291620065, "language_loss": 0.84272861, "learning_rate": 3.490287555252514e-06, "loss": 0.92082345, "num_input_tokens_seen": 91713270, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.19299316, "step": 4248, "time_per_iteration": 2.5571672916412354 }, { "auxiliary_loss_clip": 0.0652266, "auxiliary_loss_mlp": 0.01274497, "balance_loss_clip": 0.06307504, "balance_loss_mlp": 0.01257331, "epoch": 0.2554637005862017, "flos": 17570531013120.0, "grad_norm": 1.8856458052549674, "language_loss": 0.84523249, "learning_rate": 3.4900277935018166e-06, "loss": 0.92320406, "num_input_tokens_seen": 91728865, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.17175293, "step": 4249, "time_per_iteration": 2.544496774673462 }, { "auxiliary_loss_clip": 0.06426406, "auxiliary_loss_mlp": 0.0125398, "balance_loss_clip": 0.06315731, "balance_loss_mlp": 0.01249757, "epoch": 0.2555238238388697, "flos": 72263441698560.0, "grad_norm": 0.7255229317278261, "language_loss": 0.56192291, "learning_rate": 3.489767975249115e-06, "loss": 0.63872677, "num_input_tokens_seen": 91787470, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.04226685, "step": 4250, "time_per_iteration": 4.624901294708252 }, { "auxiliary_loss_clip": 0.06517244, "auxiliary_loss_mlp": 0.01277083, "balance_loss_clip": 0.06301773, "balance_loss_mlp": 0.01258856, "epoch": 0.25558394709153764, "flos": 24396323990400.0, "grad_norm": 1.88110383517462, "language_loss": 0.81027985, "learning_rate": 3.4895081005042632e-06, "loss": 0.88822305, "num_input_tokens_seen": 91805640, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.18225098, "step": 4251, "time_per_iteration": 2.608802080154419 }, { "auxiliary_loss_clip": 0.06419322, "auxiliary_loss_mlp": 0.01257782, "balance_loss_clip": 0.0630964, "balance_loss_mlp": 0.01253735, "epoch": 0.2556440703442056, "flos": 69251857776000.0, "grad_norm": 0.7709059810540282, "language_loss": 0.66096461, "learning_rate": 3.4892481692771146e-06, "loss": 0.73773557, "num_input_tokens_seen": 91869695, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 0.04046631, "step": 4252, "time_per_iteration": 4.696612358093262 }, { "auxiliary_loss_clip": 0.06506709, "auxiliary_loss_mlp": 0.01277499, "balance_loss_clip": 0.06296333, "balance_loss_mlp": 0.0126168, "epoch": 0.2557041935968736, "flos": 24870919916160.0, "grad_norm": 2.4163255165317867, "language_loss": 0.74224645, "learning_rate": 3.4889881815775267e-06, "loss": 0.82008851, "num_input_tokens_seen": 91889920, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.1583252, "step": 4253, "time_per_iteration": 2.6066458225250244 }, { "auxiliary_loss_clip": 0.06504092, "auxiliary_loss_mlp": 0.01277705, "balance_loss_clip": 0.06293286, "balance_loss_mlp": 0.01260515, "epoch": 0.25576431684954154, "flos": 22498694974080.0, "grad_norm": 2.4771859501687494, "language_loss": 0.74089587, "learning_rate": 3.488728137415357e-06, "loss": 0.81871378, "num_input_tokens_seen": 91908665, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.17199707, "step": 4254, "time_per_iteration": 2.604635715484619 }, { "auxiliary_loss_clip": 0.06513207, "auxiliary_loss_mlp": 0.01274927, "balance_loss_clip": 0.06299401, "balance_loss_mlp": 0.01257797, "epoch": 0.2558244401022095, "flos": 19832569436160.0, "grad_norm": 10.659183446166002, "language_loss": 0.81468832, "learning_rate": 3.4884680368004675e-06, "loss": 0.89256966, "num_input_tokens_seen": 91927855, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.17138672, "step": 4255, "time_per_iteration": 2.5639102458953857 }, { "auxiliary_loss_clip": 0.06509065, "auxiliary_loss_mlp": 0.01274864, "balance_loss_clip": 0.06297191, "balance_loss_mlp": 0.01258079, "epoch": 0.2558845633548775, "flos": 23226968736000.0, "grad_norm": 1.3318193831149483, "language_loss": 0.85602224, "learning_rate": 3.488207879742721e-06, "loss": 0.93386149, "num_input_tokens_seen": 91948500, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.16784668, "step": 4256, "time_per_iteration": 2.5934832096099854 }, { "auxiliary_loss_clip": 0.06516286, "auxiliary_loss_mlp": 0.01281868, "balance_loss_clip": 0.06298055, "balance_loss_mlp": 0.01264535, "epoch": 0.2559446866075455, "flos": 16843682770560.0, "grad_norm": 1.5800418827472251, "language_loss": 0.75453794, "learning_rate": 3.4879476662519826e-06, "loss": 0.83251953, "num_input_tokens_seen": 91968375, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.17333984, "step": 4257, "time_per_iteration": 2.573211193084717 }, { "auxiliary_loss_clip": 0.06398547, "auxiliary_loss_mlp": 0.01260301, "balance_loss_clip": 0.0628839, "balance_loss_mlp": 0.01256165, "epoch": 0.25600480986021346, "flos": 57612741258240.0, "grad_norm": 0.7733659223356867, "language_loss": 0.65104604, "learning_rate": 3.4876873963381196e-06, "loss": 0.72763455, "num_input_tokens_seen": 92028490, "router_z_loss_clip": 1.10058594, "router_z_loss_mlp": 0.04138184, "step": 4258, "time_per_iteration": 3.1804323196411133 }, { "auxiliary_loss_clip": 0.06494518, "auxiliary_loss_mlp": 0.01276692, "balance_loss_clip": 0.06289733, "balance_loss_mlp": 0.01259323, "epoch": 0.2560649331128814, "flos": 27827088762240.0, "grad_norm": 1.5588210154953217, "language_loss": 0.76723981, "learning_rate": 3.4874270700110013e-06, "loss": 0.84495187, "num_input_tokens_seen": 92048060, "router_z_loss_clip": 2.04589844, "router_z_loss_mlp": 0.17358398, "step": 4259, "time_per_iteration": 2.6262669563293457 }, { "auxiliary_loss_clip": 0.06394804, "auxiliary_loss_mlp": 0.01257726, "balance_loss_clip": 0.06285445, "balance_loss_mlp": 0.01253998, "epoch": 0.2561250563655494, "flos": 70972187552640.0, "grad_norm": 0.765931299340686, "language_loss": 0.58468693, "learning_rate": 3.4871666872804994e-06, "loss": 0.66121221, "num_input_tokens_seen": 92118180, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 0.03720093, "step": 4260, "time_per_iteration": 3.308403968811035 }, { "auxiliary_loss_clip": 0.06506532, "auxiliary_loss_mlp": 0.01274349, "balance_loss_clip": 0.0629514, "balance_loss_mlp": 0.01257421, "epoch": 0.25618517961821735, "flos": 27018998386560.0, "grad_norm": 2.0273935888913983, "language_loss": 0.77243, "learning_rate": 3.4869062481564875e-06, "loss": 0.8502388, "num_input_tokens_seen": 92137570, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.16931152, "step": 4261, "time_per_iteration": 2.6069839000701904 }, { "auxiliary_loss_clip": 0.06506938, "auxiliary_loss_mlp": 0.01277073, "balance_loss_clip": 0.06295174, "balance_loss_mlp": 0.01260503, "epoch": 0.2562453028708853, "flos": 23073708418560.0, "grad_norm": 1.5634285684998663, "language_loss": 0.83208603, "learning_rate": 3.486645752648842e-06, "loss": 0.90992612, "num_input_tokens_seen": 92157625, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.16564941, "step": 4262, "time_per_iteration": 2.6421051025390625 }, { "auxiliary_loss_clip": 0.0651633, "auxiliary_loss_mlp": 0.01271626, "balance_loss_clip": 0.06299601, "balance_loss_mlp": 0.01254222, "epoch": 0.2563054261235533, "flos": 15126120178560.0, "grad_norm": 2.232322012671371, "language_loss": 0.74338353, "learning_rate": 3.4863852007674405e-06, "loss": 0.82126307, "num_input_tokens_seen": 92175350, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.17407227, "step": 4263, "time_per_iteration": 2.537257194519043 }, { "auxiliary_loss_clip": 0.06508441, "auxiliary_loss_mlp": 0.01271247, "balance_loss_clip": 0.06300889, "balance_loss_mlp": 0.01254736, "epoch": 0.25636554937622125, "flos": 27862238350080.0, "grad_norm": 1.6195031427255633, "language_loss": 0.83456802, "learning_rate": 3.486124592522163e-06, "loss": 0.91236496, "num_input_tokens_seen": 92196070, "router_z_loss_clip": 2.07324219, "router_z_loss_mlp": 0.16491699, "step": 4264, "time_per_iteration": 2.6402180194854736 }, { "auxiliary_loss_clip": 0.06512339, "auxiliary_loss_mlp": 0.01271205, "balance_loss_clip": 0.06299302, "balance_loss_mlp": 0.01253216, "epoch": 0.2564256726288892, "flos": 28912979750400.0, "grad_norm": 1.7365317434327228, "language_loss": 0.7490859, "learning_rate": 3.4858639279228924e-06, "loss": 0.82692134, "num_input_tokens_seen": 92216310, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.18005371, "step": 4265, "time_per_iteration": 2.632277727127075 }, { "auxiliary_loss_clip": 0.06508985, "auxiliary_loss_mlp": 0.01272194, "balance_loss_clip": 0.06295452, "balance_loss_mlp": 0.01255528, "epoch": 0.2564857958815572, "flos": 18520812967680.0, "grad_norm": 1.7034839172985585, "language_loss": 0.82501328, "learning_rate": 3.485603206979513e-06, "loss": 0.90282512, "num_input_tokens_seen": 92234510, "router_z_loss_clip": 2.13574219, "router_z_loss_mlp": 0.16662598, "step": 4266, "time_per_iteration": 2.5925557613372803 }, { "auxiliary_loss_clip": 0.06502424, "auxiliary_loss_mlp": 0.01276563, "balance_loss_clip": 0.06293929, "balance_loss_mlp": 0.01260339, "epoch": 0.25654591913422514, "flos": 25814745106560.0, "grad_norm": 1.4768341609800937, "language_loss": 0.79711127, "learning_rate": 3.4853424297019103e-06, "loss": 0.87490118, "num_input_tokens_seen": 92254070, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.16235352, "step": 4267, "time_per_iteration": 2.609799861907959 }, { "auxiliary_loss_clip": 0.06495235, "auxiliary_loss_mlp": 0.01282476, "balance_loss_clip": 0.06292515, "balance_loss_mlp": 0.01265775, "epoch": 0.2566060423868931, "flos": 19105805047680.0, "grad_norm": 1.5794866340863924, "language_loss": 0.79468417, "learning_rate": 3.4850815960999736e-06, "loss": 0.87246132, "num_input_tokens_seen": 92275060, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.16687012, "step": 4268, "time_per_iteration": 2.6084296703338623 }, { "auxiliary_loss_clip": 0.06505181, "auxiliary_loss_mlp": 0.01278053, "balance_loss_clip": 0.062936, "balance_loss_mlp": 0.01261412, "epoch": 0.25666616563956113, "flos": 23849584099200.0, "grad_norm": 2.3540567085600905, "language_loss": 0.68650544, "learning_rate": 3.484820706183595e-06, "loss": 0.76433778, "num_input_tokens_seen": 92293610, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.16625977, "step": 4269, "time_per_iteration": 2.6528522968292236 }, { "auxiliary_loss_clip": 0.0651356, "auxiliary_loss_mlp": 0.01276166, "balance_loss_clip": 0.06299402, "balance_loss_mlp": 0.0125832, "epoch": 0.2567262888922291, "flos": 14608366600320.0, "grad_norm": 3.0153857276315077, "language_loss": 0.79876, "learning_rate": 3.484559759962666e-06, "loss": 0.87665725, "num_input_tokens_seen": 92308305, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.17858887, "step": 4270, "time_per_iteration": 2.535935401916504 }, { "auxiliary_loss_clip": 0.065191, "auxiliary_loss_mlp": 0.01276208, "balance_loss_clip": 0.06299374, "balance_loss_mlp": 0.01257516, "epoch": 0.25678641214489706, "flos": 32930791027200.0, "grad_norm": 2.280398003365961, "language_loss": 0.68007052, "learning_rate": 3.4842987574470816e-06, "loss": 0.75802362, "num_input_tokens_seen": 92329875, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.18701172, "step": 4271, "time_per_iteration": 2.691084623336792 }, { "auxiliary_loss_clip": 0.06511902, "auxiliary_loss_mlp": 0.01273205, "balance_loss_clip": 0.06295069, "balance_loss_mlp": 0.01256135, "epoch": 0.256846535397565, "flos": 24106029120000.0, "grad_norm": 1.4193749211027198, "language_loss": 0.87622404, "learning_rate": 3.4840376986467403e-06, "loss": 0.9540751, "num_input_tokens_seen": 92348780, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.1706543, "step": 4272, "time_per_iteration": 2.620328903198242 }, { "auxiliary_loss_clip": 0.06510759, "auxiliary_loss_mlp": 0.01275636, "balance_loss_clip": 0.06297125, "balance_loss_mlp": 0.01257529, "epoch": 0.256906658650233, "flos": 19724437342080.0, "grad_norm": 1.7374971989920251, "language_loss": 0.81704509, "learning_rate": 3.483776583571541e-06, "loss": 0.89490908, "num_input_tokens_seen": 92368175, "router_z_loss_clip": 2.13769531, "router_z_loss_mlp": 0.1809082, "step": 4273, "time_per_iteration": 2.600846290588379 }, { "auxiliary_loss_clip": 0.06505424, "auxiliary_loss_mlp": 0.01271945, "balance_loss_clip": 0.06298731, "balance_loss_mlp": 0.01255732, "epoch": 0.25696678190290095, "flos": 22932019964160.0, "grad_norm": 1.5067844277513756, "language_loss": 0.77763742, "learning_rate": 3.4835154122313846e-06, "loss": 0.85541105, "num_input_tokens_seen": 92387755, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.16210938, "step": 4274, "time_per_iteration": 2.614583969116211 }, { "auxiliary_loss_clip": 0.06499504, "auxiliary_loss_mlp": 0.01271386, "balance_loss_clip": 0.06293195, "balance_loss_mlp": 0.01253862, "epoch": 0.2570269051555689, "flos": 27315163042560.0, "grad_norm": 1.6747132549361126, "language_loss": 0.84133106, "learning_rate": 3.4832541846361743e-06, "loss": 0.91903996, "num_input_tokens_seen": 92409850, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.17504883, "step": 4275, "time_per_iteration": 2.6220602989196777 }, { "auxiliary_loss_clip": 0.06509905, "auxiliary_loss_mlp": 0.01270652, "balance_loss_clip": 0.06295395, "balance_loss_mlp": 0.01253354, "epoch": 0.2570870284082369, "flos": 27570811449600.0, "grad_norm": 2.0482272712073106, "language_loss": 0.78765231, "learning_rate": 3.4829929007958175e-06, "loss": 0.86545789, "num_input_tokens_seen": 92431250, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.17285156, "step": 4276, "time_per_iteration": 2.630460262298584 }, { "auxiliary_loss_clip": 0.06505974, "auxiliary_loss_mlp": 0.01278091, "balance_loss_clip": 0.06295572, "balance_loss_mlp": 0.0126108, "epoch": 0.25714715166090485, "flos": 28738405768320.0, "grad_norm": 1.8350710240813635, "language_loss": 0.80059266, "learning_rate": 3.4827315607202214e-06, "loss": 0.87843335, "num_input_tokens_seen": 92452065, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.17016602, "step": 4277, "time_per_iteration": 2.6270744800567627 }, { "auxiliary_loss_clip": 0.06504172, "auxiliary_loss_mlp": 0.01271579, "balance_loss_clip": 0.06293301, "balance_loss_mlp": 0.01255652, "epoch": 0.2572072749135728, "flos": 20121606495360.0, "grad_norm": 1.9500443427847705, "language_loss": 0.7952624, "learning_rate": 3.482470164419295e-06, "loss": 0.87301987, "num_input_tokens_seen": 92470025, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.15917969, "step": 4278, "time_per_iteration": 3.9684431552886963 }, { "auxiliary_loss_clip": 0.06514508, "auxiliary_loss_mlp": 0.01273057, "balance_loss_clip": 0.0629933, "balance_loss_mlp": 0.0125607, "epoch": 0.2572673981662408, "flos": 26037969183360.0, "grad_norm": 1.9680662531058064, "language_loss": 0.7555387, "learning_rate": 3.482208711902952e-06, "loss": 0.83341432, "num_input_tokens_seen": 92489825, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.1697998, "step": 4279, "time_per_iteration": 2.6010899543762207 }, { "auxiliary_loss_clip": 0.06510951, "auxiliary_loss_mlp": 0.01277936, "balance_loss_clip": 0.06298031, "balance_loss_mlp": 0.01260388, "epoch": 0.25732752141890874, "flos": 16112054845440.0, "grad_norm": 1.99279883877908, "language_loss": 0.86774862, "learning_rate": 3.4819472031811065e-06, "loss": 0.94563746, "num_input_tokens_seen": 92507270, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.17541504, "step": 4280, "time_per_iteration": 2.766796350479126 }, { "auxiliary_loss_clip": 0.06510893, "auxiliary_loss_mlp": 0.01272404, "balance_loss_clip": 0.06297066, "balance_loss_mlp": 0.01254177, "epoch": 0.2573876446715767, "flos": 22530322690560.0, "grad_norm": 2.479981671267962, "language_loss": 0.79470754, "learning_rate": 3.4816856382636744e-06, "loss": 0.87254053, "num_input_tokens_seen": 92526300, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.18225098, "step": 4281, "time_per_iteration": 2.588242769241333 }, { "auxiliary_loss_clip": 0.06512433, "auxiliary_loss_mlp": 0.01273929, "balance_loss_clip": 0.06300288, "balance_loss_mlp": 0.01256704, "epoch": 0.2574477679242447, "flos": 23957548485120.0, "grad_norm": 1.663128043802257, "language_loss": 0.87986141, "learning_rate": 3.4814240171605737e-06, "loss": 0.95772505, "num_input_tokens_seen": 92546465, "router_z_loss_clip": 2.12011719, "router_z_loss_mlp": 0.17236328, "step": 4282, "time_per_iteration": 2.6246416568756104 }, { "auxiliary_loss_clip": 0.06512891, "auxiliary_loss_mlp": 0.01273435, "balance_loss_clip": 0.06299433, "balance_loss_mlp": 0.01257044, "epoch": 0.2575078911769127, "flos": 21988278627840.0, "grad_norm": 1.4711749565671874, "language_loss": 0.71203649, "learning_rate": 3.4811623398817267e-06, "loss": 0.78989971, "num_input_tokens_seen": 92567260, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.16394043, "step": 4283, "time_per_iteration": 4.232388019561768 }, { "auxiliary_loss_clip": 0.06503083, "auxiliary_loss_mlp": 0.01273828, "balance_loss_clip": 0.06297874, "balance_loss_mlp": 0.01257699, "epoch": 0.25756801442958066, "flos": 21951997009920.0, "grad_norm": 1.7599924566945255, "language_loss": 0.81015533, "learning_rate": 3.4809006064370553e-06, "loss": 0.88792443, "num_input_tokens_seen": 92585425, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.16125488, "step": 4284, "time_per_iteration": 2.687720775604248 }, { "auxiliary_loss_clip": 0.06503468, "auxiliary_loss_mlp": 0.01271287, "balance_loss_clip": 0.06292358, "balance_loss_mlp": 0.01255695, "epoch": 0.2576281376822486, "flos": 35270675493120.0, "grad_norm": 1.9538822415121213, "language_loss": 0.70922554, "learning_rate": 3.4806388168364835e-06, "loss": 0.78697312, "num_input_tokens_seen": 92604770, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.15588379, "step": 4285, "time_per_iteration": 2.7192835807800293 }, { "auxiliary_loss_clip": 0.06505788, "auxiliary_loss_mlp": 0.01272509, "balance_loss_clip": 0.06295571, "balance_loss_mlp": 0.01256916, "epoch": 0.2576882609349166, "flos": 14136705567360.0, "grad_norm": 1.9091576334154676, "language_loss": 0.59302354, "learning_rate": 3.4803769710899402e-06, "loss": 0.67080647, "num_input_tokens_seen": 92622635, "router_z_loss_clip": 2.10253906, "router_z_loss_mlp": 0.15576172, "step": 4286, "time_per_iteration": 2.5858426094055176 }, { "auxiliary_loss_clip": 0.06503969, "auxiliary_loss_mlp": 0.01273084, "balance_loss_clip": 0.06290037, "balance_loss_mlp": 0.01256395, "epoch": 0.25774838418758456, "flos": 23265053216640.0, "grad_norm": 1.8030600264683783, "language_loss": 0.65268886, "learning_rate": 3.480115069207354e-06, "loss": 0.73045939, "num_input_tokens_seen": 92642960, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.16687012, "step": 4287, "time_per_iteration": 2.6176774501800537 }, { "auxiliary_loss_clip": 0.0650519, "auxiliary_loss_mlp": 0.0127672, "balance_loss_clip": 0.06290649, "balance_loss_mlp": 0.01259041, "epoch": 0.2578085074402525, "flos": 22608378368640.0, "grad_norm": 1.700672958204323, "language_loss": 0.7210108, "learning_rate": 3.4798531111986557e-06, "loss": 0.79882991, "num_input_tokens_seen": 92662455, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.17675781, "step": 4288, "time_per_iteration": 2.58963942527771 }, { "auxiliary_loss_clip": 0.06503595, "auxiliary_loss_mlp": 0.01276254, "balance_loss_clip": 0.06295079, "balance_loss_mlp": 0.01260197, "epoch": 0.2578686306929205, "flos": 24578780256000.0, "grad_norm": 1.9085454218454974, "language_loss": 0.77453756, "learning_rate": 3.4795910970737786e-06, "loss": 0.85233605, "num_input_tokens_seen": 92683520, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.16064453, "step": 4289, "time_per_iteration": 4.034632205963135 }, { "auxiliary_loss_clip": 0.06504482, "auxiliary_loss_mlp": 0.01279561, "balance_loss_clip": 0.06295554, "balance_loss_mlp": 0.01262491, "epoch": 0.25792875394558845, "flos": 18119828453760.0, "grad_norm": 1.9640309214636864, "language_loss": 0.85120356, "learning_rate": 3.4793290268426592e-06, "loss": 0.92904401, "num_input_tokens_seen": 92701450, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.1706543, "step": 4290, "time_per_iteration": 2.5402164459228516 }, { "auxiliary_loss_clip": 0.06515373, "auxiliary_loss_mlp": 0.01282283, "balance_loss_clip": 0.06301716, "balance_loss_mlp": 0.01264938, "epoch": 0.2579888771982564, "flos": 17718760085760.0, "grad_norm": 1.6828789854514037, "language_loss": 0.72845304, "learning_rate": 3.4790669005152354e-06, "loss": 0.80642956, "num_input_tokens_seen": 92720355, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.17333984, "step": 4291, "time_per_iteration": 4.019816160202026 }, { "auxiliary_loss_clip": 0.06511899, "auxiliary_loss_mlp": 0.01272758, "balance_loss_clip": 0.06298096, "balance_loss_mlp": 0.0125589, "epoch": 0.2580490004509244, "flos": 16440350342400.0, "grad_norm": 2.6141398831549343, "language_loss": 0.81581098, "learning_rate": 3.4788047181014458e-06, "loss": 0.89365757, "num_input_tokens_seen": 92736755, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.16870117, "step": 4292, "time_per_iteration": 2.607675313949585 }, { "auxiliary_loss_clip": 0.0650459, "auxiliary_loss_mlp": 0.01281866, "balance_loss_clip": 0.06295248, "balance_loss_mlp": 0.01264557, "epoch": 0.25810912370359235, "flos": 33842946574080.0, "grad_norm": 3.0979510589613932, "language_loss": 0.67815781, "learning_rate": 3.4785424796112337e-06, "loss": 0.75602233, "num_input_tokens_seen": 92757655, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.17321777, "step": 4293, "time_per_iteration": 2.6606087684631348 }, { "auxiliary_loss_clip": 0.06502315, "auxiliary_loss_mlp": 0.01272869, "balance_loss_clip": 0.06297462, "balance_loss_mlp": 0.01257026, "epoch": 0.2581692469562603, "flos": 25199257340160.0, "grad_norm": 8.613035614562246, "language_loss": 0.75692827, "learning_rate": 3.478280185054542e-06, "loss": 0.83468008, "num_input_tokens_seen": 92776100, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.1583252, "step": 4294, "time_per_iteration": 2.6833789348602295 }, { "auxiliary_loss_clip": 0.06501219, "auxiliary_loss_mlp": 0.01279777, "balance_loss_clip": 0.06292991, "balance_loss_mlp": 0.01263589, "epoch": 0.2582293702089283, "flos": 34940619060480.0, "grad_norm": 1.8935920450471986, "language_loss": 0.81837213, "learning_rate": 3.478017834441318e-06, "loss": 0.89618212, "num_input_tokens_seen": 92798880, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.1619873, "step": 4295, "time_per_iteration": 2.684030055999756 }, { "auxiliary_loss_clip": 0.0651072, "auxiliary_loss_mlp": 0.01274968, "balance_loss_clip": 0.06296202, "balance_loss_mlp": 0.01257206, "epoch": 0.2582894934615963, "flos": 26841028314240.0, "grad_norm": 2.1972146470799228, "language_loss": 0.7308023, "learning_rate": 3.4777554277815096e-06, "loss": 0.8086592, "num_input_tokens_seen": 92817750, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.1776123, "step": 4296, "time_per_iteration": 2.663404941558838 }, { "auxiliary_loss_clip": 0.06507987, "auxiliary_loss_mlp": 0.01273859, "balance_loss_clip": 0.0629482, "balance_loss_mlp": 0.01256312, "epoch": 0.25834961671426426, "flos": 23522252924160.0, "grad_norm": 1.845770543301498, "language_loss": 0.86820281, "learning_rate": 3.477492965085067e-06, "loss": 0.94602126, "num_input_tokens_seen": 92837995, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.17541504, "step": 4297, "time_per_iteration": 2.588751792907715 }, { "auxiliary_loss_clip": 0.06500991, "auxiliary_loss_mlp": 0.01274684, "balance_loss_clip": 0.06290077, "balance_loss_mlp": 0.01258889, "epoch": 0.25840973996693223, "flos": 22456837059840.0, "grad_norm": 1.7432303537462148, "language_loss": 0.84641659, "learning_rate": 3.477230446361943e-06, "loss": 0.92417341, "num_input_tokens_seen": 92857245, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.15783691, "step": 4298, "time_per_iteration": 2.5793299674987793 }, { "auxiliary_loss_clip": 0.0650824, "auxiliary_loss_mlp": 0.01271416, "balance_loss_clip": 0.06297575, "balance_loss_mlp": 0.01254596, "epoch": 0.2584698632196002, "flos": 11295544849920.0, "grad_norm": 3.4567012901620178, "language_loss": 0.85041338, "learning_rate": 3.4769678716220927e-06, "loss": 0.9282099, "num_input_tokens_seen": 92873265, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.16821289, "step": 4299, "time_per_iteration": 2.560791254043579 }, { "auxiliary_loss_clip": 0.06501572, "auxiliary_loss_mlp": 0.01273419, "balance_loss_clip": 0.06295258, "balance_loss_mlp": 0.0125698, "epoch": 0.25852998647226816, "flos": 17935569325440.0, "grad_norm": 2.791771549633581, "language_loss": 0.83593225, "learning_rate": 3.4767052408754726e-06, "loss": 0.9136821, "num_input_tokens_seen": 92890880, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.16442871, "step": 4300, "time_per_iteration": 2.567060947418213 }, { "auxiliary_loss_clip": 0.06508674, "auxiliary_loss_mlp": 0.0127072, "balance_loss_clip": 0.06296559, "balance_loss_mlp": 0.01254007, "epoch": 0.2585901097249361, "flos": 33264620893440.0, "grad_norm": 2.174370647433651, "language_loss": 0.67815346, "learning_rate": 3.4764425541320417e-06, "loss": 0.75594735, "num_input_tokens_seen": 92910770, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.16711426, "step": 4301, "time_per_iteration": 2.6883764266967773 }, { "auxiliary_loss_clip": 0.0651032, "auxiliary_loss_mlp": 0.0127322, "balance_loss_clip": 0.06294782, "balance_loss_mlp": 0.01256388, "epoch": 0.2586502329776041, "flos": 18447033847680.0, "grad_norm": 6.5003155723501695, "language_loss": 0.81944513, "learning_rate": 3.4761798114017617e-06, "loss": 0.89728051, "num_input_tokens_seen": 92929520, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.16833496, "step": 4302, "time_per_iteration": 2.5715434551239014 }, { "auxiliary_loss_clip": 0.06501213, "auxiliary_loss_mlp": 0.0127632, "balance_loss_clip": 0.06292633, "balance_loss_mlp": 0.01259249, "epoch": 0.25871035623027205, "flos": 17973989222400.0, "grad_norm": 1.795933165261466, "language_loss": 0.92652667, "learning_rate": 3.475917012694595e-06, "loss": 1.00430202, "num_input_tokens_seen": 92947890, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.1706543, "step": 4303, "time_per_iteration": 2.564598560333252 }, { "auxiliary_loss_clip": 0.06503217, "auxiliary_loss_mlp": 0.0128119, "balance_loss_clip": 0.06293833, "balance_loss_mlp": 0.01264166, "epoch": 0.25877047948294, "flos": 27784392307200.0, "grad_norm": 2.24365886829954, "language_loss": 0.67911744, "learning_rate": 3.475654158020507e-06, "loss": 0.75696158, "num_input_tokens_seen": 92967690, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.17028809, "step": 4304, "time_per_iteration": 2.6267282962799072 }, { "auxiliary_loss_clip": 0.06508203, "auxiliary_loss_mlp": 0.01274822, "balance_loss_clip": 0.06292591, "balance_loss_mlp": 0.0125805, "epoch": 0.258830602735608, "flos": 27133209901440.0, "grad_norm": 2.4343102776886663, "language_loss": 0.73439914, "learning_rate": 3.4753912473894657e-06, "loss": 0.81222939, "num_input_tokens_seen": 92986830, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.16760254, "step": 4305, "time_per_iteration": 2.6201069355010986 }, { "auxiliary_loss_clip": 0.06510912, "auxiliary_loss_mlp": 0.01276044, "balance_loss_clip": 0.06295496, "balance_loss_mlp": 0.01258139, "epoch": 0.25889072598827595, "flos": 17896730158080.0, "grad_norm": 1.8962013893465717, "language_loss": 0.76402503, "learning_rate": 3.4751282808114403e-06, "loss": 0.84189463, "num_input_tokens_seen": 93002740, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.17919922, "step": 4306, "time_per_iteration": 2.5379581451416016 }, { "auxiliary_loss_clip": 0.06438586, "auxiliary_loss_mlp": 0.01278371, "balance_loss_clip": 0.06330341, "balance_loss_mlp": 0.01274038, "epoch": 0.2589508492409439, "flos": 53951582885760.0, "grad_norm": 0.8035889541768522, "language_loss": 0.5681991, "learning_rate": 3.474865258296403e-06, "loss": 0.6453687, "num_input_tokens_seen": 93058645, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.043396, "step": 4307, "time_per_iteration": 3.1345582008361816 }, { "auxiliary_loss_clip": 0.06495108, "auxiliary_loss_mlp": 0.01275152, "balance_loss_clip": 0.06292164, "balance_loss_mlp": 0.01259714, "epoch": 0.2590109724936119, "flos": 22132063434240.0, "grad_norm": 1.8503622271364586, "language_loss": 0.72524345, "learning_rate": 3.474602179854327e-06, "loss": 0.80294609, "num_input_tokens_seen": 93077140, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.15441895, "step": 4308, "time_per_iteration": 2.5875399112701416 }, { "auxiliary_loss_clip": 0.06507379, "auxiliary_loss_mlp": 0.01278216, "balance_loss_clip": 0.06293901, "balance_loss_mlp": 0.012613, "epoch": 0.2590710957462799, "flos": 13478395564800.0, "grad_norm": 1.91788491041927, "language_loss": 0.84478629, "learning_rate": 3.4743390454951886e-06, "loss": 0.92264223, "num_input_tokens_seen": 93093580, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.16906738, "step": 4309, "time_per_iteration": 2.563432455062866 }, { "auxiliary_loss_clip": 0.065047, "auxiliary_loss_mlp": 0.01279598, "balance_loss_clip": 0.06297369, "balance_loss_mlp": 0.01263707, "epoch": 0.25913121899894787, "flos": 22313219961600.0, "grad_norm": 1.4478149290819613, "language_loss": 0.85104817, "learning_rate": 3.474075855228966e-06, "loss": 0.92889118, "num_input_tokens_seen": 93112345, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.15881348, "step": 4310, "time_per_iteration": 2.5830676555633545 }, { "auxiliary_loss_clip": 0.06509162, "auxiliary_loss_mlp": 0.01275849, "balance_loss_clip": 0.06296121, "balance_loss_mlp": 0.01258933, "epoch": 0.25919134225161583, "flos": 25818770102400.0, "grad_norm": 1.7495056197254806, "language_loss": 0.78051162, "learning_rate": 3.473812609065639e-06, "loss": 0.85836178, "num_input_tokens_seen": 93131545, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.16906738, "step": 4311, "time_per_iteration": 2.625913381576538 }, { "auxiliary_loss_clip": 0.06509776, "auxiliary_loss_mlp": 0.01276577, "balance_loss_clip": 0.0629842, "balance_loss_mlp": 0.01259447, "epoch": 0.2592514655042838, "flos": 31220314104960.0, "grad_norm": 1.7709967227892343, "language_loss": 0.72233069, "learning_rate": 3.4735493070151904e-06, "loss": 0.80019426, "num_input_tokens_seen": 93150730, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.17126465, "step": 4312, "time_per_iteration": 2.654406785964966 }, { "auxiliary_loss_clip": 0.06504397, "auxiliary_loss_mlp": 0.01271803, "balance_loss_clip": 0.06294595, "balance_loss_mlp": 0.01255185, "epoch": 0.25931158875695176, "flos": 18480296718720.0, "grad_norm": 1.7707223893275232, "language_loss": 0.70754272, "learning_rate": 3.4732859490876044e-06, "loss": 0.78530473, "num_input_tokens_seen": 93167895, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.1661377, "step": 4313, "time_per_iteration": 2.5390536785125732 }, { "auxiliary_loss_clip": 0.06506994, "auxiliary_loss_mlp": 0.01278335, "balance_loss_clip": 0.06299099, "balance_loss_mlp": 0.01261657, "epoch": 0.2593717120096197, "flos": 19213895214720.0, "grad_norm": 1.5964293124438922, "language_loss": 0.80883527, "learning_rate": 3.473022535292867e-06, "loss": 0.88668859, "num_input_tokens_seen": 93187650, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.16687012, "step": 4314, "time_per_iteration": 2.6244871616363525 }, { "auxiliary_loss_clip": 0.06513665, "auxiliary_loss_mlp": 0.01276544, "balance_loss_clip": 0.06298167, "balance_loss_mlp": 0.01259175, "epoch": 0.2594318352622877, "flos": 31256050671360.0, "grad_norm": 3.325908234496684, "language_loss": 0.6718657, "learning_rate": 3.472759065640968e-06, "loss": 0.74976784, "num_input_tokens_seen": 93207370, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.17370605, "step": 4315, "time_per_iteration": 2.677659034729004 }, { "auxiliary_loss_clip": 0.06507087, "auxiliary_loss_mlp": 0.01273038, "balance_loss_clip": 0.06299533, "balance_loss_mlp": 0.01257303, "epoch": 0.25949195851495566, "flos": 22243759326720.0, "grad_norm": 1.4442835123490536, "language_loss": 0.80083549, "learning_rate": 3.4724955401418976e-06, "loss": 0.87863678, "num_input_tokens_seen": 93227925, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.15734863, "step": 4316, "time_per_iteration": 2.702913522720337 }, { "auxiliary_loss_clip": 0.06514409, "auxiliary_loss_mlp": 0.01273737, "balance_loss_clip": 0.06301263, "balance_loss_mlp": 0.01256166, "epoch": 0.2595520817676236, "flos": 28083449928960.0, "grad_norm": 1.5329638986221648, "language_loss": 0.77964574, "learning_rate": 3.4722319588056487e-06, "loss": 0.85752714, "num_input_tokens_seen": 93250020, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.17565918, "step": 4317, "time_per_iteration": 4.067133665084839 }, { "auxiliary_loss_clip": 0.06504415, "auxiliary_loss_mlp": 0.01280254, "balance_loss_clip": 0.06296566, "balance_loss_mlp": 0.0126335, "epoch": 0.2596122050202916, "flos": 20196727280640.0, "grad_norm": 1.839006025119898, "language_loss": 0.78462529, "learning_rate": 3.4719683216422163e-06, "loss": 0.86247206, "num_input_tokens_seen": 93269070, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.16906738, "step": 4318, "time_per_iteration": 2.631957530975342 }, { "auxiliary_loss_clip": 0.06502056, "auxiliary_loss_mlp": 0.01271474, "balance_loss_clip": 0.06294464, "balance_loss_mlp": 0.01254045, "epoch": 0.25967232827295955, "flos": 22534431540480.0, "grad_norm": 1.6348613110924597, "language_loss": 0.76821107, "learning_rate": 3.471704628661598e-06, "loss": 0.84594637, "num_input_tokens_seen": 93290250, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.17419434, "step": 4319, "time_per_iteration": 2.5848517417907715 }, { "auxiliary_loss_clip": 0.06499846, "auxiliary_loss_mlp": 0.01276591, "balance_loss_clip": 0.06292479, "balance_loss_mlp": 0.0126082, "epoch": 0.2597324515256275, "flos": 21074445999360.0, "grad_norm": 1.8669582423601059, "language_loss": 0.77067423, "learning_rate": 3.4714408798737925e-06, "loss": 0.84843862, "num_input_tokens_seen": 93310090, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.15771484, "step": 4320, "time_per_iteration": 2.606708526611328 }, { "auxiliary_loss_clip": 0.06505428, "auxiliary_loss_mlp": 0.01271684, "balance_loss_clip": 0.0629347, "balance_loss_mlp": 0.0125521, "epoch": 0.2597925747782955, "flos": 22055810618880.0, "grad_norm": 5.0677841227340785, "language_loss": 0.7193172, "learning_rate": 3.471177075288801e-06, "loss": 0.79708833, "num_input_tokens_seen": 93329570, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.16479492, "step": 4321, "time_per_iteration": 2.557807683944702 }, { "auxiliary_loss_clip": 0.06510073, "auxiliary_loss_mlp": 0.0127265, "balance_loss_clip": 0.06296271, "balance_loss_mlp": 0.01255769, "epoch": 0.2598526980309635, "flos": 19543071179520.0, "grad_norm": 3.2536750233934972, "language_loss": 0.75329268, "learning_rate": 3.4709132149166277e-06, "loss": 0.83111989, "num_input_tokens_seen": 93347920, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.16870117, "step": 4322, "time_per_iteration": 3.9837543964385986 }, { "auxiliary_loss_clip": 0.06512428, "auxiliary_loss_mlp": 0.01276164, "balance_loss_clip": 0.06299876, "balance_loss_mlp": 0.01258211, "epoch": 0.25991282128363147, "flos": 24501521191680.0, "grad_norm": 2.1734878546831555, "language_loss": 0.74274719, "learning_rate": 3.470649298767278e-06, "loss": 0.82063311, "num_input_tokens_seen": 93367145, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.17956543, "step": 4323, "time_per_iteration": 2.60353684425354 }, { "auxiliary_loss_clip": 0.06517962, "auxiliary_loss_mlp": 0.01277728, "balance_loss_clip": 0.06297433, "balance_loss_mlp": 0.01260645, "epoch": 0.25997294453629943, "flos": 24207410960640.0, "grad_norm": 1.7455977268774703, "language_loss": 0.67498207, "learning_rate": 3.4703853268507597e-06, "loss": 0.75293887, "num_input_tokens_seen": 93386555, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.17089844, "step": 4324, "time_per_iteration": 2.585247039794922 }, { "auxiliary_loss_clip": 0.06507775, "auxiliary_loss_mlp": 0.01274087, "balance_loss_clip": 0.06294866, "balance_loss_mlp": 0.01257767, "epoch": 0.2600330677889674, "flos": 31439597040000.0, "grad_norm": 1.7657714570471663, "language_loss": 0.71075404, "learning_rate": 3.470121299177082e-06, "loss": 0.78857267, "num_input_tokens_seen": 93405590, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.16320801, "step": 4325, "time_per_iteration": 2.6447715759277344 }, { "auxiliary_loss_clip": 0.06506102, "auxiliary_loss_mlp": 0.0127409, "balance_loss_clip": 0.06294544, "balance_loss_mlp": 0.01257519, "epoch": 0.26009319104163536, "flos": 32274116179200.0, "grad_norm": 2.6191217937428988, "language_loss": 0.73273057, "learning_rate": 3.469857215756257e-06, "loss": 0.81053245, "num_input_tokens_seen": 93424750, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.16564941, "step": 4326, "time_per_iteration": 2.6460132598876953 }, { "auxiliary_loss_clip": 0.0649956, "auxiliary_loss_mlp": 0.01275302, "balance_loss_clip": 0.06294274, "balance_loss_mlp": 0.0125934, "epoch": 0.26015331429430333, "flos": 26293994933760.0, "grad_norm": 1.6542906721733568, "language_loss": 0.87514949, "learning_rate": 3.4695930765982997e-06, "loss": 0.95289814, "num_input_tokens_seen": 93443465, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.15942383, "step": 4327, "time_per_iteration": 2.7149503231048584 }, { "auxiliary_loss_clip": 0.06507664, "auxiliary_loss_mlp": 0.01279833, "balance_loss_clip": 0.06293284, "balance_loss_mlp": 0.01261534, "epoch": 0.2602134375469713, "flos": 21148728243840.0, "grad_norm": 1.4280281073158392, "language_loss": 0.80772746, "learning_rate": 3.4693288817132255e-06, "loss": 0.88560241, "num_input_tokens_seen": 93462580, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.18310547, "step": 4328, "time_per_iteration": 3.9951467514038086 }, { "auxiliary_loss_clip": 0.06508411, "auxiliary_loss_mlp": 0.01278925, "balance_loss_clip": 0.0629845, "balance_loss_mlp": 0.01263154, "epoch": 0.26027356079963926, "flos": 25928411569920.0, "grad_norm": 1.4210079623035852, "language_loss": 0.88201237, "learning_rate": 3.4690646311110525e-06, "loss": 0.95988572, "num_input_tokens_seen": 93482790, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.15759277, "step": 4329, "time_per_iteration": 2.5986762046813965 }, { "auxiliary_loss_clip": 0.06507154, "auxiliary_loss_mlp": 0.01279375, "balance_loss_clip": 0.06298817, "balance_loss_mlp": 0.01262769, "epoch": 0.2603336840523072, "flos": 26366390461440.0, "grad_norm": 1.819298309035431, "language_loss": 0.78443635, "learning_rate": 3.468800324801802e-06, "loss": 0.86230165, "num_input_tokens_seen": 93498795, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.1661377, "step": 4330, "time_per_iteration": 2.721745014190674 }, { "auxiliary_loss_clip": 0.06514354, "auxiliary_loss_mlp": 0.01282235, "balance_loss_clip": 0.06298764, "balance_loss_mlp": 0.01264866, "epoch": 0.2603938073049752, "flos": 23520408134400.0, "grad_norm": 2.775109094368308, "language_loss": 0.7610122, "learning_rate": 3.4685359627954958e-06, "loss": 0.83897805, "num_input_tokens_seen": 93518335, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.17370605, "step": 4331, "time_per_iteration": 4.099231004714966 }, { "auxiliary_loss_clip": 0.06506391, "auxiliary_loss_mlp": 0.01272673, "balance_loss_clip": 0.0629894, "balance_loss_mlp": 0.01256687, "epoch": 0.26045393055764315, "flos": 25381336262400.0, "grad_norm": 1.520395493482442, "language_loss": 0.69365144, "learning_rate": 3.4682715451021584e-06, "loss": 0.77144206, "num_input_tokens_seen": 93539170, "router_z_loss_clip": 2.07714844, "router_z_loss_mlp": 0.15979004, "step": 4332, "time_per_iteration": 2.629225015640259 }, { "auxiliary_loss_clip": 0.06513131, "auxiliary_loss_mlp": 0.01276675, "balance_loss_clip": 0.06300758, "balance_loss_mlp": 0.01258746, "epoch": 0.2605140538103111, "flos": 27642494217600.0, "grad_norm": 2.161508815882493, "language_loss": 0.80306184, "learning_rate": 3.4680070717318174e-06, "loss": 0.88095987, "num_input_tokens_seen": 93558480, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.17919922, "step": 4333, "time_per_iteration": 2.6403069496154785 }, { "auxiliary_loss_clip": 0.06495199, "auxiliary_loss_mlp": 0.01281263, "balance_loss_clip": 0.06289066, "balance_loss_mlp": 0.01264824, "epoch": 0.2605741770629791, "flos": 13774602147840.0, "grad_norm": 4.686295686864288, "language_loss": 0.81041932, "learning_rate": 3.467742542694501e-06, "loss": 0.88818395, "num_input_tokens_seen": 93575220, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.16430664, "step": 4334, "time_per_iteration": 2.569246768951416 }, { "auxiliary_loss_clip": 0.06504281, "auxiliary_loss_mlp": 0.01280758, "balance_loss_clip": 0.06293312, "balance_loss_mlp": 0.01263068, "epoch": 0.26063430031564705, "flos": 26038933505280.0, "grad_norm": 1.7568356844554303, "language_loss": 0.80337644, "learning_rate": 3.46747795800024e-06, "loss": 0.88122678, "num_input_tokens_seen": 93597015, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.17687988, "step": 4335, "time_per_iteration": 2.6420512199401855 }, { "auxiliary_loss_clip": 0.06406057, "auxiliary_loss_mlp": 0.01257077, "balance_loss_clip": 0.0629916, "balance_loss_mlp": 0.01253745, "epoch": 0.26069442356831507, "flos": 62463143030400.0, "grad_norm": 0.8390852207043153, "language_loss": 0.60782766, "learning_rate": 3.467213317659068e-06, "loss": 0.68445897, "num_input_tokens_seen": 93657775, "router_z_loss_clip": 1.0703125, "router_z_loss_mlp": 0.03338623, "step": 4336, "time_per_iteration": 3.1700291633605957 }, { "auxiliary_loss_clip": 0.06509525, "auxiliary_loss_mlp": 0.012754, "balance_loss_clip": 0.06295548, "balance_loss_mlp": 0.0125846, "epoch": 0.26075454682098304, "flos": 13631530101120.0, "grad_norm": 2.603808009773287, "language_loss": 0.77857232, "learning_rate": 3.46694862168102e-06, "loss": 0.85642147, "num_input_tokens_seen": 93676145, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.16931152, "step": 4337, "time_per_iteration": 2.5769786834716797 }, { "auxiliary_loss_clip": 0.06508169, "auxiliary_loss_mlp": 0.01280652, "balance_loss_clip": 0.06293716, "balance_loss_mlp": 0.0126227, "epoch": 0.260814670073651, "flos": 12130776748800.0, "grad_norm": 1.9117328112365095, "language_loss": 0.7468586, "learning_rate": 3.4666838700761334e-06, "loss": 0.82474685, "num_input_tokens_seen": 93692480, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.18371582, "step": 4338, "time_per_iteration": 2.5648093223571777 }, { "auxiliary_loss_clip": 0.0651503, "auxiliary_loss_mlp": 0.01279877, "balance_loss_clip": 0.06296882, "balance_loss_mlp": 0.01261781, "epoch": 0.26087479332631897, "flos": 15127964968320.0, "grad_norm": 1.9260748451771785, "language_loss": 0.81239867, "learning_rate": 3.466419062854447e-06, "loss": 0.89034772, "num_input_tokens_seen": 93710165, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.18115234, "step": 4339, "time_per_iteration": 2.5773186683654785 }, { "auxiliary_loss_clip": 0.06498216, "auxiliary_loss_mlp": 0.01274215, "balance_loss_clip": 0.06290062, "balance_loss_mlp": 0.01258682, "epoch": 0.26093491657898693, "flos": 24687834744960.0, "grad_norm": 1.5911968299577957, "language_loss": 0.77100581, "learning_rate": 3.4661542000260033e-06, "loss": 0.84873015, "num_input_tokens_seen": 93730185, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.15527344, "step": 4340, "time_per_iteration": 2.612718105316162 }, { "auxiliary_loss_clip": 0.06500548, "auxiliary_loss_mlp": 0.01273065, "balance_loss_clip": 0.06287688, "balance_loss_mlp": 0.01255684, "epoch": 0.2609950398316549, "flos": 25122669108480.0, "grad_norm": 1.5030822413091616, "language_loss": 0.82393706, "learning_rate": 3.465889281600845e-06, "loss": 0.9016732, "num_input_tokens_seen": 93747690, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.17382812, "step": 4341, "time_per_iteration": 2.6048622131347656 }, { "auxiliary_loss_clip": 0.06500056, "auxiliary_loss_mlp": 0.01279471, "balance_loss_clip": 0.06288975, "balance_loss_mlp": 0.01261089, "epoch": 0.26105516308432286, "flos": 28556159137920.0, "grad_norm": 1.781148539751254, "language_loss": 0.77119285, "learning_rate": 3.4656243075890183e-06, "loss": 0.84898818, "num_input_tokens_seen": 93767405, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.18371582, "step": 4342, "time_per_iteration": 2.614896297454834 }, { "auxiliary_loss_clip": 0.06499705, "auxiliary_loss_mlp": 0.01274046, "balance_loss_clip": 0.0628789, "balance_loss_mlp": 0.01257094, "epoch": 0.2611152863369908, "flos": 39539984400000.0, "grad_norm": 1.8566035621601285, "language_loss": 0.66316378, "learning_rate": 3.4653592780005707e-06, "loss": 0.74090135, "num_input_tokens_seen": 93789950, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.16943359, "step": 4343, "time_per_iteration": 2.744260311126709 }, { "auxiliary_loss_clip": 0.06502492, "auxiliary_loss_mlp": 0.01275926, "balance_loss_clip": 0.06287814, "balance_loss_mlp": 0.01259249, "epoch": 0.2611754095896588, "flos": 13740416881920.0, "grad_norm": 2.1059835627720784, "language_loss": 0.73994172, "learning_rate": 3.465094192845553e-06, "loss": 0.8177259, "num_input_tokens_seen": 93807835, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.16674805, "step": 4344, "time_per_iteration": 2.546405076980591 }, { "auxiliary_loss_clip": 0.0650083, "auxiliary_loss_mlp": 0.01277758, "balance_loss_clip": 0.06289227, "balance_loss_mlp": 0.01260723, "epoch": 0.26123553284232676, "flos": 21513011869440.0, "grad_norm": 1.9543695210042757, "language_loss": 0.87552738, "learning_rate": 3.4648290521340165e-06, "loss": 0.95331329, "num_input_tokens_seen": 93825670, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.17022705, "step": 4345, "time_per_iteration": 2.5739641189575195 }, { "auxiliary_loss_clip": 0.06494418, "auxiliary_loss_mlp": 0.01271403, "balance_loss_clip": 0.0628663, "balance_loss_mlp": 0.01254535, "epoch": 0.2612956560949947, "flos": 21145751424000.0, "grad_norm": 1.7388172678684888, "language_loss": 0.76773965, "learning_rate": 3.464563855876015e-06, "loss": 0.84539795, "num_input_tokens_seen": 93844045, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.1685791, "step": 4346, "time_per_iteration": 2.5807034969329834 }, { "auxiliary_loss_clip": 0.0649625, "auxiliary_loss_mlp": 0.01272112, "balance_loss_clip": 0.06285945, "balance_loss_mlp": 0.01255602, "epoch": 0.2613557793476627, "flos": 25126023271680.0, "grad_norm": 1.6732446983038611, "language_loss": 0.76523572, "learning_rate": 3.464298604081606e-06, "loss": 0.84291935, "num_input_tokens_seen": 93864380, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.16503906, "step": 4347, "time_per_iteration": 2.615940809249878 }, { "auxiliary_loss_clip": 0.06492648, "auxiliary_loss_mlp": 0.01275703, "balance_loss_clip": 0.06284978, "balance_loss_mlp": 0.01258919, "epoch": 0.26141590260033065, "flos": 26074879706880.0, "grad_norm": 1.344962320727314, "language_loss": 0.73755252, "learning_rate": 3.4640332967608476e-06, "loss": 0.81523603, "num_input_tokens_seen": 93885475, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.16772461, "step": 4348, "time_per_iteration": 2.6168413162231445 }, { "auxiliary_loss_clip": 0.06498458, "auxiliary_loss_mlp": 0.01277532, "balance_loss_clip": 0.062868, "balance_loss_mlp": 0.01260127, "epoch": 0.2614760258529987, "flos": 25708415875200.0, "grad_norm": 1.8961950437276898, "language_loss": 0.91381991, "learning_rate": 3.463767933923799e-06, "loss": 0.99157983, "num_input_tokens_seen": 93905545, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.17407227, "step": 4349, "time_per_iteration": 2.6043505668640137 }, { "auxiliary_loss_clip": 0.06490157, "auxiliary_loss_mlp": 0.01274024, "balance_loss_clip": 0.06285933, "balance_loss_mlp": 0.01257132, "epoch": 0.26153614910566664, "flos": 17462902043520.0, "grad_norm": 1.6965186449938603, "language_loss": 0.8025313, "learning_rate": 3.463502515580524e-06, "loss": 0.88017309, "num_input_tokens_seen": 93924185, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.16882324, "step": 4350, "time_per_iteration": 2.5689802169799805 }, { "auxiliary_loss_clip": 0.06485794, "auxiliary_loss_mlp": 0.01272546, "balance_loss_clip": 0.06282526, "balance_loss_mlp": 0.01255833, "epoch": 0.2615962723583346, "flos": 17718676231680.0, "grad_norm": 2.547751828471051, "language_loss": 0.6292603, "learning_rate": 3.4632370417410866e-06, "loss": 0.70684367, "num_input_tokens_seen": 93942825, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.16723633, "step": 4351, "time_per_iteration": 2.5973618030548096 }, { "auxiliary_loss_clip": 0.06500292, "auxiliary_loss_mlp": 0.01276768, "balance_loss_clip": 0.06288527, "balance_loss_mlp": 0.01259482, "epoch": 0.26165639561100257, "flos": 23264340456960.0, "grad_norm": 1.8754493223634474, "language_loss": 0.84182882, "learning_rate": 3.462971512415555e-06, "loss": 0.91959941, "num_input_tokens_seen": 93962045, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.17285156, "step": 4352, "time_per_iteration": 2.5756328105926514 }, { "auxiliary_loss_clip": 0.06405432, "auxiliary_loss_mlp": 0.01355224, "balance_loss_clip": 0.0629989, "balance_loss_mlp": 0.01349696, "epoch": 0.26171651886367053, "flos": 66756155443200.0, "grad_norm": 0.9054145192238232, "language_loss": 0.70514816, "learning_rate": 3.462705927613996e-06, "loss": 0.78275472, "num_input_tokens_seen": 94021175, "router_z_loss_clip": 1.0546875, "router_z_loss_mlp": 0.05532837, "step": 4353, "time_per_iteration": 3.1365954875946045 }, { "auxiliary_loss_clip": 0.06490086, "auxiliary_loss_mlp": 0.01278589, "balance_loss_clip": 0.06285293, "balance_loss_mlp": 0.01261364, "epoch": 0.2617766421163385, "flos": 22356713030400.0, "grad_norm": 1.7095443967892106, "language_loss": 0.78020948, "learning_rate": 3.4624402873464816e-06, "loss": 0.85789621, "num_input_tokens_seen": 94043370, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.17224121, "step": 4354, "time_per_iteration": 2.6570675373077393 }, { "auxiliary_loss_clip": 0.06505668, "auxiliary_loss_mlp": 0.01277693, "balance_loss_clip": 0.06292485, "balance_loss_mlp": 0.01260777, "epoch": 0.26183676536900646, "flos": 26074208874240.0, "grad_norm": 2.784454950123462, "language_loss": 0.69156545, "learning_rate": 3.462174591623085e-06, "loss": 0.76939911, "num_input_tokens_seen": 94063510, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.16906738, "step": 4355, "time_per_iteration": 2.6030752658843994 }, { "auxiliary_loss_clip": 0.06503071, "auxiliary_loss_mlp": 0.0127705, "balance_loss_clip": 0.06293909, "balance_loss_mlp": 0.01258763, "epoch": 0.26189688862167443, "flos": 21002847085440.0, "grad_norm": 2.0274007573029094, "language_loss": 0.68322849, "learning_rate": 3.4619088404538815e-06, "loss": 0.76102972, "num_input_tokens_seen": 94083865, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.1829834, "step": 4356, "time_per_iteration": 4.022512912750244 }, { "auxiliary_loss_clip": 0.06405729, "auxiliary_loss_mlp": 0.01279447, "balance_loss_clip": 0.0630041, "balance_loss_mlp": 0.01275766, "epoch": 0.2619570118743424, "flos": 65817780768000.0, "grad_norm": 0.6687702463377423, "language_loss": 0.53186458, "learning_rate": 3.4616430338489487e-06, "loss": 0.60871637, "num_input_tokens_seen": 94144095, "router_z_loss_clip": 1.05371094, "router_z_loss_mlp": 0.03674316, "step": 4357, "time_per_iteration": 3.1414661407470703 }, { "auxiliary_loss_clip": 0.06510448, "auxiliary_loss_mlp": 0.01275781, "balance_loss_clip": 0.06295437, "balance_loss_mlp": 0.01258889, "epoch": 0.26201713512701036, "flos": 28774310042880.0, "grad_norm": 2.288732101915173, "language_loss": 0.85612953, "learning_rate": 3.4613771718183654e-06, "loss": 0.93399179, "num_input_tokens_seen": 94163035, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.16894531, "step": 4358, "time_per_iteration": 2.675205707550049 }, { "auxiliary_loss_clip": 0.06516972, "auxiliary_loss_mlp": 0.01275606, "balance_loss_clip": 0.06293614, "balance_loss_mlp": 0.01257319, "epoch": 0.2620772583796783, "flos": 26439750311040.0, "grad_norm": 2.3577890444561738, "language_loss": 0.67660713, "learning_rate": 3.4611112543722127e-06, "loss": 0.75453293, "num_input_tokens_seen": 94182520, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.18273926, "step": 4359, "time_per_iteration": 2.6375954151153564 }, { "auxiliary_loss_clip": 0.06504974, "auxiliary_loss_mlp": 0.01277868, "balance_loss_clip": 0.06293753, "balance_loss_mlp": 0.01260868, "epoch": 0.2621373816323463, "flos": 20162667795840.0, "grad_norm": 1.9930923004477168, "language_loss": 0.78832436, "learning_rate": 3.4608452815205757e-06, "loss": 0.86615276, "num_input_tokens_seen": 94201795, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.16992188, "step": 4360, "time_per_iteration": 2.5825142860412598 }, { "auxiliary_loss_clip": 0.06501165, "auxiliary_loss_mlp": 0.01278286, "balance_loss_clip": 0.06298308, "balance_loss_mlp": 0.01260762, "epoch": 0.26219750488501425, "flos": 28628764300800.0, "grad_norm": 2.26138401876097, "language_loss": 0.69247454, "learning_rate": 3.4605792532735387e-06, "loss": 0.77026904, "num_input_tokens_seen": 94222390, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.1751709, "step": 4361, "time_per_iteration": 2.6739070415496826 }, { "auxiliary_loss_clip": 0.0651627, "auxiliary_loss_mlp": 0.01285889, "balance_loss_clip": 0.06301419, "balance_loss_mlp": 0.01267852, "epoch": 0.2622576281376823, "flos": 15046806689280.0, "grad_norm": 1.77702897168974, "language_loss": 0.84988952, "learning_rate": 3.46031316964119e-06, "loss": 0.92791104, "num_input_tokens_seen": 94239980, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.18041992, "step": 4362, "time_per_iteration": 3.9870948791503906 }, { "auxiliary_loss_clip": 0.06507481, "auxiliary_loss_mlp": 0.01283332, "balance_loss_clip": 0.06299679, "balance_loss_mlp": 0.01265999, "epoch": 0.26231775139035024, "flos": 26403426766080.0, "grad_norm": 1.6668330463845618, "language_loss": 0.65901923, "learning_rate": 3.4600470306336197e-06, "loss": 0.73692739, "num_input_tokens_seen": 94260715, "router_z_loss_clip": 2.07714844, "router_z_loss_mlp": 0.17333984, "step": 4363, "time_per_iteration": 2.6670918464660645 }, { "auxiliary_loss_clip": 0.06396444, "auxiliary_loss_mlp": 0.01320344, "balance_loss_clip": 0.06291604, "balance_loss_mlp": 0.0131605, "epoch": 0.2623778746430182, "flos": 65430380615040.0, "grad_norm": 0.8843267642022278, "language_loss": 0.61120367, "learning_rate": 3.4597808362609194e-06, "loss": 0.6883716, "num_input_tokens_seen": 94321285, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.04299927, "step": 4364, "time_per_iteration": 3.260255813598633 }, { "auxiliary_loss_clip": 0.06512673, "auxiliary_loss_mlp": 0.01285852, "balance_loss_clip": 0.06298672, "balance_loss_mlp": 0.01267613, "epoch": 0.26243799789568617, "flos": 12609104181120.0, "grad_norm": 2.335501712038107, "language_loss": 0.72756588, "learning_rate": 3.459514586533184e-06, "loss": 0.80555117, "num_input_tokens_seen": 94335420, "router_z_loss_clip": 2.13964844, "router_z_loss_mlp": 0.18225098, "step": 4365, "time_per_iteration": 2.6163647174835205 }, { "auxiliary_loss_clip": 0.065091, "auxiliary_loss_mlp": 0.01282058, "balance_loss_clip": 0.06301221, "balance_loss_mlp": 0.01265392, "epoch": 0.26249812114835414, "flos": 28631783047680.0, "grad_norm": 1.6231619690912755, "language_loss": 0.78178698, "learning_rate": 3.459248281460509e-06, "loss": 0.85969853, "num_input_tokens_seen": 94357440, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.16674805, "step": 4366, "time_per_iteration": 2.6715338230133057 }, { "auxiliary_loss_clip": 0.06513071, "auxiliary_loss_mlp": 0.01286401, "balance_loss_clip": 0.06301287, "balance_loss_mlp": 0.01269795, "epoch": 0.2625582444010221, "flos": 14470661214720.0, "grad_norm": 2.3462970721680354, "language_loss": 0.76370156, "learning_rate": 3.4589819210529927e-06, "loss": 0.84169626, "num_input_tokens_seen": 94375690, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.16601562, "step": 4367, "time_per_iteration": 2.5517570972442627 }, { "auxiliary_loss_clip": 0.06515092, "auxiliary_loss_mlp": 0.01284061, "balance_loss_clip": 0.06304673, "balance_loss_mlp": 0.01268432, "epoch": 0.26261836765369007, "flos": 16617984998400.0, "grad_norm": 1.5392686543324663, "language_loss": 0.69670331, "learning_rate": 3.458715505320736e-06, "loss": 0.77469486, "num_input_tokens_seen": 94393190, "router_z_loss_clip": 2.10253906, "router_z_loss_mlp": 0.15637207, "step": 4368, "time_per_iteration": 3.9567654132843018 }, { "auxiliary_loss_clip": 0.06507221, "auxiliary_loss_mlp": 0.01282047, "balance_loss_clip": 0.06296667, "balance_loss_mlp": 0.01262461, "epoch": 0.26267849090635803, "flos": 20525861318400.0, "grad_norm": 12.649416028285462, "language_loss": 0.7931748, "learning_rate": 3.458449034273841e-06, "loss": 0.87106746, "num_input_tokens_seen": 94410975, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.19592285, "step": 4369, "time_per_iteration": 2.5927183628082275 }, { "auxiliary_loss_clip": 0.06513119, "auxiliary_loss_mlp": 0.01279115, "balance_loss_clip": 0.06304274, "balance_loss_mlp": 0.0126239, "epoch": 0.262738614159026, "flos": 21330220187520.0, "grad_norm": 4.296825980798066, "language_loss": 0.84034866, "learning_rate": 3.4581825079224133e-06, "loss": 0.91827106, "num_input_tokens_seen": 94429985, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.16711426, "step": 4370, "time_per_iteration": 4.0506205558776855 }, { "auxiliary_loss_clip": 0.06519189, "auxiliary_loss_mlp": 0.01274191, "balance_loss_clip": 0.06301708, "balance_loss_mlp": 0.01255356, "epoch": 0.26279873741169396, "flos": 17609454034560.0, "grad_norm": 1.6323690415227659, "language_loss": 0.71815002, "learning_rate": 3.4579159262765575e-06, "loss": 0.79608381, "num_input_tokens_seen": 94448660, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.18847656, "step": 4371, "time_per_iteration": 2.639265298843384 }, { "auxiliary_loss_clip": 0.06412719, "auxiliary_loss_mlp": 0.01300763, "balance_loss_clip": 0.06306773, "balance_loss_mlp": 0.01296189, "epoch": 0.2628588606643619, "flos": 60969139931520.0, "grad_norm": 0.6787010803899156, "language_loss": 0.56041074, "learning_rate": 3.457649289346384e-06, "loss": 0.63754553, "num_input_tokens_seen": 94515630, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.04577637, "step": 4372, "time_per_iteration": 3.4151670932769775 }, { "auxiliary_loss_clip": 0.06512487, "auxiliary_loss_mlp": 0.0127859, "balance_loss_clip": 0.06303118, "balance_loss_mlp": 0.01261627, "epoch": 0.2629189839170299, "flos": 27023652288000.0, "grad_norm": 1.7471493019658852, "language_loss": 0.78276968, "learning_rate": 3.4573825971420042e-06, "loss": 0.86068046, "num_input_tokens_seen": 94535385, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.16967773, "step": 4373, "time_per_iteration": 2.7073681354522705 }, { "auxiliary_loss_clip": 0.06514785, "auxiliary_loss_mlp": 0.01276859, "balance_loss_clip": 0.06303477, "balance_loss_mlp": 0.01259717, "epoch": 0.26297910716969786, "flos": 17025635911680.0, "grad_norm": 2.0003205343803554, "language_loss": 0.72370791, "learning_rate": 3.4571158496735294e-06, "loss": 0.8016243, "num_input_tokens_seen": 94552650, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.17126465, "step": 4374, "time_per_iteration": 2.6053879261016846 }, { "auxiliary_loss_clip": 0.06509227, "auxiliary_loss_mlp": 0.01274996, "balance_loss_clip": 0.06298199, "balance_loss_mlp": 0.01256995, "epoch": 0.2630392304223659, "flos": 24903889297920.0, "grad_norm": 1.8789337134172397, "language_loss": 0.80962497, "learning_rate": 3.4568490469510756e-06, "loss": 0.88746727, "num_input_tokens_seen": 94574075, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.18005371, "step": 4375, "time_per_iteration": 2.684370279312134 }, { "auxiliary_loss_clip": 0.06506892, "auxiliary_loss_mlp": 0.01279082, "balance_loss_clip": 0.06297413, "balance_loss_mlp": 0.01262298, "epoch": 0.26309935367503384, "flos": 32862336641280.0, "grad_norm": 2.1497846532907254, "language_loss": 0.67114538, "learning_rate": 3.4565821889847603e-06, "loss": 0.74900514, "num_input_tokens_seen": 94594255, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.16784668, "step": 4376, "time_per_iteration": 2.778381109237671 }, { "auxiliary_loss_clip": 0.06511118, "auxiliary_loss_mlp": 0.01274635, "balance_loss_clip": 0.06296994, "balance_loss_mlp": 0.01257373, "epoch": 0.2631594769277018, "flos": 15893400816000.0, "grad_norm": 1.9373724867128608, "language_loss": 0.69707465, "learning_rate": 3.4563152757847026e-06, "loss": 0.77493215, "num_input_tokens_seen": 94611410, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.17260742, "step": 4377, "time_per_iteration": 2.7495415210723877 }, { "auxiliary_loss_clip": 0.06507481, "auxiliary_loss_mlp": 0.01273182, "balance_loss_clip": 0.06295344, "balance_loss_mlp": 0.01255658, "epoch": 0.2632196001803698, "flos": 50816242811520.0, "grad_norm": 2.791785034544203, "language_loss": 0.8002578, "learning_rate": 3.4560483073610233e-06, "loss": 0.87806445, "num_input_tokens_seen": 94636575, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.17504883, "step": 4378, "time_per_iteration": 3.027714490890503 }, { "auxiliary_loss_clip": 0.06509262, "auxiliary_loss_mlp": 0.01272618, "balance_loss_clip": 0.06299114, "balance_loss_mlp": 0.01256286, "epoch": 0.26327972343303774, "flos": 13737733551360.0, "grad_norm": 2.4299672629182956, "language_loss": 0.77317858, "learning_rate": 3.455781283723846e-06, "loss": 0.85099733, "num_input_tokens_seen": 94654345, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.16333008, "step": 4379, "time_per_iteration": 2.6856234073638916 }, { "auxiliary_loss_clip": 0.06510741, "auxiliary_loss_mlp": 0.01275853, "balance_loss_clip": 0.06293143, "balance_loss_mlp": 0.01256851, "epoch": 0.2633398466857057, "flos": 23775846906240.0, "grad_norm": 2.0773913234265944, "language_loss": 0.78388417, "learning_rate": 3.4555142048832975e-06, "loss": 0.86175013, "num_input_tokens_seen": 94673985, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.19018555, "step": 4380, "time_per_iteration": 2.615443468093872 }, { "auxiliary_loss_clip": 0.06505498, "auxiliary_loss_mlp": 0.01276074, "balance_loss_clip": 0.0628966, "balance_loss_mlp": 0.01258157, "epoch": 0.26339996993837367, "flos": 27607680046080.0, "grad_norm": 1.736622181256375, "language_loss": 0.64260387, "learning_rate": 3.4552470708495036e-06, "loss": 0.72041959, "num_input_tokens_seen": 94693145, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.17919922, "step": 4381, "time_per_iteration": 2.62591552734375 }, { "auxiliary_loss_clip": 0.06506698, "auxiliary_loss_mlp": 0.01272778, "balance_loss_clip": 0.06293741, "balance_loss_mlp": 0.01255326, "epoch": 0.26346009319104163, "flos": 16951982572800.0, "grad_norm": 1.6575931493799403, "language_loss": 0.82677072, "learning_rate": 3.454979881632595e-06, "loss": 0.90456545, "num_input_tokens_seen": 94710185, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.17443848, "step": 4382, "time_per_iteration": 2.5605804920196533 }, { "auxiliary_loss_clip": 0.06518352, "auxiliary_loss_mlp": 0.01285372, "balance_loss_clip": 0.06298822, "balance_loss_mlp": 0.01266656, "epoch": 0.2635202164437096, "flos": 37241245088640.0, "grad_norm": 1.7565412093549038, "language_loss": 0.69679278, "learning_rate": 3.4547126372427035e-06, "loss": 0.77482998, "num_input_tokens_seen": 94730280, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.18713379, "step": 4383, "time_per_iteration": 2.736520767211914 }, { "auxiliary_loss_clip": 0.06502081, "auxiliary_loss_mlp": 0.01277261, "balance_loss_clip": 0.06288633, "balance_loss_mlp": 0.01259546, "epoch": 0.26358033969637756, "flos": 21002721304320.0, "grad_norm": 2.21545711537774, "language_loss": 0.69760311, "learning_rate": 3.4544453376899638e-06, "loss": 0.77539653, "num_input_tokens_seen": 94748560, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.17724609, "step": 4384, "time_per_iteration": 2.572072744369507 }, { "auxiliary_loss_clip": 0.06503637, "auxiliary_loss_mlp": 0.01272578, "balance_loss_clip": 0.0629378, "balance_loss_mlp": 0.01255174, "epoch": 0.26364046294904553, "flos": 27753561204480.0, "grad_norm": 2.2520796922088233, "language_loss": 0.70256001, "learning_rate": 3.45417798298451e-06, "loss": 0.78032219, "num_input_tokens_seen": 94767570, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.17407227, "step": 4385, "time_per_iteration": 2.617521286010742 }, { "auxiliary_loss_clip": 0.06501971, "auxiliary_loss_mlp": 0.01274916, "balance_loss_clip": 0.06290433, "balance_loss_mlp": 0.01257261, "epoch": 0.2637005862017135, "flos": 22899679488000.0, "grad_norm": 2.1655793276637425, "language_loss": 0.85553998, "learning_rate": 3.453910573136482e-06, "loss": 0.93330884, "num_input_tokens_seen": 94784985, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.17651367, "step": 4386, "time_per_iteration": 2.5699896812438965 }, { "auxiliary_loss_clip": 0.06500924, "auxiliary_loss_mlp": 0.01278778, "balance_loss_clip": 0.06289682, "balance_loss_mlp": 0.01260873, "epoch": 0.26376070945438146, "flos": 15054143921280.0, "grad_norm": 2.3492470879834415, "language_loss": 0.77746326, "learning_rate": 3.4536431081560196e-06, "loss": 0.85526025, "num_input_tokens_seen": 94802545, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.17907715, "step": 4387, "time_per_iteration": 2.5782952308654785 }, { "auxiliary_loss_clip": 0.0650055, "auxiliary_loss_mlp": 0.0127771, "balance_loss_clip": 0.06289527, "balance_loss_mlp": 0.01260401, "epoch": 0.2638208327070494, "flos": 21148141265280.0, "grad_norm": 2.4099417539848953, "language_loss": 0.76259333, "learning_rate": 3.453375588053264e-06, "loss": 0.84037596, "num_input_tokens_seen": 94820730, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.17297363, "step": 4388, "time_per_iteration": 2.5537407398223877 }, { "auxiliary_loss_clip": 0.06501108, "auxiliary_loss_mlp": 0.01274169, "balance_loss_clip": 0.06289431, "balance_loss_mlp": 0.01256097, "epoch": 0.26388095595971744, "flos": 21732001315200.0, "grad_norm": 2.112752191563902, "language_loss": 0.87155885, "learning_rate": 3.4531080128383617e-06, "loss": 0.94931161, "num_input_tokens_seen": 94839175, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.18066406, "step": 4389, "time_per_iteration": 2.5867867469787598 }, { "auxiliary_loss_clip": 0.06385556, "auxiliary_loss_mlp": 0.01271531, "balance_loss_clip": 0.06281263, "balance_loss_mlp": 0.0126735, "epoch": 0.2639410792123854, "flos": 65536542138240.0, "grad_norm": 0.784391128130035, "language_loss": 0.60235113, "learning_rate": 3.452840382521457e-06, "loss": 0.678922, "num_input_tokens_seen": 94898865, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.0418396, "step": 4390, "time_per_iteration": 3.2214691638946533 }, { "auxiliary_loss_clip": 0.06506754, "auxiliary_loss_mlp": 0.01276269, "balance_loss_clip": 0.06288646, "balance_loss_mlp": 0.01258507, "epoch": 0.2640012024650534, "flos": 23954907081600.0, "grad_norm": 1.6019754241806083, "language_loss": 0.77978271, "learning_rate": 3.4525726971127e-06, "loss": 0.85761297, "num_input_tokens_seen": 94917490, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.17773438, "step": 4391, "time_per_iteration": 2.667586088180542 }, { "auxiliary_loss_clip": 0.06375141, "auxiliary_loss_mlp": 0.01260696, "balance_loss_clip": 0.06270493, "balance_loss_mlp": 0.01256741, "epoch": 0.26406132571772134, "flos": 56462420880000.0, "grad_norm": 0.8128110002111184, "language_loss": 0.58844817, "learning_rate": 3.45230495662224e-06, "loss": 0.6648066, "num_input_tokens_seen": 94969065, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.03952026, "step": 4392, "time_per_iteration": 3.189330816268921 }, { "auxiliary_loss_clip": 0.06506795, "auxiliary_loss_mlp": 0.01275474, "balance_loss_clip": 0.06290119, "balance_loss_mlp": 0.0125733, "epoch": 0.2641214489703893, "flos": 22097039627520.0, "grad_norm": 1.8060874927402535, "language_loss": 0.69431293, "learning_rate": 3.4520371610602306e-06, "loss": 0.77213562, "num_input_tokens_seen": 94988540, "router_z_loss_clip": 2.16699219, "router_z_loss_mlp": 0.18151855, "step": 4393, "time_per_iteration": 2.602951765060425 }, { "auxiliary_loss_clip": 0.06513735, "auxiliary_loss_mlp": 0.01278557, "balance_loss_clip": 0.0629276, "balance_loss_mlp": 0.0125828, "epoch": 0.26418157222305727, "flos": 16550327226240.0, "grad_norm": 3.5559138857833927, "language_loss": 0.84678757, "learning_rate": 3.4517693104368267e-06, "loss": 0.92471045, "num_input_tokens_seen": 95004810, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.20300293, "step": 4394, "time_per_iteration": 2.564785957336426 }, { "auxiliary_loss_clip": 0.06515507, "auxiliary_loss_mlp": 0.01280999, "balance_loss_clip": 0.06293164, "balance_loss_mlp": 0.012609, "epoch": 0.26424169547572524, "flos": 18008006780160.0, "grad_norm": 2.3334845994042483, "language_loss": 0.70917922, "learning_rate": 3.4515014047621856e-06, "loss": 0.7871443, "num_input_tokens_seen": 95024085, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.2010498, "step": 4395, "time_per_iteration": 2.5898404121398926 }, { "auxiliary_loss_clip": 0.06499168, "auxiliary_loss_mlp": 0.01277251, "balance_loss_clip": 0.06288587, "balance_loss_mlp": 0.01258964, "epoch": 0.2643018187283932, "flos": 16988893096320.0, "grad_norm": 1.805385062423579, "language_loss": 0.87389469, "learning_rate": 3.4512334440464655e-06, "loss": 0.95165879, "num_input_tokens_seen": 95042515, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.18286133, "step": 4396, "time_per_iteration": 3.9649455547332764 }, { "auxiliary_loss_clip": 0.06375581, "auxiliary_loss_mlp": 0.01258573, "balance_loss_clip": 0.06271544, "balance_loss_mlp": 0.01254082, "epoch": 0.26436194198106117, "flos": 59682135144960.0, "grad_norm": 0.7672338231601564, "language_loss": 0.55115569, "learning_rate": 3.4509654282998277e-06, "loss": 0.6274972, "num_input_tokens_seen": 95094835, "router_z_loss_clip": 1.04101562, "router_z_loss_mlp": 0.04498291, "step": 4397, "time_per_iteration": 2.9711697101593018 }, { "auxiliary_loss_clip": 0.06497723, "auxiliary_loss_mlp": 0.01272047, "balance_loss_clip": 0.06288087, "balance_loss_mlp": 0.01254929, "epoch": 0.26442206523372913, "flos": 32928694675200.0, "grad_norm": 2.1308330279258545, "language_loss": 0.78626132, "learning_rate": 3.450697357532435e-06, "loss": 0.86395901, "num_input_tokens_seen": 95113480, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.17102051, "step": 4398, "time_per_iteration": 2.6693825721740723 }, { "auxiliary_loss_clip": 0.06505613, "auxiliary_loss_mlp": 0.01278118, "balance_loss_clip": 0.06288752, "balance_loss_mlp": 0.01259498, "epoch": 0.2644821884863971, "flos": 21037409694720.0, "grad_norm": 1.8499952540074636, "language_loss": 0.67912561, "learning_rate": 3.4504292317544534e-06, "loss": 0.7569629, "num_input_tokens_seen": 95132580, "router_z_loss_clip": 2.17089844, "router_z_loss_mlp": 0.18640137, "step": 4399, "time_per_iteration": 2.5825531482696533 }, { "auxiliary_loss_clip": 0.06496722, "auxiliary_loss_mlp": 0.01276154, "balance_loss_clip": 0.06293789, "balance_loss_mlp": 0.01258821, "epoch": 0.26454231173906506, "flos": 20783019098880.0, "grad_norm": 3.0450906757154117, "language_loss": 0.87035227, "learning_rate": 3.4501610509760504e-06, "loss": 0.94808108, "num_input_tokens_seen": 95152375, "router_z_loss_clip": 2.03027344, "router_z_loss_mlp": 0.17321777, "step": 4400, "time_per_iteration": 2.6072187423706055 }, { "auxiliary_loss_clip": 0.06507255, "auxiliary_loss_mlp": 0.01272642, "balance_loss_clip": 0.06292304, "balance_loss_mlp": 0.01253628, "epoch": 0.264602434991733, "flos": 16624399835520.0, "grad_norm": 1.9901898089707997, "language_loss": 0.77385527, "learning_rate": 3.4498928152073944e-06, "loss": 0.85165429, "num_input_tokens_seen": 95170265, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.19018555, "step": 4401, "time_per_iteration": 4.009115695953369 }, { "auxiliary_loss_clip": 0.06508504, "auxiliary_loss_mlp": 0.01279091, "balance_loss_clip": 0.06292368, "balance_loss_mlp": 0.0126016, "epoch": 0.26466255824440105, "flos": 19068726816000.0, "grad_norm": 2.001913512167761, "language_loss": 0.88070703, "learning_rate": 3.4496245244586577e-06, "loss": 0.95858294, "num_input_tokens_seen": 95188655, "router_z_loss_clip": 2.16113281, "router_z_loss_mlp": 0.18920898, "step": 4402, "time_per_iteration": 2.5658578872680664 }, { "auxiliary_loss_clip": 0.0650451, "auxiliary_loss_mlp": 0.01280717, "balance_loss_clip": 0.06292304, "balance_loss_mlp": 0.01263575, "epoch": 0.264722681497069, "flos": 22645246965120.0, "grad_norm": 1.8521702494814627, "language_loss": 0.78413647, "learning_rate": 3.4493561787400137e-06, "loss": 0.86198872, "num_input_tokens_seen": 95209615, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.17150879, "step": 4403, "time_per_iteration": 2.601844310760498 }, { "auxiliary_loss_clip": 0.06501863, "auxiliary_loss_mlp": 0.0127194, "balance_loss_clip": 0.06289512, "balance_loss_mlp": 0.01254559, "epoch": 0.264782804749737, "flos": 22498862682240.0, "grad_norm": 2.8440046923235305, "language_loss": 0.88854432, "learning_rate": 3.4490877780616387e-06, "loss": 0.96628237, "num_input_tokens_seen": 95227810, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.17382812, "step": 4404, "time_per_iteration": 2.564965009689331 }, { "auxiliary_loss_clip": 0.0650209, "auxiliary_loss_mlp": 0.01272891, "balance_loss_clip": 0.06288325, "balance_loss_mlp": 0.01255737, "epoch": 0.26484292800240494, "flos": 16805891779200.0, "grad_norm": 1.590951004504798, "language_loss": 0.768372, "learning_rate": 3.448819322433709e-06, "loss": 0.84612179, "num_input_tokens_seen": 95245890, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.17163086, "step": 4405, "time_per_iteration": 2.5800304412841797 }, { "auxiliary_loss_clip": 0.06510167, "auxiliary_loss_mlp": 0.01274532, "balance_loss_clip": 0.06295547, "balance_loss_mlp": 0.012559, "epoch": 0.2649030512550729, "flos": 20455939486080.0, "grad_norm": 1.8522140285901416, "language_loss": 0.7073583, "learning_rate": 3.4485508118664066e-06, "loss": 0.78520525, "num_input_tokens_seen": 95264955, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.1862793, "step": 4406, "time_per_iteration": 2.564084768295288 }, { "auxiliary_loss_clip": 0.06501409, "auxiliary_loss_mlp": 0.01280662, "balance_loss_clip": 0.06292858, "balance_loss_mlp": 0.01262709, "epoch": 0.2649631745077409, "flos": 22422190596480.0, "grad_norm": 1.7116247992014653, "language_loss": 0.84014636, "learning_rate": 3.448282246369912e-06, "loss": 0.91796708, "num_input_tokens_seen": 95284245, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.17956543, "step": 4407, "time_per_iteration": 2.576099157333374 }, { "auxiliary_loss_clip": 0.06501865, "auxiliary_loss_mlp": 0.01272705, "balance_loss_clip": 0.06290558, "balance_loss_mlp": 0.01255325, "epoch": 0.26502329776040884, "flos": 35124794334720.0, "grad_norm": 1.5940261898790682, "language_loss": 0.766518, "learning_rate": 3.4480136259544084e-06, "loss": 0.84426379, "num_input_tokens_seen": 95307125, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.17382812, "step": 4408, "time_per_iteration": 4.108827590942383 }, { "auxiliary_loss_clip": 0.06496199, "auxiliary_loss_mlp": 0.01277362, "balance_loss_clip": 0.06287816, "balance_loss_mlp": 0.01259696, "epoch": 0.2650834210130768, "flos": 38696073603840.0, "grad_norm": 1.7468182103429888, "language_loss": 0.71607572, "learning_rate": 3.447744950630084e-06, "loss": 0.79381132, "num_input_tokens_seen": 95329150, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.17675781, "step": 4409, "time_per_iteration": 2.691774606704712 }, { "auxiliary_loss_clip": 0.06507437, "auxiliary_loss_mlp": 0.01277779, "balance_loss_clip": 0.06291446, "balance_loss_mlp": 0.01259862, "epoch": 0.26514354426574477, "flos": 24723655165440.0, "grad_norm": 1.8801562636707156, "language_loss": 0.74198806, "learning_rate": 3.4474762204071253e-06, "loss": 0.81984019, "num_input_tokens_seen": 95349880, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.17919922, "step": 4410, "time_per_iteration": 4.0552544593811035 }, { "auxiliary_loss_clip": 0.06502476, "auxiliary_loss_mlp": 0.0127088, "balance_loss_clip": 0.06288136, "balance_loss_mlp": 0.01254167, "epoch": 0.26520366751841273, "flos": 20346381872640.0, "grad_norm": 1.7165941731694336, "language_loss": 0.74287355, "learning_rate": 3.4472074352957244e-06, "loss": 0.82060719, "num_input_tokens_seen": 95368570, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.16711426, "step": 4411, "time_per_iteration": 2.5639307498931885 }, { "auxiliary_loss_clip": 0.06498012, "auxiliary_loss_mlp": 0.01277436, "balance_loss_clip": 0.06288921, "balance_loss_mlp": 0.01260282, "epoch": 0.2652637907710807, "flos": 22350046631040.0, "grad_norm": 1.7371554670674014, "language_loss": 0.82711256, "learning_rate": 3.446938595306071e-06, "loss": 0.90486705, "num_input_tokens_seen": 95387065, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.17163086, "step": 4412, "time_per_iteration": 2.5621583461761475 }, { "auxiliary_loss_clip": 0.06497154, "auxiliary_loss_mlp": 0.01278563, "balance_loss_clip": 0.06286791, "balance_loss_mlp": 0.01261004, "epoch": 0.26532391402374866, "flos": 19360279497600.0, "grad_norm": 1.8277732362311099, "language_loss": 0.74947381, "learning_rate": 3.4466697004483622e-06, "loss": 0.82723105, "num_input_tokens_seen": 95406345, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.17553711, "step": 4413, "time_per_iteration": 2.5593109130859375 }, { "auxiliary_loss_clip": 0.06380688, "auxiliary_loss_mlp": 0.01258329, "balance_loss_clip": 0.06277373, "balance_loss_mlp": 0.01253751, "epoch": 0.26538403727641663, "flos": 44804479121280.0, "grad_norm": 0.8526300593306344, "language_loss": 0.57005763, "learning_rate": 3.446400750732793e-06, "loss": 0.64644772, "num_input_tokens_seen": 95463595, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.04577637, "step": 4414, "time_per_iteration": 3.1083719730377197 }, { "auxiliary_loss_clip": 0.06494461, "auxiliary_loss_mlp": 0.0127074, "balance_loss_clip": 0.06289855, "balance_loss_mlp": 0.01254575, "epoch": 0.26544416052908465, "flos": 28189359889920.0, "grad_norm": 1.4788350804917827, "language_loss": 0.74793983, "learning_rate": 3.4461317461695625e-06, "loss": 0.82559186, "num_input_tokens_seen": 95484115, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.16162109, "step": 4415, "time_per_iteration": 2.633606433868408 }, { "auxiliary_loss_clip": 0.06504561, "auxiliary_loss_mlp": 0.01276687, "balance_loss_clip": 0.06290393, "balance_loss_mlp": 0.01257864, "epoch": 0.2655042837817526, "flos": 17570824502400.0, "grad_norm": 2.2737445178468647, "language_loss": 0.87771183, "learning_rate": 3.4458626867688707e-06, "loss": 0.95552433, "num_input_tokens_seen": 95501435, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.18835449, "step": 4416, "time_per_iteration": 2.5322768688201904 }, { "auxiliary_loss_clip": 0.06498045, "auxiliary_loss_mlp": 0.01274713, "balance_loss_clip": 0.06286422, "balance_loss_mlp": 0.01256581, "epoch": 0.2655644070344206, "flos": 23411437499520.0, "grad_norm": 1.8020118785154473, "language_loss": 0.7747125, "learning_rate": 3.4455935725409217e-06, "loss": 0.85244012, "num_input_tokens_seen": 95520135, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.18115234, "step": 4417, "time_per_iteration": 2.5998802185058594 }, { "auxiliary_loss_clip": 0.06490208, "auxiliary_loss_mlp": 0.01273035, "balance_loss_clip": 0.06282437, "balance_loss_mlp": 0.01256083, "epoch": 0.26562453028708854, "flos": 26475612658560.0, "grad_norm": 1.5630139060531236, "language_loss": 0.80299604, "learning_rate": 3.4453244034959196e-06, "loss": 0.88062847, "num_input_tokens_seen": 95541705, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.16943359, "step": 4418, "time_per_iteration": 2.615626096725464 }, { "auxiliary_loss_clip": 0.06501371, "auxiliary_loss_mlp": 0.01275047, "balance_loss_clip": 0.06289642, "balance_loss_mlp": 0.01257368, "epoch": 0.2656846535397565, "flos": 19213475944320.0, "grad_norm": 2.304281451070774, "language_loss": 0.68207622, "learning_rate": 3.445055179644071e-06, "loss": 0.75984037, "num_input_tokens_seen": 95560300, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.17663574, "step": 4419, "time_per_iteration": 2.625227689743042 }, { "auxiliary_loss_clip": 0.06502342, "auxiliary_loss_mlp": 0.01278335, "balance_loss_clip": 0.06290469, "balance_loss_mlp": 0.01259417, "epoch": 0.2657447767924245, "flos": 30558566085120.0, "grad_norm": 1.5829310341226515, "language_loss": 0.80017555, "learning_rate": 3.444785900995585e-06, "loss": 0.87798232, "num_input_tokens_seen": 95580150, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.18933105, "step": 4420, "time_per_iteration": 2.6776857376098633 }, { "auxiliary_loss_clip": 0.0650761, "auxiliary_loss_mlp": 0.01280258, "balance_loss_clip": 0.06290014, "balance_loss_mlp": 0.01261637, "epoch": 0.26580490004509244, "flos": 20928984111360.0, "grad_norm": 2.126032506872954, "language_loss": 0.82064676, "learning_rate": 3.444516567560673e-06, "loss": 0.89852554, "num_input_tokens_seen": 95597570, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.1862793, "step": 4421, "time_per_iteration": 2.5987887382507324 }, { "auxiliary_loss_clip": 0.06495114, "auxiliary_loss_mlp": 0.01274623, "balance_loss_clip": 0.06285845, "balance_loss_mlp": 0.01257302, "epoch": 0.2658650232977604, "flos": 43955845297920.0, "grad_norm": 1.6396806913468545, "language_loss": 0.66383028, "learning_rate": 3.444247179349548e-06, "loss": 0.74152768, "num_input_tokens_seen": 95619415, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.17321777, "step": 4422, "time_per_iteration": 2.7748167514801025 }, { "auxiliary_loss_clip": 0.0650324, "auxiliary_loss_mlp": 0.01280348, "balance_loss_clip": 0.06290881, "balance_loss_mlp": 0.01263301, "epoch": 0.26592514655042837, "flos": 29724256581120.0, "grad_norm": 2.130692166905416, "language_loss": 0.74962246, "learning_rate": 3.4439777363724252e-06, "loss": 0.82745832, "num_input_tokens_seen": 95639155, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.17041016, "step": 4423, "time_per_iteration": 2.680961847305298 }, { "auxiliary_loss_clip": 0.06503473, "auxiliary_loss_mlp": 0.01273164, "balance_loss_clip": 0.06288955, "balance_loss_mlp": 0.01255533, "epoch": 0.26598526980309634, "flos": 46687616110080.0, "grad_norm": 1.5486573077167072, "language_loss": 0.78431815, "learning_rate": 3.443708238639522e-06, "loss": 0.86208451, "num_input_tokens_seen": 95663320, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.1763916, "step": 4424, "time_per_iteration": 2.921430826187134 }, { "auxiliary_loss_clip": 0.06511664, "auxiliary_loss_mlp": 0.01281503, "balance_loss_clip": 0.06296265, "balance_loss_mlp": 0.01263789, "epoch": 0.2660453930557643, "flos": 11514115025280.0, "grad_norm": 2.1356274936136628, "language_loss": 0.79862726, "learning_rate": 3.4434386861610573e-06, "loss": 0.8765589, "num_input_tokens_seen": 95680260, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.17712402, "step": 4425, "time_per_iteration": 2.53792667388916 }, { "auxiliary_loss_clip": 0.06497131, "auxiliary_loss_mlp": 0.01273698, "balance_loss_clip": 0.06287857, "balance_loss_mlp": 0.01257318, "epoch": 0.26610551630843227, "flos": 24798692096640.0, "grad_norm": 1.7116427565183023, "language_loss": 0.81624401, "learning_rate": 3.4431690789472532e-06, "loss": 0.89395231, "num_input_tokens_seen": 95701140, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.16381836, "step": 4426, "time_per_iteration": 2.6179869174957275 }, { "auxiliary_loss_clip": 0.06506111, "auxiliary_loss_mlp": 0.01277136, "balance_loss_clip": 0.0629181, "balance_loss_mlp": 0.01258778, "epoch": 0.26616563956110023, "flos": 27643793955840.0, "grad_norm": 1.6332526647739078, "language_loss": 0.77190882, "learning_rate": 3.442899417008333e-06, "loss": 0.84974122, "num_input_tokens_seen": 95722060, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.18359375, "step": 4427, "time_per_iteration": 2.6319053173065186 }, { "auxiliary_loss_clip": 0.06502724, "auxiliary_loss_mlp": 0.01277823, "balance_loss_clip": 0.06296358, "balance_loss_mlp": 0.01261479, "epoch": 0.26622576281376825, "flos": 28369887511680.0, "grad_norm": 1.7506048850517548, "language_loss": 0.77649987, "learning_rate": 3.4426297003545227e-06, "loss": 0.85430539, "num_input_tokens_seen": 95742495, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.16345215, "step": 4428, "time_per_iteration": 2.639115571975708 }, { "auxiliary_loss_clip": 0.0650764, "auxiliary_loss_mlp": 0.01271626, "balance_loss_clip": 0.06293125, "balance_loss_mlp": 0.01255151, "epoch": 0.2662858860664362, "flos": 18047265217920.0, "grad_norm": 1.89695500616738, "language_loss": 0.83285224, "learning_rate": 3.4423599289960495e-06, "loss": 0.91064489, "num_input_tokens_seen": 95761510, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.16479492, "step": 4429, "time_per_iteration": 2.55387806892395 }, { "auxiliary_loss_clip": 0.06508815, "auxiliary_loss_mlp": 0.01277539, "balance_loss_clip": 0.06299278, "balance_loss_mlp": 0.0125837, "epoch": 0.2663460093191042, "flos": 22752163175040.0, "grad_norm": 1.5023501880346801, "language_loss": 0.72425258, "learning_rate": 3.442090102943143e-06, "loss": 0.80211604, "num_input_tokens_seen": 95782385, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.19165039, "step": 4430, "time_per_iteration": 2.599900007247925 }, { "auxiliary_loss_clip": 0.06503117, "auxiliary_loss_mlp": 0.01282047, "balance_loss_clip": 0.06291191, "balance_loss_mlp": 0.01263605, "epoch": 0.26640613257177215, "flos": 16514422951680.0, "grad_norm": 1.9985731300512781, "language_loss": 0.82816279, "learning_rate": 3.441820222206035e-06, "loss": 0.90601438, "num_input_tokens_seen": 95800595, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.18432617, "step": 4431, "time_per_iteration": 2.5549356937408447 }, { "auxiliary_loss_clip": 0.06516881, "auxiliary_loss_mlp": 0.01284268, "balance_loss_clip": 0.06296997, "balance_loss_mlp": 0.01265087, "epoch": 0.2664662558244401, "flos": 23082638878080.0, "grad_norm": 3.253681540181593, "language_loss": 0.76707721, "learning_rate": 3.44155028679496e-06, "loss": 0.84508866, "num_input_tokens_seen": 95818480, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.19177246, "step": 4432, "time_per_iteration": 2.60127329826355 }, { "auxiliary_loss_clip": 0.06512128, "auxiliary_loss_mlp": 0.01279997, "balance_loss_clip": 0.06297196, "balance_loss_mlp": 0.01261162, "epoch": 0.2665263790771081, "flos": 23776098468480.0, "grad_norm": 1.9767723761458258, "language_loss": 0.83288932, "learning_rate": 3.441280296720154e-06, "loss": 0.91081053, "num_input_tokens_seen": 95837205, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.18847656, "step": 4433, "time_per_iteration": 2.600412607192993 }, { "auxiliary_loss_clip": 0.06504966, "auxiliary_loss_mlp": 0.01281246, "balance_loss_clip": 0.06292742, "balance_loss_mlp": 0.01263365, "epoch": 0.26658650232977604, "flos": 28008748414080.0, "grad_norm": 1.9771282022386594, "language_loss": 0.77057511, "learning_rate": 3.441010251991854e-06, "loss": 0.84843719, "num_input_tokens_seen": 95858395, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.17883301, "step": 4434, "time_per_iteration": 2.6709821224212646 }, { "auxiliary_loss_clip": 0.06506929, "auxiliary_loss_mlp": 0.01279372, "balance_loss_clip": 0.06297025, "balance_loss_mlp": 0.01261074, "epoch": 0.266646625582444, "flos": 22170147914880.0, "grad_norm": 1.8463991635118726, "language_loss": 0.83059406, "learning_rate": 3.440740152620301e-06, "loss": 0.90845704, "num_input_tokens_seen": 95877875, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.18286133, "step": 4435, "time_per_iteration": 4.0801496505737305 }, { "auxiliary_loss_clip": 0.0651507, "auxiliary_loss_mlp": 0.01276006, "balance_loss_clip": 0.06296155, "balance_loss_mlp": 0.01256444, "epoch": 0.266706748835112, "flos": 27860687049600.0, "grad_norm": 2.6505387596396432, "language_loss": 0.88102698, "learning_rate": 3.4404699986157376e-06, "loss": 0.95893776, "num_input_tokens_seen": 95895820, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.19580078, "step": 4436, "time_per_iteration": 2.6276185512542725 }, { "auxiliary_loss_clip": 0.06510176, "auxiliary_loss_mlp": 0.01274061, "balance_loss_clip": 0.06293434, "balance_loss_mlp": 0.01256681, "epoch": 0.26676687208777994, "flos": 25819231299840.0, "grad_norm": 1.3518314529491535, "language_loss": 0.79420948, "learning_rate": 3.440199789988407e-06, "loss": 0.87205184, "num_input_tokens_seen": 95918025, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.17382812, "step": 4437, "time_per_iteration": 2.6476900577545166 }, { "auxiliary_loss_clip": 0.06507044, "auxiliary_loss_mlp": 0.01276286, "balance_loss_clip": 0.06293105, "balance_loss_mlp": 0.01258178, "epoch": 0.2668269953404479, "flos": 36073399207680.0, "grad_norm": 2.414056796205333, "language_loss": 0.64990413, "learning_rate": 3.439929526748556e-06, "loss": 0.72773749, "num_input_tokens_seen": 95937725, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.18115234, "step": 4438, "time_per_iteration": 2.706242084503174 }, { "auxiliary_loss_clip": 0.06513868, "auxiliary_loss_mlp": 0.01286695, "balance_loss_clip": 0.06296714, "balance_loss_mlp": 0.01268504, "epoch": 0.26688711859311587, "flos": 26576994499200.0, "grad_norm": 1.7803232651760998, "language_loss": 0.75992894, "learning_rate": 3.4396592089064334e-06, "loss": 0.83793461, "num_input_tokens_seen": 95956335, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.18188477, "step": 4439, "time_per_iteration": 2.6037914752960205 }, { "auxiliary_loss_clip": 0.06512718, "auxiliary_loss_mlp": 0.01274749, "balance_loss_clip": 0.062972, "balance_loss_mlp": 0.0125552, "epoch": 0.26694724184578383, "flos": 26768968202880.0, "grad_norm": 1.8038635611044946, "language_loss": 0.71812165, "learning_rate": 3.4393888364722897e-06, "loss": 0.79599637, "num_input_tokens_seen": 95977135, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.19238281, "step": 4440, "time_per_iteration": 2.6089446544647217 }, { "auxiliary_loss_clip": 0.06513664, "auxiliary_loss_mlp": 0.01286116, "balance_loss_clip": 0.06297025, "balance_loss_mlp": 0.01266923, "epoch": 0.2670073650984518, "flos": 20965894634880.0, "grad_norm": 1.8518482536087417, "language_loss": 0.67696023, "learning_rate": 3.439118409456376e-06, "loss": 0.75495797, "num_input_tokens_seen": 95995435, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.19189453, "step": 4441, "time_per_iteration": 3.9980976581573486 }, { "auxiliary_loss_clip": 0.06511506, "auxiliary_loss_mlp": 0.01279744, "balance_loss_clip": 0.06294356, "balance_loss_mlp": 0.01260838, "epoch": 0.2670674883511198, "flos": 28373577091200.0, "grad_norm": 1.5847813019737171, "language_loss": 0.7704525, "learning_rate": 3.4388479278689486e-06, "loss": 0.84836495, "num_input_tokens_seen": 96016340, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.18896484, "step": 4442, "time_per_iteration": 2.635220766067505 }, { "auxiliary_loss_clip": 0.06393145, "auxiliary_loss_mlp": 0.01266112, "balance_loss_clip": 0.06288883, "balance_loss_mlp": 0.01261501, "epoch": 0.2671276116037878, "flos": 58989010970880.0, "grad_norm": 0.9811116490763919, "language_loss": 0.61805856, "learning_rate": 3.4385773917202637e-06, "loss": 0.69465113, "num_input_tokens_seen": 96071205, "router_z_loss_clip": 1.04394531, "router_z_loss_mlp": 0.04605103, "step": 4443, "time_per_iteration": 3.1030948162078857 }, { "auxiliary_loss_clip": 0.06511617, "auxiliary_loss_mlp": 0.01281968, "balance_loss_clip": 0.06295185, "balance_loss_mlp": 0.01262894, "epoch": 0.26718773485645575, "flos": 43955132538240.0, "grad_norm": 1.5228077510518143, "language_loss": 0.76893061, "learning_rate": 3.4383068010205793e-06, "loss": 0.84686649, "num_input_tokens_seen": 96094240, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.19067383, "step": 4444, "time_per_iteration": 2.7625303268432617 }, { "auxiliary_loss_clip": 0.06511895, "auxiliary_loss_mlp": 0.01279934, "balance_loss_clip": 0.06298471, "balance_loss_mlp": 0.01260109, "epoch": 0.2672478581091237, "flos": 25235329322880.0, "grad_norm": 1.7891926006605385, "language_loss": 0.80819893, "learning_rate": 3.438036155780158e-06, "loss": 0.88611722, "num_input_tokens_seen": 96114105, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.19824219, "step": 4445, "time_per_iteration": 2.6061182022094727 }, { "auxiliary_loss_clip": 0.06513037, "auxiliary_loss_mlp": 0.01273867, "balance_loss_clip": 0.06296931, "balance_loss_mlp": 0.01255867, "epoch": 0.2673079813617917, "flos": 15273594564480.0, "grad_norm": 2.047861359150934, "language_loss": 0.89662784, "learning_rate": 3.43776545600926e-06, "loss": 0.97449684, "num_input_tokens_seen": 96132140, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.18005371, "step": 4446, "time_per_iteration": 2.562359571456909 }, { "auxiliary_loss_clip": 0.06517742, "auxiliary_loss_mlp": 0.01272356, "balance_loss_clip": 0.06299888, "balance_loss_mlp": 0.0125457, "epoch": 0.26736810461445965, "flos": 25819944059520.0, "grad_norm": 1.7001565277787516, "language_loss": 0.68693757, "learning_rate": 3.437494701718153e-06, "loss": 0.76483852, "num_input_tokens_seen": 96152090, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.17785645, "step": 4447, "time_per_iteration": 2.614624500274658 }, { "auxiliary_loss_clip": 0.06508717, "auxiliary_loss_mlp": 0.01275903, "balance_loss_clip": 0.06293362, "balance_loss_mlp": 0.01257557, "epoch": 0.2674282278671276, "flos": 24318981072000.0, "grad_norm": 1.940707818803994, "language_loss": 0.83625352, "learning_rate": 3.4372238929171026e-06, "loss": 0.91409981, "num_input_tokens_seen": 96170015, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.18347168, "step": 4448, "time_per_iteration": 4.05219030380249 }, { "auxiliary_loss_clip": 0.06503204, "auxiliary_loss_mlp": 0.01276585, "balance_loss_clip": 0.06293215, "balance_loss_mlp": 0.01258298, "epoch": 0.2674883511197956, "flos": 22821330320640.0, "grad_norm": 1.498399930163992, "language_loss": 0.84754938, "learning_rate": 3.436953029616378e-06, "loss": 0.92534727, "num_input_tokens_seen": 96188065, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.18310547, "step": 4449, "time_per_iteration": 4.031713485717773 }, { "auxiliary_loss_clip": 0.06524255, "auxiliary_loss_mlp": 0.01281758, "balance_loss_clip": 0.06297394, "balance_loss_mlp": 0.01261778, "epoch": 0.26754847437246354, "flos": 25376514652800.0, "grad_norm": 1.6078856796871728, "language_loss": 0.84416771, "learning_rate": 3.4366821118262506e-06, "loss": 0.92222786, "num_input_tokens_seen": 96205780, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.19970703, "step": 4450, "time_per_iteration": 2.608973741531372 }, { "auxiliary_loss_clip": 0.06499749, "auxiliary_loss_mlp": 0.01275124, "balance_loss_clip": 0.06289503, "balance_loss_mlp": 0.01257195, "epoch": 0.2676085976251315, "flos": 20236698478080.0, "grad_norm": 2.0972189163787176, "language_loss": 0.81661886, "learning_rate": 3.4364111395569937e-06, "loss": 0.89436758, "num_input_tokens_seen": 96224990, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.17932129, "step": 4451, "time_per_iteration": 2.575209617614746 }, { "auxiliary_loss_clip": 0.06508891, "auxiliary_loss_mlp": 0.01273226, "balance_loss_clip": 0.06298854, "balance_loss_mlp": 0.01255344, "epoch": 0.26766872087779947, "flos": 28045784718720.0, "grad_norm": 1.621000723119536, "language_loss": 0.86607033, "learning_rate": 3.436140112818882e-06, "loss": 0.94389153, "num_input_tokens_seen": 96245345, "router_z_loss_clip": 2.09863281, "router_z_loss_mlp": 0.17871094, "step": 4452, "time_per_iteration": 2.6077945232391357 }, { "auxiliary_loss_clip": 0.06510705, "auxiliary_loss_mlp": 0.01278127, "balance_loss_clip": 0.06297439, "balance_loss_mlp": 0.01259852, "epoch": 0.26772884413046744, "flos": 18329803585920.0, "grad_norm": 2.0657970070538085, "language_loss": 0.83581412, "learning_rate": 3.435869031622194e-06, "loss": 0.91370249, "num_input_tokens_seen": 96259000, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.18273926, "step": 4453, "time_per_iteration": 2.515562057495117 }, { "auxiliary_loss_clip": 0.06510039, "auxiliary_loss_mlp": 0.01279887, "balance_loss_clip": 0.06297331, "balance_loss_mlp": 0.01261803, "epoch": 0.2677889673831354, "flos": 22134075932160.0, "grad_norm": 6.324126056268082, "language_loss": 0.79696667, "learning_rate": 3.435597895977208e-06, "loss": 0.87486595, "num_input_tokens_seen": 96277000, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.1809082, "step": 4454, "time_per_iteration": 2.5552287101745605 }, { "auxiliary_loss_clip": 0.06509034, "auxiliary_loss_mlp": 0.01279021, "balance_loss_clip": 0.0629263, "balance_loss_mlp": 0.01260615, "epoch": 0.2678490906358034, "flos": 23736001489920.0, "grad_norm": 1.5706277128543242, "language_loss": 0.72825122, "learning_rate": 3.435326705894206e-06, "loss": 0.80613178, "num_input_tokens_seen": 96297010, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.18408203, "step": 4455, "time_per_iteration": 2.575374126434326 }, { "auxiliary_loss_clip": 0.06500682, "auxiliary_loss_mlp": 0.01276778, "balance_loss_clip": 0.06292551, "balance_loss_mlp": 0.0125935, "epoch": 0.2679092138884714, "flos": 21769414963200.0, "grad_norm": 1.5240608068129897, "language_loss": 0.73967534, "learning_rate": 3.435055461383471e-06, "loss": 0.81744999, "num_input_tokens_seen": 96315780, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.17419434, "step": 4456, "time_per_iteration": 2.5701615810394287 }, { "auxiliary_loss_clip": 0.06520003, "auxiliary_loss_mlp": 0.01274921, "balance_loss_clip": 0.06300326, "balance_loss_mlp": 0.01256241, "epoch": 0.26796933714113935, "flos": 19866670848000.0, "grad_norm": 2.7303739374618328, "language_loss": 0.71670878, "learning_rate": 3.4347841624552896e-06, "loss": 0.79465795, "num_input_tokens_seen": 96333465, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.18676758, "step": 4457, "time_per_iteration": 2.5666162967681885 }, { "auxiliary_loss_clip": 0.06515722, "auxiliary_loss_mlp": 0.01275678, "balance_loss_clip": 0.06300897, "balance_loss_mlp": 0.01256628, "epoch": 0.2680294603938073, "flos": 20054116431360.0, "grad_norm": 1.8039451462381555, "language_loss": 0.79452729, "learning_rate": 3.4345128091199493e-06, "loss": 0.87244135, "num_input_tokens_seen": 96352005, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.19055176, "step": 4458, "time_per_iteration": 2.5615973472595215 }, { "auxiliary_loss_clip": 0.06426029, "auxiliary_loss_mlp": 0.01255818, "balance_loss_clip": 0.06322937, "balance_loss_mlp": 0.0125124, "epoch": 0.2680895836464753, "flos": 72134918334720.0, "grad_norm": 0.8316287579186351, "language_loss": 0.58614933, "learning_rate": 3.434241401387739e-06, "loss": 0.6629678, "num_input_tokens_seen": 96406265, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.04577637, "step": 4459, "time_per_iteration": 3.171090841293335 }, { "auxiliary_loss_clip": 0.06510539, "auxiliary_loss_mlp": 0.01274022, "balance_loss_clip": 0.06298032, "balance_loss_mlp": 0.01256475, "epoch": 0.26814970689914325, "flos": 20455310580480.0, "grad_norm": 2.00171316225356, "language_loss": 0.85465062, "learning_rate": 3.4339699392689507e-06, "loss": 0.93249631, "num_input_tokens_seen": 96425225, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.17541504, "step": 4460, "time_per_iteration": 2.5653111934661865 }, { "auxiliary_loss_clip": 0.06509854, "auxiliary_loss_mlp": 0.01275908, "balance_loss_clip": 0.0629963, "balance_loss_mlp": 0.01258707, "epoch": 0.2682098301518112, "flos": 17572459656960.0, "grad_norm": 2.1008704407762573, "language_loss": 0.69327152, "learning_rate": 3.4336984227738796e-06, "loss": 0.77112919, "num_input_tokens_seen": 96443780, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.17199707, "step": 4461, "time_per_iteration": 2.549633741378784 }, { "auxiliary_loss_clip": 0.06503313, "auxiliary_loss_mlp": 0.01281138, "balance_loss_clip": 0.06292406, "balance_loss_mlp": 0.01262136, "epoch": 0.2682699534044792, "flos": 18339237169920.0, "grad_norm": 2.299591091736878, "language_loss": 0.67844474, "learning_rate": 3.43342685191282e-06, "loss": 0.75628924, "num_input_tokens_seen": 96464530, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.19006348, "step": 4462, "time_per_iteration": 2.6236398220062256 }, { "auxiliary_loss_clip": 0.06503171, "auxiliary_loss_mlp": 0.01278145, "balance_loss_clip": 0.06293099, "balance_loss_mlp": 0.01259703, "epoch": 0.26833007665714714, "flos": 25308311829120.0, "grad_norm": 1.6063471945264247, "language_loss": 0.70000196, "learning_rate": 3.4331552266960705e-06, "loss": 0.7778151, "num_input_tokens_seen": 96483345, "router_z_loss_clip": 2.09863281, "router_z_loss_mlp": 0.18444824, "step": 4463, "time_per_iteration": 2.6138947010040283 }, { "auxiliary_loss_clip": 0.06510614, "auxiliary_loss_mlp": 0.01277484, "balance_loss_clip": 0.06296065, "balance_loss_mlp": 0.01258852, "epoch": 0.2683901999098151, "flos": 16104046780800.0, "grad_norm": 2.9524534131840183, "language_loss": 0.77923024, "learning_rate": 3.432883547133931e-06, "loss": 0.85711122, "num_input_tokens_seen": 96498305, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.1862793, "step": 4464, "time_per_iteration": 2.5292139053344727 }, { "auxiliary_loss_clip": 0.06504047, "auxiliary_loss_mlp": 0.01275705, "balance_loss_clip": 0.06291686, "balance_loss_mlp": 0.01257621, "epoch": 0.2684503231624831, "flos": 27315414604800.0, "grad_norm": 1.6677832740601983, "language_loss": 0.71171695, "learning_rate": 3.432611813236704e-06, "loss": 0.78951448, "num_input_tokens_seen": 96519740, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.18078613, "step": 4465, "time_per_iteration": 2.611098051071167 }, { "auxiliary_loss_clip": 0.06416272, "auxiliary_loss_mlp": 0.01270061, "balance_loss_clip": 0.06313965, "balance_loss_mlp": 0.01265397, "epoch": 0.26851044641515104, "flos": 71879060292480.0, "grad_norm": 0.6718259270687664, "language_loss": 0.52733153, "learning_rate": 3.4323400250146943e-06, "loss": 0.60419488, "num_input_tokens_seen": 96588870, "router_z_loss_clip": 1.0234375, "router_z_loss_mlp": 0.04656982, "step": 4466, "time_per_iteration": 3.3362045288085938 }, { "auxiliary_loss_clip": 0.06502537, "auxiliary_loss_mlp": 0.01281948, "balance_loss_clip": 0.06290551, "balance_loss_mlp": 0.01262791, "epoch": 0.268570569667819, "flos": 18739676632320.0, "grad_norm": 2.0651032796710247, "language_loss": 0.7471652, "learning_rate": 3.4320681824782057e-06, "loss": 0.82501006, "num_input_tokens_seen": 96605100, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.19165039, "step": 4467, "time_per_iteration": 2.6556060314178467 }, { "auxiliary_loss_clip": 0.06514826, "auxiliary_loss_mlp": 0.01272989, "balance_loss_clip": 0.06299094, "balance_loss_mlp": 0.01254858, "epoch": 0.268630692920487, "flos": 18182832324480.0, "grad_norm": 2.0814619254283264, "language_loss": 0.81113607, "learning_rate": 3.4317962856375493e-06, "loss": 0.88901418, "num_input_tokens_seen": 96621410, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.18151855, "step": 4468, "time_per_iteration": 2.565593957901001 }, { "auxiliary_loss_clip": 0.0640231, "auxiliary_loss_mlp": 0.01254327, "balance_loss_clip": 0.0630027, "balance_loss_mlp": 0.01249192, "epoch": 0.268690816173155, "flos": 68754229176960.0, "grad_norm": 0.9473015998458432, "language_loss": 0.5938369, "learning_rate": 3.4315243345030334e-06, "loss": 0.6704033, "num_input_tokens_seen": 96684810, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.05136108, "step": 4469, "time_per_iteration": 3.26700496673584 }, { "auxiliary_loss_clip": 0.06506151, "auxiliary_loss_mlp": 0.01275766, "balance_loss_clip": 0.06291535, "balance_loss_mlp": 0.01255739, "epoch": 0.26875093942582295, "flos": 23300160877440.0, "grad_norm": 20.285267781683856, "language_loss": 0.81974638, "learning_rate": 3.431252329084972e-06, "loss": 0.8975656, "num_input_tokens_seen": 96701920, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.20019531, "step": 4470, "time_per_iteration": 2.6157078742980957 }, { "auxiliary_loss_clip": 0.06497768, "auxiliary_loss_mlp": 0.01278555, "balance_loss_clip": 0.06291942, "balance_loss_mlp": 0.01260828, "epoch": 0.2688110626784909, "flos": 21549880465920.0, "grad_norm": 1.5604864904500837, "language_loss": 0.83195758, "learning_rate": 3.4309802693936786e-06, "loss": 0.90972084, "num_input_tokens_seen": 96721260, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.17736816, "step": 4471, "time_per_iteration": 2.6723673343658447 }, { "auxiliary_loss_clip": 0.06496139, "auxiliary_loss_mlp": 0.01278035, "balance_loss_clip": 0.06290037, "balance_loss_mlp": 0.01260464, "epoch": 0.2688711859311589, "flos": 28407804284160.0, "grad_norm": 2.3894719787037926, "language_loss": 0.69980145, "learning_rate": 3.43070815543947e-06, "loss": 0.77754319, "num_input_tokens_seen": 96740385, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.17565918, "step": 4472, "time_per_iteration": 2.6422908306121826 }, { "auxiliary_loss_clip": 0.0650344, "auxiliary_loss_mlp": 0.01276993, "balance_loss_clip": 0.06292557, "balance_loss_mlp": 0.01259612, "epoch": 0.26893130918382685, "flos": 26002148762880.0, "grad_norm": 1.7961180485071142, "language_loss": 0.68485606, "learning_rate": 3.4304359872326656e-06, "loss": 0.76266038, "num_input_tokens_seen": 96761860, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.1739502, "step": 4473, "time_per_iteration": 2.7366552352905273 }, { "auxiliary_loss_clip": 0.06498845, "auxiliary_loss_mlp": 0.01280742, "balance_loss_clip": 0.06293385, "balance_loss_mlp": 0.01262503, "epoch": 0.2689914324364948, "flos": 20345878748160.0, "grad_norm": 1.6703364610470275, "language_loss": 0.83732975, "learning_rate": 3.4301637647835843e-06, "loss": 0.91512561, "num_input_tokens_seen": 96781890, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.18225098, "step": 4474, "time_per_iteration": 2.561577081680298 }, { "auxiliary_loss_clip": 0.06497613, "auxiliary_loss_mlp": 0.01276725, "balance_loss_clip": 0.06292687, "balance_loss_mlp": 0.01259368, "epoch": 0.2690515556891628, "flos": 19470759505920.0, "grad_norm": 1.8778654943840034, "language_loss": 0.71313357, "learning_rate": 3.4298914881025494e-06, "loss": 0.79087698, "num_input_tokens_seen": 96800390, "router_z_loss_clip": 2.04785156, "router_z_loss_mlp": 0.17370605, "step": 4475, "time_per_iteration": 3.941702127456665 }, { "auxiliary_loss_clip": 0.06498078, "auxiliary_loss_mlp": 0.01275506, "balance_loss_clip": 0.06284053, "balance_loss_mlp": 0.0125673, "epoch": 0.26911167894183075, "flos": 18151875440640.0, "grad_norm": 1.6207366289249552, "language_loss": 0.73493433, "learning_rate": 3.4296191571998863e-06, "loss": 0.81267011, "num_input_tokens_seen": 96816685, "router_z_loss_clip": 2.13964844, "router_z_loss_mlp": 0.18786621, "step": 4476, "time_per_iteration": 2.5486178398132324 }, { "auxiliary_loss_clip": 0.06499598, "auxiliary_loss_mlp": 0.01272585, "balance_loss_clip": 0.06287789, "balance_loss_mlp": 0.01256278, "epoch": 0.2691718021944987, "flos": 19981385487360.0, "grad_norm": 1.8721724729331772, "language_loss": 0.81359333, "learning_rate": 3.429346772085922e-06, "loss": 0.89131516, "num_input_tokens_seen": 96836285, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.16320801, "step": 4477, "time_per_iteration": 2.5669665336608887 }, { "auxiliary_loss_clip": 0.06503119, "auxiliary_loss_mlp": 0.0127688, "balance_loss_clip": 0.06286581, "balance_loss_mlp": 0.01257866, "epoch": 0.2692319254471667, "flos": 37455622560000.0, "grad_norm": 2.076433880514757, "language_loss": 0.66029894, "learning_rate": 3.429074332770984e-06, "loss": 0.73809898, "num_input_tokens_seen": 96857745, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.19018555, "step": 4478, "time_per_iteration": 2.7186408042907715 }, { "auxiliary_loss_clip": 0.06500408, "auxiliary_loss_mlp": 0.01278113, "balance_loss_clip": 0.06289517, "balance_loss_mlp": 0.01259767, "epoch": 0.26929204869983464, "flos": 22134411348480.0, "grad_norm": 1.817401035166408, "language_loss": 0.81540006, "learning_rate": 3.4288018392654047e-06, "loss": 0.89318526, "num_input_tokens_seen": 96877295, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.18347168, "step": 4479, "time_per_iteration": 2.5711212158203125 }, { "auxiliary_loss_clip": 0.06506255, "auxiliary_loss_mlp": 0.01273358, "balance_loss_clip": 0.06293222, "balance_loss_mlp": 0.01255405, "epoch": 0.2693521719525026, "flos": 19799055002880.0, "grad_norm": 2.182557272370516, "language_loss": 0.81788707, "learning_rate": 3.4285292915795166e-06, "loss": 0.89568317, "num_input_tokens_seen": 96896160, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.17956543, "step": 4480, "time_per_iteration": 3.9993090629577637 }, { "auxiliary_loss_clip": 0.064981, "auxiliary_loss_mlp": 0.01277535, "balance_loss_clip": 0.06292555, "balance_loss_mlp": 0.01260786, "epoch": 0.2694122952051706, "flos": 21000415317120.0, "grad_norm": 2.542599332881626, "language_loss": 0.78306329, "learning_rate": 3.4282566897236543e-06, "loss": 0.86081964, "num_input_tokens_seen": 96915410, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.16748047, "step": 4481, "time_per_iteration": 2.587355375289917 }, { "auxiliary_loss_clip": 0.06499396, "auxiliary_loss_mlp": 0.01271561, "balance_loss_clip": 0.06287938, "balance_loss_mlp": 0.01253418, "epoch": 0.2694724184578386, "flos": 25856519166720.0, "grad_norm": 2.175327182027217, "language_loss": 0.74578106, "learning_rate": 3.4279840337081547e-06, "loss": 0.82349068, "num_input_tokens_seen": 96937865, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.18151855, "step": 4482, "time_per_iteration": 2.626589298248291 }, { "auxiliary_loss_clip": 0.06503494, "auxiliary_loss_mlp": 0.01278897, "balance_loss_clip": 0.06290929, "balance_loss_mlp": 0.01260622, "epoch": 0.26953254171050656, "flos": 21733594542720.0, "grad_norm": 2.0605237800880776, "language_loss": 0.73063147, "learning_rate": 3.4277113235433584e-06, "loss": 0.80845535, "num_input_tokens_seen": 96957710, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.18273926, "step": 4483, "time_per_iteration": 2.5786380767822266 }, { "auxiliary_loss_clip": 0.06507446, "auxiliary_loss_mlp": 0.01279475, "balance_loss_clip": 0.06289751, "balance_loss_mlp": 0.01260163, "epoch": 0.2695926649631745, "flos": 19689078119040.0, "grad_norm": 2.7295891100654965, "language_loss": 0.87697291, "learning_rate": 3.427438559239605e-06, "loss": 0.95484209, "num_input_tokens_seen": 96975890, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.1932373, "step": 4484, "time_per_iteration": 2.5522146224975586 }, { "auxiliary_loss_clip": 0.06501345, "auxiliary_loss_mlp": 0.01275078, "balance_loss_clip": 0.0628607, "balance_loss_mlp": 0.01256695, "epoch": 0.2696527882158425, "flos": 32894257847040.0, "grad_norm": 1.4847683080168765, "language_loss": 0.67257035, "learning_rate": 3.427165740807239e-06, "loss": 0.75033456, "num_input_tokens_seen": 96998595, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.18371582, "step": 4485, "time_per_iteration": 2.663280725479126 }, { "auxiliary_loss_clip": 0.06503005, "auxiliary_loss_mlp": 0.01279159, "balance_loss_clip": 0.06289633, "balance_loss_mlp": 0.0126148, "epoch": 0.26971291146851045, "flos": 12128806177920.0, "grad_norm": 9.600876498295234, "language_loss": 0.73337936, "learning_rate": 3.426892868256604e-06, "loss": 0.81120092, "num_input_tokens_seen": 97013715, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.17663574, "step": 4486, "time_per_iteration": 2.51863956451416 }, { "auxiliary_loss_clip": 0.06504488, "auxiliary_loss_mlp": 0.01277654, "balance_loss_clip": 0.06288889, "balance_loss_mlp": 0.01259308, "epoch": 0.2697730347211784, "flos": 22640467282560.0, "grad_norm": 2.475990702553355, "language_loss": 0.84418702, "learning_rate": 3.4266199415980495e-06, "loss": 0.92200851, "num_input_tokens_seen": 97031570, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.18334961, "step": 4487, "time_per_iteration": 3.9759082794189453 }, { "auxiliary_loss_clip": 0.06507059, "auxiliary_loss_mlp": 0.01281438, "balance_loss_clip": 0.06291755, "balance_loss_mlp": 0.01261387, "epoch": 0.2698331579738464, "flos": 23519695374720.0, "grad_norm": 4.843792978756702, "language_loss": 0.72674811, "learning_rate": 3.4263469608419234e-06, "loss": 0.80463302, "num_input_tokens_seen": 97049815, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.20056152, "step": 4488, "time_per_iteration": 4.024530410766602 }, { "auxiliary_loss_clip": 0.06502008, "auxiliary_loss_mlp": 0.01278611, "balance_loss_clip": 0.06288217, "balance_loss_mlp": 0.01259752, "epoch": 0.26989328122651435, "flos": 24647360423040.0, "grad_norm": 1.6613977924226355, "language_loss": 0.84402353, "learning_rate": 3.426073925998578e-06, "loss": 0.9218297, "num_input_tokens_seen": 97067570, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.1887207, "step": 4489, "time_per_iteration": 2.591001272201538 }, { "auxiliary_loss_clip": 0.06506231, "auxiliary_loss_mlp": 0.01283782, "balance_loss_clip": 0.06289077, "balance_loss_mlp": 0.01264458, "epoch": 0.2699534044791823, "flos": 10775904554880.0, "grad_norm": 5.988480665914319, "language_loss": 0.90016681, "learning_rate": 3.4258008370783656e-06, "loss": 0.97806686, "num_input_tokens_seen": 97082180, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.19299316, "step": 4490, "time_per_iteration": 2.50667142868042 }, { "auxiliary_loss_clip": 0.06497735, "auxiliary_loss_mlp": 0.01274844, "balance_loss_clip": 0.06291477, "balance_loss_mlp": 0.01257225, "epoch": 0.2700135277318503, "flos": 36180021928320.0, "grad_norm": 1.7237950764746022, "language_loss": 0.73478562, "learning_rate": 3.4255276940916434e-06, "loss": 0.81251144, "num_input_tokens_seen": 97103470, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.17614746, "step": 4491, "time_per_iteration": 2.691051721572876 }, { "auxiliary_loss_clip": 0.06503892, "auxiliary_loss_mlp": 0.01276249, "balance_loss_clip": 0.06291365, "balance_loss_mlp": 0.01258046, "epoch": 0.27007365098451824, "flos": 17424020949120.0, "grad_norm": 2.2037075796386643, "language_loss": 0.7451033, "learning_rate": 3.4252544970487676e-06, "loss": 0.82290471, "num_input_tokens_seen": 97118100, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.18225098, "step": 4492, "time_per_iteration": 2.5438201427459717 }, { "auxiliary_loss_clip": 0.06500311, "auxiliary_loss_mlp": 0.0127101, "balance_loss_clip": 0.06290865, "balance_loss_mlp": 0.01253319, "epoch": 0.2701337742371862, "flos": 23192448053760.0, "grad_norm": 2.0483974269361735, "language_loss": 0.89109176, "learning_rate": 3.4249812459600986e-06, "loss": 0.96880496, "num_input_tokens_seen": 97136765, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.17700195, "step": 4493, "time_per_iteration": 2.679621934890747 }, { "auxiliary_loss_clip": 0.06505674, "auxiliary_loss_mlp": 0.01275551, "balance_loss_clip": 0.06295248, "balance_loss_mlp": 0.0125879, "epoch": 0.2701938974898542, "flos": 24396365917440.0, "grad_norm": 1.7107631672603725, "language_loss": 0.71828574, "learning_rate": 3.424707940835998e-06, "loss": 0.79609799, "num_input_tokens_seen": 97157470, "router_z_loss_clip": 2.10644531, "router_z_loss_mlp": 0.16772461, "step": 4494, "time_per_iteration": 2.613224744796753 }, { "auxiliary_loss_clip": 0.06494216, "auxiliary_loss_mlp": 0.01272864, "balance_loss_clip": 0.06287485, "balance_loss_mlp": 0.01255423, "epoch": 0.2702540207425222, "flos": 26221641333120.0, "grad_norm": 1.9510281532196194, "language_loss": 0.86930835, "learning_rate": 3.42443458168683e-06, "loss": 0.94697917, "num_input_tokens_seen": 97176905, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.17443848, "step": 4495, "time_per_iteration": 2.607048988342285 }, { "auxiliary_loss_clip": 0.06502461, "auxiliary_loss_mlp": 0.01276039, "balance_loss_clip": 0.06292065, "balance_loss_mlp": 0.01258956, "epoch": 0.27031414399519016, "flos": 22932439234560.0, "grad_norm": 5.8825949568071, "language_loss": 0.76943445, "learning_rate": 3.424161168522959e-06, "loss": 0.84721947, "num_input_tokens_seen": 97196380, "router_z_loss_clip": 2.10253906, "router_z_loss_mlp": 0.17077637, "step": 4496, "time_per_iteration": 2.5669126510620117 }, { "auxiliary_loss_clip": 0.06396413, "auxiliary_loss_mlp": 0.01257099, "balance_loss_clip": 0.0629441, "balance_loss_mlp": 0.01252334, "epoch": 0.2703742672478581, "flos": 63037904912640.0, "grad_norm": 0.6565752727624057, "language_loss": 0.49939889, "learning_rate": 3.423887701354754e-06, "loss": 0.57593399, "num_input_tokens_seen": 97260100, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.0475769, "step": 4497, "time_per_iteration": 3.227477550506592 }, { "auxiliary_loss_clip": 0.06499335, "auxiliary_loss_mlp": 0.01277131, "balance_loss_clip": 0.06289795, "balance_loss_mlp": 0.012608, "epoch": 0.2704343905005261, "flos": 18846341280000.0, "grad_norm": 3.282172471005798, "language_loss": 0.73326695, "learning_rate": 3.4236141801925847e-06, "loss": 0.81103158, "num_input_tokens_seen": 97277935, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.16320801, "step": 4498, "time_per_iteration": 2.572444200515747 }, { "auxiliary_loss_clip": 0.06380871, "auxiliary_loss_mlp": 0.01258564, "balance_loss_clip": 0.0627809, "balance_loss_mlp": 0.01253691, "epoch": 0.27049451375319405, "flos": 71253635817600.0, "grad_norm": 0.7317361687781341, "language_loss": 0.5898267, "learning_rate": 3.4233406050468237e-06, "loss": 0.66622108, "num_input_tokens_seen": 97338845, "router_z_loss_clip": 1.02929688, "router_z_loss_mlp": 0.04867554, "step": 4499, "time_per_iteration": 3.2181265354156494 }, { "auxiliary_loss_clip": 0.06500889, "auxiliary_loss_mlp": 0.01276041, "balance_loss_clip": 0.06291543, "balance_loss_mlp": 0.01257921, "epoch": 0.270554637005862, "flos": 24285257003520.0, "grad_norm": 1.99686047267916, "language_loss": 0.74329579, "learning_rate": 3.4230669759278438e-06, "loss": 0.82106507, "num_input_tokens_seen": 97356640, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.18115234, "step": 4500, "time_per_iteration": 2.579066753387451 }, { "auxiliary_loss_clip": 0.06498355, "auxiliary_loss_mlp": 0.01275073, "balance_loss_clip": 0.06287724, "balance_loss_mlp": 0.01258098, "epoch": 0.27061476025853, "flos": 17636889047040.0, "grad_norm": 2.477566165997206, "language_loss": 0.81325853, "learning_rate": 3.4227932928460215e-06, "loss": 0.89099276, "num_input_tokens_seen": 97372585, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.16967773, "step": 4501, "time_per_iteration": 2.536155939102173 }, { "auxiliary_loss_clip": 0.065051, "auxiliary_loss_mlp": 0.01282499, "balance_loss_clip": 0.06291045, "balance_loss_mlp": 0.01264641, "epoch": 0.27067488351119795, "flos": 22716594316800.0, "grad_norm": 1.7700806416122017, "language_loss": 0.73270714, "learning_rate": 3.422519555811735e-06, "loss": 0.81058317, "num_input_tokens_seen": 97393315, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.1784668, "step": 4502, "time_per_iteration": 2.57924747467041 }, { "auxiliary_loss_clip": 0.06507567, "auxiliary_loss_mlp": 0.01275623, "balance_loss_clip": 0.06289744, "balance_loss_mlp": 0.01256693, "epoch": 0.2707350067638659, "flos": 41729333806080.0, "grad_norm": 2.174107248899368, "language_loss": 0.69055599, "learning_rate": 3.4222457648353642e-06, "loss": 0.76838791, "num_input_tokens_seen": 97417860, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.18933105, "step": 4503, "time_per_iteration": 2.7422258853912354 }, { "auxiliary_loss_clip": 0.06503069, "auxiliary_loss_mlp": 0.01282075, "balance_loss_clip": 0.06290968, "balance_loss_mlp": 0.01263466, "epoch": 0.2707951300165339, "flos": 20199159048960.0, "grad_norm": 3.2538886326752188, "language_loss": 0.68790478, "learning_rate": 3.4219719199272918e-06, "loss": 0.76575625, "num_input_tokens_seen": 97436780, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.18603516, "step": 4504, "time_per_iteration": 2.5545036792755127 }, { "auxiliary_loss_clip": 0.06501503, "auxiliary_loss_mlp": 0.01282715, "balance_loss_clip": 0.06291457, "balance_loss_mlp": 0.01263916, "epoch": 0.27085525326920185, "flos": 21440364779520.0, "grad_norm": 1.4286129847552052, "language_loss": 0.75823027, "learning_rate": 3.421698021097902e-06, "loss": 0.83607244, "num_input_tokens_seen": 97456190, "router_z_loss_clip": 2.10253906, "router_z_loss_mlp": 0.18798828, "step": 4505, "time_per_iteration": 2.5660781860351562 }, { "auxiliary_loss_clip": 0.06508805, "auxiliary_loss_mlp": 0.01281405, "balance_loss_clip": 0.06293256, "balance_loss_mlp": 0.01262665, "epoch": 0.2709153765218698, "flos": 17680885240320.0, "grad_norm": 2.870244843131717, "language_loss": 0.74971533, "learning_rate": 3.42142406835758e-06, "loss": 0.82761741, "num_input_tokens_seen": 97474545, "router_z_loss_clip": 2.15722656, "router_z_loss_mlp": 0.1875, "step": 4506, "time_per_iteration": 2.531036376953125 }, { "auxiliary_loss_clip": 0.06504238, "auxiliary_loss_mlp": 0.01280657, "balance_loss_clip": 0.06292301, "balance_loss_mlp": 0.01261298, "epoch": 0.2709754997745378, "flos": 24462136972800.0, "grad_norm": 2.0286675480587157, "language_loss": 0.81408429, "learning_rate": 3.421150061716715e-06, "loss": 0.89193332, "num_input_tokens_seen": 97494520, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.19360352, "step": 4507, "time_per_iteration": 2.6011712551116943 }, { "auxiliary_loss_clip": 0.06372792, "auxiliary_loss_mlp": 0.01258017, "balance_loss_clip": 0.06270482, "balance_loss_mlp": 0.01253603, "epoch": 0.2710356230272058, "flos": 65229602232960.0, "grad_norm": 0.7126602614820505, "language_loss": 0.50679922, "learning_rate": 3.420876001185698e-06, "loss": 0.58310723, "num_input_tokens_seen": 97552455, "router_z_loss_clip": 1.02148438, "router_z_loss_mlp": 0.04421997, "step": 4508, "time_per_iteration": 3.101318120956421 }, { "auxiliary_loss_clip": 0.06505429, "auxiliary_loss_mlp": 0.01270743, "balance_loss_clip": 0.06296872, "balance_loss_mlp": 0.01254125, "epoch": 0.27109574627987376, "flos": 25491606635520.0, "grad_norm": 3.0543007631587065, "language_loss": 0.74890786, "learning_rate": 3.4206018867749197e-06, "loss": 0.82666963, "num_input_tokens_seen": 97572650, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.16638184, "step": 4509, "time_per_iteration": 2.6224327087402344 }, { "auxiliary_loss_clip": 0.06500581, "auxiliary_loss_mlp": 0.01274526, "balance_loss_clip": 0.06296486, "balance_loss_mlp": 0.01256418, "epoch": 0.2711558695325417, "flos": 19688910410880.0, "grad_norm": 1.6560408889042808, "language_loss": 0.71688282, "learning_rate": 3.4203277184947757e-06, "loss": 0.79463387, "num_input_tokens_seen": 97591150, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.18127441, "step": 4510, "time_per_iteration": 2.5777792930603027 }, { "auxiliary_loss_clip": 0.06512213, "auxiliary_loss_mlp": 0.012775, "balance_loss_clip": 0.06303266, "balance_loss_mlp": 0.01259691, "epoch": 0.2712159927852097, "flos": 18593627765760.0, "grad_norm": 2.5427317691893507, "language_loss": 0.71052277, "learning_rate": 3.4200534963556627e-06, "loss": 0.78841984, "num_input_tokens_seen": 97607410, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.17810059, "step": 4511, "time_per_iteration": 2.584895133972168 }, { "auxiliary_loss_clip": 0.06509729, "auxiliary_loss_mlp": 0.01274186, "balance_loss_clip": 0.06296381, "balance_loss_mlp": 0.01255983, "epoch": 0.27127611603787766, "flos": 25637403939840.0, "grad_norm": 2.420972614420113, "language_loss": 0.81487632, "learning_rate": 3.419779220367979e-06, "loss": 0.89271545, "num_input_tokens_seen": 97626870, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.18200684, "step": 4512, "time_per_iteration": 2.6309216022491455 }, { "auxiliary_loss_clip": 0.06508297, "auxiliary_loss_mlp": 0.012749, "balance_loss_clip": 0.06300617, "balance_loss_mlp": 0.01257364, "epoch": 0.2713362392905456, "flos": 23155663311360.0, "grad_norm": 2.3798514848291803, "language_loss": 0.80856568, "learning_rate": 3.419504890542124e-06, "loss": 0.88639766, "num_input_tokens_seen": 97646595, "router_z_loss_clip": 2.07714844, "router_z_loss_mlp": 0.17541504, "step": 4513, "time_per_iteration": 2.607597827911377 }, { "auxiliary_loss_clip": 0.06505048, "auxiliary_loss_mlp": 0.01275504, "balance_loss_clip": 0.0629379, "balance_loss_mlp": 0.01257957, "epoch": 0.2713963625432136, "flos": 18371409937920.0, "grad_norm": 1.8235899836378875, "language_loss": 0.88895345, "learning_rate": 3.4192305068885026e-06, "loss": 0.96675897, "num_input_tokens_seen": 97665485, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.17553711, "step": 4514, "time_per_iteration": 2.6248674392700195 }, { "auxiliary_loss_clip": 0.06506583, "auxiliary_loss_mlp": 0.01280616, "balance_loss_clip": 0.06296003, "balance_loss_mlp": 0.01263009, "epoch": 0.27145648579588155, "flos": 22498275703680.0, "grad_norm": 2.375828083460037, "language_loss": 0.92377949, "learning_rate": 3.418956069417517e-06, "loss": 1.00165141, "num_input_tokens_seen": 97683800, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.17590332, "step": 4515, "time_per_iteration": 4.032464504241943 }, { "auxiliary_loss_clip": 0.0651211, "auxiliary_loss_mlp": 0.01285925, "balance_loss_clip": 0.06296187, "balance_loss_mlp": 0.01264825, "epoch": 0.2715166090485495, "flos": 19244265120000.0, "grad_norm": 2.2802674966356316, "language_loss": 0.74525636, "learning_rate": 3.4186815781395756e-06, "loss": 0.8232367, "num_input_tokens_seen": 97700505, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.2109375, "step": 4516, "time_per_iteration": 2.568434715270996 }, { "auxiliary_loss_clip": 0.06509124, "auxiliary_loss_mlp": 0.0127548, "balance_loss_clip": 0.06297915, "balance_loss_mlp": 0.012568, "epoch": 0.2715767323012175, "flos": 17714902798080.0, "grad_norm": 2.4522245302836554, "language_loss": 0.76586056, "learning_rate": 3.4184070330650866e-06, "loss": 0.84370673, "num_input_tokens_seen": 97717410, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.18701172, "step": 4517, "time_per_iteration": 2.5968570709228516 }, { "auxiliary_loss_clip": 0.06506097, "auxiliary_loss_mlp": 0.01276998, "balance_loss_clip": 0.06294665, "balance_loss_mlp": 0.01258628, "epoch": 0.27163685555388545, "flos": 22389430849920.0, "grad_norm": 2.9715161661649896, "language_loss": 0.78291792, "learning_rate": 3.4181324342044607e-06, "loss": 0.86074889, "num_input_tokens_seen": 97734545, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.18371582, "step": 4518, "time_per_iteration": 2.70438551902771 }, { "auxiliary_loss_clip": 0.0651046, "auxiliary_loss_mlp": 0.01279624, "balance_loss_clip": 0.06298556, "balance_loss_mlp": 0.01262434, "epoch": 0.2716969788065534, "flos": 22353358867200.0, "grad_norm": 1.6392774551502836, "language_loss": 0.683999, "learning_rate": 3.41785778156811e-06, "loss": 0.76189983, "num_input_tokens_seen": 97754000, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.171875, "step": 4519, "time_per_iteration": 2.5752153396606445 }, { "auxiliary_loss_clip": 0.06505829, "auxiliary_loss_mlp": 0.01274898, "balance_loss_clip": 0.06296322, "balance_loss_mlp": 0.0125704, "epoch": 0.2717571020592214, "flos": 25235497031040.0, "grad_norm": 3.6613661266230237, "language_loss": 0.76574641, "learning_rate": 3.417583075166451e-06, "loss": 0.84355366, "num_input_tokens_seen": 97772080, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.17871094, "step": 4520, "time_per_iteration": 3.991757392883301 }, { "auxiliary_loss_clip": 0.06516688, "auxiliary_loss_mlp": 0.01282369, "balance_loss_clip": 0.06302319, "balance_loss_mlp": 0.01263915, "epoch": 0.2718172253118894, "flos": 20195343688320.0, "grad_norm": 2.229967848574381, "language_loss": 0.76228839, "learning_rate": 3.4173083150099e-06, "loss": 0.84027898, "num_input_tokens_seen": 97789370, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.18457031, "step": 4521, "time_per_iteration": 2.5808112621307373 }, { "auxiliary_loss_clip": 0.06524368, "auxiliary_loss_mlp": 0.01285124, "balance_loss_clip": 0.06306408, "balance_loss_mlp": 0.01264584, "epoch": 0.27187734856455736, "flos": 14324318858880.0, "grad_norm": 2.988006441997176, "language_loss": 0.75594771, "learning_rate": 3.417033501108875e-06, "loss": 0.83404261, "num_input_tokens_seen": 97807385, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.20532227, "step": 4522, "time_per_iteration": 2.576029062271118 }, { "auxiliary_loss_clip": 0.06522918, "auxiliary_loss_mlp": 0.01282872, "balance_loss_clip": 0.06308439, "balance_loss_mlp": 0.01264073, "epoch": 0.27193747181722533, "flos": 21114375269760.0, "grad_norm": 1.9300197649275483, "language_loss": 0.73229897, "learning_rate": 3.416758633473798e-06, "loss": 0.8103568, "num_input_tokens_seen": 97827930, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.18798828, "step": 4523, "time_per_iteration": 2.59808349609375 }, { "auxiliary_loss_clip": 0.06517033, "auxiliary_loss_mlp": 0.01279941, "balance_loss_clip": 0.06307864, "balance_loss_mlp": 0.01261905, "epoch": 0.2719975950698933, "flos": 19688910410880.0, "grad_norm": 1.853151186047877, "language_loss": 0.74585485, "learning_rate": 3.4164837121150915e-06, "loss": 0.82382458, "num_input_tokens_seen": 97847440, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.18029785, "step": 4524, "time_per_iteration": 2.5861356258392334 }, { "auxiliary_loss_clip": 0.06530545, "auxiliary_loss_mlp": 0.01283861, "balance_loss_clip": 0.06314231, "balance_loss_mlp": 0.01265634, "epoch": 0.27205771832256126, "flos": 24761488083840.0, "grad_norm": 1.69000303071242, "language_loss": 0.76657307, "learning_rate": 3.4162087370431803e-06, "loss": 0.84471714, "num_input_tokens_seen": 97867620, "router_z_loss_clip": 2.16308594, "router_z_loss_mlp": 0.18237305, "step": 4525, "time_per_iteration": 2.6226513385772705 }, { "auxiliary_loss_clip": 0.06518853, "auxiliary_loss_mlp": 0.01278391, "balance_loss_clip": 0.06307897, "balance_loss_mlp": 0.01260641, "epoch": 0.2721178415752292, "flos": 21760903774080.0, "grad_norm": 2.20310581465408, "language_loss": 0.819139, "learning_rate": 3.4159337082684926e-06, "loss": 0.89711142, "num_input_tokens_seen": 97884345, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.1776123, "step": 4526, "time_per_iteration": 3.9731764793395996 }, { "auxiliary_loss_clip": 0.06537143, "auxiliary_loss_mlp": 0.01288263, "balance_loss_clip": 0.06314866, "balance_loss_mlp": 0.01268665, "epoch": 0.2721779648278972, "flos": 12681667416960.0, "grad_norm": 2.6404073993912056, "language_loss": 0.76934505, "learning_rate": 3.4156586258014566e-06, "loss": 0.84759915, "num_input_tokens_seen": 97901500, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.19604492, "step": 4527, "time_per_iteration": 2.5683672428131104 }, { "auxiliary_loss_clip": 0.06527267, "auxiliary_loss_mlp": 0.01290967, "balance_loss_clip": 0.06312874, "balance_loss_mlp": 0.01271989, "epoch": 0.27223808808056515, "flos": 16258774544640.0, "grad_norm": 1.9714364528686183, "language_loss": 0.81693804, "learning_rate": 3.415383489652503e-06, "loss": 0.89512032, "num_input_tokens_seen": 97917800, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.18981934, "step": 4528, "time_per_iteration": 4.011265993118286 }, { "auxiliary_loss_clip": 0.06521371, "auxiliary_loss_mlp": 0.01278592, "balance_loss_clip": 0.06312534, "balance_loss_mlp": 0.01260293, "epoch": 0.2722982113332331, "flos": 27753225788160.0, "grad_norm": 1.821381217458859, "language_loss": 0.78058034, "learning_rate": 3.4151082998320666e-06, "loss": 0.85857999, "num_input_tokens_seen": 97937225, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.18286133, "step": 4529, "time_per_iteration": 2.6202144622802734 }, { "auxiliary_loss_clip": 0.06520341, "auxiliary_loss_mlp": 0.01276383, "balance_loss_clip": 0.06306158, "balance_loss_mlp": 0.01258489, "epoch": 0.2723583345859011, "flos": 21732756001920.0, "grad_norm": 5.337469986190473, "language_loss": 0.83132273, "learning_rate": 3.4148330563505805e-06, "loss": 0.90928996, "num_input_tokens_seen": 97956845, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.17895508, "step": 4530, "time_per_iteration": 2.5989038944244385 }, { "auxiliary_loss_clip": 0.06521025, "auxiliary_loss_mlp": 0.01285249, "balance_loss_clip": 0.06308895, "balance_loss_mlp": 0.01266486, "epoch": 0.27241845783856905, "flos": 17352925159680.0, "grad_norm": 13.443101347380534, "language_loss": 0.91621304, "learning_rate": 3.4145577592184838e-06, "loss": 0.99427581, "num_input_tokens_seen": 97972465, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.18762207, "step": 4531, "time_per_iteration": 2.5425496101379395 }, { "auxiliary_loss_clip": 0.0652706, "auxiliary_loss_mlp": 0.01282136, "balance_loss_clip": 0.06309405, "balance_loss_mlp": 0.01263993, "epoch": 0.272478581091237, "flos": 24761278448640.0, "grad_norm": 1.8205125549905494, "language_loss": 0.76757848, "learning_rate": 3.4142824084462155e-06, "loss": 0.84567046, "num_input_tokens_seen": 97990770, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.18139648, "step": 4532, "time_per_iteration": 2.6167218685150146 }, { "auxiliary_loss_clip": 0.06515141, "auxiliary_loss_mlp": 0.01271641, "balance_loss_clip": 0.06307311, "balance_loss_mlp": 0.01254832, "epoch": 0.272538704343905, "flos": 17895723909120.0, "grad_norm": 2.3530988115399207, "language_loss": 0.88801718, "learning_rate": 3.4140070040442162e-06, "loss": 0.96588504, "num_input_tokens_seen": 98005775, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.16796875, "step": 4533, "time_per_iteration": 2.5310840606689453 }, { "auxiliary_loss_clip": 0.06517398, "auxiliary_loss_mlp": 0.01275522, "balance_loss_clip": 0.06312063, "balance_loss_mlp": 0.01259465, "epoch": 0.272598827596573, "flos": 22939021779840.0, "grad_norm": 3.4535956829604695, "language_loss": 0.72028357, "learning_rate": 3.413731546022929e-06, "loss": 0.79821277, "num_input_tokens_seen": 98025750, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.16052246, "step": 4534, "time_per_iteration": 2.615762948989868 }, { "auxiliary_loss_clip": 0.06529761, "auxiliary_loss_mlp": 0.0127856, "balance_loss_clip": 0.0631474, "balance_loss_mlp": 0.01259069, "epoch": 0.27265895084924097, "flos": 24244447265280.0, "grad_norm": 1.6354910565753613, "language_loss": 0.91664302, "learning_rate": 3.4134560343928005e-06, "loss": 0.99472612, "num_input_tokens_seen": 98044955, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.19482422, "step": 4535, "time_per_iteration": 2.6038007736206055 }, { "auxiliary_loss_clip": 0.06525269, "auxiliary_loss_mlp": 0.01273003, "balance_loss_clip": 0.06310645, "balance_loss_mlp": 0.01254741, "epoch": 0.27271907410190893, "flos": 27019962708480.0, "grad_norm": 1.7451221888309127, "language_loss": 0.73578542, "learning_rate": 3.4131804691642778e-06, "loss": 0.81376815, "num_input_tokens_seen": 98065860, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.18261719, "step": 4536, "time_per_iteration": 2.624701976776123 }, { "auxiliary_loss_clip": 0.06517823, "auxiliary_loss_mlp": 0.012763, "balance_loss_clip": 0.06306091, "balance_loss_mlp": 0.01258502, "epoch": 0.2727791973545769, "flos": 34460027568000.0, "grad_norm": 2.1084634244919025, "language_loss": 0.7247172, "learning_rate": 3.41290485034781e-06, "loss": 0.80265844, "num_input_tokens_seen": 98085450, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.17797852, "step": 4537, "time_per_iteration": 2.7264578342437744 }, { "auxiliary_loss_clip": 0.06514658, "auxiliary_loss_mlp": 0.01277448, "balance_loss_clip": 0.06305464, "balance_loss_mlp": 0.01259674, "epoch": 0.27283932060724486, "flos": 15045842367360.0, "grad_norm": 2.4792474571397833, "language_loss": 0.78197026, "learning_rate": 3.4126291779538485e-06, "loss": 0.85989136, "num_input_tokens_seen": 98099115, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.17785645, "step": 4538, "time_per_iteration": 2.539989709854126 }, { "auxiliary_loss_clip": 0.06514454, "auxiliary_loss_mlp": 0.01281874, "balance_loss_clip": 0.06303681, "balance_loss_mlp": 0.01264278, "epoch": 0.2728994438599128, "flos": 21658767246720.0, "grad_norm": 1.5487595683386135, "language_loss": 0.90244806, "learning_rate": 3.412353451992847e-06, "loss": 0.98041129, "num_input_tokens_seen": 98118415, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.17590332, "step": 4539, "time_per_iteration": 2.59832763671875 }, { "auxiliary_loss_clip": 0.06510819, "auxiliary_loss_mlp": 0.01272867, "balance_loss_clip": 0.06301241, "balance_loss_mlp": 0.01254687, "epoch": 0.2729595671125808, "flos": 17493313875840.0, "grad_norm": 2.0326162106614873, "language_loss": 0.8836177, "learning_rate": 3.4120776724752607e-06, "loss": 0.96145451, "num_input_tokens_seen": 98136300, "router_z_loss_clip": 2.09277344, "router_z_loss_mlp": 0.1817627, "step": 4540, "time_per_iteration": 2.584972381591797 }, { "auxiliary_loss_clip": 0.06512971, "auxiliary_loss_mlp": 0.01276668, "balance_loss_clip": 0.06300525, "balance_loss_mlp": 0.01259561, "epoch": 0.27301969036524876, "flos": 19324249441920.0, "grad_norm": 2.044828735172673, "language_loss": 0.8257814, "learning_rate": 3.4118018394115476e-06, "loss": 0.90367776, "num_input_tokens_seen": 98154580, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.17114258, "step": 4541, "time_per_iteration": 2.583336353302002 }, { "auxiliary_loss_clip": 0.06510692, "auxiliary_loss_mlp": 0.01277545, "balance_loss_clip": 0.06299204, "balance_loss_mlp": 0.01259473, "epoch": 0.2730798136179167, "flos": 21071427252480.0, "grad_norm": 2.20879190659069, "language_loss": 0.80004489, "learning_rate": 3.4115259528121678e-06, "loss": 0.8779273, "num_input_tokens_seen": 98173115, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.1809082, "step": 4542, "time_per_iteration": 2.5899977684020996 }, { "auxiliary_loss_clip": 0.06514112, "auxiliary_loss_mlp": 0.01278927, "balance_loss_clip": 0.06303017, "balance_loss_mlp": 0.01261499, "epoch": 0.2731399368705847, "flos": 19177739377920.0, "grad_norm": 2.04956133188159, "language_loss": 0.90062732, "learning_rate": 3.411250012687582e-06, "loss": 0.97855777, "num_input_tokens_seen": 98190260, "router_z_loss_clip": 2.11035156, "router_z_loss_mlp": 0.17443848, "step": 4543, "time_per_iteration": 2.5639638900756836 }, { "auxiliary_loss_clip": 0.06519053, "auxiliary_loss_mlp": 0.01276484, "balance_loss_clip": 0.06303309, "balance_loss_mlp": 0.0125778, "epoch": 0.27320006012325265, "flos": 18294989414400.0, "grad_norm": 1.8369802599742937, "language_loss": 0.63474369, "learning_rate": 3.410974019048255e-06, "loss": 0.71269906, "num_input_tokens_seen": 98207115, "router_z_loss_clip": 2.15332031, "router_z_loss_mlp": 0.18688965, "step": 4544, "time_per_iteration": 2.555145740509033 }, { "auxiliary_loss_clip": 0.06510513, "auxiliary_loss_mlp": 0.01280036, "balance_loss_clip": 0.06300862, "balance_loss_mlp": 0.01261248, "epoch": 0.2732601833759206, "flos": 34869607125120.0, "grad_norm": 1.6734082100854204, "language_loss": 0.69816494, "learning_rate": 3.410697971904651e-06, "loss": 0.77607042, "num_input_tokens_seen": 98230610, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.18786621, "step": 4545, "time_per_iteration": 2.697434425354004 }, { "auxiliary_loss_clip": 0.06393808, "auxiliary_loss_mlp": 0.01281249, "balance_loss_clip": 0.0628937, "balance_loss_mlp": 0.01276108, "epoch": 0.2733203066285886, "flos": 53929514534400.0, "grad_norm": 0.7080381104847169, "language_loss": 0.61668521, "learning_rate": 3.4104218712672383e-06, "loss": 0.69343579, "num_input_tokens_seen": 98293585, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.05142212, "step": 4546, "time_per_iteration": 3.217345714569092 }, { "auxiliary_loss_clip": 0.06511781, "auxiliary_loss_mlp": 0.01279235, "balance_loss_clip": 0.06301987, "balance_loss_mlp": 0.0126183, "epoch": 0.2733804298812566, "flos": 20665411493760.0, "grad_norm": 1.90162151170672, "language_loss": 0.65508753, "learning_rate": 3.410145717146488e-06, "loss": 0.73299778, "num_input_tokens_seen": 98311680, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.17419434, "step": 4547, "time_per_iteration": 2.58255672454834 }, { "auxiliary_loss_clip": 0.06509086, "auxiliary_loss_mlp": 0.01275208, "balance_loss_clip": 0.06305562, "balance_loss_mlp": 0.01258888, "epoch": 0.27344055313392457, "flos": 25891333338240.0, "grad_norm": 2.262661230539188, "language_loss": 0.78374869, "learning_rate": 3.4098695095528694e-06, "loss": 0.86159164, "num_input_tokens_seen": 98330770, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.16308594, "step": 4548, "time_per_iteration": 2.629760980606079 }, { "auxiliary_loss_clip": 0.06514558, "auxiliary_loss_mlp": 0.01279252, "balance_loss_clip": 0.06306753, "balance_loss_mlp": 0.01262492, "epoch": 0.27350067638659253, "flos": 22936380376320.0, "grad_norm": 3.4412276580159658, "language_loss": 0.83161795, "learning_rate": 3.4095932484968585e-06, "loss": 0.90955609, "num_input_tokens_seen": 98349860, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.16748047, "step": 4549, "time_per_iteration": 2.5913140773773193 }, { "auxiliary_loss_clip": 0.06511027, "auxiliary_loss_mlp": 0.01277933, "balance_loss_clip": 0.06297256, "balance_loss_mlp": 0.01259361, "epoch": 0.2735607996392605, "flos": 16579313539200.0, "grad_norm": 2.417839435071035, "language_loss": 0.71448696, "learning_rate": 3.4093169339889305e-06, "loss": 0.79237664, "num_input_tokens_seen": 98367040, "router_z_loss_clip": 2.13769531, "router_z_loss_mlp": 0.18554688, "step": 4550, "time_per_iteration": 2.58847975730896 }, { "auxiliary_loss_clip": 0.06514075, "auxiliary_loss_mlp": 0.01272528, "balance_loss_clip": 0.06307541, "balance_loss_mlp": 0.01254563, "epoch": 0.27362092289192846, "flos": 19651245200640.0, "grad_norm": 2.019235272077413, "language_loss": 0.78637278, "learning_rate": 3.409040566039563e-06, "loss": 0.86423886, "num_input_tokens_seen": 98384010, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.17956543, "step": 4551, "time_per_iteration": 2.559567451477051 }, { "auxiliary_loss_clip": 0.06515038, "auxiliary_loss_mlp": 0.01276618, "balance_loss_clip": 0.06304935, "balance_loss_mlp": 0.01259762, "epoch": 0.27368104614459643, "flos": 17644855184640.0, "grad_norm": 2.3915156516853777, "language_loss": 0.71569979, "learning_rate": 3.4087641446592362e-06, "loss": 0.79361641, "num_input_tokens_seen": 98399625, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.1685791, "step": 4552, "time_per_iteration": 2.5559329986572266 }, { "auxiliary_loss_clip": 0.06512813, "auxiliary_loss_mlp": 0.01276388, "balance_loss_clip": 0.06302325, "balance_loss_mlp": 0.01259139, "epoch": 0.2737411693972644, "flos": 21586455573120.0, "grad_norm": 2.1350342803717717, "language_loss": 0.71940142, "learning_rate": 3.408487669858431e-06, "loss": 0.79729342, "num_input_tokens_seen": 98417310, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.17248535, "step": 4553, "time_per_iteration": 2.575284004211426 }, { "auxiliary_loss_clip": 0.06511034, "auxiliary_loss_mlp": 0.01274036, "balance_loss_clip": 0.063016, "balance_loss_mlp": 0.01257073, "epoch": 0.27380129264993236, "flos": 25491145438080.0, "grad_norm": 2.9049230196304467, "language_loss": 0.59853029, "learning_rate": 3.4082111416476337e-06, "loss": 0.67638099, "num_input_tokens_seen": 98438670, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.16967773, "step": 4554, "time_per_iteration": 4.0806968212127686 }, { "auxiliary_loss_clip": 0.06522334, "auxiliary_loss_mlp": 0.01278953, "balance_loss_clip": 0.06302635, "balance_loss_mlp": 0.0126075, "epoch": 0.2738614159026003, "flos": 18667155323520.0, "grad_norm": 1.8864078653860143, "language_loss": 0.74661833, "learning_rate": 3.4079345600373275e-06, "loss": 0.82463121, "num_input_tokens_seen": 98456060, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.18188477, "step": 4555, "time_per_iteration": 2.5686607360839844 }, { "auxiliary_loss_clip": 0.06519663, "auxiliary_loss_mlp": 0.012774, "balance_loss_clip": 0.0630779, "balance_loss_mlp": 0.01259745, "epoch": 0.2739215391552683, "flos": 23483874954240.0, "grad_norm": 3.4642243143422053, "language_loss": 0.7749666, "learning_rate": 3.407657925038002e-06, "loss": 0.85293716, "num_input_tokens_seen": 98473765, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.1763916, "step": 4556, "time_per_iteration": 2.611172914505005 }, { "auxiliary_loss_clip": 0.06533008, "auxiliary_loss_mlp": 0.01277153, "balance_loss_clip": 0.06308687, "balance_loss_mlp": 0.01257996, "epoch": 0.27398166240793626, "flos": 17134313057280.0, "grad_norm": 1.9440890751808777, "language_loss": 0.83273828, "learning_rate": 3.4073812366601473e-06, "loss": 0.91083992, "num_input_tokens_seen": 98490590, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.19165039, "step": 4557, "time_per_iteration": 2.5683279037475586 }, { "auxiliary_loss_clip": 0.06520036, "auxiliary_loss_mlp": 0.01273722, "balance_loss_clip": 0.06309297, "balance_loss_mlp": 0.01256675, "epoch": 0.2740417856606042, "flos": 23411563280640.0, "grad_norm": 2.027283606524613, "language_loss": 0.73415971, "learning_rate": 3.4071044949142547e-06, "loss": 0.81209731, "num_input_tokens_seen": 98510590, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.17041016, "step": 4558, "time_per_iteration": 2.5971529483795166 }, { "auxiliary_loss_clip": 0.0651907, "auxiliary_loss_mlp": 0.01279882, "balance_loss_clip": 0.06308933, "balance_loss_mlp": 0.01262298, "epoch": 0.2741019089132722, "flos": 12784307068800.0, "grad_norm": 2.181473796275626, "language_loss": 0.69013816, "learning_rate": 3.406827699810819e-06, "loss": 0.76812768, "num_input_tokens_seen": 98527875, "router_z_loss_clip": 2.10058594, "router_z_loss_mlp": 0.17590332, "step": 4559, "time_per_iteration": 2.5646345615386963 }, { "auxiliary_loss_clip": 0.06523664, "auxiliary_loss_mlp": 0.01274807, "balance_loss_clip": 0.06316266, "balance_loss_mlp": 0.01258725, "epoch": 0.27416203216594015, "flos": 20637850700160.0, "grad_norm": 1.7505218499590696, "language_loss": 0.72173309, "learning_rate": 3.4065508513603353e-06, "loss": 0.79971778, "num_input_tokens_seen": 98547575, "router_z_loss_clip": 2.07324219, "router_z_loss_mlp": 0.16088867, "step": 4560, "time_per_iteration": 3.99124813079834 }, { "auxiliary_loss_clip": 0.06520009, "auxiliary_loss_mlp": 0.01274071, "balance_loss_clip": 0.06310023, "balance_loss_mlp": 0.0125731, "epoch": 0.27422215541860817, "flos": 26548762872960.0, "grad_norm": 1.7786188361906017, "language_loss": 0.81636178, "learning_rate": 3.406273949573303e-06, "loss": 0.89430261, "num_input_tokens_seen": 98566290, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.16772461, "step": 4561, "time_per_iteration": 2.700216054916382 }, { "auxiliary_loss_clip": 0.06523953, "auxiliary_loss_mlp": 0.0127548, "balance_loss_clip": 0.06312925, "balance_loss_mlp": 0.01258684, "epoch": 0.27428227867127614, "flos": 23337868014720.0, "grad_norm": 2.4915399089598287, "language_loss": 0.75629956, "learning_rate": 3.4059969944602214e-06, "loss": 0.8342939, "num_input_tokens_seen": 98586255, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.16796875, "step": 4562, "time_per_iteration": 2.5914266109466553 }, { "auxiliary_loss_clip": 0.06522465, "auxiliary_loss_mlp": 0.0127462, "balance_loss_clip": 0.06311438, "balance_loss_mlp": 0.01257859, "epoch": 0.2743424019239441, "flos": 23041074453120.0, "grad_norm": 7.161689308973713, "language_loss": 0.75411749, "learning_rate": 3.4057199860315928e-06, "loss": 0.83208835, "num_input_tokens_seen": 98606030, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.16760254, "step": 4563, "time_per_iteration": 2.5993173122406006 }, { "auxiliary_loss_clip": 0.06529767, "auxiliary_loss_mlp": 0.01276113, "balance_loss_clip": 0.06310847, "balance_loss_mlp": 0.0125717, "epoch": 0.27440252517661207, "flos": 21987565868160.0, "grad_norm": 1.8033943110381407, "language_loss": 0.63603079, "learning_rate": 3.4054429242979213e-06, "loss": 0.71408963, "num_input_tokens_seen": 98625225, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.18945312, "step": 4564, "time_per_iteration": 2.5928938388824463 }, { "auxiliary_loss_clip": 0.06517609, "auxiliary_loss_mlp": 0.01277463, "balance_loss_clip": 0.06306258, "balance_loss_mlp": 0.01260308, "epoch": 0.27446264842928003, "flos": 40196952737280.0, "grad_norm": 2.975144540892223, "language_loss": 0.78704107, "learning_rate": 3.4051658092697135e-06, "loss": 0.86499178, "num_input_tokens_seen": 98649470, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.17150879, "step": 4565, "time_per_iteration": 2.884385824203491 }, { "auxiliary_loss_clip": 0.06512216, "auxiliary_loss_mlp": 0.01277463, "balance_loss_clip": 0.06302362, "balance_loss_mlp": 0.01260535, "epoch": 0.274522771681948, "flos": 13484684620800.0, "grad_norm": 1.93980822502123, "language_loss": 0.69552982, "learning_rate": 3.404888640957477e-06, "loss": 0.77342665, "num_input_tokens_seen": 98666915, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.16931152, "step": 4566, "time_per_iteration": 3.907263994216919 }, { "auxiliary_loss_clip": 0.06509914, "auxiliary_loss_mlp": 0.01281326, "balance_loss_clip": 0.06302871, "balance_loss_mlp": 0.01264589, "epoch": 0.27458289493461596, "flos": 28629812476800.0, "grad_norm": 1.9659877155273102, "language_loss": 0.61730754, "learning_rate": 3.404611419371723e-06, "loss": 0.69521993, "num_input_tokens_seen": 98688240, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.1673584, "step": 4567, "time_per_iteration": 4.104169130325317 }, { "auxiliary_loss_clip": 0.06514989, "auxiliary_loss_mlp": 0.01281229, "balance_loss_clip": 0.0630141, "balance_loss_mlp": 0.01262859, "epoch": 0.2746430181872839, "flos": 20125883053440.0, "grad_norm": 2.157052676741448, "language_loss": 0.83202392, "learning_rate": 3.4043341445229627e-06, "loss": 0.90998608, "num_input_tokens_seen": 98708245, "router_z_loss_clip": 2.13574219, "router_z_loss_mlp": 0.18371582, "step": 4568, "time_per_iteration": 2.6321322917938232 }, { "auxiliary_loss_clip": 0.0652065, "auxiliary_loss_mlp": 0.0127425, "balance_loss_clip": 0.06304534, "balance_loss_mlp": 0.01256739, "epoch": 0.2747031414399519, "flos": 20199662173440.0, "grad_norm": 2.8573935283421372, "language_loss": 0.69000924, "learning_rate": 3.4040568164217117e-06, "loss": 0.76795828, "num_input_tokens_seen": 98724575, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.1751709, "step": 4569, "time_per_iteration": 2.5823428630828857 }, { "auxiliary_loss_clip": 0.06514172, "auxiliary_loss_mlp": 0.01275373, "balance_loss_clip": 0.0629967, "balance_loss_mlp": 0.01257313, "epoch": 0.27476326469261986, "flos": 13521385509120.0, "grad_norm": 2.0606805640604815, "language_loss": 0.71340001, "learning_rate": 3.4037794350784848e-06, "loss": 0.79129541, "num_input_tokens_seen": 98740700, "router_z_loss_clip": 2.14355469, "router_z_loss_mlp": 0.18054199, "step": 4570, "time_per_iteration": 2.5564754009246826 }, { "auxiliary_loss_clip": 0.06428431, "auxiliary_loss_mlp": 0.01263127, "balance_loss_clip": 0.06323555, "balance_loss_mlp": 0.01258234, "epoch": 0.2748233879452878, "flos": 65955486153600.0, "grad_norm": 0.7019438972606763, "language_loss": 0.55807728, "learning_rate": 3.4035020005038014e-06, "loss": 0.63499284, "num_input_tokens_seen": 98803030, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.04888916, "step": 4571, "time_per_iteration": 3.307060956954956 }, { "auxiliary_loss_clip": 0.06514642, "auxiliary_loss_mlp": 0.01277618, "balance_loss_clip": 0.06299338, "balance_loss_mlp": 0.01260619, "epoch": 0.2748835111979558, "flos": 17389961464320.0, "grad_norm": 2.2606673644648274, "language_loss": 0.77942276, "learning_rate": 3.4032245127081812e-06, "loss": 0.8573454, "num_input_tokens_seen": 98820505, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.17004395, "step": 4572, "time_per_iteration": 2.5648584365844727 }, { "auxiliary_loss_clip": 0.06508631, "auxiliary_loss_mlp": 0.01284744, "balance_loss_clip": 0.06304085, "balance_loss_mlp": 0.01269079, "epoch": 0.27494363445062375, "flos": 23594480743680.0, "grad_norm": 1.6612797183304893, "language_loss": 0.82012194, "learning_rate": 3.402946971702147e-06, "loss": 0.89805567, "num_input_tokens_seen": 98842150, "router_z_loss_clip": 2.04589844, "router_z_loss_mlp": 0.15661621, "step": 4573, "time_per_iteration": 2.6136314868927 }, { "auxiliary_loss_clip": 0.0650297, "auxiliary_loss_mlp": 0.01281243, "balance_loss_clip": 0.06295168, "balance_loss_mlp": 0.01263409, "epoch": 0.2750037577032918, "flos": 17170175404800.0, "grad_norm": 1.5578674091669127, "language_loss": 0.79630274, "learning_rate": 3.402669377496223e-06, "loss": 0.87414491, "num_input_tokens_seen": 98861050, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.1784668, "step": 4574, "time_per_iteration": 2.5688042640686035 }, { "auxiliary_loss_clip": 0.06516073, "auxiliary_loss_mlp": 0.01287616, "balance_loss_clip": 0.06302853, "balance_loss_mlp": 0.01270736, "epoch": 0.27506388095595974, "flos": 24497663904000.0, "grad_norm": 1.8761057194593547, "language_loss": 0.74974799, "learning_rate": 3.402391730100936e-06, "loss": 0.8277849, "num_input_tokens_seen": 98879695, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.1685791, "step": 4575, "time_per_iteration": 2.609997510910034 }, { "auxiliary_loss_clip": 0.06505647, "auxiliary_loss_mlp": 0.0128213, "balance_loss_clip": 0.06296587, "balance_loss_mlp": 0.01265118, "epoch": 0.2751240042086277, "flos": 38774003500800.0, "grad_norm": 2.0218103883196763, "language_loss": 0.71858162, "learning_rate": 3.402114029526814e-06, "loss": 0.79645938, "num_input_tokens_seen": 98902035, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.17004395, "step": 4576, "time_per_iteration": 2.728606700897217 }, { "auxiliary_loss_clip": 0.06505463, "auxiliary_loss_mlp": 0.0128441, "balance_loss_clip": 0.06296244, "balance_loss_mlp": 0.01266397, "epoch": 0.27518412746129567, "flos": 26914388163840.0, "grad_norm": 2.0530363933853435, "language_loss": 0.73159456, "learning_rate": 3.4018362757843866e-06, "loss": 0.8094933, "num_input_tokens_seen": 98921835, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.17993164, "step": 4577, "time_per_iteration": 2.6328952312469482 }, { "auxiliary_loss_clip": 0.06512526, "auxiliary_loss_mlp": 0.01273969, "balance_loss_clip": 0.06298821, "balance_loss_mlp": 0.01256112, "epoch": 0.27524425071396363, "flos": 24907578877440.0, "grad_norm": 2.318062415024384, "language_loss": 0.7627815, "learning_rate": 3.401558468884188e-06, "loss": 0.84064645, "num_input_tokens_seen": 98939610, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.17858887, "step": 4578, "time_per_iteration": 2.601501941680908 }, { "auxiliary_loss_clip": 0.06510648, "auxiliary_loss_mlp": 0.01277212, "balance_loss_clip": 0.0629548, "balance_loss_mlp": 0.01257209, "epoch": 0.2753043739666316, "flos": 26295504307200.0, "grad_norm": 1.476979831826334, "language_loss": 0.6684863, "learning_rate": 3.4012806088367516e-06, "loss": 0.74636495, "num_input_tokens_seen": 98962250, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.20019531, "step": 4579, "time_per_iteration": 2.648435354232788 }, { "auxiliary_loss_clip": 0.06509373, "auxiliary_loss_mlp": 0.01278905, "balance_loss_clip": 0.06293934, "balance_loss_mlp": 0.0126044, "epoch": 0.27536449721929956, "flos": 24213616162560.0, "grad_norm": 1.9605044031788823, "language_loss": 0.80704761, "learning_rate": 3.4010026956526137e-06, "loss": 0.88493037, "num_input_tokens_seen": 98981845, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.18469238, "step": 4580, "time_per_iteration": 2.6441075801849365 }, { "auxiliary_loss_clip": 0.06508773, "auxiliary_loss_mlp": 0.0127699, "balance_loss_clip": 0.06297401, "balance_loss_mlp": 0.01257976, "epoch": 0.27542462047196753, "flos": 19543448522880.0, "grad_norm": 1.6231353143423541, "language_loss": 0.68295044, "learning_rate": 3.4007247293423137e-06, "loss": 0.76080805, "num_input_tokens_seen": 99001855, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.19030762, "step": 4581, "time_per_iteration": 2.610874891281128 }, { "auxiliary_loss_clip": 0.06513938, "auxiliary_loss_mlp": 0.01271674, "balance_loss_clip": 0.06296262, "balance_loss_mlp": 0.01254937, "epoch": 0.2754847437246355, "flos": 14324360785920.0, "grad_norm": 1.8230452272420128, "language_loss": 0.79105139, "learning_rate": 3.400446709916392e-06, "loss": 0.86890751, "num_input_tokens_seen": 99019880, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.16748047, "step": 4582, "time_per_iteration": 2.5677897930145264 }, { "auxiliary_loss_clip": 0.06497659, "auxiliary_loss_mlp": 0.01274048, "balance_loss_clip": 0.06291533, "balance_loss_mlp": 0.01257574, "epoch": 0.27554486697730346, "flos": 18843951438720.0, "grad_norm": 1.5987346313740576, "language_loss": 0.84623778, "learning_rate": 3.4001686373853895e-06, "loss": 0.92395484, "num_input_tokens_seen": 99037570, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.16479492, "step": 4583, "time_per_iteration": 2.5615713596343994 }, { "auxiliary_loss_clip": 0.06510597, "auxiliary_loss_mlp": 0.01278939, "balance_loss_clip": 0.06294237, "balance_loss_mlp": 0.01261344, "epoch": 0.2756049902299714, "flos": 22388801944320.0, "grad_norm": 2.0453637374989944, "language_loss": 0.67311925, "learning_rate": 3.3998905117598528e-06, "loss": 0.75101459, "num_input_tokens_seen": 99056875, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.17590332, "step": 4584, "time_per_iteration": 2.6123697757720947 }, { "auxiliary_loss_clip": 0.06503516, "auxiliary_loss_mlp": 0.01273951, "balance_loss_clip": 0.0629449, "balance_loss_mlp": 0.01257214, "epoch": 0.2756651134826394, "flos": 19580107484160.0, "grad_norm": 1.7169377924869447, "language_loss": 0.77309501, "learning_rate": 3.399612333050327e-06, "loss": 0.85086966, "num_input_tokens_seen": 99074685, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.1673584, "step": 4585, "time_per_iteration": 2.5941507816314697 }, { "auxiliary_loss_clip": 0.06514189, "auxiliary_loss_mlp": 0.01278524, "balance_loss_clip": 0.06295449, "balance_loss_mlp": 0.01260476, "epoch": 0.27572523673530736, "flos": 23593306786560.0, "grad_norm": 1.6019510270651771, "language_loss": 0.72914505, "learning_rate": 3.399334101267362e-06, "loss": 0.80707216, "num_input_tokens_seen": 99095300, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.18066406, "step": 4586, "time_per_iteration": 2.603055477142334 }, { "auxiliary_loss_clip": 0.06503599, "auxiliary_loss_mlp": 0.01273722, "balance_loss_clip": 0.06293553, "balance_loss_mlp": 0.01256717, "epoch": 0.2757853599879754, "flos": 22826696981760.0, "grad_norm": 1.6383761398885077, "language_loss": 0.80616701, "learning_rate": 3.3990558164215073e-06, "loss": 0.88394022, "num_input_tokens_seen": 99115965, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.17010498, "step": 4587, "time_per_iteration": 2.60073184967041 }, { "auxiliary_loss_clip": 0.06504305, "auxiliary_loss_mlp": 0.01280046, "balance_loss_clip": 0.06293599, "balance_loss_mlp": 0.01262343, "epoch": 0.27584548324064334, "flos": 18557639637120.0, "grad_norm": 1.892992291759188, "language_loss": 0.83732867, "learning_rate": 3.398777478523316e-06, "loss": 0.91517222, "num_input_tokens_seen": 99134265, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.17700195, "step": 4588, "time_per_iteration": 2.573047161102295 }, { "auxiliary_loss_clip": 0.06497357, "auxiliary_loss_mlp": 0.01274172, "balance_loss_clip": 0.06290496, "balance_loss_mlp": 0.01257268, "epoch": 0.2759056064933113, "flos": 23776811228160.0, "grad_norm": 1.4276654240232591, "language_loss": 0.75725627, "learning_rate": 3.398499087583342e-06, "loss": 0.83497161, "num_input_tokens_seen": 99156185, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.16894531, "step": 4589, "time_per_iteration": 2.6107466220855713 }, { "auxiliary_loss_clip": 0.06501539, "auxiliary_loss_mlp": 0.01276044, "balance_loss_clip": 0.06293991, "balance_loss_mlp": 0.01258329, "epoch": 0.27596572974597927, "flos": 24289114291200.0, "grad_norm": 1.7276540598415446, "language_loss": 0.88984346, "learning_rate": 3.398220643612143e-06, "loss": 0.9676193, "num_input_tokens_seen": 99176735, "router_z_loss_clip": 2.07324219, "router_z_loss_mlp": 0.17712402, "step": 4590, "time_per_iteration": 2.6329104900360107 }, { "auxiliary_loss_clip": 0.0650358, "auxiliary_loss_mlp": 0.01275506, "balance_loss_clip": 0.06294261, "balance_loss_mlp": 0.01257851, "epoch": 0.27602585299864724, "flos": 35049296206080.0, "grad_norm": 1.7021409635209825, "language_loss": 0.71521103, "learning_rate": 3.397942146620277e-06, "loss": 0.79300189, "num_input_tokens_seen": 99199765, "router_z_loss_clip": 2.09277344, "router_z_loss_mlp": 0.17651367, "step": 4591, "time_per_iteration": 2.70648455619812 }, { "auxiliary_loss_clip": 0.06502355, "auxiliary_loss_mlp": 0.01273205, "balance_loss_clip": 0.06291921, "balance_loss_mlp": 0.0125642, "epoch": 0.2760859762513152, "flos": 24315123784320.0, "grad_norm": 2.0439034900054156, "language_loss": 0.8031553, "learning_rate": 3.3976635966183046e-06, "loss": 0.88091087, "num_input_tokens_seen": 99218435, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.16784668, "step": 4592, "time_per_iteration": 2.616198778152466 }, { "auxiliary_loss_clip": 0.06408265, "auxiliary_loss_mlp": 0.01272193, "balance_loss_clip": 0.06301124, "balance_loss_mlp": 0.01267252, "epoch": 0.27614609950398317, "flos": 71279435675520.0, "grad_norm": 0.6956741396347423, "language_loss": 0.61561251, "learning_rate": 3.3973849936167886e-06, "loss": 0.69241709, "num_input_tokens_seen": 99276200, "router_z_loss_clip": 1.0703125, "router_z_loss_mlp": 0.04937744, "step": 4593, "time_per_iteration": 3.1621174812316895 }, { "auxiliary_loss_clip": 0.06498219, "auxiliary_loss_mlp": 0.01273915, "balance_loss_clip": 0.06291053, "balance_loss_mlp": 0.01257297, "epoch": 0.27620622275665113, "flos": 29681811688320.0, "grad_norm": 1.820987325922073, "language_loss": 0.77721536, "learning_rate": 3.3971063376262937e-06, "loss": 0.85493672, "num_input_tokens_seen": 99297625, "router_z_loss_clip": 2.07324219, "router_z_loss_mlp": 0.1661377, "step": 4594, "time_per_iteration": 4.047141790390015 }, { "auxiliary_loss_clip": 0.06496491, "auxiliary_loss_mlp": 0.01273744, "balance_loss_clip": 0.06290194, "balance_loss_mlp": 0.01256602, "epoch": 0.2762663460093191, "flos": 15383571448320.0, "grad_norm": 1.8755635236190862, "language_loss": 0.91815543, "learning_rate": 3.3968276286573866e-06, "loss": 0.99585778, "num_input_tokens_seen": 99315790, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.17150879, "step": 4595, "time_per_iteration": 2.5680086612701416 }, { "auxiliary_loss_clip": 0.06502903, "auxiliary_loss_mlp": 0.01286076, "balance_loss_clip": 0.06289926, "balance_loss_mlp": 0.01267814, "epoch": 0.27632646926198706, "flos": 20710330081920.0, "grad_norm": 2.233838558662907, "language_loss": 0.69817519, "learning_rate": 3.3965488667206353e-06, "loss": 0.77606499, "num_input_tokens_seen": 99334615, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.18261719, "step": 4596, "time_per_iteration": 2.5770621299743652 }, { "auxiliary_loss_clip": 0.06515349, "auxiliary_loss_mlp": 0.01275562, "balance_loss_clip": 0.0629746, "balance_loss_mlp": 0.01258753, "epoch": 0.276386592514655, "flos": 32820981851520.0, "grad_norm": 1.8504243228288846, "language_loss": 0.64304507, "learning_rate": 3.3962700518266113e-06, "loss": 0.72095418, "num_input_tokens_seen": 99356685, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.16809082, "step": 4597, "time_per_iteration": 2.6760404109954834 }, { "auxiliary_loss_clip": 0.06495169, "auxiliary_loss_mlp": 0.01279846, "balance_loss_clip": 0.06289199, "balance_loss_mlp": 0.01261572, "epoch": 0.276446715767323, "flos": 18557639637120.0, "grad_norm": 2.1264857207648005, "language_loss": 0.86731851, "learning_rate": 3.395991183985887e-06, "loss": 0.94506866, "num_input_tokens_seen": 99374810, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.18286133, "step": 4598, "time_per_iteration": 2.5766732692718506 }, { "auxiliary_loss_clip": 0.06504682, "auxiliary_loss_mlp": 0.01277061, "balance_loss_clip": 0.06291413, "balance_loss_mlp": 0.01258774, "epoch": 0.27650683901999096, "flos": 22826110003200.0, "grad_norm": 4.297499454939647, "language_loss": 0.80252743, "learning_rate": 3.395712263209037e-06, "loss": 0.88034487, "num_input_tokens_seen": 99391290, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.18286133, "step": 4599, "time_per_iteration": 4.089247941970825 }, { "auxiliary_loss_clip": 0.06515694, "auxiliary_loss_mlp": 0.01285442, "balance_loss_clip": 0.06297754, "balance_loss_mlp": 0.01267298, "epoch": 0.276566962272659, "flos": 21368011178880.0, "grad_norm": 1.6843105950011454, "language_loss": 0.79162216, "learning_rate": 3.395433289506639e-06, "loss": 0.86963356, "num_input_tokens_seen": 99409120, "router_z_loss_clip": 2.17871094, "router_z_loss_mlp": 0.18139648, "step": 4600, "time_per_iteration": 2.5841705799102783 }, { "auxiliary_loss_clip": 0.0651506, "auxiliary_loss_mlp": 0.01278859, "balance_loss_clip": 0.06300402, "balance_loss_mlp": 0.01261073, "epoch": 0.27662708552532694, "flos": 17716076755200.0, "grad_norm": 2.220778733514616, "language_loss": 0.74220252, "learning_rate": 3.3951542628892694e-06, "loss": 0.82014167, "num_input_tokens_seen": 99426180, "router_z_loss_clip": 2.14550781, "router_z_loss_mlp": 0.17785645, "step": 4601, "time_per_iteration": 2.5545873641967773 }, { "auxiliary_loss_clip": 0.06506585, "auxiliary_loss_mlp": 0.01288509, "balance_loss_clip": 0.06296088, "balance_loss_mlp": 0.01270604, "epoch": 0.2766872087779949, "flos": 21259292106240.0, "grad_norm": 1.4456943643266784, "language_loss": 0.80599755, "learning_rate": 3.3948751833675113e-06, "loss": 0.88394845, "num_input_tokens_seen": 99447720, "router_z_loss_clip": 2.10253906, "router_z_loss_mlp": 0.17907715, "step": 4602, "time_per_iteration": 2.627256393432617 }, { "auxiliary_loss_clip": 0.06517558, "auxiliary_loss_mlp": 0.01285031, "balance_loss_clip": 0.06297389, "balance_loss_mlp": 0.01266339, "epoch": 0.2767473320306629, "flos": 12936728845440.0, "grad_norm": 3.446534778337936, "language_loss": 0.77032459, "learning_rate": 3.3945960509519455e-06, "loss": 0.84835041, "num_input_tokens_seen": 99464720, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.18701172, "step": 4603, "time_per_iteration": 2.5650174617767334 }, { "auxiliary_loss_clip": 0.06502279, "auxiliary_loss_mlp": 0.01277567, "balance_loss_clip": 0.06297055, "balance_loss_mlp": 0.01261474, "epoch": 0.27680745528333084, "flos": 15018239646720.0, "grad_norm": 1.509213476185362, "language_loss": 0.81987059, "learning_rate": 3.3943168656531585e-06, "loss": 0.89766902, "num_input_tokens_seen": 99482310, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.16088867, "step": 4604, "time_per_iteration": 2.554169178009033 }, { "auxiliary_loss_clip": 0.06509112, "auxiliary_loss_mlp": 0.01281906, "balance_loss_clip": 0.06295688, "balance_loss_mlp": 0.01265372, "epoch": 0.2768675785359988, "flos": 22644408424320.0, "grad_norm": 1.6965901282226175, "language_loss": 0.70204705, "learning_rate": 3.3940376274817363e-06, "loss": 0.77995723, "num_input_tokens_seen": 99501255, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.1652832, "step": 4605, "time_per_iteration": 2.5945801734924316 }, { "auxiliary_loss_clip": 0.064138, "auxiliary_loss_mlp": 0.01267393, "balance_loss_clip": 0.06306174, "balance_loss_mlp": 0.01262183, "epoch": 0.27692770178866677, "flos": 66150772093440.0, "grad_norm": 0.6824831942721422, "language_loss": 0.57144594, "learning_rate": 3.3937583364482673e-06, "loss": 0.64825785, "num_input_tokens_seen": 99568925, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.05212402, "step": 4606, "time_per_iteration": 4.746167421340942 }, { "auxiliary_loss_clip": 0.06516872, "auxiliary_loss_mlp": 0.01282656, "balance_loss_clip": 0.06299474, "balance_loss_mlp": 0.01264155, "epoch": 0.27698782504133473, "flos": 26471545735680.0, "grad_norm": 2.16029419921608, "language_loss": 0.69638956, "learning_rate": 3.3934789925633424e-06, "loss": 0.7743848, "num_input_tokens_seen": 99588455, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.18493652, "step": 4607, "time_per_iteration": 4.153919458389282 }, { "auxiliary_loss_clip": 0.06507321, "auxiliary_loss_mlp": 0.01278008, "balance_loss_clip": 0.06301534, "balance_loss_mlp": 0.01261259, "epoch": 0.2770479482940027, "flos": 25891878389760.0, "grad_norm": 1.5928034836387186, "language_loss": 0.70322013, "learning_rate": 3.393199595837555e-06, "loss": 0.78107339, "num_input_tokens_seen": 99609355, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.16748047, "step": 4608, "time_per_iteration": 2.6301722526550293 }, { "auxiliary_loss_clip": 0.06512122, "auxiliary_loss_mlp": 0.01283217, "balance_loss_clip": 0.06297918, "balance_loss_mlp": 0.01266146, "epoch": 0.27710807154667066, "flos": 22863942921600.0, "grad_norm": 3.18073743386843, "language_loss": 0.73664844, "learning_rate": 3.392920146281499e-06, "loss": 0.81460178, "num_input_tokens_seen": 99628780, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.1706543, "step": 4609, "time_per_iteration": 2.601005792617798 }, { "auxiliary_loss_clip": 0.06512484, "auxiliary_loss_mlp": 0.01277982, "balance_loss_clip": 0.06298748, "balance_loss_mlp": 0.0126115, "epoch": 0.27716819479933863, "flos": 17716621806720.0, "grad_norm": 2.5210626061584787, "language_loss": 0.84772551, "learning_rate": 3.3926406439057714e-06, "loss": 0.92563009, "num_input_tokens_seen": 99644545, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.16833496, "step": 4610, "time_per_iteration": 2.5533292293548584 }, { "auxiliary_loss_clip": 0.06514178, "auxiliary_loss_mlp": 0.01275839, "balance_loss_clip": 0.06297763, "balance_loss_mlp": 0.01257183, "epoch": 0.2772283180520066, "flos": 19652125668480.0, "grad_norm": 1.8042389752860226, "language_loss": 0.69040185, "learning_rate": 3.3923610887209705e-06, "loss": 0.76830202, "num_input_tokens_seen": 99663125, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.18664551, "step": 4611, "time_per_iteration": 2.7208714485168457 }, { "auxiliary_loss_clip": 0.06500407, "auxiliary_loss_mlp": 0.01276089, "balance_loss_clip": 0.06294729, "balance_loss_mlp": 0.01259305, "epoch": 0.27728844130467456, "flos": 21038960995200.0, "grad_norm": 1.8629245556832654, "language_loss": 0.74061984, "learning_rate": 3.392081480737698e-06, "loss": 0.81838477, "num_input_tokens_seen": 99682645, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.16796875, "step": 4612, "time_per_iteration": 2.583212375640869 }, { "auxiliary_loss_clip": 0.06512717, "auxiliary_loss_mlp": 0.01288334, "balance_loss_clip": 0.06297729, "balance_loss_mlp": 0.0126969, "epoch": 0.2773485645573425, "flos": 18995157331200.0, "grad_norm": 2.3100663992851733, "language_loss": 0.67146742, "learning_rate": 3.3918018199665563e-06, "loss": 0.74947798, "num_input_tokens_seen": 99700520, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.1862793, "step": 4613, "time_per_iteration": 2.5740208625793457 }, { "auxiliary_loss_clip": 0.06506117, "auxiliary_loss_mlp": 0.01276783, "balance_loss_clip": 0.06297918, "balance_loss_mlp": 0.01258544, "epoch": 0.27740868781001055, "flos": 21474508118400.0, "grad_norm": 1.6706585663452302, "language_loss": 0.8011719, "learning_rate": 3.39152210641815e-06, "loss": 0.8790009, "num_input_tokens_seen": 99720355, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.18249512, "step": 4614, "time_per_iteration": 2.597900390625 }, { "auxiliary_loss_clip": 0.06508166, "auxiliary_loss_mlp": 0.01280117, "balance_loss_clip": 0.06295441, "balance_loss_mlp": 0.01260841, "epoch": 0.2774688110626785, "flos": 19833827247360.0, "grad_norm": 2.456244314430047, "language_loss": 0.80653012, "learning_rate": 3.3912423401030865e-06, "loss": 0.88441288, "num_input_tokens_seen": 99736090, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.19274902, "step": 4615, "time_per_iteration": 2.579213857650757 }, { "auxiliary_loss_clip": 0.06517009, "auxiliary_loss_mlp": 0.01278084, "balance_loss_clip": 0.06297582, "balance_loss_mlp": 0.01260584, "epoch": 0.2775289343153465, "flos": 18220916805120.0, "grad_norm": 3.0329738014858583, "language_loss": 0.64015764, "learning_rate": 3.3909625210319735e-06, "loss": 0.71810853, "num_input_tokens_seen": 99751805, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.17492676, "step": 4616, "time_per_iteration": 2.554746627807617 }, { "auxiliary_loss_clip": 0.06513439, "auxiliary_loss_mlp": 0.01282024, "balance_loss_clip": 0.06300057, "balance_loss_mlp": 0.01262522, "epoch": 0.27758905756801444, "flos": 16478141333760.0, "grad_norm": 1.8181507378747657, "language_loss": 0.82748473, "learning_rate": 3.3906826492154226e-06, "loss": 0.90543938, "num_input_tokens_seen": 99770610, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.19494629, "step": 4617, "time_per_iteration": 2.575435161590576 }, { "auxiliary_loss_clip": 0.06512295, "auxiliary_loss_mlp": 0.01278597, "balance_loss_clip": 0.06297959, "balance_loss_mlp": 0.01260918, "epoch": 0.2776491808206824, "flos": 18733219868160.0, "grad_norm": 2.5226843572151485, "language_loss": 0.77788818, "learning_rate": 3.3904027246640458e-06, "loss": 0.85579711, "num_input_tokens_seen": 99787305, "router_z_loss_clip": 2.14355469, "router_z_loss_mlp": 0.17675781, "step": 4618, "time_per_iteration": 2.5597898960113525 }, { "auxiliary_loss_clip": 0.06515203, "auxiliary_loss_mlp": 0.01279749, "balance_loss_clip": 0.06300932, "balance_loss_mlp": 0.01262809, "epoch": 0.27770930407335037, "flos": 28045742791680.0, "grad_norm": 1.6033922796601447, "language_loss": 0.85229778, "learning_rate": 3.390122747388459e-06, "loss": 0.93024731, "num_input_tokens_seen": 99808940, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.16943359, "step": 4619, "time_per_iteration": 2.624006748199463 }, { "auxiliary_loss_clip": 0.06498766, "auxiliary_loss_mlp": 0.0128665, "balance_loss_clip": 0.06294306, "balance_loss_mlp": 0.01269627, "epoch": 0.27776942732601834, "flos": 23556522044160.0, "grad_norm": 1.4080209185351498, "language_loss": 0.77202058, "learning_rate": 3.3898427173992778e-06, "loss": 0.84987473, "num_input_tokens_seen": 99829575, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.17028809, "step": 4620, "time_per_iteration": 2.6007964611053467 }, { "auxiliary_loss_clip": 0.0650364, "auxiliary_loss_mlp": 0.01286947, "balance_loss_clip": 0.06295573, "balance_loss_mlp": 0.01268124, "epoch": 0.2778295505786863, "flos": 23914474686720.0, "grad_norm": 1.827391265902833, "language_loss": 0.7916398, "learning_rate": 3.389562634707122e-06, "loss": 0.8695457, "num_input_tokens_seen": 99847575, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.18811035, "step": 4621, "time_per_iteration": 2.5876832008361816 }, { "auxiliary_loss_clip": 0.06515114, "auxiliary_loss_mlp": 0.01280793, "balance_loss_clip": 0.06301101, "balance_loss_mlp": 0.01262113, "epoch": 0.27788967383135427, "flos": 25561276905600.0, "grad_norm": 2.172261160167225, "language_loss": 0.88581866, "learning_rate": 3.389282499322611e-06, "loss": 0.96377778, "num_input_tokens_seen": 99864995, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.18676758, "step": 4622, "time_per_iteration": 2.594437837600708 }, { "auxiliary_loss_clip": 0.06504343, "auxiliary_loss_mlp": 0.01275919, "balance_loss_clip": 0.06291766, "balance_loss_mlp": 0.01257799, "epoch": 0.27794979708402223, "flos": 16258103712000.0, "grad_norm": 2.09135611746021, "language_loss": 0.81373775, "learning_rate": 3.389002311256369e-06, "loss": 0.89154029, "num_input_tokens_seen": 99881540, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.18115234, "step": 4623, "time_per_iteration": 2.5547029972076416 }, { "auxiliary_loss_clip": 0.06509246, "auxiliary_loss_mlp": 0.01280456, "balance_loss_clip": 0.06299241, "balance_loss_mlp": 0.01262431, "epoch": 0.2780099203366902, "flos": 20673880755840.0, "grad_norm": 1.7248058007517808, "language_loss": 0.81950593, "learning_rate": 3.3887220705190204e-06, "loss": 0.89740288, "num_input_tokens_seen": 99899595, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.18017578, "step": 4624, "time_per_iteration": 2.5716969966888428 }, { "auxiliary_loss_clip": 0.06508955, "auxiliary_loss_mlp": 0.01274892, "balance_loss_clip": 0.06300688, "balance_loss_mlp": 0.01256713, "epoch": 0.27807004358935816, "flos": 17743805256960.0, "grad_norm": 2.89571599095952, "language_loss": 0.77248871, "learning_rate": 3.388441777121191e-06, "loss": 0.85032719, "num_input_tokens_seen": 99913020, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.1817627, "step": 4625, "time_per_iteration": 2.5043461322784424 }, { "auxiliary_loss_clip": 0.06507725, "auxiliary_loss_mlp": 0.01278964, "balance_loss_clip": 0.06300378, "balance_loss_mlp": 0.01261845, "epoch": 0.2781301668420261, "flos": 16732699637760.0, "grad_norm": 2.353157409839831, "language_loss": 0.70450485, "learning_rate": 3.388161431073511e-06, "loss": 0.78237176, "num_input_tokens_seen": 99931405, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.17102051, "step": 4626, "time_per_iteration": 2.5668118000030518 }, { "auxiliary_loss_clip": 0.0651709, "auxiliary_loss_mlp": 0.01282759, "balance_loss_clip": 0.06301989, "balance_loss_mlp": 0.01264616, "epoch": 0.27819029009469415, "flos": 13849848714240.0, "grad_norm": 3.1492071293605832, "language_loss": 0.93233836, "learning_rate": 3.38788103238661e-06, "loss": 1.01033688, "num_input_tokens_seen": 99948100, "router_z_loss_clip": 2.15136719, "router_z_loss_mlp": 0.18164062, "step": 4627, "time_per_iteration": 2.543226957321167 }, { "auxiliary_loss_clip": 0.06515488, "auxiliary_loss_mlp": 0.01280652, "balance_loss_clip": 0.06300171, "balance_loss_mlp": 0.01262639, "epoch": 0.2782504133473621, "flos": 27096634794240.0, "grad_norm": 1.6581358915051672, "language_loss": 0.85562366, "learning_rate": 3.387600581071121e-06, "loss": 0.93358505, "num_input_tokens_seen": 99966470, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.18005371, "step": 4628, "time_per_iteration": 2.6076138019561768 }, { "auxiliary_loss_clip": 0.06509903, "auxiliary_loss_mlp": 0.01286736, "balance_loss_clip": 0.06299132, "balance_loss_mlp": 0.01268711, "epoch": 0.2783105366000301, "flos": 21075116832000.0, "grad_norm": 1.4631088909762477, "language_loss": 0.80019403, "learning_rate": 3.387320077137679e-06, "loss": 0.87816048, "num_input_tokens_seen": 99985930, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.18029785, "step": 4629, "time_per_iteration": 2.5802664756774902 }, { "auxiliary_loss_clip": 0.06499936, "auxiliary_loss_mlp": 0.01287399, "balance_loss_clip": 0.06295975, "balance_loss_mlp": 0.01270162, "epoch": 0.27837065985269804, "flos": 26508456259200.0, "grad_norm": 2.2465730554186796, "language_loss": 0.85192114, "learning_rate": 3.3870395205969208e-06, "loss": 0.92979449, "num_input_tokens_seen": 100006235, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.17236328, "step": 4630, "time_per_iteration": 2.621962547302246 }, { "auxiliary_loss_clip": 0.06511262, "auxiliary_loss_mlp": 0.0128254, "balance_loss_clip": 0.06296448, "balance_loss_mlp": 0.01263932, "epoch": 0.278430783105366, "flos": 20228271143040.0, "grad_norm": 2.1609807829804897, "language_loss": 0.82036942, "learning_rate": 3.386758911459485e-06, "loss": 0.89830744, "num_input_tokens_seen": 100023655, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.18603516, "step": 4631, "time_per_iteration": 2.5644118785858154 }, { "auxiliary_loss_clip": 0.06513963, "auxiliary_loss_mlp": 0.01278891, "balance_loss_clip": 0.06297617, "balance_loss_mlp": 0.01261415, "epoch": 0.278490906358034, "flos": 25599906437760.0, "grad_norm": 1.7893426214040657, "language_loss": 0.71468747, "learning_rate": 3.3864782497360126e-06, "loss": 0.79261601, "num_input_tokens_seen": 100043280, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.17480469, "step": 4632, "time_per_iteration": 2.622192144393921 }, { "auxiliary_loss_clip": 0.0650124, "auxiliary_loss_mlp": 0.01292772, "balance_loss_clip": 0.06296872, "balance_loss_mlp": 0.01275475, "epoch": 0.27855102961070194, "flos": 16175645694720.0, "grad_norm": 1.7479846816419888, "language_loss": 0.82846165, "learning_rate": 3.386197535437145e-06, "loss": 0.90640175, "num_input_tokens_seen": 100057690, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.1730957, "step": 4633, "time_per_iteration": 2.5349807739257812 }, { "auxiliary_loss_clip": 0.06506816, "auxiliary_loss_mlp": 0.01290184, "balance_loss_clip": 0.06295267, "balance_loss_mlp": 0.01270598, "epoch": 0.2786111528633699, "flos": 22933864753920.0, "grad_norm": 1.728058623976338, "language_loss": 0.8849141, "learning_rate": 3.385916768573529e-06, "loss": 0.96288413, "num_input_tokens_seen": 100075875, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.19592285, "step": 4634, "time_per_iteration": 4.020534515380859 }, { "auxiliary_loss_clip": 0.06510802, "auxiliary_loss_mlp": 0.01285885, "balance_loss_clip": 0.0629732, "balance_loss_mlp": 0.01266943, "epoch": 0.27867127611603787, "flos": 23410934375040.0, "grad_norm": 1.7035445004677257, "language_loss": 0.77274251, "learning_rate": 3.38563594915581e-06, "loss": 0.85070932, "num_input_tokens_seen": 100092930, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.1895752, "step": 4635, "time_per_iteration": 2.597691297531128 }, { "auxiliary_loss_clip": 0.06509695, "auxiliary_loss_mlp": 0.01280096, "balance_loss_clip": 0.06298777, "balance_loss_mlp": 0.01263049, "epoch": 0.27873139936870583, "flos": 19835210839680.0, "grad_norm": 1.6916628110198304, "language_loss": 0.66344506, "learning_rate": 3.385355077194637e-06, "loss": 0.7413429, "num_input_tokens_seen": 100110790, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.17053223, "step": 4636, "time_per_iteration": 2.5599236488342285 }, { "auxiliary_loss_clip": 0.06516692, "auxiliary_loss_mlp": 0.01280187, "balance_loss_clip": 0.06299158, "balance_loss_mlp": 0.01261519, "epoch": 0.2787915226213738, "flos": 17712638737920.0, "grad_norm": 2.7862406372439392, "language_loss": 0.84146202, "learning_rate": 3.3850741527006604e-06, "loss": 0.91943073, "num_input_tokens_seen": 100126970, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.18664551, "step": 4637, "time_per_iteration": 2.530677080154419 }, { "auxiliary_loss_clip": 0.06503434, "auxiliary_loss_mlp": 0.01279209, "balance_loss_clip": 0.06295777, "balance_loss_mlp": 0.01262424, "epoch": 0.27885164587404176, "flos": 22097039627520.0, "grad_norm": 1.7068684712162236, "language_loss": 0.76067817, "learning_rate": 3.384793175684533e-06, "loss": 0.83850467, "num_input_tokens_seen": 100146720, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.16796875, "step": 4638, "time_per_iteration": 4.024580478668213 }, { "auxiliary_loss_clip": 0.06508227, "auxiliary_loss_mlp": 0.0128046, "balance_loss_clip": 0.06295672, "balance_loss_mlp": 0.01262674, "epoch": 0.27891176912670973, "flos": 19213601725440.0, "grad_norm": 1.4433441170796426, "language_loss": 0.72028565, "learning_rate": 3.38451214615691e-06, "loss": 0.79817259, "num_input_tokens_seen": 100165920, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.17797852, "step": 4639, "time_per_iteration": 2.5521328449249268 }, { "auxiliary_loss_clip": 0.06511725, "auxiliary_loss_mlp": 0.01274744, "balance_loss_clip": 0.06299588, "balance_loss_mlp": 0.0125672, "epoch": 0.27897189237937775, "flos": 27607428483840.0, "grad_norm": 3.4439191750743383, "language_loss": 0.66077167, "learning_rate": 3.384231064128447e-06, "loss": 0.73863637, "num_input_tokens_seen": 100185525, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.18017578, "step": 4640, "time_per_iteration": 2.6132514476776123 }, { "auxiliary_loss_clip": 0.06508341, "auxiliary_loss_mlp": 0.01274083, "balance_loss_clip": 0.06296958, "balance_loss_mlp": 0.01257346, "epoch": 0.2790320156320457, "flos": 21184506737280.0, "grad_norm": 1.9544881191712893, "language_loss": 0.7254045, "learning_rate": 3.383949929609804e-06, "loss": 0.80322868, "num_input_tokens_seen": 100204850, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.16748047, "step": 4641, "time_per_iteration": 2.59529709815979 }, { "auxiliary_loss_clip": 0.0651703, "auxiliary_loss_mlp": 0.01282902, "balance_loss_clip": 0.06299844, "balance_loss_mlp": 0.01264377, "epoch": 0.2790921388847137, "flos": 22790541144960.0, "grad_norm": 1.8374290710029757, "language_loss": 0.76111829, "learning_rate": 3.383668742611641e-06, "loss": 0.83911753, "num_input_tokens_seen": 100224520, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.18530273, "step": 4642, "time_per_iteration": 2.6032822132110596 }, { "auxiliary_loss_clip": 0.06514063, "auxiliary_loss_mlp": 0.01274515, "balance_loss_clip": 0.06300102, "balance_loss_mlp": 0.01255346, "epoch": 0.27915226213738165, "flos": 23406783598080.0, "grad_norm": 1.8273702538338776, "language_loss": 0.86547142, "learning_rate": 3.3833875031446205e-06, "loss": 0.94335723, "num_input_tokens_seen": 100243935, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.19177246, "step": 4643, "time_per_iteration": 2.5931103229522705 }, { "auxiliary_loss_clip": 0.06509171, "auxiliary_loss_mlp": 0.01274205, "balance_loss_clip": 0.06298001, "balance_loss_mlp": 0.01257099, "epoch": 0.2792123853900496, "flos": 22754469162240.0, "grad_norm": 2.3889453256514406, "language_loss": 0.83413744, "learning_rate": 3.383106211219407e-06, "loss": 0.91197115, "num_input_tokens_seen": 100262290, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.17114258, "step": 4644, "time_per_iteration": 2.6005427837371826 }, { "auxiliary_loss_clip": 0.06507453, "auxiliary_loss_mlp": 0.01273892, "balance_loss_clip": 0.06296309, "balance_loss_mlp": 0.01256105, "epoch": 0.2792725086427176, "flos": 15054772826880.0, "grad_norm": 1.7410275860088835, "language_loss": 0.79314852, "learning_rate": 3.3828248668466673e-06, "loss": 0.8709619, "num_input_tokens_seen": 100280015, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.17785645, "step": 4645, "time_per_iteration": 3.979400157928467 }, { "auxiliary_loss_clip": 0.06426115, "auxiliary_loss_mlp": 0.01292038, "balance_loss_clip": 0.06319365, "balance_loss_mlp": 0.01287106, "epoch": 0.27933263189538554, "flos": 62562805862400.0, "grad_norm": 0.7671189095209804, "language_loss": 0.6210016, "learning_rate": 3.3825434700370705e-06, "loss": 0.69818318, "num_input_tokens_seen": 100338935, "router_z_loss_clip": 1.06347656, "router_z_loss_mlp": 0.04928589, "step": 4646, "time_per_iteration": 3.183959722518921 }, { "auxiliary_loss_clip": 0.06509653, "auxiliary_loss_mlp": 0.01274567, "balance_loss_clip": 0.06303217, "balance_loss_mlp": 0.01257211, "epoch": 0.2793927551480535, "flos": 25125268584960.0, "grad_norm": 1.6618145498824426, "language_loss": 0.89850575, "learning_rate": 3.3822620208012865e-06, "loss": 0.97634792, "num_input_tokens_seen": 100359905, "router_z_loss_clip": 2.06152344, "router_z_loss_mlp": 0.17358398, "step": 4647, "time_per_iteration": 4.066488265991211 }, { "auxiliary_loss_clip": 0.06515495, "auxiliary_loss_mlp": 0.0127698, "balance_loss_clip": 0.06299899, "balance_loss_mlp": 0.01258729, "epoch": 0.27945287840072147, "flos": 21330974874240.0, "grad_norm": 1.5467447267008732, "language_loss": 0.87583202, "learning_rate": 3.381980519149988e-06, "loss": 0.95375681, "num_input_tokens_seen": 100376955, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.18237305, "step": 4648, "time_per_iteration": 2.5656559467315674 }, { "auxiliary_loss_clip": 0.06515987, "auxiliary_loss_mlp": 0.01275689, "balance_loss_clip": 0.06300297, "balance_loss_mlp": 0.01257462, "epoch": 0.27951300165338944, "flos": 27457354621440.0, "grad_norm": 2.3207273296459454, "language_loss": 0.73519701, "learning_rate": 3.38169896509385e-06, "loss": 0.81311381, "num_input_tokens_seen": 100397545, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.18212891, "step": 4649, "time_per_iteration": 2.6368536949157715 }, { "auxiliary_loss_clip": 0.06514679, "auxiliary_loss_mlp": 0.01282253, "balance_loss_clip": 0.06302802, "balance_loss_mlp": 0.01263942, "epoch": 0.2795731249060574, "flos": 15164456221440.0, "grad_norm": 2.064602278646374, "language_loss": 0.81222957, "learning_rate": 3.381417358643549e-06, "loss": 0.89019895, "num_input_tokens_seen": 100415080, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.18310547, "step": 4650, "time_per_iteration": 2.5535593032836914 }, { "auxiliary_loss_clip": 0.06409328, "auxiliary_loss_mlp": 0.01259481, "balance_loss_clip": 0.06301939, "balance_loss_mlp": 0.01254346, "epoch": 0.27963324815872537, "flos": 60140951775360.0, "grad_norm": 0.7803085635608354, "language_loss": 0.5874058, "learning_rate": 3.3811356998097624e-06, "loss": 0.66409385, "num_input_tokens_seen": 100471105, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.05136108, "step": 4651, "time_per_iteration": 3.2318434715270996 }, { "auxiliary_loss_clip": 0.06520258, "auxiliary_loss_mlp": 0.01281627, "balance_loss_clip": 0.06301757, "balance_loss_mlp": 0.01263353, "epoch": 0.27969337141139333, "flos": 21773020688640.0, "grad_norm": 1.687434741514707, "language_loss": 0.75017929, "learning_rate": 3.3808539886031726e-06, "loss": 0.82819813, "num_input_tokens_seen": 100492520, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.18286133, "step": 4652, "time_per_iteration": 2.6021692752838135 }, { "auxiliary_loss_clip": 0.06518104, "auxiliary_loss_mlp": 0.01283973, "balance_loss_clip": 0.06302898, "balance_loss_mlp": 0.01266604, "epoch": 0.27975349466406135, "flos": 39859559072640.0, "grad_norm": 2.4716069813006207, "language_loss": 0.79968464, "learning_rate": 3.380572225034461e-06, "loss": 0.87770534, "num_input_tokens_seen": 100512870, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.17370605, "step": 4653, "time_per_iteration": 2.703747034072876 }, { "auxiliary_loss_clip": 0.06509872, "auxiliary_loss_mlp": 0.0128742, "balance_loss_clip": 0.06298055, "balance_loss_mlp": 0.01270254, "epoch": 0.2798136179167293, "flos": 21586204010880.0, "grad_norm": 2.2087005065921157, "language_loss": 0.79211462, "learning_rate": 3.380290409114312e-06, "loss": 0.8700875, "num_input_tokens_seen": 100531655, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.17163086, "step": 4654, "time_per_iteration": 2.576678514480591 }, { "auxiliary_loss_clip": 0.06522661, "auxiliary_loss_mlp": 0.01289574, "balance_loss_clip": 0.06303069, "balance_loss_mlp": 0.01270882, "epoch": 0.2798737411693973, "flos": 21543130212480.0, "grad_norm": 2.2751010066166435, "language_loss": 0.80962473, "learning_rate": 3.3800085408534127e-06, "loss": 0.88774717, "num_input_tokens_seen": 100548005, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.18688965, "step": 4655, "time_per_iteration": 2.6731886863708496 }, { "auxiliary_loss_clip": 0.06510564, "auxiliary_loss_mlp": 0.01283874, "balance_loss_clip": 0.06297353, "balance_loss_mlp": 0.01265432, "epoch": 0.27993386442206525, "flos": 26988586554240.0, "grad_norm": 2.171200492700414, "language_loss": 0.82034016, "learning_rate": 3.3797266202624506e-06, "loss": 0.89828449, "num_input_tokens_seen": 100567980, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.18444824, "step": 4656, "time_per_iteration": 2.6419496536254883 }, { "auxiliary_loss_clip": 0.06508762, "auxiliary_loss_mlp": 0.01289581, "balance_loss_clip": 0.0629748, "balance_loss_mlp": 0.01271235, "epoch": 0.2799939876747332, "flos": 24356268938880.0, "grad_norm": 1.872222445478087, "language_loss": 0.83547455, "learning_rate": 3.3794446473521176e-06, "loss": 0.91345799, "num_input_tokens_seen": 100588630, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.18334961, "step": 4657, "time_per_iteration": 2.599072217941284 }, { "auxiliary_loss_clip": 0.06514128, "auxiliary_loss_mlp": 0.01283915, "balance_loss_clip": 0.06300899, "balance_loss_mlp": 0.0126607, "epoch": 0.2800541109274012, "flos": 33665479626240.0, "grad_norm": 2.079384573483238, "language_loss": 0.65510404, "learning_rate": 3.379162622133105e-06, "loss": 0.73308444, "num_input_tokens_seen": 100608775, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.17858887, "step": 4658, "time_per_iteration": 2.690825939178467 }, { "auxiliary_loss_clip": 0.0650598, "auxiliary_loss_mlp": 0.01288206, "balance_loss_clip": 0.06295058, "balance_loss_mlp": 0.01269943, "epoch": 0.28011423418006914, "flos": 21620515057920.0, "grad_norm": 2.048379256411462, "language_loss": 0.78728414, "learning_rate": 3.3788805446161073e-06, "loss": 0.86522603, "num_input_tokens_seen": 100627975, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.18261719, "step": 4659, "time_per_iteration": 2.7594716548919678 }, { "auxiliary_loss_clip": 0.06507747, "auxiliary_loss_mlp": 0.01280895, "balance_loss_clip": 0.06295821, "balance_loss_mlp": 0.01263061, "epoch": 0.2801743574327371, "flos": 23119130131200.0, "grad_norm": 2.026997153666328, "language_loss": 0.7928707, "learning_rate": 3.3785984148118215e-06, "loss": 0.8707571, "num_input_tokens_seen": 100645430, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.17822266, "step": 4660, "time_per_iteration": 2.613037109375 }, { "auxiliary_loss_clip": 0.06507154, "auxiliary_loss_mlp": 0.01276997, "balance_loss_clip": 0.06300262, "balance_loss_mlp": 0.01259211, "epoch": 0.2802344806854051, "flos": 12646433975040.0, "grad_norm": 1.7566433258996774, "language_loss": 0.80736661, "learning_rate": 3.3783162327309453e-06, "loss": 0.88520819, "num_input_tokens_seen": 100663775, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.17773438, "step": 4661, "time_per_iteration": 2.563378095626831 }, { "auxiliary_loss_clip": 0.06509119, "auxiliary_loss_mlp": 0.01288646, "balance_loss_clip": 0.06298479, "balance_loss_mlp": 0.01270884, "epoch": 0.28029460393807304, "flos": 37276772019840.0, "grad_norm": 1.594232375543556, "language_loss": 0.79577839, "learning_rate": 3.3780339983841794e-06, "loss": 0.87375617, "num_input_tokens_seen": 100686085, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.1776123, "step": 4662, "time_per_iteration": 2.7134368419647217 }, { "auxiliary_loss_clip": 0.06520776, "auxiliary_loss_mlp": 0.0128572, "balance_loss_clip": 0.06302311, "balance_loss_mlp": 0.01266575, "epoch": 0.280354727190741, "flos": 20747450240640.0, "grad_norm": 1.8886307870055214, "language_loss": 0.70038271, "learning_rate": 3.377751711782227e-06, "loss": 0.77844769, "num_input_tokens_seen": 100705135, "router_z_loss_clip": 2.18261719, "router_z_loss_mlp": 0.19140625, "step": 4663, "time_per_iteration": 2.5718510150909424 }, { "auxiliary_loss_clip": 0.06514513, "auxiliary_loss_mlp": 0.01286143, "balance_loss_clip": 0.06299199, "balance_loss_mlp": 0.01265734, "epoch": 0.28041485044340897, "flos": 21477526865280.0, "grad_norm": 1.7811109476491154, "language_loss": 0.78050154, "learning_rate": 3.377469372935791e-06, "loss": 0.85850811, "num_input_tokens_seen": 100724960, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.20397949, "step": 4664, "time_per_iteration": 2.5874602794647217 }, { "auxiliary_loss_clip": 0.06503396, "auxiliary_loss_mlp": 0.01281319, "balance_loss_clip": 0.06298913, "balance_loss_mlp": 0.01264761, "epoch": 0.28047497369607693, "flos": 14799669471360.0, "grad_norm": 1.9729822687045335, "language_loss": 0.79643142, "learning_rate": 3.377186981855578e-06, "loss": 0.8742786, "num_input_tokens_seen": 100741995, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.16552734, "step": 4665, "time_per_iteration": 2.531602382659912 }, { "auxiliary_loss_clip": 0.06508426, "auxiliary_loss_mlp": 0.0127493, "balance_loss_clip": 0.06299749, "balance_loss_mlp": 0.01258646, "epoch": 0.2805350969487449, "flos": 23076559457280.0, "grad_norm": 1.6767272059598235, "language_loss": 0.80911696, "learning_rate": 3.3769045385522968e-06, "loss": 0.88695055, "num_input_tokens_seen": 100758985, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.16271973, "step": 4666, "time_per_iteration": 2.5899720191955566 }, { "auxiliary_loss_clip": 0.06514998, "auxiliary_loss_mlp": 0.01275388, "balance_loss_clip": 0.06303302, "balance_loss_mlp": 0.0125622, "epoch": 0.2805952202014129, "flos": 20485177361280.0, "grad_norm": 2.0419100151958185, "language_loss": 0.84844953, "learning_rate": 3.376622043036658e-06, "loss": 0.92635339, "num_input_tokens_seen": 100777820, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.19165039, "step": 4667, "time_per_iteration": 2.5651144981384277 }, { "auxiliary_loss_clip": 0.06515411, "auxiliary_loss_mlp": 0.01275575, "balance_loss_clip": 0.06302612, "balance_loss_mlp": 0.01257587, "epoch": 0.2806553434540809, "flos": 27424678728960.0, "grad_norm": 1.763039566903387, "language_loss": 0.80451655, "learning_rate": 3.376339495319373e-06, "loss": 0.88242644, "num_input_tokens_seen": 100798205, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.17980957, "step": 4668, "time_per_iteration": 2.6247236728668213 }, { "auxiliary_loss_clip": 0.06518485, "auxiliary_loss_mlp": 0.01273094, "balance_loss_clip": 0.06303729, "balance_loss_mlp": 0.01255809, "epoch": 0.28071546670674885, "flos": 26512187765760.0, "grad_norm": 1.8707100484326864, "language_loss": 0.76612401, "learning_rate": 3.3760568954111563e-06, "loss": 0.8440398, "num_input_tokens_seen": 100819800, "router_z_loss_clip": 2.14941406, "router_z_loss_mlp": 0.17285156, "step": 4669, "time_per_iteration": 2.6159911155700684 }, { "auxiliary_loss_clip": 0.06509089, "auxiliary_loss_mlp": 0.01285665, "balance_loss_clip": 0.06297901, "balance_loss_mlp": 0.01267772, "epoch": 0.2807755899594168, "flos": 20564993975040.0, "grad_norm": 3.6101884194518554, "language_loss": 0.79583937, "learning_rate": 3.375774243322725e-06, "loss": 0.87378693, "num_input_tokens_seen": 100837880, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.17883301, "step": 4670, "time_per_iteration": 2.5650529861450195 }, { "auxiliary_loss_clip": 0.06514367, "auxiliary_loss_mlp": 0.01276909, "balance_loss_clip": 0.06298301, "balance_loss_mlp": 0.01259016, "epoch": 0.2808357132120848, "flos": 24319693831680.0, "grad_norm": 1.859100105183679, "language_loss": 0.79978716, "learning_rate": 3.3754915390647955e-06, "loss": 0.87769997, "num_input_tokens_seen": 100856350, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.17895508, "step": 4671, "time_per_iteration": 2.596482753753662 }, { "auxiliary_loss_clip": 0.06504397, "auxiliary_loss_mlp": 0.01270213, "balance_loss_clip": 0.06299879, "balance_loss_mlp": 0.01253846, "epoch": 0.28089583646475275, "flos": 26439624529920.0, "grad_norm": 1.99258299092616, "language_loss": 0.75469977, "learning_rate": 3.37520878264809e-06, "loss": 0.83244592, "num_input_tokens_seen": 100876135, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.16357422, "step": 4672, "time_per_iteration": 2.61737322807312 }, { "auxiliary_loss_clip": 0.06517271, "auxiliary_loss_mlp": 0.01272914, "balance_loss_clip": 0.06302831, "balance_loss_mlp": 0.01254115, "epoch": 0.2809559597174207, "flos": 23118417371520.0, "grad_norm": 2.368210865892898, "language_loss": 0.76003784, "learning_rate": 3.3749259740833286e-06, "loss": 0.83793968, "num_input_tokens_seen": 100894790, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.18811035, "step": 4673, "time_per_iteration": 4.132713079452515 }, { "auxiliary_loss_clip": 0.06512688, "auxiliary_loss_mlp": 0.01279829, "balance_loss_clip": 0.06301831, "balance_loss_mlp": 0.01262401, "epoch": 0.2810160829700887, "flos": 20929864579200.0, "grad_norm": 2.692283681089038, "language_loss": 0.73009968, "learning_rate": 3.374643113381237e-06, "loss": 0.80802488, "num_input_tokens_seen": 100915100, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.17431641, "step": 4674, "time_per_iteration": 2.5970265865325928 }, { "auxiliary_loss_clip": 0.0651089, "auxiliary_loss_mlp": 0.01276005, "balance_loss_clip": 0.062994, "balance_loss_mlp": 0.01258367, "epoch": 0.28107620622275664, "flos": 14361145528320.0, "grad_norm": 1.823703697570541, "language_loss": 0.77964127, "learning_rate": 3.374360200552541e-06, "loss": 0.85751015, "num_input_tokens_seen": 100932795, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.17633057, "step": 4675, "time_per_iteration": 2.5621142387390137 }, { "auxiliary_loss_clip": 0.06513613, "auxiliary_loss_mlp": 0.01278563, "balance_loss_clip": 0.0629977, "balance_loss_mlp": 0.01260539, "epoch": 0.2811363294754246, "flos": 20924707553280.0, "grad_norm": 4.185444085247077, "language_loss": 0.7087028, "learning_rate": 3.374077235607968e-06, "loss": 0.78662461, "num_input_tokens_seen": 100950505, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.18017578, "step": 4676, "time_per_iteration": 2.5687170028686523 }, { "auxiliary_loss_clip": 0.06498665, "auxiliary_loss_mlp": 0.01274572, "balance_loss_clip": 0.06298921, "balance_loss_mlp": 0.01257907, "epoch": 0.28119645272809257, "flos": 20601107884800.0, "grad_norm": 1.6075064598316728, "language_loss": 0.70660728, "learning_rate": 3.3737942185582487e-06, "loss": 0.78433967, "num_input_tokens_seen": 100968790, "router_z_loss_clip": 1.99609375, "router_z_loss_mlp": 0.16662598, "step": 4677, "time_per_iteration": 2.5817127227783203 }, { "auxiliary_loss_clip": 0.06508036, "auxiliary_loss_mlp": 0.0127911, "balance_loss_clip": 0.06299563, "balance_loss_mlp": 0.01260049, "epoch": 0.28125657598076054, "flos": 25344383811840.0, "grad_norm": 1.703568096148395, "language_loss": 0.64077377, "learning_rate": 3.3735111494141153e-06, "loss": 0.71864522, "num_input_tokens_seen": 100990205, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.19055176, "step": 4678, "time_per_iteration": 4.036980867385864 }, { "auxiliary_loss_clip": 0.0650167, "auxiliary_loss_mlp": 0.01284837, "balance_loss_clip": 0.06293412, "balance_loss_mlp": 0.01268088, "epoch": 0.2813166992334285, "flos": 24834051319680.0, "grad_norm": 1.495289997358307, "language_loss": 0.71210372, "learning_rate": 3.3732280281863013e-06, "loss": 0.78996873, "num_input_tokens_seen": 101009815, "router_z_loss_clip": 2.08300781, "router_z_loss_mlp": 0.1673584, "step": 4679, "time_per_iteration": 2.629681348800659 }, { "auxiliary_loss_clip": 0.06512935, "auxiliary_loss_mlp": 0.01280342, "balance_loss_clip": 0.06300355, "balance_loss_mlp": 0.01261888, "epoch": 0.2813768224860965, "flos": 21766941267840.0, "grad_norm": 1.778257877659253, "language_loss": 0.75318056, "learning_rate": 3.3729448548855422e-06, "loss": 0.83111334, "num_input_tokens_seen": 101026780, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.18444824, "step": 4680, "time_per_iteration": 2.564197063446045 }, { "auxiliary_loss_clip": 0.06503617, "auxiliary_loss_mlp": 0.01275885, "balance_loss_clip": 0.0629352, "balance_loss_mlp": 0.01259195, "epoch": 0.2814369457387645, "flos": 24323760754560.0, "grad_norm": 1.650139267540454, "language_loss": 0.77846354, "learning_rate": 3.3726616295225774e-06, "loss": 0.85625851, "num_input_tokens_seen": 101046215, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.16687012, "step": 4681, "time_per_iteration": 2.6241798400878906 }, { "auxiliary_loss_clip": 0.06512983, "auxiliary_loss_mlp": 0.01289928, "balance_loss_clip": 0.06299835, "balance_loss_mlp": 0.01272381, "epoch": 0.28149706899143245, "flos": 18521274165120.0, "grad_norm": 2.05365822377347, "language_loss": 0.74833304, "learning_rate": 3.372378352108146e-06, "loss": 0.82636213, "num_input_tokens_seen": 101063365, "router_z_loss_clip": 2.13183594, "router_z_loss_mlp": 0.1751709, "step": 4682, "time_per_iteration": 2.5637519359588623 }, { "auxiliary_loss_clip": 0.06502919, "auxiliary_loss_mlp": 0.0128518, "balance_loss_clip": 0.06297392, "balance_loss_mlp": 0.01268515, "epoch": 0.2815571922441004, "flos": 24870165229440.0, "grad_norm": 1.458929598601525, "language_loss": 0.81453484, "learning_rate": 3.3720950226529894e-06, "loss": 0.89241582, "num_input_tokens_seen": 101083835, "router_z_loss_clip": 2.05761719, "router_z_loss_mlp": 0.16662598, "step": 4683, "time_per_iteration": 2.5848896503448486 }, { "auxiliary_loss_clip": 0.0650617, "auxiliary_loss_mlp": 0.01285533, "balance_loss_clip": 0.06293365, "balance_loss_mlp": 0.01267735, "epoch": 0.2816173154967684, "flos": 19907774075520.0, "grad_norm": 1.8825751265444743, "language_loss": 0.77025712, "learning_rate": 3.371811641167852e-06, "loss": 0.84817421, "num_input_tokens_seen": 101101740, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.17810059, "step": 4684, "time_per_iteration": 3.952099561691284 }, { "auxiliary_loss_clip": 0.06499749, "auxiliary_loss_mlp": 0.01276307, "balance_loss_clip": 0.0629114, "balance_loss_mlp": 0.01258926, "epoch": 0.28167743874943635, "flos": 17496709966080.0, "grad_norm": 1.8300593191739705, "language_loss": 0.77037132, "learning_rate": 3.3715282076634807e-06, "loss": 0.8481319, "num_input_tokens_seen": 101120480, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.17382812, "step": 4685, "time_per_iteration": 2.5402095317840576 }, { "auxiliary_loss_clip": 0.06499627, "auxiliary_loss_mlp": 0.01282702, "balance_loss_clip": 0.06293553, "balance_loss_mlp": 0.01266299, "epoch": 0.2817375620021043, "flos": 25309276151040.0, "grad_norm": 1.5557418773180545, "language_loss": 0.76325995, "learning_rate": 3.3712447221506218e-06, "loss": 0.84108329, "num_input_tokens_seen": 101142910, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.16394043, "step": 4686, "time_per_iteration": 4.070720911026001 }, { "auxiliary_loss_clip": 0.06507604, "auxiliary_loss_mlp": 0.01282755, "balance_loss_clip": 0.06294449, "balance_loss_mlp": 0.01264278, "epoch": 0.2817976852547723, "flos": 18698447623680.0, "grad_norm": 2.4723742854376214, "language_loss": 0.64121294, "learning_rate": 3.370961184640025e-06, "loss": 0.71911657, "num_input_tokens_seen": 101160030, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.18469238, "step": 4687, "time_per_iteration": 2.5397562980651855 }, { "auxiliary_loss_clip": 0.06505511, "auxiliary_loss_mlp": 0.01283209, "balance_loss_clip": 0.06292874, "balance_loss_mlp": 0.01265649, "epoch": 0.28185780850744024, "flos": 22748012398080.0, "grad_norm": 2.036398570945939, "language_loss": 0.77005422, "learning_rate": 3.3706775951424433e-06, "loss": 0.8479414, "num_input_tokens_seen": 101177675, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.17553711, "step": 4688, "time_per_iteration": 2.584791898727417 }, { "auxiliary_loss_clip": 0.06503227, "auxiliary_loss_mlp": 0.01279864, "balance_loss_clip": 0.06293958, "balance_loss_mlp": 0.01263961, "epoch": 0.2819179317601082, "flos": 14938297251840.0, "grad_norm": 1.9032827204603893, "language_loss": 0.79148442, "learning_rate": 3.37039395366863e-06, "loss": 0.86931533, "num_input_tokens_seen": 101192225, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.15893555, "step": 4689, "time_per_iteration": 2.5162367820739746 }, { "auxiliary_loss_clip": 0.06502923, "auxiliary_loss_mlp": 0.01281899, "balance_loss_clip": 0.06294243, "balance_loss_mlp": 0.01264494, "epoch": 0.2819780550127762, "flos": 23151428680320.0, "grad_norm": 1.7190391219702754, "language_loss": 0.77997512, "learning_rate": 3.37011026022934e-06, "loss": 0.85782337, "num_input_tokens_seen": 101210870, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.1739502, "step": 4690, "time_per_iteration": 2.5616018772125244 }, { "auxiliary_loss_clip": 0.06509486, "auxiliary_loss_mlp": 0.01280939, "balance_loss_clip": 0.06298173, "balance_loss_mlp": 0.01264309, "epoch": 0.28203817826544414, "flos": 21622779118080.0, "grad_norm": 2.7668568380798573, "language_loss": 0.88556606, "learning_rate": 3.369826514835332e-06, "loss": 0.96347034, "num_input_tokens_seen": 101229965, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.1661377, "step": 4691, "time_per_iteration": 2.5691585540771484 }, { "auxiliary_loss_clip": 0.06517316, "auxiliary_loss_mlp": 0.0127789, "balance_loss_clip": 0.06299609, "balance_loss_mlp": 0.01261523, "epoch": 0.2820983015181121, "flos": 24034010935680.0, "grad_norm": 1.7147061782905917, "language_loss": 0.82221138, "learning_rate": 3.3695427174973654e-06, "loss": 0.90016341, "num_input_tokens_seen": 101250980, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.16357422, "step": 4692, "time_per_iteration": 2.60258150100708 }, { "auxiliary_loss_clip": 0.06506602, "auxiliary_loss_mlp": 0.01282237, "balance_loss_clip": 0.06295653, "balance_loss_mlp": 0.01265094, "epoch": 0.2821584247707801, "flos": 30015725408640.0, "grad_norm": 1.4308598262070362, "language_loss": 0.74997151, "learning_rate": 3.3692588682262022e-06, "loss": 0.82785988, "num_input_tokens_seen": 101273335, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.17138672, "step": 4693, "time_per_iteration": 2.649968385696411 }, { "auxiliary_loss_clip": 0.06514095, "auxiliary_loss_mlp": 0.01277204, "balance_loss_clip": 0.063003, "balance_loss_mlp": 0.01260551, "epoch": 0.2822185480234481, "flos": 21403034985600.0, "grad_norm": 1.7785029895322897, "language_loss": 0.78288388, "learning_rate": 3.3689749670326046e-06, "loss": 0.86079687, "num_input_tokens_seen": 101292110, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.16638184, "step": 4694, "time_per_iteration": 2.559786558151245 }, { "auxiliary_loss_clip": 0.06508545, "auxiliary_loss_mlp": 0.01275674, "balance_loss_clip": 0.06300581, "balance_loss_mlp": 0.01258365, "epoch": 0.28227867127611606, "flos": 27459996024960.0, "grad_norm": 1.8992125857289146, "language_loss": 0.66778201, "learning_rate": 3.3686910139273392e-06, "loss": 0.74562418, "num_input_tokens_seen": 101312815, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.17333984, "step": 4695, "time_per_iteration": 2.6052346229553223 }, { "auxiliary_loss_clip": 0.06519763, "auxiliary_loss_mlp": 0.0127412, "balance_loss_clip": 0.06305275, "balance_loss_mlp": 0.0125631, "epoch": 0.282338794528784, "flos": 22599028638720.0, "grad_norm": 2.163081103156057, "language_loss": 0.76322675, "learning_rate": 3.3684070089211736e-06, "loss": 0.84116566, "num_input_tokens_seen": 101329045, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.17810059, "step": 4696, "time_per_iteration": 2.5368452072143555 }, { "auxiliary_loss_clip": 0.06511107, "auxiliary_loss_mlp": 0.01275305, "balance_loss_clip": 0.0630013, "balance_loss_mlp": 0.01258819, "epoch": 0.282398917781452, "flos": 42020592998400.0, "grad_norm": 3.4113370639712075, "language_loss": 0.62877631, "learning_rate": 3.368122952024877e-06, "loss": 0.70664048, "num_input_tokens_seen": 101352715, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.16479492, "step": 4697, "time_per_iteration": 2.739314556121826 }, { "auxiliary_loss_clip": 0.06507705, "auxiliary_loss_mlp": 0.01278583, "balance_loss_clip": 0.06298161, "balance_loss_mlp": 0.01262776, "epoch": 0.28245904103411995, "flos": 23231916126720.0, "grad_norm": 1.8575608430794364, "language_loss": 0.73533702, "learning_rate": 3.3678388432492214e-06, "loss": 0.81319988, "num_input_tokens_seen": 101374640, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.15795898, "step": 4698, "time_per_iteration": 2.6144113540649414 }, { "auxiliary_loss_clip": 0.06509343, "auxiliary_loss_mlp": 0.01274855, "balance_loss_clip": 0.06302177, "balance_loss_mlp": 0.0125881, "epoch": 0.2825191642867879, "flos": 25381713605760.0, "grad_norm": 1.677216216370461, "language_loss": 0.75576669, "learning_rate": 3.3675546826049788e-06, "loss": 0.83360875, "num_input_tokens_seen": 101393595, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.16040039, "step": 4699, "time_per_iteration": 2.6022815704345703 }, { "auxiliary_loss_clip": 0.06524798, "auxiliary_loss_mlp": 0.01282253, "balance_loss_clip": 0.06309897, "balance_loss_mlp": 0.01264109, "epoch": 0.2825792875394559, "flos": 17242277443200.0, "grad_norm": 2.855982810508353, "language_loss": 0.80543679, "learning_rate": 3.3672704701029265e-06, "loss": 0.88350731, "num_input_tokens_seen": 101409265, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.18139648, "step": 4700, "time_per_iteration": 2.5322775840759277 }, { "auxiliary_loss_clip": 0.06504633, "auxiliary_loss_mlp": 0.01274046, "balance_loss_clip": 0.06299592, "balance_loss_mlp": 0.01258334, "epoch": 0.28263941079212385, "flos": 26731177211520.0, "grad_norm": 1.8490458755022978, "language_loss": 0.82150286, "learning_rate": 3.3669862057538402e-06, "loss": 0.89928973, "num_input_tokens_seen": 101428365, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.15710449, "step": 4701, "time_per_iteration": 2.612391471862793 }, { "auxiliary_loss_clip": 0.06509756, "auxiliary_loss_mlp": 0.01272638, "balance_loss_clip": 0.0630105, "balance_loss_mlp": 0.0125664, "epoch": 0.2826995340447918, "flos": 25928411569920.0, "grad_norm": 2.141399595175655, "language_loss": 0.73474836, "learning_rate": 3.3667018895685004e-06, "loss": 0.8125723, "num_input_tokens_seen": 101447280, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.15979004, "step": 4702, "time_per_iteration": 2.6038434505462646 }, { "auxiliary_loss_clip": 0.06506531, "auxiliary_loss_mlp": 0.01272746, "balance_loss_clip": 0.06300605, "balance_loss_mlp": 0.01256831, "epoch": 0.2827596572974598, "flos": 22385783197440.0, "grad_norm": 1.7783539447957502, "language_loss": 0.78726041, "learning_rate": 3.3664175215576886e-06, "loss": 0.86505318, "num_input_tokens_seen": 101465435, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.15917969, "step": 4703, "time_per_iteration": 2.709378957748413 }, { "auxiliary_loss_clip": 0.06512877, "auxiliary_loss_mlp": 0.01278798, "balance_loss_clip": 0.06301996, "balance_loss_mlp": 0.01261775, "epoch": 0.28281978055012774, "flos": 33555544669440.0, "grad_norm": 1.5285584010622182, "language_loss": 0.70187378, "learning_rate": 3.3661331017321867e-06, "loss": 0.77979052, "num_input_tokens_seen": 101486355, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.17016602, "step": 4704, "time_per_iteration": 2.684183120727539 }, { "auxiliary_loss_clip": 0.06517459, "auxiliary_loss_mlp": 0.012796, "balance_loss_clip": 0.06308652, "balance_loss_mlp": 0.01262589, "epoch": 0.2828799038027957, "flos": 23447635263360.0, "grad_norm": 2.242722424897055, "language_loss": 0.71016985, "learning_rate": 3.3658486301027807e-06, "loss": 0.78814042, "num_input_tokens_seen": 101505875, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.17016602, "step": 4705, "time_per_iteration": 2.5889294147491455 }, { "auxiliary_loss_clip": 0.06417621, "auxiliary_loss_mlp": 0.01273271, "balance_loss_clip": 0.06308714, "balance_loss_mlp": 0.01268675, "epoch": 0.2829400270554637, "flos": 69892055297280.0, "grad_norm": 0.7148787350410355, "language_loss": 0.59225261, "learning_rate": 3.3655641066802577e-06, "loss": 0.66916156, "num_input_tokens_seen": 101565045, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 0.04592896, "step": 4706, "time_per_iteration": 3.2570042610168457 }, { "auxiliary_loss_clip": 0.06505654, "auxiliary_loss_mlp": 0.01273261, "balance_loss_clip": 0.06303525, "balance_loss_mlp": 0.01258539, "epoch": 0.2830001503081317, "flos": 24795715276800.0, "grad_norm": 1.953286768117434, "language_loss": 0.82628143, "learning_rate": 3.365279531475407e-06, "loss": 0.90407062, "num_input_tokens_seen": 101585825, "router_z_loss_clip": 2.01757812, "router_z_loss_mlp": 0.14733887, "step": 4707, "time_per_iteration": 2.740595579147339 }, { "auxiliary_loss_clip": 0.06523678, "auxiliary_loss_mlp": 0.01275151, "balance_loss_clip": 0.06308462, "balance_loss_mlp": 0.01258426, "epoch": 0.28306027356079966, "flos": 27676218286080.0, "grad_norm": 1.605101473492648, "language_loss": 0.80894887, "learning_rate": 3.36499490449902e-06, "loss": 0.8869372, "num_input_tokens_seen": 101606105, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.16711426, "step": 4708, "time_per_iteration": 2.613429546356201 }, { "auxiliary_loss_clip": 0.06406525, "auxiliary_loss_mlp": 0.01266761, "balance_loss_clip": 0.06299366, "balance_loss_mlp": 0.01262592, "epoch": 0.2831203968134676, "flos": 60543837734400.0, "grad_norm": 0.8553212192393096, "language_loss": 0.62703347, "learning_rate": 3.3647102257618895e-06, "loss": 0.70376635, "num_input_tokens_seen": 101656875, "router_z_loss_clip": 1.0703125, "router_z_loss_mlp": 0.04171753, "step": 4709, "time_per_iteration": 3.085008382797241 }, { "auxiliary_loss_clip": 0.06505759, "auxiliary_loss_mlp": 0.0127252, "balance_loss_clip": 0.06298612, "balance_loss_mlp": 0.01257226, "epoch": 0.2831805200661356, "flos": 22061386915200.0, "grad_norm": 1.4159275856582758, "language_loss": 0.74393654, "learning_rate": 3.3644254952748103e-06, "loss": 0.82171929, "num_input_tokens_seen": 101676225, "router_z_loss_clip": 2.07128906, "router_z_loss_mlp": 0.15319824, "step": 4710, "time_per_iteration": 2.585179090499878 }, { "auxiliary_loss_clip": 0.06511037, "auxiliary_loss_mlp": 0.01273469, "balance_loss_clip": 0.06298797, "balance_loss_mlp": 0.01256577, "epoch": 0.28324064331880355, "flos": 22607120557440.0, "grad_norm": 1.9288594794756193, "language_loss": 0.79866827, "learning_rate": 3.364140713048579e-06, "loss": 0.87651336, "num_input_tokens_seen": 101693710, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.16906738, "step": 4711, "time_per_iteration": 2.5688939094543457 }, { "auxiliary_loss_clip": 0.06516048, "auxiliary_loss_mlp": 0.01275058, "balance_loss_clip": 0.06304548, "balance_loss_mlp": 0.01259358, "epoch": 0.2833007665714715, "flos": 30411133626240.0, "grad_norm": 1.989661165290748, "language_loss": 0.71380669, "learning_rate": 3.363855879093996e-06, "loss": 0.79171777, "num_input_tokens_seen": 101714010, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.15710449, "step": 4712, "time_per_iteration": 4.052680015563965 }, { "auxiliary_loss_clip": 0.06509414, "auxiliary_loss_mlp": 0.01284084, "balance_loss_clip": 0.06298765, "balance_loss_mlp": 0.01266214, "epoch": 0.2833608898241395, "flos": 23556144700800.0, "grad_norm": 1.934107410065238, "language_loss": 0.82442963, "learning_rate": 3.3635709934218605e-06, "loss": 0.90236461, "num_input_tokens_seen": 101732995, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.17883301, "step": 4713, "time_per_iteration": 2.5791356563568115 }, { "auxiliary_loss_clip": 0.06515352, "auxiliary_loss_mlp": 0.01273004, "balance_loss_clip": 0.06302908, "balance_loss_mlp": 0.01255122, "epoch": 0.28342101307680745, "flos": 20272980096000.0, "grad_norm": 2.306893893493065, "language_loss": 0.75857025, "learning_rate": 3.3632860560429766e-06, "loss": 0.83645386, "num_input_tokens_seen": 101751385, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.17883301, "step": 4714, "time_per_iteration": 2.5688557624816895 }, { "auxiliary_loss_clip": 0.065099, "auxiliary_loss_mlp": 0.01273453, "balance_loss_clip": 0.06299888, "balance_loss_mlp": 0.01256966, "epoch": 0.2834811363294754, "flos": 30854982303360.0, "grad_norm": 1.3886646116775878, "language_loss": 0.78364837, "learning_rate": 3.3630010669681494e-06, "loss": 0.86148196, "num_input_tokens_seen": 101773825, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.16491699, "step": 4715, "time_per_iteration": 2.652388334274292 }, { "auxiliary_loss_clip": 0.06504545, "auxiliary_loss_mlp": 0.01274932, "balance_loss_clip": 0.06295578, "balance_loss_mlp": 0.01259065, "epoch": 0.2835412595821434, "flos": 22717642492800.0, "grad_norm": 1.9375753348258367, "language_loss": 0.74612534, "learning_rate": 3.3627160262081845e-06, "loss": 0.82392007, "num_input_tokens_seen": 101791920, "router_z_loss_clip": 2.08886719, "router_z_loss_mlp": 0.15856934, "step": 4716, "time_per_iteration": 2.5696401596069336 }, { "auxiliary_loss_clip": 0.06516972, "auxiliary_loss_mlp": 0.01274857, "balance_loss_clip": 0.06297034, "balance_loss_mlp": 0.01257059, "epoch": 0.28360138283481134, "flos": 18083630689920.0, "grad_norm": 2.071237516251584, "language_loss": 0.75050491, "learning_rate": 3.3624309337738917e-06, "loss": 0.8284232, "num_input_tokens_seen": 101809515, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.17797852, "step": 4717, "time_per_iteration": 4.05983567237854 }, { "auxiliary_loss_clip": 0.06513405, "auxiliary_loss_mlp": 0.01282269, "balance_loss_clip": 0.06298406, "balance_loss_mlp": 0.01264495, "epoch": 0.2836615060874793, "flos": 17859987342720.0, "grad_norm": 1.8829563212219964, "language_loss": 0.67590237, "learning_rate": 3.3621457896760813e-06, "loss": 0.7538591, "num_input_tokens_seen": 101827735, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.17773438, "step": 4718, "time_per_iteration": 2.551290988922119 }, { "auxiliary_loss_clip": 0.06509921, "auxiliary_loss_mlp": 0.01274735, "balance_loss_clip": 0.06294094, "balance_loss_mlp": 0.01256854, "epoch": 0.2837216293401473, "flos": 25747590458880.0, "grad_norm": 1.9268188070822834, "language_loss": 0.73085427, "learning_rate": 3.361860593925566e-06, "loss": 0.8087008, "num_input_tokens_seen": 101845970, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.17895508, "step": 4719, "time_per_iteration": 2.6007652282714844 }, { "auxiliary_loss_clip": 0.06506576, "auxiliary_loss_mlp": 0.01276064, "balance_loss_clip": 0.0629713, "balance_loss_mlp": 0.01259029, "epoch": 0.2837817525928153, "flos": 20929906506240.0, "grad_norm": 2.116012281584491, "language_loss": 0.80842769, "learning_rate": 3.3615753465331605e-06, "loss": 0.88625407, "num_input_tokens_seen": 101865040, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.17028809, "step": 4720, "time_per_iteration": 2.6317524909973145 }, { "auxiliary_loss_clip": 0.06514988, "auxiliary_loss_mlp": 0.01280319, "balance_loss_clip": 0.06301576, "balance_loss_mlp": 0.01263439, "epoch": 0.28384187584548326, "flos": 18922719876480.0, "grad_norm": 1.8288377110210983, "language_loss": 0.80440283, "learning_rate": 3.3612900475096817e-06, "loss": 0.88235593, "num_input_tokens_seen": 101883735, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.16870117, "step": 4721, "time_per_iteration": 2.537935256958008 }, { "auxiliary_loss_clip": 0.06505602, "auxiliary_loss_mlp": 0.01275039, "balance_loss_clip": 0.06297688, "balance_loss_mlp": 0.0125941, "epoch": 0.2839019990981512, "flos": 27351235025280.0, "grad_norm": 2.0047513356808615, "language_loss": 0.83360767, "learning_rate": 3.3610046968659474e-06, "loss": 0.91141403, "num_input_tokens_seen": 101903025, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.15612793, "step": 4722, "time_per_iteration": 2.6021430492401123 }, { "auxiliary_loss_clip": 0.06511198, "auxiliary_loss_mlp": 0.01276356, "balance_loss_clip": 0.0629989, "balance_loss_mlp": 0.0126018, "epoch": 0.2839621223508192, "flos": 18120247724160.0, "grad_norm": 1.6475919964261507, "language_loss": 0.70956004, "learning_rate": 3.3607192946127785e-06, "loss": 0.78743559, "num_input_tokens_seen": 101922255, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.16162109, "step": 4723, "time_per_iteration": 2.5404882431030273 }, { "auxiliary_loss_clip": 0.06510496, "auxiliary_loss_mlp": 0.01277082, "balance_loss_clip": 0.06299455, "balance_loss_mlp": 0.01259654, "epoch": 0.28402224560348716, "flos": 26365384212480.0, "grad_norm": 1.6111388766848107, "language_loss": 0.78630507, "learning_rate": 3.360433840760998e-06, "loss": 0.86418086, "num_input_tokens_seen": 101943100, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.17431641, "step": 4724, "time_per_iteration": 3.998032331466675 }, { "auxiliary_loss_clip": 0.06512225, "auxiliary_loss_mlp": 0.01274837, "balance_loss_clip": 0.06301178, "balance_loss_mlp": 0.01257837, "epoch": 0.2840823688561551, "flos": 24067609223040.0, "grad_norm": 1.9623520446661131, "language_loss": 0.93386054, "learning_rate": 3.36014833532143e-06, "loss": 1.01173115, "num_input_tokens_seen": 101963160, "router_z_loss_clip": 2.11230469, "router_z_loss_mlp": 0.17004395, "step": 4725, "time_per_iteration": 2.573580026626587 }, { "auxiliary_loss_clip": 0.06512146, "auxiliary_loss_mlp": 0.01277902, "balance_loss_clip": 0.06298241, "balance_loss_mlp": 0.01259806, "epoch": 0.2841424921088231, "flos": 29467392289920.0, "grad_norm": 2.02100201384038, "language_loss": 0.89161336, "learning_rate": 3.3598627783049e-06, "loss": 0.96951389, "num_input_tokens_seen": 101984300, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.18103027, "step": 4726, "time_per_iteration": 4.10633111000061 }, { "auxiliary_loss_clip": 0.06514812, "auxiliary_loss_mlp": 0.01276205, "balance_loss_clip": 0.06301775, "balance_loss_mlp": 0.01258884, "epoch": 0.28420261536149105, "flos": 48110439565440.0, "grad_norm": 2.6075875617506052, "language_loss": 0.79746675, "learning_rate": 3.359577169722238e-06, "loss": 0.87537688, "num_input_tokens_seen": 102005765, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.17333984, "step": 4727, "time_per_iteration": 2.7964398860931396 }, { "auxiliary_loss_clip": 0.06506294, "auxiliary_loss_mlp": 0.01272095, "balance_loss_clip": 0.06299829, "balance_loss_mlp": 0.01256538, "epoch": 0.284262738614159, "flos": 25673224360320.0, "grad_norm": 9.78513807950518, "language_loss": 0.66696143, "learning_rate": 3.3592915095842733e-06, "loss": 0.74474537, "num_input_tokens_seen": 102022755, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.15551758, "step": 4728, "time_per_iteration": 2.581942081451416 }, { "auxiliary_loss_clip": 0.06503396, "auxiliary_loss_mlp": 0.01274987, "balance_loss_clip": 0.06295134, "balance_loss_mlp": 0.0125856, "epoch": 0.284322861866827, "flos": 19725066247680.0, "grad_norm": 1.8520364724583627, "language_loss": 0.76609826, "learning_rate": 3.3590057979018386e-06, "loss": 0.84388208, "num_input_tokens_seen": 102041850, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.16430664, "step": 4729, "time_per_iteration": 2.53725004196167 }, { "auxiliary_loss_clip": 0.06515215, "auxiliary_loss_mlp": 0.01272174, "balance_loss_clip": 0.06301484, "balance_loss_mlp": 0.01254566, "epoch": 0.28438298511949495, "flos": 23922105408000.0, "grad_norm": 4.107775134424107, "language_loss": 0.67459404, "learning_rate": 3.3587200346857674e-06, "loss": 0.75246799, "num_input_tokens_seen": 102059500, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.17602539, "step": 4730, "time_per_iteration": 2.5796611309051514 }, { "auxiliary_loss_clip": 0.06511393, "auxiliary_loss_mlp": 0.01272661, "balance_loss_clip": 0.06298558, "balance_loss_mlp": 0.01255089, "epoch": 0.2844431083721629, "flos": 26074460436480.0, "grad_norm": 1.8380097251114929, "language_loss": 0.7491076, "learning_rate": 3.3584342199468965e-06, "loss": 0.82694811, "num_input_tokens_seen": 102080460, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.17590332, "step": 4731, "time_per_iteration": 2.6259145736694336 }, { "auxiliary_loss_clip": 0.06507597, "auxiliary_loss_mlp": 0.01273191, "balance_loss_clip": 0.06296, "balance_loss_mlp": 0.01256394, "epoch": 0.2845032316248309, "flos": 25817260728960.0, "grad_norm": 1.986964508048804, "language_loss": 0.84371555, "learning_rate": 3.3581483536960638e-06, "loss": 0.92152345, "num_input_tokens_seen": 102100950, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.16796875, "step": 4732, "time_per_iteration": 2.6158711910247803 }, { "auxiliary_loss_clip": 0.06511201, "auxiliary_loss_mlp": 0.01276872, "balance_loss_clip": 0.06296784, "balance_loss_mlp": 0.01258204, "epoch": 0.2845633548774989, "flos": 19828418659200.0, "grad_norm": 1.8098978562791164, "language_loss": 0.79642391, "learning_rate": 3.357862435944109e-06, "loss": 0.87430465, "num_input_tokens_seen": 102119345, "router_z_loss_clip": 2.14550781, "router_z_loss_mlp": 0.18640137, "step": 4733, "time_per_iteration": 2.5534162521362305 }, { "auxiliary_loss_clip": 0.06515002, "auxiliary_loss_mlp": 0.01275581, "balance_loss_clip": 0.06298517, "balance_loss_mlp": 0.01258474, "epoch": 0.28462347813016686, "flos": 23189093890560.0, "grad_norm": 4.152047467544692, "language_loss": 0.72635639, "learning_rate": 3.357576466701875e-06, "loss": 0.80426216, "num_input_tokens_seen": 102139050, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.17114258, "step": 4734, "time_per_iteration": 2.592677116394043 }, { "auxiliary_loss_clip": 0.06505366, "auxiliary_loss_mlp": 0.01271174, "balance_loss_clip": 0.06294376, "balance_loss_mlp": 0.01254306, "epoch": 0.2846836013828348, "flos": 18666316782720.0, "grad_norm": 1.8735891547784336, "language_loss": 0.74882483, "learning_rate": 3.3572904459802056e-06, "loss": 0.8265903, "num_input_tokens_seen": 102157935, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.1685791, "step": 4735, "time_per_iteration": 2.5527594089508057 }, { "auxiliary_loss_clip": 0.06504217, "auxiliary_loss_mlp": 0.01273544, "balance_loss_clip": 0.06293449, "balance_loss_mlp": 0.01256354, "epoch": 0.2847437246355028, "flos": 14178731189760.0, "grad_norm": 1.7667290639125177, "language_loss": 0.79889464, "learning_rate": 3.357004373789946e-06, "loss": 0.87667227, "num_input_tokens_seen": 102175325, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.17175293, "step": 4736, "time_per_iteration": 2.547106981277466 }, { "auxiliary_loss_clip": 0.0650866, "auxiliary_loss_mlp": 0.01274403, "balance_loss_clip": 0.06295501, "balance_loss_mlp": 0.01257059, "epoch": 0.28480384788817076, "flos": 29286068054400.0, "grad_norm": 2.2521540235424906, "language_loss": 0.60653245, "learning_rate": 3.3567182501419453e-06, "loss": 0.68436313, "num_input_tokens_seen": 102196625, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.17333984, "step": 4737, "time_per_iteration": 2.6414682865142822 }, { "auxiliary_loss_clip": 0.06505691, "auxiliary_loss_mlp": 0.01272647, "balance_loss_clip": 0.06298786, "balance_loss_mlp": 0.01256495, "epoch": 0.2848639711408387, "flos": 22607875244160.0, "grad_norm": 1.865242830802039, "language_loss": 0.87303054, "learning_rate": 3.356432075047052e-06, "loss": 0.95081395, "num_input_tokens_seen": 102214975, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.16149902, "step": 4738, "time_per_iteration": 2.5753777027130127 }, { "auxiliary_loss_clip": 0.06508346, "auxiliary_loss_mlp": 0.01279383, "balance_loss_clip": 0.06294276, "balance_loss_mlp": 0.01261036, "epoch": 0.2849240943935067, "flos": 17604632424960.0, "grad_norm": 2.532477701409276, "language_loss": 0.90607226, "learning_rate": 3.356145848516118e-06, "loss": 0.98394954, "num_input_tokens_seen": 102231885, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.18347168, "step": 4739, "time_per_iteration": 2.524334669113159 }, { "auxiliary_loss_clip": 0.06505685, "auxiliary_loss_mlp": 0.01270847, "balance_loss_clip": 0.06296714, "balance_loss_mlp": 0.01254385, "epoch": 0.28498421764617465, "flos": 24869368615680.0, "grad_norm": 1.3769325297490196, "language_loss": 0.72763789, "learning_rate": 3.355859570559998e-06, "loss": 0.80540323, "num_input_tokens_seen": 102252725, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.16455078, "step": 4740, "time_per_iteration": 2.602874279022217 }, { "auxiliary_loss_clip": 0.06502156, "auxiliary_loss_mlp": 0.01270762, "balance_loss_clip": 0.06296042, "balance_loss_mlp": 0.0125393, "epoch": 0.2850443408988426, "flos": 22788947917440.0, "grad_norm": 1.7215196152383538, "language_loss": 0.7822876, "learning_rate": 3.3555732411895477e-06, "loss": 0.86001682, "num_input_tokens_seen": 102271730, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.16833496, "step": 4741, "time_per_iteration": 2.56381893157959 }, { "auxiliary_loss_clip": 0.06517977, "auxiliary_loss_mlp": 0.01273205, "balance_loss_clip": 0.06299817, "balance_loss_mlp": 0.01255526, "epoch": 0.2851044641515106, "flos": 18850114713600.0, "grad_norm": 2.366140050319683, "language_loss": 0.76951945, "learning_rate": 3.3552868604156235e-06, "loss": 0.8474313, "num_input_tokens_seen": 102291325, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.17675781, "step": 4742, "time_per_iteration": 2.5595338344573975 }, { "auxiliary_loss_clip": 0.06516083, "auxiliary_loss_mlp": 0.01277677, "balance_loss_clip": 0.0629948, "balance_loss_mlp": 0.01259355, "epoch": 0.28516458740417855, "flos": 18886564039680.0, "grad_norm": 2.023659302244122, "language_loss": 0.5785858, "learning_rate": 3.355000428249086e-06, "loss": 0.65652335, "num_input_tokens_seen": 102309000, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.18322754, "step": 4743, "time_per_iteration": 2.537496328353882 }, { "auxiliary_loss_clip": 0.06511536, "auxiliary_loss_mlp": 0.01279592, "balance_loss_clip": 0.06298529, "balance_loss_mlp": 0.012614, "epoch": 0.2852247106568465, "flos": 25306592820480.0, "grad_norm": 1.6719739290667741, "language_loss": 0.7512008, "learning_rate": 3.354713944700797e-06, "loss": 0.82911217, "num_input_tokens_seen": 102329240, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.18188477, "step": 4744, "time_per_iteration": 2.603935718536377 }, { "auxiliary_loss_clip": 0.06505702, "auxiliary_loss_mlp": 0.0127822, "balance_loss_clip": 0.0629621, "balance_loss_mlp": 0.01261066, "epoch": 0.2852848339095145, "flos": 11660080037760.0, "grad_norm": 2.296306941017831, "language_loss": 0.77464437, "learning_rate": 3.3544274097816185e-06, "loss": 0.85248351, "num_input_tokens_seen": 102344440, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.17138672, "step": 4745, "time_per_iteration": 2.521878719329834 }, { "auxiliary_loss_clip": 0.06505141, "auxiliary_loss_mlp": 0.01275353, "balance_loss_clip": 0.06301597, "balance_loss_mlp": 0.01258413, "epoch": 0.2853449571621825, "flos": 12938280145920.0, "grad_norm": 1.9862860412404844, "language_loss": 0.82905185, "learning_rate": 3.3541408235024173e-06, "loss": 0.90685678, "num_input_tokens_seen": 102360985, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.16918945, "step": 4746, "time_per_iteration": 2.520923137664795 }, { "auxiliary_loss_clip": 0.06522167, "auxiliary_loss_mlp": 0.01278487, "balance_loss_clip": 0.06302168, "balance_loss_mlp": 0.01259986, "epoch": 0.28540508041485046, "flos": 20016660856320.0, "grad_norm": 1.8729059805302384, "language_loss": 0.80147117, "learning_rate": 3.3538541858740604e-06, "loss": 0.87947774, "num_input_tokens_seen": 102380320, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.18505859, "step": 4747, "time_per_iteration": 2.563823699951172 }, { "auxiliary_loss_clip": 0.0641647, "auxiliary_loss_mlp": 0.01257157, "balance_loss_clip": 0.06311779, "balance_loss_mlp": 0.01252049, "epoch": 0.28546520366751843, "flos": 68160264710400.0, "grad_norm": 0.7468869879166954, "language_loss": 0.60133553, "learning_rate": 3.3535674969074173e-06, "loss": 0.67807186, "num_input_tokens_seen": 102439140, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.05108643, "step": 4748, "time_per_iteration": 3.19390606880188 }, { "auxiliary_loss_clip": 0.06503204, "auxiliary_loss_mlp": 0.0128347, "balance_loss_clip": 0.06292633, "balance_loss_mlp": 0.01266697, "epoch": 0.2855253269201864, "flos": 13254961852800.0, "grad_norm": 2.335467781414139, "language_loss": 0.80560821, "learning_rate": 3.3532807566133592e-06, "loss": 0.88347495, "num_input_tokens_seen": 102450990, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.16760254, "step": 4749, "time_per_iteration": 2.516573190689087 }, { "auxiliary_loss_clip": 0.06512128, "auxiliary_loss_mlp": 0.01276784, "balance_loss_clip": 0.06301494, "balance_loss_mlp": 0.01259666, "epoch": 0.28558545017285436, "flos": 28628345030400.0, "grad_norm": 1.7243393956149513, "language_loss": 0.70900035, "learning_rate": 3.3529939650027587e-06, "loss": 0.78688943, "num_input_tokens_seen": 102471820, "router_z_loss_clip": 2.10644531, "router_z_loss_mlp": 0.17114258, "step": 4750, "time_per_iteration": 2.659198045730591 }, { "auxiliary_loss_clip": 0.06507593, "auxiliary_loss_mlp": 0.0127975, "balance_loss_clip": 0.06302832, "balance_loss_mlp": 0.01263061, "epoch": 0.2856455734255223, "flos": 34138901594880.0, "grad_norm": 1.4800040167577335, "language_loss": 0.82141662, "learning_rate": 3.3527071220864917e-06, "loss": 0.89929008, "num_input_tokens_seen": 102492625, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.16674805, "step": 4751, "time_per_iteration": 2.7524731159210205 }, { "auxiliary_loss_clip": 0.06508847, "auxiliary_loss_mlp": 0.01281219, "balance_loss_clip": 0.06299964, "balance_loss_mlp": 0.01263278, "epoch": 0.2857056966781903, "flos": 39795590880000.0, "grad_norm": 1.8356655788571585, "language_loss": 0.80478525, "learning_rate": 3.3524202278754353e-06, "loss": 0.8826859, "num_input_tokens_seen": 102514145, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.17956543, "step": 4752, "time_per_iteration": 4.148143768310547 }, { "auxiliary_loss_clip": 0.06506651, "auxiliary_loss_mlp": 0.01274564, "balance_loss_clip": 0.06295335, "balance_loss_mlp": 0.01257482, "epoch": 0.28576581993085826, "flos": 21878846795520.0, "grad_norm": 1.7063582656179066, "language_loss": 0.79221618, "learning_rate": 3.3521332823804676e-06, "loss": 0.87002838, "num_input_tokens_seen": 102532365, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.1706543, "step": 4753, "time_per_iteration": 2.5705971717834473 }, { "auxiliary_loss_clip": 0.06517778, "auxiliary_loss_mlp": 0.01277912, "balance_loss_clip": 0.06302465, "balance_loss_mlp": 0.01257968, "epoch": 0.2858259431835262, "flos": 19096455317760.0, "grad_norm": 2.40480597530516, "language_loss": 0.90100527, "learning_rate": 3.3518462856124704e-06, "loss": 0.97896212, "num_input_tokens_seen": 102548425, "router_z_loss_clip": 2.15332031, "router_z_loss_mlp": 0.19946289, "step": 4754, "time_per_iteration": 2.6303956508636475 }, { "auxiliary_loss_clip": 0.06506168, "auxiliary_loss_mlp": 0.01281502, "balance_loss_clip": 0.06301713, "balance_loss_mlp": 0.01264574, "epoch": 0.2858860664361942, "flos": 20339673546240.0, "grad_norm": 1.5611084029832278, "language_loss": 0.82489359, "learning_rate": 3.3515592375823267e-06, "loss": 0.90277028, "num_input_tokens_seen": 102566370, "router_z_loss_clip": 2.04394531, "router_z_loss_mlp": 0.16931152, "step": 4755, "time_per_iteration": 2.5667569637298584 }, { "auxiliary_loss_clip": 0.06501237, "auxiliary_loss_mlp": 0.01275671, "balance_loss_clip": 0.06291407, "balance_loss_mlp": 0.01258874, "epoch": 0.28594618968886215, "flos": 24468551809920.0, "grad_norm": 1.4966295114777095, "language_loss": 0.83889079, "learning_rate": 3.351272138300922e-06, "loss": 0.91665989, "num_input_tokens_seen": 102588715, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.16772461, "step": 4756, "time_per_iteration": 4.026106834411621 }, { "auxiliary_loss_clip": 0.06409591, "auxiliary_loss_mlp": 0.01261519, "balance_loss_clip": 0.06304942, "balance_loss_mlp": 0.01257278, "epoch": 0.2860063129415301, "flos": 71676170830080.0, "grad_norm": 0.8646067909242754, "language_loss": 0.60905242, "learning_rate": 3.350984987779142e-06, "loss": 0.68576354, "num_input_tokens_seen": 102656715, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.04244995, "step": 4757, "time_per_iteration": 3.341113805770874 }, { "auxiliary_loss_clip": 0.06509286, "auxiliary_loss_mlp": 0.01274448, "balance_loss_clip": 0.06301225, "balance_loss_mlp": 0.01258521, "epoch": 0.2860664361941981, "flos": 20564993975040.0, "grad_norm": 2.2326865207227096, "language_loss": 0.65860522, "learning_rate": 3.3506977860278756e-06, "loss": 0.73644257, "num_input_tokens_seen": 102676545, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.15917969, "step": 4758, "time_per_iteration": 2.578518867492676 }, { "auxiliary_loss_clip": 0.0650961, "auxiliary_loss_mlp": 0.01275588, "balance_loss_clip": 0.06297281, "balance_loss_mlp": 0.01257874, "epoch": 0.2861265594468661, "flos": 36005992997760.0, "grad_norm": 1.473158192118903, "language_loss": 0.63348532, "learning_rate": 3.3504105330580143e-06, "loss": 0.71133727, "num_input_tokens_seen": 102702875, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.17712402, "step": 4759, "time_per_iteration": 2.72367262840271 }, { "auxiliary_loss_clip": 0.06514987, "auxiliary_loss_mlp": 0.01280034, "balance_loss_clip": 0.06303877, "balance_loss_mlp": 0.01261628, "epoch": 0.28618668269953407, "flos": 20053571379840.0, "grad_norm": 1.7989585005231892, "language_loss": 0.74631077, "learning_rate": 3.3501232288804496e-06, "loss": 0.82426095, "num_input_tokens_seen": 102723160, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.18408203, "step": 4760, "time_per_iteration": 2.568484306335449 }, { "auxiliary_loss_clip": 0.06502365, "auxiliary_loss_mlp": 0.01280019, "balance_loss_clip": 0.06298935, "balance_loss_mlp": 0.01264474, "epoch": 0.28624680595220203, "flos": 24978632739840.0, "grad_norm": 1.9237050560551991, "language_loss": 0.72629905, "learning_rate": 3.3498358735060773e-06, "loss": 0.80412292, "num_input_tokens_seen": 102743855, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.15515137, "step": 4761, "time_per_iteration": 2.595460891723633 }, { "auxiliary_loss_clip": 0.06514926, "auxiliary_loss_mlp": 0.01276603, "balance_loss_clip": 0.06303193, "balance_loss_mlp": 0.01259508, "epoch": 0.28630692920487, "flos": 22498862682240.0, "grad_norm": 2.1174289495773024, "language_loss": 0.74478555, "learning_rate": 3.349548466945793e-06, "loss": 0.8227008, "num_input_tokens_seen": 102761370, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.17102051, "step": 4762, "time_per_iteration": 2.5791659355163574 }, { "auxiliary_loss_clip": 0.06504807, "auxiliary_loss_mlp": 0.01275899, "balance_loss_clip": 0.06297462, "balance_loss_mlp": 0.01258292, "epoch": 0.28636705245753796, "flos": 21255979870080.0, "grad_norm": 1.4280150429060205, "language_loss": 0.76383579, "learning_rate": 3.349261009210496e-06, "loss": 0.84164286, "num_input_tokens_seen": 102780885, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.17602539, "step": 4763, "time_per_iteration": 4.005809545516968 }, { "auxiliary_loss_clip": 0.06512229, "auxiliary_loss_mlp": 0.01280224, "balance_loss_clip": 0.0630101, "balance_loss_mlp": 0.01262092, "epoch": 0.28642717571020593, "flos": 24102339540480.0, "grad_norm": 1.7063380992994868, "language_loss": 0.77893019, "learning_rate": 3.348973500311086e-06, "loss": 0.85685468, "num_input_tokens_seen": 102801000, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.18139648, "step": 4764, "time_per_iteration": 2.595695734024048 }, { "auxiliary_loss_clip": 0.06514814, "auxiliary_loss_mlp": 0.01280631, "balance_loss_clip": 0.06302429, "balance_loss_mlp": 0.01260806, "epoch": 0.2864872989628739, "flos": 22607959098240.0, "grad_norm": 1.8229093371685299, "language_loss": 0.71632522, "learning_rate": 3.348685940258466e-06, "loss": 0.79427969, "num_input_tokens_seen": 102820230, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.19824219, "step": 4765, "time_per_iteration": 4.053240537643433 }, { "auxiliary_loss_clip": 0.06512051, "auxiliary_loss_mlp": 0.01272991, "balance_loss_clip": 0.06304234, "balance_loss_mlp": 0.01256159, "epoch": 0.28654742221554186, "flos": 32753449860480.0, "grad_norm": 1.3873481644327557, "language_loss": 0.76217926, "learning_rate": 3.3483983290635395e-06, "loss": 0.84002966, "num_input_tokens_seen": 102842670, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.16833496, "step": 4766, "time_per_iteration": 2.6595871448516846 }, { "auxiliary_loss_clip": 0.06502958, "auxiliary_loss_mlp": 0.01275057, "balance_loss_clip": 0.06298876, "balance_loss_mlp": 0.01257795, "epoch": 0.2866075454682098, "flos": 26989257386880.0, "grad_norm": 1.5652741326990036, "language_loss": 0.7825042, "learning_rate": 3.348110666737214e-06, "loss": 0.86028433, "num_input_tokens_seen": 102864480, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.17248535, "step": 4767, "time_per_iteration": 2.6071536540985107 }, { "auxiliary_loss_clip": 0.06509069, "auxiliary_loss_mlp": 0.01278619, "balance_loss_clip": 0.06301302, "balance_loss_mlp": 0.01260892, "epoch": 0.2866676687208778, "flos": 23259812336640.0, "grad_norm": 2.067175847089881, "language_loss": 0.65262407, "learning_rate": 3.3478229532903956e-06, "loss": 0.73050094, "num_input_tokens_seen": 102883740, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.17712402, "step": 4768, "time_per_iteration": 2.5804057121276855 }, { "auxiliary_loss_clip": 0.06516315, "auxiliary_loss_mlp": 0.01276931, "balance_loss_clip": 0.06299043, "balance_loss_mlp": 0.01258192, "epoch": 0.28672779197354575, "flos": 21586120156800.0, "grad_norm": 1.7379781029368868, "language_loss": 0.70658219, "learning_rate": 3.3475351887339967e-06, "loss": 0.78451455, "num_input_tokens_seen": 102902945, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.1875, "step": 4769, "time_per_iteration": 2.5514721870422363 }, { "auxiliary_loss_clip": 0.06510756, "auxiliary_loss_mlp": 0.01275251, "balance_loss_clip": 0.0629833, "balance_loss_mlp": 0.01258394, "epoch": 0.2867879152262137, "flos": 19871785946880.0, "grad_norm": 1.7432241050505286, "language_loss": 0.75205755, "learning_rate": 3.3472473730789288e-06, "loss": 0.82991767, "num_input_tokens_seen": 102922405, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.16845703, "step": 4770, "time_per_iteration": 2.5617942810058594 }, { "auxiliary_loss_clip": 0.06504677, "auxiliary_loss_mlp": 0.01276354, "balance_loss_clip": 0.06291316, "balance_loss_mlp": 0.01258579, "epoch": 0.2868480384788817, "flos": 28219687868160.0, "grad_norm": 2.337699926040931, "language_loss": 0.67935884, "learning_rate": 3.3469595063361045e-06, "loss": 0.75716919, "num_input_tokens_seen": 102938980, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.17773438, "step": 4771, "time_per_iteration": 2.5802013874053955 }, { "auxiliary_loss_clip": 0.0640156, "auxiliary_loss_mlp": 0.01266283, "balance_loss_clip": 0.06297709, "balance_loss_mlp": 0.01262233, "epoch": 0.2869081617315497, "flos": 65442218768640.0, "grad_norm": 0.7752078024275798, "language_loss": 0.5670296, "learning_rate": 3.3466715885164414e-06, "loss": 0.64370799, "num_input_tokens_seen": 103000405, "router_z_loss_clip": 1.0390625, "router_z_loss_mlp": 0.04049683, "step": 4772, "time_per_iteration": 3.144221305847168 }, { "auxiliary_loss_clip": 0.06505607, "auxiliary_loss_mlp": 0.01274238, "balance_loss_clip": 0.06292614, "balance_loss_mlp": 0.01255498, "epoch": 0.28696828498421767, "flos": 18666610272000.0, "grad_norm": 2.7035774100415484, "language_loss": 0.83753264, "learning_rate": 3.346383619630856e-06, "loss": 0.91533113, "num_input_tokens_seen": 103017970, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.18762207, "step": 4773, "time_per_iteration": 2.545870065689087 }, { "auxiliary_loss_clip": 0.06507112, "auxiliary_loss_mlp": 0.01275713, "balance_loss_clip": 0.06292967, "balance_loss_mlp": 0.01257272, "epoch": 0.28702840823688563, "flos": 23666540855040.0, "grad_norm": 2.290158254039026, "language_loss": 0.7806977, "learning_rate": 3.34609559969027e-06, "loss": 0.85852593, "num_input_tokens_seen": 103036385, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.18444824, "step": 4774, "time_per_iteration": 2.6204543113708496 }, { "auxiliary_loss_clip": 0.06508821, "auxiliary_loss_mlp": 0.01274896, "balance_loss_clip": 0.06296548, "balance_loss_mlp": 0.01256728, "epoch": 0.2870885314895536, "flos": 13809248611200.0, "grad_norm": 1.8093526085964677, "language_loss": 0.7391969, "learning_rate": 3.3458075287056034e-06, "loss": 0.81703407, "num_input_tokens_seen": 103052170, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.18188477, "step": 4775, "time_per_iteration": 2.5486817359924316 }, { "auxiliary_loss_clip": 0.06509111, "auxiliary_loss_mlp": 0.01274959, "balance_loss_clip": 0.06295644, "balance_loss_mlp": 0.01256267, "epoch": 0.28714865474222157, "flos": 17792790768000.0, "grad_norm": 1.9154615321481536, "language_loss": 0.8863073, "learning_rate": 3.34551940668778e-06, "loss": 0.96414804, "num_input_tokens_seen": 103070510, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.18676758, "step": 4776, "time_per_iteration": 2.5641367435455322 }, { "auxiliary_loss_clip": 0.06506534, "auxiliary_loss_mlp": 0.01275007, "balance_loss_clip": 0.06296811, "balance_loss_mlp": 0.01257292, "epoch": 0.28720877799488953, "flos": 16002958429440.0, "grad_norm": 2.006277276940668, "language_loss": 0.74305093, "learning_rate": 3.345231233647726e-06, "loss": 0.82086647, "num_input_tokens_seen": 103089590, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.17724609, "step": 4777, "time_per_iteration": 2.541508913040161 }, { "auxiliary_loss_clip": 0.06514659, "auxiliary_loss_mlp": 0.01279568, "balance_loss_clip": 0.06295935, "balance_loss_mlp": 0.01259767, "epoch": 0.2872689012475575, "flos": 20929445308800.0, "grad_norm": 2.2717255815545796, "language_loss": 0.80539995, "learning_rate": 3.3449430095963696e-06, "loss": 0.88334221, "num_input_tokens_seen": 103109080, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.19799805, "step": 4778, "time_per_iteration": 2.5643532276153564 }, { "auxiliary_loss_clip": 0.06500857, "auxiliary_loss_mlp": 0.01277758, "balance_loss_clip": 0.06293809, "balance_loss_mlp": 0.01260019, "epoch": 0.28732902450022546, "flos": 21331603779840.0, "grad_norm": 1.5097202932740157, "language_loss": 0.74167579, "learning_rate": 3.3446547345446386e-06, "loss": 0.81946194, "num_input_tokens_seen": 103127755, "router_z_loss_clip": 2.07128906, "router_z_loss_mlp": 0.17724609, "step": 4779, "time_per_iteration": 2.595940351486206 }, { "auxiliary_loss_clip": 0.0651253, "auxiliary_loss_mlp": 0.01272747, "balance_loss_clip": 0.06300924, "balance_loss_mlp": 0.01255211, "epoch": 0.2873891477528934, "flos": 20856714364800.0, "grad_norm": 1.8305072140797076, "language_loss": 0.76672983, "learning_rate": 3.3443664085034656e-06, "loss": 0.84458256, "num_input_tokens_seen": 103147035, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.17523193, "step": 4780, "time_per_iteration": 2.570101261138916 }, { "auxiliary_loss_clip": 0.06505261, "auxiliary_loss_mlp": 0.01269757, "balance_loss_clip": 0.06297725, "balance_loss_mlp": 0.01253235, "epoch": 0.2874492710055614, "flos": 17425698030720.0, "grad_norm": 2.326205096145187, "language_loss": 0.81937313, "learning_rate": 3.344078031483784e-06, "loss": 0.89712328, "num_input_tokens_seen": 103165410, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.16503906, "step": 4781, "time_per_iteration": 2.5491881370544434 }, { "auxiliary_loss_clip": 0.06512649, "auxiliary_loss_mlp": 0.01274099, "balance_loss_clip": 0.0629698, "balance_loss_mlp": 0.01254525, "epoch": 0.28750939425822936, "flos": 13411827895680.0, "grad_norm": 2.038346969409036, "language_loss": 0.86687267, "learning_rate": 3.3437896034965283e-06, "loss": 0.94474018, "num_input_tokens_seen": 103183710, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.19580078, "step": 4782, "time_per_iteration": 2.5591371059417725 }, { "auxiliary_loss_clip": 0.06516306, "auxiliary_loss_mlp": 0.01282459, "balance_loss_clip": 0.06300133, "balance_loss_mlp": 0.01263612, "epoch": 0.2875695175108973, "flos": 21876205392000.0, "grad_norm": 1.7681034944905372, "language_loss": 0.71511972, "learning_rate": 3.3435011245526357e-06, "loss": 0.79310739, "num_input_tokens_seen": 103203790, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.1887207, "step": 4783, "time_per_iteration": 2.56980562210083 }, { "auxiliary_loss_clip": 0.06512588, "auxiliary_loss_mlp": 0.01279487, "balance_loss_clip": 0.06304072, "balance_loss_mlp": 0.01261379, "epoch": 0.2876296407635653, "flos": 26251885457280.0, "grad_norm": 1.7447621086741625, "language_loss": 0.7757014, "learning_rate": 3.343212594663047e-06, "loss": 0.85362208, "num_input_tokens_seen": 103223925, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.18103027, "step": 4784, "time_per_iteration": 2.623727321624756 }, { "auxiliary_loss_clip": 0.06505489, "auxiliary_loss_mlp": 0.01278024, "balance_loss_clip": 0.06300107, "balance_loss_mlp": 0.01260262, "epoch": 0.28768976401623325, "flos": 25380581575680.0, "grad_norm": 1.8885194475810425, "language_loss": 0.76070243, "learning_rate": 3.3429240138387015e-06, "loss": 0.83853751, "num_input_tokens_seen": 103244760, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.1776123, "step": 4785, "time_per_iteration": 2.6019365787506104 }, { "auxiliary_loss_clip": 0.06517391, "auxiliary_loss_mlp": 0.01278647, "balance_loss_clip": 0.06306819, "balance_loss_mlp": 0.01260277, "epoch": 0.28774988726890127, "flos": 30672232548480.0, "grad_norm": 2.0158204628004444, "language_loss": 0.8348273, "learning_rate": 3.3426353820905425e-06, "loss": 0.91278768, "num_input_tokens_seen": 103261995, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.18371582, "step": 4786, "time_per_iteration": 2.7111876010894775 }, { "auxiliary_loss_clip": 0.06510408, "auxiliary_loss_mlp": 0.01276138, "balance_loss_clip": 0.06302289, "balance_loss_mlp": 0.01258626, "epoch": 0.28781001052156924, "flos": 20601820644480.0, "grad_norm": 1.6383118954069298, "language_loss": 0.80592179, "learning_rate": 3.342346699429516e-06, "loss": 0.88378721, "num_input_tokens_seen": 103279780, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.1751709, "step": 4787, "time_per_iteration": 2.5511436462402344 }, { "auxiliary_loss_clip": 0.06514255, "auxiliary_loss_mlp": 0.01275085, "balance_loss_clip": 0.06300744, "balance_loss_mlp": 0.01256667, "epoch": 0.2878701337742372, "flos": 26549643340800.0, "grad_norm": 1.912272839890935, "language_loss": 0.83657277, "learning_rate": 3.3420579658665677e-06, "loss": 0.91446614, "num_input_tokens_seen": 103300580, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.1842041, "step": 4788, "time_per_iteration": 2.6073358058929443 }, { "auxiliary_loss_clip": 0.06523213, "auxiliary_loss_mlp": 0.01276328, "balance_loss_clip": 0.0630599, "balance_loss_mlp": 0.01257648, "epoch": 0.28793025702690517, "flos": 28154294156160.0, "grad_norm": 1.7467698374400045, "language_loss": 0.74229062, "learning_rate": 3.3417691814126468e-06, "loss": 0.82028604, "num_input_tokens_seen": 103320430, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.18701172, "step": 4789, "time_per_iteration": 2.6190786361694336 }, { "auxiliary_loss_clip": 0.0650707, "auxiliary_loss_mlp": 0.01273676, "balance_loss_clip": 0.06302084, "balance_loss_mlp": 0.01257118, "epoch": 0.28799038027957313, "flos": 23812254305280.0, "grad_norm": 1.75627544449478, "language_loss": 0.84227771, "learning_rate": 3.341480346078704e-06, "loss": 0.92008519, "num_input_tokens_seen": 103337695, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.16564941, "step": 4790, "time_per_iteration": 2.5741567611694336 }, { "auxiliary_loss_clip": 0.06513362, "auxiliary_loss_mlp": 0.01271951, "balance_loss_clip": 0.06300043, "balance_loss_mlp": 0.01254344, "epoch": 0.2880505035322411, "flos": 22350340120320.0, "grad_norm": 2.522326282819836, "language_loss": 0.78895378, "learning_rate": 3.3411914598756922e-06, "loss": 0.86680698, "num_input_tokens_seen": 103357010, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.17614746, "step": 4791, "time_per_iteration": 2.5693788528442383 }, { "auxiliary_loss_clip": 0.0652519, "auxiliary_loss_mlp": 0.01274916, "balance_loss_clip": 0.06307653, "balance_loss_mlp": 0.0125583, "epoch": 0.28811062678490906, "flos": 18010061205120.0, "grad_norm": 1.8304731969332364, "language_loss": 0.71111804, "learning_rate": 3.3409025228145654e-06, "loss": 0.78911912, "num_input_tokens_seen": 103375600, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.1907959, "step": 4792, "time_per_iteration": 3.976578950881958 }, { "auxiliary_loss_clip": 0.06516825, "auxiliary_loss_mlp": 0.01273924, "balance_loss_clip": 0.0630526, "balance_loss_mlp": 0.01256579, "epoch": 0.28817075003757703, "flos": 22097416970880.0, "grad_norm": 2.736305090724935, "language_loss": 0.80349016, "learning_rate": 3.3406135349062812e-06, "loss": 0.8813976, "num_input_tokens_seen": 103395225, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.17346191, "step": 4793, "time_per_iteration": 2.5785841941833496 }, { "auxiliary_loss_clip": 0.06500795, "auxiliary_loss_mlp": 0.01272102, "balance_loss_clip": 0.06296402, "balance_loss_mlp": 0.01254471, "epoch": 0.288230873290245, "flos": 41692842552960.0, "grad_norm": 1.8716337282953501, "language_loss": 0.78546464, "learning_rate": 3.340324496161797e-06, "loss": 0.86319369, "num_input_tokens_seen": 103417245, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.17651367, "step": 4794, "time_per_iteration": 2.7498810291290283 }, { "auxiliary_loss_clip": 0.06508411, "auxiliary_loss_mlp": 0.01279682, "balance_loss_clip": 0.06294583, "balance_loss_mlp": 0.0126198, "epoch": 0.28829099654291296, "flos": 18630328654080.0, "grad_norm": 1.9569381870988936, "language_loss": 0.83281994, "learning_rate": 3.340035406592074e-06, "loss": 0.91070092, "num_input_tokens_seen": 103435500, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.17700195, "step": 4795, "time_per_iteration": 2.540863513946533 }, { "auxiliary_loss_clip": 0.06500706, "auxiliary_loss_mlp": 0.0127361, "balance_loss_clip": 0.06298386, "balance_loss_mlp": 0.01257958, "epoch": 0.2883511197955809, "flos": 24680707148160.0, "grad_norm": 2.109388688515939, "language_loss": 0.75385219, "learning_rate": 3.339746266208074e-06, "loss": 0.8315953, "num_input_tokens_seen": 103451040, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.15649414, "step": 4796, "time_per_iteration": 3.992629289627075 }, { "auxiliary_loss_clip": 0.0651115, "auxiliary_loss_mlp": 0.01273148, "balance_loss_clip": 0.06295737, "balance_loss_mlp": 0.01254337, "epoch": 0.2884112430482489, "flos": 23118794714880.0, "grad_norm": 1.8294148819435163, "language_loss": 0.73104042, "learning_rate": 3.3394570750207614e-06, "loss": 0.80888337, "num_input_tokens_seen": 103471330, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.18811035, "step": 4797, "time_per_iteration": 2.6477761268615723 }, { "auxiliary_loss_clip": 0.065078, "auxiliary_loss_mlp": 0.01270684, "balance_loss_clip": 0.06297575, "balance_loss_mlp": 0.01252755, "epoch": 0.28847136630091685, "flos": 16879000066560.0, "grad_norm": 1.8723589642962413, "language_loss": 0.75399804, "learning_rate": 3.3391678330411017e-06, "loss": 0.83178294, "num_input_tokens_seen": 103488060, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.17907715, "step": 4798, "time_per_iteration": 2.5644209384918213 }, { "auxiliary_loss_clip": 0.06511612, "auxiliary_loss_mlp": 0.01281172, "balance_loss_clip": 0.06299253, "balance_loss_mlp": 0.01262266, "epoch": 0.2885314895535849, "flos": 25663161870720.0, "grad_norm": 2.403113830110825, "language_loss": 0.66151416, "learning_rate": 3.3388785402800642e-06, "loss": 0.73944205, "num_input_tokens_seen": 103503600, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.18908691, "step": 4799, "time_per_iteration": 2.5558533668518066 }, { "auxiliary_loss_clip": 0.06513254, "auxiliary_loss_mlp": 0.01273621, "balance_loss_clip": 0.06298891, "balance_loss_mlp": 0.01255895, "epoch": 0.28859161280625284, "flos": 21113872145280.0, "grad_norm": 1.6185726119166435, "language_loss": 0.82242346, "learning_rate": 3.3385891967486178e-06, "loss": 0.90029228, "num_input_tokens_seen": 103524195, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.17736816, "step": 4800, "time_per_iteration": 2.5865285396575928 }, { "auxiliary_loss_clip": 0.06495688, "auxiliary_loss_mlp": 0.01273708, "balance_loss_clip": 0.06291294, "balance_loss_mlp": 0.01257436, "epoch": 0.2886517360589208, "flos": 26476870469760.0, "grad_norm": 1.5782087214289904, "language_loss": 0.91012186, "learning_rate": 3.3382998024577347e-06, "loss": 0.98781586, "num_input_tokens_seen": 103545235, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.16271973, "step": 4801, "time_per_iteration": 2.6945056915283203 }, { "auxiliary_loss_clip": 0.06510115, "auxiliary_loss_mlp": 0.01273481, "balance_loss_clip": 0.06298172, "balance_loss_mlp": 0.01256333, "epoch": 0.28871185931158877, "flos": 25272365627520.0, "grad_norm": 6.760538583060045, "language_loss": 0.73789322, "learning_rate": 3.33801035741839e-06, "loss": 0.8157292, "num_input_tokens_seen": 103563305, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.17156982, "step": 4802, "time_per_iteration": 2.693896532058716 }, { "auxiliary_loss_clip": 0.06395827, "auxiliary_loss_mlp": 0.01254767, "balance_loss_clip": 0.06292515, "balance_loss_mlp": 0.01250168, "epoch": 0.28877198256425674, "flos": 66683676061440.0, "grad_norm": 0.7578761391276558, "language_loss": 0.6293658, "learning_rate": 3.337720861641558e-06, "loss": 0.70587176, "num_input_tokens_seen": 103625025, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.04595947, "step": 4803, "time_per_iteration": 4.594449281692505 }, { "auxiliary_loss_clip": 0.06506053, "auxiliary_loss_mlp": 0.01275264, "balance_loss_clip": 0.06296635, "balance_loss_mlp": 0.01257954, "epoch": 0.2888321058169247, "flos": 20309261713920.0, "grad_norm": 1.7714112915192126, "language_loss": 0.71360666, "learning_rate": 3.3374313151382165e-06, "loss": 0.79141986, "num_input_tokens_seen": 103644235, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.1730957, "step": 4804, "time_per_iteration": 2.5749194622039795 }, { "auxiliary_loss_clip": 0.06507879, "auxiliary_loss_mlp": 0.01276526, "balance_loss_clip": 0.06293648, "balance_loss_mlp": 0.01258502, "epoch": 0.28889222906959267, "flos": 25523192424960.0, "grad_norm": 1.9337441340243455, "language_loss": 0.68113667, "learning_rate": 3.337141717919346e-06, "loss": 0.75898075, "num_input_tokens_seen": 103664700, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.18017578, "step": 4805, "time_per_iteration": 4.035159587860107 }, { "auxiliary_loss_clip": 0.06510547, "auxiliary_loss_mlp": 0.01277651, "balance_loss_clip": 0.06297341, "balance_loss_mlp": 0.01260247, "epoch": 0.28895235232226063, "flos": 32679544959360.0, "grad_norm": 1.5349880109754854, "language_loss": 0.69862807, "learning_rate": 3.3368520699959272e-06, "loss": 0.77651, "num_input_tokens_seen": 103686595, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.17407227, "step": 4806, "time_per_iteration": 2.6757562160491943 }, { "auxiliary_loss_clip": 0.06500178, "auxiliary_loss_mlp": 0.0127247, "balance_loss_clip": 0.06294879, "balance_loss_mlp": 0.01254947, "epoch": 0.2890124755749286, "flos": 29722202156160.0, "grad_norm": 1.8657382870460968, "language_loss": 0.72423673, "learning_rate": 3.3365623713789443e-06, "loss": 0.80196321, "num_input_tokens_seen": 103707525, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.17529297, "step": 4807, "time_per_iteration": 2.6346969604492188 }, { "auxiliary_loss_clip": 0.0650526, "auxiliary_loss_mlp": 0.01283187, "balance_loss_clip": 0.0629503, "balance_loss_mlp": 0.01265437, "epoch": 0.28907259882759656, "flos": 22681067385600.0, "grad_norm": 4.780143494289361, "language_loss": 0.81553882, "learning_rate": 3.336272622079382e-06, "loss": 0.89342332, "num_input_tokens_seen": 103727905, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.17736816, "step": 4808, "time_per_iteration": 2.5851194858551025 }, { "auxiliary_loss_clip": 0.06500618, "auxiliary_loss_mlp": 0.01283759, "balance_loss_clip": 0.0629716, "balance_loss_mlp": 0.01265496, "epoch": 0.2891327220802645, "flos": 22572809510400.0, "grad_norm": 1.5887874870229064, "language_loss": 0.79220223, "learning_rate": 3.3359828221082276e-06, "loss": 0.87004602, "num_input_tokens_seen": 103748335, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.18261719, "step": 4809, "time_per_iteration": 2.5868465900421143 }, { "auxiliary_loss_clip": 0.06513197, "auxiliary_loss_mlp": 0.01276472, "balance_loss_clip": 0.06294143, "balance_loss_mlp": 0.01257196, "epoch": 0.2891928453329325, "flos": 21659228444160.0, "grad_norm": 1.7556187307879596, "language_loss": 0.79662824, "learning_rate": 3.3356929714764714e-06, "loss": 0.87452489, "num_input_tokens_seen": 103767020, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.19262695, "step": 4810, "time_per_iteration": 2.583357095718384 }, { "auxiliary_loss_clip": 0.06503297, "auxiliary_loss_mlp": 0.01278099, "balance_loss_clip": 0.06297222, "balance_loss_mlp": 0.01260694, "epoch": 0.28925296858560046, "flos": 23228855452800.0, "grad_norm": 1.5728305397847726, "language_loss": 0.77670425, "learning_rate": 3.3354030701951032e-06, "loss": 0.85451818, "num_input_tokens_seen": 103786355, "router_z_loss_clip": 2.06152344, "router_z_loss_mlp": 0.17419434, "step": 4811, "time_per_iteration": 2.592034339904785 }, { "auxiliary_loss_clip": 0.06510276, "auxiliary_loss_mlp": 0.01275952, "balance_loss_clip": 0.06301384, "balance_loss_mlp": 0.01257951, "epoch": 0.2893130918382685, "flos": 28629267425280.0, "grad_norm": 1.3261841183744112, "language_loss": 0.77724957, "learning_rate": 3.335113118275117e-06, "loss": 0.85511184, "num_input_tokens_seen": 103809345, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.18005371, "step": 4812, "time_per_iteration": 2.647190809249878 }, { "auxiliary_loss_clip": 0.06377804, "auxiliary_loss_mlp": 0.01257924, "balance_loss_clip": 0.06274962, "balance_loss_mlp": 0.01253034, "epoch": 0.28937321509093644, "flos": 72323328240000.0, "grad_norm": 0.8014777745471711, "language_loss": 0.6003561, "learning_rate": 3.3348231157275085e-06, "loss": 0.67671341, "num_input_tokens_seen": 103871180, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.04885864, "step": 4813, "time_per_iteration": 3.3428711891174316 }, { "auxiliary_loss_clip": 0.06505479, "auxiliary_loss_mlp": 0.01279506, "balance_loss_clip": 0.06298426, "balance_loss_mlp": 0.01261469, "epoch": 0.2894333383436044, "flos": 16221905948160.0, "grad_norm": 2.1248128825953456, "language_loss": 0.8280443, "learning_rate": 3.3345330625632725e-06, "loss": 0.90589416, "num_input_tokens_seen": 103889040, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.18029785, "step": 4814, "time_per_iteration": 2.5421526432037354 }, { "auxiliary_loss_clip": 0.06516486, "auxiliary_loss_mlp": 0.01275307, "balance_loss_clip": 0.06299847, "balance_loss_mlp": 0.01256901, "epoch": 0.2894934615962724, "flos": 24835434912000.0, "grad_norm": 1.5490046417739818, "language_loss": 0.72509837, "learning_rate": 3.3342429587934094e-06, "loss": 0.8030163, "num_input_tokens_seen": 103910380, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.18408203, "step": 4815, "time_per_iteration": 2.6022403240203857 }, { "auxiliary_loss_clip": 0.06497641, "auxiliary_loss_mlp": 0.01277595, "balance_loss_clip": 0.06295757, "balance_loss_mlp": 0.01260738, "epoch": 0.28955358484894034, "flos": 20456400683520.0, "grad_norm": 1.532909946965216, "language_loss": 0.70724052, "learning_rate": 3.3339528044289198e-06, "loss": 0.78499293, "num_input_tokens_seen": 103929955, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.1685791, "step": 4816, "time_per_iteration": 2.569472551345825 }, { "auxiliary_loss_clip": 0.06514454, "auxiliary_loss_mlp": 0.01275069, "balance_loss_clip": 0.06298101, "balance_loss_mlp": 0.01256306, "epoch": 0.2896137081016083, "flos": 22571803261440.0, "grad_norm": 2.2913987265730205, "language_loss": 0.75992316, "learning_rate": 3.3336625994808055e-06, "loss": 0.83781838, "num_input_tokens_seen": 103948020, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.18774414, "step": 4817, "time_per_iteration": 2.5821022987365723 }, { "auxiliary_loss_clip": 0.06515439, "auxiliary_loss_mlp": 0.01275682, "balance_loss_clip": 0.06303333, "balance_loss_mlp": 0.01256334, "epoch": 0.28967383135427627, "flos": 26695231009920.0, "grad_norm": 2.0033785050686923, "language_loss": 0.76713014, "learning_rate": 3.3333723439600723e-06, "loss": 0.84504128, "num_input_tokens_seen": 103968740, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.19348145, "step": 4818, "time_per_iteration": 2.627581834793091 }, { "auxiliary_loss_clip": 0.06513275, "auxiliary_loss_mlp": 0.01277044, "balance_loss_clip": 0.06300648, "balance_loss_mlp": 0.01259413, "epoch": 0.28973395460694423, "flos": 15563428237440.0, "grad_norm": 1.75568624055684, "language_loss": 0.8067379, "learning_rate": 3.3330820378777263e-06, "loss": 0.88464105, "num_input_tokens_seen": 103986005, "router_z_loss_clip": 2.12792969, "router_z_loss_mlp": 0.1763916, "step": 4819, "time_per_iteration": 2.5450966358184814 }, { "auxiliary_loss_clip": 0.06517375, "auxiliary_loss_mlp": 0.01276738, "balance_loss_clip": 0.06300871, "balance_loss_mlp": 0.01257605, "epoch": 0.2897940778596122, "flos": 18703395014400.0, "grad_norm": 1.7945427938565583, "language_loss": 0.79276216, "learning_rate": 3.332791681244776e-06, "loss": 0.87070328, "num_input_tokens_seen": 104005070, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.19116211, "step": 4820, "time_per_iteration": 2.553741693496704 }, { "auxiliary_loss_clip": 0.06519436, "auxiliary_loss_mlp": 0.01273965, "balance_loss_clip": 0.06304283, "balance_loss_mlp": 0.01256477, "epoch": 0.28985420111228016, "flos": 18776209812480.0, "grad_norm": 1.8945456904517914, "language_loss": 0.72888315, "learning_rate": 3.332501274072231e-06, "loss": 0.80681717, "num_input_tokens_seen": 104022945, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.17480469, "step": 4821, "time_per_iteration": 2.571448564529419 }, { "auxiliary_loss_clip": 0.06510926, "auxiliary_loss_mlp": 0.01279667, "balance_loss_clip": 0.06300513, "balance_loss_mlp": 0.01261094, "epoch": 0.28991432436494813, "flos": 23075511281280.0, "grad_norm": 1.7310056780963983, "language_loss": 0.72553349, "learning_rate": 3.332210816371104e-06, "loss": 0.80343938, "num_input_tokens_seen": 104042080, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.18554688, "step": 4822, "time_per_iteration": 2.582949638366699 }, { "auxiliary_loss_clip": 0.06507849, "auxiliary_loss_mlp": 0.01274652, "balance_loss_clip": 0.06300011, "balance_loss_mlp": 0.01256282, "epoch": 0.2899744476176161, "flos": 17608992837120.0, "grad_norm": 1.7465006832594694, "language_loss": 0.67044204, "learning_rate": 3.3319203081524102e-06, "loss": 0.74826705, "num_input_tokens_seen": 104060975, "router_z_loss_clip": 2.07714844, "router_z_loss_mlp": 0.18371582, "step": 4823, "time_per_iteration": 2.5533196926116943 }, { "auxiliary_loss_clip": 0.06509548, "auxiliary_loss_mlp": 0.01273324, "balance_loss_clip": 0.06300064, "balance_loss_mlp": 0.01256087, "epoch": 0.29003457087028406, "flos": 22315861365120.0, "grad_norm": 4.636936341062071, "language_loss": 0.81742156, "learning_rate": 3.331629749427164e-06, "loss": 0.89525026, "num_input_tokens_seen": 104081395, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.17236328, "step": 4824, "time_per_iteration": 2.5842723846435547 }, { "auxiliary_loss_clip": 0.06517643, "auxiliary_loss_mlp": 0.01275595, "balance_loss_clip": 0.06304887, "balance_loss_mlp": 0.01257833, "epoch": 0.2900946941229521, "flos": 21951493885440.0, "grad_norm": 2.0438687019426203, "language_loss": 0.73226273, "learning_rate": 3.331339140206385e-06, "loss": 0.81019515, "num_input_tokens_seen": 104099995, "router_z_loss_clip": 2.12597656, "router_z_loss_mlp": 0.1776123, "step": 4825, "time_per_iteration": 2.576725482940674 }, { "auxiliary_loss_clip": 0.06525759, "auxiliary_loss_mlp": 0.0127914, "balance_loss_clip": 0.06312664, "balance_loss_mlp": 0.01260889, "epoch": 0.29015481737562004, "flos": 17938126874880.0, "grad_norm": 3.5980821754733565, "language_loss": 0.74366081, "learning_rate": 3.331048480501092e-06, "loss": 0.82170981, "num_input_tokens_seen": 104118930, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.18261719, "step": 4826, "time_per_iteration": 2.56290602684021 }, { "auxiliary_loss_clip": 0.06525102, "auxiliary_loss_mlp": 0.01277548, "balance_loss_clip": 0.06312204, "balance_loss_mlp": 0.01260441, "epoch": 0.290214940628288, "flos": 22790079947520.0, "grad_norm": 1.9068536483413907, "language_loss": 0.69399333, "learning_rate": 3.3307577703223073e-06, "loss": 0.7720198, "num_input_tokens_seen": 104136940, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.17102051, "step": 4827, "time_per_iteration": 2.56546950340271 }, { "auxiliary_loss_clip": 0.0652084, "auxiliary_loss_mlp": 0.01273256, "balance_loss_clip": 0.06309359, "balance_loss_mlp": 0.01254767, "epoch": 0.290275063880956, "flos": 20011881173760.0, "grad_norm": 1.9791754367354462, "language_loss": 0.80781877, "learning_rate": 3.3304670096810545e-06, "loss": 0.88575971, "num_input_tokens_seen": 104154280, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.18493652, "step": 4828, "time_per_iteration": 2.5605266094207764 }, { "auxiliary_loss_clip": 0.06519932, "auxiliary_loss_mlp": 0.01273779, "balance_loss_clip": 0.06312214, "balance_loss_mlp": 0.01256231, "epoch": 0.29033518713362394, "flos": 22060003322880.0, "grad_norm": 1.866305028591949, "language_loss": 0.80515039, "learning_rate": 3.33017619858836e-06, "loss": 0.88308752, "num_input_tokens_seen": 104172605, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.17541504, "step": 4829, "time_per_iteration": 2.5661659240722656 }, { "auxiliary_loss_clip": 0.06514408, "auxiliary_loss_mlp": 0.01273351, "balance_loss_clip": 0.06309626, "balance_loss_mlp": 0.01256578, "epoch": 0.2903953103862919, "flos": 25637194304640.0, "grad_norm": 1.4518871402369764, "language_loss": 0.83000129, "learning_rate": 3.329885337055249e-06, "loss": 0.90787888, "num_input_tokens_seen": 104194120, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.16772461, "step": 4830, "time_per_iteration": 2.6090123653411865 }, { "auxiliary_loss_clip": 0.06525184, "auxiliary_loss_mlp": 0.0127683, "balance_loss_clip": 0.06313259, "balance_loss_mlp": 0.0125964, "epoch": 0.29045543363895987, "flos": 16951437521280.0, "grad_norm": 2.403228735751762, "language_loss": 0.80603456, "learning_rate": 3.3295944250927546e-06, "loss": 0.88405466, "num_input_tokens_seen": 104210875, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.171875, "step": 4831, "time_per_iteration": 4.0175371170043945 }, { "auxiliary_loss_clip": 0.06515329, "auxiliary_loss_mlp": 0.01277269, "balance_loss_clip": 0.06310196, "balance_loss_mlp": 0.01260544, "epoch": 0.29051555689162784, "flos": 26402630152320.0, "grad_norm": 1.8651502651415037, "language_loss": 0.75051385, "learning_rate": 3.3293034627119055e-06, "loss": 0.82843983, "num_input_tokens_seen": 104229875, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.16723633, "step": 4832, "time_per_iteration": 2.6073381900787354 }, { "auxiliary_loss_clip": 0.0651492, "auxiliary_loss_mlp": 0.01273608, "balance_loss_clip": 0.06309555, "balance_loss_mlp": 0.01257622, "epoch": 0.2905756801442958, "flos": 21109931003520.0, "grad_norm": 1.8158681141321935, "language_loss": 0.76265842, "learning_rate": 3.329012449923736e-06, "loss": 0.84054369, "num_input_tokens_seen": 104250405, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.15966797, "step": 4833, "time_per_iteration": 2.57098388671875 }, { "auxiliary_loss_clip": 0.06510807, "auxiliary_loss_mlp": 0.01273175, "balance_loss_clip": 0.06306845, "balance_loss_mlp": 0.0125707, "epoch": 0.29063580339696377, "flos": 15711573456000.0, "grad_norm": 1.7191233250060856, "language_loss": 0.6565218, "learning_rate": 3.3287213867392813e-06, "loss": 0.73436165, "num_input_tokens_seen": 104269185, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.16113281, "step": 4834, "time_per_iteration": 2.5449061393737793 }, { "auxiliary_loss_clip": 0.06514964, "auxiliary_loss_mlp": 0.01271297, "balance_loss_clip": 0.06312326, "balance_loss_mlp": 0.01255418, "epoch": 0.29069592664963173, "flos": 24651972397440.0, "grad_norm": 1.7117849359976116, "language_loss": 0.72301304, "learning_rate": 3.3284302731695783e-06, "loss": 0.80087566, "num_input_tokens_seen": 104289400, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.15869141, "step": 4835, "time_per_iteration": 4.117199659347534 }, { "auxiliary_loss_clip": 0.06518663, "auxiliary_loss_mlp": 0.01271917, "balance_loss_clip": 0.06312793, "balance_loss_mlp": 0.01256551, "epoch": 0.2907560499022997, "flos": 24980854872960.0, "grad_norm": 1.78490036465769, "language_loss": 0.80264372, "learning_rate": 3.3281391092256668e-06, "loss": 0.88054955, "num_input_tokens_seen": 104310485, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.15356445, "step": 4836, "time_per_iteration": 2.599616050720215 }, { "auxiliary_loss_clip": 0.0651348, "auxiliary_loss_mlp": 0.0127539, "balance_loss_clip": 0.063094, "balance_loss_mlp": 0.01259297, "epoch": 0.29081617315496766, "flos": 18662836838400.0, "grad_norm": 1.8470870717520917, "language_loss": 0.81185925, "learning_rate": 3.3278478949185865e-06, "loss": 0.88974798, "num_input_tokens_seen": 104327330, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.16088867, "step": 4837, "time_per_iteration": 2.556553363800049 }, { "auxiliary_loss_clip": 0.06521434, "auxiliary_loss_mlp": 0.01273962, "balance_loss_clip": 0.0631275, "balance_loss_mlp": 0.01256212, "epoch": 0.2908762964076356, "flos": 35339087952000.0, "grad_norm": 2.179120237675362, "language_loss": 0.67764491, "learning_rate": 3.327556630259381e-06, "loss": 0.7555989, "num_input_tokens_seen": 104350350, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.17749023, "step": 4838, "time_per_iteration": 2.6924331188201904 }, { "auxiliary_loss_clip": 0.06518313, "auxiliary_loss_mlp": 0.01273019, "balance_loss_clip": 0.06308167, "balance_loss_mlp": 0.01256187, "epoch": 0.29093641966030365, "flos": 23083058148480.0, "grad_norm": 1.7580310274226174, "language_loss": 0.71655613, "learning_rate": 3.327265315259095e-06, "loss": 0.79446948, "num_input_tokens_seen": 104369995, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.16845703, "step": 4839, "time_per_iteration": 2.618541955947876 }, { "auxiliary_loss_clip": 0.0651705, "auxiliary_loss_mlp": 0.01271897, "balance_loss_clip": 0.06309319, "balance_loss_mlp": 0.01255172, "epoch": 0.2909965429129716, "flos": 35964260864640.0, "grad_norm": 2.199932545624964, "language_loss": 0.76331437, "learning_rate": 3.326973949928776e-06, "loss": 0.84120381, "num_input_tokens_seen": 104392285, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.16711426, "step": 4840, "time_per_iteration": 2.7237207889556885 }, { "auxiliary_loss_clip": 0.0651393, "auxiliary_loss_mlp": 0.01276786, "balance_loss_clip": 0.06306029, "balance_loss_mlp": 0.01260299, "epoch": 0.2910566661656396, "flos": 30887616268800.0, "grad_norm": 1.7042174011803064, "language_loss": 0.60939759, "learning_rate": 3.326682534279471e-06, "loss": 0.68730474, "num_input_tokens_seen": 104412640, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.16491699, "step": 4841, "time_per_iteration": 2.6603524684906006 }, { "auxiliary_loss_clip": 0.06513327, "auxiliary_loss_mlp": 0.0127138, "balance_loss_clip": 0.06307846, "balance_loss_mlp": 0.01254559, "epoch": 0.29111678941830754, "flos": 30018366812160.0, "grad_norm": 3.253018988154887, "language_loss": 0.72159696, "learning_rate": 3.326391068322232e-06, "loss": 0.79944396, "num_input_tokens_seen": 104435245, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.16821289, "step": 4842, "time_per_iteration": 3.967763900756836 }, { "auxiliary_loss_clip": 0.06511369, "auxiliary_loss_mlp": 0.01277367, "balance_loss_clip": 0.06305119, "balance_loss_mlp": 0.01261298, "epoch": 0.2911769126709755, "flos": 22864110629760.0, "grad_norm": 57.97889868551965, "language_loss": 0.7398327, "learning_rate": 3.3260995520681098e-06, "loss": 0.81772006, "num_input_tokens_seen": 104455395, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.1607666, "step": 4843, "time_per_iteration": 2.585315227508545 }, { "auxiliary_loss_clip": 0.06515999, "auxiliary_loss_mlp": 0.01275106, "balance_loss_clip": 0.06307182, "balance_loss_mlp": 0.01259287, "epoch": 0.2912370359236435, "flos": 21656545113600.0, "grad_norm": 1.9329440208025381, "language_loss": 0.58913249, "learning_rate": 3.3258079855281602e-06, "loss": 0.66704357, "num_input_tokens_seen": 104473350, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.15808105, "step": 4844, "time_per_iteration": 4.073885202407837 }, { "auxiliary_loss_clip": 0.06519035, "auxiliary_loss_mlp": 0.01282877, "balance_loss_clip": 0.06306851, "balance_loss_mlp": 0.01265055, "epoch": 0.29129715917631144, "flos": 22899972977280.0, "grad_norm": 2.029542079853055, "language_loss": 0.87061602, "learning_rate": 3.3255163687134396e-06, "loss": 0.9486351, "num_input_tokens_seen": 104492265, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.17822266, "step": 4845, "time_per_iteration": 2.570794105529785 }, { "auxiliary_loss_clip": 0.0651001, "auxiliary_loss_mlp": 0.01276623, "balance_loss_clip": 0.0630188, "balance_loss_mlp": 0.01258825, "epoch": 0.2913572824289794, "flos": 22681067385600.0, "grad_norm": 1.8831977189463294, "language_loss": 0.6763891, "learning_rate": 3.3252247016350046e-06, "loss": 0.75425541, "num_input_tokens_seen": 104510755, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.17810059, "step": 4846, "time_per_iteration": 2.597752332687378 }, { "auxiliary_loss_clip": 0.06508776, "auxiliary_loss_mlp": 0.01276456, "balance_loss_clip": 0.06302403, "balance_loss_mlp": 0.01260136, "epoch": 0.29141740568164737, "flos": 23113260345600.0, "grad_norm": 1.654435559027861, "language_loss": 0.70920336, "learning_rate": 3.3249329843039166e-06, "loss": 0.78705561, "num_input_tokens_seen": 104530830, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.16333008, "step": 4847, "time_per_iteration": 2.5664403438568115 }, { "auxiliary_loss_clip": 0.06512506, "auxiliary_loss_mlp": 0.01275904, "balance_loss_clip": 0.06305674, "balance_loss_mlp": 0.01258499, "epoch": 0.29147752893431533, "flos": 23593851838080.0, "grad_norm": 1.5519938147538659, "language_loss": 0.74251974, "learning_rate": 3.324641216731237e-06, "loss": 0.82040393, "num_input_tokens_seen": 104550115, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.17419434, "step": 4848, "time_per_iteration": 2.7562923431396484 }, { "auxiliary_loss_clip": 0.06514212, "auxiliary_loss_mlp": 0.01272549, "balance_loss_clip": 0.0630595, "balance_loss_mlp": 0.0125586, "epoch": 0.2915376521869833, "flos": 20597753721600.0, "grad_norm": 2.0916250798093, "language_loss": 0.76881325, "learning_rate": 3.3243493989280295e-06, "loss": 0.84668088, "num_input_tokens_seen": 104566255, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.16687012, "step": 4849, "time_per_iteration": 2.5905678272247314 }, { "auxiliary_loss_clip": 0.06519984, "auxiliary_loss_mlp": 0.01275775, "balance_loss_clip": 0.06308042, "balance_loss_mlp": 0.01259133, "epoch": 0.29159777543965126, "flos": 20817414000000.0, "grad_norm": 1.9640468686436727, "language_loss": 0.79614019, "learning_rate": 3.3240575309053596e-06, "loss": 0.8740977, "num_input_tokens_seen": 104585235, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.16625977, "step": 4850, "time_per_iteration": 2.6009480953216553 }, { "auxiliary_loss_clip": 0.06507299, "auxiliary_loss_mlp": 0.01277706, "balance_loss_clip": 0.06305575, "balance_loss_mlp": 0.01260969, "epoch": 0.29165789869231923, "flos": 24251155591680.0, "grad_norm": 4.463769370408401, "language_loss": 0.76295209, "learning_rate": 3.323765612674296e-06, "loss": 0.84080219, "num_input_tokens_seen": 104605315, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.16748047, "step": 4851, "time_per_iteration": 2.58876633644104 }, { "auxiliary_loss_clip": 0.06505243, "auxiliary_loss_mlp": 0.0127694, "balance_loss_clip": 0.06302722, "balance_loss_mlp": 0.01260274, "epoch": 0.29171802194498725, "flos": 28957562922240.0, "grad_norm": 1.35614814621701, "language_loss": 0.7823621, "learning_rate": 3.3234736442459078e-06, "loss": 0.86018395, "num_input_tokens_seen": 104626055, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.16674805, "step": 4852, "time_per_iteration": 2.633575201034546 }, { "auxiliary_loss_clip": 0.06513259, "auxiliary_loss_mlp": 0.01274849, "balance_loss_clip": 0.06308436, "balance_loss_mlp": 0.01257588, "epoch": 0.2917781451976552, "flos": 22604269518720.0, "grad_norm": 1.645408991867106, "language_loss": 0.78675622, "learning_rate": 3.3231816256312665e-06, "loss": 0.86463732, "num_input_tokens_seen": 104646005, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.17248535, "step": 4853, "time_per_iteration": 2.6280181407928467 }, { "auxiliary_loss_clip": 0.06514542, "auxiliary_loss_mlp": 0.01275254, "balance_loss_clip": 0.0630623, "balance_loss_mlp": 0.01257837, "epoch": 0.2918382684503232, "flos": 21579956881920.0, "grad_norm": 2.4403745471096716, "language_loss": 0.88162184, "learning_rate": 3.322889556841445e-06, "loss": 0.9595198, "num_input_tokens_seen": 104661620, "router_z_loss_clip": 2.08300781, "router_z_loss_mlp": 0.17419434, "step": 4854, "time_per_iteration": 2.572603940963745 }, { "auxiliary_loss_clip": 0.06506801, "auxiliary_loss_mlp": 0.01274654, "balance_loss_clip": 0.06303293, "balance_loss_mlp": 0.01257464, "epoch": 0.29189839170299114, "flos": 24360503569920.0, "grad_norm": 2.015991400138594, "language_loss": 0.86433131, "learning_rate": 3.322597437887519e-06, "loss": 0.94214588, "num_input_tokens_seen": 104681445, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.17199707, "step": 4855, "time_per_iteration": 2.623765707015991 }, { "auxiliary_loss_clip": 0.06384028, "auxiliary_loss_mlp": 0.01255889, "balance_loss_clip": 0.06279384, "balance_loss_mlp": 0.01251094, "epoch": 0.2919585149556591, "flos": 71338693311360.0, "grad_norm": 0.7779411147251746, "language_loss": 0.6016199, "learning_rate": 3.322305268780566e-06, "loss": 0.67801905, "num_input_tokens_seen": 104747945, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.04788208, "step": 4856, "time_per_iteration": 3.281765937805176 }, { "auxiliary_loss_clip": 0.06504361, "auxiliary_loss_mlp": 0.01270242, "balance_loss_clip": 0.06298236, "balance_loss_mlp": 0.01254721, "epoch": 0.2920186382083271, "flos": 15638716730880.0, "grad_norm": 2.26437761571655, "language_loss": 0.69096869, "learning_rate": 3.322013049531664e-06, "loss": 0.76871473, "num_input_tokens_seen": 104766225, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.15527344, "step": 4857, "time_per_iteration": 2.548103094100952 }, { "auxiliary_loss_clip": 0.06498909, "auxiliary_loss_mlp": 0.01272497, "balance_loss_clip": 0.06297626, "balance_loss_mlp": 0.01257345, "epoch": 0.29207876146099504, "flos": 28373535164160.0, "grad_norm": 2.031738790878717, "language_loss": 0.83909839, "learning_rate": 3.321720780151895e-06, "loss": 0.91681248, "num_input_tokens_seen": 104785345, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.15136719, "step": 4858, "time_per_iteration": 2.611936569213867 }, { "auxiliary_loss_clip": 0.06504136, "auxiliary_loss_mlp": 0.01271376, "balance_loss_clip": 0.06300234, "balance_loss_mlp": 0.01255557, "epoch": 0.292138884713663, "flos": 21877295495040.0, "grad_norm": 1.5940021442478036, "language_loss": 0.77831322, "learning_rate": 3.321428460652342e-06, "loss": 0.85606831, "num_input_tokens_seen": 104804560, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.1583252, "step": 4859, "time_per_iteration": 2.5714025497436523 }, { "auxiliary_loss_clip": 0.06509003, "auxiliary_loss_mlp": 0.01273058, "balance_loss_clip": 0.06298748, "balance_loss_mlp": 0.01256512, "epoch": 0.29219900796633097, "flos": 20998277038080.0, "grad_norm": 1.9164138141991487, "language_loss": 0.68238366, "learning_rate": 3.3211360910440885e-06, "loss": 0.76020426, "num_input_tokens_seen": 104821105, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.16540527, "step": 4860, "time_per_iteration": 2.570542573928833 }, { "auxiliary_loss_clip": 0.06500182, "auxiliary_loss_mlp": 0.01272617, "balance_loss_clip": 0.06299224, "balance_loss_mlp": 0.01257036, "epoch": 0.29225913121899894, "flos": 35012930734080.0, "grad_norm": 2.105916788538675, "language_loss": 0.75770217, "learning_rate": 3.320843671338222e-06, "loss": 0.83543015, "num_input_tokens_seen": 104841440, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.15576172, "step": 4861, "time_per_iteration": 2.6982438564300537 }, { "auxiliary_loss_clip": 0.06494952, "auxiliary_loss_mlp": 0.01276742, "balance_loss_clip": 0.06292254, "balance_loss_mlp": 0.01261257, "epoch": 0.2923192544716669, "flos": 13520588895360.0, "grad_norm": 1.7828921433379838, "language_loss": 0.9173764, "learning_rate": 3.320551201545832e-06, "loss": 0.99509335, "num_input_tokens_seen": 104858210, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.15490723, "step": 4862, "time_per_iteration": 2.550750255584717 }, { "auxiliary_loss_clip": 0.06498083, "auxiliary_loss_mlp": 0.01275847, "balance_loss_clip": 0.06296092, "balance_loss_mlp": 0.01260147, "epoch": 0.29237937772433487, "flos": 19469543621760.0, "grad_norm": 2.1168640479958145, "language_loss": 0.74207127, "learning_rate": 3.320258681678008e-06, "loss": 0.81981057, "num_input_tokens_seen": 104875620, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.15698242, "step": 4863, "time_per_iteration": 2.5463883876800537 }, { "auxiliary_loss_clip": 0.0649469, "auxiliary_loss_mlp": 0.01273637, "balance_loss_clip": 0.06296302, "balance_loss_mlp": 0.01257591, "epoch": 0.29243950097700283, "flos": 20856965927040.0, "grad_norm": 2.7117150792305496, "language_loss": 0.78430897, "learning_rate": 3.319966111745842e-06, "loss": 0.86199224, "num_input_tokens_seen": 104894600, "router_z_loss_clip": 1.98828125, "router_z_loss_mlp": 0.16040039, "step": 4864, "time_per_iteration": 2.5789830684661865 }, { "auxiliary_loss_clip": 0.06501788, "auxiliary_loss_mlp": 0.01272415, "balance_loss_clip": 0.06294559, "balance_loss_mlp": 0.01255595, "epoch": 0.29249962422967085, "flos": 23590581528960.0, "grad_norm": 1.5676821337111055, "language_loss": 0.82546985, "learning_rate": 3.319673491760429e-06, "loss": 0.90321189, "num_input_tokens_seen": 104914530, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.16833496, "step": 4865, "time_per_iteration": 2.5995020866394043 }, { "auxiliary_loss_clip": 0.06503976, "auxiliary_loss_mlp": 0.01270531, "balance_loss_clip": 0.06299223, "balance_loss_mlp": 0.01253949, "epoch": 0.2925597474823388, "flos": 22279915163520.0, "grad_norm": 2.0075663680250546, "language_loss": 0.85907078, "learning_rate": 3.3193808217328645e-06, "loss": 0.93681586, "num_input_tokens_seen": 104933460, "router_z_loss_clip": 2.04785156, "router_z_loss_mlp": 0.16589355, "step": 4866, "time_per_iteration": 2.5874361991882324 }, { "auxiliary_loss_clip": 0.06497228, "auxiliary_loss_mlp": 0.01272062, "balance_loss_clip": 0.06295876, "balance_loss_mlp": 0.01256052, "epoch": 0.2926198707350068, "flos": 34464136417920.0, "grad_norm": 1.5948708604854451, "language_loss": 0.75746751, "learning_rate": 3.3190881016742476e-06, "loss": 0.83516049, "num_input_tokens_seen": 104954495, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.16003418, "step": 4867, "time_per_iteration": 2.712951421737671 }, { "auxiliary_loss_clip": 0.06503025, "auxiliary_loss_mlp": 0.0127322, "balance_loss_clip": 0.0629715, "balance_loss_mlp": 0.01256411, "epoch": 0.29267999398767475, "flos": 20710413936000.0, "grad_norm": 2.166367408616269, "language_loss": 0.73017013, "learning_rate": 3.3187953315956776e-06, "loss": 0.80793256, "num_input_tokens_seen": 104971915, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.16809082, "step": 4868, "time_per_iteration": 2.56115984916687 }, { "auxiliary_loss_clip": 0.06502129, "auxiliary_loss_mlp": 0.01269724, "balance_loss_clip": 0.06301151, "balance_loss_mlp": 0.01253607, "epoch": 0.2927401172403427, "flos": 18374470611840.0, "grad_norm": 1.3014721165059482, "language_loss": 0.74776644, "learning_rate": 3.3185025115082566e-06, "loss": 0.82548499, "num_input_tokens_seen": 104991335, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.16131592, "step": 4869, "time_per_iteration": 2.590144634246826 }, { "auxiliary_loss_clip": 0.0650603, "auxiliary_loss_mlp": 0.01272752, "balance_loss_clip": 0.06302941, "balance_loss_mlp": 0.01256599, "epoch": 0.2928002404930107, "flos": 26111203251840.0, "grad_norm": 1.4154696286293376, "language_loss": 0.7701804, "learning_rate": 3.318209641423088e-06, "loss": 0.84796822, "num_input_tokens_seen": 105012015, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.16149902, "step": 4870, "time_per_iteration": 4.0723206996917725 }, { "auxiliary_loss_clip": 0.06507226, "auxiliary_loss_mlp": 0.0127433, "balance_loss_clip": 0.06297508, "balance_loss_mlp": 0.01257569, "epoch": 0.29286036374567864, "flos": 21331142582400.0, "grad_norm": 1.9820709724209384, "language_loss": 0.68434149, "learning_rate": 3.3179167213512777e-06, "loss": 0.76215708, "num_input_tokens_seen": 105031460, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.16772461, "step": 4871, "time_per_iteration": 2.571948528289795 }, { "auxiliary_loss_clip": 0.06499955, "auxiliary_loss_mlp": 0.01272936, "balance_loss_clip": 0.06298441, "balance_loss_mlp": 0.01258392, "epoch": 0.2929204869983466, "flos": 29577117611520.0, "grad_norm": 1.9420395300949773, "language_loss": 0.78031182, "learning_rate": 3.317623751303933e-06, "loss": 0.85804069, "num_input_tokens_seen": 105052965, "router_z_loss_clip": 2.01269531, "router_z_loss_mlp": 0.14538574, "step": 4872, "time_per_iteration": 2.618164300918579 }, { "auxiliary_loss_clip": 0.065166, "auxiliary_loss_mlp": 0.01276705, "balance_loss_clip": 0.06307622, "balance_loss_mlp": 0.01259467, "epoch": 0.2929806102510146, "flos": 19063569790080.0, "grad_norm": 1.942332010836177, "language_loss": 0.73006904, "learning_rate": 3.317330731292164e-06, "loss": 0.80800205, "num_input_tokens_seen": 105071840, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.17224121, "step": 4873, "time_per_iteration": 2.5544273853302 }, { "auxiliary_loss_clip": 0.06511816, "auxiliary_loss_mlp": 0.01269705, "balance_loss_clip": 0.06303882, "balance_loss_mlp": 0.01253158, "epoch": 0.29304073350368254, "flos": 21950613417600.0, "grad_norm": 2.0874684514002007, "language_loss": 0.79129666, "learning_rate": 3.3170376613270812e-06, "loss": 0.86911184, "num_input_tokens_seen": 105089445, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.16564941, "step": 4874, "time_per_iteration": 2.5716640949249268 }, { "auxiliary_loss_clip": 0.06518038, "auxiliary_loss_mlp": 0.01270638, "balance_loss_clip": 0.06305617, "balance_loss_mlp": 0.01253532, "epoch": 0.2931008567563505, "flos": 15456302392320.0, "grad_norm": 1.866520176212831, "language_loss": 0.78065163, "learning_rate": 3.3167445414197985e-06, "loss": 0.85853839, "num_input_tokens_seen": 105106210, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.17102051, "step": 4875, "time_per_iteration": 4.001799583435059 }, { "auxiliary_loss_clip": 0.06512143, "auxiliary_loss_mlp": 0.01275654, "balance_loss_clip": 0.0630565, "balance_loss_mlp": 0.0125937, "epoch": 0.29316098000901847, "flos": 16988893096320.0, "grad_norm": 1.7140673104581678, "language_loss": 0.6998145, "learning_rate": 3.316451371581431e-06, "loss": 0.7776925, "num_input_tokens_seen": 105124200, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.1628418, "step": 4876, "time_per_iteration": 2.5533599853515625 }, { "auxiliary_loss_clip": 0.06504855, "auxiliary_loss_mlp": 0.01273045, "balance_loss_clip": 0.06301089, "balance_loss_mlp": 0.01257536, "epoch": 0.29322110326168643, "flos": 16362462372480.0, "grad_norm": 2.0406621333994273, "language_loss": 0.83044475, "learning_rate": 3.316158151823096e-06, "loss": 0.90822375, "num_input_tokens_seen": 105140400, "router_z_loss_clip": 2.03515625, "router_z_loss_mlp": 0.15509033, "step": 4877, "time_per_iteration": 2.542029619216919 }, { "auxiliary_loss_clip": 0.06509757, "auxiliary_loss_mlp": 0.01270691, "balance_loss_clip": 0.06298156, "balance_loss_mlp": 0.01254705, "epoch": 0.29328122651435445, "flos": 13996023361920.0, "grad_norm": 2.0246399912687236, "language_loss": 0.68292832, "learning_rate": 3.315864882155911e-06, "loss": 0.76073277, "num_input_tokens_seen": 105157535, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.15991211, "step": 4878, "time_per_iteration": 2.536036729812622 }, { "auxiliary_loss_clip": 0.06508571, "auxiliary_loss_mlp": 0.01273919, "balance_loss_clip": 0.06303138, "balance_loss_mlp": 0.01257475, "epoch": 0.2933413497670224, "flos": 25271569013760.0, "grad_norm": 5.804630261526246, "language_loss": 0.74271405, "learning_rate": 3.3155715625909982e-06, "loss": 0.82053894, "num_input_tokens_seen": 105175185, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.16448975, "step": 4879, "time_per_iteration": 2.591768264770508 }, { "auxiliary_loss_clip": 0.06508416, "auxiliary_loss_mlp": 0.01278313, "balance_loss_clip": 0.06302056, "balance_loss_mlp": 0.01260383, "epoch": 0.2934014730196904, "flos": 32131840746240.0, "grad_norm": 1.9738750824558668, "language_loss": 0.66413784, "learning_rate": 3.3152781931394803e-06, "loss": 0.74200517, "num_input_tokens_seen": 105194540, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.17932129, "step": 4880, "time_per_iteration": 2.64211368560791 }, { "auxiliary_loss_clip": 0.06508467, "auxiliary_loss_mlp": 0.01272516, "balance_loss_clip": 0.06299537, "balance_loss_mlp": 0.01255827, "epoch": 0.29346159627235835, "flos": 24359329612800.0, "grad_norm": 4.012376120464844, "language_loss": 0.71195537, "learning_rate": 3.314984773812481e-06, "loss": 0.78976518, "num_input_tokens_seen": 105213215, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.16687012, "step": 4881, "time_per_iteration": 2.6085867881774902 }, { "auxiliary_loss_clip": 0.0651053, "auxiliary_loss_mlp": 0.01276444, "balance_loss_clip": 0.06302586, "balance_loss_mlp": 0.01259778, "epoch": 0.2935217195250263, "flos": 22753253278080.0, "grad_norm": 1.7883034084922387, "language_loss": 0.8365463, "learning_rate": 3.314691304621127e-06, "loss": 0.91441607, "num_input_tokens_seen": 105231585, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.16662598, "step": 4882, "time_per_iteration": 3.956916093826294 }, { "auxiliary_loss_clip": 0.06507009, "auxiliary_loss_mlp": 0.0127842, "balance_loss_clip": 0.06295252, "balance_loss_mlp": 0.01261421, "epoch": 0.2935818427776943, "flos": 21731959388160.0, "grad_norm": 2.5076263954094427, "language_loss": 0.72240746, "learning_rate": 3.314397785576548e-06, "loss": 0.80026174, "num_input_tokens_seen": 105250120, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.17004395, "step": 4883, "time_per_iteration": 2.598961114883423 }, { "auxiliary_loss_clip": 0.06507874, "auxiliary_loss_mlp": 0.01280633, "balance_loss_clip": 0.06299233, "balance_loss_mlp": 0.01264611, "epoch": 0.29364196603036224, "flos": 23811667326720.0, "grad_norm": 1.9561171022486044, "language_loss": 0.92771983, "learning_rate": 3.3141042166898726e-06, "loss": 1.00560486, "num_input_tokens_seen": 105266065, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.16027832, "step": 4884, "time_per_iteration": 4.061708688735962 }, { "auxiliary_loss_clip": 0.06509373, "auxiliary_loss_mlp": 0.01274572, "balance_loss_clip": 0.06302278, "balance_loss_mlp": 0.01257609, "epoch": 0.2937020892830302, "flos": 23475615327360.0, "grad_norm": 3.1144325761588703, "language_loss": 0.73832345, "learning_rate": 3.313810597972234e-06, "loss": 0.81616294, "num_input_tokens_seen": 105282155, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.16967773, "step": 4885, "time_per_iteration": 2.5783536434173584 }, { "auxiliary_loss_clip": 0.06509157, "auxiliary_loss_mlp": 0.01279675, "balance_loss_clip": 0.06301399, "balance_loss_mlp": 0.01263463, "epoch": 0.2937622125356982, "flos": 24278422896000.0, "grad_norm": 1.8119414810812222, "language_loss": 0.85153115, "learning_rate": 3.3135169294347655e-06, "loss": 0.92941952, "num_input_tokens_seen": 105299225, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.16210938, "step": 4886, "time_per_iteration": 2.5893990993499756 }, { "auxiliary_loss_clip": 0.06519298, "auxiliary_loss_mlp": 0.01279828, "balance_loss_clip": 0.0630745, "balance_loss_mlp": 0.01263878, "epoch": 0.29382233578836614, "flos": 20667843262080.0, "grad_norm": 2.6160794747611944, "language_loss": 0.78146398, "learning_rate": 3.313223211088603e-06, "loss": 0.85945523, "num_input_tokens_seen": 105315710, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.1595459, "step": 4887, "time_per_iteration": 2.561230421066284 }, { "auxiliary_loss_clip": 0.0650873, "auxiliary_loss_mlp": 0.01274379, "balance_loss_clip": 0.06296258, "balance_loss_mlp": 0.01257904, "epoch": 0.2938824590410341, "flos": 16550662642560.0, "grad_norm": 2.3123132957892945, "language_loss": 0.80460882, "learning_rate": 3.3129294429448855e-06, "loss": 0.88243985, "num_input_tokens_seen": 105333505, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.16479492, "step": 4888, "time_per_iteration": 2.5374915599823 }, { "auxiliary_loss_clip": 0.06511672, "auxiliary_loss_mlp": 0.01278912, "balance_loss_clip": 0.06304817, "balance_loss_mlp": 0.01262461, "epoch": 0.29394258229370207, "flos": 37934620824960.0, "grad_norm": 2.1386167143117816, "language_loss": 0.55833876, "learning_rate": 3.3126356250147517e-06, "loss": 0.63624454, "num_input_tokens_seen": 105355605, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.16442871, "step": 4889, "time_per_iteration": 2.701871156692505 }, { "auxiliary_loss_clip": 0.06515625, "auxiliary_loss_mlp": 0.01275578, "balance_loss_clip": 0.06304646, "balance_loss_mlp": 0.01259044, "epoch": 0.29400270554637004, "flos": 20050384924800.0, "grad_norm": 5.365474438935059, "language_loss": 0.85404575, "learning_rate": 3.3123417573093434e-06, "loss": 0.93195772, "num_input_tokens_seen": 105374225, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.1652832, "step": 4890, "time_per_iteration": 2.5636849403381348 }, { "auxiliary_loss_clip": 0.06515935, "auxiliary_loss_mlp": 0.0128501, "balance_loss_clip": 0.06305068, "balance_loss_mlp": 0.01268392, "epoch": 0.294062828799038, "flos": 15271498212480.0, "grad_norm": 2.435786341081583, "language_loss": 0.73124641, "learning_rate": 3.3120478398398046e-06, "loss": 0.80925584, "num_input_tokens_seen": 105391565, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.16601562, "step": 4891, "time_per_iteration": 2.559663772583008 }, { "auxiliary_loss_clip": 0.06518658, "auxiliary_loss_mlp": 0.012779, "balance_loss_clip": 0.06308468, "balance_loss_mlp": 0.01261175, "epoch": 0.294122952051706, "flos": 22753714475520.0, "grad_norm": 2.061859430876263, "language_loss": 0.77431345, "learning_rate": 3.3117538726172797e-06, "loss": 0.85227907, "num_input_tokens_seen": 105409840, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.16729736, "step": 4892, "time_per_iteration": 2.644178628921509 }, { "auxiliary_loss_clip": 0.06507286, "auxiliary_loss_mlp": 0.0127558, "balance_loss_clip": 0.06299919, "balance_loss_mlp": 0.01259356, "epoch": 0.294183075304374, "flos": 24979848624000.0, "grad_norm": 1.8327575101411646, "language_loss": 0.78264785, "learning_rate": 3.3114598556529164e-06, "loss": 0.86047649, "num_input_tokens_seen": 105428645, "router_z_loss_clip": 2.07324219, "router_z_loss_mlp": 0.1619873, "step": 4893, "time_per_iteration": 2.5914111137390137 }, { "auxiliary_loss_clip": 0.06514271, "auxiliary_loss_mlp": 0.01274881, "balance_loss_clip": 0.06307294, "balance_loss_mlp": 0.01259634, "epoch": 0.29424319855704195, "flos": 30960347212800.0, "grad_norm": 1.8090689374453146, "language_loss": 0.84755808, "learning_rate": 3.311165788957864e-06, "loss": 0.92544961, "num_input_tokens_seen": 105447480, "router_z_loss_clip": 2.07128906, "router_z_loss_mlp": 0.15270996, "step": 4894, "time_per_iteration": 2.6363508701324463 }, { "auxiliary_loss_clip": 0.06517541, "auxiliary_loss_mlp": 0.01287617, "balance_loss_clip": 0.06306613, "balance_loss_mlp": 0.01271977, "epoch": 0.2943033218097099, "flos": 15236977530240.0, "grad_norm": 2.5263455341189522, "language_loss": 0.90896869, "learning_rate": 3.310871672543274e-06, "loss": 0.98702025, "num_input_tokens_seen": 105464600, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.15661621, "step": 4895, "time_per_iteration": 2.557737112045288 }, { "auxiliary_loss_clip": 0.06529716, "auxiliary_loss_mlp": 0.01279618, "balance_loss_clip": 0.06313898, "balance_loss_mlp": 0.01262679, "epoch": 0.2943634450623779, "flos": 21732336731520.0, "grad_norm": 2.9845228753318125, "language_loss": 0.87474257, "learning_rate": 3.3105775064202982e-06, "loss": 0.95283586, "num_input_tokens_seen": 105481510, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.16943359, "step": 4896, "time_per_iteration": 2.7213873863220215 }, { "auxiliary_loss_clip": 0.0652106, "auxiliary_loss_mlp": 0.01286435, "balance_loss_clip": 0.06309097, "balance_loss_mlp": 0.01270497, "epoch": 0.29442356831504585, "flos": 22608797639040.0, "grad_norm": 2.6013623519107476, "language_loss": 0.74335742, "learning_rate": 3.3102832906000924e-06, "loss": 0.82143235, "num_input_tokens_seen": 105501390, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.15930176, "step": 4897, "time_per_iteration": 2.6144819259643555 }, { "auxiliary_loss_clip": 0.06523593, "auxiliary_loss_mlp": 0.01290685, "balance_loss_clip": 0.0630874, "balance_loss_mlp": 0.01274068, "epoch": 0.2944836915677138, "flos": 20017625178240.0, "grad_norm": 2.430868238404697, "language_loss": 0.74704444, "learning_rate": 3.309989025093813e-06, "loss": 0.82518721, "num_input_tokens_seen": 105519600, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.16625977, "step": 4898, "time_per_iteration": 2.581359624862671 }, { "auxiliary_loss_clip": 0.06530036, "auxiliary_loss_mlp": 0.01290643, "balance_loss_clip": 0.06313059, "balance_loss_mlp": 0.01273298, "epoch": 0.2945438148203818, "flos": 20051768517120.0, "grad_norm": 6.925842901961196, "language_loss": 0.70889533, "learning_rate": 3.309694709912618e-06, "loss": 0.78710204, "num_input_tokens_seen": 105535970, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.17333984, "step": 4899, "time_per_iteration": 2.603287696838379 }, { "auxiliary_loss_clip": 0.06524534, "auxiliary_loss_mlp": 0.01297772, "balance_loss_clip": 0.06314611, "balance_loss_mlp": 0.01280785, "epoch": 0.29460393807304974, "flos": 23740487683200.0, "grad_norm": 2.517721929539311, "language_loss": 0.79647535, "learning_rate": 3.3094003450676685e-06, "loss": 0.8746984, "num_input_tokens_seen": 105556735, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.16992188, "step": 4900, "time_per_iteration": 2.6101036071777344 }, { "auxiliary_loss_clip": 0.06523024, "auxiliary_loss_mlp": 0.0130748, "balance_loss_clip": 0.0631404, "balance_loss_mlp": 0.0129085, "epoch": 0.2946640613257177, "flos": 14981412977280.0, "grad_norm": 2.0101979447950655, "language_loss": 0.81293201, "learning_rate": 3.3091059305701268e-06, "loss": 0.89123714, "num_input_tokens_seen": 105574875, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.16625977, "step": 4901, "time_per_iteration": 2.5526180267333984 }, { "auxiliary_loss_clip": 0.06521351, "auxiliary_loss_mlp": 0.013059, "balance_loss_clip": 0.06319879, "balance_loss_mlp": 0.01290843, "epoch": 0.2947241845783857, "flos": 24250862102400.0, "grad_norm": 2.3422613401805634, "language_loss": 0.58370006, "learning_rate": 3.308811466431157e-06, "loss": 0.66197252, "num_input_tokens_seen": 105594225, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.1505127, "step": 4902, "time_per_iteration": 2.611234426498413 }, { "auxiliary_loss_clip": 0.06522644, "auxiliary_loss_mlp": 0.01321787, "balance_loss_clip": 0.06313901, "balance_loss_mlp": 0.01306362, "epoch": 0.29478430783105364, "flos": 19944600744960.0, "grad_norm": 4.783195958842345, "language_loss": 0.76353341, "learning_rate": 3.308516952661925e-06, "loss": 0.84197772, "num_input_tokens_seen": 105614000, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.15429688, "step": 4903, "time_per_iteration": 2.5863454341888428 }, { "auxiliary_loss_clip": 0.06526309, "auxiliary_loss_mlp": 0.01322215, "balance_loss_clip": 0.06318706, "balance_loss_mlp": 0.01304369, "epoch": 0.2948444310837216, "flos": 27388774454400.0, "grad_norm": 2.1362748667574474, "language_loss": 0.62657022, "learning_rate": 3.3082223892736e-06, "loss": 0.70505548, "num_input_tokens_seen": 105634575, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.17834473, "step": 4904, "time_per_iteration": 2.6356093883514404 }, { "auxiliary_loss_clip": 0.06527567, "auxiliary_loss_mlp": 0.0132029, "balance_loss_clip": 0.06316342, "balance_loss_mlp": 0.0130372, "epoch": 0.2949045543363896, "flos": 23412401821440.0, "grad_norm": 2.9827694929452266, "language_loss": 0.73574746, "learning_rate": 3.3079277762773496e-06, "loss": 0.81422603, "num_input_tokens_seen": 105654385, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.16564941, "step": 4905, "time_per_iteration": 2.5957977771759033 }, { "auxiliary_loss_clip": 0.0652862, "auxiliary_loss_mlp": 0.01303769, "balance_loss_clip": 0.06321797, "balance_loss_mlp": 0.01287485, "epoch": 0.2949646775890576, "flos": 23958303171840.0, "grad_norm": 1.7403749226540302, "language_loss": 0.81926322, "learning_rate": 3.3076331136843476e-06, "loss": 0.89758712, "num_input_tokens_seen": 105673570, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.1628418, "step": 4906, "time_per_iteration": 2.608660936355591 }, { "auxiliary_loss_clip": 0.06529194, "auxiliary_loss_mlp": 0.01300901, "balance_loss_clip": 0.06326081, "balance_loss_mlp": 0.01285547, "epoch": 0.29502480084172555, "flos": 22791002342400.0, "grad_norm": 3.5103587782662364, "language_loss": 0.87593436, "learning_rate": 3.3073384015057667e-06, "loss": 0.95423532, "num_input_tokens_seen": 105691940, "router_z_loss_clip": 2.03320312, "router_z_loss_mlp": 0.15332031, "step": 4907, "time_per_iteration": 2.5907139778137207 }, { "auxiliary_loss_clip": 0.06539181, "auxiliary_loss_mlp": 0.01315671, "balance_loss_clip": 0.06326938, "balance_loss_mlp": 0.01298267, "epoch": 0.2950849240943935, "flos": 19652838428160.0, "grad_norm": 2.669626448411688, "language_loss": 0.83276689, "learning_rate": 3.307043639752782e-06, "loss": 0.91131544, "num_input_tokens_seen": 105709825, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.17382812, "step": 4908, "time_per_iteration": 2.582308530807495 }, { "auxiliary_loss_clip": 0.06432299, "auxiliary_loss_mlp": 0.01338102, "balance_loss_clip": 0.06327529, "balance_loss_mlp": 0.01330741, "epoch": 0.2951450473470615, "flos": 71021062010880.0, "grad_norm": 0.7943370537396709, "language_loss": 0.57353413, "learning_rate": 3.3067488284365728e-06, "loss": 0.65123808, "num_input_tokens_seen": 105766880, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.07342529, "step": 4909, "time_per_iteration": 4.581135988235474 }, { "auxiliary_loss_clip": 0.06530463, "auxiliary_loss_mlp": 0.01316765, "balance_loss_clip": 0.06323917, "balance_loss_mlp": 0.01301244, "epoch": 0.29520517059972945, "flos": 22972955483520.0, "grad_norm": 1.8157730743819034, "language_loss": 0.87051648, "learning_rate": 3.3064539675683163e-06, "loss": 0.9489888, "num_input_tokens_seen": 105786875, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.15527344, "step": 4910, "time_per_iteration": 2.607468605041504 }, { "auxiliary_loss_clip": 0.06528035, "auxiliary_loss_mlp": 0.01304152, "balance_loss_clip": 0.06325872, "balance_loss_mlp": 0.01288321, "epoch": 0.2952652938523974, "flos": 20491969541760.0, "grad_norm": 2.1095318655623703, "language_loss": 0.7296074, "learning_rate": 3.3061590571591946e-06, "loss": 0.80792928, "num_input_tokens_seen": 105805315, "router_z_loss_clip": 2.02148438, "router_z_loss_mlp": 0.15808105, "step": 4911, "time_per_iteration": 2.5961596965789795 }, { "auxiliary_loss_clip": 0.06531135, "auxiliary_loss_mlp": 0.01306787, "balance_loss_clip": 0.06327655, "balance_loss_mlp": 0.01291135, "epoch": 0.2953254171050654, "flos": 19652754574080.0, "grad_norm": 1.9377736286264986, "language_loss": 0.90254623, "learning_rate": 3.3058640972203904e-06, "loss": 0.98092544, "num_input_tokens_seen": 105825125, "router_z_loss_clip": 2.03515625, "router_z_loss_mlp": 0.15661621, "step": 4912, "time_per_iteration": 2.577460527420044 }, { "auxiliary_loss_clip": 0.06523362, "auxiliary_loss_mlp": 0.0131053, "balance_loss_clip": 0.06319091, "balance_loss_mlp": 0.01293936, "epoch": 0.29538554035773334, "flos": 22754678797440.0, "grad_norm": 12.993480581328422, "language_loss": 0.8371191, "learning_rate": 3.3055690877630894e-06, "loss": 0.91545796, "num_input_tokens_seen": 105846085, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.16601562, "step": 4913, "time_per_iteration": 2.636124610900879 }, { "auxiliary_loss_clip": 0.06529211, "auxiliary_loss_mlp": 0.01297741, "balance_loss_clip": 0.06325194, "balance_loss_mlp": 0.01282703, "epoch": 0.2954456636104013, "flos": 21878343671040.0, "grad_norm": 1.783294905068158, "language_loss": 0.77209949, "learning_rate": 3.3052740287984765e-06, "loss": 0.85036898, "num_input_tokens_seen": 105865400, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.15032959, "step": 4914, "time_per_iteration": 4.0245444774627686 }, { "auxiliary_loss_clip": 0.06533527, "auxiliary_loss_mlp": 0.01303666, "balance_loss_clip": 0.06329961, "balance_loss_mlp": 0.01286976, "epoch": 0.2955057868630693, "flos": 40452056092800.0, "grad_norm": 3.7263713916863024, "language_loss": 0.82215863, "learning_rate": 3.3049789203377424e-06, "loss": 0.90053058, "num_input_tokens_seen": 105887920, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.16687012, "step": 4915, "time_per_iteration": 2.7606866359710693 }, { "auxiliary_loss_clip": 0.0653432, "auxiliary_loss_mlp": 0.01297959, "balance_loss_clip": 0.06326899, "balance_loss_mlp": 0.01281747, "epoch": 0.29556591011573724, "flos": 22571006647680.0, "grad_norm": 2.3311031561341635, "language_loss": 0.84741443, "learning_rate": 3.3046837623920772e-06, "loss": 0.92573726, "num_input_tokens_seen": 105904035, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.16210938, "step": 4916, "time_per_iteration": 2.601069211959839 }, { "auxiliary_loss_clip": 0.06539688, "auxiliary_loss_mlp": 0.01302751, "balance_loss_clip": 0.06333888, "balance_loss_mlp": 0.01287385, "epoch": 0.2956260333684052, "flos": 22095572181120.0, "grad_norm": 4.02965232424233, "language_loss": 0.70528793, "learning_rate": 3.3043885549726723e-06, "loss": 0.78371227, "num_input_tokens_seen": 105922685, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.15356445, "step": 4917, "time_per_iteration": 2.59769868850708 }, { "auxiliary_loss_clip": 0.06532642, "auxiliary_loss_mlp": 0.0129711, "balance_loss_clip": 0.06326933, "balance_loss_mlp": 0.01280886, "epoch": 0.2956861566210732, "flos": 16441063102080.0, "grad_norm": 2.4748168835779243, "language_loss": 0.91236395, "learning_rate": 3.3040932980907226e-06, "loss": 0.99066156, "num_input_tokens_seen": 105940425, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.16223145, "step": 4918, "time_per_iteration": 2.554405927658081 }, { "auxiliary_loss_clip": 0.06541637, "auxiliary_loss_mlp": 0.01294611, "balance_loss_clip": 0.06335266, "balance_loss_mlp": 0.01277076, "epoch": 0.2957462798737412, "flos": 25819189372800.0, "grad_norm": 43.21581921523074, "language_loss": 0.73285317, "learning_rate": 3.303797991757425e-06, "loss": 0.81121564, "num_input_tokens_seen": 105960550, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.17541504, "step": 4919, "time_per_iteration": 2.628422975540161 }, { "auxiliary_loss_clip": 0.06527507, "auxiliary_loss_mlp": 0.01303588, "balance_loss_clip": 0.06324376, "balance_loss_mlp": 0.012879, "epoch": 0.29580640312640916, "flos": 16696459946880.0, "grad_norm": 1.8278083299822305, "language_loss": 0.76532942, "learning_rate": 3.3035026359839763e-06, "loss": 0.84364033, "num_input_tokens_seen": 105978820, "router_z_loss_clip": 2.02832031, "router_z_loss_mlp": 0.15686035, "step": 4920, "time_per_iteration": 2.561267375946045 }, { "auxiliary_loss_clip": 0.06536074, "auxiliary_loss_mlp": 0.0130334, "balance_loss_clip": 0.06327476, "balance_loss_mlp": 0.0128708, "epoch": 0.2958665263790771, "flos": 23951427137280.0, "grad_norm": 37.137240791216975, "language_loss": 0.69797808, "learning_rate": 3.3032072307815774e-06, "loss": 0.77637219, "num_input_tokens_seen": 105997545, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.16259766, "step": 4921, "time_per_iteration": 4.0389087200164795 }, { "auxiliary_loss_clip": 0.06540757, "auxiliary_loss_mlp": 0.01319255, "balance_loss_clip": 0.06328686, "balance_loss_mlp": 0.0130272, "epoch": 0.2959266496317451, "flos": 18484279787520.0, "grad_norm": 5.214868746804024, "language_loss": 0.74610734, "learning_rate": 3.3029117761614298e-06, "loss": 0.82470745, "num_input_tokens_seen": 106015320, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.1652832, "step": 4922, "time_per_iteration": 2.5722384452819824 }, { "auxiliary_loss_clip": 0.0653318, "auxiliary_loss_mlp": 0.01303262, "balance_loss_clip": 0.06321026, "balance_loss_mlp": 0.01285726, "epoch": 0.29598677288441305, "flos": 25964525479680.0, "grad_norm": 2.5138416323405486, "language_loss": 0.77209336, "learning_rate": 3.302616272134737e-06, "loss": 0.85045779, "num_input_tokens_seen": 106034555, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.17529297, "step": 4923, "time_per_iteration": 2.6047182083129883 }, { "auxiliary_loss_clip": 0.06531788, "auxiliary_loss_mlp": 0.01304893, "balance_loss_clip": 0.0632503, "balance_loss_mlp": 0.01289205, "epoch": 0.296046896137081, "flos": 25163101503360.0, "grad_norm": 1.6584950854022187, "language_loss": 0.87032902, "learning_rate": 3.3023207187127042e-06, "loss": 0.94869584, "num_input_tokens_seen": 106054200, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.15686035, "step": 4924, "time_per_iteration": 4.070467948913574 }, { "auxiliary_loss_clip": 0.06527227, "auxiliary_loss_mlp": 0.01283853, "balance_loss_clip": 0.06322847, "balance_loss_mlp": 0.01268391, "epoch": 0.296107019389749, "flos": 21767402465280.0, "grad_norm": 1.5756630195990442, "language_loss": 0.81948733, "learning_rate": 3.3020251159065396e-06, "loss": 0.89759815, "num_input_tokens_seen": 106074700, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.15454102, "step": 4925, "time_per_iteration": 2.604063034057617 }, { "auxiliary_loss_clip": 0.06519216, "auxiliary_loss_mlp": 0.01293137, "balance_loss_clip": 0.06314784, "balance_loss_mlp": 0.01277079, "epoch": 0.29616714264241695, "flos": 17964555638400.0, "grad_norm": 4.080993670340291, "language_loss": 0.86567295, "learning_rate": 3.301729463727452e-06, "loss": 0.94379652, "num_input_tokens_seen": 106091415, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.16040039, "step": 4926, "time_per_iteration": 2.543314218521118 }, { "auxiliary_loss_clip": 0.06531481, "auxiliary_loss_mlp": 0.01290895, "balance_loss_clip": 0.06322102, "balance_loss_mlp": 0.0127454, "epoch": 0.2962272658950849, "flos": 15018155792640.0, "grad_norm": 4.609229713591211, "language_loss": 0.86387765, "learning_rate": 3.3014337621866527e-06, "loss": 0.94210142, "num_input_tokens_seen": 106109135, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.16345215, "step": 4927, "time_per_iteration": 2.558847188949585 }, { "auxiliary_loss_clip": 0.06525593, "auxiliary_loss_mlp": 0.01294977, "balance_loss_clip": 0.06322837, "balance_loss_mlp": 0.01279825, "epoch": 0.2962873891477529, "flos": 14726183840640.0, "grad_norm": 1.7840916194500005, "language_loss": 0.8094058, "learning_rate": 3.3011380112953553e-06, "loss": 0.88761151, "num_input_tokens_seen": 106125750, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.15148926, "step": 4928, "time_per_iteration": 2.557969331741333 }, { "auxiliary_loss_clip": 0.06552966, "auxiliary_loss_mlp": 0.01291685, "balance_loss_clip": 0.06337849, "balance_loss_mlp": 0.01273327, "epoch": 0.29634751240042084, "flos": 26730967576320.0, "grad_norm": 2.525063321410948, "language_loss": 0.7359975, "learning_rate": 3.300842211064773e-06, "loss": 0.81444401, "num_input_tokens_seen": 106142835, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.18347168, "step": 4929, "time_per_iteration": 2.625649929046631 }, { "auxiliary_loss_clip": 0.06538752, "auxiliary_loss_mlp": 0.01282524, "balance_loss_clip": 0.06329586, "balance_loss_mlp": 0.01265107, "epoch": 0.2964076356530888, "flos": 14575984197120.0, "grad_norm": 2.8785618574966882, "language_loss": 0.72559607, "learning_rate": 3.3005463615061246e-06, "loss": 0.80380881, "num_input_tokens_seen": 106160680, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.17419434, "step": 4930, "time_per_iteration": 2.5558481216430664 }, { "auxiliary_loss_clip": 0.06456135, "auxiliary_loss_mlp": 0.01305492, "balance_loss_clip": 0.06354635, "balance_loss_mlp": 0.01298566, "epoch": 0.29646775890575683, "flos": 63124387925760.0, "grad_norm": 0.8063313442704761, "language_loss": 0.6069681, "learning_rate": 3.3002504626306275e-06, "loss": 0.68458438, "num_input_tokens_seen": 106224415, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.06939697, "step": 4931, "time_per_iteration": 3.156874656677246 }, { "auxiliary_loss_clip": 0.06460001, "auxiliary_loss_mlp": 0.01300349, "balance_loss_clip": 0.06359155, "balance_loss_mlp": 0.01292988, "epoch": 0.2965278821584248, "flos": 63087728964480.0, "grad_norm": 0.7384579541671625, "language_loss": 0.52379119, "learning_rate": 3.2999545144495023e-06, "loss": 0.60139465, "num_input_tokens_seen": 106279140, "router_z_loss_clip": 1.00683594, "router_z_loss_mlp": 0.07342529, "step": 4932, "time_per_iteration": 3.1158740520477295 }, { "auxiliary_loss_clip": 0.06535073, "auxiliary_loss_mlp": 0.01284039, "balance_loss_clip": 0.06330249, "balance_loss_mlp": 0.01268423, "epoch": 0.29658800541109276, "flos": 23775469562880.0, "grad_norm": 1.8198665169829957, "language_loss": 0.82176173, "learning_rate": 3.299658516973972e-06, "loss": 0.89995283, "num_input_tokens_seen": 106298190, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.15612793, "step": 4933, "time_per_iteration": 2.597083330154419 }, { "auxiliary_loss_clip": 0.06525296, "auxiliary_loss_mlp": 0.012757, "balance_loss_clip": 0.06325236, "balance_loss_mlp": 0.0126043, "epoch": 0.2966481286637607, "flos": 23995465257600.0, "grad_norm": 2.174559798016131, "language_loss": 0.75757003, "learning_rate": 3.299362470215261e-06, "loss": 0.83557999, "num_input_tokens_seen": 106319065, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.15283203, "step": 4934, "time_per_iteration": 2.6301026344299316 }, { "auxiliary_loss_clip": 0.06540419, "auxiliary_loss_mlp": 0.01285814, "balance_loss_clip": 0.06330694, "balance_loss_mlp": 0.01269315, "epoch": 0.2967082519164287, "flos": 17170846237440.0, "grad_norm": 1.9214157507022818, "language_loss": 0.63357013, "learning_rate": 3.299066374184594e-06, "loss": 0.71183252, "num_input_tokens_seen": 106338040, "router_z_loss_clip": 2.09863281, "router_z_loss_mlp": 0.16491699, "step": 4935, "time_per_iteration": 2.5864739418029785 }, { "auxiliary_loss_clip": 0.06525029, "auxiliary_loss_mlp": 0.01278626, "balance_loss_clip": 0.06321764, "balance_loss_mlp": 0.01262163, "epoch": 0.29676837516909665, "flos": 29395416032640.0, "grad_norm": 2.0392642884292846, "language_loss": 0.79657239, "learning_rate": 3.2987702288932e-06, "loss": 0.87460899, "num_input_tokens_seen": 106358900, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.16455078, "step": 4936, "time_per_iteration": 2.6507461071014404 }, { "auxiliary_loss_clip": 0.06537752, "auxiliary_loss_mlp": 0.01277981, "balance_loss_clip": 0.06326343, "balance_loss_mlp": 0.01260397, "epoch": 0.2968284984217646, "flos": 34759839876480.0, "grad_norm": 2.26409853617813, "language_loss": 0.74548519, "learning_rate": 3.298474034352309e-06, "loss": 0.82364249, "num_input_tokens_seen": 106381805, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.17578125, "step": 4937, "time_per_iteration": 2.6942555904388428 }, { "auxiliary_loss_clip": 0.06526694, "auxiliary_loss_mlp": 0.01275, "balance_loss_clip": 0.06320123, "balance_loss_mlp": 0.0125818, "epoch": 0.2968886216744326, "flos": 21550635152640.0, "grad_norm": 1.5156621560869867, "language_loss": 0.78480184, "learning_rate": 3.2981777905731526e-06, "loss": 0.86281878, "num_input_tokens_seen": 106402365, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.16809082, "step": 4938, "time_per_iteration": 2.589751958847046 }, { "auxiliary_loss_clip": 0.06526064, "auxiliary_loss_mlp": 0.01278176, "balance_loss_clip": 0.06316258, "balance_loss_mlp": 0.01260414, "epoch": 0.29694874492710055, "flos": 12792357060480.0, "grad_norm": 2.146006183899097, "language_loss": 0.77268982, "learning_rate": 3.297881497566964e-06, "loss": 0.85073227, "num_input_tokens_seen": 106419800, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.1776123, "step": 4939, "time_per_iteration": 2.5580811500549316 }, { "auxiliary_loss_clip": 0.06535776, "auxiliary_loss_mlp": 0.01274395, "balance_loss_clip": 0.06321356, "balance_loss_mlp": 0.0125804, "epoch": 0.2970088681797685, "flos": 24576600049920.0, "grad_norm": 2.1501558657504285, "language_loss": 0.78352642, "learning_rate": 3.297585155344979e-06, "loss": 0.86162817, "num_input_tokens_seen": 106440300, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.16345215, "step": 4940, "time_per_iteration": 2.7519454956054688 }, { "auxiliary_loss_clip": 0.06526375, "auxiliary_loss_mlp": 0.01279858, "balance_loss_clip": 0.06314685, "balance_loss_mlp": 0.01261786, "epoch": 0.2970689914324365, "flos": 23665870022400.0, "grad_norm": 1.986613271367337, "language_loss": 0.75410247, "learning_rate": 3.297288763918435e-06, "loss": 0.83216476, "num_input_tokens_seen": 106460035, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.18066406, "step": 4941, "time_per_iteration": 2.609699249267578 }, { "auxiliary_loss_clip": 0.06521668, "auxiliary_loss_mlp": 0.01280787, "balance_loss_clip": 0.06309145, "balance_loss_mlp": 0.01263561, "epoch": 0.29712911468510445, "flos": 39678654107520.0, "grad_norm": 2.475314589998328, "language_loss": 0.74636942, "learning_rate": 3.2969923232985712e-06, "loss": 0.82439405, "num_input_tokens_seen": 106481095, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.17224121, "step": 4942, "time_per_iteration": 2.7417855262756348 }, { "auxiliary_loss_clip": 0.06522237, "auxiliary_loss_mlp": 0.01284237, "balance_loss_clip": 0.06307226, "balance_loss_mlp": 0.01265331, "epoch": 0.2971892379377724, "flos": 26402420517120.0, "grad_norm": 2.1556586742818156, "language_loss": 0.70869303, "learning_rate": 3.2966958334966287e-06, "loss": 0.78675777, "num_input_tokens_seen": 106501590, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.18908691, "step": 4943, "time_per_iteration": 2.736639976501465 }, { "auxiliary_loss_clip": 0.06523765, "auxiliary_loss_mlp": 0.01274882, "balance_loss_clip": 0.0631182, "balance_loss_mlp": 0.01258181, "epoch": 0.2972493611904404, "flos": 17608992837120.0, "grad_norm": 2.3398099502210097, "language_loss": 0.80568433, "learning_rate": 3.2963992945238497e-06, "loss": 0.88367081, "num_input_tokens_seen": 106519430, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.16699219, "step": 4944, "time_per_iteration": 2.6056437492370605 }, { "auxiliary_loss_clip": 0.06514431, "auxiliary_loss_mlp": 0.01278901, "balance_loss_clip": 0.06311518, "balance_loss_mlp": 0.01262462, "epoch": 0.2973094844431084, "flos": 20419070889600.0, "grad_norm": 2.8879587980022836, "language_loss": 0.83710748, "learning_rate": 3.2961027063914795e-06, "loss": 0.91504079, "num_input_tokens_seen": 106535870, "router_z_loss_clip": 2.03027344, "router_z_loss_mlp": 0.16442871, "step": 4945, "time_per_iteration": 2.5793676376342773 }, { "auxiliary_loss_clip": 0.06510831, "auxiliary_loss_mlp": 0.01281024, "balance_loss_clip": 0.06307137, "balance_loss_mlp": 0.01264824, "epoch": 0.29736960769577636, "flos": 17499225588480.0, "grad_norm": 2.891891969391055, "language_loss": 0.67858958, "learning_rate": 3.2958060691107654e-06, "loss": 0.75650811, "num_input_tokens_seen": 106553560, "router_z_loss_clip": 2.03613281, "router_z_loss_mlp": 0.16186523, "step": 4946, "time_per_iteration": 2.564002275466919 }, { "auxiliary_loss_clip": 0.06515396, "auxiliary_loss_mlp": 0.0128893, "balance_loss_clip": 0.06307495, "balance_loss_mlp": 0.01272086, "epoch": 0.2974297309484443, "flos": 26111119397760.0, "grad_norm": 2.23686811873874, "language_loss": 0.74742675, "learning_rate": 3.2955093826929547e-06, "loss": 0.82546997, "num_input_tokens_seen": 106574115, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.16845703, "step": 4947, "time_per_iteration": 2.631344795227051 }, { "auxiliary_loss_clip": 0.06520368, "auxiliary_loss_mlp": 0.01285215, "balance_loss_clip": 0.06309755, "balance_loss_mlp": 0.01268299, "epoch": 0.2974898542011123, "flos": 25673559776640.0, "grad_norm": 2.2405769588181377, "language_loss": 0.73700732, "learning_rate": 3.2952126471492985e-06, "loss": 0.81506318, "num_input_tokens_seen": 106593070, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.16918945, "step": 4948, "time_per_iteration": 2.604691743850708 }, { "auxiliary_loss_clip": 0.0650612, "auxiliary_loss_mlp": 0.01282873, "balance_loss_clip": 0.06304038, "balance_loss_mlp": 0.01267543, "epoch": 0.29754997745378026, "flos": 18667323031680.0, "grad_norm": 2.483282567901815, "language_loss": 0.83880651, "learning_rate": 3.2949158624910497e-06, "loss": 0.91669643, "num_input_tokens_seen": 106610695, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.15319824, "step": 4949, "time_per_iteration": 4.083361625671387 }, { "auxiliary_loss_clip": 0.0650886, "auxiliary_loss_mlp": 0.01290001, "balance_loss_clip": 0.06304082, "balance_loss_mlp": 0.01273074, "epoch": 0.2976101007064482, "flos": 22281382609920.0, "grad_norm": 4.335804474658739, "language_loss": 0.71417201, "learning_rate": 3.2946190287294603e-06, "loss": 0.79216057, "num_input_tokens_seen": 106631300, "router_z_loss_clip": 2.04785156, "router_z_loss_mlp": 0.16931152, "step": 4950, "time_per_iteration": 2.597932815551758 }, { "auxiliary_loss_clip": 0.06508145, "auxiliary_loss_mlp": 0.01288486, "balance_loss_clip": 0.06311886, "balance_loss_mlp": 0.01272023, "epoch": 0.2976702239591162, "flos": 21952290499200.0, "grad_norm": 3.641728944621253, "language_loss": 0.83158529, "learning_rate": 3.294322145875789e-06, "loss": 0.90955162, "num_input_tokens_seen": 106650065, "router_z_loss_clip": 1.96289062, "router_z_loss_mlp": 0.16467285, "step": 4951, "time_per_iteration": 2.5752298831939697 }, { "auxiliary_loss_clip": 0.06514266, "auxiliary_loss_mlp": 0.01287739, "balance_loss_clip": 0.06305991, "balance_loss_mlp": 0.01271479, "epoch": 0.29773034721178415, "flos": 24642874229760.0, "grad_norm": 9.534920760764782, "language_loss": 0.74120438, "learning_rate": 3.2940252139412912e-06, "loss": 0.81922448, "num_input_tokens_seen": 106668230, "router_z_loss_clip": 2.08300781, "router_z_loss_mlp": 0.16247559, "step": 4952, "time_per_iteration": 2.593264102935791 }, { "auxiliary_loss_clip": 0.06512106, "auxiliary_loss_mlp": 0.01289874, "balance_loss_clip": 0.06307612, "balance_loss_mlp": 0.01271838, "epoch": 0.2977904704644521, "flos": 20563694236800.0, "grad_norm": 2.7019274564291362, "language_loss": 0.84065896, "learning_rate": 3.293728232937228e-06, "loss": 0.91867876, "num_input_tokens_seen": 106687785, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.18041992, "step": 4953, "time_per_iteration": 2.5825135707855225 }, { "auxiliary_loss_clip": 0.06515286, "auxiliary_loss_mlp": 0.0128949, "balance_loss_clip": 0.06306268, "balance_loss_mlp": 0.01272801, "epoch": 0.2978505937171201, "flos": 18922426387200.0, "grad_norm": 2.286528844217973, "language_loss": 0.74643528, "learning_rate": 3.2934312028748597e-06, "loss": 0.82448304, "num_input_tokens_seen": 106706875, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.16699219, "step": 4954, "time_per_iteration": 3.9950692653656006 }, { "auxiliary_loss_clip": 0.06512871, "auxiliary_loss_mlp": 0.01283266, "balance_loss_clip": 0.06308682, "balance_loss_mlp": 0.01267912, "epoch": 0.29791071696978805, "flos": 19323788244480.0, "grad_norm": 8.167917267370502, "language_loss": 0.75852281, "learning_rate": 3.293134123765452e-06, "loss": 0.83648419, "num_input_tokens_seen": 106725105, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.15368652, "step": 4955, "time_per_iteration": 2.570254325866699 }, { "auxiliary_loss_clip": 0.06521112, "auxiliary_loss_mlp": 0.01286555, "balance_loss_clip": 0.06311686, "balance_loss_mlp": 0.01270057, "epoch": 0.297970840222456, "flos": 18812742992640.0, "grad_norm": 2.988354997814855, "language_loss": 0.73026776, "learning_rate": 3.2928369956202684e-06, "loss": 0.80834442, "num_input_tokens_seen": 106744780, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.16503906, "step": 4956, "time_per_iteration": 2.581080913543701 }, { "auxiliary_loss_clip": 0.06524287, "auxiliary_loss_mlp": 0.0128662, "balance_loss_clip": 0.06311414, "balance_loss_mlp": 0.01269347, "epoch": 0.298030963475124, "flos": 22858702041600.0, "grad_norm": 1.8409923601352685, "language_loss": 0.79387391, "learning_rate": 3.2925398184505754e-06, "loss": 0.87198293, "num_input_tokens_seen": 106764670, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.17272949, "step": 4957, "time_per_iteration": 2.572296380996704 }, { "auxiliary_loss_clip": 0.06524819, "auxiliary_loss_mlp": 0.01285594, "balance_loss_clip": 0.06318974, "balance_loss_mlp": 0.01268797, "epoch": 0.298091086727792, "flos": 21874402529280.0, "grad_norm": 1.5655047521910228, "language_loss": 0.70318246, "learning_rate": 3.2922425922676437e-06, "loss": 0.7812866, "num_input_tokens_seen": 106783695, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.16784668, "step": 4958, "time_per_iteration": 2.588003158569336 }, { "auxiliary_loss_clip": 0.06518609, "auxiliary_loss_mlp": 0.01290139, "balance_loss_clip": 0.06316118, "balance_loss_mlp": 0.01272937, "epoch": 0.29815120998045996, "flos": 21180775230720.0, "grad_norm": 5.143694509387509, "language_loss": 0.7933898, "learning_rate": 3.291945317082743e-06, "loss": 0.87147725, "num_input_tokens_seen": 106803150, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.17199707, "step": 4959, "time_per_iteration": 2.5898239612579346 }, { "auxiliary_loss_clip": 0.06518684, "auxiliary_loss_mlp": 0.01281787, "balance_loss_clip": 0.06314667, "balance_loss_mlp": 0.01265801, "epoch": 0.29821133323312793, "flos": 19901526946560.0, "grad_norm": 1.9622626450374199, "language_loss": 0.79821998, "learning_rate": 3.291647992907147e-06, "loss": 0.8762247, "num_input_tokens_seen": 106820705, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.15979004, "step": 4960, "time_per_iteration": 2.5909979343414307 }, { "auxiliary_loss_clip": 0.06523488, "auxiliary_loss_mlp": 0.01291946, "balance_loss_clip": 0.06314275, "balance_loss_mlp": 0.01274553, "epoch": 0.2982714564857959, "flos": 12755781953280.0, "grad_norm": 4.077020605874822, "language_loss": 0.74438328, "learning_rate": 3.291350619752129e-06, "loss": 0.82253766, "num_input_tokens_seen": 106837335, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.17407227, "step": 4961, "time_per_iteration": 3.989494800567627 }, { "auxiliary_loss_clip": 0.06521679, "auxiliary_loss_mlp": 0.01276422, "balance_loss_clip": 0.06315225, "balance_loss_mlp": 0.01259995, "epoch": 0.29833157973846386, "flos": 22278238081920.0, "grad_norm": 1.9220780545590639, "language_loss": 0.62366867, "learning_rate": 3.291053197628967e-06, "loss": 0.70164967, "num_input_tokens_seen": 106856250, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.16430664, "step": 4962, "time_per_iteration": 2.586150646209717 }, { "auxiliary_loss_clip": 0.06519747, "auxiliary_loss_mlp": 0.01286692, "balance_loss_clip": 0.06315604, "balance_loss_mlp": 0.01270181, "epoch": 0.2983917029911318, "flos": 15377659735680.0, "grad_norm": 1.853219722461803, "language_loss": 0.82900733, "learning_rate": 3.2907557265489375e-06, "loss": 0.90707171, "num_input_tokens_seen": 106873370, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.16503906, "step": 4963, "time_per_iteration": 3.9594357013702393 }, { "auxiliary_loss_clip": 0.06523488, "auxiliary_loss_mlp": 0.01282848, "balance_loss_clip": 0.06320876, "balance_loss_mlp": 0.01266278, "epoch": 0.2984518262437998, "flos": 15383068323840.0, "grad_norm": 2.9157774333174333, "language_loss": 0.66911477, "learning_rate": 3.290458206523322e-06, "loss": 0.74717808, "num_input_tokens_seen": 106890330, "router_z_loss_clip": 2.02929688, "router_z_loss_mlp": 0.16577148, "step": 4964, "time_per_iteration": 2.560418128967285 }, { "auxiliary_loss_clip": 0.06522017, "auxiliary_loss_mlp": 0.01272275, "balance_loss_clip": 0.06319667, "balance_loss_mlp": 0.01256933, "epoch": 0.29851194949646775, "flos": 18113413616640.0, "grad_norm": 2.1936589220475757, "language_loss": 0.71346694, "learning_rate": 3.2901606375634015e-06, "loss": 0.79140985, "num_input_tokens_seen": 106909190, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.15332031, "step": 4965, "time_per_iteration": 2.572920560836792 }, { "auxiliary_loss_clip": 0.06526625, "auxiliary_loss_mlp": 0.01276327, "balance_loss_clip": 0.06319514, "balance_loss_mlp": 0.01259768, "epoch": 0.2985720727491357, "flos": 22024811808000.0, "grad_norm": 2.5137248809204653, "language_loss": 0.67071551, "learning_rate": 3.289863019680461e-06, "loss": 0.74874508, "num_input_tokens_seen": 106927825, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.16540527, "step": 4966, "time_per_iteration": 2.5919857025146484 }, { "auxiliary_loss_clip": 0.06532617, "auxiliary_loss_mlp": 0.01271629, "balance_loss_clip": 0.06327584, "balance_loss_mlp": 0.01255512, "epoch": 0.2986321960018037, "flos": 13046202604800.0, "grad_norm": 2.997105946993406, "language_loss": 0.74265718, "learning_rate": 3.289565352885785e-06, "loss": 0.82069963, "num_input_tokens_seen": 106943155, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.16125488, "step": 4967, "time_per_iteration": 2.5340499877929688 }, { "auxiliary_loss_clip": 0.06521703, "auxiliary_loss_mlp": 0.01274487, "balance_loss_clip": 0.0631595, "balance_loss_mlp": 0.01258311, "epoch": 0.29869231925447165, "flos": 14470241944320.0, "grad_norm": 2.1637779493572076, "language_loss": 0.71278429, "learning_rate": 3.2892676371906614e-06, "loss": 0.79074615, "num_input_tokens_seen": 106960295, "router_z_loss_clip": 2.05761719, "router_z_loss_mlp": 0.16174316, "step": 4968, "time_per_iteration": 2.5613553524017334 }, { "auxiliary_loss_clip": 0.065208, "auxiliary_loss_mlp": 0.01274962, "balance_loss_clip": 0.06313076, "balance_loss_mlp": 0.01258893, "epoch": 0.2987524425071396, "flos": 31658376850560.0, "grad_norm": 1.7615285401978518, "language_loss": 0.77011538, "learning_rate": 3.2889698726063805e-06, "loss": 0.84807301, "num_input_tokens_seen": 106982870, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.16064453, "step": 4969, "time_per_iteration": 2.6895222663879395 }, { "auxiliary_loss_clip": 0.0652176, "auxiliary_loss_mlp": 0.01272759, "balance_loss_clip": 0.06316209, "balance_loss_mlp": 0.01257214, "epoch": 0.2988125657598076, "flos": 21439735873920.0, "grad_norm": 2.0060971981799565, "language_loss": 0.70823443, "learning_rate": 3.2886720591442327e-06, "loss": 0.78617966, "num_input_tokens_seen": 107002405, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.15527344, "step": 4970, "time_per_iteration": 2.5689079761505127 }, { "auxiliary_loss_clip": 0.06533752, "auxiliary_loss_mlp": 0.01282247, "balance_loss_clip": 0.0632088, "balance_loss_mlp": 0.012647, "epoch": 0.2988726890124756, "flos": 18082750222080.0, "grad_norm": 2.819498108416601, "language_loss": 0.8546617, "learning_rate": 3.2883741968155103e-06, "loss": 0.93282163, "num_input_tokens_seen": 107017310, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.17553711, "step": 4971, "time_per_iteration": 2.5580263137817383 }, { "auxiliary_loss_clip": 0.06533825, "auxiliary_loss_mlp": 0.01276789, "balance_loss_clip": 0.06333785, "balance_loss_mlp": 0.01260767, "epoch": 0.29893281226514357, "flos": 21760987628160.0, "grad_norm": 2.408571896611228, "language_loss": 0.79293001, "learning_rate": 3.2880762856315107e-06, "loss": 0.87103617, "num_input_tokens_seen": 107034645, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.16027832, "step": 4972, "time_per_iteration": 2.5884478092193604 }, { "auxiliary_loss_clip": 0.06524137, "auxiliary_loss_mlp": 0.01282785, "balance_loss_clip": 0.06320066, "balance_loss_mlp": 0.01267335, "epoch": 0.29899293551781153, "flos": 16842341105280.0, "grad_norm": 3.5162821562465436, "language_loss": 0.85700941, "learning_rate": 3.2877783256035285e-06, "loss": 0.93507862, "num_input_tokens_seen": 107051125, "router_z_loss_clip": 2.04101562, "router_z_loss_mlp": 0.15447998, "step": 4973, "time_per_iteration": 2.5747110843658447 }, { "auxiliary_loss_clip": 0.06523129, "auxiliary_loss_mlp": 0.01273658, "balance_loss_clip": 0.06326596, "balance_loss_mlp": 0.01258017, "epoch": 0.2990530587704795, "flos": 11734068792960.0, "grad_norm": 1.7112096987837646, "language_loss": 0.78064799, "learning_rate": 3.287480316742863e-06, "loss": 0.85861588, "num_input_tokens_seen": 107068815, "router_z_loss_clip": 1.96484375, "router_z_loss_mlp": 0.15637207, "step": 4974, "time_per_iteration": 2.558079242706299 }, { "auxiliary_loss_clip": 0.0653069, "auxiliary_loss_mlp": 0.01279649, "balance_loss_clip": 0.06324186, "balance_loss_mlp": 0.01263556, "epoch": 0.29911318202314746, "flos": 28047713362560.0, "grad_norm": 1.8712390256725948, "language_loss": 0.73088056, "learning_rate": 3.287182259060815e-06, "loss": 0.80898392, "num_input_tokens_seen": 107090420, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.16088867, "step": 4975, "time_per_iteration": 2.653242588043213 }, { "auxiliary_loss_clip": 0.06521629, "auxiliary_loss_mlp": 0.01281383, "balance_loss_clip": 0.06317043, "balance_loss_mlp": 0.01265767, "epoch": 0.2991733052758154, "flos": 18739425070080.0, "grad_norm": 8.329040813113247, "language_loss": 0.76988018, "learning_rate": 3.286884152568687e-06, "loss": 0.84791034, "num_input_tokens_seen": 107107255, "router_z_loss_clip": 2.04589844, "router_z_loss_mlp": 0.15612793, "step": 4976, "time_per_iteration": 2.552610397338867 }, { "auxiliary_loss_clip": 0.06532098, "auxiliary_loss_mlp": 0.01277844, "balance_loss_clip": 0.06328312, "balance_loss_mlp": 0.0126184, "epoch": 0.2992334285284834, "flos": 15564476413440.0, "grad_norm": 2.0750200776332157, "language_loss": 0.87493241, "learning_rate": 3.2865859972777827e-06, "loss": 0.9530319, "num_input_tokens_seen": 107123840, "router_z_loss_clip": 2.03320312, "router_z_loss_mlp": 0.15997314, "step": 4977, "time_per_iteration": 2.5675160884857178 }, { "auxiliary_loss_clip": 0.06530023, "auxiliary_loss_mlp": 0.01280745, "balance_loss_clip": 0.06326908, "balance_loss_mlp": 0.01265033, "epoch": 0.29929355178115136, "flos": 21803809864320.0, "grad_norm": 1.477213562406726, "language_loss": 0.68627322, "learning_rate": 3.2862877931994088e-06, "loss": 0.76438081, "num_input_tokens_seen": 107143475, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.15710449, "step": 4978, "time_per_iteration": 2.576314687728882 }, { "auxiliary_loss_clip": 0.06536903, "auxiliary_loss_mlp": 0.01281195, "balance_loss_clip": 0.06333929, "balance_loss_mlp": 0.0126509, "epoch": 0.2993536750338193, "flos": 21184884080640.0, "grad_norm": 5.057313223758887, "language_loss": 0.77220333, "learning_rate": 3.2859895403448726e-06, "loss": 0.85038435, "num_input_tokens_seen": 107161725, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.16101074, "step": 4979, "time_per_iteration": 2.5833332538604736 }, { "auxiliary_loss_clip": 0.0652969, "auxiliary_loss_mlp": 0.01287558, "balance_loss_clip": 0.06322487, "balance_loss_mlp": 0.01271227, "epoch": 0.2994137982864873, "flos": 32129954029440.0, "grad_norm": 2.000212691780347, "language_loss": 0.69215882, "learning_rate": 3.285691238725484e-06, "loss": 0.77033126, "num_input_tokens_seen": 107183935, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.16333008, "step": 4980, "time_per_iteration": 2.6740031242370605 }, { "auxiliary_loss_clip": 0.06519407, "auxiliary_loss_mlp": 0.01280674, "balance_loss_clip": 0.06316392, "balance_loss_mlp": 0.01264784, "epoch": 0.29947392153915525, "flos": 21111733866240.0, "grad_norm": 9.403157187236177, "language_loss": 0.74107575, "learning_rate": 3.285392888352555e-06, "loss": 0.8190766, "num_input_tokens_seen": 107204285, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.15869141, "step": 4981, "time_per_iteration": 2.6028530597686768 }, { "auxiliary_loss_clip": 0.06528297, "auxiliary_loss_mlp": 0.01279736, "balance_loss_clip": 0.06316797, "balance_loss_mlp": 0.0126232, "epoch": 0.2995340447918232, "flos": 21548916144000.0, "grad_norm": 1.6361596787941923, "language_loss": 0.87258023, "learning_rate": 3.2850944892373987e-06, "loss": 0.95066059, "num_input_tokens_seen": 107225265, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.17407227, "step": 4982, "time_per_iteration": 2.58554744720459 }, { "auxiliary_loss_clip": 0.0653529, "auxiliary_loss_mlp": 0.01282054, "balance_loss_clip": 0.06320278, "balance_loss_mlp": 0.01264089, "epoch": 0.2995941680444912, "flos": 16730393650560.0, "grad_norm": 2.3375928166021036, "language_loss": 0.86708713, "learning_rate": 3.2847960413913307e-06, "loss": 0.94526058, "num_input_tokens_seen": 107241335, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.17956543, "step": 4983, "time_per_iteration": 2.5253653526306152 }, { "auxiliary_loss_clip": 0.06529128, "auxiliary_loss_mlp": 0.01276906, "balance_loss_clip": 0.06323554, "balance_loss_mlp": 0.01260896, "epoch": 0.2996542912971592, "flos": 20929864579200.0, "grad_norm": 2.039166149480628, "language_loss": 0.78547966, "learning_rate": 3.284497544825668e-06, "loss": 0.86353993, "num_input_tokens_seen": 107259375, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.15991211, "step": 4984, "time_per_iteration": 2.585012197494507 }, { "auxiliary_loss_clip": 0.06515074, "auxiliary_loss_mlp": 0.01285623, "balance_loss_clip": 0.06307799, "balance_loss_mlp": 0.01268994, "epoch": 0.29971441454982717, "flos": 25086429417600.0, "grad_norm": 1.5930188810907968, "language_loss": 0.79116005, "learning_rate": 3.2841989995517303e-06, "loss": 0.86916703, "num_input_tokens_seen": 107279890, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.1661377, "step": 4985, "time_per_iteration": 2.600133180618286 }, { "auxiliary_loss_clip": 0.06523396, "auxiliary_loss_mlp": 0.01282837, "balance_loss_clip": 0.0631071, "balance_loss_mlp": 0.01265301, "epoch": 0.29977453780249513, "flos": 52567445617920.0, "grad_norm": 2.1606936573653965, "language_loss": 0.72196674, "learning_rate": 3.283900405580837e-06, "loss": 0.8000291, "num_input_tokens_seen": 107303430, "router_z_loss_clip": 2.12597656, "router_z_loss_mlp": 0.17529297, "step": 4986, "time_per_iteration": 2.8462741374969482 }, { "auxiliary_loss_clip": 0.06526502, "auxiliary_loss_mlp": 0.01277686, "balance_loss_clip": 0.06315842, "balance_loss_mlp": 0.01260365, "epoch": 0.2998346610551631, "flos": 22243759326720.0, "grad_norm": 2.311685584304333, "language_loss": 0.7408638, "learning_rate": 3.283601762924312e-06, "loss": 0.81890565, "num_input_tokens_seen": 107323700, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.17321777, "step": 4987, "time_per_iteration": 2.587996482849121 }, { "auxiliary_loss_clip": 0.06519224, "auxiliary_loss_mlp": 0.01275994, "balance_loss_clip": 0.06314901, "balance_loss_mlp": 0.01260104, "epoch": 0.29989478430783106, "flos": 16878832358400.0, "grad_norm": 2.752554766804164, "language_loss": 0.80842996, "learning_rate": 3.2833030715934793e-06, "loss": 0.8863821, "num_input_tokens_seen": 107341965, "router_z_loss_clip": 2.04101562, "router_z_loss_mlp": 0.15893555, "step": 4988, "time_per_iteration": 4.03902792930603 }, { "auxiliary_loss_clip": 0.06520711, "auxiliary_loss_mlp": 0.01273799, "balance_loss_clip": 0.06315151, "balance_loss_mlp": 0.01258231, "epoch": 0.29995490756049903, "flos": 23775637271040.0, "grad_norm": 9.611981951487794, "language_loss": 0.71082157, "learning_rate": 3.2830043315996658e-06, "loss": 0.78876668, "num_input_tokens_seen": 107362615, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.15576172, "step": 4989, "time_per_iteration": 2.6239495277404785 }, { "auxiliary_loss_clip": 0.06528063, "auxiliary_loss_mlp": 0.0128478, "balance_loss_clip": 0.06316744, "balance_loss_mlp": 0.01267172, "epoch": 0.300015030813167, "flos": 14470577360640.0, "grad_norm": 2.320744500751608, "language_loss": 0.85580587, "learning_rate": 3.282705542954199e-06, "loss": 0.93393433, "num_input_tokens_seen": 107378980, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.17590332, "step": 4990, "time_per_iteration": 2.536125421524048 }, { "auxiliary_loss_clip": 0.06523094, "auxiliary_loss_mlp": 0.0128276, "balance_loss_clip": 0.0631209, "balance_loss_mlp": 0.01265474, "epoch": 0.30007515406583496, "flos": 25199005777920.0, "grad_norm": 1.566355473549733, "language_loss": 0.67229301, "learning_rate": 3.28240670566841e-06, "loss": 0.75035155, "num_input_tokens_seen": 107397640, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.17285156, "step": 4991, "time_per_iteration": 2.821868658065796 }, { "auxiliary_loss_clip": 0.06522945, "auxiliary_loss_mlp": 0.01282689, "balance_loss_clip": 0.06312551, "balance_loss_mlp": 0.01265237, "epoch": 0.3001352773185029, "flos": 19397315802240.0, "grad_norm": 2.1027763693258366, "language_loss": 0.79781038, "learning_rate": 3.28210781975363e-06, "loss": 0.87586677, "num_input_tokens_seen": 107416020, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.17456055, "step": 4992, "time_per_iteration": 2.600139617919922 }, { "auxiliary_loss_clip": 0.06523212, "auxiliary_loss_mlp": 0.01276812, "balance_loss_clip": 0.06318465, "balance_loss_mlp": 0.01260301, "epoch": 0.3001954005711709, "flos": 21550341663360.0, "grad_norm": 2.1686339592392008, "language_loss": 0.82659227, "learning_rate": 3.281808885221193e-06, "loss": 0.90459251, "num_input_tokens_seen": 107436340, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.16503906, "step": 4993, "time_per_iteration": 4.030942678451538 }, { "auxiliary_loss_clip": 0.06525333, "auxiliary_loss_mlp": 0.01285731, "balance_loss_clip": 0.06312805, "balance_loss_mlp": 0.01267706, "epoch": 0.30025552382383885, "flos": 17390087245440.0, "grad_norm": 2.1806011503831377, "language_loss": 0.87106812, "learning_rate": 3.2815099020824345e-06, "loss": 0.9491787, "num_input_tokens_seen": 107454585, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.18005371, "step": 4994, "time_per_iteration": 2.563694715499878 }, { "auxiliary_loss_clip": 0.06517208, "auxiliary_loss_mlp": 0.01275285, "balance_loss_clip": 0.06310666, "balance_loss_mlp": 0.01258929, "epoch": 0.3003156470765068, "flos": 29541003701760.0, "grad_norm": 1.4181615668577399, "language_loss": 0.81411654, "learning_rate": 3.2812108703486924e-06, "loss": 0.89204144, "num_input_tokens_seen": 107477180, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.16357422, "step": 4995, "time_per_iteration": 2.6347434520721436 }, { "auxiliary_loss_clip": 0.06515428, "auxiliary_loss_mlp": 0.01274383, "balance_loss_clip": 0.0631059, "balance_loss_mlp": 0.0125699, "epoch": 0.3003757703291748, "flos": 43655278302720.0, "grad_norm": 1.8189704438026773, "language_loss": 0.67541921, "learning_rate": 3.2809117900313055e-06, "loss": 0.75331736, "num_input_tokens_seen": 107500250, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.17419434, "step": 4996, "time_per_iteration": 2.766296863555908 }, { "auxiliary_loss_clip": 0.06514509, "auxiliary_loss_mlp": 0.01273996, "balance_loss_clip": 0.06311081, "balance_loss_mlp": 0.01257449, "epoch": 0.30043589358184275, "flos": 22534934664960.0, "grad_norm": 17.82191152882373, "language_loss": 0.75655961, "learning_rate": 3.280612661141615e-06, "loss": 0.83444464, "num_input_tokens_seen": 107520070, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.16540527, "step": 4997, "time_per_iteration": 2.5781733989715576 }, { "auxiliary_loss_clip": 0.06512856, "auxiliary_loss_mlp": 0.01274379, "balance_loss_clip": 0.06311087, "balance_loss_mlp": 0.01258739, "epoch": 0.30049601683451077, "flos": 21002176252800.0, "grad_norm": 2.5603649668114685, "language_loss": 0.783503, "learning_rate": 3.2803134836909646e-06, "loss": 0.86137533, "num_input_tokens_seen": 107539285, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.15612793, "step": 4998, "time_per_iteration": 2.567586898803711 }, { "auxiliary_loss_clip": 0.06516467, "auxiliary_loss_mlp": 0.01274856, "balance_loss_clip": 0.06317386, "balance_loss_mlp": 0.01258632, "epoch": 0.30055614008717874, "flos": 23922985875840.0, "grad_norm": 1.8176727219677173, "language_loss": 0.73651457, "learning_rate": 3.2800142576906985e-06, "loss": 0.81442785, "num_input_tokens_seen": 107560260, "router_z_loss_clip": 1.98535156, "router_z_loss_mlp": 0.16229248, "step": 4999, "time_per_iteration": 2.5938968658447266 }, { "auxiliary_loss_clip": 0.06512515, "auxiliary_loss_mlp": 0.01271234, "balance_loss_clip": 0.0630731, "balance_loss_mlp": 0.01253901, "epoch": 0.3006162633398467, "flos": 19175475317760.0, "grad_norm": 3.0599564457980812, "language_loss": 0.76271319, "learning_rate": 3.2797149831521626e-06, "loss": 0.84055078, "num_input_tokens_seen": 107579260, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.17346191, "step": 5000, "time_per_iteration": 2.5495412349700928 }, { "auxiliary_loss_clip": 0.06510757, "auxiliary_loss_mlp": 0.01275696, "balance_loss_clip": 0.06311452, "balance_loss_mlp": 0.01259353, "epoch": 0.30067638659251467, "flos": 14683697020800.0, "grad_norm": 7.11143802838438, "language_loss": 0.82051098, "learning_rate": 3.2794156600867073e-06, "loss": 0.89837557, "num_input_tokens_seen": 107595245, "router_z_loss_clip": 1.99023438, "router_z_loss_mlp": 0.16333008, "step": 5001, "time_per_iteration": 3.940286159515381 }, { "auxiliary_loss_clip": 0.06510014, "auxiliary_loss_mlp": 0.01274334, "balance_loss_clip": 0.06306471, "balance_loss_mlp": 0.01256954, "epoch": 0.30073650984518263, "flos": 23374778538240.0, "grad_norm": 5.0246916329628935, "language_loss": 0.81600964, "learning_rate": 3.2791162885056815e-06, "loss": 0.89385313, "num_input_tokens_seen": 107613985, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.1739502, "step": 5002, "time_per_iteration": 4.059413194656372 }, { "auxiliary_loss_clip": 0.06524554, "auxiliary_loss_mlp": 0.01273699, "balance_loss_clip": 0.06314079, "balance_loss_mlp": 0.01257105, "epoch": 0.3007966330978506, "flos": 22973332826880.0, "grad_norm": 1.8219823972518037, "language_loss": 0.71402121, "learning_rate": 3.2788168684204376e-06, "loss": 0.79200369, "num_input_tokens_seen": 107631435, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.16589355, "step": 5003, "time_per_iteration": 2.604221820831299 }, { "auxiliary_loss_clip": 0.06525571, "auxiliary_loss_mlp": 0.0127319, "balance_loss_clip": 0.06316191, "balance_loss_mlp": 0.01256477, "epoch": 0.30085675635051856, "flos": 27825830951040.0, "grad_norm": 4.053469315250672, "language_loss": 0.71040225, "learning_rate": 3.27851739984233e-06, "loss": 0.78838986, "num_input_tokens_seen": 107650530, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.16711426, "step": 5004, "time_per_iteration": 2.6235454082489014 }, { "auxiliary_loss_clip": 0.06518403, "auxiliary_loss_mlp": 0.0127638, "balance_loss_clip": 0.06312972, "balance_loss_mlp": 0.01259595, "epoch": 0.3009168796031865, "flos": 10886216855040.0, "grad_norm": 2.923441767313359, "language_loss": 0.81771445, "learning_rate": 3.278217882782715e-06, "loss": 0.89566231, "num_input_tokens_seen": 107662240, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.16784668, "step": 5005, "time_per_iteration": 2.5282294750213623 }, { "auxiliary_loss_clip": 0.06516753, "auxiliary_loss_mlp": 0.01269879, "balance_loss_clip": 0.06312399, "balance_loss_mlp": 0.0125381, "epoch": 0.3009770028558545, "flos": 23812170451200.0, "grad_norm": 3.913540577818443, "language_loss": 0.75128168, "learning_rate": 3.2779183172529497e-06, "loss": 0.82914793, "num_input_tokens_seen": 107680330, "router_z_loss_clip": 2.04101562, "router_z_loss_mlp": 0.16064453, "step": 5006, "time_per_iteration": 2.5930840969085693 }, { "auxiliary_loss_clip": 0.06520202, "auxiliary_loss_mlp": 0.0127373, "balance_loss_clip": 0.06316691, "balance_loss_mlp": 0.01256671, "epoch": 0.30103712610852246, "flos": 26475319169280.0, "grad_norm": 2.646896046091198, "language_loss": 0.71791655, "learning_rate": 3.2776187032643932e-06, "loss": 0.79585588, "num_input_tokens_seen": 107700020, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.1706543, "step": 5007, "time_per_iteration": 2.604003429412842 }, { "auxiliary_loss_clip": 0.06519031, "auxiliary_loss_mlp": 0.01274792, "balance_loss_clip": 0.06314696, "balance_loss_mlp": 0.0125753, "epoch": 0.3010972493611904, "flos": 22863020526720.0, "grad_norm": 2.146220154310208, "language_loss": 0.76565707, "learning_rate": 3.2773190408284075e-06, "loss": 0.84359527, "num_input_tokens_seen": 107718575, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.17236328, "step": 5008, "time_per_iteration": 2.57106876373291 }, { "auxiliary_loss_clip": 0.06519855, "auxiliary_loss_mlp": 0.01276609, "balance_loss_clip": 0.06314383, "balance_loss_mlp": 0.01260456, "epoch": 0.3011573726138584, "flos": 24059307669120.0, "grad_norm": 1.870098898367541, "language_loss": 0.84826207, "learning_rate": 3.2770193299563564e-06, "loss": 0.92622674, "num_input_tokens_seen": 107738635, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.16131592, "step": 5009, "time_per_iteration": 2.6056816577911377 }, { "auxiliary_loss_clip": 0.06526839, "auxiliary_loss_mlp": 0.01277286, "balance_loss_clip": 0.06314495, "balance_loss_mlp": 0.01258809, "epoch": 0.30121749586652635, "flos": 20264762396160.0, "grad_norm": 2.004514265813894, "language_loss": 0.84451699, "learning_rate": 3.276719570659604e-06, "loss": 0.92255819, "num_input_tokens_seen": 107753415, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.18469238, "step": 5010, "time_per_iteration": 2.5480659008026123 }, { "auxiliary_loss_clip": 0.06522986, "auxiliary_loss_mlp": 0.01270736, "balance_loss_clip": 0.06318055, "balance_loss_mlp": 0.0125481, "epoch": 0.3012776191191944, "flos": 26950334365440.0, "grad_norm": 4.374724450871222, "language_loss": 0.85320324, "learning_rate": 3.2764197629495176e-06, "loss": 0.93114048, "num_input_tokens_seen": 107773840, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.15942383, "step": 5011, "time_per_iteration": 2.6116108894348145 }, { "auxiliary_loss_clip": 0.06519984, "auxiliary_loss_mlp": 0.0127305, "balance_loss_clip": 0.06310702, "balance_loss_mlp": 0.01256063, "epoch": 0.30133774237186234, "flos": 20418525838080.0, "grad_norm": 3.0043492890278674, "language_loss": 0.724684, "learning_rate": 3.2761199068374656e-06, "loss": 0.80261433, "num_input_tokens_seen": 107792020, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.17004395, "step": 5012, "time_per_iteration": 2.5686709880828857 }, { "auxiliary_loss_clip": 0.06520006, "auxiliary_loss_mlp": 0.012757, "balance_loss_clip": 0.06313166, "balance_loss_mlp": 0.01258378, "epoch": 0.3013978656245303, "flos": 19798635732480.0, "grad_norm": 4.0344978286701645, "language_loss": 0.88199824, "learning_rate": 3.275820002334819e-06, "loss": 0.95995533, "num_input_tokens_seen": 107809595, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.1730957, "step": 5013, "time_per_iteration": 2.5713562965393066 }, { "auxiliary_loss_clip": 0.0653179, "auxiliary_loss_mlp": 0.01276443, "balance_loss_clip": 0.06321436, "balance_loss_mlp": 0.01258311, "epoch": 0.30145798887719827, "flos": 16254623767680.0, "grad_norm": 2.9948072694463277, "language_loss": 0.84010756, "learning_rate": 3.2755200494529496e-06, "loss": 0.91818982, "num_input_tokens_seen": 107827230, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.18127441, "step": 5014, "time_per_iteration": 2.541830539703369 }, { "auxiliary_loss_clip": 0.06513862, "auxiliary_loss_mlp": 0.01271896, "balance_loss_clip": 0.06312342, "balance_loss_mlp": 0.01255874, "epoch": 0.30151811212986623, "flos": 24578654474880.0, "grad_norm": 1.9938660779600093, "language_loss": 0.68368399, "learning_rate": 3.2752200482032323e-06, "loss": 0.7615416, "num_input_tokens_seen": 107847195, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.16015625, "step": 5015, "time_per_iteration": 2.602658748626709 }, { "auxiliary_loss_clip": 0.06518918, "auxiliary_loss_mlp": 0.0127953, "balance_loss_clip": 0.06314892, "balance_loss_mlp": 0.01262507, "epoch": 0.3015782353825342, "flos": 21878595233280.0, "grad_norm": 4552.419755666989, "language_loss": 0.75495112, "learning_rate": 3.2749199985970436e-06, "loss": 0.83293557, "num_input_tokens_seen": 107866420, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.17016602, "step": 5016, "time_per_iteration": 2.578808546066284 }, { "auxiliary_loss_clip": 0.0651998, "auxiliary_loss_mlp": 0.01274982, "balance_loss_clip": 0.06313162, "balance_loss_mlp": 0.01258602, "epoch": 0.30163835863520216, "flos": 28777244935680.0, "grad_norm": 1.9903164117258414, "language_loss": 0.65643883, "learning_rate": 3.2746199006457603e-06, "loss": 0.73438847, "num_input_tokens_seen": 107889090, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.16369629, "step": 5017, "time_per_iteration": 2.6520345211029053 }, { "auxiliary_loss_clip": 0.06518333, "auxiliary_loss_mlp": 0.01274803, "balance_loss_clip": 0.06311116, "balance_loss_mlp": 0.01258018, "epoch": 0.30169848188787013, "flos": 22972829702400.0, "grad_norm": 2.711384192747667, "language_loss": 0.69102085, "learning_rate": 3.2743197543607628e-06, "loss": 0.76895219, "num_input_tokens_seen": 107907520, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.16796875, "step": 5018, "time_per_iteration": 2.572453022003174 }, { "auxiliary_loss_clip": 0.06514271, "auxiliary_loss_mlp": 0.01286524, "balance_loss_clip": 0.06314169, "balance_loss_mlp": 0.01270717, "epoch": 0.3017586051405381, "flos": 21841726636800.0, "grad_norm": 3.0127730672540207, "language_loss": 0.79362237, "learning_rate": 3.2740195597534327e-06, "loss": 0.87163037, "num_input_tokens_seen": 107925650, "router_z_loss_clip": 2.00097656, "router_z_loss_mlp": 0.15820312, "step": 5019, "time_per_iteration": 2.581866979598999 }, { "auxiliary_loss_clip": 0.06516438, "auxiliary_loss_mlp": 0.01281, "balance_loss_clip": 0.06309547, "balance_loss_mlp": 0.01264537, "epoch": 0.30181872839320606, "flos": 22166374481280.0, "grad_norm": 8.430447642461909, "language_loss": 0.70480001, "learning_rate": 3.2737193168351527e-06, "loss": 0.78277433, "num_input_tokens_seen": 107943975, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.16467285, "step": 5020, "time_per_iteration": 2.5923242568969727 }, { "auxiliary_loss_clip": 0.06521447, "auxiliary_loss_mlp": 0.01283088, "balance_loss_clip": 0.06311245, "balance_loss_mlp": 0.01266196, "epoch": 0.301878851645874, "flos": 18120080016000.0, "grad_norm": 2.3090492586249605, "language_loss": 0.7884599, "learning_rate": 3.2734190256173085e-06, "loss": 0.86650527, "num_input_tokens_seen": 107962950, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.16894531, "step": 5021, "time_per_iteration": 2.5743141174316406 }, { "auxiliary_loss_clip": 0.06518058, "auxiliary_loss_mlp": 0.01297108, "balance_loss_clip": 0.06311645, "balance_loss_mlp": 0.01280633, "epoch": 0.301938974898542, "flos": 17607860807040.0, "grad_norm": 14.202495752951052, "language_loss": 0.77346432, "learning_rate": 3.2731186861112877e-06, "loss": 0.85161591, "num_input_tokens_seen": 107979700, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.16467285, "step": 5022, "time_per_iteration": 2.5622618198394775 }, { "auxiliary_loss_clip": 0.06520887, "auxiliary_loss_mlp": 0.01297276, "balance_loss_clip": 0.06313744, "balance_loss_mlp": 0.01279561, "epoch": 0.30199909815120995, "flos": 11185861455360.0, "grad_norm": 2.422543826420744, "language_loss": 0.70125186, "learning_rate": 3.2728182983284793e-06, "loss": 0.77943349, "num_input_tokens_seen": 107996645, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.17724609, "step": 5023, "time_per_iteration": 2.5379223823547363 }, { "auxiliary_loss_clip": 0.06526972, "auxiliary_loss_mlp": 0.01304516, "balance_loss_clip": 0.06315427, "balance_loss_mlp": 0.01287612, "epoch": 0.302059221403878, "flos": 21914247945600.0, "grad_norm": 2.425609734307011, "language_loss": 0.71808004, "learning_rate": 3.2725178622802724e-06, "loss": 0.79639494, "num_input_tokens_seen": 108015020, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.16906738, "step": 5024, "time_per_iteration": 2.57503604888916 }, { "auxiliary_loss_clip": 0.06516087, "auxiliary_loss_mlp": 0.01331023, "balance_loss_clip": 0.06313417, "balance_loss_mlp": 0.01311783, "epoch": 0.30211934465654594, "flos": 26403678328320.0, "grad_norm": 2.9074432949752613, "language_loss": 0.74700713, "learning_rate": 3.272217377978061e-06, "loss": 0.8254782, "num_input_tokens_seen": 108036430, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.19250488, "step": 5025, "time_per_iteration": 2.6057236194610596 }, { "auxiliary_loss_clip": 0.0651872, "auxiliary_loss_mlp": 0.01330986, "balance_loss_clip": 0.06318255, "balance_loss_mlp": 0.01313415, "epoch": 0.3021794679092139, "flos": 23406573962880.0, "grad_norm": 1.7199689625238137, "language_loss": 0.67483783, "learning_rate": 3.2719168454332387e-06, "loss": 0.75333488, "num_input_tokens_seen": 108054250, "router_z_loss_clip": 2.00390625, "router_z_loss_mlp": 0.17578125, "step": 5026, "time_per_iteration": 2.5868682861328125 }, { "auxiliary_loss_clip": 0.06524462, "auxiliary_loss_mlp": 0.01314482, "balance_loss_clip": 0.0631896, "balance_loss_mlp": 0.01295981, "epoch": 0.30223959116188187, "flos": 20266271769600.0, "grad_norm": 1.9213375366938301, "language_loss": 0.85277075, "learning_rate": 3.2716162646572034e-06, "loss": 0.93116015, "num_input_tokens_seen": 108071495, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.18493652, "step": 5027, "time_per_iteration": 2.5599565505981445 }, { "auxiliary_loss_clip": 0.06521769, "auxiliary_loss_mlp": 0.01307752, "balance_loss_clip": 0.06318755, "balance_loss_mlp": 0.01290181, "epoch": 0.30229971441454984, "flos": 26695105228800.0, "grad_norm": 1.6816903594183965, "language_loss": 0.79029614, "learning_rate": 3.271315635661351e-06, "loss": 0.86859137, "num_input_tokens_seen": 108092135, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.17590332, "step": 5028, "time_per_iteration": 4.032615661621094 }, { "auxiliary_loss_clip": 0.06525718, "auxiliary_loss_mlp": 0.0130541, "balance_loss_clip": 0.06321071, "balance_loss_mlp": 0.01288411, "epoch": 0.3023598376672178, "flos": 34353111358080.0, "grad_norm": 2.0118948349104437, "language_loss": 0.77556276, "learning_rate": 3.2710149584570826e-06, "loss": 0.85387403, "num_input_tokens_seen": 108112945, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.1697998, "step": 5029, "time_per_iteration": 2.6711838245391846 }, { "auxiliary_loss_clip": 0.06527214, "auxiliary_loss_mlp": 0.01288941, "balance_loss_clip": 0.06320006, "balance_loss_mlp": 0.01270166, "epoch": 0.30241996091988577, "flos": 23118794714880.0, "grad_norm": 22.17293540407649, "language_loss": 0.82301009, "learning_rate": 3.2707142330557993e-06, "loss": 0.90117168, "num_input_tokens_seen": 108130325, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.18774414, "step": 5030, "time_per_iteration": 2.5818045139312744 }, { "auxiliary_loss_clip": 0.06527971, "auxiliary_loss_mlp": 0.01287251, "balance_loss_clip": 0.06321552, "balance_loss_mlp": 0.01268977, "epoch": 0.30248008417255373, "flos": 19395932209920.0, "grad_norm": 4.318519831673436, "language_loss": 0.7001093, "learning_rate": 3.270413459468905e-06, "loss": 0.77826148, "num_input_tokens_seen": 108150300, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.18286133, "step": 5031, "time_per_iteration": 2.5970520973205566 }, { "auxiliary_loss_clip": 0.06530529, "auxiliary_loss_mlp": 0.01286345, "balance_loss_clip": 0.06324479, "balance_loss_mlp": 0.01267426, "epoch": 0.3025402074252217, "flos": 23776601592960.0, "grad_norm": 2.1766706991046627, "language_loss": 0.83078098, "learning_rate": 3.2701126377078047e-06, "loss": 0.90894973, "num_input_tokens_seen": 108170330, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.18908691, "step": 5032, "time_per_iteration": 2.61146879196167 }, { "auxiliary_loss_clip": 0.0653298, "auxiliary_loss_mlp": 0.01286257, "balance_loss_clip": 0.06322445, "balance_loss_mlp": 0.01266814, "epoch": 0.30260033067788966, "flos": 26001184440960.0, "grad_norm": 3.7396545011982063, "language_loss": 0.73964155, "learning_rate": 3.269811767783906e-06, "loss": 0.8178339, "num_input_tokens_seen": 108191265, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.19421387, "step": 5033, "time_per_iteration": 4.05850076675415 }, { "auxiliary_loss_clip": 0.0652426, "auxiliary_loss_mlp": 0.0129007, "balance_loss_clip": 0.06321777, "balance_loss_mlp": 0.01271163, "epoch": 0.3026604539305576, "flos": 25381629751680.0, "grad_norm": 2.9320562140350956, "language_loss": 0.74752212, "learning_rate": 3.2695108497086185e-06, "loss": 0.82566535, "num_input_tokens_seen": 108211615, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.18908691, "step": 5034, "time_per_iteration": 2.6122610569000244 }, { "auxiliary_loss_clip": 0.06530338, "auxiliary_loss_mlp": 0.01284196, "balance_loss_clip": 0.06323901, "balance_loss_mlp": 0.01265433, "epoch": 0.3027205771832256, "flos": 25819944059520.0, "grad_norm": 3.5751597962703467, "language_loss": 0.73000968, "learning_rate": 3.269209883493352e-06, "loss": 0.80815506, "num_input_tokens_seen": 108231080, "router_z_loss_clip": 2.06152344, "router_z_loss_mlp": 0.1875, "step": 5035, "time_per_iteration": 2.705622673034668 }, { "auxiliary_loss_clip": 0.065285, "auxiliary_loss_mlp": 0.01276636, "balance_loss_clip": 0.06327748, "balance_loss_mlp": 0.01259243, "epoch": 0.30278070043589356, "flos": 27351905857920.0, "grad_norm": 2.1952530846866756, "language_loss": 0.88009834, "learning_rate": 3.2689088691495196e-06, "loss": 0.95814967, "num_input_tokens_seen": 108251125, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.17407227, "step": 5036, "time_per_iteration": 2.653073310852051 }, { "auxiliary_loss_clip": 0.06526768, "auxiliary_loss_mlp": 0.01278725, "balance_loss_clip": 0.06325006, "balance_loss_mlp": 0.01261047, "epoch": 0.3028408236885616, "flos": 24792444967680.0, "grad_norm": 1.6361052086398948, "language_loss": 0.77851027, "learning_rate": 3.268607806688536e-06, "loss": 0.85656524, "num_input_tokens_seen": 108272545, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.17675781, "step": 5037, "time_per_iteration": 2.608660936355591 }, { "auxiliary_loss_clip": 0.06533939, "auxiliary_loss_mlp": 0.01280811, "balance_loss_clip": 0.06326695, "balance_loss_mlp": 0.01261797, "epoch": 0.30290094694122954, "flos": 12937399678080.0, "grad_norm": 2.7912709450120254, "language_loss": 0.78071582, "learning_rate": 3.268306696121816e-06, "loss": 0.85886323, "num_input_tokens_seen": 108289725, "router_z_loss_clip": 2.07128906, "router_z_loss_mlp": 0.19006348, "step": 5038, "time_per_iteration": 2.6028997898101807 }, { "auxiliary_loss_clip": 0.06527478, "auxiliary_loss_mlp": 0.01279919, "balance_loss_clip": 0.0632683, "balance_loss_mlp": 0.01261715, "epoch": 0.3029610701938975, "flos": 25922709492480.0, "grad_norm": 1.9067541713402971, "language_loss": 0.744807, "learning_rate": 3.2680055374607804e-06, "loss": 0.82288098, "num_input_tokens_seen": 108310690, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.18200684, "step": 5039, "time_per_iteration": 2.6733736991882324 }, { "auxiliary_loss_clip": 0.06523336, "auxiliary_loss_mlp": 0.01274744, "balance_loss_clip": 0.06322382, "balance_loss_mlp": 0.01257924, "epoch": 0.3030211934465655, "flos": 21987440087040.0, "grad_norm": 1.8645453348833927, "language_loss": 0.79873526, "learning_rate": 3.267704330716847e-06, "loss": 0.87671608, "num_input_tokens_seen": 108328905, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.16821289, "step": 5040, "time_per_iteration": 4.000753879547119 }, { "auxiliary_loss_clip": 0.0652305, "auxiliary_loss_mlp": 0.01276097, "balance_loss_clip": 0.06322183, "balance_loss_mlp": 0.01258359, "epoch": 0.30308131669923344, "flos": 20997606205440.0, "grad_norm": 5.912732657280741, "language_loss": 0.82566094, "learning_rate": 3.267403075901438e-06, "loss": 0.90365249, "num_input_tokens_seen": 108346680, "router_z_loss_clip": 2.0078125, "router_z_loss_mlp": 0.17736816, "step": 5041, "time_per_iteration": 2.5703911781311035 }, { "auxiliary_loss_clip": 0.0653092, "auxiliary_loss_mlp": 0.01350654, "balance_loss_clip": 0.06421854, "balance_loss_mlp": 0.01345483, "epoch": 0.3031414399519014, "flos": 60568281198720.0, "grad_norm": 0.763562049380091, "language_loss": 0.5921371, "learning_rate": 3.267101773025978e-06, "loss": 0.67095286, "num_input_tokens_seen": 108413885, "router_z_loss_clip": 1.09179688, "router_z_loss_mlp": 0.05172729, "step": 5042, "time_per_iteration": 4.752094507217407 }, { "auxiliary_loss_clip": 0.06530808, "auxiliary_loss_mlp": 0.01273002, "balance_loss_clip": 0.06323661, "balance_loss_mlp": 0.01255466, "epoch": 0.30320156320456937, "flos": 21914038310400.0, "grad_norm": 7.637193872636886, "language_loss": 0.7222048, "learning_rate": 3.266800422101892e-06, "loss": 0.80024296, "num_input_tokens_seen": 108433640, "router_z_loss_clip": 2.07324219, "router_z_loss_mlp": 0.17541504, "step": 5043, "time_per_iteration": 2.597541332244873 }, { "auxiliary_loss_clip": 0.06539539, "auxiliary_loss_mlp": 0.01273854, "balance_loss_clip": 0.06334393, "balance_loss_mlp": 0.01256307, "epoch": 0.30326168645723733, "flos": 21659186517120.0, "grad_norm": 42.919784978083804, "language_loss": 0.70149922, "learning_rate": 3.266499023140606e-06, "loss": 0.77963316, "num_input_tokens_seen": 108452640, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.17541504, "step": 5044, "time_per_iteration": 2.583547592163086 }, { "auxiliary_loss_clip": 0.06528552, "auxiliary_loss_mlp": 0.01275039, "balance_loss_clip": 0.06327426, "balance_loss_mlp": 0.01258946, "epoch": 0.3033218097099053, "flos": 21877672838400.0, "grad_norm": 1.4992418546950714, "language_loss": 0.77604246, "learning_rate": 3.2661975761535513e-06, "loss": 0.85407841, "num_input_tokens_seen": 108472470, "router_z_loss_clip": 2.01171875, "router_z_loss_mlp": 0.16088867, "step": 5045, "time_per_iteration": 2.5795016288757324 }, { "auxiliary_loss_clip": 0.06529824, "auxiliary_loss_mlp": 0.01280234, "balance_loss_clip": 0.06326326, "balance_loss_mlp": 0.01263157, "epoch": 0.30338193296257326, "flos": 27097137918720.0, "grad_norm": 2.041351957716354, "language_loss": 0.72819227, "learning_rate": 3.2658960811521564e-06, "loss": 0.80629283, "num_input_tokens_seen": 108493025, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.1708374, "step": 5046, "time_per_iteration": 2.618535280227661 }, { "auxiliary_loss_clip": 0.06541544, "auxiliary_loss_mlp": 0.01289431, "balance_loss_clip": 0.0633283, "balance_loss_mlp": 0.01271371, "epoch": 0.30344205621524123, "flos": 19540052432640.0, "grad_norm": 1.9479495956629451, "language_loss": 0.81199825, "learning_rate": 3.2655945381478564e-06, "loss": 0.89030802, "num_input_tokens_seen": 108513480, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.18078613, "step": 5047, "time_per_iteration": 2.6092967987060547 }, { "auxiliary_loss_clip": 0.06541365, "auxiliary_loss_mlp": 0.01285591, "balance_loss_clip": 0.06336888, "balance_loss_mlp": 0.01269581, "epoch": 0.3035021794679092, "flos": 23917116090240.0, "grad_norm": 1.9748315233076785, "language_loss": 0.71849418, "learning_rate": 3.265292947152084e-06, "loss": 0.79676366, "num_input_tokens_seen": 108533155, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.16015625, "step": 5048, "time_per_iteration": 2.594573736190796 }, { "auxiliary_loss_clip": 0.06535348, "auxiliary_loss_mlp": 0.01289816, "balance_loss_clip": 0.06331579, "balance_loss_mlp": 0.0127482, "epoch": 0.30356230272057716, "flos": 16149133077120.0, "grad_norm": 1.9044871971029986, "language_loss": 0.76047885, "learning_rate": 3.2649913081762763e-06, "loss": 0.83873045, "num_input_tokens_seen": 108551900, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.15014648, "step": 5049, "time_per_iteration": 2.580308675765991 }, { "auxiliary_loss_clip": 0.06546652, "auxiliary_loss_mlp": 0.01298462, "balance_loss_clip": 0.0633921, "balance_loss_mlp": 0.01282106, "epoch": 0.3036224259732452, "flos": 28922539115520.0, "grad_norm": 5.478657453839225, "language_loss": 0.82620299, "learning_rate": 3.2646896212318717e-06, "loss": 0.90465415, "num_input_tokens_seen": 108574005, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.16345215, "step": 5050, "time_per_iteration": 2.651994466781616 }, { "auxiliary_loss_clip": 0.0654203, "auxiliary_loss_mlp": 0.01295551, "balance_loss_clip": 0.06337783, "balance_loss_mlp": 0.01278838, "epoch": 0.30368254922591315, "flos": 21111943501440.0, "grad_norm": 2.2835771335656863, "language_loss": 0.74312675, "learning_rate": 3.2643878863303106e-06, "loss": 0.82150257, "num_input_tokens_seen": 108592715, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.16711426, "step": 5051, "time_per_iteration": 2.573981285095215 }, { "auxiliary_loss_clip": 0.06543661, "auxiliary_loss_mlp": 0.01293612, "balance_loss_clip": 0.06337448, "balance_loss_mlp": 0.01276362, "epoch": 0.3037426724785811, "flos": 23008859758080.0, "grad_norm": 2.0601867854722085, "language_loss": 0.76526761, "learning_rate": 3.264086103483033e-06, "loss": 0.84364033, "num_input_tokens_seen": 108611770, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.17236328, "step": 5052, "time_per_iteration": 2.5809922218322754 }, { "auxiliary_loss_clip": 0.06548548, "auxiliary_loss_mlp": 0.01296439, "balance_loss_clip": 0.06338532, "balance_loss_mlp": 0.01279225, "epoch": 0.3038027957312491, "flos": 15638129752320.0, "grad_norm": 2.8942810235589533, "language_loss": 0.82901686, "learning_rate": 3.2637842727014836e-06, "loss": 0.90746671, "num_input_tokens_seen": 108629070, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.17224121, "step": 5053, "time_per_iteration": 2.5455679893493652 }, { "auxiliary_loss_clip": 0.06552555, "auxiliary_loss_mlp": 0.0129649, "balance_loss_clip": 0.06347084, "balance_loss_mlp": 0.01279562, "epoch": 0.30386291898391704, "flos": 12718955283840.0, "grad_norm": 1.5417400469754667, "language_loss": 0.71697772, "learning_rate": 3.2634823939971083e-06, "loss": 0.79546815, "num_input_tokens_seen": 108646315, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.16918945, "step": 5054, "time_per_iteration": 2.552597761154175 }, { "auxiliary_loss_clip": 0.06544162, "auxiliary_loss_mlp": 0.01291668, "balance_loss_clip": 0.06337327, "balance_loss_mlp": 0.01275027, "epoch": 0.303923042236585, "flos": 26366642023680.0, "grad_norm": 2.6457714107189503, "language_loss": 0.70317888, "learning_rate": 3.2631804673813545e-06, "loss": 0.78153718, "num_input_tokens_seen": 108665920, "router_z_loss_clip": 2.06933594, "router_z_loss_mlp": 0.16662598, "step": 5055, "time_per_iteration": 2.610541582107544 }, { "auxiliary_loss_clip": 0.0654723, "auxiliary_loss_mlp": 0.01292545, "balance_loss_clip": 0.06339906, "balance_loss_mlp": 0.0127495, "epoch": 0.30398316548925297, "flos": 19725359736960.0, "grad_norm": 20.471250626146315, "language_loss": 0.67987782, "learning_rate": 3.2628784928656707e-06, "loss": 0.75827557, "num_input_tokens_seen": 108683485, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.17602539, "step": 5056, "time_per_iteration": 2.5622708797454834 }, { "auxiliary_loss_clip": 0.06541532, "auxiliary_loss_mlp": 0.01292138, "balance_loss_clip": 0.06338513, "balance_loss_mlp": 0.01275175, "epoch": 0.30404328874192094, "flos": 24246124346880.0, "grad_norm": 10.6777564307264, "language_loss": 0.82797194, "learning_rate": 3.262576470461507e-06, "loss": 0.90630865, "num_input_tokens_seen": 108702700, "router_z_loss_clip": 2.02929688, "router_z_loss_mlp": 0.16955566, "step": 5057, "time_per_iteration": 2.6052510738372803 }, { "auxiliary_loss_clip": 0.0654224, "auxiliary_loss_mlp": 0.01285634, "balance_loss_clip": 0.06338515, "balance_loss_mlp": 0.01268921, "epoch": 0.3041034119945889, "flos": 24505881603840.0, "grad_norm": 2.3798150866406367, "language_loss": 0.89421308, "learning_rate": 3.2622744001803176e-06, "loss": 0.97249186, "num_input_tokens_seen": 108721860, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.16711426, "step": 5058, "time_per_iteration": 2.6028685569763184 }, { "auxiliary_loss_clip": 0.0654321, "auxiliary_loss_mlp": 0.01294388, "balance_loss_clip": 0.06333838, "balance_loss_mlp": 0.01277067, "epoch": 0.30416353524725687, "flos": 28295689121280.0, "grad_norm": 2.9011142046817895, "language_loss": 0.71418417, "learning_rate": 3.2619722820335564e-06, "loss": 0.7925601, "num_input_tokens_seen": 108743215, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.1730957, "step": 5059, "time_per_iteration": 2.632012128829956 }, { "auxiliary_loss_clip": 0.06534772, "auxiliary_loss_mlp": 0.01287024, "balance_loss_clip": 0.06328566, "balance_loss_mlp": 0.01270335, "epoch": 0.30422365849992483, "flos": 23667295541760.0, "grad_norm": 1.6751216417670818, "language_loss": 0.73007345, "learning_rate": 3.26167011603268e-06, "loss": 0.80829138, "num_input_tokens_seen": 108765505, "router_z_loss_clip": 2.05957031, "router_z_loss_mlp": 0.16674805, "step": 5060, "time_per_iteration": 2.636955499649048 }, { "auxiliary_loss_clip": 0.06529403, "auxiliary_loss_mlp": 0.01295134, "balance_loss_clip": 0.06322004, "balance_loss_mlp": 0.01278385, "epoch": 0.3042837817525928, "flos": 23004750908160.0, "grad_norm": 1.7285452061260593, "language_loss": 0.77404982, "learning_rate": 3.2613679021891463e-06, "loss": 0.85229516, "num_input_tokens_seen": 108783370, "router_z_loss_clip": 2.07128906, "router_z_loss_mlp": 0.16748047, "step": 5061, "time_per_iteration": 2.576876401901245 }, { "auxiliary_loss_clip": 0.06537637, "auxiliary_loss_mlp": 0.01292401, "balance_loss_clip": 0.0632645, "balance_loss_mlp": 0.01274102, "epoch": 0.30434390500526076, "flos": 22087438335360.0, "grad_norm": 2.5292164138551194, "language_loss": 0.82150602, "learning_rate": 3.261065640514415e-06, "loss": 0.89980638, "num_input_tokens_seen": 108797430, "router_z_loss_clip": 2.11035156, "router_z_loss_mlp": 0.18310547, "step": 5062, "time_per_iteration": 2.5512287616729736 }, { "auxiliary_loss_clip": 0.06529469, "auxiliary_loss_mlp": 0.01279715, "balance_loss_clip": 0.06323297, "balance_loss_mlp": 0.01263562, "epoch": 0.3044040282579287, "flos": 25490516532480.0, "grad_norm": 2.0568203102510356, "language_loss": 0.75084251, "learning_rate": 3.2607633310199483e-06, "loss": 0.82893443, "num_input_tokens_seen": 108816945, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.16149902, "step": 5063, "time_per_iteration": 2.6193695068359375 }, { "auxiliary_loss_clip": 0.06528587, "auxiliary_loss_mlp": 0.0129093, "balance_loss_clip": 0.06322314, "balance_loss_mlp": 0.01273382, "epoch": 0.30446415151059675, "flos": 21952080864000.0, "grad_norm": 1.716853136314989, "language_loss": 0.84038931, "learning_rate": 3.26046097371721e-06, "loss": 0.91858447, "num_input_tokens_seen": 108836615, "router_z_loss_clip": 2.06152344, "router_z_loss_mlp": 0.17565918, "step": 5064, "time_per_iteration": 2.603952407836914 }, { "auxiliary_loss_clip": 0.06528604, "auxiliary_loss_mlp": 0.01292536, "balance_loss_clip": 0.06319904, "balance_loss_mlp": 0.01274392, "epoch": 0.3045242747632647, "flos": 16440979248000.0, "grad_norm": 2.5984321427556774, "language_loss": 0.76093048, "learning_rate": 3.2601585686176655e-06, "loss": 0.83914191, "num_input_tokens_seen": 108855165, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.18151855, "step": 5065, "time_per_iteration": 2.608084201812744 }, { "auxiliary_loss_clip": 0.06524796, "auxiliary_loss_mlp": 0.01297486, "balance_loss_clip": 0.06314713, "balance_loss_mlp": 0.01278746, "epoch": 0.3045843980159327, "flos": 31548399966720.0, "grad_norm": 2.0946970424373568, "language_loss": 0.62203372, "learning_rate": 3.2598561157327814e-06, "loss": 0.70025659, "num_input_tokens_seen": 108874690, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.18737793, "step": 5066, "time_per_iteration": 2.6862127780914307 }, { "auxiliary_loss_clip": 0.06528875, "auxiliary_loss_mlp": 0.01277959, "balance_loss_clip": 0.06315642, "balance_loss_mlp": 0.01259684, "epoch": 0.30464452126860064, "flos": 17858645677440.0, "grad_norm": 1.8498772047819794, "language_loss": 0.83052456, "learning_rate": 3.2595536150740265e-06, "loss": 0.90859294, "num_input_tokens_seen": 108893140, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.18286133, "step": 5067, "time_per_iteration": 4.0455238819122314 }, { "auxiliary_loss_clip": 0.06515863, "auxiliary_loss_mlp": 0.0128646, "balance_loss_clip": 0.06311607, "balance_loss_mlp": 0.01270056, "epoch": 0.3047046445212686, "flos": 20637682992000.0, "grad_norm": 1.6986072181615661, "language_loss": 0.63101906, "learning_rate": 3.259251066652873e-06, "loss": 0.70904231, "num_input_tokens_seen": 108911880, "router_z_loss_clip": 2.04101562, "router_z_loss_mlp": 0.16418457, "step": 5068, "time_per_iteration": 2.5579674243927 }, { "auxiliary_loss_clip": 0.06517872, "auxiliary_loss_mlp": 0.01285002, "balance_loss_clip": 0.06311759, "balance_loss_mlp": 0.01268075, "epoch": 0.3047647677739366, "flos": 21293896642560.0, "grad_norm": 1.7717421132695375, "language_loss": 0.75148392, "learning_rate": 3.258948470480793e-06, "loss": 0.82951272, "num_input_tokens_seen": 108930440, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.16918945, "step": 5069, "time_per_iteration": 2.5704562664031982 }, { "auxiliary_loss_clip": 0.06511636, "auxiliary_loss_mlp": 0.01284454, "balance_loss_clip": 0.0631046, "balance_loss_mlp": 0.01267801, "epoch": 0.30482489102660454, "flos": 21002218179840.0, "grad_norm": 2.80229417976023, "language_loss": 0.75901437, "learning_rate": 3.258645826569261e-06, "loss": 0.83697528, "num_input_tokens_seen": 108949125, "router_z_loss_clip": 2.01171875, "router_z_loss_mlp": 0.16638184, "step": 5070, "time_per_iteration": 2.5714991092681885 }, { "auxiliary_loss_clip": 0.06526012, "auxiliary_loss_mlp": 0.01281433, "balance_loss_clip": 0.06313404, "balance_loss_mlp": 0.01263051, "epoch": 0.3048850142792725, "flos": 26298732689280.0, "grad_norm": 1.868449221571115, "language_loss": 0.81724656, "learning_rate": 3.2583431349297527e-06, "loss": 0.89532101, "num_input_tokens_seen": 108972190, "router_z_loss_clip": 2.12792969, "router_z_loss_mlp": 0.18371582, "step": 5071, "time_per_iteration": 2.637880802154541 }, { "auxiliary_loss_clip": 0.06524031, "auxiliary_loss_mlp": 0.01278723, "balance_loss_clip": 0.06310792, "balance_loss_mlp": 0.01260353, "epoch": 0.30494513753194047, "flos": 22352813815680.0, "grad_norm": 2.121661858195865, "language_loss": 0.76574266, "learning_rate": 3.2580403955737467e-06, "loss": 0.84377027, "num_input_tokens_seen": 108990325, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.18371582, "step": 5072, "time_per_iteration": 4.13037371635437 }, { "auxiliary_loss_clip": 0.06513807, "auxiliary_loss_mlp": 0.01280931, "balance_loss_clip": 0.06307217, "balance_loss_mlp": 0.01263038, "epoch": 0.30500526078460843, "flos": 19543909720320.0, "grad_norm": 3.5646930965581847, "language_loss": 0.71636856, "learning_rate": 3.257737608512723e-06, "loss": 0.79431593, "num_input_tokens_seen": 109009505, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.17895508, "step": 5073, "time_per_iteration": 2.556663751602173 }, { "auxiliary_loss_clip": 0.06525565, "auxiliary_loss_mlp": 0.01277661, "balance_loss_clip": 0.06315025, "balance_loss_mlp": 0.01260078, "epoch": 0.3050653840372764, "flos": 14470577360640.0, "grad_norm": 2.6012131319385023, "language_loss": 0.77285159, "learning_rate": 3.257434773758163e-06, "loss": 0.85088384, "num_input_tokens_seen": 109026350, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.17578125, "step": 5074, "time_per_iteration": 2.567944288253784 }, { "auxiliary_loss_clip": 0.0652142, "auxiliary_loss_mlp": 0.01272494, "balance_loss_clip": 0.0631595, "balance_loss_mlp": 0.0125553, "epoch": 0.30512550728994436, "flos": 24250736321280.0, "grad_norm": 1.8537507209709763, "language_loss": 0.74545157, "learning_rate": 3.25713189132155e-06, "loss": 0.82339072, "num_input_tokens_seen": 109044165, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.16955566, "step": 5075, "time_per_iteration": 2.5866289138793945 }, { "auxiliary_loss_clip": 0.06524231, "auxiliary_loss_mlp": 0.01273365, "balance_loss_clip": 0.06311986, "balance_loss_mlp": 0.01255459, "epoch": 0.30518563054261233, "flos": 16365774608640.0, "grad_norm": 3.9781055431845305, "language_loss": 0.76655686, "learning_rate": 3.2568289612143703e-06, "loss": 0.84453285, "num_input_tokens_seen": 109060665, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.17907715, "step": 5076, "time_per_iteration": 2.535351037979126 }, { "auxiliary_loss_clip": 0.06526893, "auxiliary_loss_mlp": 0.01273821, "balance_loss_clip": 0.06319401, "balance_loss_mlp": 0.01256452, "epoch": 0.30524575379528035, "flos": 21585952448640.0, "grad_norm": 6.342594547850706, "language_loss": 0.79528761, "learning_rate": 3.25652598344811e-06, "loss": 0.87329471, "num_input_tokens_seen": 109080035, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.17370605, "step": 5077, "time_per_iteration": 2.577613115310669 }, { "auxiliary_loss_clip": 0.06523103, "auxiliary_loss_mlp": 0.01274222, "balance_loss_clip": 0.06322388, "balance_loss_mlp": 0.01257616, "epoch": 0.3053058770479483, "flos": 16550872277760.0, "grad_norm": 1.7829702790165434, "language_loss": 0.75542325, "learning_rate": 3.256222958034259e-06, "loss": 0.83339655, "num_input_tokens_seen": 109097385, "router_z_loss_clip": 2.0078125, "router_z_loss_mlp": 0.16619873, "step": 5078, "time_per_iteration": 2.5680277347564697 }, { "auxiliary_loss_clip": 0.06526089, "auxiliary_loss_mlp": 0.01278281, "balance_loss_clip": 0.06321847, "balance_loss_mlp": 0.01262116, "epoch": 0.3053660003006163, "flos": 12317844988800.0, "grad_norm": 1.8352912822220873, "language_loss": 0.67971891, "learning_rate": 3.255919884984307e-06, "loss": 0.75776267, "num_input_tokens_seen": 109115495, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.16149902, "step": 5079, "time_per_iteration": 2.5471208095550537 }, { "auxiliary_loss_clip": 0.06525384, "auxiliary_loss_mlp": 0.01277264, "balance_loss_clip": 0.06319344, "balance_loss_mlp": 0.01259526, "epoch": 0.30542612355328425, "flos": 23118962423040.0, "grad_norm": 1.8235834238916628, "language_loss": 0.80809164, "learning_rate": 3.2556167643097477e-06, "loss": 0.88611805, "num_input_tokens_seen": 109134235, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.17736816, "step": 5080, "time_per_iteration": 3.930286169052124 }, { "auxiliary_loss_clip": 0.06536427, "auxiliary_loss_mlp": 0.01284829, "balance_loss_clip": 0.06329798, "balance_loss_mlp": 0.0126777, "epoch": 0.3054862468059522, "flos": 24396365917440.0, "grad_norm": 2.3675104643397558, "language_loss": 0.8155157, "learning_rate": 3.255313596022074e-06, "loss": 0.89372826, "num_input_tokens_seen": 109152760, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.1706543, "step": 5081, "time_per_iteration": 2.5984623432159424 }, { "auxiliary_loss_clip": 0.06526423, "auxiliary_loss_mlp": 0.01280722, "balance_loss_clip": 0.06322259, "balance_loss_mlp": 0.01264247, "epoch": 0.3055463700586202, "flos": 29393529315840.0, "grad_norm": 2.2539748627888643, "language_loss": 0.72394133, "learning_rate": 3.255010380132783e-06, "loss": 0.8020128, "num_input_tokens_seen": 109173925, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.16473389, "step": 5082, "time_per_iteration": 4.18803596496582 }, { "auxiliary_loss_clip": 0.06529844, "auxiliary_loss_mlp": 0.01278985, "balance_loss_clip": 0.0631977, "balance_loss_mlp": 0.01260758, "epoch": 0.30560649331128814, "flos": 25598606699520.0, "grad_norm": 2.3891227056286377, "language_loss": 0.73515749, "learning_rate": 3.2547071166533736e-06, "loss": 0.81324577, "num_input_tokens_seen": 109192510, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.18225098, "step": 5083, "time_per_iteration": 2.5959768295288086 }, { "auxiliary_loss_clip": 0.06532304, "auxiliary_loss_mlp": 0.01282481, "balance_loss_clip": 0.06324472, "balance_loss_mlp": 0.0126547, "epoch": 0.3056666165639561, "flos": 19133156206080.0, "grad_norm": 1.7305106636623253, "language_loss": 0.71043724, "learning_rate": 3.254403805595344e-06, "loss": 0.78858507, "num_input_tokens_seen": 109210885, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.16992188, "step": 5084, "time_per_iteration": 2.5577077865600586 }, { "auxiliary_loss_clip": 0.06538568, "auxiliary_loss_mlp": 0.01273824, "balance_loss_clip": 0.06329504, "balance_loss_mlp": 0.01258219, "epoch": 0.30572673981662407, "flos": 15529368752640.0, "grad_norm": 2.5263937370521954, "language_loss": 0.79302806, "learning_rate": 3.2541004469701962e-06, "loss": 0.87115192, "num_input_tokens_seen": 109229180, "router_z_loss_clip": 2.08886719, "router_z_loss_mlp": 0.15600586, "step": 5085, "time_per_iteration": 2.620373010635376 }, { "auxiliary_loss_clip": 0.06528792, "auxiliary_loss_mlp": 0.01282574, "balance_loss_clip": 0.06326464, "balance_loss_mlp": 0.01265336, "epoch": 0.30578686306929204, "flos": 21512886088320.0, "grad_norm": 8.531946474374758, "language_loss": 0.78294683, "learning_rate": 3.2537970407894342e-06, "loss": 0.8610605, "num_input_tokens_seen": 109249510, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.17224121, "step": 5086, "time_per_iteration": 2.652343988418579 }, { "auxiliary_loss_clip": 0.06534691, "auxiliary_loss_mlp": 0.01279674, "balance_loss_clip": 0.06330556, "balance_loss_mlp": 0.01262103, "epoch": 0.30584698632196, "flos": 20959689432960.0, "grad_norm": 2.358978984549269, "language_loss": 0.77435881, "learning_rate": 3.253493587064563e-06, "loss": 0.85250247, "num_input_tokens_seen": 109268200, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.17565918, "step": 5087, "time_per_iteration": 2.586566925048828 }, { "auxiliary_loss_clip": 0.06530536, "auxiliary_loss_mlp": 0.01277664, "balance_loss_clip": 0.06321676, "balance_loss_mlp": 0.01259341, "epoch": 0.30590710957462797, "flos": 24688044380160.0, "grad_norm": 1.9130892585560524, "language_loss": 0.72582471, "learning_rate": 3.2531900858070885e-06, "loss": 0.80390674, "num_input_tokens_seen": 109288370, "router_z_loss_clip": 2.08691406, "router_z_loss_mlp": 0.18322754, "step": 5088, "time_per_iteration": 2.5953080654144287 }, { "auxiliary_loss_clip": 0.06532422, "auxiliary_loss_mlp": 0.01274484, "balance_loss_clip": 0.06319474, "balance_loss_mlp": 0.01257008, "epoch": 0.30596723282729593, "flos": 17091700456320.0, "grad_norm": 2.588015939266201, "language_loss": 0.79521096, "learning_rate": 3.252886537028521e-06, "loss": 0.87328005, "num_input_tokens_seen": 109306730, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.17480469, "step": 5089, "time_per_iteration": 2.551939010620117 }, { "auxiliary_loss_clip": 0.06532393, "auxiliary_loss_mlp": 0.01276352, "balance_loss_clip": 0.0632563, "balance_loss_mlp": 0.01259007, "epoch": 0.30602735607996395, "flos": 22863775213440.0, "grad_norm": 2.317720977871149, "language_loss": 0.77476346, "learning_rate": 3.2525829407403703e-06, "loss": 0.85285085, "num_input_tokens_seen": 109327360, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.17333984, "step": 5090, "time_per_iteration": 2.5951504707336426 }, { "auxiliary_loss_clip": 0.06534469, "auxiliary_loss_mlp": 0.01279703, "balance_loss_clip": 0.06324092, "balance_loss_mlp": 0.01262096, "epoch": 0.3060874793326319, "flos": 29869173417600.0, "grad_norm": 2.852661978738405, "language_loss": 0.77158523, "learning_rate": 3.2522792969541488e-06, "loss": 0.84972692, "num_input_tokens_seen": 109348135, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.17602539, "step": 5091, "time_per_iteration": 2.6212031841278076 }, { "auxiliary_loss_clip": 0.06530199, "auxiliary_loss_mlp": 0.01275499, "balance_loss_clip": 0.06320715, "balance_loss_mlp": 0.01258488, "epoch": 0.3061476025852999, "flos": 20454765528960.0, "grad_norm": 22.178827556683814, "language_loss": 0.71973276, "learning_rate": 3.2519756056813705e-06, "loss": 0.79778975, "num_input_tokens_seen": 109366220, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.17028809, "step": 5092, "time_per_iteration": 2.57692289352417 }, { "auxiliary_loss_clip": 0.06528817, "auxiliary_loss_mlp": 0.01271449, "balance_loss_clip": 0.0632301, "balance_loss_mlp": 0.0125532, "epoch": 0.30620772583796785, "flos": 19397651218560.0, "grad_norm": 1.9331852199239556, "language_loss": 0.82821739, "learning_rate": 3.2516718669335522e-06, "loss": 0.90622008, "num_input_tokens_seen": 109385260, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.16137695, "step": 5093, "time_per_iteration": 2.555797576904297 }, { "auxiliary_loss_clip": 0.06532525, "auxiliary_loss_mlp": 0.0127851, "balance_loss_clip": 0.06327353, "balance_loss_mlp": 0.01262345, "epoch": 0.3062678490906358, "flos": 24031411459200.0, "grad_norm": 2.286379471613708, "language_loss": 0.75391591, "learning_rate": 3.2513680807222114e-06, "loss": 0.8320263, "num_input_tokens_seen": 109405025, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.16174316, "step": 5094, "time_per_iteration": 2.5961365699768066 }, { "auxiliary_loss_clip": 0.06523447, "auxiliary_loss_mlp": 0.01273608, "balance_loss_clip": 0.06318745, "balance_loss_mlp": 0.01257825, "epoch": 0.3063279723433038, "flos": 19760593178880.0, "grad_norm": 3.1195508600232595, "language_loss": 0.75794148, "learning_rate": 3.251064247058868e-06, "loss": 0.83591199, "num_input_tokens_seen": 109422465, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.15783691, "step": 5095, "time_per_iteration": 2.5695648193359375 }, { "auxiliary_loss_clip": 0.06525756, "auxiliary_loss_mlp": 0.01274249, "balance_loss_clip": 0.06323065, "balance_loss_mlp": 0.0125713, "epoch": 0.30638809559597174, "flos": 22455663102720.0, "grad_norm": 1.8306586910711444, "language_loss": 0.80942744, "learning_rate": 3.250760365955042e-06, "loss": 0.88742745, "num_input_tokens_seen": 109440575, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.17114258, "step": 5096, "time_per_iteration": 2.5902318954467773 }, { "auxiliary_loss_clip": 0.06529814, "auxiliary_loss_mlp": 0.01275104, "balance_loss_clip": 0.06320703, "balance_loss_mlp": 0.01259166, "epoch": 0.3064482188486397, "flos": 17170846237440.0, "grad_norm": 2.471104974156979, "language_loss": 0.82557434, "learning_rate": 3.250456437422258e-06, "loss": 0.90362346, "num_input_tokens_seen": 109459050, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.15942383, "step": 5097, "time_per_iteration": 2.5615789890289307 }, { "auxiliary_loss_clip": 0.06529269, "auxiliary_loss_mlp": 0.01279787, "balance_loss_clip": 0.06322493, "balance_loss_mlp": 0.01261584, "epoch": 0.3065083421013077, "flos": 23775176073600.0, "grad_norm": 3.4777352427790476, "language_loss": 0.79166919, "learning_rate": 3.250152461472041e-06, "loss": 0.8697598, "num_input_tokens_seen": 109475860, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.18188477, "step": 5098, "time_per_iteration": 2.587460994720459 }, { "auxiliary_loss_clip": 0.06523623, "auxiliary_loss_mlp": 0.01272636, "balance_loss_clip": 0.06320612, "balance_loss_mlp": 0.01256995, "epoch": 0.30656846535397564, "flos": 26438953697280.0, "grad_norm": 1.7311603178614758, "language_loss": 0.84221089, "learning_rate": 3.249848438115917e-06, "loss": 0.92017353, "num_input_tokens_seen": 109494760, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.15649414, "step": 5099, "time_per_iteration": 2.590529441833496 }, { "auxiliary_loss_clip": 0.06525324, "auxiliary_loss_mlp": 0.01273113, "balance_loss_clip": 0.0631613, "balance_loss_mlp": 0.01255756, "epoch": 0.3066285886066436, "flos": 26659117100160.0, "grad_norm": 2.9854914885220785, "language_loss": 0.85868388, "learning_rate": 3.2495443673654148e-06, "loss": 0.93666828, "num_input_tokens_seen": 109516480, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.17346191, "step": 5100, "time_per_iteration": 2.608924388885498 }, { "auxiliary_loss_clip": 0.06523937, "auxiliary_loss_mlp": 0.01274239, "balance_loss_clip": 0.06318051, "balance_loss_mlp": 0.01257037, "epoch": 0.30668871185931157, "flos": 15055443659520.0, "grad_norm": 2.0570565070423914, "language_loss": 0.79367542, "learning_rate": 3.249240249232065e-06, "loss": 0.87165719, "num_input_tokens_seen": 109534615, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.17211914, "step": 5101, "time_per_iteration": 2.5677638053894043 }, { "auxiliary_loss_clip": 0.06526528, "auxiliary_loss_mlp": 0.01276287, "balance_loss_clip": 0.06316622, "balance_loss_mlp": 0.01259264, "epoch": 0.30674883511197953, "flos": 20087966280960.0, "grad_norm": 1.703197312234331, "language_loss": 0.80231106, "learning_rate": 3.2489360837273998e-06, "loss": 0.88033926, "num_input_tokens_seen": 109554040, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.17028809, "step": 5102, "time_per_iteration": 2.564847946166992 }, { "auxiliary_loss_clip": 0.06526084, "auxiliary_loss_mlp": 0.01275273, "balance_loss_clip": 0.06318834, "balance_loss_mlp": 0.01258238, "epoch": 0.30680895836464755, "flos": 22900518028800.0, "grad_norm": 2.0637330867863315, "language_loss": 0.89243299, "learning_rate": 3.2486318708629532e-06, "loss": 0.97044659, "num_input_tokens_seen": 109574345, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.17028809, "step": 5103, "time_per_iteration": 2.584484577178955 }, { "auxiliary_loss_clip": 0.06523575, "auxiliary_loss_mlp": 0.01273708, "balance_loss_clip": 0.06318772, "balance_loss_mlp": 0.01257341, "epoch": 0.3068690816173155, "flos": 23702948254080.0, "grad_norm": 6.063450491085786, "language_loss": 0.74568468, "learning_rate": 3.2483276106502607e-06, "loss": 0.82365751, "num_input_tokens_seen": 109593670, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.16345215, "step": 5104, "time_per_iteration": 2.5898942947387695 }, { "auxiliary_loss_clip": 0.06529628, "auxiliary_loss_mlp": 0.01273318, "balance_loss_clip": 0.06317082, "balance_loss_mlp": 0.01256796, "epoch": 0.3069292048699835, "flos": 23557947563520.0, "grad_norm": 2.299003008388518, "language_loss": 0.73699403, "learning_rate": 3.2480233031008605e-06, "loss": 0.81502354, "num_input_tokens_seen": 109613385, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.16516113, "step": 5105, "time_per_iteration": 2.581678867340088 }, { "auxiliary_loss_clip": 0.06519556, "auxiliary_loss_mlp": 0.0127857, "balance_loss_clip": 0.06312899, "balance_loss_mlp": 0.01261475, "epoch": 0.30698932812265145, "flos": 24537970517760.0, "grad_norm": 1.9017011611769952, "language_loss": 0.87614042, "learning_rate": 3.2477189482262916e-06, "loss": 0.95412171, "num_input_tokens_seen": 109632395, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.17102051, "step": 5106, "time_per_iteration": 4.0185651779174805 }, { "auxiliary_loss_clip": 0.06527209, "auxiliary_loss_mlp": 0.01274472, "balance_loss_clip": 0.06312877, "balance_loss_mlp": 0.01257544, "epoch": 0.3070494513753194, "flos": 21002805158400.0, "grad_norm": 14.345853440390522, "language_loss": 0.71941775, "learning_rate": 3.2474145460380945e-06, "loss": 0.79743457, "num_input_tokens_seen": 109651380, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.16918945, "step": 5107, "time_per_iteration": 2.5669500827789307 }, { "auxiliary_loss_clip": 0.0651867, "auxiliary_loss_mlp": 0.01276482, "balance_loss_clip": 0.06314479, "balance_loss_mlp": 0.01258803, "epoch": 0.3071095746279874, "flos": 19031942073600.0, "grad_norm": 2.755175200462444, "language_loss": 0.73079735, "learning_rate": 3.247110096547814e-06, "loss": 0.80874884, "num_input_tokens_seen": 109670240, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.17663574, "step": 5108, "time_per_iteration": 2.5647826194763184 }, { "auxiliary_loss_clip": 0.06527732, "auxiliary_loss_mlp": 0.01279164, "balance_loss_clip": 0.06321232, "balance_loss_mlp": 0.01262892, "epoch": 0.30716969788065535, "flos": 21221962312320.0, "grad_norm": 2.215013257449961, "language_loss": 0.86345863, "learning_rate": 3.2468055997669926e-06, "loss": 0.94152761, "num_input_tokens_seen": 109690810, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.16271973, "step": 5109, "time_per_iteration": 2.5904619693756104 }, { "auxiliary_loss_clip": 0.06527096, "auxiliary_loss_mlp": 0.01277929, "balance_loss_clip": 0.06320289, "balance_loss_mlp": 0.01260858, "epoch": 0.3072298211333233, "flos": 25779385883520.0, "grad_norm": 1.6212041157038017, "language_loss": 0.67776746, "learning_rate": 3.2465010557071788e-06, "loss": 0.75581765, "num_input_tokens_seen": 109711145, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.17089844, "step": 5110, "time_per_iteration": 2.612837314605713 }, { "auxiliary_loss_clip": 0.06525147, "auxiliary_loss_mlp": 0.01272414, "balance_loss_clip": 0.06322029, "balance_loss_mlp": 0.01256965, "epoch": 0.3072899443859913, "flos": 25856099896320.0, "grad_norm": 2.0132959113877225, "language_loss": 0.76906538, "learning_rate": 3.246196464379919e-06, "loss": 0.84704101, "num_input_tokens_seen": 109731425, "router_z_loss_clip": 2.03027344, "router_z_loss_mlp": 0.15441895, "step": 5111, "time_per_iteration": 2.6322648525238037 }, { "auxiliary_loss_clip": 0.06532478, "auxiliary_loss_mlp": 0.01277299, "balance_loss_clip": 0.06321201, "balance_loss_mlp": 0.0125987, "epoch": 0.30735006763865924, "flos": 25930130578560.0, "grad_norm": 1.9663567615756705, "language_loss": 0.67570734, "learning_rate": 3.245891825796765e-06, "loss": 0.75380504, "num_input_tokens_seen": 109752720, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.17431641, "step": 5112, "time_per_iteration": 4.052348375320435 }, { "auxiliary_loss_clip": 0.06542049, "auxiliary_loss_mlp": 0.01272857, "balance_loss_clip": 0.06329469, "balance_loss_mlp": 0.01254344, "epoch": 0.3074101908913272, "flos": 30924442938240.0, "grad_norm": 2.82136417932689, "language_loss": 0.80005902, "learning_rate": 3.2455871399692678e-06, "loss": 0.87820798, "num_input_tokens_seen": 109772840, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.18518066, "step": 5113, "time_per_iteration": 2.6627037525177 }, { "auxiliary_loss_clip": 0.0654109, "auxiliary_loss_mlp": 0.01273171, "balance_loss_clip": 0.06329643, "balance_loss_mlp": 0.01255791, "epoch": 0.30747031414399517, "flos": 18406182182400.0, "grad_norm": 2.021156179983298, "language_loss": 0.77921158, "learning_rate": 3.2452824069089815e-06, "loss": 0.85735416, "num_input_tokens_seen": 109790150, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.17382812, "step": 5114, "time_per_iteration": 2.5378644466400146 }, { "auxiliary_loss_clip": 0.06540917, "auxiliary_loss_mlp": 0.01277031, "balance_loss_clip": 0.06334408, "balance_loss_mlp": 0.01259078, "epoch": 0.30753043739666314, "flos": 22638957909120.0, "grad_norm": 1.7713769243571433, "language_loss": 0.61886698, "learning_rate": 3.2449776266274623e-06, "loss": 0.69704646, "num_input_tokens_seen": 109807985, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.17956543, "step": 5115, "time_per_iteration": 2.57409930229187 }, { "auxiliary_loss_clip": 0.0653864, "auxiliary_loss_mlp": 0.01275326, "balance_loss_clip": 0.06329258, "balance_loss_mlp": 0.01257551, "epoch": 0.3075905606493311, "flos": 27351360806400.0, "grad_norm": 2.362942152264421, "language_loss": 0.83096313, "learning_rate": 3.2446727991362657e-06, "loss": 0.9091028, "num_input_tokens_seen": 109825920, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.17773438, "step": 5116, "time_per_iteration": 2.619229793548584 }, { "auxiliary_loss_clip": 0.0653533, "auxiliary_loss_mlp": 0.01274271, "balance_loss_clip": 0.06327148, "balance_loss_mlp": 0.01257129, "epoch": 0.3076506839019991, "flos": 22097333116800.0, "grad_norm": 1.6798668809866322, "language_loss": 0.76357371, "learning_rate": 3.244367924446952e-06, "loss": 0.84166968, "num_input_tokens_seen": 109846220, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.17126465, "step": 5117, "time_per_iteration": 2.5836312770843506 }, { "auxiliary_loss_clip": 0.06537064, "auxiliary_loss_mlp": 0.01276361, "balance_loss_clip": 0.06327623, "balance_loss_mlp": 0.01258503, "epoch": 0.3077108071546671, "flos": 21296160702720.0, "grad_norm": 3.1856794882319046, "language_loss": 0.7186023, "learning_rate": 3.2440630025710826e-06, "loss": 0.79673654, "num_input_tokens_seen": 109863870, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.17858887, "step": 5118, "time_per_iteration": 2.57287859916687 }, { "auxiliary_loss_clip": 0.06529099, "auxiliary_loss_mlp": 0.01275796, "balance_loss_clip": 0.06320029, "balance_loss_mlp": 0.01258809, "epoch": 0.30777093040733505, "flos": 21436884835200.0, "grad_norm": 1.5630477050363871, "language_loss": 0.74700141, "learning_rate": 3.243758033520219e-06, "loss": 0.82505035, "num_input_tokens_seen": 109883500, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.1697998, "step": 5119, "time_per_iteration": 3.9141697883605957 }, { "auxiliary_loss_clip": 0.06536862, "auxiliary_loss_mlp": 0.01276987, "balance_loss_clip": 0.06324925, "balance_loss_mlp": 0.01257687, "epoch": 0.307831053660003, "flos": 23156040654720.0, "grad_norm": 2.082873385397417, "language_loss": 0.80543393, "learning_rate": 3.243453017305926e-06, "loss": 0.8835724, "num_input_tokens_seen": 109904620, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.19299316, "step": 5120, "time_per_iteration": 2.621696949005127 }, { "auxiliary_loss_clip": 0.06528731, "auxiliary_loss_mlp": 0.01270899, "balance_loss_clip": 0.06322718, "balance_loss_mlp": 0.01254293, "epoch": 0.307891176912671, "flos": 17025510130560.0, "grad_norm": 3.2265466987897238, "language_loss": 0.80354965, "learning_rate": 3.24314795393977e-06, "loss": 0.8815459, "num_input_tokens_seen": 109922275, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.16601562, "step": 5121, "time_per_iteration": 4.014821290969849 }, { "auxiliary_loss_clip": 0.06532881, "auxiliary_loss_mlp": 0.01272866, "balance_loss_clip": 0.06326628, "balance_loss_mlp": 0.01255891, "epoch": 0.30795130016533895, "flos": 27711745217280.0, "grad_norm": 2.3330849886618337, "language_loss": 0.8318162, "learning_rate": 3.242842843433319e-06, "loss": 0.90987366, "num_input_tokens_seen": 109944265, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.16955566, "step": 5122, "time_per_iteration": 2.671880006790161 }, { "auxiliary_loss_clip": 0.06497507, "auxiliary_loss_mlp": 0.01276186, "balance_loss_clip": 0.06386491, "balance_loss_mlp": 0.01269773, "epoch": 0.3080114234180069, "flos": 69080973373440.0, "grad_norm": 0.7351666099603476, "language_loss": 0.58663929, "learning_rate": 3.242537685798143e-06, "loss": 0.66437626, "num_input_tokens_seen": 110014160, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.06420898, "step": 5123, "time_per_iteration": 3.3773245811462402 }, { "auxiliary_loss_clip": 0.06536394, "auxiliary_loss_mlp": 0.01280872, "balance_loss_clip": 0.0632254, "balance_loss_mlp": 0.01263074, "epoch": 0.3080715466706749, "flos": 24066938390400.0, "grad_norm": 2.211365658632679, "language_loss": 0.83734787, "learning_rate": 3.242232481045813e-06, "loss": 0.91552055, "num_input_tokens_seen": 110034865, "router_z_loss_clip": 2.13769531, "router_z_loss_mlp": 0.17810059, "step": 5124, "time_per_iteration": 2.6314468383789062 }, { "auxiliary_loss_clip": 0.06534402, "auxiliary_loss_mlp": 0.01272371, "balance_loss_clip": 0.06323364, "balance_loss_mlp": 0.01255693, "epoch": 0.30813166992334284, "flos": 25855806407040.0, "grad_norm": 83.52619443795061, "language_loss": 0.79907441, "learning_rate": 3.2419272291879035e-06, "loss": 0.87714213, "num_input_tokens_seen": 110052930, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.16662598, "step": 5125, "time_per_iteration": 2.60790753364563 }, { "auxiliary_loss_clip": 0.06538041, "auxiliary_loss_mlp": 0.01275395, "balance_loss_clip": 0.06324184, "balance_loss_mlp": 0.01257192, "epoch": 0.3081917931760108, "flos": 20455981413120.0, "grad_norm": 1.8038520261033084, "language_loss": 0.65133899, "learning_rate": 3.241621930235989e-06, "loss": 0.72947335, "num_input_tokens_seen": 110071765, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.18212891, "step": 5126, "time_per_iteration": 2.5525662899017334 }, { "auxiliary_loss_clip": 0.06522332, "auxiliary_loss_mlp": 0.01274009, "balance_loss_clip": 0.0631891, "balance_loss_mlp": 0.01256998, "epoch": 0.3082519164286788, "flos": 22173208588800.0, "grad_norm": 2.3252999365813487, "language_loss": 0.87112755, "learning_rate": 3.241316584201646e-06, "loss": 0.94909096, "num_input_tokens_seen": 110092660, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.17004395, "step": 5127, "time_per_iteration": 2.590322732925415 }, { "auxiliary_loss_clip": 0.06520911, "auxiliary_loss_mlp": 0.01271771, "balance_loss_clip": 0.06314521, "balance_loss_mlp": 0.01255022, "epoch": 0.30831203968134674, "flos": 28921029742080.0, "grad_norm": 1.741830378317085, "language_loss": 0.69094026, "learning_rate": 3.2410111910964538e-06, "loss": 0.76886714, "num_input_tokens_seen": 110114960, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.16760254, "step": 5128, "time_per_iteration": 2.6340177059173584 }, { "auxiliary_loss_clip": 0.06532493, "auxiliary_loss_mlp": 0.01278958, "balance_loss_clip": 0.06321029, "balance_loss_mlp": 0.01260957, "epoch": 0.3083721629340147, "flos": 25675069150080.0, "grad_norm": 1.8559578360615039, "language_loss": 0.71722686, "learning_rate": 3.240705750931993e-06, "loss": 0.79534137, "num_input_tokens_seen": 110135750, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.18017578, "step": 5129, "time_per_iteration": 2.6026957035064697 }, { "auxiliary_loss_clip": 0.06480652, "auxiliary_loss_mlp": 0.01270407, "balance_loss_clip": 0.06369989, "balance_loss_mlp": 0.01264321, "epoch": 0.3084322861866827, "flos": 68233666487040.0, "grad_norm": 0.8117106393836929, "language_loss": 0.58988726, "learning_rate": 3.240400263719846e-06, "loss": 0.66739786, "num_input_tokens_seen": 110189480, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.06082153, "step": 5130, "time_per_iteration": 3.2755796909332275 }, { "auxiliary_loss_clip": 0.06531253, "auxiliary_loss_mlp": 0.01276938, "balance_loss_clip": 0.06318657, "balance_loss_mlp": 0.0125939, "epoch": 0.3084924094393507, "flos": 20301630992640.0, "grad_norm": 4.616203868837887, "language_loss": 0.73274541, "learning_rate": 3.2400947294715957e-06, "loss": 0.81082737, "num_input_tokens_seen": 110206445, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.17541504, "step": 5131, "time_per_iteration": 2.546383857727051 }, { "auxiliary_loss_clip": 0.06518957, "auxiliary_loss_mlp": 0.01273416, "balance_loss_clip": 0.06310713, "balance_loss_mlp": 0.01256369, "epoch": 0.30855253269201866, "flos": 23956374528000.0, "grad_norm": 1.7164765785897917, "language_loss": 0.71320534, "learning_rate": 3.2397891481988303e-06, "loss": 0.79112905, "num_input_tokens_seen": 110226845, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.17053223, "step": 5132, "time_per_iteration": 2.587052583694458 }, { "auxiliary_loss_clip": 0.06510961, "auxiliary_loss_mlp": 0.01272404, "balance_loss_clip": 0.06305704, "balance_loss_mlp": 0.01255977, "epoch": 0.3086126559446866, "flos": 19288009751040.0, "grad_norm": 2.8469606210278213, "language_loss": 0.9061498, "learning_rate": 3.239483519913136e-06, "loss": 0.9839834, "num_input_tokens_seen": 110244095, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.16418457, "step": 5133, "time_per_iteration": 2.7412807941436768 }, { "auxiliary_loss_clip": 0.06526122, "auxiliary_loss_mlp": 0.01277016, "balance_loss_clip": 0.06313337, "balance_loss_mlp": 0.01258873, "epoch": 0.3086727791973546, "flos": 33768328913280.0, "grad_norm": 2.3010977587130306, "language_loss": 0.68000042, "learning_rate": 3.239177844626102e-06, "loss": 0.75803179, "num_input_tokens_seen": 110264240, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.18151855, "step": 5134, "time_per_iteration": 2.7177135944366455 }, { "auxiliary_loss_clip": 0.06525761, "auxiliary_loss_mlp": 0.01282627, "balance_loss_clip": 0.06312259, "balance_loss_mlp": 0.0126508, "epoch": 0.30873290245002255, "flos": 16039659317760.0, "grad_norm": 3.5260916501425057, "language_loss": 0.83508468, "learning_rate": 3.2388721223493197e-06, "loss": 0.91316855, "num_input_tokens_seen": 110282450, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.17553711, "step": 5135, "time_per_iteration": 2.551744222640991 }, { "auxiliary_loss_clip": 0.06434441, "auxiliary_loss_mlp": 0.0128093, "balance_loss_clip": 0.06325096, "balance_loss_mlp": 0.01274582, "epoch": 0.3087930257026905, "flos": 65070415474560.0, "grad_norm": 0.6899788221406663, "language_loss": 0.55235857, "learning_rate": 3.2385663530943824e-06, "loss": 0.62951231, "num_input_tokens_seen": 110343715, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 0.0635376, "step": 5136, "time_per_iteration": 3.249021530151367 }, { "auxiliary_loss_clip": 0.06515201, "auxiliary_loss_mlp": 0.01275899, "balance_loss_clip": 0.06306183, "balance_loss_mlp": 0.012589, "epoch": 0.3088531489553585, "flos": 74754001733760.0, "grad_norm": 2.3771995825039025, "language_loss": 0.7641598, "learning_rate": 3.2382605368728852e-06, "loss": 0.84207082, "num_input_tokens_seen": 110368430, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.17004395, "step": 5137, "time_per_iteration": 2.974426746368408 }, { "auxiliary_loss_clip": 0.06509812, "auxiliary_loss_mlp": 0.01274294, "balance_loss_clip": 0.06302658, "balance_loss_mlp": 0.01258189, "epoch": 0.30891327220802645, "flos": 21148686316800.0, "grad_norm": 1.7469989521161324, "language_loss": 0.80145609, "learning_rate": 3.237954673696424e-06, "loss": 0.8792972, "num_input_tokens_seen": 110386735, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.16088867, "step": 5138, "time_per_iteration": 2.5841968059539795 }, { "auxiliary_loss_clip": 0.06516023, "auxiliary_loss_mlp": 0.01282843, "balance_loss_clip": 0.06305284, "balance_loss_mlp": 0.01265546, "epoch": 0.3089733954606944, "flos": 25671295716480.0, "grad_norm": 2.43919889860907, "language_loss": 0.81616813, "learning_rate": 3.2376487635765983e-06, "loss": 0.89415681, "num_input_tokens_seen": 110406820, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.17285156, "step": 5139, "time_per_iteration": 2.6230506896972656 }, { "auxiliary_loss_clip": 0.06523733, "auxiliary_loss_mlp": 0.0128233, "balance_loss_clip": 0.06305289, "balance_loss_mlp": 0.01263913, "epoch": 0.3090335187133624, "flos": 19433429712000.0, "grad_norm": 3.1358224349421526, "language_loss": 0.77312291, "learning_rate": 3.2373428065250067e-06, "loss": 0.85118353, "num_input_tokens_seen": 110424225, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.18395996, "step": 5140, "time_per_iteration": 2.586731433868408 }, { "auxiliary_loss_clip": 0.06497891, "auxiliary_loss_mlp": 0.01276743, "balance_loss_clip": 0.06296563, "balance_loss_mlp": 0.01260817, "epoch": 0.30909364196603034, "flos": 20017541324160.0, "grad_norm": 2.307519708466903, "language_loss": 0.78851527, "learning_rate": 3.237036802553252e-06, "loss": 0.8662616, "num_input_tokens_seen": 110443310, "router_z_loss_clip": 2.01269531, "router_z_loss_mlp": 0.15905762, "step": 5141, "time_per_iteration": 2.582371234893799 }, { "auxiliary_loss_clip": 0.06511845, "auxiliary_loss_mlp": 0.01282805, "balance_loss_clip": 0.0629873, "balance_loss_mlp": 0.01265281, "epoch": 0.3091537652186983, "flos": 19682830990080.0, "grad_norm": 2.1353603656982836, "language_loss": 0.87349939, "learning_rate": 3.2367307516729377e-06, "loss": 0.95144582, "num_input_tokens_seen": 110460215, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.17529297, "step": 5142, "time_per_iteration": 2.5594594478607178 }, { "auxiliary_loss_clip": 0.06517793, "auxiliary_loss_mlp": 0.01283695, "balance_loss_clip": 0.06304167, "balance_loss_mlp": 0.01266755, "epoch": 0.3092138884713663, "flos": 17025845546880.0, "grad_norm": 1.741166501949385, "language_loss": 0.79484284, "learning_rate": 3.23642465389567e-06, "loss": 0.87285769, "num_input_tokens_seen": 110479385, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.16931152, "step": 5143, "time_per_iteration": 2.5637009143829346 }, { "auxiliary_loss_clip": 0.06507794, "auxiliary_loss_mlp": 0.01273868, "balance_loss_clip": 0.06300692, "balance_loss_mlp": 0.01256881, "epoch": 0.3092740117240343, "flos": 25017052636800.0, "grad_norm": 1.66209130913083, "language_loss": 0.7254194, "learning_rate": 3.236118509233055e-06, "loss": 0.80323601, "num_input_tokens_seen": 110499885, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.17004395, "step": 5144, "time_per_iteration": 2.602858543395996 }, { "auxiliary_loss_clip": 0.06513996, "auxiliary_loss_mlp": 0.01273577, "balance_loss_clip": 0.06301796, "balance_loss_mlp": 0.01256733, "epoch": 0.30933413497670226, "flos": 25597013472000.0, "grad_norm": 1.9865597490751565, "language_loss": 0.74703908, "learning_rate": 3.235812317696702e-06, "loss": 0.82491481, "num_input_tokens_seen": 110519690, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.16845703, "step": 5145, "time_per_iteration": 2.645388126373291 }, { "auxiliary_loss_clip": 0.06505968, "auxiliary_loss_mlp": 0.01274466, "balance_loss_clip": 0.06296719, "balance_loss_mlp": 0.01257503, "epoch": 0.3093942582293702, "flos": 24396617479680.0, "grad_norm": 1.7396985941090584, "language_loss": 0.76794863, "learning_rate": 3.2355060792982224e-06, "loss": 0.84575295, "num_input_tokens_seen": 110540520, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.16955566, "step": 5146, "time_per_iteration": 4.011385917663574 }, { "auxiliary_loss_clip": 0.06510374, "auxiliary_loss_mlp": 0.01270795, "balance_loss_clip": 0.06303158, "balance_loss_mlp": 0.0125438, "epoch": 0.3094543814820382, "flos": 19652586865920.0, "grad_norm": 5.348784579319492, "language_loss": 0.67013848, "learning_rate": 3.2351997940492286e-06, "loss": 0.7479502, "num_input_tokens_seen": 110557950, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.16430664, "step": 5147, "time_per_iteration": 2.5649049282073975 }, { "auxiliary_loss_clip": 0.06509463, "auxiliary_loss_mlp": 0.01271888, "balance_loss_clip": 0.06297199, "balance_loss_mlp": 0.01254948, "epoch": 0.30951450473470615, "flos": 25670499102720.0, "grad_norm": 3.352551618183583, "language_loss": 0.74974918, "learning_rate": 3.2348934619613346e-06, "loss": 0.82756263, "num_input_tokens_seen": 110578215, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.16943359, "step": 5148, "time_per_iteration": 2.5873262882232666 }, { "auxiliary_loss_clip": 0.06516521, "auxiliary_loss_mlp": 0.01276164, "balance_loss_clip": 0.06297482, "balance_loss_mlp": 0.01258306, "epoch": 0.3095746279873741, "flos": 12025202204160.0, "grad_norm": 2.4444907404560734, "language_loss": 0.73274469, "learning_rate": 3.2345870830461567e-06, "loss": 0.81067157, "num_input_tokens_seen": 110592990, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.17871094, "step": 5149, "time_per_iteration": 2.5480899810791016 }, { "auxiliary_loss_clip": 0.06514278, "auxiliary_loss_mlp": 0.01283212, "balance_loss_clip": 0.06298959, "balance_loss_mlp": 0.01265224, "epoch": 0.3096347512400421, "flos": 23629798039680.0, "grad_norm": 4.117194587368444, "language_loss": 0.85347092, "learning_rate": 3.2342806573153132e-06, "loss": 0.93144572, "num_input_tokens_seen": 110612130, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.17993164, "step": 5150, "time_per_iteration": 2.5761077404022217 }, { "auxiliary_loss_clip": 0.06497918, "auxiliary_loss_mlp": 0.0127037, "balance_loss_clip": 0.06290652, "balance_loss_mlp": 0.01253168, "epoch": 0.30969487449271005, "flos": 22536024768000.0, "grad_norm": 1.7938989130314786, "language_loss": 0.79531044, "learning_rate": 3.233974184780424e-06, "loss": 0.87299335, "num_input_tokens_seen": 110632045, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.17199707, "step": 5151, "time_per_iteration": 4.005594730377197 }, { "auxiliary_loss_clip": 0.06502392, "auxiliary_loss_mlp": 0.01274034, "balance_loss_clip": 0.06292079, "balance_loss_mlp": 0.01257058, "epoch": 0.309754997745378, "flos": 15273301075200.0, "grad_norm": 7.870484341295499, "language_loss": 0.67993438, "learning_rate": 3.2336676654531084e-06, "loss": 0.75769871, "num_input_tokens_seen": 110649340, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.16967773, "step": 5152, "time_per_iteration": 2.561593532562256 }, { "auxiliary_loss_clip": 0.06502928, "auxiliary_loss_mlp": 0.01274698, "balance_loss_clip": 0.06296552, "balance_loss_mlp": 0.01256697, "epoch": 0.309815120998046, "flos": 26986532129280.0, "grad_norm": 1.9648396231817282, "language_loss": 0.83188456, "learning_rate": 3.2333610993449926e-06, "loss": 0.90966082, "num_input_tokens_seen": 110668450, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.17993164, "step": 5153, "time_per_iteration": 2.5987627506256104 }, { "auxiliary_loss_clip": 0.06498405, "auxiliary_loss_mlp": 0.01272088, "balance_loss_clip": 0.06292633, "balance_loss_mlp": 0.01255637, "epoch": 0.30987524425071394, "flos": 21149692565760.0, "grad_norm": 2.5971406813078732, "language_loss": 0.74540931, "learning_rate": 3.2330544864676997e-06, "loss": 0.82311422, "num_input_tokens_seen": 110689410, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.16442871, "step": 5154, "time_per_iteration": 2.628952980041504 }, { "auxiliary_loss_clip": 0.06498022, "auxiliary_loss_mlp": 0.01277878, "balance_loss_clip": 0.06293041, "balance_loss_mlp": 0.01260986, "epoch": 0.3099353675033819, "flos": 15273720345600.0, "grad_norm": 1.9066381267938233, "language_loss": 0.76652342, "learning_rate": 3.232747826832858e-06, "loss": 0.84428239, "num_input_tokens_seen": 110707350, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.16894531, "step": 5155, "time_per_iteration": 2.5454344749450684 }, { "auxiliary_loss_clip": 0.06501806, "auxiliary_loss_mlp": 0.01270717, "balance_loss_clip": 0.06291986, "balance_loss_mlp": 0.01253944, "epoch": 0.30999549075604993, "flos": 15419182233600.0, "grad_norm": 2.069890005276822, "language_loss": 0.79580212, "learning_rate": 3.232441120452094e-06, "loss": 0.87352729, "num_input_tokens_seen": 110724910, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.16760254, "step": 5156, "time_per_iteration": 2.5495402812957764 }, { "auxiliary_loss_clip": 0.06501843, "auxiliary_loss_mlp": 0.01276655, "balance_loss_clip": 0.06291739, "balance_loss_mlp": 0.01259751, "epoch": 0.3100556140087179, "flos": 23191106388480.0, "grad_norm": 2.1071104559062297, "language_loss": 0.75513387, "learning_rate": 3.23213436733704e-06, "loss": 0.83291888, "num_input_tokens_seen": 110744010, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.16894531, "step": 5157, "time_per_iteration": 2.5621588230133057 }, { "auxiliary_loss_clip": 0.06499546, "auxiliary_loss_mlp": 0.0127342, "balance_loss_clip": 0.06295522, "balance_loss_mlp": 0.01257708, "epoch": 0.31011573726138586, "flos": 25749770664960.0, "grad_norm": 1.6114340988975606, "language_loss": 0.69657218, "learning_rate": 3.231827567499327e-06, "loss": 0.77430177, "num_input_tokens_seen": 110765835, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.15710449, "step": 5158, "time_per_iteration": 4.052194595336914 }, { "auxiliary_loss_clip": 0.06504522, "auxiliary_loss_mlp": 0.01273176, "balance_loss_clip": 0.06300481, "balance_loss_mlp": 0.01257798, "epoch": 0.3101758605140538, "flos": 20017541324160.0, "grad_norm": 2.636953379086012, "language_loss": 0.85095257, "learning_rate": 3.2315207209505896e-06, "loss": 0.92872953, "num_input_tokens_seen": 110784655, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.15368652, "step": 5159, "time_per_iteration": 2.557788610458374 }, { "auxiliary_loss_clip": 0.0650074, "auxiliary_loss_mlp": 0.0127577, "balance_loss_clip": 0.06294897, "balance_loss_mlp": 0.01258795, "epoch": 0.3102359837667218, "flos": 19141751249280.0, "grad_norm": 2.7373977722791514, "language_loss": 0.84825265, "learning_rate": 3.231213827702462e-06, "loss": 0.92601776, "num_input_tokens_seen": 110802545, "router_z_loss_clip": 2.05957031, "router_z_loss_mlp": 0.16967773, "step": 5160, "time_per_iteration": 2.5538337230682373 }, { "auxiliary_loss_clip": 0.06499752, "auxiliary_loss_mlp": 0.01271313, "balance_loss_clip": 0.06296406, "balance_loss_mlp": 0.01255577, "epoch": 0.31029610701938976, "flos": 22270649287680.0, "grad_norm": 2.5304222132656333, "language_loss": 0.76241994, "learning_rate": 3.230906887766584e-06, "loss": 0.84013057, "num_input_tokens_seen": 110820265, "router_z_loss_clip": 2.03320312, "router_z_loss_mlp": 0.15759277, "step": 5161, "time_per_iteration": 4.020951271057129 }, { "auxiliary_loss_clip": 0.06505205, "auxiliary_loss_mlp": 0.01275533, "balance_loss_clip": 0.0629645, "balance_loss_mlp": 0.01258498, "epoch": 0.3103562302720577, "flos": 20810244476160.0, "grad_norm": 1.9226945816500056, "language_loss": 0.82505727, "learning_rate": 3.2305999011545924e-06, "loss": 0.90286469, "num_input_tokens_seen": 110836195, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.17028809, "step": 5162, "time_per_iteration": 2.556889533996582 }, { "auxiliary_loss_clip": 0.06501912, "auxiliary_loss_mlp": 0.01268975, "balance_loss_clip": 0.06298827, "balance_loss_mlp": 0.01253573, "epoch": 0.3104163535247257, "flos": 22350382047360.0, "grad_norm": 2.084244002720156, "language_loss": 0.82759225, "learning_rate": 3.2302928678781295e-06, "loss": 0.90530109, "num_input_tokens_seen": 110856420, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.15405273, "step": 5163, "time_per_iteration": 2.5902724266052246 }, { "auxiliary_loss_clip": 0.06505944, "auxiliary_loss_mlp": 0.01268706, "balance_loss_clip": 0.06299725, "balance_loss_mlp": 0.01252648, "epoch": 0.31047647677739365, "flos": 21695803551360.0, "grad_norm": 1.886234465317836, "language_loss": 0.76279283, "learning_rate": 3.2299857879488376e-06, "loss": 0.84053928, "num_input_tokens_seen": 110876650, "router_z_loss_clip": 2.06152344, "router_z_loss_mlp": 0.16052246, "step": 5164, "time_per_iteration": 2.5884382724761963 }, { "auxiliary_loss_clip": 0.0649998, "auxiliary_loss_mlp": 0.01274584, "balance_loss_clip": 0.0629565, "balance_loss_mlp": 0.01258253, "epoch": 0.3105366000300616, "flos": 18923390709120.0, "grad_norm": 1.9492038966004421, "language_loss": 0.75316709, "learning_rate": 3.2296786613783626e-06, "loss": 0.83091271, "num_input_tokens_seen": 110894445, "router_z_loss_clip": 2.04101562, "router_z_loss_mlp": 0.16320801, "step": 5165, "time_per_iteration": 2.5781514644622803 }, { "auxiliary_loss_clip": 0.06498455, "auxiliary_loss_mlp": 0.01270843, "balance_loss_clip": 0.0629425, "balance_loss_mlp": 0.01254464, "epoch": 0.3105967232827296, "flos": 18266380444800.0, "grad_norm": 1.5407722717287893, "language_loss": 0.76638854, "learning_rate": 3.229371488178348e-06, "loss": 0.84408146, "num_input_tokens_seen": 110912855, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.16381836, "step": 5166, "time_per_iteration": 2.645725965499878 }, { "auxiliary_loss_clip": 0.06501429, "auxiliary_loss_mlp": 0.01275122, "balance_loss_clip": 0.0629618, "balance_loss_mlp": 0.01258302, "epoch": 0.31065684653539755, "flos": 17677279514880.0, "grad_norm": 2.7048437424107292, "language_loss": 0.74290192, "learning_rate": 3.229064268360444e-06, "loss": 0.82066739, "num_input_tokens_seen": 110928025, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.16821289, "step": 5167, "time_per_iteration": 2.541994094848633 }, { "auxiliary_loss_clip": 0.06435375, "auxiliary_loss_mlp": 0.01279502, "balance_loss_clip": 0.0633086, "balance_loss_mlp": 0.01273363, "epoch": 0.3107169697880655, "flos": 68551522151040.0, "grad_norm": 0.7274442323928012, "language_loss": 0.52975553, "learning_rate": 3.2287570019362997e-06, "loss": 0.60690427, "num_input_tokens_seen": 110992215, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.06130981, "step": 5168, "time_per_iteration": 3.274554491043091 }, { "auxiliary_loss_clip": 0.06506252, "auxiliary_loss_mlp": 0.01272616, "balance_loss_clip": 0.06297691, "balance_loss_mlp": 0.01256523, "epoch": 0.3107770930407335, "flos": 13193844698880.0, "grad_norm": 1.9060292454686982, "language_loss": 0.79277498, "learning_rate": 3.2284496889175668e-06, "loss": 0.87056363, "num_input_tokens_seen": 111010400, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.16088867, "step": 5169, "time_per_iteration": 2.5563721656799316 }, { "auxiliary_loss_clip": 0.06503172, "auxiliary_loss_mlp": 0.01276569, "balance_loss_clip": 0.06294741, "balance_loss_mlp": 0.01258747, "epoch": 0.3108372162934015, "flos": 31589587048320.0, "grad_norm": 1.802461459459535, "language_loss": 0.64530742, "learning_rate": 3.2281423293158986e-06, "loss": 0.72310483, "num_input_tokens_seen": 111033960, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.17822266, "step": 5170, "time_per_iteration": 2.6969523429870605 }, { "auxiliary_loss_clip": 0.06504799, "auxiliary_loss_mlp": 0.01274251, "balance_loss_clip": 0.06299755, "balance_loss_mlp": 0.01257239, "epoch": 0.31089733954606946, "flos": 28737231811200.0, "grad_norm": 2.288853460202599, "language_loss": 0.78205943, "learning_rate": 3.22783492314295e-06, "loss": 0.85984999, "num_input_tokens_seen": 111053265, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.17028809, "step": 5171, "time_per_iteration": 2.636543035507202 }, { "auxiliary_loss_clip": 0.06508109, "auxiliary_loss_mlp": 0.01280527, "balance_loss_clip": 0.06303615, "balance_loss_mlp": 0.01263623, "epoch": 0.3109574627987374, "flos": 19689455462400.0, "grad_norm": 2.1222002122260513, "language_loss": 0.84422076, "learning_rate": 3.2275274704103785e-06, "loss": 0.9221071, "num_input_tokens_seen": 111071130, "router_z_loss_clip": 2.04394531, "router_z_loss_mlp": 0.16906738, "step": 5172, "time_per_iteration": 2.6326375007629395 }, { "auxiliary_loss_clip": 0.06512207, "auxiliary_loss_mlp": 0.01277615, "balance_loss_clip": 0.06303893, "balance_loss_mlp": 0.01261534, "epoch": 0.3110175860514054, "flos": 14689231390080.0, "grad_norm": 2.3989488703227937, "language_loss": 0.84534013, "learning_rate": 3.227219971129842e-06, "loss": 0.92323828, "num_input_tokens_seen": 111089560, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.1607666, "step": 5173, "time_per_iteration": 2.548241138458252 }, { "auxiliary_loss_clip": 0.06501603, "auxiliary_loss_mlp": 0.01272302, "balance_loss_clip": 0.06302847, "balance_loss_mlp": 0.0125665, "epoch": 0.31107770930407336, "flos": 25746835772160.0, "grad_norm": 2.2662514193689187, "language_loss": 0.83711004, "learning_rate": 3.226912425313001e-06, "loss": 0.9148491, "num_input_tokens_seen": 111109960, "router_z_loss_clip": 1.98925781, "router_z_loss_mlp": 0.15661621, "step": 5174, "time_per_iteration": 2.6181821823120117 }, { "auxiliary_loss_clip": 0.06504968, "auxiliary_loss_mlp": 0.01277342, "balance_loss_clip": 0.06299694, "balance_loss_mlp": 0.01260986, "epoch": 0.3111378325567413, "flos": 19214272558080.0, "grad_norm": 1.9224638699169923, "language_loss": 0.86069429, "learning_rate": 3.2266048329715183e-06, "loss": 0.93851739, "num_input_tokens_seen": 111127960, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.16357422, "step": 5175, "time_per_iteration": 2.549846887588501 }, { "auxiliary_loss_clip": 0.06499384, "auxiliary_loss_mlp": 0.01279463, "balance_loss_clip": 0.06300172, "balance_loss_mlp": 0.01262631, "epoch": 0.3111979558094093, "flos": 23703199816320.0, "grad_norm": 5.398579194768534, "language_loss": 0.83924681, "learning_rate": 3.2262971941170575e-06, "loss": 0.91703522, "num_input_tokens_seen": 111146730, "router_z_loss_clip": 1.99316406, "router_z_loss_mlp": 0.16833496, "step": 5176, "time_per_iteration": 2.585549831390381 }, { "auxiliary_loss_clip": 0.06497301, "auxiliary_loss_mlp": 0.0127786, "balance_loss_clip": 0.06293704, "balance_loss_mlp": 0.01260718, "epoch": 0.31125807906207725, "flos": 21039422192640.0, "grad_norm": 2.402255152079848, "language_loss": 0.8124975, "learning_rate": 3.2259895087612837e-06, "loss": 0.89024913, "num_input_tokens_seen": 111166295, "router_z_loss_clip": 2.03515625, "router_z_loss_mlp": 0.17138672, "step": 5177, "time_per_iteration": 2.5685033798217773 }, { "auxiliary_loss_clip": 0.06502226, "auxiliary_loss_mlp": 0.01280513, "balance_loss_clip": 0.06296796, "balance_loss_mlp": 0.01263347, "epoch": 0.3113182023147452, "flos": 23083435491840.0, "grad_norm": 1.9123644322732798, "language_loss": 0.81035787, "learning_rate": 3.2256817769158657e-06, "loss": 0.88818526, "num_input_tokens_seen": 111185665, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.17150879, "step": 5178, "time_per_iteration": 2.5760879516601562 }, { "auxiliary_loss_clip": 0.06500989, "auxiliary_loss_mlp": 0.01278071, "balance_loss_clip": 0.06294057, "balance_loss_mlp": 0.01261763, "epoch": 0.3113783255674132, "flos": 11843919895680.0, "grad_norm": 3.0730821664744523, "language_loss": 0.8107456, "learning_rate": 3.225373998592471e-06, "loss": 0.88853621, "num_input_tokens_seen": 111201615, "router_z_loss_clip": 2.06933594, "router_z_loss_mlp": 0.16308594, "step": 5179, "time_per_iteration": 2.519495964050293 }, { "auxiliary_loss_clip": 0.06500454, "auxiliary_loss_mlp": 0.01278553, "balance_loss_clip": 0.06296499, "balance_loss_mlp": 0.01262603, "epoch": 0.31143844882008115, "flos": 16295098089600.0, "grad_norm": 1.8337249165221108, "language_loss": 0.79028618, "learning_rate": 3.2250661738027715e-06, "loss": 0.86807621, "num_input_tokens_seen": 111220515, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.15942383, "step": 5180, "time_per_iteration": 2.5603792667388916 }, { "auxiliary_loss_clip": 0.06500477, "auxiliary_loss_mlp": 0.01281012, "balance_loss_clip": 0.06296442, "balance_loss_mlp": 0.01264251, "epoch": 0.3114985720727491, "flos": 23223824208000.0, "grad_norm": 1.7741123599238524, "language_loss": 0.83523941, "learning_rate": 3.22475830255844e-06, "loss": 0.91305435, "num_input_tokens_seen": 111240395, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.16772461, "step": 5181, "time_per_iteration": 2.700148582458496 }, { "auxiliary_loss_clip": 0.06496798, "auxiliary_loss_mlp": 0.01276796, "balance_loss_clip": 0.06295517, "balance_loss_mlp": 0.01261775, "epoch": 0.3115586953254171, "flos": 30052468224000.0, "grad_norm": 1.6213390459346597, "language_loss": 0.74544781, "learning_rate": 3.2244503848711516e-06, "loss": 0.82318377, "num_input_tokens_seen": 111261100, "router_z_loss_clip": 2.01367188, "router_z_loss_mlp": 0.15026855, "step": 5182, "time_per_iteration": 2.6437418460845947 }, { "auxiliary_loss_clip": 0.06504013, "auxiliary_loss_mlp": 0.01273523, "balance_loss_clip": 0.06296363, "balance_loss_mlp": 0.01258013, "epoch": 0.3116188185780851, "flos": 25673433995520.0, "grad_norm": 2.054112045953273, "language_loss": 0.7139293, "learning_rate": 3.2241424207525815e-06, "loss": 0.79170465, "num_input_tokens_seen": 111281320, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.1550293, "step": 5183, "time_per_iteration": 2.5944974422454834 }, { "auxiliary_loss_clip": 0.06383995, "auxiliary_loss_mlp": 0.01255968, "balance_loss_clip": 0.06279957, "balance_loss_mlp": 0.01251036, "epoch": 0.31167894183075306, "flos": 69528568285440.0, "grad_norm": 0.9388908444898688, "language_loss": 0.59155768, "learning_rate": 3.223834410214408e-06, "loss": 0.66795731, "num_input_tokens_seen": 111341405, "router_z_loss_clip": 1.04003906, "router_z_loss_mlp": 0.04928589, "step": 5184, "time_per_iteration": 3.222395181655884 }, { "auxiliary_loss_clip": 0.06500305, "auxiliary_loss_mlp": 0.01274691, "balance_loss_clip": 0.06294739, "balance_loss_mlp": 0.01258765, "epoch": 0.31173906508342103, "flos": 14945215213440.0, "grad_norm": 3.0699414812913886, "language_loss": 0.7008667, "learning_rate": 3.223526353268311e-06, "loss": 0.77861673, "num_input_tokens_seen": 111358975, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.15942383, "step": 5185, "time_per_iteration": 4.107775449752808 }, { "auxiliary_loss_clip": 0.06503738, "auxiliary_loss_mlp": 0.0127211, "balance_loss_clip": 0.06297088, "balance_loss_mlp": 0.01255468, "epoch": 0.311799188336089, "flos": 16180886574720.0, "grad_norm": 2.51711861476628, "language_loss": 0.64041483, "learning_rate": 3.2232182499259725e-06, "loss": 0.71817338, "num_input_tokens_seen": 111375845, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.16638184, "step": 5186, "time_per_iteration": 2.545426368713379 }, { "auxiliary_loss_clip": 0.06507244, "auxiliary_loss_mlp": 0.01272515, "balance_loss_clip": 0.06296392, "balance_loss_mlp": 0.0125542, "epoch": 0.31185931158875696, "flos": 25016633366400.0, "grad_norm": 2.9356096397313722, "language_loss": 0.86799467, "learning_rate": 3.2229101001990747e-06, "loss": 0.94579226, "num_input_tokens_seen": 111394150, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.17114258, "step": 5187, "time_per_iteration": 2.6409072875976562 }, { "auxiliary_loss_clip": 0.06497468, "auxiliary_loss_mlp": 0.01276866, "balance_loss_clip": 0.06290329, "balance_loss_mlp": 0.01260987, "epoch": 0.3119194348414249, "flos": 37242041702400.0, "grad_norm": 2.074772520104307, "language_loss": 0.63113379, "learning_rate": 3.2226019040993036e-06, "loss": 0.70887709, "num_input_tokens_seen": 111418355, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.15893555, "step": 5188, "time_per_iteration": 2.6986632347106934 }, { "auxiliary_loss_clip": 0.0650083, "auxiliary_loss_mlp": 0.01274412, "balance_loss_clip": 0.06295855, "balance_loss_mlp": 0.01258462, "epoch": 0.3119795580940929, "flos": 15018155792640.0, "grad_norm": 2.158114699323703, "language_loss": 0.83757818, "learning_rate": 3.222293661638346e-06, "loss": 0.91533059, "num_input_tokens_seen": 111435445, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.1595459, "step": 5189, "time_per_iteration": 2.539203643798828 }, { "auxiliary_loss_clip": 0.06496395, "auxiliary_loss_mlp": 0.0127547, "balance_loss_clip": 0.06295711, "balance_loss_mlp": 0.01259139, "epoch": 0.31203968134676086, "flos": 16003755043200.0, "grad_norm": 4.448615430324576, "language_loss": 0.79449797, "learning_rate": 3.22198537282789e-06, "loss": 0.87221664, "num_input_tokens_seen": 111453430, "router_z_loss_clip": 2.00585938, "router_z_loss_mlp": 0.16320801, "step": 5190, "time_per_iteration": 2.5493481159210205 }, { "auxiliary_loss_clip": 0.06501891, "auxiliary_loss_mlp": 0.0127443, "balance_loss_clip": 0.06295339, "balance_loss_mlp": 0.01258897, "epoch": 0.3120998045994288, "flos": 23843378897280.0, "grad_norm": 1.4475924567787868, "language_loss": 0.75334334, "learning_rate": 3.2216770376796262e-06, "loss": 0.83110654, "num_input_tokens_seen": 111475325, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.15527344, "step": 5191, "time_per_iteration": 4.028865814208984 }, { "auxiliary_loss_clip": 0.06393889, "auxiliary_loss_mlp": 0.0125891, "balance_loss_clip": 0.06289932, "balance_loss_mlp": 0.01254133, "epoch": 0.3121599278520968, "flos": 69203081900160.0, "grad_norm": 0.8269955172531595, "language_loss": 0.63917041, "learning_rate": 3.221368656205247e-06, "loss": 0.71569836, "num_input_tokens_seen": 111533960, "router_z_loss_clip": 1.04003906, "router_z_loss_mlp": 0.04769897, "step": 5192, "time_per_iteration": 3.2540299892425537 }, { "auxiliary_loss_clip": 0.06505729, "auxiliary_loss_mlp": 0.01273935, "balance_loss_clip": 0.06296325, "balance_loss_mlp": 0.0125665, "epoch": 0.31222005110476475, "flos": 23813302481280.0, "grad_norm": 2.5309668208616998, "language_loss": 0.80169731, "learning_rate": 3.221060228416446e-06, "loss": 0.87949395, "num_input_tokens_seen": 111554055, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.17285156, "step": 5193, "time_per_iteration": 2.5760793685913086 }, { "auxiliary_loss_clip": 0.0650347, "auxiliary_loss_mlp": 0.01274532, "balance_loss_clip": 0.06296859, "balance_loss_mlp": 0.0125739, "epoch": 0.3122801743574327, "flos": 25232771773440.0, "grad_norm": 1.863464256906829, "language_loss": 0.72105503, "learning_rate": 3.2207517543249183e-06, "loss": 0.79883504, "num_input_tokens_seen": 111574305, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.17138672, "step": 5194, "time_per_iteration": 2.6091740131378174 }, { "auxiliary_loss_clip": 0.06498972, "auxiliary_loss_mlp": 0.01270787, "balance_loss_clip": 0.06296654, "balance_loss_mlp": 0.01254944, "epoch": 0.3123402976101007, "flos": 22973165118720.0, "grad_norm": 3.397291649556132, "language_loss": 0.77030712, "learning_rate": 3.2204432339423616e-06, "loss": 0.8480047, "num_input_tokens_seen": 111595680, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.15856934, "step": 5195, "time_per_iteration": 2.5958681106567383 }, { "auxiliary_loss_clip": 0.0650348, "auxiliary_loss_mlp": 0.01272687, "balance_loss_clip": 0.06294073, "balance_loss_mlp": 0.0125657, "epoch": 0.3124004208627687, "flos": 25199131559040.0, "grad_norm": 1.3994261546314093, "language_loss": 0.78530121, "learning_rate": 3.220134667280476e-06, "loss": 0.86306286, "num_input_tokens_seen": 111618135, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.16137695, "step": 5196, "time_per_iteration": 2.590491771697998 }, { "auxiliary_loss_clip": 0.06386882, "auxiliary_loss_mlp": 0.01256547, "balance_loss_clip": 0.06283661, "balance_loss_mlp": 0.01251526, "epoch": 0.31246054411543667, "flos": 67506398974080.0, "grad_norm": 0.7593434702964696, "language_loss": 0.54678518, "learning_rate": 3.2198260543509613e-06, "loss": 0.62321949, "num_input_tokens_seen": 111682220, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.05020142, "step": 5197, "time_per_iteration": 3.248377561569214 }, { "auxiliary_loss_clip": 0.06499276, "auxiliary_loss_mlp": 0.01273838, "balance_loss_clip": 0.06297226, "balance_loss_mlp": 0.01257113, "epoch": 0.31252066736810463, "flos": 17864347754880.0, "grad_norm": 2.850122310465846, "language_loss": 0.67241693, "learning_rate": 3.21951739516552e-06, "loss": 0.75014806, "num_input_tokens_seen": 111700815, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.16723633, "step": 5198, "time_per_iteration": 4.064748525619507 }, { "auxiliary_loss_clip": 0.06504067, "auxiliary_loss_mlp": 0.01272884, "balance_loss_clip": 0.06295095, "balance_loss_mlp": 0.01255039, "epoch": 0.3125807906207726, "flos": 18480338645760.0, "grad_norm": 2.754038231367475, "language_loss": 0.69747829, "learning_rate": 3.219208689735857e-06, "loss": 0.77524787, "num_input_tokens_seen": 111718195, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.17858887, "step": 5199, "time_per_iteration": 2.5814030170440674 }, { "auxiliary_loss_clip": 0.06503283, "auxiliary_loss_mlp": 0.01277389, "balance_loss_clip": 0.06297502, "balance_loss_mlp": 0.01260318, "epoch": 0.31264091387344056, "flos": 18951454627200.0, "grad_norm": 3.675337412130963, "language_loss": 0.79156387, "learning_rate": 3.2188999380736785e-06, "loss": 0.86937064, "num_input_tokens_seen": 111734440, "router_z_loss_clip": 2.05761719, "router_z_loss_mlp": 0.1706543, "step": 5200, "time_per_iteration": 2.539358377456665 }, { "auxiliary_loss_clip": 0.06496312, "auxiliary_loss_mlp": 0.01273101, "balance_loss_clip": 0.06295089, "balance_loss_mlp": 0.0125721, "epoch": 0.3127010371261085, "flos": 21474591972480.0, "grad_norm": 2.696616403877227, "language_loss": 0.8420313, "learning_rate": 3.2185911401906917e-06, "loss": 0.91972542, "num_input_tokens_seen": 111751960, "router_z_loss_clip": 2.01074219, "router_z_loss_mlp": 0.15881348, "step": 5201, "time_per_iteration": 4.0117127895355225 }, { "auxiliary_loss_clip": 0.06502855, "auxiliary_loss_mlp": 0.01271913, "balance_loss_clip": 0.06297958, "balance_loss_mlp": 0.01254281, "epoch": 0.3127611603787765, "flos": 15340623431040.0, "grad_norm": 2.4198088890853757, "language_loss": 0.6971302, "learning_rate": 3.2182822960986072e-06, "loss": 0.77487791, "num_input_tokens_seen": 111769585, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.1763916, "step": 5202, "time_per_iteration": 2.542609930038452 }, { "auxiliary_loss_clip": 0.06506, "auxiliary_loss_mlp": 0.0127644, "balance_loss_clip": 0.0629907, "balance_loss_mlp": 0.01260001, "epoch": 0.31282128363144446, "flos": 17608741274880.0, "grad_norm": 2.2947556938086837, "language_loss": 0.84006345, "learning_rate": 3.2179734058091358e-06, "loss": 0.91788793, "num_input_tokens_seen": 111787880, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.16442871, "step": 5203, "time_per_iteration": 2.537954330444336 }, { "auxiliary_loss_clip": 0.06502145, "auxiliary_loss_mlp": 0.01274657, "balance_loss_clip": 0.06294037, "balance_loss_mlp": 0.01258051, "epoch": 0.3128814068841124, "flos": 26763349979520.0, "grad_norm": 2.848549418731283, "language_loss": 0.61473107, "learning_rate": 3.2176644693339913e-06, "loss": 0.69249904, "num_input_tokens_seen": 111805950, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.1661377, "step": 5204, "time_per_iteration": 2.610492706298828 }, { "auxiliary_loss_clip": 0.06494333, "auxiliary_loss_mlp": 0.01273254, "balance_loss_clip": 0.06292735, "balance_loss_mlp": 0.0125784, "epoch": 0.3129415301367804, "flos": 22278783133440.0, "grad_norm": 2.051723829324211, "language_loss": 0.66085613, "learning_rate": 3.217355486684887e-06, "loss": 0.73853195, "num_input_tokens_seen": 111826135, "router_z_loss_clip": 2.01757812, "router_z_loss_mlp": 0.1541748, "step": 5205, "time_per_iteration": 2.5519156455993652 }, { "auxiliary_loss_clip": 0.06507781, "auxiliary_loss_mlp": 0.01278642, "balance_loss_clip": 0.06301474, "balance_loss_mlp": 0.01261487, "epoch": 0.31300165338944835, "flos": 26471461881600.0, "grad_norm": 2.4880338442944554, "language_loss": 0.7689988, "learning_rate": 3.2170464578735414e-06, "loss": 0.84686303, "num_input_tokens_seen": 111844700, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.17163086, "step": 5206, "time_per_iteration": 2.6195449829101562 }, { "auxiliary_loss_clip": 0.06498488, "auxiliary_loss_mlp": 0.01272605, "balance_loss_clip": 0.06296154, "balance_loss_mlp": 0.0125706, "epoch": 0.3130617766421163, "flos": 21951116542080.0, "grad_norm": 1.957820369847809, "language_loss": 0.83190858, "learning_rate": 3.216737382911672e-06, "loss": 0.90961945, "num_input_tokens_seen": 111861585, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.15563965, "step": 5207, "time_per_iteration": 2.556345224380493 }, { "auxiliary_loss_clip": 0.06497553, "auxiliary_loss_mlp": 0.01275914, "balance_loss_clip": 0.06297009, "balance_loss_mlp": 0.01260179, "epoch": 0.3131218998947843, "flos": 23299154628480.0, "grad_norm": 5.090997647203341, "language_loss": 0.71929002, "learning_rate": 3.216428261810999e-06, "loss": 0.79702473, "num_input_tokens_seen": 111882950, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.1574707, "step": 5208, "time_per_iteration": 2.612720012664795 }, { "auxiliary_loss_clip": 0.06505165, "auxiliary_loss_mlp": 0.01270914, "balance_loss_clip": 0.06300941, "balance_loss_mlp": 0.01255154, "epoch": 0.3131820231474523, "flos": 21145583715840.0, "grad_norm": 2.7738374346487, "language_loss": 0.75067258, "learning_rate": 3.2161190945832445e-06, "loss": 0.82843339, "num_input_tokens_seen": 111901640, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.15759277, "step": 5209, "time_per_iteration": 2.567678451538086 }, { "auxiliary_loss_clip": 0.06505595, "auxiliary_loss_mlp": 0.01269925, "balance_loss_clip": 0.06301152, "balance_loss_mlp": 0.01254702, "epoch": 0.31324214640012027, "flos": 23915816352000.0, "grad_norm": 2.6615278288071216, "language_loss": 0.77459973, "learning_rate": 3.2158098812401325e-06, "loss": 0.85235494, "num_input_tokens_seen": 111919615, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.15222168, "step": 5210, "time_per_iteration": 2.5778186321258545 }, { "auxiliary_loss_clip": 0.06490597, "auxiliary_loss_mlp": 0.01275525, "balance_loss_clip": 0.0629488, "balance_loss_mlp": 0.01260671, "epoch": 0.31330226965278823, "flos": 22243507764480.0, "grad_norm": 1.8261232344588518, "language_loss": 0.79156673, "learning_rate": 3.2155006217933874e-06, "loss": 0.86922795, "num_input_tokens_seen": 111938485, "router_z_loss_clip": 1.95605469, "router_z_loss_mlp": 0.14855957, "step": 5211, "time_per_iteration": 2.5847346782684326 }, { "auxiliary_loss_clip": 0.06501761, "auxiliary_loss_mlp": 0.01267815, "balance_loss_clip": 0.06301837, "balance_loss_mlp": 0.01253051, "epoch": 0.3133623929054562, "flos": 19759838492160.0, "grad_norm": 2.0698045378286163, "language_loss": 0.79432201, "learning_rate": 3.2151913162547367e-06, "loss": 0.87201774, "num_input_tokens_seen": 111956425, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.14764404, "step": 5212, "time_per_iteration": 2.57916522026062 }, { "auxiliary_loss_clip": 0.06508082, "auxiliary_loss_mlp": 0.01273903, "balance_loss_clip": 0.06303431, "balance_loss_mlp": 0.0125713, "epoch": 0.31342251615812416, "flos": 27169617300480.0, "grad_norm": 5.126590609116748, "language_loss": 0.71482241, "learning_rate": 3.2148819646359097e-06, "loss": 0.7926423, "num_input_tokens_seen": 111975915, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.16760254, "step": 5213, "time_per_iteration": 2.609484910964966 }, { "auxiliary_loss_clip": 0.06506786, "auxiliary_loss_mlp": 0.01276692, "balance_loss_clip": 0.06301985, "balance_loss_mlp": 0.01261111, "epoch": 0.31348263941079213, "flos": 20235985718400.0, "grad_norm": 1.960406198332198, "language_loss": 0.77907598, "learning_rate": 3.2145725669486374e-06, "loss": 0.85691071, "num_input_tokens_seen": 111995055, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.15576172, "step": 5214, "time_per_iteration": 2.5678701400756836 }, { "auxiliary_loss_clip": 0.06500934, "auxiliary_loss_mlp": 0.01275909, "balance_loss_clip": 0.06302708, "balance_loss_mlp": 0.01260925, "epoch": 0.3135427626634601, "flos": 24614474895360.0, "grad_norm": 1.6020737478573572, "language_loss": 0.83041012, "learning_rate": 3.2142631232046517e-06, "loss": 0.90817863, "num_input_tokens_seen": 112015830, "router_z_loss_clip": 1.98144531, "router_z_loss_mlp": 0.14978027, "step": 5215, "time_per_iteration": 2.594219923019409 }, { "auxiliary_loss_clip": 0.06502689, "auxiliary_loss_mlp": 0.01274078, "balance_loss_clip": 0.06298991, "balance_loss_mlp": 0.01258497, "epoch": 0.31360288591612806, "flos": 20966230051200.0, "grad_norm": 1.9819135291808405, "language_loss": 0.79884362, "learning_rate": 3.213953633415686e-06, "loss": 0.87661135, "num_input_tokens_seen": 112035065, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.15576172, "step": 5216, "time_per_iteration": 2.5709853172302246 }, { "auxiliary_loss_clip": 0.06504896, "auxiliary_loss_mlp": 0.01272942, "balance_loss_clip": 0.06297337, "balance_loss_mlp": 0.01255359, "epoch": 0.313663009168796, "flos": 26987957648640.0, "grad_norm": 2.1808328074464747, "language_loss": 0.68915135, "learning_rate": 3.213644097593477e-06, "loss": 0.76692975, "num_input_tokens_seen": 112058405, "router_z_loss_clip": 2.07714844, "router_z_loss_mlp": 0.17590332, "step": 5217, "time_per_iteration": 2.638733386993408 }, { "auxiliary_loss_clip": 0.06504846, "auxiliary_loss_mlp": 0.01274364, "balance_loss_clip": 0.06302118, "balance_loss_mlp": 0.01258664, "epoch": 0.313723132421464, "flos": 18046762093440.0, "grad_norm": 1.729473934844603, "language_loss": 0.80541408, "learning_rate": 3.2133345157497624e-06, "loss": 0.88320613, "num_input_tokens_seen": 112076420, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.15686035, "step": 5218, "time_per_iteration": 2.5572609901428223 }, { "auxiliary_loss_clip": 0.06497609, "auxiliary_loss_mlp": 0.01274752, "balance_loss_clip": 0.06292639, "balance_loss_mlp": 0.01258993, "epoch": 0.31378325567413196, "flos": 22494963467520.0, "grad_norm": 3.4201307175163347, "language_loss": 0.6979546, "learning_rate": 3.2130248878962813e-06, "loss": 0.77567816, "num_input_tokens_seen": 112090775, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.15759277, "step": 5219, "time_per_iteration": 2.546840190887451 }, { "auxiliary_loss_clip": 0.06497208, "auxiliary_loss_mlp": 0.01270693, "balance_loss_clip": 0.06296124, "balance_loss_mlp": 0.0125566, "epoch": 0.3138433789267999, "flos": 22425838248960.0, "grad_norm": 4.294429144019294, "language_loss": 0.80506957, "learning_rate": 3.2127152140447747e-06, "loss": 0.88274854, "num_input_tokens_seen": 112110980, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.15039062, "step": 5220, "time_per_iteration": 2.5889639854431152 }, { "auxiliary_loss_clip": 0.06499755, "auxiliary_loss_mlp": 0.0127565, "balance_loss_clip": 0.06296707, "balance_loss_mlp": 0.01260189, "epoch": 0.3139035021794679, "flos": 13010927235840.0, "grad_norm": 1.8528046850462305, "language_loss": 0.73541594, "learning_rate": 3.212405494206986e-06, "loss": 0.81317002, "num_input_tokens_seen": 112129020, "router_z_loss_clip": 2.03027344, "router_z_loss_mlp": 0.15454102, "step": 5221, "time_per_iteration": 2.566230535507202 }, { "auxiliary_loss_clip": 0.06496575, "auxiliary_loss_mlp": 0.01271896, "balance_loss_clip": 0.06296724, "balance_loss_mlp": 0.0125647, "epoch": 0.31396362543213585, "flos": 16951605229440.0, "grad_norm": 1.6554825826804231, "language_loss": 0.82084858, "learning_rate": 3.2120957283946588e-06, "loss": 0.89853334, "num_input_tokens_seen": 112147865, "router_z_loss_clip": 1.99902344, "router_z_loss_mlp": 0.1541748, "step": 5222, "time_per_iteration": 2.636914014816284 }, { "auxiliary_loss_clip": 0.06500362, "auxiliary_loss_mlp": 0.01272404, "balance_loss_clip": 0.06294616, "balance_loss_mlp": 0.01255333, "epoch": 0.31402374868480387, "flos": 20162877431040.0, "grad_norm": 2.9374266869546974, "language_loss": 0.70662344, "learning_rate": 3.2117859166195407e-06, "loss": 0.78435111, "num_input_tokens_seen": 112166745, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.17059326, "step": 5223, "time_per_iteration": 2.5550944805145264 }, { "auxiliary_loss_clip": 0.06495903, "auxiliary_loss_mlp": 0.01268266, "balance_loss_clip": 0.06295799, "balance_loss_mlp": 0.01253389, "epoch": 0.31408387193747184, "flos": 21257363462400.0, "grad_norm": 2.1034451581206226, "language_loss": 0.8072269, "learning_rate": 3.211476058893379e-06, "loss": 0.88486862, "num_input_tokens_seen": 112185895, "router_z_loss_clip": 2.00097656, "router_z_loss_mlp": 0.14892578, "step": 5224, "time_per_iteration": 2.564750909805298 }, { "auxiliary_loss_clip": 0.06504172, "auxiliary_loss_mlp": 0.01271931, "balance_loss_clip": 0.06296209, "balance_loss_mlp": 0.01256088, "epoch": 0.3141439951901398, "flos": 27490617492480.0, "grad_norm": 12.354386268984756, "language_loss": 0.58328331, "learning_rate": 3.2111661552279243e-06, "loss": 0.66104424, "num_input_tokens_seen": 112204465, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.1583252, "step": 5225, "time_per_iteration": 3.990405321121216 }, { "auxiliary_loss_clip": 0.06494698, "auxiliary_loss_mlp": 0.0126918, "balance_loss_clip": 0.06297669, "balance_loss_mlp": 0.01254744, "epoch": 0.31420411844280777, "flos": 17857010522880.0, "grad_norm": 3.0769943048941277, "language_loss": 0.82036209, "learning_rate": 3.2108562056349273e-06, "loss": 0.89800084, "num_input_tokens_seen": 112221635, "router_z_loss_clip": 1.96777344, "router_z_loss_mlp": 0.14428711, "step": 5226, "time_per_iteration": 2.5449554920196533 }, { "auxiliary_loss_clip": 0.06493711, "auxiliary_loss_mlp": 0.01274669, "balance_loss_clip": 0.06291787, "balance_loss_mlp": 0.01259422, "epoch": 0.31426424169547573, "flos": 21623491877760.0, "grad_norm": 4.5301508502160415, "language_loss": 0.74230933, "learning_rate": 3.210546210126141e-06, "loss": 0.8199932, "num_input_tokens_seen": 112241240, "router_z_loss_clip": 2.01855469, "router_z_loss_mlp": 0.15246582, "step": 5227, "time_per_iteration": 2.5743260383605957 }, { "auxiliary_loss_clip": 0.06498574, "auxiliary_loss_mlp": 0.01270795, "balance_loss_clip": 0.06297367, "balance_loss_mlp": 0.01255131, "epoch": 0.3143243649481437, "flos": 30928677569280.0, "grad_norm": 2.078525970074079, "language_loss": 0.68569052, "learning_rate": 3.2102361687133213e-06, "loss": 0.76338422, "num_input_tokens_seen": 112262350, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.15661621, "step": 5228, "time_per_iteration": 2.8407342433929443 }, { "auxiliary_loss_clip": 0.06498803, "auxiliary_loss_mlp": 0.01273263, "balance_loss_clip": 0.06297497, "balance_loss_mlp": 0.01258291, "epoch": 0.31438448820081166, "flos": 22828206355200.0, "grad_norm": 1.7535951269475862, "language_loss": 0.80149496, "learning_rate": 3.2099260814082254e-06, "loss": 0.8792156, "num_input_tokens_seen": 112283710, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.14978027, "step": 5229, "time_per_iteration": 2.8174688816070557 }, { "auxiliary_loss_clip": 0.06489199, "auxiliary_loss_mlp": 0.01270917, "balance_loss_clip": 0.06288771, "balance_loss_mlp": 0.01255956, "epoch": 0.3144446114534796, "flos": 23298399941760.0, "grad_norm": 1.8552534639089973, "language_loss": 0.70378441, "learning_rate": 3.209615948222611e-06, "loss": 0.78138554, "num_input_tokens_seen": 112304285, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.14953613, "step": 5230, "time_per_iteration": 2.6681175231933594 }, { "auxiliary_loss_clip": 0.06497615, "auxiliary_loss_mlp": 0.01272142, "balance_loss_clip": 0.06294583, "balance_loss_mlp": 0.01256418, "epoch": 0.3145047347061476, "flos": 31363679640960.0, "grad_norm": 1.6661296705013313, "language_loss": 0.80180442, "learning_rate": 3.209305769168239e-06, "loss": 0.87950194, "num_input_tokens_seen": 112325110, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.15722656, "step": 5231, "time_per_iteration": 4.12939453125 }, { "auxiliary_loss_clip": 0.06496754, "auxiliary_loss_mlp": 0.01270051, "balance_loss_clip": 0.06297875, "balance_loss_mlp": 0.01255567, "epoch": 0.31456485795881556, "flos": 10894182992640.0, "grad_norm": 1.9357496170029203, "language_loss": 0.84920537, "learning_rate": 3.2089955442568704e-06, "loss": 0.92687345, "num_input_tokens_seen": 112339855, "router_z_loss_clip": 1.98828125, "router_z_loss_mlp": 0.14483643, "step": 5232, "time_per_iteration": 2.5575809478759766 }, { "auxiliary_loss_clip": 0.06492892, "auxiliary_loss_mlp": 0.01278776, "balance_loss_clip": 0.06296067, "balance_loss_mlp": 0.01263005, "epoch": 0.3146249812114835, "flos": 17098157220480.0, "grad_norm": 5.272597167381731, "language_loss": 0.80616164, "learning_rate": 3.2086852735002692e-06, "loss": 0.88387835, "num_input_tokens_seen": 112358480, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.15783691, "step": 5233, "time_per_iteration": 2.551213502883911 }, { "auxiliary_loss_clip": 0.06504168, "auxiliary_loss_mlp": 0.01272615, "balance_loss_clip": 0.06299576, "balance_loss_mlp": 0.01257129, "epoch": 0.3146851044641515, "flos": 55303283352960.0, "grad_norm": 1.9279361530024757, "language_loss": 0.71566164, "learning_rate": 3.2083749569102024e-06, "loss": 0.79342949, "num_input_tokens_seen": 112382350, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.15466309, "step": 5234, "time_per_iteration": 2.848661184310913 }, { "auxiliary_loss_clip": 0.06494553, "auxiliary_loss_mlp": 0.01272829, "balance_loss_clip": 0.06291172, "balance_loss_mlp": 0.01256986, "epoch": 0.31474522771681945, "flos": 27023149163520.0, "grad_norm": 1.9388287574895016, "language_loss": 0.72717535, "learning_rate": 3.2080645944984356e-06, "loss": 0.80484921, "num_input_tokens_seen": 112400260, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.15844727, "step": 5235, "time_per_iteration": 2.6046993732452393 }, { "auxiliary_loss_clip": 0.06486642, "auxiliary_loss_mlp": 0.01270956, "balance_loss_clip": 0.06287065, "balance_loss_mlp": 0.01256353, "epoch": 0.3148053509694875, "flos": 21258369711360.0, "grad_norm": 1.824970732725274, "language_loss": 0.79033315, "learning_rate": 3.2077541862767384e-06, "loss": 0.86790913, "num_input_tokens_seen": 112419400, "router_z_loss_clip": 1.99511719, "router_z_loss_mlp": 0.14599609, "step": 5236, "time_per_iteration": 2.5785958766937256 }, { "auxiliary_loss_clip": 0.06501575, "auxiliary_loss_mlp": 0.01276589, "balance_loss_clip": 0.06294358, "balance_loss_mlp": 0.01261068, "epoch": 0.31486547422215544, "flos": 31256721504000.0, "grad_norm": 1.9620179941599247, "language_loss": 0.76664603, "learning_rate": 3.207443732256881e-06, "loss": 0.8444277, "num_input_tokens_seen": 112440825, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.15539551, "step": 5237, "time_per_iteration": 4.0021281242370605 }, { "auxiliary_loss_clip": 0.06494401, "auxiliary_loss_mlp": 0.01275251, "balance_loss_clip": 0.06297252, "balance_loss_mlp": 0.01260642, "epoch": 0.3149255974748234, "flos": 19834749642240.0, "grad_norm": 9.890128666729815, "language_loss": 0.79837668, "learning_rate": 3.2071332324506372e-06, "loss": 0.87607324, "num_input_tokens_seen": 112459180, "router_z_loss_clip": 1.96972656, "router_z_loss_mlp": 0.1461792, "step": 5238, "time_per_iteration": 2.542884349822998 }, { "auxiliary_loss_clip": 0.06385292, "auxiliary_loss_mlp": 0.01273952, "balance_loss_clip": 0.06284548, "balance_loss_mlp": 0.0126871, "epoch": 0.31498572072749137, "flos": 67701867350400.0, "grad_norm": 0.8529018567114542, "language_loss": 0.67965943, "learning_rate": 3.2068226868697795e-06, "loss": 0.75625181, "num_input_tokens_seen": 112516680, "router_z_loss_clip": 1.0078125, "router_z_loss_mlp": 0.05245972, "step": 5239, "time_per_iteration": 3.165137529373169 }, { "auxiliary_loss_clip": 0.06499535, "auxiliary_loss_mlp": 0.01273103, "balance_loss_clip": 0.06292669, "balance_loss_mlp": 0.0125633, "epoch": 0.31504584398015933, "flos": 19799432346240.0, "grad_norm": 2.494489147109084, "language_loss": 0.83005202, "learning_rate": 3.2065120955260846e-06, "loss": 0.90777838, "num_input_tokens_seen": 112535895, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.16772461, "step": 5240, "time_per_iteration": 4.019512891769409 }, { "auxiliary_loss_clip": 0.06495658, "auxiliary_loss_mlp": 0.01279242, "balance_loss_clip": 0.06294118, "balance_loss_mlp": 0.0126421, "epoch": 0.3151059672328273, "flos": 26622751628160.0, "grad_norm": 1.7077154423554448, "language_loss": 0.80925822, "learning_rate": 3.2062014584313302e-06, "loss": 0.88700724, "num_input_tokens_seen": 112557490, "router_z_loss_clip": 2.01367188, "router_z_loss_mlp": 0.15039062, "step": 5241, "time_per_iteration": 2.6008448600769043 }, { "auxiliary_loss_clip": 0.0648772, "auxiliary_loss_mlp": 0.01277765, "balance_loss_clip": 0.06292588, "balance_loss_mlp": 0.01262113, "epoch": 0.31516609048549526, "flos": 24210890904960.0, "grad_norm": 2.3190812821085927, "language_loss": 0.7462365, "learning_rate": 3.2058907755972956e-06, "loss": 0.82389134, "num_input_tokens_seen": 112577075, "router_z_loss_clip": 1.95410156, "router_z_loss_mlp": 0.15637207, "step": 5242, "time_per_iteration": 2.592907667160034 }, { "auxiliary_loss_clip": 0.06495751, "auxiliary_loss_mlp": 0.01271775, "balance_loss_clip": 0.06295286, "balance_loss_mlp": 0.0125642, "epoch": 0.31522621373816323, "flos": 25965950999040.0, "grad_norm": 4.447656045167664, "language_loss": 0.73709863, "learning_rate": 3.2055800470357626e-06, "loss": 0.81477392, "num_input_tokens_seen": 112597620, "router_z_loss_clip": 2.00585938, "router_z_loss_mlp": 0.15356445, "step": 5243, "time_per_iteration": 2.601017951965332 }, { "auxiliary_loss_clip": 0.06497008, "auxiliary_loss_mlp": 0.01272109, "balance_loss_clip": 0.06295938, "balance_loss_mlp": 0.01256528, "epoch": 0.3152863369908312, "flos": 21915379975680.0, "grad_norm": 18.730322733938593, "language_loss": 0.64691991, "learning_rate": 3.205269272758513e-06, "loss": 0.72461104, "num_input_tokens_seen": 112617150, "router_z_loss_clip": 2.01171875, "router_z_loss_mlp": 0.15576172, "step": 5244, "time_per_iteration": 2.5730648040771484 }, { "auxiliary_loss_clip": 0.06493979, "auxiliary_loss_mlp": 0.01267209, "balance_loss_clip": 0.06291839, "balance_loss_mlp": 0.01252248, "epoch": 0.31534646024349916, "flos": 16285203308160.0, "grad_norm": 3.5548295815608157, "language_loss": 0.91093272, "learning_rate": 3.2049584527773313e-06, "loss": 0.98854458, "num_input_tokens_seen": 112631090, "router_z_loss_clip": 2.01757812, "router_z_loss_mlp": 0.14953613, "step": 5245, "time_per_iteration": 2.5220251083374023 }, { "auxiliary_loss_clip": 0.06496453, "auxiliary_loss_mlp": 0.0127375, "balance_loss_clip": 0.06295591, "balance_loss_mlp": 0.0125867, "epoch": 0.3154065834961671, "flos": 24724116362880.0, "grad_norm": 1.6731225762050936, "language_loss": 0.76239574, "learning_rate": 3.2046475871040048e-06, "loss": 0.84009778, "num_input_tokens_seen": 112651220, "router_z_loss_clip": 2.00683594, "router_z_loss_mlp": 0.15087891, "step": 5246, "time_per_iteration": 2.6838338375091553 }, { "auxiliary_loss_clip": 0.0649486, "auxiliary_loss_mlp": 0.01271387, "balance_loss_clip": 0.06293866, "balance_loss_mlp": 0.01255771, "epoch": 0.3154667067488351, "flos": 35379813836160.0, "grad_norm": 1.896218433023162, "language_loss": 0.61776763, "learning_rate": 3.204336675750321e-06, "loss": 0.69543016, "num_input_tokens_seen": 112671560, "router_z_loss_clip": 2.0078125, "router_z_loss_mlp": 0.15625, "step": 5247, "time_per_iteration": 2.6839418411254883 }, { "auxiliary_loss_clip": 0.06494548, "auxiliary_loss_mlp": 0.01273518, "balance_loss_clip": 0.06290306, "balance_loss_mlp": 0.01257449, "epoch": 0.31552683000150306, "flos": 17462105429760.0, "grad_norm": 3.009787976843829, "language_loss": 0.82837951, "learning_rate": 3.2040257187280693e-06, "loss": 0.90606022, "num_input_tokens_seen": 112689790, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.16064453, "step": 5248, "time_per_iteration": 2.5310556888580322 }, { "auxiliary_loss_clip": 0.06498362, "auxiliary_loss_mlp": 0.01275026, "balance_loss_clip": 0.06297255, "balance_loss_mlp": 0.01258373, "epoch": 0.3155869532541711, "flos": 18411674624640.0, "grad_norm": 7.82947283617293, "language_loss": 0.85965097, "learning_rate": 3.2037147160490423e-06, "loss": 0.93738484, "num_input_tokens_seen": 112708265, "router_z_loss_clip": 2.01074219, "router_z_loss_mlp": 0.16650391, "step": 5249, "time_per_iteration": 2.527456283569336 }, { "auxiliary_loss_clip": 0.06492095, "auxiliary_loss_mlp": 0.01272202, "balance_loss_clip": 0.06290857, "balance_loss_mlp": 0.01255548, "epoch": 0.31564707650683904, "flos": 21586162083840.0, "grad_norm": 1.887077382963581, "language_loss": 0.86185688, "learning_rate": 3.2034036677250322e-06, "loss": 0.93949986, "num_input_tokens_seen": 112727820, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.16650391, "step": 5250, "time_per_iteration": 2.5556390285491943 }, { "auxiliary_loss_clip": 0.06497367, "auxiliary_loss_mlp": 0.01273337, "balance_loss_clip": 0.06297205, "balance_loss_mlp": 0.01257173, "epoch": 0.315707199759507, "flos": 21037032351360.0, "grad_norm": 3.140118299845283, "language_loss": 0.68624473, "learning_rate": 3.203092573767835e-06, "loss": 0.76395172, "num_input_tokens_seen": 112743140, "router_z_loss_clip": 1.99902344, "router_z_loss_mlp": 0.16174316, "step": 5251, "time_per_iteration": 2.5413219928741455 }, { "auxiliary_loss_clip": 0.06493722, "auxiliary_loss_mlp": 0.01270929, "balance_loss_clip": 0.06293255, "balance_loss_mlp": 0.01254835, "epoch": 0.31576732301217497, "flos": 26835326236800.0, "grad_norm": 1.970248910942557, "language_loss": 0.78762996, "learning_rate": 3.202781434189246e-06, "loss": 0.86527646, "num_input_tokens_seen": 112764705, "router_z_loss_clip": 2.00390625, "router_z_loss_mlp": 0.16088867, "step": 5252, "time_per_iteration": 2.58870267868042 }, { "auxiliary_loss_clip": 0.06496565, "auxiliary_loss_mlp": 0.01272424, "balance_loss_clip": 0.06297638, "balance_loss_mlp": 0.0125757, "epoch": 0.31582744626484294, "flos": 22717810200960.0, "grad_norm": 2.2015420132654615, "language_loss": 0.74724638, "learning_rate": 3.202470249001066e-06, "loss": 0.82493627, "num_input_tokens_seen": 112785310, "router_z_loss_clip": 1.98828125, "router_z_loss_mlp": 0.14855957, "step": 5253, "time_per_iteration": 2.58052921295166 }, { "auxiliary_loss_clip": 0.06494383, "auxiliary_loss_mlp": 0.01270779, "balance_loss_clip": 0.06292018, "balance_loss_mlp": 0.01255222, "epoch": 0.3158875695175109, "flos": 23958806296320.0, "grad_norm": 2.1204817937510603, "language_loss": 0.74115312, "learning_rate": 3.2021590182150924e-06, "loss": 0.8188048, "num_input_tokens_seen": 112802905, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.15551758, "step": 5254, "time_per_iteration": 2.574192762374878 }, { "auxiliary_loss_clip": 0.06498225, "auxiliary_loss_mlp": 0.01271297, "balance_loss_clip": 0.06293406, "balance_loss_mlp": 0.01255412, "epoch": 0.31594769277017887, "flos": 13267036840320.0, "grad_norm": 1.7736609184056444, "language_loss": 0.77915144, "learning_rate": 3.201847741843128e-06, "loss": 0.85684669, "num_input_tokens_seen": 112820305, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.15875244, "step": 5255, "time_per_iteration": 2.5303430557250977 }, { "auxiliary_loss_clip": 0.06497046, "auxiliary_loss_mlp": 0.01272991, "balance_loss_clip": 0.06296545, "balance_loss_mlp": 0.01255765, "epoch": 0.31600781602284683, "flos": 23375072027520.0, "grad_norm": 2.1693555312239274, "language_loss": 0.78676105, "learning_rate": 3.2015364198969772e-06, "loss": 0.86446142, "num_input_tokens_seen": 112841185, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.17248535, "step": 5256, "time_per_iteration": 2.5672175884246826 }, { "auxiliary_loss_clip": 0.06491298, "auxiliary_loss_mlp": 0.01272443, "balance_loss_clip": 0.06298671, "balance_loss_mlp": 0.01257709, "epoch": 0.3160679392755148, "flos": 19834707715200.0, "grad_norm": 4.0599201681706845, "language_loss": 0.7182157, "learning_rate": 3.2012250523884453e-06, "loss": 0.79585314, "num_input_tokens_seen": 112860570, "router_z_loss_clip": 1.92675781, "router_z_loss_mlp": 0.14709473, "step": 5257, "time_per_iteration": 2.551830530166626 }, { "auxiliary_loss_clip": 0.06502236, "auxiliary_loss_mlp": 0.01272658, "balance_loss_clip": 0.06297665, "balance_loss_mlp": 0.01256565, "epoch": 0.31612806252818276, "flos": 20199368684160.0, "grad_norm": 1.9969128621675831, "language_loss": 0.7669245, "learning_rate": 3.2009136393293393e-06, "loss": 0.84467345, "num_input_tokens_seen": 112877975, "router_z_loss_clip": 2.04394531, "router_z_loss_mlp": 0.16088867, "step": 5258, "time_per_iteration": 2.5569498538970947 }, { "auxiliary_loss_clip": 0.06496677, "auxiliary_loss_mlp": 0.01270877, "balance_loss_clip": 0.06293094, "balance_loss_mlp": 0.0125532, "epoch": 0.31618818578085073, "flos": 24241596226560.0, "grad_norm": 1.9171618369922516, "language_loss": 0.7317822, "learning_rate": 3.200602180731467e-06, "loss": 0.80945772, "num_input_tokens_seen": 112896170, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.15551758, "step": 5259, "time_per_iteration": 2.566157579421997 }, { "auxiliary_loss_clip": 0.06501582, "auxiliary_loss_mlp": 0.01274533, "balance_loss_clip": 0.06297972, "balance_loss_mlp": 0.01258201, "epoch": 0.3162483090335187, "flos": 25088735404800.0, "grad_norm": 5.182726402527669, "language_loss": 0.66865778, "learning_rate": 3.20029067660664e-06, "loss": 0.74641889, "num_input_tokens_seen": 112916180, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.16320801, "step": 5260, "time_per_iteration": 2.592057228088379 }, { "auxiliary_loss_clip": 0.06493738, "auxiliary_loss_mlp": 0.01284118, "balance_loss_clip": 0.06291145, "balance_loss_mlp": 0.0126818, "epoch": 0.31630843228618666, "flos": 26330653895040.0, "grad_norm": 2.24266316065932, "language_loss": 0.72758847, "learning_rate": 3.1999791269666706e-06, "loss": 0.80536705, "num_input_tokens_seen": 112936745, "router_z_loss_clip": 2.02832031, "router_z_loss_mlp": 0.15930176, "step": 5261, "time_per_iteration": 2.6356327533721924 }, { "auxiliary_loss_clip": 0.06386963, "auxiliary_loss_mlp": 0.01276784, "balance_loss_clip": 0.06285766, "balance_loss_mlp": 0.0127212, "epoch": 0.3163685555388547, "flos": 66780053856000.0, "grad_norm": 0.7222020665854625, "language_loss": 0.50659657, "learning_rate": 3.1996675318233716e-06, "loss": 0.58323401, "num_input_tokens_seen": 112994845, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.04656982, "step": 5262, "time_per_iteration": 3.2081727981567383 }, { "auxiliary_loss_clip": 0.06495522, "auxiliary_loss_mlp": 0.01276521, "balance_loss_clip": 0.06292871, "balance_loss_mlp": 0.01259939, "epoch": 0.31642867879152264, "flos": 26002987303680.0, "grad_norm": 1.5029040022028204, "language_loss": 0.85537016, "learning_rate": 3.19935589118856e-06, "loss": 0.93309063, "num_input_tokens_seen": 113015125, "router_z_loss_clip": 2.02441406, "router_z_loss_mlp": 0.16577148, "step": 5263, "time_per_iteration": 2.5962307453155518 }, { "auxiliary_loss_clip": 0.06487649, "auxiliary_loss_mlp": 0.01281951, "balance_loss_clip": 0.06291799, "balance_loss_mlp": 0.01266305, "epoch": 0.3164888020441906, "flos": 25781943432960.0, "grad_norm": 2.9016757686327534, "language_loss": 0.82112533, "learning_rate": 3.1990442050740535e-06, "loss": 0.89882135, "num_input_tokens_seen": 113035535, "router_z_loss_clip": 1.95605469, "router_z_loss_mlp": 0.15631104, "step": 5264, "time_per_iteration": 4.004236459732056 }, { "auxiliary_loss_clip": 0.06500256, "auxiliary_loss_mlp": 0.01283754, "balance_loss_clip": 0.06294809, "balance_loss_mlp": 0.01266397, "epoch": 0.3165489252968586, "flos": 19762437968640.0, "grad_norm": 1.9772522614321029, "language_loss": 0.8015331, "learning_rate": 3.19873247349167e-06, "loss": 0.87937319, "num_input_tokens_seen": 113052720, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.17358398, "step": 5265, "time_per_iteration": 2.550633430480957 }, { "auxiliary_loss_clip": 0.0649781, "auxiliary_loss_mlp": 0.01277969, "balance_loss_clip": 0.06290127, "balance_loss_mlp": 0.01260588, "epoch": 0.31660904854952654, "flos": 23190393628800.0, "grad_norm": 1.7997610810489748, "language_loss": 0.75113487, "learning_rate": 3.1984206964532307e-06, "loss": 0.82889259, "num_input_tokens_seen": 113071435, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.17382812, "step": 5266, "time_per_iteration": 2.572385549545288 }, { "auxiliary_loss_clip": 0.06494564, "auxiliary_loss_mlp": 0.0128517, "balance_loss_clip": 0.06288195, "balance_loss_mlp": 0.0126773, "epoch": 0.3166691718021945, "flos": 20414081571840.0, "grad_norm": 65.12608150643715, "language_loss": 0.80158442, "learning_rate": 3.1981088739705585e-06, "loss": 0.87938178, "num_input_tokens_seen": 113088645, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.17443848, "step": 5267, "time_per_iteration": 2.560999631881714 }, { "auxiliary_loss_clip": 0.06385924, "auxiliary_loss_mlp": 0.01270683, "balance_loss_clip": 0.06284772, "balance_loss_mlp": 0.01265963, "epoch": 0.31672929505486247, "flos": 70165816185600.0, "grad_norm": 0.7773950799914352, "language_loss": 0.57604873, "learning_rate": 3.197797006055478e-06, "loss": 0.65261483, "num_input_tokens_seen": 113152775, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.04711914, "step": 5268, "time_per_iteration": 3.2265408039093018 }, { "auxiliary_loss_clip": 0.06498074, "auxiliary_loss_mlp": 0.01284789, "balance_loss_clip": 0.06291619, "balance_loss_mlp": 0.01266776, "epoch": 0.31678941830753043, "flos": 14360977820160.0, "grad_norm": 2.536312257406298, "language_loss": 0.73603392, "learning_rate": 3.197485092719815e-06, "loss": 0.81386256, "num_input_tokens_seen": 113171410, "router_z_loss_clip": 2.06152344, "router_z_loss_mlp": 0.18017578, "step": 5269, "time_per_iteration": 3.969982624053955 }, { "auxiliary_loss_clip": 0.06494296, "auxiliary_loss_mlp": 0.01294022, "balance_loss_clip": 0.06292149, "balance_loss_mlp": 0.0127601, "epoch": 0.3168495415601984, "flos": 22754385308160.0, "grad_norm": 2.2739779710251344, "language_loss": 0.79868895, "learning_rate": 3.1971731339753973e-06, "loss": 0.87657213, "num_input_tokens_seen": 113189965, "router_z_loss_clip": 2.02246094, "router_z_loss_mlp": 0.18017578, "step": 5270, "time_per_iteration": 2.5863122940063477 }, { "auxiliary_loss_clip": 0.06498358, "auxiliary_loss_mlp": 0.01285329, "balance_loss_clip": 0.06291787, "balance_loss_mlp": 0.01267531, "epoch": 0.31690966481286637, "flos": 20120558319360.0, "grad_norm": 3.3083833569826275, "language_loss": 0.80164027, "learning_rate": 3.1968611298340545e-06, "loss": 0.87947714, "num_input_tokens_seen": 113206355, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.17810059, "step": 5271, "time_per_iteration": 2.531507968902588 }, { "auxiliary_loss_clip": 0.06502989, "auxiliary_loss_mlp": 0.01292523, "balance_loss_clip": 0.06296574, "balance_loss_mlp": 0.01274368, "epoch": 0.31696978806553433, "flos": 21185345278080.0, "grad_norm": 4.207045536624656, "language_loss": 0.73376215, "learning_rate": 3.1965490803076173e-06, "loss": 0.81171733, "num_input_tokens_seen": 113225440, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.18151855, "step": 5272, "time_per_iteration": 2.564788818359375 }, { "auxiliary_loss_clip": 0.06507288, "auxiliary_loss_mlp": 0.01292776, "balance_loss_clip": 0.06297299, "balance_loss_mlp": 0.01272379, "epoch": 0.3170299113182023, "flos": 43007030789760.0, "grad_norm": 4.545399961113533, "language_loss": 0.69582748, "learning_rate": 3.1962369854079194e-06, "loss": 0.77382815, "num_input_tokens_seen": 113248840, "router_z_loss_clip": 2.09863281, "router_z_loss_mlp": 0.20397949, "step": 5273, "time_per_iteration": 2.7591984272003174 }, { "auxiliary_loss_clip": 0.06508359, "auxiliary_loss_mlp": 0.0129195, "balance_loss_clip": 0.06302704, "balance_loss_mlp": 0.01274331, "epoch": 0.31709003457087026, "flos": 24466707020160.0, "grad_norm": 2.7585618636157823, "language_loss": 0.68251038, "learning_rate": 3.195924845146795e-06, "loss": 0.76051342, "num_input_tokens_seen": 113269630, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.17614746, "step": 5274, "time_per_iteration": 2.604736804962158 }, { "auxiliary_loss_clip": 0.06510881, "auxiliary_loss_mlp": 0.01310576, "balance_loss_clip": 0.06311005, "balance_loss_mlp": 0.01293076, "epoch": 0.3171501578235382, "flos": 24142394592000.0, "grad_norm": 1.492949957723513, "language_loss": 0.80932993, "learning_rate": 3.195612659536081e-06, "loss": 0.88754451, "num_input_tokens_seen": 113291200, "router_z_loss_clip": 1.99804688, "router_z_loss_mlp": 0.17504883, "step": 5275, "time_per_iteration": 2.7301547527313232 }, { "auxiliary_loss_clip": 0.0651661, "auxiliary_loss_mlp": 0.01304562, "balance_loss_clip": 0.06309483, "balance_loss_mlp": 0.0128531, "epoch": 0.31721028107620625, "flos": 18885641644800.0, "grad_norm": 2.2096065920261974, "language_loss": 0.73721981, "learning_rate": 3.1953004285876147e-06, "loss": 0.81543148, "num_input_tokens_seen": 113310170, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.19250488, "step": 5276, "time_per_iteration": 2.6065945625305176 }, { "auxiliary_loss_clip": 0.06508918, "auxiliary_loss_mlp": 0.01317248, "balance_loss_clip": 0.06306468, "balance_loss_mlp": 0.01297913, "epoch": 0.3172704043288742, "flos": 23154405500160.0, "grad_norm": 2.383338966254294, "language_loss": 0.78517556, "learning_rate": 3.194988152313236e-06, "loss": 0.86343729, "num_input_tokens_seen": 113331140, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.19335938, "step": 5277, "time_per_iteration": 3.984330415725708 }, { "auxiliary_loss_clip": 0.06528009, "auxiliary_loss_mlp": 0.01317167, "balance_loss_clip": 0.06319802, "balance_loss_mlp": 0.01295721, "epoch": 0.3173305275815422, "flos": 17864347754880.0, "grad_norm": 2.2346835739012043, "language_loss": 0.80093145, "learning_rate": 3.1946758307247878e-06, "loss": 0.87938321, "num_input_tokens_seen": 113350030, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.21447754, "step": 5278, "time_per_iteration": 2.5654094219207764 }, { "auxiliary_loss_clip": 0.06395272, "auxiliary_loss_mlp": 0.01257309, "balance_loss_clip": 0.06292319, "balance_loss_mlp": 0.01252043, "epoch": 0.31739065083421014, "flos": 59988083529600.0, "grad_norm": 0.9673361789827316, "language_loss": 0.62887788, "learning_rate": 3.1943634638341114e-06, "loss": 0.70540369, "num_input_tokens_seen": 113395820, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.05270386, "step": 5279, "time_per_iteration": 2.968745231628418 }, { "auxiliary_loss_clip": 0.06523305, "auxiliary_loss_mlp": 0.0133843, "balance_loss_clip": 0.0631098, "balance_loss_mlp": 0.01314707, "epoch": 0.3174507740868781, "flos": 23807013425280.0, "grad_norm": 1.781411328482952, "language_loss": 0.81605816, "learning_rate": 3.194051051653053e-06, "loss": 0.89467555, "num_input_tokens_seen": 113416835, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.23742676, "step": 5280, "time_per_iteration": 4.02339506149292 }, { "auxiliary_loss_clip": 0.06515254, "auxiliary_loss_mlp": 0.01319822, "balance_loss_clip": 0.06312931, "balance_loss_mlp": 0.01298686, "epoch": 0.31751089733954607, "flos": 27646728848640.0, "grad_norm": 1.4544018359454336, "language_loss": 0.78492594, "learning_rate": 3.19373859419346e-06, "loss": 0.86327672, "num_input_tokens_seen": 113440850, "router_z_loss_clip": 2.02148438, "router_z_loss_mlp": 0.21105957, "step": 5281, "time_per_iteration": 2.66400146484375 }, { "auxiliary_loss_clip": 0.06512889, "auxiliary_loss_mlp": 0.01308741, "balance_loss_clip": 0.06306785, "balance_loss_mlp": 0.01287677, "epoch": 0.31757102059221404, "flos": 23776098468480.0, "grad_norm": 1.5549230487155945, "language_loss": 0.78712332, "learning_rate": 3.193426091467179e-06, "loss": 0.8653397, "num_input_tokens_seen": 113461000, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.21057129, "step": 5282, "time_per_iteration": 2.5839595794677734 }, { "auxiliary_loss_clip": 0.06521523, "auxiliary_loss_mlp": 0.01301876, "balance_loss_clip": 0.06311223, "balance_loss_mlp": 0.01280382, "epoch": 0.317631143844882, "flos": 25271485159680.0, "grad_norm": 2.2822648316405423, "language_loss": 0.67568231, "learning_rate": 3.193113543486061e-06, "loss": 0.75391626, "num_input_tokens_seen": 113480820, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.21484375, "step": 5283, "time_per_iteration": 2.6186957359313965 }, { "auxiliary_loss_clip": 0.06385493, "auxiliary_loss_mlp": 0.0126012, "balance_loss_clip": 0.06282938, "balance_loss_mlp": 0.01255903, "epoch": 0.31769126709754997, "flos": 55841832743040.0, "grad_norm": 0.7161045849090023, "language_loss": 0.52595198, "learning_rate": 3.192800950261958e-06, "loss": 0.60240817, "num_input_tokens_seen": 113536910, "router_z_loss_clip": 1.0234375, "router_z_loss_mlp": 0.04220581, "step": 5284, "time_per_iteration": 3.1589713096618652 }, { "auxiliary_loss_clip": 0.06520237, "auxiliary_loss_mlp": 0.01294495, "balance_loss_clip": 0.06305218, "balance_loss_mlp": 0.01273026, "epoch": 0.31775139035021793, "flos": 16696124530560.0, "grad_norm": 2.0158306706350366, "language_loss": 0.70700133, "learning_rate": 3.1924883118067235e-06, "loss": 0.78514862, "num_input_tokens_seen": 113555480, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.21472168, "step": 5285, "time_per_iteration": 2.545088052749634 }, { "auxiliary_loss_clip": 0.06375375, "auxiliary_loss_mlp": 0.01254374, "balance_loss_clip": 0.06272028, "balance_loss_mlp": 0.01250026, "epoch": 0.3178115136028859, "flos": 64246141261440.0, "grad_norm": 0.8124097931675159, "language_loss": 0.60550559, "learning_rate": 3.1921756281322123e-06, "loss": 0.68180305, "num_input_tokens_seen": 113616790, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.04354858, "step": 5286, "time_per_iteration": 3.243647813796997 }, { "auxiliary_loss_clip": 0.06518099, "auxiliary_loss_mlp": 0.0130779, "balance_loss_clip": 0.06307407, "balance_loss_mlp": 0.01286225, "epoch": 0.31787163685555386, "flos": 18703395014400.0, "grad_norm": 2.564514145253648, "language_loss": 0.72466177, "learning_rate": 3.1918628992502826e-06, "loss": 0.8029207, "num_input_tokens_seen": 113635320, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.21582031, "step": 5287, "time_per_iteration": 2.577974557876587 }, { "auxiliary_loss_clip": 0.06507781, "auxiliary_loss_mlp": 0.01294613, "balance_loss_clip": 0.06296737, "balance_loss_mlp": 0.01273143, "epoch": 0.31793176010822183, "flos": 21331184509440.0, "grad_norm": 2.8197776009660043, "language_loss": 0.76243168, "learning_rate": 3.191550125172792e-06, "loss": 0.84045565, "num_input_tokens_seen": 113654000, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.21484375, "step": 5288, "time_per_iteration": 2.570361375808716 }, { "auxiliary_loss_clip": 0.06497812, "auxiliary_loss_mlp": 0.01296072, "balance_loss_clip": 0.06294902, "balance_loss_mlp": 0.01277428, "epoch": 0.31799188336088985, "flos": 20964846458880.0, "grad_norm": 2.4558109570925617, "language_loss": 0.88339609, "learning_rate": 3.1912373059116007e-06, "loss": 0.96133494, "num_input_tokens_seen": 113672375, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.1862793, "step": 5289, "time_per_iteration": 2.5897955894470215 }, { "auxiliary_loss_clip": 0.06497748, "auxiliary_loss_mlp": 0.01300006, "balance_loss_clip": 0.06293531, "balance_loss_mlp": 0.01280456, "epoch": 0.3180520066135578, "flos": 22498485338880.0, "grad_norm": 1.7642395063900933, "language_loss": 0.68274766, "learning_rate": 3.190924441478572e-06, "loss": 0.7607252, "num_input_tokens_seen": 113692385, "router_z_loss_clip": 2.04101562, "router_z_loss_mlp": 0.19543457, "step": 5290, "time_per_iteration": 2.5914039611816406 }, { "auxiliary_loss_clip": 0.0650694, "auxiliary_loss_mlp": 0.01299557, "balance_loss_clip": 0.06293148, "balance_loss_mlp": 0.01277241, "epoch": 0.3181121298662258, "flos": 27242725587840.0, "grad_norm": 1.647458576007488, "language_loss": 0.79846686, "learning_rate": 3.1906115318855687e-06, "loss": 0.87653184, "num_input_tokens_seen": 113712145, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.22338867, "step": 5291, "time_per_iteration": 2.6222267150878906 }, { "auxiliary_loss_clip": 0.06500493, "auxiliary_loss_mlp": 0.01282334, "balance_loss_clip": 0.06291638, "balance_loss_mlp": 0.01261079, "epoch": 0.31817225311889374, "flos": 23185991289600.0, "grad_norm": 2.1634645141165896, "language_loss": 0.80059612, "learning_rate": 3.1902985771444577e-06, "loss": 0.87842441, "num_input_tokens_seen": 113731435, "router_z_loss_clip": 2.08886719, "router_z_loss_mlp": 0.21264648, "step": 5292, "time_per_iteration": 2.5689525604248047 }, { "auxiliary_loss_clip": 0.06487101, "auxiliary_loss_mlp": 0.01280494, "balance_loss_clip": 0.06288052, "balance_loss_mlp": 0.01262207, "epoch": 0.3182323763715617, "flos": 23265598268160.0, "grad_norm": 2.091879184545463, "language_loss": 0.75698501, "learning_rate": 3.1899855772671043e-06, "loss": 0.83466101, "num_input_tokens_seen": 113750825, "router_z_loss_clip": 1.99023438, "router_z_loss_mlp": 0.1829834, "step": 5293, "time_per_iteration": 2.616328239440918 }, { "auxiliary_loss_clip": 0.0648739, "auxiliary_loss_mlp": 0.01279602, "balance_loss_clip": 0.06284812, "balance_loss_mlp": 0.01261839, "epoch": 0.3182924996242297, "flos": 29023292050560.0, "grad_norm": 1.9761212712687548, "language_loss": 0.7531383, "learning_rate": 3.189672532265379e-06, "loss": 0.83080816, "num_input_tokens_seen": 113770010, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.17773438, "step": 5294, "time_per_iteration": 2.6530797481536865 }, { "auxiliary_loss_clip": 0.06494991, "auxiliary_loss_mlp": 0.0128676, "balance_loss_clip": 0.06290583, "balance_loss_mlp": 0.01265862, "epoch": 0.31835262287689764, "flos": 20455478288640.0, "grad_norm": 1.8587319526037611, "language_loss": 0.76518553, "learning_rate": 3.189359442151152e-06, "loss": 0.84300303, "num_input_tokens_seen": 113788640, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.2088623, "step": 5295, "time_per_iteration": 2.5704617500305176 }, { "auxiliary_loss_clip": 0.06503975, "auxiliary_loss_mlp": 0.01280995, "balance_loss_clip": 0.06293187, "balance_loss_mlp": 0.01261672, "epoch": 0.3184127461295656, "flos": 25126568323200.0, "grad_norm": 1.46383481877475, "language_loss": 0.69970298, "learning_rate": 3.189046306936296e-06, "loss": 0.7775526, "num_input_tokens_seen": 113809515, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.19335938, "step": 5296, "time_per_iteration": 2.6230030059814453 }, { "auxiliary_loss_clip": 0.06492409, "auxiliary_loss_mlp": 0.01277005, "balance_loss_clip": 0.06287587, "balance_loss_mlp": 0.01258158, "epoch": 0.31847286938223357, "flos": 25557377690880.0, "grad_norm": 1.5688780171526215, "language_loss": 0.77774334, "learning_rate": 3.1887331266326846e-06, "loss": 0.85543752, "num_input_tokens_seen": 113829770, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.18835449, "step": 5297, "time_per_iteration": 2.6017889976501465 }, { "auxiliary_loss_clip": 0.06490218, "auxiliary_loss_mlp": 0.01275197, "balance_loss_clip": 0.06285296, "balance_loss_mlp": 0.01255539, "epoch": 0.31853299263490154, "flos": 27789926676480.0, "grad_norm": 1.8035124527003281, "language_loss": 0.79458129, "learning_rate": 3.1884199012521942e-06, "loss": 0.87223548, "num_input_tokens_seen": 113849320, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.1965332, "step": 5298, "time_per_iteration": 2.599212884902954 }, { "auxiliary_loss_clip": 0.06499767, "auxiliary_loss_mlp": 0.01273445, "balance_loss_clip": 0.06289827, "balance_loss_mlp": 0.01254956, "epoch": 0.3185931158875695, "flos": 22712653175040.0, "grad_norm": 1.924381845411763, "language_loss": 0.74660653, "learning_rate": 3.1881066308067016e-06, "loss": 0.82433867, "num_input_tokens_seen": 113867860, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.18481445, "step": 5299, "time_per_iteration": 2.5692710876464844 }, { "auxiliary_loss_clip": 0.06502007, "auxiliary_loss_mlp": 0.01277785, "balance_loss_clip": 0.06291775, "balance_loss_mlp": 0.0125808, "epoch": 0.31865323914023747, "flos": 24578402912640.0, "grad_norm": 2.347618929129865, "language_loss": 0.78142834, "learning_rate": 3.1877933153080873e-06, "loss": 0.85922623, "num_input_tokens_seen": 113886375, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.19714355, "step": 5300, "time_per_iteration": 2.5942094326019287 }, { "auxiliary_loss_clip": 0.06491758, "auxiliary_loss_mlp": 0.01276272, "balance_loss_clip": 0.06286821, "balance_loss_mlp": 0.01257401, "epoch": 0.31871336239290543, "flos": 18192391689600.0, "grad_norm": 2.135666445097304, "language_loss": 0.84932011, "learning_rate": 3.1874799547682304e-06, "loss": 0.9270004, "num_input_tokens_seen": 113904065, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.1887207, "step": 5301, "time_per_iteration": 2.5365142822265625 }, { "auxiliary_loss_clip": 0.06490596, "auxiliary_loss_mlp": 0.01274964, "balance_loss_clip": 0.06289262, "balance_loss_mlp": 0.01254853, "epoch": 0.31877348564557345, "flos": 21831789928320.0, "grad_norm": 2.182229207155364, "language_loss": 0.77542037, "learning_rate": 3.187166549199015e-06, "loss": 0.85307592, "num_input_tokens_seen": 113918415, "router_z_loss_clip": 2.01171875, "router_z_loss_mlp": 0.20129395, "step": 5302, "time_per_iteration": 2.545283555984497 }, { "auxiliary_loss_clip": 0.06488065, "auxiliary_loss_mlp": 0.01275185, "balance_loss_clip": 0.06289099, "balance_loss_mlp": 0.01257602, "epoch": 0.3188336088982414, "flos": 22021331863680.0, "grad_norm": 1.6681400134129016, "language_loss": 0.7970773, "learning_rate": 3.1868530986123255e-06, "loss": 0.87470984, "num_input_tokens_seen": 113938135, "router_z_loss_clip": 1.99023438, "router_z_loss_mlp": 0.17590332, "step": 5303, "time_per_iteration": 4.034299373626709 }, { "auxiliary_loss_clip": 0.06512433, "auxiliary_loss_mlp": 0.01279624, "balance_loss_clip": 0.06295146, "balance_loss_mlp": 0.0125943, "epoch": 0.3188937321509094, "flos": 20054116431360.0, "grad_norm": 1.9640992829311603, "language_loss": 0.73298299, "learning_rate": 3.186539603020047e-06, "loss": 0.81090367, "num_input_tokens_seen": 113957125, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.2019043, "step": 5304, "time_per_iteration": 2.6099655628204346 }, { "auxiliary_loss_clip": 0.06491769, "auxiliary_loss_mlp": 0.01273172, "balance_loss_clip": 0.0629142, "balance_loss_mlp": 0.01255911, "epoch": 0.31895385540357735, "flos": 25855135574400.0, "grad_norm": 1.88645318054349, "language_loss": 0.73083282, "learning_rate": 3.186226062434068e-06, "loss": 0.80848223, "num_input_tokens_seen": 113974875, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.17285156, "step": 5305, "time_per_iteration": 2.6328659057617188 }, { "auxiliary_loss_clip": 0.06494163, "auxiliary_loss_mlp": 0.01272959, "balance_loss_clip": 0.06291262, "balance_loss_mlp": 0.01256174, "epoch": 0.3190139786562453, "flos": 23484545786880.0, "grad_norm": 1.689192806888103, "language_loss": 0.64121956, "learning_rate": 3.1859124768662778e-06, "loss": 0.71889079, "num_input_tokens_seen": 113994450, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.16796875, "step": 5306, "time_per_iteration": 2.582775354385376 }, { "auxiliary_loss_clip": 0.06496659, "auxiliary_loss_mlp": 0.01275556, "balance_loss_clip": 0.06290932, "balance_loss_mlp": 0.01256172, "epoch": 0.3190741019089133, "flos": 29103150591360.0, "grad_norm": 2.5513543743750686, "language_loss": 0.80075234, "learning_rate": 3.1855988463285678e-06, "loss": 0.87847447, "num_input_tokens_seen": 114013945, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.19372559, "step": 5307, "time_per_iteration": 2.6158792972564697 }, { "auxiliary_loss_clip": 0.0648884, "auxiliary_loss_mlp": 0.01278015, "balance_loss_clip": 0.06289243, "balance_loss_mlp": 0.01259395, "epoch": 0.31913422516158124, "flos": 17135361233280.0, "grad_norm": 2.178628112757677, "language_loss": 0.78349853, "learning_rate": 3.1852851708328308e-06, "loss": 0.86116707, "num_input_tokens_seen": 114031375, "router_z_loss_clip": 1.99316406, "router_z_loss_mlp": 0.18591309, "step": 5308, "time_per_iteration": 2.5316247940063477 }, { "auxiliary_loss_clip": 0.06508496, "auxiliary_loss_mlp": 0.01275635, "balance_loss_clip": 0.06297286, "balance_loss_mlp": 0.01256501, "epoch": 0.3191943484142492, "flos": 16075228176000.0, "grad_norm": 2.5897874070349984, "language_loss": 0.74434006, "learning_rate": 3.184971450390961e-06, "loss": 0.8221814, "num_input_tokens_seen": 114048465, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.19140625, "step": 5309, "time_per_iteration": 3.9915599822998047 }, { "auxiliary_loss_clip": 0.06496973, "auxiliary_loss_mlp": 0.01271576, "balance_loss_clip": 0.06293401, "balance_loss_mlp": 0.01255399, "epoch": 0.3192544716669172, "flos": 22972787775360.0, "grad_norm": 1.9677089052948173, "language_loss": 0.83149379, "learning_rate": 3.184657685014856e-06, "loss": 0.90917927, "num_input_tokens_seen": 114068415, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.16174316, "step": 5310, "time_per_iteration": 2.5810906887054443 }, { "auxiliary_loss_clip": 0.06495216, "auxiliary_loss_mlp": 0.01276899, "balance_loss_clip": 0.06292433, "balance_loss_mlp": 0.01260532, "epoch": 0.31931459491958514, "flos": 26877645348480.0, "grad_norm": 1.387167517991584, "language_loss": 0.78640848, "learning_rate": 3.184343874716412e-06, "loss": 0.86412966, "num_input_tokens_seen": 114088565, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.16357422, "step": 5311, "time_per_iteration": 2.607335090637207 }, { "auxiliary_loss_clip": 0.06494198, "auxiliary_loss_mlp": 0.01273515, "balance_loss_clip": 0.06294307, "balance_loss_mlp": 0.01257505, "epoch": 0.3193747181722531, "flos": 21843194083200.0, "grad_norm": 1.6849346924497062, "language_loss": 0.8509711, "learning_rate": 3.1840300195075295e-06, "loss": 0.92864823, "num_input_tokens_seen": 114107160, "router_z_loss_clip": 1.99511719, "router_z_loss_mlp": 0.16009521, "step": 5312, "time_per_iteration": 2.569079637527466 }, { "auxiliary_loss_clip": 0.06501992, "auxiliary_loss_mlp": 0.01276372, "balance_loss_clip": 0.06292412, "balance_loss_mlp": 0.01259432, "epoch": 0.31943484142492107, "flos": 18329593950720.0, "grad_norm": 4.106049171978666, "language_loss": 0.78796363, "learning_rate": 3.1837161194001102e-06, "loss": 0.86574733, "num_input_tokens_seen": 114123420, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.16943359, "step": 5313, "time_per_iteration": 2.519191026687622 }, { "auxiliary_loss_clip": 0.06496058, "auxiliary_loss_mlp": 0.01274726, "balance_loss_clip": 0.06293836, "balance_loss_mlp": 0.01258705, "epoch": 0.31949496467758903, "flos": 21622150212480.0, "grad_norm": 3.2238580948612383, "language_loss": 0.86143595, "learning_rate": 3.183402174406057e-06, "loss": 0.93914378, "num_input_tokens_seen": 114139230, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.16015625, "step": 5314, "time_per_iteration": 2.554093837738037 }, { "auxiliary_loss_clip": 0.0649294, "auxiliary_loss_mlp": 0.01271807, "balance_loss_clip": 0.06291001, "balance_loss_mlp": 0.01255606, "epoch": 0.31955508793025705, "flos": 21766312362240.0, "grad_norm": 2.8057493218370997, "language_loss": 0.79947692, "learning_rate": 3.1830881845372747e-06, "loss": 0.87712431, "num_input_tokens_seen": 114159290, "router_z_loss_clip": 2.01855469, "router_z_loss_mlp": 0.1619873, "step": 5315, "time_per_iteration": 2.541764497756958 }, { "auxiliary_loss_clip": 0.06492715, "auxiliary_loss_mlp": 0.01277353, "balance_loss_clip": 0.06290592, "balance_loss_mlp": 0.01260139, "epoch": 0.319615211182925, "flos": 17169881915520.0, "grad_norm": 1.6116878238175152, "language_loss": 0.67854416, "learning_rate": 3.18277414980567e-06, "loss": 0.7562449, "num_input_tokens_seen": 114177655, "router_z_loss_clip": 2.01855469, "router_z_loss_mlp": 0.17224121, "step": 5316, "time_per_iteration": 3.980894088745117 }, { "auxiliary_loss_clip": 0.06495544, "auxiliary_loss_mlp": 0.01272924, "balance_loss_clip": 0.06296667, "balance_loss_mlp": 0.01256748, "epoch": 0.319675334435593, "flos": 28120653941760.0, "grad_norm": 2.1270710545518514, "language_loss": 0.69430697, "learning_rate": 3.1824600702231515e-06, "loss": 0.77199161, "num_input_tokens_seen": 114200880, "router_z_loss_clip": 1.98632812, "router_z_loss_mlp": 0.16186523, "step": 5317, "time_per_iteration": 2.665893077850342 }, { "auxiliary_loss_clip": 0.06379604, "auxiliary_loss_mlp": 0.01273756, "balance_loss_clip": 0.0627876, "balance_loss_mlp": 0.01270328, "epoch": 0.31973545768826095, "flos": 69524235072000.0, "grad_norm": 0.7180808232474781, "language_loss": 0.52821809, "learning_rate": 3.182145945801628e-06, "loss": 0.60475171, "num_input_tokens_seen": 114267145, "router_z_loss_clip": 1.0078125, "router_z_loss_mlp": 0.03436279, "step": 5318, "time_per_iteration": 3.3385322093963623 }, { "auxiliary_loss_clip": 0.06490792, "auxiliary_loss_mlp": 0.01270342, "balance_loss_clip": 0.06293251, "balance_loss_mlp": 0.01254559, "epoch": 0.3197955809409289, "flos": 13704344899200.0, "grad_norm": 1.601190430147581, "language_loss": 0.84270632, "learning_rate": 3.181831776553012e-06, "loss": 0.92031771, "num_input_tokens_seen": 114284630, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.15771484, "step": 5319, "time_per_iteration": 2.5545880794525146 }, { "auxiliary_loss_clip": 0.06493548, "auxiliary_loss_mlp": 0.01279878, "balance_loss_clip": 0.06295461, "balance_loss_mlp": 0.01263164, "epoch": 0.3198557041935969, "flos": 33226368704640.0, "grad_norm": 1.5982270612413776, "language_loss": 0.63898432, "learning_rate": 3.1815175624892165e-06, "loss": 0.71671855, "num_input_tokens_seen": 114305830, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.1673584, "step": 5320, "time_per_iteration": 4.122129440307617 }, { "auxiliary_loss_clip": 0.065027, "auxiliary_loss_mlp": 0.01272445, "balance_loss_clip": 0.06296182, "balance_loss_mlp": 0.01255744, "epoch": 0.31991582744626484, "flos": 23738726747520.0, "grad_norm": 2.8234168640149715, "language_loss": 0.71019757, "learning_rate": 3.1812033036221567e-06, "loss": 0.78794909, "num_input_tokens_seen": 114325165, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.16687012, "step": 5321, "time_per_iteration": 2.707578182220459 }, { "auxiliary_loss_clip": 0.06512072, "auxiliary_loss_mlp": 0.01282862, "balance_loss_clip": 0.06302729, "balance_loss_mlp": 0.01264826, "epoch": 0.3199759506989328, "flos": 18556633388160.0, "grad_norm": 2.6363301123525553, "language_loss": 0.86980253, "learning_rate": 3.180888999963749e-06, "loss": 0.94775194, "num_input_tokens_seen": 114341310, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.18029785, "step": 5322, "time_per_iteration": 2.621342897415161 }, { "auxiliary_loss_clip": 0.06497061, "auxiliary_loss_mlp": 0.01271339, "balance_loss_clip": 0.06295463, "balance_loss_mlp": 0.01255722, "epoch": 0.3200360739516008, "flos": 22425418978560.0, "grad_norm": 2.219192079599595, "language_loss": 0.83643317, "learning_rate": 3.1805746515259123e-06, "loss": 0.91411716, "num_input_tokens_seen": 114360355, "router_z_loss_clip": 2.01367188, "router_z_loss_mlp": 0.15612793, "step": 5323, "time_per_iteration": 2.579373598098755 }, { "auxiliary_loss_clip": 0.06491591, "auxiliary_loss_mlp": 0.01274977, "balance_loss_clip": 0.06292991, "balance_loss_mlp": 0.01257048, "epoch": 0.32009619720426874, "flos": 20601569082240.0, "grad_norm": 1.8728630005809344, "language_loss": 0.78661132, "learning_rate": 3.1802602583205663e-06, "loss": 0.86427701, "num_input_tokens_seen": 114379220, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.17932129, "step": 5324, "time_per_iteration": 2.5517396926879883 }, { "auxiliary_loss_clip": 0.06495216, "auxiliary_loss_mlp": 0.01270988, "balance_loss_clip": 0.0629498, "balance_loss_mlp": 0.01255324, "epoch": 0.3201563204569367, "flos": 18153049397760.0, "grad_norm": 1.7666373321560724, "language_loss": 0.80670965, "learning_rate": 3.1799458203596333e-06, "loss": 0.88437176, "num_input_tokens_seen": 114396365, "router_z_loss_clip": 2.00195312, "router_z_loss_mlp": 0.15649414, "step": 5325, "time_per_iteration": 2.5275533199310303 }, { "auxiliary_loss_clip": 0.06496416, "auxiliary_loss_mlp": 0.01275314, "balance_loss_clip": 0.06295057, "balance_loss_mlp": 0.0125903, "epoch": 0.32021644370960467, "flos": 31691975137920.0, "grad_norm": 1.8636566652695212, "language_loss": 0.75280941, "learning_rate": 3.179631337655037e-06, "loss": 0.83052677, "num_input_tokens_seen": 114416780, "router_z_loss_clip": 2.01367188, "router_z_loss_mlp": 0.16278076, "step": 5326, "time_per_iteration": 2.6294631958007812 }, { "auxiliary_loss_clip": 0.06487171, "auxiliary_loss_mlp": 0.01274828, "balance_loss_clip": 0.06290378, "balance_loss_mlp": 0.01258413, "epoch": 0.32027656696227264, "flos": 26872488322560.0, "grad_norm": 3.6468673282210053, "language_loss": 0.81064737, "learning_rate": 3.179316810218701e-06, "loss": 0.88826734, "num_input_tokens_seen": 114437405, "router_z_loss_clip": 1.96777344, "router_z_loss_mlp": 0.1640625, "step": 5327, "time_per_iteration": 2.5981287956237793 }, { "auxiliary_loss_clip": 0.06507075, "auxiliary_loss_mlp": 0.01271301, "balance_loss_clip": 0.06299729, "balance_loss_mlp": 0.01254833, "epoch": 0.32033669021494066, "flos": 24176705639040.0, "grad_norm": 1.7656066336784477, "language_loss": 0.77936536, "learning_rate": 3.179002238062554e-06, "loss": 0.85714912, "num_input_tokens_seen": 114458505, "router_z_loss_clip": 2.07128906, "router_z_loss_mlp": 0.16461182, "step": 5328, "time_per_iteration": 2.564056634902954 }, { "auxiliary_loss_clip": 0.06503601, "auxiliary_loss_mlp": 0.01278368, "balance_loss_clip": 0.06300443, "balance_loss_mlp": 0.01261678, "epoch": 0.3203968134676086, "flos": 24467419779840.0, "grad_norm": 1.6657806943731195, "language_loss": 0.73964685, "learning_rate": 3.178687621198524e-06, "loss": 0.8174665, "num_input_tokens_seen": 114479050, "router_z_loss_clip": 2.03027344, "router_z_loss_mlp": 0.16687012, "step": 5329, "time_per_iteration": 2.587512731552124 }, { "auxiliary_loss_clip": 0.06486346, "auxiliary_loss_mlp": 0.01276837, "balance_loss_clip": 0.06292263, "balance_loss_mlp": 0.01261209, "epoch": 0.3204569367202766, "flos": 18010606256640.0, "grad_norm": 1.5555526453564785, "language_loss": 0.71165329, "learning_rate": 3.1783729596385415e-06, "loss": 0.78928512, "num_input_tokens_seen": 114497415, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.15625, "step": 5330, "time_per_iteration": 2.527160406112671 }, { "auxiliary_loss_clip": 0.06501267, "auxiliary_loss_mlp": 0.01272778, "balance_loss_clip": 0.06294289, "balance_loss_mlp": 0.01254348, "epoch": 0.32051705997294455, "flos": 30597237544320.0, "grad_norm": 2.0655081434088025, "language_loss": 0.80423063, "learning_rate": 3.1780582533945376e-06, "loss": 0.88197112, "num_input_tokens_seen": 114518785, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.1842041, "step": 5331, "time_per_iteration": 2.6218881607055664 }, { "auxiliary_loss_clip": 0.06376946, "auxiliary_loss_mlp": 0.01252691, "balance_loss_clip": 0.06276496, "balance_loss_mlp": 0.01249213, "epoch": 0.3205771832256125, "flos": 68436723657600.0, "grad_norm": 0.8084035481034351, "language_loss": 0.57657576, "learning_rate": 3.177743502478447e-06, "loss": 0.65287209, "num_input_tokens_seen": 114577710, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 0.03488159, "step": 5332, "time_per_iteration": 3.1359996795654297 }, { "auxiliary_loss_clip": 0.06504957, "auxiliary_loss_mlp": 0.01275116, "balance_loss_clip": 0.06299377, "balance_loss_mlp": 0.01258665, "epoch": 0.3206373064782805, "flos": 30451524094080.0, "grad_norm": 1.7153298028332244, "language_loss": 0.73782611, "learning_rate": 3.177428706902205e-06, "loss": 0.81562686, "num_input_tokens_seen": 114598640, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.16455078, "step": 5333, "time_per_iteration": 2.6332108974456787 }, { "auxiliary_loss_clip": 0.06499721, "auxiliary_loss_mlp": 0.01275601, "balance_loss_clip": 0.06297661, "balance_loss_mlp": 0.01258327, "epoch": 0.32069742973094845, "flos": 22061051498880.0, "grad_norm": 2.2973407695257464, "language_loss": 0.71433568, "learning_rate": 3.1771138666777485e-06, "loss": 0.79208887, "num_input_tokens_seen": 114618780, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.17260742, "step": 5334, "time_per_iteration": 2.5958516597747803 }, { "auxiliary_loss_clip": 0.06497343, "auxiliary_loss_mlp": 0.01274043, "balance_loss_clip": 0.06296511, "balance_loss_mlp": 0.01257055, "epoch": 0.3207575529836164, "flos": 22060464520320.0, "grad_norm": 1.7407119561360416, "language_loss": 0.77494621, "learning_rate": 3.1767989818170156e-06, "loss": 0.85266006, "num_input_tokens_seen": 114637525, "router_z_loss_clip": 2.0078125, "router_z_loss_mlp": 0.16992188, "step": 5335, "time_per_iteration": 2.5756521224975586 }, { "auxiliary_loss_clip": 0.06496193, "auxiliary_loss_mlp": 0.01274189, "balance_loss_clip": 0.06296022, "balance_loss_mlp": 0.01257858, "epoch": 0.3208176762362844, "flos": 34065961015680.0, "grad_norm": 1.530070058212237, "language_loss": 0.69169474, "learning_rate": 3.1764840523319477e-06, "loss": 0.76939851, "num_input_tokens_seen": 114659705, "router_z_loss_clip": 2.00195312, "router_z_loss_mlp": 0.16333008, "step": 5336, "time_per_iteration": 2.6865835189819336 }, { "auxiliary_loss_clip": 0.06496522, "auxiliary_loss_mlp": 0.01277474, "balance_loss_clip": 0.06295782, "balance_loss_mlp": 0.01260499, "epoch": 0.32087779948895234, "flos": 21805151529600.0, "grad_norm": 2.123373847534383, "language_loss": 0.79461348, "learning_rate": 3.176169078234487e-06, "loss": 0.87235343, "num_input_tokens_seen": 114678340, "router_z_loss_clip": 2.00585938, "router_z_loss_mlp": 0.1697998, "step": 5337, "time_per_iteration": 2.5593814849853516 }, { "auxiliary_loss_clip": 0.06483894, "auxiliary_loss_mlp": 0.01276696, "balance_loss_clip": 0.06289485, "balance_loss_mlp": 0.01261742, "epoch": 0.3209379227416203, "flos": 21440532487680.0, "grad_norm": 1.5920742800112486, "language_loss": 0.75165451, "learning_rate": 3.1758540595365766e-06, "loss": 0.82926035, "num_input_tokens_seen": 114696980, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.14959717, "step": 5338, "time_per_iteration": 2.5678977966308594 }, { "auxiliary_loss_clip": 0.06500797, "auxiliary_loss_mlp": 0.01278431, "balance_loss_clip": 0.06296246, "balance_loss_mlp": 0.0126055, "epoch": 0.3209980459942883, "flos": 25856267604480.0, "grad_norm": 1.8269216696314456, "language_loss": 0.63545126, "learning_rate": 3.1755389962501626e-06, "loss": 0.71324348, "num_input_tokens_seen": 114717330, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.17883301, "step": 5339, "time_per_iteration": 2.598637819290161 }, { "auxiliary_loss_clip": 0.06500804, "auxiliary_loss_mlp": 0.01274646, "balance_loss_clip": 0.06296156, "balance_loss_mlp": 0.012576, "epoch": 0.32105816924695624, "flos": 19105218069120.0, "grad_norm": 2.3808673912443066, "language_loss": 0.81817406, "learning_rate": 3.175223888387192e-06, "loss": 0.89592856, "num_input_tokens_seen": 114736320, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.17041016, "step": 5340, "time_per_iteration": 2.565891742706299 }, { "auxiliary_loss_clip": 0.06492014, "auxiliary_loss_mlp": 0.01273141, "balance_loss_clip": 0.0629186, "balance_loss_mlp": 0.01256953, "epoch": 0.3211182924996242, "flos": 16587531239040.0, "grad_norm": 2.4095371773381347, "language_loss": 0.76909214, "learning_rate": 3.1749087359596137e-06, "loss": 0.8467437, "num_input_tokens_seen": 114754575, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.16174316, "step": 5341, "time_per_iteration": 2.591444253921509 }, { "auxiliary_loss_clip": 0.06486939, "auxiliary_loss_mlp": 0.01273658, "balance_loss_clip": 0.06287595, "balance_loss_mlp": 0.01257827, "epoch": 0.3211784157522922, "flos": 22678425982080.0, "grad_norm": 1.7211920367967624, "language_loss": 0.79659051, "learning_rate": 3.1745935389793786e-06, "loss": 0.87419641, "num_input_tokens_seen": 114773590, "router_z_loss_clip": 1.99316406, "router_z_loss_mlp": 0.15844727, "step": 5342, "time_per_iteration": 2.642244338989258 }, { "auxiliary_loss_clip": 0.06501836, "auxiliary_loss_mlp": 0.01279628, "balance_loss_clip": 0.06299028, "balance_loss_mlp": 0.01262104, "epoch": 0.3212385390049602, "flos": 20565119756160.0, "grad_norm": 2.1716310796526237, "language_loss": 0.751203, "learning_rate": 3.174278297458438e-06, "loss": 0.82901764, "num_input_tokens_seen": 114790775, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.17529297, "step": 5343, "time_per_iteration": 3.947681188583374 }, { "auxiliary_loss_clip": 0.06492676, "auxiliary_loss_mlp": 0.01272176, "balance_loss_clip": 0.06291755, "balance_loss_mlp": 0.01254986, "epoch": 0.32129866225762815, "flos": 24798188972160.0, "grad_norm": 1.674391659344979, "language_loss": 0.83226687, "learning_rate": 3.173963011408748e-06, "loss": 0.90991545, "num_input_tokens_seen": 114809835, "router_z_loss_clip": 2.0078125, "router_z_loss_mlp": 0.17211914, "step": 5344, "time_per_iteration": 2.5923967361450195 }, { "auxiliary_loss_clip": 0.06492373, "auxiliary_loss_mlp": 0.01273339, "balance_loss_clip": 0.06290448, "balance_loss_mlp": 0.01257436, "epoch": 0.3213587855102961, "flos": 18372374259840.0, "grad_norm": 2.6003069836721826, "language_loss": 0.80510825, "learning_rate": 3.173647680842262e-06, "loss": 0.88276535, "num_input_tokens_seen": 114826505, "router_z_loss_clip": 2.01757812, "router_z_loss_mlp": 0.15905762, "step": 5345, "time_per_iteration": 2.5304811000823975 }, { "auxiliary_loss_clip": 0.06494172, "auxiliary_loss_mlp": 0.01271993, "balance_loss_clip": 0.06290735, "balance_loss_mlp": 0.01255804, "epoch": 0.3214189087629641, "flos": 27023274944640.0, "grad_norm": 1.7366815161148326, "language_loss": 0.83505952, "learning_rate": 3.1733323057709384e-06, "loss": 0.91272122, "num_input_tokens_seen": 114846140, "router_z_loss_clip": 2.03515625, "router_z_loss_mlp": 0.16174316, "step": 5346, "time_per_iteration": 2.5914127826690674 }, { "auxiliary_loss_clip": 0.06497908, "auxiliary_loss_mlp": 0.0127345, "balance_loss_clip": 0.06291088, "balance_loss_mlp": 0.01256511, "epoch": 0.32147903201563205, "flos": 23154866697600.0, "grad_norm": 1.5810576675752694, "language_loss": 0.81779504, "learning_rate": 3.1730168862067366e-06, "loss": 0.89550865, "num_input_tokens_seen": 114866660, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.16943359, "step": 5347, "time_per_iteration": 2.5724680423736572 }, { "auxiliary_loss_clip": 0.06492363, "auxiliary_loss_mlp": 0.01274604, "balance_loss_clip": 0.06292678, "balance_loss_mlp": 0.01258117, "epoch": 0.3215391552683, "flos": 16586231500800.0, "grad_norm": 2.5026181801839695, "language_loss": 0.79878616, "learning_rate": 3.1727014221616164e-06, "loss": 0.87645578, "num_input_tokens_seen": 114882820, "router_z_loss_clip": 1.99609375, "router_z_loss_mlp": 0.16467285, "step": 5348, "time_per_iteration": 2.5212504863739014 }, { "auxiliary_loss_clip": 0.06488541, "auxiliary_loss_mlp": 0.01274416, "balance_loss_clip": 0.06287365, "balance_loss_mlp": 0.01258108, "epoch": 0.321599278520968, "flos": 17827604939520.0, "grad_norm": 1.9598225077344198, "language_loss": 0.86144173, "learning_rate": 3.172385913647542e-06, "loss": 0.9390713, "num_input_tokens_seen": 114900745, "router_z_loss_clip": 2.01171875, "router_z_loss_mlp": 0.16308594, "step": 5349, "time_per_iteration": 4.027801275253296 }, { "auxiliary_loss_clip": 0.06492528, "auxiliary_loss_mlp": 0.01274145, "balance_loss_clip": 0.06289744, "balance_loss_mlp": 0.01257229, "epoch": 0.32165940177363594, "flos": 16257097463040.0, "grad_norm": 1.5539204733361947, "language_loss": 0.80962497, "learning_rate": 3.172070360676475e-06, "loss": 0.88729167, "num_input_tokens_seen": 114917940, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.16906738, "step": 5350, "time_per_iteration": 2.5274033546447754 }, { "auxiliary_loss_clip": 0.06484783, "auxiliary_loss_mlp": 0.01274067, "balance_loss_clip": 0.06288588, "balance_loss_mlp": 0.01258856, "epoch": 0.3217195250263039, "flos": 27607302702720.0, "grad_norm": 1.5979739234532075, "language_loss": 0.8025369, "learning_rate": 3.1717547632603828e-06, "loss": 0.8801254, "num_input_tokens_seen": 114937735, "router_z_loss_clip": 1.96484375, "router_z_loss_mlp": 0.15197754, "step": 5351, "time_per_iteration": 2.603332042694092 }, { "auxiliary_loss_clip": 0.0648814, "auxiliary_loss_mlp": 0.01273801, "balance_loss_clip": 0.06288575, "balance_loss_mlp": 0.01257147, "epoch": 0.3217796482789719, "flos": 21477023740800.0, "grad_norm": 1.7191315631998536, "language_loss": 0.76374567, "learning_rate": 3.1714391214112326e-06, "loss": 0.84136504, "num_input_tokens_seen": 114956630, "router_z_loss_clip": 1.99609375, "router_z_loss_mlp": 0.16650391, "step": 5352, "time_per_iteration": 2.5516843795776367 }, { "auxiliary_loss_clip": 0.06491689, "auxiliary_loss_mlp": 0.01277118, "balance_loss_clip": 0.06291974, "balance_loss_mlp": 0.01261156, "epoch": 0.32183977153163984, "flos": 21222046166400.0, "grad_norm": 3.025395528488203, "language_loss": 0.82172275, "learning_rate": 3.1711234351409933e-06, "loss": 0.89941078, "num_input_tokens_seen": 114976470, "router_z_loss_clip": 1.99511719, "router_z_loss_mlp": 0.15966797, "step": 5353, "time_per_iteration": 2.5478060245513916 }, { "auxiliary_loss_clip": 0.0649105, "auxiliary_loss_mlp": 0.01271131, "balance_loss_clip": 0.06294063, "balance_loss_mlp": 0.0125561, "epoch": 0.3218998947843078, "flos": 24615103800960.0, "grad_norm": 1.4922867471469932, "language_loss": 0.73734117, "learning_rate": 3.1708077044616365e-06, "loss": 0.81496298, "num_input_tokens_seen": 114996710, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.15515137, "step": 5354, "time_per_iteration": 2.5839478969573975 }, { "auxiliary_loss_clip": 0.06494468, "auxiliary_loss_mlp": 0.01277144, "balance_loss_clip": 0.06291494, "balance_loss_mlp": 0.01261325, "epoch": 0.3219600180369758, "flos": 22276686781440.0, "grad_norm": 1.5543204534313186, "language_loss": 0.83950222, "learning_rate": 3.1704919293851334e-06, "loss": 0.91721833, "num_input_tokens_seen": 115015775, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.15820312, "step": 5355, "time_per_iteration": 4.06453800201416 }, { "auxiliary_loss_clip": 0.06490049, "auxiliary_loss_mlp": 0.01270413, "balance_loss_clip": 0.06287864, "balance_loss_mlp": 0.01254344, "epoch": 0.3220201412896438, "flos": 14944376672640.0, "grad_norm": 2.5289257434979335, "language_loss": 0.71840048, "learning_rate": 3.1701761099234597e-06, "loss": 0.79600507, "num_input_tokens_seen": 115034265, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.1607666, "step": 5356, "time_per_iteration": 2.5335240364074707 }, { "auxiliary_loss_clip": 0.06503597, "auxiliary_loss_mlp": 0.01276104, "balance_loss_clip": 0.06293052, "balance_loss_mlp": 0.01259176, "epoch": 0.32208026454231176, "flos": 22672807758720.0, "grad_norm": 8.203000947271859, "language_loss": 0.68166667, "learning_rate": 3.1698602460885903e-06, "loss": 0.75946367, "num_input_tokens_seen": 115051945, "router_z_loss_clip": 2.10644531, "router_z_loss_mlp": 0.16918945, "step": 5357, "time_per_iteration": 2.5539703369140625 }, { "auxiliary_loss_clip": 0.06392878, "auxiliary_loss_mlp": 0.01270612, "balance_loss_clip": 0.06293479, "balance_loss_mlp": 0.01267083, "epoch": 0.3221403877949797, "flos": 64626273308160.0, "grad_norm": 0.679003845851277, "language_loss": 0.58229148, "learning_rate": 3.1695443378925035e-06, "loss": 0.65892637, "num_input_tokens_seen": 115119090, "router_z_loss_clip": 0.9921875, "router_z_loss_mlp": 0.03530884, "step": 5358, "time_per_iteration": 3.288797616958618 }, { "auxiliary_loss_clip": 0.06488284, "auxiliary_loss_mlp": 0.01278494, "balance_loss_clip": 0.06286063, "balance_loss_mlp": 0.01262497, "epoch": 0.3222005110476477, "flos": 20163212847360.0, "grad_norm": 2.0588756767638765, "language_loss": 0.83988833, "learning_rate": 3.1692283853471777e-06, "loss": 0.91755605, "num_input_tokens_seen": 115137755, "router_z_loss_clip": 2.02148438, "router_z_loss_mlp": 0.15991211, "step": 5359, "time_per_iteration": 2.5448904037475586 }, { "auxiliary_loss_clip": 0.06491541, "auxiliary_loss_mlp": 0.01271106, "balance_loss_clip": 0.06289979, "balance_loss_mlp": 0.01254715, "epoch": 0.32226063430031565, "flos": 22680731969280.0, "grad_norm": 1.8113814191486244, "language_loss": 0.80323029, "learning_rate": 3.168912388464595e-06, "loss": 0.88085675, "num_input_tokens_seen": 115158150, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.16381836, "step": 5360, "time_per_iteration": 4.031552314758301 }, { "auxiliary_loss_clip": 0.06392208, "auxiliary_loss_mlp": 0.01257761, "balance_loss_clip": 0.06293321, "balance_loss_mlp": 0.01254024, "epoch": 0.3223207575529836, "flos": 63847798151040.0, "grad_norm": 0.6372361376665328, "language_loss": 0.56743455, "learning_rate": 3.168596347256737e-06, "loss": 0.64393425, "num_input_tokens_seen": 115212755, "router_z_loss_clip": 0.98828125, "router_z_loss_mlp": 0.03729248, "step": 5361, "time_per_iteration": 3.0583994388580322 }, { "auxiliary_loss_clip": 0.06487297, "auxiliary_loss_mlp": 0.01271986, "balance_loss_clip": 0.06289411, "balance_loss_mlp": 0.01256298, "epoch": 0.3223808808056516, "flos": 26877393786240.0, "grad_norm": 2.108241609655693, "language_loss": 0.7136569, "learning_rate": 3.168280261735588e-06, "loss": 0.79124975, "num_input_tokens_seen": 115233090, "router_z_loss_clip": 1.97753906, "router_z_loss_mlp": 0.15661621, "step": 5362, "time_per_iteration": 2.61380672454834 }, { "auxiliary_loss_clip": 0.06490683, "auxiliary_loss_mlp": 0.01273495, "balance_loss_clip": 0.0629278, "balance_loss_mlp": 0.01257843, "epoch": 0.32244100405831955, "flos": 26768716640640.0, "grad_norm": 1.5562619588469537, "language_loss": 0.74034494, "learning_rate": 3.167964131913135e-06, "loss": 0.81798673, "num_input_tokens_seen": 115252645, "router_z_loss_clip": 1.98046875, "router_z_loss_mlp": 0.15649414, "step": 5363, "time_per_iteration": 2.6069657802581787 }, { "auxiliary_loss_clip": 0.06495944, "auxiliary_loss_mlp": 0.01276486, "balance_loss_clip": 0.06290911, "balance_loss_mlp": 0.0125994, "epoch": 0.3225011273109875, "flos": 23809403266560.0, "grad_norm": 2.7105298894899086, "language_loss": 0.77023983, "learning_rate": 3.167647957801365e-06, "loss": 0.84796405, "num_input_tokens_seen": 115269085, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.16540527, "step": 5364, "time_per_iteration": 2.5707647800445557 }, { "auxiliary_loss_clip": 0.06490468, "auxiliary_loss_mlp": 0.01274935, "balance_loss_clip": 0.06292396, "balance_loss_mlp": 0.01258317, "epoch": 0.3225612505636555, "flos": 17280194215680.0, "grad_norm": 2.7446080251116114, "language_loss": 0.77426881, "learning_rate": 3.1673317394122672e-06, "loss": 0.85192281, "num_input_tokens_seen": 115286470, "router_z_loss_clip": 1.97753906, "router_z_loss_mlp": 0.1661377, "step": 5365, "time_per_iteration": 2.532216787338257 }, { "auxiliary_loss_clip": 0.06490287, "auxiliary_loss_mlp": 0.01273537, "balance_loss_clip": 0.0629132, "balance_loss_mlp": 0.01257348, "epoch": 0.32262137381632344, "flos": 23372724113280.0, "grad_norm": 1.5991501226244078, "language_loss": 0.7728433, "learning_rate": 3.1670154767578333e-06, "loss": 0.85048157, "num_input_tokens_seen": 115307000, "router_z_loss_clip": 1.99023438, "router_z_loss_mlp": 0.16174316, "step": 5366, "time_per_iteration": 2.5941665172576904 }, { "auxiliary_loss_clip": 0.06484606, "auxiliary_loss_mlp": 0.01270897, "balance_loss_clip": 0.06287186, "balance_loss_mlp": 0.01255769, "epoch": 0.3226814970689914, "flos": 23265598268160.0, "grad_norm": 1.750508397644075, "language_loss": 0.72377431, "learning_rate": 3.166699169850055e-06, "loss": 0.80132937, "num_input_tokens_seen": 115325925, "router_z_loss_clip": 1.97363281, "router_z_loss_mlp": 0.15148926, "step": 5367, "time_per_iteration": 2.5726606845855713 }, { "auxiliary_loss_clip": 0.06485832, "auxiliary_loss_mlp": 0.01276009, "balance_loss_clip": 0.06287904, "balance_loss_mlp": 0.01260523, "epoch": 0.32274162032165943, "flos": 16400127582720.0, "grad_norm": 1.9356152405132296, "language_loss": 0.74630272, "learning_rate": 3.1663828187009274e-06, "loss": 0.82392108, "num_input_tokens_seen": 115343705, "router_z_loss_clip": 1.97753906, "router_z_loss_mlp": 0.15472412, "step": 5368, "time_per_iteration": 2.516934871673584 }, { "auxiliary_loss_clip": 0.06486779, "auxiliary_loss_mlp": 0.01272679, "balance_loss_clip": 0.0629058, "balance_loss_mlp": 0.01257348, "epoch": 0.3228017435743274, "flos": 27862489912320.0, "grad_norm": 2.147368157561579, "language_loss": 0.78941441, "learning_rate": 3.1660664233224467e-06, "loss": 0.86700904, "num_input_tokens_seen": 115364170, "router_z_loss_clip": 1.9609375, "router_z_loss_mlp": 0.15338135, "step": 5369, "time_per_iteration": 2.679711103439331 }, { "auxiliary_loss_clip": 0.06480651, "auxiliary_loss_mlp": 0.01276092, "balance_loss_clip": 0.06288835, "balance_loss_mlp": 0.01260821, "epoch": 0.32286186682699536, "flos": 19614712020480.0, "grad_norm": 34.966487853945786, "language_loss": 0.83697599, "learning_rate": 3.16574998372661e-06, "loss": 0.91454345, "num_input_tokens_seen": 115382495, "router_z_loss_clip": 1.91894531, "router_z_loss_mlp": 0.15270996, "step": 5370, "time_per_iteration": 2.6587064266204834 }, { "auxiliary_loss_clip": 0.0648925, "auxiliary_loss_mlp": 0.01279631, "balance_loss_clip": 0.06291658, "balance_loss_mlp": 0.01264825, "epoch": 0.3229219900796633, "flos": 24140885218560.0, "grad_norm": 1.829560209174708, "language_loss": 0.82688701, "learning_rate": 3.1654334999254177e-06, "loss": 0.90457582, "num_input_tokens_seen": 115399450, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.14807129, "step": 5371, "time_per_iteration": 2.5570316314697266 }, { "auxiliary_loss_clip": 0.06490419, "auxiliary_loss_mlp": 0.01273504, "balance_loss_clip": 0.06288351, "balance_loss_mlp": 0.01256493, "epoch": 0.3229821133323313, "flos": 17754454725120.0, "grad_norm": 3.0986172647419097, "language_loss": 0.89098406, "learning_rate": 3.1651169719308695e-06, "loss": 0.96862322, "num_input_tokens_seen": 115417700, "router_z_loss_clip": 2.02148438, "router_z_loss_mlp": 0.17016602, "step": 5372, "time_per_iteration": 2.534912109375 }, { "auxiliary_loss_clip": 0.06489272, "auxiliary_loss_mlp": 0.01271896, "balance_loss_clip": 0.0629008, "balance_loss_mlp": 0.01256041, "epoch": 0.32304223658499925, "flos": 22352562253440.0, "grad_norm": 1.8879591534882079, "language_loss": 0.73306894, "learning_rate": 3.1648003997549694e-06, "loss": 0.81068063, "num_input_tokens_seen": 115435840, "router_z_loss_clip": 1.99414062, "router_z_loss_mlp": 0.15856934, "step": 5373, "time_per_iteration": 2.564943790435791 }, { "auxiliary_loss_clip": 0.06487533, "auxiliary_loss_mlp": 0.01269799, "balance_loss_clip": 0.06294812, "balance_loss_mlp": 0.01254433, "epoch": 0.3231023598376672, "flos": 18484154006400.0, "grad_norm": 2.7704949970833663, "language_loss": 0.82424861, "learning_rate": 3.1644837834097214e-06, "loss": 0.90182191, "num_input_tokens_seen": 115454210, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.15356445, "step": 5374, "time_per_iteration": 2.534182071685791 }, { "auxiliary_loss_clip": 0.06482772, "auxiliary_loss_mlp": 0.01272231, "balance_loss_clip": 0.06289003, "balance_loss_mlp": 0.01256829, "epoch": 0.3231624830903352, "flos": 27643710101760.0, "grad_norm": 3.363232520391793, "language_loss": 0.88042152, "learning_rate": 3.1641671229071317e-06, "loss": 0.95797157, "num_input_tokens_seen": 115471785, "router_z_loss_clip": 1.93945312, "router_z_loss_mlp": 0.15405273, "step": 5375, "time_per_iteration": 2.5873281955718994 }, { "auxiliary_loss_clip": 0.06496686, "auxiliary_loss_mlp": 0.01274543, "balance_loss_clip": 0.06293363, "balance_loss_mlp": 0.01258134, "epoch": 0.32322260634300315, "flos": 21732965637120.0, "grad_norm": 2.7092675588323347, "language_loss": 0.76358736, "learning_rate": 3.1638504182592076e-06, "loss": 0.84129971, "num_input_tokens_seen": 115491405, "router_z_loss_clip": 2.03320312, "router_z_loss_mlp": 0.16387939, "step": 5376, "time_per_iteration": 2.6215226650238037 }, { "auxiliary_loss_clip": 0.06486502, "auxiliary_loss_mlp": 0.01269824, "balance_loss_clip": 0.06288751, "balance_loss_mlp": 0.01254869, "epoch": 0.3232827295956711, "flos": 22644198789120.0, "grad_norm": 1.589679958470093, "language_loss": 0.67339951, "learning_rate": 3.1635336694779594e-06, "loss": 0.75096273, "num_input_tokens_seen": 115511555, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.1494751, "step": 5377, "time_per_iteration": 2.5816874504089355 }, { "auxiliary_loss_clip": 0.06490351, "auxiliary_loss_mlp": 0.01278867, "balance_loss_clip": 0.06292447, "balance_loss_mlp": 0.01261105, "epoch": 0.3233428528483391, "flos": 26329731500160.0, "grad_norm": 1.4706815088419232, "language_loss": 0.72837478, "learning_rate": 3.1632168765753982e-06, "loss": 0.80606693, "num_input_tokens_seen": 115532860, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.17749023, "step": 5378, "time_per_iteration": 2.6081690788269043 }, { "auxiliary_loss_clip": 0.06487221, "auxiliary_loss_mlp": 0.01270313, "balance_loss_clip": 0.06289586, "balance_loss_mlp": 0.01255162, "epoch": 0.32340297610100704, "flos": 28592818099200.0, "grad_norm": 2.106751045569448, "language_loss": 0.82333827, "learning_rate": 3.1629000395635357e-06, "loss": 0.9009136, "num_input_tokens_seen": 115553850, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.15161133, "step": 5379, "time_per_iteration": 2.6246840953826904 }, { "auxiliary_loss_clip": 0.06494828, "auxiliary_loss_mlp": 0.012721, "balance_loss_clip": 0.06290707, "balance_loss_mlp": 0.01256495, "epoch": 0.323463099353675, "flos": 30781664380800.0, "grad_norm": 1.7121725062076332, "language_loss": 0.78592747, "learning_rate": 3.162583158454388e-06, "loss": 0.8635968, "num_input_tokens_seen": 115575530, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.15600586, "step": 5380, "time_per_iteration": 2.6731324195861816 }, { "auxiliary_loss_clip": 0.06498097, "auxiliary_loss_mlp": 0.0127384, "balance_loss_clip": 0.06297015, "balance_loss_mlp": 0.01257795, "epoch": 0.32352322260634303, "flos": 25235664739200.0, "grad_norm": 1.6020774848238288, "language_loss": 0.77378827, "learning_rate": 3.1622662332599697e-06, "loss": 0.85150766, "num_input_tokens_seen": 115594885, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.16040039, "step": 5381, "time_per_iteration": 2.600559949874878 }, { "auxiliary_loss_clip": 0.06489429, "auxiliary_loss_mlp": 0.01271735, "balance_loss_clip": 0.06294742, "balance_loss_mlp": 0.01257501, "epoch": 0.323583345859011, "flos": 23337071400960.0, "grad_norm": 2.498388774385737, "language_loss": 0.72517574, "learning_rate": 3.1619492639922998e-06, "loss": 0.80278742, "num_input_tokens_seen": 115614080, "router_z_loss_clip": 1.94238281, "router_z_loss_mlp": 0.14227295, "step": 5382, "time_per_iteration": 4.117245435714722 }, { "auxiliary_loss_clip": 0.06495291, "auxiliary_loss_mlp": 0.01274019, "balance_loss_clip": 0.06291498, "balance_loss_mlp": 0.0125851, "epoch": 0.32364346911167896, "flos": 26213675195520.0, "grad_norm": 2.706734808321492, "language_loss": 0.71360135, "learning_rate": 3.1616322506633964e-06, "loss": 0.79129446, "num_input_tokens_seen": 115632820, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.15490723, "step": 5383, "time_per_iteration": 2.603193521499634 }, { "auxiliary_loss_clip": 0.06486577, "auxiliary_loss_mlp": 0.01272153, "balance_loss_clip": 0.06292057, "balance_loss_mlp": 0.01257728, "epoch": 0.3237035923643469, "flos": 23702487056640.0, "grad_norm": 1.760345611129107, "language_loss": 0.78684646, "learning_rate": 3.161315193285283e-06, "loss": 0.86443377, "num_input_tokens_seen": 115652860, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.14428711, "step": 5384, "time_per_iteration": 2.560779333114624 }, { "auxiliary_loss_clip": 0.06495126, "auxiliary_loss_mlp": 0.01272567, "balance_loss_clip": 0.06291028, "balance_loss_mlp": 0.01256128, "epoch": 0.3237637156170149, "flos": 14433960326400.0, "grad_norm": 2.538123602276641, "language_loss": 0.75143558, "learning_rate": 3.16099809186998e-06, "loss": 0.82911253, "num_input_tokens_seen": 115670940, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.16418457, "step": 5385, "time_per_iteration": 2.5338916778564453 }, { "auxiliary_loss_clip": 0.06485957, "auxiliary_loss_mlp": 0.01272408, "balance_loss_clip": 0.06288656, "balance_loss_mlp": 0.0125762, "epoch": 0.32382383886968286, "flos": 31070449877760.0, "grad_norm": 1.746775817739213, "language_loss": 0.72012568, "learning_rate": 3.1606809464295145e-06, "loss": 0.79770935, "num_input_tokens_seen": 115691155, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.14788818, "step": 5386, "time_per_iteration": 2.6327197551727295 }, { "auxiliary_loss_clip": 0.06491455, "auxiliary_loss_mlp": 0.0127383, "balance_loss_clip": 0.06288354, "balance_loss_mlp": 0.012572, "epoch": 0.3238839621223508, "flos": 23263418062080.0, "grad_norm": 2.2463105269259933, "language_loss": 0.94592416, "learning_rate": 3.1603637569759095e-06, "loss": 1.02357697, "num_input_tokens_seen": 115710340, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.16625977, "step": 5387, "time_per_iteration": 2.5841424465179443 }, { "auxiliary_loss_clip": 0.0649564, "auxiliary_loss_mlp": 0.01285542, "balance_loss_clip": 0.06291422, "balance_loss_mlp": 0.01269639, "epoch": 0.3239440853750188, "flos": 22971026839680.0, "grad_norm": 2.8385170270725943, "language_loss": 0.77435553, "learning_rate": 3.1600465235211956e-06, "loss": 0.85216737, "num_input_tokens_seen": 115726745, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.15881348, "step": 5388, "time_per_iteration": 4.076035499572754 }, { "auxiliary_loss_clip": 0.06490723, "auxiliary_loss_mlp": 0.0127099, "balance_loss_clip": 0.06290822, "balance_loss_mlp": 0.0125504, "epoch": 0.32400420862768675, "flos": 36255394275840.0, "grad_norm": 2.0329259041554533, "language_loss": 0.72276616, "learning_rate": 3.1597292460774006e-06, "loss": 0.80038333, "num_input_tokens_seen": 115749385, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.15942383, "step": 5389, "time_per_iteration": 2.7462527751922607 }, { "auxiliary_loss_clip": 0.06489128, "auxiliary_loss_mlp": 0.01272649, "balance_loss_clip": 0.06292389, "balance_loss_mlp": 0.01257033, "epoch": 0.3240643318803547, "flos": 21622946826240.0, "grad_norm": 1.7385870186886254, "language_loss": 0.81435275, "learning_rate": 3.159411924656557e-06, "loss": 0.89197052, "num_input_tokens_seen": 115768105, "router_z_loss_clip": 1.96679688, "router_z_loss_mlp": 0.15600586, "step": 5390, "time_per_iteration": 2.577305793762207 }, { "auxiliary_loss_clip": 0.06496645, "auxiliary_loss_mlp": 0.01270808, "balance_loss_clip": 0.06296211, "balance_loss_mlp": 0.01255024, "epoch": 0.3241244551330227, "flos": 23302466864640.0, "grad_norm": 2.037576065119124, "language_loss": 0.73837668, "learning_rate": 3.1590945592706967e-06, "loss": 0.81605119, "num_input_tokens_seen": 115787340, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.15789795, "step": 5391, "time_per_iteration": 2.596470355987549 }, { "auxiliary_loss_clip": 0.0648735, "auxiliary_loss_mlp": 0.01276084, "balance_loss_clip": 0.06288108, "balance_loss_mlp": 0.01260492, "epoch": 0.32418457838569065, "flos": 14101891395840.0, "grad_norm": 1.8461355769350432, "language_loss": 0.77681732, "learning_rate": 3.158777149931855e-06, "loss": 0.85445166, "num_input_tokens_seen": 115805565, "router_z_loss_clip": 1.9921875, "router_z_loss_mlp": 0.15588379, "step": 5392, "time_per_iteration": 2.570359945297241 }, { "auxiliary_loss_clip": 0.06491743, "auxiliary_loss_mlp": 0.01270226, "balance_loss_clip": 0.06287159, "balance_loss_mlp": 0.01253966, "epoch": 0.3242447016383586, "flos": 29760454344960.0, "grad_norm": 2.025565403421961, "language_loss": 0.62904209, "learning_rate": 3.158459696652067e-06, "loss": 0.70666182, "num_input_tokens_seen": 115826725, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.16271973, "step": 5393, "time_per_iteration": 2.6271779537200928 }, { "auxiliary_loss_clip": 0.06487583, "auxiliary_loss_mlp": 0.01272637, "balance_loss_clip": 0.06287952, "balance_loss_mlp": 0.01257557, "epoch": 0.3243048248910266, "flos": 24357820239360.0, "grad_norm": 1.5385279244380408, "language_loss": 0.83198112, "learning_rate": 3.158142199443371e-06, "loss": 0.90958333, "num_input_tokens_seen": 115846955, "router_z_loss_clip": 1.99707031, "router_z_loss_mlp": 0.15087891, "step": 5394, "time_per_iteration": 2.607238531112671 }, { "auxiliary_loss_clip": 0.06485546, "auxiliary_loss_mlp": 0.01277194, "balance_loss_clip": 0.06290873, "balance_loss_mlp": 0.01263354, "epoch": 0.3243649481436946, "flos": 24359958518400.0, "grad_norm": 2.107623181868041, "language_loss": 0.82352972, "learning_rate": 3.1578246583178076e-06, "loss": 0.90115714, "num_input_tokens_seen": 115865975, "router_z_loss_clip": 1.94628906, "router_z_loss_mlp": 0.13830566, "step": 5395, "time_per_iteration": 4.035175800323486 }, { "auxiliary_loss_clip": 0.06483047, "auxiliary_loss_mlp": 0.01271283, "balance_loss_clip": 0.06291049, "balance_loss_mlp": 0.01257335, "epoch": 0.32442507139636256, "flos": 22931097569280.0, "grad_norm": 2.32147322954055, "language_loss": 0.83751935, "learning_rate": 3.157507073287417e-06, "loss": 0.91506267, "num_input_tokens_seen": 115884950, "router_z_loss_clip": 1.92089844, "router_z_loss_mlp": 0.13934326, "step": 5396, "time_per_iteration": 2.5818865299224854 }, { "auxiliary_loss_clip": 0.06496625, "auxiliary_loss_mlp": 0.0127417, "balance_loss_clip": 0.06291215, "balance_loss_mlp": 0.01257815, "epoch": 0.32448519464903053, "flos": 22206723022080.0, "grad_norm": 2.601829854254763, "language_loss": 0.76569247, "learning_rate": 3.1571894443642414e-06, "loss": 0.84340048, "num_input_tokens_seen": 115904170, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.16369629, "step": 5397, "time_per_iteration": 2.614325523376465 }, { "auxiliary_loss_clip": 0.06485003, "auxiliary_loss_mlp": 0.01271438, "balance_loss_clip": 0.06290074, "balance_loss_mlp": 0.01256203, "epoch": 0.3245453179016985, "flos": 18843574095360.0, "grad_norm": 2.365679221512261, "language_loss": 0.6797967, "learning_rate": 3.1568717715603263e-06, "loss": 0.75736111, "num_input_tokens_seen": 115919255, "router_z_loss_clip": 1.94824219, "router_z_loss_mlp": 0.15246582, "step": 5398, "time_per_iteration": 2.525110960006714 }, { "auxiliary_loss_clip": 0.06484282, "auxiliary_loss_mlp": 0.01270203, "balance_loss_clip": 0.06286627, "balance_loss_mlp": 0.0125473, "epoch": 0.32460544115436646, "flos": 21184716372480.0, "grad_norm": 1.7939693335801539, "language_loss": 0.73400432, "learning_rate": 3.156554054887718e-06, "loss": 0.81154919, "num_input_tokens_seen": 115938535, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.15478516, "step": 5399, "time_per_iteration": 4.209081649780273 }, { "auxiliary_loss_clip": 0.06491473, "auxiliary_loss_mlp": 0.01279663, "balance_loss_clip": 0.06291476, "balance_loss_mlp": 0.01263999, "epoch": 0.3246655644070344, "flos": 21987607795200.0, "grad_norm": 2.056755824779651, "language_loss": 0.71773112, "learning_rate": 3.1562362943584645e-06, "loss": 0.79544246, "num_input_tokens_seen": 115955005, "router_z_loss_clip": 2.00097656, "router_z_loss_mlp": 0.15661621, "step": 5400, "time_per_iteration": 2.574235677719116 }, { "auxiliary_loss_clip": 0.06489132, "auxiliary_loss_mlp": 0.01272303, "balance_loss_clip": 0.06287336, "balance_loss_mlp": 0.01257199, "epoch": 0.3247256876597024, "flos": 32167745020800.0, "grad_norm": 1.9865763166591928, "language_loss": 0.80268687, "learning_rate": 3.155918489984614e-06, "loss": 0.88030124, "num_input_tokens_seen": 115975305, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.15100098, "step": 5401, "time_per_iteration": 2.6415586471557617 }, { "auxiliary_loss_clip": 0.06488077, "auxiliary_loss_mlp": 0.01269987, "balance_loss_clip": 0.0628669, "balance_loss_mlp": 0.01253882, "epoch": 0.32478581091237035, "flos": 21004104896640.0, "grad_norm": 1.4338152814163108, "language_loss": 0.8798188, "learning_rate": 3.1556006417782196e-06, "loss": 0.95739949, "num_input_tokens_seen": 115994810, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.16113281, "step": 5402, "time_per_iteration": 2.5542681217193604 }, { "auxiliary_loss_clip": 0.06488344, "auxiliary_loss_mlp": 0.01269229, "balance_loss_clip": 0.06291866, "balance_loss_mlp": 0.01254649, "epoch": 0.3248459341650383, "flos": 17929741466880.0, "grad_norm": 2.4785270080782147, "language_loss": 0.8505193, "learning_rate": 3.155282749751332e-06, "loss": 0.92809498, "num_input_tokens_seen": 116011095, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.14562988, "step": 5403, "time_per_iteration": 2.52886700630188 }, { "auxiliary_loss_clip": 0.0648068, "auxiliary_loss_mlp": 0.01266891, "balance_loss_clip": 0.06289534, "balance_loss_mlp": 0.01253599, "epoch": 0.3249060574177063, "flos": 24542582492160.0, "grad_norm": 2.191175510072926, "language_loss": 0.87868679, "learning_rate": 3.154964813916007e-06, "loss": 0.95616251, "num_input_tokens_seen": 116028805, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.13293457, "step": 5404, "time_per_iteration": 2.5950825214385986 }, { "auxiliary_loss_clip": 0.06480138, "auxiliary_loss_mlp": 0.01270302, "balance_loss_clip": 0.06285375, "balance_loss_mlp": 0.01255282, "epoch": 0.32496618067037425, "flos": 26001939127680.0, "grad_norm": 6.082564506885314, "language_loss": 0.73350275, "learning_rate": 3.1546468342843008e-06, "loss": 0.81100714, "num_input_tokens_seen": 116047765, "router_z_loss_clip": 1.94824219, "router_z_loss_mlp": 0.15014648, "step": 5405, "time_per_iteration": 2.5901973247528076 }, { "auxiliary_loss_clip": 0.06484902, "auxiliary_loss_mlp": 0.01272094, "balance_loss_clip": 0.06288603, "balance_loss_mlp": 0.01257532, "epoch": 0.3250263039230422, "flos": 19579939776000.0, "grad_norm": 2.459516897895449, "language_loss": 0.83579355, "learning_rate": 3.1543288108682707e-06, "loss": 0.91336352, "num_input_tokens_seen": 116068385, "router_z_loss_clip": 1.9609375, "router_z_loss_mlp": 0.14569092, "step": 5406, "time_per_iteration": 2.592090368270874 }, { "auxiliary_loss_clip": 0.06484151, "auxiliary_loss_mlp": 0.01270465, "balance_loss_clip": 0.06291336, "balance_loss_mlp": 0.01255659, "epoch": 0.3250864271757102, "flos": 16769232817920.0, "grad_norm": 2.157092277577217, "language_loss": 0.87737483, "learning_rate": 3.1540107436799764e-06, "loss": 0.95492101, "num_input_tokens_seen": 116085350, "router_z_loss_clip": 1.92773438, "router_z_loss_mlp": 0.14794922, "step": 5407, "time_per_iteration": 2.540668487548828 }, { "auxiliary_loss_clip": 0.06483753, "auxiliary_loss_mlp": 0.01271757, "balance_loss_clip": 0.06287873, "balance_loss_mlp": 0.012562, "epoch": 0.3251465504283782, "flos": 27827004908160.0, "grad_norm": 1.3681079748089715, "language_loss": 0.70079124, "learning_rate": 3.153692632731479e-06, "loss": 0.77834636, "num_input_tokens_seen": 116107560, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.15576172, "step": 5408, "time_per_iteration": 2.644556760787964 }, { "auxiliary_loss_clip": 0.06494744, "auxiliary_loss_mlp": 0.01273261, "balance_loss_clip": 0.06288674, "balance_loss_mlp": 0.01257269, "epoch": 0.32520667368104617, "flos": 19069271867520.0, "grad_norm": 3.066153728078198, "language_loss": 0.78007358, "learning_rate": 3.153374478034841e-06, "loss": 0.85775363, "num_input_tokens_seen": 116125980, "router_z_loss_clip": 2.05761719, "router_z_loss_mlp": 0.15985107, "step": 5409, "time_per_iteration": 2.5805015563964844 }, { "auxiliary_loss_clip": 0.06493719, "auxiliary_loss_mlp": 0.01270287, "balance_loss_clip": 0.06292427, "balance_loss_mlp": 0.01254408, "epoch": 0.32526679693371413, "flos": 29388917341440.0, "grad_norm": 1.9687010199333992, "language_loss": 0.83551347, "learning_rate": 3.1530562796021285e-06, "loss": 0.91315353, "num_input_tokens_seen": 116146530, "router_z_loss_clip": 2.01367188, "router_z_loss_mlp": 0.15893555, "step": 5410, "time_per_iteration": 2.648413896560669 }, { "auxiliary_loss_clip": 0.06478699, "auxiliary_loss_mlp": 0.01270236, "balance_loss_clip": 0.06288213, "balance_loss_mlp": 0.01256271, "epoch": 0.3253269201863821, "flos": 20710833206400.0, "grad_norm": 1.5906805148395382, "language_loss": 0.71712148, "learning_rate": 3.152738037445405e-06, "loss": 0.79461086, "num_input_tokens_seen": 116165695, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.13970947, "step": 5411, "time_per_iteration": 2.5771279335021973 }, { "auxiliary_loss_clip": 0.06489253, "auxiliary_loss_mlp": 0.01274828, "balance_loss_clip": 0.06289834, "balance_loss_mlp": 0.01259534, "epoch": 0.32538704343905006, "flos": 29101515436800.0, "grad_norm": 1.6350561354682327, "language_loss": 0.83568752, "learning_rate": 3.1524197515767403e-06, "loss": 0.91332841, "num_input_tokens_seen": 116185375, "router_z_loss_clip": 1.99707031, "router_z_loss_mlp": 0.15283203, "step": 5412, "time_per_iteration": 2.595197916030884 }, { "auxiliary_loss_clip": 0.06490765, "auxiliary_loss_mlp": 0.01271713, "balance_loss_clip": 0.06289877, "balance_loss_mlp": 0.01255012, "epoch": 0.325447166691718, "flos": 24682216521600.0, "grad_norm": 1.671640814068554, "language_loss": 0.81247878, "learning_rate": 3.152101422008203e-06, "loss": 0.89010358, "num_input_tokens_seen": 116204335, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.16711426, "step": 5413, "time_per_iteration": 2.5683655738830566 }, { "auxiliary_loss_clip": 0.06486918, "auxiliary_loss_mlp": 0.01271243, "balance_loss_clip": 0.0628791, "balance_loss_mlp": 0.01255197, "epoch": 0.325507289944386, "flos": 21549503122560.0, "grad_norm": 1.654229271612235, "language_loss": 0.77195996, "learning_rate": 3.151783048751864e-06, "loss": 0.84954154, "num_input_tokens_seen": 116222840, "router_z_loss_clip": 1.99023438, "router_z_loss_mlp": 0.16040039, "step": 5414, "time_per_iteration": 2.560828447341919 }, { "auxiliary_loss_clip": 0.06377231, "auxiliary_loss_mlp": 0.01261099, "balance_loss_clip": 0.06279758, "balance_loss_mlp": 0.01257487, "epoch": 0.32556741319705396, "flos": 71537893194240.0, "grad_norm": 0.9019488101464784, "language_loss": 0.63982713, "learning_rate": 3.1514646318197965e-06, "loss": 0.71621037, "num_input_tokens_seen": 116274940, "router_z_loss_clip": 0.9765625, "router_z_loss_mlp": 0.03607178, "step": 5415, "time_per_iteration": 3.1160097122192383 }, { "auxiliary_loss_clip": 0.06487516, "auxiliary_loss_mlp": 0.01271484, "balance_loss_clip": 0.06290174, "balance_loss_mlp": 0.01256368, "epoch": 0.3256275364497219, "flos": 23739187944960.0, "grad_norm": 1.372911682623873, "language_loss": 0.74506074, "learning_rate": 3.151146171224075e-06, "loss": 0.82265073, "num_input_tokens_seen": 116297300, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.15124512, "step": 5416, "time_per_iteration": 2.610266923904419 }, { "auxiliary_loss_clip": 0.0637982, "auxiliary_loss_mlp": 0.0125826, "balance_loss_clip": 0.06283145, "balance_loss_mlp": 0.01254821, "epoch": 0.3256876597023899, "flos": 67308136214400.0, "grad_norm": 0.7568411577198375, "language_loss": 0.57793921, "learning_rate": 3.1508276669767757e-06, "loss": 0.65432, "num_input_tokens_seen": 116362370, "router_z_loss_clip": 0.96679688, "router_z_loss_mlp": 0.03448486, "step": 5417, "time_per_iteration": 3.351712226867676 }, { "auxiliary_loss_clip": 0.06381755, "auxiliary_loss_mlp": 0.01256566, "balance_loss_clip": 0.06285089, "balance_loss_mlp": 0.01253342, "epoch": 0.32574778295505785, "flos": 71304633826560.0, "grad_norm": 0.8242852793060244, "language_loss": 0.63521075, "learning_rate": 3.150509119089975e-06, "loss": 0.71159399, "num_input_tokens_seen": 116430365, "router_z_loss_clip": 0.96679688, "router_z_loss_mlp": 0.03225708, "step": 5418, "time_per_iteration": 3.3269917964935303 }, { "auxiliary_loss_clip": 0.06487449, "auxiliary_loss_mlp": 0.01272063, "balance_loss_clip": 0.06293248, "balance_loss_mlp": 0.01257466, "epoch": 0.3258079062077258, "flos": 20782515974400.0, "grad_norm": 18.924622536858806, "language_loss": 0.69721842, "learning_rate": 3.1501905275757537e-06, "loss": 0.77481353, "num_input_tokens_seen": 116447525, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.14593506, "step": 5419, "time_per_iteration": 2.550936460494995 }, { "auxiliary_loss_clip": 0.06490967, "auxiliary_loss_mlp": 0.01271981, "balance_loss_clip": 0.0629198, "balance_loss_mlp": 0.0125683, "epoch": 0.3258680294603938, "flos": 22241788755840.0, "grad_norm": 1.6884087321736463, "language_loss": 0.77460229, "learning_rate": 3.1498718924461926e-06, "loss": 0.85223174, "num_input_tokens_seen": 116466310, "router_z_loss_clip": 1.99023438, "router_z_loss_mlp": 0.15148926, "step": 5420, "time_per_iteration": 2.5697579383850098 }, { "auxiliary_loss_clip": 0.06489342, "auxiliary_loss_mlp": 0.01270236, "balance_loss_clip": 0.06290048, "balance_loss_mlp": 0.01255561, "epoch": 0.3259281527130618, "flos": 26987328743040.0, "grad_norm": 1.586185127332399, "language_loss": 0.80572414, "learning_rate": 3.1495532137133736e-06, "loss": 0.88331991, "num_input_tokens_seen": 116487825, "router_z_loss_clip": 1.9921875, "router_z_loss_mlp": 0.14685059, "step": 5421, "time_per_iteration": 2.6216232776641846 }, { "auxiliary_loss_clip": 0.06485315, "auxiliary_loss_mlp": 0.01268559, "balance_loss_clip": 0.06292526, "balance_loss_mlp": 0.01254301, "epoch": 0.32598827596572977, "flos": 26221557479040.0, "grad_norm": 1.4679763133166306, "language_loss": 0.75701976, "learning_rate": 3.149234491389381e-06, "loss": 0.83455849, "num_input_tokens_seen": 116509950, "router_z_loss_clip": 1.92871094, "router_z_loss_mlp": 0.14282227, "step": 5422, "time_per_iteration": 4.055182933807373 }, { "auxiliary_loss_clip": 0.06494369, "auxiliary_loss_mlp": 0.01270619, "balance_loss_clip": 0.06295691, "balance_loss_mlp": 0.01255634, "epoch": 0.32604839921839773, "flos": 17645567944320.0, "grad_norm": 2.2815734397351126, "language_loss": 0.63457257, "learning_rate": 3.1489157254863026e-06, "loss": 0.71222246, "num_input_tokens_seen": 116527695, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.14984131, "step": 5423, "time_per_iteration": 2.551487445831299 }, { "auxiliary_loss_clip": 0.06480994, "auxiliary_loss_mlp": 0.01272595, "balance_loss_clip": 0.06291488, "balance_loss_mlp": 0.01258243, "epoch": 0.3261085224710657, "flos": 23629420696320.0, "grad_norm": 1.5268669290567316, "language_loss": 0.74724859, "learning_rate": 3.148596916016224e-06, "loss": 0.82478446, "num_input_tokens_seen": 116547800, "router_z_loss_clip": 1.89257812, "router_z_loss_mlp": 0.14355469, "step": 5424, "time_per_iteration": 2.61383056640625 }, { "auxiliary_loss_clip": 0.06487858, "auxiliary_loss_mlp": 0.01275268, "balance_loss_clip": 0.06296285, "balance_loss_mlp": 0.01259383, "epoch": 0.32616864572373366, "flos": 23267526912000.0, "grad_norm": 1.846426000156746, "language_loss": 0.77125174, "learning_rate": 3.1482780629912355e-06, "loss": 0.84888297, "num_input_tokens_seen": 116568460, "router_z_loss_clip": 1.91503906, "router_z_loss_mlp": 0.15887451, "step": 5425, "time_per_iteration": 2.6107797622680664 }, { "auxiliary_loss_clip": 0.06499609, "auxiliary_loss_mlp": 0.01278824, "balance_loss_clip": 0.06296837, "balance_loss_mlp": 0.012626, "epoch": 0.32622876897640163, "flos": 25600535343360.0, "grad_norm": 2.6288721140827214, "language_loss": 0.78745341, "learning_rate": 3.147959166423428e-06, "loss": 0.86523771, "num_input_tokens_seen": 116588705, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.16210938, "step": 5426, "time_per_iteration": 2.6149001121520996 }, { "auxiliary_loss_clip": 0.06493977, "auxiliary_loss_mlp": 0.01275584, "balance_loss_clip": 0.06296798, "balance_loss_mlp": 0.01259598, "epoch": 0.3262888922290696, "flos": 22425544759680.0, "grad_norm": 2.0107161462325087, "language_loss": 0.75129211, "learning_rate": 3.147640226324893e-06, "loss": 0.82898772, "num_input_tokens_seen": 116608845, "router_z_loss_clip": 1.97363281, "router_z_loss_mlp": 0.15991211, "step": 5427, "time_per_iteration": 2.5856926441192627 }, { "auxiliary_loss_clip": 0.06493221, "auxiliary_loss_mlp": 0.01271941, "balance_loss_clip": 0.06294658, "balance_loss_mlp": 0.01256468, "epoch": 0.32634901548173756, "flos": 19724982393600.0, "grad_norm": 1.6504931343994955, "language_loss": 0.79415727, "learning_rate": 3.1473212427077266e-06, "loss": 0.87180889, "num_input_tokens_seen": 116628145, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.15472412, "step": 5428, "time_per_iteration": 3.9819934368133545 }, { "auxiliary_loss_clip": 0.06493106, "auxiliary_loss_mlp": 0.01270768, "balance_loss_clip": 0.06296739, "balance_loss_mlp": 0.01255295, "epoch": 0.3264091387344055, "flos": 16148336463360.0, "grad_norm": 1.6396900099470644, "language_loss": 0.72048151, "learning_rate": 3.147002215584023e-06, "loss": 0.79812026, "num_input_tokens_seen": 116646920, "router_z_loss_clip": 1.96289062, "router_z_loss_mlp": 0.15478516, "step": 5429, "time_per_iteration": 2.5445847511291504 }, { "auxiliary_loss_clip": 0.06492087, "auxiliary_loss_mlp": 0.01271298, "balance_loss_clip": 0.06297147, "balance_loss_mlp": 0.01255908, "epoch": 0.3264692619870735, "flos": 16404655703040.0, "grad_norm": 1.619529175800295, "language_loss": 0.78738737, "learning_rate": 3.146683144965881e-06, "loss": 0.86502123, "num_input_tokens_seen": 116665100, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.15393066, "step": 5430, "time_per_iteration": 2.5270183086395264 }, { "auxiliary_loss_clip": 0.06492689, "auxiliary_loss_mlp": 0.01273702, "balance_loss_clip": 0.06294681, "balance_loss_mlp": 0.01256834, "epoch": 0.32652938523974145, "flos": 22388843871360.0, "grad_norm": 1.8538733298422232, "language_loss": 0.84631193, "learning_rate": 3.146364030865399e-06, "loss": 0.92397588, "num_input_tokens_seen": 116682205, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.16870117, "step": 5431, "time_per_iteration": 2.547144889831543 }, { "auxiliary_loss_clip": 0.06488679, "auxiliary_loss_mlp": 0.01270046, "balance_loss_clip": 0.06296249, "balance_loss_mlp": 0.01255252, "epoch": 0.3265895084924094, "flos": 21914499507840.0, "grad_norm": 1.6004715688292106, "language_loss": 0.70860064, "learning_rate": 3.146044873294678e-06, "loss": 0.78618789, "num_input_tokens_seen": 116702575, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.14770508, "step": 5432, "time_per_iteration": 2.574702262878418 }, { "auxiliary_loss_clip": 0.06492498, "auxiliary_loss_mlp": 0.0126947, "balance_loss_clip": 0.06295787, "balance_loss_mlp": 0.01255165, "epoch": 0.3266496317450774, "flos": 16072083648000.0, "grad_norm": 1.3923545251503688, "language_loss": 0.84589535, "learning_rate": 3.1457256722658203e-06, "loss": 0.92351502, "num_input_tokens_seen": 116720885, "router_z_loss_clip": 1.96679688, "router_z_loss_mlp": 0.14300537, "step": 5433, "time_per_iteration": 2.5399022102355957 }, { "auxiliary_loss_clip": 0.06481928, "auxiliary_loss_mlp": 0.01271891, "balance_loss_clip": 0.06290705, "balance_loss_mlp": 0.01256656, "epoch": 0.3267097549977454, "flos": 22534766956800.0, "grad_norm": 1.4933144712288453, "language_loss": 0.8615824, "learning_rate": 3.145406427790931e-06, "loss": 0.93912053, "num_input_tokens_seen": 116740395, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.15234375, "step": 5434, "time_per_iteration": 2.5699269771575928 }, { "auxiliary_loss_clip": 0.06491546, "auxiliary_loss_mlp": 0.01272896, "balance_loss_clip": 0.06292294, "balance_loss_mlp": 0.01256148, "epoch": 0.32676987825041337, "flos": 27277581686400.0, "grad_norm": 1.994692037180079, "language_loss": 0.87996674, "learning_rate": 3.1450871398821147e-06, "loss": 0.95761108, "num_input_tokens_seen": 116758870, "router_z_loss_clip": 1.99121094, "router_z_loss_mlp": 0.16748047, "step": 5435, "time_per_iteration": 4.114036560058594 }, { "auxiliary_loss_clip": 0.0648624, "auxiliary_loss_mlp": 0.01270534, "balance_loss_clip": 0.06290211, "balance_loss_mlp": 0.01254763, "epoch": 0.32683000150308134, "flos": 11512731432960.0, "grad_norm": 2.9553019785650205, "language_loss": 0.76539564, "learning_rate": 3.144767808551479e-06, "loss": 0.84296334, "num_input_tokens_seen": 116773440, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.15771484, "step": 5436, "time_per_iteration": 2.572664737701416 }, { "auxiliary_loss_clip": 0.06482436, "auxiliary_loss_mlp": 0.01271914, "balance_loss_clip": 0.06288335, "balance_loss_mlp": 0.01256804, "epoch": 0.3268901247557493, "flos": 25637362012800.0, "grad_norm": 1.617632556831747, "language_loss": 0.72212803, "learning_rate": 3.144448433811134e-06, "loss": 0.79967153, "num_input_tokens_seen": 116794375, "router_z_loss_clip": 1.93945312, "router_z_loss_mlp": 0.15124512, "step": 5437, "time_per_iteration": 2.627671480178833 }, { "auxiliary_loss_clip": 0.0648898, "auxiliary_loss_mlp": 0.01278173, "balance_loss_clip": 0.06286566, "balance_loss_mlp": 0.01260768, "epoch": 0.32695024800841727, "flos": 24867356117760.0, "grad_norm": 3.0220465626656217, "language_loss": 0.64180595, "learning_rate": 3.144129015673189e-06, "loss": 0.71947747, "num_input_tokens_seen": 116815095, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.1739502, "step": 5438, "time_per_iteration": 2.608703374862671 }, { "auxiliary_loss_clip": 0.06478971, "auxiliary_loss_mlp": 0.01269323, "balance_loss_clip": 0.06285595, "balance_loss_mlp": 0.01253635, "epoch": 0.32701037126108523, "flos": 28846663643520.0, "grad_norm": 1.7930483030979885, "language_loss": 0.74387521, "learning_rate": 3.1438095541497576e-06, "loss": 0.8213582, "num_input_tokens_seen": 116836630, "router_z_loss_clip": 1.93554688, "router_z_loss_mlp": 0.15673828, "step": 5439, "time_per_iteration": 4.08735466003418 }, { "auxiliary_loss_clip": 0.06492645, "auxiliary_loss_mlp": 0.01272711, "balance_loss_clip": 0.06294644, "balance_loss_mlp": 0.01256677, "epoch": 0.3270704945137532, "flos": 27972592577280.0, "grad_norm": 1.8516812762186112, "language_loss": 0.74863708, "learning_rate": 3.1434900492529527e-06, "loss": 0.82629067, "num_input_tokens_seen": 116856880, "router_z_loss_clip": 1.98144531, "router_z_loss_mlp": 0.16027832, "step": 5440, "time_per_iteration": 2.606727123260498 }, { "auxiliary_loss_clip": 0.0647731, "auxiliary_loss_mlp": 0.0127373, "balance_loss_clip": 0.06283256, "balance_loss_mlp": 0.01258162, "epoch": 0.32713061776642116, "flos": 23696575344000.0, "grad_norm": 2.218284420594085, "language_loss": 0.84818113, "learning_rate": 3.1431705009948914e-06, "loss": 0.92569155, "num_input_tokens_seen": 116873770, "router_z_loss_clip": 1.94042969, "router_z_loss_mlp": 0.15588379, "step": 5441, "time_per_iteration": 2.55741286277771 }, { "auxiliary_loss_clip": 0.06486515, "auxiliary_loss_mlp": 0.01271936, "balance_loss_clip": 0.06288588, "balance_loss_mlp": 0.012557, "epoch": 0.3271907410190891, "flos": 22462203720960.0, "grad_norm": 2.0443445132894382, "language_loss": 0.87114489, "learning_rate": 3.1428509093876897e-06, "loss": 0.9487294, "num_input_tokens_seen": 116891225, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.16247559, "step": 5442, "time_per_iteration": 2.5421361923217773 }, { "auxiliary_loss_clip": 0.06485112, "auxiliary_loss_mlp": 0.01269174, "balance_loss_clip": 0.06284258, "balance_loss_mlp": 0.01253331, "epoch": 0.3272508642717571, "flos": 22826696981760.0, "grad_norm": 1.8838791124321657, "language_loss": 0.77636182, "learning_rate": 3.1425312744434668e-06, "loss": 0.8539046, "num_input_tokens_seen": 116912300, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.1583252, "step": 5443, "time_per_iteration": 2.577070951461792 }, { "auxiliary_loss_clip": 0.06485599, "auxiliary_loss_mlp": 0.0127183, "balance_loss_clip": 0.06286502, "balance_loss_mlp": 0.01255463, "epoch": 0.32731098752442506, "flos": 11806086977280.0, "grad_norm": 2.4800969265354724, "language_loss": 0.82687014, "learning_rate": 3.142211596174343e-06, "loss": 0.90444446, "num_input_tokens_seen": 116929425, "router_z_loss_clip": 1.99316406, "router_z_loss_mlp": 0.16357422, "step": 5444, "time_per_iteration": 2.5197269916534424 }, { "auxiliary_loss_clip": 0.06489295, "auxiliary_loss_mlp": 0.01271603, "balance_loss_clip": 0.06291422, "balance_loss_mlp": 0.01254866, "epoch": 0.327371110777093, "flos": 21033300844800.0, "grad_norm": 2.194909591194618, "language_loss": 0.58584654, "learning_rate": 3.1418918745924423e-06, "loss": 0.66345555, "num_input_tokens_seen": 116948255, "router_z_loss_clip": 1.97851562, "router_z_loss_mlp": 0.16748047, "step": 5445, "time_per_iteration": 2.549193859100342 }, { "auxiliary_loss_clip": 0.06487465, "auxiliary_loss_mlp": 0.01277493, "balance_loss_clip": 0.06289601, "balance_loss_mlp": 0.0126066, "epoch": 0.327431234029761, "flos": 19068055983360.0, "grad_norm": 2.044830987589755, "language_loss": 0.88876832, "learning_rate": 3.1415721097098865e-06, "loss": 0.96641785, "num_input_tokens_seen": 116964905, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.16845703, "step": 5446, "time_per_iteration": 2.5458261966705322 }, { "auxiliary_loss_clip": 0.06494232, "auxiliary_loss_mlp": 0.01276551, "balance_loss_clip": 0.06288005, "balance_loss_mlp": 0.01257728, "epoch": 0.32749135728242895, "flos": 25856435312640.0, "grad_norm": 1.52587057753421, "language_loss": 0.79560483, "learning_rate": 3.141252301538802e-06, "loss": 0.87331271, "num_input_tokens_seen": 116983650, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.18811035, "step": 5447, "time_per_iteration": 2.5777132511138916 }, { "auxiliary_loss_clip": 0.06479777, "auxiliary_loss_mlp": 0.01274231, "balance_loss_clip": 0.06284357, "balance_loss_mlp": 0.01259258, "epoch": 0.327551480535097, "flos": 20126721594240.0, "grad_norm": 7.4526390584365005, "language_loss": 0.73092192, "learning_rate": 3.1409324500913157e-06, "loss": 0.80846202, "num_input_tokens_seen": 117003265, "router_z_loss_clip": 1.95214844, "router_z_loss_mlp": 0.1496582, "step": 5448, "time_per_iteration": 2.5496840476989746 }, { "auxiliary_loss_clip": 0.06485237, "auxiliary_loss_mlp": 0.01270955, "balance_loss_clip": 0.0628949, "balance_loss_mlp": 0.01254707, "epoch": 0.32761160378776494, "flos": 28811094785280.0, "grad_norm": 1.4759408446005937, "language_loss": 0.67608613, "learning_rate": 3.1406125553795567e-06, "loss": 0.75364804, "num_input_tokens_seen": 117025370, "router_z_loss_clip": 1.95507812, "router_z_loss_mlp": 0.16247559, "step": 5449, "time_per_iteration": 2.6038272380828857 }, { "auxiliary_loss_clip": 0.06486919, "auxiliary_loss_mlp": 0.01270748, "balance_loss_clip": 0.06291033, "balance_loss_mlp": 0.0125494, "epoch": 0.3276717270404329, "flos": 26944171090560.0, "grad_norm": 1.465260183744459, "language_loss": 0.66056359, "learning_rate": 3.1402926174156556e-06, "loss": 0.73814023, "num_input_tokens_seen": 117044350, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.15808105, "step": 5450, "time_per_iteration": 2.59761381149292 }, { "auxiliary_loss_clip": 0.06483489, "auxiliary_loss_mlp": 0.01270039, "balance_loss_clip": 0.06286319, "balance_loss_mlp": 0.01253063, "epoch": 0.32773185029310087, "flos": 25345557768960.0, "grad_norm": 2.877002895319345, "language_loss": 0.77837908, "learning_rate": 3.1399726362117437e-06, "loss": 0.85591435, "num_input_tokens_seen": 117064450, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.1697998, "step": 5451, "time_per_iteration": 2.5821797847747803 }, { "auxiliary_loss_clip": 0.06487337, "auxiliary_loss_mlp": 0.01276343, "balance_loss_clip": 0.06287816, "balance_loss_mlp": 0.01260143, "epoch": 0.32779197354576883, "flos": 26398227813120.0, "grad_norm": 2.167153899022782, "language_loss": 0.71375346, "learning_rate": 3.1396526117799555e-06, "loss": 0.7913903, "num_input_tokens_seen": 117083060, "router_z_loss_clip": 1.99707031, "router_z_loss_mlp": 0.1618042, "step": 5452, "time_per_iteration": 2.6024115085601807 }, { "auxiliary_loss_clip": 0.06478827, "auxiliary_loss_mlp": 0.01271812, "balance_loss_clip": 0.06287065, "balance_loss_mlp": 0.01256124, "epoch": 0.3278520967984368, "flos": 24906237212160.0, "grad_norm": 1.7163855565634236, "language_loss": 0.78843105, "learning_rate": 3.1393325441324256e-06, "loss": 0.86593741, "num_input_tokens_seen": 117101860, "router_z_loss_clip": 1.91601562, "router_z_loss_mlp": 0.15686035, "step": 5453, "time_per_iteration": 2.590975046157837 }, { "auxiliary_loss_clip": 0.06481644, "auxiliary_loss_mlp": 0.01275463, "balance_loss_clip": 0.06283817, "balance_loss_mlp": 0.01259179, "epoch": 0.32791222005110476, "flos": 29760831688320.0, "grad_norm": 2.3524404232040745, "language_loss": 0.75898093, "learning_rate": 3.1390124332812916e-06, "loss": 0.83655202, "num_input_tokens_seen": 117123100, "router_z_loss_clip": 1.97753906, "router_z_loss_mlp": 0.16278076, "step": 5454, "time_per_iteration": 2.617703914642334 }, { "auxiliary_loss_clip": 0.06477319, "auxiliary_loss_mlp": 0.01267746, "balance_loss_clip": 0.06287593, "balance_loss_mlp": 0.01254055, "epoch": 0.32797234330377273, "flos": 16513584410880.0, "grad_norm": 1.879032713539904, "language_loss": 0.77467626, "learning_rate": 3.1386922792386924e-06, "loss": 0.85212696, "num_input_tokens_seen": 117140515, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.13702393, "step": 5455, "time_per_iteration": 2.548229932785034 }, { "auxiliary_loss_clip": 0.06496032, "auxiliary_loss_mlp": 0.01274967, "balance_loss_clip": 0.06294192, "balance_loss_mlp": 0.01258409, "epoch": 0.3280324665564407, "flos": 26585086417920.0, "grad_norm": 1.9588287474141142, "language_loss": 0.73911232, "learning_rate": 3.138372082016768e-06, "loss": 0.81682229, "num_input_tokens_seen": 117161485, "router_z_loss_clip": 2.01757812, "router_z_loss_mlp": 0.16552734, "step": 5456, "time_per_iteration": 2.6118931770324707 }, { "auxiliary_loss_clip": 0.06483398, "auxiliary_loss_mlp": 0.01276744, "balance_loss_clip": 0.06288227, "balance_loss_mlp": 0.01260901, "epoch": 0.32809258980910866, "flos": 22936631938560.0, "grad_norm": 1.6020635346064684, "language_loss": 0.78555644, "learning_rate": 3.1380518416276596e-06, "loss": 0.86315787, "num_input_tokens_seen": 117181870, "router_z_loss_clip": 1.95117188, "router_z_loss_mlp": 0.15844727, "step": 5457, "time_per_iteration": 2.568495273590088 }, { "auxiliary_loss_clip": 0.06489926, "auxiliary_loss_mlp": 0.01273373, "balance_loss_clip": 0.06287722, "balance_loss_mlp": 0.01258055, "epoch": 0.3281527130617766, "flos": 22790457290880.0, "grad_norm": 1.8295551901381526, "language_loss": 0.79113173, "learning_rate": 3.1377315580835115e-06, "loss": 0.8687647, "num_input_tokens_seen": 117201380, "router_z_loss_clip": 2.02441406, "router_z_loss_mlp": 0.15319824, "step": 5458, "time_per_iteration": 2.5754339694976807 }, { "auxiliary_loss_clip": 0.0648583, "auxiliary_loss_mlp": 0.01275184, "balance_loss_clip": 0.06291075, "balance_loss_mlp": 0.01258972, "epoch": 0.3282128363144446, "flos": 21256902264960.0, "grad_norm": 1.8866214433503328, "language_loss": 0.73551798, "learning_rate": 3.1374112313964686e-06, "loss": 0.81312811, "num_input_tokens_seen": 117221040, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.16223145, "step": 5459, "time_per_iteration": 2.555173397064209 }, { "auxiliary_loss_clip": 0.06490184, "auxiliary_loss_mlp": 0.01272791, "balance_loss_clip": 0.06291983, "balance_loss_mlp": 0.01257151, "epoch": 0.32827295956711255, "flos": 30850328401920.0, "grad_norm": 1.903664958657258, "language_loss": 0.84846151, "learning_rate": 3.1370908615786783e-06, "loss": 0.92609119, "num_input_tokens_seen": 117241395, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.15649414, "step": 5460, "time_per_iteration": 2.621889591217041 }, { "auxiliary_loss_clip": 0.06485908, "auxiliary_loss_mlp": 0.01276541, "balance_loss_clip": 0.0628874, "balance_loss_mlp": 0.01261354, "epoch": 0.3283330828197806, "flos": 25921032410880.0, "grad_norm": 3.237244562352146, "language_loss": 0.7749877, "learning_rate": 3.136770448642288e-06, "loss": 0.85261214, "num_input_tokens_seen": 117259340, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.15185547, "step": 5461, "time_per_iteration": 4.025079011917114 }, { "auxiliary_loss_clip": 0.06482965, "auxiliary_loss_mlp": 0.01278724, "balance_loss_clip": 0.06287502, "balance_loss_mlp": 0.01261569, "epoch": 0.32839320607244854, "flos": 38591295672960.0, "grad_norm": 1.756656396701566, "language_loss": 0.63023031, "learning_rate": 3.1364499925994484e-06, "loss": 0.70784718, "num_input_tokens_seen": 117282375, "router_z_loss_clip": 1.95605469, "router_z_loss_mlp": 0.17163086, "step": 5462, "time_per_iteration": 2.6927683353424072 }, { "auxiliary_loss_clip": 0.06486777, "auxiliary_loss_mlp": 0.0127696, "balance_loss_clip": 0.06294754, "balance_loss_mlp": 0.01261946, "epoch": 0.3284533293251165, "flos": 26658068924160.0, "grad_norm": 1.427312471541666, "language_loss": 0.78727007, "learning_rate": 3.1361294934623115e-06, "loss": 0.86490744, "num_input_tokens_seen": 117303830, "router_z_loss_clip": 1.91894531, "router_z_loss_mlp": 0.15008545, "step": 5463, "time_per_iteration": 2.608985424041748 }, { "auxiliary_loss_clip": 0.06485416, "auxiliary_loss_mlp": 0.01274341, "balance_loss_clip": 0.0628915, "balance_loss_mlp": 0.01257878, "epoch": 0.32851345257778447, "flos": 15309498839040.0, "grad_norm": 1.73899701551779, "language_loss": 0.69945556, "learning_rate": 3.1358089512430303e-06, "loss": 0.77705312, "num_input_tokens_seen": 117320665, "router_z_loss_clip": 1.95996094, "router_z_loss_mlp": 0.16455078, "step": 5464, "time_per_iteration": 2.634092330932617 }, { "auxiliary_loss_clip": 0.06480017, "auxiliary_loss_mlp": 0.01272193, "balance_loss_clip": 0.06289808, "balance_loss_mlp": 0.01256779, "epoch": 0.32857357583045244, "flos": 23520491988480.0, "grad_norm": 3.646383924491144, "language_loss": 0.72382581, "learning_rate": 3.1354883659537594e-06, "loss": 0.80134791, "num_input_tokens_seen": 117339795, "router_z_loss_clip": 1.90136719, "router_z_loss_mlp": 0.1541748, "step": 5465, "time_per_iteration": 2.6809840202331543 }, { "auxiliary_loss_clip": 0.06488622, "auxiliary_loss_mlp": 0.01278132, "balance_loss_clip": 0.06291194, "balance_loss_mlp": 0.01262003, "epoch": 0.3286336990831204, "flos": 21001379639040.0, "grad_norm": 1.3743953323203124, "language_loss": 0.83223963, "learning_rate": 3.1351677376066567e-06, "loss": 0.90990722, "num_input_tokens_seen": 117359525, "router_z_loss_clip": 1.97460938, "router_z_loss_mlp": 0.16125488, "step": 5466, "time_per_iteration": 2.5949413776397705 }, { "auxiliary_loss_clip": 0.06483825, "auxiliary_loss_mlp": 0.01272315, "balance_loss_clip": 0.06286561, "balance_loss_mlp": 0.01257271, "epoch": 0.32869382233578837, "flos": 23665450752000.0, "grad_norm": 2.419570514252141, "language_loss": 0.7973004, "learning_rate": 3.134847066213879e-06, "loss": 0.87486184, "num_input_tokens_seen": 117380320, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.15039062, "step": 5467, "time_per_iteration": 2.5922093391418457 }, { "auxiliary_loss_clip": 0.06485875, "auxiliary_loss_mlp": 0.01270931, "balance_loss_clip": 0.06285301, "balance_loss_mlp": 0.01254302, "epoch": 0.32875394558845633, "flos": 25343335635840.0, "grad_norm": 2.104784931217333, "language_loss": 0.7503643, "learning_rate": 3.134526351787587e-06, "loss": 0.82793242, "num_input_tokens_seen": 117400695, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.16625977, "step": 5468, "time_per_iteration": 4.047188997268677 }, { "auxiliary_loss_clip": 0.06495232, "auxiliary_loss_mlp": 0.0127722, "balance_loss_clip": 0.06293425, "balance_loss_mlp": 0.01259684, "epoch": 0.3288140688411243, "flos": 14908430471040.0, "grad_norm": 5.948878890234276, "language_loss": 0.7872591, "learning_rate": 3.134205594339942e-06, "loss": 0.86498356, "num_input_tokens_seen": 117418800, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.17541504, "step": 5469, "time_per_iteration": 2.5810141563415527 }, { "auxiliary_loss_clip": 0.06487159, "auxiliary_loss_mlp": 0.01272874, "balance_loss_clip": 0.06288994, "balance_loss_mlp": 0.0125678, "epoch": 0.32887419209379226, "flos": 18557220366720.0, "grad_norm": 1.7334075833567983, "language_loss": 0.82322848, "learning_rate": 3.133884793883107e-06, "loss": 0.90082884, "num_input_tokens_seen": 117438220, "router_z_loss_clip": 1.98046875, "router_z_loss_mlp": 0.16088867, "step": 5470, "time_per_iteration": 2.559739828109741 }, { "auxiliary_loss_clip": 0.06484611, "auxiliary_loss_mlp": 0.01272962, "balance_loss_clip": 0.0628456, "balance_loss_mlp": 0.01255629, "epoch": 0.3289343153464602, "flos": 48116560913280.0, "grad_norm": 1.7601046042565172, "language_loss": 0.68678689, "learning_rate": 3.1335639504292478e-06, "loss": 0.76436257, "num_input_tokens_seen": 117462560, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.17321777, "step": 5471, "time_per_iteration": 2.803812026977539 }, { "auxiliary_loss_clip": 0.06494435, "auxiliary_loss_mlp": 0.01278207, "balance_loss_clip": 0.06288949, "balance_loss_mlp": 0.01260386, "epoch": 0.3289944385991282, "flos": 27607763900160.0, "grad_norm": 1.6062733903976214, "language_loss": 0.6570453, "learning_rate": 3.1332430639905288e-06, "loss": 0.73477173, "num_input_tokens_seen": 117483665, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.17822266, "step": 5472, "time_per_iteration": 2.6297881603240967 }, { "auxiliary_loss_clip": 0.06490444, "auxiliary_loss_mlp": 0.01272728, "balance_loss_clip": 0.06289393, "balance_loss_mlp": 0.01254799, "epoch": 0.32905456185179616, "flos": 20126470032000.0, "grad_norm": 1.6523478639528941, "language_loss": 0.89183974, "learning_rate": 3.13292213457912e-06, "loss": 0.96947145, "num_input_tokens_seen": 117503565, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.17944336, "step": 5473, "time_per_iteration": 2.563811779022217 }, { "auxiliary_loss_clip": 0.06490625, "auxiliary_loss_mlp": 0.01272963, "balance_loss_clip": 0.06290704, "balance_loss_mlp": 0.01256047, "epoch": 0.3291146851044642, "flos": 23186075143680.0, "grad_norm": 1.8486135132154398, "language_loss": 0.78554356, "learning_rate": 3.1326011622071903e-06, "loss": 0.86317945, "num_input_tokens_seen": 117521460, "router_z_loss_clip": 1.99707031, "router_z_loss_mlp": 0.16918945, "step": 5474, "time_per_iteration": 3.9418954849243164 }, { "auxiliary_loss_clip": 0.06413923, "auxiliary_loss_mlp": 0.01256893, "balance_loss_clip": 0.06317595, "balance_loss_mlp": 0.0125275, "epoch": 0.32917480835713214, "flos": 67641630664320.0, "grad_norm": 0.8036709066488853, "language_loss": 0.60249203, "learning_rate": 3.132280146886911e-06, "loss": 0.67920017, "num_input_tokens_seen": 117580550, "router_z_loss_clip": 0.96142578, "router_z_loss_mlp": 0.04144287, "step": 5475, "time_per_iteration": 3.1708593368530273 }, { "auxiliary_loss_clip": 0.06499004, "auxiliary_loss_mlp": 0.01278577, "balance_loss_clip": 0.06291993, "balance_loss_mlp": 0.01260231, "epoch": 0.3292349316098001, "flos": 27971963671680.0, "grad_norm": 3.5381387756124227, "language_loss": 0.76967394, "learning_rate": 3.131959088630455e-06, "loss": 0.84744978, "num_input_tokens_seen": 117600645, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.18334961, "step": 5476, "time_per_iteration": 2.5970675945281982 }, { "auxiliary_loss_clip": 0.0648091, "auxiliary_loss_mlp": 0.01271835, "balance_loss_clip": 0.0628524, "balance_loss_mlp": 0.01255897, "epoch": 0.3292950548624681, "flos": 20269416297600.0, "grad_norm": 1.7892647278174763, "language_loss": 0.74741197, "learning_rate": 3.131637987449997e-06, "loss": 0.82493937, "num_input_tokens_seen": 117618880, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.15948486, "step": 5477, "time_per_iteration": 2.555638551712036 }, { "auxiliary_loss_clip": 0.06483293, "auxiliary_loss_mlp": 0.01272123, "balance_loss_clip": 0.06290899, "balance_loss_mlp": 0.01256375, "epoch": 0.32935517811513604, "flos": 20819174935680.0, "grad_norm": 2.478898405769071, "language_loss": 0.7634238, "learning_rate": 3.131316843357713e-06, "loss": 0.84097791, "num_input_tokens_seen": 117636445, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.1574707, "step": 5478, "time_per_iteration": 3.991666793823242 }, { "auxiliary_loss_clip": 0.06481741, "auxiliary_loss_mlp": 0.01272057, "balance_loss_clip": 0.06287287, "balance_loss_mlp": 0.01256715, "epoch": 0.329415301367804, "flos": 18447704680320.0, "grad_norm": 1.8205372112937466, "language_loss": 0.80738229, "learning_rate": 3.1309956563657807e-06, "loss": 0.88492024, "num_input_tokens_seen": 117653105, "router_z_loss_clip": 1.94628906, "router_z_loss_mlp": 0.15338135, "step": 5479, "time_per_iteration": 2.5394160747528076 }, { "auxiliary_loss_clip": 0.06401373, "auxiliary_loss_mlp": 0.01254536, "balance_loss_clip": 0.06304662, "balance_loss_mlp": 0.0125031, "epoch": 0.32947542462047197, "flos": 66344967930240.0, "grad_norm": 0.7792119270246873, "language_loss": 0.56334949, "learning_rate": 3.1306744264863804e-06, "loss": 0.63990861, "num_input_tokens_seen": 117719225, "router_z_loss_clip": 0.96630859, "router_z_loss_mlp": 0.04229736, "step": 5480, "time_per_iteration": 3.2762796878814697 }, { "auxiliary_loss_clip": 0.06488277, "auxiliary_loss_mlp": 0.01279757, "balance_loss_clip": 0.06290083, "balance_loss_mlp": 0.01263545, "epoch": 0.32953554787313993, "flos": 23228268474240.0, "grad_norm": 2.0472960262149313, "language_loss": 0.77727699, "learning_rate": 3.1303531537316915e-06, "loss": 0.85495734, "num_input_tokens_seen": 117738725, "router_z_loss_clip": 1.98046875, "router_z_loss_mlp": 0.16210938, "step": 5481, "time_per_iteration": 2.586601972579956 }, { "auxiliary_loss_clip": 0.06490518, "auxiliary_loss_mlp": 0.01274245, "balance_loss_clip": 0.06287334, "balance_loss_mlp": 0.01258223, "epoch": 0.3295956711258079, "flos": 27015686150400.0, "grad_norm": 1.6222061191325634, "language_loss": 0.79228592, "learning_rate": 3.130031838113899e-06, "loss": 0.86993355, "num_input_tokens_seen": 117757765, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.16027832, "step": 5482, "time_per_iteration": 2.6239378452301025 }, { "auxiliary_loss_clip": 0.06485672, "auxiliary_loss_mlp": 0.01270057, "balance_loss_clip": 0.06286313, "balance_loss_mlp": 0.01254494, "epoch": 0.32965579437847586, "flos": 19177697450880.0, "grad_norm": 2.07519915766771, "language_loss": 0.74808013, "learning_rate": 3.129710479645185e-06, "loss": 0.8256374, "num_input_tokens_seen": 117776810, "router_z_loss_clip": 1.99414062, "router_z_loss_mlp": 0.15557861, "step": 5483, "time_per_iteration": 2.5777552127838135 }, { "auxiliary_loss_clip": 0.06487543, "auxiliary_loss_mlp": 0.0127297, "balance_loss_clip": 0.06290436, "balance_loss_mlp": 0.01257676, "epoch": 0.32971591763114383, "flos": 30490447115520.0, "grad_norm": 1.928112369393318, "language_loss": 0.76226151, "learning_rate": 3.1293890783377366e-06, "loss": 0.83986664, "num_input_tokens_seen": 117797730, "router_z_loss_clip": 1.97460938, "router_z_loss_mlp": 0.1529541, "step": 5484, "time_per_iteration": 2.6385304927825928 }, { "auxiliary_loss_clip": 0.06478956, "auxiliary_loss_mlp": 0.01274267, "balance_loss_clip": 0.06284182, "balance_loss_mlp": 0.01258257, "epoch": 0.3297760408838118, "flos": 16295140016640.0, "grad_norm": 2.506722451420254, "language_loss": 0.72571409, "learning_rate": 3.129067634203742e-06, "loss": 0.80324626, "num_input_tokens_seen": 117815365, "router_z_loss_clip": 1.94921875, "router_z_loss_mlp": 0.16015625, "step": 5485, "time_per_iteration": 2.5618410110473633 }, { "auxiliary_loss_clip": 0.0647975, "auxiliary_loss_mlp": 0.01275832, "balance_loss_clip": 0.06286277, "balance_loss_mlp": 0.0126024, "epoch": 0.32983616413647976, "flos": 29538194590080.0, "grad_norm": 1.7013771585695274, "language_loss": 0.80873775, "learning_rate": 3.128746147255388e-06, "loss": 0.88629359, "num_input_tokens_seen": 117836095, "router_z_loss_clip": 1.93359375, "router_z_loss_mlp": 0.15600586, "step": 5486, "time_per_iteration": 2.62622332572937 }, { "auxiliary_loss_clip": 0.0648071, "auxiliary_loss_mlp": 0.01274502, "balance_loss_clip": 0.06285913, "balance_loss_mlp": 0.01259005, "epoch": 0.3298962873891478, "flos": 20637682992000.0, "grad_norm": 1.99664336595549, "language_loss": 0.85131717, "learning_rate": 3.1284246175048683e-06, "loss": 0.92886925, "num_input_tokens_seen": 117854655, "router_z_loss_clip": 1.94921875, "router_z_loss_mlp": 0.1550293, "step": 5487, "time_per_iteration": 2.5422561168670654 }, { "auxiliary_loss_clip": 0.0648939, "auxiliary_loss_mlp": 0.01274178, "balance_loss_clip": 0.06289146, "balance_loss_mlp": 0.01256833, "epoch": 0.32995641064181574, "flos": 14981329123200.0, "grad_norm": 2.604395611122822, "language_loss": 0.75012362, "learning_rate": 3.1281030449643735e-06, "loss": 0.82775933, "num_input_tokens_seen": 117873300, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.17346191, "step": 5488, "time_per_iteration": 2.527977466583252 }, { "auxiliary_loss_clip": 0.06486475, "auxiliary_loss_mlp": 0.01272547, "balance_loss_clip": 0.06288687, "balance_loss_mlp": 0.01256418, "epoch": 0.3300165338944837, "flos": 18667448812800.0, "grad_norm": 2.056811519762147, "language_loss": 0.73225659, "learning_rate": 3.127781429646098e-06, "loss": 0.80984682, "num_input_tokens_seen": 117891540, "router_z_loss_clip": 1.97851562, "router_z_loss_mlp": 0.16113281, "step": 5489, "time_per_iteration": 2.533299446105957 }, { "auxiliary_loss_clip": 0.06480801, "auxiliary_loss_mlp": 0.01273426, "balance_loss_clip": 0.06284954, "balance_loss_mlp": 0.01258156, "epoch": 0.3300766571471517, "flos": 25589215042560.0, "grad_norm": 3.1735889391718217, "language_loss": 0.89299995, "learning_rate": 3.127459771562238e-06, "loss": 0.97054219, "num_input_tokens_seen": 117907690, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.15270996, "step": 5490, "time_per_iteration": 2.573622226715088 }, { "auxiliary_loss_clip": 0.06478599, "auxiliary_loss_mlp": 0.01271308, "balance_loss_clip": 0.06283886, "balance_loss_mlp": 0.01256562, "epoch": 0.33013678039981964, "flos": 11368150012800.0, "grad_norm": 2.0903723845491773, "language_loss": 0.83745688, "learning_rate": 3.1271380707249907e-06, "loss": 0.91495597, "num_input_tokens_seen": 117925640, "router_z_loss_clip": 1.94921875, "router_z_loss_mlp": 0.1473999, "step": 5491, "time_per_iteration": 2.5491890907287598 }, { "auxiliary_loss_clip": 0.06481117, "auxiliary_loss_mlp": 0.01271642, "balance_loss_clip": 0.06285228, "balance_loss_mlp": 0.01256001, "epoch": 0.3301969036524876, "flos": 24827175285120.0, "grad_norm": 2.3693424180005676, "language_loss": 0.7866801, "learning_rate": 3.126816327146554e-06, "loss": 0.86420768, "num_input_tokens_seen": 117944525, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.15637207, "step": 5492, "time_per_iteration": 2.5801360607147217 }, { "auxiliary_loss_clip": 0.06490218, "auxiliary_loss_mlp": 0.0127249, "balance_loss_clip": 0.06290336, "balance_loss_mlp": 0.01255622, "epoch": 0.33025702690515557, "flos": 15966634884480.0, "grad_norm": 2.436466477396105, "language_loss": 0.75439537, "learning_rate": 3.12649454083913e-06, "loss": 0.83202243, "num_input_tokens_seen": 117962515, "router_z_loss_clip": 1.99804688, "router_z_loss_mlp": 0.16882324, "step": 5493, "time_per_iteration": 2.546098470687866 }, { "auxiliary_loss_clip": 0.06400958, "auxiliary_loss_mlp": 0.01258597, "balance_loss_clip": 0.06304851, "balance_loss_mlp": 0.01254082, "epoch": 0.33031715015782354, "flos": 59435794540800.0, "grad_norm": 0.7628030324855922, "language_loss": 0.53926152, "learning_rate": 3.12617271181492e-06, "loss": 0.61585701, "num_input_tokens_seen": 118018780, "router_z_loss_clip": 0.9609375, "router_z_loss_mlp": 0.04519653, "step": 5494, "time_per_iteration": 3.1415913105010986 }, { "auxiliary_loss_clip": 0.06490691, "auxiliary_loss_mlp": 0.01277025, "balance_loss_clip": 0.06290571, "balance_loss_mlp": 0.01260825, "epoch": 0.3303772734104915, "flos": 23190896753280.0, "grad_norm": 1.6373748677928968, "language_loss": 0.87136579, "learning_rate": 3.1258508400861276e-06, "loss": 0.94904292, "num_input_tokens_seen": 118038610, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.1619873, "step": 5495, "time_per_iteration": 2.5958478450775146 }, { "auxiliary_loss_clip": 0.06487384, "auxiliary_loss_mlp": 0.01273828, "balance_loss_clip": 0.06287673, "balance_loss_mlp": 0.01257258, "epoch": 0.33043739666315947, "flos": 33080068275840.0, "grad_norm": 1.8589277848665784, "language_loss": 0.73830777, "learning_rate": 3.1255289256649587e-06, "loss": 0.81591988, "num_input_tokens_seen": 118055905, "router_z_loss_clip": 1.99511719, "router_z_loss_mlp": 0.16589355, "step": 5496, "time_per_iteration": 2.656463623046875 }, { "auxiliary_loss_clip": 0.06479512, "auxiliary_loss_mlp": 0.01272686, "balance_loss_clip": 0.06285183, "balance_loss_mlp": 0.01256652, "epoch": 0.33049751991582743, "flos": 24901625237760.0, "grad_norm": 1.9432476962067267, "language_loss": 0.72891808, "learning_rate": 3.1252069685636196e-06, "loss": 0.80644011, "num_input_tokens_seen": 118073695, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.16040039, "step": 5497, "time_per_iteration": 2.5890839099884033 }, { "auxiliary_loss_clip": 0.06486233, "auxiliary_loss_mlp": 0.0127148, "balance_loss_clip": 0.06291439, "balance_loss_mlp": 0.01256496, "epoch": 0.3305576431684954, "flos": 29468272757760.0, "grad_norm": 2.23846196694962, "language_loss": 0.80659914, "learning_rate": 3.124884968794321e-06, "loss": 0.88417625, "num_input_tokens_seen": 118094030, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.14990234, "step": 5498, "time_per_iteration": 2.682004928588867 }, { "auxiliary_loss_clip": 0.06483226, "auxiliary_loss_mlp": 0.01275794, "balance_loss_clip": 0.06283633, "balance_loss_mlp": 0.01258342, "epoch": 0.33061776642116336, "flos": 22637951660160.0, "grad_norm": 2.2524541364574624, "language_loss": 0.75838006, "learning_rate": 3.12456292636927e-06, "loss": 0.83597016, "num_input_tokens_seen": 118111665, "router_z_loss_clip": 1.99609375, "router_z_loss_mlp": 0.17456055, "step": 5499, "time_per_iteration": 2.5972423553466797 }, { "auxiliary_loss_clip": 0.06483451, "auxiliary_loss_mlp": 0.01272752, "balance_loss_clip": 0.06287796, "balance_loss_mlp": 0.01257839, "epoch": 0.3306778896738313, "flos": 25783536660480.0, "grad_norm": 2.0037603712037777, "language_loss": 0.7920779, "learning_rate": 3.124240841300681e-06, "loss": 0.86963993, "num_input_tokens_seen": 118132435, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.14929199, "step": 5500, "time_per_iteration": 2.604142665863037 }, { "auxiliary_loss_clip": 0.06491024, "auxiliary_loss_mlp": 0.0127202, "balance_loss_clip": 0.06293687, "balance_loss_mlp": 0.01255379, "epoch": 0.33073801292649935, "flos": 36949566625920.0, "grad_norm": 2.046944329663866, "language_loss": 0.66460872, "learning_rate": 3.1239187136007665e-06, "loss": 0.74223912, "num_input_tokens_seen": 118155255, "router_z_loss_clip": 1.97363281, "router_z_loss_mlp": 0.1661377, "step": 5501, "time_per_iteration": 4.13093900680542 }, { "auxiliary_loss_clip": 0.06485368, "auxiliary_loss_mlp": 0.01270012, "balance_loss_clip": 0.06287349, "balance_loss_mlp": 0.01252822, "epoch": 0.3307981361791673, "flos": 12972465411840.0, "grad_norm": 2.3913284037439806, "language_loss": 0.77976793, "learning_rate": 3.1235965432817417e-06, "loss": 0.85732174, "num_input_tokens_seen": 118169865, "router_z_loss_clip": 1.97851562, "router_z_loss_mlp": 0.17199707, "step": 5502, "time_per_iteration": 2.5185348987579346 }, { "auxiliary_loss_clip": 0.06485141, "auxiliary_loss_mlp": 0.01271265, "balance_loss_clip": 0.0628771, "balance_loss_mlp": 0.0125361, "epoch": 0.3308582594318353, "flos": 25381420116480.0, "grad_norm": 2.1535305528686166, "language_loss": 0.73088145, "learning_rate": 3.123274330355824e-06, "loss": 0.80844557, "num_input_tokens_seen": 118190760, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.17651367, "step": 5503, "time_per_iteration": 2.5833897590637207 }, { "auxiliary_loss_clip": 0.06483694, "auxiliary_loss_mlp": 0.0127272, "balance_loss_clip": 0.06288624, "balance_loss_mlp": 0.01256657, "epoch": 0.33091838268450324, "flos": 26475738439680.0, "grad_norm": 1.6279387884176573, "language_loss": 0.75285548, "learning_rate": 3.12295207483523e-06, "loss": 0.83041966, "num_input_tokens_seen": 118213620, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.1605835, "step": 5504, "time_per_iteration": 2.6195969581604004 }, { "auxiliary_loss_clip": 0.0648801, "auxiliary_loss_mlp": 0.01272414, "balance_loss_clip": 0.06293052, "balance_loss_mlp": 0.01257459, "epoch": 0.3309785059371712, "flos": 24977836126080.0, "grad_norm": 1.615490800617595, "language_loss": 0.69995975, "learning_rate": 3.1226297767321816e-06, "loss": 0.77756393, "num_input_tokens_seen": 118235010, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.1494751, "step": 5505, "time_per_iteration": 2.610260009765625 }, { "auxiliary_loss_clip": 0.06490335, "auxiliary_loss_mlp": 0.01271052, "balance_loss_clip": 0.06295969, "balance_loss_mlp": 0.01256043, "epoch": 0.3310386291898392, "flos": 20452585322880.0, "grad_norm": 1.6201178162248284, "language_loss": 0.82560617, "learning_rate": 3.122307436058899e-06, "loss": 0.90322, "num_input_tokens_seen": 118255820, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.15014648, "step": 5506, "time_per_iteration": 2.5993356704711914 }, { "auxiliary_loss_clip": 0.06491292, "auxiliary_loss_mlp": 0.01271443, "balance_loss_clip": 0.06297272, "balance_loss_mlp": 0.01255052, "epoch": 0.33109875244250714, "flos": 23188926182400.0, "grad_norm": 1.9054540023607451, "language_loss": 0.79184496, "learning_rate": 3.121985052827606e-06, "loss": 0.86947227, "num_input_tokens_seen": 118274160, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.16394043, "step": 5507, "time_per_iteration": 4.015244007110596 }, { "auxiliary_loss_clip": 0.06488267, "auxiliary_loss_mlp": 0.01271614, "balance_loss_clip": 0.06294317, "balance_loss_mlp": 0.01256045, "epoch": 0.3311588756951751, "flos": 24174902776320.0, "grad_norm": 1.4968117328637436, "language_loss": 0.72199202, "learning_rate": 3.1216626270505274e-06, "loss": 0.79959089, "num_input_tokens_seen": 118294385, "router_z_loss_clip": 1.93945312, "router_z_loss_mlp": 0.15576172, "step": 5508, "time_per_iteration": 2.578463077545166 }, { "auxiliary_loss_clip": 0.06489962, "auxiliary_loss_mlp": 0.01269094, "balance_loss_clip": 0.06301109, "balance_loss_mlp": 0.01254372, "epoch": 0.33121899894784307, "flos": 28152994417920.0, "grad_norm": 2.052763730616814, "language_loss": 0.72074628, "learning_rate": 3.12134015873989e-06, "loss": 0.79833686, "num_input_tokens_seen": 118313105, "router_z_loss_clip": 1.88769531, "router_z_loss_mlp": 0.14715576, "step": 5509, "time_per_iteration": 2.6034340858459473 }, { "auxiliary_loss_clip": 0.06487346, "auxiliary_loss_mlp": 0.01274933, "balance_loss_clip": 0.06296062, "balance_loss_mlp": 0.01259889, "epoch": 0.33127912220051103, "flos": 29574979332480.0, "grad_norm": 1.8966759735448047, "language_loss": 0.74655545, "learning_rate": 3.121017647907921e-06, "loss": 0.82417822, "num_input_tokens_seen": 118335250, "router_z_loss_clip": 1.90917969, "router_z_loss_mlp": 0.15026855, "step": 5510, "time_per_iteration": 2.6156694889068604 }, { "auxiliary_loss_clip": 0.06489666, "auxiliary_loss_mlp": 0.01270533, "balance_loss_clip": 0.06298044, "balance_loss_mlp": 0.01255406, "epoch": 0.331339245453179, "flos": 14434086107520.0, "grad_norm": 2.1345040593317592, "language_loss": 0.88529241, "learning_rate": 3.1206950945668508e-06, "loss": 0.96289444, "num_input_tokens_seen": 118351470, "router_z_loss_clip": 1.91894531, "router_z_loss_mlp": 0.15112305, "step": 5511, "time_per_iteration": 2.6672518253326416 }, { "auxiliary_loss_clip": 0.06478877, "auxiliary_loss_mlp": 0.01271158, "balance_loss_clip": 0.06294948, "balance_loss_mlp": 0.01257199, "epoch": 0.33139936870584696, "flos": 20893499107200.0, "grad_norm": 1.569268194236114, "language_loss": 0.73505759, "learning_rate": 3.12037249872891e-06, "loss": 0.812558, "num_input_tokens_seen": 118370970, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.13946533, "step": 5512, "time_per_iteration": 2.6872446537017822 }, { "auxiliary_loss_clip": 0.06487289, "auxiliary_loss_mlp": 0.01269679, "balance_loss_clip": 0.06298781, "balance_loss_mlp": 0.0125442, "epoch": 0.33145949195851493, "flos": 36293352975360.0, "grad_norm": 1.9921704843599723, "language_loss": 0.73084229, "learning_rate": 3.1200498604063317e-06, "loss": 0.80841196, "num_input_tokens_seen": 118393125, "router_z_loss_clip": 1.88671875, "router_z_loss_mlp": 0.15240479, "step": 5513, "time_per_iteration": 4.120615720748901 }, { "auxiliary_loss_clip": 0.06488639, "auxiliary_loss_mlp": 0.01274838, "balance_loss_clip": 0.06296203, "balance_loss_mlp": 0.01259651, "epoch": 0.33151961521118295, "flos": 14284431515520.0, "grad_norm": 1.9709972434157028, "language_loss": 0.68618488, "learning_rate": 3.1197271796113507e-06, "loss": 0.76381969, "num_input_tokens_seen": 118410860, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.15197754, "step": 5514, "time_per_iteration": 2.560438632965088 }, { "auxiliary_loss_clip": 0.06489652, "auxiliary_loss_mlp": 0.01276856, "balance_loss_clip": 0.06297702, "balance_loss_mlp": 0.0126, "epoch": 0.3315797384638509, "flos": 20780126133120.0, "grad_norm": 2.978710581629462, "language_loss": 0.66901642, "learning_rate": 3.1194044563562026e-06, "loss": 0.74668145, "num_input_tokens_seen": 118429570, "router_z_loss_clip": 1.91992188, "router_z_loss_mlp": 0.1685791, "step": 5515, "time_per_iteration": 2.5690009593963623 }, { "auxiliary_loss_clip": 0.0649133, "auxiliary_loss_mlp": 0.01271972, "balance_loss_clip": 0.06297746, "balance_loss_mlp": 0.0125626, "epoch": 0.3316398617165189, "flos": 24686115736320.0, "grad_norm": 1.6660851470528595, "language_loss": 0.6981746, "learning_rate": 3.1190816906531257e-06, "loss": 0.77580762, "num_input_tokens_seen": 118450285, "router_z_loss_clip": 1.93554688, "router_z_loss_mlp": 0.15710449, "step": 5516, "time_per_iteration": 2.6102511882781982 }, { "auxiliary_loss_clip": 0.06489751, "auxiliary_loss_mlp": 0.01269646, "balance_loss_clip": 0.06294692, "balance_loss_mlp": 0.01254661, "epoch": 0.33169998496918685, "flos": 18593879328000.0, "grad_norm": 2.2285845393084114, "language_loss": 0.80897796, "learning_rate": 3.118758882514359e-06, "loss": 0.88657194, "num_input_tokens_seen": 118468270, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.14978027, "step": 5517, "time_per_iteration": 2.5625743865966797 }, { "auxiliary_loss_clip": 0.06475903, "auxiliary_loss_mlp": 0.01277797, "balance_loss_clip": 0.06289679, "balance_loss_mlp": 0.0126255, "epoch": 0.3317601082218548, "flos": 20199871808640.0, "grad_norm": 1.6801660708511341, "language_loss": 0.74949479, "learning_rate": 3.118436031952143e-06, "loss": 0.82703185, "num_input_tokens_seen": 118486615, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.15246582, "step": 5518, "time_per_iteration": 4.027996778488159 }, { "auxiliary_loss_clip": 0.06394118, "auxiliary_loss_mlp": 0.01261431, "balance_loss_clip": 0.06299549, "balance_loss_mlp": 0.01257801, "epoch": 0.3318202314745228, "flos": 68995119265920.0, "grad_norm": 0.6105460419846764, "language_loss": 0.54257834, "learning_rate": 3.1181131389787206e-06, "loss": 0.61913383, "num_input_tokens_seen": 118553580, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 0.03619385, "step": 5519, "time_per_iteration": 3.3047094345092773 }, { "auxiliary_loss_clip": 0.06484608, "auxiliary_loss_mlp": 0.01278593, "balance_loss_clip": 0.06293017, "balance_loss_mlp": 0.01262083, "epoch": 0.33188035472719074, "flos": 21505381148160.0, "grad_norm": 2.223065026697433, "language_loss": 0.79900748, "learning_rate": 3.117790203606336e-06, "loss": 0.87663954, "num_input_tokens_seen": 118570280, "router_z_loss_clip": 1.91601562, "router_z_loss_mlp": 0.16503906, "step": 5520, "time_per_iteration": 2.569272756576538 }, { "auxiliary_loss_clip": 0.06479949, "auxiliary_loss_mlp": 0.01272476, "balance_loss_clip": 0.06290922, "balance_loss_mlp": 0.0125838, "epoch": 0.3319404779798587, "flos": 28877033548800.0, "grad_norm": 2.383059257005906, "language_loss": 0.77240932, "learning_rate": 3.1174672258472344e-06, "loss": 0.84993351, "num_input_tokens_seen": 118590455, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.14093018, "step": 5521, "time_per_iteration": 2.642982244491577 }, { "auxiliary_loss_clip": 0.06483687, "auxiliary_loss_mlp": 0.01274792, "balance_loss_clip": 0.06289105, "balance_loss_mlp": 0.01258663, "epoch": 0.33200060123252667, "flos": 23083770908160.0, "grad_norm": 2.235966662584185, "language_loss": 0.70458424, "learning_rate": 3.117144205713664e-06, "loss": 0.7821691, "num_input_tokens_seen": 118609495, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.16125488, "step": 5522, "time_per_iteration": 2.5710904598236084 }, { "auxiliary_loss_clip": 0.06478165, "auxiliary_loss_mlp": 0.01271221, "balance_loss_clip": 0.06289039, "balance_loss_mlp": 0.01256641, "epoch": 0.33206072448519464, "flos": 21148895952000.0, "grad_norm": 1.8360749613498433, "language_loss": 0.74654186, "learning_rate": 3.1168211432178735e-06, "loss": 0.82403564, "num_input_tokens_seen": 118628720, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.14587402, "step": 5523, "time_per_iteration": 2.5623459815979004 }, { "auxiliary_loss_clip": 0.06476031, "auxiliary_loss_mlp": 0.01273415, "balance_loss_clip": 0.06288981, "balance_loss_mlp": 0.01257906, "epoch": 0.3321208477378626, "flos": 13084161304320.0, "grad_norm": 1.7933565120399932, "language_loss": 0.82300985, "learning_rate": 3.116498038372114e-06, "loss": 0.90050435, "num_input_tokens_seen": 118645955, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.15515137, "step": 5524, "time_per_iteration": 2.5504868030548096 }, { "auxiliary_loss_clip": 0.06473269, "auxiliary_loss_mlp": 0.01277147, "balance_loss_clip": 0.06285775, "balance_loss_mlp": 0.01262609, "epoch": 0.33218097099053057, "flos": 21221836531200.0, "grad_norm": 1.9357377445516395, "language_loss": 0.83501863, "learning_rate": 3.116174891188636e-06, "loss": 0.91252285, "num_input_tokens_seen": 118665605, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.14520264, "step": 5525, "time_per_iteration": 2.5639662742614746 }, { "auxiliary_loss_clip": 0.06382468, "auxiliary_loss_mlp": 0.01255053, "balance_loss_clip": 0.0628836, "balance_loss_mlp": 0.01251677, "epoch": 0.33224109424319853, "flos": 64369954068480.0, "grad_norm": 0.7482430653154047, "language_loss": 0.52537692, "learning_rate": 3.1158517016796945e-06, "loss": 0.60175216, "num_input_tokens_seen": 118728155, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.03384399, "step": 5526, "time_per_iteration": 3.1799674034118652 }, { "auxiliary_loss_clip": 0.06477328, "auxiliary_loss_mlp": 0.01274574, "balance_loss_clip": 0.06284497, "balance_loss_mlp": 0.01258457, "epoch": 0.33230121749586655, "flos": 17351457713280.0, "grad_norm": 4.477886653677356, "language_loss": 0.78684986, "learning_rate": 3.1155284698575445e-06, "loss": 0.86436892, "num_input_tokens_seen": 118743955, "router_z_loss_clip": 1.92773438, "router_z_loss_mlp": 0.16113281, "step": 5527, "time_per_iteration": 2.545025587081909 }, { "auxiliary_loss_clip": 0.06480458, "auxiliary_loss_mlp": 0.01274629, "balance_loss_clip": 0.06292239, "balance_loss_mlp": 0.01260622, "epoch": 0.3323613407485345, "flos": 21003517918080.0, "grad_norm": 2.084631734944191, "language_loss": 0.73118329, "learning_rate": 3.1152051957344434e-06, "loss": 0.80873418, "num_input_tokens_seen": 118763275, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.14013672, "step": 5528, "time_per_iteration": 2.581970691680908 }, { "auxiliary_loss_clip": 0.06474419, "auxiliary_loss_mlp": 0.01275531, "balance_loss_clip": 0.06284927, "balance_loss_mlp": 0.01260928, "epoch": 0.3324214640012025, "flos": 13157688862080.0, "grad_norm": 1.6450268882327914, "language_loss": 0.83605129, "learning_rate": 3.1148818793226497e-06, "loss": 0.91355079, "num_input_tokens_seen": 118781110, "router_z_loss_clip": 1.89257812, "router_z_loss_mlp": 0.14599609, "step": 5529, "time_per_iteration": 2.546992301940918 }, { "auxiliary_loss_clip": 0.06480349, "auxiliary_loss_mlp": 0.01268442, "balance_loss_clip": 0.06286682, "balance_loss_mlp": 0.01253666, "epoch": 0.33248158725387045, "flos": 22280124798720.0, "grad_norm": 1.7979157818484277, "language_loss": 0.70046103, "learning_rate": 3.114558520634423e-06, "loss": 0.77794892, "num_input_tokens_seen": 118800620, "router_z_loss_clip": 1.93554688, "router_z_loss_mlp": 0.14764404, "step": 5530, "time_per_iteration": 2.7004590034484863 }, { "auxiliary_loss_clip": 0.06480995, "auxiliary_loss_mlp": 0.01274845, "balance_loss_clip": 0.06288578, "balance_loss_mlp": 0.01259228, "epoch": 0.3325417105065384, "flos": 20747324459520.0, "grad_norm": 2.5275486060521444, "language_loss": 0.76665491, "learning_rate": 3.1142351196820256e-06, "loss": 0.84421337, "num_input_tokens_seen": 118818725, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.15600586, "step": 5531, "time_per_iteration": 2.6198625564575195 }, { "auxiliary_loss_clip": 0.06481902, "auxiliary_loss_mlp": 0.01274409, "balance_loss_clip": 0.06289789, "balance_loss_mlp": 0.01259377, "epoch": 0.3326018337592064, "flos": 24797476212480.0, "grad_norm": 1.7314446644097723, "language_loss": 0.73474008, "learning_rate": 3.1139116764777206e-06, "loss": 0.81230319, "num_input_tokens_seen": 118839390, "router_z_loss_clip": 1.91992188, "router_z_loss_mlp": 0.15039062, "step": 5532, "time_per_iteration": 2.613351345062256 }, { "auxiliary_loss_clip": 0.06475431, "auxiliary_loss_mlp": 0.01270732, "balance_loss_clip": 0.06285676, "balance_loss_mlp": 0.01256892, "epoch": 0.33266195701187434, "flos": 14506942832640.0, "grad_norm": 1.8975773602474333, "language_loss": 0.6629619, "learning_rate": 3.1135881910337735e-06, "loss": 0.74042356, "num_input_tokens_seen": 118856275, "router_z_loss_clip": 1.89746094, "router_z_loss_mlp": 0.13848877, "step": 5533, "time_per_iteration": 2.570284366607666 }, { "auxiliary_loss_clip": 0.06479272, "auxiliary_loss_mlp": 0.01274025, "balance_loss_clip": 0.06289857, "balance_loss_mlp": 0.01258134, "epoch": 0.3327220802645423, "flos": 15309792328320.0, "grad_norm": 1.8154056709614097, "language_loss": 0.71403635, "learning_rate": 3.113264663362451e-06, "loss": 0.79156935, "num_input_tokens_seen": 118873830, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.15881348, "step": 5534, "time_per_iteration": 2.5413496494293213 }, { "auxiliary_loss_clip": 0.0647723, "auxiliary_loss_mlp": 0.01270189, "balance_loss_clip": 0.06289213, "balance_loss_mlp": 0.01255491, "epoch": 0.3327822035172103, "flos": 23484336151680.0, "grad_norm": 2.5393537163577253, "language_loss": 0.67472768, "learning_rate": 3.1129410934760204e-06, "loss": 0.75220186, "num_input_tokens_seen": 118891560, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.14685059, "step": 5535, "time_per_iteration": 2.5773818492889404 }, { "auxiliary_loss_clip": 0.06475413, "auxiliary_loss_mlp": 0.01274853, "balance_loss_clip": 0.0628483, "balance_loss_mlp": 0.01259559, "epoch": 0.33284232676987824, "flos": 25381587824640.0, "grad_norm": 3.7618557548895013, "language_loss": 0.73018491, "learning_rate": 3.1126174813867517e-06, "loss": 0.80768752, "num_input_tokens_seen": 118910260, "router_z_loss_clip": 1.90527344, "router_z_loss_mlp": 0.1529541, "step": 5536, "time_per_iteration": 2.590381622314453 }, { "auxiliary_loss_clip": 0.06474561, "auxiliary_loss_mlp": 0.01270888, "balance_loss_clip": 0.06284668, "balance_loss_mlp": 0.01255713, "epoch": 0.3329024500225462, "flos": 23700851902080.0, "grad_norm": 1.5566197454077793, "language_loss": 0.82055902, "learning_rate": 3.112293827106917e-06, "loss": 0.89801353, "num_input_tokens_seen": 118929985, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.15161133, "step": 5537, "time_per_iteration": 2.5658748149871826 }, { "auxiliary_loss_clip": 0.06476026, "auxiliary_loss_mlp": 0.01268396, "balance_loss_clip": 0.06285285, "balance_loss_mlp": 0.01253436, "epoch": 0.33296257327521417, "flos": 31731317429760.0, "grad_norm": 3.665652446699324, "language_loss": 0.71987581, "learning_rate": 3.111970130648789e-06, "loss": 0.79732007, "num_input_tokens_seen": 118951355, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.14953613, "step": 5538, "time_per_iteration": 2.6388046741485596 }, { "auxiliary_loss_clip": 0.06476051, "auxiliary_loss_mlp": 0.01270098, "balance_loss_clip": 0.06287602, "balance_loss_mlp": 0.01255161, "epoch": 0.33302269652788213, "flos": 22750863436800.0, "grad_norm": 1.916289973038589, "language_loss": 0.74794823, "learning_rate": 3.1116463920246424e-06, "loss": 0.82540971, "num_input_tokens_seen": 118970910, "router_z_loss_clip": 1.88183594, "router_z_loss_mlp": 0.14953613, "step": 5539, "time_per_iteration": 2.57200026512146 }, { "auxiliary_loss_clip": 0.06480901, "auxiliary_loss_mlp": 0.01274977, "balance_loss_clip": 0.06287201, "balance_loss_mlp": 0.01259027, "epoch": 0.33308281978055015, "flos": 11478546167040.0, "grad_norm": 1.788579055400327, "language_loss": 0.71562445, "learning_rate": 3.1113226112467527e-06, "loss": 0.79318327, "num_input_tokens_seen": 118989200, "router_z_loss_clip": 1.93554688, "router_z_loss_mlp": 0.15942383, "step": 5540, "time_per_iteration": 3.996490240097046 }, { "auxiliary_loss_clip": 0.06470808, "auxiliary_loss_mlp": 0.01269303, "balance_loss_clip": 0.06282364, "balance_loss_mlp": 0.01255034, "epoch": 0.3331429430332181, "flos": 38222274291840.0, "grad_norm": 1.6731286253160356, "language_loss": 0.60939997, "learning_rate": 3.1109987883273983e-06, "loss": 0.68680108, "num_input_tokens_seen": 119011030, "router_z_loss_clip": 1.88378906, "router_z_loss_mlp": 0.14282227, "step": 5541, "time_per_iteration": 2.7237255573272705 }, { "auxiliary_loss_clip": 0.06479269, "auxiliary_loss_mlp": 0.01274382, "balance_loss_clip": 0.06286053, "balance_loss_mlp": 0.01258957, "epoch": 0.3332030662858861, "flos": 22535270081280.0, "grad_norm": 1.7392795960024985, "language_loss": 0.6932053, "learning_rate": 3.1106749232788584e-06, "loss": 0.77074182, "num_input_tokens_seen": 119030620, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.15441895, "step": 5542, "time_per_iteration": 2.667935371398926 }, { "auxiliary_loss_clip": 0.06477119, "auxiliary_loss_mlp": 0.012731, "balance_loss_clip": 0.06286816, "balance_loss_mlp": 0.0125889, "epoch": 0.33326318953855405, "flos": 16003293845760.0, "grad_norm": 1.7664374085625825, "language_loss": 0.7554177, "learning_rate": 3.110351016113414e-06, "loss": 0.8329199, "num_input_tokens_seen": 119048015, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.14221191, "step": 5543, "time_per_iteration": 2.550658941268921 }, { "auxiliary_loss_clip": 0.06477426, "auxiliary_loss_mlp": 0.01271317, "balance_loss_clip": 0.06285199, "balance_loss_mlp": 0.01256571, "epoch": 0.333323312791222, "flos": 25600661124480.0, "grad_norm": 1.7533248632807077, "language_loss": 0.75783789, "learning_rate": 3.110027066843348e-06, "loss": 0.8353253, "num_input_tokens_seen": 119066280, "router_z_loss_clip": 1.91992188, "router_z_loss_mlp": 0.14733887, "step": 5544, "time_per_iteration": 2.6033661365509033 }, { "auxiliary_loss_clip": 0.06473796, "auxiliary_loss_mlp": 0.01268856, "balance_loss_clip": 0.06285195, "balance_loss_mlp": 0.01254313, "epoch": 0.33338343604389, "flos": 25126652177280.0, "grad_norm": 2.7608479304454745, "language_loss": 0.70805359, "learning_rate": 3.1097030754809456e-06, "loss": 0.78548008, "num_input_tokens_seen": 119087680, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.14550781, "step": 5545, "time_per_iteration": 2.6078267097473145 }, { "auxiliary_loss_clip": 0.0646974, "auxiliary_loss_mlp": 0.01272084, "balance_loss_clip": 0.06283332, "balance_loss_mlp": 0.01257612, "epoch": 0.33344355929655795, "flos": 16953114602880.0, "grad_norm": 1.7615406878002218, "language_loss": 0.69216526, "learning_rate": 3.1093790420384894e-06, "loss": 0.76958346, "num_input_tokens_seen": 119105820, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.14477539, "step": 5546, "time_per_iteration": 2.538877248764038 }, { "auxiliary_loss_clip": 0.06476963, "auxiliary_loss_mlp": 0.0127003, "balance_loss_clip": 0.06283061, "balance_loss_mlp": 0.01255081, "epoch": 0.3335036825492259, "flos": 27896675178240.0, "grad_norm": 1.633045388700478, "language_loss": 0.6491617, "learning_rate": 3.1090549665282702e-06, "loss": 0.72663164, "num_input_tokens_seen": 119126630, "router_z_loss_clip": 1.94140625, "router_z_loss_mlp": 0.14953613, "step": 5547, "time_per_iteration": 4.045672178268433 }, { "auxiliary_loss_clip": 0.06473217, "auxiliary_loss_mlp": 0.01270409, "balance_loss_clip": 0.0628574, "balance_loss_mlp": 0.01256151, "epoch": 0.3335638058018939, "flos": 16184995424640.0, "grad_norm": 3.911030496772327, "language_loss": 0.85605365, "learning_rate": 3.1087308489625742e-06, "loss": 0.93348992, "num_input_tokens_seen": 119143375, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.1427002, "step": 5548, "time_per_iteration": 2.53883957862854 }, { "auxiliary_loss_clip": 0.06478611, "auxiliary_loss_mlp": 0.01271518, "balance_loss_clip": 0.06285303, "balance_loss_mlp": 0.01255627, "epoch": 0.33362392905456184, "flos": 39905651617920.0, "grad_norm": 2.3156355474281445, "language_loss": 0.74736935, "learning_rate": 3.1084066893536945e-06, "loss": 0.82487059, "num_input_tokens_seen": 119166450, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.15881348, "step": 5549, "time_per_iteration": 2.7239608764648438 }, { "auxiliary_loss_clip": 0.06477328, "auxiliary_loss_mlp": 0.01276251, "balance_loss_clip": 0.06284761, "balance_loss_mlp": 0.01260134, "epoch": 0.3336840523072298, "flos": 44280954339840.0, "grad_norm": 2.165010701006442, "language_loss": 0.68927163, "learning_rate": 3.108082487713921e-06, "loss": 0.76680744, "num_input_tokens_seen": 119189645, "router_z_loss_clip": 1.92773438, "router_z_loss_mlp": 0.16113281, "step": 5550, "time_per_iteration": 2.7581586837768555 }, { "auxiliary_loss_clip": 0.06480923, "auxiliary_loss_mlp": 0.01275192, "balance_loss_clip": 0.06289721, "balance_loss_mlp": 0.01259951, "epoch": 0.33374417555989777, "flos": 15091054444800.0, "grad_norm": 9.409017257196838, "language_loss": 0.6110391, "learning_rate": 3.1077582440555495e-06, "loss": 0.68860018, "num_input_tokens_seen": 119208045, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.15240479, "step": 5551, "time_per_iteration": 2.5787580013275146 }, { "auxiliary_loss_clip": 0.06473462, "auxiliary_loss_mlp": 0.01271213, "balance_loss_clip": 0.06284689, "balance_loss_mlp": 0.01256688, "epoch": 0.33380429881256574, "flos": 15854226232320.0, "grad_norm": 1.7376080474310407, "language_loss": 0.71169782, "learning_rate": 3.1074339583908746e-06, "loss": 0.78914464, "num_input_tokens_seen": 119224910, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.14532471, "step": 5552, "time_per_iteration": 3.947030544281006 }, { "auxiliary_loss_clip": 0.06473531, "auxiliary_loss_mlp": 0.01271563, "balance_loss_clip": 0.0628379, "balance_loss_mlp": 0.01257037, "epoch": 0.33386442206523376, "flos": 13485439307520.0, "grad_norm": 2.5826180559938012, "language_loss": 0.84003359, "learning_rate": 3.107109630732192e-06, "loss": 0.91748452, "num_input_tokens_seen": 119243290, "router_z_loss_clip": 1.89550781, "router_z_loss_mlp": 0.14501953, "step": 5553, "time_per_iteration": 2.5210368633270264 }, { "auxiliary_loss_clip": 0.06476564, "auxiliary_loss_mlp": 0.01269215, "balance_loss_clip": 0.06284592, "balance_loss_mlp": 0.01253623, "epoch": 0.3339245453179017, "flos": 16696250311680.0, "grad_norm": 1.8305367419144392, "language_loss": 0.81333458, "learning_rate": 3.1067852610918017e-06, "loss": 0.89079237, "num_input_tokens_seen": 119261195, "router_z_loss_clip": 1.91992188, "router_z_loss_mlp": 0.15588379, "step": 5554, "time_per_iteration": 2.574183702468872 }, { "auxiliary_loss_clip": 0.06475101, "auxiliary_loss_mlp": 0.0127637, "balance_loss_clip": 0.06283328, "balance_loss_mlp": 0.0126067, "epoch": 0.3339846685705697, "flos": 24617954839680.0, "grad_norm": 1.4718022405788482, "language_loss": 0.8169992, "learning_rate": 3.1064608494820032e-06, "loss": 0.89451385, "num_input_tokens_seen": 119282845, "router_z_loss_clip": 1.91503906, "router_z_loss_mlp": 0.15698242, "step": 5555, "time_per_iteration": 2.6037685871124268 }, { "auxiliary_loss_clip": 0.0647622, "auxiliary_loss_mlp": 0.01273469, "balance_loss_clip": 0.06286114, "balance_loss_mlp": 0.01257697, "epoch": 0.33404479182323765, "flos": 30961311534720.0, "grad_norm": 1.7289464716312226, "language_loss": 0.74871051, "learning_rate": 3.106136395915099e-06, "loss": 0.8262074, "num_input_tokens_seen": 119304430, "router_z_loss_clip": 1.89941406, "router_z_loss_mlp": 0.15777588, "step": 5556, "time_per_iteration": 2.6509604454040527 }, { "auxiliary_loss_clip": 0.06470827, "auxiliary_loss_mlp": 0.01274906, "balance_loss_clip": 0.06284504, "balance_loss_mlp": 0.01260004, "epoch": 0.3341049150759056, "flos": 23519988864000.0, "grad_norm": 1.471835905659336, "language_loss": 0.82538301, "learning_rate": 3.105811900403391e-06, "loss": 0.90284032, "num_input_tokens_seen": 119323830, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.14904785, "step": 5557, "time_per_iteration": 2.608982563018799 }, { "auxiliary_loss_clip": 0.06477971, "auxiliary_loss_mlp": 0.01270996, "balance_loss_clip": 0.06288125, "balance_loss_mlp": 0.01255248, "epoch": 0.3341650383285736, "flos": 24034052862720.0, "grad_norm": 1.3830080170209686, "language_loss": 0.80415803, "learning_rate": 3.1054873629591855e-06, "loss": 0.88164771, "num_input_tokens_seen": 119346340, "router_z_loss_clip": 1.89941406, "router_z_loss_mlp": 0.15759277, "step": 5558, "time_per_iteration": 4.1984007358551025 }, { "auxiliary_loss_clip": 0.06478097, "auxiliary_loss_mlp": 0.01271414, "balance_loss_clip": 0.06285423, "balance_loss_mlp": 0.01256322, "epoch": 0.33422516158124155, "flos": 24909255959040.0, "grad_norm": 1.535333230653277, "language_loss": 0.81634545, "learning_rate": 3.105162783594788e-06, "loss": 0.89384055, "num_input_tokens_seen": 119367285, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.15087891, "step": 5559, "time_per_iteration": 2.6264803409576416 }, { "auxiliary_loss_clip": 0.06471951, "auxiliary_loss_mlp": 0.01269938, "balance_loss_clip": 0.06284829, "balance_loss_mlp": 0.01254977, "epoch": 0.3342852848339095, "flos": 18339404878080.0, "grad_norm": 1.7374468965361551, "language_loss": 0.72250265, "learning_rate": 3.1048381623225074e-06, "loss": 0.79992157, "num_input_tokens_seen": 119385370, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.14953613, "step": 5560, "time_per_iteration": 2.560267210006714 }, { "auxiliary_loss_clip": 0.06479803, "auxiliary_loss_mlp": 0.01272175, "balance_loss_clip": 0.06284995, "balance_loss_mlp": 0.0125607, "epoch": 0.3343454080865775, "flos": 30054690357120.0, "grad_norm": 1.3168371697102266, "language_loss": 0.75346881, "learning_rate": 3.1045134991546526e-06, "loss": 0.83098853, "num_input_tokens_seen": 119409150, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.16113281, "step": 5561, "time_per_iteration": 2.6455721855163574 }, { "auxiliary_loss_clip": 0.06476374, "auxiliary_loss_mlp": 0.01276312, "balance_loss_clip": 0.06287559, "balance_loss_mlp": 0.012606, "epoch": 0.33440553133924544, "flos": 16404362213760.0, "grad_norm": 3.471445275605031, "language_loss": 0.6989271, "learning_rate": 3.1041887941035355e-06, "loss": 0.77645397, "num_input_tokens_seen": 119426475, "router_z_loss_clip": 1.88671875, "router_z_loss_mlp": 0.15722656, "step": 5562, "time_per_iteration": 2.5527899265289307 }, { "auxiliary_loss_clip": 0.06475773, "auxiliary_loss_mlp": 0.01272002, "balance_loss_clip": 0.06286091, "balance_loss_mlp": 0.01256708, "epoch": 0.3344656545919134, "flos": 24248723823360.0, "grad_norm": 1.6413564328051584, "language_loss": 0.65767598, "learning_rate": 3.1038640471814685e-06, "loss": 0.73515368, "num_input_tokens_seen": 119446900, "router_z_loss_clip": 1.89746094, "router_z_loss_mlp": 0.1529541, "step": 5563, "time_per_iteration": 2.591923236846924 }, { "auxiliary_loss_clip": 0.0647905, "auxiliary_loss_mlp": 0.01272823, "balance_loss_clip": 0.06285766, "balance_loss_mlp": 0.01255263, "epoch": 0.3345257778445814, "flos": 52130431048320.0, "grad_norm": 2.570898204355061, "language_loss": 0.7428292, "learning_rate": 3.103539258400766e-06, "loss": 0.82034791, "num_input_tokens_seen": 119470945, "router_z_loss_clip": 1.93261719, "router_z_loss_mlp": 0.17578125, "step": 5564, "time_per_iteration": 2.835411310195923 }, { "auxiliary_loss_clip": 0.06385215, "auxiliary_loss_mlp": 0.01255769, "balance_loss_clip": 0.06292079, "balance_loss_mlp": 0.01252219, "epoch": 0.33458590109724934, "flos": 68066528319360.0, "grad_norm": 0.7604117310078757, "language_loss": 0.55296195, "learning_rate": 3.103214427773745e-06, "loss": 0.62937188, "num_input_tokens_seen": 119529925, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.03549194, "step": 5565, "time_per_iteration": 3.1701598167419434 }, { "auxiliary_loss_clip": 0.06472242, "auxiliary_loss_mlp": 0.01271239, "balance_loss_clip": 0.06285839, "balance_loss_mlp": 0.01256124, "epoch": 0.3346460243499173, "flos": 37423869062400.0, "grad_norm": 1.9603856452526065, "language_loss": 0.65072119, "learning_rate": 3.102889555312721e-06, "loss": 0.72815603, "num_input_tokens_seen": 119550700, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.15112305, "step": 5566, "time_per_iteration": 2.692594051361084 }, { "auxiliary_loss_clip": 0.06477527, "auxiliary_loss_mlp": 0.01276665, "balance_loss_clip": 0.06289929, "balance_loss_mlp": 0.01260452, "epoch": 0.3347061476025853, "flos": 18703269233280.0, "grad_norm": 4.172190183300859, "language_loss": 0.77707225, "learning_rate": 3.102564641030016e-06, "loss": 0.8546142, "num_input_tokens_seen": 119569295, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.16223145, "step": 5567, "time_per_iteration": 2.532371759414673 }, { "auxiliary_loss_clip": 0.06480473, "auxiliary_loss_mlp": 0.01277433, "balance_loss_clip": 0.06288108, "balance_loss_mlp": 0.01260827, "epoch": 0.3347662708552533, "flos": 13922957001600.0, "grad_norm": 2.0880575056455166, "language_loss": 0.76682091, "learning_rate": 3.102239684937949e-06, "loss": 0.84439993, "num_input_tokens_seen": 119587375, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.16601562, "step": 5568, "time_per_iteration": 2.5266847610473633 }, { "auxiliary_loss_clip": 0.06480125, "auxiliary_loss_mlp": 0.01276957, "balance_loss_clip": 0.06290193, "balance_loss_mlp": 0.01260899, "epoch": 0.33482639410792125, "flos": 19755645788160.0, "grad_norm": 3.732571747317119, "language_loss": 0.71923935, "learning_rate": 3.101914687048842e-06, "loss": 0.79681015, "num_input_tokens_seen": 119604530, "router_z_loss_clip": 1.89941406, "router_z_loss_mlp": 0.16052246, "step": 5569, "time_per_iteration": 2.553825855255127 }, { "auxiliary_loss_clip": 0.06477777, "auxiliary_loss_mlp": 0.0127359, "balance_loss_clip": 0.06284451, "balance_loss_mlp": 0.01257639, "epoch": 0.3348865173605892, "flos": 16107820214400.0, "grad_norm": 1.8556351439154357, "language_loss": 0.90156722, "learning_rate": 3.10158964737502e-06, "loss": 0.97908092, "num_input_tokens_seen": 119621025, "router_z_loss_clip": 1.93066406, "router_z_loss_mlp": 0.15930176, "step": 5570, "time_per_iteration": 2.5489704608917236 }, { "auxiliary_loss_clip": 0.06475738, "auxiliary_loss_mlp": 0.01272434, "balance_loss_clip": 0.06287307, "balance_loss_mlp": 0.01257437, "epoch": 0.3349466406132572, "flos": 25015836752640.0, "grad_norm": 1.5291963471515786, "language_loss": 0.80388975, "learning_rate": 3.101264565928808e-06, "loss": 0.88137138, "num_input_tokens_seen": 119641725, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.14978027, "step": 5571, "time_per_iteration": 2.593212127685547 }, { "auxiliary_loss_clip": 0.06371553, "auxiliary_loss_mlp": 0.01254204, "balance_loss_clip": 0.06279264, "balance_loss_mlp": 0.01250967, "epoch": 0.33500676386592515, "flos": 54340058413440.0, "grad_norm": 0.8543969204612137, "language_loss": 0.55767834, "learning_rate": 3.1009394427225335e-06, "loss": 0.63393593, "num_input_tokens_seen": 119693560, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.03240967, "step": 5572, "time_per_iteration": 3.163482427597046 }, { "auxiliary_loss_clip": 0.06478837, "auxiliary_loss_mlp": 0.01277239, "balance_loss_clip": 0.06290624, "balance_loss_mlp": 0.01261813, "epoch": 0.3350668871185931, "flos": 26804620915200.0, "grad_norm": 2.2057637542667634, "language_loss": 0.78627622, "learning_rate": 3.1006142777685257e-06, "loss": 0.863837, "num_input_tokens_seen": 119712935, "router_z_loss_clip": 1.8828125, "router_z_loss_mlp": 0.15423584, "step": 5573, "time_per_iteration": 2.6220974922180176 }, { "auxiliary_loss_clip": 0.06480139, "auxiliary_loss_mlp": 0.01276837, "balance_loss_clip": 0.06289296, "balance_loss_mlp": 0.01260159, "epoch": 0.3351270103712611, "flos": 33518885708160.0, "grad_norm": 2.452751306916076, "language_loss": 0.72727597, "learning_rate": 3.1002890710791133e-06, "loss": 0.80484569, "num_input_tokens_seen": 119731680, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.16662598, "step": 5574, "time_per_iteration": 2.66414475440979 }, { "auxiliary_loss_clip": 0.0647327, "auxiliary_loss_mlp": 0.01275369, "balance_loss_clip": 0.06288914, "balance_loss_mlp": 0.01259908, "epoch": 0.33518713362392905, "flos": 26513613285120.0, "grad_norm": 1.6512641949181068, "language_loss": 0.88443017, "learning_rate": 3.0999638226666287e-06, "loss": 0.96191657, "num_input_tokens_seen": 119752155, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.15454102, "step": 5575, "time_per_iteration": 2.6024484634399414 }, { "auxiliary_loss_clip": 0.06486277, "auxiliary_loss_mlp": 0.01274903, "balance_loss_clip": 0.06289604, "balance_loss_mlp": 0.01257856, "epoch": 0.335247256876597, "flos": 17237078490240.0, "grad_norm": 3.0504390285313288, "language_loss": 0.82639521, "learning_rate": 3.0996385325434063e-06, "loss": 0.90400702, "num_input_tokens_seen": 119769195, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.17041016, "step": 5576, "time_per_iteration": 2.5493273735046387 }, { "auxiliary_loss_clip": 0.06480227, "auxiliary_loss_mlp": 0.01273296, "balance_loss_clip": 0.06286855, "balance_loss_mlp": 0.01257, "epoch": 0.335307380129265, "flos": 25636397690880.0, "grad_norm": 2.6213415804132887, "language_loss": 0.73012757, "learning_rate": 3.0993132007217806e-06, "loss": 0.80766284, "num_input_tokens_seen": 119786810, "router_z_loss_clip": 1.93457031, "router_z_loss_mlp": 0.16296387, "step": 5577, "time_per_iteration": 2.620527505874634 }, { "auxiliary_loss_clip": 0.06482758, "auxiliary_loss_mlp": 0.01271672, "balance_loss_clip": 0.06292257, "balance_loss_mlp": 0.01255496, "epoch": 0.33536750338193294, "flos": 19685765882880.0, "grad_norm": 1.5782969669112197, "language_loss": 0.81919825, "learning_rate": 3.0989878272140883e-06, "loss": 0.89674258, "num_input_tokens_seen": 119805395, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.1619873, "step": 5578, "time_per_iteration": 2.634612560272217 }, { "auxiliary_loss_clip": 0.0647154, "auxiliary_loss_mlp": 0.01273665, "balance_loss_clip": 0.06286868, "balance_loss_mlp": 0.01258251, "epoch": 0.3354276266346009, "flos": 18338482483200.0, "grad_norm": 1.8540585832074605, "language_loss": 0.72236085, "learning_rate": 3.0986624120326676e-06, "loss": 0.79981291, "num_input_tokens_seen": 119823135, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.15423584, "step": 5579, "time_per_iteration": 2.5262451171875 }, { "auxiliary_loss_clip": 0.06482735, "auxiliary_loss_mlp": 0.01280953, "balance_loss_clip": 0.06291101, "balance_loss_mlp": 0.01265837, "epoch": 0.3354877498872689, "flos": 17864389681920.0, "grad_norm": 2.0415737323279526, "language_loss": 0.81800914, "learning_rate": 3.0983369551898573e-06, "loss": 0.89564604, "num_input_tokens_seen": 119842265, "router_z_loss_clip": 1.91699219, "router_z_loss_mlp": 0.15112305, "step": 5580, "time_per_iteration": 3.9847073554992676 }, { "auxiliary_loss_clip": 0.06484006, "auxiliary_loss_mlp": 0.01275049, "balance_loss_clip": 0.06290359, "balance_loss_mlp": 0.01258896, "epoch": 0.3355478731399369, "flos": 24724703341440.0, "grad_norm": 1.7717281381657093, "language_loss": 0.7820155, "learning_rate": 3.0980114566980003e-06, "loss": 0.85960603, "num_input_tokens_seen": 119862500, "router_z_loss_clip": 1.93457031, "router_z_loss_mlp": 0.16137695, "step": 5581, "time_per_iteration": 2.612795829772949 }, { "auxiliary_loss_clip": 0.06484699, "auxiliary_loss_mlp": 0.01270916, "balance_loss_clip": 0.06288769, "balance_loss_mlp": 0.01254179, "epoch": 0.33560799639260486, "flos": 16879628972160.0, "grad_norm": 2.203502685702421, "language_loss": 0.74928892, "learning_rate": 3.0976859165694384e-06, "loss": 0.82684505, "num_input_tokens_seen": 119880160, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.16748047, "step": 5582, "time_per_iteration": 2.5601541996002197 }, { "auxiliary_loss_clip": 0.06481045, "auxiliary_loss_mlp": 0.01279842, "balance_loss_clip": 0.06287386, "balance_loss_mlp": 0.01264189, "epoch": 0.3356681196452728, "flos": 18339530659200.0, "grad_norm": 1.7147469093254004, "language_loss": 0.8287335, "learning_rate": 3.0973603348165166e-06, "loss": 0.90634239, "num_input_tokens_seen": 119899040, "router_z_loss_clip": 1.93457031, "router_z_loss_mlp": 0.15637207, "step": 5583, "time_per_iteration": 2.547311544418335 }, { "auxiliary_loss_clip": 0.06476769, "auxiliary_loss_mlp": 0.01277998, "balance_loss_clip": 0.06287193, "balance_loss_mlp": 0.01262906, "epoch": 0.3357282428979408, "flos": 34759127116800.0, "grad_norm": 1.8597915286786357, "language_loss": 0.77692044, "learning_rate": 3.097034711451581e-06, "loss": 0.85446805, "num_input_tokens_seen": 119921120, "router_z_loss_clip": 1.89746094, "router_z_loss_mlp": 0.15087891, "step": 5584, "time_per_iteration": 2.685591697692871 }, { "auxiliary_loss_clip": 0.0648429, "auxiliary_loss_mlp": 0.01274524, "balance_loss_clip": 0.06290568, "balance_loss_mlp": 0.01258919, "epoch": 0.33578836615060875, "flos": 21586539427200.0, "grad_norm": 1.4953862388559949, "language_loss": 0.76405513, "learning_rate": 3.0967090464869795e-06, "loss": 0.84164327, "num_input_tokens_seen": 119940165, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.15600586, "step": 5585, "time_per_iteration": 2.578947067260742 }, { "auxiliary_loss_clip": 0.06477267, "auxiliary_loss_mlp": 0.01273857, "balance_loss_clip": 0.06290062, "balance_loss_mlp": 0.01257925, "epoch": 0.3358484894032767, "flos": 24536377290240.0, "grad_norm": 1.7641240703478607, "language_loss": 0.78165931, "learning_rate": 3.0963833399350608e-06, "loss": 0.85917056, "num_input_tokens_seen": 119959730, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.15917969, "step": 5586, "time_per_iteration": 4.024035930633545 }, { "auxiliary_loss_clip": 0.06493236, "auxiliary_loss_mlp": 0.01280008, "balance_loss_clip": 0.06295791, "balance_loss_mlp": 0.01263635, "epoch": 0.3359086126559447, "flos": 22462161793920.0, "grad_norm": 2.090051008605941, "language_loss": 0.81688428, "learning_rate": 3.0960575918081756e-06, "loss": 0.89461672, "num_input_tokens_seen": 119979315, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.16351318, "step": 5587, "time_per_iteration": 2.584846258163452 }, { "auxiliary_loss_clip": 0.06471863, "auxiliary_loss_mlp": 0.01268973, "balance_loss_clip": 0.06289388, "balance_loss_mlp": 0.01254656, "epoch": 0.33596873590861265, "flos": 16549069415040.0, "grad_norm": 2.0645298486206083, "language_loss": 0.68038875, "learning_rate": 3.095731802118677e-06, "loss": 0.75779712, "num_input_tokens_seen": 119996140, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.14331055, "step": 5588, "time_per_iteration": 2.5501549243927 }, { "auxiliary_loss_clip": 0.06485827, "auxiliary_loss_mlp": 0.01272702, "balance_loss_clip": 0.06294687, "balance_loss_mlp": 0.01256167, "epoch": 0.3360288591612806, "flos": 31183864778880.0, "grad_norm": 1.7752095625127995, "language_loss": 0.70129234, "learning_rate": 3.095405970878919e-06, "loss": 0.77887768, "num_input_tokens_seen": 120017720, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.16534424, "step": 5589, "time_per_iteration": 2.641336679458618 }, { "auxiliary_loss_clip": 0.0648693, "auxiliary_loss_mlp": 0.01272415, "balance_loss_clip": 0.06295897, "balance_loss_mlp": 0.01257442, "epoch": 0.3360889824139486, "flos": 23703828721920.0, "grad_norm": 1.871164920784661, "language_loss": 0.67121625, "learning_rate": 3.0950800981012567e-06, "loss": 0.7488097, "num_input_tokens_seen": 120036335, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.14978027, "step": 5590, "time_per_iteration": 2.582211971282959 }, { "auxiliary_loss_clip": 0.06482991, "auxiliary_loss_mlp": 0.01276707, "balance_loss_clip": 0.06297274, "balance_loss_mlp": 0.01261282, "epoch": 0.33614910566661654, "flos": 19324207514880.0, "grad_norm": 2.162664936741354, "language_loss": 0.74253643, "learning_rate": 3.094754183798047e-06, "loss": 0.82013339, "num_input_tokens_seen": 120056120, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.15429688, "step": 5591, "time_per_iteration": 2.589958906173706 }, { "auxiliary_loss_clip": 0.06479768, "auxiliary_loss_mlp": 0.01268983, "balance_loss_clip": 0.06293717, "balance_loss_mlp": 0.01253736, "epoch": 0.3362092289192845, "flos": 16477889771520.0, "grad_norm": 2.0094365805896466, "language_loss": 0.70112824, "learning_rate": 3.0944282279816493e-06, "loss": 0.77861571, "num_input_tokens_seen": 120073650, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.15258789, "step": 5592, "time_per_iteration": 3.973351240158081 }, { "auxiliary_loss_clip": 0.06485382, "auxiliary_loss_mlp": 0.01272221, "balance_loss_clip": 0.06298035, "balance_loss_mlp": 0.01258577, "epoch": 0.33626935217195253, "flos": 24250484759040.0, "grad_norm": 2.7236861811885325, "language_loss": 0.76888883, "learning_rate": 3.094102230664423e-06, "loss": 0.84646481, "num_input_tokens_seen": 120093260, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.13653564, "step": 5593, "time_per_iteration": 2.6052563190460205 }, { "auxiliary_loss_clip": 0.06488584, "auxiliary_loss_mlp": 0.01276524, "balance_loss_clip": 0.06294996, "balance_loss_mlp": 0.01259466, "epoch": 0.3363294754246205, "flos": 19724814685440.0, "grad_norm": 2.945952752365954, "language_loss": 0.72596151, "learning_rate": 3.093776191858731e-06, "loss": 0.80361259, "num_input_tokens_seen": 120111830, "router_z_loss_clip": 1.93554688, "router_z_loss_mlp": 0.1706543, "step": 5594, "time_per_iteration": 2.563993215560913 }, { "auxiliary_loss_clip": 0.06488445, "auxiliary_loss_mlp": 0.01274351, "balance_loss_clip": 0.062968, "balance_loss_mlp": 0.01258353, "epoch": 0.33638959867728846, "flos": 22602005458560.0, "grad_norm": 1.7020942762555518, "language_loss": 0.80305243, "learning_rate": 3.0934501115769363e-06, "loss": 0.88068032, "num_input_tokens_seen": 120130470, "router_z_loss_clip": 1.91601562, "router_z_loss_mlp": 0.16015625, "step": 5595, "time_per_iteration": 2.5724050998687744 }, { "auxiliary_loss_clip": 0.06480554, "auxiliary_loss_mlp": 0.01269368, "balance_loss_clip": 0.06293996, "balance_loss_mlp": 0.012556, "epoch": 0.3364497219299564, "flos": 21000834587520.0, "grad_norm": 1.8601330561942815, "language_loss": 0.82126164, "learning_rate": 3.0931239898314037e-06, "loss": 0.8987608, "num_input_tokens_seen": 120150735, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.13769531, "step": 5596, "time_per_iteration": 2.5736680030822754 }, { "auxiliary_loss_clip": 0.0648275, "auxiliary_loss_mlp": 0.01274685, "balance_loss_clip": 0.06294711, "balance_loss_mlp": 0.01260463, "epoch": 0.3365098451826244, "flos": 25235664739200.0, "grad_norm": 1.5562526865913873, "language_loss": 0.76382285, "learning_rate": 3.0927978266344995e-06, "loss": 0.84139723, "num_input_tokens_seen": 120173230, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.14233398, "step": 5597, "time_per_iteration": 4.0791943073272705 }, { "auxiliary_loss_clip": 0.06480817, "auxiliary_loss_mlp": 0.01270496, "balance_loss_clip": 0.06295851, "balance_loss_mlp": 0.01256549, "epoch": 0.33656996843529235, "flos": 24578612547840.0, "grad_norm": 2.3113171113561064, "language_loss": 0.79144597, "learning_rate": 3.0924716219985916e-06, "loss": 0.86895907, "num_input_tokens_seen": 120191860, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.13952637, "step": 5598, "time_per_iteration": 2.5920298099517822 }, { "auxiliary_loss_clip": 0.06493545, "auxiliary_loss_mlp": 0.01274958, "balance_loss_clip": 0.06297877, "balance_loss_mlp": 0.01259246, "epoch": 0.3366300916879603, "flos": 44101223331840.0, "grad_norm": 1.768601838631645, "language_loss": 0.65035212, "learning_rate": 3.0921453759360514e-06, "loss": 0.72803712, "num_input_tokens_seen": 120219195, "router_z_loss_clip": 1.95117188, "router_z_loss_mlp": 0.1572876, "step": 5599, "time_per_iteration": 2.7916924953460693 }, { "auxiliary_loss_clip": 0.0649709, "auxiliary_loss_mlp": 0.01274609, "balance_loss_clip": 0.06299388, "balance_loss_mlp": 0.01256942, "epoch": 0.3366902149406283, "flos": 13884746739840.0, "grad_norm": 3.92714403045386, "language_loss": 0.82460976, "learning_rate": 3.091819088459249e-06, "loss": 0.9023267, "num_input_tokens_seen": 120232950, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.17675781, "step": 5600, "time_per_iteration": 2.5326619148254395 }, { "auxiliary_loss_clip": 0.06489992, "auxiliary_loss_mlp": 0.01272471, "balance_loss_clip": 0.06297128, "balance_loss_mlp": 0.01256592, "epoch": 0.33675033819329625, "flos": 16258648763520.0, "grad_norm": 2.1596008487058675, "language_loss": 0.83685744, "learning_rate": 3.0914927595805573e-06, "loss": 0.914482, "num_input_tokens_seen": 120248865, "router_z_loss_clip": 1.92773438, "router_z_loss_mlp": 0.15893555, "step": 5601, "time_per_iteration": 2.5580248832702637 }, { "auxiliary_loss_clip": 0.06475134, "auxiliary_loss_mlp": 0.01273347, "balance_loss_clip": 0.06294231, "balance_loss_mlp": 0.01259972, "epoch": 0.3368104614459642, "flos": 17061498259200.0, "grad_norm": 1.5298335375664858, "language_loss": 0.8375532, "learning_rate": 3.0911663893123507e-06, "loss": 0.91503799, "num_input_tokens_seen": 120267820, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.1338501, "step": 5602, "time_per_iteration": 2.5674664974212646 }, { "auxiliary_loss_clip": 0.06484307, "auxiliary_loss_mlp": 0.01276177, "balance_loss_clip": 0.0629451, "balance_loss_mlp": 0.01260644, "epoch": 0.3368705846986322, "flos": 17864473536000.0, "grad_norm": 2.189186347337122, "language_loss": 0.70016253, "learning_rate": 3.0908399776670048e-06, "loss": 0.77776742, "num_input_tokens_seen": 120286540, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.15527344, "step": 5603, "time_per_iteration": 2.600973129272461 }, { "auxiliary_loss_clip": 0.06494288, "auxiliary_loss_mlp": 0.0127107, "balance_loss_clip": 0.06301168, "balance_loss_mlp": 0.01255442, "epoch": 0.33693070795130015, "flos": 22936086887040.0, "grad_norm": 1.533195753028559, "language_loss": 0.83788228, "learning_rate": 3.090513524656898e-06, "loss": 0.91553593, "num_input_tokens_seen": 120307305, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.15637207, "step": 5604, "time_per_iteration": 2.5962941646575928 }, { "auxiliary_loss_clip": 0.06492917, "auxiliary_loss_mlp": 0.01271458, "balance_loss_clip": 0.06300566, "balance_loss_mlp": 0.01255448, "epoch": 0.3369908312039681, "flos": 22023889413120.0, "grad_norm": 1.5127138429078886, "language_loss": 0.74073082, "learning_rate": 3.090187030294409e-06, "loss": 0.81837463, "num_input_tokens_seen": 120327845, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.16015625, "step": 5605, "time_per_iteration": 2.6524851322174072 }, { "auxiliary_loss_clip": 0.0649076, "auxiliary_loss_mlp": 0.01270029, "balance_loss_clip": 0.06296717, "balance_loss_mlp": 0.01254162, "epoch": 0.33705095445663613, "flos": 11806799736960.0, "grad_norm": 3.7796532517163817, "language_loss": 0.83694416, "learning_rate": 3.089860494591919e-06, "loss": 0.91455209, "num_input_tokens_seen": 120343255, "router_z_loss_clip": 1.94140625, "router_z_loss_mlp": 0.15869141, "step": 5606, "time_per_iteration": 2.6686060428619385 }, { "auxiliary_loss_clip": 0.064846, "auxiliary_loss_mlp": 0.01270175, "balance_loss_clip": 0.06294431, "balance_loss_mlp": 0.01255483, "epoch": 0.3371110777093041, "flos": 25053460035840.0, "grad_norm": 1.5264460906723056, "language_loss": 0.67934561, "learning_rate": 3.089533917561809e-06, "loss": 0.75689328, "num_input_tokens_seen": 120361745, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.14691162, "step": 5607, "time_per_iteration": 2.6075053215026855 }, { "auxiliary_loss_clip": 0.06492525, "auxiliary_loss_mlp": 0.01274861, "balance_loss_clip": 0.06295887, "balance_loss_mlp": 0.0125878, "epoch": 0.33717120096197206, "flos": 26586386156160.0, "grad_norm": 1.8380165084808562, "language_loss": 0.70946878, "learning_rate": 3.089207299216464e-06, "loss": 0.78714263, "num_input_tokens_seen": 120380565, "router_z_loss_clip": 1.96972656, "router_z_loss_mlp": 0.1607666, "step": 5608, "time_per_iteration": 2.5851433277130127 }, { "auxiliary_loss_clip": 0.06485623, "auxiliary_loss_mlp": 0.01277914, "balance_loss_clip": 0.06296396, "balance_loss_mlp": 0.01262381, "epoch": 0.33723132421464, "flos": 15163911169920.0, "grad_norm": 2.2383035319313325, "language_loss": 0.79999858, "learning_rate": 3.088880639568269e-06, "loss": 0.87763393, "num_input_tokens_seen": 120399235, "router_z_loss_clip": 1.89453125, "router_z_loss_mlp": 0.15539551, "step": 5609, "time_per_iteration": 2.545842409133911 }, { "auxiliary_loss_clip": 0.06486145, "auxiliary_loss_mlp": 0.01276153, "balance_loss_clip": 0.06295109, "balance_loss_mlp": 0.01259035, "epoch": 0.337291447467308, "flos": 23442058967040.0, "grad_norm": 1.9368123921783407, "language_loss": 0.82701242, "learning_rate": 3.0885539386296114e-06, "loss": 0.90463543, "num_input_tokens_seen": 120420095, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.17126465, "step": 5610, "time_per_iteration": 2.6017541885375977 }, { "auxiliary_loss_clip": 0.06481612, "auxiliary_loss_mlp": 0.01269043, "balance_loss_clip": 0.06297341, "balance_loss_mlp": 0.01253832, "epoch": 0.33735157071997596, "flos": 17243870670720.0, "grad_norm": 1.8814904277347895, "language_loss": 0.82346714, "learning_rate": 3.088227196412879e-06, "loss": 0.90097368, "num_input_tokens_seen": 120437690, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.15197754, "step": 5611, "time_per_iteration": 2.550493001937866 }, { "auxiliary_loss_clip": 0.06488365, "auxiliary_loss_mlp": 0.01276201, "balance_loss_clip": 0.06297478, "balance_loss_mlp": 0.01258373, "epoch": 0.3374116939726439, "flos": 28265025726720.0, "grad_norm": 1.8104928400345974, "language_loss": 0.79947364, "learning_rate": 3.0879004129304626e-06, "loss": 0.87711924, "num_input_tokens_seen": 120459240, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.1784668, "step": 5612, "time_per_iteration": 2.6270129680633545 }, { "auxiliary_loss_clip": 0.06490174, "auxiliary_loss_mlp": 0.01275665, "balance_loss_clip": 0.06298314, "balance_loss_mlp": 0.0126037, "epoch": 0.3374718172253119, "flos": 35928314663040.0, "grad_norm": 2.1894459602785563, "language_loss": 0.70390344, "learning_rate": 3.087573588194753e-06, "loss": 0.78156185, "num_input_tokens_seen": 120481090, "router_z_loss_clip": 1.91796875, "router_z_loss_mlp": 0.1529541, "step": 5613, "time_per_iteration": 2.7046945095062256 }, { "auxiliary_loss_clip": 0.06484407, "auxiliary_loss_mlp": 0.01275126, "balance_loss_clip": 0.06292044, "balance_loss_mlp": 0.01259593, "epoch": 0.33753194047797985, "flos": 18192517470720.0, "grad_norm": 1.7059535366918237, "language_loss": 0.79574716, "learning_rate": 3.087246722218144e-06, "loss": 0.87334251, "num_input_tokens_seen": 120500045, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.15527344, "step": 5614, "time_per_iteration": 2.5466036796569824 }, { "auxiliary_loss_clip": 0.06489199, "auxiliary_loss_mlp": 0.01275513, "balance_loss_clip": 0.06296614, "balance_loss_mlp": 0.01258693, "epoch": 0.3375920637306478, "flos": 23155621384320.0, "grad_norm": 1.6087695907160642, "language_loss": 0.91068554, "learning_rate": 3.086919815013031e-06, "loss": 0.98833269, "num_input_tokens_seen": 120521125, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.16821289, "step": 5615, "time_per_iteration": 2.6017274856567383 }, { "auxiliary_loss_clip": 0.06485799, "auxiliary_loss_mlp": 0.01281579, "balance_loss_clip": 0.06297511, "balance_loss_mlp": 0.01265903, "epoch": 0.3376521869833158, "flos": 23118878568960.0, "grad_norm": 2.0652761817252996, "language_loss": 0.81468225, "learning_rate": 3.086592866591809e-06, "loss": 0.89235604, "num_input_tokens_seen": 120539180, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.15661621, "step": 5616, "time_per_iteration": 2.582998275756836 }, { "auxiliary_loss_clip": 0.06489442, "auxiliary_loss_mlp": 0.01277959, "balance_loss_clip": 0.06293836, "balance_loss_mlp": 0.01260745, "epoch": 0.33771231023598375, "flos": 19279498561920.0, "grad_norm": 2.7341951496300547, "language_loss": 0.84258652, "learning_rate": 3.0862658769668774e-06, "loss": 0.92026055, "num_input_tokens_seen": 120556280, "router_z_loss_clip": 1.95605469, "router_z_loss_mlp": 0.17224121, "step": 5617, "time_per_iteration": 2.5601320266723633 }, { "auxiliary_loss_clip": 0.06483095, "auxiliary_loss_mlp": 0.01275591, "balance_loss_clip": 0.06293988, "balance_loss_mlp": 0.01259736, "epoch": 0.3377724334886517, "flos": 18156026217600.0, "grad_norm": 1.3891527387115, "language_loss": 0.80528396, "learning_rate": 3.0859388461506343e-06, "loss": 0.88287091, "num_input_tokens_seen": 120575395, "router_z_loss_clip": 1.89160156, "router_z_loss_mlp": 0.15844727, "step": 5618, "time_per_iteration": 2.560054302215576 }, { "auxiliary_loss_clip": 0.06487982, "auxiliary_loss_mlp": 0.01277389, "balance_loss_clip": 0.06297088, "balance_loss_mlp": 0.01261272, "epoch": 0.3378325567413197, "flos": 25783159317120.0, "grad_norm": 1.7085777885735933, "language_loss": 0.71529597, "learning_rate": 3.085611774155481e-06, "loss": 0.79294968, "num_input_tokens_seen": 120596075, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.16113281, "step": 5619, "time_per_iteration": 4.026709794998169 }, { "auxiliary_loss_clip": 0.06484224, "auxiliary_loss_mlp": 0.01270033, "balance_loss_clip": 0.06295041, "balance_loss_mlp": 0.01254405, "epoch": 0.3378926799939877, "flos": 21322254049920.0, "grad_norm": 4.617713461410896, "language_loss": 0.70852053, "learning_rate": 3.085284660993821e-06, "loss": 0.78606308, "num_input_tokens_seen": 120614195, "router_z_loss_clip": 1.88964844, "router_z_loss_mlp": 0.15637207, "step": 5620, "time_per_iteration": 2.5762252807617188 }, { "auxiliary_loss_clip": 0.06484361, "auxiliary_loss_mlp": 0.01274753, "balance_loss_clip": 0.06296641, "balance_loss_mlp": 0.01258695, "epoch": 0.33795280324665566, "flos": 24906991898880.0, "grad_norm": 2.2423508606538483, "language_loss": 0.68587005, "learning_rate": 3.084957506678058e-06, "loss": 0.76346117, "num_input_tokens_seen": 120634475, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.16052246, "step": 5621, "time_per_iteration": 2.6259617805480957 }, { "auxiliary_loss_clip": 0.06481428, "auxiliary_loss_mlp": 0.01271978, "balance_loss_clip": 0.06295393, "balance_loss_mlp": 0.01257244, "epoch": 0.33801292649932363, "flos": 24760859178240.0, "grad_norm": 2.104984271985636, "language_loss": 0.83103859, "learning_rate": 3.0846303112205975e-06, "loss": 0.90857267, "num_input_tokens_seen": 120654980, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.14746094, "step": 5622, "time_per_iteration": 2.605133295059204 }, { "auxiliary_loss_clip": 0.06481342, "auxiliary_loss_mlp": 0.01269858, "balance_loss_clip": 0.06297565, "balance_loss_mlp": 0.01255279, "epoch": 0.3380730497519916, "flos": 26731177211520.0, "grad_norm": 1.7776578492169537, "language_loss": 0.7400375, "learning_rate": 3.0843030746338464e-06, "loss": 0.81754959, "num_input_tokens_seen": 120676245, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.14562988, "step": 5623, "time_per_iteration": 2.6864264011383057 }, { "auxiliary_loss_clip": 0.06383017, "auxiliary_loss_mlp": 0.0126772, "balance_loss_clip": 0.06293075, "balance_loss_mlp": 0.01264731, "epoch": 0.33813317300465956, "flos": 70056845550720.0, "grad_norm": 0.7194037587116471, "language_loss": 0.54937714, "learning_rate": 3.083975796930215e-06, "loss": 0.62588453, "num_input_tokens_seen": 120741965, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.02987671, "step": 5624, "time_per_iteration": 3.3195137977600098 }, { "auxiliary_loss_clip": 0.06490065, "auxiliary_loss_mlp": 0.01272948, "balance_loss_clip": 0.06297931, "balance_loss_mlp": 0.01257618, "epoch": 0.3381932962573275, "flos": 24104142403200.0, "grad_norm": 1.9825966509000437, "language_loss": 0.72889435, "learning_rate": 3.083648478122111e-06, "loss": 0.80652452, "num_input_tokens_seen": 120760410, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.15313721, "step": 5625, "time_per_iteration": 4.161866903305054 }, { "auxiliary_loss_clip": 0.06490991, "auxiliary_loss_mlp": 0.01272559, "balance_loss_clip": 0.06296539, "balance_loss_mlp": 0.01256144, "epoch": 0.3382534195099955, "flos": 19283775120000.0, "grad_norm": 1.6909279374621082, "language_loss": 0.71497899, "learning_rate": 3.0833211182219497e-06, "loss": 0.79261446, "num_input_tokens_seen": 120777705, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.1640625, "step": 5626, "time_per_iteration": 2.574187755584717 }, { "auxiliary_loss_clip": 0.06479674, "auxiliary_loss_mlp": 0.01267832, "balance_loss_clip": 0.06295942, "balance_loss_mlp": 0.01252466, "epoch": 0.33831354276266346, "flos": 25232897554560.0, "grad_norm": 1.588145065797496, "language_loss": 0.81349653, "learning_rate": 3.0829937172421425e-06, "loss": 0.89097154, "num_input_tokens_seen": 120798660, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.15356445, "step": 5627, "time_per_iteration": 2.620120048522949 }, { "auxiliary_loss_clip": 0.06489214, "auxiliary_loss_mlp": 0.01269524, "balance_loss_clip": 0.06295329, "balance_loss_mlp": 0.01253347, "epoch": 0.3383736660153314, "flos": 23118627006720.0, "grad_norm": 2.6488135081996593, "language_loss": 0.80403435, "learning_rate": 3.0826662751951055e-06, "loss": 0.88162172, "num_input_tokens_seen": 120816705, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.16186523, "step": 5628, "time_per_iteration": 2.5842061042785645 }, { "auxiliary_loss_clip": 0.06486882, "auxiliary_loss_mlp": 0.01269932, "balance_loss_clip": 0.06295786, "balance_loss_mlp": 0.01254054, "epoch": 0.3384337892679994, "flos": 23483874954240.0, "grad_norm": 2.1714396326677563, "language_loss": 0.77453506, "learning_rate": 3.082338792093254e-06, "loss": 0.85210323, "num_input_tokens_seen": 120835375, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.15881348, "step": 5629, "time_per_iteration": 2.5915367603302 }, { "auxiliary_loss_clip": 0.06488989, "auxiliary_loss_mlp": 0.01271768, "balance_loss_clip": 0.06295622, "balance_loss_mlp": 0.01255436, "epoch": 0.33849391252066735, "flos": 19431626849280.0, "grad_norm": 2.196409210349484, "language_loss": 0.85744321, "learning_rate": 3.0820112679490074e-06, "loss": 0.93505079, "num_input_tokens_seen": 120854260, "router_z_loss_clip": 1.93457031, "router_z_loss_mlp": 0.16333008, "step": 5630, "time_per_iteration": 2.5804789066314697 }, { "auxiliary_loss_clip": 0.06486699, "auxiliary_loss_mlp": 0.01270766, "balance_loss_clip": 0.06297787, "balance_loss_mlp": 0.012559, "epoch": 0.3385540357733353, "flos": 21070462930560.0, "grad_norm": 1.948350343399069, "language_loss": 0.72386599, "learning_rate": 3.0816837027747857e-06, "loss": 0.8014406, "num_input_tokens_seen": 120871590, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.1484375, "step": 5631, "time_per_iteration": 4.007920026779175 }, { "auxiliary_loss_clip": 0.06368053, "auxiliary_loss_mlp": 0.01258441, "balance_loss_clip": 0.06278017, "balance_loss_mlp": 0.01255403, "epoch": 0.3386141590260033, "flos": 69224772908160.0, "grad_norm": 0.8310268185043918, "language_loss": 0.56143153, "learning_rate": 3.0813560965830084e-06, "loss": 0.63769656, "num_input_tokens_seen": 120925550, "router_z_loss_clip": 0.89892578, "router_z_loss_mlp": 0.03036499, "step": 5632, "time_per_iteration": 3.2461965084075928 }, { "auxiliary_loss_clip": 0.06485637, "auxiliary_loss_mlp": 0.01271772, "balance_loss_clip": 0.0629566, "balance_loss_mlp": 0.01256287, "epoch": 0.3386742822786713, "flos": 25526420807040.0, "grad_norm": 1.5439314295456845, "language_loss": 0.80568409, "learning_rate": 3.0810284493861005e-06, "loss": 0.88325816, "num_input_tokens_seen": 120947620, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.15466309, "step": 5633, "time_per_iteration": 2.612859010696411 }, { "auxiliary_loss_clip": 0.06485379, "auxiliary_loss_mlp": 0.01272586, "balance_loss_clip": 0.06296323, "balance_loss_mlp": 0.01257482, "epoch": 0.33873440553133927, "flos": 23629881893760.0, "grad_norm": 4.426244544868653, "language_loss": 0.59050101, "learning_rate": 3.0807007611964855e-06, "loss": 0.66808069, "num_input_tokens_seen": 120965205, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.15093994, "step": 5634, "time_per_iteration": 2.5900464057922363 }, { "auxiliary_loss_clip": 0.06487395, "auxiliary_loss_mlp": 0.01270148, "balance_loss_clip": 0.06298514, "balance_loss_mlp": 0.01255617, "epoch": 0.33879452878400723, "flos": 17094006443520.0, "grad_norm": 1.7207484092027976, "language_loss": 0.92617261, "learning_rate": 3.080373032026589e-06, "loss": 1.00374794, "num_input_tokens_seen": 120983560, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.14520264, "step": 5635, "time_per_iteration": 2.570570945739746 }, { "auxiliary_loss_clip": 0.06484933, "auxiliary_loss_mlp": 0.01273578, "balance_loss_clip": 0.06302373, "balance_loss_mlp": 0.01258844, "epoch": 0.3388546520366752, "flos": 15747477730560.0, "grad_norm": 1.7159406534002708, "language_loss": 0.759422, "learning_rate": 3.0800452618888386e-06, "loss": 0.83700716, "num_input_tokens_seen": 121001400, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.14746094, "step": 5636, "time_per_iteration": 2.5541365146636963 }, { "auxiliary_loss_clip": 0.06486683, "auxiliary_loss_mlp": 0.01272751, "balance_loss_clip": 0.06299795, "balance_loss_mlp": 0.01256658, "epoch": 0.33891477528934316, "flos": 22425251270400.0, "grad_norm": 1.6440282720265222, "language_loss": 0.8344391, "learning_rate": 3.0797174507956637e-06, "loss": 0.9120335, "num_input_tokens_seen": 121021760, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.16088867, "step": 5637, "time_per_iteration": 4.039005756378174 }, { "auxiliary_loss_clip": 0.06490925, "auxiliary_loss_mlp": 0.01273298, "balance_loss_clip": 0.06299727, "balance_loss_mlp": 0.01256012, "epoch": 0.3389748985420111, "flos": 17280571559040.0, "grad_norm": 1.850511221545264, "language_loss": 0.70521456, "learning_rate": 3.079389598759495e-06, "loss": 0.78285682, "num_input_tokens_seen": 121041070, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.17297363, "step": 5638, "time_per_iteration": 2.541952133178711 }, { "auxiliary_loss_clip": 0.06488566, "auxiliary_loss_mlp": 0.01272556, "balance_loss_clip": 0.062987, "balance_loss_mlp": 0.01256963, "epoch": 0.3390350217946791, "flos": 27752261466240.0, "grad_norm": 1.6364888928482642, "language_loss": 0.81186247, "learning_rate": 3.079061705792765e-06, "loss": 0.88947368, "num_input_tokens_seen": 121060890, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.15588379, "step": 5639, "time_per_iteration": 2.6146185398101807 }, { "auxiliary_loss_clip": 0.06493299, "auxiliary_loss_mlp": 0.01276089, "balance_loss_clip": 0.06297913, "balance_loss_mlp": 0.01259435, "epoch": 0.33909514504734706, "flos": 20346088383360.0, "grad_norm": 2.023584344869204, "language_loss": 0.68249255, "learning_rate": 3.078733771907907e-06, "loss": 0.76018643, "num_input_tokens_seen": 121079135, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.16662598, "step": 5640, "time_per_iteration": 2.5521926879882812 }, { "auxiliary_loss_clip": 0.06486219, "auxiliary_loss_mlp": 0.01276407, "balance_loss_clip": 0.06297457, "balance_loss_mlp": 0.01259688, "epoch": 0.339155268300015, "flos": 14835322183680.0, "grad_norm": 1.7044847371828726, "language_loss": 0.70262015, "learning_rate": 3.0784057971173554e-06, "loss": 0.78024638, "num_input_tokens_seen": 121097685, "router_z_loss_clip": 1.88769531, "router_z_loss_mlp": 0.16729736, "step": 5641, "time_per_iteration": 2.573333501815796 }, { "auxiliary_loss_clip": 0.06490307, "auxiliary_loss_mlp": 0.01277474, "balance_loss_clip": 0.06299742, "balance_loss_mlp": 0.01261619, "epoch": 0.339215391552683, "flos": 26075173196160.0, "grad_norm": 2.54839720413013, "language_loss": 0.88034952, "learning_rate": 3.0780777814335483e-06, "loss": 0.95802736, "num_input_tokens_seen": 121115640, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.15844727, "step": 5642, "time_per_iteration": 2.6076672077178955 }, { "auxiliary_loss_clip": 0.06477894, "auxiliary_loss_mlp": 0.01271877, "balance_loss_clip": 0.0629752, "balance_loss_mlp": 0.01258036, "epoch": 0.33927551480535095, "flos": 14579967265920.0, "grad_norm": 1.8701501965559904, "language_loss": 0.84321201, "learning_rate": 3.077749724868924e-06, "loss": 0.92070973, "num_input_tokens_seen": 121132485, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.13848877, "step": 5643, "time_per_iteration": 2.551260471343994 }, { "auxiliary_loss_clip": 0.06485562, "auxiliary_loss_mlp": 0.01272379, "balance_loss_clip": 0.06298535, "balance_loss_mlp": 0.01257096, "epoch": 0.3393356380580189, "flos": 23812380086400.0, "grad_norm": 1.5820330390759958, "language_loss": 0.77293372, "learning_rate": 3.077421627435922e-06, "loss": 0.8505131, "num_input_tokens_seen": 121152935, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.15270996, "step": 5644, "time_per_iteration": 2.585797071456909 }, { "auxiliary_loss_clip": 0.06487767, "auxiliary_loss_mlp": 0.01278259, "balance_loss_clip": 0.0629911, "balance_loss_mlp": 0.01261367, "epoch": 0.3393957613106869, "flos": 17353637919360.0, "grad_norm": 2.7701019026223266, "language_loss": 0.63666201, "learning_rate": 3.0770934891469832e-06, "loss": 0.71432227, "num_input_tokens_seen": 121169835, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.16894531, "step": 5645, "time_per_iteration": 2.5544209480285645 }, { "auxiliary_loss_clip": 0.06479406, "auxiliary_loss_mlp": 0.01269995, "balance_loss_clip": 0.06294197, "balance_loss_mlp": 0.01254783, "epoch": 0.3394558845633549, "flos": 28440647884800.0, "grad_norm": 2.3875880322359997, "language_loss": 0.77171814, "learning_rate": 3.076765310014552e-06, "loss": 0.84921211, "num_input_tokens_seen": 121190290, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.15209961, "step": 5646, "time_per_iteration": 2.616668224334717 }, { "auxiliary_loss_clip": 0.06489696, "auxiliary_loss_mlp": 0.01274633, "balance_loss_clip": 0.06294516, "balance_loss_mlp": 0.0125829, "epoch": 0.33951600781602287, "flos": 22092804996480.0, "grad_norm": 3.564214559574854, "language_loss": 0.79515636, "learning_rate": 3.0764370900510727e-06, "loss": 0.87279969, "num_input_tokens_seen": 121209060, "router_z_loss_clip": 1.95410156, "router_z_loss_mlp": 0.16345215, "step": 5647, "time_per_iteration": 2.571650743484497 }, { "auxiliary_loss_clip": 0.06486028, "auxiliary_loss_mlp": 0.0127083, "balance_loss_clip": 0.06294666, "balance_loss_mlp": 0.01254641, "epoch": 0.33957613106869083, "flos": 23885027176320.0, "grad_norm": 3.083746523060201, "language_loss": 0.7754817, "learning_rate": 3.0761088292689904e-06, "loss": 0.85305023, "num_input_tokens_seen": 121227480, "router_z_loss_clip": 1.91601562, "router_z_loss_mlp": 0.1619873, "step": 5648, "time_per_iteration": 2.6102161407470703 }, { "auxiliary_loss_clip": 0.06374733, "auxiliary_loss_mlp": 0.01257258, "balance_loss_clip": 0.06285588, "balance_loss_mlp": 0.0125431, "epoch": 0.3396362543213588, "flos": 71264411066880.0, "grad_norm": 0.762643470149736, "language_loss": 0.56018162, "learning_rate": 3.075780527680754e-06, "loss": 0.63650155, "num_input_tokens_seen": 121291305, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.02946472, "step": 5649, "time_per_iteration": 3.222139835357666 }, { "auxiliary_loss_clip": 0.06484834, "auxiliary_loss_mlp": 0.01276521, "balance_loss_clip": 0.06294157, "balance_loss_mlp": 0.0126069, "epoch": 0.33969637757402676, "flos": 25928746986240.0, "grad_norm": 2.152039775406335, "language_loss": 0.86044991, "learning_rate": 3.0754521852988117e-06, "loss": 0.93806344, "num_input_tokens_seen": 121312740, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.1583252, "step": 5650, "time_per_iteration": 2.6282401084899902 }, { "auxiliary_loss_clip": 0.06483991, "auxiliary_loss_mlp": 0.01272989, "balance_loss_clip": 0.06296802, "balance_loss_mlp": 0.01258052, "epoch": 0.33975650082669473, "flos": 35270382003840.0, "grad_norm": 1.794320877782522, "language_loss": 0.71284163, "learning_rate": 3.0751238021356152e-06, "loss": 0.79041147, "num_input_tokens_seen": 121334220, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.14935303, "step": 5651, "time_per_iteration": 2.7126715183258057 }, { "auxiliary_loss_clip": 0.06484324, "auxiliary_loss_mlp": 0.01272396, "balance_loss_clip": 0.06296952, "balance_loss_mlp": 0.01256732, "epoch": 0.3398166240793627, "flos": 16651373650560.0, "grad_norm": 1.7920829477021765, "language_loss": 0.81956565, "learning_rate": 3.074795378203616e-06, "loss": 0.89713287, "num_input_tokens_seen": 121351870, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.15661621, "step": 5652, "time_per_iteration": 2.5529959201812744 }, { "auxiliary_loss_clip": 0.06491646, "auxiliary_loss_mlp": 0.01277898, "balance_loss_clip": 0.06299447, "balance_loss_mlp": 0.01261411, "epoch": 0.33987674733203066, "flos": 24069244377600.0, "grad_norm": 3.1549295720243156, "language_loss": 0.77888453, "learning_rate": 3.0744669135152685e-06, "loss": 0.85657996, "num_input_tokens_seen": 121373400, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.16467285, "step": 5653, "time_per_iteration": 2.761587619781494 }, { "auxiliary_loss_clip": 0.06486392, "auxiliary_loss_mlp": 0.01267801, "balance_loss_clip": 0.06298899, "balance_loss_mlp": 0.01253317, "epoch": 0.3399368705846986, "flos": 13253955603840.0, "grad_norm": 2.873176476745163, "language_loss": 0.86260974, "learning_rate": 3.0741384080830278e-06, "loss": 0.94015169, "num_input_tokens_seen": 121385225, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.14477539, "step": 5654, "time_per_iteration": 2.5830373764038086 }, { "auxiliary_loss_clip": 0.06480758, "auxiliary_loss_mlp": 0.01274749, "balance_loss_clip": 0.06293487, "balance_loss_mlp": 0.01258561, "epoch": 0.3399969938373666, "flos": 27019585365120.0, "grad_norm": 2.252900641091977, "language_loss": 0.66118801, "learning_rate": 3.073809861919351e-06, "loss": 0.73874313, "num_input_tokens_seen": 121404735, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.16186523, "step": 5655, "time_per_iteration": 2.6072325706481934 }, { "auxiliary_loss_clip": 0.06483847, "auxiliary_loss_mlp": 0.01271788, "balance_loss_clip": 0.0629654, "balance_loss_mlp": 0.01257292, "epoch": 0.34005711709003456, "flos": 28557920073600.0, "grad_norm": 1.4249399336313602, "language_loss": 0.76815391, "learning_rate": 3.073481275036697e-06, "loss": 0.84571028, "num_input_tokens_seen": 121426780, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.14489746, "step": 5656, "time_per_iteration": 2.631497383117676 }, { "auxiliary_loss_clip": 0.06490952, "auxiliary_loss_mlp": 0.01275399, "balance_loss_clip": 0.06297595, "balance_loss_mlp": 0.01259938, "epoch": 0.3401172403427025, "flos": 21623533804800.0, "grad_norm": 2.2145155227145685, "language_loss": 0.8324104, "learning_rate": 3.073152647447525e-06, "loss": 0.91007388, "num_input_tokens_seen": 121447245, "router_z_loss_clip": 1.93359375, "router_z_loss_mlp": 0.15478516, "step": 5657, "time_per_iteration": 2.5944173336029053 }, { "auxiliary_loss_clip": 0.06486299, "auxiliary_loss_mlp": 0.01274325, "balance_loss_clip": 0.06299905, "balance_loss_mlp": 0.01259602, "epoch": 0.3401773635953705, "flos": 25893010419840.0, "grad_norm": 2.0544500751910246, "language_loss": 0.86252356, "learning_rate": 3.0728239791642976e-06, "loss": 0.94012976, "num_input_tokens_seen": 121468165, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.1472168, "step": 5658, "time_per_iteration": 4.0168232917785645 }, { "auxiliary_loss_clip": 0.0638282, "auxiliary_loss_mlp": 0.01262118, "balance_loss_clip": 0.06292816, "balance_loss_mlp": 0.01258923, "epoch": 0.3402374868480385, "flos": 65527737459840.0, "grad_norm": 0.7930828865913722, "language_loss": 0.59828568, "learning_rate": 3.072495270199477e-06, "loss": 0.67473507, "num_input_tokens_seen": 121523795, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.03198242, "step": 5659, "time_per_iteration": 3.1494204998016357 }, { "auxiliary_loss_clip": 0.0648205, "auxiliary_loss_mlp": 0.01273866, "balance_loss_clip": 0.06299508, "balance_loss_mlp": 0.01259131, "epoch": 0.34029761010070647, "flos": 24067357660800.0, "grad_norm": 3.2756305998550443, "language_loss": 0.68602294, "learning_rate": 3.0721665205655284e-06, "loss": 0.76358205, "num_input_tokens_seen": 121542950, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.14715576, "step": 5660, "time_per_iteration": 2.5925252437591553 }, { "auxiliary_loss_clip": 0.06484757, "auxiliary_loss_mlp": 0.01275449, "balance_loss_clip": 0.06299335, "balance_loss_mlp": 0.01260202, "epoch": 0.34035773335337444, "flos": 27607093067520.0, "grad_norm": 1.9997922958112986, "language_loss": 0.6810087, "learning_rate": 3.071837730274918e-06, "loss": 0.75861079, "num_input_tokens_seen": 121562765, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.15246582, "step": 5661, "time_per_iteration": 2.6331095695495605 }, { "auxiliary_loss_clip": 0.06480978, "auxiliary_loss_mlp": 0.0127153, "balance_loss_clip": 0.06298862, "balance_loss_mlp": 0.01257195, "epoch": 0.3404178566060424, "flos": 20818923373440.0, "grad_norm": 2.1576885723049286, "language_loss": 0.79698312, "learning_rate": 3.071508899340113e-06, "loss": 0.87450814, "num_input_tokens_seen": 121581610, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.14337158, "step": 5662, "time_per_iteration": 2.5718228816986084 }, { "auxiliary_loss_clip": 0.06484535, "auxiliary_loss_mlp": 0.01270316, "balance_loss_clip": 0.06298444, "balance_loss_mlp": 0.01254425, "epoch": 0.34047797985871037, "flos": 26840818679040.0, "grad_norm": 2.1684122106659105, "language_loss": 0.73825908, "learning_rate": 3.0711800277735833e-06, "loss": 0.81580758, "num_input_tokens_seen": 121601885, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.15905762, "step": 5663, "time_per_iteration": 2.6219382286071777 }, { "auxiliary_loss_clip": 0.0648023, "auxiliary_loss_mlp": 0.01272068, "balance_loss_clip": 0.06298438, "balance_loss_mlp": 0.01257578, "epoch": 0.34053810311137833, "flos": 19688742702720.0, "grad_norm": 1.512302378577422, "language_loss": 0.86500049, "learning_rate": 3.0708511155877997e-06, "loss": 0.94252354, "num_input_tokens_seen": 121621335, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.1449585, "step": 5664, "time_per_iteration": 2.577171802520752 }, { "auxiliary_loss_clip": 0.06489517, "auxiliary_loss_mlp": 0.01272704, "balance_loss_clip": 0.06301091, "balance_loss_mlp": 0.01258411, "epoch": 0.3405982263640463, "flos": 21732169023360.0, "grad_norm": 1.8763391951540154, "language_loss": 0.69172382, "learning_rate": 3.070522162795235e-06, "loss": 0.76934606, "num_input_tokens_seen": 121641310, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.14300537, "step": 5665, "time_per_iteration": 4.02979040145874 }, { "auxiliary_loss_clip": 0.06490809, "auxiliary_loss_mlp": 0.01269523, "balance_loss_clip": 0.06300249, "balance_loss_mlp": 0.01254157, "epoch": 0.34065834961671426, "flos": 18047600634240.0, "grad_norm": 2.7287761852633765, "language_loss": 0.73136032, "learning_rate": 3.0701931694083626e-06, "loss": 0.80896366, "num_input_tokens_seen": 121659625, "router_z_loss_clip": 1.90527344, "router_z_loss_mlp": 0.15356445, "step": 5666, "time_per_iteration": 2.551133155822754 }, { "auxiliary_loss_clip": 0.06490707, "auxiliary_loss_mlp": 0.0127047, "balance_loss_clip": 0.06301247, "balance_loss_mlp": 0.01256028, "epoch": 0.3407184728693822, "flos": 21403705818240.0, "grad_norm": 2.4377413415995512, "language_loss": 0.73341501, "learning_rate": 3.0698641354396576e-06, "loss": 0.81102681, "num_input_tokens_seen": 121679205, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.14422607, "step": 5667, "time_per_iteration": 2.5709879398345947 }, { "auxiliary_loss_clip": 0.06383882, "auxiliary_loss_mlp": 0.01268564, "balance_loss_clip": 0.06294465, "balance_loss_mlp": 0.01265539, "epoch": 0.3407785961220502, "flos": 68709352515840.0, "grad_norm": 1.119670096581563, "language_loss": 0.63357913, "learning_rate": 3.069535060901597e-06, "loss": 0.71010363, "num_input_tokens_seen": 121751085, "router_z_loss_clip": 0.89160156, "router_z_loss_mlp": 0.03022766, "step": 5668, "time_per_iteration": 3.3519535064697266 }, { "auxiliary_loss_clip": 0.06489752, "auxiliary_loss_mlp": 0.01270381, "balance_loss_clip": 0.06301052, "balance_loss_mlp": 0.01255659, "epoch": 0.34083871937471816, "flos": 14069634773760.0, "grad_norm": 2.146636482715249, "language_loss": 0.73036999, "learning_rate": 3.0692059458066596e-06, "loss": 0.8079713, "num_input_tokens_seen": 121768565, "router_z_loss_clip": 1.88671875, "router_z_loss_mlp": 0.1472168, "step": 5669, "time_per_iteration": 2.5437793731689453 }, { "auxiliary_loss_clip": 0.06491489, "auxiliary_loss_mlp": 0.01271907, "balance_loss_clip": 0.06300973, "balance_loss_mlp": 0.01256827, "epoch": 0.3408988426273861, "flos": 17089981447680.0, "grad_norm": 1.7773511806084101, "language_loss": 0.80456698, "learning_rate": 3.0688767901673265e-06, "loss": 0.88220096, "num_input_tokens_seen": 121784925, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.15087891, "step": 5670, "time_per_iteration": 2.54552960395813 }, { "auxiliary_loss_clip": 0.06496339, "auxiliary_loss_mlp": 0.01271768, "balance_loss_clip": 0.06305361, "balance_loss_mlp": 0.01257129, "epoch": 0.3409589658800541, "flos": 24031411459200.0, "grad_norm": 2.2696989965454053, "language_loss": 0.77242184, "learning_rate": 3.068547593996078e-06, "loss": 0.8501029, "num_input_tokens_seen": 121804425, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.1463623, "step": 5671, "time_per_iteration": 4.0462729930877686 }, { "auxiliary_loss_clip": 0.0649182, "auxiliary_loss_mlp": 0.01272404, "balance_loss_clip": 0.06304259, "balance_loss_mlp": 0.01256299, "epoch": 0.34101908913272205, "flos": 21148350900480.0, "grad_norm": 2.640977977547872, "language_loss": 0.74543387, "learning_rate": 3.0682183573053974e-06, "loss": 0.82307613, "num_input_tokens_seen": 121825145, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.16101074, "step": 5672, "time_per_iteration": 2.5880980491638184 }, { "auxiliary_loss_clip": 0.06489138, "auxiliary_loss_mlp": 0.01270245, "balance_loss_clip": 0.06299846, "balance_loss_mlp": 0.01255618, "epoch": 0.3410792123853901, "flos": 15706835700480.0, "grad_norm": 1.7999589931085238, "language_loss": 0.74121678, "learning_rate": 3.06788908010777e-06, "loss": 0.81881058, "num_input_tokens_seen": 121842185, "router_z_loss_clip": 1.89160156, "router_z_loss_mlp": 0.14624023, "step": 5673, "time_per_iteration": 2.526047468185425 }, { "auxiliary_loss_clip": 0.06483635, "auxiliary_loss_mlp": 0.0127505, "balance_loss_clip": 0.0630032, "balance_loss_mlp": 0.01260054, "epoch": 0.34113933563805804, "flos": 23042122629120.0, "grad_norm": 1.838579331568315, "language_loss": 0.80221277, "learning_rate": 3.067559762415682e-06, "loss": 0.8797996, "num_input_tokens_seen": 121862260, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.14996338, "step": 5674, "time_per_iteration": 2.5842182636260986 }, { "auxiliary_loss_clip": 0.06382041, "auxiliary_loss_mlp": 0.0126069, "balance_loss_clip": 0.06293058, "balance_loss_mlp": 0.01257227, "epoch": 0.341199458890726, "flos": 69631878769920.0, "grad_norm": 0.7724465603684811, "language_loss": 0.56075335, "learning_rate": 3.0672304042416198e-06, "loss": 0.63718063, "num_input_tokens_seen": 121923560, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.034729, "step": 5675, "time_per_iteration": 3.357722759246826 }, { "auxiliary_loss_clip": 0.0648544, "auxiliary_loss_mlp": 0.01272125, "balance_loss_clip": 0.06304088, "balance_loss_mlp": 0.01258565, "epoch": 0.34125958214339397, "flos": 22352939596800.0, "grad_norm": 1.7029454266089192, "language_loss": 0.79994524, "learning_rate": 3.0669010055980734e-06, "loss": 0.87752092, "num_input_tokens_seen": 121943515, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.13555908, "step": 5676, "time_per_iteration": 4.085116624832153 }, { "auxiliary_loss_clip": 0.06486701, "auxiliary_loss_mlp": 0.01273419, "balance_loss_clip": 0.06298567, "balance_loss_mlp": 0.01259037, "epoch": 0.34131970539606193, "flos": 21878427525120.0, "grad_norm": 2.7068208230945734, "language_loss": 0.86553252, "learning_rate": 3.0665715664975357e-06, "loss": 0.94313371, "num_input_tokens_seen": 121962540, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.14385986, "step": 5677, "time_per_iteration": 2.5793004035949707 }, { "auxiliary_loss_clip": 0.06482373, "auxiliary_loss_mlp": 0.01273331, "balance_loss_clip": 0.0629674, "balance_loss_mlp": 0.01258156, "epoch": 0.3413798286487299, "flos": 24942560757120.0, "grad_norm": 1.865389533437916, "language_loss": 0.80053329, "learning_rate": 3.0662420869524966e-06, "loss": 0.87809026, "num_input_tokens_seen": 121979830, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.15185547, "step": 5678, "time_per_iteration": 2.577228307723999 }, { "auxiliary_loss_clip": 0.06489071, "auxiliary_loss_mlp": 0.01268794, "balance_loss_clip": 0.06299957, "balance_loss_mlp": 0.01254644, "epoch": 0.34143995190139786, "flos": 25381420116480.0, "grad_norm": 2.099797022577286, "language_loss": 0.75605202, "learning_rate": 3.0659125669754506e-06, "loss": 0.83363068, "num_input_tokens_seen": 121999055, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.14154053, "step": 5679, "time_per_iteration": 2.6096832752227783 }, { "auxiliary_loss_clip": 0.0636747, "auxiliary_loss_mlp": 0.01255254, "balance_loss_clip": 0.06278294, "balance_loss_mlp": 0.01251478, "epoch": 0.34150007515406583, "flos": 67804785763200.0, "grad_norm": 1.3049002224764792, "language_loss": 0.5934428, "learning_rate": 3.0655830065788923e-06, "loss": 0.66966999, "num_input_tokens_seen": 122067015, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.03768921, "step": 5680, "time_per_iteration": 3.2796730995178223 }, { "auxiliary_loss_clip": 0.06485517, "auxiliary_loss_mlp": 0.01272144, "balance_loss_clip": 0.06300998, "balance_loss_mlp": 0.01257362, "epoch": 0.3415601984067338, "flos": 20308548954240.0, "grad_norm": 1.858390424973034, "language_loss": 0.72204095, "learning_rate": 3.0652534057753206e-06, "loss": 0.79961753, "num_input_tokens_seen": 122085295, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.14782715, "step": 5681, "time_per_iteration": 2.556135654449463 }, { "auxiliary_loss_clip": 0.06481317, "auxiliary_loss_mlp": 0.01273863, "balance_loss_clip": 0.06297427, "balance_loss_mlp": 0.01259642, "epoch": 0.34162032165940176, "flos": 26038346526720.0, "grad_norm": 1.8973753101155775, "language_loss": 0.71578419, "learning_rate": 3.064923764577233e-06, "loss": 0.79333597, "num_input_tokens_seen": 122104020, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.14215088, "step": 5682, "time_per_iteration": 2.611036777496338 }, { "auxiliary_loss_clip": 0.0648836, "auxiliary_loss_mlp": 0.01275185, "balance_loss_clip": 0.06300819, "balance_loss_mlp": 0.01260332, "epoch": 0.3416804449120697, "flos": 28810843223040.0, "grad_norm": 3.910599628838324, "language_loss": 0.84371239, "learning_rate": 3.0645940829971295e-06, "loss": 0.92134792, "num_input_tokens_seen": 122125080, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.1484375, "step": 5683, "time_per_iteration": 2.634289264678955 }, { "auxiliary_loss_clip": 0.06495963, "auxiliary_loss_mlp": 0.01271032, "balance_loss_clip": 0.06307532, "balance_loss_mlp": 0.01256411, "epoch": 0.3417405681647377, "flos": 22608210660480.0, "grad_norm": 2.273707488042629, "language_loss": 0.71111608, "learning_rate": 3.0642643610475116e-06, "loss": 0.78878599, "num_input_tokens_seen": 122146350, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.14624023, "step": 5684, "time_per_iteration": 2.6007437705993652 }, { "auxiliary_loss_clip": 0.06483015, "auxiliary_loss_mlp": 0.01271048, "balance_loss_clip": 0.06300201, "balance_loss_mlp": 0.01257846, "epoch": 0.34180069141740566, "flos": 24722942405760.0, "grad_norm": 1.6801779640995544, "language_loss": 0.75162864, "learning_rate": 3.0639345987408823e-06, "loss": 0.82916927, "num_input_tokens_seen": 122168085, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.13226318, "step": 5685, "time_per_iteration": 2.613532066345215 }, { "auxiliary_loss_clip": 0.06481465, "auxiliary_loss_mlp": 0.01271309, "balance_loss_clip": 0.06299873, "balance_loss_mlp": 0.01257535, "epoch": 0.3418608146700737, "flos": 30526644879360.0, "grad_norm": 1.7877817372346152, "language_loss": 0.70966589, "learning_rate": 3.0636047960897468e-06, "loss": 0.78719366, "num_input_tokens_seen": 122191040, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.13769531, "step": 5686, "time_per_iteration": 2.6441116333007812 }, { "auxiliary_loss_clip": 0.06491792, "auxiliary_loss_mlp": 0.01274272, "balance_loss_clip": 0.06303672, "balance_loss_mlp": 0.01259651, "epoch": 0.34192093792274164, "flos": 15127755333120.0, "grad_norm": 2.2492875831981616, "language_loss": 0.78027678, "learning_rate": 3.06327495310661e-06, "loss": 0.85793734, "num_input_tokens_seen": 122209225, "router_z_loss_clip": 1.88183594, "router_z_loss_mlp": 0.14630127, "step": 5687, "time_per_iteration": 2.5738604068756104 }, { "auxiliary_loss_clip": 0.06491295, "auxiliary_loss_mlp": 0.01271234, "balance_loss_clip": 0.06308736, "balance_loss_mlp": 0.01256589, "epoch": 0.3419810611754096, "flos": 13192754595840.0, "grad_norm": 2.9100807616537976, "language_loss": 0.87500787, "learning_rate": 3.062945069803981e-06, "loss": 0.95263314, "num_input_tokens_seen": 122226160, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.14642334, "step": 5688, "time_per_iteration": 2.539602041244507 }, { "auxiliary_loss_clip": 0.06492491, "auxiliary_loss_mlp": 0.01279381, "balance_loss_clip": 0.06301852, "balance_loss_mlp": 0.01263729, "epoch": 0.34204118442807757, "flos": 19542274565760.0, "grad_norm": 1.7338663976064332, "language_loss": 0.80016154, "learning_rate": 3.0626151461943684e-06, "loss": 0.87788022, "num_input_tokens_seen": 122243115, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.15637207, "step": 5689, "time_per_iteration": 2.6017844676971436 }, { "auxiliary_loss_clip": 0.06493102, "auxiliary_loss_mlp": 0.01277175, "balance_loss_clip": 0.06302938, "balance_loss_mlp": 0.01263263, "epoch": 0.34210130768074554, "flos": 15200192787840.0, "grad_norm": 1.7581381531863718, "language_loss": 0.74112427, "learning_rate": 3.0622851822902834e-06, "loss": 0.81882697, "num_input_tokens_seen": 122261105, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.13916016, "step": 5690, "time_per_iteration": 2.637042760848999 }, { "auxiliary_loss_clip": 0.06492712, "auxiliary_loss_mlp": 0.01281253, "balance_loss_clip": 0.06304125, "balance_loss_mlp": 0.01267466, "epoch": 0.3421614309334135, "flos": 24943147735680.0, "grad_norm": 1.8295174438691324, "language_loss": 0.76497018, "learning_rate": 3.061955178104237e-06, "loss": 0.84270978, "num_input_tokens_seen": 122279995, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.13787842, "step": 5691, "time_per_iteration": 2.667534589767456 }, { "auxiliary_loss_clip": 0.06481871, "auxiliary_loss_mlp": 0.01281573, "balance_loss_clip": 0.06297304, "balance_loss_mlp": 0.01268281, "epoch": 0.34222155418608147, "flos": 21915170340480.0, "grad_norm": 1.9109720462201876, "language_loss": 0.69064879, "learning_rate": 3.0616251336487447e-06, "loss": 0.76828325, "num_input_tokens_seen": 122299070, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.13293457, "step": 5692, "time_per_iteration": 2.5929906368255615 }, { "auxiliary_loss_clip": 0.06492522, "auxiliary_loss_mlp": 0.01283152, "balance_loss_clip": 0.06303382, "balance_loss_mlp": 0.01268597, "epoch": 0.34228167743874943, "flos": 18119954234880.0, "grad_norm": 2.3545921224243243, "language_loss": 0.72926819, "learning_rate": 3.06129504893632e-06, "loss": 0.80702496, "num_input_tokens_seen": 122316800, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.14550781, "step": 5693, "time_per_iteration": 2.5780491828918457 }, { "auxiliary_loss_clip": 0.06489667, "auxiliary_loss_mlp": 0.01269699, "balance_loss_clip": 0.06305519, "balance_loss_mlp": 0.0125655, "epoch": 0.3423418006914174, "flos": 21295070599680.0, "grad_norm": 2.9305735920789044, "language_loss": 0.75830829, "learning_rate": 3.0609649239794813e-06, "loss": 0.83590198, "num_input_tokens_seen": 122335275, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.1315918, "step": 5694, "time_per_iteration": 2.679852247238159 }, { "auxiliary_loss_clip": 0.06486066, "auxiliary_loss_mlp": 0.01275113, "balance_loss_clip": 0.06302489, "balance_loss_mlp": 0.01261439, "epoch": 0.34240192394408536, "flos": 19828754075520.0, "grad_norm": 3.77450481485353, "language_loss": 0.7983048, "learning_rate": 3.060634758790747e-06, "loss": 0.8759166, "num_input_tokens_seen": 122353215, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.13677979, "step": 5695, "time_per_iteration": 2.563754081726074 }, { "auxiliary_loss_clip": 0.06489333, "auxiliary_loss_mlp": 0.01282164, "balance_loss_clip": 0.06302831, "balance_loss_mlp": 0.01267948, "epoch": 0.3424620471967533, "flos": 24542498638080.0, "grad_norm": 1.9648923053403724, "language_loss": 0.73433173, "learning_rate": 3.060304553382635e-06, "loss": 0.81204671, "num_input_tokens_seen": 122372495, "router_z_loss_clip": 1.86523438, "router_z_loss_mlp": 0.14202881, "step": 5696, "time_per_iteration": 2.61484432220459 }, { "auxiliary_loss_clip": 0.06487044, "auxiliary_loss_mlp": 0.01280806, "balance_loss_clip": 0.06301063, "balance_loss_mlp": 0.01267461, "epoch": 0.3425221704494213, "flos": 25856057969280.0, "grad_norm": 1.7058424138251755, "language_loss": 0.71723902, "learning_rate": 3.0599743077676685e-06, "loss": 0.79491746, "num_input_tokens_seen": 122394600, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.13354492, "step": 5697, "time_per_iteration": 2.5981271266937256 }, { "auxiliary_loss_clip": 0.06482823, "auxiliary_loss_mlp": 0.01277141, "balance_loss_clip": 0.06299673, "balance_loss_mlp": 0.01264279, "epoch": 0.34258229370208926, "flos": 21546442448640.0, "grad_norm": 2.202464486623376, "language_loss": 0.82163453, "learning_rate": 3.05964402195837e-06, "loss": 0.89923424, "num_input_tokens_seen": 122414700, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.12860107, "step": 5698, "time_per_iteration": 4.006196975708008 }, { "auxiliary_loss_clip": 0.06493997, "auxiliary_loss_mlp": 0.01281813, "balance_loss_clip": 0.06305224, "balance_loss_mlp": 0.01267365, "epoch": 0.3426424169547573, "flos": 23658407009280.0, "grad_norm": 1.8459649960876907, "language_loss": 0.690458, "learning_rate": 3.0593136959672645e-06, "loss": 0.76821613, "num_input_tokens_seen": 122432760, "router_z_loss_clip": 1.88964844, "router_z_loss_mlp": 0.14465332, "step": 5699, "time_per_iteration": 2.662269115447998 }, { "auxiliary_loss_clip": 0.06487739, "auxiliary_loss_mlp": 0.012734, "balance_loss_clip": 0.06301416, "balance_loss_mlp": 0.01259333, "epoch": 0.34270254020742524, "flos": 24651846616320.0, "grad_norm": 2.3857596404726094, "language_loss": 0.72737628, "learning_rate": 3.058983329806877e-06, "loss": 0.80498767, "num_input_tokens_seen": 122449105, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.14074707, "step": 5700, "time_per_iteration": 2.656104803085327 }, { "auxiliary_loss_clip": 0.06485867, "auxiliary_loss_mlp": 0.01274143, "balance_loss_clip": 0.06302734, "balance_loss_mlp": 0.01260571, "epoch": 0.3427626634600932, "flos": 21003182501760.0, "grad_norm": 2.1900861641837155, "language_loss": 0.82242572, "learning_rate": 3.0586529234897354e-06, "loss": 0.90002584, "num_input_tokens_seen": 122468700, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.13568115, "step": 5701, "time_per_iteration": 2.5657927989959717 }, { "auxiliary_loss_clip": 0.06485918, "auxiliary_loss_mlp": 0.01272372, "balance_loss_clip": 0.06298158, "balance_loss_mlp": 0.01258353, "epoch": 0.3428227867127612, "flos": 21440155144320.0, "grad_norm": 1.6774563266567468, "language_loss": 0.72223222, "learning_rate": 3.0583224770283694e-06, "loss": 0.79981512, "num_input_tokens_seen": 122488160, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.14025879, "step": 5702, "time_per_iteration": 2.5443239212036133 }, { "auxiliary_loss_clip": 0.0637117, "auxiliary_loss_mlp": 0.01257617, "balance_loss_clip": 0.0628264, "balance_loss_mlp": 0.0125385, "epoch": 0.34288290996542914, "flos": 55750219902720.0, "grad_norm": 0.8199771433568164, "language_loss": 0.56796038, "learning_rate": 3.057991990435309e-06, "loss": 0.64424825, "num_input_tokens_seen": 122542890, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.03759766, "step": 5703, "time_per_iteration": 3.0926456451416016 }, { "auxiliary_loss_clip": 0.0648787, "auxiliary_loss_mlp": 0.01271872, "balance_loss_clip": 0.06301186, "balance_loss_mlp": 0.01255647, "epoch": 0.3429430332180971, "flos": 20162961285120.0, "grad_norm": 1.7613629515526028, "language_loss": 0.75400126, "learning_rate": 3.057661463723086e-06, "loss": 0.8315987, "num_input_tokens_seen": 122561770, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.16223145, "step": 5704, "time_per_iteration": 4.024562358856201 }, { "auxiliary_loss_clip": 0.06486899, "auxiliary_loss_mlp": 0.01270182, "balance_loss_clip": 0.06303583, "balance_loss_mlp": 0.01256914, "epoch": 0.34300315647076507, "flos": 17971347818880.0, "grad_norm": 1.8776152858786757, "language_loss": 0.730533, "learning_rate": 3.0573308969042346e-06, "loss": 0.8081038, "num_input_tokens_seen": 122580580, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.13275146, "step": 5705, "time_per_iteration": 2.590876579284668 }, { "auxiliary_loss_clip": 0.0649365, "auxiliary_loss_mlp": 0.01270062, "balance_loss_clip": 0.06306472, "balance_loss_mlp": 0.01256216, "epoch": 0.34306327972343303, "flos": 22092679215360.0, "grad_norm": 1.738428593409615, "language_loss": 0.80064827, "learning_rate": 3.057000289991289e-06, "loss": 0.87828541, "num_input_tokens_seen": 122599810, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.1385498, "step": 5706, "time_per_iteration": 2.577960729598999 }, { "auxiliary_loss_clip": 0.06487841, "auxiliary_loss_mlp": 0.01270307, "balance_loss_clip": 0.06299148, "balance_loss_mlp": 0.01255603, "epoch": 0.343123402976101, "flos": 18448669002240.0, "grad_norm": 2.0001213315279807, "language_loss": 0.83231103, "learning_rate": 3.056669642996787e-06, "loss": 0.9098925, "num_input_tokens_seen": 122616035, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.14685059, "step": 5707, "time_per_iteration": 2.5870275497436523 }, { "auxiliary_loss_clip": 0.06491023, "auxiliary_loss_mlp": 0.01272295, "balance_loss_clip": 0.06305769, "balance_loss_mlp": 0.01258264, "epoch": 0.34318352622876896, "flos": 17169127228800.0, "grad_norm": 1.6642559133029027, "language_loss": 0.75544471, "learning_rate": 3.056338955933266e-06, "loss": 0.83307785, "num_input_tokens_seen": 122633785, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.14025879, "step": 5708, "time_per_iteration": 2.553056240081787 }, { "auxiliary_loss_clip": 0.06485656, "auxiliary_loss_mlp": 0.01270435, "balance_loss_clip": 0.0630334, "balance_loss_mlp": 0.01256416, "epoch": 0.34324364948143693, "flos": 26695482572160.0, "grad_norm": 1.7334682047607655, "language_loss": 0.814116, "learning_rate": 3.0560082288132662e-06, "loss": 0.8916769, "num_input_tokens_seen": 122652100, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.14013672, "step": 5709, "time_per_iteration": 2.6048314571380615 }, { "auxiliary_loss_clip": 0.06498213, "auxiliary_loss_mlp": 0.01277088, "balance_loss_clip": 0.0630786, "balance_loss_mlp": 0.01260327, "epoch": 0.3433037727341049, "flos": 21257950440960.0, "grad_norm": 4.777566922825445, "language_loss": 0.79588449, "learning_rate": 3.055677461649329e-06, "loss": 0.87363744, "num_input_tokens_seen": 122669720, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.16760254, "step": 5710, "time_per_iteration": 4.003442764282227 }, { "auxiliary_loss_clip": 0.06492122, "auxiliary_loss_mlp": 0.0126937, "balance_loss_clip": 0.06302375, "balance_loss_mlp": 0.0125395, "epoch": 0.34336389598677286, "flos": 20635377004800.0, "grad_norm": 3.512989181614223, "language_loss": 0.7114619, "learning_rate": 3.055346654453996e-06, "loss": 0.78907681, "num_input_tokens_seen": 122688715, "router_z_loss_clip": 1.89550781, "router_z_loss_mlp": 0.15435791, "step": 5711, "time_per_iteration": 2.5694994926452637 }, { "auxiliary_loss_clip": 0.06493457, "auxiliary_loss_mlp": 0.01270495, "balance_loss_clip": 0.0630672, "balance_loss_mlp": 0.01255534, "epoch": 0.3434240192394409, "flos": 14543895283200.0, "grad_norm": 1.7812420127295108, "language_loss": 0.67428303, "learning_rate": 3.055015807239812e-06, "loss": 0.75192261, "num_input_tokens_seen": 122706970, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.1496582, "step": 5712, "time_per_iteration": 2.5529420375823975 }, { "auxiliary_loss_clip": 0.06379545, "auxiliary_loss_mlp": 0.01265263, "balance_loss_clip": 0.06291357, "balance_loss_mlp": 0.01261639, "epoch": 0.34348414249210885, "flos": 58067799183360.0, "grad_norm": 0.8402997093541752, "language_loss": 0.58068782, "learning_rate": 3.0546849200193226e-06, "loss": 0.65713584, "num_input_tokens_seen": 122758095, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.03613281, "step": 5713, "time_per_iteration": 3.1610794067382812 }, { "auxiliary_loss_clip": 0.06492521, "auxiliary_loss_mlp": 0.0127004, "balance_loss_clip": 0.06304687, "balance_loss_mlp": 0.01255657, "epoch": 0.3435442657447768, "flos": 20710749352320.0, "grad_norm": 1.654686525629205, "language_loss": 0.81466824, "learning_rate": 3.054353992805076e-06, "loss": 0.89229381, "num_input_tokens_seen": 122777815, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.14373779, "step": 5714, "time_per_iteration": 2.5949862003326416 }, { "auxiliary_loss_clip": 0.06486091, "auxiliary_loss_mlp": 0.01277721, "balance_loss_clip": 0.06300762, "balance_loss_mlp": 0.01263082, "epoch": 0.3436043889974448, "flos": 22936967354880.0, "grad_norm": 2.178194614867047, "language_loss": 0.72247422, "learning_rate": 3.05402302560962e-06, "loss": 0.80011237, "num_input_tokens_seen": 122797555, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.14630127, "step": 5715, "time_per_iteration": 2.5965795516967773 }, { "auxiliary_loss_clip": 0.06377167, "auxiliary_loss_mlp": 0.01261758, "balance_loss_clip": 0.0628958, "balance_loss_mlp": 0.01258375, "epoch": 0.34366451225011274, "flos": 58423514964480.0, "grad_norm": 0.8774237598797053, "language_loss": 0.65590942, "learning_rate": 3.053692018445505e-06, "loss": 0.73229867, "num_input_tokens_seen": 122863955, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.03390503, "step": 5716, "time_per_iteration": 4.710184812545776 }, { "auxiliary_loss_clip": 0.06485545, "auxiliary_loss_mlp": 0.01278086, "balance_loss_clip": 0.06302305, "balance_loss_mlp": 0.01264627, "epoch": 0.3437246355027807, "flos": 15601722353280.0, "grad_norm": 1.6653354865343408, "language_loss": 0.74844515, "learning_rate": 3.0533609713252838e-06, "loss": 0.82608145, "num_input_tokens_seen": 122883000, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.13464355, "step": 5717, "time_per_iteration": 2.583468198776245 }, { "auxiliary_loss_clip": 0.06481526, "auxiliary_loss_mlp": 0.01277431, "balance_loss_clip": 0.06296694, "balance_loss_mlp": 0.01263484, "epoch": 0.34378475875544867, "flos": 27679572449280.0, "grad_norm": 70.1234708985223, "language_loss": 0.75244129, "learning_rate": 3.0530298842615077e-06, "loss": 0.83003092, "num_input_tokens_seen": 122903265, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.13952637, "step": 5718, "time_per_iteration": 2.6441915035247803 }, { "auxiliary_loss_clip": 0.06481312, "auxiliary_loss_mlp": 0.01275387, "balance_loss_clip": 0.0629473, "balance_loss_mlp": 0.0125964, "epoch": 0.34384488200811664, "flos": 31439638967040.0, "grad_norm": 1.9568396379388644, "language_loss": 0.64837408, "learning_rate": 3.052698757266734e-06, "loss": 0.72594112, "num_input_tokens_seen": 122923860, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.1574707, "step": 5719, "time_per_iteration": 2.6771621704101562 }, { "auxiliary_loss_clip": 0.06487159, "auxiliary_loss_mlp": 0.01277633, "balance_loss_clip": 0.06299049, "balance_loss_mlp": 0.01261516, "epoch": 0.3439050052607846, "flos": 24906866117760.0, "grad_norm": 1.7629478168313755, "language_loss": 0.73238051, "learning_rate": 3.0523675903535183e-06, "loss": 0.81002843, "num_input_tokens_seen": 122945305, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.16101074, "step": 5720, "time_per_iteration": 2.6202383041381836 }, { "auxiliary_loss_clip": 0.06485245, "auxiliary_loss_mlp": 0.01284308, "balance_loss_clip": 0.06300265, "balance_loss_mlp": 0.01268704, "epoch": 0.34396512851345257, "flos": 18155900436480.0, "grad_norm": 3.9719093653432274, "language_loss": 0.7423535, "learning_rate": 3.0520363835344173e-06, "loss": 0.82004905, "num_input_tokens_seen": 122962535, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.15600586, "step": 5721, "time_per_iteration": 2.5527427196502686 }, { "auxiliary_loss_clip": 0.06486581, "auxiliary_loss_mlp": 0.0128344, "balance_loss_clip": 0.0629836, "balance_loss_mlp": 0.01267365, "epoch": 0.34402525176612053, "flos": 16039994734080.0, "grad_norm": 2.111448587061548, "language_loss": 0.80466801, "learning_rate": 3.051705136821992e-06, "loss": 0.88236821, "num_input_tokens_seen": 122979750, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.16070557, "step": 5722, "time_per_iteration": 2.537309169769287 }, { "auxiliary_loss_clip": 0.06482205, "auxiliary_loss_mlp": 0.01291282, "balance_loss_clip": 0.06297216, "balance_loss_mlp": 0.01276238, "epoch": 0.3440853750187885, "flos": 21185009861760.0, "grad_norm": 1.9596038607823578, "language_loss": 0.81699467, "learning_rate": 3.051373850228801e-06, "loss": 0.89472955, "num_input_tokens_seen": 122998955, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.1505127, "step": 5723, "time_per_iteration": 2.6320459842681885 }, { "auxiliary_loss_clip": 0.06486472, "auxiliary_loss_mlp": 0.01284941, "balance_loss_clip": 0.06298295, "balance_loss_mlp": 0.01269659, "epoch": 0.34414549827145646, "flos": 12682883301120.0, "grad_norm": 1.8102150313819831, "language_loss": 0.80909842, "learning_rate": 3.0510425237674096e-06, "loss": 0.88681257, "num_input_tokens_seen": 123016165, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.15283203, "step": 5724, "time_per_iteration": 2.5516180992126465 }, { "auxiliary_loss_clip": 0.06488308, "auxiliary_loss_mlp": 0.01288267, "balance_loss_clip": 0.06301071, "balance_loss_mlp": 0.01272746, "epoch": 0.3442056215241244, "flos": 31292458070400.0, "grad_norm": 1.7525030310971386, "language_loss": 0.69056076, "learning_rate": 3.05071115745038e-06, "loss": 0.76832652, "num_input_tokens_seen": 123036900, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.15515137, "step": 5725, "time_per_iteration": 2.652015447616577 }, { "auxiliary_loss_clip": 0.06497459, "auxiliary_loss_mlp": 0.01284149, "balance_loss_clip": 0.06302493, "balance_loss_mlp": 0.01266756, "epoch": 0.34426574477679245, "flos": 23373939997440.0, "grad_norm": 2.4247400014940914, "language_loss": 0.69784975, "learning_rate": 3.0503797512902773e-06, "loss": 0.77566588, "num_input_tokens_seen": 123057480, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.17407227, "step": 5726, "time_per_iteration": 2.5773351192474365 }, { "auxiliary_loss_clip": 0.06486639, "auxiliary_loss_mlp": 0.01284846, "balance_loss_clip": 0.06296862, "balance_loss_mlp": 0.01269086, "epoch": 0.3443258680294604, "flos": 24542372856960.0, "grad_norm": 2.0256496128502204, "language_loss": 0.73740637, "learning_rate": 3.0500483052996703e-06, "loss": 0.81512123, "num_input_tokens_seen": 123076890, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.15771484, "step": 5727, "time_per_iteration": 2.613069534301758 }, { "auxiliary_loss_clip": 0.06487918, "auxiliary_loss_mlp": 0.01282299, "balance_loss_clip": 0.06302696, "balance_loss_mlp": 0.01267041, "epoch": 0.3443859912821284, "flos": 20236363061760.0, "grad_norm": 1.9364618832369995, "language_loss": 0.88789093, "learning_rate": 3.0497168194911257e-06, "loss": 0.96559316, "num_input_tokens_seen": 123092530, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.15258789, "step": 5728, "time_per_iteration": 2.559983253479004 }, { "auxiliary_loss_clip": 0.06483238, "auxiliary_loss_mlp": 0.01280621, "balance_loss_clip": 0.06295896, "balance_loss_mlp": 0.01264826, "epoch": 0.34444611453479634, "flos": 24323425338240.0, "grad_norm": 2.0232403429424126, "language_loss": 0.70827591, "learning_rate": 3.0493852938772143e-06, "loss": 0.78591454, "num_input_tokens_seen": 123110560, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.15808105, "step": 5729, "time_per_iteration": 2.571596145629883 }, { "auxiliary_loss_clip": 0.06481473, "auxiliary_loss_mlp": 0.01290252, "balance_loss_clip": 0.06298251, "balance_loss_mlp": 0.01274731, "epoch": 0.3445062377874643, "flos": 16989186585600.0, "grad_norm": 1.8416738610551202, "language_loss": 0.74047816, "learning_rate": 3.0490537284705078e-06, "loss": 0.81819546, "num_input_tokens_seen": 123128655, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.15515137, "step": 5730, "time_per_iteration": 2.550455331802368 }, { "auxiliary_loss_clip": 0.06485574, "auxiliary_loss_mlp": 0.01280166, "balance_loss_clip": 0.06297527, "balance_loss_mlp": 0.01263607, "epoch": 0.3445663610401323, "flos": 20308884370560.0, "grad_norm": 5.413279853752337, "language_loss": 0.80301964, "learning_rate": 3.048722123283578e-06, "loss": 0.88067704, "num_input_tokens_seen": 123145130, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.16552734, "step": 5731, "time_per_iteration": 2.554692268371582 }, { "auxiliary_loss_clip": 0.06486002, "auxiliary_loss_mlp": 0.01279226, "balance_loss_clip": 0.06298769, "balance_loss_mlp": 0.012633, "epoch": 0.34462648429280024, "flos": 15893568524160.0, "grad_norm": 2.9239139681358712, "language_loss": 0.79003978, "learning_rate": 3.0483904783290006e-06, "loss": 0.86769199, "num_input_tokens_seen": 123162265, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.15930176, "step": 5732, "time_per_iteration": 2.5540266036987305 }, { "auxiliary_loss_clip": 0.06366415, "auxiliary_loss_mlp": 0.01260719, "balance_loss_clip": 0.06279085, "balance_loss_mlp": 0.01257501, "epoch": 0.3446866075454682, "flos": 59330681193600.0, "grad_norm": 0.7249526511662915, "language_loss": 0.53256404, "learning_rate": 3.0480587936193505e-06, "loss": 0.60883546, "num_input_tokens_seen": 123218620, "router_z_loss_clip": 0.87158203, "router_z_loss_mlp": 0.03222656, "step": 5733, "time_per_iteration": 3.2181060314178467 }, { "auxiliary_loss_clip": 0.06486659, "auxiliary_loss_mlp": 0.01286728, "balance_loss_clip": 0.06300993, "balance_loss_mlp": 0.01270253, "epoch": 0.34474673079813617, "flos": 22349962776960.0, "grad_norm": 1.6736472249242988, "language_loss": 0.83859706, "learning_rate": 3.047727069167207e-06, "loss": 0.91633099, "num_input_tokens_seen": 123237325, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.16467285, "step": 5734, "time_per_iteration": 2.735442638397217 }, { "auxiliary_loss_clip": 0.06490526, "auxiliary_loss_mlp": 0.01288358, "balance_loss_clip": 0.06302582, "balance_loss_mlp": 0.01271359, "epoch": 0.34480685405080413, "flos": 27677098753920.0, "grad_norm": 2.3411721165137918, "language_loss": 0.92965782, "learning_rate": 3.0473953049851478e-06, "loss": 1.00744677, "num_input_tokens_seen": 123258650, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.16992188, "step": 5735, "time_per_iteration": 2.643561363220215 }, { "auxiliary_loss_clip": 0.06489129, "auxiliary_loss_mlp": 0.01287257, "balance_loss_clip": 0.06300331, "balance_loss_mlp": 0.01268875, "epoch": 0.3448669773034721, "flos": 22462664918400.0, "grad_norm": 1.9445242688050421, "language_loss": 0.76978135, "learning_rate": 3.0470635010857533e-06, "loss": 0.84754521, "num_input_tokens_seen": 123277155, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.18395996, "step": 5736, "time_per_iteration": 2.5994889736175537 }, { "auxiliary_loss_clip": 0.06491786, "auxiliary_loss_mlp": 0.01285706, "balance_loss_clip": 0.06301057, "balance_loss_mlp": 0.01268921, "epoch": 0.34492710055614006, "flos": 24943105808640.0, "grad_norm": 1.8610952463366577, "language_loss": 0.79390013, "learning_rate": 3.0467316574816064e-06, "loss": 0.87167501, "num_input_tokens_seen": 123297640, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.16796875, "step": 5737, "time_per_iteration": 4.12772011756897 }, { "auxiliary_loss_clip": 0.0649235, "auxiliary_loss_mlp": 0.0127741, "balance_loss_clip": 0.06298605, "balance_loss_mlp": 0.01260649, "epoch": 0.34498722380880803, "flos": 20127057010560.0, "grad_norm": 3.288948667343945, "language_loss": 0.71917844, "learning_rate": 3.0463997741852893e-06, "loss": 0.79687607, "num_input_tokens_seen": 123314370, "router_z_loss_clip": 1.93554688, "router_z_loss_mlp": 0.16772461, "step": 5738, "time_per_iteration": 2.5567574501037598 }, { "auxiliary_loss_clip": 0.06491695, "auxiliary_loss_mlp": 0.0128116, "balance_loss_clip": 0.06299376, "balance_loss_mlp": 0.01263016, "epoch": 0.34504734706147605, "flos": 28445511421440.0, "grad_norm": 1.6331520146175602, "language_loss": 0.82280898, "learning_rate": 3.046067851209389e-06, "loss": 0.90053761, "num_input_tokens_seen": 123336085, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.18139648, "step": 5739, "time_per_iteration": 2.641373872756958 }, { "auxiliary_loss_clip": 0.06493244, "auxiliary_loss_mlp": 0.01283399, "balance_loss_clip": 0.06304479, "balance_loss_mlp": 0.01266602, "epoch": 0.345107470314144, "flos": 22681067385600.0, "grad_norm": 1.8904906091211884, "language_loss": 0.83259225, "learning_rate": 3.0457358885664898e-06, "loss": 0.91035873, "num_input_tokens_seen": 123354460, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.16796875, "step": 5740, "time_per_iteration": 2.66860032081604 }, { "auxiliary_loss_clip": 0.06490782, "auxiliary_loss_mlp": 0.01280604, "balance_loss_clip": 0.06303544, "balance_loss_mlp": 0.01263652, "epoch": 0.345167593566812, "flos": 20636886378240.0, "grad_norm": 2.3982511877323724, "language_loss": 0.7730819, "learning_rate": 3.045403886269181e-06, "loss": 0.85079575, "num_input_tokens_seen": 123373420, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.16931152, "step": 5741, "time_per_iteration": 2.734424591064453 }, { "auxiliary_loss_clip": 0.06491472, "auxiliary_loss_mlp": 0.01275504, "balance_loss_clip": 0.06301072, "balance_loss_mlp": 0.01259149, "epoch": 0.34522771681947995, "flos": 26221683260160.0, "grad_norm": 1.7637870800055715, "language_loss": 0.78162563, "learning_rate": 3.045071844330053e-06, "loss": 0.85929543, "num_input_tokens_seen": 123394730, "router_z_loss_clip": 1.90332031, "router_z_loss_mlp": 0.16351318, "step": 5742, "time_per_iteration": 2.6292450428009033 }, { "auxiliary_loss_clip": 0.06491958, "auxiliary_loss_mlp": 0.01279074, "balance_loss_clip": 0.06305276, "balance_loss_mlp": 0.01261109, "epoch": 0.3452878400721479, "flos": 19068349472640.0, "grad_norm": 2.2617225799832816, "language_loss": 0.76855183, "learning_rate": 3.0447397627616955e-06, "loss": 0.84626216, "num_input_tokens_seen": 123412895, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.1796875, "step": 5743, "time_per_iteration": 2.5583364963531494 }, { "auxiliary_loss_clip": 0.06483681, "auxiliary_loss_mlp": 0.01276219, "balance_loss_clip": 0.06298593, "balance_loss_mlp": 0.01260853, "epoch": 0.3453479633248159, "flos": 27937442989440.0, "grad_norm": 1.5433193659739781, "language_loss": 0.70879495, "learning_rate": 3.0444076415767016e-06, "loss": 0.78639388, "num_input_tokens_seen": 123432320, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.15393066, "step": 5744, "time_per_iteration": 4.104948043823242 }, { "auxiliary_loss_clip": 0.06481937, "auxiliary_loss_mlp": 0.01275721, "balance_loss_clip": 0.06299672, "balance_loss_mlp": 0.01260093, "epoch": 0.34540808657748384, "flos": 19611609419520.0, "grad_norm": 1.6192791398346624, "language_loss": 0.80189568, "learning_rate": 3.044075480787665e-06, "loss": 0.87947226, "num_input_tokens_seen": 123450980, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.15625, "step": 5745, "time_per_iteration": 2.5716497898101807 }, { "auxiliary_loss_clip": 0.0648658, "auxiliary_loss_mlp": 0.01274141, "balance_loss_clip": 0.06297383, "balance_loss_mlp": 0.01257691, "epoch": 0.3454682098301518, "flos": 20417771151360.0, "grad_norm": 5.626701599950169, "language_loss": 0.89598787, "learning_rate": 3.043743280407182e-06, "loss": 0.97359508, "num_input_tokens_seen": 123469365, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.16455078, "step": 5746, "time_per_iteration": 2.6995420455932617 }, { "auxiliary_loss_clip": 0.06490982, "auxiliary_loss_mlp": 0.01276177, "balance_loss_clip": 0.06296832, "balance_loss_mlp": 0.01259058, "epoch": 0.34552833308281977, "flos": 21331603779840.0, "grad_norm": 2.640866186878873, "language_loss": 0.64878649, "learning_rate": 3.043411040447849e-06, "loss": 0.72645807, "num_input_tokens_seen": 123489425, "router_z_loss_clip": 1.94238281, "router_z_loss_mlp": 0.17114258, "step": 5747, "time_per_iteration": 2.6724822521209717 }, { "auxiliary_loss_clip": 0.06480398, "auxiliary_loss_mlp": 0.01278576, "balance_loss_clip": 0.06293483, "balance_loss_mlp": 0.01263472, "epoch": 0.34558845633548774, "flos": 36251914331520.0, "grad_norm": 1.5891429048236252, "language_loss": 0.73325205, "learning_rate": 3.043078760922264e-06, "loss": 0.8108418, "num_input_tokens_seen": 123509970, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.15124512, "step": 5748, "time_per_iteration": 2.767803907394409 }, { "auxiliary_loss_clip": 0.06479452, "auxiliary_loss_mlp": 0.01280925, "balance_loss_clip": 0.06296057, "balance_loss_mlp": 0.01266417, "epoch": 0.3456485795881557, "flos": 22456292008320.0, "grad_norm": 1.6766700038450622, "language_loss": 0.75762677, "learning_rate": 3.042746441843029e-06, "loss": 0.83523053, "num_input_tokens_seen": 123531055, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.1451416, "step": 5749, "time_per_iteration": 2.6142258644104004 }, { "auxiliary_loss_clip": 0.06357282, "auxiliary_loss_mlp": 0.01257212, "balance_loss_clip": 0.06269876, "balance_loss_mlp": 0.01254193, "epoch": 0.34570870284082367, "flos": 62023277422080.0, "grad_norm": 0.8817416488260212, "language_loss": 0.62772173, "learning_rate": 3.0424140832227437e-06, "loss": 0.70386666, "num_input_tokens_seen": 123584720, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.03018188, "step": 5750, "time_per_iteration": 4.481480836868286 }, { "auxiliary_loss_clip": 0.06472763, "auxiliary_loss_mlp": 0.01268756, "balance_loss_clip": 0.06291379, "balance_loss_mlp": 0.01255106, "epoch": 0.34576882609349163, "flos": 22788528647040.0, "grad_norm": 1.6494331546660839, "language_loss": 0.81233227, "learning_rate": 3.042081685074012e-06, "loss": 0.88974744, "num_input_tokens_seen": 123604465, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.13647461, "step": 5751, "time_per_iteration": 2.5945403575897217 }, { "auxiliary_loss_clip": 0.06481482, "auxiliary_loss_mlp": 0.01276026, "balance_loss_clip": 0.06297719, "balance_loss_mlp": 0.01260702, "epoch": 0.34582894934615965, "flos": 12353665409280.0, "grad_norm": 2.011443262632584, "language_loss": 0.84331554, "learning_rate": 3.041749247409439e-06, "loss": 0.92089057, "num_input_tokens_seen": 123622320, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.15313721, "step": 5752, "time_per_iteration": 2.5509023666381836 }, { "auxiliary_loss_clip": 0.0635506, "auxiliary_loss_mlp": 0.01256037, "balance_loss_clip": 0.06267218, "balance_loss_mlp": 0.01252969, "epoch": 0.3458890725988276, "flos": 70186459017600.0, "grad_norm": 0.7212275408177979, "language_loss": 0.63134259, "learning_rate": 3.0414167702416296e-06, "loss": 0.70745361, "num_input_tokens_seen": 123678010, "router_z_loss_clip": 0.87695312, "router_z_loss_mlp": 0.03068542, "step": 5753, "time_per_iteration": 3.1005725860595703 }, { "auxiliary_loss_clip": 0.06474338, "auxiliary_loss_mlp": 0.01273827, "balance_loss_clip": 0.06291361, "balance_loss_mlp": 0.01257913, "epoch": 0.3459491958514956, "flos": 17098324928640.0, "grad_norm": 1.7517887197039108, "language_loss": 0.71487474, "learning_rate": 3.0410842535831914e-06, "loss": 0.79235637, "num_input_tokens_seen": 123696830, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.15930176, "step": 5754, "time_per_iteration": 2.599689483642578 }, { "auxiliary_loss_clip": 0.06487679, "auxiliary_loss_mlp": 0.01273708, "balance_loss_clip": 0.06294397, "balance_loss_mlp": 0.01257925, "epoch": 0.34600931910416355, "flos": 16655985624960.0, "grad_norm": 1.6056275174891672, "language_loss": 0.73268539, "learning_rate": 3.0407516974467343e-06, "loss": 0.81029928, "num_input_tokens_seen": 123714360, "router_z_loss_clip": 1.93261719, "router_z_loss_mlp": 0.15783691, "step": 5755, "time_per_iteration": 4.007187128067017 }, { "auxiliary_loss_clip": 0.06481618, "auxiliary_loss_mlp": 0.01271287, "balance_loss_clip": 0.0629772, "balance_loss_mlp": 0.01257155, "epoch": 0.3460694423568315, "flos": 38555517179520.0, "grad_norm": 1.6301908221615473, "language_loss": 0.72711229, "learning_rate": 3.040419101844869e-06, "loss": 0.80464137, "num_input_tokens_seen": 123739250, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.14123535, "step": 5756, "time_per_iteration": 2.7318999767303467 }, { "auxiliary_loss_clip": 0.06360953, "auxiliary_loss_mlp": 0.01253046, "balance_loss_clip": 0.06272765, "balance_loss_mlp": 0.01249974, "epoch": 0.3461295656094995, "flos": 72103332545280.0, "grad_norm": 0.6905451284333017, "language_loss": 0.62304103, "learning_rate": 3.040086466790207e-06, "loss": 0.69918102, "num_input_tokens_seen": 123802845, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.03071594, "step": 5757, "time_per_iteration": 3.2093722820281982 }, { "auxiliary_loss_clip": 0.06355887, "auxiliary_loss_mlp": 0.01254216, "balance_loss_clip": 0.06267344, "balance_loss_mlp": 0.01251042, "epoch": 0.34618968886216744, "flos": 65477913408000.0, "grad_norm": 0.8016123679446867, "language_loss": 0.59166682, "learning_rate": 3.039753792295362e-06, "loss": 0.66776788, "num_input_tokens_seen": 123861805, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.03175354, "step": 5758, "time_per_iteration": 3.191582441329956 }, { "auxiliary_loss_clip": 0.0648158, "auxiliary_loss_mlp": 0.01271582, "balance_loss_clip": 0.06299028, "balance_loss_mlp": 0.01257426, "epoch": 0.3462498121148354, "flos": 23478508293120.0, "grad_norm": 1.5922807639665624, "language_loss": 0.72283334, "learning_rate": 3.0394210783729487e-06, "loss": 0.80036497, "num_input_tokens_seen": 123881820, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.14154053, "step": 5759, "time_per_iteration": 2.5879015922546387 }, { "auxiliary_loss_clip": 0.06474701, "auxiliary_loss_mlp": 0.01276366, "balance_loss_clip": 0.06291105, "balance_loss_mlp": 0.01260368, "epoch": 0.3463099353675034, "flos": 24177711888000.0, "grad_norm": 1.7876904090344277, "language_loss": 0.8361634, "learning_rate": 3.0390883250355836e-06, "loss": 0.913674, "num_input_tokens_seen": 123903700, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.16003418, "step": 5760, "time_per_iteration": 2.6308698654174805 }, { "auxiliary_loss_clip": 0.06350437, "auxiliary_loss_mlp": 0.01253459, "balance_loss_clip": 0.06262414, "balance_loss_mlp": 0.0125038, "epoch": 0.34637005862017134, "flos": 63716773893120.0, "grad_norm": 0.7838327883648153, "language_loss": 0.56453651, "learning_rate": 3.0387555322958865e-06, "loss": 0.64057541, "num_input_tokens_seen": 123960075, "router_z_loss_clip": 0.87988281, "router_z_loss_mlp": 0.03074646, "step": 5761, "time_per_iteration": 3.255788564682007 }, { "auxiliary_loss_clip": 0.0647015, "auxiliary_loss_mlp": 0.01269336, "balance_loss_clip": 0.06287973, "balance_loss_mlp": 0.01254864, "epoch": 0.3464301818728393, "flos": 13149513089280.0, "grad_norm": 1.908403101748942, "language_loss": 0.95910418, "learning_rate": 3.038422700166474e-06, "loss": 1.03649902, "num_input_tokens_seen": 123975805, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.14489746, "step": 5762, "time_per_iteration": 2.565929889678955 }, { "auxiliary_loss_clip": 0.06484999, "auxiliary_loss_mlp": 0.01271289, "balance_loss_clip": 0.06295171, "balance_loss_mlp": 0.01256018, "epoch": 0.34649030512550727, "flos": 29322936650880.0, "grad_norm": 1.5486783670211517, "language_loss": 0.69620025, "learning_rate": 3.0380898286599692e-06, "loss": 0.77376318, "num_input_tokens_seen": 123997530, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.15258789, "step": 5763, "time_per_iteration": 2.6596968173980713 }, { "auxiliary_loss_clip": 0.06485216, "auxiliary_loss_mlp": 0.01271918, "balance_loss_clip": 0.06292417, "balance_loss_mlp": 0.01254383, "epoch": 0.34655042837817523, "flos": 23737385082240.0, "grad_norm": 1.7679795784551116, "language_loss": 0.84199071, "learning_rate": 3.0377569177889945e-06, "loss": 0.91956198, "num_input_tokens_seen": 124016375, "router_z_loss_clip": 1.92871094, "router_z_loss_mlp": 0.17541504, "step": 5764, "time_per_iteration": 2.5945746898651123 }, { "auxiliary_loss_clip": 0.06476458, "auxiliary_loss_mlp": 0.01273917, "balance_loss_clip": 0.06291346, "balance_loss_mlp": 0.01257866, "epoch": 0.34661055163084326, "flos": 22060716082560.0, "grad_norm": 3.571355114373572, "language_loss": 0.68112254, "learning_rate": 3.0374239675661722e-06, "loss": 0.75862634, "num_input_tokens_seen": 124033975, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.16046143, "step": 5765, "time_per_iteration": 2.581174612045288 }, { "auxiliary_loss_clip": 0.0647922, "auxiliary_loss_mlp": 0.01277896, "balance_loss_clip": 0.06295277, "balance_loss_mlp": 0.0126203, "epoch": 0.3466706748835112, "flos": 21805738508160.0, "grad_norm": 2.1468491803058276, "language_loss": 0.78197825, "learning_rate": 3.03709097800413e-06, "loss": 0.8595494, "num_input_tokens_seen": 124051930, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.15875244, "step": 5766, "time_per_iteration": 2.598273754119873 }, { "auxiliary_loss_clip": 0.0647848, "auxiliary_loss_mlp": 0.01275206, "balance_loss_clip": 0.06290899, "balance_loss_mlp": 0.01260436, "epoch": 0.3467307981361792, "flos": 19467405342720.0, "grad_norm": 1.5140567521630661, "language_loss": 0.73740196, "learning_rate": 3.0367579491154943e-06, "loss": 0.81493884, "num_input_tokens_seen": 124071220, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.14764404, "step": 5767, "time_per_iteration": 2.5825388431549072 }, { "auxiliary_loss_clip": 0.06477819, "auxiliary_loss_mlp": 0.01276481, "balance_loss_clip": 0.06291701, "balance_loss_mlp": 0.0125935, "epoch": 0.34679092138884715, "flos": 24834470590080.0, "grad_norm": 2.1044497541631797, "language_loss": 0.78824234, "learning_rate": 3.036424880912893e-06, "loss": 0.86578536, "num_input_tokens_seen": 124090140, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.17138672, "step": 5768, "time_per_iteration": 2.6145336627960205 }, { "auxiliary_loss_clip": 0.0636117, "auxiliary_loss_mlp": 0.01268539, "balance_loss_clip": 0.06273918, "balance_loss_mlp": 0.01265324, "epoch": 0.3468510446415151, "flos": 63253791757440.0, "grad_norm": 0.7519420658684385, "language_loss": 0.57381809, "learning_rate": 3.036091773408956e-06, "loss": 0.65011513, "num_input_tokens_seen": 124152025, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.03219604, "step": 5769, "time_per_iteration": 3.269181966781616 }, { "auxiliary_loss_clip": 0.06492895, "auxiliary_loss_mlp": 0.01273176, "balance_loss_clip": 0.06293502, "balance_loss_mlp": 0.01255557, "epoch": 0.3469111678941831, "flos": 12123984568320.0, "grad_norm": 2.1957824556346104, "language_loss": 0.86223948, "learning_rate": 3.0357586266163154e-06, "loss": 0.93990022, "num_input_tokens_seen": 124165795, "router_z_loss_clip": 1.9921875, "router_z_loss_mlp": 0.17626953, "step": 5770, "time_per_iteration": 2.5350112915039062 }, { "auxiliary_loss_clip": 0.06354994, "auxiliary_loss_mlp": 0.01261893, "balance_loss_clip": 0.06267913, "balance_loss_mlp": 0.01258792, "epoch": 0.34697129114685105, "flos": 65951964282240.0, "grad_norm": 0.7595443224519105, "language_loss": 0.5983783, "learning_rate": 3.0354254405476036e-06, "loss": 0.6745472, "num_input_tokens_seen": 124222925, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.03097534, "step": 5771, "time_per_iteration": 2.9932024478912354 }, { "auxiliary_loss_clip": 0.06474324, "auxiliary_loss_mlp": 0.01273692, "balance_loss_clip": 0.06288513, "balance_loss_mlp": 0.01258254, "epoch": 0.347031414399519, "flos": 34461914284800.0, "grad_norm": 1.664362626536795, "language_loss": 0.72520161, "learning_rate": 3.0350922152154557e-06, "loss": 0.8026818, "num_input_tokens_seen": 124240915, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.15429688, "step": 5772, "time_per_iteration": 2.66900372505188 }, { "auxiliary_loss_clip": 0.06475595, "auxiliary_loss_mlp": 0.01275476, "balance_loss_clip": 0.06289062, "balance_loss_mlp": 0.01259096, "epoch": 0.347091537652187, "flos": 26951592176640.0, "grad_norm": 1.395795914507344, "language_loss": 0.7637558, "learning_rate": 3.034758950632507e-06, "loss": 0.84126645, "num_input_tokens_seen": 124262770, "router_z_loss_clip": 1.86523438, "router_z_loss_mlp": 0.16394043, "step": 5773, "time_per_iteration": 2.6069273948669434 }, { "auxiliary_loss_clip": 0.06480579, "auxiliary_loss_mlp": 0.01270188, "balance_loss_clip": 0.06290297, "balance_loss_mlp": 0.01253331, "epoch": 0.34715166090485494, "flos": 21148602462720.0, "grad_norm": 2.086254240981307, "language_loss": 0.70597887, "learning_rate": 3.034425646811396e-06, "loss": 0.78348655, "num_input_tokens_seen": 124280950, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.16845703, "step": 5774, "time_per_iteration": 2.5686187744140625 }, { "auxiliary_loss_clip": 0.06473463, "auxiliary_loss_mlp": 0.01272499, "balance_loss_clip": 0.06289722, "balance_loss_mlp": 0.01257001, "epoch": 0.3472117841575229, "flos": 23484881203200.0, "grad_norm": 2.346033865819942, "language_loss": 0.76147735, "learning_rate": 3.0340923037647602e-06, "loss": 0.83893698, "num_input_tokens_seen": 124299540, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.15490723, "step": 5775, "time_per_iteration": 2.576792001724243 }, { "auxiliary_loss_clip": 0.06489013, "auxiliary_loss_mlp": 0.01270971, "balance_loss_clip": 0.06295224, "balance_loss_mlp": 0.01253763, "epoch": 0.34727190741019087, "flos": 17498428974720.0, "grad_norm": 2.159292370682998, "language_loss": 0.78322232, "learning_rate": 3.0337589215052404e-06, "loss": 0.8608222, "num_input_tokens_seen": 124316285, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.17211914, "step": 5776, "time_per_iteration": 2.537071943283081 }, { "auxiliary_loss_clip": 0.0636158, "auxiliary_loss_mlp": 0.01256911, "balance_loss_clip": 0.06274717, "balance_loss_mlp": 0.01254156, "epoch": 0.34733203066285884, "flos": 65287350495360.0, "grad_norm": 0.8203582658306466, "language_loss": 0.63432908, "learning_rate": 3.033425500045478e-06, "loss": 0.71051395, "num_input_tokens_seen": 124376650, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.02757263, "step": 5777, "time_per_iteration": 4.6027607917785645 }, { "auxiliary_loss_clip": 0.06478439, "auxiliary_loss_mlp": 0.01269977, "balance_loss_clip": 0.06287089, "balance_loss_mlp": 0.01253609, "epoch": 0.3473921539155268, "flos": 28666429511040.0, "grad_norm": 2.0414759021364675, "language_loss": 0.65186167, "learning_rate": 3.033092039398119e-06, "loss": 0.7293458, "num_input_tokens_seen": 124396475, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.16357422, "step": 5778, "time_per_iteration": 2.633286476135254 }, { "auxiliary_loss_clip": 0.06479254, "auxiliary_loss_mlp": 0.01277083, "balance_loss_clip": 0.06288494, "balance_loss_mlp": 0.01259929, "epoch": 0.3474522771681948, "flos": 40845284104320.0, "grad_norm": 2.0396719689814127, "language_loss": 0.72279382, "learning_rate": 3.0327585395758046e-06, "loss": 0.80035722, "num_input_tokens_seen": 124416480, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.17163086, "step": 5779, "time_per_iteration": 2.796356678009033 }, { "auxiliary_loss_clip": 0.06482246, "auxiliary_loss_mlp": 0.01275644, "balance_loss_clip": 0.06288432, "balance_loss_mlp": 0.0125874, "epoch": 0.3475124004208628, "flos": 24615564998400.0, "grad_norm": 2.672360765088393, "language_loss": 0.62660974, "learning_rate": 3.0324250005911837e-06, "loss": 0.70418864, "num_input_tokens_seen": 124435950, "router_z_loss_clip": 1.93945312, "router_z_loss_mlp": 0.16906738, "step": 5780, "time_per_iteration": 2.635899066925049 }, { "auxiliary_loss_clip": 0.06481934, "auxiliary_loss_mlp": 0.01269236, "balance_loss_clip": 0.06293559, "balance_loss_mlp": 0.01253536, "epoch": 0.34757252367353075, "flos": 22717977909120.0, "grad_norm": 1.9757755879297438, "language_loss": 0.72413909, "learning_rate": 3.0320914224569033e-06, "loss": 0.80165076, "num_input_tokens_seen": 124455410, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.15686035, "step": 5781, "time_per_iteration": 2.6373109817504883 }, { "auxiliary_loss_clip": 0.06481766, "auxiliary_loss_mlp": 0.01277118, "balance_loss_clip": 0.06289179, "balance_loss_mlp": 0.0125988, "epoch": 0.3476326469261987, "flos": 19834246517760.0, "grad_norm": 2.0398647659239035, "language_loss": 0.76997119, "learning_rate": 3.031757805185612e-06, "loss": 0.84756005, "num_input_tokens_seen": 124474870, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.17236328, "step": 5782, "time_per_iteration": 2.5650794506073 }, { "auxiliary_loss_clip": 0.06479688, "auxiliary_loss_mlp": 0.01272919, "balance_loss_clip": 0.0629255, "balance_loss_mlp": 0.0125738, "epoch": 0.3476927701788667, "flos": 19944265328640.0, "grad_norm": 3.4825301359091587, "language_loss": 0.63928813, "learning_rate": 3.0314241487899622e-06, "loss": 0.71681416, "num_input_tokens_seen": 124494105, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.15539551, "step": 5783, "time_per_iteration": 4.036992311477661 }, { "auxiliary_loss_clip": 0.06472467, "auxiliary_loss_mlp": 0.01272678, "balance_loss_clip": 0.0628854, "balance_loss_mlp": 0.01258253, "epoch": 0.34775289343153465, "flos": 20740448424960.0, "grad_norm": 1.944426265238345, "language_loss": 0.88802242, "learning_rate": 3.031090453282605e-06, "loss": 0.96547389, "num_input_tokens_seen": 124512030, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.144104, "step": 5784, "time_per_iteration": 2.5697457790374756 }, { "auxiliary_loss_clip": 0.06477994, "auxiliary_loss_mlp": 0.0127088, "balance_loss_clip": 0.0629358, "balance_loss_mlp": 0.01254953, "epoch": 0.3478130166842026, "flos": 19360992257280.0, "grad_norm": 1.8723416031522593, "language_loss": 0.81998301, "learning_rate": 3.0307567186761946e-06, "loss": 0.89747173, "num_input_tokens_seen": 124530980, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.1595459, "step": 5785, "time_per_iteration": 2.565307855606079 }, { "auxiliary_loss_clip": 0.06477046, "auxiliary_loss_mlp": 0.01276072, "balance_loss_clip": 0.06291985, "balance_loss_mlp": 0.01261678, "epoch": 0.3478731399368706, "flos": 22057194211200.0, "grad_norm": 1.9309281633172632, "language_loss": 0.80716157, "learning_rate": 3.0304229449833862e-06, "loss": 0.88469273, "num_input_tokens_seen": 124549330, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.14385986, "step": 5786, "time_per_iteration": 2.6599533557891846 }, { "auxiliary_loss_clip": 0.06470296, "auxiliary_loss_mlp": 0.01271704, "balance_loss_clip": 0.06287257, "balance_loss_mlp": 0.01255969, "epoch": 0.34793326318953854, "flos": 18047390999040.0, "grad_norm": 1.5961535519276406, "language_loss": 0.75497508, "learning_rate": 3.030089132216836e-06, "loss": 0.83239508, "num_input_tokens_seen": 124567200, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.15734863, "step": 5787, "time_per_iteration": 2.534763813018799 }, { "auxiliary_loss_clip": 0.06474243, "auxiliary_loss_mlp": 0.01273518, "balance_loss_clip": 0.06286432, "balance_loss_mlp": 0.01258056, "epoch": 0.3479933864422065, "flos": 29322349672320.0, "grad_norm": 1.5722986251613418, "language_loss": 0.8184613, "learning_rate": 3.029755280389203e-06, "loss": 0.89593887, "num_input_tokens_seen": 124587025, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.15466309, "step": 5788, "time_per_iteration": 2.6150152683258057 }, { "auxiliary_loss_clip": 0.06483987, "auxiliary_loss_mlp": 0.01274297, "balance_loss_clip": 0.06291489, "balance_loss_mlp": 0.01258073, "epoch": 0.3480535096948745, "flos": 20126931229440.0, "grad_norm": 1.890842783275377, "language_loss": 0.86377001, "learning_rate": 3.029421389513147e-06, "loss": 0.94135284, "num_input_tokens_seen": 124605860, "router_z_loss_clip": 1.92675781, "router_z_loss_mlp": 0.16210938, "step": 5789, "time_per_iteration": 3.963305950164795 }, { "auxiliary_loss_clip": 0.06482555, "auxiliary_loss_mlp": 0.01272741, "balance_loss_clip": 0.0629171, "balance_loss_mlp": 0.01257088, "epoch": 0.34811363294754244, "flos": 18554453182080.0, "grad_norm": 6.137756769222177, "language_loss": 0.85324693, "learning_rate": 3.029087459601328e-06, "loss": 0.9307999, "num_input_tokens_seen": 124624270, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.15643311, "step": 5790, "time_per_iteration": 2.562053918838501 }, { "auxiliary_loss_clip": 0.06477721, "auxiliary_loss_mlp": 0.0127631, "balance_loss_clip": 0.06291601, "balance_loss_mlp": 0.01260586, "epoch": 0.3481737562002104, "flos": 26877603421440.0, "grad_norm": 2.576848716737671, "language_loss": 0.81295002, "learning_rate": 3.0287534906664097e-06, "loss": 0.89049029, "num_input_tokens_seen": 124644005, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.15710449, "step": 5791, "time_per_iteration": 2.6266891956329346 }, { "auxiliary_loss_clip": 0.06484725, "auxiliary_loss_mlp": 0.01272249, "balance_loss_clip": 0.06294252, "balance_loss_mlp": 0.012555, "epoch": 0.3482338794528784, "flos": 28915495372800.0, "grad_norm": 1.739801787361185, "language_loss": 0.77836901, "learning_rate": 3.028419482721056e-06, "loss": 0.85593879, "num_input_tokens_seen": 124663020, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.16748047, "step": 5792, "time_per_iteration": 2.8149588108062744 }, { "auxiliary_loss_clip": 0.06474736, "auxiliary_loss_mlp": 0.01270205, "balance_loss_clip": 0.06285984, "balance_loss_mlp": 0.01254648, "epoch": 0.3482940027055464, "flos": 22207393854720.0, "grad_norm": 1.615156889386538, "language_loss": 0.82317102, "learning_rate": 3.0280854357779325e-06, "loss": 0.90062046, "num_input_tokens_seen": 124682975, "router_z_loss_clip": 1.88769531, "router_z_loss_mlp": 0.15576172, "step": 5793, "time_per_iteration": 2.6841580867767334 }, { "auxiliary_loss_clip": 0.06478308, "auxiliary_loss_mlp": 0.01274124, "balance_loss_clip": 0.06288067, "balance_loss_mlp": 0.01257768, "epoch": 0.34835412595821436, "flos": 20308884370560.0, "grad_norm": 2.789958311461188, "language_loss": 0.76462519, "learning_rate": 3.027751349849706e-06, "loss": 0.84214938, "num_input_tokens_seen": 124701340, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.16345215, "step": 5794, "time_per_iteration": 2.5717899799346924 }, { "auxiliary_loss_clip": 0.06474672, "auxiliary_loss_mlp": 0.01273679, "balance_loss_clip": 0.06287867, "balance_loss_mlp": 0.01257526, "epoch": 0.3484142492108823, "flos": 20456065267200.0, "grad_norm": 1.8564898128635776, "language_loss": 0.57699811, "learning_rate": 3.0274172249490456e-06, "loss": 0.65448165, "num_input_tokens_seen": 124719165, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.16149902, "step": 5795, "time_per_iteration": 4.015840768814087 }, { "auxiliary_loss_clip": 0.06470694, "auxiliary_loss_mlp": 0.01270216, "balance_loss_clip": 0.06285569, "balance_loss_mlp": 0.01255613, "epoch": 0.3484743724635503, "flos": 24359832737280.0, "grad_norm": 1.5308436865045896, "language_loss": 0.8304261, "learning_rate": 3.0270830610886213e-06, "loss": 0.90783519, "num_input_tokens_seen": 124738670, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.14599609, "step": 5796, "time_per_iteration": 2.599379539489746 }, { "auxiliary_loss_clip": 0.06468132, "auxiliary_loss_mlp": 0.01271489, "balance_loss_clip": 0.06287857, "balance_loss_mlp": 0.01257017, "epoch": 0.34853449571621825, "flos": 24359916591360.0, "grad_norm": 1.5409146339307114, "language_loss": 0.83751655, "learning_rate": 3.0267488582811033e-06, "loss": 0.9149127, "num_input_tokens_seen": 124758760, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.14489746, "step": 5797, "time_per_iteration": 2.5965988636016846 }, { "auxiliary_loss_clip": 0.06471689, "auxiliary_loss_mlp": 0.01270419, "balance_loss_clip": 0.06287932, "balance_loss_mlp": 0.01256174, "epoch": 0.3485946189688862, "flos": 27274395231360.0, "grad_norm": 1.754331572458436, "language_loss": 0.73590541, "learning_rate": 3.026414616539167e-06, "loss": 0.81332648, "num_input_tokens_seen": 124777765, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.1427002, "step": 5798, "time_per_iteration": 2.617356777191162 }, { "auxiliary_loss_clip": 0.06476569, "auxiliary_loss_mlp": 0.01272968, "balance_loss_clip": 0.06285225, "balance_loss_mlp": 0.01256624, "epoch": 0.3486547422215542, "flos": 20162835504000.0, "grad_norm": 4.428605496796427, "language_loss": 0.76324379, "learning_rate": 3.026080335875485e-06, "loss": 0.84073919, "num_input_tokens_seen": 124796775, "router_z_loss_clip": 1.91308594, "router_z_loss_mlp": 0.16333008, "step": 5799, "time_per_iteration": 2.5576765537261963 }, { "auxiliary_loss_clip": 0.06473625, "auxiliary_loss_mlp": 0.01269544, "balance_loss_clip": 0.06285453, "balance_loss_mlp": 0.01254309, "epoch": 0.34871486547422215, "flos": 20236614624000.0, "grad_norm": 2.4359698354137724, "language_loss": 0.76307517, "learning_rate": 3.025746016302734e-06, "loss": 0.84050691, "num_input_tokens_seen": 124815825, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.15234375, "step": 5800, "time_per_iteration": 2.564171314239502 }, { "auxiliary_loss_clip": 0.06478575, "auxiliary_loss_mlp": 0.01271769, "balance_loss_clip": 0.06288098, "balance_loss_mlp": 0.01256593, "epoch": 0.3487749887268901, "flos": 44063096924160.0, "grad_norm": 2.4924982009403363, "language_loss": 0.67393625, "learning_rate": 3.025411657833591e-06, "loss": 0.75143975, "num_input_tokens_seen": 124838420, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.1517334, "step": 5801, "time_per_iteration": 2.753993272781372 }, { "auxiliary_loss_clip": 0.06472078, "auxiliary_loss_mlp": 0.01274453, "balance_loss_clip": 0.0628821, "balance_loss_mlp": 0.01258359, "epoch": 0.3488351119795581, "flos": 23301921813120.0, "grad_norm": 1.8018511689084136, "language_loss": 0.77121997, "learning_rate": 3.025077260480735e-06, "loss": 0.84868526, "num_input_tokens_seen": 124857320, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.16088867, "step": 5802, "time_per_iteration": 2.59305739402771 }, { "auxiliary_loss_clip": 0.06463669, "auxiliary_loss_mlp": 0.01274883, "balance_loss_clip": 0.06284708, "balance_loss_mlp": 0.01261829, "epoch": 0.34889523523222604, "flos": 19940449968000.0, "grad_norm": 1.5950493086347368, "language_loss": 0.79622841, "learning_rate": 3.0247428242568474e-06, "loss": 0.87361395, "num_input_tokens_seen": 124875685, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.13049316, "step": 5803, "time_per_iteration": 2.5620203018188477 }, { "auxiliary_loss_clip": 0.06474401, "auxiliary_loss_mlp": 0.01270377, "balance_loss_clip": 0.06281948, "balance_loss_mlp": 0.01254438, "epoch": 0.348955358484894, "flos": 30454123570560.0, "grad_norm": 2.4093489926763074, "language_loss": 0.68117189, "learning_rate": 3.0244083491746085e-06, "loss": 0.75861967, "num_input_tokens_seen": 124895960, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.15942383, "step": 5804, "time_per_iteration": 2.632176399230957 }, { "auxiliary_loss_clip": 0.0646973, "auxiliary_loss_mlp": 0.01271333, "balance_loss_clip": 0.06288974, "balance_loss_mlp": 0.01255514, "epoch": 0.349015481737562, "flos": 18005071887360.0, "grad_norm": 1.7837939580238693, "language_loss": 0.7662406, "learning_rate": 3.024073835246702e-06, "loss": 0.84365129, "num_input_tokens_seen": 124914140, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.15820312, "step": 5805, "time_per_iteration": 2.527104377746582 }, { "auxiliary_loss_clip": 0.06472412, "auxiliary_loss_mlp": 0.01271904, "balance_loss_clip": 0.06285864, "balance_loss_mlp": 0.01257039, "epoch": 0.34907560499023, "flos": 27205815064320.0, "grad_norm": 2.0756316033756543, "language_loss": 0.67925954, "learning_rate": 3.023739282485814e-06, "loss": 0.75670278, "num_input_tokens_seen": 124934180, "router_z_loss_clip": 1.86523438, "router_z_loss_mlp": 0.14855957, "step": 5806, "time_per_iteration": 2.587883472442627 }, { "auxiliary_loss_clip": 0.06470917, "auxiliary_loss_mlp": 0.01272279, "balance_loss_clip": 0.0628634, "balance_loss_mlp": 0.01256883, "epoch": 0.34913572824289796, "flos": 30234714854400.0, "grad_norm": 1.6923677312046221, "language_loss": 0.72484356, "learning_rate": 3.023404690904629e-06, "loss": 0.80227554, "num_input_tokens_seen": 124956060, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.15393066, "step": 5807, "time_per_iteration": 2.654726505279541 }, { "auxiliary_loss_clip": 0.06475221, "auxiliary_loss_mlp": 0.01273104, "balance_loss_clip": 0.06284304, "balance_loss_mlp": 0.01256713, "epoch": 0.3491958514955659, "flos": 29979779207040.0, "grad_norm": 2.1671426132165403, "language_loss": 0.74843085, "learning_rate": 3.0230700605158364e-06, "loss": 0.82591408, "num_input_tokens_seen": 124976070, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.16394043, "step": 5808, "time_per_iteration": 2.6473355293273926 }, { "auxiliary_loss_clip": 0.06465133, "auxiliary_loss_mlp": 0.01272627, "balance_loss_clip": 0.06283866, "balance_loss_mlp": 0.01257297, "epoch": 0.3492559747482339, "flos": 22789786458240.0, "grad_norm": 1.3385841537515193, "language_loss": 0.8481009, "learning_rate": 3.0227353913321238e-06, "loss": 0.92547846, "num_input_tokens_seen": 124996995, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.15332031, "step": 5809, "time_per_iteration": 2.588925838470459 }, { "auxiliary_loss_clip": 0.06467839, "auxiliary_loss_mlp": 0.01274955, "balance_loss_clip": 0.06287843, "balance_loss_mlp": 0.01260138, "epoch": 0.34931609800090185, "flos": 26075257050240.0, "grad_norm": 3.0726987684201967, "language_loss": 0.81205833, "learning_rate": 3.0224006833661835e-06, "loss": 0.88948625, "num_input_tokens_seen": 125015600, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.14819336, "step": 5810, "time_per_iteration": 2.634371042251587 }, { "auxiliary_loss_clip": 0.06472644, "auxiliary_loss_mlp": 0.01278819, "balance_loss_clip": 0.06287235, "balance_loss_mlp": 0.01262797, "epoch": 0.3493762212535698, "flos": 29249744509440.0, "grad_norm": 1.6211252696091591, "language_loss": 0.7598424, "learning_rate": 3.0220659366307057e-06, "loss": 0.83735704, "num_input_tokens_seen": 125035290, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.16027832, "step": 5811, "time_per_iteration": 2.6430628299713135 }, { "auxiliary_loss_clip": 0.06470045, "auxiliary_loss_mlp": 0.01270845, "balance_loss_clip": 0.06283699, "balance_loss_mlp": 0.01255992, "epoch": 0.3494363445062378, "flos": 27133461463680.0, "grad_norm": 2.1498496157242704, "language_loss": 0.80471492, "learning_rate": 3.021731151138386e-06, "loss": 0.88212383, "num_input_tokens_seen": 125057130, "router_z_loss_clip": 1.86523438, "router_z_loss_mlp": 0.14837646, "step": 5812, "time_per_iteration": 2.6278553009033203 }, { "auxiliary_loss_clip": 0.06467411, "auxiliary_loss_mlp": 0.01270925, "balance_loss_clip": 0.0628079, "balance_loss_mlp": 0.012555, "epoch": 0.34949646775890575, "flos": 12281102173440.0, "grad_norm": 1.9284782394112525, "language_loss": 0.69865167, "learning_rate": 3.021396326901918e-06, "loss": 0.77603501, "num_input_tokens_seen": 125073720, "router_z_loss_clip": 1.86523438, "router_z_loss_mlp": 0.15423584, "step": 5813, "time_per_iteration": 2.5236082077026367 }, { "auxiliary_loss_clip": 0.064671, "auxiliary_loss_mlp": 0.0127628, "balance_loss_clip": 0.06284526, "balance_loss_mlp": 0.01260723, "epoch": 0.3495565910115737, "flos": 17171265507840.0, "grad_norm": 1.8174144328407382, "language_loss": 0.76952457, "learning_rate": 3.0210614639339998e-06, "loss": 0.8469584, "num_input_tokens_seen": 125090635, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.15551758, "step": 5814, "time_per_iteration": 2.540905714035034 }, { "auxiliary_loss_clip": 0.06472778, "auxiliary_loss_mlp": 0.01271343, "balance_loss_clip": 0.06285253, "balance_loss_mlp": 0.01255262, "epoch": 0.3496167142642417, "flos": 26472342349440.0, "grad_norm": 1.6098241554045056, "language_loss": 0.85456049, "learning_rate": 3.020726562247328e-06, "loss": 0.93200171, "num_input_tokens_seen": 125110070, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.1607666, "step": 5815, "time_per_iteration": 2.5997486114501953 }, { "auxiliary_loss_clip": 0.06473579, "auxiliary_loss_mlp": 0.01279357, "balance_loss_clip": 0.06286993, "balance_loss_mlp": 0.01264122, "epoch": 0.34967683751690964, "flos": 17419618609920.0, "grad_norm": 1.998601922810785, "language_loss": 0.77811569, "learning_rate": 3.0203916218546024e-06, "loss": 0.85564506, "num_input_tokens_seen": 125125730, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.15246582, "step": 5816, "time_per_iteration": 3.99534010887146 }, { "auxiliary_loss_clip": 0.06480245, "auxiliary_loss_mlp": 0.01275988, "balance_loss_clip": 0.06292253, "balance_loss_mlp": 0.0125993, "epoch": 0.3497369607695776, "flos": 22606365870720.0, "grad_norm": 2.1577604069954988, "language_loss": 0.59591001, "learning_rate": 3.0200566427685246e-06, "loss": 0.6734724, "num_input_tokens_seen": 125146195, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.16064453, "step": 5817, "time_per_iteration": 2.5832836627960205 }, { "auxiliary_loss_clip": 0.06364666, "auxiliary_loss_mlp": 0.01254273, "balance_loss_clip": 0.06279521, "balance_loss_mlp": 0.01250736, "epoch": 0.34979708402224563, "flos": 68548461477120.0, "grad_norm": 0.8586178180047839, "language_loss": 0.59791422, "learning_rate": 3.0197216250017975e-06, "loss": 0.6741035, "num_input_tokens_seen": 125207790, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 0.03540039, "step": 5818, "time_per_iteration": 3.248943567276001 }, { "auxiliary_loss_clip": 0.06467859, "auxiliary_loss_mlp": 0.01271357, "balance_loss_clip": 0.06284735, "balance_loss_mlp": 0.01255896, "epoch": 0.3498572072749136, "flos": 18995660455680.0, "grad_norm": 1.888992499585025, "language_loss": 0.83986795, "learning_rate": 3.019386568567123e-06, "loss": 0.91726017, "num_input_tokens_seen": 125226220, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.15466309, "step": 5819, "time_per_iteration": 2.590104579925537 }, { "auxiliary_loss_clip": 0.06471495, "auxiliary_loss_mlp": 0.01273788, "balance_loss_clip": 0.06284082, "balance_loss_mlp": 0.01258613, "epoch": 0.34991733052758156, "flos": 27826334075520.0, "grad_norm": 1.8104666662232327, "language_loss": 0.71072775, "learning_rate": 3.0190514734772083e-06, "loss": 0.78818065, "num_input_tokens_seen": 125247485, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.15161133, "step": 5820, "time_per_iteration": 2.6107752323150635 }, { "auxiliary_loss_clip": 0.0647462, "auxiliary_loss_mlp": 0.01270132, "balance_loss_clip": 0.06287959, "balance_loss_mlp": 0.01254957, "epoch": 0.3499774537802495, "flos": 33592706755200.0, "grad_norm": 2.013763680660465, "language_loss": 0.70397687, "learning_rate": 3.018716339744759e-06, "loss": 0.7814244, "num_input_tokens_seen": 125268625, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.1517334, "step": 5821, "time_per_iteration": 2.670499563217163 }, { "auxiliary_loss_clip": 0.06480719, "auxiliary_loss_mlp": 0.01276614, "balance_loss_clip": 0.06288473, "balance_loss_mlp": 0.01258923, "epoch": 0.3500375770329175, "flos": 23483413756800.0, "grad_norm": 2.548655192856, "language_loss": 0.7472958, "learning_rate": 3.0183811673824842e-06, "loss": 0.82486916, "num_input_tokens_seen": 125287530, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.17675781, "step": 5822, "time_per_iteration": 4.029402732849121 }, { "auxiliary_loss_clip": 0.06473553, "auxiliary_loss_mlp": 0.01275872, "balance_loss_clip": 0.06285213, "balance_loss_mlp": 0.01259219, "epoch": 0.35009770028558546, "flos": 19032067854720.0, "grad_norm": 1.4953536969502523, "language_loss": 0.78549612, "learning_rate": 3.018045956403094e-06, "loss": 0.86299038, "num_input_tokens_seen": 125307020, "router_z_loss_clip": 1.88378906, "router_z_loss_mlp": 0.16644287, "step": 5823, "time_per_iteration": 2.590606689453125 }, { "auxiliary_loss_clip": 0.06358803, "auxiliary_loss_mlp": 0.01253896, "balance_loss_clip": 0.06274158, "balance_loss_mlp": 0.01250838, "epoch": 0.3501578235382534, "flos": 68371749216000.0, "grad_norm": 0.6932063498788134, "language_loss": 0.5891614, "learning_rate": 3.017710706819298e-06, "loss": 0.66528833, "num_input_tokens_seen": 125370445, "router_z_loss_clip": 0.84375, "router_z_loss_mlp": 0.03057861, "step": 5824, "time_per_iteration": 3.2267189025878906 }, { "auxiliary_loss_clip": 0.06469382, "auxiliary_loss_mlp": 0.01271594, "balance_loss_clip": 0.06283599, "balance_loss_mlp": 0.01255512, "epoch": 0.3502179467909214, "flos": 21257153827200.0, "grad_norm": 4.446561771448851, "language_loss": 0.849962, "learning_rate": 3.017375418643811e-06, "loss": 0.92737174, "num_input_tokens_seen": 125388900, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.16088867, "step": 5825, "time_per_iteration": 2.582442045211792 }, { "auxiliary_loss_clip": 0.06465118, "auxiliary_loss_mlp": 0.01273841, "balance_loss_clip": 0.06281526, "balance_loss_mlp": 0.01257676, "epoch": 0.35027807004358935, "flos": 11946978817920.0, "grad_norm": 2.9881509628644913, "language_loss": 0.83053327, "learning_rate": 3.0170400918893464e-06, "loss": 0.90792286, "num_input_tokens_seen": 125402675, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.16162109, "step": 5826, "time_per_iteration": 2.5379302501678467 }, { "auxiliary_loss_clip": 0.06475202, "auxiliary_loss_mlp": 0.01273275, "balance_loss_clip": 0.0628562, "balance_loss_mlp": 0.01255632, "epoch": 0.3503381932962573, "flos": 21477401084160.0, "grad_norm": 1.5211562117491164, "language_loss": 0.809609, "learning_rate": 3.0167047265686186e-06, "loss": 0.88709378, "num_input_tokens_seen": 125421360, "router_z_loss_clip": 1.89550781, "router_z_loss_mlp": 0.17633057, "step": 5827, "time_per_iteration": 2.586322546005249 }, { "auxiliary_loss_clip": 0.06467265, "auxiliary_loss_mlp": 0.01271599, "balance_loss_clip": 0.06282422, "balance_loss_mlp": 0.01256823, "epoch": 0.3503983165489253, "flos": 21257405389440.0, "grad_norm": 3.9304350231719454, "language_loss": 0.71827537, "learning_rate": 3.0163693226943467e-06, "loss": 0.79566395, "num_input_tokens_seen": 125440000, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.14764404, "step": 5828, "time_per_iteration": 2.564885377883911 }, { "auxiliary_loss_clip": 0.06469513, "auxiliary_loss_mlp": 0.01276495, "balance_loss_clip": 0.06282306, "balance_loss_mlp": 0.01258971, "epoch": 0.35045843980159325, "flos": 27822644496000.0, "grad_norm": 1.6832669061442131, "language_loss": 0.79920703, "learning_rate": 3.016033880279248e-06, "loss": 0.87666708, "num_input_tokens_seen": 125460390, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.17541504, "step": 5829, "time_per_iteration": 4.03369665145874 }, { "auxiliary_loss_clip": 0.0647692, "auxiliary_loss_mlp": 0.01279554, "balance_loss_clip": 0.06283266, "balance_loss_mlp": 0.01261828, "epoch": 0.3505185630542612, "flos": 25928201934720.0, "grad_norm": 1.828942100278785, "language_loss": 0.72547013, "learning_rate": 3.0156983993360417e-06, "loss": 0.80303484, "num_input_tokens_seen": 125478410, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.17724609, "step": 5830, "time_per_iteration": 2.6126768589019775 }, { "auxiliary_loss_clip": 0.06464946, "auxiliary_loss_mlp": 0.01271953, "balance_loss_clip": 0.06280548, "balance_loss_mlp": 0.01256217, "epoch": 0.35057868630692923, "flos": 20527999597440.0, "grad_norm": 1.9328293997905654, "language_loss": 0.88927662, "learning_rate": 3.0153628798774513e-06, "loss": 0.9666456, "num_input_tokens_seen": 125495975, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.15722656, "step": 5831, "time_per_iteration": 2.576011896133423 }, { "auxiliary_loss_clip": 0.06468594, "auxiliary_loss_mlp": 0.01272362, "balance_loss_clip": 0.06282091, "balance_loss_mlp": 0.01257282, "epoch": 0.3506388095595972, "flos": 20454849383040.0, "grad_norm": 2.5240514658933275, "language_loss": 0.78650981, "learning_rate": 3.0150273219161985e-06, "loss": 0.86391938, "num_input_tokens_seen": 125515035, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.15081787, "step": 5832, "time_per_iteration": 2.5788462162017822 }, { "auxiliary_loss_clip": 0.0647198, "auxiliary_loss_mlp": 0.01273489, "balance_loss_clip": 0.06282133, "balance_loss_mlp": 0.01256228, "epoch": 0.35069893281226516, "flos": 23115901749120.0, "grad_norm": 2.334974472010864, "language_loss": 0.71919692, "learning_rate": 3.014691725465008e-06, "loss": 0.7966516, "num_input_tokens_seen": 125535555, "router_z_loss_clip": 1.89746094, "router_z_loss_mlp": 0.17260742, "step": 5833, "time_per_iteration": 2.7128968238830566 }, { "auxiliary_loss_clip": 0.0646421, "auxiliary_loss_mlp": 0.0127101, "balance_loss_clip": 0.06283325, "balance_loss_mlp": 0.01255567, "epoch": 0.35075905606493313, "flos": 27279426476160.0, "grad_norm": 1.4274384753520863, "language_loss": 0.81118739, "learning_rate": 3.014356090536606e-06, "loss": 0.88853961, "num_input_tokens_seen": 125558195, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.15435791, "step": 5834, "time_per_iteration": 4.136436462402344 }, { "auxiliary_loss_clip": 0.06468005, "auxiliary_loss_mlp": 0.01271746, "balance_loss_clip": 0.06282948, "balance_loss_mlp": 0.01255939, "epoch": 0.3508191793176011, "flos": 19133491622400.0, "grad_norm": 2.132518137275804, "language_loss": 0.8392362, "learning_rate": 3.0140204171437183e-06, "loss": 0.91663373, "num_input_tokens_seen": 125575375, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.15820312, "step": 5835, "time_per_iteration": 2.5364933013916016 }, { "auxiliary_loss_clip": 0.06468523, "auxiliary_loss_mlp": 0.01271593, "balance_loss_clip": 0.06282847, "balance_loss_mlp": 0.01256513, "epoch": 0.35087930257026906, "flos": 25564798776960.0, "grad_norm": 1.7828953528537657, "language_loss": 0.77262694, "learning_rate": 3.0136847052990754e-06, "loss": 0.85002816, "num_input_tokens_seen": 125596745, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.15063477, "step": 5836, "time_per_iteration": 2.599569320678711 }, { "auxiliary_loss_clip": 0.06467872, "auxiliary_loss_mlp": 0.01275127, "balance_loss_clip": 0.06285422, "balance_loss_mlp": 0.01259749, "epoch": 0.350939425822937, "flos": 18010061205120.0, "grad_norm": 2.137652558566755, "language_loss": 0.7821151, "learning_rate": 3.0133489550154074e-06, "loss": 0.85954505, "num_input_tokens_seen": 125613980, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.15368652, "step": 5837, "time_per_iteration": 2.5374767780303955 }, { "auxiliary_loss_clip": 0.06467008, "auxiliary_loss_mlp": 0.01271569, "balance_loss_clip": 0.0628401, "balance_loss_mlp": 0.01256763, "epoch": 0.350999549075605, "flos": 22279747455360.0, "grad_norm": 1.656783480770433, "language_loss": 0.68745577, "learning_rate": 3.0130131663054442e-06, "loss": 0.76484156, "num_input_tokens_seen": 125632100, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.14807129, "step": 5838, "time_per_iteration": 2.5982072353363037 }, { "auxiliary_loss_clip": 0.06466621, "auxiliary_loss_mlp": 0.01269886, "balance_loss_clip": 0.06284292, "balance_loss_mlp": 0.01253959, "epoch": 0.35105967232827295, "flos": 14397511000320.0, "grad_norm": 1.9578252922309511, "language_loss": 0.83696318, "learning_rate": 3.0126773391819215e-06, "loss": 0.91432822, "num_input_tokens_seen": 125649190, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.15917969, "step": 5839, "time_per_iteration": 2.688133478164673 }, { "auxiliary_loss_clip": 0.06473701, "auxiliary_loss_mlp": 0.0127303, "balance_loss_clip": 0.06283708, "balance_loss_mlp": 0.01257306, "epoch": 0.3511197955809409, "flos": 25089322383360.0, "grad_norm": 1.9126681984966925, "language_loss": 0.59393805, "learning_rate": 3.012341473657572e-06, "loss": 0.67140532, "num_input_tokens_seen": 125668680, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.15710449, "step": 5840, "time_per_iteration": 2.6415627002716064 }, { "auxiliary_loss_clip": 0.06468448, "auxiliary_loss_mlp": 0.01270895, "balance_loss_clip": 0.06282987, "balance_loss_mlp": 0.01255362, "epoch": 0.3511799188336089, "flos": 25891123703040.0, "grad_norm": 3.039387873281861, "language_loss": 0.88562876, "learning_rate": 3.0120055697451322e-06, "loss": 0.96302223, "num_input_tokens_seen": 125686935, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.15527344, "step": 5841, "time_per_iteration": 2.606177568435669 }, { "auxiliary_loss_clip": 0.06472868, "auxiliary_loss_mlp": 0.01273883, "balance_loss_clip": 0.06282554, "balance_loss_mlp": 0.01257336, "epoch": 0.35124004208627685, "flos": 20089852997760.0, "grad_norm": 1.9073911622310367, "language_loss": 0.75382805, "learning_rate": 3.0116696274573406e-06, "loss": 0.83129561, "num_input_tokens_seen": 125707180, "router_z_loss_clip": 1.90136719, "router_z_loss_mlp": 0.16552734, "step": 5842, "time_per_iteration": 2.6017098426818848 }, { "auxiliary_loss_clip": 0.06473182, "auxiliary_loss_mlp": 0.01275175, "balance_loss_clip": 0.06285357, "balance_loss_mlp": 0.01259302, "epoch": 0.3513001653389448, "flos": 17788891553280.0, "grad_norm": 1.851599295066269, "language_loss": 0.69179058, "learning_rate": 3.0113336468069346e-06, "loss": 0.76927412, "num_input_tokens_seen": 125722780, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.15863037, "step": 5843, "time_per_iteration": 2.605365514755249 }, { "auxiliary_loss_clip": 0.06466326, "auxiliary_loss_mlp": 0.01277305, "balance_loss_clip": 0.06281439, "balance_loss_mlp": 0.01260985, "epoch": 0.3513602885916128, "flos": 29394745200000.0, "grad_norm": 2.1869289621226256, "language_loss": 0.66077399, "learning_rate": 3.010997627806655e-06, "loss": 0.73821032, "num_input_tokens_seen": 125742110, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.16320801, "step": 5844, "time_per_iteration": 2.634448766708374 }, { "auxiliary_loss_clip": 0.06471064, "auxiliary_loss_mlp": 0.01272837, "balance_loss_clip": 0.06285992, "balance_loss_mlp": 0.01256362, "epoch": 0.3514204118442808, "flos": 16185372768000.0, "grad_norm": 2.310170662393711, "language_loss": 0.76067907, "learning_rate": 3.010661570469245e-06, "loss": 0.83811808, "num_input_tokens_seen": 125759980, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.16467285, "step": 5845, "time_per_iteration": 2.555851936340332 }, { "auxiliary_loss_clip": 0.06464444, "auxiliary_loss_mlp": 0.01272724, "balance_loss_clip": 0.06281839, "balance_loss_mlp": 0.01257358, "epoch": 0.35148053509694877, "flos": 23840234369280.0, "grad_norm": 3.1024840470370103, "language_loss": 0.74230802, "learning_rate": 3.0103254748074465e-06, "loss": 0.81967962, "num_input_tokens_seen": 125772660, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.15368652, "step": 5846, "time_per_iteration": 2.560037136077881 }, { "auxiliary_loss_clip": 0.0647378, "auxiliary_loss_mlp": 0.01277614, "balance_loss_clip": 0.06286616, "balance_loss_mlp": 0.01261997, "epoch": 0.35154065834961673, "flos": 20996809591680.0, "grad_norm": 1.609909529866876, "language_loss": 0.75946677, "learning_rate": 3.0099893408340046e-06, "loss": 0.8369807, "num_input_tokens_seen": 125791935, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.15625, "step": 5847, "time_per_iteration": 2.5663859844207764 }, { "auxiliary_loss_clip": 0.06470749, "auxiliary_loss_mlp": 0.01269665, "balance_loss_clip": 0.06282846, "balance_loss_mlp": 0.01255205, "epoch": 0.3516007816022847, "flos": 33263866206720.0, "grad_norm": 2.194482392848483, "language_loss": 0.72727937, "learning_rate": 3.009653168561666e-06, "loss": 0.80468351, "num_input_tokens_seen": 125813455, "router_z_loss_clip": 1.87695312, "router_z_loss_mlp": 0.14447021, "step": 5848, "time_per_iteration": 2.6619465351104736 }, { "auxiliary_loss_clip": 0.06471166, "auxiliary_loss_mlp": 0.01273634, "balance_loss_clip": 0.06284407, "balance_loss_mlp": 0.01257809, "epoch": 0.35166090485495266, "flos": 11731427389440.0, "grad_norm": 2.417466508095473, "language_loss": 0.90474224, "learning_rate": 3.009316958003178e-06, "loss": 0.98219025, "num_input_tokens_seen": 125827660, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.15826416, "step": 5849, "time_per_iteration": 2.54529070854187 }, { "auxiliary_loss_clip": 0.06464751, "auxiliary_loss_mlp": 0.01272973, "balance_loss_clip": 0.06282093, "balance_loss_mlp": 0.01257738, "epoch": 0.3517210281076206, "flos": 22645121184000.0, "grad_norm": 1.9023761219866973, "language_loss": 0.75889957, "learning_rate": 3.0089807091712897e-06, "loss": 0.83627677, "num_input_tokens_seen": 125846655, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.15234375, "step": 5850, "time_per_iteration": 2.583740234375 }, { "auxiliary_loss_clip": 0.06465784, "auxiliary_loss_mlp": 0.0127015, "balance_loss_clip": 0.06283067, "balance_loss_mlp": 0.01255356, "epoch": 0.3517811513602886, "flos": 21328836595200.0, "grad_norm": 1.5035734209218414, "language_loss": 0.76283395, "learning_rate": 3.0086444220787515e-06, "loss": 0.84019333, "num_input_tokens_seen": 125866290, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.14794922, "step": 5851, "time_per_iteration": 2.6247761249542236 }, { "auxiliary_loss_clip": 0.06464824, "auxiliary_loss_mlp": 0.012739, "balance_loss_clip": 0.06281504, "balance_loss_mlp": 0.01258212, "epoch": 0.35184127461295656, "flos": 21039254484480.0, "grad_norm": 1.892789168264524, "language_loss": 0.88086486, "learning_rate": 3.0083080967383165e-06, "loss": 0.95825213, "num_input_tokens_seen": 125884620, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.15686035, "step": 5852, "time_per_iteration": 2.5929977893829346 }, { "auxiliary_loss_clip": 0.06465434, "auxiliary_loss_mlp": 0.01271766, "balance_loss_clip": 0.06283687, "balance_loss_mlp": 0.01256483, "epoch": 0.3519013978656245, "flos": 22461784450560.0, "grad_norm": 2.53349799955589, "language_loss": 0.68055391, "learning_rate": 3.007971733162737e-06, "loss": 0.75792587, "num_input_tokens_seen": 125902430, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.15283203, "step": 5853, "time_per_iteration": 2.5814132690429688 }, { "auxiliary_loss_clip": 0.06471318, "auxiliary_loss_mlp": 0.01275984, "balance_loss_clip": 0.0628615, "balance_loss_mlp": 0.01260069, "epoch": 0.3519615211182925, "flos": 13120317141120.0, "grad_norm": 1.7279229912980527, "language_loss": 0.82020801, "learning_rate": 3.0076353313647686e-06, "loss": 0.897681, "num_input_tokens_seen": 125920570, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.15917969, "step": 5854, "time_per_iteration": 2.561506509780884 }, { "auxiliary_loss_clip": 0.06459436, "auxiliary_loss_mlp": 0.01268852, "balance_loss_clip": 0.06280915, "balance_loss_mlp": 0.01255137, "epoch": 0.35202164437096045, "flos": 19141122343680.0, "grad_norm": 1.318930312146895, "language_loss": 0.7341488, "learning_rate": 3.0072988913571666e-06, "loss": 0.81143171, "num_input_tokens_seen": 125939800, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.13726807, "step": 5855, "time_per_iteration": 2.5903005599975586 }, { "auxiliary_loss_clip": 0.06459889, "auxiliary_loss_mlp": 0.0127024, "balance_loss_clip": 0.06279801, "balance_loss_mlp": 0.01256018, "epoch": 0.3520817676236284, "flos": 26549475632640.0, "grad_norm": 2.0477150240008455, "language_loss": 0.71298313, "learning_rate": 3.006962413152691e-06, "loss": 0.7902844, "num_input_tokens_seen": 125958720, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.14221191, "step": 5856, "time_per_iteration": 4.070830345153809 }, { "auxiliary_loss_clip": 0.06475905, "auxiliary_loss_mlp": 0.01271925, "balance_loss_clip": 0.06288119, "balance_loss_mlp": 0.01255987, "epoch": 0.3521418908762964, "flos": 44903653557120.0, "grad_norm": 1.6964280981119948, "language_loss": 0.6153698, "learning_rate": 3.0066258967640987e-06, "loss": 0.69284809, "num_input_tokens_seen": 125984310, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.15930176, "step": 5857, "time_per_iteration": 2.786252975463867 }, { "auxiliary_loss_clip": 0.0646994, "auxiliary_loss_mlp": 0.01268693, "balance_loss_clip": 0.06284795, "balance_loss_mlp": 0.01252988, "epoch": 0.3522020141289644, "flos": 20192576503680.0, "grad_norm": 1.6539460277547042, "language_loss": 0.73603827, "learning_rate": 3.006289342204152e-06, "loss": 0.81342459, "num_input_tokens_seen": 126002410, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.15692139, "step": 5858, "time_per_iteration": 2.5835883617401123 }, { "auxiliary_loss_clip": 0.06471242, "auxiliary_loss_mlp": 0.01270364, "balance_loss_clip": 0.06287422, "balance_loss_mlp": 0.01255534, "epoch": 0.35226213738163237, "flos": 27571398428160.0, "grad_norm": 1.7254483934070166, "language_loss": 0.76273358, "learning_rate": 3.0059527494856126e-06, "loss": 0.84014964, "num_input_tokens_seen": 126022490, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.14831543, "step": 5859, "time_per_iteration": 2.6251652240753174 }, { "auxiliary_loss_clip": 0.0648304, "auxiliary_loss_mlp": 0.01273765, "balance_loss_clip": 0.06291164, "balance_loss_mlp": 0.01258202, "epoch": 0.35232226063430033, "flos": 22972955483520.0, "grad_norm": 1.7620658438481696, "language_loss": 0.72097421, "learning_rate": 3.0056161186212435e-06, "loss": 0.79854226, "num_input_tokens_seen": 126042895, "router_z_loss_clip": 1.91699219, "router_z_loss_mlp": 0.15570068, "step": 5860, "time_per_iteration": 2.5999979972839355 }, { "auxiliary_loss_clip": 0.06476898, "auxiliary_loss_mlp": 0.01273139, "balance_loss_clip": 0.06286831, "balance_loss_mlp": 0.0125701, "epoch": 0.3523823838869683, "flos": 19173714382080.0, "grad_norm": 1.9970672568591172, "language_loss": 0.67222428, "learning_rate": 3.005279449623811e-06, "loss": 0.74972475, "num_input_tokens_seen": 126060130, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.16137695, "step": 5861, "time_per_iteration": 2.560492992401123 }, { "auxiliary_loss_clip": 0.06474502, "auxiliary_loss_mlp": 0.01269456, "balance_loss_clip": 0.06292714, "balance_loss_mlp": 0.01255109, "epoch": 0.35244250713963626, "flos": 17936743282560.0, "grad_norm": 2.3393166145486206, "language_loss": 0.67303938, "learning_rate": 3.0049427425060815e-06, "loss": 0.75047898, "num_input_tokens_seen": 126077850, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.14337158, "step": 5862, "time_per_iteration": 4.014210224151611 }, { "auxiliary_loss_clip": 0.06474039, "auxiliary_loss_mlp": 0.0127697, "balance_loss_clip": 0.06286055, "balance_loss_mlp": 0.01260674, "epoch": 0.35250263039230423, "flos": 21438687697920.0, "grad_norm": 1.929077354396276, "language_loss": 0.77460039, "learning_rate": 3.0046059972808215e-06, "loss": 0.85211045, "num_input_tokens_seen": 126095985, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.16296387, "step": 5863, "time_per_iteration": 2.5616352558135986 }, { "auxiliary_loss_clip": 0.06475432, "auxiliary_loss_mlp": 0.01270324, "balance_loss_clip": 0.06290951, "balance_loss_mlp": 0.01256042, "epoch": 0.3525627536449722, "flos": 27424133677440.0, "grad_norm": 1.812253769255842, "language_loss": 0.75465226, "learning_rate": 3.0042692139608024e-06, "loss": 0.83210981, "num_input_tokens_seen": 126116070, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.1428833, "step": 5864, "time_per_iteration": 2.611226797103882 }, { "auxiliary_loss_clip": 0.06474021, "auxiliary_loss_mlp": 0.01270626, "balance_loss_clip": 0.06289814, "balance_loss_mlp": 0.01256452, "epoch": 0.35262287689764016, "flos": 24796637671680.0, "grad_norm": 2.5422953586520975, "language_loss": 0.79989511, "learning_rate": 3.003932392558793e-06, "loss": 0.87734163, "num_input_tokens_seen": 126135205, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.14196777, "step": 5865, "time_per_iteration": 2.6138052940368652 }, { "auxiliary_loss_clip": 0.06488538, "auxiliary_loss_mlp": 0.01272428, "balance_loss_clip": 0.06300026, "balance_loss_mlp": 0.01257003, "epoch": 0.3526830001503081, "flos": 17827353377280.0, "grad_norm": 5.138317891081969, "language_loss": 0.81901157, "learning_rate": 3.0035955330875677e-06, "loss": 0.89662123, "num_input_tokens_seen": 126151895, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.1541748, "step": 5866, "time_per_iteration": 2.5670650005340576 }, { "auxiliary_loss_clip": 0.0649311, "auxiliary_loss_mlp": 0.01272204, "balance_loss_clip": 0.06298084, "balance_loss_mlp": 0.01255277, "epoch": 0.3527431234029761, "flos": 18084091887360.0, "grad_norm": 2.081073850922842, "language_loss": 0.84811628, "learning_rate": 3.0032586355598986e-06, "loss": 0.92576933, "num_input_tokens_seen": 126168515, "router_z_loss_clip": 1.95117188, "router_z_loss_mlp": 0.16918945, "step": 5867, "time_per_iteration": 2.5751683712005615 }, { "auxiliary_loss_clip": 0.06480755, "auxiliary_loss_mlp": 0.01270085, "balance_loss_clip": 0.06293453, "balance_loss_mlp": 0.01254015, "epoch": 0.35280324665564405, "flos": 19433429712000.0, "grad_norm": 2.0655899004658673, "language_loss": 0.74636269, "learning_rate": 3.0029216999885613e-06, "loss": 0.82387108, "num_input_tokens_seen": 126186460, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.16046143, "step": 5868, "time_per_iteration": 3.95906400680542 }, { "auxiliary_loss_clip": 0.06486978, "auxiliary_loss_mlp": 0.01274769, "balance_loss_clip": 0.06298794, "balance_loss_mlp": 0.01257984, "epoch": 0.352863369908312, "flos": 21509951195520.0, "grad_norm": 2.8239848518809803, "language_loss": 0.62371469, "learning_rate": 3.0025847263863327e-06, "loss": 0.70133209, "num_input_tokens_seen": 126206170, "router_z_loss_clip": 1.88183594, "router_z_loss_mlp": 0.16809082, "step": 5869, "time_per_iteration": 2.6353893280029297 }, { "auxiliary_loss_clip": 0.06477287, "auxiliary_loss_mlp": 0.01276386, "balance_loss_clip": 0.06289861, "balance_loss_mlp": 0.01261198, "epoch": 0.35292349316098, "flos": 22316029073280.0, "grad_norm": 1.8199926262091524, "language_loss": 0.74826634, "learning_rate": 3.0022477147659917e-06, "loss": 0.82580304, "num_input_tokens_seen": 126225605, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.15185547, "step": 5870, "time_per_iteration": 2.575552463531494 }, { "auxiliary_loss_clip": 0.06480719, "auxiliary_loss_mlp": 0.01268315, "balance_loss_clip": 0.06295392, "balance_loss_mlp": 0.01253056, "epoch": 0.352983616413648, "flos": 33118152756480.0, "grad_norm": 1.6370124955229766, "language_loss": 0.7268585, "learning_rate": 3.001910665140316e-06, "loss": 0.80434883, "num_input_tokens_seen": 126250230, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.15270996, "step": 5871, "time_per_iteration": 2.738332748413086 }, { "auxiliary_loss_clip": 0.06467061, "auxiliary_loss_mlp": 0.01271848, "balance_loss_clip": 0.06287844, "balance_loss_mlp": 0.01258115, "epoch": 0.35304373966631597, "flos": 18702388765440.0, "grad_norm": 1.7382600016481622, "language_loss": 0.74252653, "learning_rate": 3.0015735775220873e-06, "loss": 0.81991565, "num_input_tokens_seen": 126268315, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.137146, "step": 5872, "time_per_iteration": 2.609248399734497 }, { "auxiliary_loss_clip": 0.06476021, "auxiliary_loss_mlp": 0.01266915, "balance_loss_clip": 0.06291832, "balance_loss_mlp": 0.01253015, "epoch": 0.35310386291898394, "flos": 23371214739840.0, "grad_norm": 1.8295273554367293, "language_loss": 0.83059388, "learning_rate": 3.001236451924089e-06, "loss": 0.90802324, "num_input_tokens_seen": 126288390, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.13903809, "step": 5873, "time_per_iteration": 2.608396291732788 }, { "auxiliary_loss_clip": 0.06483988, "auxiliary_loss_mlp": 0.01274432, "balance_loss_clip": 0.06292839, "balance_loss_mlp": 0.01257707, "epoch": 0.3531639861716519, "flos": 24468803372160.0, "grad_norm": 1.980667729696505, "language_loss": 0.66406763, "learning_rate": 3.000899288359104e-06, "loss": 0.74165177, "num_input_tokens_seen": 126305750, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.16717529, "step": 5874, "time_per_iteration": 4.056976795196533 }, { "auxiliary_loss_clip": 0.06361531, "auxiliary_loss_mlp": 0.01278777, "balance_loss_clip": 0.06277093, "balance_loss_mlp": 0.01275903, "epoch": 0.35322410942431987, "flos": 70331040437760.0, "grad_norm": 0.7549189735457639, "language_loss": 0.61581016, "learning_rate": 3.000562086839917e-06, "loss": 0.69221324, "num_input_tokens_seen": 126362495, "router_z_loss_clip": 0.84375, "router_z_loss_mlp": 0.02871704, "step": 5875, "time_per_iteration": 3.085461139678955 }, { "auxiliary_loss_clip": 0.06476035, "auxiliary_loss_mlp": 0.01274379, "balance_loss_clip": 0.06291503, "balance_loss_mlp": 0.01259758, "epoch": 0.35328423267698783, "flos": 19825735328640.0, "grad_norm": 2.0374682418433325, "language_loss": 0.80368131, "learning_rate": 3.0002248473793163e-06, "loss": 0.88118541, "num_input_tokens_seen": 126378320, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.1461792, "step": 5876, "time_per_iteration": 2.5972495079040527 }, { "auxiliary_loss_clip": 0.06357273, "auxiliary_loss_mlp": 0.01260764, "balance_loss_clip": 0.06273269, "balance_loss_mlp": 0.01257567, "epoch": 0.3533443559296558, "flos": 60843398480640.0, "grad_norm": 0.6629952601849503, "language_loss": 0.56716895, "learning_rate": 2.999887569990088e-06, "loss": 0.64334929, "num_input_tokens_seen": 126442735, "router_z_loss_clip": 0.83837891, "router_z_loss_mlp": 0.03201294, "step": 5877, "time_per_iteration": 3.2848925590515137 }, { "auxiliary_loss_clip": 0.06475478, "auxiliary_loss_mlp": 0.01274198, "balance_loss_clip": 0.06290351, "balance_loss_mlp": 0.01258391, "epoch": 0.35340447918232376, "flos": 24762997457280.0, "grad_norm": 1.4892969534644227, "language_loss": 0.72494107, "learning_rate": 2.999550254685024e-06, "loss": 0.80243778, "num_input_tokens_seen": 126463090, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.15808105, "step": 5878, "time_per_iteration": 2.601990222930908 }, { "auxiliary_loss_clip": 0.06473425, "auxiliary_loss_mlp": 0.01272532, "balance_loss_clip": 0.06289096, "balance_loss_mlp": 0.01257846, "epoch": 0.3534646024349917, "flos": 21802342417920.0, "grad_norm": 1.7590076272366155, "language_loss": 0.79074764, "learning_rate": 2.9992129014769136e-06, "loss": 0.86820722, "num_input_tokens_seen": 126482105, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.14685059, "step": 5879, "time_per_iteration": 2.572979688644409 }, { "auxiliary_loss_clip": 0.06480504, "auxiliary_loss_mlp": 0.01270887, "balance_loss_clip": 0.06289217, "balance_loss_mlp": 0.01253542, "epoch": 0.3535247256876597, "flos": 20018463719040.0, "grad_norm": 2.037224868314407, "language_loss": 0.63485485, "learning_rate": 2.9988755103785493e-06, "loss": 0.71236873, "num_input_tokens_seen": 126502125, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.17333984, "step": 5880, "time_per_iteration": 2.63651442527771 }, { "auxiliary_loss_clip": 0.06478067, "auxiliary_loss_mlp": 0.01270629, "balance_loss_clip": 0.06289752, "balance_loss_mlp": 0.01254405, "epoch": 0.35358484894032766, "flos": 18193984917120.0, "grad_norm": 2.154330980608583, "language_loss": 0.66435158, "learning_rate": 2.998538081402727e-06, "loss": 0.74183851, "num_input_tokens_seen": 126521950, "router_z_loss_clip": 1.88183594, "router_z_loss_mlp": 0.16235352, "step": 5881, "time_per_iteration": 2.6709494590759277 }, { "auxiliary_loss_clip": 0.06467684, "auxiliary_loss_mlp": 0.01275255, "balance_loss_clip": 0.06288894, "balance_loss_mlp": 0.01261308, "epoch": 0.3536449721929956, "flos": 22826990471040.0, "grad_norm": 1.3552016132016176, "language_loss": 0.76119757, "learning_rate": 2.998200614562239e-06, "loss": 0.83862698, "num_input_tokens_seen": 126542445, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.1395874, "step": 5882, "time_per_iteration": 2.6066014766693115 }, { "auxiliary_loss_clip": 0.06473306, "auxiliary_loss_mlp": 0.01272033, "balance_loss_clip": 0.06287748, "balance_loss_mlp": 0.01257096, "epoch": 0.3537050954456636, "flos": 26439540675840.0, "grad_norm": 1.94742086774899, "language_loss": 0.7072559, "learning_rate": 2.9978631098698847e-06, "loss": 0.78470927, "num_input_tokens_seen": 126560690, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.14941406, "step": 5883, "time_per_iteration": 2.6329190731048584 }, { "auxiliary_loss_clip": 0.06481615, "auxiliary_loss_mlp": 0.01271163, "balance_loss_clip": 0.06289718, "balance_loss_mlp": 0.01256315, "epoch": 0.3537652186983316, "flos": 17202096610560.0, "grad_norm": 1.9597695128262425, "language_loss": 0.78891718, "learning_rate": 2.9975255673384614e-06, "loss": 0.86644495, "num_input_tokens_seen": 126577620, "router_z_loss_clip": 1.91894531, "router_z_loss_mlp": 0.14837646, "step": 5884, "time_per_iteration": 2.5539798736572266 }, { "auxiliary_loss_clip": 0.06469164, "auxiliary_loss_mlp": 0.01270476, "balance_loss_clip": 0.06287082, "balance_loss_mlp": 0.01255825, "epoch": 0.3538253419509996, "flos": 19542861544320.0, "grad_norm": 1.8899819456810563, "language_loss": 0.75363588, "learning_rate": 2.9971879869807673e-06, "loss": 0.83103228, "num_input_tokens_seen": 126596235, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.14660645, "step": 5885, "time_per_iteration": 2.5845508575439453 }, { "auxiliary_loss_clip": 0.06475472, "auxiliary_loss_mlp": 0.0127199, "balance_loss_clip": 0.06287542, "balance_loss_mlp": 0.01256147, "epoch": 0.35388546520366754, "flos": 12133166590080.0, "grad_norm": 2.127704670990518, "language_loss": 0.83857679, "learning_rate": 2.996850368809606e-06, "loss": 0.91605151, "num_input_tokens_seen": 126612830, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.15826416, "step": 5886, "time_per_iteration": 2.65798020362854 }, { "auxiliary_loss_clip": 0.06469453, "auxiliary_loss_mlp": 0.01270563, "balance_loss_clip": 0.06286788, "balance_loss_mlp": 0.01254446, "epoch": 0.3539455884563355, "flos": 19683501822720.0, "grad_norm": 2.021280975066769, "language_loss": 0.78598207, "learning_rate": 2.9965127128377787e-06, "loss": 0.86338222, "num_input_tokens_seen": 126630910, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.16125488, "step": 5887, "time_per_iteration": 2.789033889770508 }, { "auxiliary_loss_clip": 0.06469843, "auxiliary_loss_mlp": 0.01267311, "balance_loss_clip": 0.06285223, "balance_loss_mlp": 0.01253208, "epoch": 0.35400571170900347, "flos": 18077006217600.0, "grad_norm": 1.802908414365561, "language_loss": 0.66265774, "learning_rate": 2.996175019078089e-06, "loss": 0.74002922, "num_input_tokens_seen": 126648365, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.14111328, "step": 5888, "time_per_iteration": 2.6097280979156494 }, { "auxiliary_loss_clip": 0.06470773, "auxiliary_loss_mlp": 0.0127127, "balance_loss_clip": 0.06285858, "balance_loss_mlp": 0.01256619, "epoch": 0.35406583496167143, "flos": 26075298977280.0, "grad_norm": 1.7701493220764848, "language_loss": 0.77490026, "learning_rate": 2.9958372875433437e-06, "loss": 0.85232073, "num_input_tokens_seen": 126667500, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.14648438, "step": 5889, "time_per_iteration": 2.6132640838623047 }, { "auxiliary_loss_clip": 0.06470559, "auxiliary_loss_mlp": 0.01274657, "balance_loss_clip": 0.06287931, "balance_loss_mlp": 0.01259613, "epoch": 0.3541259582143394, "flos": 19798635732480.0, "grad_norm": 1.9277430122218144, "language_loss": 0.81355703, "learning_rate": 2.9954995182463478e-06, "loss": 0.89100921, "num_input_tokens_seen": 126686820, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.1505127, "step": 5890, "time_per_iteration": 2.5496063232421875 }, { "auxiliary_loss_clip": 0.06469855, "auxiliary_loss_mlp": 0.01271684, "balance_loss_clip": 0.0628729, "balance_loss_mlp": 0.01258213, "epoch": 0.35418608146700736, "flos": 24028518493440.0, "grad_norm": 1.7351124988167244, "language_loss": 0.80172288, "learning_rate": 2.99516171119991e-06, "loss": 0.87913823, "num_input_tokens_seen": 126706965, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.13470459, "step": 5891, "time_per_iteration": 2.604328155517578 }, { "auxiliary_loss_clip": 0.06473483, "auxiliary_loss_mlp": 0.01273765, "balance_loss_clip": 0.06291454, "balance_loss_mlp": 0.01257982, "epoch": 0.35424620471967533, "flos": 12390701713920.0, "grad_norm": 2.5117442604411537, "language_loss": 0.73973668, "learning_rate": 2.9948238664168415e-06, "loss": 0.81720912, "num_input_tokens_seen": 126724015, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.15795898, "step": 5892, "time_per_iteration": 2.550981044769287 }, { "auxiliary_loss_clip": 0.06473006, "auxiliary_loss_mlp": 0.01273596, "balance_loss_clip": 0.06286262, "balance_loss_mlp": 0.01258826, "epoch": 0.3543063279723433, "flos": 19678219015680.0, "grad_norm": 1.97492933887675, "language_loss": 0.66947746, "learning_rate": 2.9944859839099518e-06, "loss": 0.74694359, "num_input_tokens_seen": 126737565, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.14782715, "step": 5893, "time_per_iteration": 2.5344560146331787 }, { "auxiliary_loss_clip": 0.06478126, "auxiliary_loss_mlp": 0.01274328, "balance_loss_clip": 0.06294551, "balance_loss_mlp": 0.01257806, "epoch": 0.35436645122501126, "flos": 21915841173120.0, "grad_norm": 6.358628414845315, "language_loss": 0.70197213, "learning_rate": 2.9941480636920533e-06, "loss": 0.77949667, "num_input_tokens_seen": 126756095, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.1652832, "step": 5894, "time_per_iteration": 2.565791606903076 }, { "auxiliary_loss_clip": 0.06477386, "auxiliary_loss_mlp": 0.01271555, "balance_loss_clip": 0.06295669, "balance_loss_mlp": 0.01257393, "epoch": 0.3544265744776792, "flos": 21724915645440.0, "grad_norm": 2.0468751139653873, "language_loss": 0.7485677, "learning_rate": 2.9938101057759615e-06, "loss": 0.82605708, "num_input_tokens_seen": 126775455, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.14147949, "step": 5895, "time_per_iteration": 2.6041934490203857 }, { "auxiliary_loss_clip": 0.06478877, "auxiliary_loss_mlp": 0.01273859, "balance_loss_clip": 0.06293237, "balance_loss_mlp": 0.01259637, "epoch": 0.3544866977303472, "flos": 21219278981760.0, "grad_norm": 2.188536638745796, "language_loss": 0.84006268, "learning_rate": 2.993472110174491e-06, "loss": 0.91759008, "num_input_tokens_seen": 126792320, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.14233398, "step": 5896, "time_per_iteration": 3.9683055877685547 }, { "auxiliary_loss_clip": 0.06472042, "auxiliary_loss_mlp": 0.01273853, "balance_loss_clip": 0.06289461, "balance_loss_mlp": 0.01257056, "epoch": 0.35454682098301515, "flos": 29318534311680.0, "grad_norm": 4.860688131570379, "language_loss": 0.70658088, "learning_rate": 2.9931340769004576e-06, "loss": 0.78403986, "num_input_tokens_seen": 126813680, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.16809082, "step": 5897, "time_per_iteration": 2.6253318786621094 }, { "auxiliary_loss_clip": 0.06475295, "auxiliary_loss_mlp": 0.0127103, "balance_loss_clip": 0.06291033, "balance_loss_mlp": 0.0125694, "epoch": 0.3546069442356832, "flos": 24323509192320.0, "grad_norm": 1.675016649562506, "language_loss": 0.82038534, "learning_rate": 2.9927960059666816e-06, "loss": 0.89784861, "num_input_tokens_seen": 126834395, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.14099121, "step": 5898, "time_per_iteration": 2.617047071456909 }, { "auxiliary_loss_clip": 0.06469817, "auxiliary_loss_mlp": 0.01276515, "balance_loss_clip": 0.06290394, "balance_loss_mlp": 0.01262234, "epoch": 0.35466706748835114, "flos": 22863984848640.0, "grad_norm": 1.4441939007919347, "language_loss": 0.74735582, "learning_rate": 2.9924578973859804e-06, "loss": 0.82481921, "num_input_tokens_seen": 126855145, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.14306641, "step": 5899, "time_per_iteration": 2.603911876678467 }, { "auxiliary_loss_clip": 0.06475753, "auxiliary_loss_mlp": 0.01274112, "balance_loss_clip": 0.06290428, "balance_loss_mlp": 0.01258698, "epoch": 0.3547271907410191, "flos": 28337714743680.0, "grad_norm": 1.5591616169110658, "language_loss": 0.79717803, "learning_rate": 2.9921197511711763e-06, "loss": 0.8746767, "num_input_tokens_seen": 126873790, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.1541748, "step": 5900, "time_per_iteration": 2.6568493843078613 }, { "auxiliary_loss_clip": 0.06478321, "auxiliary_loss_mlp": 0.01270309, "balance_loss_clip": 0.06295076, "balance_loss_mlp": 0.012548, "epoch": 0.35478731399368707, "flos": 23520911258880.0, "grad_norm": 4.212779926180838, "language_loss": 0.82250631, "learning_rate": 2.991781567335093e-06, "loss": 0.89999264, "num_input_tokens_seen": 126892865, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.1550293, "step": 5901, "time_per_iteration": 4.020668029785156 }, { "auxiliary_loss_clip": 0.06483643, "auxiliary_loss_mlp": 0.0127525, "balance_loss_clip": 0.06296212, "balance_loss_mlp": 0.01259967, "epoch": 0.35484743724635504, "flos": 18630202872960.0, "grad_norm": 1.8331967914581142, "language_loss": 0.7636134, "learning_rate": 2.9914433458905525e-06, "loss": 0.84120238, "num_input_tokens_seen": 126911935, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.1529541, "step": 5902, "time_per_iteration": 2.5796549320220947 }, { "auxiliary_loss_clip": 0.06480484, "auxiliary_loss_mlp": 0.01274468, "balance_loss_clip": 0.06296328, "balance_loss_mlp": 0.01258911, "epoch": 0.354907560499023, "flos": 17390296880640.0, "grad_norm": 2.0910049122952157, "language_loss": 0.71453881, "learning_rate": 2.991105086850381e-06, "loss": 0.79208827, "num_input_tokens_seen": 126930040, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.15551758, "step": 5903, "time_per_iteration": 2.5539026260375977 }, { "auxiliary_loss_clip": 0.06478946, "auxiliary_loss_mlp": 0.01271329, "balance_loss_clip": 0.06290349, "balance_loss_mlp": 0.01254067, "epoch": 0.35496768375169097, "flos": 19214607974400.0, "grad_norm": 2.223961136541281, "language_loss": 0.74432802, "learning_rate": 2.9907667902274053e-06, "loss": 0.82183075, "num_input_tokens_seen": 126948390, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.17260742, "step": 5904, "time_per_iteration": 2.541524648666382 }, { "auxiliary_loss_clip": 0.06480788, "auxiliary_loss_mlp": 0.01270787, "balance_loss_clip": 0.06292647, "balance_loss_mlp": 0.01255892, "epoch": 0.35502780700435893, "flos": 18338692118400.0, "grad_norm": 2.957636759380377, "language_loss": 0.79259241, "learning_rate": 2.9904284560344536e-06, "loss": 0.87010813, "num_input_tokens_seen": 126964905, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.14880371, "step": 5905, "time_per_iteration": 2.5479001998901367 }, { "auxiliary_loss_clip": 0.06461434, "auxiliary_loss_mlp": 0.01267572, "balance_loss_clip": 0.06288981, "balance_loss_mlp": 0.01253881, "epoch": 0.3550879302570269, "flos": 15453660988800.0, "grad_norm": 2.299209384389433, "language_loss": 0.72705233, "learning_rate": 2.990090084284356e-06, "loss": 0.80434239, "num_input_tokens_seen": 126982000, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.13696289, "step": 5906, "time_per_iteration": 2.5426759719848633 }, { "auxiliary_loss_clip": 0.06491185, "auxiliary_loss_mlp": 0.01272596, "balance_loss_clip": 0.06302293, "balance_loss_mlp": 0.01255871, "epoch": 0.35514805350969486, "flos": 21985343735040.0, "grad_norm": 1.711498765664193, "language_loss": 0.74816799, "learning_rate": 2.9897516749899426e-06, "loss": 0.82580584, "num_input_tokens_seen": 126998390, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.16723633, "step": 5907, "time_per_iteration": 2.5816986560821533 }, { "auxiliary_loss_clip": 0.06479986, "auxiliary_loss_mlp": 0.0127516, "balance_loss_clip": 0.06297816, "balance_loss_mlp": 0.01259276, "epoch": 0.3552081767623628, "flos": 29869718469120.0, "grad_norm": 1.946160613584192, "language_loss": 0.7625801, "learning_rate": 2.989413228164047e-06, "loss": 0.84013158, "num_input_tokens_seen": 127020220, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.15875244, "step": 5908, "time_per_iteration": 4.04041051864624 }, { "auxiliary_loss_clip": 0.06483023, "auxiliary_loss_mlp": 0.0127357, "balance_loss_clip": 0.06300625, "balance_loss_mlp": 0.0125756, "epoch": 0.3552683000150308, "flos": 26439456821760.0, "grad_norm": 2.3941732083334406, "language_loss": 0.68923068, "learning_rate": 2.989074743819502e-06, "loss": 0.76679659, "num_input_tokens_seen": 127038585, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.16003418, "step": 5909, "time_per_iteration": 2.6201701164245605 }, { "auxiliary_loss_clip": 0.06477384, "auxiliary_loss_mlp": 0.01283764, "balance_loss_clip": 0.06299077, "balance_loss_mlp": 0.01268755, "epoch": 0.35532842326769876, "flos": 19791088865280.0, "grad_norm": 2.1169918192219184, "language_loss": 0.790676, "learning_rate": 2.988736221969144e-06, "loss": 0.86828744, "num_input_tokens_seen": 127056215, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.15026855, "step": 5910, "time_per_iteration": 2.565624713897705 }, { "auxiliary_loss_clip": 0.0648322, "auxiliary_loss_mlp": 0.0127575, "balance_loss_clip": 0.0629479, "balance_loss_mlp": 0.01258739, "epoch": 0.3553885465203668, "flos": 17245170408960.0, "grad_norm": 1.5904593227669066, "language_loss": 0.71089417, "learning_rate": 2.98839766262581e-06, "loss": 0.78848386, "num_input_tokens_seen": 127075825, "router_z_loss_clip": 1.88671875, "router_z_loss_mlp": 0.17004395, "step": 5911, "time_per_iteration": 2.7225992679595947 }, { "auxiliary_loss_clip": 0.06482592, "auxiliary_loss_mlp": 0.01274314, "balance_loss_clip": 0.06304114, "balance_loss_mlp": 0.01258852, "epoch": 0.35544866977303474, "flos": 14938800376320.0, "grad_norm": 2.058314808554453, "language_loss": 0.87582958, "learning_rate": 2.9880590658023366e-06, "loss": 0.95339859, "num_input_tokens_seen": 127091205, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.15466309, "step": 5912, "time_per_iteration": 2.5933773517608643 }, { "auxiliary_loss_clip": 0.06476606, "auxiliary_loss_mlp": 0.01280458, "balance_loss_clip": 0.06296188, "balance_loss_mlp": 0.0126626, "epoch": 0.3555087930257027, "flos": 19762228333440.0, "grad_norm": 2.1816274735606207, "language_loss": 0.77930939, "learning_rate": 2.9877204315115646e-06, "loss": 0.85688007, "num_input_tokens_seen": 127109210, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.14196777, "step": 5913, "time_per_iteration": 4.046305418014526 }, { "auxiliary_loss_clip": 0.0647433, "auxiliary_loss_mlp": 0.01281182, "balance_loss_clip": 0.062952, "balance_loss_mlp": 0.01265327, "epoch": 0.3555689162783707, "flos": 21074445999360.0, "grad_norm": 1.2911820465524622, "language_loss": 0.82868254, "learning_rate": 2.9873817597663353e-06, "loss": 0.9062376, "num_input_tokens_seen": 127128400, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.15856934, "step": 5914, "time_per_iteration": 2.610443592071533 }, { "auxiliary_loss_clip": 0.06481843, "auxiliary_loss_mlp": 0.01271183, "balance_loss_clip": 0.06298296, "balance_loss_mlp": 0.01255721, "epoch": 0.35562903953103864, "flos": 33077426872320.0, "grad_norm": 2.101296289771362, "language_loss": 0.704247, "learning_rate": 2.98704305057949e-06, "loss": 0.78177726, "num_input_tokens_seen": 127149965, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.15454102, "step": 5915, "time_per_iteration": 2.679921865463257 }, { "auxiliary_loss_clip": 0.06475148, "auxiliary_loss_mlp": 0.01271174, "balance_loss_clip": 0.06293159, "balance_loss_mlp": 0.01256463, "epoch": 0.3556891627837066, "flos": 20564029653120.0, "grad_norm": 3.0028345871934827, "language_loss": 0.76523364, "learning_rate": 2.9867043039638737e-06, "loss": 0.84269685, "num_input_tokens_seen": 127169865, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.14709473, "step": 5916, "time_per_iteration": 2.584616184234619 }, { "auxiliary_loss_clip": 0.06477781, "auxiliary_loss_mlp": 0.01278129, "balance_loss_clip": 0.0629475, "balance_loss_mlp": 0.01262369, "epoch": 0.35574928603637457, "flos": 20709449614080.0, "grad_norm": 2.2565243161422583, "language_loss": 0.88746595, "learning_rate": 2.986365519932332e-06, "loss": 0.96502507, "num_input_tokens_seen": 127188075, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.15771484, "step": 5917, "time_per_iteration": 2.665036678314209 }, { "auxiliary_loss_clip": 0.06479058, "auxiliary_loss_mlp": 0.0127207, "balance_loss_clip": 0.06298012, "balance_loss_mlp": 0.01255941, "epoch": 0.35580940928904253, "flos": 15199899298560.0, "grad_norm": 2.1308111045379974, "language_loss": 0.75272, "learning_rate": 2.98602669849771e-06, "loss": 0.83023125, "num_input_tokens_seen": 127206065, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.16125488, "step": 5918, "time_per_iteration": 2.539393186569214 }, { "auxiliary_loss_clip": 0.06358784, "auxiliary_loss_mlp": 0.01289746, "balance_loss_clip": 0.06275603, "balance_loss_mlp": 0.01284844, "epoch": 0.3558695325417105, "flos": 58656145426560.0, "grad_norm": 0.9201927746220788, "language_loss": 0.63616371, "learning_rate": 2.985687839672857e-06, "loss": 0.71264899, "num_input_tokens_seen": 127257885, "router_z_loss_clip": 0.82910156, "router_z_loss_mlp": 0.04898071, "step": 5919, "time_per_iteration": 2.971118450164795 }, { "auxiliary_loss_clip": 0.06482154, "auxiliary_loss_mlp": 0.01272928, "balance_loss_clip": 0.06296015, "balance_loss_mlp": 0.01256596, "epoch": 0.35592965579437846, "flos": 22024811808000.0, "grad_norm": 2.495510600961152, "language_loss": 0.74001628, "learning_rate": 2.9853489434706223e-06, "loss": 0.81756711, "num_input_tokens_seen": 127275550, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.16320801, "step": 5920, "time_per_iteration": 2.5975167751312256 }, { "auxiliary_loss_clip": 0.06473021, "auxiliary_loss_mlp": 0.01277797, "balance_loss_clip": 0.06291358, "balance_loss_mlp": 0.01261906, "epoch": 0.35598977904704643, "flos": 23374401194880.0, "grad_norm": 2.066616038277249, "language_loss": 0.7738508, "learning_rate": 2.985010009903857e-06, "loss": 0.85135901, "num_input_tokens_seen": 127295110, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.15905762, "step": 5921, "time_per_iteration": 2.623802900314331 }, { "auxiliary_loss_clip": 0.06480089, "auxiliary_loss_mlp": 0.01277387, "balance_loss_clip": 0.0629607, "balance_loss_mlp": 0.01261675, "epoch": 0.3560499022997144, "flos": 17791113686400.0, "grad_norm": 3.655823149458822, "language_loss": 0.67885458, "learning_rate": 2.9846710389854133e-06, "loss": 0.75642931, "num_input_tokens_seen": 127312865, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.15710449, "step": 5922, "time_per_iteration": 2.5324511528015137 }, { "auxiliary_loss_clip": 0.06476848, "auxiliary_loss_mlp": 0.01272566, "balance_loss_clip": 0.06296385, "balance_loss_mlp": 0.01257593, "epoch": 0.35611002555238236, "flos": 20746695553920.0, "grad_norm": 1.8951565670405504, "language_loss": 0.79764616, "learning_rate": 2.9843320307281454e-06, "loss": 0.87514031, "num_input_tokens_seen": 127331710, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.14971924, "step": 5923, "time_per_iteration": 2.5804519653320312 }, { "auxiliary_loss_clip": 0.06471782, "auxiliary_loss_mlp": 0.0127178, "balance_loss_clip": 0.06290722, "balance_loss_mlp": 0.01256485, "epoch": 0.3561701488050504, "flos": 19468034248320.0, "grad_norm": 1.983128441101393, "language_loss": 0.85756749, "learning_rate": 2.983992985144908e-06, "loss": 0.93500316, "num_input_tokens_seen": 127350950, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.15283203, "step": 5924, "time_per_iteration": 2.5466837882995605 }, { "auxiliary_loss_clip": 0.06474385, "auxiliary_loss_mlp": 0.0127435, "balance_loss_clip": 0.06293872, "balance_loss_mlp": 0.01258459, "epoch": 0.35623027205771834, "flos": 30783006046080.0, "grad_norm": 2.488283551608766, "language_loss": 0.77362764, "learning_rate": 2.9836539022485578e-06, "loss": 0.85111499, "num_input_tokens_seen": 127369385, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.15881348, "step": 5925, "time_per_iteration": 2.668710231781006 }, { "auxiliary_loss_clip": 0.06474071, "auxiliary_loss_mlp": 0.01276513, "balance_loss_clip": 0.06290503, "balance_loss_mlp": 0.01260963, "epoch": 0.3562903953103863, "flos": 16986461328000.0, "grad_norm": 2.439261376493277, "language_loss": 0.75949275, "learning_rate": 2.9833147820519535e-06, "loss": 0.83699858, "num_input_tokens_seen": 127386965, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.15545654, "step": 5926, "time_per_iteration": 2.5526697635650635 }, { "auxiliary_loss_clip": 0.0647987, "auxiliary_loss_mlp": 0.01276096, "balance_loss_clip": 0.06292987, "balance_loss_mlp": 0.01260134, "epoch": 0.3563505185630543, "flos": 23846271863040.0, "grad_norm": 2.090299663306321, "language_loss": 0.70244849, "learning_rate": 2.9829756245679544e-06, "loss": 0.7800082, "num_input_tokens_seen": 127406075, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.15948486, "step": 5927, "time_per_iteration": 2.593564748764038 }, { "auxiliary_loss_clip": 0.06471229, "auxiliary_loss_mlp": 0.01275608, "balance_loss_clip": 0.0629087, "balance_loss_mlp": 0.01261171, "epoch": 0.35641064181572224, "flos": 22280040944640.0, "grad_norm": 1.8505943725160947, "language_loss": 0.80047143, "learning_rate": 2.9826364298094212e-06, "loss": 0.87793982, "num_input_tokens_seen": 127425350, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.14453125, "step": 5928, "time_per_iteration": 2.6374309062957764 }, { "auxiliary_loss_clip": 0.06476815, "auxiliary_loss_mlp": 0.01274001, "balance_loss_clip": 0.06295191, "balance_loss_mlp": 0.01259791, "epoch": 0.3564707650683902, "flos": 23007643873920.0, "grad_norm": 1.7682454788832422, "language_loss": 0.82248735, "learning_rate": 2.982297197789215e-06, "loss": 0.89999545, "num_input_tokens_seen": 127446335, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.14208984, "step": 5929, "time_per_iteration": 2.7598495483398438 }, { "auxiliary_loss_clip": 0.0646432, "auxiliary_loss_mlp": 0.01272921, "balance_loss_clip": 0.06287307, "balance_loss_mlp": 0.01257829, "epoch": 0.35653088832105817, "flos": 14689566806400.0, "grad_norm": 2.1785294996963533, "language_loss": 0.70608997, "learning_rate": 2.981957928520201e-06, "loss": 0.78346241, "num_input_tokens_seen": 127462795, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.15093994, "step": 5930, "time_per_iteration": 2.5657224655151367 }, { "auxiliary_loss_clip": 0.06477267, "auxiliary_loss_mlp": 0.01270958, "balance_loss_clip": 0.06291013, "balance_loss_mlp": 0.01255127, "epoch": 0.35659101157372614, "flos": 23483791100160.0, "grad_norm": 2.1250094795694934, "language_loss": 0.68263489, "learning_rate": 2.981618622015244e-06, "loss": 0.76011717, "num_input_tokens_seen": 127482675, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.1583252, "step": 5931, "time_per_iteration": 2.5969929695129395 }, { "auxiliary_loss_clip": 0.0646828, "auxiliary_loss_mlp": 0.01269348, "balance_loss_clip": 0.06289978, "balance_loss_mlp": 0.01254941, "epoch": 0.3566511348263941, "flos": 26585966885760.0, "grad_norm": 6.548589380101832, "language_loss": 0.68473673, "learning_rate": 2.981279278287211e-06, "loss": 0.76211298, "num_input_tokens_seen": 127502275, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.144104, "step": 5932, "time_per_iteration": 2.6282756328582764 }, { "auxiliary_loss_clip": 0.06470652, "auxiliary_loss_mlp": 0.01273385, "balance_loss_clip": 0.0629282, "balance_loss_mlp": 0.01258877, "epoch": 0.35671125807906207, "flos": 13119981724800.0, "grad_norm": 3.0994957638494247, "language_loss": 0.80487251, "learning_rate": 2.980939897348969e-06, "loss": 0.88231289, "num_input_tokens_seen": 127520195, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.1451416, "step": 5933, "time_per_iteration": 2.5553336143493652 }, { "auxiliary_loss_clip": 0.06473281, "auxiliary_loss_mlp": 0.01269668, "balance_loss_clip": 0.0628975, "balance_loss_mlp": 0.01255255, "epoch": 0.35677138133173003, "flos": 33009014413440.0, "grad_norm": 1.5659041403482563, "language_loss": 0.70132828, "learning_rate": 2.980600479213388e-06, "loss": 0.77875781, "num_input_tokens_seen": 127544495, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.14416504, "step": 5934, "time_per_iteration": 2.7654242515563965 }, { "auxiliary_loss_clip": 0.06485704, "auxiliary_loss_mlp": 0.01270654, "balance_loss_clip": 0.06295747, "balance_loss_mlp": 0.01253881, "epoch": 0.356831504584398, "flos": 20784234983040.0, "grad_norm": 6.836235628901861, "language_loss": 0.71382844, "learning_rate": 2.9802610238933384e-06, "loss": 0.79139203, "num_input_tokens_seen": 127563810, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.16784668, "step": 5935, "time_per_iteration": 4.15117883682251 }, { "auxiliary_loss_clip": 0.06476632, "auxiliary_loss_mlp": 0.01272703, "balance_loss_clip": 0.0629383, "balance_loss_mlp": 0.01256955, "epoch": 0.35689162783706596, "flos": 12170244821760.0, "grad_norm": 1.9658659279743158, "language_loss": 0.78471136, "learning_rate": 2.979921531401692e-06, "loss": 0.86220473, "num_input_tokens_seen": 127579065, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.15771484, "step": 5936, "time_per_iteration": 2.586775064468384 }, { "auxiliary_loss_clip": 0.06477947, "auxiliary_loss_mlp": 0.01275048, "balance_loss_clip": 0.06298229, "balance_loss_mlp": 0.01259134, "epoch": 0.356951751089734, "flos": 23848200506880.0, "grad_norm": 1.6092782466159063, "language_loss": 0.65186471, "learning_rate": 2.9795820017513242e-06, "loss": 0.72939461, "num_input_tokens_seen": 127599105, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.15917969, "step": 5937, "time_per_iteration": 2.6448593139648438 }, { "auxiliary_loss_clip": 0.06478778, "auxiliary_loss_mlp": 0.01269443, "balance_loss_clip": 0.06295015, "balance_loss_mlp": 0.01254184, "epoch": 0.35701187434240195, "flos": 11725851093120.0, "grad_norm": 2.64339152619034, "language_loss": 0.78631663, "learning_rate": 2.9792424349551073e-06, "loss": 0.86379886, "num_input_tokens_seen": 127614940, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.15264893, "step": 5938, "time_per_iteration": 2.5845861434936523 }, { "auxiliary_loss_clip": 0.06475195, "auxiliary_loss_mlp": 0.01271573, "balance_loss_clip": 0.06291981, "balance_loss_mlp": 0.01256302, "epoch": 0.3570719975950699, "flos": 24905650233600.0, "grad_norm": 1.4677058759855461, "language_loss": 0.80851674, "learning_rate": 2.9789028310259202e-06, "loss": 0.88598442, "num_input_tokens_seen": 127634960, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.15270996, "step": 5939, "time_per_iteration": 2.607052803039551 }, { "auxiliary_loss_clip": 0.06492697, "auxiliary_loss_mlp": 0.01272283, "balance_loss_clip": 0.06302287, "balance_loss_mlp": 0.01256661, "epoch": 0.3571321208477379, "flos": 26002022981760.0, "grad_norm": 1.9758119584262352, "language_loss": 0.79928243, "learning_rate": 2.9785631899766395e-06, "loss": 0.87693226, "num_input_tokens_seen": 127654545, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.15612793, "step": 5940, "time_per_iteration": 2.6683433055877686 }, { "auxiliary_loss_clip": 0.06487901, "auxiliary_loss_mlp": 0.01274883, "balance_loss_clip": 0.06302331, "balance_loss_mlp": 0.01259058, "epoch": 0.35719224410040584, "flos": 14506900905600.0, "grad_norm": 2.3008709225173325, "language_loss": 0.7306639, "learning_rate": 2.9782235118201443e-06, "loss": 0.80829167, "num_input_tokens_seen": 127672320, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.15820312, "step": 5941, "time_per_iteration": 4.0483574867248535 }, { "auxiliary_loss_clip": 0.06482643, "auxiliary_loss_mlp": 0.01275337, "balance_loss_clip": 0.06300637, "balance_loss_mlp": 0.0125953, "epoch": 0.3572523673530738, "flos": 31183445508480.0, "grad_norm": 1.8766816959537835, "language_loss": 0.65034723, "learning_rate": 2.9778837965693154e-06, "loss": 0.72792709, "num_input_tokens_seen": 127693315, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.15795898, "step": 5942, "time_per_iteration": 2.66434645652771 }, { "auxiliary_loss_clip": 0.06481265, "auxiliary_loss_mlp": 0.01272822, "balance_loss_clip": 0.06297669, "balance_loss_mlp": 0.01255948, "epoch": 0.3573124906057418, "flos": 15857496541440.0, "grad_norm": 1.6647421056293363, "language_loss": 0.74336553, "learning_rate": 2.9775440442370354e-06, "loss": 0.8209064, "num_input_tokens_seen": 127711570, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.16876221, "step": 5943, "time_per_iteration": 2.5909204483032227 }, { "auxiliary_loss_clip": 0.06350088, "auxiliary_loss_mlp": 0.0125291, "balance_loss_clip": 0.0626809, "balance_loss_mlp": 0.01249863, "epoch": 0.35737261385840974, "flos": 60839163849600.0, "grad_norm": 0.7702134684734145, "language_loss": 0.60601205, "learning_rate": 2.9772042548361867e-06, "loss": 0.68204206, "num_input_tokens_seen": 127772475, "router_z_loss_clip": 0.8203125, "router_z_loss_mlp": 0.03044128, "step": 5944, "time_per_iteration": 3.2879128456115723 }, { "auxiliary_loss_clip": 0.06482187, "auxiliary_loss_mlp": 0.01273747, "balance_loss_clip": 0.06301846, "balance_loss_mlp": 0.01258298, "epoch": 0.3574327371110777, "flos": 18849779297280.0, "grad_norm": 3.1623050797196615, "language_loss": 0.728688, "learning_rate": 2.976864428379655e-06, "loss": 0.80624729, "num_input_tokens_seen": 127790940, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.15466309, "step": 5945, "time_per_iteration": 2.5986387729644775 }, { "auxiliary_loss_clip": 0.06479915, "auxiliary_loss_mlp": 0.01268959, "balance_loss_clip": 0.0629891, "balance_loss_mlp": 0.01253962, "epoch": 0.35749286036374567, "flos": 23556354336000.0, "grad_norm": 1.5941559852756058, "language_loss": 0.81672662, "learning_rate": 2.976524564880326e-06, "loss": 0.89421535, "num_input_tokens_seen": 127808275, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.14990234, "step": 5946, "time_per_iteration": 2.6101438999176025 }, { "auxiliary_loss_clip": 0.06485596, "auxiliary_loss_mlp": 0.0127552, "balance_loss_clip": 0.0630079, "balance_loss_mlp": 0.0126007, "epoch": 0.35755298361641363, "flos": 21111817720320.0, "grad_norm": 1.4597551453588813, "language_loss": 0.69400752, "learning_rate": 2.9761846643510882e-06, "loss": 0.7716186, "num_input_tokens_seen": 127828840, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.15454102, "step": 5947, "time_per_iteration": 4.034064531326294 }, { "auxiliary_loss_clip": 0.06476995, "auxiliary_loss_mlp": 0.01270631, "balance_loss_clip": 0.06298798, "balance_loss_mlp": 0.01256576, "epoch": 0.3576131068690816, "flos": 19251099227520.0, "grad_norm": 1.8486016252664976, "language_loss": 0.76296312, "learning_rate": 2.9758447268048297e-06, "loss": 0.84043938, "num_input_tokens_seen": 127846240, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.14074707, "step": 5948, "time_per_iteration": 2.5716445446014404 }, { "auxiliary_loss_clip": 0.06483577, "auxiliary_loss_mlp": 0.01273614, "balance_loss_clip": 0.0629973, "balance_loss_mlp": 0.01257485, "epoch": 0.35767323012174956, "flos": 28661733682560.0, "grad_norm": 2.8173147693915634, "language_loss": 0.72097093, "learning_rate": 2.9755047522544415e-06, "loss": 0.79854286, "num_input_tokens_seen": 127866880, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.16101074, "step": 5949, "time_per_iteration": 2.63967227935791 }, { "auxiliary_loss_clip": 0.06479969, "auxiliary_loss_mlp": 0.01275191, "balance_loss_clip": 0.06297688, "balance_loss_mlp": 0.0126032, "epoch": 0.35773335337441753, "flos": 17089897593600.0, "grad_norm": 2.2398181496167484, "language_loss": 0.77597916, "learning_rate": 2.9751647407128154e-06, "loss": 0.85353082, "num_input_tokens_seen": 127883560, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.14862061, "step": 5950, "time_per_iteration": 2.598951816558838 }, { "auxiliary_loss_clip": 0.06476702, "auxiliary_loss_mlp": 0.01272228, "balance_loss_clip": 0.0629154, "balance_loss_mlp": 0.01257386, "epoch": 0.35779347662708555, "flos": 15894155502720.0, "grad_norm": 1.614586452278047, "language_loss": 0.72800982, "learning_rate": 2.9748246921928445e-06, "loss": 0.80549914, "num_input_tokens_seen": 127902330, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.1484375, "step": 5951, "time_per_iteration": 2.596226215362549 }, { "auxiliary_loss_clip": 0.06479587, "auxiliary_loss_mlp": 0.01273461, "balance_loss_clip": 0.06292863, "balance_loss_mlp": 0.0125751, "epoch": 0.3578535998797535, "flos": 28666555292160.0, "grad_norm": 4.72624245070371, "language_loss": 0.70173788, "learning_rate": 2.9744846067074236e-06, "loss": 0.77926838, "num_input_tokens_seen": 127922325, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.15930176, "step": 5952, "time_per_iteration": 4.11937952041626 }, { "auxiliary_loss_clip": 0.06474594, "auxiliary_loss_mlp": 0.01272696, "balance_loss_clip": 0.06294286, "balance_loss_mlp": 0.01257753, "epoch": 0.3579137231324215, "flos": 37861554464640.0, "grad_norm": 2.3592749351727, "language_loss": 0.70170629, "learning_rate": 2.974144484269449e-06, "loss": 0.77917922, "num_input_tokens_seen": 127942635, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.1494751, "step": 5953, "time_per_iteration": 2.727219820022583 }, { "auxiliary_loss_clip": 0.06471233, "auxiliary_loss_mlp": 0.01274258, "balance_loss_clip": 0.06290819, "balance_loss_mlp": 0.01258368, "epoch": 0.35797384638508944, "flos": 22353526575360.0, "grad_norm": 1.9921319084861084, "language_loss": 0.67238051, "learning_rate": 2.9738043248918175e-06, "loss": 0.74983549, "num_input_tokens_seen": 127962520, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.15881348, "step": 5954, "time_per_iteration": 2.596013307571411 }, { "auxiliary_loss_clip": 0.06469396, "auxiliary_loss_mlp": 0.01276546, "balance_loss_clip": 0.06291528, "balance_loss_mlp": 0.01262479, "epoch": 0.3580339696377574, "flos": 13594829212800.0, "grad_norm": 1.7034272727237887, "language_loss": 0.75577831, "learning_rate": 2.9734641285874282e-06, "loss": 0.83323777, "num_input_tokens_seen": 127981180, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.14074707, "step": 5955, "time_per_iteration": 2.556018590927124 }, { "auxiliary_loss_clip": 0.06467962, "auxiliary_loss_mlp": 0.01271315, "balance_loss_clip": 0.06290309, "balance_loss_mlp": 0.01257558, "epoch": 0.3580940928904254, "flos": 23774882584320.0, "grad_norm": 1.5809545698422502, "language_loss": 0.76075411, "learning_rate": 2.973123895369182e-06, "loss": 0.83814687, "num_input_tokens_seen": 127999725, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.13769531, "step": 5956, "time_per_iteration": 2.6117208003997803 }, { "auxiliary_loss_clip": 0.06463011, "auxiliary_loss_mlp": 0.01275568, "balance_loss_clip": 0.06288257, "balance_loss_mlp": 0.01260971, "epoch": 0.35815421614309334, "flos": 19469962892160.0, "grad_norm": 1.6856059989841863, "language_loss": 0.73918509, "learning_rate": 2.9727836252499805e-06, "loss": 0.81657088, "num_input_tokens_seen": 128018885, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.14593506, "step": 5957, "time_per_iteration": 2.5893638134002686 }, { "auxiliary_loss_clip": 0.06469157, "auxiliary_loss_mlp": 0.01274077, "balance_loss_clip": 0.06288882, "balance_loss_mlp": 0.01258759, "epoch": 0.3582143393957613, "flos": 23374988173440.0, "grad_norm": 1.8498253357145427, "language_loss": 0.71658725, "learning_rate": 2.972443318242726e-06, "loss": 0.79401958, "num_input_tokens_seen": 128037875, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.15319824, "step": 5958, "time_per_iteration": 2.6165623664855957 }, { "auxiliary_loss_clip": 0.06468207, "auxiliary_loss_mlp": 0.01270452, "balance_loss_clip": 0.06293279, "balance_loss_mlp": 0.01256529, "epoch": 0.35827446264842927, "flos": 26330528113920.0, "grad_norm": 1.5928069044725013, "language_loss": 0.88732404, "learning_rate": 2.972102974360324e-06, "loss": 0.96471059, "num_input_tokens_seen": 128056045, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.1394043, "step": 5959, "time_per_iteration": 2.6506993770599365 }, { "auxiliary_loss_clip": 0.06469631, "auxiliary_loss_mlp": 0.01270381, "balance_loss_clip": 0.06290767, "balance_loss_mlp": 0.01256344, "epoch": 0.35833458590109724, "flos": 30454626695040.0, "grad_norm": 1.6441913266510961, "language_loss": 0.58710814, "learning_rate": 2.971762593615679e-06, "loss": 0.66450828, "num_input_tokens_seen": 128077815, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.14038086, "step": 5960, "time_per_iteration": 2.6918256282806396 }, { "auxiliary_loss_clip": 0.06464539, "auxiliary_loss_mlp": 0.0127284, "balance_loss_clip": 0.062848, "balance_loss_mlp": 0.01257766, "epoch": 0.3583947091537652, "flos": 14835154475520.0, "grad_norm": 2.503116570009794, "language_loss": 0.76937902, "learning_rate": 2.9714221760216993e-06, "loss": 0.84675276, "num_input_tokens_seen": 128095460, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.15081787, "step": 5961, "time_per_iteration": 2.6190829277038574 }, { "auxiliary_loss_clip": 0.06466269, "auxiliary_loss_mlp": 0.01273252, "balance_loss_clip": 0.06285074, "balance_loss_mlp": 0.01258577, "epoch": 0.35845483240643317, "flos": 34249213895040.0, "grad_norm": 2.070775149610978, "language_loss": 0.70120287, "learning_rate": 2.971081721591294e-06, "loss": 0.77859813, "num_input_tokens_seen": 128118605, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.14672852, "step": 5962, "time_per_iteration": 2.753713846206665 }, { "auxiliary_loss_clip": 0.06466739, "auxiliary_loss_mlp": 0.01271706, "balance_loss_clip": 0.06289028, "balance_loss_mlp": 0.01257621, "epoch": 0.35851495565910113, "flos": 20966481613440.0, "grad_norm": 1.7164031176684582, "language_loss": 0.74846631, "learning_rate": 2.9707412303373716e-06, "loss": 0.82585073, "num_input_tokens_seen": 128139205, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.140625, "step": 5963, "time_per_iteration": 2.6063127517700195 }, { "auxiliary_loss_clip": 0.06471114, "auxiliary_loss_mlp": 0.01271818, "balance_loss_clip": 0.06292853, "balance_loss_mlp": 0.01257632, "epoch": 0.35857507891176915, "flos": 22316448343680.0, "grad_norm": 1.6133643363881356, "language_loss": 0.79164433, "learning_rate": 2.9704007022728447e-06, "loss": 0.86907363, "num_input_tokens_seen": 128158765, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.14196777, "step": 5964, "time_per_iteration": 2.618683338165283 }, { "auxiliary_loss_clip": 0.06474236, "auxiliary_loss_mlp": 0.01273669, "balance_loss_clip": 0.06291994, "balance_loss_mlp": 0.01258744, "epoch": 0.3586352021644371, "flos": 23374610830080.0, "grad_norm": 4.669218172614954, "language_loss": 0.67062688, "learning_rate": 2.970060137410626e-06, "loss": 0.74810588, "num_input_tokens_seen": 128177850, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.14916992, "step": 5965, "time_per_iteration": 2.6027071475982666 }, { "auxiliary_loss_clip": 0.06470157, "auxiliary_loss_mlp": 0.01274782, "balance_loss_clip": 0.06292415, "balance_loss_mlp": 0.01259953, "epoch": 0.3586953254171051, "flos": 27855655804800.0, "grad_norm": 1.583913252890455, "language_loss": 0.79577911, "learning_rate": 2.9697195357636294e-06, "loss": 0.87322855, "num_input_tokens_seen": 128196925, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.1484375, "step": 5966, "time_per_iteration": 2.6419548988342285 }, { "auxiliary_loss_clip": 0.06468451, "auxiliary_loss_mlp": 0.01268415, "balance_loss_clip": 0.06289774, "balance_loss_mlp": 0.01253424, "epoch": 0.35875544866977305, "flos": 19506621853440.0, "grad_norm": 2.0616449931529868, "language_loss": 0.91899788, "learning_rate": 2.9693788973447715e-06, "loss": 0.99636656, "num_input_tokens_seen": 128213955, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.14984131, "step": 5967, "time_per_iteration": 2.5498008728027344 }, { "auxiliary_loss_clip": 0.06465171, "auxiliary_loss_mlp": 0.01273562, "balance_loss_clip": 0.06285693, "balance_loss_mlp": 0.01259287, "epoch": 0.358815571922441, "flos": 21477652646400.0, "grad_norm": 1.7518297359182537, "language_loss": 0.80308843, "learning_rate": 2.9690382221669682e-06, "loss": 0.88047576, "num_input_tokens_seen": 128232980, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.1427002, "step": 5968, "time_per_iteration": 2.5832433700561523 }, { "auxiliary_loss_clip": 0.06466981, "auxiliary_loss_mlp": 0.01274827, "balance_loss_clip": 0.06285062, "balance_loss_mlp": 0.01259115, "epoch": 0.358875695175109, "flos": 21841894344960.0, "grad_norm": 1.996413992405991, "language_loss": 0.84593523, "learning_rate": 2.9686975102431384e-06, "loss": 0.92335331, "num_input_tokens_seen": 128252795, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.15710449, "step": 5969, "time_per_iteration": 2.5671496391296387 }, { "auxiliary_loss_clip": 0.06468033, "auxiliary_loss_mlp": 0.01272139, "balance_loss_clip": 0.06291296, "balance_loss_mlp": 0.01258883, "epoch": 0.35893581842777694, "flos": 32019264385920.0, "grad_norm": 2.09236454560565, "language_loss": 0.73227108, "learning_rate": 2.968356761586202e-06, "loss": 0.80967277, "num_input_tokens_seen": 128273115, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.13262939, "step": 5970, "time_per_iteration": 2.6659417152404785 }, { "auxiliary_loss_clip": 0.0646753, "auxiliary_loss_mlp": 0.01272963, "balance_loss_clip": 0.06289524, "balance_loss_mlp": 0.01258455, "epoch": 0.3589959416804449, "flos": 20492137249920.0, "grad_norm": 2.183043349967941, "language_loss": 0.79905772, "learning_rate": 2.9680159762090805e-06, "loss": 0.8764627, "num_input_tokens_seen": 128292220, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.14501953, "step": 5971, "time_per_iteration": 2.568881034851074 }, { "auxiliary_loss_clip": 0.06468377, "auxiliary_loss_mlp": 0.01271238, "balance_loss_clip": 0.06285128, "balance_loss_mlp": 0.01256814, "epoch": 0.3590560649331129, "flos": 16186295162880.0, "grad_norm": 1.6752873028140312, "language_loss": 0.79269445, "learning_rate": 2.967675154124696e-06, "loss": 0.8700906, "num_input_tokens_seen": 128310305, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.14428711, "step": 5972, "time_per_iteration": 2.57515287399292 }, { "auxiliary_loss_clip": 0.06465039, "auxiliary_loss_mlp": 0.01275443, "balance_loss_clip": 0.06285611, "balance_loss_mlp": 0.01261454, "epoch": 0.35911618818578084, "flos": 20381531460480.0, "grad_norm": 1.7651520763464539, "language_loss": 0.81709838, "learning_rate": 2.9673342953459722e-06, "loss": 0.89450324, "num_input_tokens_seen": 128328305, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.13977051, "step": 5973, "time_per_iteration": 2.5777504444122314 }, { "auxiliary_loss_clip": 0.06353912, "auxiliary_loss_mlp": 0.01259104, "balance_loss_clip": 0.06271993, "balance_loss_mlp": 0.01256199, "epoch": 0.3591763114384488, "flos": 41250991645440.0, "grad_norm": 0.8899452980390081, "language_loss": 0.56422704, "learning_rate": 2.9669933998858355e-06, "loss": 0.6403572, "num_input_tokens_seen": 128378380, "router_z_loss_clip": 0.8203125, "router_z_loss_mlp": 0.02902222, "step": 5974, "time_per_iteration": 3.0661330223083496 }, { "auxiliary_loss_clip": 0.06463727, "auxiliary_loss_mlp": 0.01276366, "balance_loss_clip": 0.0628504, "balance_loss_mlp": 0.01262466, "epoch": 0.35923643469111677, "flos": 18701047100160.0, "grad_norm": 1.9818945180981973, "language_loss": 0.69379038, "learning_rate": 2.9666524677572114e-06, "loss": 0.77119124, "num_input_tokens_seen": 128394315, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.13916016, "step": 5975, "time_per_iteration": 4.0314781665802 }, { "auxiliary_loss_clip": 0.06460682, "auxiliary_loss_mlp": 0.01269163, "balance_loss_clip": 0.06283617, "balance_loss_mlp": 0.01254953, "epoch": 0.35929655794378473, "flos": 25017010709760.0, "grad_norm": 2.2460509467389986, "language_loss": 0.80629265, "learning_rate": 2.96631149897303e-06, "loss": 0.88359112, "num_input_tokens_seen": 128414515, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.14221191, "step": 5976, "time_per_iteration": 2.751023054122925 }, { "auxiliary_loss_clip": 0.06461912, "auxiliary_loss_mlp": 0.01271577, "balance_loss_clip": 0.06283581, "balance_loss_mlp": 0.01256664, "epoch": 0.35935668119645275, "flos": 14980825998720.0, "grad_norm": 1.6626177461573741, "language_loss": 0.79311359, "learning_rate": 2.9659704935462194e-06, "loss": 0.87044847, "num_input_tokens_seen": 128430615, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.14910889, "step": 5977, "time_per_iteration": 2.5693295001983643 }, { "auxiliary_loss_clip": 0.06463601, "auxiliary_loss_mlp": 0.01270977, "balance_loss_clip": 0.0628456, "balance_loss_mlp": 0.01256887, "epoch": 0.3594168044491207, "flos": 21184422883200.0, "grad_norm": 1.7748085393942206, "language_loss": 0.80917412, "learning_rate": 2.9656294514897102e-06, "loss": 0.88651991, "num_input_tokens_seen": 128449480, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.14093018, "step": 5978, "time_per_iteration": 2.589738130569458 }, { "auxiliary_loss_clip": 0.06461865, "auxiliary_loss_mlp": 0.01270809, "balance_loss_clip": 0.06282839, "balance_loss_mlp": 0.01256462, "epoch": 0.3594769277017887, "flos": 27679446668160.0, "grad_norm": 1.4722461634849768, "language_loss": 0.67798638, "learning_rate": 2.965288372816436e-06, "loss": 0.7553131, "num_input_tokens_seen": 128471465, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.14349365, "step": 5979, "time_per_iteration": 2.642568588256836 }, { "auxiliary_loss_clip": 0.0646163, "auxiliary_loss_mlp": 0.01271022, "balance_loss_clip": 0.06283174, "balance_loss_mlp": 0.01256538, "epoch": 0.35953705095445665, "flos": 23008901685120.0, "grad_norm": 2.1111811446689157, "language_loss": 0.67951477, "learning_rate": 2.9649472575393296e-06, "loss": 0.7568413, "num_input_tokens_seen": 128490645, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.14489746, "step": 5980, "time_per_iteration": 4.048844814300537 }, { "auxiliary_loss_clip": 0.06475063, "auxiliary_loss_mlp": 0.0127498, "balance_loss_clip": 0.06286698, "balance_loss_mlp": 0.01258994, "epoch": 0.3595971742071246, "flos": 25520005969920.0, "grad_norm": 1.8489585285085188, "language_loss": 0.71259022, "learning_rate": 2.964606105671327e-06, "loss": 0.79009068, "num_input_tokens_seen": 128510225, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.15979004, "step": 5981, "time_per_iteration": 2.715482234954834 }, { "auxiliary_loss_clip": 0.06469394, "auxiliary_loss_mlp": 0.01272919, "balance_loss_clip": 0.06287488, "balance_loss_mlp": 0.01258018, "epoch": 0.3596572974597926, "flos": 29870431228800.0, "grad_norm": 2.6997315324083915, "language_loss": 0.71723229, "learning_rate": 2.9642649172253635e-06, "loss": 0.79465544, "num_input_tokens_seen": 128530195, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.14898682, "step": 5982, "time_per_iteration": 2.685011625289917 }, { "auxiliary_loss_clip": 0.06456874, "auxiliary_loss_mlp": 0.01268675, "balance_loss_clip": 0.0628271, "balance_loss_mlp": 0.0125502, "epoch": 0.35971742071246054, "flos": 23119255912320.0, "grad_norm": 1.7756732254636967, "language_loss": 0.76935136, "learning_rate": 2.9639236922143786e-06, "loss": 0.84660685, "num_input_tokens_seen": 128549990, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.13647461, "step": 5983, "time_per_iteration": 2.600292682647705 }, { "auxiliary_loss_clip": 0.06470622, "auxiliary_loss_mlp": 0.01271716, "balance_loss_clip": 0.06285545, "balance_loss_mlp": 0.01255598, "epoch": 0.3597775439651285, "flos": 16730645212800.0, "grad_norm": 2.4040058651242933, "language_loss": 0.76832128, "learning_rate": 2.96358243065131e-06, "loss": 0.84574461, "num_input_tokens_seen": 128567925, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.16125488, "step": 5984, "time_per_iteration": 2.604884386062622 }, { "auxiliary_loss_clip": 0.06455241, "auxiliary_loss_mlp": 0.01273106, "balance_loss_clip": 0.06280239, "balance_loss_mlp": 0.01259236, "epoch": 0.3598376672177965, "flos": 19725653226240.0, "grad_norm": 1.8882438283983525, "language_loss": 0.86571991, "learning_rate": 2.9632411325490993e-06, "loss": 0.94300342, "num_input_tokens_seen": 128585655, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.13867188, "step": 5985, "time_per_iteration": 2.543628454208374 }, { "auxiliary_loss_clip": 0.06460491, "auxiliary_loss_mlp": 0.01271082, "balance_loss_clip": 0.0628437, "balance_loss_mlp": 0.01256336, "epoch": 0.35989779047046444, "flos": 17317314374400.0, "grad_norm": 2.9441785429740035, "language_loss": 0.73111093, "learning_rate": 2.9628997979206884e-06, "loss": 0.80842668, "num_input_tokens_seen": 128604820, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.14746094, "step": 5986, "time_per_iteration": 2.583130121231079 }, { "auxiliary_loss_clip": 0.06466866, "auxiliary_loss_mlp": 0.01269845, "balance_loss_clip": 0.06282344, "balance_loss_mlp": 0.01255362, "epoch": 0.3599579137231324, "flos": 22717894055040.0, "grad_norm": 1.6122129150779816, "language_loss": 0.73953235, "learning_rate": 2.9625584267790204e-06, "loss": 0.81689942, "num_input_tokens_seen": 128623070, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.14489746, "step": 5987, "time_per_iteration": 4.111433982849121 }, { "auxiliary_loss_clip": 0.06464332, "auxiliary_loss_mlp": 0.01268773, "balance_loss_clip": 0.06282879, "balance_loss_mlp": 0.0125442, "epoch": 0.36001803697580037, "flos": 20966230051200.0, "grad_norm": 1.9965274867710106, "language_loss": 0.70231611, "learning_rate": 2.9622170191370404e-06, "loss": 0.77964711, "num_input_tokens_seen": 128642430, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.14331055, "step": 5988, "time_per_iteration": 2.5715367794036865 }, { "auxiliary_loss_clip": 0.06470115, "auxiliary_loss_mlp": 0.01273108, "balance_loss_clip": 0.06286967, "balance_loss_mlp": 0.01257831, "epoch": 0.36007816022846834, "flos": 20491843760640.0, "grad_norm": 1.8627204683700864, "language_loss": 0.73498386, "learning_rate": 2.9618755750076953e-06, "loss": 0.81241614, "num_input_tokens_seen": 128661285, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.15270996, "step": 5989, "time_per_iteration": 2.624394416809082 }, { "auxiliary_loss_clip": 0.06465492, "auxiliary_loss_mlp": 0.01267105, "balance_loss_clip": 0.06286976, "balance_loss_mlp": 0.01253021, "epoch": 0.36013828348113636, "flos": 28008706487040.0, "grad_norm": 1.5907869690047967, "language_loss": 0.80190802, "learning_rate": 2.961534094403931e-06, "loss": 0.87923396, "num_input_tokens_seen": 128682210, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.14080811, "step": 5990, "time_per_iteration": 2.644334316253662 }, { "auxiliary_loss_clip": 0.06460969, "auxiliary_loss_mlp": 0.01270485, "balance_loss_clip": 0.06283154, "balance_loss_mlp": 0.01255626, "epoch": 0.3601984067338043, "flos": 20088050135040.0, "grad_norm": 1.7391438392378806, "language_loss": 0.84395874, "learning_rate": 2.961192577338698e-06, "loss": 0.92127329, "num_input_tokens_seen": 128700445, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.14849854, "step": 5991, "time_per_iteration": 2.6125288009643555 }, { "auxiliary_loss_clip": 0.06474456, "auxiliary_loss_mlp": 0.01276281, "balance_loss_clip": 0.06289719, "balance_loss_mlp": 0.01261332, "epoch": 0.3602585299864723, "flos": 18622362516480.0, "grad_norm": 2.2511030227831035, "language_loss": 0.75383878, "learning_rate": 2.9608510238249463e-06, "loss": 0.83134609, "num_input_tokens_seen": 128716855, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.14953613, "step": 5992, "time_per_iteration": 4.019655466079712 }, { "auxiliary_loss_clip": 0.06463696, "auxiliary_loss_mlp": 0.01273722, "balance_loss_clip": 0.06285067, "balance_loss_mlp": 0.01259143, "epoch": 0.36031865323914025, "flos": 19579059308160.0, "grad_norm": 1.8288984273931883, "language_loss": 0.78068715, "learning_rate": 2.960509433875627e-06, "loss": 0.85806131, "num_input_tokens_seen": 128735835, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.14575195, "step": 5993, "time_per_iteration": 2.5545992851257324 }, { "auxiliary_loss_clip": 0.0646604, "auxiliary_loss_mlp": 0.01270653, "balance_loss_clip": 0.0628396, "balance_loss_mlp": 0.01256169, "epoch": 0.3603787764918082, "flos": 17495871425280.0, "grad_norm": 2.806843601521359, "language_loss": 0.7482481, "learning_rate": 2.9601678075036943e-06, "loss": 0.82561505, "num_input_tokens_seen": 128752465, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.14483643, "step": 5994, "time_per_iteration": 2.5311195850372314 }, { "auxiliary_loss_clip": 0.06466404, "auxiliary_loss_mlp": 0.01267212, "balance_loss_clip": 0.06282102, "balance_loss_mlp": 0.01253038, "epoch": 0.3604388997444762, "flos": 15528823701120.0, "grad_norm": 1.8612255648416136, "language_loss": 0.69967091, "learning_rate": 2.9598261447221024e-06, "loss": 0.77700704, "num_input_tokens_seen": 128770865, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.14196777, "step": 5995, "time_per_iteration": 2.53989577293396 }, { "auxiliary_loss_clip": 0.06468232, "auxiliary_loss_mlp": 0.01272866, "balance_loss_clip": 0.06283721, "balance_loss_mlp": 0.01257511, "epoch": 0.36049902299714415, "flos": 17316559687680.0, "grad_norm": 1.98605054208373, "language_loss": 0.82746494, "learning_rate": 2.9594844455438057e-06, "loss": 0.90487587, "num_input_tokens_seen": 128789730, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.15338135, "step": 5996, "time_per_iteration": 2.5620968341827393 }, { "auxiliary_loss_clip": 0.06464434, "auxiliary_loss_mlp": 0.01270584, "balance_loss_clip": 0.06282618, "balance_loss_mlp": 0.01255677, "epoch": 0.3605591462498121, "flos": 17061749821440.0, "grad_norm": 1.8963903529559847, "language_loss": 0.73703367, "learning_rate": 2.959142709981763e-06, "loss": 0.8143838, "num_input_tokens_seen": 128806610, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.14910889, "step": 5997, "time_per_iteration": 2.5529775619506836 }, { "auxiliary_loss_clip": 0.06454624, "auxiliary_loss_mlp": 0.01271089, "balance_loss_clip": 0.06278965, "balance_loss_mlp": 0.01256629, "epoch": 0.3606192695024801, "flos": 16842508813440.0, "grad_norm": 8.063035911681352, "language_loss": 0.70047385, "learning_rate": 2.9588009380489337e-06, "loss": 0.777731, "num_input_tokens_seen": 128824830, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.14453125, "step": 5998, "time_per_iteration": 2.5579187870025635 }, { "auxiliary_loss_clip": 0.06459549, "auxiliary_loss_mlp": 0.01269887, "balance_loss_clip": 0.06282002, "balance_loss_mlp": 0.01255093, "epoch": 0.36067939275514804, "flos": 12134424401280.0, "grad_norm": 2.525988966575301, "language_loss": 0.77458864, "learning_rate": 2.9584591297582758e-06, "loss": 0.85188305, "num_input_tokens_seen": 128838170, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.14770508, "step": 5999, "time_per_iteration": 2.508737087249756 }, { "auxiliary_loss_clip": 0.06464683, "auxiliary_loss_mlp": 0.0127111, "balance_loss_clip": 0.06281643, "balance_loss_mlp": 0.01256918, "epoch": 0.360739516007816, "flos": 18047390999040.0, "grad_norm": 2.0574344978377392, "language_loss": 0.78958988, "learning_rate": 2.9581172851227516e-06, "loss": 0.86694777, "num_input_tokens_seen": 128855625, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.14190674, "step": 6000, "time_per_iteration": 2.5366435050964355 }, { "auxiliary_loss_clip": 0.06463245, "auxiliary_loss_mlp": 0.01270496, "balance_loss_clip": 0.06285706, "balance_loss_mlp": 0.01256859, "epoch": 0.360799639260484, "flos": 18555417504000.0, "grad_norm": 1.664026507192656, "language_loss": 0.7827152, "learning_rate": 2.9577754041553243e-06, "loss": 0.86005259, "num_input_tokens_seen": 128873540, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.13635254, "step": 6001, "time_per_iteration": 2.558568000793457 }, { "auxiliary_loss_clip": 0.0645529, "auxiliary_loss_mlp": 0.01268964, "balance_loss_clip": 0.06280501, "balance_loss_mlp": 0.01254778, "epoch": 0.36085976251315194, "flos": 19688029943040.0, "grad_norm": 1.8487704179318414, "language_loss": 0.83329546, "learning_rate": 2.9574334868689575e-06, "loss": 0.91053808, "num_input_tokens_seen": 128889925, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.14172363, "step": 6002, "time_per_iteration": 2.555941343307495 }, { "auxiliary_loss_clip": 0.06454697, "auxiliary_loss_mlp": 0.01272611, "balance_loss_clip": 0.06283592, "balance_loss_mlp": 0.01259474, "epoch": 0.3609198857658199, "flos": 24204476067840.0, "grad_norm": 1.8886517433436114, "language_loss": 0.91721594, "learning_rate": 2.9570915332766165e-06, "loss": 0.99448895, "num_input_tokens_seen": 128906890, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.13146973, "step": 6003, "time_per_iteration": 2.5789217948913574 }, { "auxiliary_loss_clip": 0.06348413, "auxiliary_loss_mlp": 0.01255127, "balance_loss_clip": 0.06265658, "balance_loss_mlp": 0.0125164, "epoch": 0.3609800090184879, "flos": 57134288044800.0, "grad_norm": 0.8574456086303067, "language_loss": 0.53494865, "learning_rate": 2.9567495433912693e-06, "loss": 0.61098403, "num_input_tokens_seen": 128965940, "router_z_loss_clip": 0.828125, "router_z_loss_mlp": 0.03494263, "step": 6004, "time_per_iteration": 3.133066415786743 }, { "auxiliary_loss_clip": 0.06470815, "auxiliary_loss_mlp": 0.01270197, "balance_loss_clip": 0.06287653, "balance_loss_mlp": 0.01254879, "epoch": 0.3610401322711559, "flos": 20817120510720.0, "grad_norm": 1.6628383367906912, "language_loss": 0.78081435, "learning_rate": 2.956407517225883e-06, "loss": 0.85822439, "num_input_tokens_seen": 128985835, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.15332031, "step": 6005, "time_per_iteration": 2.5911643505096436 }, { "auxiliary_loss_clip": 0.06459391, "auxiliary_loss_mlp": 0.01271554, "balance_loss_clip": 0.0628108, "balance_loss_mlp": 0.0125707, "epoch": 0.36110025552382385, "flos": 13704302972160.0, "grad_norm": 2.1244442041280664, "language_loss": 0.79099631, "learning_rate": 2.956065454793429e-06, "loss": 0.8683058, "num_input_tokens_seen": 129003120, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.14489746, "step": 6006, "time_per_iteration": 2.6150543689727783 }, { "auxiliary_loss_clip": 0.06464959, "auxiliary_loss_mlp": 0.01271204, "balance_loss_clip": 0.06285422, "balance_loss_mlp": 0.01255266, "epoch": 0.3611603787764918, "flos": 22461490961280.0, "grad_norm": 2.006225548624323, "language_loss": 0.84939706, "learning_rate": 2.955723356106876e-06, "loss": 0.92675877, "num_input_tokens_seen": 129021645, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.15942383, "step": 6007, "time_per_iteration": 2.596550941467285 }, { "auxiliary_loss_clip": 0.06476133, "auxiliary_loss_mlp": 0.01268968, "balance_loss_clip": 0.06286846, "balance_loss_mlp": 0.01253364, "epoch": 0.3612205020291598, "flos": 20892954055680.0, "grad_norm": 4.432927756988416, "language_loss": 0.73163652, "learning_rate": 2.955381221179198e-06, "loss": 0.80908751, "num_input_tokens_seen": 129038375, "router_z_loss_clip": 1.89257812, "router_z_loss_mlp": 0.15588379, "step": 6008, "time_per_iteration": 2.580578565597534 }, { "auxiliary_loss_clip": 0.06466424, "auxiliary_loss_mlp": 0.01270478, "balance_loss_clip": 0.062859, "balance_loss_mlp": 0.01256131, "epoch": 0.36128062528182775, "flos": 15747393876480.0, "grad_norm": 1.920464368962787, "language_loss": 0.83503062, "learning_rate": 2.955039050023368e-06, "loss": 0.91239965, "num_input_tokens_seen": 129056235, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.14343262, "step": 6009, "time_per_iteration": 2.534513473510742 }, { "auxiliary_loss_clip": 0.06465524, "auxiliary_loss_mlp": 0.01270177, "balance_loss_clip": 0.06285929, "balance_loss_mlp": 0.01254787, "epoch": 0.3613407485344957, "flos": 16770239066880.0, "grad_norm": 1.78935437579177, "language_loss": 0.7635752, "learning_rate": 2.954696842652362e-06, "loss": 0.84093225, "num_input_tokens_seen": 129072405, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.15393066, "step": 6010, "time_per_iteration": 2.5632452964782715 }, { "auxiliary_loss_clip": 0.06474615, "auxiliary_loss_mlp": 0.01274164, "balance_loss_clip": 0.06294048, "balance_loss_mlp": 0.01259274, "epoch": 0.3614008717871637, "flos": 20376625996800.0, "grad_norm": 1.6670633130300292, "language_loss": 0.83468759, "learning_rate": 2.9543545990791554e-06, "loss": 0.91217542, "num_input_tokens_seen": 129090225, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.14904785, "step": 6011, "time_per_iteration": 2.560868501663208 }, { "auxiliary_loss_clip": 0.06475654, "auxiliary_loss_mlp": 0.01272125, "balance_loss_clip": 0.06288911, "balance_loss_mlp": 0.01256628, "epoch": 0.36146099503983165, "flos": 22782071882880.0, "grad_norm": 2.1242876548444456, "language_loss": 0.63073343, "learning_rate": 2.954012319316727e-06, "loss": 0.70821118, "num_input_tokens_seen": 129107685, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.15466309, "step": 6012, "time_per_iteration": 2.5661911964416504 }, { "auxiliary_loss_clip": 0.06470601, "auxiliary_loss_mlp": 0.01272502, "balance_loss_clip": 0.06292736, "balance_loss_mlp": 0.01258423, "epoch": 0.3615211182924996, "flos": 23002277212800.0, "grad_norm": 1.733566540243851, "language_loss": 0.8411274, "learning_rate": 2.9536700033780565e-06, "loss": 0.91855848, "num_input_tokens_seen": 129125315, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.14086914, "step": 6013, "time_per_iteration": 2.5788121223449707 }, { "auxiliary_loss_clip": 0.06470692, "auxiliary_loss_mlp": 0.01273563, "balance_loss_clip": 0.06290372, "balance_loss_mlp": 0.01257851, "epoch": 0.3615812415451676, "flos": 16652631461760.0, "grad_norm": 1.8297565379689162, "language_loss": 0.91999257, "learning_rate": 2.9533276512761228e-06, "loss": 0.99743509, "num_input_tokens_seen": 129141600, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.15710449, "step": 6014, "time_per_iteration": 2.5258264541625977 }, { "auxiliary_loss_clip": 0.06467391, "auxiliary_loss_mlp": 0.01278876, "balance_loss_clip": 0.062887, "balance_loss_mlp": 0.01263737, "epoch": 0.36164136479783554, "flos": 21325733994240.0, "grad_norm": 1.8668194588268419, "language_loss": 0.74425042, "learning_rate": 2.95298526302391e-06, "loss": 0.82171309, "num_input_tokens_seen": 129160665, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.15148926, "step": 6015, "time_per_iteration": 3.960221290588379 }, { "auxiliary_loss_clip": 0.06468968, "auxiliary_loss_mlp": 0.01274185, "balance_loss_clip": 0.06287415, "balance_loss_mlp": 0.01258569, "epoch": 0.3617014880505035, "flos": 24176286368640.0, "grad_norm": 1.5745489320822605, "language_loss": 0.65481067, "learning_rate": 2.9526428386344e-06, "loss": 0.73224223, "num_input_tokens_seen": 129179220, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.15600586, "step": 6016, "time_per_iteration": 2.612611770629883 }, { "auxiliary_loss_clip": 0.0647283, "auxiliary_loss_mlp": 0.01280231, "balance_loss_clip": 0.06290101, "balance_loss_mlp": 0.01263577, "epoch": 0.3617616113031715, "flos": 39023278997760.0, "grad_norm": 1.6972198029995689, "language_loss": 0.72378588, "learning_rate": 2.9523003781205785e-06, "loss": 0.80131644, "num_input_tokens_seen": 129200385, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.16662598, "step": 6017, "time_per_iteration": 2.740156650543213 }, { "auxiliary_loss_clip": 0.06471056, "auxiliary_loss_mlp": 0.01271565, "balance_loss_clip": 0.06285923, "balance_loss_mlp": 0.01256676, "epoch": 0.3618217345558395, "flos": 12135807993600.0, "grad_norm": 1.8545196407285194, "language_loss": 0.74014473, "learning_rate": 2.9519578814954307e-06, "loss": 0.81757092, "num_input_tokens_seen": 129217395, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.14886475, "step": 6018, "time_per_iteration": 2.553149938583374 }, { "auxiliary_loss_clip": 0.06459861, "auxiliary_loss_mlp": 0.01271178, "balance_loss_clip": 0.062866, "balance_loss_mlp": 0.01256599, "epoch": 0.36188185780850746, "flos": 24941722216320.0, "grad_norm": 1.6112159075505235, "language_loss": 0.69501001, "learning_rate": 2.9516153487719448e-06, "loss": 0.77232039, "num_input_tokens_seen": 129238940, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.14587402, "step": 6019, "time_per_iteration": 2.6236302852630615 }, { "auxiliary_loss_clip": 0.06468616, "auxiliary_loss_mlp": 0.01273437, "balance_loss_clip": 0.06283382, "balance_loss_mlp": 0.01258321, "epoch": 0.3619419810611754, "flos": 20965014167040.0, "grad_norm": 1.7403326675724908, "language_loss": 0.76991433, "learning_rate": 2.95127277996311e-06, "loss": 0.84733486, "num_input_tokens_seen": 129258240, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.15112305, "step": 6020, "time_per_iteration": 4.010326385498047 }, { "auxiliary_loss_clip": 0.0647315, "auxiliary_loss_mlp": 0.01275105, "balance_loss_clip": 0.0628923, "balance_loss_mlp": 0.01258952, "epoch": 0.3620021043138434, "flos": 22535521643520.0, "grad_norm": 1.7123163386387543, "language_loss": 0.74463534, "learning_rate": 2.9509301750819156e-06, "loss": 0.82211792, "num_input_tokens_seen": 129279040, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.16162109, "step": 6021, "time_per_iteration": 2.5921623706817627 }, { "auxiliary_loss_clip": 0.06468906, "auxiliary_loss_mlp": 0.0127091, "balance_loss_clip": 0.06287771, "balance_loss_mlp": 0.01256616, "epoch": 0.36206222756651135, "flos": 15602183550720.0, "grad_norm": 1.9123987226881425, "language_loss": 0.81423187, "learning_rate": 2.9505875341413533e-06, "loss": 0.89162999, "num_input_tokens_seen": 129295415, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.1428833, "step": 6022, "time_per_iteration": 2.5569379329681396 }, { "auxiliary_loss_clip": 0.06461644, "auxiliary_loss_mlp": 0.01273206, "balance_loss_clip": 0.06287917, "balance_loss_mlp": 0.01258948, "epoch": 0.3621223508191793, "flos": 23594019546240.0, "grad_norm": 1.6354610523092883, "language_loss": 0.81503904, "learning_rate": 2.950244857154417e-06, "loss": 0.89238757, "num_input_tokens_seen": 129312620, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.14245605, "step": 6023, "time_per_iteration": 2.6595053672790527 }, { "auxiliary_loss_clip": 0.06474313, "auxiliary_loss_mlp": 0.01278552, "balance_loss_clip": 0.06290516, "balance_loss_mlp": 0.01262304, "epoch": 0.3621824740718473, "flos": 22316490270720.0, "grad_norm": 2.2953683425917757, "language_loss": 0.80377382, "learning_rate": 2.9499021441341e-06, "loss": 0.88130248, "num_input_tokens_seen": 129331825, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.16247559, "step": 6024, "time_per_iteration": 2.679324150085449 }, { "auxiliary_loss_clip": 0.06465714, "auxiliary_loss_mlp": 0.01278683, "balance_loss_clip": 0.06292135, "balance_loss_mlp": 0.01264068, "epoch": 0.36224259732451525, "flos": 16769232817920.0, "grad_norm": 1.9197097767498337, "language_loss": 0.75634295, "learning_rate": 2.9495593950933997e-06, "loss": 0.83378696, "num_input_tokens_seen": 129350400, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.1461792, "step": 6025, "time_per_iteration": 2.5614311695098877 }, { "auxiliary_loss_clip": 0.06468346, "auxiliary_loss_mlp": 0.01270107, "balance_loss_clip": 0.06290621, "balance_loss_mlp": 0.01255694, "epoch": 0.3623027205771832, "flos": 23156585706240.0, "grad_norm": 1.5328102964584418, "language_loss": 0.73123324, "learning_rate": 2.9492166100453107e-06, "loss": 0.80861777, "num_input_tokens_seen": 129371155, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.14416504, "step": 6026, "time_per_iteration": 4.037768840789795 }, { "auxiliary_loss_clip": 0.06477685, "auxiliary_loss_mlp": 0.0127948, "balance_loss_clip": 0.06291356, "balance_loss_mlp": 0.01263506, "epoch": 0.3623628438298512, "flos": 28556829970560.0, "grad_norm": 4.207192728299297, "language_loss": 0.7886824, "learning_rate": 2.948873789002833e-06, "loss": 0.86625409, "num_input_tokens_seen": 129391230, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.15979004, "step": 6027, "time_per_iteration": 2.6186869144439697 }, { "auxiliary_loss_clip": 0.06476676, "auxiliary_loss_mlp": 0.01272919, "balance_loss_clip": 0.06296279, "balance_loss_mlp": 0.01256456, "epoch": 0.36242296708251914, "flos": 25492193614080.0, "grad_norm": 1.7429866813021109, "language_loss": 0.68231022, "learning_rate": 2.9485309319789667e-06, "loss": 0.75980616, "num_input_tokens_seen": 129410065, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.16479492, "step": 6028, "time_per_iteration": 2.7586405277252197 }, { "auxiliary_loss_clip": 0.06465876, "auxiliary_loss_mlp": 0.01273426, "balance_loss_clip": 0.06288733, "balance_loss_mlp": 0.01258823, "epoch": 0.3624830903351871, "flos": 16296062411520.0, "grad_norm": 1.6254485437255295, "language_loss": 0.86081475, "learning_rate": 2.9481880389867117e-06, "loss": 0.93820775, "num_input_tokens_seen": 129428655, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.14599609, "step": 6029, "time_per_iteration": 2.7147953510284424 }, { "auxiliary_loss_clip": 0.06474387, "auxiliary_loss_mlp": 0.01276308, "balance_loss_clip": 0.06297085, "balance_loss_mlp": 0.01261473, "epoch": 0.36254321358785513, "flos": 18302200865280.0, "grad_norm": 1.5210082715208166, "language_loss": 0.73291427, "learning_rate": 2.9478451100390714e-06, "loss": 0.81042123, "num_input_tokens_seen": 129447845, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.14837646, "step": 6030, "time_per_iteration": 2.5800042152404785 }, { "auxiliary_loss_clip": 0.06476288, "auxiliary_loss_mlp": 0.01276264, "balance_loss_clip": 0.06291147, "balance_loss_mlp": 0.01259491, "epoch": 0.3626033368405231, "flos": 14870387917440.0, "grad_norm": 2.612535771536801, "language_loss": 0.75341177, "learning_rate": 2.94750214514905e-06, "loss": 0.83093727, "num_input_tokens_seen": 129463275, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.16784668, "step": 6031, "time_per_iteration": 4.045806646347046 }, { "auxiliary_loss_clip": 0.06474788, "auxiliary_loss_mlp": 0.0127226, "balance_loss_clip": 0.06296995, "balance_loss_mlp": 0.01258152, "epoch": 0.36266346009319106, "flos": 22312632983040.0, "grad_norm": 1.6702481462963332, "language_loss": 0.7423861, "learning_rate": 2.9471591443296516e-06, "loss": 0.81985658, "num_input_tokens_seen": 129483205, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.14117432, "step": 6032, "time_per_iteration": 2.5999999046325684 }, { "auxiliary_loss_clip": 0.06474338, "auxiliary_loss_mlp": 0.01270405, "balance_loss_clip": 0.06292453, "balance_loss_mlp": 0.01255915, "epoch": 0.362723583345859, "flos": 18228044401920.0, "grad_norm": 2.12426400804177, "language_loss": 0.78028715, "learning_rate": 2.946816107593884e-06, "loss": 0.85773456, "num_input_tokens_seen": 129499885, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.14508057, "step": 6033, "time_per_iteration": 2.594688653945923 }, { "auxiliary_loss_clip": 0.06408025, "auxiliary_loss_mlp": 0.01256894, "balance_loss_clip": 0.06326624, "balance_loss_mlp": 0.0125333, "epoch": 0.362783706598527, "flos": 68519307456000.0, "grad_norm": 0.7632100030791835, "language_loss": 0.648224, "learning_rate": 2.9464730349547547e-06, "loss": 0.7248733, "num_input_tokens_seen": 129561885, "router_z_loss_clip": 0.8125, "router_z_loss_mlp": 0.03564453, "step": 6034, "time_per_iteration": 3.290825128555298 }, { "auxiliary_loss_clip": 0.06463614, "auxiliary_loss_mlp": 0.01272523, "balance_loss_clip": 0.06286924, "balance_loss_mlp": 0.01257551, "epoch": 0.36284382985119495, "flos": 26583535117440.0, "grad_norm": 1.4539959757202918, "language_loss": 0.89724904, "learning_rate": 2.946129926425273e-06, "loss": 0.97461039, "num_input_tokens_seen": 129582325, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.14990234, "step": 6035, "time_per_iteration": 2.607100009918213 }, { "auxiliary_loss_clip": 0.06475177, "auxiliary_loss_mlp": 0.01278487, "balance_loss_clip": 0.06290243, "balance_loss_mlp": 0.01262846, "epoch": 0.3629039531038629, "flos": 20162919358080.0, "grad_norm": 2.1812572707454607, "language_loss": 0.74095356, "learning_rate": 2.9457867820184496e-06, "loss": 0.81849021, "num_input_tokens_seen": 129600350, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.15649414, "step": 6036, "time_per_iteration": 2.6144349575042725 }, { "auxiliary_loss_clip": 0.06471384, "auxiliary_loss_mlp": 0.01273758, "balance_loss_clip": 0.06285967, "balance_loss_mlp": 0.01258999, "epoch": 0.3629640763565309, "flos": 18631838027520.0, "grad_norm": 1.7655775615324645, "language_loss": 0.76138926, "learning_rate": 2.945443601747297e-06, "loss": 0.83884066, "num_input_tokens_seen": 129618425, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.14758301, "step": 6037, "time_per_iteration": 2.5892834663391113 }, { "auxiliary_loss_clip": 0.06470263, "auxiliary_loss_mlp": 0.01275104, "balance_loss_clip": 0.06295546, "balance_loss_mlp": 0.01259404, "epoch": 0.36302419960919885, "flos": 19577256445440.0, "grad_norm": 1.5408919429209271, "language_loss": 0.78822446, "learning_rate": 2.945100385624828e-06, "loss": 0.86567813, "num_input_tokens_seen": 129636750, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.15698242, "step": 6038, "time_per_iteration": 2.5951287746429443 }, { "auxiliary_loss_clip": 0.06383754, "auxiliary_loss_mlp": 0.01256734, "balance_loss_clip": 0.0630361, "balance_loss_mlp": 0.01254151, "epoch": 0.3630843228618668, "flos": 63817805589120.0, "grad_norm": 0.8327964920578925, "language_loss": 0.63302135, "learning_rate": 2.9447571336640573e-06, "loss": 0.70942628, "num_input_tokens_seen": 129699030, "router_z_loss_clip": 0.79980469, "router_z_loss_mlp": 0.02583313, "step": 6039, "time_per_iteration": 3.312286615371704 }, { "auxiliary_loss_clip": 0.06470992, "auxiliary_loss_mlp": 0.01266187, "balance_loss_clip": 0.06292915, "balance_loss_mlp": 0.01251525, "epoch": 0.3631444461145348, "flos": 21841600855680.0, "grad_norm": 1.8156559049018266, "language_loss": 0.71336067, "learning_rate": 2.944413845878002e-06, "loss": 0.7907325, "num_input_tokens_seen": 129717135, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.14672852, "step": 6040, "time_per_iteration": 2.6062839031219482 }, { "auxiliary_loss_clip": 0.06473793, "auxiliary_loss_mlp": 0.01269964, "balance_loss_clip": 0.06288609, "balance_loss_mlp": 0.0125526, "epoch": 0.36320456936720275, "flos": 21727850538240.0, "grad_norm": 3.0334677334204097, "language_loss": 0.81679696, "learning_rate": 2.9440705222796783e-06, "loss": 0.89423454, "num_input_tokens_seen": 129735940, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.14703369, "step": 6041, "time_per_iteration": 2.605440616607666 }, { "auxiliary_loss_clip": 0.0647125, "auxiliary_loss_mlp": 0.01273623, "balance_loss_clip": 0.06289649, "balance_loss_mlp": 0.01257828, "epoch": 0.3632646926198707, "flos": 17024713516800.0, "grad_norm": 2.0243891210572147, "language_loss": 0.84361196, "learning_rate": 2.943727162882107e-06, "loss": 0.92106068, "num_input_tokens_seen": 129752790, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.15783691, "step": 6042, "time_per_iteration": 2.588399648666382 }, { "auxiliary_loss_clip": 0.06465994, "auxiliary_loss_mlp": 0.01271891, "balance_loss_clip": 0.06288654, "balance_loss_mlp": 0.01257061, "epoch": 0.36332481587253873, "flos": 23337868014720.0, "grad_norm": 3.7018226275465977, "language_loss": 0.78656769, "learning_rate": 2.9433837676983064e-06, "loss": 0.86394656, "num_input_tokens_seen": 129773655, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.14837646, "step": 6043, "time_per_iteration": 2.5939431190490723 }, { "auxiliary_loss_clip": 0.06464498, "auxiliary_loss_mlp": 0.01273616, "balance_loss_clip": 0.06288145, "balance_loss_mlp": 0.0125769, "epoch": 0.3633849391252067, "flos": 10748134126080.0, "grad_norm": 1.9002739610334285, "language_loss": 0.66356087, "learning_rate": 2.943040336741298e-06, "loss": 0.740942, "num_input_tokens_seen": 129791605, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.15930176, "step": 6044, "time_per_iteration": 2.5681095123291016 }, { "auxiliary_loss_clip": 0.06469037, "auxiliary_loss_mlp": 0.01268443, "balance_loss_clip": 0.06291533, "balance_loss_mlp": 0.0125381, "epoch": 0.36344506237787466, "flos": 25856351458560.0, "grad_norm": 1.654436366473847, "language_loss": 0.81212628, "learning_rate": 2.9426968700241066e-06, "loss": 0.88950109, "num_input_tokens_seen": 129811075, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.1463623, "step": 6045, "time_per_iteration": 2.6376988887786865 }, { "auxiliary_loss_clip": 0.06474218, "auxiliary_loss_mlp": 0.01271253, "balance_loss_clip": 0.06292916, "balance_loss_mlp": 0.01256006, "epoch": 0.3635051856305426, "flos": 30161900056320.0, "grad_norm": 2.1276411561566997, "language_loss": 0.64802301, "learning_rate": 2.942353367559755e-06, "loss": 0.7254777, "num_input_tokens_seen": 129833755, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.15246582, "step": 6046, "time_per_iteration": 2.67047119140625 }, { "auxiliary_loss_clip": 0.06473222, "auxiliary_loss_mlp": 0.01270313, "balance_loss_clip": 0.06295104, "balance_loss_mlp": 0.01255865, "epoch": 0.3635653088832106, "flos": 22204626670080.0, "grad_norm": 1.5098226602771327, "language_loss": 0.78127354, "learning_rate": 2.9420098293612692e-06, "loss": 0.85870886, "num_input_tokens_seen": 129854475, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.14465332, "step": 6047, "time_per_iteration": 2.70876407623291 }, { "auxiliary_loss_clip": 0.06483829, "auxiliary_loss_mlp": 0.01273179, "balance_loss_clip": 0.06296546, "balance_loss_mlp": 0.01256728, "epoch": 0.36362543213587856, "flos": 24793409289600.0, "grad_norm": 2.263659957709091, "language_loss": 0.80257154, "learning_rate": 2.9416662554416767e-06, "loss": 0.88014162, "num_input_tokens_seen": 129873530, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.16442871, "step": 6048, "time_per_iteration": 2.6423776149749756 }, { "auxiliary_loss_clip": 0.06384438, "auxiliary_loss_mlp": 0.01253642, "balance_loss_clip": 0.06301883, "balance_loss_mlp": 0.01250634, "epoch": 0.3636855553885465, "flos": 62547320056320.0, "grad_norm": 0.7548905399413395, "language_loss": 0.52535272, "learning_rate": 2.9413226458140054e-06, "loss": 0.60173345, "num_input_tokens_seen": 129940400, "router_z_loss_clip": 0.82470703, "router_z_loss_mlp": 0.03005981, "step": 6049, "time_per_iteration": 3.3343610763549805 }, { "auxiliary_loss_clip": 0.06473847, "auxiliary_loss_mlp": 0.0127327, "balance_loss_clip": 0.0629308, "balance_loss_mlp": 0.01258512, "epoch": 0.3637456786412145, "flos": 24067441514880.0, "grad_norm": 1.6978659744754312, "language_loss": 0.87156218, "learning_rate": 2.9409790004912845e-06, "loss": 0.94903332, "num_input_tokens_seen": 129958635, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.14758301, "step": 6050, "time_per_iteration": 2.634963274002075 }, { "auxiliary_loss_clip": 0.06475271, "auxiliary_loss_mlp": 0.01269684, "balance_loss_clip": 0.06297976, "balance_loss_mlp": 0.01254652, "epoch": 0.36380580189388245, "flos": 16697214633600.0, "grad_norm": 2.6825583472857213, "language_loss": 0.78330815, "learning_rate": 2.940635319486546e-06, "loss": 0.86075777, "num_input_tokens_seen": 129977685, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.15026855, "step": 6051, "time_per_iteration": 2.5985095500946045 }, { "auxiliary_loss_clip": 0.06476051, "auxiliary_loss_mlp": 0.01269374, "balance_loss_clip": 0.0629679, "balance_loss_mlp": 0.0125532, "epoch": 0.3638659251465504, "flos": 25120279267200.0, "grad_norm": 2.046939240422785, "language_loss": 0.83221674, "learning_rate": 2.940291602812822e-06, "loss": 0.90967095, "num_input_tokens_seen": 129997530, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.14050293, "step": 6052, "time_per_iteration": 2.6138792037963867 }, { "auxiliary_loss_clip": 0.06468403, "auxiliary_loss_mlp": 0.01274516, "balance_loss_clip": 0.06293868, "balance_loss_mlp": 0.01260831, "epoch": 0.3639260483992184, "flos": 23009698298880.0, "grad_norm": 1.8794899279518833, "language_loss": 0.72627264, "learning_rate": 2.939947850483145e-06, "loss": 0.80370176, "num_input_tokens_seen": 130017955, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.13677979, "step": 6053, "time_per_iteration": 2.6255226135253906 }, { "auxiliary_loss_clip": 0.06388703, "auxiliary_loss_mlp": 0.01254744, "balance_loss_clip": 0.06306078, "balance_loss_mlp": 0.01251835, "epoch": 0.36398617165188635, "flos": 70735043698560.0, "grad_norm": 0.7410224763535993, "language_loss": 0.61090744, "learning_rate": 2.9396040625105532e-06, "loss": 0.68734193, "num_input_tokens_seen": 130074275, "router_z_loss_clip": 0.828125, "router_z_loss_mlp": 0.02905273, "step": 6054, "time_per_iteration": 4.65570330619812 }, { "auxiliary_loss_clip": 0.06475952, "auxiliary_loss_mlp": 0.01276679, "balance_loss_clip": 0.06296048, "balance_loss_mlp": 0.0126077, "epoch": 0.3640462949045543, "flos": 22241788755840.0, "grad_norm": 2.0071670478329158, "language_loss": 0.76110184, "learning_rate": 2.9392602389080802e-06, "loss": 0.83862817, "num_input_tokens_seen": 130091375, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.15911865, "step": 6055, "time_per_iteration": 2.599235773086548 }, { "auxiliary_loss_clip": 0.0647143, "auxiliary_loss_mlp": 0.01271997, "balance_loss_clip": 0.06293134, "balance_loss_mlp": 0.01256655, "epoch": 0.3641064181572223, "flos": 21549964320000.0, "grad_norm": 1.6544146574508893, "language_loss": 0.7594347, "learning_rate": 2.938916379688765e-06, "loss": 0.83686888, "num_input_tokens_seen": 130111595, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.15344238, "step": 6056, "time_per_iteration": 2.6203062534332275 }, { "auxiliary_loss_clip": 0.06468157, "auxiliary_loss_mlp": 0.01270897, "balance_loss_clip": 0.06290493, "balance_loss_mlp": 0.01255162, "epoch": 0.3641665414098903, "flos": 22279873236480.0, "grad_norm": 1.752683043952362, "language_loss": 0.80662245, "learning_rate": 2.9385724848656468e-06, "loss": 0.88401294, "num_input_tokens_seen": 130131440, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.1574707, "step": 6057, "time_per_iteration": 2.6695501804351807 }, { "auxiliary_loss_clip": 0.06475015, "auxiliary_loss_mlp": 0.01270868, "balance_loss_clip": 0.06298719, "balance_loss_mlp": 0.01255001, "epoch": 0.36422666466255826, "flos": 28337211619200.0, "grad_norm": 1.864543241343626, "language_loss": 0.80361819, "learning_rate": 2.9382285544517647e-06, "loss": 0.88107705, "num_input_tokens_seen": 130151375, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.15856934, "step": 6058, "time_per_iteration": 2.6810507774353027 }, { "auxiliary_loss_clip": 0.06472257, "auxiliary_loss_mlp": 0.01271452, "balance_loss_clip": 0.06291769, "balance_loss_mlp": 0.01256014, "epoch": 0.36428678791522623, "flos": 24177376471680.0, "grad_norm": 1.621471860100361, "language_loss": 0.85155952, "learning_rate": 2.9378845884601636e-06, "loss": 0.92899656, "num_input_tokens_seen": 130169960, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.15441895, "step": 6059, "time_per_iteration": 4.080186128616333 }, { "auxiliary_loss_clip": 0.06475249, "auxiliary_loss_mlp": 0.01275473, "balance_loss_clip": 0.06296353, "balance_loss_mlp": 0.01260691, "epoch": 0.3643469111678942, "flos": 22535018519040.0, "grad_norm": 1.4998248500745546, "language_loss": 0.88062376, "learning_rate": 2.937540586903884e-06, "loss": 0.95813096, "num_input_tokens_seen": 130189800, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.14794922, "step": 6060, "time_per_iteration": 2.585235118865967 }, { "auxiliary_loss_clip": 0.06477336, "auxiliary_loss_mlp": 0.01274719, "balance_loss_clip": 0.06294892, "balance_loss_mlp": 0.01258244, "epoch": 0.36440703442056216, "flos": 19432549244160.0, "grad_norm": 1.9335127500277376, "language_loss": 0.67492932, "learning_rate": 2.937196549795971e-06, "loss": 0.75244987, "num_input_tokens_seen": 130206370, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.16467285, "step": 6061, "time_per_iteration": 2.600123882293701 }, { "auxiliary_loss_clip": 0.06481012, "auxiliary_loss_mlp": 0.01271641, "balance_loss_clip": 0.06298608, "balance_loss_mlp": 0.01256573, "epoch": 0.3644671576732301, "flos": 18046300896000.0, "grad_norm": 2.2013638503607096, "language_loss": 0.75589585, "learning_rate": 2.9368524771494718e-06, "loss": 0.83342236, "num_input_tokens_seen": 130224445, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.15063477, "step": 6062, "time_per_iteration": 2.555363655090332 }, { "auxiliary_loss_clip": 0.06469591, "auxiliary_loss_mlp": 0.0127098, "balance_loss_clip": 0.06290894, "balance_loss_mlp": 0.01255936, "epoch": 0.3645272809258981, "flos": 21549125779200.0, "grad_norm": 4.843632696798331, "language_loss": 0.72548026, "learning_rate": 2.936508368977432e-06, "loss": 0.80288601, "num_input_tokens_seen": 130245380, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.1505127, "step": 6063, "time_per_iteration": 2.6410107612609863 }, { "auxiliary_loss_clip": 0.06471054, "auxiliary_loss_mlp": 0.01271271, "balance_loss_clip": 0.06295522, "balance_loss_mlp": 0.01256227, "epoch": 0.36458740417856605, "flos": 22753379059200.0, "grad_norm": 4.982943080011604, "language_loss": 0.67894334, "learning_rate": 2.936164225292901e-06, "loss": 0.75636661, "num_input_tokens_seen": 130265575, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.15039062, "step": 6064, "time_per_iteration": 2.6847591400146484 }, { "auxiliary_loss_clip": 0.06479067, "auxiliary_loss_mlp": 0.01273082, "balance_loss_clip": 0.06296655, "balance_loss_mlp": 0.0125656, "epoch": 0.364647527431234, "flos": 26147862213120.0, "grad_norm": 3.7118439163588337, "language_loss": 0.75156897, "learning_rate": 2.9358200461089297e-06, "loss": 0.82909048, "num_input_tokens_seen": 130286195, "router_z_loss_clip": 1.82324219, "router_z_loss_mlp": 0.16503906, "step": 6065, "time_per_iteration": 3.989439010620117 }, { "auxiliary_loss_clip": 0.06487341, "auxiliary_loss_mlp": 0.01270066, "balance_loss_clip": 0.06303343, "balance_loss_mlp": 0.01253794, "epoch": 0.364707650683902, "flos": 31037941693440.0, "grad_norm": 2.020859457671249, "language_loss": 0.75342137, "learning_rate": 2.9354758314385676e-06, "loss": 0.83099544, "num_input_tokens_seen": 130306095, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.16271973, "step": 6066, "time_per_iteration": 2.714555501937866 }, { "auxiliary_loss_clip": 0.06479332, "auxiliary_loss_mlp": 0.01268472, "balance_loss_clip": 0.06300922, "balance_loss_mlp": 0.01254429, "epoch": 0.36476777393656995, "flos": 19578933527040.0, "grad_norm": 2.0182136760994016, "language_loss": 0.77029616, "learning_rate": 2.9351315812948684e-06, "loss": 0.84777415, "num_input_tokens_seen": 130324685, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.14050293, "step": 6067, "time_per_iteration": 2.5933892726898193 }, { "auxiliary_loss_clip": 0.06479216, "auxiliary_loss_mlp": 0.01271588, "balance_loss_clip": 0.06303649, "balance_loss_mlp": 0.0125736, "epoch": 0.3648278971892379, "flos": 17754622433280.0, "grad_norm": 2.9839620302870817, "language_loss": 0.7110191, "learning_rate": 2.934787295690886e-06, "loss": 0.78852713, "num_input_tokens_seen": 130343855, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.14221191, "step": 6068, "time_per_iteration": 2.5848774909973145 }, { "auxiliary_loss_clip": 0.06487308, "auxiliary_loss_mlp": 0.01272587, "balance_loss_clip": 0.06302746, "balance_loss_mlp": 0.01257137, "epoch": 0.3648880204419059, "flos": 17936952917760.0, "grad_norm": 2.0511689520215435, "language_loss": 0.74466288, "learning_rate": 2.9344429746396755e-06, "loss": 0.82226181, "num_input_tokens_seen": 130362320, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.15454102, "step": 6069, "time_per_iteration": 2.6347737312316895 }, { "auxiliary_loss_clip": 0.06486614, "auxiliary_loss_mlp": 0.01276085, "balance_loss_clip": 0.06302768, "balance_loss_mlp": 0.01261183, "epoch": 0.3649481436945739, "flos": 22644911548800.0, "grad_norm": 1.9267783927292415, "language_loss": 0.66781837, "learning_rate": 2.9340986181542945e-06, "loss": 0.74544537, "num_input_tokens_seen": 130383165, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.14892578, "step": 6070, "time_per_iteration": 4.122167110443115 }, { "auxiliary_loss_clip": 0.0647705, "auxiliary_loss_mlp": 0.01276587, "balance_loss_clip": 0.06300253, "balance_loss_mlp": 0.01261925, "epoch": 0.36500826694724187, "flos": 21586036302720.0, "grad_norm": 1.5657988176875772, "language_loss": 0.74576026, "learning_rate": 2.9337542262477994e-06, "loss": 0.82329667, "num_input_tokens_seen": 130402425, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.14660645, "step": 6071, "time_per_iteration": 2.5996627807617188 }, { "auxiliary_loss_clip": 0.06478514, "auxiliary_loss_mlp": 0.01276711, "balance_loss_clip": 0.06299225, "balance_loss_mlp": 0.01261256, "epoch": 0.36506839019990983, "flos": 13777746675840.0, "grad_norm": 1.9440669652484739, "language_loss": 0.88421965, "learning_rate": 2.9334097989332506e-06, "loss": 0.96177185, "num_input_tokens_seen": 130419440, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.15454102, "step": 6072, "time_per_iteration": 2.5847980976104736 }, { "auxiliary_loss_clip": 0.06477911, "auxiliary_loss_mlp": 0.01274968, "balance_loss_clip": 0.06297526, "balance_loss_mlp": 0.01259281, "epoch": 0.3651285134525778, "flos": 17280739267200.0, "grad_norm": 2.662718807653743, "language_loss": 0.7362473, "learning_rate": 2.9330653362237094e-06, "loss": 0.81377602, "num_input_tokens_seen": 130438495, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.15698242, "step": 6073, "time_per_iteration": 2.5897529125213623 }, { "auxiliary_loss_clip": 0.06484193, "auxiliary_loss_mlp": 0.01275134, "balance_loss_clip": 0.06302074, "balance_loss_mlp": 0.01258934, "epoch": 0.36518863670524576, "flos": 21914415653760.0, "grad_norm": 2.0834257420530364, "language_loss": 0.67564362, "learning_rate": 2.932720838132236e-06, "loss": 0.75323689, "num_input_tokens_seen": 130455575, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.1619873, "step": 6074, "time_per_iteration": 2.650014638900757 }, { "auxiliary_loss_clip": 0.06471258, "auxiliary_loss_mlp": 0.0127001, "balance_loss_clip": 0.0629234, "balance_loss_mlp": 0.01254513, "epoch": 0.3652487599579137, "flos": 27128933343360.0, "grad_norm": 1.812418908628288, "language_loss": 0.73172778, "learning_rate": 2.9323763046718954e-06, "loss": 0.80914044, "num_input_tokens_seen": 130476385, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.15490723, "step": 6075, "time_per_iteration": 2.6748149394989014 }, { "auxiliary_loss_clip": 0.06478839, "auxiliary_loss_mlp": 0.01275932, "balance_loss_clip": 0.06292659, "balance_loss_mlp": 0.01259648, "epoch": 0.3653088832105817, "flos": 19761683281920.0, "grad_norm": 4.380642184742716, "language_loss": 0.89938104, "learning_rate": 2.9320317358557524e-06, "loss": 0.97692877, "num_input_tokens_seen": 130493630, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.1628418, "step": 6076, "time_per_iteration": 2.591416597366333 }, { "auxiliary_loss_clip": 0.06471573, "auxiliary_loss_mlp": 0.01277681, "balance_loss_clip": 0.06292306, "balance_loss_mlp": 0.01262023, "epoch": 0.36536900646324966, "flos": 13119981724800.0, "grad_norm": 1.8938537408786777, "language_loss": 0.69955236, "learning_rate": 2.931687131696872e-06, "loss": 0.77704489, "num_input_tokens_seen": 130510735, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.15667725, "step": 6077, "time_per_iteration": 2.5266635417938232 }, { "auxiliary_loss_clip": 0.06370698, "auxiliary_loss_mlp": 0.01272437, "balance_loss_clip": 0.06289183, "balance_loss_mlp": 0.01269504, "epoch": 0.3654291297159176, "flos": 71122848393600.0, "grad_norm": 0.7334066509649548, "language_loss": 0.61721641, "learning_rate": 2.9313424922083224e-06, "loss": 0.69364774, "num_input_tokens_seen": 130577050, "router_z_loss_clip": 0.81298828, "router_z_loss_mlp": 0.02929688, "step": 6078, "time_per_iteration": 3.297985553741455 }, { "auxiliary_loss_clip": 0.06473137, "auxiliary_loss_mlp": 0.0128086, "balance_loss_clip": 0.06291853, "balance_loss_mlp": 0.01265005, "epoch": 0.3654892529685856, "flos": 23623299348480.0, "grad_norm": 2.0076088169246646, "language_loss": 0.78629279, "learning_rate": 2.930997817403173e-06, "loss": 0.86383271, "num_input_tokens_seen": 130593780, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.15856934, "step": 6079, "time_per_iteration": 2.587718963623047 }, { "auxiliary_loss_clip": 0.06469205, "auxiliary_loss_mlp": 0.01281431, "balance_loss_clip": 0.06286845, "balance_loss_mlp": 0.01265325, "epoch": 0.36554937622125355, "flos": 43480788174720.0, "grad_norm": 1.881049668975671, "language_loss": 0.63142192, "learning_rate": 2.9306531072944913e-06, "loss": 0.70892823, "num_input_tokens_seen": 130615510, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.16101074, "step": 6080, "time_per_iteration": 2.780439853668213 }, { "auxiliary_loss_clip": 0.0647469, "auxiliary_loss_mlp": 0.01283448, "balance_loss_clip": 0.06290369, "balance_loss_mlp": 0.01267558, "epoch": 0.3656094994739215, "flos": 23301334834560.0, "grad_norm": 3.1651201226447916, "language_loss": 0.68457019, "learning_rate": 2.930308361895352e-06, "loss": 0.7621516, "num_input_tokens_seen": 130635410, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.15893555, "step": 6081, "time_per_iteration": 2.6095094680786133 }, { "auxiliary_loss_clip": 0.06486415, "auxiliary_loss_mlp": 0.01284385, "balance_loss_clip": 0.06297079, "balance_loss_mlp": 0.01268375, "epoch": 0.3656696227265895, "flos": 24578947964160.0, "grad_norm": 1.9813227176372397, "language_loss": 0.7546978, "learning_rate": 2.9299635812188257e-06, "loss": 0.83240587, "num_input_tokens_seen": 130657725, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.16003418, "step": 6082, "time_per_iteration": 2.649113416671753 }, { "auxiliary_loss_clip": 0.06472974, "auxiliary_loss_mlp": 0.01278092, "balance_loss_clip": 0.06292845, "balance_loss_mlp": 0.01264079, "epoch": 0.3657297459792575, "flos": 27935849761920.0, "grad_norm": 1.9188239509084974, "language_loss": 0.8324967, "learning_rate": 2.929618765277987e-06, "loss": 0.91000736, "num_input_tokens_seen": 130678360, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.14019775, "step": 6083, "time_per_iteration": 2.622713088989258 }, { "auxiliary_loss_clip": 0.06362525, "auxiliary_loss_mlp": 0.01270929, "balance_loss_clip": 0.06280789, "balance_loss_mlp": 0.01267668, "epoch": 0.36578986923192547, "flos": 67410566231040.0, "grad_norm": 0.8934842893161195, "language_loss": 0.59325892, "learning_rate": 2.9292739140859125e-06, "loss": 0.66959345, "num_input_tokens_seen": 130742110, "router_z_loss_clip": 0.81640625, "router_z_loss_mlp": 0.03265381, "step": 6084, "time_per_iteration": 3.2944114208221436 }, { "auxiliary_loss_clip": 0.06472233, "auxiliary_loss_mlp": 0.01276904, "balance_loss_clip": 0.06293054, "balance_loss_mlp": 0.01261412, "epoch": 0.36584999248459343, "flos": 20233302387840.0, "grad_norm": 1.6660760774642716, "language_loss": 0.73360997, "learning_rate": 2.9289290276556767e-06, "loss": 0.81110144, "num_input_tokens_seen": 130759870, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.15466309, "step": 6085, "time_per_iteration": 2.578228712081909 }, { "auxiliary_loss_clip": 0.06479771, "auxiliary_loss_mlp": 0.01280076, "balance_loss_clip": 0.06297404, "balance_loss_mlp": 0.01264424, "epoch": 0.3659101157372614, "flos": 19068475253760.0, "grad_norm": 1.7060131378011563, "language_loss": 0.7857886, "learning_rate": 2.9285841060003604e-06, "loss": 0.86338705, "num_input_tokens_seen": 130778510, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.15643311, "step": 6086, "time_per_iteration": 2.614882230758667 }, { "auxiliary_loss_clip": 0.06469028, "auxiliary_loss_mlp": 0.01288388, "balance_loss_clip": 0.062918, "balance_loss_mlp": 0.01273773, "epoch": 0.36597023898992936, "flos": 30818658758400.0, "grad_norm": 1.9828249293256877, "language_loss": 0.77404994, "learning_rate": 2.9282391491330416e-06, "loss": 0.85162413, "num_input_tokens_seen": 130798535, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.14624023, "step": 6087, "time_per_iteration": 2.6316001415252686 }, { "auxiliary_loss_clip": 0.06481588, "auxiliary_loss_mlp": 0.01283804, "balance_loss_clip": 0.06296775, "balance_loss_mlp": 0.01268706, "epoch": 0.36603036224259733, "flos": 20528041524480.0, "grad_norm": 1.8461099757769095, "language_loss": 0.71176231, "learning_rate": 2.9278941570668002e-06, "loss": 0.78941619, "num_input_tokens_seen": 130816655, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.15106201, "step": 6088, "time_per_iteration": 2.563488245010376 }, { "auxiliary_loss_clip": 0.06491656, "auxiliary_loss_mlp": 0.01284583, "balance_loss_clip": 0.06298133, "balance_loss_mlp": 0.01266797, "epoch": 0.3660904854952653, "flos": 38339043356160.0, "grad_norm": 3.567543872844227, "language_loss": 0.80048686, "learning_rate": 2.92754912981472e-06, "loss": 0.87824929, "num_input_tokens_seen": 130841225, "router_z_loss_clip": 1.93359375, "router_z_loss_mlp": 0.17773438, "step": 6089, "time_per_iteration": 2.7226099967956543 }, { "auxiliary_loss_clip": 0.0647503, "auxiliary_loss_mlp": 0.01278084, "balance_loss_clip": 0.06294888, "balance_loss_mlp": 0.01263671, "epoch": 0.36615060874793326, "flos": 21842062053120.0, "grad_norm": 2.7534714953290065, "language_loss": 0.72284544, "learning_rate": 2.927204067389884e-06, "loss": 0.80037659, "num_input_tokens_seen": 130861050, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.14416504, "step": 6090, "time_per_iteration": 2.5781331062316895 }, { "auxiliary_loss_clip": 0.06475659, "auxiliary_loss_mlp": 0.01280021, "balance_loss_clip": 0.06297776, "balance_loss_mlp": 0.01264691, "epoch": 0.3662107320006012, "flos": 16587153895680.0, "grad_norm": 1.6148253084281026, "language_loss": 0.74417317, "learning_rate": 2.9268589698053763e-06, "loss": 0.82173002, "num_input_tokens_seen": 130879775, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.15319824, "step": 6091, "time_per_iteration": 2.5713517665863037 }, { "auxiliary_loss_clip": 0.06480989, "auxiliary_loss_mlp": 0.01277435, "balance_loss_clip": 0.06299829, "balance_loss_mlp": 0.01262295, "epoch": 0.3662708552532692, "flos": 20964469115520.0, "grad_norm": 1.793926219092581, "language_loss": 0.72831744, "learning_rate": 2.926513837074284e-06, "loss": 0.80590165, "num_input_tokens_seen": 130898070, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.15136719, "step": 6092, "time_per_iteration": 2.569967746734619 }, { "auxiliary_loss_clip": 0.06486009, "auxiliary_loss_mlp": 0.01283693, "balance_loss_clip": 0.06300961, "balance_loss_mlp": 0.01268011, "epoch": 0.36633097850593715, "flos": 21908252378880.0, "grad_norm": 2.3382067504584234, "language_loss": 0.79155815, "learning_rate": 2.9261686692096942e-06, "loss": 0.86925519, "num_input_tokens_seen": 130915250, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.15692139, "step": 6093, "time_per_iteration": 2.6247975826263428 }, { "auxiliary_loss_clip": 0.06479411, "auxiliary_loss_mlp": 0.01281436, "balance_loss_clip": 0.06295931, "balance_loss_mlp": 0.01266416, "epoch": 0.3663911017586051, "flos": 32862462422400.0, "grad_norm": 1.7713535845736081, "language_loss": 0.7442441, "learning_rate": 2.925823466224696e-06, "loss": 0.82185251, "num_input_tokens_seen": 130936995, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.15039062, "step": 6094, "time_per_iteration": 4.1295225620269775 }, { "auxiliary_loss_clip": 0.06490353, "auxiliary_loss_mlp": 0.01284806, "balance_loss_clip": 0.06307326, "balance_loss_mlp": 0.01269809, "epoch": 0.3664512250112731, "flos": 27279132986880.0, "grad_norm": 17.80339033545407, "language_loss": 0.79826474, "learning_rate": 2.9254782281323785e-06, "loss": 0.87601626, "num_input_tokens_seen": 130957970, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.15002441, "step": 6095, "time_per_iteration": 2.6623475551605225 }, { "auxiliary_loss_clip": 0.06492177, "auxiliary_loss_mlp": 0.01281313, "balance_loss_clip": 0.06305858, "balance_loss_mlp": 0.01265113, "epoch": 0.3665113482639411, "flos": 17790065510400.0, "grad_norm": 2.4574164193038874, "language_loss": 0.74023199, "learning_rate": 2.925132954945834e-06, "loss": 0.81796694, "num_input_tokens_seen": 130974915, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.1619873, "step": 6096, "time_per_iteration": 2.560029983520508 }, { "auxiliary_loss_clip": 0.06490474, "auxiliary_loss_mlp": 0.01280379, "balance_loss_clip": 0.06303182, "balance_loss_mlp": 0.01265287, "epoch": 0.36657147151660907, "flos": 27861944860800.0, "grad_norm": 5.171732889025455, "language_loss": 0.67614985, "learning_rate": 2.924787646678155e-06, "loss": 0.75385833, "num_input_tokens_seen": 130995745, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.15112305, "step": 6097, "time_per_iteration": 2.6453728675842285 }, { "auxiliary_loss_clip": 0.06492388, "auxiliary_loss_mlp": 0.01279536, "balance_loss_clip": 0.06308512, "balance_loss_mlp": 0.0126374, "epoch": 0.36663159476927704, "flos": 25381000846080.0, "grad_norm": 1.4215984779634476, "language_loss": 0.78032482, "learning_rate": 2.9244423033424365e-06, "loss": 0.85804403, "num_input_tokens_seen": 131015545, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.15795898, "step": 6098, "time_per_iteration": 2.594106912612915 }, { "auxiliary_loss_clip": 0.06486318, "auxiliary_loss_mlp": 0.01272087, "balance_loss_clip": 0.06304335, "balance_loss_mlp": 0.01257239, "epoch": 0.366691718021945, "flos": 21362979934080.0, "grad_norm": 2.6861307793871503, "language_loss": 0.73804426, "learning_rate": 2.9240969249517723e-06, "loss": 0.81562829, "num_input_tokens_seen": 131033990, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.14831543, "step": 6099, "time_per_iteration": 3.991466522216797 }, { "auxiliary_loss_clip": 0.06474146, "auxiliary_loss_mlp": 0.01275046, "balance_loss_clip": 0.06296401, "balance_loss_mlp": 0.01260794, "epoch": 0.36675184127461297, "flos": 16806017560320.0, "grad_norm": 7.001842441478088, "language_loss": 0.8522377, "learning_rate": 2.9237515115192602e-06, "loss": 0.92972964, "num_input_tokens_seen": 131050710, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.14239502, "step": 6100, "time_per_iteration": 2.546600580215454 }, { "auxiliary_loss_clip": 0.0649085, "auxiliary_loss_mlp": 0.01271605, "balance_loss_clip": 0.06302393, "balance_loss_mlp": 0.01256108, "epoch": 0.36681196452728093, "flos": 21912696645120.0, "grad_norm": 11.41005108161385, "language_loss": 0.7147218, "learning_rate": 2.9234060630579992e-06, "loss": 0.79234636, "num_input_tokens_seen": 131071435, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.1550293, "step": 6101, "time_per_iteration": 2.596757173538208 }, { "auxiliary_loss_clip": 0.06484558, "auxiliary_loss_mlp": 0.01277455, "balance_loss_clip": 0.06300329, "balance_loss_mlp": 0.01260957, "epoch": 0.3668720877799489, "flos": 17718215034240.0, "grad_norm": 2.150019792056376, "language_loss": 0.76611769, "learning_rate": 2.9230605795810865e-06, "loss": 0.84373778, "num_input_tokens_seen": 131088775, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.16497803, "step": 6102, "time_per_iteration": 2.52601957321167 }, { "auxiliary_loss_clip": 0.06496267, "auxiliary_loss_mlp": 0.01282598, "balance_loss_clip": 0.06307407, "balance_loss_mlp": 0.01266564, "epoch": 0.36693221103261686, "flos": 47055882804480.0, "grad_norm": 1.5958680184759915, "language_loss": 0.70740765, "learning_rate": 2.922715061101625e-06, "loss": 0.7851963, "num_input_tokens_seen": 131112800, "router_z_loss_clip": 1.88671875, "router_z_loss_mlp": 0.16040039, "step": 6103, "time_per_iteration": 2.795664072036743 }, { "auxiliary_loss_clip": 0.06494799, "auxiliary_loss_mlp": 0.01274663, "balance_loss_clip": 0.06309611, "balance_loss_mlp": 0.0125916, "epoch": 0.3669923342852848, "flos": 15966383322240.0, "grad_norm": 1.8068470687275553, "language_loss": 0.72843981, "learning_rate": 2.922369507632716e-06, "loss": 0.80613446, "num_input_tokens_seen": 131131150, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.15509033, "step": 6104, "time_per_iteration": 2.555314779281616 }, { "auxiliary_loss_clip": 0.06490999, "auxiliary_loss_mlp": 0.01273, "balance_loss_clip": 0.06308188, "balance_loss_mlp": 0.01256931, "epoch": 0.3670524575379528, "flos": 19980630800640.0, "grad_norm": 1.8932600806493591, "language_loss": 0.82099247, "learning_rate": 2.9220239191874617e-06, "loss": 0.89863247, "num_input_tokens_seen": 131150365, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.1607666, "step": 6105, "time_per_iteration": 3.9813592433929443 }, { "auxiliary_loss_clip": 0.06497871, "auxiliary_loss_mlp": 0.01276752, "balance_loss_clip": 0.06306779, "balance_loss_mlp": 0.01260289, "epoch": 0.36711258079062076, "flos": 25710092956800.0, "grad_norm": 29.760287578751687, "language_loss": 0.81838357, "learning_rate": 2.9216782957789692e-06, "loss": 0.89612985, "num_input_tokens_seen": 131169310, "router_z_loss_clip": 1.91308594, "router_z_loss_mlp": 0.16455078, "step": 6106, "time_per_iteration": 2.613744020462036 }, { "auxiliary_loss_clip": 0.06387828, "auxiliary_loss_mlp": 0.01261516, "balance_loss_clip": 0.06304506, "balance_loss_mlp": 0.01256471, "epoch": 0.3671727040432887, "flos": 60793014648960.0, "grad_norm": 0.672278097816103, "language_loss": 0.59173775, "learning_rate": 2.9213326374203426e-06, "loss": 0.66823125, "num_input_tokens_seen": 131232900, "router_z_loss_clip": 0.83251953, "router_z_loss_mlp": 0.05044556, "step": 6107, "time_per_iteration": 3.28471040725708 }, { "auxiliary_loss_clip": 0.06485987, "auxiliary_loss_mlp": 0.01273711, "balance_loss_clip": 0.06305148, "balance_loss_mlp": 0.01258786, "epoch": 0.3672328272959567, "flos": 18667281104640.0, "grad_norm": 5.442051144164675, "language_loss": 0.74825466, "learning_rate": 2.92098694412469e-06, "loss": 0.82585162, "num_input_tokens_seen": 131250920, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.14916992, "step": 6108, "time_per_iteration": 2.5904104709625244 }, { "auxiliary_loss_clip": 0.06496686, "auxiliary_loss_mlp": 0.01272805, "balance_loss_clip": 0.06310014, "balance_loss_mlp": 0.01257439, "epoch": 0.3672929505486247, "flos": 15054395483520.0, "grad_norm": 1.9945016646435714, "language_loss": 0.74277258, "learning_rate": 2.9206412159051213e-06, "loss": 0.82046747, "num_input_tokens_seen": 131267910, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.15380859, "step": 6109, "time_per_iteration": 2.578437089920044 }, { "auxiliary_loss_clip": 0.064852, "auxiliary_loss_mlp": 0.01274869, "balance_loss_clip": 0.06303772, "balance_loss_mlp": 0.0125886, "epoch": 0.3673530738012927, "flos": 20594693047680.0, "grad_norm": 1.7630032404483522, "language_loss": 0.5376724, "learning_rate": 2.920295452774744e-06, "loss": 0.61527306, "num_input_tokens_seen": 131287150, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.16015625, "step": 6110, "time_per_iteration": 4.006082534790039 }, { "auxiliary_loss_clip": 0.06489402, "auxiliary_loss_mlp": 0.01275395, "balance_loss_clip": 0.0630911, "balance_loss_mlp": 0.01259397, "epoch": 0.36741319705396064, "flos": 21696348602880.0, "grad_norm": 1.6769282189249741, "language_loss": 0.80819881, "learning_rate": 2.919949654746672e-06, "loss": 0.88584679, "num_input_tokens_seen": 131308225, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.15991211, "step": 6111, "time_per_iteration": 2.622488021850586 }, { "auxiliary_loss_clip": 0.06495061, "auxiliary_loss_mlp": 0.01272623, "balance_loss_clip": 0.06314114, "balance_loss_mlp": 0.01256244, "epoch": 0.3674733203066286, "flos": 29870011958400.0, "grad_norm": 3.4347914239243202, "language_loss": 0.72514808, "learning_rate": 2.9196038218340163e-06, "loss": 0.80282485, "num_input_tokens_seen": 131332115, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.16381836, "step": 6112, "time_per_iteration": 2.6426074504852295 }, { "auxiliary_loss_clip": 0.06494431, "auxiliary_loss_mlp": 0.01276031, "balance_loss_clip": 0.0631584, "balance_loss_mlp": 0.01260331, "epoch": 0.36753344355929657, "flos": 18262439303040.0, "grad_norm": 1.6227241888290698, "language_loss": 0.85723114, "learning_rate": 2.919257954049892e-06, "loss": 0.93493581, "num_input_tokens_seen": 131351885, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.15679932, "step": 6113, "time_per_iteration": 2.624680280685425 }, { "auxiliary_loss_clip": 0.06496675, "auxiliary_loss_mlp": 0.01277166, "balance_loss_clip": 0.06309839, "balance_loss_mlp": 0.01260131, "epoch": 0.36759356681196453, "flos": 25308144120960.0, "grad_norm": 9.298717682123508, "language_loss": 0.79313207, "learning_rate": 2.918912051407413e-06, "loss": 0.87087047, "num_input_tokens_seen": 131370245, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.17028809, "step": 6114, "time_per_iteration": 2.6096816062927246 }, { "auxiliary_loss_clip": 0.06502444, "auxiliary_loss_mlp": 0.012768, "balance_loss_clip": 0.06315868, "balance_loss_mlp": 0.01259253, "epoch": 0.3676536900646325, "flos": 21039338338560.0, "grad_norm": 2.5471895483804574, "language_loss": 0.6802423, "learning_rate": 2.918566113919698e-06, "loss": 0.75803471, "num_input_tokens_seen": 131388115, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.17541504, "step": 6115, "time_per_iteration": 2.610102891921997 }, { "auxiliary_loss_clip": 0.06487802, "auxiliary_loss_mlp": 0.01277671, "balance_loss_clip": 0.06308267, "balance_loss_mlp": 0.01261816, "epoch": 0.36771381331730046, "flos": 16293882205440.0, "grad_norm": 2.365856822065875, "language_loss": 0.77488756, "learning_rate": 2.9182201415998636e-06, "loss": 0.85254228, "num_input_tokens_seen": 131404595, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.15869141, "step": 6116, "time_per_iteration": 2.672393798828125 }, { "auxiliary_loss_clip": 0.06498848, "auxiliary_loss_mlp": 0.01279694, "balance_loss_clip": 0.06316607, "balance_loss_mlp": 0.01264137, "epoch": 0.36777393656996843, "flos": 22316574124800.0, "grad_norm": 2.3237280978708976, "language_loss": 0.63162506, "learning_rate": 2.9178741344610286e-06, "loss": 0.70941043, "num_input_tokens_seen": 131423760, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.15563965, "step": 6117, "time_per_iteration": 2.6762096881866455 }, { "auxiliary_loss_clip": 0.06494778, "auxiliary_loss_mlp": 0.01277979, "balance_loss_clip": 0.06314038, "balance_loss_mlp": 0.01262529, "epoch": 0.3678340598226364, "flos": 26841405657600.0, "grad_norm": 2.5428935157729904, "language_loss": 0.74176097, "learning_rate": 2.9175280925163156e-06, "loss": 0.81948853, "num_input_tokens_seen": 131444955, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.15441895, "step": 6118, "time_per_iteration": 2.6185109615325928 }, { "auxiliary_loss_clip": 0.06506766, "auxiliary_loss_mlp": 0.01276259, "balance_loss_clip": 0.06318774, "balance_loss_mlp": 0.01259629, "epoch": 0.36789418307530436, "flos": 21768073297920.0, "grad_norm": 2.588981440254637, "language_loss": 0.73049027, "learning_rate": 2.9171820157788445e-06, "loss": 0.80832052, "num_input_tokens_seen": 131465720, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.1661377, "step": 6119, "time_per_iteration": 2.613281011581421 }, { "auxiliary_loss_clip": 0.06497292, "auxiliary_loss_mlp": 0.01280728, "balance_loss_clip": 0.06316194, "balance_loss_mlp": 0.01265058, "epoch": 0.3679543063279723, "flos": 15929598579840.0, "grad_norm": 2.2318921160258904, "language_loss": 0.80409759, "learning_rate": 2.9168359042617404e-06, "loss": 0.88187784, "num_input_tokens_seen": 131483080, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.15679932, "step": 6120, "time_per_iteration": 2.5855441093444824 }, { "auxiliary_loss_clip": 0.06502973, "auxiliary_loss_mlp": 0.01277921, "balance_loss_clip": 0.06321116, "balance_loss_mlp": 0.01262448, "epoch": 0.3680144295806403, "flos": 24281693205120.0, "grad_norm": 4.661818013868724, "language_loss": 0.65133882, "learning_rate": 2.916489757978126e-06, "loss": 0.72914779, "num_input_tokens_seen": 131502545, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.15478516, "step": 6121, "time_per_iteration": 2.683589458465576 }, { "auxiliary_loss_clip": 0.06503157, "auxiliary_loss_mlp": 0.01273443, "balance_loss_clip": 0.06321345, "balance_loss_mlp": 0.01258775, "epoch": 0.36807455283330826, "flos": 26111329032960.0, "grad_norm": 13.342397081970313, "language_loss": 0.71858764, "learning_rate": 2.9161435769411286e-06, "loss": 0.79635358, "num_input_tokens_seen": 131522155, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.14666748, "step": 6122, "time_per_iteration": 2.833885908126831 }, { "auxiliary_loss_clip": 0.06494822, "auxiliary_loss_mlp": 0.01273203, "balance_loss_clip": 0.06318003, "balance_loss_mlp": 0.01257908, "epoch": 0.3681346760859763, "flos": 24651972397440.0, "grad_norm": 1.85130099490502, "language_loss": 0.70114487, "learning_rate": 2.915797361163875e-06, "loss": 0.7788251, "num_input_tokens_seen": 131543865, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.1529541, "step": 6123, "time_per_iteration": 2.660829782485962 }, { "auxiliary_loss_clip": 0.06506653, "auxiliary_loss_mlp": 0.01274762, "balance_loss_clip": 0.06318749, "balance_loss_mlp": 0.01258275, "epoch": 0.36819479933864424, "flos": 23885152957440.0, "grad_norm": 4.902354133425583, "language_loss": 0.75034857, "learning_rate": 2.9154511106594933e-06, "loss": 0.82816267, "num_input_tokens_seen": 131562155, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.16479492, "step": 6124, "time_per_iteration": 2.5989062786102295 }, { "auxiliary_loss_clip": 0.06499273, "auxiliary_loss_mlp": 0.01276639, "balance_loss_clip": 0.06317206, "balance_loss_mlp": 0.01259163, "epoch": 0.3682549225913122, "flos": 25560606072960.0, "grad_norm": 2.2476086488555014, "language_loss": 0.74934083, "learning_rate": 2.915104825441114e-06, "loss": 0.82710004, "num_input_tokens_seen": 131581695, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.17480469, "step": 6125, "time_per_iteration": 2.659667730331421 }, { "auxiliary_loss_clip": 0.06509449, "auxiliary_loss_mlp": 0.01270475, "balance_loss_clip": 0.06321515, "balance_loss_mlp": 0.01253488, "epoch": 0.36831504584398017, "flos": 16952317989120.0, "grad_norm": 3.8196226300877445, "language_loss": 0.78470635, "learning_rate": 2.9147585055218686e-06, "loss": 0.86250556, "num_input_tokens_seen": 131599465, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.1697998, "step": 6126, "time_per_iteration": 2.546173572540283 }, { "auxiliary_loss_clip": 0.06512852, "auxiliary_loss_mlp": 0.01274571, "balance_loss_clip": 0.06323095, "balance_loss_mlp": 0.01257321, "epoch": 0.36837516909664814, "flos": 19871198968320.0, "grad_norm": 2.3819275913664657, "language_loss": 0.66738927, "learning_rate": 2.914412150914888e-06, "loss": 0.74526346, "num_input_tokens_seen": 131618330, "router_z_loss_clip": 1.89550781, "router_z_loss_mlp": 0.17248535, "step": 6127, "time_per_iteration": 2.581191062927246 }, { "auxiliary_loss_clip": 0.06508656, "auxiliary_loss_mlp": 0.01274528, "balance_loss_clip": 0.06320763, "balance_loss_mlp": 0.01258817, "epoch": 0.3684352923493161, "flos": 37634976224640.0, "grad_norm": 27.988026311120883, "language_loss": 0.70527345, "learning_rate": 2.9140657616333074e-06, "loss": 0.78310531, "num_input_tokens_seen": 131638960, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.15722656, "step": 6128, "time_per_iteration": 2.7149288654327393 }, { "auxiliary_loss_clip": 0.06506565, "auxiliary_loss_mlp": 0.01276874, "balance_loss_clip": 0.06323671, "balance_loss_mlp": 0.01260793, "epoch": 0.36849541560198407, "flos": 14470786995840.0, "grad_norm": 1.6920798624635247, "language_loss": 0.75589585, "learning_rate": 2.9137193376902614e-06, "loss": 0.83373022, "num_input_tokens_seen": 131657440, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.1607666, "step": 6129, "time_per_iteration": 2.5827486515045166 }, { "auxiliary_loss_clip": 0.0650703, "auxiliary_loss_mlp": 0.01274318, "balance_loss_clip": 0.06322327, "balance_loss_mlp": 0.01258595, "epoch": 0.36855553885465203, "flos": 25777037969280.0, "grad_norm": 1.7451634505054976, "language_loss": 0.85381114, "learning_rate": 2.9133728790988868e-06, "loss": 0.93162459, "num_input_tokens_seen": 131678035, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.15722656, "step": 6130, "time_per_iteration": 2.635236978530884 }, { "auxiliary_loss_clip": 0.06432545, "auxiliary_loss_mlp": 0.01287017, "balance_loss_clip": 0.06347673, "balance_loss_mlp": 0.01283608, "epoch": 0.36861566210732, "flos": 65071715212800.0, "grad_norm": 0.7982163459799322, "language_loss": 0.60291952, "learning_rate": 2.913026385872321e-06, "loss": 0.6801151, "num_input_tokens_seen": 131742470, "router_z_loss_clip": 0.84716797, "router_z_loss_mlp": 0.03414917, "step": 6131, "time_per_iteration": 3.30208158493042 }, { "auxiliary_loss_clip": 0.06500782, "auxiliary_loss_mlp": 0.01286388, "balance_loss_clip": 0.06321014, "balance_loss_mlp": 0.01269866, "epoch": 0.36867578535998796, "flos": 30962108148480.0, "grad_norm": 1.8610812043230243, "language_loss": 0.73721635, "learning_rate": 2.9126798580237034e-06, "loss": 0.81508803, "num_input_tokens_seen": 131764570, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.16522217, "step": 6132, "time_per_iteration": 2.64534068107605 }, { "auxiliary_loss_clip": 0.06517324, "auxiliary_loss_mlp": 0.01285098, "balance_loss_clip": 0.06327048, "balance_loss_mlp": 0.01267837, "epoch": 0.3687359086126559, "flos": 28845154270080.0, "grad_norm": 1.7002082011527133, "language_loss": 0.74290478, "learning_rate": 2.9123332955661736e-06, "loss": 0.82092893, "num_input_tokens_seen": 131785720, "router_z_loss_clip": 1.90136719, "router_z_loss_mlp": 0.17248535, "step": 6133, "time_per_iteration": 2.6933231353759766 }, { "auxiliary_loss_clip": 0.0650634, "auxiliary_loss_mlp": 0.01285036, "balance_loss_clip": 0.06326565, "balance_loss_mlp": 0.01268084, "epoch": 0.3687960318653239, "flos": 21403076912640.0, "grad_norm": 41.750866079868494, "language_loss": 0.71449137, "learning_rate": 2.911986698512874e-06, "loss": 0.79240513, "num_input_tokens_seen": 131804430, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.16955566, "step": 6134, "time_per_iteration": 3.9149439334869385 }, { "auxiliary_loss_clip": 0.06502809, "auxiliary_loss_mlp": 0.01289184, "balance_loss_clip": 0.06320541, "balance_loss_mlp": 0.01271363, "epoch": 0.36885615511799186, "flos": 20272183482240.0, "grad_norm": 1.6964708129078956, "language_loss": 0.75463581, "learning_rate": 2.9116400668769477e-06, "loss": 0.83255577, "num_input_tokens_seen": 131822060, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.17822266, "step": 6135, "time_per_iteration": 2.6072347164154053 }, { "auxiliary_loss_clip": 0.06433653, "auxiliary_loss_mlp": 0.01279475, "balance_loss_clip": 0.06349356, "balance_loss_mlp": 0.01276268, "epoch": 0.3689162783706599, "flos": 63106317371520.0, "grad_norm": 0.7799303527240311, "language_loss": 0.58577448, "learning_rate": 2.9112934006715376e-06, "loss": 0.66290581, "num_input_tokens_seen": 131880715, "router_z_loss_clip": 0.84375, "router_z_loss_mlp": 0.03210449, "step": 6136, "time_per_iteration": 3.157303810119629 }, { "auxiliary_loss_clip": 0.06506771, "auxiliary_loss_mlp": 0.0128764, "balance_loss_clip": 0.06325272, "balance_loss_mlp": 0.01271046, "epoch": 0.36897640162332784, "flos": 10966536593280.0, "grad_norm": 2.1322963286345638, "language_loss": 0.79786801, "learning_rate": 2.9109466999097918e-06, "loss": 0.87581211, "num_input_tokens_seen": 131895850, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.16577148, "step": 6137, "time_per_iteration": 2.571336269378662 }, { "auxiliary_loss_clip": 0.06507559, "auxiliary_loss_mlp": 0.01282784, "balance_loss_clip": 0.06326006, "balance_loss_mlp": 0.01267025, "epoch": 0.3690365248759958, "flos": 20710581644160.0, "grad_norm": 2.389998047486444, "language_loss": 0.74731708, "learning_rate": 2.9105999646048552e-06, "loss": 0.82522053, "num_input_tokens_seen": 131915775, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.15759277, "step": 6138, "time_per_iteration": 4.083872556686401 }, { "auxiliary_loss_clip": 0.06514136, "auxiliary_loss_mlp": 0.01282713, "balance_loss_clip": 0.06328042, "balance_loss_mlp": 0.01265988, "epoch": 0.3690966481286638, "flos": 31833495884160.0, "grad_norm": 2.3479561056899385, "language_loss": 0.65336263, "learning_rate": 2.9102531947698764e-06, "loss": 0.73133117, "num_input_tokens_seen": 131935715, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.16723633, "step": 6139, "time_per_iteration": 2.691209077835083 }, { "auxiliary_loss_clip": 0.06495893, "auxiliary_loss_mlp": 0.01284356, "balance_loss_clip": 0.06319134, "balance_loss_mlp": 0.01268227, "epoch": 0.36915677138133174, "flos": 13119897870720.0, "grad_norm": 1.9714419938573167, "language_loss": 0.72043526, "learning_rate": 2.909906390418006e-06, "loss": 0.79823774, "num_input_tokens_seen": 131954120, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.16125488, "step": 6140, "time_per_iteration": 2.563746690750122 }, { "auxiliary_loss_clip": 0.06414386, "auxiliary_loss_mlp": 0.01273991, "balance_loss_clip": 0.06331624, "balance_loss_mlp": 0.01270722, "epoch": 0.3692168946339997, "flos": 68707926996480.0, "grad_norm": 0.7298337994315435, "language_loss": 0.59143907, "learning_rate": 2.9095595515623934e-06, "loss": 0.6683228, "num_input_tokens_seen": 132017485, "router_z_loss_clip": 0.828125, "router_z_loss_mlp": 0.03274536, "step": 6141, "time_per_iteration": 3.29567813873291 }, { "auxiliary_loss_clip": 0.06498614, "auxiliary_loss_mlp": 0.01291501, "balance_loss_clip": 0.06316538, "balance_loss_mlp": 0.01274275, "epoch": 0.36927701788666767, "flos": 22024392537600.0, "grad_norm": 2.3662590995838064, "language_loss": 0.75704503, "learning_rate": 2.909212678216192e-06, "loss": 0.83494616, "num_input_tokens_seen": 132036760, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.17236328, "step": 6142, "time_per_iteration": 2.5866780281066895 }, { "auxiliary_loss_clip": 0.06497505, "auxiliary_loss_mlp": 0.01292939, "balance_loss_clip": 0.0631814, "balance_loss_mlp": 0.01277823, "epoch": 0.36933714113933563, "flos": 21842103980160.0, "grad_norm": 1.6948213330635886, "language_loss": 0.77229613, "learning_rate": 2.908865770392555e-06, "loss": 0.85020053, "num_input_tokens_seen": 132056935, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.15112305, "step": 6143, "time_per_iteration": 2.594722270965576 }, { "auxiliary_loss_clip": 0.06499276, "auxiliary_loss_mlp": 0.0128933, "balance_loss_clip": 0.06321466, "balance_loss_mlp": 0.01273749, "epoch": 0.3693972643920036, "flos": 23697749301120.0, "grad_norm": 1.6725146363253682, "language_loss": 0.82199335, "learning_rate": 2.9085188281046364e-06, "loss": 0.8998794, "num_input_tokens_seen": 132077285, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.15557861, "step": 6144, "time_per_iteration": 4.0216004848480225 }, { "auxiliary_loss_clip": 0.06500562, "auxiliary_loss_mlp": 0.01287464, "balance_loss_clip": 0.063191, "balance_loss_mlp": 0.0127149, "epoch": 0.36945738764467156, "flos": 22863355943040.0, "grad_norm": 2.553807554996398, "language_loss": 0.78200364, "learning_rate": 2.908171851365593e-06, "loss": 0.85988402, "num_input_tokens_seen": 132095520, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.15979004, "step": 6145, "time_per_iteration": 2.5776212215423584 }, { "auxiliary_loss_clip": 0.06511243, "auxiliary_loss_mlp": 0.01292522, "balance_loss_clip": 0.06327341, "balance_loss_mlp": 0.01275522, "epoch": 0.36951751089733953, "flos": 16621213380480.0, "grad_norm": 7.717231626649835, "language_loss": 0.77638543, "learning_rate": 2.9078248401885815e-06, "loss": 0.85442305, "num_input_tokens_seen": 132112810, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.16992188, "step": 6146, "time_per_iteration": 2.5657925605773926 }, { "auxiliary_loss_clip": 0.06504588, "auxiliary_loss_mlp": 0.01284779, "balance_loss_clip": 0.06321899, "balance_loss_mlp": 0.01267589, "epoch": 0.3695776341500075, "flos": 18920204254080.0, "grad_norm": 1.8825260329986033, "language_loss": 0.8099311, "learning_rate": 2.907477794586761e-06, "loss": 0.88782477, "num_input_tokens_seen": 132131615, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.171875, "step": 6147, "time_per_iteration": 2.5590291023254395 }, { "auxiliary_loss_clip": 0.06511279, "auxiliary_loss_mlp": 0.01289513, "balance_loss_clip": 0.06327663, "balance_loss_mlp": 0.01272955, "epoch": 0.36963775740267546, "flos": 20813892128640.0, "grad_norm": 1.7720095095850372, "language_loss": 0.83547592, "learning_rate": 2.9071307145732926e-06, "loss": 0.91348386, "num_input_tokens_seen": 132149585, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.16552734, "step": 6148, "time_per_iteration": 2.5870251655578613 }, { "auxiliary_loss_clip": 0.06498594, "auxiliary_loss_mlp": 0.01285155, "balance_loss_clip": 0.06317574, "balance_loss_mlp": 0.01268978, "epoch": 0.3696978806553435, "flos": 26068087526400.0, "grad_norm": 1.8849821738933883, "language_loss": 0.74794501, "learning_rate": 2.9067836001613357e-06, "loss": 0.82578254, "num_input_tokens_seen": 132165555, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.16186523, "step": 6149, "time_per_iteration": 4.130134105682373 }, { "auxiliary_loss_clip": 0.06514404, "auxiliary_loss_mlp": 0.01287532, "balance_loss_clip": 0.06331494, "balance_loss_mlp": 0.01270175, "epoch": 0.36975800390801145, "flos": 26841237949440.0, "grad_norm": 2.4749481864293754, "language_loss": 0.71341616, "learning_rate": 2.906436451364054e-06, "loss": 0.79143548, "num_input_tokens_seen": 132185100, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.17358398, "step": 6150, "time_per_iteration": 2.626279592514038 }, { "auxiliary_loss_clip": 0.06506828, "auxiliary_loss_mlp": 0.01287872, "balance_loss_clip": 0.0632702, "balance_loss_mlp": 0.01271862, "epoch": 0.3698181271606794, "flos": 21149063660160.0, "grad_norm": 1.460471554860805, "language_loss": 0.81891966, "learning_rate": 2.906089268194611e-06, "loss": 0.89686668, "num_input_tokens_seen": 132203930, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.16015625, "step": 6151, "time_per_iteration": 2.6267948150634766 }, { "auxiliary_loss_clip": 0.06430347, "auxiliary_loss_mlp": 0.01259697, "balance_loss_clip": 0.06348622, "balance_loss_mlp": 0.01256794, "epoch": 0.3698782504133474, "flos": 66761605958400.0, "grad_norm": 0.7674648433522748, "language_loss": 0.63058621, "learning_rate": 2.9057420506661726e-06, "loss": 0.70748663, "num_input_tokens_seen": 132263845, "router_z_loss_clip": 0.81689453, "router_z_loss_mlp": 0.0289917, "step": 6152, "time_per_iteration": 3.328402042388916 }, { "auxiliary_loss_clip": 0.06497535, "auxiliary_loss_mlp": 0.01280416, "balance_loss_clip": 0.06323151, "balance_loss_mlp": 0.01265383, "epoch": 0.36993837366601534, "flos": 24317597479680.0, "grad_norm": 14.487211929267211, "language_loss": 0.70337152, "learning_rate": 2.9053947987919044e-06, "loss": 0.78115106, "num_input_tokens_seen": 132282350, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.15045166, "step": 6153, "time_per_iteration": 2.6069698333740234 }, { "auxiliary_loss_clip": 0.06508767, "auxiliary_loss_mlp": 0.01277819, "balance_loss_clip": 0.0632498, "balance_loss_mlp": 0.01261511, "epoch": 0.3699984969186833, "flos": 24355472325120.0, "grad_norm": 1.9664515850729312, "language_loss": 0.72946554, "learning_rate": 2.9050475125849755e-06, "loss": 0.80733138, "num_input_tokens_seen": 132301930, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.16320801, "step": 6154, "time_per_iteration": 2.6159722805023193 }, { "auxiliary_loss_clip": 0.06498461, "auxiliary_loss_mlp": 0.01279417, "balance_loss_clip": 0.06316052, "balance_loss_mlp": 0.01264087, "epoch": 0.37005862017135127, "flos": 19835378547840.0, "grad_norm": 1.8863388250284665, "language_loss": 0.68655652, "learning_rate": 2.9047001920585534e-06, "loss": 0.76433527, "num_input_tokens_seen": 132320915, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.15332031, "step": 6155, "time_per_iteration": 2.6093194484710693 }, { "auxiliary_loss_clip": 0.06494714, "auxiliary_loss_mlp": 0.01276897, "balance_loss_clip": 0.06315359, "balance_loss_mlp": 0.01261757, "epoch": 0.37011874342401924, "flos": 19579981703040.0, "grad_norm": 1.9723805640124428, "language_loss": 0.6790334, "learning_rate": 2.9043528372258097e-06, "loss": 0.75674963, "num_input_tokens_seen": 132340415, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.15136719, "step": 6156, "time_per_iteration": 2.628859758377075 }, { "auxiliary_loss_clip": 0.06489048, "auxiliary_loss_mlp": 0.01278073, "balance_loss_clip": 0.06312744, "balance_loss_mlp": 0.01263101, "epoch": 0.3701788666766872, "flos": 20380315576320.0, "grad_norm": 1.8776957859699515, "language_loss": 0.82504976, "learning_rate": 2.904005448099916e-06, "loss": 0.90272093, "num_input_tokens_seen": 132358600, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.14978027, "step": 6157, "time_per_iteration": 2.601776123046875 }, { "auxiliary_loss_clip": 0.06507637, "auxiliary_loss_mlp": 0.01279043, "balance_loss_clip": 0.06322163, "balance_loss_mlp": 0.01262962, "epoch": 0.37023898992935517, "flos": 15346325508480.0, "grad_norm": 2.049250193091022, "language_loss": 0.76812285, "learning_rate": 2.9036580246940444e-06, "loss": 0.84598958, "num_input_tokens_seen": 132373160, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.1607666, "step": 6158, "time_per_iteration": 2.57297420501709 }, { "auxiliary_loss_clip": 0.06510726, "auxiliary_loss_mlp": 0.01275247, "balance_loss_clip": 0.06323697, "balance_loss_mlp": 0.01259345, "epoch": 0.37029911318202313, "flos": 19580149411200.0, "grad_norm": 2.10755244671835, "language_loss": 0.68854618, "learning_rate": 2.9033105670213708e-06, "loss": 0.76640588, "num_input_tokens_seen": 132392345, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.15911865, "step": 6159, "time_per_iteration": 2.6621766090393066 }, { "auxiliary_loss_clip": 0.06498636, "auxiliary_loss_mlp": 0.01276567, "balance_loss_clip": 0.06318826, "balance_loss_mlp": 0.01262375, "epoch": 0.3703592364346911, "flos": 26220509303040.0, "grad_norm": 2.6193089486306445, "language_loss": 0.7119537, "learning_rate": 2.9029630750950697e-06, "loss": 0.78970575, "num_input_tokens_seen": 132412620, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.14196777, "step": 6160, "time_per_iteration": 2.6275715827941895 }, { "auxiliary_loss_clip": 0.06498729, "auxiliary_loss_mlp": 0.01273077, "balance_loss_clip": 0.06320365, "balance_loss_mlp": 0.01258641, "epoch": 0.37041935968735906, "flos": 20054619555840.0, "grad_norm": 1.5538306698182272, "language_loss": 0.79723704, "learning_rate": 2.9026155489283176e-06, "loss": 0.87495506, "num_input_tokens_seen": 132431570, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.14440918, "step": 6161, "time_per_iteration": 2.5617918968200684 }, { "auxiliary_loss_clip": 0.06495899, "auxiliary_loss_mlp": 0.01275099, "balance_loss_clip": 0.06314506, "balance_loss_mlp": 0.01258624, "epoch": 0.3704794829400271, "flos": 24140633656320.0, "grad_norm": 1.9818952552637517, "language_loss": 0.79762793, "learning_rate": 2.902267988534295e-06, "loss": 0.8753379, "num_input_tokens_seen": 132451525, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.16461182, "step": 6162, "time_per_iteration": 2.62105393409729 }, { "auxiliary_loss_clip": 0.06505622, "auxiliary_loss_mlp": 0.01270723, "balance_loss_clip": 0.06324986, "balance_loss_mlp": 0.01255285, "epoch": 0.37053960619269505, "flos": 14872232707200.0, "grad_norm": 3.19956918058755, "language_loss": 0.79890347, "learning_rate": 2.9019203939261783e-06, "loss": 0.8766669, "num_input_tokens_seen": 132469875, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.15435791, "step": 6163, "time_per_iteration": 2.7133636474609375 }, { "auxiliary_loss_clip": 0.06494199, "auxiliary_loss_mlp": 0.01282689, "balance_loss_clip": 0.06312808, "balance_loss_mlp": 0.01267704, "epoch": 0.370599729445363, "flos": 21367969251840.0, "grad_norm": 2.178476202272676, "language_loss": 0.68572748, "learning_rate": 2.9015727651171507e-06, "loss": 0.76349634, "num_input_tokens_seen": 132488360, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.14978027, "step": 6164, "time_per_iteration": 2.6063594818115234 }, { "auxiliary_loss_clip": 0.06501299, "auxiliary_loss_mlp": 0.01278803, "balance_loss_clip": 0.06319094, "balance_loss_mlp": 0.01262733, "epoch": 0.370659852698031, "flos": 26835535872000.0, "grad_norm": 2.0323754320470933, "language_loss": 0.83103907, "learning_rate": 2.9012251021203935e-06, "loss": 0.90884006, "num_input_tokens_seen": 132508630, "router_z_loss_clip": 1.82324219, "router_z_loss_mlp": 0.16064453, "step": 6165, "time_per_iteration": 2.6154513359069824 }, { "auxiliary_loss_clip": 0.06506044, "auxiliary_loss_mlp": 0.01278828, "balance_loss_clip": 0.06319778, "balance_loss_mlp": 0.01261614, "epoch": 0.37071997595069894, "flos": 19105050360960.0, "grad_norm": 3.816365152595886, "language_loss": 0.70201671, "learning_rate": 2.9008774049490896e-06, "loss": 0.77986538, "num_input_tokens_seen": 132527465, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.17211914, "step": 6166, "time_per_iteration": 2.619824171066284 }, { "auxiliary_loss_clip": 0.0638407, "auxiliary_loss_mlp": 0.01273543, "balance_loss_clip": 0.06302115, "balance_loss_mlp": 0.01270122, "epoch": 0.3707800992033669, "flos": 52193839461120.0, "grad_norm": 0.7757726867309764, "language_loss": 0.56120539, "learning_rate": 2.9005296736164244e-06, "loss": 0.6377815, "num_input_tokens_seen": 132579940, "router_z_loss_clip": 0.8203125, "router_z_loss_mlp": 0.03430176, "step": 6167, "time_per_iteration": 3.093780040740967 }, { "auxiliary_loss_clip": 0.06501413, "auxiliary_loss_mlp": 0.01274958, "balance_loss_clip": 0.06321687, "balance_loss_mlp": 0.01259866, "epoch": 0.3708402224560349, "flos": 19908025637760.0, "grad_norm": 3.2297282260890077, "language_loss": 0.7569629, "learning_rate": 2.900181908135584e-06, "loss": 0.83472663, "num_input_tokens_seen": 132598390, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.15087891, "step": 6168, "time_per_iteration": 2.735985040664673 }, { "auxiliary_loss_clip": 0.06493716, "auxiliary_loss_mlp": 0.01270279, "balance_loss_clip": 0.06312983, "balance_loss_mlp": 0.01255139, "epoch": 0.37090034570870284, "flos": 20013222839040.0, "grad_norm": 3.7712442214458894, "language_loss": 0.74997312, "learning_rate": 2.899834108519755e-06, "loss": 0.82761306, "num_input_tokens_seen": 132616920, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.15136719, "step": 6169, "time_per_iteration": 2.6923396587371826 }, { "auxiliary_loss_clip": 0.06492253, "auxiliary_loss_mlp": 0.01279787, "balance_loss_clip": 0.06314667, "balance_loss_mlp": 0.01264755, "epoch": 0.3709604689613708, "flos": 24141681832320.0, "grad_norm": 2.389687560314229, "language_loss": 0.8007524, "learning_rate": 2.899486274782127e-06, "loss": 0.87847281, "num_input_tokens_seen": 132637660, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.15039062, "step": 6170, "time_per_iteration": 2.633578300476074 }, { "auxiliary_loss_clip": 0.0648946, "auxiliary_loss_mlp": 0.01272982, "balance_loss_clip": 0.0630956, "balance_loss_mlp": 0.01257115, "epoch": 0.37102059221403877, "flos": 23882469626880.0, "grad_norm": 1.7160350706056668, "language_loss": 0.77000439, "learning_rate": 2.8991384069358885e-06, "loss": 0.84762883, "num_input_tokens_seen": 132657635, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.15863037, "step": 6171, "time_per_iteration": 2.6992435455322266 }, { "auxiliary_loss_clip": 0.06498548, "auxiliary_loss_mlp": 0.01271005, "balance_loss_clip": 0.06317922, "balance_loss_mlp": 0.01255758, "epoch": 0.37108071546670673, "flos": 14506439708160.0, "grad_norm": 2.8093377644548028, "language_loss": 0.8070699, "learning_rate": 2.898790504994232e-06, "loss": 0.88476539, "num_input_tokens_seen": 132674455, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.15258789, "step": 6172, "time_per_iteration": 2.6612660884857178 }, { "auxiliary_loss_clip": 0.06506179, "auxiliary_loss_mlp": 0.01275982, "balance_loss_clip": 0.06322286, "balance_loss_mlp": 0.01259424, "epoch": 0.3711408387193747, "flos": 34570172160000.0, "grad_norm": 2.116216070978181, "language_loss": 0.59702504, "learning_rate": 2.89844256897035e-06, "loss": 0.67484665, "num_input_tokens_seen": 132695140, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.16564941, "step": 6173, "time_per_iteration": 4.1304895877838135 }, { "auxiliary_loss_clip": 0.06498791, "auxiliary_loss_mlp": 0.01271195, "balance_loss_clip": 0.06317602, "balance_loss_mlp": 0.01254803, "epoch": 0.37120096197204266, "flos": 17316350052480.0, "grad_norm": 2.253157883945253, "language_loss": 0.81448221, "learning_rate": 2.898094598877435e-06, "loss": 0.89218211, "num_input_tokens_seen": 132712470, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.16394043, "step": 6174, "time_per_iteration": 2.556842565536499 }, { "auxiliary_loss_clip": 0.06485899, "auxiliary_loss_mlp": 0.01266767, "balance_loss_clip": 0.06310926, "balance_loss_mlp": 0.01251878, "epoch": 0.37126108522471063, "flos": 30671855205120.0, "grad_norm": 2.018015277252822, "language_loss": 0.80610698, "learning_rate": 2.8977465947286826e-06, "loss": 0.88363367, "num_input_tokens_seen": 132732945, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.14904785, "step": 6175, "time_per_iteration": 2.6623005867004395 }, { "auxiliary_loss_clip": 0.0649403, "auxiliary_loss_mlp": 0.01269217, "balance_loss_clip": 0.06314897, "balance_loss_mlp": 0.0125347, "epoch": 0.37132120847737865, "flos": 25162682232960.0, "grad_norm": 1.7988356094068219, "language_loss": 0.88872063, "learning_rate": 2.89739855653729e-06, "loss": 0.96635306, "num_input_tokens_seen": 132752470, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.1574707, "step": 6176, "time_per_iteration": 2.618525981903076 }, { "auxiliary_loss_clip": 0.06495267, "auxiliary_loss_mlp": 0.01271424, "balance_loss_clip": 0.06314284, "balance_loss_mlp": 0.01255676, "epoch": 0.3713813317300466, "flos": 21219572471040.0, "grad_norm": 1.6154192542432653, "language_loss": 0.73861498, "learning_rate": 2.8970504843164546e-06, "loss": 0.81628186, "num_input_tokens_seen": 132771485, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.1574707, "step": 6177, "time_per_iteration": 2.6156678199768066 }, { "auxiliary_loss_clip": 0.06495034, "auxiliary_loss_mlp": 0.01275854, "balance_loss_clip": 0.06314975, "balance_loss_mlp": 0.0125988, "epoch": 0.3714414549827146, "flos": 21623114534400.0, "grad_norm": 1.9776868769749525, "language_loss": 0.75901014, "learning_rate": 2.896702378079374e-06, "loss": 0.83671904, "num_input_tokens_seen": 132791465, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.15979004, "step": 6178, "time_per_iteration": 3.985743522644043 }, { "auxiliary_loss_clip": 0.06491145, "auxiliary_loss_mlp": 0.01269597, "balance_loss_clip": 0.06313293, "balance_loss_mlp": 0.01254535, "epoch": 0.37150157823538255, "flos": 19978073251200.0, "grad_norm": 2.194252355127539, "language_loss": 0.72964597, "learning_rate": 2.8963542378392502e-06, "loss": 0.80725336, "num_input_tokens_seen": 132810160, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.1506958, "step": 6179, "time_per_iteration": 2.603337049484253 }, { "auxiliary_loss_clip": 0.06494897, "auxiliary_loss_mlp": 0.01276824, "balance_loss_clip": 0.0631267, "balance_loss_mlp": 0.01260337, "epoch": 0.3715617014880505, "flos": 24867020701440.0, "grad_norm": 1.6092514490389878, "language_loss": 0.70217431, "learning_rate": 2.896006063609283e-06, "loss": 0.77989149, "num_input_tokens_seen": 132831265, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.16479492, "step": 6180, "time_per_iteration": 2.6315407752990723 }, { "auxiliary_loss_clip": 0.06483948, "auxiliary_loss_mlp": 0.01274758, "balance_loss_clip": 0.06305164, "balance_loss_mlp": 0.01258874, "epoch": 0.3716218247407185, "flos": 20455352507520.0, "grad_norm": 1.6723490523813629, "language_loss": 0.78133225, "learning_rate": 2.8956578554026767e-06, "loss": 0.85891932, "num_input_tokens_seen": 132850005, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.15887451, "step": 6181, "time_per_iteration": 2.668586254119873 }, { "auxiliary_loss_clip": 0.06487587, "auxiliary_loss_mlp": 0.01270796, "balance_loss_clip": 0.06309143, "balance_loss_mlp": 0.01254417, "epoch": 0.37168194799338644, "flos": 24140256312960.0, "grad_norm": 1.9460843611668919, "language_loss": 0.78969061, "learning_rate": 2.8953096132326343e-06, "loss": 0.8672744, "num_input_tokens_seen": 132865790, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.16381836, "step": 6182, "time_per_iteration": 2.6522135734558105 }, { "auxiliary_loss_clip": 0.06399348, "auxiliary_loss_mlp": 0.01254418, "balance_loss_clip": 0.06316456, "balance_loss_mlp": 0.01250785, "epoch": 0.3717420712460544, "flos": 67429601107200.0, "grad_norm": 0.786359634355967, "language_loss": 0.57273507, "learning_rate": 2.894961337112362e-06, "loss": 0.64927268, "num_input_tokens_seen": 132921775, "router_z_loss_clip": 0.828125, "router_z_loss_mlp": 0.03625488, "step": 6183, "time_per_iteration": 3.1907906532287598 }, { "auxiliary_loss_clip": 0.06489436, "auxiliary_loss_mlp": 0.012788, "balance_loss_clip": 0.06303225, "balance_loss_mlp": 0.0126192, "epoch": 0.37180219449872237, "flos": 22382512888320.0, "grad_norm": 62.54764949278488, "language_loss": 0.7720812, "learning_rate": 2.894613027055066e-06, "loss": 0.84976351, "num_input_tokens_seen": 132941060, "router_z_loss_clip": 1.86523438, "router_z_loss_mlp": 0.16894531, "step": 6184, "time_per_iteration": 4.014302968978882 }, { "auxiliary_loss_clip": 0.06478932, "auxiliary_loss_mlp": 0.01282894, "balance_loss_clip": 0.06303459, "balance_loss_mlp": 0.01266562, "epoch": 0.37186231775139034, "flos": 21876037683840.0, "grad_norm": 1.7087024621423292, "language_loss": 0.72325099, "learning_rate": 2.894264683073954e-06, "loss": 0.80086923, "num_input_tokens_seen": 132961850, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.16333008, "step": 6185, "time_per_iteration": 2.597243070602417 }, { "auxiliary_loss_clip": 0.06472392, "auxiliary_loss_mlp": 0.0127877, "balance_loss_clip": 0.06297826, "balance_loss_mlp": 0.01262737, "epoch": 0.3719224410040583, "flos": 22421142420480.0, "grad_norm": 1.44616316884563, "language_loss": 0.77132285, "learning_rate": 2.8939163051822363e-06, "loss": 0.84883446, "num_input_tokens_seen": 132981625, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.16052246, "step": 6186, "time_per_iteration": 2.586449384689331 }, { "auxiliary_loss_clip": 0.06488146, "auxiliary_loss_mlp": 0.01287009, "balance_loss_clip": 0.06303558, "balance_loss_mlp": 0.01269366, "epoch": 0.37198256425672627, "flos": 25157525207040.0, "grad_norm": 1.8589746866663557, "language_loss": 0.84004557, "learning_rate": 2.8935678933931224e-06, "loss": 0.91779709, "num_input_tokens_seen": 133001225, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.1763916, "step": 6187, "time_per_iteration": 2.596621036529541 }, { "auxiliary_loss_clip": 0.06475472, "auxiliary_loss_mlp": 0.01282254, "balance_loss_clip": 0.06298351, "balance_loss_mlp": 0.01266888, "epoch": 0.37204268750939423, "flos": 21144032415360.0, "grad_norm": 1.979590680461896, "language_loss": 0.85331833, "learning_rate": 2.893219447719824e-06, "loss": 0.93089557, "num_input_tokens_seen": 133018820, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.15380859, "step": 6188, "time_per_iteration": 2.5440523624420166 }, { "auxiliary_loss_clip": 0.06473975, "auxiliary_loss_mlp": 0.01278156, "balance_loss_clip": 0.06295404, "balance_loss_mlp": 0.01261943, "epoch": 0.37210281076206225, "flos": 21513221504640.0, "grad_norm": 18.798982886486968, "language_loss": 0.6561389, "learning_rate": 2.8928709681755548e-06, "loss": 0.7336601, "num_input_tokens_seen": 133040205, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.1619873, "step": 6189, "time_per_iteration": 3.9906673431396484 }, { "auxiliary_loss_clip": 0.06476299, "auxiliary_loss_mlp": 0.01276056, "balance_loss_clip": 0.06296709, "balance_loss_mlp": 0.01258682, "epoch": 0.3721629340147302, "flos": 17353595992320.0, "grad_norm": 1.6902628635630397, "language_loss": 0.84283429, "learning_rate": 2.8925224547735293e-06, "loss": 0.92035782, "num_input_tokens_seen": 133058095, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.17376709, "step": 6190, "time_per_iteration": 2.546647071838379 }, { "auxiliary_loss_clip": 0.06480049, "auxiliary_loss_mlp": 0.01281354, "balance_loss_clip": 0.06296074, "balance_loss_mlp": 0.01265141, "epoch": 0.3722230572673982, "flos": 16437457376640.0, "grad_norm": 2.5449738807420594, "language_loss": 0.88654876, "learning_rate": 2.8921739075269633e-06, "loss": 0.96416283, "num_input_tokens_seen": 133071530, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.16210938, "step": 6191, "time_per_iteration": 2.505739450454712 }, { "auxiliary_loss_clip": 0.06480631, "auxiliary_loss_mlp": 0.01281677, "balance_loss_clip": 0.06296321, "balance_loss_mlp": 0.01263927, "epoch": 0.37228318052006615, "flos": 22681360874880.0, "grad_norm": 2.7599117349319653, "language_loss": 0.74037665, "learning_rate": 2.891825326449073e-06, "loss": 0.81799978, "num_input_tokens_seen": 133091410, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.17749023, "step": 6192, "time_per_iteration": 2.600011110305786 }, { "auxiliary_loss_clip": 0.06485838, "auxiliary_loss_mlp": 0.01277212, "balance_loss_clip": 0.06304064, "balance_loss_mlp": 0.01262013, "epoch": 0.3723433037727341, "flos": 25272617189760.0, "grad_norm": 2.487529036057932, "language_loss": 0.79933667, "learning_rate": 2.8914767115530766e-06, "loss": 0.87696713, "num_input_tokens_seen": 133110365, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.15185547, "step": 6193, "time_per_iteration": 2.598602771759033 }, { "auxiliary_loss_clip": 0.06484455, "auxiliary_loss_mlp": 0.0127928, "balance_loss_clip": 0.06301405, "balance_loss_mlp": 0.01262829, "epoch": 0.3724034270254021, "flos": 10529228534400.0, "grad_norm": 1.8798148026152135, "language_loss": 0.84272432, "learning_rate": 2.891128062852194e-06, "loss": 0.9203617, "num_input_tokens_seen": 133128255, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.16442871, "step": 6194, "time_per_iteration": 2.5883142948150635 }, { "auxiliary_loss_clip": 0.0647641, "auxiliary_loss_mlp": 0.01278932, "balance_loss_clip": 0.06296952, "balance_loss_mlp": 0.01263852, "epoch": 0.37246355027807004, "flos": 20272393117440.0, "grad_norm": 2.740702831860902, "language_loss": 0.77459228, "learning_rate": 2.890779380359646e-06, "loss": 0.85214567, "num_input_tokens_seen": 133143975, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.15075684, "step": 6195, "time_per_iteration": 2.555284261703491 }, { "auxiliary_loss_clip": 0.06473336, "auxiliary_loss_mlp": 0.01286565, "balance_loss_clip": 0.06297201, "balance_loss_mlp": 0.01270019, "epoch": 0.372523673530738, "flos": 19506705707520.0, "grad_norm": 1.7987670342197177, "language_loss": 0.80061764, "learning_rate": 2.890430664088655e-06, "loss": 0.87821662, "num_input_tokens_seen": 133162935, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.16552734, "step": 6196, "time_per_iteration": 2.600076675415039 }, { "auxiliary_loss_clip": 0.0647168, "auxiliary_loss_mlp": 0.01275019, "balance_loss_clip": 0.06295819, "balance_loss_mlp": 0.01259438, "epoch": 0.372583796783406, "flos": 16769945577600.0, "grad_norm": 2.0905250509010194, "language_loss": 0.83758247, "learning_rate": 2.890081914052443e-06, "loss": 0.91504943, "num_input_tokens_seen": 133181180, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.15576172, "step": 6197, "time_per_iteration": 2.563755750656128 }, { "auxiliary_loss_clip": 0.06468558, "auxiliary_loss_mlp": 0.01281155, "balance_loss_clip": 0.06294378, "balance_loss_mlp": 0.01265014, "epoch": 0.37264392003607394, "flos": 22644576132480.0, "grad_norm": 1.6178345412210007, "language_loss": 0.65144306, "learning_rate": 2.889733130264237e-06, "loss": 0.72894019, "num_input_tokens_seen": 133199615, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.16137695, "step": 6198, "time_per_iteration": 2.6131529808044434 }, { "auxiliary_loss_clip": 0.06468293, "auxiliary_loss_mlp": 0.01275841, "balance_loss_clip": 0.06292402, "balance_loss_mlp": 0.01260272, "epoch": 0.3727040432887419, "flos": 19979037573120.0, "grad_norm": 1.3917234206781297, "language_loss": 0.74560344, "learning_rate": 2.889384312737261e-06, "loss": 0.82304478, "num_input_tokens_seen": 133219650, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.15551758, "step": 6199, "time_per_iteration": 2.6225335597991943 }, { "auxiliary_loss_clip": 0.06474212, "auxiliary_loss_mlp": 0.01277326, "balance_loss_clip": 0.06296907, "balance_loss_mlp": 0.01262878, "epoch": 0.37276416654140987, "flos": 63911906853120.0, "grad_norm": 1.5956695057009236, "language_loss": 0.81069064, "learning_rate": 2.889035461484742e-06, "loss": 0.88820601, "num_input_tokens_seen": 133245675, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.14465332, "step": 6200, "time_per_iteration": 2.9903171062469482 }, { "auxiliary_loss_clip": 0.06470216, "auxiliary_loss_mlp": 0.01275163, "balance_loss_clip": 0.06293662, "balance_loss_mlp": 0.01259653, "epoch": 0.37282428979407783, "flos": 39795381244800.0, "grad_norm": 2.343841555831302, "language_loss": 0.60738021, "learning_rate": 2.88868657651991e-06, "loss": 0.684834, "num_input_tokens_seen": 133266905, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.1550293, "step": 6201, "time_per_iteration": 2.763310194015503 }, { "auxiliary_loss_clip": 0.06477105, "auxiliary_loss_mlp": 0.01277106, "balance_loss_clip": 0.0629501, "balance_loss_mlp": 0.01261204, "epoch": 0.37288441304674586, "flos": 22715336505600.0, "grad_norm": 1.7115217572402257, "language_loss": 0.72985059, "learning_rate": 2.8883376578559934e-06, "loss": 0.80739266, "num_input_tokens_seen": 133286865, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.15905762, "step": 6202, "time_per_iteration": 2.6593756675720215 }, { "auxiliary_loss_clip": 0.06471723, "auxiliary_loss_mlp": 0.01276957, "balance_loss_clip": 0.06293572, "balance_loss_mlp": 0.01260983, "epoch": 0.3729445362994138, "flos": 18776209812480.0, "grad_norm": 6.1983049646265185, "language_loss": 0.73770797, "learning_rate": 2.8879887055062243e-06, "loss": 0.81519473, "num_input_tokens_seen": 133305295, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.15966797, "step": 6203, "time_per_iteration": 2.5700883865356445 }, { "auxiliary_loss_clip": 0.0646548, "auxiliary_loss_mlp": 0.01272005, "balance_loss_clip": 0.06290998, "balance_loss_mlp": 0.01258165, "epoch": 0.3730046595520818, "flos": 22462874553600.0, "grad_norm": 1.6429204372987247, "language_loss": 0.82018828, "learning_rate": 2.8876397194838353e-06, "loss": 0.8975631, "num_input_tokens_seen": 133324625, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.13842773, "step": 6204, "time_per_iteration": 2.657144069671631 }, { "auxiliary_loss_clip": 0.06472898, "auxiliary_loss_mlp": 0.01272602, "balance_loss_clip": 0.06292671, "balance_loss_mlp": 0.01256008, "epoch": 0.37306478280474975, "flos": 24323257630080.0, "grad_norm": 1.6562232073011318, "language_loss": 0.75861496, "learning_rate": 2.8872906998020577e-06, "loss": 0.83606994, "num_input_tokens_seen": 133344625, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.16601562, "step": 6205, "time_per_iteration": 2.629443883895874 }, { "auxiliary_loss_clip": 0.06466714, "auxiliary_loss_mlp": 0.01275004, "balance_loss_clip": 0.06291174, "balance_loss_mlp": 0.01259244, "epoch": 0.3731249060574177, "flos": 15820627944960.0, "grad_norm": 11.064618326518707, "language_loss": 0.78507245, "learning_rate": 2.886941646474128e-06, "loss": 0.86248958, "num_input_tokens_seen": 133363605, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.1574707, "step": 6206, "time_per_iteration": 2.599867105484009 }, { "auxiliary_loss_clip": 0.06476405, "auxiliary_loss_mlp": 0.01269695, "balance_loss_clip": 0.06297961, "balance_loss_mlp": 0.01253888, "epoch": 0.3731850293100857, "flos": 19834120736640.0, "grad_norm": 5.444227465537884, "language_loss": 0.93508202, "learning_rate": 2.886592559513283e-06, "loss": 1.01254296, "num_input_tokens_seen": 133379405, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.15814209, "step": 6207, "time_per_iteration": 2.5970518589019775 }, { "auxiliary_loss_clip": 0.06479881, "auxiliary_loss_mlp": 0.01271745, "balance_loss_clip": 0.06298483, "balance_loss_mlp": 0.01256212, "epoch": 0.37324515256275365, "flos": 19068349472640.0, "grad_norm": 2.5644738271776566, "language_loss": 0.8262915, "learning_rate": 2.886243438932759e-06, "loss": 0.90380776, "num_input_tokens_seen": 133397585, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.15539551, "step": 6208, "time_per_iteration": 2.5932083129882812 }, { "auxiliary_loss_clip": 0.06477801, "auxiliary_loss_mlp": 0.01272527, "balance_loss_clip": 0.06296951, "balance_loss_mlp": 0.01255433, "epoch": 0.3733052758154216, "flos": 20710623571200.0, "grad_norm": 13.178327921988808, "language_loss": 0.74023443, "learning_rate": 2.8858942847457953e-06, "loss": 0.8177377, "num_input_tokens_seen": 133415365, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.17102051, "step": 6209, "time_per_iteration": 2.6075973510742188 }, { "auxiliary_loss_clip": 0.06473485, "auxiliary_loss_mlp": 0.01276418, "balance_loss_clip": 0.06296055, "balance_loss_mlp": 0.01259002, "epoch": 0.3733653990680896, "flos": 20199704100480.0, "grad_norm": 1.577424444301989, "language_loss": 0.70763755, "learning_rate": 2.8855450969656305e-06, "loss": 0.7851367, "num_input_tokens_seen": 133435700, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.17407227, "step": 6210, "time_per_iteration": 2.7035865783691406 }, { "auxiliary_loss_clip": 0.06482714, "auxiliary_loss_mlp": 0.01273929, "balance_loss_clip": 0.06302042, "balance_loss_mlp": 0.01257192, "epoch": 0.37342552232075754, "flos": 20345920675200.0, "grad_norm": 1.8347307287938817, "language_loss": 0.78431177, "learning_rate": 2.8851958756055073e-06, "loss": 0.86187816, "num_input_tokens_seen": 133455180, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.1673584, "step": 6211, "time_per_iteration": 2.6249284744262695 }, { "auxiliary_loss_clip": 0.06482647, "auxiliary_loss_mlp": 0.01269507, "balance_loss_clip": 0.06300414, "balance_loss_mlp": 0.01253735, "epoch": 0.3734856455734255, "flos": 35526701243520.0, "grad_norm": 1.4409375627700616, "language_loss": 0.73856837, "learning_rate": 2.884846620678668e-06, "loss": 0.81608993, "num_input_tokens_seen": 133476715, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.15771484, "step": 6212, "time_per_iteration": 2.7164957523345947 }, { "auxiliary_loss_clip": 0.06496765, "auxiliary_loss_mlp": 0.01282135, "balance_loss_clip": 0.06305613, "balance_loss_mlp": 0.01265315, "epoch": 0.37354576882609347, "flos": 21148686316800.0, "grad_norm": 2.5717247121070717, "language_loss": 0.82645631, "learning_rate": 2.884497332198356e-06, "loss": 0.90424532, "num_input_tokens_seen": 133494550, "router_z_loss_clip": 1.91308594, "router_z_loss_mlp": 0.16833496, "step": 6213, "time_per_iteration": 4.047536611557007 }, { "auxiliary_loss_clip": 0.06483728, "auxiliary_loss_mlp": 0.01281071, "balance_loss_clip": 0.06302583, "balance_loss_mlp": 0.01265026, "epoch": 0.37360589207876144, "flos": 21513179577600.0, "grad_norm": 2.285676844455817, "language_loss": 0.79071885, "learning_rate": 2.8841480101778167e-06, "loss": 0.86836684, "num_input_tokens_seen": 133512640, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.16027832, "step": 6214, "time_per_iteration": 2.6066691875457764 }, { "auxiliary_loss_clip": 0.06483865, "auxiliary_loss_mlp": 0.0127757, "balance_loss_clip": 0.06306148, "balance_loss_mlp": 0.01263485, "epoch": 0.37366601533142946, "flos": 38444953317120.0, "grad_norm": 1.7791399537594235, "language_loss": 0.8504436, "learning_rate": 2.883798654630296e-06, "loss": 0.92805791, "num_input_tokens_seen": 133535540, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.14074707, "step": 6215, "time_per_iteration": 2.8524818420410156 }, { "auxiliary_loss_clip": 0.06491891, "auxiliary_loss_mlp": 0.01280098, "balance_loss_clip": 0.06307156, "balance_loss_mlp": 0.01263552, "epoch": 0.3737261385840974, "flos": 18446908066560.0, "grad_norm": 2.15032379659747, "language_loss": 0.67985469, "learning_rate": 2.8834492655690423e-06, "loss": 0.75757456, "num_input_tokens_seen": 133555795, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.16540527, "step": 6216, "time_per_iteration": 2.6991543769836426 }, { "auxiliary_loss_clip": 0.064854, "auxiliary_loss_mlp": 0.01278887, "balance_loss_clip": 0.06304298, "balance_loss_mlp": 0.01262901, "epoch": 0.3737862618367654, "flos": 22936506157440.0, "grad_norm": 2.2228231537947307, "language_loss": 0.6654368, "learning_rate": 2.883099843007303e-06, "loss": 0.74307966, "num_input_tokens_seen": 133575905, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.15991211, "step": 6217, "time_per_iteration": 4.083714962005615 }, { "auxiliary_loss_clip": 0.0649029, "auxiliary_loss_mlp": 0.0127831, "balance_loss_clip": 0.06307852, "balance_loss_mlp": 0.01261764, "epoch": 0.37384638508943335, "flos": 15414360624000.0, "grad_norm": 7.778910948384062, "language_loss": 0.8087399, "learning_rate": 2.88275038695833e-06, "loss": 0.88642597, "num_input_tokens_seen": 133592585, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.16534424, "step": 6218, "time_per_iteration": 2.545785903930664 }, { "auxiliary_loss_clip": 0.06484573, "auxiliary_loss_mlp": 0.01287029, "balance_loss_clip": 0.06309168, "balance_loss_mlp": 0.01271246, "epoch": 0.3739065083421013, "flos": 24287856480000.0, "grad_norm": 1.4974978493493958, "language_loss": 0.78715205, "learning_rate": 2.8824008974353736e-06, "loss": 0.86486804, "num_input_tokens_seen": 133615070, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.15777588, "step": 6219, "time_per_iteration": 2.6650490760803223 }, { "auxiliary_loss_clip": 0.06482326, "auxiliary_loss_mlp": 0.01279541, "balance_loss_clip": 0.06305966, "balance_loss_mlp": 0.01264222, "epoch": 0.3739666315947693, "flos": 23009488663680.0, "grad_norm": 3.186298963693123, "language_loss": 0.77705634, "learning_rate": 2.8820513744516866e-06, "loss": 0.85467494, "num_input_tokens_seen": 133633490, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.15332031, "step": 6220, "time_per_iteration": 2.598867654800415 }, { "auxiliary_loss_clip": 0.06491712, "auxiliary_loss_mlp": 0.01273941, "balance_loss_clip": 0.0630912, "balance_loss_mlp": 0.01258312, "epoch": 0.37402675484743725, "flos": 19397231948160.0, "grad_norm": 1.7593735832679818, "language_loss": 0.83406436, "learning_rate": 2.8817018180205235e-06, "loss": 0.91172087, "num_input_tokens_seen": 133653425, "router_z_loss_clip": 1.82324219, "router_z_loss_mlp": 0.15637207, "step": 6221, "time_per_iteration": 2.5936317443847656 }, { "auxiliary_loss_clip": 0.06489973, "auxiliary_loss_mlp": 0.01277354, "balance_loss_clip": 0.06311145, "balance_loss_mlp": 0.01262012, "epoch": 0.3740868781001052, "flos": 17131420091520.0, "grad_norm": 1.825629128676879, "language_loss": 0.76700163, "learning_rate": 2.8813522281551387e-06, "loss": 0.84467483, "num_input_tokens_seen": 133670220, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.15344238, "step": 6222, "time_per_iteration": 2.5327765941619873 }, { "auxiliary_loss_clip": 0.06491987, "auxiliary_loss_mlp": 0.01276337, "balance_loss_clip": 0.06312399, "balance_loss_mlp": 0.01261138, "epoch": 0.3741470013527732, "flos": 20049001332480.0, "grad_norm": 1.6171850149812186, "language_loss": 0.70870602, "learning_rate": 2.881002604868789e-06, "loss": 0.78638929, "num_input_tokens_seen": 133688910, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.15185547, "step": 6223, "time_per_iteration": 3.9640395641326904 }, { "auxiliary_loss_clip": 0.06485641, "auxiliary_loss_mlp": 0.01272051, "balance_loss_clip": 0.06307016, "balance_loss_mlp": 0.01256876, "epoch": 0.37420712460544114, "flos": 36905151162240.0, "grad_norm": 1.6550091462757746, "language_loss": 0.69142789, "learning_rate": 2.8806529481747325e-06, "loss": 0.76900482, "num_input_tokens_seen": 133708690, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.15167236, "step": 6224, "time_per_iteration": 2.7865095138549805 }, { "auxiliary_loss_clip": 0.06479553, "auxiliary_loss_mlp": 0.01274516, "balance_loss_clip": 0.0630388, "balance_loss_mlp": 0.01258709, "epoch": 0.3742672478581091, "flos": 22207896979200.0, "grad_norm": 1.6344469947823006, "language_loss": 0.70418191, "learning_rate": 2.880303258086228e-06, "loss": 0.78172255, "num_input_tokens_seen": 133728095, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.15808105, "step": 6225, "time_per_iteration": 2.644641160964966 }, { "auxiliary_loss_clip": 0.06488562, "auxiliary_loss_mlp": 0.01276939, "balance_loss_clip": 0.06312342, "balance_loss_mlp": 0.01260524, "epoch": 0.3743273711107771, "flos": 24688547504640.0, "grad_norm": 1.9860452958336803, "language_loss": 0.79351664, "learning_rate": 2.879953534616536e-06, "loss": 0.87117159, "num_input_tokens_seen": 133745590, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.16418457, "step": 6226, "time_per_iteration": 2.6320624351501465 }, { "auxiliary_loss_clip": 0.0648932, "auxiliary_loss_mlp": 0.01271603, "balance_loss_clip": 0.06309429, "balance_loss_mlp": 0.01256237, "epoch": 0.37438749436344504, "flos": 24466078114560.0, "grad_norm": 1.8963221159194306, "language_loss": 0.6805979, "learning_rate": 2.879603777778917e-06, "loss": 0.75820714, "num_input_tokens_seen": 133766155, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.15368652, "step": 6227, "time_per_iteration": 2.621920585632324 }, { "auxiliary_loss_clip": 0.06484833, "auxiliary_loss_mlp": 0.01273298, "balance_loss_clip": 0.06308785, "balance_loss_mlp": 0.01258647, "epoch": 0.374447617616113, "flos": 21805193456640.0, "grad_norm": 1.6593373907673492, "language_loss": 0.83453035, "learning_rate": 2.879253987586635e-06, "loss": 0.91211164, "num_input_tokens_seen": 133783185, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.14672852, "step": 6228, "time_per_iteration": 2.583064317703247 }, { "auxiliary_loss_clip": 0.06489685, "auxiliary_loss_mlp": 0.01269597, "balance_loss_clip": 0.0631106, "balance_loss_mlp": 0.01254505, "epoch": 0.374507740868781, "flos": 17974073076480.0, "grad_norm": 1.6979269602459282, "language_loss": 0.75001162, "learning_rate": 2.8789041640529535e-06, "loss": 0.82760441, "num_input_tokens_seen": 133800975, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.15087891, "step": 6229, "time_per_iteration": 3.9988579750061035 }, { "auxiliary_loss_clip": 0.06486079, "auxiliary_loss_mlp": 0.01271936, "balance_loss_clip": 0.06305937, "balance_loss_mlp": 0.01255903, "epoch": 0.374567864121449, "flos": 16111132450560.0, "grad_norm": 1.7870960253056096, "language_loss": 0.83366412, "learning_rate": 2.8785543071911383e-06, "loss": 0.91124427, "num_input_tokens_seen": 133818020, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.16040039, "step": 6230, "time_per_iteration": 2.58752703666687 }, { "auxiliary_loss_clip": 0.0648998, "auxiliary_loss_mlp": 0.01274567, "balance_loss_clip": 0.06309555, "balance_loss_mlp": 0.01258044, "epoch": 0.37462798737411696, "flos": 25779847080960.0, "grad_norm": 1.7951151379550574, "language_loss": 0.73793811, "learning_rate": 2.878204417014456e-06, "loss": 0.81558359, "num_input_tokens_seen": 133840690, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.16516113, "step": 6231, "time_per_iteration": 2.638014793395996 }, { "auxiliary_loss_clip": 0.06488019, "auxiliary_loss_mlp": 0.01274651, "balance_loss_clip": 0.06306218, "balance_loss_mlp": 0.01258212, "epoch": 0.3746881106267849, "flos": 16660136401920.0, "grad_norm": 1.9716265786447673, "language_loss": 0.74027491, "learning_rate": 2.8778544935361735e-06, "loss": 0.81790161, "num_input_tokens_seen": 133858350, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.16418457, "step": 6232, "time_per_iteration": 2.5642662048339844 }, { "auxiliary_loss_clip": 0.06483105, "auxiliary_loss_mlp": 0.01276285, "balance_loss_clip": 0.06303591, "balance_loss_mlp": 0.01260729, "epoch": 0.3747482338794529, "flos": 26185317788160.0, "grad_norm": 4.472661736969135, "language_loss": 0.77225661, "learning_rate": 2.877504536769561e-06, "loss": 0.84985054, "num_input_tokens_seen": 133879775, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.15545654, "step": 6233, "time_per_iteration": 2.6584713459014893 }, { "auxiliary_loss_clip": 0.06486075, "auxiliary_loss_mlp": 0.01274234, "balance_loss_clip": 0.06307043, "balance_loss_mlp": 0.01259869, "epoch": 0.37480835713212085, "flos": 12025956890880.0, "grad_norm": 2.099232672312574, "language_loss": 0.6971131, "learning_rate": 2.8771545467278883e-06, "loss": 0.77471614, "num_input_tokens_seen": 133898295, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.14355469, "step": 6234, "time_per_iteration": 2.5477569103240967 }, { "auxiliary_loss_clip": 0.06490259, "auxiliary_loss_mlp": 0.01272069, "balance_loss_clip": 0.06311039, "balance_loss_mlp": 0.01258157, "epoch": 0.3748684803847888, "flos": 19684801560960.0, "grad_norm": 1.8901418509067565, "language_loss": 0.82783496, "learning_rate": 2.8768045234244276e-06, "loss": 0.90545821, "num_input_tokens_seen": 133915230, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.13916016, "step": 6235, "time_per_iteration": 2.5885348320007324 }, { "auxiliary_loss_clip": 0.06483355, "auxiliary_loss_mlp": 0.01270873, "balance_loss_clip": 0.06303221, "balance_loss_mlp": 0.01254827, "epoch": 0.3749286036374568, "flos": 20527328764800.0, "grad_norm": 1.8994766295550545, "language_loss": 0.785712, "learning_rate": 2.8764544668724517e-06, "loss": 0.86325425, "num_input_tokens_seen": 133934110, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.16052246, "step": 6236, "time_per_iteration": 2.5760834217071533 }, { "auxiliary_loss_clip": 0.0648593, "auxiliary_loss_mlp": 0.01276359, "balance_loss_clip": 0.06302057, "balance_loss_mlp": 0.01258907, "epoch": 0.37498872689012475, "flos": 20710958987520.0, "grad_norm": 3.9525614174087167, "language_loss": 0.73741972, "learning_rate": 2.876104377085234e-06, "loss": 0.81504261, "num_input_tokens_seen": 133952395, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.17456055, "step": 6237, "time_per_iteration": 2.6051688194274902 }, { "auxiliary_loss_clip": 0.06486149, "auxiliary_loss_mlp": 0.01274531, "balance_loss_clip": 0.06303459, "balance_loss_mlp": 0.01258366, "epoch": 0.3750488501427927, "flos": 21580418079360.0, "grad_norm": 2.3920261641055225, "language_loss": 0.93412817, "learning_rate": 2.8757542540760508e-06, "loss": 1.01173496, "num_input_tokens_seen": 133969635, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.16174316, "step": 6238, "time_per_iteration": 2.5671041011810303 }, { "auxiliary_loss_clip": 0.06482419, "auxiliary_loss_mlp": 0.01277756, "balance_loss_clip": 0.06300675, "balance_loss_mlp": 0.01262199, "epoch": 0.3751089733954607, "flos": 15929221236480.0, "grad_norm": 1.8888506464539039, "language_loss": 0.71409208, "learning_rate": 2.8754040978581777e-06, "loss": 0.79169387, "num_input_tokens_seen": 133987215, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.15563965, "step": 6239, "time_per_iteration": 2.543748140335083 }, { "auxiliary_loss_clip": 0.06484122, "auxiliary_loss_mlp": 0.01275014, "balance_loss_clip": 0.06303136, "balance_loss_mlp": 0.01259195, "epoch": 0.37516909664812864, "flos": 36293688391680.0, "grad_norm": 1.5452208299813912, "language_loss": 0.66038769, "learning_rate": 2.875053908444895e-06, "loss": 0.73797905, "num_input_tokens_seen": 134009250, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.15820312, "step": 6240, "time_per_iteration": 2.6938230991363525 }, { "auxiliary_loss_clip": 0.06484954, "auxiliary_loss_mlp": 0.01273815, "balance_loss_clip": 0.06304, "balance_loss_mlp": 0.01258831, "epoch": 0.3752292199007966, "flos": 13520882384640.0, "grad_norm": 2.3636954803318937, "language_loss": 0.75839162, "learning_rate": 2.8747036858494795e-06, "loss": 0.83597934, "num_input_tokens_seen": 134026875, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.14978027, "step": 6241, "time_per_iteration": 2.5718111991882324 }, { "auxiliary_loss_clip": 0.06483756, "auxiliary_loss_mlp": 0.01276078, "balance_loss_clip": 0.06302035, "balance_loss_mlp": 0.01260247, "epoch": 0.3752893431534646, "flos": 27205353866880.0, "grad_norm": 2.0088412698673426, "language_loss": 0.84692085, "learning_rate": 2.874353430085213e-06, "loss": 0.92451918, "num_input_tokens_seen": 134047185, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.1583252, "step": 6242, "time_per_iteration": 2.6438803672790527 }, { "auxiliary_loss_clip": 0.06485915, "auxiliary_loss_mlp": 0.01274864, "balance_loss_clip": 0.06304447, "balance_loss_mlp": 0.01259855, "epoch": 0.3753494664061326, "flos": 30015431919360.0, "grad_norm": 2.4001140972745483, "language_loss": 0.68942589, "learning_rate": 2.8740031411653766e-06, "loss": 0.7670337, "num_input_tokens_seen": 134067330, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.15008545, "step": 6243, "time_per_iteration": 2.663540840148926 }, { "auxiliary_loss_clip": 0.06476243, "auxiliary_loss_mlp": 0.01274868, "balance_loss_clip": 0.06296919, "balance_loss_mlp": 0.01258596, "epoch": 0.37540958965880056, "flos": 24468803372160.0, "grad_norm": 1.9359304341632257, "language_loss": 0.84478092, "learning_rate": 2.8736528191032535e-06, "loss": 0.92229199, "num_input_tokens_seen": 134085525, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.1628418, "step": 6244, "time_per_iteration": 2.590508222579956 }, { "auxiliary_loss_clip": 0.06474287, "auxiliary_loss_mlp": 0.01277203, "balance_loss_clip": 0.06297594, "balance_loss_mlp": 0.01262874, "epoch": 0.3754697129114685, "flos": 16513961754240.0, "grad_norm": 3.1060508809403746, "language_loss": 0.83149618, "learning_rate": 2.8733024639121277e-06, "loss": 0.90901107, "num_input_tokens_seen": 134101855, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.14331055, "step": 6245, "time_per_iteration": 2.5940778255462646 }, { "auxiliary_loss_clip": 0.06479479, "auxiliary_loss_mlp": 0.01274925, "balance_loss_clip": 0.0630026, "balance_loss_mlp": 0.01259166, "epoch": 0.3755298361641365, "flos": 19396980385920.0, "grad_norm": 2.0696839330902956, "language_loss": 0.64699304, "learning_rate": 2.8729520756052853e-06, "loss": 0.72453713, "num_input_tokens_seen": 134119360, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.15759277, "step": 6246, "time_per_iteration": 2.568490743637085 }, { "auxiliary_loss_clip": 0.06488179, "auxiliary_loss_mlp": 0.01275698, "balance_loss_clip": 0.06303293, "balance_loss_mlp": 0.01260117, "epoch": 0.37558995941680445, "flos": 14725638789120.0, "grad_norm": 1.6795150150666414, "language_loss": 0.75520623, "learning_rate": 2.8726016541960124e-06, "loss": 0.83284497, "num_input_tokens_seen": 134137475, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.15588379, "step": 6247, "time_per_iteration": 2.556678056716919 }, { "auxiliary_loss_clip": 0.06484373, "auxiliary_loss_mlp": 0.01274238, "balance_loss_clip": 0.06301991, "balance_loss_mlp": 0.01258312, "epoch": 0.3756500826694724, "flos": 21696432456960.0, "grad_norm": 2.5540404026402506, "language_loss": 0.56171417, "learning_rate": 2.872251199697598e-06, "loss": 0.63930023, "num_input_tokens_seen": 134154580, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.15917969, "step": 6248, "time_per_iteration": 2.5683040618896484 }, { "auxiliary_loss_clip": 0.06475464, "auxiliary_loss_mlp": 0.01275942, "balance_loss_clip": 0.06297269, "balance_loss_mlp": 0.01260433, "epoch": 0.3757102059221404, "flos": 26512942452480.0, "grad_norm": 2.1928914560348134, "language_loss": 0.84591281, "learning_rate": 2.8719007121233297e-06, "loss": 0.92342693, "num_input_tokens_seen": 134174285, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.1550293, "step": 6249, "time_per_iteration": 2.6547904014587402 }, { "auxiliary_loss_clip": 0.06485552, "auxiliary_loss_mlp": 0.01272568, "balance_loss_clip": 0.06304738, "balance_loss_mlp": 0.01257559, "epoch": 0.37577032917480835, "flos": 37346526144000.0, "grad_norm": 1.8746487469569617, "language_loss": 0.68620324, "learning_rate": 2.8715501914864993e-06, "loss": 0.76378447, "num_input_tokens_seen": 134195940, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.15014648, "step": 6250, "time_per_iteration": 2.712646007537842 }, { "auxiliary_loss_clip": 0.06482814, "auxiliary_loss_mlp": 0.01271954, "balance_loss_clip": 0.06302053, "balance_loss_mlp": 0.01257863, "epoch": 0.3758304524274763, "flos": 21915128413440.0, "grad_norm": 1.9748981270328145, "language_loss": 0.78083718, "learning_rate": 2.8711996378003987e-06, "loss": 0.85838485, "num_input_tokens_seen": 134212235, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.14080811, "step": 6251, "time_per_iteration": 2.5672545433044434 }, { "auxiliary_loss_clip": 0.06483023, "auxiliary_loss_mlp": 0.01278971, "balance_loss_clip": 0.06304084, "balance_loss_mlp": 0.0126438, "epoch": 0.3758905756801443, "flos": 36577233008640.0, "grad_norm": 2.0568030248880236, "language_loss": 0.58561426, "learning_rate": 2.8708490510783203e-06, "loss": 0.66323417, "num_input_tokens_seen": 134233810, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.14587402, "step": 6252, "time_per_iteration": 2.6639809608459473 }, { "auxiliary_loss_clip": 0.06482249, "auxiliary_loss_mlp": 0.01275519, "balance_loss_clip": 0.06301232, "balance_loss_mlp": 0.01259855, "epoch": 0.37595069893281224, "flos": 24534616354560.0, "grad_norm": 2.2443202533581768, "language_loss": 0.89406633, "learning_rate": 2.8704984313335584e-06, "loss": 0.97164404, "num_input_tokens_seen": 134252020, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.15661621, "step": 6253, "time_per_iteration": 3.990396022796631 }, { "auxiliary_loss_clip": 0.06488104, "auxiliary_loss_mlp": 0.01274403, "balance_loss_clip": 0.06310974, "balance_loss_mlp": 0.01259442, "epoch": 0.3760108221854802, "flos": 16440518050560.0, "grad_norm": 1.7827856015875074, "language_loss": 0.76918679, "learning_rate": 2.8701477785794097e-06, "loss": 0.84681189, "num_input_tokens_seen": 134269495, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.14978027, "step": 6254, "time_per_iteration": 2.5535011291503906 }, { "auxiliary_loss_clip": 0.06484959, "auxiliary_loss_mlp": 0.01272805, "balance_loss_clip": 0.0630321, "balance_loss_mlp": 0.01256807, "epoch": 0.37607094543814823, "flos": 13776824280960.0, "grad_norm": 2.005044669029225, "language_loss": 0.62634051, "learning_rate": 2.869797092829169e-06, "loss": 0.7039181, "num_input_tokens_seen": 134287035, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.16003418, "step": 6255, "time_per_iteration": 2.543989419937134 }, { "auxiliary_loss_clip": 0.06486846, "auxiliary_loss_mlp": 0.01269545, "balance_loss_clip": 0.06303947, "balance_loss_mlp": 0.01253511, "epoch": 0.3761310686908162, "flos": 19862855487360.0, "grad_norm": 2.1987428950242425, "language_loss": 0.74321985, "learning_rate": 2.869446374096135e-06, "loss": 0.82078373, "num_input_tokens_seen": 134304840, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.16027832, "step": 6256, "time_per_iteration": 4.137516975402832 }, { "auxiliary_loss_clip": 0.06484052, "auxiliary_loss_mlp": 0.01272766, "balance_loss_clip": 0.06301194, "balance_loss_mlp": 0.01256757, "epoch": 0.37619119194348416, "flos": 12755823880320.0, "grad_norm": 1.8782292354610781, "language_loss": 0.70917642, "learning_rate": 2.8690956223936088e-06, "loss": 0.78674459, "num_input_tokens_seen": 134323180, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.16003418, "step": 6257, "time_per_iteration": 2.7903385162353516 }, { "auxiliary_loss_clip": 0.06479552, "auxiliary_loss_mlp": 0.01270715, "balance_loss_clip": 0.0630001, "balance_loss_mlp": 0.01255743, "epoch": 0.3762513151961521, "flos": 17536387674240.0, "grad_norm": 1.6036831441098045, "language_loss": 0.85040838, "learning_rate": 2.868744837734889e-06, "loss": 0.92791104, "num_input_tokens_seen": 134341390, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.1496582, "step": 6258, "time_per_iteration": 2.571516752243042 }, { "auxiliary_loss_clip": 0.06486353, "auxiliary_loss_mlp": 0.01269687, "balance_loss_clip": 0.063063, "balance_loss_mlp": 0.01255722, "epoch": 0.3763114384488201, "flos": 23623215494400.0, "grad_norm": 2.021334770447863, "language_loss": 0.81360042, "learning_rate": 2.868394020133277e-06, "loss": 0.89116085, "num_input_tokens_seen": 134360425, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.13970947, "step": 6259, "time_per_iteration": 2.6172397136688232 }, { "auxiliary_loss_clip": 0.06494047, "auxiliary_loss_mlp": 0.01274299, "balance_loss_clip": 0.06311766, "balance_loss_mlp": 0.01256966, "epoch": 0.37637156170148806, "flos": 25413383249280.0, "grad_norm": 2.535361231488561, "language_loss": 0.71868455, "learning_rate": 2.8680431696020783e-06, "loss": 0.796368, "num_input_tokens_seen": 134379775, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.17333984, "step": 6260, "time_per_iteration": 2.612884044647217 }, { "auxiliary_loss_clip": 0.064895, "auxiliary_loss_mlp": 0.01274186, "balance_loss_clip": 0.06304947, "balance_loss_mlp": 0.01257663, "epoch": 0.376431684954156, "flos": 23447677190400.0, "grad_norm": 1.573955248291773, "language_loss": 0.78916126, "learning_rate": 2.867692286154594e-06, "loss": 0.8667981, "num_input_tokens_seen": 134400315, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.16516113, "step": 6261, "time_per_iteration": 2.600766181945801 }, { "auxiliary_loss_clip": 0.0649039, "auxiliary_loss_mlp": 0.01272896, "balance_loss_clip": 0.06306137, "balance_loss_mlp": 0.01258519, "epoch": 0.376491808206824, "flos": 34213099985280.0, "grad_norm": 1.7764934005592912, "language_loss": 0.8072235, "learning_rate": 2.867341369804132e-06, "loss": 0.88485634, "num_input_tokens_seen": 134422875, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.1439209, "step": 6262, "time_per_iteration": 4.145408868789673 }, { "auxiliary_loss_clip": 0.06482822, "auxiliary_loss_mlp": 0.01271994, "balance_loss_clip": 0.06303381, "balance_loss_mlp": 0.01257224, "epoch": 0.37655193145949195, "flos": 35193793772160.0, "grad_norm": 1.883391764372412, "language_loss": 0.81092513, "learning_rate": 2.866990420563998e-06, "loss": 0.88847327, "num_input_tokens_seen": 134443025, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.14782715, "step": 6263, "time_per_iteration": 2.73221755027771 }, { "auxiliary_loss_clip": 0.06491485, "auxiliary_loss_mlp": 0.01273715, "balance_loss_clip": 0.06307746, "balance_loss_mlp": 0.01259207, "epoch": 0.3766120547121599, "flos": 16767136465920.0, "grad_norm": 1.6356037954551153, "language_loss": 0.8018114, "learning_rate": 2.866639438447501e-06, "loss": 0.87946337, "num_input_tokens_seen": 134460945, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.14501953, "step": 6264, "time_per_iteration": 2.575636148452759 }, { "auxiliary_loss_clip": 0.06483088, "auxiliary_loss_mlp": 0.01271109, "balance_loss_clip": 0.06304619, "balance_loss_mlp": 0.01256529, "epoch": 0.3766721779648279, "flos": 23557150949760.0, "grad_norm": 1.7579799697276015, "language_loss": 0.73943508, "learning_rate": 2.8662884234679497e-06, "loss": 0.81697702, "num_input_tokens_seen": 134480440, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.14587402, "step": 6265, "time_per_iteration": 2.576298952102661 }, { "auxiliary_loss_clip": 0.064858, "auxiliary_loss_mlp": 0.01275763, "balance_loss_clip": 0.06307544, "balance_loss_mlp": 0.01262293, "epoch": 0.37673230121749585, "flos": 29136329608320.0, "grad_norm": 1.5617388486829065, "language_loss": 0.69081295, "learning_rate": 2.865937375638654e-06, "loss": 0.76842862, "num_input_tokens_seen": 134501110, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.13470459, "step": 6266, "time_per_iteration": 2.663517475128174 }, { "auxiliary_loss_clip": 0.06488898, "auxiliary_loss_mlp": 0.0127556, "balance_loss_clip": 0.06303571, "balance_loss_mlp": 0.01260384, "epoch": 0.3767924244701638, "flos": 28154210302080.0, "grad_norm": 6.566267560270022, "language_loss": 0.6349355, "learning_rate": 2.8655862949729264e-06, "loss": 0.71258008, "num_input_tokens_seen": 134522460, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.1517334, "step": 6267, "time_per_iteration": 2.6084868907928467 }, { "auxiliary_loss_clip": 0.0640926, "auxiliary_loss_mlp": 0.0126198, "balance_loss_clip": 0.0632863, "balance_loss_mlp": 0.0125877, "epoch": 0.37685254772283183, "flos": 60815460343680.0, "grad_norm": 0.7117909947413058, "language_loss": 0.58950913, "learning_rate": 2.8652351814840795e-06, "loss": 0.66622156, "num_input_tokens_seen": 134589545, "router_z_loss_clip": 0.8046875, "router_z_loss_mlp": 0.03213501, "step": 6268, "time_per_iteration": 4.681051731109619 }, { "auxiliary_loss_clip": 0.06478475, "auxiliary_loss_mlp": 0.01270007, "balance_loss_clip": 0.06299844, "balance_loss_mlp": 0.0125457, "epoch": 0.3769126709754998, "flos": 26039939754240.0, "grad_norm": 1.543815747703326, "language_loss": 0.65685791, "learning_rate": 2.8648840351854283e-06, "loss": 0.73434269, "num_input_tokens_seen": 134610550, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.15454102, "step": 6269, "time_per_iteration": 2.611362934112549 }, { "auxiliary_loss_clip": 0.06483288, "auxiliary_loss_mlp": 0.01272929, "balance_loss_clip": 0.06307116, "balance_loss_mlp": 0.0125776, "epoch": 0.37697279422816776, "flos": 23585508357120.0, "grad_norm": 1.45797795205, "language_loss": 0.70963502, "learning_rate": 2.8645328560902874e-06, "loss": 0.78719723, "num_input_tokens_seen": 134630485, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.15167236, "step": 6270, "time_per_iteration": 2.6206274032592773 }, { "auxiliary_loss_clip": 0.06400092, "auxiliary_loss_mlp": 0.01259968, "balance_loss_clip": 0.06319298, "balance_loss_mlp": 0.01256601, "epoch": 0.3770329174808357, "flos": 64766242753920.0, "grad_norm": 0.7394781898086404, "language_loss": 0.56120789, "learning_rate": 2.8641816442119746e-06, "loss": 0.63780844, "num_input_tokens_seen": 134693510, "router_z_loss_clip": 0.8046875, "router_z_loss_mlp": 0.03372192, "step": 6271, "time_per_iteration": 3.231011152267456 }, { "auxiliary_loss_clip": 0.06482083, "auxiliary_loss_mlp": 0.01270133, "balance_loss_clip": 0.06304276, "balance_loss_mlp": 0.01254159, "epoch": 0.3770930407335037, "flos": 21841768563840.0, "grad_norm": 1.690926446170859, "language_loss": 0.80034041, "learning_rate": 2.8638303995638066e-06, "loss": 0.87786251, "num_input_tokens_seen": 134713115, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.15960693, "step": 6272, "time_per_iteration": 2.6014363765716553 }, { "auxiliary_loss_clip": 0.06475221, "auxiliary_loss_mlp": 0.01273005, "balance_loss_clip": 0.06298316, "balance_loss_mlp": 0.01258998, "epoch": 0.37715316398617166, "flos": 22754594943360.0, "grad_norm": 8.106393279415654, "language_loss": 0.74672711, "learning_rate": 2.863479122159103e-06, "loss": 0.82420933, "num_input_tokens_seen": 134732635, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.14019775, "step": 6273, "time_per_iteration": 2.5912585258483887 }, { "auxiliary_loss_clip": 0.0647677, "auxiliary_loss_mlp": 0.0126874, "balance_loss_clip": 0.06299005, "balance_loss_mlp": 0.01255007, "epoch": 0.3772132872388396, "flos": 18920246181120.0, "grad_norm": 1.476741709200579, "language_loss": 0.72252452, "learning_rate": 2.8631278120111858e-06, "loss": 0.79997957, "num_input_tokens_seen": 134750695, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.13726807, "step": 6274, "time_per_iteration": 2.5493147373199463 }, { "auxiliary_loss_clip": 0.0647984, "auxiliary_loss_mlp": 0.01272769, "balance_loss_clip": 0.0630092, "balance_loss_mlp": 0.01258845, "epoch": 0.3772734104915076, "flos": 17351709275520.0, "grad_norm": 1.7975418447526308, "language_loss": 0.84817153, "learning_rate": 2.8627764691333742e-06, "loss": 0.92569762, "num_input_tokens_seen": 134768935, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.13922119, "step": 6275, "time_per_iteration": 2.5686092376708984 }, { "auxiliary_loss_clip": 0.06474729, "auxiliary_loss_mlp": 0.01270435, "balance_loss_clip": 0.06300317, "balance_loss_mlp": 0.01257375, "epoch": 0.37733353374417555, "flos": 32350452848640.0, "grad_norm": 1.3364027782104149, "language_loss": 0.75927615, "learning_rate": 2.8624250935389935e-06, "loss": 0.83672774, "num_input_tokens_seen": 134791260, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.1307373, "step": 6276, "time_per_iteration": 2.6427013874053955 }, { "auxiliary_loss_clip": 0.06479782, "auxiliary_loss_mlp": 0.01271697, "balance_loss_clip": 0.06298801, "balance_loss_mlp": 0.01256671, "epoch": 0.3773936569968435, "flos": 23366225422080.0, "grad_norm": 2.323840505733224, "language_loss": 0.85943389, "learning_rate": 2.862073685241366e-06, "loss": 0.93694866, "num_input_tokens_seen": 134808350, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.15020752, "step": 6277, "time_per_iteration": 2.555820941925049 }, { "auxiliary_loss_clip": 0.0647196, "auxiliary_loss_mlp": 0.01270732, "balance_loss_clip": 0.06299491, "balance_loss_mlp": 0.0125657, "epoch": 0.3774537802495115, "flos": 21472579474560.0, "grad_norm": 2.034495856006984, "language_loss": 0.78717017, "learning_rate": 2.861722244253818e-06, "loss": 0.86459714, "num_input_tokens_seen": 134826005, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.14160156, "step": 6278, "time_per_iteration": 2.543499708175659 }, { "auxiliary_loss_clip": 0.06483086, "auxiliary_loss_mlp": 0.01268876, "balance_loss_clip": 0.06298018, "balance_loss_mlp": 0.01252973, "epoch": 0.37751390350217945, "flos": 24980812945920.0, "grad_norm": 1.806613172994934, "language_loss": 0.83787692, "learning_rate": 2.8613707705896767e-06, "loss": 0.91539657, "num_input_tokens_seen": 134844995, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.15905762, "step": 6279, "time_per_iteration": 2.5956387519836426 }, { "auxiliary_loss_clip": 0.06477754, "auxiliary_loss_mlp": 0.01270001, "balance_loss_clip": 0.06297669, "balance_loss_mlp": 0.01255767, "epoch": 0.3775740267548474, "flos": 27826585637760.0, "grad_norm": 1.6849429866443737, "language_loss": 0.75250196, "learning_rate": 2.861019264262269e-06, "loss": 0.82997954, "num_input_tokens_seen": 134865285, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.14245605, "step": 6280, "time_per_iteration": 2.5923447608947754 }, { "auxiliary_loss_clip": 0.06472294, "auxiliary_loss_mlp": 0.01272488, "balance_loss_clip": 0.06297275, "balance_loss_mlp": 0.01258636, "epoch": 0.3776341500075154, "flos": 22571845188480.0, "grad_norm": 2.0999411532998846, "language_loss": 0.76705956, "learning_rate": 2.8606677252849242e-06, "loss": 0.8445074, "num_input_tokens_seen": 134886535, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.1383667, "step": 6281, "time_per_iteration": 2.565293312072754 }, { "auxiliary_loss_clip": 0.06469421, "auxiliary_loss_mlp": 0.01276354, "balance_loss_clip": 0.0629127, "balance_loss_mlp": 0.01262162, "epoch": 0.3776942732601834, "flos": 23084148251520.0, "grad_norm": 1.3720486409696335, "language_loss": 0.84668195, "learning_rate": 2.860316153670974e-06, "loss": 0.92413974, "num_input_tokens_seen": 134907435, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.14178467, "step": 6282, "time_per_iteration": 2.573079824447632 }, { "auxiliary_loss_clip": 0.06471479, "auxiliary_loss_mlp": 0.01275835, "balance_loss_clip": 0.06296635, "balance_loss_mlp": 0.01261548, "epoch": 0.37775439651285136, "flos": 21730617722880.0, "grad_norm": 1.8119769775175616, "language_loss": 0.69979799, "learning_rate": 2.8599645494337484e-06, "loss": 0.77727115, "num_input_tokens_seen": 134925360, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.1428833, "step": 6283, "time_per_iteration": 2.5600345134735107 }, { "auxiliary_loss_clip": 0.06473342, "auxiliary_loss_mlp": 0.01280163, "balance_loss_clip": 0.06297785, "balance_loss_mlp": 0.01265167, "epoch": 0.37781451976551933, "flos": 23994542862720.0, "grad_norm": 1.880854899262301, "language_loss": 0.76879966, "learning_rate": 2.859612912586581e-06, "loss": 0.8463347, "num_input_tokens_seen": 134944205, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.14990234, "step": 6284, "time_per_iteration": 2.613530158996582 }, { "auxiliary_loss_clip": 0.064854, "auxiliary_loss_mlp": 0.01276414, "balance_loss_clip": 0.06299448, "balance_loss_mlp": 0.01260535, "epoch": 0.3778746430181873, "flos": 13731821838720.0, "grad_norm": 1.9234576055902775, "language_loss": 0.86522514, "learning_rate": 2.8592612431428055e-06, "loss": 0.9428432, "num_input_tokens_seen": 134960255, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.15856934, "step": 6285, "time_per_iteration": 2.571988105773926 }, { "auxiliary_loss_clip": 0.0647389, "auxiliary_loss_mlp": 0.01275918, "balance_loss_clip": 0.06292256, "balance_loss_mlp": 0.01260302, "epoch": 0.37793476627085526, "flos": 19466021750400.0, "grad_norm": 1.7279989704048349, "language_loss": 0.84723043, "learning_rate": 2.858909541115758e-06, "loss": 0.92472851, "num_input_tokens_seen": 134978605, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.15625, "step": 6286, "time_per_iteration": 2.560852527618408 }, { "auxiliary_loss_clip": 0.06472579, "auxiliary_loss_mlp": 0.01273084, "balance_loss_clip": 0.06292064, "balance_loss_mlp": 0.01256824, "epoch": 0.3779948895235232, "flos": 10711600945920.0, "grad_norm": 2.238435167410292, "language_loss": 0.82089847, "learning_rate": 2.858557806518775e-06, "loss": 0.89835513, "num_input_tokens_seen": 134995020, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.16271973, "step": 6287, "time_per_iteration": 2.542518377304077 }, { "auxiliary_loss_clip": 0.06474738, "auxiliary_loss_mlp": 0.01274985, "balance_loss_clip": 0.06294924, "balance_loss_mlp": 0.01258737, "epoch": 0.3780550127761912, "flos": 22316616051840.0, "grad_norm": 2.0780909712979927, "language_loss": 0.73381346, "learning_rate": 2.8582060393651927e-06, "loss": 0.81131065, "num_input_tokens_seen": 135012620, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.16229248, "step": 6288, "time_per_iteration": 2.6279778480529785 }, { "auxiliary_loss_clip": 0.06475964, "auxiliary_loss_mlp": 0.01274862, "balance_loss_clip": 0.06296636, "balance_loss_mlp": 0.01258953, "epoch": 0.37811513602885916, "flos": 28958401463040.0, "grad_norm": 2.197168537268861, "language_loss": 0.76090795, "learning_rate": 2.857854239668352e-06, "loss": 0.83841622, "num_input_tokens_seen": 135033365, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.15905762, "step": 6289, "time_per_iteration": 2.663389205932617 }, { "auxiliary_loss_clip": 0.06472103, "auxiliary_loss_mlp": 0.01274836, "balance_loss_clip": 0.06293533, "balance_loss_mlp": 0.01259208, "epoch": 0.3781752592815271, "flos": 23119717109760.0, "grad_norm": 3.6139289991745995, "language_loss": 0.74225628, "learning_rate": 2.857502407441593e-06, "loss": 0.81972563, "num_input_tokens_seen": 135052185, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.15612793, "step": 6290, "time_per_iteration": 2.5726537704467773 }, { "auxiliary_loss_clip": 0.06481753, "auxiliary_loss_mlp": 0.01276041, "balance_loss_clip": 0.06296501, "balance_loss_mlp": 0.01259054, "epoch": 0.3782353825341951, "flos": 19762102552320.0, "grad_norm": 2.2022794405384754, "language_loss": 0.79904509, "learning_rate": 2.8571505426982566e-06, "loss": 0.87662309, "num_input_tokens_seen": 135070425, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.1697998, "step": 6291, "time_per_iteration": 2.5419161319732666 }, { "auxiliary_loss_clip": 0.06483731, "auxiliary_loss_mlp": 0.01271504, "balance_loss_clip": 0.06301967, "balance_loss_mlp": 0.01255458, "epoch": 0.37829550578686305, "flos": 22056774940800.0, "grad_norm": 1.8139064934530849, "language_loss": 0.76530296, "learning_rate": 2.8567986454516854e-06, "loss": 0.84285533, "num_input_tokens_seen": 135090525, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.16040039, "step": 6292, "time_per_iteration": 3.971102714538574 }, { "auxiliary_loss_clip": 0.06475782, "auxiliary_loss_mlp": 0.01270556, "balance_loss_clip": 0.0629494, "balance_loss_mlp": 0.01255787, "epoch": 0.378355629039531, "flos": 16475667638400.0, "grad_norm": 1.9845864729051967, "language_loss": 0.70211178, "learning_rate": 2.856446715715224e-06, "loss": 0.77957511, "num_input_tokens_seen": 135109575, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.14782715, "step": 6293, "time_per_iteration": 2.534069299697876 }, { "auxiliary_loss_clip": 0.06477405, "auxiliary_loss_mlp": 0.01272956, "balance_loss_clip": 0.06299641, "balance_loss_mlp": 0.01257256, "epoch": 0.378415752292199, "flos": 19981050071040.0, "grad_norm": 3.8623770630658516, "language_loss": 0.71894598, "learning_rate": 2.8560947535022173e-06, "loss": 0.79644954, "num_input_tokens_seen": 135127000, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.15704346, "step": 6294, "time_per_iteration": 2.5441367626190186 }, { "auxiliary_loss_clip": 0.06489552, "auxiliary_loss_mlp": 0.01276342, "balance_loss_clip": 0.06302438, "balance_loss_mlp": 0.01260308, "epoch": 0.378475875544867, "flos": 14652614355840.0, "grad_norm": 2.088170908327833, "language_loss": 0.83013177, "learning_rate": 2.855742758826011e-06, "loss": 0.90779078, "num_input_tokens_seen": 135145285, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.16027832, "step": 6295, "time_per_iteration": 2.5133581161499023 }, { "auxiliary_loss_clip": 0.06487426, "auxiliary_loss_mlp": 0.01268783, "balance_loss_clip": 0.06305762, "balance_loss_mlp": 0.01253744, "epoch": 0.37853599879753497, "flos": 26658194705280.0, "grad_norm": 1.6157070028978298, "language_loss": 0.7170518, "learning_rate": 2.8553907316999547e-06, "loss": 0.7946139, "num_input_tokens_seen": 135165240, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.15032959, "step": 6296, "time_per_iteration": 4.029380798339844 }, { "auxiliary_loss_clip": 0.06477796, "auxiliary_loss_mlp": 0.01272028, "balance_loss_clip": 0.06303492, "balance_loss_mlp": 0.01258247, "epoch": 0.37859612205020293, "flos": 17317817498880.0, "grad_norm": 1.6994207808026054, "language_loss": 0.77186155, "learning_rate": 2.855038672137396e-06, "loss": 0.84935981, "num_input_tokens_seen": 135184045, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.13793945, "step": 6297, "time_per_iteration": 2.530865430831909 }, { "auxiliary_loss_clip": 0.06487577, "auxiliary_loss_mlp": 0.01273697, "balance_loss_clip": 0.06304886, "balance_loss_mlp": 0.0125895, "epoch": 0.3786562453028709, "flos": 18225780341760.0, "grad_norm": 2.227738567279439, "language_loss": 0.79899675, "learning_rate": 2.854686580151684e-06, "loss": 0.8766095, "num_input_tokens_seen": 135202365, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.14746094, "step": 6298, "time_per_iteration": 2.515878438949585 }, { "auxiliary_loss_clip": 0.06476485, "auxiliary_loss_mlp": 0.01277044, "balance_loss_clip": 0.06297445, "balance_loss_mlp": 0.01262107, "epoch": 0.37871636855553886, "flos": 21221207625600.0, "grad_norm": 1.9796645787478675, "language_loss": 0.85110033, "learning_rate": 2.8543344557561722e-06, "loss": 0.92863566, "num_input_tokens_seen": 135220955, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.14941406, "step": 6299, "time_per_iteration": 2.5661864280700684 }, { "auxiliary_loss_clip": 0.0648832, "auxiliary_loss_mlp": 0.01272155, "balance_loss_clip": 0.06307359, "balance_loss_mlp": 0.012568, "epoch": 0.3787764918082068, "flos": 20957886570240.0, "grad_norm": 2.0430527315429328, "language_loss": 0.77030188, "learning_rate": 2.8539822989642116e-06, "loss": 0.84790665, "num_input_tokens_seen": 135239715, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.15356445, "step": 6300, "time_per_iteration": 2.53584885597229 }, { "auxiliary_loss_clip": 0.0649475, "auxiliary_loss_mlp": 0.01275178, "balance_loss_clip": 0.06306089, "balance_loss_mlp": 0.01258798, "epoch": 0.3788366150608748, "flos": 17313205524480.0, "grad_norm": 2.076612595648122, "language_loss": 0.82910335, "learning_rate": 2.8536301097891577e-06, "loss": 0.90680265, "num_input_tokens_seen": 135257035, "router_z_loss_clip": 1.88769531, "router_z_loss_mlp": 0.16357422, "step": 6301, "time_per_iteration": 2.525724172592163 }, { "auxiliary_loss_clip": 0.06489389, "auxiliary_loss_mlp": 0.01276413, "balance_loss_clip": 0.06306238, "balance_loss_mlp": 0.01260582, "epoch": 0.37889673831354276, "flos": 24317094355200.0, "grad_norm": 1.7320062344324383, "language_loss": 0.67941546, "learning_rate": 2.8532778882443636e-06, "loss": 0.75707352, "num_input_tokens_seen": 135275720, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.1583252, "step": 6302, "time_per_iteration": 3.9349308013916016 }, { "auxiliary_loss_clip": 0.06484827, "auxiliary_loss_mlp": 0.01276861, "balance_loss_clip": 0.06305623, "balance_loss_mlp": 0.01261828, "epoch": 0.3789568615662107, "flos": 26690157838080.0, "grad_norm": 1.777590404561111, "language_loss": 0.68848419, "learning_rate": 2.8529256343431867e-06, "loss": 0.76610112, "num_input_tokens_seen": 135294140, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.15039062, "step": 6303, "time_per_iteration": 2.6012673377990723 }, { "auxiliary_loss_clip": 0.06479686, "auxiliary_loss_mlp": 0.01271563, "balance_loss_clip": 0.06297816, "balance_loss_mlp": 0.01256399, "epoch": 0.3790169848188787, "flos": 23591713559040.0, "grad_norm": 1.541713067319457, "language_loss": 0.77872729, "learning_rate": 2.8525733480989846e-06, "loss": 0.8562398, "num_input_tokens_seen": 135314845, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.15155029, "step": 6304, "time_per_iteration": 2.679760217666626 }, { "auxiliary_loss_clip": 0.06496654, "auxiliary_loss_mlp": 0.01274214, "balance_loss_clip": 0.06308216, "balance_loss_mlp": 0.01257275, "epoch": 0.37907710807154665, "flos": 18442547654400.0, "grad_norm": 1.9486486522635962, "language_loss": 0.80261552, "learning_rate": 2.8522210295251146e-06, "loss": 0.88032424, "num_input_tokens_seen": 135333055, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.16931152, "step": 6305, "time_per_iteration": 2.5333428382873535 }, { "auxiliary_loss_clip": 0.06405625, "auxiliary_loss_mlp": 0.01265973, "balance_loss_clip": 0.06325226, "balance_loss_mlp": 0.01261684, "epoch": 0.3791372313242146, "flos": 50123690887680.0, "grad_norm": 0.945366648110055, "language_loss": 0.64425135, "learning_rate": 2.8518686786349387e-06, "loss": 0.72096729, "num_input_tokens_seen": 135387865, "router_z_loss_clip": 0.8046875, "router_z_loss_mlp": 0.04293823, "step": 6306, "time_per_iteration": 3.110062599182129 }, { "auxiliary_loss_clip": 0.06485321, "auxiliary_loss_mlp": 0.01279898, "balance_loss_clip": 0.0630165, "balance_loss_mlp": 0.01263828, "epoch": 0.3791973545768826, "flos": 24323467265280.0, "grad_norm": 2.633101893748681, "language_loss": 0.73885363, "learning_rate": 2.851516295441817e-06, "loss": 0.81650579, "num_input_tokens_seen": 135409095, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.16064453, "step": 6307, "time_per_iteration": 4.024107933044434 }, { "auxiliary_loss_clip": 0.06481252, "auxiliary_loss_mlp": 0.01268504, "balance_loss_clip": 0.06295969, "balance_loss_mlp": 0.01253388, "epoch": 0.3792574778295506, "flos": 21586329792000.0, "grad_norm": 1.585345419462225, "language_loss": 0.78743637, "learning_rate": 2.851163879959112e-06, "loss": 0.86493391, "num_input_tokens_seen": 135429585, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.15124512, "step": 6308, "time_per_iteration": 2.5627059936523438 }, { "auxiliary_loss_clip": 0.06480345, "auxiliary_loss_mlp": 0.01271722, "balance_loss_clip": 0.06297289, "balance_loss_mlp": 0.01255653, "epoch": 0.37931760108221857, "flos": 22279202403840.0, "grad_norm": 6.944460378268034, "language_loss": 0.73205423, "learning_rate": 2.8508114322001876e-06, "loss": 0.8095749, "num_input_tokens_seen": 135446320, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.16064453, "step": 6309, "time_per_iteration": 2.689592123031616 }, { "auxiliary_loss_clip": 0.06473482, "auxiliary_loss_mlp": 0.01275375, "balance_loss_clip": 0.06294823, "balance_loss_mlp": 0.0126076, "epoch": 0.37937772433488653, "flos": 19689161973120.0, "grad_norm": 2.2846954172436558, "language_loss": 0.79424751, "learning_rate": 2.8504589521784083e-06, "loss": 0.87173605, "num_input_tokens_seen": 135465720, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.14611816, "step": 6310, "time_per_iteration": 2.547020196914673 }, { "auxiliary_loss_clip": 0.06477315, "auxiliary_loss_mlp": 0.01272415, "balance_loss_clip": 0.06294855, "balance_loss_mlp": 0.01256858, "epoch": 0.3794378475875545, "flos": 19105469631360.0, "grad_norm": 1.8068362653576304, "language_loss": 0.76755786, "learning_rate": 2.8501064399071403e-06, "loss": 0.8450551, "num_input_tokens_seen": 135485155, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.15551758, "step": 6311, "time_per_iteration": 2.556255340576172 }, { "auxiliary_loss_clip": 0.06480576, "auxiliary_loss_mlp": 0.01280762, "balance_loss_clip": 0.06298929, "balance_loss_mlp": 0.01265771, "epoch": 0.37949797084022246, "flos": 20345920675200.0, "grad_norm": 1.6807069175138183, "language_loss": 0.713678, "learning_rate": 2.8497538953997504e-06, "loss": 0.79129136, "num_input_tokens_seen": 135502675, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.14984131, "step": 6312, "time_per_iteration": 2.5425610542297363 }, { "auxiliary_loss_clip": 0.06381261, "auxiliary_loss_mlp": 0.01258374, "balance_loss_clip": 0.063012, "balance_loss_mlp": 0.01254363, "epoch": 0.37955809409289043, "flos": 63991121760000.0, "grad_norm": 0.7572267232800105, "language_loss": 0.55923158, "learning_rate": 2.849401318669608e-06, "loss": 0.63562793, "num_input_tokens_seen": 135562005, "router_z_loss_clip": 0.79931641, "router_z_loss_mlp": 0.0401001, "step": 6313, "time_per_iteration": 3.185438394546509 }, { "auxiliary_loss_clip": 0.06476942, "auxiliary_loss_mlp": 0.01270091, "balance_loss_clip": 0.06294118, "balance_loss_mlp": 0.01255762, "epoch": 0.3796182173455584, "flos": 31548777310080.0, "grad_norm": 1.644007469410175, "language_loss": 0.71504837, "learning_rate": 2.849048709730083e-06, "loss": 0.79251873, "num_input_tokens_seen": 135582600, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.14331055, "step": 6314, "time_per_iteration": 2.633265972137451 }, { "auxiliary_loss_clip": 0.06480955, "auxiliary_loss_mlp": 0.0127505, "balance_loss_clip": 0.06296601, "balance_loss_mlp": 0.01258945, "epoch": 0.37967834059822636, "flos": 12135766066560.0, "grad_norm": 1.8954533851130702, "language_loss": 0.74175894, "learning_rate": 2.848696068594545e-06, "loss": 0.81931901, "num_input_tokens_seen": 135600280, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.16101074, "step": 6315, "time_per_iteration": 2.602987289428711 }, { "auxiliary_loss_clip": 0.06468953, "auxiliary_loss_mlp": 0.01271029, "balance_loss_clip": 0.06288928, "balance_loss_mlp": 0.01255496, "epoch": 0.3797384638508943, "flos": 39357989331840.0, "grad_norm": 2.8322630461450933, "language_loss": 0.71466041, "learning_rate": 2.8483433952763677e-06, "loss": 0.7920602, "num_input_tokens_seen": 135621560, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.15539551, "step": 6316, "time_per_iteration": 2.7317075729370117 }, { "auxiliary_loss_clip": 0.06475257, "auxiliary_loss_mlp": 0.01272667, "balance_loss_clip": 0.06293968, "balance_loss_mlp": 0.01257564, "epoch": 0.3797985871035623, "flos": 34061852165760.0, "grad_norm": 2.0200794704546756, "language_loss": 0.65798271, "learning_rate": 2.847990689788923e-06, "loss": 0.73546195, "num_input_tokens_seen": 135641745, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.15100098, "step": 6317, "time_per_iteration": 2.6645925045013428 }, { "auxiliary_loss_clip": 0.06465238, "auxiliary_loss_mlp": 0.01274986, "balance_loss_clip": 0.06288657, "balance_loss_mlp": 0.01260645, "epoch": 0.37985871035623026, "flos": 23228939306880.0, "grad_norm": 2.305627937829806, "language_loss": 0.86284161, "learning_rate": 2.8476379521455877e-06, "loss": 0.9402439, "num_input_tokens_seen": 135660650, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.14318848, "step": 6318, "time_per_iteration": 2.603682041168213 }, { "auxiliary_loss_clip": 0.06480621, "auxiliary_loss_mlp": 0.01273303, "balance_loss_clip": 0.0629659, "balance_loss_mlp": 0.01258033, "epoch": 0.3799188336088982, "flos": 18121002410880.0, "grad_norm": 2.939543818696402, "language_loss": 0.76144534, "learning_rate": 2.8472851823597354e-06, "loss": 0.83898461, "num_input_tokens_seen": 135679980, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.152771, "step": 6319, "time_per_iteration": 2.5577313899993896 }, { "auxiliary_loss_clip": 0.06471335, "auxiliary_loss_mlp": 0.01272477, "balance_loss_clip": 0.06291451, "balance_loss_mlp": 0.01257122, "epoch": 0.3799789568615662, "flos": 21878385598080.0, "grad_norm": 1.6882079923900795, "language_loss": 0.64510345, "learning_rate": 2.846932380444744e-06, "loss": 0.72254157, "num_input_tokens_seen": 135699400, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.15356445, "step": 6320, "time_per_iteration": 2.593280792236328 }, { "auxiliary_loss_clip": 0.06466894, "auxiliary_loss_mlp": 0.01274986, "balance_loss_clip": 0.06286612, "balance_loss_mlp": 0.01260097, "epoch": 0.3800390801142342, "flos": 32971181495040.0, "grad_norm": 2.05545353906217, "language_loss": 0.71422923, "learning_rate": 2.846579546413992e-06, "loss": 0.79164803, "num_input_tokens_seen": 135723455, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.14880371, "step": 6321, "time_per_iteration": 2.6739964485168457 }, { "auxiliary_loss_clip": 0.06470338, "auxiliary_loss_mlp": 0.01272861, "balance_loss_clip": 0.06289081, "balance_loss_mlp": 0.01257811, "epoch": 0.38009920336690217, "flos": 26914430090880.0, "grad_norm": 1.6277007183970467, "language_loss": 0.75120991, "learning_rate": 2.846226680280859e-06, "loss": 0.82864189, "num_input_tokens_seen": 135744335, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.15057373, "step": 6322, "time_per_iteration": 2.632674217224121 }, { "auxiliary_loss_clip": 0.0646452, "auxiliary_loss_mlp": 0.01272793, "balance_loss_clip": 0.06286603, "balance_loss_mlp": 0.01258339, "epoch": 0.38015932661957014, "flos": 22494963467520.0, "grad_norm": 1.9459855416699854, "language_loss": 0.85249949, "learning_rate": 2.845873782058725e-06, "loss": 0.92987257, "num_input_tokens_seen": 135761440, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.14447021, "step": 6323, "time_per_iteration": 2.5423424243927 }, { "auxiliary_loss_clip": 0.06467995, "auxiliary_loss_mlp": 0.01276363, "balance_loss_clip": 0.06286021, "balance_loss_mlp": 0.01260901, "epoch": 0.3802194498722381, "flos": 21987440087040.0, "grad_norm": 1.8461668317357518, "language_loss": 0.73441517, "learning_rate": 2.845520851760973e-06, "loss": 0.81185877, "num_input_tokens_seen": 135779955, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.15454102, "step": 6324, "time_per_iteration": 2.578570604324341 }, { "auxiliary_loss_clip": 0.06474161, "auxiliary_loss_mlp": 0.01271651, "balance_loss_clip": 0.06290218, "balance_loss_mlp": 0.01256416, "epoch": 0.38027957312490607, "flos": 21331310290560.0, "grad_norm": 2.5146319398110886, "language_loss": 0.84826082, "learning_rate": 2.8451678894009847e-06, "loss": 0.9257189, "num_input_tokens_seen": 135799840, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.15240479, "step": 6325, "time_per_iteration": 2.5591366291046143 }, { "auxiliary_loss_clip": 0.06470838, "auxiliary_loss_mlp": 0.01268606, "balance_loss_clip": 0.06290531, "balance_loss_mlp": 0.01254611, "epoch": 0.38033969637757403, "flos": 16696921144320.0, "grad_norm": 2.1074027744398807, "language_loss": 0.80151176, "learning_rate": 2.8448148949921465e-06, "loss": 0.87890619, "num_input_tokens_seen": 135817880, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.13989258, "step": 6326, "time_per_iteration": 2.570890426635742 }, { "auxiliary_loss_clip": 0.06465429, "auxiliary_loss_mlp": 0.01275518, "balance_loss_clip": 0.06287502, "balance_loss_mlp": 0.01261005, "epoch": 0.380399819630242, "flos": 36219741563520.0, "grad_norm": 1.7826824273237982, "language_loss": 0.73673487, "learning_rate": 2.844461868547842e-06, "loss": 0.81414431, "num_input_tokens_seen": 135838940, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.14520264, "step": 6327, "time_per_iteration": 2.661409378051758 }, { "auxiliary_loss_clip": 0.06466573, "auxiliary_loss_mlp": 0.01269218, "balance_loss_clip": 0.06288725, "balance_loss_mlp": 0.01255473, "epoch": 0.38045994288290996, "flos": 21295364088960.0, "grad_norm": 1.6784501822264015, "language_loss": 0.83267015, "learning_rate": 2.844108810081459e-06, "loss": 0.91002804, "num_input_tokens_seen": 135858325, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.13739014, "step": 6328, "time_per_iteration": 2.6039628982543945 }, { "auxiliary_loss_clip": 0.06461404, "auxiliary_loss_mlp": 0.01271566, "balance_loss_clip": 0.06283505, "balance_loss_mlp": 0.0125769, "epoch": 0.38052006613557793, "flos": 20929151819520.0, "grad_norm": 363.8792423584016, "language_loss": 0.61631757, "learning_rate": 2.843755719606385e-06, "loss": 0.69364727, "num_input_tokens_seen": 135878430, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.13903809, "step": 6329, "time_per_iteration": 2.576641082763672 }, { "auxiliary_loss_clip": 0.06467907, "auxiliary_loss_mlp": 0.01272461, "balance_loss_clip": 0.06289197, "balance_loss_mlp": 0.01257357, "epoch": 0.3805801893882459, "flos": 20996138759040.0, "grad_norm": 1.8819277108610548, "language_loss": 0.56307006, "learning_rate": 2.8434025971360104e-06, "loss": 0.64047372, "num_input_tokens_seen": 135894755, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.15112305, "step": 6330, "time_per_iteration": 2.5794837474823 }, { "auxiliary_loss_clip": 0.06460899, "auxiliary_loss_mlp": 0.01272337, "balance_loss_clip": 0.06286708, "balance_loss_mlp": 0.01258688, "epoch": 0.38064031264091386, "flos": 25565972734080.0, "grad_norm": 1.3464414979505985, "language_loss": 0.65686864, "learning_rate": 2.8430494426837243e-06, "loss": 0.73420101, "num_input_tokens_seen": 135918275, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.13635254, "step": 6331, "time_per_iteration": 2.653607130050659 }, { "auxiliary_loss_clip": 0.06466436, "auxiliary_loss_mlp": 0.01269532, "balance_loss_clip": 0.06288536, "balance_loss_mlp": 0.01253391, "epoch": 0.3807004358935818, "flos": 15091264080000.0, "grad_norm": 1.491640273868615, "language_loss": 0.76245332, "learning_rate": 2.842696256262919e-06, "loss": 0.83981299, "num_input_tokens_seen": 135937430, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.16137695, "step": 6332, "time_per_iteration": 4.02353310585022 }, { "auxiliary_loss_clip": 0.06471813, "auxiliary_loss_mlp": 0.01272631, "balance_loss_clip": 0.06291094, "balance_loss_mlp": 0.01257849, "epoch": 0.3807605591462498, "flos": 16405033046400.0, "grad_norm": 1.831377065519337, "language_loss": 0.82060766, "learning_rate": 2.842343037886987e-06, "loss": 0.8980521, "num_input_tokens_seen": 135954210, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.14794922, "step": 6333, "time_per_iteration": 2.5416736602783203 }, { "auxiliary_loss_clip": 0.06473102, "auxiliary_loss_mlp": 0.01269072, "balance_loss_clip": 0.06293958, "balance_loss_mlp": 0.01254868, "epoch": 0.3808206823989178, "flos": 29064353351040.0, "grad_norm": 1.4959103580293913, "language_loss": 0.86442053, "learning_rate": 2.8419897875693226e-06, "loss": 0.94184232, "num_input_tokens_seen": 135974425, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.14202881, "step": 6334, "time_per_iteration": 2.6530168056488037 }, { "auxiliary_loss_clip": 0.06469948, "auxiliary_loss_mlp": 0.0126861, "balance_loss_clip": 0.06290718, "balance_loss_mlp": 0.01254889, "epoch": 0.3808808056515858, "flos": 15711321893760.0, "grad_norm": 1.6787316984782317, "language_loss": 0.79778612, "learning_rate": 2.841636505323321e-06, "loss": 0.87517172, "num_input_tokens_seen": 135991985, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.137146, "step": 6335, "time_per_iteration": 3.970670223236084 }, { "auxiliary_loss_clip": 0.06467265, "auxiliary_loss_mlp": 0.01269854, "balance_loss_clip": 0.06286662, "balance_loss_mlp": 0.01254679, "epoch": 0.38094092890425374, "flos": 20710917060480.0, "grad_norm": 2.017825791992308, "language_loss": 0.72808999, "learning_rate": 2.8412831911623795e-06, "loss": 0.80546117, "num_input_tokens_seen": 136010015, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.15185547, "step": 6336, "time_per_iteration": 2.574496030807495 }, { "auxiliary_loss_clip": 0.0646519, "auxiliary_loss_mlp": 0.01267779, "balance_loss_clip": 0.06288153, "balance_loss_mlp": 0.01253795, "epoch": 0.3810010521569217, "flos": 20674258099200.0, "grad_norm": 3.7477583613023775, "language_loss": 0.70039302, "learning_rate": 2.840929845099894e-06, "loss": 0.77772272, "num_input_tokens_seen": 136028440, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.13989258, "step": 6337, "time_per_iteration": 2.5638837814331055 }, { "auxiliary_loss_clip": 0.06473081, "auxiliary_loss_mlp": 0.0127208, "balance_loss_clip": 0.06294522, "balance_loss_mlp": 0.01257834, "epoch": 0.38106117540958967, "flos": 31834963330560.0, "grad_norm": 2.797165963125935, "language_loss": 0.63854206, "learning_rate": 2.8405764671492652e-06, "loss": 0.71599364, "num_input_tokens_seen": 136048360, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.14233398, "step": 6338, "time_per_iteration": 2.6591203212738037 }, { "auxiliary_loss_clip": 0.06471688, "auxiliary_loss_mlp": 0.01271101, "balance_loss_clip": 0.06289152, "balance_loss_mlp": 0.01256033, "epoch": 0.38112129866225763, "flos": 16907231692800.0, "grad_norm": 13.296215124540312, "language_loss": 0.69779956, "learning_rate": 2.8402230573238923e-06, "loss": 0.77522749, "num_input_tokens_seen": 136065500, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.15075684, "step": 6339, "time_per_iteration": 2.5259389877319336 }, { "auxiliary_loss_clip": 0.06468302, "auxiliary_loss_mlp": 0.01273576, "balance_loss_clip": 0.06289072, "balance_loss_mlp": 0.01259557, "epoch": 0.3811814219149256, "flos": 20893624888320.0, "grad_norm": 1.9888183371694914, "language_loss": 0.6831609, "learning_rate": 2.839869615637177e-06, "loss": 0.76057965, "num_input_tokens_seen": 136084060, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.14019775, "step": 6340, "time_per_iteration": 2.557269811630249 }, { "auxiliary_loss_clip": 0.06476302, "auxiliary_loss_mlp": 0.01272761, "balance_loss_clip": 0.06294662, "balance_loss_mlp": 0.01257502, "epoch": 0.38124154516759357, "flos": 16696418019840.0, "grad_norm": 1.891501811672291, "language_loss": 0.89776707, "learning_rate": 2.839516142102522e-06, "loss": 0.97525775, "num_input_tokens_seen": 136102310, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.15252686, "step": 6341, "time_per_iteration": 3.962191104888916 }, { "auxiliary_loss_clip": 0.06479006, "auxiliary_loss_mlp": 0.01275729, "balance_loss_clip": 0.06296582, "balance_loss_mlp": 0.0125954, "epoch": 0.38130166842026153, "flos": 19687946088960.0, "grad_norm": 1.8251449491662786, "language_loss": 0.7525835, "learning_rate": 2.83916263673333e-06, "loss": 0.83013082, "num_input_tokens_seen": 136120725, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.16186523, "step": 6342, "time_per_iteration": 2.5731706619262695 }, { "auxiliary_loss_clip": 0.0647492, "auxiliary_loss_mlp": 0.01272409, "balance_loss_clip": 0.06297815, "balance_loss_mlp": 0.01258343, "epoch": 0.3813617916729295, "flos": 22204668597120.0, "grad_norm": 1.6660639619113098, "language_loss": 0.83846003, "learning_rate": 2.838809099543007e-06, "loss": 0.91593325, "num_input_tokens_seen": 136139105, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.14044189, "step": 6343, "time_per_iteration": 2.5471558570861816 }, { "auxiliary_loss_clip": 0.06474584, "auxiliary_loss_mlp": 0.0127578, "balance_loss_clip": 0.06294047, "balance_loss_mlp": 0.01261004, "epoch": 0.38142191492559746, "flos": 19102576665600.0, "grad_norm": 2.2107361768774423, "language_loss": 0.77301383, "learning_rate": 2.838455530544959e-06, "loss": 0.85051751, "num_input_tokens_seen": 136158265, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.14776611, "step": 6344, "time_per_iteration": 2.584024429321289 }, { "auxiliary_loss_clip": 0.06481566, "auxiliary_loss_mlp": 0.01272009, "balance_loss_clip": 0.06302874, "balance_loss_mlp": 0.01257632, "epoch": 0.3814820381782654, "flos": 24104645527680.0, "grad_norm": 1.8295370114855365, "language_loss": 0.73893487, "learning_rate": 2.838101929752593e-06, "loss": 0.81647062, "num_input_tokens_seen": 136176100, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.1439209, "step": 6345, "time_per_iteration": 2.565638303756714 }, { "auxiliary_loss_clip": 0.06475012, "auxiliary_loss_mlp": 0.01275215, "balance_loss_clip": 0.06297866, "balance_loss_mlp": 0.0126091, "epoch": 0.3815421614309334, "flos": 15783927056640.0, "grad_norm": 2.018746592456695, "language_loss": 0.70114994, "learning_rate": 2.8377482971793187e-06, "loss": 0.77865219, "num_input_tokens_seen": 136195125, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.14318848, "step": 6346, "time_per_iteration": 2.5709662437438965 }, { "auxiliary_loss_clip": 0.06484661, "auxiliary_loss_mlp": 0.01273126, "balance_loss_clip": 0.06302534, "balance_loss_mlp": 0.01258517, "epoch": 0.38160228468360136, "flos": 19905593869440.0, "grad_norm": 1.6761809083702104, "language_loss": 0.76055717, "learning_rate": 2.8373946328385437e-06, "loss": 0.83813506, "num_input_tokens_seen": 136213885, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.14587402, "step": 6347, "time_per_iteration": 4.020709276199341 }, { "auxiliary_loss_clip": 0.06477453, "auxiliary_loss_mlp": 0.01281244, "balance_loss_clip": 0.0629759, "balance_loss_mlp": 0.01267242, "epoch": 0.3816624079362694, "flos": 19287045429120.0, "grad_norm": 1.588578385016413, "language_loss": 0.7547757, "learning_rate": 2.8370409367436813e-06, "loss": 0.83236265, "num_input_tokens_seen": 136232700, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.14007568, "step": 6348, "time_per_iteration": 2.5933454036712646 }, { "auxiliary_loss_clip": 0.0648157, "auxiliary_loss_mlp": 0.01271578, "balance_loss_clip": 0.06301425, "balance_loss_mlp": 0.01257613, "epoch": 0.38172253118893734, "flos": 21183752050560.0, "grad_norm": 4.337213701858645, "language_loss": 0.88017786, "learning_rate": 2.836687208908142e-06, "loss": 0.95770931, "num_input_tokens_seen": 136248975, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.1395874, "step": 6349, "time_per_iteration": 2.5561625957489014 }, { "auxiliary_loss_clip": 0.0647897, "auxiliary_loss_mlp": 0.01273778, "balance_loss_clip": 0.06300513, "balance_loss_mlp": 0.01258603, "epoch": 0.3817826544416053, "flos": 17534836373760.0, "grad_norm": 1.7188875556512837, "language_loss": 0.77170944, "learning_rate": 2.836333449345341e-06, "loss": 0.84923697, "num_input_tokens_seen": 136266710, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.1517334, "step": 6350, "time_per_iteration": 2.5739622116088867 }, { "auxiliary_loss_clip": 0.06478064, "auxiliary_loss_mlp": 0.01274408, "balance_loss_clip": 0.06300116, "balance_loss_mlp": 0.01259441, "epoch": 0.38184277769427327, "flos": 16332176321280.0, "grad_norm": 2.7080605060982954, "language_loss": 0.76975936, "learning_rate": 2.8359796580686907e-06, "loss": 0.84728408, "num_input_tokens_seen": 136284445, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.14959717, "step": 6351, "time_per_iteration": 2.581075668334961 }, { "auxiliary_loss_clip": 0.06484547, "auxiliary_loss_mlp": 0.01275425, "balance_loss_clip": 0.0630458, "balance_loss_mlp": 0.01259094, "epoch": 0.38190290094694124, "flos": 30450937115520.0, "grad_norm": 1.6286756553530304, "language_loss": 0.74784648, "learning_rate": 2.8356258350916085e-06, "loss": 0.82544619, "num_input_tokens_seen": 136305730, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.16320801, "step": 6352, "time_per_iteration": 2.70160174369812 }, { "auxiliary_loss_clip": 0.06477468, "auxiliary_loss_mlp": 0.0127519, "balance_loss_clip": 0.06300765, "balance_loss_mlp": 0.01260873, "epoch": 0.3819630241996092, "flos": 14215138588800.0, "grad_norm": 1.8015126100145054, "language_loss": 0.64466262, "learning_rate": 2.8352719804275104e-06, "loss": 0.72218919, "num_input_tokens_seen": 136323850, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.14318848, "step": 6353, "time_per_iteration": 2.536482810974121 }, { "auxiliary_loss_clip": 0.06479818, "auxiliary_loss_mlp": 0.01273939, "balance_loss_clip": 0.06300136, "balance_loss_mlp": 0.01260212, "epoch": 0.38202314745227717, "flos": 25016717220480.0, "grad_norm": 1.7989181463635175, "language_loss": 0.84048128, "learning_rate": 2.834918094089816e-06, "loss": 0.91801876, "num_input_tokens_seen": 136344880, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.1373291, "step": 6354, "time_per_iteration": 2.566011905670166 }, { "auxiliary_loss_clip": 0.06479298, "auxiliary_loss_mlp": 0.01268883, "balance_loss_clip": 0.06305475, "balance_loss_mlp": 0.01255729, "epoch": 0.38208327070494513, "flos": 20820935871360.0, "grad_norm": 1.7009888813676501, "language_loss": 0.81161088, "learning_rate": 2.834564176091943e-06, "loss": 0.88909268, "num_input_tokens_seen": 136366060, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.1315918, "step": 6355, "time_per_iteration": 2.600450277328491 }, { "auxiliary_loss_clip": 0.06475097, "auxiliary_loss_mlp": 0.01270171, "balance_loss_clip": 0.0629793, "balance_loss_mlp": 0.01255055, "epoch": 0.3821433939576131, "flos": 22644282643200.0, "grad_norm": 1.9555853430435257, "language_loss": 0.75916165, "learning_rate": 2.8342102264473125e-06, "loss": 0.83661437, "num_input_tokens_seen": 136385625, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.15112305, "step": 6356, "time_per_iteration": 2.560908317565918 }, { "auxiliary_loss_clip": 0.06479525, "auxiliary_loss_mlp": 0.01269302, "balance_loss_clip": 0.06298781, "balance_loss_mlp": 0.01253721, "epoch": 0.38220351721028106, "flos": 26877100296960.0, "grad_norm": 1.850364129570066, "language_loss": 0.81892586, "learning_rate": 2.833856245169348e-06, "loss": 0.8964141, "num_input_tokens_seen": 136405750, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.15563965, "step": 6357, "time_per_iteration": 2.7663216590881348 }, { "auxiliary_loss_clip": 0.06479493, "auxiliary_loss_mlp": 0.01272463, "balance_loss_clip": 0.06300451, "balance_loss_mlp": 0.01256799, "epoch": 0.38226364046294903, "flos": 23374149632640.0, "grad_norm": 2.867872163011898, "language_loss": 0.7781474, "learning_rate": 2.8335022322714695e-06, "loss": 0.85566694, "num_input_tokens_seen": 136426085, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.15661621, "step": 6358, "time_per_iteration": 2.6453890800476074 }, { "auxiliary_loss_clip": 0.06481181, "auxiliary_loss_mlp": 0.01273457, "balance_loss_clip": 0.06299607, "balance_loss_mlp": 0.0125827, "epoch": 0.382323763715617, "flos": 19652335303680.0, "grad_norm": 9.0645083812043, "language_loss": 0.78808248, "learning_rate": 2.8331481877671036e-06, "loss": 0.8656289, "num_input_tokens_seen": 136442670, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.15185547, "step": 6359, "time_per_iteration": 2.648613929748535 }, { "auxiliary_loss_clip": 0.06475115, "auxiliary_loss_mlp": 0.01272032, "balance_loss_clip": 0.06299032, "balance_loss_mlp": 0.0125719, "epoch": 0.38238388696828496, "flos": 54136527575040.0, "grad_norm": 4.375528084058969, "language_loss": 0.70284069, "learning_rate": 2.8327941116696754e-06, "loss": 0.78031212, "num_input_tokens_seen": 136465730, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.1484375, "step": 6360, "time_per_iteration": 2.8227996826171875 }, { "auxiliary_loss_clip": 0.06475352, "auxiliary_loss_mlp": 0.0127649, "balance_loss_clip": 0.06299566, "balance_loss_mlp": 0.01260993, "epoch": 0.382444010220953, "flos": 24943105808640.0, "grad_norm": 2.573072937694622, "language_loss": 0.79122186, "learning_rate": 2.83244000399261e-06, "loss": 0.86874032, "num_input_tokens_seen": 136487215, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.1550293, "step": 6361, "time_per_iteration": 2.5823564529418945 }, { "auxiliary_loss_clip": 0.0646949, "auxiliary_loss_mlp": 0.01270136, "balance_loss_clip": 0.06295944, "balance_loss_mlp": 0.01256349, "epoch": 0.38250413347362094, "flos": 42346750216320.0, "grad_norm": 1.4042962923855022, "language_loss": 0.6562168, "learning_rate": 2.832085864749337e-06, "loss": 0.73361301, "num_input_tokens_seen": 136510365, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.13775635, "step": 6362, "time_per_iteration": 2.732422113418579 }, { "auxiliary_loss_clip": 0.06474015, "auxiliary_loss_mlp": 0.01269268, "balance_loss_clip": 0.06294518, "balance_loss_mlp": 0.01254176, "epoch": 0.3825642567262889, "flos": 16294720746240.0, "grad_norm": 2.3222992106761184, "language_loss": 0.82598352, "learning_rate": 2.8317316939532848e-06, "loss": 0.9034164, "num_input_tokens_seen": 136527100, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.15100098, "step": 6363, "time_per_iteration": 2.5200111865997314 }, { "auxiliary_loss_clip": 0.06471594, "auxiliary_loss_mlp": 0.01274148, "balance_loss_clip": 0.06296682, "balance_loss_mlp": 0.01259091, "epoch": 0.3826243799789569, "flos": 45664267795200.0, "grad_norm": 1.7443330264227033, "language_loss": 0.59560263, "learning_rate": 2.8313774916178825e-06, "loss": 0.67306012, "num_input_tokens_seen": 136550870, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.15063477, "step": 6364, "time_per_iteration": 2.779200792312622 }, { "auxiliary_loss_clip": 0.0647849, "auxiliary_loss_mlp": 0.0127702, "balance_loss_clip": 0.06297778, "balance_loss_mlp": 0.01261106, "epoch": 0.38268450323162484, "flos": 25308647245440.0, "grad_norm": 3.1874790536296294, "language_loss": 0.69337738, "learning_rate": 2.8310232577565635e-06, "loss": 0.77093244, "num_input_tokens_seen": 136569895, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.15905762, "step": 6365, "time_per_iteration": 2.584019660949707 }, { "auxiliary_loss_clip": 0.06476261, "auxiliary_loss_mlp": 0.01273489, "balance_loss_clip": 0.06292476, "balance_loss_mlp": 0.01257473, "epoch": 0.3827446264842928, "flos": 21842607104640.0, "grad_norm": 1.9825304212769468, "language_loss": 0.7351858, "learning_rate": 2.830668992382758e-06, "loss": 0.81268328, "num_input_tokens_seen": 136588585, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.16009521, "step": 6366, "time_per_iteration": 2.551110029220581 }, { "auxiliary_loss_clip": 0.06474307, "auxiliary_loss_mlp": 0.01270356, "balance_loss_clip": 0.06296176, "balance_loss_mlp": 0.01256301, "epoch": 0.38280474973696077, "flos": 25740924059520.0, "grad_norm": 2.4391040249499083, "language_loss": 0.68804651, "learning_rate": 2.830314695509902e-06, "loss": 0.76549315, "num_input_tokens_seen": 136606640, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.14050293, "step": 6367, "time_per_iteration": 2.59158992767334 }, { "auxiliary_loss_clip": 0.06470517, "auxiliary_loss_mlp": 0.01278898, "balance_loss_clip": 0.06298923, "balance_loss_mlp": 0.01264474, "epoch": 0.38286487298962874, "flos": 24902212216320.0, "grad_norm": 3.5821005725413224, "language_loss": 0.64370513, "learning_rate": 2.82996036715143e-06, "loss": 0.72119927, "num_input_tokens_seen": 136624940, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.14428711, "step": 6368, "time_per_iteration": 2.569596290588379 }, { "auxiliary_loss_clip": 0.06475138, "auxiliary_loss_mlp": 0.01275119, "balance_loss_clip": 0.06299566, "balance_loss_mlp": 0.01260516, "epoch": 0.3829249962422967, "flos": 28550457060480.0, "grad_norm": 1.3160954866861787, "language_loss": 0.68815899, "learning_rate": 2.8296060073207763e-06, "loss": 0.76566148, "num_input_tokens_seen": 136645540, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.14611816, "step": 6369, "time_per_iteration": 2.610614538192749 }, { "auxiliary_loss_clip": 0.06474319, "auxiliary_loss_mlp": 0.01271388, "balance_loss_clip": 0.06298092, "balance_loss_mlp": 0.01256493, "epoch": 0.38298511949496467, "flos": 21477736500480.0, "grad_norm": 1.6512026378989997, "language_loss": 0.78924584, "learning_rate": 2.8292516160313804e-06, "loss": 0.86670291, "num_input_tokens_seen": 136664530, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.14886475, "step": 6370, "time_per_iteration": 2.553884506225586 }, { "auxiliary_loss_clip": 0.06481662, "auxiliary_loss_mlp": 0.01277338, "balance_loss_clip": 0.06302077, "balance_loss_mlp": 0.01261746, "epoch": 0.38304524274763263, "flos": 31687027747200.0, "grad_norm": 2.959248480775013, "language_loss": 0.6556347, "learning_rate": 2.8288971932966805e-06, "loss": 0.73322469, "num_input_tokens_seen": 136682315, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.15588379, "step": 6371, "time_per_iteration": 4.002042055130005 }, { "auxiliary_loss_clip": 0.06480787, "auxiliary_loss_mlp": 0.01274392, "balance_loss_clip": 0.06298681, "balance_loss_mlp": 0.01258763, "epoch": 0.3831053660003006, "flos": 25082865619200.0, "grad_norm": 2.942956870647282, "language_loss": 0.73271394, "learning_rate": 2.8285427391301155e-06, "loss": 0.81026578, "num_input_tokens_seen": 136701185, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.15612793, "step": 6372, "time_per_iteration": 2.598179578781128 }, { "auxiliary_loss_clip": 0.06476044, "auxiliary_loss_mlp": 0.01271408, "balance_loss_clip": 0.06296553, "balance_loss_mlp": 0.0125634, "epoch": 0.38316548925296856, "flos": 23265849830400.0, "grad_norm": 1.7093855884877707, "language_loss": 0.85168004, "learning_rate": 2.8281882535451266e-06, "loss": 0.92915452, "num_input_tokens_seen": 136721265, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.15063477, "step": 6373, "time_per_iteration": 2.5484743118286133 }, { "auxiliary_loss_clip": 0.06477977, "auxiliary_loss_mlp": 0.01278002, "balance_loss_clip": 0.06297924, "balance_loss_mlp": 0.01262362, "epoch": 0.3832256125056366, "flos": 34432131358080.0, "grad_norm": 1.944288460933275, "language_loss": 0.75213009, "learning_rate": 2.8278337365551567e-06, "loss": 0.82968998, "num_input_tokens_seen": 136741885, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.15649414, "step": 6374, "time_per_iteration": 2.66341233253479 }, { "auxiliary_loss_clip": 0.06481533, "auxiliary_loss_mlp": 0.01273455, "balance_loss_clip": 0.06299451, "balance_loss_mlp": 0.01258268, "epoch": 0.38328573575830455, "flos": 21769289182080.0, "grad_norm": 2.027304077891625, "language_loss": 0.76439357, "learning_rate": 2.8274791881736485e-06, "loss": 0.84194344, "num_input_tokens_seen": 136760905, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.15197754, "step": 6375, "time_per_iteration": 3.971134901046753 }, { "auxiliary_loss_clip": 0.06474571, "auxiliary_loss_mlp": 0.01274383, "balance_loss_clip": 0.0629518, "balance_loss_mlp": 0.01259756, "epoch": 0.3833458590109725, "flos": 17385056000640.0, "grad_norm": 1.9668104366829242, "language_loss": 0.7322824, "learning_rate": 2.8271246084140457e-06, "loss": 0.8097719, "num_input_tokens_seen": 136777240, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.14624023, "step": 6376, "time_per_iteration": 2.516305446624756 }, { "auxiliary_loss_clip": 0.06469041, "auxiliary_loss_mlp": 0.01270139, "balance_loss_clip": 0.06292804, "balance_loss_mlp": 0.01254677, "epoch": 0.3834059822636405, "flos": 29432326556160.0, "grad_norm": 1.5289519253598105, "language_loss": 0.67735136, "learning_rate": 2.826769997289796e-06, "loss": 0.75474322, "num_input_tokens_seen": 136801040, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.15454102, "step": 6377, "time_per_iteration": 2.6324236392974854 }, { "auxiliary_loss_clip": 0.06481228, "auxiliary_loss_mlp": 0.01270528, "balance_loss_clip": 0.06297293, "balance_loss_mlp": 0.01255162, "epoch": 0.38346610551630844, "flos": 21477191448960.0, "grad_norm": 1.8406158607756236, "language_loss": 0.73253894, "learning_rate": 2.826415354814344e-06, "loss": 0.81005651, "num_input_tokens_seen": 136819495, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.15380859, "step": 6378, "time_per_iteration": 2.560469388961792 }, { "auxiliary_loss_clip": 0.06474416, "auxiliary_loss_mlp": 0.01270827, "balance_loss_clip": 0.06294363, "balance_loss_mlp": 0.01256772, "epoch": 0.3835262287689764, "flos": 27568253900160.0, "grad_norm": 1.6223334294028908, "language_loss": 0.69291031, "learning_rate": 2.8260606810011396e-06, "loss": 0.77036273, "num_input_tokens_seen": 136838840, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.14050293, "step": 6379, "time_per_iteration": 2.6207268238067627 }, { "auxiliary_loss_clip": 0.06475005, "auxiliary_loss_mlp": 0.01271654, "balance_loss_clip": 0.06297925, "balance_loss_mlp": 0.01257063, "epoch": 0.3835863520216444, "flos": 15529201044480.0, "grad_norm": 1.7812846329914307, "language_loss": 0.83960682, "learning_rate": 2.8257059758636315e-06, "loss": 0.91707337, "num_input_tokens_seen": 136854425, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.14593506, "step": 6380, "time_per_iteration": 2.5316786766052246 }, { "auxiliary_loss_clip": 0.06475769, "auxiliary_loss_mlp": 0.01270648, "balance_loss_clip": 0.0630051, "balance_loss_mlp": 0.01256081, "epoch": 0.38364647527431234, "flos": 21910851855360.0, "grad_norm": 1.37861968440619, "language_loss": 0.8137992, "learning_rate": 2.8253512394152697e-06, "loss": 0.89126337, "num_input_tokens_seen": 136874355, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.14550781, "step": 6381, "time_per_iteration": 4.005084037780762 }, { "auxiliary_loss_clip": 0.06430729, "auxiliary_loss_mlp": 0.01265182, "balance_loss_clip": 0.0635268, "balance_loss_mlp": 0.01259946, "epoch": 0.3837065985269803, "flos": 65553076120320.0, "grad_norm": 0.7790871919597019, "language_loss": 0.6008147, "learning_rate": 2.8249964716695068e-06, "loss": 0.67777377, "num_input_tokens_seen": 136937475, "router_z_loss_clip": 0.78125, "router_z_loss_mlp": 0.05239868, "step": 6382, "time_per_iteration": 3.1830646991729736 }, { "auxiliary_loss_clip": 0.06482938, "auxiliary_loss_mlp": 0.01269402, "balance_loss_clip": 0.06300346, "balance_loss_mlp": 0.01254549, "epoch": 0.38376672177964827, "flos": 28264103331840.0, "grad_norm": 2.5701562227003674, "language_loss": 0.67087901, "learning_rate": 2.824641672639794e-06, "loss": 0.74840242, "num_input_tokens_seen": 136955805, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.14849854, "step": 6383, "time_per_iteration": 2.590318441390991 }, { "auxiliary_loss_clip": 0.06481358, "auxiliary_loss_mlp": 0.01268218, "balance_loss_clip": 0.06300912, "balance_loss_mlp": 0.01253329, "epoch": 0.38382684503231623, "flos": 20637641064960.0, "grad_norm": 1.7372160208868241, "language_loss": 0.74967504, "learning_rate": 2.824286842339587e-06, "loss": 0.82717079, "num_input_tokens_seen": 136975240, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.14904785, "step": 6384, "time_per_iteration": 2.552732467651367 }, { "auxiliary_loss_clip": 0.06475706, "auxiliary_loss_mlp": 0.01272267, "balance_loss_clip": 0.06300798, "balance_loss_mlp": 0.01258355, "epoch": 0.3838869682849842, "flos": 19611274003200.0, "grad_norm": 1.3499102607103162, "language_loss": 0.76578426, "learning_rate": 2.823931980782341e-06, "loss": 0.84326398, "num_input_tokens_seen": 136994985, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.13916016, "step": 6385, "time_per_iteration": 2.5491580963134766 }, { "auxiliary_loss_clip": 0.06404404, "auxiliary_loss_mlp": 0.01258144, "balance_loss_clip": 0.06326317, "balance_loss_mlp": 0.01253673, "epoch": 0.38394709153765216, "flos": 56572202856960.0, "grad_norm": 1.0806297157613782, "language_loss": 0.67668056, "learning_rate": 2.82357708798151e-06, "loss": 0.75330603, "num_input_tokens_seen": 137046290, "router_z_loss_clip": 0.78125, "router_z_loss_mlp": 0.04470825, "step": 6386, "time_per_iteration": 4.485408782958984 }, { "auxiliary_loss_clip": 0.06480932, "auxiliary_loss_mlp": 0.01273914, "balance_loss_clip": 0.06305708, "balance_loss_mlp": 0.01259347, "epoch": 0.3840072147903202, "flos": 15894323210880.0, "grad_norm": 1.5671277408851925, "language_loss": 0.7275539, "learning_rate": 2.8232221639505547e-06, "loss": 0.80510235, "num_input_tokens_seen": 137064725, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.14575195, "step": 6387, "time_per_iteration": 2.5346503257751465 }, { "auxiliary_loss_clip": 0.06472757, "auxiliary_loss_mlp": 0.01273674, "balance_loss_clip": 0.06300558, "balance_loss_mlp": 0.01258487, "epoch": 0.38406733804298815, "flos": 28225180310400.0, "grad_norm": 1.6556357417239118, "language_loss": 0.8166666, "learning_rate": 2.822867208702932e-06, "loss": 0.89413095, "num_input_tokens_seen": 137086030, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.15185547, "step": 6388, "time_per_iteration": 2.594639539718628 }, { "auxiliary_loss_clip": 0.06472816, "auxiliary_loss_mlp": 0.01269685, "balance_loss_clip": 0.06297655, "balance_loss_mlp": 0.01254951, "epoch": 0.3841274612956561, "flos": 18229511848320.0, "grad_norm": 1.6918492580169824, "language_loss": 0.76508421, "learning_rate": 2.8225122222521026e-06, "loss": 0.84250921, "num_input_tokens_seen": 137105400, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.14733887, "step": 6389, "time_per_iteration": 2.5407559871673584 }, { "auxiliary_loss_clip": 0.06484482, "auxiliary_loss_mlp": 0.01273685, "balance_loss_clip": 0.06302992, "balance_loss_mlp": 0.01258033, "epoch": 0.3841875845483241, "flos": 19799138856960.0, "grad_norm": 1.7029756433427217, "language_loss": 0.76845706, "learning_rate": 2.8221572046115273e-06, "loss": 0.8460387, "num_input_tokens_seen": 137124985, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.15661621, "step": 6390, "time_per_iteration": 2.564077377319336 }, { "auxiliary_loss_clip": 0.06477935, "auxiliary_loss_mlp": 0.01271356, "balance_loss_clip": 0.06296694, "balance_loss_mlp": 0.01255227, "epoch": 0.38424770780099204, "flos": 29906670919680.0, "grad_norm": 1.6039957531433802, "language_loss": 0.71137077, "learning_rate": 2.821802155794668e-06, "loss": 0.78886372, "num_input_tokens_seen": 137146745, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.16125488, "step": 6391, "time_per_iteration": 2.645691156387329 }, { "auxiliary_loss_clip": 0.06474765, "auxiliary_loss_mlp": 0.0127806, "balance_loss_clip": 0.06297167, "balance_loss_mlp": 0.0126286, "epoch": 0.38430783105366, "flos": 20820013476480.0, "grad_norm": 1.6593070338689677, "language_loss": 0.84195638, "learning_rate": 2.8214470758149884e-06, "loss": 0.91948462, "num_input_tokens_seen": 137163195, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.15197754, "step": 6392, "time_per_iteration": 2.606187582015991 }, { "auxiliary_loss_clip": 0.0647796, "auxiliary_loss_mlp": 0.01269696, "balance_loss_clip": 0.06298572, "balance_loss_mlp": 0.01255284, "epoch": 0.384367954306328, "flos": 11003153627520.0, "grad_norm": 1.7277636113905683, "language_loss": 0.6161468, "learning_rate": 2.8210919646859536e-06, "loss": 0.69362336, "num_input_tokens_seen": 137179330, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.144104, "step": 6393, "time_per_iteration": 2.524207830429077 }, { "auxiliary_loss_clip": 0.06483467, "auxiliary_loss_mlp": 0.01272772, "balance_loss_clip": 0.06299593, "balance_loss_mlp": 0.01256274, "epoch": 0.38442807755899594, "flos": 25345096571520.0, "grad_norm": 2.1464172565270654, "language_loss": 0.71075726, "learning_rate": 2.820736822421029e-06, "loss": 0.78831971, "num_input_tokens_seen": 137198655, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.16516113, "step": 6394, "time_per_iteration": 2.5995326042175293 }, { "auxiliary_loss_clip": 0.06485178, "auxiliary_loss_mlp": 0.01268797, "balance_loss_clip": 0.0630089, "balance_loss_mlp": 0.01252954, "epoch": 0.3844882008116639, "flos": 21076206935040.0, "grad_norm": 1.8894762751337906, "language_loss": 0.82231444, "learning_rate": 2.8203816490336822e-06, "loss": 0.89985418, "num_input_tokens_seen": 137217120, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.15844727, "step": 6395, "time_per_iteration": 2.552030324935913 }, { "auxiliary_loss_clip": 0.06478399, "auxiliary_loss_mlp": 0.01273653, "balance_loss_clip": 0.06298092, "balance_loss_mlp": 0.01258776, "epoch": 0.38454832406433187, "flos": 17968287144960.0, "grad_norm": 2.129795070986842, "language_loss": 0.71303695, "learning_rate": 2.8200264445373813e-06, "loss": 0.79055738, "num_input_tokens_seen": 137234410, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.14874268, "step": 6396, "time_per_iteration": 2.50880765914917 }, { "auxiliary_loss_clip": 0.0637988, "auxiliary_loss_mlp": 0.01276215, "balance_loss_clip": 0.06302561, "balance_loss_mlp": 0.01272618, "epoch": 0.38460844731699984, "flos": 67946641925760.0, "grad_norm": 0.869293898318184, "language_loss": 0.59812027, "learning_rate": 2.8196712089455954e-06, "loss": 0.67468125, "num_input_tokens_seen": 137294940, "router_z_loss_clip": 0.7734375, "router_z_loss_mlp": 0.03588867, "step": 6397, "time_per_iteration": 3.244593381881714 }, { "auxiliary_loss_clip": 0.06472977, "auxiliary_loss_mlp": 0.01272169, "balance_loss_clip": 0.06298372, "balance_loss_mlp": 0.0125716, "epoch": 0.3846685705696678, "flos": 25856267604480.0, "grad_norm": 2.190760610588707, "language_loss": 0.85338581, "learning_rate": 2.819315942271794e-06, "loss": 0.93083727, "num_input_tokens_seen": 137315035, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.14990234, "step": 6398, "time_per_iteration": 2.5911037921905518 }, { "auxiliary_loss_clip": 0.06472908, "auxiliary_loss_mlp": 0.01278277, "balance_loss_clip": 0.06297006, "balance_loss_mlp": 0.01263424, "epoch": 0.38472869382233577, "flos": 16295852776320.0, "grad_norm": 1.8276864991564905, "language_loss": 0.80427486, "learning_rate": 2.8189606445294515e-06, "loss": 0.8817867, "num_input_tokens_seen": 137333155, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.14855957, "step": 6399, "time_per_iteration": 2.6514904499053955 }, { "auxiliary_loss_clip": 0.06477328, "auxiliary_loss_mlp": 0.01279204, "balance_loss_clip": 0.06297595, "balance_loss_mlp": 0.01262491, "epoch": 0.38478881707500373, "flos": 19358979759360.0, "grad_norm": 2.147422115578657, "language_loss": 0.67885852, "learning_rate": 2.818605315732038e-06, "loss": 0.75642389, "num_input_tokens_seen": 137351515, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.16711426, "step": 6400, "time_per_iteration": 2.543403148651123 }, { "auxiliary_loss_clip": 0.0648206, "auxiliary_loss_mlp": 0.01273838, "balance_loss_clip": 0.06300624, "balance_loss_mlp": 0.01258615, "epoch": 0.38484894032767175, "flos": 24867356117760.0, "grad_norm": 1.686779425133666, "language_loss": 0.7404021, "learning_rate": 2.81824995589303e-06, "loss": 0.81796104, "num_input_tokens_seen": 137371255, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.15234375, "step": 6401, "time_per_iteration": 2.597365140914917 }, { "auxiliary_loss_clip": 0.064769, "auxiliary_loss_mlp": 0.01274616, "balance_loss_clip": 0.06297454, "balance_loss_mlp": 0.01258367, "epoch": 0.3849090635803397, "flos": 14507068613760.0, "grad_norm": 2.53182207769655, "language_loss": 0.72699225, "learning_rate": 2.8178945650259012e-06, "loss": 0.80450743, "num_input_tokens_seen": 137388980, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.16235352, "step": 6402, "time_per_iteration": 2.545325517654419 }, { "auxiliary_loss_clip": 0.06472333, "auxiliary_loss_mlp": 0.01268702, "balance_loss_clip": 0.06298424, "balance_loss_mlp": 0.01254194, "epoch": 0.3849691868330077, "flos": 18521903070720.0, "grad_norm": 2.225493648929912, "language_loss": 0.8297869, "learning_rate": 2.817539143144128e-06, "loss": 0.90719724, "num_input_tokens_seen": 137406885, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.14489746, "step": 6403, "time_per_iteration": 2.5340194702148438 }, { "auxiliary_loss_clip": 0.06474602, "auxiliary_loss_mlp": 0.01269931, "balance_loss_clip": 0.06297544, "balance_loss_mlp": 0.01254136, "epoch": 0.38502931008567565, "flos": 21622821045120.0, "grad_norm": 1.8531159171241414, "language_loss": 0.83754134, "learning_rate": 2.817183690261189e-06, "loss": 0.91498661, "num_input_tokens_seen": 137425535, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.15795898, "step": 6404, "time_per_iteration": 2.5514352321624756 }, { "auxiliary_loss_clip": 0.06475437, "auxiliary_loss_mlp": 0.01271961, "balance_loss_clip": 0.06296157, "balance_loss_mlp": 0.01257203, "epoch": 0.3850894333383436, "flos": 25423152249600.0, "grad_norm": 2.158681590921324, "language_loss": 0.70178008, "learning_rate": 2.816828206390563e-06, "loss": 0.77925402, "num_input_tokens_seen": 137447700, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.14770508, "step": 6405, "time_per_iteration": 2.7688117027282715 }, { "auxiliary_loss_clip": 0.06470415, "auxiliary_loss_mlp": 0.01278318, "balance_loss_clip": 0.06296565, "balance_loss_mlp": 0.01263638, "epoch": 0.3851495565910116, "flos": 20233721658240.0, "grad_norm": 2.005543425693999, "language_loss": 0.78901696, "learning_rate": 2.816472691545729e-06, "loss": 0.86650431, "num_input_tokens_seen": 137462245, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.14678955, "step": 6406, "time_per_iteration": 2.530722141265869 }, { "auxiliary_loss_clip": 0.06477491, "auxiliary_loss_mlp": 0.01272384, "balance_loss_clip": 0.06298037, "balance_loss_mlp": 0.01257232, "epoch": 0.38520967984367954, "flos": 16514045608320.0, "grad_norm": 2.1127172723267464, "language_loss": 0.84455448, "learning_rate": 2.8161171457401694e-06, "loss": 0.92205328, "num_input_tokens_seen": 137476455, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.15136719, "step": 6407, "time_per_iteration": 2.517641305923462 }, { "auxiliary_loss_clip": 0.06371613, "auxiliary_loss_mlp": 0.01263555, "balance_loss_clip": 0.06293099, "balance_loss_mlp": 0.01259866, "epoch": 0.3852698030963475, "flos": 61333088140800.0, "grad_norm": 0.7509903383361917, "language_loss": 0.6483891, "learning_rate": 2.815761568987365e-06, "loss": 0.72474074, "num_input_tokens_seen": 137539845, "router_z_loss_clip": 0.78369141, "router_z_loss_mlp": 0.0368042, "step": 6408, "time_per_iteration": 3.235504627227783 }, { "auxiliary_loss_clip": 0.06473804, "auxiliary_loss_mlp": 0.01269632, "balance_loss_clip": 0.06293637, "balance_loss_mlp": 0.01253157, "epoch": 0.3853299263490155, "flos": 22899595633920.0, "grad_norm": 1.537913735319972, "language_loss": 0.73740542, "learning_rate": 2.8154059613008e-06, "loss": 0.81483978, "num_input_tokens_seen": 137559880, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.16467285, "step": 6409, "time_per_iteration": 2.5668556690216064 }, { "auxiliary_loss_clip": 0.0648104, "auxiliary_loss_mlp": 0.01274972, "balance_loss_clip": 0.06294297, "balance_loss_mlp": 0.01258854, "epoch": 0.38539004960168344, "flos": 20053655233920.0, "grad_norm": 1.930035939027533, "language_loss": 0.71467412, "learning_rate": 2.81505032269396e-06, "loss": 0.79223418, "num_input_tokens_seen": 137578225, "router_z_loss_clip": 1.86816406, "router_z_loss_mlp": 0.16125488, "step": 6410, "time_per_iteration": 3.977060317993164 }, { "auxiliary_loss_clip": 0.06358077, "auxiliary_loss_mlp": 0.01256102, "balance_loss_clip": 0.06280117, "balance_loss_mlp": 0.01252347, "epoch": 0.3854501728543514, "flos": 68752971365760.0, "grad_norm": 0.6553472297566824, "language_loss": 0.60070163, "learning_rate": 2.81469465318033e-06, "loss": 0.6768434, "num_input_tokens_seen": 137645770, "router_z_loss_clip": 0.78076172, "router_z_loss_mlp": 0.03747559, "step": 6411, "time_per_iteration": 3.295381784439087 }, { "auxiliary_loss_clip": 0.06471702, "auxiliary_loss_mlp": 0.01271306, "balance_loss_clip": 0.06294554, "balance_loss_mlp": 0.01256923, "epoch": 0.38551029610701937, "flos": 20491214855040.0, "grad_norm": 5.644957165339711, "language_loss": 0.78720427, "learning_rate": 2.814338952773397e-06, "loss": 0.86463439, "num_input_tokens_seen": 137664090, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.14385986, "step": 6412, "time_per_iteration": 2.564915180206299 }, { "auxiliary_loss_clip": 0.06477545, "auxiliary_loss_mlp": 0.01273568, "balance_loss_clip": 0.06295491, "balance_loss_mlp": 0.01256855, "epoch": 0.38557041935968733, "flos": 23477627825280.0, "grad_norm": 1.7508033851775542, "language_loss": 0.7804563, "learning_rate": 2.8139832214866493e-06, "loss": 0.85796744, "num_input_tokens_seen": 137683190, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.16711426, "step": 6413, "time_per_iteration": 2.5828537940979004 }, { "auxiliary_loss_clip": 0.06352194, "auxiliary_loss_mlp": 0.01252097, "balance_loss_clip": 0.06274683, "balance_loss_mlp": 0.01249048, "epoch": 0.38563054261235535, "flos": 63984623068800.0, "grad_norm": 0.7977320539273576, "language_loss": 0.61278355, "learning_rate": 2.813627459333576e-06, "loss": 0.68882638, "num_input_tokens_seen": 137737315, "router_z_loss_clip": 0.7734375, "router_z_loss_mlp": 0.03044128, "step": 6414, "time_per_iteration": 4.509811878204346 }, { "auxiliary_loss_clip": 0.06478608, "auxiliary_loss_mlp": 0.01275787, "balance_loss_clip": 0.06297974, "balance_loss_mlp": 0.01261332, "epoch": 0.3856906658650233, "flos": 23994584789760.0, "grad_norm": 2.6539315113514848, "language_loss": 0.7735495, "learning_rate": 2.8132716663276685e-06, "loss": 0.85109341, "num_input_tokens_seen": 137753535, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.14459229, "step": 6415, "time_per_iteration": 2.5923848152160645 }, { "auxiliary_loss_clip": 0.06465513, "auxiliary_loss_mlp": 0.01274123, "balance_loss_clip": 0.06294899, "balance_loss_mlp": 0.0126095, "epoch": 0.3857507891176913, "flos": 25014075816960.0, "grad_norm": 1.7173928564698209, "language_loss": 0.8051815, "learning_rate": 2.8129158424824173e-06, "loss": 0.88257778, "num_input_tokens_seen": 137773405, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.13195801, "step": 6416, "time_per_iteration": 2.593626022338867 }, { "auxiliary_loss_clip": 0.06473002, "auxiliary_loss_mlp": 0.01278169, "balance_loss_clip": 0.06297086, "balance_loss_mlp": 0.01263858, "epoch": 0.38581091237035925, "flos": 21542082036480.0, "grad_norm": 1.6730130131294165, "language_loss": 0.79501033, "learning_rate": 2.8125599878113155e-06, "loss": 0.87252206, "num_input_tokens_seen": 137790810, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.14312744, "step": 6417, "time_per_iteration": 2.554164171218872 }, { "auxiliary_loss_clip": 0.06472691, "auxiliary_loss_mlp": 0.01275907, "balance_loss_clip": 0.06295156, "balance_loss_mlp": 0.01260588, "epoch": 0.3858710356230272, "flos": 17389584120960.0, "grad_norm": 1.8337838829745026, "language_loss": 0.80630374, "learning_rate": 2.8122041023278583e-06, "loss": 0.88378972, "num_input_tokens_seen": 137810265, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.15307617, "step": 6418, "time_per_iteration": 2.551698923110962 }, { "auxiliary_loss_clip": 0.0646676, "auxiliary_loss_mlp": 0.01274778, "balance_loss_clip": 0.06291714, "balance_loss_mlp": 0.01260383, "epoch": 0.3859311588756952, "flos": 20345836821120.0, "grad_norm": 1.656748965931808, "language_loss": 0.79864264, "learning_rate": 2.8118481860455407e-06, "loss": 0.87605798, "num_input_tokens_seen": 137828580, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.14385986, "step": 6419, "time_per_iteration": 2.558356761932373 }, { "auxiliary_loss_clip": 0.06469955, "auxiliary_loss_mlp": 0.01279465, "balance_loss_clip": 0.06295638, "balance_loss_mlp": 0.01263444, "epoch": 0.38599128212836314, "flos": 26328054418560.0, "grad_norm": 1.9278114012341898, "language_loss": 0.67438066, "learning_rate": 2.8114922389778573e-06, "loss": 0.75187492, "num_input_tokens_seen": 137846145, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.16027832, "step": 6420, "time_per_iteration": 4.000306129455566 }, { "auxiliary_loss_clip": 0.06468717, "auxiliary_loss_mlp": 0.01280712, "balance_loss_clip": 0.06297055, "balance_loss_mlp": 0.01266735, "epoch": 0.3860514053810311, "flos": 13559050719360.0, "grad_norm": 2.01828853732226, "language_loss": 0.82141173, "learning_rate": 2.8111362611383076e-06, "loss": 0.89890605, "num_input_tokens_seen": 137863705, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.13970947, "step": 6421, "time_per_iteration": 2.5450830459594727 }, { "auxiliary_loss_clip": 0.06473152, "auxiliary_loss_mlp": 0.01277487, "balance_loss_clip": 0.06294877, "balance_loss_mlp": 0.01262181, "epoch": 0.3861115286336991, "flos": 20959689432960.0, "grad_norm": 2.5455021390204178, "language_loss": 0.72526932, "learning_rate": 2.8107802525403886e-06, "loss": 0.80277568, "num_input_tokens_seen": 137880285, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.15319824, "step": 6422, "time_per_iteration": 2.5286245346069336 }, { "auxiliary_loss_clip": 0.06468511, "auxiliary_loss_mlp": 0.0127783, "balance_loss_clip": 0.06296179, "balance_loss_mlp": 0.01263692, "epoch": 0.38617165188636704, "flos": 16368290231040.0, "grad_norm": 1.7119089188685312, "language_loss": 0.66679722, "learning_rate": 2.8104242131976025e-06, "loss": 0.74426061, "num_input_tokens_seen": 137898335, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.14147949, "step": 6423, "time_per_iteration": 2.5452558994293213 }, { "auxiliary_loss_clip": 0.0647154, "auxiliary_loss_mlp": 0.01277846, "balance_loss_clip": 0.06292754, "balance_loss_mlp": 0.01263028, "epoch": 0.386231775139035, "flos": 34795828005120.0, "grad_norm": 1.8292137427265625, "language_loss": 0.69653553, "learning_rate": 2.810068143123449e-06, "loss": 0.77402937, "num_input_tokens_seen": 137918605, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.14825439, "step": 6424, "time_per_iteration": 2.6547231674194336 }, { "auxiliary_loss_clip": 0.06468284, "auxiliary_loss_mlp": 0.01276205, "balance_loss_clip": 0.06295602, "balance_loss_mlp": 0.01262716, "epoch": 0.38629189839170297, "flos": 21732672147840.0, "grad_norm": 1.6558398470278508, "language_loss": 0.7255981, "learning_rate": 2.809712042331429e-06, "loss": 0.80304301, "num_input_tokens_seen": 137938245, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.13494873, "step": 6425, "time_per_iteration": 2.5770089626312256 }, { "auxiliary_loss_clip": 0.06475513, "auxiliary_loss_mlp": 0.01273516, "balance_loss_clip": 0.06294023, "balance_loss_mlp": 0.01257816, "epoch": 0.38635202164437094, "flos": 27930315392640.0, "grad_norm": 3.514515956105829, "language_loss": 0.80920631, "learning_rate": 2.8093559108350484e-06, "loss": 0.88669658, "num_input_tokens_seen": 137956770, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.15686035, "step": 6426, "time_per_iteration": 4.029763698577881 }, { "auxiliary_loss_clip": 0.06481653, "auxiliary_loss_mlp": 0.01282072, "balance_loss_clip": 0.06300751, "balance_loss_mlp": 0.01266325, "epoch": 0.38641214489703896, "flos": 23593390640640.0, "grad_norm": 2.3055757154857885, "language_loss": 0.75341761, "learning_rate": 2.80899974864781e-06, "loss": 0.83105481, "num_input_tokens_seen": 137977040, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.15759277, "step": 6427, "time_per_iteration": 2.587176561355591 }, { "auxiliary_loss_clip": 0.06475037, "auxiliary_loss_mlp": 0.01271078, "balance_loss_clip": 0.06297195, "balance_loss_mlp": 0.01256344, "epoch": 0.3864722681497069, "flos": 12646224339840.0, "grad_norm": 2.0387663036910912, "language_loss": 0.70789254, "learning_rate": 2.8086435557832203e-06, "loss": 0.78535366, "num_input_tokens_seen": 137993545, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.1472168, "step": 6428, "time_per_iteration": 2.519092321395874 }, { "auxiliary_loss_clip": 0.06472466, "auxiliary_loss_mlp": 0.01276622, "balance_loss_clip": 0.06293435, "balance_loss_mlp": 0.01261888, "epoch": 0.3865323914023749, "flos": 17604003519360.0, "grad_norm": 1.9578667307262678, "language_loss": 0.84775925, "learning_rate": 2.8082873322547863e-06, "loss": 0.92525005, "num_input_tokens_seen": 138010140, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.14746094, "step": 6429, "time_per_iteration": 2.516204833984375 }, { "auxiliary_loss_clip": 0.06474701, "auxiliary_loss_mlp": 0.01275608, "balance_loss_clip": 0.06296782, "balance_loss_mlp": 0.01260504, "epoch": 0.38659251465504285, "flos": 18484908693120.0, "grad_norm": 2.1129552921613772, "language_loss": 0.81697929, "learning_rate": 2.807931078076015e-06, "loss": 0.89448237, "num_input_tokens_seen": 138028880, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.15100098, "step": 6430, "time_per_iteration": 2.5452792644500732 }, { "auxiliary_loss_clip": 0.06362765, "auxiliary_loss_mlp": 0.01264424, "balance_loss_clip": 0.06286006, "balance_loss_mlp": 0.01259826, "epoch": 0.3866526379077108, "flos": 64186533480960.0, "grad_norm": 0.7095266456939012, "language_loss": 0.58812392, "learning_rate": 2.807574793260416e-06, "loss": 0.66439581, "num_input_tokens_seen": 138098090, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 0.04598999, "step": 6431, "time_per_iteration": 3.219482898712158 }, { "auxiliary_loss_clip": 0.06488594, "auxiliary_loss_mlp": 0.01272007, "balance_loss_clip": 0.06305932, "balance_loss_mlp": 0.01256021, "epoch": 0.3867127611603788, "flos": 14392857098880.0, "grad_norm": 1.7459960312809393, "language_loss": 0.79253727, "learning_rate": 2.8072184778215004e-06, "loss": 0.87014329, "num_input_tokens_seen": 138114735, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.16015625, "step": 6432, "time_per_iteration": 2.5336663722991943 }, { "auxiliary_loss_clip": 0.06483305, "auxiliary_loss_mlp": 0.01274681, "balance_loss_clip": 0.06296347, "balance_loss_mlp": 0.0125817, "epoch": 0.38677288441304675, "flos": 20016870491520.0, "grad_norm": 1.9739266014078514, "language_loss": 0.80940938, "learning_rate": 2.806862131772779e-06, "loss": 0.88698918, "num_input_tokens_seen": 138130480, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.16503906, "step": 6433, "time_per_iteration": 2.5502943992614746 }, { "auxiliary_loss_clip": 0.0647327, "auxiliary_loss_mlp": 0.01272234, "balance_loss_clip": 0.06293391, "balance_loss_mlp": 0.01254519, "epoch": 0.3868330076657147, "flos": 22243465837440.0, "grad_norm": 1.8369973575997793, "language_loss": 0.71157378, "learning_rate": 2.806505755127765e-06, "loss": 0.78902876, "num_input_tokens_seen": 138150640, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.17700195, "step": 6434, "time_per_iteration": 2.5756349563598633 }, { "auxiliary_loss_clip": 0.06484073, "auxiliary_loss_mlp": 0.01272188, "balance_loss_clip": 0.06295671, "balance_loss_mlp": 0.01255869, "epoch": 0.3868931309183827, "flos": 16733076981120.0, "grad_norm": 1.6459333235317708, "language_loss": 0.78622818, "learning_rate": 2.806149347899972e-06, "loss": 0.86379075, "num_input_tokens_seen": 138169700, "router_z_loss_clip": 1.88183594, "router_z_loss_mlp": 0.16333008, "step": 6435, "time_per_iteration": 2.5280673503875732 }, { "auxiliary_loss_clip": 0.06472868, "auxiliary_loss_mlp": 0.01274387, "balance_loss_clip": 0.06297843, "balance_loss_mlp": 0.01259092, "epoch": 0.38695325417105064, "flos": 22681360874880.0, "grad_norm": 1.7306750419355275, "language_loss": 0.8031137, "learning_rate": 2.805792910102915e-06, "loss": 0.88058627, "num_input_tokens_seen": 138185835, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.15289307, "step": 6436, "time_per_iteration": 2.5555989742279053 }, { "auxiliary_loss_clip": 0.06472667, "auxiliary_loss_mlp": 0.0127298, "balance_loss_clip": 0.06298669, "balance_loss_mlp": 0.01258031, "epoch": 0.3870133774237186, "flos": 23118668933760.0, "grad_norm": 1.744100796346517, "language_loss": 0.77045274, "learning_rate": 2.8054364417501093e-06, "loss": 0.84790921, "num_input_tokens_seen": 138204080, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.1496582, "step": 6437, "time_per_iteration": 2.5722239017486572 }, { "auxiliary_loss_clip": 0.06473295, "auxiliary_loss_mlp": 0.01278369, "balance_loss_clip": 0.06297053, "balance_loss_mlp": 0.01263057, "epoch": 0.3870735006763866, "flos": 17681430291840.0, "grad_norm": 2.881467364335243, "language_loss": 0.81687009, "learning_rate": 2.805079942855074e-06, "loss": 0.89438677, "num_input_tokens_seen": 138220710, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.15307617, "step": 6438, "time_per_iteration": 2.537080764770508 }, { "auxiliary_loss_clip": 0.06474172, "auxiliary_loss_mlp": 0.01269979, "balance_loss_clip": 0.062942, "balance_loss_mlp": 0.01254565, "epoch": 0.38713362392905454, "flos": 23302676499840.0, "grad_norm": 1.4925433186568782, "language_loss": 0.75888681, "learning_rate": 2.804723413431326e-06, "loss": 0.83632833, "num_input_tokens_seen": 138241720, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.1541748, "step": 6439, "time_per_iteration": 2.576687812805176 }, { "auxiliary_loss_clip": 0.06470689, "auxiliary_loss_mlp": 0.01275324, "balance_loss_clip": 0.06298581, "balance_loss_mlp": 0.01261185, "epoch": 0.38719374718172256, "flos": 21037283913600.0, "grad_norm": 1.4909461339802064, "language_loss": 0.74193013, "learning_rate": 2.8043668534923855e-06, "loss": 0.8193903, "num_input_tokens_seen": 138261885, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.14105225, "step": 6440, "time_per_iteration": 2.569132089614868 }, { "auxiliary_loss_clip": 0.06479625, "auxiliary_loss_mlp": 0.01271896, "balance_loss_clip": 0.06295522, "balance_loss_mlp": 0.0125616, "epoch": 0.3872538704343905, "flos": 19615885977600.0, "grad_norm": 1.7432512842675303, "language_loss": 0.81837881, "learning_rate": 2.804010263051774e-06, "loss": 0.89589405, "num_input_tokens_seen": 138280255, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.1574707, "step": 6441, "time_per_iteration": 2.5751140117645264 }, { "auxiliary_loss_clip": 0.06473304, "auxiliary_loss_mlp": 0.01271619, "balance_loss_clip": 0.06297255, "balance_loss_mlp": 0.01257028, "epoch": 0.3873139936870585, "flos": 17535800695680.0, "grad_norm": 2.0495623622087393, "language_loss": 0.81037945, "learning_rate": 2.8036536421230118e-06, "loss": 0.88782871, "num_input_tokens_seen": 138296675, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.14581299, "step": 6442, "time_per_iteration": 2.543363332748413 }, { "auxiliary_loss_clip": 0.06475826, "auxiliary_loss_mlp": 0.0128024, "balance_loss_clip": 0.06299529, "balance_loss_mlp": 0.01265112, "epoch": 0.38737411693972645, "flos": 17792539205760.0, "grad_norm": 1.5425543334601033, "language_loss": 0.84288186, "learning_rate": 2.803296990719624e-06, "loss": 0.92044246, "num_input_tokens_seen": 138314985, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.15124512, "step": 6443, "time_per_iteration": 2.55318021774292 }, { "auxiliary_loss_clip": 0.06368066, "auxiliary_loss_mlp": 0.01284009, "balance_loss_clip": 0.06290352, "balance_loss_mlp": 0.0128091, "epoch": 0.3874342401923944, "flos": 58320554624640.0, "grad_norm": 0.7554496207583145, "language_loss": 0.5026024, "learning_rate": 2.8029403088551327e-06, "loss": 0.57912314, "num_input_tokens_seen": 138373275, "router_z_loss_clip": 0.77636719, "router_z_loss_mlp": 0.03096008, "step": 6444, "time_per_iteration": 3.191754102706909 }, { "auxiliary_loss_clip": 0.06462062, "auxiliary_loss_mlp": 0.01272899, "balance_loss_clip": 0.0629105, "balance_loss_mlp": 0.01257891, "epoch": 0.3874943634450624, "flos": 17717628055680.0, "grad_norm": 1.6010401898290822, "language_loss": 0.79167086, "learning_rate": 2.802583596543065e-06, "loss": 0.86902046, "num_input_tokens_seen": 138391145, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.15014648, "step": 6445, "time_per_iteration": 2.525414228439331 }, { "auxiliary_loss_clip": 0.06465276, "auxiliary_loss_mlp": 0.01281605, "balance_loss_clip": 0.06292038, "balance_loss_mlp": 0.01267264, "epoch": 0.38755448669773035, "flos": 19250889592320.0, "grad_norm": 1.746992387626611, "language_loss": 0.8112973, "learning_rate": 2.8022268537969474e-06, "loss": 0.88876617, "num_input_tokens_seen": 138409875, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.14349365, "step": 6446, "time_per_iteration": 2.5364179611206055 }, { "auxiliary_loss_clip": 0.06465295, "auxiliary_loss_mlp": 0.0127881, "balance_loss_clip": 0.06290263, "balance_loss_mlp": 0.01263527, "epoch": 0.3876146099503983, "flos": 20600437052160.0, "grad_norm": 1.9788575368522507, "language_loss": 0.77035236, "learning_rate": 2.801870080630306e-06, "loss": 0.8477934, "num_input_tokens_seen": 138428965, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.15283203, "step": 6447, "time_per_iteration": 2.7306711673736572 }, { "auxiliary_loss_clip": 0.06465669, "auxiliary_loss_mlp": 0.01274674, "balance_loss_clip": 0.06293215, "balance_loss_mlp": 0.01259647, "epoch": 0.3876747332030663, "flos": 19287129283200.0, "grad_norm": 1.5425316809252547, "language_loss": 0.76145202, "learning_rate": 2.801513277056671e-06, "loss": 0.83885545, "num_input_tokens_seen": 138448090, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.15020752, "step": 6448, "time_per_iteration": 2.5741126537323 }, { "auxiliary_loss_clip": 0.06468866, "auxiliary_loss_mlp": 0.01273618, "balance_loss_clip": 0.0629499, "balance_loss_mlp": 0.01259522, "epoch": 0.38773485645573424, "flos": 18950699940480.0, "grad_norm": 1.5586795254178445, "language_loss": 0.76530516, "learning_rate": 2.8011564430895725e-06, "loss": 0.84272999, "num_input_tokens_seen": 138466105, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.14093018, "step": 6449, "time_per_iteration": 2.5409128665924072 }, { "auxiliary_loss_clip": 0.0647059, "auxiliary_loss_mlp": 0.01272922, "balance_loss_clip": 0.06291666, "balance_loss_mlp": 0.01257651, "epoch": 0.3877949797084022, "flos": 23077272216960.0, "grad_norm": 1.701042901045688, "language_loss": 0.78950787, "learning_rate": 2.800799578742542e-06, "loss": 0.866943, "num_input_tokens_seen": 138485160, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.15258789, "step": 6450, "time_per_iteration": 4.0986127853393555 }, { "auxiliary_loss_clip": 0.06475297, "auxiliary_loss_mlp": 0.01272043, "balance_loss_clip": 0.06290345, "balance_loss_mlp": 0.01257869, "epoch": 0.3878551029610702, "flos": 29103150591360.0, "grad_norm": 2.5808711447515673, "language_loss": 0.78246152, "learning_rate": 2.8004426840291106e-06, "loss": 0.85993493, "num_input_tokens_seen": 138504135, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.1418457, "step": 6451, "time_per_iteration": 2.7130961418151855 }, { "auxiliary_loss_clip": 0.06465013, "auxiliary_loss_mlp": 0.01272242, "balance_loss_clip": 0.06293789, "balance_loss_mlp": 0.01257985, "epoch": 0.38791522621373814, "flos": 21002763231360.0, "grad_norm": 1.8579031849629166, "language_loss": 0.76856101, "learning_rate": 2.800085758962812e-06, "loss": 0.84593356, "num_input_tokens_seen": 138523955, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.14245605, "step": 6452, "time_per_iteration": 2.7369842529296875 }, { "auxiliary_loss_clip": 0.06464354, "auxiliary_loss_mlp": 0.0127188, "balance_loss_clip": 0.0628904, "balance_loss_mlp": 0.01256634, "epoch": 0.3879753494664061, "flos": 15492248593920.0, "grad_norm": 1.593993579029583, "language_loss": 0.80176181, "learning_rate": 2.799728803557182e-06, "loss": 0.8791241, "num_input_tokens_seen": 138541655, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.15246582, "step": 6453, "time_per_iteration": 2.5706424713134766 }, { "auxiliary_loss_clip": 0.06473303, "auxiliary_loss_mlp": 0.01270482, "balance_loss_clip": 0.06290373, "balance_loss_mlp": 0.01254484, "epoch": 0.3880354727190741, "flos": 22060422593280.0, "grad_norm": 1.6630286058989199, "language_loss": 0.71436, "learning_rate": 2.7993718178257555e-06, "loss": 0.79179788, "num_input_tokens_seen": 138560860, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.16003418, "step": 6454, "time_per_iteration": 4.092878103256226 }, { "auxiliary_loss_clip": 0.06476082, "auxiliary_loss_mlp": 0.01278337, "balance_loss_clip": 0.06293654, "balance_loss_mlp": 0.01262339, "epoch": 0.3880955959717421, "flos": 20346675361920.0, "grad_norm": 1.936893299367098, "language_loss": 0.77888227, "learning_rate": 2.7990148017820694e-06, "loss": 0.85642648, "num_input_tokens_seen": 138580200, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.16003418, "step": 6455, "time_per_iteration": 2.550347089767456 }, { "auxiliary_loss_clip": 0.06468979, "auxiliary_loss_mlp": 0.01270553, "balance_loss_clip": 0.06294399, "balance_loss_mlp": 0.01255712, "epoch": 0.38815571922441006, "flos": 23082009972480.0, "grad_norm": 5.217391070806311, "language_loss": 0.75607771, "learning_rate": 2.798657755439662e-06, "loss": 0.83347309, "num_input_tokens_seen": 138598315, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.14831543, "step": 6456, "time_per_iteration": 2.5685336589813232 }, { "auxiliary_loss_clip": 0.06476137, "auxiliary_loss_mlp": 0.01269993, "balance_loss_clip": 0.06295869, "balance_loss_mlp": 0.01255676, "epoch": 0.388215842477078, "flos": 20783186807040.0, "grad_norm": 2.702322259090837, "language_loss": 0.60940218, "learning_rate": 2.7983006788120726e-06, "loss": 0.68686342, "num_input_tokens_seen": 138615695, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.14331055, "step": 6457, "time_per_iteration": 2.5618042945861816 }, { "auxiliary_loss_clip": 0.06470753, "auxiliary_loss_mlp": 0.01274841, "balance_loss_clip": 0.06290396, "balance_loss_mlp": 0.01259284, "epoch": 0.388275965729746, "flos": 20454304331520.0, "grad_norm": 2.8687424512922757, "language_loss": 0.80218852, "learning_rate": 2.797943571912841e-06, "loss": 0.87964439, "num_input_tokens_seen": 138633180, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.15563965, "step": 6458, "time_per_iteration": 2.5374512672424316 }, { "auxiliary_loss_clip": 0.06467228, "auxiliary_loss_mlp": 0.01270741, "balance_loss_clip": 0.0629078, "balance_loss_mlp": 0.01255578, "epoch": 0.38833608898241395, "flos": 27899945487360.0, "grad_norm": 2.050992845591479, "language_loss": 0.82350278, "learning_rate": 2.797586434755509e-06, "loss": 0.90088242, "num_input_tokens_seen": 138654785, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.1517334, "step": 6459, "time_per_iteration": 2.604555368423462 }, { "auxiliary_loss_clip": 0.06463966, "auxiliary_loss_mlp": 0.01272768, "balance_loss_clip": 0.06293176, "balance_loss_mlp": 0.01258772, "epoch": 0.3883962122350819, "flos": 18082079389440.0, "grad_norm": 1.9939139806070636, "language_loss": 0.62898707, "learning_rate": 2.7972292673536202e-06, "loss": 0.70635444, "num_input_tokens_seen": 138673330, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.14001465, "step": 6460, "time_per_iteration": 3.9522931575775146 }, { "auxiliary_loss_clip": 0.06466469, "auxiliary_loss_mlp": 0.01268532, "balance_loss_clip": 0.06293629, "balance_loss_mlp": 0.01254293, "epoch": 0.3884563354877499, "flos": 23628875644800.0, "grad_norm": 1.645571995496207, "language_loss": 0.86426109, "learning_rate": 2.796872069720717e-06, "loss": 0.94161111, "num_input_tokens_seen": 138694185, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.14245605, "step": 6461, "time_per_iteration": 2.5930442810058594 }, { "auxiliary_loss_clip": 0.06465884, "auxiliary_loss_mlp": 0.01272276, "balance_loss_clip": 0.06287367, "balance_loss_mlp": 0.01257131, "epoch": 0.38851645874041785, "flos": 27460834565760.0, "grad_norm": 2.372281756064515, "language_loss": 0.71921444, "learning_rate": 2.7965148418703456e-06, "loss": 0.79659605, "num_input_tokens_seen": 138714625, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.15130615, "step": 6462, "time_per_iteration": 2.595949411392212 }, { "auxiliary_loss_clip": 0.06461471, "auxiliary_loss_mlp": 0.01271633, "balance_loss_clip": 0.06287429, "balance_loss_mlp": 0.01256738, "epoch": 0.3885765819930858, "flos": 25235035833600.0, "grad_norm": 2.086678097192693, "language_loss": 0.77208954, "learning_rate": 2.796157583816052e-06, "loss": 0.84942061, "num_input_tokens_seen": 138733585, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.14886475, "step": 6463, "time_per_iteration": 2.564800262451172 }, { "auxiliary_loss_clip": 0.06476178, "auxiliary_loss_mlp": 0.012779, "balance_loss_clip": 0.0629548, "balance_loss_mlp": 0.01262439, "epoch": 0.3886367052457538, "flos": 16952317989120.0, "grad_norm": 2.5034122881204968, "language_loss": 0.71005988, "learning_rate": 2.795800295571382e-06, "loss": 0.78760064, "num_input_tokens_seen": 138752335, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.15466309, "step": 6464, "time_per_iteration": 2.53501296043396 }, { "auxiliary_loss_clip": 0.06464399, "auxiliary_loss_mlp": 0.01270772, "balance_loss_clip": 0.06290907, "balance_loss_mlp": 0.01256139, "epoch": 0.38869682849842174, "flos": 27160141789440.0, "grad_norm": 2.054711747386823, "language_loss": 0.69687688, "learning_rate": 2.7954429771498858e-06, "loss": 0.77422863, "num_input_tokens_seen": 138768450, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.14630127, "step": 6465, "time_per_iteration": 3.9944875240325928 }, { "auxiliary_loss_clip": 0.06465729, "auxiliary_loss_mlp": 0.01272824, "balance_loss_clip": 0.06289346, "balance_loss_mlp": 0.01257894, "epoch": 0.3887569517510897, "flos": 21069037411200.0, "grad_norm": 1.8980515382862497, "language_loss": 0.78006458, "learning_rate": 2.7950856285651117e-06, "loss": 0.85745013, "num_input_tokens_seen": 138786775, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.14923096, "step": 6466, "time_per_iteration": 2.5527477264404297 }, { "auxiliary_loss_clip": 0.06470713, "auxiliary_loss_mlp": 0.01273583, "balance_loss_clip": 0.06292124, "balance_loss_mlp": 0.01258778, "epoch": 0.38881707500375773, "flos": 29505141354240.0, "grad_norm": 1.5097990358578963, "language_loss": 0.69314981, "learning_rate": 2.794728249830611e-06, "loss": 0.77059275, "num_input_tokens_seen": 138810100, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.14807129, "step": 6467, "time_per_iteration": 2.6438629627227783 }, { "auxiliary_loss_clip": 0.0647198, "auxiliary_loss_mlp": 0.01280942, "balance_loss_clip": 0.06294522, "balance_loss_mlp": 0.01265302, "epoch": 0.3888771982564257, "flos": 17493146167680.0, "grad_norm": 4.670625880898588, "language_loss": 0.83961427, "learning_rate": 2.794370840959936e-06, "loss": 0.9171434, "num_input_tokens_seen": 138825140, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.15649414, "step": 6468, "time_per_iteration": 2.51501202583313 }, { "auxiliary_loss_clip": 0.0646044, "auxiliary_loss_mlp": 0.01270403, "balance_loss_clip": 0.06287348, "balance_loss_mlp": 0.01257022, "epoch": 0.38893732150909366, "flos": 21948517065600.0, "grad_norm": 3.287137923921757, "language_loss": 0.85019279, "learning_rate": 2.7940134019666383e-06, "loss": 0.92750114, "num_input_tokens_seen": 138844115, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.13391113, "step": 6469, "time_per_iteration": 2.586040496826172 }, { "auxiliary_loss_clip": 0.0646352, "auxiliary_loss_mlp": 0.01273872, "balance_loss_clip": 0.06289491, "balance_loss_mlp": 0.01259149, "epoch": 0.3889974447617616, "flos": 24282657527040.0, "grad_norm": 1.6738721365800329, "language_loss": 0.7520256, "learning_rate": 2.793655932864273e-06, "loss": 0.82939959, "num_input_tokens_seen": 138860860, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.14733887, "step": 6470, "time_per_iteration": 2.5873117446899414 }, { "auxiliary_loss_clip": 0.06470066, "auxiliary_loss_mlp": 0.01279975, "balance_loss_clip": 0.06294365, "balance_loss_mlp": 0.01264597, "epoch": 0.3890575680144296, "flos": 25674356390400.0, "grad_norm": 1.5911653105060304, "language_loss": 0.74803704, "learning_rate": 2.7932984336663953e-06, "loss": 0.82553744, "num_input_tokens_seen": 138881910, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.15380859, "step": 6471, "time_per_iteration": 2.6296749114990234 }, { "auxiliary_loss_clip": 0.06462382, "auxiliary_loss_mlp": 0.0127676, "balance_loss_clip": 0.06286994, "balance_loss_mlp": 0.01261644, "epoch": 0.38911769126709755, "flos": 22861636934400.0, "grad_norm": 1.5621504086603173, "language_loss": 0.68311697, "learning_rate": 2.792940904386562e-06, "loss": 0.76050842, "num_input_tokens_seen": 138900975, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.15112305, "step": 6472, "time_per_iteration": 2.57519268989563 }, { "auxiliary_loss_clip": 0.0646459, "auxiliary_loss_mlp": 0.0127777, "balance_loss_clip": 0.06287635, "balance_loss_mlp": 0.01262887, "epoch": 0.3891778145197655, "flos": 25454612257920.0, "grad_norm": 1.8555140791969942, "language_loss": 0.76950741, "learning_rate": 2.7925833450383293e-06, "loss": 0.84693098, "num_input_tokens_seen": 138920795, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.14880371, "step": 6473, "time_per_iteration": 2.5976457595825195 }, { "auxiliary_loss_clip": 0.06470046, "auxiliary_loss_mlp": 0.01274244, "balance_loss_clip": 0.06292648, "balance_loss_mlp": 0.01258866, "epoch": 0.3892379377724335, "flos": 14033227374720.0, "grad_norm": 1.8821080232504726, "language_loss": 0.71675819, "learning_rate": 2.792225755635257e-06, "loss": 0.79420108, "num_input_tokens_seen": 138938770, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.15374756, "step": 6474, "time_per_iteration": 2.517395257949829 }, { "auxiliary_loss_clip": 0.06468867, "auxiliary_loss_mlp": 0.01270706, "balance_loss_clip": 0.06290475, "balance_loss_mlp": 0.01256782, "epoch": 0.38929806102510145, "flos": 20163715971840.0, "grad_norm": 1.5972569355791937, "language_loss": 0.69027936, "learning_rate": 2.7918681361909046e-06, "loss": 0.76767504, "num_input_tokens_seen": 138958880, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.13916016, "step": 6475, "time_per_iteration": 2.5509605407714844 }, { "auxiliary_loss_clip": 0.06479952, "auxiliary_loss_mlp": 0.01271948, "balance_loss_clip": 0.06295149, "balance_loss_mlp": 0.01256344, "epoch": 0.3893581842777694, "flos": 22170525258240.0, "grad_norm": 2.0714765392945917, "language_loss": 0.75752807, "learning_rate": 2.7915104867188332e-06, "loss": 0.83504713, "num_input_tokens_seen": 138977240, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.15612793, "step": 6476, "time_per_iteration": 2.5574262142181396 }, { "auxiliary_loss_clip": 0.0638322, "auxiliary_loss_mlp": 0.01264482, "balance_loss_clip": 0.06306609, "balance_loss_mlp": 0.01260342, "epoch": 0.3894183075304374, "flos": 67322936459520.0, "grad_norm": 0.7701639349369779, "language_loss": 0.58183742, "learning_rate": 2.7911528072326055e-06, "loss": 0.65831447, "num_input_tokens_seen": 139039035, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 0.04141235, "step": 6477, "time_per_iteration": 3.180872917175293 }, { "auxiliary_loss_clip": 0.06471908, "auxiliary_loss_mlp": 0.01275236, "balance_loss_clip": 0.06293602, "balance_loss_mlp": 0.01259714, "epoch": 0.38947843078310534, "flos": 18552734173440.0, "grad_norm": 2.1165581822641015, "language_loss": 0.78642619, "learning_rate": 2.7907950977457832e-06, "loss": 0.86389756, "num_input_tokens_seen": 139055560, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.1552124, "step": 6478, "time_per_iteration": 2.5403666496276855 }, { "auxiliary_loss_clip": 0.0646497, "auxiliary_loss_mlp": 0.01275332, "balance_loss_clip": 0.06289786, "balance_loss_mlp": 0.01261135, "epoch": 0.3895385540357733, "flos": 14610253317120.0, "grad_norm": 1.9966153495393328, "language_loss": 0.83059776, "learning_rate": 2.7904373582719317e-06, "loss": 0.90800071, "num_input_tokens_seen": 139071865, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.14190674, "step": 6479, "time_per_iteration": 2.523393392562866 }, { "auxiliary_loss_clip": 0.0646435, "auxiliary_loss_mlp": 0.01266754, "balance_loss_clip": 0.062903, "balance_loss_mlp": 0.01251954, "epoch": 0.38959867728844133, "flos": 19981469341440.0, "grad_norm": 1.6375045784546083, "language_loss": 0.80680346, "learning_rate": 2.790079588824617e-06, "loss": 0.8841145, "num_input_tokens_seen": 139089640, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.14801025, "step": 6480, "time_per_iteration": 2.5404515266418457 }, { "auxiliary_loss_clip": 0.06463944, "auxiliary_loss_mlp": 0.01269814, "balance_loss_clip": 0.06290431, "balance_loss_mlp": 0.01255366, "epoch": 0.3896588005411093, "flos": 22678342128000.0, "grad_norm": 1.5483568496599998, "language_loss": 0.832573, "learning_rate": 2.7897217894174038e-06, "loss": 0.90991062, "num_input_tokens_seen": 139109365, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.14453125, "step": 6481, "time_per_iteration": 2.56129789352417 }, { "auxiliary_loss_clip": 0.06457236, "auxiliary_loss_mlp": 0.012719, "balance_loss_clip": 0.0628739, "balance_loss_mlp": 0.01258293, "epoch": 0.38971892379377726, "flos": 21002343960960.0, "grad_norm": 1.5591687317838383, "language_loss": 0.75912797, "learning_rate": 2.789363960063863e-06, "loss": 0.83641934, "num_input_tokens_seen": 139128260, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.13623047, "step": 6482, "time_per_iteration": 2.539914608001709 }, { "auxiliary_loss_clip": 0.06467196, "auxiliary_loss_mlp": 0.01269717, "balance_loss_clip": 0.06290563, "balance_loss_mlp": 0.01255793, "epoch": 0.3897790470464452, "flos": 22535060446080.0, "grad_norm": 1.9796747318491144, "language_loss": 0.79453552, "learning_rate": 2.78900610077756e-06, "loss": 0.87190467, "num_input_tokens_seen": 139147315, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.13916016, "step": 6483, "time_per_iteration": 2.5705184936523438 }, { "auxiliary_loss_clip": 0.06467111, "auxiliary_loss_mlp": 0.01272213, "balance_loss_clip": 0.06291269, "balance_loss_mlp": 0.01256752, "epoch": 0.3898391702991132, "flos": 26216484307200.0, "grad_norm": 1.5036573174340204, "language_loss": 0.8040086, "learning_rate": 2.788648211572067e-06, "loss": 0.88140184, "num_input_tokens_seen": 139167270, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.15454102, "step": 6484, "time_per_iteration": 2.5931451320648193 }, { "auxiliary_loss_clip": 0.06472124, "auxiliary_loss_mlp": 0.01269377, "balance_loss_clip": 0.06296412, "balance_loss_mlp": 0.01254237, "epoch": 0.38989929355178116, "flos": 21071301471360.0, "grad_norm": 1.672031177516944, "language_loss": 0.77944851, "learning_rate": 2.7882902924609557e-06, "loss": 0.8568635, "num_input_tokens_seen": 139185970, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.15118408, "step": 6485, "time_per_iteration": 2.5797672271728516 }, { "auxiliary_loss_clip": 0.06471612, "auxiliary_loss_mlp": 0.01273793, "balance_loss_clip": 0.06292931, "balance_loss_mlp": 0.01258093, "epoch": 0.3899594168044491, "flos": 25491229292160.0, "grad_norm": 2.441060057379656, "language_loss": 0.85576314, "learning_rate": 2.7879323434577965e-06, "loss": 0.93321723, "num_input_tokens_seen": 139203730, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.15710449, "step": 6486, "time_per_iteration": 2.60613751411438 }, { "auxiliary_loss_clip": 0.06475265, "auxiliary_loss_mlp": 0.01272391, "balance_loss_clip": 0.0629489, "balance_loss_mlp": 0.01256816, "epoch": 0.3900195400571171, "flos": 31147415452800.0, "grad_norm": 1.809426464775039, "language_loss": 0.855995, "learning_rate": 2.7875743645761645e-06, "loss": 0.93347156, "num_input_tokens_seen": 139222560, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.15576172, "step": 6487, "time_per_iteration": 2.6510908603668213 }, { "auxiliary_loss_clip": 0.06465957, "auxiliary_loss_mlp": 0.01279497, "balance_loss_clip": 0.06289487, "balance_loss_mlp": 0.01264036, "epoch": 0.39007966330978505, "flos": 20236111499520.0, "grad_norm": 1.560059631853144, "language_loss": 0.73008466, "learning_rate": 2.787216355829633e-06, "loss": 0.80753922, "num_input_tokens_seen": 139242165, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.15454102, "step": 6488, "time_per_iteration": 2.562199831008911 }, { "auxiliary_loss_clip": 0.06478259, "auxiliary_loss_mlp": 0.01276167, "balance_loss_clip": 0.06298468, "balance_loss_mlp": 0.01259711, "epoch": 0.390139786562453, "flos": 22535353935360.0, "grad_norm": 3.327762249932594, "language_loss": 0.68594015, "learning_rate": 2.786858317231779e-06, "loss": 0.76348436, "num_input_tokens_seen": 139262525, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.16448975, "step": 6489, "time_per_iteration": 2.5746421813964844 }, { "auxiliary_loss_clip": 0.06469114, "auxiliary_loss_mlp": 0.01276466, "balance_loss_clip": 0.06297006, "balance_loss_mlp": 0.01261964, "epoch": 0.390199909815121, "flos": 26440211508480.0, "grad_norm": 2.036699621399225, "language_loss": 0.81210274, "learning_rate": 2.7865002487961788e-06, "loss": 0.88955855, "num_input_tokens_seen": 139282835, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.14489746, "step": 6490, "time_per_iteration": 3.821241855621338 }, { "auxiliary_loss_clip": 0.06471006, "auxiliary_loss_mlp": 0.0127174, "balance_loss_clip": 0.06294344, "balance_loss_mlp": 0.01256195, "epoch": 0.39026003306778895, "flos": 17280278069760.0, "grad_norm": 1.8307738512133414, "language_loss": 0.90286291, "learning_rate": 2.7861421505364104e-06, "loss": 0.98029029, "num_input_tokens_seen": 139299490, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.15563965, "step": 6491, "time_per_iteration": 2.5157642364501953 }, { "auxiliary_loss_clip": 0.06477229, "auxiliary_loss_mlp": 0.01271759, "balance_loss_clip": 0.06298554, "balance_loss_mlp": 0.01256643, "epoch": 0.3903201563204569, "flos": 24539354110080.0, "grad_norm": 1.7483224533636856, "language_loss": 0.78972137, "learning_rate": 2.7857840224660523e-06, "loss": 0.86721134, "num_input_tokens_seen": 139317865, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.15112305, "step": 6492, "time_per_iteration": 2.5734105110168457 }, { "auxiliary_loss_clip": 0.06476125, "auxiliary_loss_mlp": 0.01273024, "balance_loss_clip": 0.06300282, "balance_loss_mlp": 0.01258361, "epoch": 0.39038027957312493, "flos": 23774547168000.0, "grad_norm": 1.7348717436606855, "language_loss": 0.74294943, "learning_rate": 2.7854258645986857e-06, "loss": 0.82044089, "num_input_tokens_seen": 139339840, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.14660645, "step": 6493, "time_per_iteration": 4.103617429733276 }, { "auxiliary_loss_clip": 0.06492272, "auxiliary_loss_mlp": 0.01275904, "balance_loss_clip": 0.06307901, "balance_loss_mlp": 0.01259227, "epoch": 0.3904404028257929, "flos": 14105832537600.0, "grad_norm": 3.147955099311454, "language_loss": 0.76743388, "learning_rate": 2.7850676769478916e-06, "loss": 0.84511566, "num_input_tokens_seen": 139357555, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.16674805, "step": 6494, "time_per_iteration": 2.692445755004883 }, { "auxiliary_loss_clip": 0.06486256, "auxiliary_loss_mlp": 0.01277289, "balance_loss_clip": 0.06297093, "balance_loss_mlp": 0.01259741, "epoch": 0.39050052607846086, "flos": 16915742881920.0, "grad_norm": 1.9316866591263375, "language_loss": 0.75110352, "learning_rate": 2.7847094595272525e-06, "loss": 0.82873899, "num_input_tokens_seen": 139374455, "router_z_loss_clip": 1.89160156, "router_z_loss_mlp": 0.17541504, "step": 6495, "time_per_iteration": 2.5483992099761963 }, { "auxiliary_loss_clip": 0.06480514, "auxiliary_loss_mlp": 0.01277975, "balance_loss_clip": 0.06307191, "balance_loss_mlp": 0.01261393, "epoch": 0.39056064933112883, "flos": 25921912878720.0, "grad_norm": 1.4816272625842308, "language_loss": 0.6813432, "learning_rate": 2.784351212350352e-06, "loss": 0.75892812, "num_input_tokens_seen": 139394770, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.16589355, "step": 6496, "time_per_iteration": 2.6143486499786377 }, { "auxiliary_loss_clip": 0.06391441, "auxiliary_loss_mlp": 0.01256666, "balance_loss_clip": 0.06315163, "balance_loss_mlp": 0.01252688, "epoch": 0.3906207725837968, "flos": 60046125281280.0, "grad_norm": 0.6593210455656323, "language_loss": 0.53725141, "learning_rate": 2.783992935430775e-06, "loss": 0.61373252, "num_input_tokens_seen": 139454760, "router_z_loss_clip": 0.76220703, "router_z_loss_mlp": 0.0397644, "step": 6497, "time_per_iteration": 3.2959063053131104 }, { "auxiliary_loss_clip": 0.06475803, "auxiliary_loss_mlp": 0.01274583, "balance_loss_clip": 0.06298149, "balance_loss_mlp": 0.01259509, "epoch": 0.39068089583646476, "flos": 21074949123840.0, "grad_norm": 4.6839840877741015, "language_loss": 0.69589084, "learning_rate": 2.7836346287821068e-06, "loss": 0.7733947, "num_input_tokens_seen": 139472645, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.15075684, "step": 6498, "time_per_iteration": 2.5822160243988037 }, { "auxiliary_loss_clip": 0.06393933, "auxiliary_loss_mlp": 0.01256179, "balance_loss_clip": 0.0631773, "balance_loss_mlp": 0.01252397, "epoch": 0.3907410190891327, "flos": 70468269897600.0, "grad_norm": 0.7729999836036475, "language_loss": 0.51780242, "learning_rate": 2.783276292417936e-06, "loss": 0.59430361, "num_input_tokens_seen": 139536730, "router_z_loss_clip": 0.76171875, "router_z_loss_mlp": 0.03775024, "step": 6499, "time_per_iteration": 4.685570240020752 }, { "auxiliary_loss_clip": 0.06484846, "auxiliary_loss_mlp": 0.01273202, "balance_loss_clip": 0.06303287, "balance_loss_mlp": 0.01256238, "epoch": 0.3908011423418007, "flos": 27969531903360.0, "grad_norm": 1.6686853737724325, "language_loss": 0.74328113, "learning_rate": 2.7829179263518487e-06, "loss": 0.82086164, "num_input_tokens_seen": 139557540, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.1696167, "step": 6500, "time_per_iteration": 2.6457407474517822 }, { "auxiliary_loss_clip": 0.06484228, "auxiliary_loss_mlp": 0.0127546, "balance_loss_clip": 0.06304562, "balance_loss_mlp": 0.01260082, "epoch": 0.39086126559446865, "flos": 24468971080320.0, "grad_norm": 2.4887884978485797, "language_loss": 0.69162488, "learning_rate": 2.7825595305974354e-06, "loss": 0.76922178, "num_input_tokens_seen": 139576875, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.15380859, "step": 6501, "time_per_iteration": 2.6199164390563965 }, { "auxiliary_loss_clip": 0.06466551, "auxiliary_loss_mlp": 0.01273335, "balance_loss_clip": 0.06292685, "balance_loss_mlp": 0.01257766, "epoch": 0.3909213888471366, "flos": 16946406276480.0, "grad_norm": 1.5861308282188495, "language_loss": 0.7908709, "learning_rate": 2.782201105168287e-06, "loss": 0.8682698, "num_input_tokens_seen": 139594295, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.15551758, "step": 6502, "time_per_iteration": 2.546189308166504 }, { "auxiliary_loss_clip": 0.06472834, "auxiliary_loss_mlp": 0.01272721, "balance_loss_clip": 0.06302901, "balance_loss_mlp": 0.01258922, "epoch": 0.3909815120998046, "flos": 29286109981440.0, "grad_norm": 4.335240355420341, "language_loss": 0.80637097, "learning_rate": 2.7818426500779932e-06, "loss": 0.88382649, "num_input_tokens_seen": 139614080, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.13800049, "step": 6503, "time_per_iteration": 2.623750686645508 }, { "auxiliary_loss_clip": 0.06465366, "auxiliary_loss_mlp": 0.01270308, "balance_loss_clip": 0.06293119, "balance_loss_mlp": 0.01255228, "epoch": 0.39104163535247255, "flos": 18956947069440.0, "grad_norm": 1.6801368291648775, "language_loss": 0.72431219, "learning_rate": 2.7814841653401485e-06, "loss": 0.80166894, "num_input_tokens_seen": 139632755, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.15087891, "step": 6504, "time_per_iteration": 4.007492303848267 }, { "auxiliary_loss_clip": 0.06468549, "auxiliary_loss_mlp": 0.01272834, "balance_loss_clip": 0.06293796, "balance_loss_mlp": 0.01258207, "epoch": 0.3911017586051405, "flos": 26330611968000.0, "grad_norm": 1.4432465373425096, "language_loss": 0.83432508, "learning_rate": 2.7811256509683454e-06, "loss": 0.91173899, "num_input_tokens_seen": 139654205, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.14630127, "step": 6505, "time_per_iteration": 2.604710340499878 }, { "auxiliary_loss_clip": 0.06463003, "auxiliary_loss_mlp": 0.01270427, "balance_loss_clip": 0.06290169, "balance_loss_mlp": 0.01254668, "epoch": 0.3911618818578085, "flos": 21842313615360.0, "grad_norm": 2.2642275909694334, "language_loss": 0.7195071, "learning_rate": 2.7807671069761797e-06, "loss": 0.79684138, "num_input_tokens_seen": 139673595, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.15771484, "step": 6506, "time_per_iteration": 2.5581798553466797 }, { "auxiliary_loss_clip": 0.0646158, "auxiliary_loss_mlp": 0.01273139, "balance_loss_clip": 0.06291332, "balance_loss_mlp": 0.0125915, "epoch": 0.3912220051104765, "flos": 16364768359680.0, "grad_norm": 1.7683667880624154, "language_loss": 0.75711638, "learning_rate": 2.7804085333772477e-06, "loss": 0.83446348, "num_input_tokens_seen": 139690565, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.13983154, "step": 6507, "time_per_iteration": 2.5249857902526855 }, { "auxiliary_loss_clip": 0.06381582, "auxiliary_loss_mlp": 0.01262711, "balance_loss_clip": 0.0630487, "balance_loss_mlp": 0.01259266, "epoch": 0.39128212836314447, "flos": 71071179552000.0, "grad_norm": 0.748908376023126, "language_loss": 0.56584716, "learning_rate": 2.7800499301851446e-06, "loss": 0.64229012, "num_input_tokens_seen": 139749420, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 0.03451538, "step": 6508, "time_per_iteration": 3.343287467956543 }, { "auxiliary_loss_clip": 0.06471968, "auxiliary_loss_mlp": 0.0127182, "balance_loss_clip": 0.06296901, "balance_loss_mlp": 0.01257134, "epoch": 0.39134225161581243, "flos": 20336948288640.0, "grad_norm": 1.89227950145983, "language_loss": 0.7705586, "learning_rate": 2.779691297413471e-06, "loss": 0.84799647, "num_input_tokens_seen": 139766265, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.14685059, "step": 6509, "time_per_iteration": 2.5578041076660156 }, { "auxiliary_loss_clip": 0.06472141, "auxiliary_loss_mlp": 0.01272749, "balance_loss_clip": 0.06298508, "balance_loss_mlp": 0.01256751, "epoch": 0.3914023748684804, "flos": 17023916903040.0, "grad_norm": 2.5711565035182122, "language_loss": 0.83643556, "learning_rate": 2.779332635075825e-06, "loss": 0.91388446, "num_input_tokens_seen": 139782400, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.16003418, "step": 6510, "time_per_iteration": 2.5318610668182373 }, { "auxiliary_loss_clip": 0.06475578, "auxiliary_loss_mlp": 0.01271539, "balance_loss_clip": 0.06297702, "balance_loss_mlp": 0.01256936, "epoch": 0.39146249812114836, "flos": 18411045719040.0, "grad_norm": 1.7414087424323703, "language_loss": 0.77525705, "learning_rate": 2.7789739431858073e-06, "loss": 0.85272825, "num_input_tokens_seen": 139801435, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.14611816, "step": 6511, "time_per_iteration": 2.5527753829956055 }, { "auxiliary_loss_clip": 0.06387582, "auxiliary_loss_mlp": 0.01261488, "balance_loss_clip": 0.06311103, "balance_loss_mlp": 0.0125855, "epoch": 0.3915226213738163, "flos": 67659659291520.0, "grad_norm": 0.7088799863813983, "language_loss": 0.57739836, "learning_rate": 2.7786152217570196e-06, "loss": 0.65388906, "num_input_tokens_seen": 139869700, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 0.02935791, "step": 6512, "time_per_iteration": 3.309943914413452 }, { "auxiliary_loss_clip": 0.06463053, "auxiliary_loss_mlp": 0.01276097, "balance_loss_clip": 0.06287381, "balance_loss_mlp": 0.01259765, "epoch": 0.3915827446264843, "flos": 26366516242560.0, "grad_norm": 1.7548159494263549, "language_loss": 0.69511497, "learning_rate": 2.7782564708030647e-06, "loss": 0.77250648, "num_input_tokens_seen": 139890140, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.16333008, "step": 6513, "time_per_iteration": 2.620110511779785 }, { "auxiliary_loss_clip": 0.06471699, "auxiliary_loss_mlp": 0.01276294, "balance_loss_clip": 0.06291997, "balance_loss_mlp": 0.01261077, "epoch": 0.39164286787915226, "flos": 21950236074240.0, "grad_norm": 2.8054893205812634, "language_loss": 0.76804775, "learning_rate": 2.7778976903375464e-06, "loss": 0.84552771, "num_input_tokens_seen": 139908020, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.15216064, "step": 6514, "time_per_iteration": 2.5558369159698486 }, { "auxiliary_loss_clip": 0.06465326, "auxiliary_loss_mlp": 0.01270211, "balance_loss_clip": 0.0629065, "balance_loss_mlp": 0.01255739, "epoch": 0.3917029911318202, "flos": 16405536170880.0, "grad_norm": 1.6518308139383104, "language_loss": 0.77625453, "learning_rate": 2.7775388803740693e-06, "loss": 0.85360992, "num_input_tokens_seen": 139926180, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.14459229, "step": 6515, "time_per_iteration": 2.5542831420898438 }, { "auxiliary_loss_clip": 0.06462134, "auxiliary_loss_mlp": 0.01271048, "balance_loss_clip": 0.06289984, "balance_loss_mlp": 0.01257148, "epoch": 0.3917631143844882, "flos": 26218580659200.0, "grad_norm": 1.3227298378394912, "language_loss": 0.79975319, "learning_rate": 2.7771800409262406e-06, "loss": 0.87708503, "num_input_tokens_seen": 139947420, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.13903809, "step": 6516, "time_per_iteration": 2.6392478942871094 }, { "auxiliary_loss_clip": 0.06464334, "auxiliary_loss_mlp": 0.01274964, "balance_loss_clip": 0.06289699, "balance_loss_mlp": 0.01260385, "epoch": 0.39182323763715615, "flos": 18553740422400.0, "grad_norm": 1.7579550338715197, "language_loss": 0.70504403, "learning_rate": 2.7768211720076665e-06, "loss": 0.78243703, "num_input_tokens_seen": 139965800, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.14581299, "step": 6517, "time_per_iteration": 2.569417953491211 }, { "auxiliary_loss_clip": 0.0646617, "auxiliary_loss_mlp": 0.0126928, "balance_loss_clip": 0.06291063, "balance_loss_mlp": 0.01254259, "epoch": 0.3918833608898241, "flos": 34322112547200.0, "grad_norm": 1.807744589724856, "language_loss": 0.72471786, "learning_rate": 2.776462273631956e-06, "loss": 0.80207241, "num_input_tokens_seen": 139988140, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.15014648, "step": 6518, "time_per_iteration": 2.6544811725616455 }, { "auxiliary_loss_clip": 0.06466978, "auxiliary_loss_mlp": 0.01272581, "balance_loss_clip": 0.06289844, "balance_loss_mlp": 0.01257078, "epoch": 0.3919434841424921, "flos": 36948434595840.0, "grad_norm": 2.7570474017161586, "language_loss": 0.61992157, "learning_rate": 2.7761033458127177e-06, "loss": 0.69731724, "num_input_tokens_seen": 140010060, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.15496826, "step": 6519, "time_per_iteration": 2.6998074054718018 }, { "auxiliary_loss_clip": 0.06473879, "auxiliary_loss_mlp": 0.01270103, "balance_loss_clip": 0.0629075, "balance_loss_mlp": 0.01253629, "epoch": 0.3920036073951601, "flos": 23514915692160.0, "grad_norm": 2.049925414158837, "language_loss": 0.67975456, "learning_rate": 2.775744388563563e-06, "loss": 0.7571944, "num_input_tokens_seen": 140029400, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.16455078, "step": 6520, "time_per_iteration": 2.5670464038848877 }, { "auxiliary_loss_clip": 0.06464586, "auxiliary_loss_mlp": 0.01271909, "balance_loss_clip": 0.06290452, "balance_loss_mlp": 0.01257706, "epoch": 0.39206373064782807, "flos": 18412051968000.0, "grad_norm": 2.053444432917538, "language_loss": 0.79159915, "learning_rate": 2.775385401898104e-06, "loss": 0.86896408, "num_input_tokens_seen": 140048940, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.14208984, "step": 6521, "time_per_iteration": 2.520650625228882 }, { "auxiliary_loss_clip": 0.06477685, "auxiliary_loss_mlp": 0.01273955, "balance_loss_clip": 0.06295177, "balance_loss_mlp": 0.01256289, "epoch": 0.39212385390049603, "flos": 12318012696960.0, "grad_norm": 2.147953860607244, "language_loss": 0.70877528, "learning_rate": 2.775026385829952e-06, "loss": 0.78629166, "num_input_tokens_seen": 140066380, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.17675781, "step": 6522, "time_per_iteration": 2.512725353240967 }, { "auxiliary_loss_clip": 0.06467029, "auxiliary_loss_mlp": 0.01268273, "balance_loss_clip": 0.06288755, "balance_loss_mlp": 0.01253241, "epoch": 0.392183977153164, "flos": 19725275882880.0, "grad_norm": 2.023544961667575, "language_loss": 0.77064502, "learning_rate": 2.774667340372722e-06, "loss": 0.84799802, "num_input_tokens_seen": 140085275, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.15032959, "step": 6523, "time_per_iteration": 2.546870708465576 }, { "auxiliary_loss_clip": 0.06470917, "auxiliary_loss_mlp": 0.01273618, "balance_loss_clip": 0.06293121, "balance_loss_mlp": 0.01259551, "epoch": 0.39224410040583196, "flos": 33153092709120.0, "grad_norm": 3.064583847462686, "language_loss": 0.62469184, "learning_rate": 2.7743082655400293e-06, "loss": 0.70213723, "num_input_tokens_seen": 140105105, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.140625, "step": 6524, "time_per_iteration": 2.657179355621338 }, { "auxiliary_loss_clip": 0.064651, "auxiliary_loss_mlp": 0.01270334, "balance_loss_clip": 0.06289083, "balance_loss_mlp": 0.01255117, "epoch": 0.39230422365849993, "flos": 27789884749440.0, "grad_norm": 1.5268266000571926, "language_loss": 0.74327099, "learning_rate": 2.773949161345489e-06, "loss": 0.82062542, "num_input_tokens_seen": 140125645, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.15216064, "step": 6525, "time_per_iteration": 2.607771396636963 }, { "auxiliary_loss_clip": 0.06472005, "auxiliary_loss_mlp": 0.01273375, "balance_loss_clip": 0.06293683, "balance_loss_mlp": 0.01258665, "epoch": 0.3923643469111679, "flos": 17937497969280.0, "grad_norm": 1.930911237862843, "language_loss": 0.81546938, "learning_rate": 2.773590027802719e-06, "loss": 0.89292324, "num_input_tokens_seen": 140141925, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.14697266, "step": 6526, "time_per_iteration": 2.5325827598571777 }, { "auxiliary_loss_clip": 0.06468645, "auxiliary_loss_mlp": 0.01270498, "balance_loss_clip": 0.06291406, "balance_loss_mlp": 0.01255538, "epoch": 0.39242447016383586, "flos": 24066141776640.0, "grad_norm": 1.724087291479139, "language_loss": 0.70496356, "learning_rate": 2.7732308649253383e-06, "loss": 0.78235495, "num_input_tokens_seen": 140160965, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.1494751, "step": 6527, "time_per_iteration": 2.5810534954071045 }, { "auxiliary_loss_clip": 0.06469392, "auxiliary_loss_mlp": 0.01270824, "balance_loss_clip": 0.06293459, "balance_loss_mlp": 0.01255994, "epoch": 0.3924845934165038, "flos": 10667562825600.0, "grad_norm": 2.3544226773836607, "language_loss": 0.83299088, "learning_rate": 2.772871672726965e-06, "loss": 0.910393, "num_input_tokens_seen": 140177780, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.1484375, "step": 6528, "time_per_iteration": 2.5233166217803955 }, { "auxiliary_loss_clip": 0.06466423, "auxiliary_loss_mlp": 0.01272641, "balance_loss_clip": 0.06294325, "balance_loss_mlp": 0.01258414, "epoch": 0.3925447166691718, "flos": 31253493121920.0, "grad_norm": 1.8009510333471188, "language_loss": 0.68748999, "learning_rate": 2.7725124512212205e-06, "loss": 0.76488066, "num_input_tokens_seen": 140201660, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.14227295, "step": 6529, "time_per_iteration": 4.096567153930664 }, { "auxiliary_loss_clip": 0.06475715, "auxiliary_loss_mlp": 0.01268424, "balance_loss_clip": 0.06296115, "balance_loss_mlp": 0.01252474, "epoch": 0.39260483992183975, "flos": 29421215890560.0, "grad_norm": 3.8744836786855426, "language_loss": 0.79717505, "learning_rate": 2.7721532004217267e-06, "loss": 0.87461638, "num_input_tokens_seen": 140218585, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.1595459, "step": 6530, "time_per_iteration": 2.58012318611145 }, { "auxiliary_loss_clip": 0.06463628, "auxiliary_loss_mlp": 0.01272416, "balance_loss_clip": 0.06289371, "balance_loss_mlp": 0.01257813, "epoch": 0.3926649631745077, "flos": 22864571827200.0, "grad_norm": 1.4331456326744014, "language_loss": 0.7569412, "learning_rate": 2.7717939203421063e-06, "loss": 0.83430159, "num_input_tokens_seen": 140239905, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.14611816, "step": 6531, "time_per_iteration": 2.5665700435638428 }, { "auxiliary_loss_clip": 0.06363243, "auxiliary_loss_mlp": 0.01279909, "balance_loss_clip": 0.06286731, "balance_loss_mlp": 0.01277088, "epoch": 0.3927250864271757, "flos": 63911892124800.0, "grad_norm": 0.8049413791258488, "language_loss": 0.60232282, "learning_rate": 2.7714346109959822e-06, "loss": 0.67875433, "num_input_tokens_seen": 140293820, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 0.02818298, "step": 6532, "time_per_iteration": 3.0410637855529785 }, { "auxiliary_loss_clip": 0.06360857, "auxiliary_loss_mlp": 0.01281499, "balance_loss_clip": 0.06283495, "balance_loss_mlp": 0.01277959, "epoch": 0.3927852096798437, "flos": 68931486489600.0, "grad_norm": 0.772825773732952, "language_loss": 0.55449617, "learning_rate": 2.771075272396981e-06, "loss": 0.63091969, "num_input_tokens_seen": 140360420, "router_z_loss_clip": 0.7734375, "router_z_loss_mlp": 0.03543091, "step": 6533, "time_per_iteration": 4.708561897277832 }, { "auxiliary_loss_clip": 0.06467484, "auxiliary_loss_mlp": 0.0127498, "balance_loss_clip": 0.06288243, "balance_loss_mlp": 0.01259257, "epoch": 0.39284533293251167, "flos": 29723711529600.0, "grad_norm": 3.661921237820719, "language_loss": 0.76612949, "learning_rate": 2.7707159045587284e-06, "loss": 0.84355414, "num_input_tokens_seen": 140381950, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.15722656, "step": 6534, "time_per_iteration": 2.6234230995178223 }, { "auxiliary_loss_clip": 0.06472618, "auxiliary_loss_mlp": 0.0127116, "balance_loss_clip": 0.06292208, "balance_loss_mlp": 0.01256127, "epoch": 0.39290545618517964, "flos": 18558016980480.0, "grad_norm": 1.991754536773328, "language_loss": 0.78571707, "learning_rate": 2.770356507494851e-06, "loss": 0.86315489, "num_input_tokens_seen": 140399410, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.15026855, "step": 6535, "time_per_iteration": 2.5097804069519043 }, { "auxiliary_loss_clip": 0.0646438, "auxiliary_loss_mlp": 0.01268696, "balance_loss_clip": 0.06292827, "balance_loss_mlp": 0.01255493, "epoch": 0.3929655794378476, "flos": 26256581285760.0, "grad_norm": 2.932506012814915, "language_loss": 0.69324082, "learning_rate": 2.769997081218978e-06, "loss": 0.77057159, "num_input_tokens_seen": 140419055, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.13195801, "step": 6536, "time_per_iteration": 2.587951421737671 }, { "auxiliary_loss_clip": 0.06462365, "auxiliary_loss_mlp": 0.01274952, "balance_loss_clip": 0.06293432, "balance_loss_mlp": 0.01260927, "epoch": 0.39302570269051557, "flos": 29285564929920.0, "grad_norm": 2.0621595690854715, "language_loss": 0.69893974, "learning_rate": 2.769637625744738e-06, "loss": 0.77631295, "num_input_tokens_seen": 140438800, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.14013672, "step": 6537, "time_per_iteration": 2.592186689376831 }, { "auxiliary_loss_clip": 0.06471238, "auxiliary_loss_mlp": 0.01269299, "balance_loss_clip": 0.06295872, "balance_loss_mlp": 0.01255065, "epoch": 0.39308582594318353, "flos": 17353134794880.0, "grad_norm": 1.6555376028302164, "language_loss": 0.79505932, "learning_rate": 2.769278141085763e-06, "loss": 0.87246466, "num_input_tokens_seen": 140456880, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.14227295, "step": 6538, "time_per_iteration": 2.5392563343048096 }, { "auxiliary_loss_clip": 0.06357487, "auxiliary_loss_mlp": 0.01254975, "balance_loss_clip": 0.06281065, "balance_loss_mlp": 0.01251387, "epoch": 0.3931459491958515, "flos": 61023884175360.0, "grad_norm": 0.7906759640693892, "language_loss": 0.61754096, "learning_rate": 2.768918627255683e-06, "loss": 0.69366556, "num_input_tokens_seen": 140507510, "router_z_loss_clip": 0.76318359, "router_z_loss_mlp": 0.03585815, "step": 6539, "time_per_iteration": 4.318203449249268 }, { "auxiliary_loss_clip": 0.0647044, "auxiliary_loss_mlp": 0.01270764, "balance_loss_clip": 0.06296512, "balance_loss_mlp": 0.01256197, "epoch": 0.39320607244851946, "flos": 39024662590080.0, "grad_norm": 2.252125422769885, "language_loss": 0.67936128, "learning_rate": 2.7685590842681315e-06, "loss": 0.75677329, "num_input_tokens_seen": 140528740, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.14544678, "step": 6540, "time_per_iteration": 2.7415075302124023 }, { "auxiliary_loss_clip": 0.06468831, "auxiliary_loss_mlp": 0.01271725, "balance_loss_clip": 0.06295255, "balance_loss_mlp": 0.01256668, "epoch": 0.3932661957011874, "flos": 24686451152640.0, "grad_norm": 1.6066496211295054, "language_loss": 0.72815204, "learning_rate": 2.7681995121367433e-06, "loss": 0.80555761, "num_input_tokens_seen": 140547560, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.1505127, "step": 6541, "time_per_iteration": 2.6194567680358887 }, { "auxiliary_loss_clip": 0.06354367, "auxiliary_loss_mlp": 0.01267425, "balance_loss_clip": 0.06278044, "balance_loss_mlp": 0.01264164, "epoch": 0.3933263189538554, "flos": 70115614790400.0, "grad_norm": 0.8077815582810193, "language_loss": 0.60358852, "learning_rate": 2.7678399108751516e-06, "loss": 0.67980647, "num_input_tokens_seen": 140601175, "router_z_loss_clip": 0.76171875, "router_z_loss_mlp": 0.03265381, "step": 6542, "time_per_iteration": 3.0338473320007324 }, { "auxiliary_loss_clip": 0.06469254, "auxiliary_loss_mlp": 0.0127744, "balance_loss_clip": 0.06293648, "balance_loss_mlp": 0.01263475, "epoch": 0.39338644220652336, "flos": 22935583762560.0, "grad_norm": 1.5189348324664902, "language_loss": 0.82769239, "learning_rate": 2.7674802804969947e-06, "loss": 0.90515929, "num_input_tokens_seen": 140622200, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.13970947, "step": 6543, "time_per_iteration": 2.5741662979125977 }, { "auxiliary_loss_clip": 0.06470205, "auxiliary_loss_mlp": 0.01273571, "balance_loss_clip": 0.06296606, "balance_loss_mlp": 0.01259147, "epoch": 0.3934465654591913, "flos": 30856282041600.0, "grad_norm": 1.5702892291350072, "language_loss": 0.69007504, "learning_rate": 2.767120621015908e-06, "loss": 0.7675128, "num_input_tokens_seen": 140643125, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.14440918, "step": 6544, "time_per_iteration": 4.097050905227661 }, { "auxiliary_loss_clip": 0.06478105, "auxiliary_loss_mlp": 0.0127703, "balance_loss_clip": 0.06298532, "balance_loss_mlp": 0.01260913, "epoch": 0.3935066887118593, "flos": 29243329672320.0, "grad_norm": 2.0081610493391655, "language_loss": 0.75881875, "learning_rate": 2.76676093244553e-06, "loss": 0.83637011, "num_input_tokens_seen": 140662500, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.16113281, "step": 6545, "time_per_iteration": 2.71901798248291 }, { "auxiliary_loss_clip": 0.06456158, "auxiliary_loss_mlp": 0.01271581, "balance_loss_clip": 0.06291258, "balance_loss_mlp": 0.01258385, "epoch": 0.3935668119645273, "flos": 19141290051840.0, "grad_norm": 1.6080855899443482, "language_loss": 0.74919081, "learning_rate": 2.7664012147995015e-06, "loss": 0.82646823, "num_input_tokens_seen": 140681960, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.13201904, "step": 6546, "time_per_iteration": 2.5578572750091553 }, { "auxiliary_loss_clip": 0.06480284, "auxiliary_loss_mlp": 0.01269638, "balance_loss_clip": 0.06297939, "balance_loss_mlp": 0.01254546, "epoch": 0.3936269352171953, "flos": 18522196560000.0, "grad_norm": 1.9446089559170225, "language_loss": 0.82287276, "learning_rate": 2.7660414680914617e-06, "loss": 0.90037197, "num_input_tokens_seen": 140699170, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.15075684, "step": 6547, "time_per_iteration": 2.5334601402282715 }, { "auxiliary_loss_clip": 0.06469487, "auxiliary_loss_mlp": 0.01271663, "balance_loss_clip": 0.06294668, "balance_loss_mlp": 0.01257936, "epoch": 0.39368705846986324, "flos": 15638255533440.0, "grad_norm": 1.8968101625146179, "language_loss": 0.84535384, "learning_rate": 2.7656816923350525e-06, "loss": 0.92276531, "num_input_tokens_seen": 140714920, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.137146, "step": 6548, "time_per_iteration": 2.5282115936279297 }, { "auxiliary_loss_clip": 0.06462725, "auxiliary_loss_mlp": 0.0127796, "balance_loss_clip": 0.0629219, "balance_loss_mlp": 0.01263906, "epoch": 0.3937471817225312, "flos": 21332442320640.0, "grad_norm": 1.475509642460697, "language_loss": 0.73010498, "learning_rate": 2.7653218875439174e-06, "loss": 0.80751181, "num_input_tokens_seen": 140734595, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.14031982, "step": 6549, "time_per_iteration": 2.5638298988342285 }, { "auxiliary_loss_clip": 0.06466807, "auxiliary_loss_mlp": 0.01273968, "balance_loss_clip": 0.0629216, "balance_loss_mlp": 0.01258268, "epoch": 0.39380730497519917, "flos": 20782893317760.0, "grad_norm": 1.6103260648506335, "language_loss": 0.78135645, "learning_rate": 2.764962053731699e-06, "loss": 0.85876423, "num_input_tokens_seen": 140754050, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.15710449, "step": 6550, "time_per_iteration": 2.579869508743286 }, { "auxiliary_loss_clip": 0.06464683, "auxiliary_loss_mlp": 0.01272778, "balance_loss_clip": 0.06290667, "balance_loss_mlp": 0.01258938, "epoch": 0.39386742822786713, "flos": 21615106469760.0, "grad_norm": 1.7344294356578165, "language_loss": 0.81605029, "learning_rate": 2.7646021909120434e-06, "loss": 0.89342487, "num_input_tokens_seen": 140771440, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.13867188, "step": 6551, "time_per_iteration": 2.5553743839263916 }, { "auxiliary_loss_clip": 0.06469314, "auxiliary_loss_mlp": 0.01271557, "balance_loss_clip": 0.06293687, "balance_loss_mlp": 0.01257222, "epoch": 0.3939275514805351, "flos": 12418304434560.0, "grad_norm": 2.0284388363035024, "language_loss": 0.80863392, "learning_rate": 2.764242299098596e-06, "loss": 0.88604259, "num_input_tokens_seen": 140786715, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.14331055, "step": 6552, "time_per_iteration": 2.5302891731262207 }, { "auxiliary_loss_clip": 0.06472848, "auxiliary_loss_mlp": 0.01270002, "balance_loss_clip": 0.0629455, "balance_loss_mlp": 0.0125491, "epoch": 0.39398767473320306, "flos": 18558016980480.0, "grad_norm": 2.8625817202727286, "language_loss": 0.71603686, "learning_rate": 2.763882378305003e-06, "loss": 0.79346538, "num_input_tokens_seen": 140804950, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.15087891, "step": 6553, "time_per_iteration": 2.5478098392486572 }, { "auxiliary_loss_clip": 0.06467324, "auxiliary_loss_mlp": 0.01274265, "balance_loss_clip": 0.06295623, "balance_loss_mlp": 0.01259734, "epoch": 0.39404779798587103, "flos": 29315599418880.0, "grad_norm": 1.6421638529154814, "language_loss": 0.6433807, "learning_rate": 2.7635224285449144e-06, "loss": 0.72079659, "num_input_tokens_seen": 140822800, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.14526367, "step": 6554, "time_per_iteration": 2.621081590652466 }, { "auxiliary_loss_clip": 0.0646589, "auxiliary_loss_mlp": 0.01274478, "balance_loss_clip": 0.06291839, "balance_loss_mlp": 0.01259982, "epoch": 0.394107921238539, "flos": 34905679107840.0, "grad_norm": 1.8381571554978038, "language_loss": 0.79332459, "learning_rate": 2.7631624498319796e-06, "loss": 0.87072825, "num_input_tokens_seen": 140842940, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.14489746, "step": 6555, "time_per_iteration": 2.6634278297424316 }, { "auxiliary_loss_clip": 0.06472169, "auxiliary_loss_mlp": 0.01274437, "balance_loss_clip": 0.06295613, "balance_loss_mlp": 0.01259416, "epoch": 0.39416804449120696, "flos": 25088232280320.0, "grad_norm": 1.7349945333218637, "language_loss": 0.71881807, "learning_rate": 2.7628024421798473e-06, "loss": 0.79628408, "num_input_tokens_seen": 140863060, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.15026855, "step": 6556, "time_per_iteration": 2.593109607696533 }, { "auxiliary_loss_clip": 0.0646475, "auxiliary_loss_mlp": 0.01271235, "balance_loss_clip": 0.06290232, "balance_loss_mlp": 0.01256537, "epoch": 0.3942281677438749, "flos": 32314842063360.0, "grad_norm": 1.9695029780717637, "language_loss": 0.8351419, "learning_rate": 2.7624424056021705e-06, "loss": 0.91250175, "num_input_tokens_seen": 140883795, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.14709473, "step": 6557, "time_per_iteration": 2.6675593852996826 }, { "auxiliary_loss_clip": 0.0646082, "auxiliary_loss_mlp": 0.01270503, "balance_loss_clip": 0.06288739, "balance_loss_mlp": 0.01256221, "epoch": 0.3942882909965429, "flos": 24943608933120.0, "grad_norm": 3.445505159759472, "language_loss": 0.80637991, "learning_rate": 2.7620823401126004e-06, "loss": 0.88369322, "num_input_tokens_seen": 140903055, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.14263916, "step": 6558, "time_per_iteration": 2.723607063293457 }, { "auxiliary_loss_clip": 0.06464005, "auxiliary_loss_mlp": 0.01269837, "balance_loss_clip": 0.06292351, "balance_loss_mlp": 0.01256646, "epoch": 0.39434841424921085, "flos": 11879614535040.0, "grad_norm": 1.796178230572194, "language_loss": 0.71874136, "learning_rate": 2.761722245724792e-06, "loss": 0.79607975, "num_input_tokens_seen": 140920685, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.13189697, "step": 6559, "time_per_iteration": 2.5387604236602783 }, { "auxiliary_loss_clip": 0.06480406, "auxiliary_loss_mlp": 0.01273944, "balance_loss_clip": 0.06300048, "balance_loss_mlp": 0.01259299, "epoch": 0.3944085375018789, "flos": 16367032419840.0, "grad_norm": 2.1164121148698807, "language_loss": 0.80974501, "learning_rate": 2.7613621224524003e-06, "loss": 0.88728857, "num_input_tokens_seen": 140937320, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.1463623, "step": 6560, "time_per_iteration": 2.5445046424865723 }, { "auxiliary_loss_clip": 0.06473662, "auxiliary_loss_mlp": 0.01271832, "balance_loss_clip": 0.0629743, "balance_loss_mlp": 0.0125714, "epoch": 0.39446866075454684, "flos": 10637821825920.0, "grad_norm": 3.002525290994051, "language_loss": 0.83055449, "learning_rate": 2.7610019703090803e-06, "loss": 0.90800935, "num_input_tokens_seen": 140954855, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.14678955, "step": 6561, "time_per_iteration": 2.585005283355713 }, { "auxiliary_loss_clip": 0.06466885, "auxiliary_loss_mlp": 0.01273673, "balance_loss_clip": 0.06293905, "balance_loss_mlp": 0.0126006, "epoch": 0.3945287840072148, "flos": 18193481792640.0, "grad_norm": 1.9977641621433904, "language_loss": 0.80204368, "learning_rate": 2.7606417893084887e-06, "loss": 0.87944925, "num_input_tokens_seen": 140973250, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.13616943, "step": 6562, "time_per_iteration": 2.540339708328247 }, { "auxiliary_loss_clip": 0.06464472, "auxiliary_loss_mlp": 0.01273104, "balance_loss_clip": 0.06292585, "balance_loss_mlp": 0.01258691, "epoch": 0.39458890725988277, "flos": 23046650749440.0, "grad_norm": 1.5356931620698269, "language_loss": 0.81834501, "learning_rate": 2.7602815794642853e-06, "loss": 0.89572072, "num_input_tokens_seen": 140993050, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.144104, "step": 6563, "time_per_iteration": 2.593371868133545 }, { "auxiliary_loss_clip": 0.06476884, "auxiliary_loss_mlp": 0.01270336, "balance_loss_clip": 0.06301285, "balance_loss_mlp": 0.01256382, "epoch": 0.39464903051255074, "flos": 17163718640640.0, "grad_norm": 1.9403600674983048, "language_loss": 0.70110077, "learning_rate": 2.759921340790127e-06, "loss": 0.77857298, "num_input_tokens_seen": 141010815, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.13946533, "step": 6564, "time_per_iteration": 2.5667757987976074 }, { "auxiliary_loss_clip": 0.06470895, "auxiliary_loss_mlp": 0.01270957, "balance_loss_clip": 0.06294987, "balance_loss_mlp": 0.01256688, "epoch": 0.3947091537652187, "flos": 15894616700160.0, "grad_norm": 2.0948989551960597, "language_loss": 0.83130592, "learning_rate": 2.759561073299676e-06, "loss": 0.90872443, "num_input_tokens_seen": 141028720, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.14257812, "step": 6565, "time_per_iteration": 2.532411813735962 }, { "auxiliary_loss_clip": 0.06471225, "auxiliary_loss_mlp": 0.01270257, "balance_loss_clip": 0.06297447, "balance_loss_mlp": 0.01256452, "epoch": 0.39476927701788667, "flos": 18550386259200.0, "grad_norm": 1.7837350156431733, "language_loss": 0.83816445, "learning_rate": 2.7592007770065937e-06, "loss": 0.9155792, "num_input_tokens_seen": 141046025, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.13812256, "step": 6566, "time_per_iteration": 2.526761293411255 }, { "auxiliary_loss_clip": 0.06480436, "auxiliary_loss_mlp": 0.01269832, "balance_loss_clip": 0.06298289, "balance_loss_mlp": 0.01254949, "epoch": 0.39482940027055463, "flos": 22282682348160.0, "grad_norm": 1.8530710325620663, "language_loss": 0.77416879, "learning_rate": 2.7588404519245403e-06, "loss": 0.85167146, "num_input_tokens_seen": 141066865, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.14874268, "step": 6567, "time_per_iteration": 2.5929815769195557 }, { "auxiliary_loss_clip": 0.06466939, "auxiliary_loss_mlp": 0.01271283, "balance_loss_clip": 0.06298164, "balance_loss_mlp": 0.01257801, "epoch": 0.3948895235232226, "flos": 14763010510080.0, "grad_norm": 1.8740130648627669, "language_loss": 0.8082518, "learning_rate": 2.758480098067182e-06, "loss": 0.88563401, "num_input_tokens_seen": 141084210, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.1348877, "step": 6568, "time_per_iteration": 3.9675676822662354 }, { "auxiliary_loss_clip": 0.06472635, "auxiliary_loss_mlp": 0.01274193, "balance_loss_clip": 0.06298627, "balance_loss_mlp": 0.01259197, "epoch": 0.39494964677589056, "flos": 22572474094080.0, "grad_norm": 1.587557581617473, "language_loss": 0.85321391, "learning_rate": 2.7581197154481816e-06, "loss": 0.93068218, "num_input_tokens_seen": 141103895, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.14978027, "step": 6569, "time_per_iteration": 2.5716068744659424 }, { "auxiliary_loss_clip": 0.06463183, "auxiliary_loss_mlp": 0.01272514, "balance_loss_clip": 0.06293222, "balance_loss_mlp": 0.01258566, "epoch": 0.3950097700285585, "flos": 22969307831040.0, "grad_norm": 1.880145847158742, "language_loss": 0.74828863, "learning_rate": 2.7577593040812066e-06, "loss": 0.82564557, "num_input_tokens_seen": 141124000, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.13934326, "step": 6570, "time_per_iteration": 2.5769236087799072 }, { "auxiliary_loss_clip": 0.06470969, "auxiliary_loss_mlp": 0.01269936, "balance_loss_clip": 0.06294674, "balance_loss_mlp": 0.01255476, "epoch": 0.3950698932812265, "flos": 20601569082240.0, "grad_norm": 1.5324132981353245, "language_loss": 0.80046606, "learning_rate": 2.757398863979922e-06, "loss": 0.87787521, "num_input_tokens_seen": 141142535, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.14477539, "step": 6571, "time_per_iteration": 2.5357108116149902 }, { "auxiliary_loss_clip": 0.06474578, "auxiliary_loss_mlp": 0.0127528, "balance_loss_clip": 0.06299534, "balance_loss_mlp": 0.01261404, "epoch": 0.39513001653389446, "flos": 20381992657920.0, "grad_norm": 1.682892258749633, "language_loss": 0.78224754, "learning_rate": 2.757038395157997e-06, "loss": 0.8597461, "num_input_tokens_seen": 141161575, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.13879395, "step": 6572, "time_per_iteration": 3.9876925945281982 }, { "auxiliary_loss_clip": 0.06475143, "auxiliary_loss_mlp": 0.01272134, "balance_loss_clip": 0.06298868, "balance_loss_mlp": 0.01258032, "epoch": 0.3951901397865625, "flos": 26469994435200.0, "grad_norm": 1.9624476434353677, "language_loss": 0.75553447, "learning_rate": 2.7566778976291002e-06, "loss": 0.83300722, "num_input_tokens_seen": 141181150, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.14099121, "step": 6573, "time_per_iteration": 2.5972635746002197 }, { "auxiliary_loss_clip": 0.06467913, "auxiliary_loss_mlp": 0.01266719, "balance_loss_clip": 0.06296305, "balance_loss_mlp": 0.01253982, "epoch": 0.39525026303923044, "flos": 43848845233920.0, "grad_norm": 1.463680348328596, "language_loss": 0.67921805, "learning_rate": 2.7563173714069017e-06, "loss": 0.75656444, "num_input_tokens_seen": 141206310, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.12756348, "step": 6574, "time_per_iteration": 2.7607903480529785 }, { "auxiliary_loss_clip": 0.06469862, "auxiliary_loss_mlp": 0.01269645, "balance_loss_clip": 0.06295514, "balance_loss_mlp": 0.01255763, "epoch": 0.3953103862918984, "flos": 18046636312320.0, "grad_norm": 3.654518383214396, "language_loss": 0.72048318, "learning_rate": 2.755956816505072e-06, "loss": 0.79787827, "num_input_tokens_seen": 141223925, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.13891602, "step": 6575, "time_per_iteration": 2.5301897525787354 }, { "auxiliary_loss_clip": 0.0647531, "auxiliary_loss_mlp": 0.01269741, "balance_loss_clip": 0.06297769, "balance_loss_mlp": 0.01255561, "epoch": 0.3953705095445664, "flos": 16980549615360.0, "grad_norm": 2.031816511382794, "language_loss": 0.74542284, "learning_rate": 2.7555962329372845e-06, "loss": 0.82287335, "num_input_tokens_seen": 141239010, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.1418457, "step": 6576, "time_per_iteration": 2.542025327682495 }, { "auxiliary_loss_clip": 0.06474072, "auxiliary_loss_mlp": 0.01271022, "balance_loss_clip": 0.06298648, "balance_loss_mlp": 0.0125773, "epoch": 0.39543063279723434, "flos": 17415300124800.0, "grad_norm": 2.571517359073795, "language_loss": 0.83944917, "learning_rate": 2.7552356207172124e-06, "loss": 0.9169001, "num_input_tokens_seen": 141252255, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.13305664, "step": 6577, "time_per_iteration": 2.524827480316162 }, { "auxiliary_loss_clip": 0.06468305, "auxiliary_loss_mlp": 0.01268687, "balance_loss_clip": 0.06297283, "balance_loss_mlp": 0.01255371, "epoch": 0.3954907560499023, "flos": 22790876561280.0, "grad_norm": 2.643558990276545, "language_loss": 0.90247273, "learning_rate": 2.75487497985853e-06, "loss": 0.97984266, "num_input_tokens_seen": 141269325, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.13317871, "step": 6578, "time_per_iteration": 2.5614829063415527 }, { "auxiliary_loss_clip": 0.06470542, "auxiliary_loss_mlp": 0.01268906, "balance_loss_clip": 0.06293105, "balance_loss_mlp": 0.01254017, "epoch": 0.39555087930257027, "flos": 21950823052800.0, "grad_norm": 2.0519419957948655, "language_loss": 0.78906095, "learning_rate": 2.7545143103749117e-06, "loss": 0.86645544, "num_input_tokens_seen": 141288505, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.14880371, "step": 6579, "time_per_iteration": 3.9577343463897705 }, { "auxiliary_loss_clip": 0.06477676, "auxiliary_loss_mlp": 0.01270347, "balance_loss_clip": 0.06298858, "balance_loss_mlp": 0.01255476, "epoch": 0.39561100255523823, "flos": 20409553451520.0, "grad_norm": 2.1808925319633574, "language_loss": 0.6827724, "learning_rate": 2.754153612280037e-06, "loss": 0.76025259, "num_input_tokens_seen": 141303680, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.14862061, "step": 6580, "time_per_iteration": 2.5381019115448 }, { "auxiliary_loss_clip": 0.06465679, "auxiliary_loss_mlp": 0.01273282, "balance_loss_clip": 0.06292877, "balance_loss_mlp": 0.0125946, "epoch": 0.3956711258079062, "flos": 27972005598720.0, "grad_norm": 1.853720722903731, "language_loss": 0.5892446, "learning_rate": 2.7537928855875797e-06, "loss": 0.66663426, "num_input_tokens_seen": 141324090, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.1383667, "step": 6581, "time_per_iteration": 2.5844409465789795 }, { "auxiliary_loss_clip": 0.06467487, "auxiliary_loss_mlp": 0.01271183, "balance_loss_clip": 0.06293221, "balance_loss_mlp": 0.01257093, "epoch": 0.39573124906057416, "flos": 14433457201920.0, "grad_norm": 1.8462551317118279, "language_loss": 0.69772506, "learning_rate": 2.7534321303112224e-06, "loss": 0.77511179, "num_input_tokens_seen": 141342235, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.14080811, "step": 6582, "time_per_iteration": 2.5351932048797607 }, { "auxiliary_loss_clip": 0.06473047, "auxiliary_loss_mlp": 0.0127096, "balance_loss_clip": 0.06297081, "balance_loss_mlp": 0.0125659, "epoch": 0.39579137231324213, "flos": 18739592778240.0, "grad_norm": 2.3448251349816354, "language_loss": 0.76759899, "learning_rate": 2.753071346464642e-06, "loss": 0.84503901, "num_input_tokens_seen": 141361195, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.14367676, "step": 6583, "time_per_iteration": 4.05335545539856 }, { "auxiliary_loss_clip": 0.0647047, "auxiliary_loss_mlp": 0.01270943, "balance_loss_clip": 0.06294373, "balance_loss_mlp": 0.01257436, "epoch": 0.3958514955659101, "flos": 17682268832640.0, "grad_norm": 1.4734128062848837, "language_loss": 0.66497636, "learning_rate": 2.7527105340615207e-06, "loss": 0.74239045, "num_input_tokens_seen": 141378275, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.13482666, "step": 6584, "time_per_iteration": 2.5320885181427 }, { "auxiliary_loss_clip": 0.06471124, "auxiliary_loss_mlp": 0.01270848, "balance_loss_clip": 0.06292412, "balance_loss_mlp": 0.01256853, "epoch": 0.39591161881857806, "flos": 29315850981120.0, "grad_norm": 2.521518907111458, "language_loss": 0.72737259, "learning_rate": 2.7523496931155413e-06, "loss": 0.80479234, "num_input_tokens_seen": 141396960, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.14001465, "step": 6585, "time_per_iteration": 2.6119725704193115 }, { "auxiliary_loss_clip": 0.06467672, "auxiliary_loss_mlp": 0.01275004, "balance_loss_clip": 0.06291102, "balance_loss_mlp": 0.01260675, "epoch": 0.3959717420712461, "flos": 25778295780480.0, "grad_norm": 2.7062535853638625, "language_loss": 0.73238301, "learning_rate": 2.7519888236403856e-06, "loss": 0.8098098, "num_input_tokens_seen": 141417320, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.14337158, "step": 6586, "time_per_iteration": 2.580242395401001 }, { "auxiliary_loss_clip": 0.06471957, "auxiliary_loss_mlp": 0.01269126, "balance_loss_clip": 0.06294543, "balance_loss_mlp": 0.01255482, "epoch": 0.39603186532391405, "flos": 20930199995520.0, "grad_norm": 1.71365156273302, "language_loss": 0.71452045, "learning_rate": 2.7516279256497382e-06, "loss": 0.79193127, "num_input_tokens_seen": 141435985, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.13659668, "step": 6587, "time_per_iteration": 2.5611608028411865 }, { "auxiliary_loss_clip": 0.06356113, "auxiliary_loss_mlp": 0.01258831, "balance_loss_clip": 0.06280006, "balance_loss_mlp": 0.01255348, "epoch": 0.396091988576582, "flos": 54897336720000.0, "grad_norm": 0.8313277273335107, "language_loss": 0.60796851, "learning_rate": 2.751266999157285e-06, "loss": 0.68411797, "num_input_tokens_seen": 141486075, "router_z_loss_clip": 0.7578125, "router_z_loss_mlp": 0.03491211, "step": 6588, "time_per_iteration": 3.0323638916015625 }, { "auxiliary_loss_clip": 0.06474675, "auxiliary_loss_mlp": 0.01266486, "balance_loss_clip": 0.06296788, "balance_loss_mlp": 0.01252444, "epoch": 0.39615211182925, "flos": 20708946489600.0, "grad_norm": 1.5606249413480482, "language_loss": 0.81361949, "learning_rate": 2.7509060441767115e-06, "loss": 0.89103115, "num_input_tokens_seen": 141505280, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.14044189, "step": 6589, "time_per_iteration": 2.695925712585449 }, { "auxiliary_loss_clip": 0.06470668, "auxiliary_loss_mlp": 0.01268772, "balance_loss_clip": 0.06293601, "balance_loss_mlp": 0.01253525, "epoch": 0.39621223508191794, "flos": 21000331463040.0, "grad_norm": 2.13080569570879, "language_loss": 0.71216512, "learning_rate": 2.7505450607217057e-06, "loss": 0.78955948, "num_input_tokens_seen": 141523930, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.15234375, "step": 6590, "time_per_iteration": 2.5498111248016357 }, { "auxiliary_loss_clip": 0.0646878, "auxiliary_loss_mlp": 0.01268712, "balance_loss_clip": 0.06293176, "balance_loss_mlp": 0.01255075, "epoch": 0.3962723583345859, "flos": 23375742860160.0, "grad_norm": 1.6736191067847872, "language_loss": 0.76279843, "learning_rate": 2.750184048805956e-06, "loss": 0.84017342, "num_input_tokens_seen": 141541320, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.13647461, "step": 6591, "time_per_iteration": 2.5582237243652344 }, { "auxiliary_loss_clip": 0.06466137, "auxiliary_loss_mlp": 0.01270884, "balance_loss_clip": 0.06291712, "balance_loss_mlp": 0.01256704, "epoch": 0.39633248158725387, "flos": 25122040202880.0, "grad_norm": 1.6398978957834047, "language_loss": 0.78647912, "learning_rate": 2.749823008443152e-06, "loss": 0.86384928, "num_input_tokens_seen": 141561880, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.14178467, "step": 6592, "time_per_iteration": 2.571467399597168 }, { "auxiliary_loss_clip": 0.06463438, "auxiliary_loss_mlp": 0.01271397, "balance_loss_clip": 0.06293916, "balance_loss_mlp": 0.01256454, "epoch": 0.39639260483992184, "flos": 39797309888640.0, "grad_norm": 1.8207468035446661, "language_loss": 0.70041603, "learning_rate": 2.7494619396469843e-06, "loss": 0.77776444, "num_input_tokens_seen": 141586460, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.14929199, "step": 6593, "time_per_iteration": 2.823099136352539 }, { "auxiliary_loss_clip": 0.06470474, "auxiliary_loss_mlp": 0.0127035, "balance_loss_clip": 0.06292969, "balance_loss_mlp": 0.01255807, "epoch": 0.3964527280925898, "flos": 17352673597440.0, "grad_norm": 1.569716112088801, "language_loss": 0.78333688, "learning_rate": 2.7491008424311452e-06, "loss": 0.86074513, "num_input_tokens_seen": 141605955, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.14538574, "step": 6594, "time_per_iteration": 2.6664676666259766 }, { "auxiliary_loss_clip": 0.06354689, "auxiliary_loss_mlp": 0.01261886, "balance_loss_clip": 0.06278493, "balance_loss_mlp": 0.01258524, "epoch": 0.39651285134525777, "flos": 71739845533440.0, "grad_norm": 0.9336532925925675, "language_loss": 0.62846017, "learning_rate": 2.7487397168093265e-06, "loss": 0.7046259, "num_input_tokens_seen": 141673140, "router_z_loss_clip": 0.76123047, "router_z_loss_mlp": 0.03369141, "step": 6595, "time_per_iteration": 3.225059986114502 }, { "auxiliary_loss_clip": 0.06469172, "auxiliary_loss_mlp": 0.01273758, "balance_loss_clip": 0.06289686, "balance_loss_mlp": 0.01258451, "epoch": 0.39657297459792573, "flos": 25782823900800.0, "grad_norm": 3.454636405060215, "language_loss": 0.63540125, "learning_rate": 2.748378562795223e-06, "loss": 0.71283054, "num_input_tokens_seen": 141692955, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.15301514, "step": 6596, "time_per_iteration": 2.572693109512329 }, { "auxiliary_loss_clip": 0.06460267, "auxiliary_loss_mlp": 0.01269269, "balance_loss_clip": 0.06291111, "balance_loss_mlp": 0.01254678, "epoch": 0.3966330978505937, "flos": 20272267336320.0, "grad_norm": 2.647444821864755, "language_loss": 0.79151571, "learning_rate": 2.7480173804025293e-06, "loss": 0.86881107, "num_input_tokens_seen": 141710680, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.14593506, "step": 6597, "time_per_iteration": 2.54830002784729 }, { "auxiliary_loss_clip": 0.0647452, "auxiliary_loss_mlp": 0.01271363, "balance_loss_clip": 0.06294386, "balance_loss_mlp": 0.01256956, "epoch": 0.39669322110326166, "flos": 20637431429760.0, "grad_norm": 1.9929963673088769, "language_loss": 0.67877305, "learning_rate": 2.747656169644941e-06, "loss": 0.75623184, "num_input_tokens_seen": 141729860, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.144104, "step": 6598, "time_per_iteration": 2.533273935317993 }, { "auxiliary_loss_clip": 0.06467632, "auxiliary_loss_mlp": 0.01275412, "balance_loss_clip": 0.06290941, "balance_loss_mlp": 0.01262108, "epoch": 0.3967533443559297, "flos": 21732546366720.0, "grad_norm": 2.7551833935201238, "language_loss": 0.78859639, "learning_rate": 2.747294930536157e-06, "loss": 0.86602688, "num_input_tokens_seen": 141749060, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.13299561, "step": 6599, "time_per_iteration": 2.5744268894195557 }, { "auxiliary_loss_clip": 0.0646479, "auxiliary_loss_mlp": 0.01268775, "balance_loss_clip": 0.06289057, "balance_loss_mlp": 0.01253785, "epoch": 0.39681346760859765, "flos": 25491271219200.0, "grad_norm": 1.6779731058833591, "language_loss": 0.73486632, "learning_rate": 2.7469336630898737e-06, "loss": 0.81220198, "num_input_tokens_seen": 141769860, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.14990234, "step": 6600, "time_per_iteration": 2.598587989807129 }, { "auxiliary_loss_clip": 0.06467386, "auxiliary_loss_mlp": 0.01273537, "balance_loss_clip": 0.06290872, "balance_loss_mlp": 0.01260084, "epoch": 0.3968735908612656, "flos": 20965894634880.0, "grad_norm": 2.0911834382368055, "language_loss": 0.86252713, "learning_rate": 2.746572367319791e-06, "loss": 0.9399364, "num_input_tokens_seen": 141788465, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.13464355, "step": 6601, "time_per_iteration": 2.5308732986450195 }, { "auxiliary_loss_clip": 0.06476108, "auxiliary_loss_mlp": 0.01272792, "balance_loss_clip": 0.06293699, "balance_loss_mlp": 0.01257288, "epoch": 0.3969337141139336, "flos": 10711684800000.0, "grad_norm": 2.397822098204827, "language_loss": 0.70395947, "learning_rate": 2.7462110432396095e-06, "loss": 0.78144848, "num_input_tokens_seen": 141804955, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.15496826, "step": 6602, "time_per_iteration": 2.546520233154297 }, { "auxiliary_loss_clip": 0.06474161, "auxiliary_loss_mlp": 0.01273422, "balance_loss_clip": 0.06297679, "balance_loss_mlp": 0.01259051, "epoch": 0.39699383736660154, "flos": 17597924098560.0, "grad_norm": 2.4724972675553807, "language_loss": 0.83832026, "learning_rate": 2.7458496908630305e-06, "loss": 0.9157961, "num_input_tokens_seen": 141820025, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.14361572, "step": 6603, "time_per_iteration": 2.530125379562378 }, { "auxiliary_loss_clip": 0.064722, "auxiliary_loss_mlp": 0.01273525, "balance_loss_clip": 0.06298202, "balance_loss_mlp": 0.01259959, "epoch": 0.3970539606192695, "flos": 17791826446080.0, "grad_norm": 1.512613526540637, "language_loss": 0.73521471, "learning_rate": 2.7454883102037563e-06, "loss": 0.81267202, "num_input_tokens_seen": 141838735, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.13574219, "step": 6604, "time_per_iteration": 2.56437349319458 }, { "auxiliary_loss_clip": 0.06462702, "auxiliary_loss_mlp": 0.0127224, "balance_loss_clip": 0.06293701, "balance_loss_mlp": 0.01258745, "epoch": 0.3971140838719375, "flos": 24796260328320.0, "grad_norm": 1.4801086898581934, "language_loss": 0.82837725, "learning_rate": 2.745126901275491e-06, "loss": 0.90572661, "num_input_tokens_seen": 141858090, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.13494873, "step": 6605, "time_per_iteration": 2.607682704925537 }, { "auxiliary_loss_clip": 0.06467542, "auxiliary_loss_mlp": 0.01271827, "balance_loss_clip": 0.0629575, "balance_loss_mlp": 0.01259214, "epoch": 0.39717420712460544, "flos": 24250484759040.0, "grad_norm": 1.4780104008191197, "language_loss": 0.74439865, "learning_rate": 2.7447654640919383e-06, "loss": 0.82179236, "num_input_tokens_seen": 141877540, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.12634277, "step": 6606, "time_per_iteration": 2.5871777534484863 }, { "auxiliary_loss_clip": 0.06482941, "auxiliary_loss_mlp": 0.01270431, "balance_loss_clip": 0.06303195, "balance_loss_mlp": 0.01256972, "epoch": 0.3972343303772734, "flos": 25891752608640.0, "grad_norm": 1.7184139885531449, "language_loss": 0.74592394, "learning_rate": 2.744403998666805e-06, "loss": 0.82345772, "num_input_tokens_seen": 141897315, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.13464355, "step": 6607, "time_per_iteration": 2.5962185859680176 }, { "auxiliary_loss_clip": 0.0648168, "auxiliary_loss_mlp": 0.01274015, "balance_loss_clip": 0.06305398, "balance_loss_mlp": 0.01260043, "epoch": 0.39729445362994137, "flos": 45634107525120.0, "grad_norm": 257.0744505936446, "language_loss": 0.68181467, "learning_rate": 2.744042505013797e-06, "loss": 0.75937164, "num_input_tokens_seen": 141919580, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.13989258, "step": 6608, "time_per_iteration": 4.200479745864868 }, { "auxiliary_loss_clip": 0.0648423, "auxiliary_loss_mlp": 0.01274501, "balance_loss_clip": 0.06304653, "balance_loss_mlp": 0.01259368, "epoch": 0.39735457688260933, "flos": 20200249152000.0, "grad_norm": 2.9711156450534633, "language_loss": 0.74547702, "learning_rate": 2.7436809831466233e-06, "loss": 0.82306433, "num_input_tokens_seen": 141937045, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.15118408, "step": 6609, "time_per_iteration": 2.5453941822052 }, { "auxiliary_loss_clip": 0.06478597, "auxiliary_loss_mlp": 0.01274092, "balance_loss_clip": 0.06302541, "balance_loss_mlp": 0.01259859, "epoch": 0.3974147001352773, "flos": 23337868014720.0, "grad_norm": 1.6263463620537102, "language_loss": 0.71834773, "learning_rate": 2.7433194330789927e-06, "loss": 0.7958746, "num_input_tokens_seen": 141956695, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.14221191, "step": 6610, "time_per_iteration": 2.5660812854766846 }, { "auxiliary_loss_clip": 0.06473716, "auxiliary_loss_mlp": 0.01271316, "balance_loss_clip": 0.06302443, "balance_loss_mlp": 0.01257989, "epoch": 0.39747482338794526, "flos": 21694965010560.0, "grad_norm": 1.5783301705245119, "language_loss": 0.79120475, "learning_rate": 2.7429578548246133e-06, "loss": 0.86865509, "num_input_tokens_seen": 141975935, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.13323975, "step": 6611, "time_per_iteration": 2.5618984699249268 }, { "auxiliary_loss_clip": 0.06479943, "auxiliary_loss_mlp": 0.01269897, "balance_loss_clip": 0.06305383, "balance_loss_mlp": 0.01256349, "epoch": 0.3975349466406133, "flos": 30995957998080.0, "grad_norm": 1.915937822310538, "language_loss": 0.79380381, "learning_rate": 2.7425962483971985e-06, "loss": 0.87130225, "num_input_tokens_seen": 141995750, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.13555908, "step": 6612, "time_per_iteration": 4.084220886230469 }, { "auxiliary_loss_clip": 0.06370267, "auxiliary_loss_mlp": 0.01263481, "balance_loss_clip": 0.06295487, "balance_loss_mlp": 0.01260358, "epoch": 0.39759506989328125, "flos": 63703426366080.0, "grad_norm": 0.8189103865337973, "language_loss": 0.64699322, "learning_rate": 2.742234613810459e-06, "loss": 0.72333074, "num_input_tokens_seen": 142057655, "router_z_loss_clip": 0.74951172, "router_z_loss_mlp": 0.03120422, "step": 6613, "time_per_iteration": 3.082829475402832 }, { "auxiliary_loss_clip": 0.06478571, "auxiliary_loss_mlp": 0.01277114, "balance_loss_clip": 0.06302868, "balance_loss_mlp": 0.01261558, "epoch": 0.3976551931459492, "flos": 23702570910720.0, "grad_norm": 2.6518434636561525, "language_loss": 0.72101939, "learning_rate": 2.741872951078109e-06, "loss": 0.79857624, "num_input_tokens_seen": 142076020, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.15539551, "step": 6614, "time_per_iteration": 2.558629035949707 }, { "auxiliary_loss_clip": 0.06476133, "auxiliary_loss_mlp": 0.01270388, "balance_loss_clip": 0.06302322, "balance_loss_mlp": 0.01256071, "epoch": 0.3977153163986172, "flos": 15675166056960.0, "grad_norm": 2.189536987315121, "language_loss": 0.82352221, "learning_rate": 2.741511260213862e-06, "loss": 0.90098739, "num_input_tokens_seen": 142093790, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.14294434, "step": 6615, "time_per_iteration": 2.5334582328796387 }, { "auxiliary_loss_clip": 0.06481016, "auxiliary_loss_mlp": 0.01268078, "balance_loss_clip": 0.06305576, "balance_loss_mlp": 0.01254965, "epoch": 0.39777543965128515, "flos": 14070012117120.0, "grad_norm": 1.854367432847999, "language_loss": 0.68020535, "learning_rate": 2.741149541231434e-06, "loss": 0.75769633, "num_input_tokens_seen": 142110545, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.13110352, "step": 6616, "time_per_iteration": 2.5325000286102295 }, { "auxiliary_loss_clip": 0.06476265, "auxiliary_loss_mlp": 0.01272429, "balance_loss_clip": 0.06297266, "balance_loss_mlp": 0.01257074, "epoch": 0.3978355629039531, "flos": 23374149632640.0, "grad_norm": 1.8787983451582333, "language_loss": 0.840505, "learning_rate": 2.740787794144541e-06, "loss": 0.917992, "num_input_tokens_seen": 142128695, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.15356445, "step": 6617, "time_per_iteration": 2.6342527866363525 }, { "auxiliary_loss_clip": 0.06475124, "auxiliary_loss_mlp": 0.01271958, "balance_loss_clip": 0.06307004, "balance_loss_mlp": 0.0125897, "epoch": 0.3978956861566211, "flos": 19068852597120.0, "grad_norm": 1.6080196150324693, "language_loss": 0.72421718, "learning_rate": 2.7404260189669e-06, "loss": 0.80168802, "num_input_tokens_seen": 142148375, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.12994385, "step": 6618, "time_per_iteration": 4.0439722537994385 }, { "auxiliary_loss_clip": 0.0647992, "auxiliary_loss_mlp": 0.01279228, "balance_loss_clip": 0.06305186, "balance_loss_mlp": 0.01263224, "epoch": 0.39795580940928904, "flos": 30235679176320.0, "grad_norm": 1.745917546829554, "language_loss": 0.65766853, "learning_rate": 2.740064215712231e-06, "loss": 0.73526001, "num_input_tokens_seen": 142169735, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.15991211, "step": 6619, "time_per_iteration": 2.6568377017974854 }, { "auxiliary_loss_clip": 0.06368463, "auxiliary_loss_mlp": 0.01252798, "balance_loss_clip": 0.0629296, "balance_loss_mlp": 0.01249704, "epoch": 0.398015932661957, "flos": 69867261688320.0, "grad_norm": 0.7563047850935213, "language_loss": 0.58115387, "learning_rate": 2.7397023843942527e-06, "loss": 0.65736639, "num_input_tokens_seen": 142229520, "router_z_loss_clip": 0.75390625, "router_z_loss_mlp": 0.03092957, "step": 6620, "time_per_iteration": 3.1631860733032227 }, { "auxiliary_loss_clip": 0.06469789, "auxiliary_loss_mlp": 0.01267745, "balance_loss_clip": 0.0629627, "balance_loss_mlp": 0.01254477, "epoch": 0.39807605591462497, "flos": 20164093315200.0, "grad_norm": 2.0562417074274126, "language_loss": 0.79835224, "learning_rate": 2.739340525026686e-06, "loss": 0.87572759, "num_input_tokens_seen": 142247660, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.13262939, "step": 6621, "time_per_iteration": 2.5623669624328613 }, { "auxiliary_loss_clip": 0.06465896, "auxiliary_loss_mlp": 0.01273031, "balance_loss_clip": 0.06293465, "balance_loss_mlp": 0.01258714, "epoch": 0.39813617916729294, "flos": 21148057411200.0, "grad_norm": 2.036163114139181, "language_loss": 0.78329885, "learning_rate": 2.738978637623252e-06, "loss": 0.86068815, "num_input_tokens_seen": 142266990, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.14318848, "step": 6622, "time_per_iteration": 2.592494487762451 }, { "auxiliary_loss_clip": 0.06467125, "auxiliary_loss_mlp": 0.01272832, "balance_loss_clip": 0.06295285, "balance_loss_mlp": 0.01257979, "epoch": 0.3981963024199609, "flos": 18994318790400.0, "grad_norm": 1.62291362483186, "language_loss": 0.75730276, "learning_rate": 2.738616722197674e-06, "loss": 0.83470231, "num_input_tokens_seen": 142287170, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.14837646, "step": 6623, "time_per_iteration": 4.004153490066528 }, { "auxiliary_loss_clip": 0.0647182, "auxiliary_loss_mlp": 0.01275795, "balance_loss_clip": 0.06298779, "balance_loss_mlp": 0.01261395, "epoch": 0.39825642567262887, "flos": 16579648955520.0, "grad_norm": 1.7069631705590576, "language_loss": 0.8035444, "learning_rate": 2.7382547787636766e-06, "loss": 0.88102061, "num_input_tokens_seen": 142305405, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.1439209, "step": 6624, "time_per_iteration": 2.542466878890991 }, { "auxiliary_loss_clip": 0.06476012, "auxiliary_loss_mlp": 0.01272385, "balance_loss_clip": 0.06295951, "balance_loss_mlp": 0.01256924, "epoch": 0.39831654892529683, "flos": 22206303751680.0, "grad_norm": 2.2745222709900204, "language_loss": 0.8383646, "learning_rate": 2.7378928073349832e-06, "loss": 0.91584861, "num_input_tokens_seen": 142322710, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.15441895, "step": 6625, "time_per_iteration": 2.5965232849121094 }, { "auxiliary_loss_clip": 0.06471565, "auxiliary_loss_mlp": 0.01271029, "balance_loss_clip": 0.06298439, "balance_loss_mlp": 0.01257958, "epoch": 0.39837667217796485, "flos": 10492485719040.0, "grad_norm": 2.16489896520953, "language_loss": 0.87201297, "learning_rate": 2.737530807925321e-06, "loss": 0.94943893, "num_input_tokens_seen": 142338535, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.13079834, "step": 6626, "time_per_iteration": 2.5687484741210938 }, { "auxiliary_loss_clip": 0.06467205, "auxiliary_loss_mlp": 0.01271198, "balance_loss_clip": 0.06294172, "balance_loss_mlp": 0.01256535, "epoch": 0.3984367954306328, "flos": 17970676986240.0, "grad_norm": 2.2609197499352374, "language_loss": 0.8424508, "learning_rate": 2.737168780548417e-06, "loss": 0.91983479, "num_input_tokens_seen": 142354570, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.14672852, "step": 6627, "time_per_iteration": 2.642117500305176 }, { "auxiliary_loss_clip": 0.06468256, "auxiliary_loss_mlp": 0.01272218, "balance_loss_clip": 0.06295772, "balance_loss_mlp": 0.01259051, "epoch": 0.3984969186833008, "flos": 22717684419840.0, "grad_norm": 2.0753446325911713, "language_loss": 0.83249229, "learning_rate": 2.736806725217998e-06, "loss": 0.90989709, "num_input_tokens_seen": 142374395, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.13171387, "step": 6628, "time_per_iteration": 2.585463047027588 }, { "auxiliary_loss_clip": 0.06473993, "auxiliary_loss_mlp": 0.01272271, "balance_loss_clip": 0.06296712, "balance_loss_mlp": 0.012583, "epoch": 0.39855704193596875, "flos": 23412779164800.0, "grad_norm": 2.2565911915918897, "language_loss": 0.71772254, "learning_rate": 2.7364446419477945e-06, "loss": 0.79518521, "num_input_tokens_seen": 142396040, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.13970947, "step": 6629, "time_per_iteration": 2.619805335998535 }, { "auxiliary_loss_clip": 0.06468183, "auxiliary_loss_mlp": 0.01269768, "balance_loss_clip": 0.06299645, "balance_loss_mlp": 0.0125607, "epoch": 0.3986171651886367, "flos": 21258369711360.0, "grad_norm": 2.188440183638482, "language_loss": 0.81151873, "learning_rate": 2.7360825307515366e-06, "loss": 0.88889825, "num_input_tokens_seen": 142415495, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.13696289, "step": 6630, "time_per_iteration": 2.573334217071533 }, { "auxiliary_loss_clip": 0.06470478, "auxiliary_loss_mlp": 0.01270313, "balance_loss_clip": 0.06295347, "balance_loss_mlp": 0.01255919, "epoch": 0.3986772884413047, "flos": 12463642293120.0, "grad_norm": 1.8051144090850835, "language_loss": 0.74868679, "learning_rate": 2.7357203916429555e-06, "loss": 0.82609475, "num_input_tokens_seen": 142431865, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.14379883, "step": 6631, "time_per_iteration": 2.5387933254241943 }, { "auxiliary_loss_clip": 0.06474163, "auxiliary_loss_mlp": 0.01269635, "balance_loss_clip": 0.06298263, "balance_loss_mlp": 0.01255628, "epoch": 0.39873741169397264, "flos": 19652209522560.0, "grad_norm": 1.9464040285977353, "language_loss": 0.72026032, "learning_rate": 2.735358224635783e-06, "loss": 0.79769832, "num_input_tokens_seen": 142450595, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.13995361, "step": 6632, "time_per_iteration": 2.545959234237671 }, { "auxiliary_loss_clip": 0.06470577, "auxiliary_loss_mlp": 0.01268267, "balance_loss_clip": 0.0629804, "balance_loss_mlp": 0.01255249, "epoch": 0.3987975349466406, "flos": 21690436890240.0, "grad_norm": 1.8448134238509262, "language_loss": 0.7546351, "learning_rate": 2.7349960297437533e-06, "loss": 0.83202362, "num_input_tokens_seen": 142466650, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.13018799, "step": 6633, "time_per_iteration": 2.5343966484069824 }, { "auxiliary_loss_clip": 0.06472482, "auxiliary_loss_mlp": 0.01270607, "balance_loss_clip": 0.06296796, "balance_loss_mlp": 0.01256827, "epoch": 0.3988576581993086, "flos": 23920721815680.0, "grad_norm": 1.6872298441964049, "language_loss": 0.81434715, "learning_rate": 2.7346338069806e-06, "loss": 0.89177805, "num_input_tokens_seen": 142486165, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.13775635, "step": 6634, "time_per_iteration": 2.5776009559631348 }, { "auxiliary_loss_clip": 0.06478747, "auxiliary_loss_mlp": 0.01268689, "balance_loss_clip": 0.06303078, "balance_loss_mlp": 0.01254741, "epoch": 0.39891778145197654, "flos": 18155690801280.0, "grad_norm": 1.789863904003046, "language_loss": 0.74718666, "learning_rate": 2.7342715563600597e-06, "loss": 0.82466102, "num_input_tokens_seen": 142505035, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.13952637, "step": 6635, "time_per_iteration": 2.647371768951416 }, { "auxiliary_loss_clip": 0.06493308, "auxiliary_loss_mlp": 0.01274386, "balance_loss_clip": 0.06308255, "balance_loss_mlp": 0.01258496, "epoch": 0.3989779047046445, "flos": 22600831501440.0, "grad_norm": 1.9011384384379857, "language_loss": 0.66757363, "learning_rate": 2.733909277895868e-06, "loss": 0.74525058, "num_input_tokens_seen": 142521870, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.15887451, "step": 6636, "time_per_iteration": 2.671555757522583 }, { "auxiliary_loss_clip": 0.06478319, "auxiliary_loss_mlp": 0.01268812, "balance_loss_clip": 0.06304832, "balance_loss_mlp": 0.0125496, "epoch": 0.39903802795731247, "flos": 18083043711360.0, "grad_norm": 1.6484713931990211, "language_loss": 0.81724167, "learning_rate": 2.733546971601763e-06, "loss": 0.89471292, "num_input_tokens_seen": 142540455, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.1385498, "step": 6637, "time_per_iteration": 2.5947225093841553 }, { "auxiliary_loss_clip": 0.06365575, "auxiliary_loss_mlp": 0.01255002, "balance_loss_clip": 0.06288904, "balance_loss_mlp": 0.01251661, "epoch": 0.39909815120998043, "flos": 70463238652800.0, "grad_norm": 0.7010619156576711, "language_loss": 0.53179085, "learning_rate": 2.733184637491484e-06, "loss": 0.60799664, "num_input_tokens_seen": 142599665, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 0.03347778, "step": 6638, "time_per_iteration": 3.231275796890259 }, { "auxiliary_loss_clip": 0.06474307, "auxiliary_loss_mlp": 0.01278406, "balance_loss_clip": 0.06296103, "balance_loss_mlp": 0.01264149, "epoch": 0.39915827446264845, "flos": 18554788598400.0, "grad_norm": 1.3770789394055551, "language_loss": 0.75273347, "learning_rate": 2.732822275578769e-06, "loss": 0.83026063, "num_input_tokens_seen": 142618845, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.14233398, "step": 6639, "time_per_iteration": 2.5773353576660156 }, { "auxiliary_loss_clip": 0.06465921, "auxiliary_loss_mlp": 0.01270469, "balance_loss_clip": 0.06294967, "balance_loss_mlp": 0.01256939, "epoch": 0.3992183977153164, "flos": 29904826129920.0, "grad_norm": 1.6333247276484333, "language_loss": 0.75997066, "learning_rate": 2.7324598858773603e-06, "loss": 0.83733451, "num_input_tokens_seen": 142640885, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.13525391, "step": 6640, "time_per_iteration": 2.681825637817383 }, { "auxiliary_loss_clip": 0.06470414, "auxiliary_loss_mlp": 0.01271337, "balance_loss_clip": 0.06294234, "balance_loss_mlp": 0.0125699, "epoch": 0.3992785209679844, "flos": 22571677480320.0, "grad_norm": 2.230846128286925, "language_loss": 0.82611018, "learning_rate": 2.7320974684009996e-06, "loss": 0.90352768, "num_input_tokens_seen": 142659340, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.14349365, "step": 6641, "time_per_iteration": 2.711385488510132 }, { "auxiliary_loss_clip": 0.0647895, "auxiliary_loss_mlp": 0.01273457, "balance_loss_clip": 0.06301139, "balance_loss_mlp": 0.01258699, "epoch": 0.39933864422065235, "flos": 19688784629760.0, "grad_norm": 2.023751108019174, "language_loss": 0.77333623, "learning_rate": 2.7317350231634288e-06, "loss": 0.85086036, "num_input_tokens_seen": 142677085, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.14758301, "step": 6642, "time_per_iteration": 2.587911367416382 }, { "auxiliary_loss_clip": 0.06469175, "auxiliary_loss_mlp": 0.01271223, "balance_loss_clip": 0.06292368, "balance_loss_mlp": 0.012573, "epoch": 0.3993987674733203, "flos": 23045015594880.0, "grad_norm": 2.08051056923896, "language_loss": 0.73214549, "learning_rate": 2.731372550178393e-06, "loss": 0.80954945, "num_input_tokens_seen": 142694595, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.13928223, "step": 6643, "time_per_iteration": 2.58430552482605 }, { "auxiliary_loss_clip": 0.06472205, "auxiliary_loss_mlp": 0.01271747, "balance_loss_clip": 0.06295584, "balance_loss_mlp": 0.01257728, "epoch": 0.3994588907259883, "flos": 19396896531840.0, "grad_norm": 1.5795498480996832, "language_loss": 0.67429745, "learning_rate": 2.7310100494596375e-06, "loss": 0.751737, "num_input_tokens_seen": 142714175, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.14038086, "step": 6644, "time_per_iteration": 2.5793585777282715 }, { "auxiliary_loss_clip": 0.06467706, "auxiliary_loss_mlp": 0.01275238, "balance_loss_clip": 0.06290375, "balance_loss_mlp": 0.01260814, "epoch": 0.39951901397865625, "flos": 13739326778880.0, "grad_norm": 2.5021690723074173, "language_loss": 0.78682983, "learning_rate": 2.730647521020907e-06, "loss": 0.86425924, "num_input_tokens_seen": 142730955, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.14434814, "step": 6645, "time_per_iteration": 2.5300650596618652 }, { "auxiliary_loss_clip": 0.06472379, "auxiliary_loss_mlp": 0.01272425, "balance_loss_clip": 0.06293324, "balance_loss_mlp": 0.01257953, "epoch": 0.3995791372313242, "flos": 23593181005440.0, "grad_norm": 1.4875982731683952, "language_loss": 0.70332462, "learning_rate": 2.73028496487595e-06, "loss": 0.78077269, "num_input_tokens_seen": 142751200, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.14465332, "step": 6646, "time_per_iteration": 2.59257173538208 }, { "auxiliary_loss_clip": 0.0647055, "auxiliary_loss_mlp": 0.01273212, "balance_loss_clip": 0.06293193, "balance_loss_mlp": 0.01258382, "epoch": 0.3996392604839922, "flos": 21361428633600.0, "grad_norm": 1.8385089093923246, "language_loss": 0.7220403, "learning_rate": 2.729922381038513e-06, "loss": 0.79947788, "num_input_tokens_seen": 142770170, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.14837646, "step": 6647, "time_per_iteration": 4.035264015197754 }, { "auxiliary_loss_clip": 0.06461921, "auxiliary_loss_mlp": 0.01272488, "balance_loss_clip": 0.06291357, "balance_loss_mlp": 0.01257885, "epoch": 0.39969938373666014, "flos": 26039604337920.0, "grad_norm": 1.5960613051961838, "language_loss": 0.74652815, "learning_rate": 2.7295597695223463e-06, "loss": 0.82387227, "num_input_tokens_seen": 142792680, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.14605713, "step": 6648, "time_per_iteration": 2.633315086364746 }, { "auxiliary_loss_clip": 0.06467395, "auxiliary_loss_mlp": 0.01271233, "balance_loss_clip": 0.06292018, "balance_loss_mlp": 0.01257101, "epoch": 0.3997595069893281, "flos": 20121858057600.0, "grad_norm": 2.0689862756388377, "language_loss": 0.65845323, "learning_rate": 2.7291971303412006e-06, "loss": 0.73583949, "num_input_tokens_seen": 142810510, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.14141846, "step": 6649, "time_per_iteration": 2.5718092918395996 }, { "auxiliary_loss_clip": 0.06467268, "auxiliary_loss_mlp": 0.01276625, "balance_loss_clip": 0.06290759, "balance_loss_mlp": 0.0126173, "epoch": 0.39981963024199607, "flos": 27791016779520.0, "grad_norm": 1.8688129033804355, "language_loss": 0.75164294, "learning_rate": 2.728834463508826e-06, "loss": 0.82908189, "num_input_tokens_seen": 142832455, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.14910889, "step": 6650, "time_per_iteration": 2.6699740886688232 }, { "auxiliary_loss_clip": 0.06462628, "auxiliary_loss_mlp": 0.01274264, "balance_loss_clip": 0.06286104, "balance_loss_mlp": 0.01260185, "epoch": 0.39987975349466404, "flos": 21950864979840.0, "grad_norm": 2.450338120544869, "language_loss": 0.71929783, "learning_rate": 2.728471769038975e-06, "loss": 0.79666674, "num_input_tokens_seen": 142852590, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.14099121, "step": 6651, "time_per_iteration": 2.5587189197540283 }, { "auxiliary_loss_clip": 0.06464539, "auxiliary_loss_mlp": 0.01269772, "balance_loss_clip": 0.06286933, "balance_loss_mlp": 0.01256015, "epoch": 0.39993987674733206, "flos": 20710707425280.0, "grad_norm": 1.83571610877162, "language_loss": 0.73319781, "learning_rate": 2.728109046945403e-06, "loss": 0.81054091, "num_input_tokens_seen": 142870595, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.13769531, "step": 6652, "time_per_iteration": 3.9976296424865723 }, { "auxiliary_loss_clip": 0.06348884, "auxiliary_loss_mlp": 0.01263066, "balance_loss_clip": 0.06272092, "balance_loss_mlp": 0.01259749, "epoch": 0.4, "flos": 61543566397440.0, "grad_norm": 0.8173047475882783, "language_loss": 0.6045382, "learning_rate": 2.727746297241862e-06, "loss": 0.68065774, "num_input_tokens_seen": 142925805, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 0.03323364, "step": 6653, "time_per_iteration": 3.079706907272339 }, { "auxiliary_loss_clip": 0.06453034, "auxiliary_loss_mlp": 0.01270213, "balance_loss_clip": 0.06285338, "balance_loss_mlp": 0.01256141, "epoch": 0.400060123252668, "flos": 14507655592320.0, "grad_norm": 1.9278743008370929, "language_loss": 0.67548597, "learning_rate": 2.7273835199421085e-06, "loss": 0.75271845, "num_input_tokens_seen": 142943145, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.14086914, "step": 6654, "time_per_iteration": 2.5255980491638184 }, { "auxiliary_loss_clip": 0.06462455, "auxiliary_loss_mlp": 0.01270944, "balance_loss_clip": 0.06288602, "balance_loss_mlp": 0.01257867, "epoch": 0.40012024650533595, "flos": 19098383961600.0, "grad_norm": 1.983400699916361, "language_loss": 0.90518641, "learning_rate": 2.7270207150599e-06, "loss": 0.98252046, "num_input_tokens_seen": 142956925, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.13079834, "step": 6655, "time_per_iteration": 2.5235977172851562 }, { "auxiliary_loss_clip": 0.06452361, "auxiliary_loss_mlp": 0.01270136, "balance_loss_clip": 0.06284694, "balance_loss_mlp": 0.01257774, "epoch": 0.4001803697580039, "flos": 29358673217280.0, "grad_norm": 1.6808730365827087, "language_loss": 0.73518085, "learning_rate": 2.7266578826089917e-06, "loss": 0.81240582, "num_input_tokens_seen": 142978040, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.12365723, "step": 6656, "time_per_iteration": 2.632246494293213 }, { "auxiliary_loss_clip": 0.06460167, "auxiliary_loss_mlp": 0.01272822, "balance_loss_clip": 0.06284764, "balance_loss_mlp": 0.0125863, "epoch": 0.4002404930106719, "flos": 20925839583360.0, "grad_norm": 1.40230230714536, "language_loss": 0.73860902, "learning_rate": 2.726295022603144e-06, "loss": 0.81593889, "num_input_tokens_seen": 142998390, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.14178467, "step": 6657, "time_per_iteration": 2.582846164703369 }, { "auxiliary_loss_clip": 0.06465816, "auxiliary_loss_mlp": 0.01268899, "balance_loss_clip": 0.06290583, "balance_loss_mlp": 0.01254344, "epoch": 0.40030061626333985, "flos": 28413799850880.0, "grad_norm": 1.5113381812288982, "language_loss": 0.79594493, "learning_rate": 2.725932135056117e-06, "loss": 0.87329209, "num_input_tokens_seen": 143021505, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.14538574, "step": 6658, "time_per_iteration": 4.170760631561279 }, { "auxiliary_loss_clip": 0.06457439, "auxiliary_loss_mlp": 0.01278953, "balance_loss_clip": 0.06283697, "balance_loss_mlp": 0.01264964, "epoch": 0.4003607395160078, "flos": 25928746986240.0, "grad_norm": 3.680012292957788, "language_loss": 0.7759499, "learning_rate": 2.72556921998167e-06, "loss": 0.8533138, "num_input_tokens_seen": 143041375, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.13983154, "step": 6659, "time_per_iteration": 2.6135191917419434 }, { "auxiliary_loss_clip": 0.06447863, "auxiliary_loss_mlp": 0.01269895, "balance_loss_clip": 0.06285068, "balance_loss_mlp": 0.01257676, "epoch": 0.4004208627686758, "flos": 20773501660800.0, "grad_norm": 1.7890865297219294, "language_loss": 0.73343319, "learning_rate": 2.7252062773935662e-06, "loss": 0.81061077, "num_input_tokens_seen": 143058725, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.12231445, "step": 6660, "time_per_iteration": 2.5566964149475098 }, { "auxiliary_loss_clip": 0.06453866, "auxiliary_loss_mlp": 0.01270741, "balance_loss_clip": 0.06280627, "balance_loss_mlp": 0.01257902, "epoch": 0.40048098602134374, "flos": 24688170161280.0, "grad_norm": 1.8461044731162706, "language_loss": 0.71542263, "learning_rate": 2.7248433073055674e-06, "loss": 0.7926687, "num_input_tokens_seen": 143076995, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.128479, "step": 6661, "time_per_iteration": 4.034600019454956 }, { "auxiliary_loss_clip": 0.0646012, "auxiliary_loss_mlp": 0.01268999, "balance_loss_clip": 0.06285976, "balance_loss_mlp": 0.01255612, "epoch": 0.4005411092740117, "flos": 23192448053760.0, "grad_norm": 1.8173558918405694, "language_loss": 0.75591588, "learning_rate": 2.724480309731437e-06, "loss": 0.83320707, "num_input_tokens_seen": 143096780, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.13397217, "step": 6662, "time_per_iteration": 2.562559127807617 }, { "auxiliary_loss_clip": 0.06466739, "auxiliary_loss_mlp": 0.01270159, "balance_loss_clip": 0.06288861, "balance_loss_mlp": 0.0125589, "epoch": 0.4006012325266797, "flos": 17526786382080.0, "grad_norm": 1.7984061647166592, "language_loss": 0.665187, "learning_rate": 2.7241172846849417e-06, "loss": 0.74255604, "num_input_tokens_seen": 143112590, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.1427002, "step": 6663, "time_per_iteration": 2.5046675205230713 }, { "auxiliary_loss_clip": 0.06461784, "auxiliary_loss_mlp": 0.01270908, "balance_loss_clip": 0.06286493, "balance_loss_mlp": 0.01257145, "epoch": 0.40066135577934764, "flos": 19862016946560.0, "grad_norm": 2.0957541240362496, "language_loss": 0.86816776, "learning_rate": 2.7237542321798455e-06, "loss": 0.94549477, "num_input_tokens_seen": 143130220, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.13757324, "step": 6664, "time_per_iteration": 2.5545191764831543 }, { "auxiliary_loss_clip": 0.06462331, "auxiliary_loss_mlp": 0.01274146, "balance_loss_clip": 0.06286086, "balance_loss_mlp": 0.0126002, "epoch": 0.40072147903201566, "flos": 18155816582400.0, "grad_norm": 2.003293189918483, "language_loss": 0.84806079, "learning_rate": 2.723391152229917e-06, "loss": 0.92542553, "num_input_tokens_seen": 143147160, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.14135742, "step": 6665, "time_per_iteration": 2.5259721279144287 }, { "auxiliary_loss_clip": 0.06456266, "auxiliary_loss_mlp": 0.01270022, "balance_loss_clip": 0.06282364, "balance_loss_mlp": 0.01256122, "epoch": 0.4007816022846836, "flos": 18667239177600.0, "grad_norm": 1.6796356497580218, "language_loss": 0.78854936, "learning_rate": 2.7230280448489236e-06, "loss": 0.86581218, "num_input_tokens_seen": 143164605, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.13916016, "step": 6666, "time_per_iteration": 2.547142267227173 }, { "auxiliary_loss_clip": 0.06461076, "auxiliary_loss_mlp": 0.01270898, "balance_loss_clip": 0.06285775, "balance_loss_mlp": 0.012567, "epoch": 0.4008417255373516, "flos": 25710344519040.0, "grad_norm": 4.034601739778633, "language_loss": 0.74131358, "learning_rate": 2.7226649100506333e-06, "loss": 0.81863332, "num_input_tokens_seen": 143183965, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.14202881, "step": 6667, "time_per_iteration": 2.592743158340454 }, { "auxiliary_loss_clip": 0.06459466, "auxiliary_loss_mlp": 0.01275377, "balance_loss_clip": 0.06282717, "balance_loss_mlp": 0.01260124, "epoch": 0.40090184879001955, "flos": 22865536149120.0, "grad_norm": 1.3594754475821818, "language_loss": 0.76131779, "learning_rate": 2.7223017478488183e-06, "loss": 0.8386662, "num_input_tokens_seen": 143204965, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.15240479, "step": 6668, "time_per_iteration": 2.590646743774414 }, { "auxiliary_loss_clip": 0.064532, "auxiliary_loss_mlp": 0.01269556, "balance_loss_clip": 0.0628396, "balance_loss_mlp": 0.0125652, "epoch": 0.4009619720426875, "flos": 29067581733120.0, "grad_norm": 1.9877397487860173, "language_loss": 0.82745957, "learning_rate": 2.721938558257248e-06, "loss": 0.90468717, "num_input_tokens_seen": 143225015, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.13031006, "step": 6669, "time_per_iteration": 2.643665075302124 }, { "auxiliary_loss_clip": 0.06335597, "auxiliary_loss_mlp": 0.01254269, "balance_loss_clip": 0.06258342, "balance_loss_mlp": 0.01251069, "epoch": 0.4010220952953555, "flos": 66080347136640.0, "grad_norm": 0.7095740867684659, "language_loss": 0.53262246, "learning_rate": 2.721575341289695e-06, "loss": 0.6085211, "num_input_tokens_seen": 143294925, "router_z_loss_clip": 0.7734375, "router_z_loss_mlp": 0.03204346, "step": 6670, "time_per_iteration": 3.3566298484802246 }, { "auxiliary_loss_clip": 0.06452497, "auxiliary_loss_mlp": 0.01270134, "balance_loss_clip": 0.06281705, "balance_loss_mlp": 0.01256574, "epoch": 0.40108221854802345, "flos": 29650519388160.0, "grad_norm": 1.8003300763779277, "language_loss": 0.88709438, "learning_rate": 2.7212120969599333e-06, "loss": 0.96432066, "num_input_tokens_seen": 143314170, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.13562012, "step": 6671, "time_per_iteration": 2.672757387161255 }, { "auxiliary_loss_clip": 0.06459142, "auxiliary_loss_mlp": 0.0126918, "balance_loss_clip": 0.06284817, "balance_loss_mlp": 0.01254798, "epoch": 0.4011423418006914, "flos": 19934286693120.0, "grad_norm": 1.6988394989449946, "language_loss": 0.79028243, "learning_rate": 2.720848825281736e-06, "loss": 0.86756563, "num_input_tokens_seen": 143330050, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.14398193, "step": 6672, "time_per_iteration": 2.5631723403930664 }, { "auxiliary_loss_clip": 0.06457314, "auxiliary_loss_mlp": 0.01272549, "balance_loss_clip": 0.06285967, "balance_loss_mlp": 0.0125893, "epoch": 0.4012024650533594, "flos": 20090523830400.0, "grad_norm": 1.9760129378613023, "language_loss": 0.63500929, "learning_rate": 2.72048552626888e-06, "loss": 0.71230793, "num_input_tokens_seen": 143348650, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.1361084, "step": 6673, "time_per_iteration": 2.6162829399108887 }, { "auxiliary_loss_clip": 0.06451601, "auxiliary_loss_mlp": 0.01269928, "balance_loss_clip": 0.06279124, "balance_loss_mlp": 0.01256607, "epoch": 0.40126258830602735, "flos": 21703224637440.0, "grad_norm": 1.79236746890571, "language_loss": 0.8064574, "learning_rate": 2.7201221999351402e-06, "loss": 0.88367271, "num_input_tokens_seen": 143370275, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.13323975, "step": 6674, "time_per_iteration": 2.590758800506592 }, { "auxiliary_loss_clip": 0.06465371, "auxiliary_loss_mlp": 0.01274767, "balance_loss_clip": 0.06286569, "balance_loss_mlp": 0.01261219, "epoch": 0.4013227115586953, "flos": 12025160277120.0, "grad_norm": 2.048385451483193, "language_loss": 0.82936817, "learning_rate": 2.719758846294294e-06, "loss": 0.90676963, "num_input_tokens_seen": 143385390, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.13555908, "step": 6675, "time_per_iteration": 2.5665533542633057 }, { "auxiliary_loss_clip": 0.06457977, "auxiliary_loss_mlp": 0.01269618, "balance_loss_clip": 0.0628439, "balance_loss_mlp": 0.01255956, "epoch": 0.4013828348113633, "flos": 25454612257920.0, "grad_norm": 1.7054728457976098, "language_loss": 0.9355377, "learning_rate": 2.71939546536012e-06, "loss": 1.01281369, "num_input_tokens_seen": 143404215, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.13671875, "step": 6676, "time_per_iteration": 2.6056456565856934 }, { "auxiliary_loss_clip": 0.06468569, "auxiliary_loss_mlp": 0.01272837, "balance_loss_clip": 0.06287925, "balance_loss_mlp": 0.01257358, "epoch": 0.40144295806403124, "flos": 18588009542400.0, "grad_norm": 2.363359801278079, "language_loss": 0.79585928, "learning_rate": 2.719032057146399e-06, "loss": 0.87327337, "num_input_tokens_seen": 143422245, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.15496826, "step": 6677, "time_per_iteration": 2.6116557121276855 }, { "auxiliary_loss_clip": 0.0645434, "auxiliary_loss_mlp": 0.01272697, "balance_loss_clip": 0.06281383, "balance_loss_mlp": 0.01258732, "epoch": 0.4015030813166992, "flos": 22936925427840.0, "grad_norm": 1.9442829738786886, "language_loss": 0.83922827, "learning_rate": 2.71866862166691e-06, "loss": 0.91649866, "num_input_tokens_seen": 143443130, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.13970947, "step": 6678, "time_per_iteration": 2.570415496826172 }, { "auxiliary_loss_clip": 0.06456555, "auxiliary_loss_mlp": 0.01272753, "balance_loss_clip": 0.06283607, "balance_loss_mlp": 0.01258055, "epoch": 0.4015632045693672, "flos": 20601359447040.0, "grad_norm": 2.2411280915412344, "language_loss": 0.64011133, "learning_rate": 2.718305158935434e-06, "loss": 0.71740437, "num_input_tokens_seen": 143461385, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.14685059, "step": 6679, "time_per_iteration": 2.564030408859253 }, { "auxiliary_loss_clip": 0.06450175, "auxiliary_loss_mlp": 0.01270071, "balance_loss_clip": 0.06278419, "balance_loss_mlp": 0.01257137, "epoch": 0.4016233278220352, "flos": 23445371203200.0, "grad_norm": 1.384928901765345, "language_loss": 0.79019982, "learning_rate": 2.7179416689657554e-06, "loss": 0.86740232, "num_input_tokens_seen": 143481750, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.1293335, "step": 6680, "time_per_iteration": 2.5765411853790283 }, { "auxiliary_loss_clip": 0.06465209, "auxiliary_loss_mlp": 0.01272038, "balance_loss_clip": 0.06284492, "balance_loss_mlp": 0.01256577, "epoch": 0.40168345107470316, "flos": 21436968689280.0, "grad_norm": 1.55605546008107, "language_loss": 0.75711596, "learning_rate": 2.7175781517716556e-06, "loss": 0.83448845, "num_input_tokens_seen": 143501540, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.15454102, "step": 6681, "time_per_iteration": 2.6375741958618164 }, { "auxiliary_loss_clip": 0.06464218, "auxiliary_loss_mlp": 0.01269689, "balance_loss_clip": 0.06288213, "balance_loss_mlp": 0.01255301, "epoch": 0.4017435743273711, "flos": 22863900994560.0, "grad_norm": 2.0112859547766524, "language_loss": 0.64715666, "learning_rate": 2.7172146073669213e-06, "loss": 0.72449571, "num_input_tokens_seen": 143520530, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.14379883, "step": 6682, "time_per_iteration": 2.5671985149383545 }, { "auxiliary_loss_clip": 0.06458351, "auxiliary_loss_mlp": 0.01269817, "balance_loss_clip": 0.06281487, "balance_loss_mlp": 0.0125646, "epoch": 0.4018036975800391, "flos": 28630022112000.0, "grad_norm": 1.9226484672830004, "language_loss": 0.7292937, "learning_rate": 2.716851035765337e-06, "loss": 0.80657542, "num_input_tokens_seen": 143540210, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.13366699, "step": 6683, "time_per_iteration": 2.6444625854492188 }, { "auxiliary_loss_clip": 0.06461877, "auxiliary_loss_mlp": 0.0127133, "balance_loss_clip": 0.06287172, "balance_loss_mlp": 0.0125789, "epoch": 0.40186382083270705, "flos": 26658446267520.0, "grad_norm": 1.6248634972741458, "language_loss": 0.73425901, "learning_rate": 2.7164874369806896e-06, "loss": 0.81159109, "num_input_tokens_seen": 143560940, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.13458252, "step": 6684, "time_per_iteration": 2.635192394256592 }, { "auxiliary_loss_clip": 0.06341444, "auxiliary_loss_mlp": 0.01262898, "balance_loss_clip": 0.06264669, "balance_loss_mlp": 0.01259507, "epoch": 0.401923944085375, "flos": 59277167562240.0, "grad_norm": 0.8111197908402574, "language_loss": 0.60405374, "learning_rate": 2.716123811026767e-06, "loss": 0.68009722, "num_input_tokens_seen": 143624015, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 0.03399658, "step": 6685, "time_per_iteration": 3.3374481201171875 }, { "auxiliary_loss_clip": 0.0646475, "auxiliary_loss_mlp": 0.0126962, "balance_loss_clip": 0.06283933, "balance_loss_mlp": 0.01255005, "epoch": 0.401984067338043, "flos": 16988473825920.0, "grad_norm": 1.955117088162109, "language_loss": 0.70658553, "learning_rate": 2.715760157917357e-06, "loss": 0.78392923, "num_input_tokens_seen": 143642750, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.14611816, "step": 6686, "time_per_iteration": 3.974674940109253 }, { "auxiliary_loss_clip": 0.06455879, "auxiliary_loss_mlp": 0.01276003, "balance_loss_clip": 0.06282511, "balance_loss_mlp": 0.01261954, "epoch": 0.40204419059071095, "flos": 24979387426560.0, "grad_norm": 1.436529686641861, "language_loss": 0.7496888, "learning_rate": 2.7153964776662504e-06, "loss": 0.82700765, "num_input_tokens_seen": 143664515, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.14038086, "step": 6687, "time_per_iteration": 2.5855486392974854 }, { "auxiliary_loss_clip": 0.06460959, "auxiliary_loss_mlp": 0.01272893, "balance_loss_clip": 0.06287698, "balance_loss_mlp": 0.01259398, "epoch": 0.4021043138433789, "flos": 23484252297600.0, "grad_norm": 1.968118438227427, "language_loss": 0.71316952, "learning_rate": 2.7150327702872385e-06, "loss": 0.79050803, "num_input_tokens_seen": 143683135, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.13519287, "step": 6688, "time_per_iteration": 2.603870153427124 }, { "auxiliary_loss_clip": 0.06460474, "auxiliary_loss_mlp": 0.01274639, "balance_loss_clip": 0.06279171, "balance_loss_mlp": 0.01258831, "epoch": 0.4021644370960469, "flos": 26003155011840.0, "grad_norm": 2.6300546865369836, "language_loss": 0.64781201, "learning_rate": 2.7146690357941112e-06, "loss": 0.7251631, "num_input_tokens_seen": 143703985, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.15808105, "step": 6689, "time_per_iteration": 2.594245195388794 }, { "auxiliary_loss_clip": 0.06461394, "auxiliary_loss_mlp": 0.01269064, "balance_loss_clip": 0.06284371, "balance_loss_mlp": 0.0125529, "epoch": 0.40222456034871484, "flos": 13592816714880.0, "grad_norm": 2.0057498528555273, "language_loss": 0.73919046, "learning_rate": 2.7143052742006632e-06, "loss": 0.816495, "num_input_tokens_seen": 143719245, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.13775635, "step": 6690, "time_per_iteration": 2.5352859497070312 }, { "auxiliary_loss_clip": 0.06460378, "auxiliary_loss_mlp": 0.01272033, "balance_loss_clip": 0.06286297, "balance_loss_mlp": 0.01257883, "epoch": 0.4022846836013828, "flos": 24284586170880.0, "grad_norm": 1.4942621658658626, "language_loss": 0.74840313, "learning_rate": 2.7139414855206872e-06, "loss": 0.82572722, "num_input_tokens_seen": 143739575, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.14129639, "step": 6691, "time_per_iteration": 4.121461629867554 }, { "auxiliary_loss_clip": 0.06462988, "auxiliary_loss_mlp": 0.01274122, "balance_loss_clip": 0.06287055, "balance_loss_mlp": 0.01259232, "epoch": 0.40234480685405083, "flos": 20156881864320.0, "grad_norm": 1.6388992786882108, "language_loss": 0.72679651, "learning_rate": 2.7135776697679785e-06, "loss": 0.80416763, "num_input_tokens_seen": 143758515, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.14904785, "step": 6692, "time_per_iteration": 2.622809410095215 }, { "auxiliary_loss_clip": 0.06455074, "auxiliary_loss_mlp": 0.01277149, "balance_loss_clip": 0.06280699, "balance_loss_mlp": 0.01262856, "epoch": 0.4024049301067188, "flos": 22936925427840.0, "grad_norm": 1.6757144560356452, "language_loss": 0.84190524, "learning_rate": 2.7132138269563333e-06, "loss": 0.91922748, "num_input_tokens_seen": 143776770, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.14306641, "step": 6693, "time_per_iteration": 2.571481466293335 }, { "auxiliary_loss_clip": 0.06462049, "auxiliary_loss_mlp": 0.01268709, "balance_loss_clip": 0.06288476, "balance_loss_mlp": 0.0125534, "epoch": 0.40246505335938676, "flos": 36037285297920.0, "grad_norm": 2.279487830366581, "language_loss": 0.71043086, "learning_rate": 2.7128499570995483e-06, "loss": 0.78773844, "num_input_tokens_seen": 143798450, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.13354492, "step": 6694, "time_per_iteration": 2.8095810413360596 }, { "auxiliary_loss_clip": 0.06454958, "auxiliary_loss_mlp": 0.01279066, "balance_loss_clip": 0.06281193, "balance_loss_mlp": 0.01264486, "epoch": 0.4025251766120547, "flos": 20600478979200.0, "grad_norm": 2.196885052473854, "language_loss": 0.68069792, "learning_rate": 2.7124860602114212e-06, "loss": 0.75803822, "num_input_tokens_seen": 143816995, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.14581299, "step": 6695, "time_per_iteration": 2.566904306411743 }, { "auxiliary_loss_clip": 0.06458673, "auxiliary_loss_mlp": 0.01271526, "balance_loss_clip": 0.06284076, "balance_loss_mlp": 0.01257256, "epoch": 0.4025852998647227, "flos": 64537582890240.0, "grad_norm": 3.9890935494302897, "language_loss": 0.79092491, "learning_rate": 2.7121221363057515e-06, "loss": 0.86822689, "num_input_tokens_seen": 143842090, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.1427002, "step": 6696, "time_per_iteration": 2.9717206954956055 }, { "auxiliary_loss_clip": 0.06471203, "auxiliary_loss_mlp": 0.0127134, "balance_loss_clip": 0.0629319, "balance_loss_mlp": 0.01257172, "epoch": 0.40264542311739066, "flos": 20892534785280.0, "grad_norm": 1.831866289172836, "language_loss": 0.71755099, "learning_rate": 2.7117581853963393e-06, "loss": 0.79497635, "num_input_tokens_seen": 143860800, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.14154053, "step": 6697, "time_per_iteration": 4.126566410064697 }, { "auxiliary_loss_clip": 0.06465194, "auxiliary_loss_mlp": 0.01270689, "balance_loss_clip": 0.06292047, "balance_loss_mlp": 0.0125692, "epoch": 0.4027055463700586, "flos": 26257419826560.0, "grad_norm": 1.9928333660629964, "language_loss": 0.61535776, "learning_rate": 2.711394207496984e-06, "loss": 0.6927166, "num_input_tokens_seen": 143878950, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.13775635, "step": 6698, "time_per_iteration": 2.7638115882873535 }, { "auxiliary_loss_clip": 0.06461044, "auxiliary_loss_mlp": 0.01268165, "balance_loss_clip": 0.06284332, "balance_loss_mlp": 0.0125408, "epoch": 0.4027656696227266, "flos": 20637682992000.0, "grad_norm": 2.223237261790526, "language_loss": 0.77019483, "learning_rate": 2.711030202621491e-06, "loss": 0.84748685, "num_input_tokens_seen": 143898385, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.14080811, "step": 6699, "time_per_iteration": 2.580967903137207 }, { "auxiliary_loss_clip": 0.06458066, "auxiliary_loss_mlp": 0.01268865, "balance_loss_clip": 0.06287605, "balance_loss_mlp": 0.01255287, "epoch": 0.40282579287539455, "flos": 22352855742720.0, "grad_norm": 1.9841517216299123, "language_loss": 0.8065182, "learning_rate": 2.7106661707836605e-06, "loss": 0.88378751, "num_input_tokens_seen": 143918795, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.13598633, "step": 6700, "time_per_iteration": 4.041242361068726 }, { "auxiliary_loss_clip": 0.06473915, "auxiliary_loss_mlp": 0.01271068, "balance_loss_clip": 0.06292855, "balance_loss_mlp": 0.01255368, "epoch": 0.4028859161280625, "flos": 29282126912640.0, "grad_norm": 1.9101249757401468, "language_loss": 0.75079733, "learning_rate": 2.7103021119972977e-06, "loss": 0.82824719, "num_input_tokens_seen": 143938245, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.15686035, "step": 6701, "time_per_iteration": 2.6332151889801025 }, { "auxiliary_loss_clip": 0.06468648, "auxiliary_loss_mlp": 0.01274495, "balance_loss_clip": 0.06293984, "balance_loss_mlp": 0.01260666, "epoch": 0.4029460393807305, "flos": 28630022112000.0, "grad_norm": 2.1391077973665724, "language_loss": 0.66436678, "learning_rate": 2.709938026276208e-06, "loss": 0.74179822, "num_input_tokens_seen": 143960995, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.1385498, "step": 6702, "time_per_iteration": 2.6181745529174805 }, { "auxiliary_loss_clip": 0.06469998, "auxiliary_loss_mlp": 0.01271953, "balance_loss_clip": 0.06292242, "balance_loss_mlp": 0.01257099, "epoch": 0.40300616263339845, "flos": 22608588003840.0, "grad_norm": 2.7459268598029483, "language_loss": 0.66681564, "learning_rate": 2.7095739136341964e-06, "loss": 0.74423516, "num_input_tokens_seen": 143979910, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.14868164, "step": 6703, "time_per_iteration": 2.56160044670105 }, { "auxiliary_loss_clip": 0.06467296, "auxiliary_loss_mlp": 0.01272992, "balance_loss_clip": 0.06289466, "balance_loss_mlp": 0.01257876, "epoch": 0.4030662858860664, "flos": 25527385128960.0, "grad_norm": 1.8286019753839051, "language_loss": 0.822137, "learning_rate": 2.709209774085071e-06, "loss": 0.89953995, "num_input_tokens_seen": 144000095, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.15106201, "step": 6704, "time_per_iteration": 2.5736629962921143 }, { "auxiliary_loss_clip": 0.06472179, "auxiliary_loss_mlp": 0.01272917, "balance_loss_clip": 0.06293543, "balance_loss_mlp": 0.01258683, "epoch": 0.40312640913873443, "flos": 23593474494720.0, "grad_norm": 1.6535073670428913, "language_loss": 0.73548484, "learning_rate": 2.7088456076426407e-06, "loss": 0.81293583, "num_input_tokens_seen": 144019695, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.14227295, "step": 6705, "time_per_iteration": 2.5776193141937256 }, { "auxiliary_loss_clip": 0.06463631, "auxiliary_loss_mlp": 0.01268312, "balance_loss_clip": 0.06292392, "balance_loss_mlp": 0.0125449, "epoch": 0.4031865323914024, "flos": 20017205907840.0, "grad_norm": 2.2593543422216418, "language_loss": 0.66790664, "learning_rate": 2.708481414320713e-06, "loss": 0.74522603, "num_input_tokens_seen": 144038525, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.13806152, "step": 6706, "time_per_iteration": 2.5456607341766357 }, { "auxiliary_loss_clip": 0.06468017, "auxiliary_loss_mlp": 0.01271581, "balance_loss_clip": 0.06293362, "balance_loss_mlp": 0.01257371, "epoch": 0.40324665564407036, "flos": 21877840546560.0, "grad_norm": 1.3903941081721272, "language_loss": 0.71572894, "learning_rate": 2.7081171941330992e-06, "loss": 0.79312491, "num_input_tokens_seen": 144059485, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.14202881, "step": 6707, "time_per_iteration": 2.5732648372650146 }, { "auxiliary_loss_clip": 0.06457114, "auxiliary_loss_mlp": 0.0128078, "balance_loss_clip": 0.06290179, "balance_loss_mlp": 0.01266969, "epoch": 0.4033067788967383, "flos": 23885572227840.0, "grad_norm": 2.507086033723678, "language_loss": 0.80075967, "learning_rate": 2.707752947093611e-06, "loss": 0.87813866, "num_input_tokens_seen": 144080265, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.13793945, "step": 6708, "time_per_iteration": 2.575692892074585 }, { "auxiliary_loss_clip": 0.06469372, "auxiliary_loss_mlp": 0.01271929, "balance_loss_clip": 0.06288563, "balance_loss_mlp": 0.0125673, "epoch": 0.4033669021494063, "flos": 17425530322560.0, "grad_norm": 1.9530268315667167, "language_loss": 0.83293581, "learning_rate": 2.70738867321606e-06, "loss": 0.91034883, "num_input_tokens_seen": 144098040, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.15185547, "step": 6709, "time_per_iteration": 2.5537750720977783 }, { "auxiliary_loss_clip": 0.06471077, "auxiliary_loss_mlp": 0.01272514, "balance_loss_clip": 0.0629342, "balance_loss_mlp": 0.01257041, "epoch": 0.40342702540207426, "flos": 29607277881600.0, "grad_norm": 4.031892954519921, "language_loss": 0.71553993, "learning_rate": 2.70702437251426e-06, "loss": 0.79297584, "num_input_tokens_seen": 144118265, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.15478516, "step": 6710, "time_per_iteration": 2.637185573577881 }, { "auxiliary_loss_clip": 0.0646614, "auxiliary_loss_mlp": 0.01270632, "balance_loss_clip": 0.0629267, "balance_loss_mlp": 0.01255623, "epoch": 0.4034871486547422, "flos": 11288249544960.0, "grad_norm": 2.4789389342832377, "language_loss": 0.85442913, "learning_rate": 2.7066600450020236e-06, "loss": 0.93179691, "num_input_tokens_seen": 144133865, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.15014648, "step": 6711, "time_per_iteration": 2.538133382797241 }, { "auxiliary_loss_clip": 0.06472467, "auxiliary_loss_mlp": 0.01274833, "balance_loss_clip": 0.06296775, "balance_loss_mlp": 0.01259265, "epoch": 0.4035472719074102, "flos": 15557097254400.0, "grad_norm": 3.0649995548232156, "language_loss": 0.76583701, "learning_rate": 2.706295690693168e-06, "loss": 0.84331, "num_input_tokens_seen": 144150125, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.15551758, "step": 6712, "time_per_iteration": 2.5334622859954834 }, { "auxiliary_loss_clip": 0.06468509, "auxiliary_loss_mlp": 0.01271165, "balance_loss_clip": 0.06293176, "balance_loss_mlp": 0.01256025, "epoch": 0.40360739516007815, "flos": 24680162096640.0, "grad_norm": 1.9651829961448632, "language_loss": 0.79746079, "learning_rate": 2.7059313096015096e-06, "loss": 0.87485754, "num_input_tokens_seen": 144169295, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.15130615, "step": 6713, "time_per_iteration": 2.623739719390869 }, { "auxiliary_loss_clip": 0.06472898, "auxiliary_loss_mlp": 0.01271609, "balance_loss_clip": 0.06293361, "balance_loss_mlp": 0.01256934, "epoch": 0.4036675184127461, "flos": 17308635477120.0, "grad_norm": 1.8370116719236265, "language_loss": 0.88323939, "learning_rate": 2.705566901740865e-06, "loss": 0.96068448, "num_input_tokens_seen": 144185790, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.14672852, "step": 6714, "time_per_iteration": 2.542569637298584 }, { "auxiliary_loss_clip": 0.06468916, "auxiliary_loss_mlp": 0.01269774, "balance_loss_clip": 0.06292169, "balance_loss_mlp": 0.01254944, "epoch": 0.4037276416654141, "flos": 19869983084160.0, "grad_norm": 1.5446313558863574, "language_loss": 0.69272172, "learning_rate": 2.7052024671250527e-06, "loss": 0.77010864, "num_input_tokens_seen": 144205190, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.14831543, "step": 6715, "time_per_iteration": 2.580686330795288 }, { "auxiliary_loss_clip": 0.06474083, "auxiliary_loss_mlp": 0.01272437, "balance_loss_clip": 0.06292197, "balance_loss_mlp": 0.01256546, "epoch": 0.40378776491808205, "flos": 18302158938240.0, "grad_norm": 4.729500323563287, "language_loss": 0.78031719, "learning_rate": 2.704838005767892e-06, "loss": 0.85778248, "num_input_tokens_seen": 144222705, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.15869141, "step": 6716, "time_per_iteration": 2.5686402320861816 }, { "auxiliary_loss_clip": 0.06462252, "auxiliary_loss_mlp": 0.01276242, "balance_loss_clip": 0.06292467, "balance_loss_mlp": 0.01262366, "epoch": 0.40384788817075, "flos": 15054772826880.0, "grad_norm": 2.0154354895213085, "language_loss": 0.76697427, "learning_rate": 2.7044735176832037e-06, "loss": 0.84435916, "num_input_tokens_seen": 144239545, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.13891602, "step": 6717, "time_per_iteration": 2.54426646232605 }, { "auxiliary_loss_clip": 0.06351024, "auxiliary_loss_mlp": 0.01255475, "balance_loss_clip": 0.06275184, "balance_loss_mlp": 0.01250921, "epoch": 0.40390801142341803, "flos": 61948659761280.0, "grad_norm": 0.8964518952792284, "language_loss": 0.60207069, "learning_rate": 2.7041090028848084e-06, "loss": 0.67813569, "num_input_tokens_seen": 144288145, "router_z_loss_clip": 0.7578125, "router_z_loss_mlp": 0.04556274, "step": 6718, "time_per_iteration": 3.0487968921661377 }, { "auxiliary_loss_clip": 0.06474768, "auxiliary_loss_mlp": 0.01274104, "balance_loss_clip": 0.06291726, "balance_loss_mlp": 0.0125788, "epoch": 0.403968134676086, "flos": 22743945475200.0, "grad_norm": 2.0184109434580892, "language_loss": 0.74983299, "learning_rate": 2.7037444613865306e-06, "loss": 0.82732165, "num_input_tokens_seen": 144302315, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.16223145, "step": 6719, "time_per_iteration": 2.560502767562866 }, { "auxiliary_loss_clip": 0.0647357, "auxiliary_loss_mlp": 0.0127443, "balance_loss_clip": 0.06298125, "balance_loss_mlp": 0.01259327, "epoch": 0.40402825792875396, "flos": 19789244075520.0, "grad_norm": 2.1048910951589286, "language_loss": 0.81643689, "learning_rate": 2.7033798932021906e-06, "loss": 0.8939169, "num_input_tokens_seen": 144318990, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.15112305, "step": 6720, "time_per_iteration": 2.566922903060913 }, { "auxiliary_loss_clip": 0.06472044, "auxiliary_loss_mlp": 0.01269059, "balance_loss_clip": 0.06293824, "balance_loss_mlp": 0.01254324, "epoch": 0.40408838118142193, "flos": 19615298999040.0, "grad_norm": 2.199168078420762, "language_loss": 0.77043372, "learning_rate": 2.7030152983456153e-06, "loss": 0.84784472, "num_input_tokens_seen": 144335765, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.14733887, "step": 6721, "time_per_iteration": 2.6084227561950684 }, { "auxiliary_loss_clip": 0.06458929, "auxiliary_loss_mlp": 0.01271479, "balance_loss_clip": 0.06286731, "balance_loss_mlp": 0.01257282, "epoch": 0.4041485044340899, "flos": 24432982951680.0, "grad_norm": 2.7725766038340502, "language_loss": 0.73069006, "learning_rate": 2.7026506768306304e-06, "loss": 0.80799419, "num_input_tokens_seen": 144355825, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.14208984, "step": 6722, "time_per_iteration": 2.589702606201172 }, { "auxiliary_loss_clip": 0.06462228, "auxiliary_loss_mlp": 0.0127108, "balance_loss_clip": 0.06287541, "balance_loss_mlp": 0.01256834, "epoch": 0.40420862768675786, "flos": 16765207822080.0, "grad_norm": 3.52409389614127, "language_loss": 0.65307713, "learning_rate": 2.7022860286710602e-06, "loss": 0.73041022, "num_input_tokens_seen": 144374320, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.14251709, "step": 6723, "time_per_iteration": 2.5573201179504395 }, { "auxiliary_loss_clip": 0.06475042, "auxiliary_loss_mlp": 0.01275513, "balance_loss_clip": 0.06294116, "balance_loss_mlp": 0.0126032, "epoch": 0.4042687509394258, "flos": 22498066068480.0, "grad_norm": 1.4190959623277388, "language_loss": 0.74045599, "learning_rate": 2.701921353880734e-06, "loss": 0.81796157, "num_input_tokens_seen": 144394325, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.15197754, "step": 6724, "time_per_iteration": 2.5933797359466553 }, { "auxiliary_loss_clip": 0.06457087, "auxiliary_loss_mlp": 0.01270919, "balance_loss_clip": 0.06287058, "balance_loss_mlp": 0.01256924, "epoch": 0.4043288741920938, "flos": 30343978978560.0, "grad_norm": 1.8220431149004483, "language_loss": 0.75235868, "learning_rate": 2.7015566524734787e-06, "loss": 0.82963872, "num_input_tokens_seen": 144412765, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.14013672, "step": 6725, "time_per_iteration": 4.031014680862427 }, { "auxiliary_loss_clip": 0.06466124, "auxiliary_loss_mlp": 0.01271112, "balance_loss_clip": 0.0629161, "balance_loss_mlp": 0.0125577, "epoch": 0.40438899744476176, "flos": 46357978947840.0, "grad_norm": 1.9338691630761284, "language_loss": 0.77240741, "learning_rate": 2.701191924463126e-06, "loss": 0.84977984, "num_input_tokens_seen": 144435400, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.15356445, "step": 6726, "time_per_iteration": 2.764108180999756 }, { "auxiliary_loss_clip": 0.06462995, "auxiliary_loss_mlp": 0.01273147, "balance_loss_clip": 0.06285217, "balance_loss_mlp": 0.01257638, "epoch": 0.4044491206974297, "flos": 13338468046080.0, "grad_norm": 2.8925953479607256, "language_loss": 0.82587212, "learning_rate": 2.7008271698635054e-06, "loss": 0.90323353, "num_input_tokens_seen": 144452925, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.15515137, "step": 6727, "time_per_iteration": 2.5431177616119385 }, { "auxiliary_loss_clip": 0.06461309, "auxiliary_loss_mlp": 0.01271587, "balance_loss_clip": 0.06286911, "balance_loss_mlp": 0.01257139, "epoch": 0.4045092439500977, "flos": 12098603980800.0, "grad_norm": 2.4253208706264107, "language_loss": 0.85712266, "learning_rate": 2.700462388688447e-06, "loss": 0.93445158, "num_input_tokens_seen": 144470195, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.14434814, "step": 6728, "time_per_iteration": 2.522890090942383 }, { "auxiliary_loss_clip": 0.0645493, "auxiliary_loss_mlp": 0.01272973, "balance_loss_clip": 0.06283006, "balance_loss_mlp": 0.01257732, "epoch": 0.40456936720276565, "flos": 21186225745920.0, "grad_norm": 1.657586045664663, "language_loss": 0.82468557, "learning_rate": 2.700097580951786e-06, "loss": 0.9019646, "num_input_tokens_seen": 144490320, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.15240479, "step": 6729, "time_per_iteration": 2.567728042602539 }, { "auxiliary_loss_clip": 0.06461753, "auxiliary_loss_mlp": 0.01272926, "balance_loss_clip": 0.06285073, "balance_loss_mlp": 0.01257286, "epoch": 0.4046294904554336, "flos": 23922147335040.0, "grad_norm": 3.697575262843075, "language_loss": 0.73984849, "learning_rate": 2.6997327466673533e-06, "loss": 0.8171953, "num_input_tokens_seen": 144508990, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.15649414, "step": 6730, "time_per_iteration": 3.994880199432373 }, { "auxiliary_loss_clip": 0.06455643, "auxiliary_loss_mlp": 0.01268442, "balance_loss_clip": 0.06281458, "balance_loss_mlp": 0.01254458, "epoch": 0.4046896137081016, "flos": 38080376202240.0, "grad_norm": 2.776080713164482, "language_loss": 0.67998475, "learning_rate": 2.699367885848985e-06, "loss": 0.75722563, "num_input_tokens_seen": 144529550, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.13989258, "step": 6731, "time_per_iteration": 2.70048451423645 }, { "auxiliary_loss_clip": 0.06463137, "auxiliary_loss_mlp": 0.01272013, "balance_loss_clip": 0.06288029, "balance_loss_mlp": 0.01256778, "epoch": 0.4047497369607696, "flos": 23623047786240.0, "grad_norm": 1.4711470774984252, "language_loss": 0.74539244, "learning_rate": 2.699002998510517e-06, "loss": 0.82274389, "num_input_tokens_seen": 144549310, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.15234375, "step": 6732, "time_per_iteration": 2.5942630767822266 }, { "auxiliary_loss_clip": 0.06457819, "auxiliary_loss_mlp": 0.01272666, "balance_loss_clip": 0.06285875, "balance_loss_mlp": 0.01258415, "epoch": 0.40480986021343757, "flos": 12828596751360.0, "grad_norm": 1.851894135353113, "language_loss": 0.77614152, "learning_rate": 2.6986380846657852e-06, "loss": 0.85344625, "num_input_tokens_seen": 144567430, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.14251709, "step": 6733, "time_per_iteration": 2.5684425830841064 }, { "auxiliary_loss_clip": 0.06465727, "auxiliary_loss_mlp": 0.01270861, "balance_loss_clip": 0.06285308, "balance_loss_mlp": 0.01254696, "epoch": 0.40486998346610553, "flos": 23775511489920.0, "grad_norm": 1.842934711937025, "language_loss": 0.76660311, "learning_rate": 2.698273144328627e-06, "loss": 0.84396899, "num_input_tokens_seen": 144585975, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.16174316, "step": 6734, "time_per_iteration": 2.595956563949585 }, { "auxiliary_loss_clip": 0.06471379, "auxiliary_loss_mlp": 0.01268674, "balance_loss_clip": 0.06290545, "balance_loss_mlp": 0.01253987, "epoch": 0.4049301067187735, "flos": 22863439797120.0, "grad_norm": 2.36710709912721, "language_loss": 0.65481853, "learning_rate": 2.6979081775128805e-06, "loss": 0.7322191, "num_input_tokens_seen": 144605225, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.14685059, "step": 6735, "time_per_iteration": 2.566826820373535 }, { "auxiliary_loss_clip": 0.06465963, "auxiliary_loss_mlp": 0.0126978, "balance_loss_clip": 0.06292609, "balance_loss_mlp": 0.0125563, "epoch": 0.40499022997144146, "flos": 22790624999040.0, "grad_norm": 2.345266035874642, "language_loss": 0.8347919, "learning_rate": 2.697543184232387e-06, "loss": 0.91214931, "num_input_tokens_seen": 144624145, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.1416626, "step": 6736, "time_per_iteration": 3.923220634460449 }, { "auxiliary_loss_clip": 0.06465683, "auxiliary_loss_mlp": 0.01275461, "balance_loss_clip": 0.06288249, "balance_loss_mlp": 0.01259791, "epoch": 0.4050503532241094, "flos": 23046021843840.0, "grad_norm": 1.718366026952594, "language_loss": 0.7546711, "learning_rate": 2.6971781645009863e-06, "loss": 0.83208251, "num_input_tokens_seen": 144644470, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.15679932, "step": 6737, "time_per_iteration": 2.5820484161376953 }, { "auxiliary_loss_clip": 0.06464028, "auxiliary_loss_mlp": 0.0127464, "balance_loss_clip": 0.0629091, "balance_loss_mlp": 0.01260102, "epoch": 0.4051104764767774, "flos": 16652254118400.0, "grad_norm": 3.2367236682069027, "language_loss": 0.72225922, "learning_rate": 2.696813118332519e-06, "loss": 0.7996459, "num_input_tokens_seen": 144661055, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.14532471, "step": 6738, "time_per_iteration": 2.5393784046173096 }, { "auxiliary_loss_clip": 0.0647014, "auxiliary_loss_mlp": 0.01270442, "balance_loss_clip": 0.06296978, "balance_loss_mlp": 0.0125597, "epoch": 0.40517059972944536, "flos": 16363929818880.0, "grad_norm": 3.6994988899285057, "language_loss": 0.7561307, "learning_rate": 2.696448045740828e-06, "loss": 0.83353651, "num_input_tokens_seen": 144677935, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.14465332, "step": 6739, "time_per_iteration": 2.5919413566589355 }, { "auxiliary_loss_clip": 0.06472072, "auxiliary_loss_mlp": 0.01275918, "balance_loss_clip": 0.0629681, "balance_loss_mlp": 0.01260588, "epoch": 0.4052307229821133, "flos": 28810885150080.0, "grad_norm": 1.911340966926702, "language_loss": 0.74583822, "learning_rate": 2.6960829467397576e-06, "loss": 0.82331812, "num_input_tokens_seen": 144697725, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.15344238, "step": 6740, "time_per_iteration": 4.20359468460083 }, { "auxiliary_loss_clip": 0.06466226, "auxiliary_loss_mlp": 0.01275828, "balance_loss_clip": 0.0629553, "balance_loss_mlp": 0.0126045, "epoch": 0.4052908462347813, "flos": 21404334723840.0, "grad_norm": 1.6181199648049667, "language_loss": 0.77750331, "learning_rate": 2.695717821343153e-06, "loss": 0.85492384, "num_input_tokens_seen": 144718805, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.15374756, "step": 6741, "time_per_iteration": 2.628502368927002 }, { "auxiliary_loss_clip": 0.06471007, "auxiliary_loss_mlp": 0.01274481, "balance_loss_clip": 0.06295925, "balance_loss_mlp": 0.01258876, "epoch": 0.40535096948744925, "flos": 22425628613760.0, "grad_norm": 1.6938580867789783, "language_loss": 0.71939701, "learning_rate": 2.6953526695648577e-06, "loss": 0.79685187, "num_input_tokens_seen": 144737105, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.15588379, "step": 6742, "time_per_iteration": 2.6131906509399414 }, { "auxiliary_loss_clip": 0.06469473, "auxiliary_loss_mlp": 0.01273138, "balance_loss_clip": 0.06294622, "balance_loss_mlp": 0.01257009, "epoch": 0.4054110927401172, "flos": 17015028370560.0, "grad_norm": 2.378001391665026, "language_loss": 0.7310518, "learning_rate": 2.6949874914187202e-06, "loss": 0.80847794, "num_input_tokens_seen": 144751350, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.16137695, "step": 6743, "time_per_iteration": 2.560652732849121 }, { "auxiliary_loss_clip": 0.06470265, "auxiliary_loss_mlp": 0.01281806, "balance_loss_clip": 0.06292719, "balance_loss_mlp": 0.01266225, "epoch": 0.4054712159927852, "flos": 21621018182400.0, "grad_norm": 1.9450382021623405, "language_loss": 0.71457994, "learning_rate": 2.694622286918588e-06, "loss": 0.79210061, "num_input_tokens_seen": 144770030, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.15588379, "step": 6744, "time_per_iteration": 2.5539205074310303 }, { "auxiliary_loss_clip": 0.06468426, "auxiliary_loss_mlp": 0.01275287, "balance_loss_clip": 0.06297222, "balance_loss_mlp": 0.01260803, "epoch": 0.4055313392454532, "flos": 25819734424320.0, "grad_norm": 1.897121014981497, "language_loss": 0.7997328, "learning_rate": 2.6942570560783076e-06, "loss": 0.87716991, "num_input_tokens_seen": 144790965, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.14477539, "step": 6745, "time_per_iteration": 2.7185606956481934 }, { "auxiliary_loss_clip": 0.06471764, "auxiliary_loss_mlp": 0.01275907, "balance_loss_clip": 0.06299661, "balance_loss_mlp": 0.01260547, "epoch": 0.40559146249812117, "flos": 14142323790720.0, "grad_norm": 1.8373306916418084, "language_loss": 0.6654399, "learning_rate": 2.693891798911731e-06, "loss": 0.74291658, "num_input_tokens_seen": 144807755, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.15362549, "step": 6746, "time_per_iteration": 2.549147129058838 }, { "auxiliary_loss_clip": 0.06463759, "auxiliary_loss_mlp": 0.01268122, "balance_loss_clip": 0.06290534, "balance_loss_mlp": 0.01254318, "epoch": 0.40565158575078913, "flos": 41365259815680.0, "grad_norm": 1.4512687639598898, "language_loss": 0.57315987, "learning_rate": 2.6935265154327075e-06, "loss": 0.65047866, "num_input_tokens_seen": 144832405, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.13800049, "step": 6747, "time_per_iteration": 2.7549562454223633 }, { "auxiliary_loss_clip": 0.06469329, "auxiliary_loss_mlp": 0.0126719, "balance_loss_clip": 0.06293961, "balance_loss_mlp": 0.01253088, "epoch": 0.4057117090034571, "flos": 28551421382400.0, "grad_norm": 2.165774785327281, "language_loss": 0.84511864, "learning_rate": 2.693161205655089e-06, "loss": 0.92248386, "num_input_tokens_seen": 144853890, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.14105225, "step": 6748, "time_per_iteration": 2.6383163928985596 }, { "auxiliary_loss_clip": 0.06469759, "auxiliary_loss_mlp": 0.01269593, "balance_loss_clip": 0.06293453, "balance_loss_mlp": 0.01254703, "epoch": 0.40577183225612506, "flos": 18009851569920.0, "grad_norm": 2.0337413314237276, "language_loss": 0.81883591, "learning_rate": 2.6927958695927287e-06, "loss": 0.89622939, "num_input_tokens_seen": 144871395, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.14880371, "step": 6749, "time_per_iteration": 2.6586616039276123 }, { "auxiliary_loss_clip": 0.06465326, "auxiliary_loss_mlp": 0.0127035, "balance_loss_clip": 0.06292796, "balance_loss_mlp": 0.0125558, "epoch": 0.40583195550879303, "flos": 19542819617280.0, "grad_norm": 3.932419643963564, "language_loss": 0.7565217, "learning_rate": 2.6924305072594784e-06, "loss": 0.8338784, "num_input_tokens_seen": 144890975, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.14782715, "step": 6750, "time_per_iteration": 2.5949432849884033 }, { "auxiliary_loss_clip": 0.06471889, "auxiliary_loss_mlp": 0.01273201, "balance_loss_clip": 0.06290691, "balance_loss_mlp": 0.01258574, "epoch": 0.405892078761461, "flos": 22315987146240.0, "grad_norm": 2.4360819207049134, "language_loss": 0.74345779, "learning_rate": 2.692065118669195e-06, "loss": 0.82090873, "num_input_tokens_seen": 144908170, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.14630127, "step": 6751, "time_per_iteration": 2.577066421508789 }, { "auxiliary_loss_clip": 0.06468211, "auxiliary_loss_mlp": 0.01272254, "balance_loss_clip": 0.06291571, "balance_loss_mlp": 0.01257544, "epoch": 0.40595220201412896, "flos": 25491564708480.0, "grad_norm": 5.21053905053476, "language_loss": 0.67114931, "learning_rate": 2.6916997038357326e-06, "loss": 0.74855399, "num_input_tokens_seen": 144928020, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.14715576, "step": 6752, "time_per_iteration": 2.6278328895568848 }, { "auxiliary_loss_clip": 0.06465247, "auxiliary_loss_mlp": 0.01274586, "balance_loss_clip": 0.06286353, "balance_loss_mlp": 0.01259196, "epoch": 0.4060123252667969, "flos": 49867092887040.0, "grad_norm": 1.6733731560091007, "language_loss": 0.7125203, "learning_rate": 2.691334262772948e-06, "loss": 0.78991872, "num_input_tokens_seen": 144951240, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.15393066, "step": 6753, "time_per_iteration": 2.8264527320861816 }, { "auxiliary_loss_clip": 0.06464618, "auxiliary_loss_mlp": 0.01271105, "balance_loss_clip": 0.06286655, "balance_loss_mlp": 0.01256984, "epoch": 0.4060724485194649, "flos": 21140720179200.0, "grad_norm": 2.1055290070116057, "language_loss": 0.72581321, "learning_rate": 2.690968795494699e-06, "loss": 0.80317044, "num_input_tokens_seen": 144969100, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.14129639, "step": 6754, "time_per_iteration": 2.5707528591156006 }, { "auxiliary_loss_clip": 0.0646662, "auxiliary_loss_mlp": 0.0127538, "balance_loss_clip": 0.06289466, "balance_loss_mlp": 0.01261188, "epoch": 0.40613257177213286, "flos": 21763796739840.0, "grad_norm": 1.7139206851806537, "language_loss": 0.8325901, "learning_rate": 2.690603302014844e-06, "loss": 0.9100101, "num_input_tokens_seen": 144987065, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.14196777, "step": 6755, "time_per_iteration": 2.576695203781128 }, { "auxiliary_loss_clip": 0.06476243, "auxiliary_loss_mlp": 0.01271592, "balance_loss_clip": 0.06294224, "balance_loss_mlp": 0.01256035, "epoch": 0.4061926950248008, "flos": 25561863884160.0, "grad_norm": 1.5948971827315683, "language_loss": 0.71249568, "learning_rate": 2.6902377823472426e-06, "loss": 0.78997397, "num_input_tokens_seen": 145007310, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.15551758, "step": 6756, "time_per_iteration": 2.650613307952881 }, { "auxiliary_loss_clip": 0.06468458, "auxiliary_loss_mlp": 0.01274479, "balance_loss_clip": 0.06287926, "balance_loss_mlp": 0.01259685, "epoch": 0.4062528182774688, "flos": 23702528983680.0, "grad_norm": 1.8071160078634843, "language_loss": 0.79062998, "learning_rate": 2.689872236505755e-06, "loss": 0.8680594, "num_input_tokens_seen": 145026210, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.14807129, "step": 6757, "time_per_iteration": 2.6096620559692383 }, { "auxiliary_loss_clip": 0.06464764, "auxiliary_loss_mlp": 0.0127415, "balance_loss_clip": 0.06288788, "balance_loss_mlp": 0.01258891, "epoch": 0.4063129415301368, "flos": 21732504439680.0, "grad_norm": 2.4614244095685938, "language_loss": 0.78547835, "learning_rate": 2.6895066645042437e-06, "loss": 0.86286747, "num_input_tokens_seen": 145045475, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.15246582, "step": 6758, "time_per_iteration": 2.6058695316314697 }, { "auxiliary_loss_clip": 0.06466328, "auxiliary_loss_mlp": 0.01271371, "balance_loss_clip": 0.06295508, "balance_loss_mlp": 0.01257614, "epoch": 0.40637306478280477, "flos": 12792650549760.0, "grad_norm": 2.0109027072684693, "language_loss": 0.89087343, "learning_rate": 2.6891410663565703e-06, "loss": 0.96825039, "num_input_tokens_seen": 145062260, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.13757324, "step": 6759, "time_per_iteration": 2.5949251651763916 }, { "auxiliary_loss_clip": 0.06465904, "auxiliary_loss_mlp": 0.01274713, "balance_loss_clip": 0.06290156, "balance_loss_mlp": 0.01260241, "epoch": 0.40643318803547274, "flos": 24031327605120.0, "grad_norm": 2.011795107009417, "language_loss": 0.64575487, "learning_rate": 2.688775442076598e-06, "loss": 0.72316104, "num_input_tokens_seen": 145082470, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.14471436, "step": 6760, "time_per_iteration": 2.645096778869629 }, { "auxiliary_loss_clip": 0.0646856, "auxiliary_loss_mlp": 0.01274894, "balance_loss_clip": 0.06290165, "balance_loss_mlp": 0.01260291, "epoch": 0.4064933112881407, "flos": 25599361386240.0, "grad_norm": 2.185045912905982, "language_loss": 0.753883, "learning_rate": 2.688409791678193e-06, "loss": 0.83131748, "num_input_tokens_seen": 145105685, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.14599609, "step": 6761, "time_per_iteration": 2.6489675045013428 }, { "auxiliary_loss_clip": 0.06463116, "auxiliary_loss_mlp": 0.01275022, "balance_loss_clip": 0.06295428, "balance_loss_mlp": 0.01261516, "epoch": 0.40655343454080867, "flos": 22060841863680.0, "grad_norm": 1.378748808043542, "language_loss": 0.7007755, "learning_rate": 2.6880441151752185e-06, "loss": 0.77815694, "num_input_tokens_seen": 145125590, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.13513184, "step": 6762, "time_per_iteration": 2.6499106884002686 }, { "auxiliary_loss_clip": 0.06466369, "auxiliary_loss_mlp": 0.01273026, "balance_loss_clip": 0.06292047, "balance_loss_mlp": 0.01259508, "epoch": 0.40661355779347663, "flos": 26476115783040.0, "grad_norm": 1.4626989801812376, "language_loss": 0.74174857, "learning_rate": 2.6876784125815433e-06, "loss": 0.81914252, "num_input_tokens_seen": 145146810, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.13531494, "step": 6763, "time_per_iteration": 2.6416749954223633 }, { "auxiliary_loss_clip": 0.06476916, "auxiliary_loss_mlp": 0.0127523, "balance_loss_clip": 0.06299391, "balance_loss_mlp": 0.01260186, "epoch": 0.4066736810461446, "flos": 13266156372480.0, "grad_norm": 1.8634842529486717, "language_loss": 0.69209534, "learning_rate": 2.687312683911033e-06, "loss": 0.76961672, "num_input_tokens_seen": 145163130, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.15063477, "step": 6764, "time_per_iteration": 2.6039071083068848 }, { "auxiliary_loss_clip": 0.06478262, "auxiliary_loss_mlp": 0.01271452, "balance_loss_clip": 0.0629762, "balance_loss_mlp": 0.01255597, "epoch": 0.40673380429881256, "flos": 28811178639360.0, "grad_norm": 2.166659753351662, "language_loss": 0.91629946, "learning_rate": 2.686946929177557e-06, "loss": 0.99379659, "num_input_tokens_seen": 145181420, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.15838623, "step": 6765, "time_per_iteration": 4.0116283893585205 }, { "auxiliary_loss_clip": 0.06474861, "auxiliary_loss_mlp": 0.01274844, "balance_loss_clip": 0.0629501, "balance_loss_mlp": 0.01260396, "epoch": 0.4067939275514805, "flos": 12500301254400.0, "grad_norm": 2.6490019521073442, "language_loss": 0.80000496, "learning_rate": 2.6865811483949855e-06, "loss": 0.87750208, "num_input_tokens_seen": 145198545, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.14459229, "step": 6766, "time_per_iteration": 2.585218906402588 }, { "auxiliary_loss_clip": 0.06471923, "auxiliary_loss_mlp": 0.01274541, "balance_loss_clip": 0.06294449, "balance_loss_mlp": 0.01259777, "epoch": 0.4068540508041485, "flos": 18776461374720.0, "grad_norm": 1.817919212235201, "language_loss": 0.76602411, "learning_rate": 2.6862153415771867e-06, "loss": 0.84348875, "num_input_tokens_seen": 145215835, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.14764404, "step": 6767, "time_per_iteration": 2.5354952812194824 }, { "auxiliary_loss_clip": 0.06472171, "auxiliary_loss_mlp": 0.01277367, "balance_loss_clip": 0.06298502, "balance_loss_mlp": 0.01262693, "epoch": 0.40691417405681646, "flos": 28520506425600.0, "grad_norm": 2.256791267918135, "language_loss": 0.77657378, "learning_rate": 2.685849508738034e-06, "loss": 0.85406917, "num_input_tokens_seen": 145236555, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.14678955, "step": 6768, "time_per_iteration": 2.6527445316314697 }, { "auxiliary_loss_clip": 0.06465469, "auxiliary_loss_mlp": 0.01274352, "balance_loss_clip": 0.06292339, "balance_loss_mlp": 0.01260839, "epoch": 0.4069742973094844, "flos": 20820390819840.0, "grad_norm": 1.9621467063418072, "language_loss": 0.87527698, "learning_rate": 2.6854836498913995e-06, "loss": 0.9526751, "num_input_tokens_seen": 145254595, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.1350708, "step": 6769, "time_per_iteration": 2.5709776878356934 }, { "auxiliary_loss_clip": 0.06467508, "auxiliary_loss_mlp": 0.01270925, "balance_loss_clip": 0.06298229, "balance_loss_mlp": 0.01257675, "epoch": 0.4070344205621524, "flos": 21476646397440.0, "grad_norm": 2.2350737662296303, "language_loss": 0.81107843, "learning_rate": 2.685117765051156e-06, "loss": 0.88846278, "num_input_tokens_seen": 145274005, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.13238525, "step": 6770, "time_per_iteration": 4.013275861740112 }, { "auxiliary_loss_clip": 0.06471369, "auxiliary_loss_mlp": 0.01268486, "balance_loss_clip": 0.0629327, "balance_loss_mlp": 0.01253561, "epoch": 0.4070945438148204, "flos": 26836709829120.0, "grad_norm": 1.7458262450765603, "language_loss": 0.80212075, "learning_rate": 2.6847518542311783e-06, "loss": 0.87951934, "num_input_tokens_seen": 145294850, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.14904785, "step": 6771, "time_per_iteration": 2.6072018146514893 }, { "auxiliary_loss_clip": 0.06466151, "auxiliary_loss_mlp": 0.01274322, "balance_loss_clip": 0.06292487, "balance_loss_mlp": 0.01260803, "epoch": 0.4071546670674884, "flos": 26360478748800.0, "grad_norm": 1.3729292164750084, "language_loss": 0.76115024, "learning_rate": 2.6843859174453417e-06, "loss": 0.83855504, "num_input_tokens_seen": 145317050, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.13525391, "step": 6772, "time_per_iteration": 2.6432971954345703 }, { "auxiliary_loss_clip": 0.06465845, "auxiliary_loss_mlp": 0.01271472, "balance_loss_clip": 0.06290398, "balance_loss_mlp": 0.01256648, "epoch": 0.40721479032015634, "flos": 17901300205440.0, "grad_norm": 2.0404161085611303, "language_loss": 0.81436467, "learning_rate": 2.6840199547075218e-06, "loss": 0.89173788, "num_input_tokens_seen": 145334480, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.14825439, "step": 6773, "time_per_iteration": 2.552910804748535 }, { "auxiliary_loss_clip": 0.06364428, "auxiliary_loss_mlp": 0.01273148, "balance_loss_clip": 0.06288394, "balance_loss_mlp": 0.01269349, "epoch": 0.4072749135728243, "flos": 49871522424960.0, "grad_norm": 0.8869525653371005, "language_loss": 0.64262474, "learning_rate": 2.683653966031597e-06, "loss": 0.71900046, "num_input_tokens_seen": 145388695, "router_z_loss_clip": 0.75830078, "router_z_loss_mlp": 0.03793335, "step": 6774, "time_per_iteration": 3.125117778778076 }, { "auxiliary_loss_clip": 0.06465209, "auxiliary_loss_mlp": 0.01270071, "balance_loss_clip": 0.06288201, "balance_loss_mlp": 0.01255926, "epoch": 0.40733503682549227, "flos": 27571063011840.0, "grad_norm": 1.7682176235405855, "language_loss": 0.73006868, "learning_rate": 2.683287951431446e-06, "loss": 0.80742145, "num_input_tokens_seen": 145408240, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.14147949, "step": 6775, "time_per_iteration": 2.676158905029297 }, { "auxiliary_loss_clip": 0.06467533, "auxiliary_loss_mlp": 0.01274141, "balance_loss_clip": 0.06292516, "balance_loss_mlp": 0.01259198, "epoch": 0.40739516007816023, "flos": 22133447026560.0, "grad_norm": 1.8512246681524849, "language_loss": 0.77979445, "learning_rate": 2.6829219109209474e-06, "loss": 0.85721111, "num_input_tokens_seen": 145428395, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.14959717, "step": 6776, "time_per_iteration": 4.043287038803101 }, { "auxiliary_loss_clip": 0.06468713, "auxiliary_loss_mlp": 0.01270821, "balance_loss_clip": 0.06290457, "balance_loss_mlp": 0.01256838, "epoch": 0.4074552833308282, "flos": 23849080974720.0, "grad_norm": 2.5340265447196115, "language_loss": 0.79324365, "learning_rate": 2.682555844513981e-06, "loss": 0.87063897, "num_input_tokens_seen": 145448290, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.13983154, "step": 6777, "time_per_iteration": 2.6118876934051514 }, { "auxiliary_loss_clip": 0.06356864, "auxiliary_loss_mlp": 0.01257399, "balance_loss_clip": 0.0628072, "balance_loss_mlp": 0.01253888, "epoch": 0.40751540658349616, "flos": 58019847120000.0, "grad_norm": 0.6721864620813237, "language_loss": 0.52951849, "learning_rate": 2.6821897522244286e-06, "loss": 0.60566109, "num_input_tokens_seen": 145509785, "router_z_loss_clip": 0.76220703, "router_z_loss_mlp": 0.03515625, "step": 6778, "time_per_iteration": 3.294189453125 }, { "auxiliary_loss_clip": 0.06467082, "auxiliary_loss_mlp": 0.01271757, "balance_loss_clip": 0.06292339, "balance_loss_mlp": 0.01257356, "epoch": 0.40757552983616413, "flos": 21220956063360.0, "grad_norm": 2.2718764872634427, "language_loss": 0.83529019, "learning_rate": 2.6818236340661718e-06, "loss": 0.9126786, "num_input_tokens_seen": 145528620, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.14404297, "step": 6779, "time_per_iteration": 4.043463230133057 }, { "auxiliary_loss_clip": 0.06462613, "auxiliary_loss_mlp": 0.01269207, "balance_loss_clip": 0.06288697, "balance_loss_mlp": 0.01254341, "epoch": 0.4076356530888321, "flos": 26840776752000.0, "grad_norm": 2.227689227048513, "language_loss": 0.76497436, "learning_rate": 2.6814574900530957e-06, "loss": 0.84229261, "num_input_tokens_seen": 145547775, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.14862061, "step": 6780, "time_per_iteration": 2.6140074729919434 }, { "auxiliary_loss_clip": 0.06458645, "auxiliary_loss_mlp": 0.01269869, "balance_loss_clip": 0.06288647, "balance_loss_mlp": 0.01256994, "epoch": 0.40769577634150006, "flos": 12207868104960.0, "grad_norm": 2.630524742327823, "language_loss": 0.66699934, "learning_rate": 2.6810913201990827e-06, "loss": 0.74428451, "num_input_tokens_seen": 145564465, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.12884521, "step": 6781, "time_per_iteration": 2.545132637023926 }, { "auxiliary_loss_clip": 0.06463178, "auxiliary_loss_mlp": 0.01276967, "balance_loss_clip": 0.06289776, "balance_loss_mlp": 0.01262489, "epoch": 0.407755899594168, "flos": 33663467128320.0, "grad_norm": 1.7098929784372252, "language_loss": 0.71722507, "learning_rate": 2.6807251245180183e-06, "loss": 0.79462653, "num_input_tokens_seen": 145585965, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.14459229, "step": 6782, "time_per_iteration": 2.6824634075164795 }, { "auxiliary_loss_clip": 0.06462392, "auxiliary_loss_mlp": 0.0127818, "balance_loss_clip": 0.06287029, "balance_loss_mlp": 0.01263738, "epoch": 0.407816022846836, "flos": 20163590190720.0, "grad_norm": 1.7522908581298628, "language_loss": 0.81953764, "learning_rate": 2.6803589030237897e-06, "loss": 0.89694339, "num_input_tokens_seen": 145605000, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.14422607, "step": 6783, "time_per_iteration": 2.589784860610962 }, { "auxiliary_loss_clip": 0.06465968, "auxiliary_loss_mlp": 0.01271738, "balance_loss_clip": 0.06290135, "balance_loss_mlp": 0.01257725, "epoch": 0.40787614609950396, "flos": 21185219496960.0, "grad_norm": 1.6021930365875094, "language_loss": 0.80716938, "learning_rate": 2.679992655730283e-06, "loss": 0.88454646, "num_input_tokens_seen": 145623740, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.14007568, "step": 6784, "time_per_iteration": 2.585503578186035 }, { "auxiliary_loss_clip": 0.06467621, "auxiliary_loss_mlp": 0.01279235, "balance_loss_clip": 0.06286117, "balance_loss_mlp": 0.01263189, "epoch": 0.407936269352172, "flos": 20526699859200.0, "grad_norm": 2.1441518255081653, "language_loss": 0.6586073, "learning_rate": 2.679626382651386e-06, "loss": 0.73607588, "num_input_tokens_seen": 145643515, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.16052246, "step": 6785, "time_per_iteration": 2.612643241882324 }, { "auxiliary_loss_clip": 0.06460992, "auxiliary_loss_mlp": 0.01277264, "balance_loss_clip": 0.0628711, "balance_loss_mlp": 0.01263043, "epoch": 0.40799639260483994, "flos": 20124709096320.0, "grad_norm": 1.9211897586331592, "language_loss": 0.80570686, "learning_rate": 2.679260083800989e-06, "loss": 0.88308942, "num_input_tokens_seen": 145660890, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.14215088, "step": 6786, "time_per_iteration": 2.736201286315918 }, { "auxiliary_loss_clip": 0.06473152, "auxiliary_loss_mlp": 0.01277029, "balance_loss_clip": 0.06298676, "balance_loss_mlp": 0.0126264, "epoch": 0.4080565158575079, "flos": 21003853334400.0, "grad_norm": 2.0544755416268288, "language_loss": 0.81810021, "learning_rate": 2.678893759192982e-06, "loss": 0.89560205, "num_input_tokens_seen": 145680070, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.14398193, "step": 6787, "time_per_iteration": 2.6140172481536865 }, { "auxiliary_loss_clip": 0.06464326, "auxiliary_loss_mlp": 0.01274719, "balance_loss_clip": 0.06291306, "balance_loss_mlp": 0.01260939, "epoch": 0.40811663911017587, "flos": 19323746317440.0, "grad_norm": 1.758507101822191, "language_loss": 0.68521917, "learning_rate": 2.678527408841255e-06, "loss": 0.7626096, "num_input_tokens_seen": 145698010, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.13775635, "step": 6788, "time_per_iteration": 2.6107239723205566 }, { "auxiliary_loss_clip": 0.06471401, "auxiliary_loss_mlp": 0.0127525, "balance_loss_clip": 0.0629776, "balance_loss_mlp": 0.01260456, "epoch": 0.40817676236284384, "flos": 40634973555840.0, "grad_norm": 1.8350291605239524, "language_loss": 0.66148162, "learning_rate": 2.678161032759701e-06, "loss": 0.73894817, "num_input_tokens_seen": 145722215, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.14788818, "step": 6789, "time_per_iteration": 2.7965009212493896 }, { "auxiliary_loss_clip": 0.0646816, "auxiliary_loss_mlp": 0.01276505, "balance_loss_clip": 0.06294487, "balance_loss_mlp": 0.0126133, "epoch": 0.4082368856155118, "flos": 20528376940800.0, "grad_norm": 1.652355980715805, "language_loss": 0.60729629, "learning_rate": 2.6777946309622123e-06, "loss": 0.68474299, "num_input_tokens_seen": 145741090, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.15155029, "step": 6790, "time_per_iteration": 2.5688679218292236 }, { "auxiliary_loss_clip": 0.06464767, "auxiliary_loss_mlp": 0.0127124, "balance_loss_clip": 0.06293287, "balance_loss_mlp": 0.01256935, "epoch": 0.40829700886817977, "flos": 11430944248320.0, "grad_norm": 2.9844907214072296, "language_loss": 0.70445597, "learning_rate": 2.677428203462683e-06, "loss": 0.78181601, "num_input_tokens_seen": 145754985, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.14318848, "step": 6791, "time_per_iteration": 2.508904457092285 }, { "auxiliary_loss_clip": 0.06374758, "auxiliary_loss_mlp": 0.01270504, "balance_loss_clip": 0.06299826, "balance_loss_mlp": 0.01267092, "epoch": 0.40835713212084773, "flos": 67350455326080.0, "grad_norm": 0.7298036110511159, "language_loss": 0.59376329, "learning_rate": 2.6770617502750093e-06, "loss": 0.67021585, "num_input_tokens_seen": 145815260, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 0.03421021, "step": 6792, "time_per_iteration": 3.2842442989349365 }, { "auxiliary_loss_clip": 0.06472963, "auxiliary_loss_mlp": 0.01273616, "balance_loss_clip": 0.06297784, "balance_loss_mlp": 0.01258417, "epoch": 0.4084172553735157, "flos": 21768408714240.0, "grad_norm": 1.788507898106745, "language_loss": 0.80373478, "learning_rate": 2.6766952714130857e-06, "loss": 0.88120055, "num_input_tokens_seen": 145832665, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.15185547, "step": 6793, "time_per_iteration": 2.7273740768432617 }, { "auxiliary_loss_clip": 0.06474014, "auxiliary_loss_mlp": 0.01273196, "balance_loss_clip": 0.06297588, "balance_loss_mlp": 0.01258259, "epoch": 0.40847737862618366, "flos": 27424594874880.0, "grad_norm": 1.7267611544778243, "language_loss": 0.85336506, "learning_rate": 2.6763287668908094e-06, "loss": 0.93083715, "num_input_tokens_seen": 145850240, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.14929199, "step": 6794, "time_per_iteration": 2.6345176696777344 }, { "auxiliary_loss_clip": 0.06469819, "auxiliary_loss_mlp": 0.01278668, "balance_loss_clip": 0.06295986, "balance_loss_mlp": 0.01262849, "epoch": 0.4085375018788516, "flos": 18593040787200.0, "grad_norm": 3.69384419715464, "language_loss": 0.79729044, "learning_rate": 2.6759622367220788e-06, "loss": 0.87477529, "num_input_tokens_seen": 145869545, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.15795898, "step": 6795, "time_per_iteration": 2.5642435550689697 }, { "auxiliary_loss_clip": 0.06480312, "auxiliary_loss_mlp": 0.01275513, "balance_loss_clip": 0.0630001, "balance_loss_mlp": 0.01258501, "epoch": 0.4085976251315196, "flos": 15416834319360.0, "grad_norm": 2.761326086694989, "language_loss": 0.69948119, "learning_rate": 2.675595680920792e-06, "loss": 0.77703941, "num_input_tokens_seen": 145884025, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.17004395, "step": 6796, "time_per_iteration": 2.5200107097625732 }, { "auxiliary_loss_clip": 0.0646801, "auxiliary_loss_mlp": 0.01276431, "balance_loss_clip": 0.06293306, "balance_loss_mlp": 0.01261565, "epoch": 0.40865774838418756, "flos": 21258705127680.0, "grad_norm": 1.665160385883943, "language_loss": 0.77735484, "learning_rate": 2.6752290995008498e-06, "loss": 0.85479933, "num_input_tokens_seen": 145903210, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.14868164, "step": 6797, "time_per_iteration": 2.572023391723633 }, { "auxiliary_loss_clip": 0.06464843, "auxiliary_loss_mlp": 0.01274297, "balance_loss_clip": 0.06290533, "balance_loss_mlp": 0.01260046, "epoch": 0.4087178716368556, "flos": 13777411259520.0, "grad_norm": 2.029539405382293, "language_loss": 0.86041254, "learning_rate": 2.6748624924761523e-06, "loss": 0.93780392, "num_input_tokens_seen": 145920985, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.14263916, "step": 6798, "time_per_iteration": 2.5470314025878906 }, { "auxiliary_loss_clip": 0.06468184, "auxiliary_loss_mlp": 0.01276734, "balance_loss_clip": 0.06295189, "balance_loss_mlp": 0.01263084, "epoch": 0.40877799488952354, "flos": 23628288666240.0, "grad_norm": 1.4110816936929937, "language_loss": 0.8426742, "learning_rate": 2.674495859860601e-06, "loss": 0.9201234, "num_input_tokens_seen": 145940350, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.13647461, "step": 6799, "time_per_iteration": 2.5962893962860107 }, { "auxiliary_loss_clip": 0.0646889, "auxiliary_loss_mlp": 0.01273813, "balance_loss_clip": 0.06294131, "balance_loss_mlp": 0.01258649, "epoch": 0.4088381181421915, "flos": 20924372136960.0, "grad_norm": 1.9265807713221967, "language_loss": 0.83561683, "learning_rate": 2.6741292016681e-06, "loss": 0.91304386, "num_input_tokens_seen": 145957460, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.15167236, "step": 6800, "time_per_iteration": 2.5866172313690186 }, { "auxiliary_loss_clip": 0.06470625, "auxiliary_loss_mlp": 0.01275747, "balance_loss_clip": 0.06292921, "balance_loss_mlp": 0.01260554, "epoch": 0.4088982413948595, "flos": 13302605698560.0, "grad_norm": 1.7630931126042697, "language_loss": 0.74181175, "learning_rate": 2.6737625179125514e-06, "loss": 0.8192755, "num_input_tokens_seen": 145975285, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.15179443, "step": 6801, "time_per_iteration": 2.558852195739746 }, { "auxiliary_loss_clip": 0.0646419, "auxiliary_loss_mlp": 0.01273294, "balance_loss_clip": 0.06288411, "balance_loss_mlp": 0.01259055, "epoch": 0.40895836464752744, "flos": 15273007585920.0, "grad_norm": 5.272863307224225, "language_loss": 0.80285919, "learning_rate": 2.673395808607861e-06, "loss": 0.88023412, "num_input_tokens_seen": 145989150, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.14227295, "step": 6802, "time_per_iteration": 2.55448579788208 }, { "auxiliary_loss_clip": 0.06471835, "auxiliary_loss_mlp": 0.01275445, "balance_loss_clip": 0.06291796, "balance_loss_mlp": 0.0125968, "epoch": 0.4090184879001954, "flos": 14506607416320.0, "grad_norm": 2.153677087793437, "language_loss": 0.75946069, "learning_rate": 2.673029073767934e-06, "loss": 0.83693349, "num_input_tokens_seen": 146006980, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.15753174, "step": 6803, "time_per_iteration": 2.58747935295105 }, { "auxiliary_loss_clip": 0.06466343, "auxiliary_loss_mlp": 0.01271438, "balance_loss_clip": 0.06291465, "balance_loss_mlp": 0.01256924, "epoch": 0.40907861115286337, "flos": 13886759237760.0, "grad_norm": 1.8125629544778945, "language_loss": 0.78789043, "learning_rate": 2.6726623134066764e-06, "loss": 0.86526823, "num_input_tokens_seen": 146025125, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.14508057, "step": 6804, "time_per_iteration": 3.9448399543762207 }, { "auxiliary_loss_clip": 0.06471771, "auxiliary_loss_mlp": 0.01269441, "balance_loss_clip": 0.06293911, "balance_loss_mlp": 0.01255511, "epoch": 0.40913873440553133, "flos": 28045071959040.0, "grad_norm": 1.9317647221650915, "language_loss": 0.75326997, "learning_rate": 2.672295527537998e-06, "loss": 0.8306821, "num_input_tokens_seen": 146044990, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.13934326, "step": 6805, "time_per_iteration": 2.6040232181549072 }, { "auxiliary_loss_clip": 0.06473279, "auxiliary_loss_mlp": 0.01273932, "balance_loss_clip": 0.06294493, "balance_loss_mlp": 0.01258685, "epoch": 0.4091988576581993, "flos": 21624917397120.0, "grad_norm": 1.975528878871602, "language_loss": 0.79552484, "learning_rate": 2.671928716175804e-06, "loss": 0.87299705, "num_input_tokens_seen": 146066045, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.15234375, "step": 6806, "time_per_iteration": 2.6190404891967773 }, { "auxiliary_loss_clip": 0.06470041, "auxiliary_loss_mlp": 0.01275425, "balance_loss_clip": 0.06292309, "balance_loss_mlp": 0.01260667, "epoch": 0.40925898091086726, "flos": 25230381932160.0, "grad_norm": 2.400239163387908, "language_loss": 0.72224605, "learning_rate": 2.671561879334007e-06, "loss": 0.79970068, "num_input_tokens_seen": 146086280, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.14758301, "step": 6807, "time_per_iteration": 2.5821714401245117 }, { "auxiliary_loss_clip": 0.06367353, "auxiliary_loss_mlp": 0.01261953, "balance_loss_clip": 0.06292634, "balance_loss_mlp": 0.01258983, "epoch": 0.40931910416353523, "flos": 68949697553280.0, "grad_norm": 0.8102555960782515, "language_loss": 0.58816952, "learning_rate": 2.6711950170265155e-06, "loss": 0.66446257, "num_input_tokens_seen": 146148840, "router_z_loss_clip": 0.74755859, "router_z_loss_mlp": 0.02967834, "step": 6808, "time_per_iteration": 3.297649383544922 }, { "auxiliary_loss_clip": 0.06470633, "auxiliary_loss_mlp": 0.01274694, "balance_loss_clip": 0.06297665, "balance_loss_mlp": 0.01260448, "epoch": 0.4093792274162032, "flos": 20195092126080.0, "grad_norm": 1.716739875484271, "language_loss": 0.54946417, "learning_rate": 2.670828129267242e-06, "loss": 0.62691742, "num_input_tokens_seen": 146166195, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.1427002, "step": 6809, "time_per_iteration": 4.006935119628906 }, { "auxiliary_loss_clip": 0.06468114, "auxiliary_loss_mlp": 0.0127136, "balance_loss_clip": 0.06295322, "balance_loss_mlp": 0.012569, "epoch": 0.40943935066887116, "flos": 25235832447360.0, "grad_norm": 1.6991298486192217, "language_loss": 0.83617616, "learning_rate": 2.6704612160700983e-06, "loss": 0.91357088, "num_input_tokens_seen": 146185045, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.14465332, "step": 6810, "time_per_iteration": 2.6253201961517334 }, { "auxiliary_loss_clip": 0.06475179, "auxiliary_loss_mlp": 0.01275094, "balance_loss_clip": 0.06298755, "balance_loss_mlp": 0.01259906, "epoch": 0.4094994739215392, "flos": 23261531345280.0, "grad_norm": 2.34881047967935, "language_loss": 0.77101642, "learning_rate": 2.670094277448999e-06, "loss": 0.84851921, "num_input_tokens_seen": 146204655, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.15185547, "step": 6811, "time_per_iteration": 2.565091609954834 }, { "auxiliary_loss_clip": 0.06472032, "auxiliary_loss_mlp": 0.01269747, "balance_loss_clip": 0.06296402, "balance_loss_mlp": 0.01254262, "epoch": 0.40955959717420715, "flos": 17387571623040.0, "grad_norm": 1.6507918739987026, "language_loss": 0.70916033, "learning_rate": 2.669727313417857e-06, "loss": 0.78657818, "num_input_tokens_seen": 146222000, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.15478516, "step": 6812, "time_per_iteration": 2.5530011653900146 }, { "auxiliary_loss_clip": 0.06463772, "auxiliary_loss_mlp": 0.01272899, "balance_loss_clip": 0.06290269, "balance_loss_mlp": 0.01258106, "epoch": 0.4096197204268751, "flos": 25089406237440.0, "grad_norm": 1.6158665852977787, "language_loss": 0.67101169, "learning_rate": 2.6693603239905872e-06, "loss": 0.7483784, "num_input_tokens_seen": 146242630, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.14807129, "step": 6813, "time_per_iteration": 2.5897140502929688 }, { "auxiliary_loss_clip": 0.06461466, "auxiliary_loss_mlp": 0.01271184, "balance_loss_clip": 0.06287896, "balance_loss_mlp": 0.01257141, "epoch": 0.4096798436795431, "flos": 30593841454080.0, "grad_norm": 2.041674839013729, "language_loss": 0.7413795, "learning_rate": 2.6689933091811087e-06, "loss": 0.81870592, "num_input_tokens_seen": 146263070, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.140625, "step": 6814, "time_per_iteration": 2.6550347805023193 }, { "auxiliary_loss_clip": 0.06473995, "auxiliary_loss_mlp": 0.01270015, "balance_loss_clip": 0.06294961, "balance_loss_mlp": 0.01255329, "epoch": 0.40973996693221104, "flos": 24140424021120.0, "grad_norm": 2.043680161978394, "language_loss": 0.66022462, "learning_rate": 2.6686262690033357e-06, "loss": 0.7376647, "num_input_tokens_seen": 146282890, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.14685059, "step": 6815, "time_per_iteration": 4.030013084411621 }, { "auxiliary_loss_clip": 0.064619, "auxiliary_loss_mlp": 0.01275112, "balance_loss_clip": 0.06293654, "balance_loss_mlp": 0.0126126, "epoch": 0.409800090184879, "flos": 23995968382080.0, "grad_norm": 1.6635800328424533, "language_loss": 0.76735568, "learning_rate": 2.668259203471188e-06, "loss": 0.84472579, "num_input_tokens_seen": 146301755, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.1385498, "step": 6816, "time_per_iteration": 2.58840012550354 }, { "auxiliary_loss_clip": 0.06460679, "auxiliary_loss_mlp": 0.0126897, "balance_loss_clip": 0.0628807, "balance_loss_mlp": 0.01255511, "epoch": 0.40986021343754697, "flos": 16149216931200.0, "grad_norm": 2.0482107764421325, "language_loss": 0.82239616, "learning_rate": 2.6678921125985843e-06, "loss": 0.89969271, "num_input_tokens_seen": 146316835, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.13470459, "step": 6817, "time_per_iteration": 2.5757548809051514 }, { "auxiliary_loss_clip": 0.06472079, "auxiliary_loss_mlp": 0.01271776, "balance_loss_clip": 0.0628984, "balance_loss_mlp": 0.01254443, "epoch": 0.40992033669021494, "flos": 24797811628800.0, "grad_norm": 6.450710324028164, "language_loss": 0.80367053, "learning_rate": 2.667524996399444e-06, "loss": 0.88110912, "num_input_tokens_seen": 146336650, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.17333984, "step": 6818, "time_per_iteration": 2.593942880630493 }, { "auxiliary_loss_clip": 0.06463747, "auxiliary_loss_mlp": 0.01265789, "balance_loss_clip": 0.06291964, "balance_loss_mlp": 0.01252294, "epoch": 0.4099804599428829, "flos": 29649429285120.0, "grad_norm": 1.760676648022776, "language_loss": 0.66496801, "learning_rate": 2.66715785488769e-06, "loss": 0.74226338, "num_input_tokens_seen": 146357640, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.13494873, "step": 6819, "time_per_iteration": 4.087445259094238 }, { "auxiliary_loss_clip": 0.0647141, "auxiliary_loss_mlp": 0.01271619, "balance_loss_clip": 0.06289794, "balance_loss_mlp": 0.01256515, "epoch": 0.41004058319555087, "flos": 24833464341120.0, "grad_norm": 2.5931713465349757, "language_loss": 0.85518169, "learning_rate": 2.6667906880772428e-06, "loss": 0.932612, "num_input_tokens_seen": 146379325, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.15087891, "step": 6820, "time_per_iteration": 2.626514196395874 }, { "auxiliary_loss_clip": 0.06464766, "auxiliary_loss_mlp": 0.01271524, "balance_loss_clip": 0.06294166, "balance_loss_mlp": 0.01258149, "epoch": 0.41010070644821883, "flos": 25744278222720.0, "grad_norm": 8.243734726153164, "language_loss": 0.71749616, "learning_rate": 2.6664234959820256e-06, "loss": 0.79485905, "num_input_tokens_seen": 146398635, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.13378906, "step": 6821, "time_per_iteration": 2.6064555644989014 }, { "auxiliary_loss_clip": 0.06459633, "auxiliary_loss_mlp": 0.0127507, "balance_loss_clip": 0.06286796, "balance_loss_mlp": 0.01261671, "epoch": 0.4101608297008868, "flos": 22352604180480.0, "grad_norm": 2.254781382058682, "language_loss": 0.75166535, "learning_rate": 2.6660562786159634e-06, "loss": 0.82901239, "num_input_tokens_seen": 146417585, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.1338501, "step": 6822, "time_per_iteration": 2.592313766479492 }, { "auxiliary_loss_clip": 0.06462801, "auxiliary_loss_mlp": 0.01269493, "balance_loss_clip": 0.06288627, "balance_loss_mlp": 0.01254937, "epoch": 0.41022095295355476, "flos": 21951619666560.0, "grad_norm": 2.1449071218386626, "language_loss": 0.76444709, "learning_rate": 2.6656890359929796e-06, "loss": 0.84177005, "num_input_tokens_seen": 146437035, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.14562988, "step": 6823, "time_per_iteration": 2.5582966804504395 }, { "auxiliary_loss_clip": 0.0646587, "auxiliary_loss_mlp": 0.012777, "balance_loss_clip": 0.06285377, "balance_loss_mlp": 0.01262084, "epoch": 0.4102810762062228, "flos": 27457312694400.0, "grad_norm": 1.7965572187606516, "language_loss": 0.7367816, "learning_rate": 2.665321768127001e-06, "loss": 0.81421733, "num_input_tokens_seen": 146457370, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.15600586, "step": 6824, "time_per_iteration": 2.6186764240264893 }, { "auxiliary_loss_clip": 0.06469078, "auxiliary_loss_mlp": 0.01270445, "balance_loss_clip": 0.06289166, "balance_loss_mlp": 0.0125546, "epoch": 0.41034119945889075, "flos": 24506258947200.0, "grad_norm": 1.8802292101588045, "language_loss": 0.72646904, "learning_rate": 2.6649544750319548e-06, "loss": 0.8038643, "num_input_tokens_seen": 146478105, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.1496582, "step": 6825, "time_per_iteration": 2.5849204063415527 }, { "auxiliary_loss_clip": 0.06466614, "auxiliary_loss_mlp": 0.01270568, "balance_loss_clip": 0.06292199, "balance_loss_mlp": 0.0125612, "epoch": 0.4104013227115587, "flos": 24359497320960.0, "grad_norm": 2.8513183291614537, "language_loss": 0.85709459, "learning_rate": 2.664587156721768e-06, "loss": 0.93446636, "num_input_tokens_seen": 146497835, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.14440918, "step": 6826, "time_per_iteration": 2.5936598777770996 }, { "auxiliary_loss_clip": 0.06460638, "auxiliary_loss_mlp": 0.01276143, "balance_loss_clip": 0.06290636, "balance_loss_mlp": 0.01260979, "epoch": 0.4104614459642267, "flos": 23735582219520.0, "grad_norm": 1.7855280599585543, "language_loss": 0.66917956, "learning_rate": 2.6642198132103696e-06, "loss": 0.74654734, "num_input_tokens_seen": 146517735, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.15148926, "step": 6827, "time_per_iteration": 2.6033272743225098 }, { "auxiliary_loss_clip": 0.06465954, "auxiliary_loss_mlp": 0.01270213, "balance_loss_clip": 0.06293455, "balance_loss_mlp": 0.01255395, "epoch": 0.41052156921689464, "flos": 22134620983680.0, "grad_norm": 1.277176211633425, "language_loss": 0.72060323, "learning_rate": 2.663852444511689e-06, "loss": 0.79796493, "num_input_tokens_seen": 146537640, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.14831543, "step": 6828, "time_per_iteration": 2.571962833404541 }, { "auxiliary_loss_clip": 0.06473257, "auxiliary_loss_mlp": 0.0127397, "balance_loss_clip": 0.06292663, "balance_loss_mlp": 0.01258092, "epoch": 0.4105816924695626, "flos": 20090607684480.0, "grad_norm": 1.6704986343810686, "language_loss": 0.83780622, "learning_rate": 2.6634850506396574e-06, "loss": 0.91527849, "num_input_tokens_seen": 146554695, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.15881348, "step": 6829, "time_per_iteration": 2.5447404384613037 }, { "auxiliary_loss_clip": 0.06468559, "auxiliary_loss_mlp": 0.01274539, "balance_loss_clip": 0.06295028, "balance_loss_mlp": 0.01260055, "epoch": 0.4106418157222306, "flos": 18082540586880.0, "grad_norm": 1.6353058299168497, "language_loss": 0.90526402, "learning_rate": 2.663117631608206e-06, "loss": 0.98269498, "num_input_tokens_seen": 146573740, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.14477539, "step": 6830, "time_per_iteration": 2.593193531036377 }, { "auxiliary_loss_clip": 0.06476296, "auxiliary_loss_mlp": 0.01269535, "balance_loss_clip": 0.06301572, "balance_loss_mlp": 0.01255892, "epoch": 0.41070193897489854, "flos": 21653442512640.0, "grad_norm": 2.051080205783239, "language_loss": 0.6639905, "learning_rate": 2.662750187431268e-06, "loss": 0.74144882, "num_input_tokens_seen": 146592885, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.13653564, "step": 6831, "time_per_iteration": 2.5526175498962402 }, { "auxiliary_loss_clip": 0.06470899, "auxiliary_loss_mlp": 0.01273947, "balance_loss_clip": 0.06297997, "balance_loss_mlp": 0.01259457, "epoch": 0.4107620622275665, "flos": 26654924396160.0, "grad_norm": 1.630524235470784, "language_loss": 0.69983125, "learning_rate": 2.662382718122776e-06, "loss": 0.77727973, "num_input_tokens_seen": 146611995, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.14477539, "step": 6832, "time_per_iteration": 2.6026132106781006 }, { "auxiliary_loss_clip": 0.06466354, "auxiliary_loss_mlp": 0.01272663, "balance_loss_clip": 0.06293875, "balance_loss_mlp": 0.01259043, "epoch": 0.41082218548023447, "flos": 18740305537920.0, "grad_norm": 2.2030739241609014, "language_loss": 0.74118245, "learning_rate": 2.662015223696666e-06, "loss": 0.81857264, "num_input_tokens_seen": 146628045, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.1362915, "step": 6833, "time_per_iteration": 2.5276613235473633 }, { "auxiliary_loss_clip": 0.06473076, "auxiliary_loss_mlp": 0.01272012, "balance_loss_clip": 0.06297726, "balance_loss_mlp": 0.01256902, "epoch": 0.41088230873290243, "flos": 22900476101760.0, "grad_norm": 1.5994033584391714, "language_loss": 0.73120964, "learning_rate": 2.6616477041668713e-06, "loss": 0.80866057, "num_input_tokens_seen": 146648355, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.15106201, "step": 6834, "time_per_iteration": 2.716017246246338 }, { "auxiliary_loss_clip": 0.06472709, "auxiliary_loss_mlp": 0.01272418, "balance_loss_clip": 0.06293835, "balance_loss_mlp": 0.01257743, "epoch": 0.4109424319855704, "flos": 24283370286720.0, "grad_norm": 1.7429379426910638, "language_loss": 0.71586084, "learning_rate": 2.661280159547329e-06, "loss": 0.79331219, "num_input_tokens_seen": 146668370, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.14672852, "step": 6835, "time_per_iteration": 2.6136653423309326 }, { "auxiliary_loss_clip": 0.0647804, "auxiliary_loss_mlp": 0.01269961, "balance_loss_clip": 0.06302932, "balance_loss_mlp": 0.01255054, "epoch": 0.41100255523823837, "flos": 12974100566400.0, "grad_norm": 1.7989236099848709, "language_loss": 0.87342805, "learning_rate": 2.660912589851978e-06, "loss": 0.95090806, "num_input_tokens_seen": 146686665, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.14886475, "step": 6836, "time_per_iteration": 2.5237936973571777 }, { "auxiliary_loss_clip": 0.06472163, "auxiliary_loss_mlp": 0.01273576, "balance_loss_clip": 0.06301583, "balance_loss_mlp": 0.01259647, "epoch": 0.4110626784909064, "flos": 23151806023680.0, "grad_norm": 1.8919238843900177, "language_loss": 0.69408643, "learning_rate": 2.6605449950947547e-06, "loss": 0.77154386, "num_input_tokens_seen": 146706570, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.1394043, "step": 6837, "time_per_iteration": 2.552974224090576 }, { "auxiliary_loss_clip": 0.064775, "auxiliary_loss_mlp": 0.01274182, "balance_loss_clip": 0.06300782, "balance_loss_mlp": 0.01258529, "epoch": 0.41112280174357435, "flos": 22754007964800.0, "grad_norm": 2.3512972694406056, "language_loss": 0.75388062, "learning_rate": 2.660177375289599e-06, "loss": 0.83139741, "num_input_tokens_seen": 146723425, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.15661621, "step": 6838, "time_per_iteration": 2.5504648685455322 }, { "auxiliary_loss_clip": 0.0647409, "auxiliary_loss_mlp": 0.01270778, "balance_loss_clip": 0.0630216, "balance_loss_mlp": 0.01257224, "epoch": 0.4111829249962423, "flos": 21108211994880.0, "grad_norm": 2.171789505509234, "language_loss": 0.82248139, "learning_rate": 2.659809730450451e-06, "loss": 0.89993006, "num_input_tokens_seen": 146741640, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.13549805, "step": 6839, "time_per_iteration": 2.5531108379364014 }, { "auxiliary_loss_clip": 0.06472239, "auxiliary_loss_mlp": 0.01269818, "balance_loss_clip": 0.06299032, "balance_loss_mlp": 0.01255954, "epoch": 0.4112430482489103, "flos": 21512005620480.0, "grad_norm": 7.1763875944503015, "language_loss": 0.80998158, "learning_rate": 2.6594420605912523e-06, "loss": 0.88740206, "num_input_tokens_seen": 146759195, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.13867188, "step": 6840, "time_per_iteration": 2.6648054122924805 }, { "auxiliary_loss_clip": 0.0646821, "auxiliary_loss_mlp": 0.01275792, "balance_loss_clip": 0.06297727, "balance_loss_mlp": 0.01262226, "epoch": 0.41130317150157825, "flos": 19575579363840.0, "grad_norm": 2.153599751483624, "language_loss": 0.68086481, "learning_rate": 2.6590743657259442e-06, "loss": 0.75830483, "num_input_tokens_seen": 146774990, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.13574219, "step": 6841, "time_per_iteration": 2.605757713317871 }, { "auxiliary_loss_clip": 0.06397706, "auxiliary_loss_mlp": 0.0126203, "balance_loss_clip": 0.06321472, "balance_loss_mlp": 0.01258776, "epoch": 0.4113632947542462, "flos": 62404541498880.0, "grad_norm": 0.748675566894786, "language_loss": 0.59497267, "learning_rate": 2.65870664586847e-06, "loss": 0.67157, "num_input_tokens_seen": 146839610, "router_z_loss_clip": 0.76318359, "router_z_loss_mlp": 0.03259277, "step": 6842, "time_per_iteration": 3.2739195823669434 }, { "auxiliary_loss_clip": 0.06473601, "auxiliary_loss_mlp": 0.01273518, "balance_loss_clip": 0.06305832, "balance_loss_mlp": 0.01259511, "epoch": 0.4114234180069142, "flos": 13923879396480.0, "grad_norm": 2.6319053950943414, "language_loss": 0.69681621, "learning_rate": 2.6583389010327742e-06, "loss": 0.77428734, "num_input_tokens_seen": 146857360, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.14007568, "step": 6843, "time_per_iteration": 2.532315492630005 }, { "auxiliary_loss_clip": 0.06390728, "auxiliary_loss_mlp": 0.01256431, "balance_loss_clip": 0.06314515, "balance_loss_mlp": 0.01253492, "epoch": 0.41148354125958214, "flos": 64948866727680.0, "grad_norm": 0.6940668990211724, "language_loss": 0.53574884, "learning_rate": 2.6579711312328013e-06, "loss": 0.61222041, "num_input_tokens_seen": 146917055, "router_z_loss_clip": 0.76220703, "router_z_loss_mlp": 0.02935791, "step": 6844, "time_per_iteration": 4.5133373737335205 }, { "auxiliary_loss_clip": 0.06474029, "auxiliary_loss_mlp": 0.01270603, "balance_loss_clip": 0.06303525, "balance_loss_mlp": 0.01256787, "epoch": 0.4115436645122501, "flos": 18733848773760.0, "grad_norm": 1.8422099216202104, "language_loss": 0.66542029, "learning_rate": 2.6576033364824967e-06, "loss": 0.74286664, "num_input_tokens_seen": 146935215, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.13830566, "step": 6845, "time_per_iteration": 2.553161144256592 }, { "auxiliary_loss_clip": 0.06475665, "auxiliary_loss_mlp": 0.01271597, "balance_loss_clip": 0.0630637, "balance_loss_mlp": 0.01257405, "epoch": 0.41160378776491807, "flos": 16258439128320.0, "grad_norm": 3.6185272235713613, "language_loss": 0.70239758, "learning_rate": 2.657235516795808e-06, "loss": 0.77987027, "num_input_tokens_seen": 146951970, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.14178467, "step": 6846, "time_per_iteration": 2.5436882972717285 }, { "auxiliary_loss_clip": 0.06474176, "auxiliary_loss_mlp": 0.01270965, "balance_loss_clip": 0.06302769, "balance_loss_mlp": 0.01257238, "epoch": 0.41166391101758604, "flos": 27978378508800.0, "grad_norm": 1.4722811048227329, "language_loss": 0.65377295, "learning_rate": 2.6568676721866826e-06, "loss": 0.73122442, "num_input_tokens_seen": 146975615, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.13739014, "step": 6847, "time_per_iteration": 2.654531240463257 }, { "auxiliary_loss_clip": 0.06472848, "auxiliary_loss_mlp": 0.01272476, "balance_loss_clip": 0.06303144, "balance_loss_mlp": 0.01258695, "epoch": 0.411724034270254, "flos": 34139865916800.0, "grad_norm": 1.423216000851037, "language_loss": 0.71015942, "learning_rate": 2.656499802669069e-06, "loss": 0.78761268, "num_input_tokens_seen": 146998855, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.13775635, "step": 6848, "time_per_iteration": 4.150612831115723 }, { "auxiliary_loss_clip": 0.06371969, "auxiliary_loss_mlp": 0.0125325, "balance_loss_clip": 0.06296349, "balance_loss_mlp": 0.01250586, "epoch": 0.41178415752292197, "flos": 67945090625280.0, "grad_norm": 0.8745234331047887, "language_loss": 0.5624671, "learning_rate": 2.6561319082569174e-06, "loss": 0.63871932, "num_input_tokens_seen": 147062710, "router_z_loss_clip": 0.7578125, "router_z_loss_mlp": 0.02667236, "step": 6849, "time_per_iteration": 3.2925803661346436 }, { "auxiliary_loss_clip": 0.0646616, "auxiliary_loss_mlp": 0.01271985, "balance_loss_clip": 0.06296811, "balance_loss_mlp": 0.01257841, "epoch": 0.41184428077558993, "flos": 34322573744640.0, "grad_norm": 2.056258970051142, "language_loss": 0.76157987, "learning_rate": 2.6557639889641783e-06, "loss": 0.8389613, "num_input_tokens_seen": 147086075, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.14129639, "step": 6850, "time_per_iteration": 2.7010364532470703 }, { "auxiliary_loss_clip": 0.06471282, "auxiliary_loss_mlp": 0.01270529, "balance_loss_clip": 0.06300929, "balance_loss_mlp": 0.01256379, "epoch": 0.41190440402825795, "flos": 35452796342400.0, "grad_norm": 1.4692454281080904, "language_loss": 0.68331826, "learning_rate": 2.6553960448048025e-06, "loss": 0.76073635, "num_input_tokens_seen": 147107590, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.14135742, "step": 6851, "time_per_iteration": 2.7149484157562256 }, { "auxiliary_loss_clip": 0.06469936, "auxiliary_loss_mlp": 0.01273795, "balance_loss_clip": 0.06295623, "balance_loss_mlp": 0.01257744, "epoch": 0.4119645272809259, "flos": 20856127386240.0, "grad_norm": 2.935215811057708, "language_loss": 0.79745626, "learning_rate": 2.655028075792743e-06, "loss": 0.87489355, "num_input_tokens_seen": 147123715, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.16046143, "step": 6852, "time_per_iteration": 2.569143295288086 }, { "auxiliary_loss_clip": 0.06474338, "auxiliary_loss_mlp": 0.01273006, "balance_loss_clip": 0.06296848, "balance_loss_mlp": 0.01259065, "epoch": 0.4120246505335939, "flos": 27569218222080.0, "grad_norm": 1.861595522511851, "language_loss": 0.77461702, "learning_rate": 2.6546600819419537e-06, "loss": 0.85209042, "num_input_tokens_seen": 147144290, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.1395874, "step": 6853, "time_per_iteration": 2.6169278621673584 }, { "auxiliary_loss_clip": 0.06474276, "auxiliary_loss_mlp": 0.01275454, "balance_loss_clip": 0.06294316, "balance_loss_mlp": 0.01259778, "epoch": 0.41208477378626185, "flos": 37824476232960.0, "grad_norm": 1.56435884201084, "language_loss": 0.66253436, "learning_rate": 2.6542920632663883e-06, "loss": 0.74003166, "num_input_tokens_seen": 147166340, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.15673828, "step": 6854, "time_per_iteration": 2.7098307609558105 }, { "auxiliary_loss_clip": 0.06465252, "auxiliary_loss_mlp": 0.01271606, "balance_loss_clip": 0.06294037, "balance_loss_mlp": 0.01259023, "epoch": 0.4121448970389298, "flos": 23447509482240.0, "grad_norm": 3.7032132144337595, "language_loss": 0.8418045, "learning_rate": 2.6539240197800023e-06, "loss": 0.91917312, "num_input_tokens_seen": 147184025, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.12585449, "step": 6855, "time_per_iteration": 3.9661076068878174 }, { "auxiliary_loss_clip": 0.06471927, "auxiliary_loss_mlp": 0.01274023, "balance_loss_clip": 0.06304214, "balance_loss_mlp": 0.01260296, "epoch": 0.4122050202915978, "flos": 21331813415040.0, "grad_norm": 1.7379883816228225, "language_loss": 0.79450142, "learning_rate": 2.6535559514967517e-06, "loss": 0.87196088, "num_input_tokens_seen": 147202730, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.13745117, "step": 6856, "time_per_iteration": 2.547919988632202 }, { "auxiliary_loss_clip": 0.06475314, "auxiliary_loss_mlp": 0.01270829, "balance_loss_clip": 0.06304272, "balance_loss_mlp": 0.01257167, "epoch": 0.41226514354426574, "flos": 17311193026560.0, "grad_norm": 3.4432278840029023, "language_loss": 0.79910213, "learning_rate": 2.6531878584305935e-06, "loss": 0.87656355, "num_input_tokens_seen": 147215315, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.13653564, "step": 6857, "time_per_iteration": 2.509129524230957 }, { "auxiliary_loss_clip": 0.06467369, "auxiliary_loss_mlp": 0.01272546, "balance_loss_clip": 0.06292313, "balance_loss_mlp": 0.01258354, "epoch": 0.4123252667969337, "flos": 17644519768320.0, "grad_norm": 1.9680840025265554, "language_loss": 0.70888776, "learning_rate": 2.6528197405954873e-06, "loss": 0.78628695, "num_input_tokens_seen": 147233330, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.14190674, "step": 6858, "time_per_iteration": 3.9774394035339355 }, { "auxiliary_loss_clip": 0.06467125, "auxiliary_loss_mlp": 0.01270942, "balance_loss_clip": 0.06297619, "balance_loss_mlp": 0.01256947, "epoch": 0.4123853900496017, "flos": 46435070304000.0, "grad_norm": 1.411425380673024, "language_loss": 0.59869272, "learning_rate": 2.652451598005391e-06, "loss": 0.67607337, "num_input_tokens_seen": 147257780, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.13983154, "step": 6859, "time_per_iteration": 2.757614850997925 }, { "auxiliary_loss_clip": 0.06472534, "auxiliary_loss_mlp": 0.01274136, "balance_loss_clip": 0.06296952, "balance_loss_mlp": 0.01260522, "epoch": 0.41244551330226964, "flos": 17680801386240.0, "grad_norm": 2.5793008732917015, "language_loss": 0.74286145, "learning_rate": 2.652083430674264e-06, "loss": 0.82032818, "num_input_tokens_seen": 147276055, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.13616943, "step": 6860, "time_per_iteration": 2.5342154502868652 }, { "auxiliary_loss_clip": 0.06461652, "auxiliary_loss_mlp": 0.01270819, "balance_loss_clip": 0.06291069, "balance_loss_mlp": 0.012581, "epoch": 0.4125056365549376, "flos": 18699034602240.0, "grad_norm": 1.5545545697883418, "language_loss": 0.74450076, "learning_rate": 2.651715238616068e-06, "loss": 0.8218255, "num_input_tokens_seen": 147293200, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.12719727, "step": 6861, "time_per_iteration": 2.5345840454101562 }, { "auxiliary_loss_clip": 0.06461596, "auxiliary_loss_mlp": 0.01268554, "balance_loss_clip": 0.06293127, "balance_loss_mlp": 0.01255488, "epoch": 0.41256575980760557, "flos": 17901174424320.0, "grad_norm": 1.9078977558075783, "language_loss": 0.80209184, "learning_rate": 2.651347021844765e-06, "loss": 0.87939334, "num_input_tokens_seen": 147310640, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.13061523, "step": 6862, "time_per_iteration": 2.553100824356079 }, { "auxiliary_loss_clip": 0.0646861, "auxiliary_loss_mlp": 0.01269994, "balance_loss_clip": 0.06296743, "balance_loss_mlp": 0.01256595, "epoch": 0.41262588306027354, "flos": 21987817430400.0, "grad_norm": 1.7414255131596115, "language_loss": 0.76704335, "learning_rate": 2.650978780374318e-06, "loss": 0.84442943, "num_input_tokens_seen": 147329435, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.13391113, "step": 6863, "time_per_iteration": 2.5621187686920166 }, { "auxiliary_loss_clip": 0.06364527, "auxiliary_loss_mlp": 0.01257702, "balance_loss_clip": 0.0628899, "balance_loss_mlp": 0.01255056, "epoch": 0.41268600631294156, "flos": 53366339243520.0, "grad_norm": 0.8845040465906225, "language_loss": 0.52539802, "learning_rate": 2.650610514218691e-06, "loss": 0.60162032, "num_input_tokens_seen": 147385805, "router_z_loss_clip": 0.75732422, "router_z_loss_mlp": 0.02648926, "step": 6864, "time_per_iteration": 3.1366355419158936 }, { "auxiliary_loss_clip": 0.06469612, "auxiliary_loss_mlp": 0.01270923, "balance_loss_clip": 0.06292592, "balance_loss_mlp": 0.01256761, "epoch": 0.4127461295656095, "flos": 24391586234880.0, "grad_norm": 1.7915975366500172, "language_loss": 0.73036909, "learning_rate": 2.6502422233918468e-06, "loss": 0.80777442, "num_input_tokens_seen": 147405160, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.14160156, "step": 6865, "time_per_iteration": 2.603036880493164 }, { "auxiliary_loss_clip": 0.06365516, "auxiliary_loss_mlp": 0.0125447, "balance_loss_clip": 0.06290501, "balance_loss_mlp": 0.01251883, "epoch": 0.4128062528182775, "flos": 71725129142400.0, "grad_norm": 0.9075591233132555, "language_loss": 0.66407704, "learning_rate": 2.649873907907753e-06, "loss": 0.74027687, "num_input_tokens_seen": 147460245, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 0.02587891, "step": 6866, "time_per_iteration": 3.0900535583496094 }, { "auxiliary_loss_clip": 0.06464471, "auxiliary_loss_mlp": 0.01268588, "balance_loss_clip": 0.06292921, "balance_loss_mlp": 0.01254974, "epoch": 0.41286637607094545, "flos": 17853362870400.0, "grad_norm": 2.0808030093432435, "language_loss": 0.81517804, "learning_rate": 2.649505567780375e-06, "loss": 0.89250863, "num_input_tokens_seen": 147476200, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.1362915, "step": 6867, "time_per_iteration": 2.5391063690185547 }, { "auxiliary_loss_clip": 0.06466865, "auxiliary_loss_mlp": 0.01272166, "balance_loss_clip": 0.06291988, "balance_loss_mlp": 0.01257611, "epoch": 0.4129264993236134, "flos": 25555407120000.0, "grad_norm": 8.576348760263581, "language_loss": 0.78442436, "learning_rate": 2.6491372030236815e-06, "loss": 0.86181468, "num_input_tokens_seen": 147494315, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.14562988, "step": 6868, "time_per_iteration": 2.6037309169769287 }, { "auxiliary_loss_clip": 0.06367366, "auxiliary_loss_mlp": 0.01251646, "balance_loss_clip": 0.06292216, "balance_loss_mlp": 0.01249037, "epoch": 0.4129866225762814, "flos": 65430730759680.0, "grad_norm": 0.8442922343761158, "language_loss": 0.57586455, "learning_rate": 2.64876881365164e-06, "loss": 0.65205467, "num_input_tokens_seen": 147543665, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 0.02610779, "step": 6869, "time_per_iteration": 2.981989860534668 }, { "auxiliary_loss_clip": 0.06463926, "auxiliary_loss_mlp": 0.01273235, "balance_loss_clip": 0.06297319, "balance_loss_mlp": 0.01259037, "epoch": 0.41304674582894935, "flos": 28884622343040.0, "grad_norm": 1.783694373356443, "language_loss": 0.75550258, "learning_rate": 2.64840039967822e-06, "loss": 0.83287412, "num_input_tokens_seen": 147564870, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.14208984, "step": 6870, "time_per_iteration": 2.6248040199279785 }, { "auxiliary_loss_clip": 0.06467072, "auxiliary_loss_mlp": 0.01272424, "balance_loss_clip": 0.06293756, "balance_loss_mlp": 0.01258667, "epoch": 0.4131068690816173, "flos": 22898379749760.0, "grad_norm": 1.6383596902682516, "language_loss": 0.83899057, "learning_rate": 2.6480319611173912e-06, "loss": 0.91638553, "num_input_tokens_seen": 147584840, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.13757324, "step": 6871, "time_per_iteration": 2.5722575187683105 }, { "auxiliary_loss_clip": 0.06470239, "auxiliary_loss_mlp": 0.01273014, "balance_loss_clip": 0.06297419, "balance_loss_mlp": 0.01258507, "epoch": 0.4131669923342853, "flos": 26071944814080.0, "grad_norm": 2.2965301986879427, "language_loss": 0.69070792, "learning_rate": 2.6476634979831263e-06, "loss": 0.76814049, "num_input_tokens_seen": 147604635, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.14520264, "step": 6872, "time_per_iteration": 2.609813928604126 }, { "auxiliary_loss_clip": 0.06465907, "auxiliary_loss_mlp": 0.01269335, "balance_loss_clip": 0.06292702, "balance_loss_mlp": 0.01256192, "epoch": 0.41322711558695324, "flos": 19250554176000.0, "grad_norm": 2.05172396582582, "language_loss": 0.76551658, "learning_rate": 2.6472950102893964e-06, "loss": 0.84286904, "num_input_tokens_seen": 147620700, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.13146973, "step": 6873, "time_per_iteration": 2.570024251937866 }, { "auxiliary_loss_clip": 0.06472437, "auxiliary_loss_mlp": 0.0127402, "balance_loss_clip": 0.06296919, "balance_loss_mlp": 0.01260031, "epoch": 0.4132872388396212, "flos": 22681067385600.0, "grad_norm": 2.2299539714018928, "language_loss": 0.83382303, "learning_rate": 2.6469264980501746e-06, "loss": 0.91128755, "num_input_tokens_seen": 147639490, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.13977051, "step": 6874, "time_per_iteration": 2.557194709777832 }, { "auxiliary_loss_clip": 0.06467558, "auxiliary_loss_mlp": 0.01272145, "balance_loss_clip": 0.06293772, "balance_loss_mlp": 0.01257947, "epoch": 0.4133473620922892, "flos": 20155246709760.0, "grad_norm": 1.9040041561177024, "language_loss": 0.71669817, "learning_rate": 2.646557961279436e-06, "loss": 0.79409522, "num_input_tokens_seen": 147657205, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.14190674, "step": 6875, "time_per_iteration": 2.5714588165283203 }, { "auxiliary_loss_clip": 0.06454913, "auxiliary_loss_mlp": 0.01268393, "balance_loss_clip": 0.06291123, "balance_loss_mlp": 0.01255536, "epoch": 0.41340748534495714, "flos": 24249520437120.0, "grad_norm": 1.6457590789852234, "language_loss": 0.82587248, "learning_rate": 2.646189399991154e-06, "loss": 0.9031055, "num_input_tokens_seen": 147677005, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.12860107, "step": 6876, "time_per_iteration": 2.5752179622650146 }, { "auxiliary_loss_clip": 0.06466592, "auxiliary_loss_mlp": 0.01270442, "balance_loss_clip": 0.06291451, "balance_loss_mlp": 0.01256006, "epoch": 0.41346760859762516, "flos": 14397385219200.0, "grad_norm": 2.820622594156227, "language_loss": 0.66105723, "learning_rate": 2.6458208141993048e-06, "loss": 0.73842764, "num_input_tokens_seen": 147693435, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.14440918, "step": 6877, "time_per_iteration": 2.5638301372528076 }, { "auxiliary_loss_clip": 0.06462754, "auxiliary_loss_mlp": 0.01276001, "balance_loss_clip": 0.06293096, "balance_loss_mlp": 0.01262018, "epoch": 0.4135277318502931, "flos": 22498569192960.0, "grad_norm": 1.8758825649616993, "language_loss": 0.77157605, "learning_rate": 2.6454522039178668e-06, "loss": 0.84896356, "num_input_tokens_seen": 147714000, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.13977051, "step": 6878, "time_per_iteration": 2.568833112716675 }, { "auxiliary_loss_clip": 0.06466207, "auxiliary_loss_mlp": 0.0127515, "balance_loss_clip": 0.0629393, "balance_loss_mlp": 0.01261984, "epoch": 0.4135878551029611, "flos": 22425251270400.0, "grad_norm": 2.043574076844615, "language_loss": 0.80572695, "learning_rate": 2.6450835691608154e-06, "loss": 0.88314056, "num_input_tokens_seen": 147731010, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.1317749, "step": 6879, "time_per_iteration": 2.5943620204925537 }, { "auxiliary_loss_clip": 0.06464867, "auxiliary_loss_mlp": 0.01277745, "balance_loss_clip": 0.06293496, "balance_loss_mlp": 0.01264012, "epoch": 0.41364797835562905, "flos": 27060646665600.0, "grad_norm": 1.6293729360168165, "language_loss": 0.84951061, "learning_rate": 2.6447149099421315e-06, "loss": 0.92693675, "num_input_tokens_seen": 147750880, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.13739014, "step": 6880, "time_per_iteration": 2.619915008544922 }, { "auxiliary_loss_clip": 0.06462247, "auxiliary_loss_mlp": 0.0127219, "balance_loss_clip": 0.06289029, "balance_loss_mlp": 0.01258874, "epoch": 0.413708101608297, "flos": 22974464856960.0, "grad_norm": 2.3422545725622315, "language_loss": 0.7075932, "learning_rate": 2.6443462262757927e-06, "loss": 0.78493762, "num_input_tokens_seen": 147771360, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.13323975, "step": 6881, "time_per_iteration": 2.689455509185791 }, { "auxiliary_loss_clip": 0.06452738, "auxiliary_loss_mlp": 0.01270169, "balance_loss_clip": 0.06287655, "balance_loss_mlp": 0.01257187, "epoch": 0.413768224860965, "flos": 13339013097600.0, "grad_norm": 2.3785186977402395, "language_loss": 0.81370002, "learning_rate": 2.6439775181757805e-06, "loss": 0.8909291, "num_input_tokens_seen": 147787440, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.12976074, "step": 6882, "time_per_iteration": 2.519258499145508 }, { "auxiliary_loss_clip": 0.06472169, "auxiliary_loss_mlp": 0.01269805, "balance_loss_clip": 0.06296878, "balance_loss_mlp": 0.01253867, "epoch": 0.41382834811363295, "flos": 20820306965760.0, "grad_norm": 2.360125898679366, "language_loss": 0.70082366, "learning_rate": 2.643608785656077e-06, "loss": 0.77824342, "num_input_tokens_seen": 147805720, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.1595459, "step": 6883, "time_per_iteration": 3.9932913780212402 }, { "auxiliary_loss_clip": 0.064641, "auxiliary_loss_mlp": 0.01271426, "balance_loss_clip": 0.0629179, "balance_loss_mlp": 0.01258003, "epoch": 0.4138884713663009, "flos": 20673293777280.0, "grad_norm": 4.354366209816431, "language_loss": 0.75491524, "learning_rate": 2.643240028730663e-06, "loss": 0.83227038, "num_input_tokens_seen": 147824605, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.13427734, "step": 6884, "time_per_iteration": 2.5749528408050537 }, { "auxiliary_loss_clip": 0.06463444, "auxiliary_loss_mlp": 0.01269828, "balance_loss_clip": 0.06288526, "balance_loss_mlp": 0.01255797, "epoch": 0.4139485946189689, "flos": 29063808299520.0, "grad_norm": 1.4367205881872758, "language_loss": 0.76299953, "learning_rate": 2.642871247413523e-06, "loss": 0.84033227, "num_input_tokens_seen": 147845445, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.14050293, "step": 6885, "time_per_iteration": 2.6340386867523193 }, { "auxiliary_loss_clip": 0.06466223, "auxiliary_loss_mlp": 0.0127454, "balance_loss_clip": 0.06291914, "balance_loss_mlp": 0.01260008, "epoch": 0.41400871787163684, "flos": 24432605608320.0, "grad_norm": 1.7979209622207268, "language_loss": 0.69923747, "learning_rate": 2.6425024417186414e-06, "loss": 0.77664518, "num_input_tokens_seen": 147865580, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.14526367, "step": 6886, "time_per_iteration": 2.5982353687286377 }, { "auxiliary_loss_clip": 0.06469404, "auxiliary_loss_mlp": 0.01273371, "balance_loss_clip": 0.06292561, "balance_loss_mlp": 0.01258911, "epoch": 0.4140688411243048, "flos": 19470172527360.0, "grad_norm": 1.6348470993975464, "language_loss": 0.75318599, "learning_rate": 2.642133611660002e-06, "loss": 0.83061373, "num_input_tokens_seen": 147885230, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.14453125, "step": 6887, "time_per_iteration": 4.1822450160980225 }, { "auxiliary_loss_clip": 0.06459373, "auxiliary_loss_mlp": 0.01273132, "balance_loss_clip": 0.0628933, "balance_loss_mlp": 0.01259297, "epoch": 0.4141289643769728, "flos": 19319008561920.0, "grad_norm": 1.9602441081723128, "language_loss": 0.70256722, "learning_rate": 2.641764757251592e-06, "loss": 0.77989221, "num_input_tokens_seen": 147903035, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.13842773, "step": 6888, "time_per_iteration": 2.5570363998413086 }, { "auxiliary_loss_clip": 0.06457132, "auxiliary_loss_mlp": 0.01274719, "balance_loss_clip": 0.06287366, "balance_loss_mlp": 0.01260897, "epoch": 0.41418908762964074, "flos": 16732448075520.0, "grad_norm": 1.819588914544575, "language_loss": 0.76762897, "learning_rate": 2.6413958785073976e-06, "loss": 0.84494752, "num_input_tokens_seen": 147918745, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.1383667, "step": 6889, "time_per_iteration": 2.5473666191101074 }, { "auxiliary_loss_clip": 0.06464055, "auxiliary_loss_mlp": 0.01279931, "balance_loss_clip": 0.062951, "balance_loss_mlp": 0.01265983, "epoch": 0.41424921088230876, "flos": 25303112876160.0, "grad_norm": 1.5526701314833522, "language_loss": 0.80378866, "learning_rate": 2.6410269754414074e-06, "loss": 0.88122857, "num_input_tokens_seen": 147938265, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.13934326, "step": 6890, "time_per_iteration": 2.5919058322906494 }, { "auxiliary_loss_clip": 0.0645729, "auxiliary_loss_mlp": 0.0127592, "balance_loss_clip": 0.06288867, "balance_loss_mlp": 0.01260268, "epoch": 0.4143093341349767, "flos": 20966984737920.0, "grad_norm": 3.122098806452869, "language_loss": 0.74871391, "learning_rate": 2.6406580480676113e-06, "loss": 0.82604605, "num_input_tokens_seen": 147957320, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.15667725, "step": 6891, "time_per_iteration": 2.567795991897583 }, { "auxiliary_loss_clip": 0.06470463, "auxiliary_loss_mlp": 0.01271054, "balance_loss_clip": 0.06294259, "balance_loss_mlp": 0.01255676, "epoch": 0.4143694573876447, "flos": 22024182902400.0, "grad_norm": 1.6360770999156236, "language_loss": 0.84977984, "learning_rate": 2.6402890963999963e-06, "loss": 0.92719501, "num_input_tokens_seen": 147977045, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.1539917, "step": 6892, "time_per_iteration": 2.602025032043457 }, { "auxiliary_loss_clip": 0.06458105, "auxiliary_loss_mlp": 0.0127354, "balance_loss_clip": 0.06290007, "balance_loss_mlp": 0.01260659, "epoch": 0.41442958064031266, "flos": 35705761418880.0, "grad_norm": 1.7783633631317637, "language_loss": 0.70831144, "learning_rate": 2.6399201204525554e-06, "loss": 0.78562784, "num_input_tokens_seen": 147996905, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.12884521, "step": 6893, "time_per_iteration": 2.6902453899383545 }, { "auxiliary_loss_clip": 0.0645878, "auxiliary_loss_mlp": 0.01267219, "balance_loss_clip": 0.06287812, "balance_loss_mlp": 0.01254863, "epoch": 0.4144897038929806, "flos": 28301391198720.0, "grad_norm": 1.329059198959299, "language_loss": 0.73331487, "learning_rate": 2.639551120239279e-06, "loss": 0.81057489, "num_input_tokens_seen": 148017875, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.12353516, "step": 6894, "time_per_iteration": 2.626323938369751 }, { "auxiliary_loss_clip": 0.06465897, "auxiliary_loss_mlp": 0.01273305, "balance_loss_clip": 0.06293188, "balance_loss_mlp": 0.01259864, "epoch": 0.4145498271456486, "flos": 11651568848640.0, "grad_norm": 3.214759559927943, "language_loss": 0.6288507, "learning_rate": 2.63918209577416e-06, "loss": 0.70624268, "num_input_tokens_seen": 148032300, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.13433838, "step": 6895, "time_per_iteration": 3.949155330657959 }, { "auxiliary_loss_clip": 0.06456646, "auxiliary_loss_mlp": 0.01272926, "balance_loss_clip": 0.06288862, "balance_loss_mlp": 0.0125942, "epoch": 0.41460995039831655, "flos": 27243061004160.0, "grad_norm": 1.6290512563195925, "language_loss": 0.70747954, "learning_rate": 2.638813047071192e-06, "loss": 0.78477526, "num_input_tokens_seen": 148053260, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.13513184, "step": 6896, "time_per_iteration": 2.614079236984253 }, { "auxiliary_loss_clip": 0.06466591, "auxiliary_loss_mlp": 0.01274861, "balance_loss_clip": 0.06294622, "balance_loss_mlp": 0.01260073, "epoch": 0.4146700736509845, "flos": 25929627454080.0, "grad_norm": 1.8129924133142528, "language_loss": 0.73374736, "learning_rate": 2.6384439741443696e-06, "loss": 0.81116188, "num_input_tokens_seen": 148072965, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.14794922, "step": 6897, "time_per_iteration": 4.047922134399414 }, { "auxiliary_loss_clip": 0.06461816, "auxiliary_loss_mlp": 0.01269177, "balance_loss_clip": 0.06292407, "balance_loss_mlp": 0.01255718, "epoch": 0.4147301969036525, "flos": 26840441335680.0, "grad_norm": 1.765915171170989, "language_loss": 0.84929413, "learning_rate": 2.6380748770076873e-06, "loss": 0.92660409, "num_input_tokens_seen": 148093240, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.13452148, "step": 6898, "time_per_iteration": 2.5977582931518555 }, { "auxiliary_loss_clip": 0.06467131, "auxiliary_loss_mlp": 0.01273291, "balance_loss_clip": 0.06293947, "balance_loss_mlp": 0.01260256, "epoch": 0.41479032015632045, "flos": 20303727344640.0, "grad_norm": 1.6163084385929147, "language_loss": 0.74880922, "learning_rate": 2.6377057556751416e-06, "loss": 0.82621348, "num_input_tokens_seen": 148110925, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.13037109, "step": 6899, "time_per_iteration": 2.5499014854431152 }, { "auxiliary_loss_clip": 0.06470602, "auxiliary_loss_mlp": 0.01271362, "balance_loss_clip": 0.06293844, "balance_loss_mlp": 0.01256545, "epoch": 0.4148504434089884, "flos": 25272030211200.0, "grad_norm": 5.759684647527488, "language_loss": 0.75904381, "learning_rate": 2.6373366101607306e-06, "loss": 0.83646339, "num_input_tokens_seen": 148130670, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.14819336, "step": 6900, "time_per_iteration": 2.59039306640625 }, { "auxiliary_loss_clip": 0.06463926, "auxiliary_loss_mlp": 0.01271553, "balance_loss_clip": 0.06294886, "balance_loss_mlp": 0.0125649, "epoch": 0.4149105666616564, "flos": 12827087377920.0, "grad_norm": 1.9538586201125543, "language_loss": 0.80337572, "learning_rate": 2.6369674404784503e-06, "loss": 0.88073051, "num_input_tokens_seen": 148148350, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.1506958, "step": 6901, "time_per_iteration": 2.5211997032165527 }, { "auxiliary_loss_clip": 0.06456812, "auxiliary_loss_mlp": 0.01270777, "balance_loss_clip": 0.06287298, "balance_loss_mlp": 0.01256919, "epoch": 0.41497068991432434, "flos": 16769526307200.0, "grad_norm": 1.656018242546646, "language_loss": 0.70755887, "learning_rate": 2.6365982466423014e-06, "loss": 0.7848348, "num_input_tokens_seen": 148167550, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.1385498, "step": 6902, "time_per_iteration": 2.5309839248657227 }, { "auxiliary_loss_clip": 0.06454267, "auxiliary_loss_mlp": 0.01274639, "balance_loss_clip": 0.06288701, "balance_loss_mlp": 0.01261448, "epoch": 0.4150308131669923, "flos": 18006161990400.0, "grad_norm": 1.9756334315318904, "language_loss": 0.84282988, "learning_rate": 2.6362290286662834e-06, "loss": 0.92011893, "num_input_tokens_seen": 148184740, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.13195801, "step": 6903, "time_per_iteration": 2.5198800563812256 }, { "auxiliary_loss_clip": 0.06473652, "auxiliary_loss_mlp": 0.0127363, "balance_loss_clip": 0.06296898, "balance_loss_mlp": 0.01256798, "epoch": 0.41509093641966033, "flos": 30052635932160.0, "grad_norm": 3.1830621955516993, "language_loss": 0.68205392, "learning_rate": 2.6358597865643968e-06, "loss": 0.75952673, "num_input_tokens_seen": 148204605, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.16821289, "step": 6904, "time_per_iteration": 2.6079859733581543 }, { "auxiliary_loss_clip": 0.06470566, "auxiliary_loss_mlp": 0.0127021, "balance_loss_clip": 0.06295757, "balance_loss_mlp": 0.01256787, "epoch": 0.4151510596723283, "flos": 24286892158080.0, "grad_norm": 1.687502702839731, "language_loss": 0.77818257, "learning_rate": 2.635490520350643e-06, "loss": 0.85559034, "num_input_tokens_seen": 148224675, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.13415527, "step": 6905, "time_per_iteration": 2.5692033767700195 }, { "auxiliary_loss_clip": 0.06467679, "auxiliary_loss_mlp": 0.01271592, "balance_loss_clip": 0.06296115, "balance_loss_mlp": 0.01258395, "epoch": 0.41521118292499626, "flos": 23482784851200.0, "grad_norm": 1.5106842482535725, "language_loss": 0.68958378, "learning_rate": 2.635121230039025e-06, "loss": 0.76697648, "num_input_tokens_seen": 148243375, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.13189697, "step": 6906, "time_per_iteration": 2.5658764839172363 }, { "auxiliary_loss_clip": 0.06466831, "auxiliary_loss_mlp": 0.01270001, "balance_loss_clip": 0.06297401, "balance_loss_mlp": 0.01257079, "epoch": 0.4152713061776642, "flos": 22131728017920.0, "grad_norm": 2.4754399808504433, "language_loss": 0.68688762, "learning_rate": 2.6347519156435467e-06, "loss": 0.76425594, "num_input_tokens_seen": 148261140, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.12927246, "step": 6907, "time_per_iteration": 2.578925371170044 }, { "auxiliary_loss_clip": 0.0646855, "auxiliary_loss_mlp": 0.01272189, "balance_loss_clip": 0.06297405, "balance_loss_mlp": 0.01259451, "epoch": 0.4153314294303322, "flos": 21257740805760.0, "grad_norm": 1.6708362975922342, "language_loss": 0.76702791, "learning_rate": 2.6343825771782123e-06, "loss": 0.84443527, "num_input_tokens_seen": 148279655, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.12738037, "step": 6908, "time_per_iteration": 2.5517468452453613 }, { "auxiliary_loss_clip": 0.063566, "auxiliary_loss_mlp": 0.01262669, "balance_loss_clip": 0.06282297, "balance_loss_mlp": 0.01259765, "epoch": 0.41539155268300015, "flos": 57939443527680.0, "grad_norm": 0.761449667589949, "language_loss": 0.64775705, "learning_rate": 2.634013214657026e-06, "loss": 0.72394973, "num_input_tokens_seen": 148339005, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 0.02900696, "step": 6909, "time_per_iteration": 3.158278465270996 }, { "auxiliary_loss_clip": 0.06460127, "auxiliary_loss_mlp": 0.01271785, "balance_loss_clip": 0.06292672, "balance_loss_mlp": 0.01259012, "epoch": 0.4154516759356681, "flos": 21909384408960.0, "grad_norm": 1.7142836599653963, "language_loss": 0.86870182, "learning_rate": 2.633643828093996e-06, "loss": 0.94602096, "num_input_tokens_seen": 148358715, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.12774658, "step": 6910, "time_per_iteration": 2.568721294403076 }, { "auxiliary_loss_clip": 0.06347616, "auxiliary_loss_mlp": 0.01256311, "balance_loss_clip": 0.06273381, "balance_loss_mlp": 0.0125397, "epoch": 0.4155117991883361, "flos": 67852234702080.0, "grad_norm": 0.7841411758686749, "language_loss": 0.62074286, "learning_rate": 2.633274417503128e-06, "loss": 0.69678211, "num_input_tokens_seen": 148417280, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 0.02336121, "step": 6911, "time_per_iteration": 3.2027246952056885 }, { "auxiliary_loss_clip": 0.06476508, "auxiliary_loss_mlp": 0.01269677, "balance_loss_clip": 0.06298462, "balance_loss_mlp": 0.01255348, "epoch": 0.41557192244100405, "flos": 14287869532800.0, "grad_norm": 2.263350909076231, "language_loss": 0.88477528, "learning_rate": 2.6329049828984312e-06, "loss": 0.96223712, "num_input_tokens_seen": 148432610, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.14318848, "step": 6912, "time_per_iteration": 2.5400919914245605 }, { "auxiliary_loss_clip": 0.06466568, "auxiliary_loss_mlp": 0.01268389, "balance_loss_clip": 0.06295894, "balance_loss_mlp": 0.01254996, "epoch": 0.415632045693672, "flos": 24468803372160.0, "grad_norm": 2.4160444377342336, "language_loss": 0.63664287, "learning_rate": 2.632535524293914e-06, "loss": 0.71399248, "num_input_tokens_seen": 148451510, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.13397217, "step": 6913, "time_per_iteration": 2.612769365310669 }, { "auxiliary_loss_clip": 0.06461514, "auxiliary_loss_mlp": 0.01267538, "balance_loss_clip": 0.06293219, "balance_loss_mlp": 0.01255242, "epoch": 0.41569216894634, "flos": 20120600246400.0, "grad_norm": 1.7561770752021313, "language_loss": 0.7556327, "learning_rate": 2.632166041703586e-06, "loss": 0.83292323, "num_input_tokens_seen": 148469945, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.12298584, "step": 6914, "time_per_iteration": 2.553765058517456 }, { "auxiliary_loss_clip": 0.06469025, "auxiliary_loss_mlp": 0.01274717, "balance_loss_clip": 0.06296855, "balance_loss_mlp": 0.01259613, "epoch": 0.41575229219900794, "flos": 23804497802880.0, "grad_norm": 1.8593074287783122, "language_loss": 0.8755151, "learning_rate": 2.631796535141458e-06, "loss": 0.95295244, "num_input_tokens_seen": 148486655, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.15087891, "step": 6915, "time_per_iteration": 2.5833823680877686 }, { "auxiliary_loss_clip": 0.06462457, "auxiliary_loss_mlp": 0.01270218, "balance_loss_clip": 0.06293114, "balance_loss_mlp": 0.0125745, "epoch": 0.4158124154516759, "flos": 23114224667520.0, "grad_norm": 2.0720060179900814, "language_loss": 0.71427155, "learning_rate": 2.6314270046215426e-06, "loss": 0.79159832, "num_input_tokens_seen": 148505035, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.12780762, "step": 6916, "time_per_iteration": 2.5713300704956055 }, { "auxiliary_loss_clip": 0.0646082, "auxiliary_loss_mlp": 0.01270821, "balance_loss_clip": 0.06286612, "balance_loss_mlp": 0.01256588, "epoch": 0.41587253870434393, "flos": 24249771999360.0, "grad_norm": 1.50113926339311, "language_loss": 0.72400224, "learning_rate": 2.631057450157852e-06, "loss": 0.80131865, "num_input_tokens_seen": 148525575, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.14233398, "step": 6917, "time_per_iteration": 2.577477216720581 }, { "auxiliary_loss_clip": 0.06454207, "auxiliary_loss_mlp": 0.01266618, "balance_loss_clip": 0.06285027, "balance_loss_mlp": 0.0125332, "epoch": 0.4159326619570119, "flos": 23888926391040.0, "grad_norm": 1.381829127468376, "language_loss": 0.81196624, "learning_rate": 2.6306878717643988e-06, "loss": 0.88917446, "num_input_tokens_seen": 148547270, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.13287354, "step": 6918, "time_per_iteration": 2.599499225616455 }, { "auxiliary_loss_clip": 0.06467676, "auxiliary_loss_mlp": 0.0127564, "balance_loss_clip": 0.0629277, "balance_loss_mlp": 0.01260834, "epoch": 0.41599278520967986, "flos": 40636315221120.0, "grad_norm": 3.379685701880234, "language_loss": 0.70717609, "learning_rate": 2.6303182694551995e-06, "loss": 0.78460932, "num_input_tokens_seen": 148572100, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.14794922, "step": 6919, "time_per_iteration": 2.7479591369628906 }, { "auxiliary_loss_clip": 0.06461734, "auxiliary_loss_mlp": 0.01275773, "balance_loss_clip": 0.06290196, "balance_loss_mlp": 0.0126157, "epoch": 0.4160529084623478, "flos": 18228757161600.0, "grad_norm": 2.011883614907053, "language_loss": 0.82359648, "learning_rate": 2.6299486432442677e-06, "loss": 0.90097153, "num_input_tokens_seen": 148591245, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.14196777, "step": 6920, "time_per_iteration": 2.534618854522705 }, { "auxiliary_loss_clip": 0.06460294, "auxiliary_loss_mlp": 0.01270159, "balance_loss_clip": 0.0628621, "balance_loss_mlp": 0.01256092, "epoch": 0.4161130317150158, "flos": 13666973178240.0, "grad_norm": 2.7690009686244346, "language_loss": 0.65129685, "learning_rate": 2.6295789931456195e-06, "loss": 0.72860134, "num_input_tokens_seen": 148607980, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.14074707, "step": 6921, "time_per_iteration": 2.5184271335601807 }, { "auxiliary_loss_clip": 0.06462266, "auxiliary_loss_mlp": 0.01276688, "balance_loss_clip": 0.06292212, "balance_loss_mlp": 0.01262693, "epoch": 0.41617315496768376, "flos": 16183779540480.0, "grad_norm": 2.234622329589186, "language_loss": 0.80740607, "learning_rate": 2.629209319173274e-06, "loss": 0.88479561, "num_input_tokens_seen": 148624490, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.14001465, "step": 6922, "time_per_iteration": 2.529602289199829 }, { "auxiliary_loss_clip": 0.06467633, "auxiliary_loss_mlp": 0.01271034, "balance_loss_clip": 0.06292431, "balance_loss_mlp": 0.01256854, "epoch": 0.4162332782203517, "flos": 26220467376000.0, "grad_norm": 1.4893294027645783, "language_loss": 0.67736799, "learning_rate": 2.628839621341247e-06, "loss": 0.75475466, "num_input_tokens_seen": 148646490, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.14178467, "step": 6923, "time_per_iteration": 4.014873743057251 }, { "auxiliary_loss_clip": 0.06461991, "auxiliary_loss_mlp": 0.01273261, "balance_loss_clip": 0.06290476, "balance_loss_mlp": 0.0125861, "epoch": 0.4162934014730197, "flos": 28191540096000.0, "grad_norm": 1.8910086727938937, "language_loss": 0.75985301, "learning_rate": 2.6284698996635593e-06, "loss": 0.83720559, "num_input_tokens_seen": 148668580, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.14672852, "step": 6924, "time_per_iteration": 2.617839813232422 }, { "auxiliary_loss_clip": 0.06462647, "auxiliary_loss_mlp": 0.01271958, "balance_loss_clip": 0.06290521, "balance_loss_mlp": 0.01258422, "epoch": 0.41635352472568765, "flos": 19871492457600.0, "grad_norm": 1.6019013332989767, "language_loss": 0.73272848, "learning_rate": 2.62810015415423e-06, "loss": 0.81007457, "num_input_tokens_seen": 148688410, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.13525391, "step": 6925, "time_per_iteration": 2.544116735458374 }, { "auxiliary_loss_clip": 0.06465028, "auxiliary_loss_mlp": 0.01272213, "balance_loss_clip": 0.06293403, "balance_loss_mlp": 0.01258533, "epoch": 0.4164136479783556, "flos": 14939974333440.0, "grad_norm": 2.0206546056694816, "language_loss": 0.84683144, "learning_rate": 2.6277303848272792e-06, "loss": 0.92420387, "num_input_tokens_seen": 148704855, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.13677979, "step": 6926, "time_per_iteration": 2.54872465133667 }, { "auxiliary_loss_clip": 0.06463553, "auxiliary_loss_mlp": 0.01270685, "balance_loss_clip": 0.06295423, "balance_loss_mlp": 0.01257328, "epoch": 0.4164737712310236, "flos": 21763251688320.0, "grad_norm": 1.549057077473634, "language_loss": 0.8670525, "learning_rate": 2.6273605916967302e-06, "loss": 0.94439489, "num_input_tokens_seen": 148723065, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.13354492, "step": 6927, "time_per_iteration": 4.008452892303467 }, { "auxiliary_loss_clip": 0.06468598, "auxiliary_loss_mlp": 0.01276586, "balance_loss_clip": 0.06298556, "balance_loss_mlp": 0.0126274, "epoch": 0.41653389448369155, "flos": 20746318210560.0, "grad_norm": 2.545756938005702, "language_loss": 0.72995162, "learning_rate": 2.626990774776604e-06, "loss": 0.8074035, "num_input_tokens_seen": 148741780, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.13848877, "step": 6928, "time_per_iteration": 2.6771678924560547 }, { "auxiliary_loss_clip": 0.06463817, "auxiliary_loss_mlp": 0.01274466, "balance_loss_clip": 0.06292395, "balance_loss_mlp": 0.01260822, "epoch": 0.4165940177363595, "flos": 24979848624000.0, "grad_norm": 1.8252718681018858, "language_loss": 0.78988153, "learning_rate": 2.6266209340809254e-06, "loss": 0.86726433, "num_input_tokens_seen": 148759795, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.13653564, "step": 6929, "time_per_iteration": 2.6146833896636963 }, { "auxiliary_loss_clip": 0.0646429, "auxiliary_loss_mlp": 0.0126746, "balance_loss_clip": 0.06294069, "balance_loss_mlp": 0.01254711, "epoch": 0.41665414098902753, "flos": 20527957670400.0, "grad_norm": 2.02143975487831, "language_loss": 0.71174008, "learning_rate": 2.6262510696237182e-06, "loss": 0.78905761, "num_input_tokens_seen": 148778680, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.12744141, "step": 6930, "time_per_iteration": 2.531770706176758 }, { "auxiliary_loss_clip": 0.06467664, "auxiliary_loss_mlp": 0.01274499, "balance_loss_clip": 0.06294177, "balance_loss_mlp": 0.01260814, "epoch": 0.4167142642416955, "flos": 19689078119040.0, "grad_norm": 1.7397337756522062, "language_loss": 0.81194937, "learning_rate": 2.625881181419007e-06, "loss": 0.88937092, "num_input_tokens_seen": 148796470, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.13684082, "step": 6931, "time_per_iteration": 2.5404486656188965 }, { "auxiliary_loss_clip": 0.06463474, "auxiliary_loss_mlp": 0.01275648, "balance_loss_clip": 0.06293543, "balance_loss_mlp": 0.01260664, "epoch": 0.41677438749436346, "flos": 23769641704320.0, "grad_norm": 1.9649291811524259, "language_loss": 0.79354823, "learning_rate": 2.6255112694808193e-06, "loss": 0.87093949, "num_input_tokens_seen": 148815300, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.14978027, "step": 6932, "time_per_iteration": 2.779885768890381 }, { "auxiliary_loss_clip": 0.06472474, "auxiliary_loss_mlp": 0.01271131, "balance_loss_clip": 0.0630033, "balance_loss_mlp": 0.01256927, "epoch": 0.41683451074703143, "flos": 30418051587840.0, "grad_norm": 2.2193778700673286, "language_loss": 0.82691264, "learning_rate": 2.6251413338231813e-06, "loss": 0.90434873, "num_input_tokens_seen": 148834315, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.14196777, "step": 6933, "time_per_iteration": 2.6374645233154297 }, { "auxiliary_loss_clip": 0.06466988, "auxiliary_loss_mlp": 0.01273051, "balance_loss_clip": 0.06290132, "balance_loss_mlp": 0.01259074, "epoch": 0.4168946339996994, "flos": 21513137650560.0, "grad_norm": 1.6523161221325027, "language_loss": 0.76996565, "learning_rate": 2.624771374460121e-06, "loss": 0.84736603, "num_input_tokens_seen": 148852420, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.13970947, "step": 6934, "time_per_iteration": 2.563096284866333 }, { "auxiliary_loss_clip": 0.06479777, "auxiliary_loss_mlp": 0.01267477, "balance_loss_clip": 0.06305108, "balance_loss_mlp": 0.01253905, "epoch": 0.41695475725236736, "flos": 17644310133120.0, "grad_norm": 1.7338715471539792, "language_loss": 0.67630148, "learning_rate": 2.624401391405668e-06, "loss": 0.75377405, "num_input_tokens_seen": 148869305, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.13598633, "step": 6935, "time_per_iteration": 3.871863603591919 }, { "auxiliary_loss_clip": 0.06481415, "auxiliary_loss_mlp": 0.01273446, "balance_loss_clip": 0.06308826, "balance_loss_mlp": 0.0125858, "epoch": 0.4170148805050353, "flos": 15674285589120.0, "grad_norm": 2.1989427964675228, "language_loss": 0.73259735, "learning_rate": 2.6240313846738513e-06, "loss": 0.81014597, "num_input_tokens_seen": 148886395, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.14868164, "step": 6936, "time_per_iteration": 2.521930456161499 }, { "auxiliary_loss_clip": 0.0646701, "auxiliary_loss_mlp": 0.01273879, "balance_loss_clip": 0.06298621, "balance_loss_mlp": 0.01260754, "epoch": 0.4170750037577033, "flos": 15164623929600.0, "grad_norm": 2.2863313913314025, "language_loss": 0.74427354, "learning_rate": 2.6236613542787024e-06, "loss": 0.82168245, "num_input_tokens_seen": 148905235, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.13116455, "step": 6937, "time_per_iteration": 3.991180658340454 }, { "auxiliary_loss_clip": 0.06476066, "auxiliary_loss_mlp": 0.01277214, "balance_loss_clip": 0.06306341, "balance_loss_mlp": 0.01263422, "epoch": 0.41713512701037125, "flos": 28776029051520.0, "grad_norm": 1.5541358319225236, "language_loss": 0.84792328, "learning_rate": 2.6232913002342518e-06, "loss": 0.92545605, "num_input_tokens_seen": 148928130, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.13818359, "step": 6938, "time_per_iteration": 2.6540205478668213 }, { "auxiliary_loss_clip": 0.06478956, "auxiliary_loss_mlp": 0.01274443, "balance_loss_clip": 0.06302811, "balance_loss_mlp": 0.01258684, "epoch": 0.4171952502630392, "flos": 28264564529280.0, "grad_norm": 1.705278887540132, "language_loss": 0.75090158, "learning_rate": 2.6229212225545334e-06, "loss": 0.82843554, "num_input_tokens_seen": 148948790, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.15759277, "step": 6939, "time_per_iteration": 2.6282784938812256 }, { "auxiliary_loss_clip": 0.06474929, "auxiliary_loss_mlp": 0.01269714, "balance_loss_clip": 0.06304862, "balance_loss_mlp": 0.01255916, "epoch": 0.4172553735157072, "flos": 24578612547840.0, "grad_norm": 1.5970570888937254, "language_loss": 0.75507712, "learning_rate": 2.622551121253579e-06, "loss": 0.83252352, "num_input_tokens_seen": 148967690, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.13800049, "step": 6940, "time_per_iteration": 2.5948486328125 }, { "auxiliary_loss_clip": 0.06477873, "auxiliary_loss_mlp": 0.01272806, "balance_loss_clip": 0.06303281, "balance_loss_mlp": 0.01258942, "epoch": 0.41731549676837515, "flos": 27051967768320.0, "grad_norm": 1.6200733264849188, "language_loss": 0.72076368, "learning_rate": 2.622180996345424e-06, "loss": 0.79827046, "num_input_tokens_seen": 148987150, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.13879395, "step": 6941, "time_per_iteration": 2.620478630065918 }, { "auxiliary_loss_clip": 0.06473029, "auxiliary_loss_mlp": 0.0127249, "balance_loss_clip": 0.06297019, "balance_loss_mlp": 0.01257834, "epoch": 0.4173756200210431, "flos": 28400173562880.0, "grad_norm": 2.073927259588408, "language_loss": 0.7355485, "learning_rate": 2.621810847844104e-06, "loss": 0.81300366, "num_input_tokens_seen": 149004895, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.14660645, "step": 6942, "time_per_iteration": 2.624075174331665 }, { "auxiliary_loss_clip": 0.06479827, "auxiliary_loss_mlp": 0.012726, "balance_loss_clip": 0.06303325, "balance_loss_mlp": 0.01257425, "epoch": 0.41743574327371114, "flos": 22526968527360.0, "grad_norm": 3.544171007303408, "language_loss": 0.73518729, "learning_rate": 2.6214406757636534e-06, "loss": 0.8127116, "num_input_tokens_seen": 149020970, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.15185547, "step": 6943, "time_per_iteration": 2.5646896362304688 }, { "auxiliary_loss_clip": 0.06475326, "auxiliary_loss_mlp": 0.01273169, "balance_loss_clip": 0.0630048, "balance_loss_mlp": 0.01259097, "epoch": 0.4174958665263791, "flos": 30120587193600.0, "grad_norm": 1.6610837429070782, "language_loss": 0.64151263, "learning_rate": 2.621070480118111e-06, "loss": 0.71899754, "num_input_tokens_seen": 149041795, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.14068604, "step": 6944, "time_per_iteration": 2.6369097232818604 }, { "auxiliary_loss_clip": 0.06475331, "auxiliary_loss_mlp": 0.0127614, "balance_loss_clip": 0.06301532, "balance_loss_mlp": 0.01261168, "epoch": 0.41755598977904707, "flos": 25270227348480.0, "grad_norm": 1.4372648886669288, "language_loss": 0.70479202, "learning_rate": 2.620700260921513e-06, "loss": 0.78230667, "num_input_tokens_seen": 149063700, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.1496582, "step": 6945, "time_per_iteration": 2.6051273345947266 }, { "auxiliary_loss_clip": 0.06469211, "auxiliary_loss_mlp": 0.01271761, "balance_loss_clip": 0.06296839, "balance_loss_mlp": 0.01256627, "epoch": 0.41761611303171503, "flos": 19834707715200.0, "grad_norm": 1.6700851636962555, "language_loss": 0.80906653, "learning_rate": 2.620330018187899e-06, "loss": 0.88647628, "num_input_tokens_seen": 149082410, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.15130615, "step": 6946, "time_per_iteration": 2.5563204288482666 }, { "auxiliary_loss_clip": 0.06471992, "auxiliary_loss_mlp": 0.01270718, "balance_loss_clip": 0.06300806, "balance_loss_mlp": 0.0125655, "epoch": 0.417676236284383, "flos": 15528655992960.0, "grad_norm": 2.0485961566436135, "language_loss": 0.77225995, "learning_rate": 2.6199597519313086e-06, "loss": 0.84968704, "num_input_tokens_seen": 149098745, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.14160156, "step": 6947, "time_per_iteration": 2.5373282432556152 }, { "auxiliary_loss_clip": 0.0647452, "auxiliary_loss_mlp": 0.01270033, "balance_loss_clip": 0.06301342, "balance_loss_mlp": 0.01255406, "epoch": 0.41773635953705096, "flos": 32532531770880.0, "grad_norm": 1.9308116147391414, "language_loss": 0.71838903, "learning_rate": 2.6195894621657825e-06, "loss": 0.79583454, "num_input_tokens_seen": 149122255, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.1461792, "step": 6948, "time_per_iteration": 2.6468114852905273 }, { "auxiliary_loss_clip": 0.06467989, "auxiliary_loss_mlp": 0.01269312, "balance_loss_clip": 0.06297858, "balance_loss_mlp": 0.01254834, "epoch": 0.4177964827897189, "flos": 23447719117440.0, "grad_norm": 1.5417494478228158, "language_loss": 0.77351367, "learning_rate": 2.619219148905362e-06, "loss": 0.85088664, "num_input_tokens_seen": 149142845, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.14459229, "step": 6949, "time_per_iteration": 2.58178448677063 }, { "auxiliary_loss_clip": 0.06475741, "auxiliary_loss_mlp": 0.01273618, "balance_loss_clip": 0.06299858, "balance_loss_mlp": 0.01258401, "epoch": 0.4178566060423869, "flos": 22755768900480.0, "grad_norm": 1.5641161511343649, "language_loss": 0.82195365, "learning_rate": 2.6188488121640888e-06, "loss": 0.89944726, "num_input_tokens_seen": 149163375, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.15222168, "step": 6950, "time_per_iteration": 2.582953453063965 }, { "auxiliary_loss_clip": 0.06462278, "auxiliary_loss_mlp": 0.01269246, "balance_loss_clip": 0.06296842, "balance_loss_mlp": 0.01255924, "epoch": 0.41791672929505486, "flos": 26040233243520.0, "grad_norm": 1.262303379588907, "language_loss": 0.76165694, "learning_rate": 2.618478451956007e-06, "loss": 0.83897221, "num_input_tokens_seen": 149185610, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.13323975, "step": 6951, "time_per_iteration": 2.6109619140625 }, { "auxiliary_loss_clip": 0.06480066, "auxiliary_loss_mlp": 0.01271714, "balance_loss_clip": 0.06300564, "balance_loss_mlp": 0.01256443, "epoch": 0.4179768525477228, "flos": 19574028063360.0, "grad_norm": 1.7392107460874608, "language_loss": 0.73348796, "learning_rate": 2.61810806829516e-06, "loss": 0.81100577, "num_input_tokens_seen": 149203990, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.152771, "step": 6952, "time_per_iteration": 2.5466885566711426 }, { "auxiliary_loss_clip": 0.0647404, "auxiliary_loss_mlp": 0.0127135, "balance_loss_clip": 0.06299929, "balance_loss_mlp": 0.01257427, "epoch": 0.4180369758003908, "flos": 17789352750720.0, "grad_norm": 5.141967465498643, "language_loss": 0.72231388, "learning_rate": 2.617737661195593e-06, "loss": 0.79976773, "num_input_tokens_seen": 149221385, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.1394043, "step": 6953, "time_per_iteration": 2.53261661529541 }, { "auxiliary_loss_clip": 0.06468578, "auxiliary_loss_mlp": 0.01271994, "balance_loss_clip": 0.06300622, "balance_loss_mlp": 0.01257474, "epoch": 0.41809709905305875, "flos": 20967152446080.0, "grad_norm": 2.019431991949277, "language_loss": 0.765517, "learning_rate": 2.617367230671353e-06, "loss": 0.84292275, "num_input_tokens_seen": 149241175, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.14520264, "step": 6954, "time_per_iteration": 2.569288969039917 }, { "auxiliary_loss_clip": 0.06463298, "auxiliary_loss_mlp": 0.01274064, "balance_loss_clip": 0.06292564, "balance_loss_mlp": 0.01258281, "epoch": 0.4181572223057267, "flos": 22024099048320.0, "grad_norm": 2.038315732116982, "language_loss": 0.85049188, "learning_rate": 2.616996776736485e-06, "loss": 0.92786551, "num_input_tokens_seen": 149259115, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.15808105, "step": 6955, "time_per_iteration": 2.561657428741455 }, { "auxiliary_loss_clip": 0.06469531, "auxiliary_loss_mlp": 0.01272755, "balance_loss_clip": 0.06298703, "balance_loss_mlp": 0.01257866, "epoch": 0.4182173455583947, "flos": 26251969311360.0, "grad_norm": 1.5684449762813961, "language_loss": 0.83421975, "learning_rate": 2.616626299405037e-06, "loss": 0.91164255, "num_input_tokens_seen": 149278705, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.14874268, "step": 6956, "time_per_iteration": 2.6093733310699463 }, { "auxiliary_loss_clip": 0.06477696, "auxiliary_loss_mlp": 0.01275592, "balance_loss_clip": 0.06301058, "balance_loss_mlp": 0.01260274, "epoch": 0.4182774688110627, "flos": 14796566870400.0, "grad_norm": 1.8375603464740795, "language_loss": 0.72453833, "learning_rate": 2.616255798691059e-06, "loss": 0.80207121, "num_input_tokens_seen": 149294040, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.15307617, "step": 6957, "time_per_iteration": 2.659233570098877 }, { "auxiliary_loss_clip": 0.06470069, "auxiliary_loss_mlp": 0.01270812, "balance_loss_clip": 0.06297484, "balance_loss_mlp": 0.01257783, "epoch": 0.41833759206373067, "flos": 20418190421760.0, "grad_norm": 1.885947729641732, "language_loss": 0.75547063, "learning_rate": 2.6158852746085982e-06, "loss": 0.83287942, "num_input_tokens_seen": 149310385, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.13024902, "step": 6958, "time_per_iteration": 2.5792760848999023 }, { "auxiliary_loss_clip": 0.06467288, "auxiliary_loss_mlp": 0.01271913, "balance_loss_clip": 0.0629564, "balance_loss_mlp": 0.01257477, "epoch": 0.41839771531639863, "flos": 23662557786240.0, "grad_norm": 2.5891809331632585, "language_loss": 0.77484614, "learning_rate": 2.6155147271717066e-06, "loss": 0.85223818, "num_input_tokens_seen": 149328235, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.14422607, "step": 6959, "time_per_iteration": 2.5723111629486084 }, { "auxiliary_loss_clip": 0.06466764, "auxiliary_loss_mlp": 0.01277269, "balance_loss_clip": 0.06295107, "balance_loss_mlp": 0.01260586, "epoch": 0.4184578385690666, "flos": 19760006200320.0, "grad_norm": 1.5729616720377406, "language_loss": 0.76855421, "learning_rate": 2.6151441563944347e-06, "loss": 0.84599447, "num_input_tokens_seen": 149347465, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.16674805, "step": 6960, "time_per_iteration": 2.566213607788086 }, { "auxiliary_loss_clip": 0.06461142, "auxiliary_loss_mlp": 0.01266942, "balance_loss_clip": 0.06296968, "balance_loss_mlp": 0.01254175, "epoch": 0.41851796182173456, "flos": 20199578319360.0, "grad_norm": 1.639387165389999, "language_loss": 0.76378745, "learning_rate": 2.614773562290835e-06, "loss": 0.84106833, "num_input_tokens_seen": 149366685, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.12774658, "step": 6961, "time_per_iteration": 2.5731828212738037 }, { "auxiliary_loss_clip": 0.06353405, "auxiliary_loss_mlp": 0.01252905, "balance_loss_clip": 0.06278527, "balance_loss_mlp": 0.01249984, "epoch": 0.41857808507440253, "flos": 59038331898240.0, "grad_norm": 0.7615690764170492, "language_loss": 0.54722762, "learning_rate": 2.61440294487496e-06, "loss": 0.62329078, "num_input_tokens_seen": 149422925, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 0.0291748, "step": 6962, "time_per_iteration": 4.486556768417358 }, { "auxiliary_loss_clip": 0.06471389, "auxiliary_loss_mlp": 0.01275413, "balance_loss_clip": 0.06296104, "balance_loss_mlp": 0.01261066, "epoch": 0.4186382083270705, "flos": 18484740984960.0, "grad_norm": 2.062388617362537, "language_loss": 0.85499972, "learning_rate": 2.614032304160864e-06, "loss": 0.9324677, "num_input_tokens_seen": 149440820, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.14349365, "step": 6963, "time_per_iteration": 2.530250072479248 }, { "auxiliary_loss_clip": 0.06472962, "auxiliary_loss_mlp": 0.01270556, "balance_loss_clip": 0.0630001, "balance_loss_mlp": 0.01256245, "epoch": 0.41869833157973846, "flos": 21584988126720.0, "grad_norm": 1.5343439613242846, "language_loss": 0.70575321, "learning_rate": 2.6136616401626014e-06, "loss": 0.78318846, "num_input_tokens_seen": 149461060, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.14312744, "step": 6964, "time_per_iteration": 2.563286781311035 }, { "auxiliary_loss_clip": 0.06472158, "auxiliary_loss_mlp": 0.01272626, "balance_loss_clip": 0.0630293, "balance_loss_mlp": 0.01259823, "epoch": 0.4187584548324064, "flos": 35526156192000.0, "grad_norm": 2.1727018432897935, "language_loss": 0.71144831, "learning_rate": 2.6132909528942273e-06, "loss": 0.78889608, "num_input_tokens_seen": 149483115, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.12799072, "step": 6965, "time_per_iteration": 2.6885671615600586 }, { "auxiliary_loss_clip": 0.0645998, "auxiliary_loss_mlp": 0.01270431, "balance_loss_clip": 0.06293532, "balance_loss_mlp": 0.01258594, "epoch": 0.4188185780850744, "flos": 18660950121600.0, "grad_norm": 1.522908868863189, "language_loss": 0.71831995, "learning_rate": 2.6129202423697997e-06, "loss": 0.79562408, "num_input_tokens_seen": 149501495, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.11828613, "step": 6966, "time_per_iteration": 4.083525896072388 }, { "auxiliary_loss_clip": 0.06473973, "auxiliary_loss_mlp": 0.01274914, "balance_loss_clip": 0.0629618, "balance_loss_mlp": 0.01260567, "epoch": 0.41887870133774235, "flos": 40342959676800.0, "grad_norm": 1.8625482493232837, "language_loss": 0.71365523, "learning_rate": 2.612549508603375e-06, "loss": 0.79114413, "num_input_tokens_seen": 149523170, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.14343262, "step": 6967, "time_per_iteration": 2.7243809700012207 }, { "auxiliary_loss_clip": 0.06358943, "auxiliary_loss_mlp": 0.01258552, "balance_loss_clip": 0.06283988, "balance_loss_mlp": 0.01255476, "epoch": 0.4189388245904103, "flos": 61388083946880.0, "grad_norm": 0.6552131188707987, "language_loss": 0.46022272, "learning_rate": 2.612178751609011e-06, "loss": 0.5363977, "num_input_tokens_seen": 149583955, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 0.03071594, "step": 6968, "time_per_iteration": 3.1941144466400146 }, { "auxiliary_loss_clip": 0.06465179, "auxiliary_loss_mlp": 0.01269878, "balance_loss_clip": 0.06290055, "balance_loss_mlp": 0.01255144, "epoch": 0.4189989478430783, "flos": 28222371198720.0, "grad_norm": 1.6253952426641785, "language_loss": 0.75745893, "learning_rate": 2.6118079714007685e-06, "loss": 0.83480954, "num_input_tokens_seen": 149604440, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.14746094, "step": 6969, "time_per_iteration": 2.6149346828460693 }, { "auxiliary_loss_clip": 0.06463821, "auxiliary_loss_mlp": 0.01270117, "balance_loss_clip": 0.0629386, "balance_loss_mlp": 0.01256611, "epoch": 0.4190590710957463, "flos": 24571820367360.0, "grad_norm": 2.3271984481369907, "language_loss": 0.81254864, "learning_rate": 2.611437167992705e-06, "loss": 0.88988805, "num_input_tokens_seen": 149623745, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.13525391, "step": 6970, "time_per_iteration": 2.586690664291382 }, { "auxiliary_loss_clip": 0.06464978, "auxiliary_loss_mlp": 0.01271616, "balance_loss_clip": 0.0629642, "balance_loss_mlp": 0.01258062, "epoch": 0.41911919434841427, "flos": 21732504439680.0, "grad_norm": 1.6675826585233684, "language_loss": 0.84054077, "learning_rate": 2.6110663413988835e-06, "loss": 0.91790676, "num_input_tokens_seen": 149643025, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.13549805, "step": 6971, "time_per_iteration": 2.577756643295288 }, { "auxiliary_loss_clip": 0.06460361, "auxiliary_loss_mlp": 0.01277298, "balance_loss_clip": 0.06294268, "balance_loss_mlp": 0.01262349, "epoch": 0.41917931760108224, "flos": 17607064193280.0, "grad_norm": 1.9138087161549713, "language_loss": 0.75032628, "learning_rate": 2.6106954916333648e-06, "loss": 0.82770288, "num_input_tokens_seen": 149660695, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.14953613, "step": 6972, "time_per_iteration": 2.5523078441619873 }, { "auxiliary_loss_clip": 0.06463607, "auxiliary_loss_mlp": 0.01268423, "balance_loss_clip": 0.06292748, "balance_loss_mlp": 0.01254988, "epoch": 0.4192394408537502, "flos": 37825943679360.0, "grad_norm": 1.5678789992169917, "language_loss": 0.73498189, "learning_rate": 2.610324618710212e-06, "loss": 0.81230223, "num_input_tokens_seen": 149682040, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.13439941, "step": 6973, "time_per_iteration": 2.6934707164764404 }, { "auxiliary_loss_clip": 0.0648008, "auxiliary_loss_mlp": 0.01274392, "balance_loss_clip": 0.06301595, "balance_loss_mlp": 0.01260659, "epoch": 0.41929956410641817, "flos": 23113637688960.0, "grad_norm": 2.4878557277456834, "language_loss": 0.74078393, "learning_rate": 2.609953722643489e-06, "loss": 0.81832862, "num_input_tokens_seen": 149700855, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.1373291, "step": 6974, "time_per_iteration": 3.9173216819763184 }, { "auxiliary_loss_clip": 0.06459543, "auxiliary_loss_mlp": 0.01269958, "balance_loss_clip": 0.06289367, "balance_loss_mlp": 0.0125632, "epoch": 0.41935968735908613, "flos": 22530448471680.0, "grad_norm": 1.7753535034351902, "language_loss": 0.73429435, "learning_rate": 2.609582803447259e-06, "loss": 0.81158936, "num_input_tokens_seen": 149717360, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.13647461, "step": 6975, "time_per_iteration": 2.63026762008667 }, { "auxiliary_loss_clip": 0.06464896, "auxiliary_loss_mlp": 0.0126955, "balance_loss_clip": 0.06297261, "balance_loss_mlp": 0.01254952, "epoch": 0.4194198106117541, "flos": 26877771129600.0, "grad_norm": 1.4956032373556454, "language_loss": 0.81324172, "learning_rate": 2.6092118611355885e-06, "loss": 0.89058614, "num_input_tokens_seen": 149738975, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.14605713, "step": 6976, "time_per_iteration": 4.059283256530762 }, { "auxiliary_loss_clip": 0.06467363, "auxiliary_loss_mlp": 0.01265044, "balance_loss_clip": 0.06295818, "balance_loss_mlp": 0.012522, "epoch": 0.41947993386442206, "flos": 19908696470400.0, "grad_norm": 1.6932034616305285, "language_loss": 0.67505717, "learning_rate": 2.6088408957225425e-06, "loss": 0.75238132, "num_input_tokens_seen": 149757055, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.12835693, "step": 6977, "time_per_iteration": 2.542165994644165 }, { "auxiliary_loss_clip": 0.06472044, "auxiliary_loss_mlp": 0.01268571, "balance_loss_clip": 0.06298183, "balance_loss_mlp": 0.01255589, "epoch": 0.41954005711709, "flos": 17389584120960.0, "grad_norm": 2.1426791213626593, "language_loss": 0.81290436, "learning_rate": 2.6084699072221898e-06, "loss": 0.89031053, "num_input_tokens_seen": 149772885, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.12976074, "step": 6978, "time_per_iteration": 2.554593086242676 }, { "auxiliary_loss_clip": 0.06467635, "auxiliary_loss_mlp": 0.01272809, "balance_loss_clip": 0.06291935, "balance_loss_mlp": 0.01259297, "epoch": 0.419600180369758, "flos": 25009254207360.0, "grad_norm": 1.9388101178361643, "language_loss": 0.83189178, "learning_rate": 2.6080988956485964e-06, "loss": 0.90929621, "num_input_tokens_seen": 149791515, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.13519287, "step": 6979, "time_per_iteration": 2.589118480682373 }, { "auxiliary_loss_clip": 0.06468275, "auxiliary_loss_mlp": 0.01266703, "balance_loss_clip": 0.06298688, "balance_loss_mlp": 0.0125384, "epoch": 0.41966030362242596, "flos": 17389458339840.0, "grad_norm": 2.0132847419166215, "language_loss": 0.83872002, "learning_rate": 2.6077278610158325e-06, "loss": 0.91606975, "num_input_tokens_seen": 149807250, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.12860107, "step": 6980, "time_per_iteration": 2.5209765434265137 }, { "auxiliary_loss_clip": 0.06470437, "auxiliary_loss_mlp": 0.01272402, "balance_loss_clip": 0.06295729, "balance_loss_mlp": 0.01259182, "epoch": 0.4197204268750939, "flos": 22161427090560.0, "grad_norm": 2.3636928842476985, "language_loss": 0.79176199, "learning_rate": 2.6073568033379665e-06, "loss": 0.86919045, "num_input_tokens_seen": 149821640, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.13226318, "step": 6981, "time_per_iteration": 2.5447347164154053 }, { "auxiliary_loss_clip": 0.06466605, "auxiliary_loss_mlp": 0.01271367, "balance_loss_clip": 0.06300751, "balance_loss_mlp": 0.01258719, "epoch": 0.4197805501277619, "flos": 22089534687360.0, "grad_norm": 1.596916698735347, "language_loss": 0.84621733, "learning_rate": 2.6069857226290696e-06, "loss": 0.9235971, "num_input_tokens_seen": 149840545, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.12634277, "step": 6982, "time_per_iteration": 2.569683790206909 }, { "auxiliary_loss_clip": 0.064702, "auxiliary_loss_mlp": 0.01275972, "balance_loss_clip": 0.06297306, "balance_loss_mlp": 0.01262472, "epoch": 0.4198406733804299, "flos": 26439372967680.0, "grad_norm": 2.3675151506243863, "language_loss": 0.57665807, "learning_rate": 2.606614618903214e-06, "loss": 0.65411979, "num_input_tokens_seen": 149860375, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.13494873, "step": 6983, "time_per_iteration": 2.5929627418518066 }, { "auxiliary_loss_clip": 0.0646809, "auxiliary_loss_mlp": 0.01268259, "balance_loss_clip": 0.0630099, "balance_loss_mlp": 0.01256046, "epoch": 0.4199007966330979, "flos": 12535870112640.0, "grad_norm": 2.167830831745686, "language_loss": 0.83042085, "learning_rate": 2.606243492174471e-06, "loss": 0.90778434, "num_input_tokens_seen": 149877850, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.12219238, "step": 6984, "time_per_iteration": 2.533973217010498 }, { "auxiliary_loss_clip": 0.06472513, "auxiliary_loss_mlp": 0.01268704, "balance_loss_clip": 0.06302557, "balance_loss_mlp": 0.01256622, "epoch": 0.41996091988576584, "flos": 21769498817280.0, "grad_norm": 1.4682112949183868, "language_loss": 0.78632009, "learning_rate": 2.605872342456914e-06, "loss": 0.86373222, "num_input_tokens_seen": 149896110, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.12078857, "step": 6985, "time_per_iteration": 2.572613000869751 }, { "auxiliary_loss_clip": 0.06482378, "auxiliary_loss_mlp": 0.01267631, "balance_loss_clip": 0.06304882, "balance_loss_mlp": 0.01253182, "epoch": 0.4200210431384338, "flos": 26549182143360.0, "grad_norm": 1.5936008985514034, "language_loss": 0.78346467, "learning_rate": 2.6055011697646173e-06, "loss": 0.86096478, "num_input_tokens_seen": 149916495, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.14453125, "step": 6986, "time_per_iteration": 2.617225170135498 }, { "auxiliary_loss_clip": 0.06463405, "auxiliary_loss_mlp": 0.01268236, "balance_loss_clip": 0.06298508, "balance_loss_mlp": 0.01256184, "epoch": 0.42008116639110177, "flos": 26802859979520.0, "grad_norm": 1.4940679626325732, "language_loss": 0.72550344, "learning_rate": 2.605129974111655e-06, "loss": 0.80281985, "num_input_tokens_seen": 149936445, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.1206665, "step": 6987, "time_per_iteration": 2.603672981262207 }, { "auxiliary_loss_clip": 0.06469081, "auxiliary_loss_mlp": 0.01273802, "balance_loss_clip": 0.06298221, "balance_loss_mlp": 0.01260701, "epoch": 0.42014128964376973, "flos": 32095433347200.0, "grad_norm": 1.4302813263672334, "language_loss": 0.75222874, "learning_rate": 2.604758755512104e-06, "loss": 0.82965755, "num_input_tokens_seen": 149959430, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.13110352, "step": 6988, "time_per_iteration": 2.640251636505127 }, { "auxiliary_loss_clip": 0.06474573, "auxiliary_loss_mlp": 0.01268874, "balance_loss_clip": 0.06300697, "balance_loss_mlp": 0.01255106, "epoch": 0.4202014128964377, "flos": 26474061358080.0, "grad_norm": 1.534941547180269, "language_loss": 0.74412298, "learning_rate": 2.60438751398004e-06, "loss": 0.82155746, "num_input_tokens_seen": 149980365, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.13769531, "step": 6989, "time_per_iteration": 2.598891258239746 }, { "auxiliary_loss_clip": 0.06470972, "auxiliary_loss_mlp": 0.01269827, "balance_loss_clip": 0.06298609, "balance_loss_mlp": 0.01256368, "epoch": 0.42026153614910566, "flos": 13405287277440.0, "grad_norm": 2.2914369025567045, "language_loss": 0.71410966, "learning_rate": 2.6040162495295404e-06, "loss": 0.79151767, "num_input_tokens_seen": 149997375, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.13458252, "step": 6990, "time_per_iteration": 2.5272574424743652 }, { "auxiliary_loss_clip": 0.06370977, "auxiliary_loss_mlp": 0.01263971, "balance_loss_clip": 0.06296623, "balance_loss_mlp": 0.01260693, "epoch": 0.42032165940177363, "flos": 60268720452480.0, "grad_norm": 0.8190936449660472, "language_loss": 0.60319316, "learning_rate": 2.603644962174685e-06, "loss": 0.67954266, "num_input_tokens_seen": 150051230, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 0.03283691, "step": 6991, "time_per_iteration": 3.0814857482910156 }, { "auxiliary_loss_clip": 0.06469792, "auxiliary_loss_mlp": 0.01271537, "balance_loss_clip": 0.06298652, "balance_loss_mlp": 0.01257423, "epoch": 0.4203817826544416, "flos": 24542121294720.0, "grad_norm": 1.5435488195341256, "language_loss": 0.83498323, "learning_rate": 2.6032736519295517e-06, "loss": 0.91239655, "num_input_tokens_seen": 150071135, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.14111328, "step": 6992, "time_per_iteration": 2.596576690673828 }, { "auxiliary_loss_clip": 0.06370571, "auxiliary_loss_mlp": 0.0126743, "balance_loss_clip": 0.06296436, "balance_loss_mlp": 0.01264463, "epoch": 0.42044190590710956, "flos": 58837679297280.0, "grad_norm": 0.8031926762071825, "language_loss": 0.6525836, "learning_rate": 2.6029023188082217e-06, "loss": 0.72896361, "num_input_tokens_seen": 150125220, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 0.02964783, "step": 6993, "time_per_iteration": 3.1632540225982666 }, { "auxiliary_loss_clip": 0.06472756, "auxiliary_loss_mlp": 0.01274848, "balance_loss_clip": 0.06295371, "balance_loss_mlp": 0.01259649, "epoch": 0.4205020291597775, "flos": 16441733934720.0, "grad_norm": 3.3844347108473305, "language_loss": 0.83870745, "learning_rate": 2.6025309628247746e-06, "loss": 0.91618353, "num_input_tokens_seen": 150142300, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.15197754, "step": 6994, "time_per_iteration": 2.5198988914489746 }, { "auxiliary_loss_clip": 0.06463042, "auxiliary_loss_mlp": 0.01271338, "balance_loss_clip": 0.06296853, "balance_loss_mlp": 0.01257986, "epoch": 0.4205621524124455, "flos": 18411548843520.0, "grad_norm": 1.6129015115268124, "language_loss": 0.79357588, "learning_rate": 2.6021595839932934e-06, "loss": 0.8709197, "num_input_tokens_seen": 150161345, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.13323975, "step": 6995, "time_per_iteration": 2.5568795204162598 }, { "auxiliary_loss_clip": 0.06457105, "auxiliary_loss_mlp": 0.01269145, "balance_loss_clip": 0.06290887, "balance_loss_mlp": 0.01256634, "epoch": 0.4206222756651135, "flos": 25527133566720.0, "grad_norm": 1.3265659087183512, "language_loss": 0.80591428, "learning_rate": 2.60178818232786e-06, "loss": 0.88317668, "num_input_tokens_seen": 150182420, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.12524414, "step": 6996, "time_per_iteration": 2.6330952644348145 }, { "auxiliary_loss_clip": 0.0645985, "auxiliary_loss_mlp": 0.01267874, "balance_loss_clip": 0.06288882, "balance_loss_mlp": 0.01255089, "epoch": 0.4206823989177815, "flos": 15309708474240.0, "grad_norm": 2.5091959191859536, "language_loss": 0.75796366, "learning_rate": 2.601416757842559e-06, "loss": 0.83524096, "num_input_tokens_seen": 150200175, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.12774658, "step": 6997, "time_per_iteration": 2.5444817543029785 }, { "auxiliary_loss_clip": 0.06458817, "auxiliary_loss_mlp": 0.01270355, "balance_loss_clip": 0.06287729, "balance_loss_mlp": 0.01257456, "epoch": 0.42074252217044944, "flos": 15558564700800.0, "grad_norm": 1.7624893799235382, "language_loss": 0.76211953, "learning_rate": 2.6010453105514743e-06, "loss": 0.83941126, "num_input_tokens_seen": 150217100, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.12908936, "step": 6998, "time_per_iteration": 2.5412209033966064 }, { "auxiliary_loss_clip": 0.06464262, "auxiliary_loss_mlp": 0.01276617, "balance_loss_clip": 0.06289963, "balance_loss_mlp": 0.01263248, "epoch": 0.4208026454231174, "flos": 26153941633920.0, "grad_norm": 1.583625998512818, "language_loss": 0.76446366, "learning_rate": 2.60067384046869e-06, "loss": 0.84187245, "num_input_tokens_seen": 150239830, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.13366699, "step": 6999, "time_per_iteration": 2.653050184249878 }, { "auxiliary_loss_clip": 0.06456883, "auxiliary_loss_mlp": 0.01267126, "balance_loss_clip": 0.06289427, "balance_loss_mlp": 0.01253817, "epoch": 0.42086276867578537, "flos": 23556857460480.0, "grad_norm": 1.873380879452436, "language_loss": 0.64513111, "learning_rate": 2.600302347608295e-06, "loss": 0.72237122, "num_input_tokens_seen": 150260690, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.13305664, "step": 7000, "time_per_iteration": 2.596665382385254 }, { "auxiliary_loss_clip": 0.06459615, "auxiliary_loss_mlp": 0.01271123, "balance_loss_clip": 0.06288084, "balance_loss_mlp": 0.01257688, "epoch": 0.42092289192845334, "flos": 18119199548160.0, "grad_norm": 1.4939084132292435, "language_loss": 0.76643586, "learning_rate": 2.5999308319843743e-06, "loss": 0.84374321, "num_input_tokens_seen": 150279885, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.13458252, "step": 7001, "time_per_iteration": 3.982708692550659 }, { "auxiliary_loss_clip": 0.06452405, "auxiliary_loss_mlp": 0.01269739, "balance_loss_clip": 0.06285295, "balance_loss_mlp": 0.01257085, "epoch": 0.4209830151811213, "flos": 20012006954880.0, "grad_norm": 1.5107031864015217, "language_loss": 0.86817545, "learning_rate": 2.5995592936110154e-06, "loss": 0.9453969, "num_input_tokens_seen": 150297390, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.12652588, "step": 7002, "time_per_iteration": 2.5559206008911133 }, { "auxiliary_loss_clip": 0.06452305, "auxiliary_loss_mlp": 0.01267695, "balance_loss_clip": 0.06284595, "balance_loss_mlp": 0.01255518, "epoch": 0.42104313843378927, "flos": 21985050245760.0, "grad_norm": 4.086143728322972, "language_loss": 0.68243349, "learning_rate": 2.5991877325023096e-06, "loss": 0.75963342, "num_input_tokens_seen": 150317390, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.12188721, "step": 7003, "time_per_iteration": 2.557227611541748 }, { "auxiliary_loss_clip": 0.06457625, "auxiliary_loss_mlp": 0.0126922, "balance_loss_clip": 0.06285034, "balance_loss_mlp": 0.01255046, "epoch": 0.42110326168645723, "flos": 25450461480960.0, "grad_norm": 1.796190499399389, "language_loss": 0.7767548, "learning_rate": 2.598816148672344e-06, "loss": 0.85402322, "num_input_tokens_seen": 150337455, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.14154053, "step": 7004, "time_per_iteration": 2.7561049461364746 }, { "auxiliary_loss_clip": 0.06451201, "auxiliary_loss_mlp": 0.01269919, "balance_loss_clip": 0.06285513, "balance_loss_mlp": 0.01256461, "epoch": 0.4211633849391252, "flos": 17828485407360.0, "grad_norm": 1.757441207204908, "language_loss": 0.68655741, "learning_rate": 2.59844454213521e-06, "loss": 0.76376855, "num_input_tokens_seen": 150355385, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.13470459, "step": 7005, "time_per_iteration": 2.6348302364349365 }, { "auxiliary_loss_clip": 0.06459948, "auxiliary_loss_mlp": 0.01270257, "balance_loss_clip": 0.06287713, "balance_loss_mlp": 0.01257155, "epoch": 0.42122350819179316, "flos": 16286796535680.0, "grad_norm": 1.6792872883850076, "language_loss": 0.73074251, "learning_rate": 2.5980729129049994e-06, "loss": 0.80804455, "num_input_tokens_seen": 150371750, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.13116455, "step": 7006, "time_per_iteration": 4.077740907669067 }, { "auxiliary_loss_clip": 0.06456894, "auxiliary_loss_mlp": 0.01268781, "balance_loss_clip": 0.06284707, "balance_loss_mlp": 0.01254804, "epoch": 0.4212836314444611, "flos": 19651916033280.0, "grad_norm": 1.597436098698458, "language_loss": 0.71602494, "learning_rate": 2.5977012609958033e-06, "loss": 0.79328167, "num_input_tokens_seen": 150389955, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.13964844, "step": 7007, "time_per_iteration": 2.5383057594299316 }, { "auxiliary_loss_clip": 0.06455474, "auxiliary_loss_mlp": 0.01270631, "balance_loss_clip": 0.06283748, "balance_loss_mlp": 0.01257309, "epoch": 0.4213437546971291, "flos": 18374889882240.0, "grad_norm": 1.7745389635625264, "language_loss": 0.82970512, "learning_rate": 2.5973295864217166e-06, "loss": 0.90696621, "num_input_tokens_seen": 150405780, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.13317871, "step": 7008, "time_per_iteration": 2.5320627689361572 }, { "auxiliary_loss_clip": 0.06460162, "auxiliary_loss_mlp": 0.01267798, "balance_loss_clip": 0.06289319, "balance_loss_mlp": 0.0125509, "epoch": 0.42140387794979706, "flos": 27711116311680.0, "grad_norm": 2.2910617028721183, "language_loss": 0.7258594, "learning_rate": 2.596957889196831e-06, "loss": 0.80313897, "num_input_tokens_seen": 150425615, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.12689209, "step": 7009, "time_per_iteration": 2.606198787689209 }, { "auxiliary_loss_clip": 0.06462383, "auxiliary_loss_mlp": 0.01268937, "balance_loss_clip": 0.06287162, "balance_loss_mlp": 0.01255091, "epoch": 0.4214640012024651, "flos": 28154545718400.0, "grad_norm": 2.127655407961532, "language_loss": 0.66729695, "learning_rate": 2.596586169335243e-06, "loss": 0.74461019, "num_input_tokens_seen": 150445765, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.13867188, "step": 7010, "time_per_iteration": 2.6060640811920166 }, { "auxiliary_loss_clip": 0.06455168, "auxiliary_loss_mlp": 0.01273966, "balance_loss_clip": 0.06285107, "balance_loss_mlp": 0.01260388, "epoch": 0.42152412445513304, "flos": 23002989972480.0, "grad_norm": 1.5248603087103534, "language_loss": 0.72649932, "learning_rate": 2.5962144268510477e-06, "loss": 0.80379069, "num_input_tokens_seen": 150464405, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.13586426, "step": 7011, "time_per_iteration": 2.551940441131592 }, { "auxiliary_loss_clip": 0.06347367, "auxiliary_loss_mlp": 0.01257552, "balance_loss_clip": 0.0627269, "balance_loss_mlp": 0.01254345, "epoch": 0.421584247707801, "flos": 63767855756160.0, "grad_norm": 0.7839673839582972, "language_loss": 0.54325205, "learning_rate": 2.5958426617583417e-06, "loss": 0.61930126, "num_input_tokens_seen": 150520430, "router_z_loss_clip": 0.74707031, "router_z_loss_mlp": 0.03208923, "step": 7012, "time_per_iteration": 3.0794713497161865 }, { "auxiliary_loss_clip": 0.06463571, "auxiliary_loss_mlp": 0.01270542, "balance_loss_clip": 0.06289583, "balance_loss_mlp": 0.01255223, "epoch": 0.421644370960469, "flos": 24321203205120.0, "grad_norm": 1.2978428310158527, "language_loss": 0.78913951, "learning_rate": 2.5954708740712215e-06, "loss": 0.86648059, "num_input_tokens_seen": 150542610, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.15319824, "step": 7013, "time_per_iteration": 2.6028387546539307 }, { "auxiliary_loss_clip": 0.06461833, "auxiliary_loss_mlp": 0.01273103, "balance_loss_clip": 0.0628867, "balance_loss_mlp": 0.01258071, "epoch": 0.42170449421313694, "flos": 23447425628160.0, "grad_norm": 1.8583151452476567, "language_loss": 0.81680244, "learning_rate": 2.595099063803787e-06, "loss": 0.89415181, "num_input_tokens_seen": 150560970, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.15026855, "step": 7014, "time_per_iteration": 3.9509785175323486 }, { "auxiliary_loss_clip": 0.06463172, "auxiliary_loss_mlp": 0.01274266, "balance_loss_clip": 0.06289551, "balance_loss_mlp": 0.012607, "epoch": 0.4217646174658049, "flos": 23702151640320.0, "grad_norm": 1.5568019552417416, "language_loss": 0.78202248, "learning_rate": 2.5947272309701354e-06, "loss": 0.85939682, "num_input_tokens_seen": 150582615, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.13555908, "step": 7015, "time_per_iteration": 2.607698917388916 }, { "auxiliary_loss_clip": 0.06461295, "auxiliary_loss_mlp": 0.01273753, "balance_loss_clip": 0.06286839, "balance_loss_mlp": 0.01259448, "epoch": 0.42182474071847287, "flos": 24978297323520.0, "grad_norm": 1.323408123096498, "language_loss": 0.82522428, "learning_rate": 2.594355375584368e-06, "loss": 0.90257478, "num_input_tokens_seen": 150603640, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.14306641, "step": 7016, "time_per_iteration": 4.047183275222778 }, { "auxiliary_loss_clip": 0.06466216, "auxiliary_loss_mlp": 0.01269818, "balance_loss_clip": 0.06294374, "balance_loss_mlp": 0.01256001, "epoch": 0.42188486397114083, "flos": 22863230161920.0, "grad_norm": 2.945550094039944, "language_loss": 0.68286633, "learning_rate": 2.593983497660586e-06, "loss": 0.76022673, "num_input_tokens_seen": 150622490, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.13824463, "step": 7017, "time_per_iteration": 2.584306001663208 }, { "auxiliary_loss_clip": 0.06353992, "auxiliary_loss_mlp": 0.01252961, "balance_loss_clip": 0.06278738, "balance_loss_mlp": 0.01249872, "epoch": 0.4219449872238088, "flos": 66997072730880.0, "grad_norm": 0.6800822497726996, "language_loss": 0.59473562, "learning_rate": 2.5936115972128895e-06, "loss": 0.67080516, "num_input_tokens_seen": 150689545, "router_z_loss_clip": 0.75292969, "router_z_loss_mlp": 0.03086853, "step": 7018, "time_per_iteration": 3.23689341545105 }, { "auxiliary_loss_clip": 0.06471856, "auxiliary_loss_mlp": 0.01267434, "balance_loss_clip": 0.06295731, "balance_loss_mlp": 0.0125413, "epoch": 0.42200511047647676, "flos": 13120400995200.0, "grad_norm": 1.6845678847252916, "language_loss": 0.75561082, "learning_rate": 2.593239674255382e-06, "loss": 0.8330037, "num_input_tokens_seen": 150707610, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.13299561, "step": 7019, "time_per_iteration": 2.544419765472412 }, { "auxiliary_loss_clip": 0.06464943, "auxiliary_loss_mlp": 0.01271038, "balance_loss_clip": 0.06292205, "balance_loss_mlp": 0.01256589, "epoch": 0.42206523372914473, "flos": 13996400705280.0, "grad_norm": 1.8972632727093461, "language_loss": 0.69252014, "learning_rate": 2.592867728802166e-06, "loss": 0.76987994, "num_input_tokens_seen": 150724530, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.14434814, "step": 7020, "time_per_iteration": 2.544605255126953 }, { "auxiliary_loss_clip": 0.06461845, "auxiliary_loss_mlp": 0.0127094, "balance_loss_clip": 0.06295912, "balance_loss_mlp": 0.01257993, "epoch": 0.4221253569818127, "flos": 21948391284480.0, "grad_norm": 1.7337548681984447, "language_loss": 0.81346035, "learning_rate": 2.592495760867347e-06, "loss": 0.89078814, "num_input_tokens_seen": 150742870, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.1295166, "step": 7021, "time_per_iteration": 2.5618250370025635 }, { "auxiliary_loss_clip": 0.06465397, "auxiliary_loss_mlp": 0.01268796, "balance_loss_clip": 0.06292558, "balance_loss_mlp": 0.01254944, "epoch": 0.42218548023448066, "flos": 32200001642880.0, "grad_norm": 1.5204377264151834, "language_loss": 0.70359522, "learning_rate": 2.5921237704650293e-06, "loss": 0.78093719, "num_input_tokens_seen": 150765500, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.1385498, "step": 7022, "time_per_iteration": 2.7921226024627686 }, { "auxiliary_loss_clip": 0.06455804, "auxiliary_loss_mlp": 0.01272281, "balance_loss_clip": 0.06291343, "balance_loss_mlp": 0.01258881, "epoch": 0.4222456034871487, "flos": 30127043957760.0, "grad_norm": 1.704515681281772, "language_loss": 0.6773023, "learning_rate": 2.5917517576093188e-06, "loss": 0.75458312, "num_input_tokens_seen": 150784945, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.13397217, "step": 7023, "time_per_iteration": 2.6854424476623535 }, { "auxiliary_loss_clip": 0.06459701, "auxiliary_loss_mlp": 0.01273181, "balance_loss_clip": 0.06294191, "balance_loss_mlp": 0.01259102, "epoch": 0.42230572673981664, "flos": 22134537129600.0, "grad_norm": 2.645506858883467, "language_loss": 0.69828498, "learning_rate": 2.591379722314322e-06, "loss": 0.77561378, "num_input_tokens_seen": 150803120, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.14086914, "step": 7024, "time_per_iteration": 2.5776233673095703 }, { "auxiliary_loss_clip": 0.0646145, "auxiliary_loss_mlp": 0.01269675, "balance_loss_clip": 0.06289484, "balance_loss_mlp": 0.01256276, "epoch": 0.4223658499924846, "flos": 22061722331520.0, "grad_norm": 1.4593937572296976, "language_loss": 0.77065611, "learning_rate": 2.591007664594147e-06, "loss": 0.84796739, "num_input_tokens_seen": 150823135, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.1340332, "step": 7025, "time_per_iteration": 2.5713682174682617 }, { "auxiliary_loss_clip": 0.06457864, "auxiliary_loss_mlp": 0.01272083, "balance_loss_clip": 0.0629113, "balance_loss_mlp": 0.01259101, "epoch": 0.4224259732451526, "flos": 20416681048320.0, "grad_norm": 1.5805766651673059, "language_loss": 0.80271512, "learning_rate": 2.5906355844629024e-06, "loss": 0.88001454, "num_input_tokens_seen": 150842070, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.12994385, "step": 7026, "time_per_iteration": 2.5835537910461426 }, { "auxiliary_loss_clip": 0.06359783, "auxiliary_loss_mlp": 0.01255716, "balance_loss_clip": 0.06285262, "balance_loss_mlp": 0.0125307, "epoch": 0.42248609649782054, "flos": 62866307750400.0, "grad_norm": 0.7083054723353507, "language_loss": 0.61878157, "learning_rate": 2.5902634819346966e-06, "loss": 0.69493657, "num_input_tokens_seen": 150907450, "router_z_loss_clip": 0.74316406, "router_z_loss_mlp": 0.02648926, "step": 7027, "time_per_iteration": 3.265181064605713 }, { "auxiliary_loss_clip": 0.06460655, "auxiliary_loss_mlp": 0.01273305, "balance_loss_clip": 0.06293811, "balance_loss_mlp": 0.01259459, "epoch": 0.4225462197504885, "flos": 26257126337280.0, "grad_norm": 3.393776746925224, "language_loss": 0.71187943, "learning_rate": 2.5898913570236414e-06, "loss": 0.78921902, "num_input_tokens_seen": 150928040, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.13842773, "step": 7028, "time_per_iteration": 2.6241748332977295 }, { "auxiliary_loss_clip": 0.06467355, "auxiliary_loss_mlp": 0.01276486, "balance_loss_clip": 0.06294262, "balance_loss_mlp": 0.01261817, "epoch": 0.42260634300315647, "flos": 20528209232640.0, "grad_norm": 1.78728532163544, "language_loss": 0.82815975, "learning_rate": 2.589519209743846e-06, "loss": 0.90559816, "num_input_tokens_seen": 150945760, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.14660645, "step": 7029, "time_per_iteration": 2.571432113647461 }, { "auxiliary_loss_clip": 0.06468042, "auxiliary_loss_mlp": 0.01277557, "balance_loss_clip": 0.0629135, "balance_loss_mlp": 0.01262673, "epoch": 0.42266646625582444, "flos": 24323676900480.0, "grad_norm": 5.09264828703895, "language_loss": 0.74852693, "learning_rate": 2.589147040109424e-06, "loss": 0.82598293, "num_input_tokens_seen": 150965665, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.14904785, "step": 7030, "time_per_iteration": 2.616318941116333 }, { "auxiliary_loss_clip": 0.06463057, "auxiliary_loss_mlp": 0.01276061, "balance_loss_clip": 0.06292552, "balance_loss_mlp": 0.01261243, "epoch": 0.4227265895084924, "flos": 24210555488640.0, "grad_norm": 2.2698703334840413, "language_loss": 0.87190521, "learning_rate": 2.588774848134486e-06, "loss": 0.94929641, "num_input_tokens_seen": 150982260, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.14831543, "step": 7031, "time_per_iteration": 2.583832025527954 }, { "auxiliary_loss_clip": 0.06455972, "auxiliary_loss_mlp": 0.01278403, "balance_loss_clip": 0.06285501, "balance_loss_mlp": 0.01263657, "epoch": 0.42278671276116037, "flos": 16915407465600.0, "grad_norm": 1.9875312849593465, "language_loss": 0.73534799, "learning_rate": 2.5884026338331473e-06, "loss": 0.81269169, "num_input_tokens_seen": 150999990, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.14758301, "step": 7032, "time_per_iteration": 2.5438156127929688 }, { "auxiliary_loss_clip": 0.06465049, "auxiliary_loss_mlp": 0.01281005, "balance_loss_clip": 0.06291866, "balance_loss_mlp": 0.01266527, "epoch": 0.42284683601382833, "flos": 25418162931840.0, "grad_norm": 1.7323198867168403, "language_loss": 0.70693707, "learning_rate": 2.5880303972195222e-06, "loss": 0.7843976, "num_input_tokens_seen": 151021105, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.14483643, "step": 7033, "time_per_iteration": 2.6019277572631836 }, { "auxiliary_loss_clip": 0.06463115, "auxiliary_loss_mlp": 0.01276449, "balance_loss_clip": 0.06291395, "balance_loss_mlp": 0.01261936, "epoch": 0.4229069592664963, "flos": 23047153873920.0, "grad_norm": 1.8897131311753992, "language_loss": 0.90903544, "learning_rate": 2.5876581383077256e-06, "loss": 0.98643112, "num_input_tokens_seen": 151040665, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.14520264, "step": 7034, "time_per_iteration": 2.5819592475891113 }, { "auxiliary_loss_clip": 0.06453172, "auxiliary_loss_mlp": 0.01276369, "balance_loss_clip": 0.06284313, "balance_loss_mlp": 0.01263071, "epoch": 0.42296708251916426, "flos": 26074586217600.0, "grad_norm": 1.67491839703738, "language_loss": 0.77443534, "learning_rate": 2.5872858571118723e-06, "loss": 0.85173076, "num_input_tokens_seen": 151061240, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.13299561, "step": 7035, "time_per_iteration": 2.5921449661254883 }, { "auxiliary_loss_clip": 0.06464391, "auxiliary_loss_mlp": 0.01278135, "balance_loss_clip": 0.06289846, "balance_loss_mlp": 0.01263388, "epoch": 0.4230272057718323, "flos": 19463548055040.0, "grad_norm": 1.9170677971127694, "language_loss": 0.82919312, "learning_rate": 2.5869135536460817e-06, "loss": 0.90661836, "num_input_tokens_seen": 151076870, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.1473999, "step": 7036, "time_per_iteration": 2.5422475337982178 }, { "auxiliary_loss_clip": 0.06462474, "auxiliary_loss_mlp": 0.01271575, "balance_loss_clip": 0.06296109, "balance_loss_mlp": 0.01258045, "epoch": 0.42308732902450025, "flos": 22389975901440.0, "grad_norm": 1.6500279154596387, "language_loss": 0.70356786, "learning_rate": 2.58654122792447e-06, "loss": 0.78090835, "num_input_tokens_seen": 151095110, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.13537598, "step": 7037, "time_per_iteration": 2.586318254470825 }, { "auxiliary_loss_clip": 0.06459614, "auxiliary_loss_mlp": 0.01275558, "balance_loss_clip": 0.06288726, "balance_loss_mlp": 0.01260299, "epoch": 0.4231474522771682, "flos": 21001631201280.0, "grad_norm": 1.5185105916661028, "language_loss": 0.78123349, "learning_rate": 2.586168879961155e-06, "loss": 0.85858518, "num_input_tokens_seen": 151114355, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.15270996, "step": 7038, "time_per_iteration": 2.553647756576538 }, { "auxiliary_loss_clip": 0.06466246, "auxiliary_loss_mlp": 0.01275206, "balance_loss_clip": 0.0629134, "balance_loss_mlp": 0.01259447, "epoch": 0.4232075755298362, "flos": 14981161415040.0, "grad_norm": 3.118531735719529, "language_loss": 0.67324042, "learning_rate": 2.585796509770259e-06, "loss": 0.75065494, "num_input_tokens_seen": 151131505, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.15771484, "step": 7039, "time_per_iteration": 2.536659002304077 }, { "auxiliary_loss_clip": 0.06473087, "auxiliary_loss_mlp": 0.0127926, "balance_loss_clip": 0.06294189, "balance_loss_mlp": 0.01263846, "epoch": 0.42326769878250414, "flos": 24539144474880.0, "grad_norm": 1.5768081020929106, "language_loss": 0.75907385, "learning_rate": 2.5854241173658996e-06, "loss": 0.83659732, "num_input_tokens_seen": 151151555, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.15429688, "step": 7040, "time_per_iteration": 3.9945333003997803 }, { "auxiliary_loss_clip": 0.0646108, "auxiliary_loss_mlp": 0.01272111, "balance_loss_clip": 0.06289241, "balance_loss_mlp": 0.01257568, "epoch": 0.4233278220351721, "flos": 26877603421440.0, "grad_norm": 1.7035542253290656, "language_loss": 0.66035807, "learning_rate": 2.5850517027621996e-06, "loss": 0.73769003, "num_input_tokens_seen": 151172385, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.14550781, "step": 7041, "time_per_iteration": 2.6075751781463623 }, { "auxiliary_loss_clip": 0.06468068, "auxiliary_loss_mlp": 0.01272363, "balance_loss_clip": 0.06293348, "balance_loss_mlp": 0.01257789, "epoch": 0.4233879452878401, "flos": 42824951867520.0, "grad_norm": 1.8844500948279648, "language_loss": 0.74399203, "learning_rate": 2.5846792659732803e-06, "loss": 0.82139635, "num_input_tokens_seen": 151194930, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.14556885, "step": 7042, "time_per_iteration": 2.73494029045105 }, { "auxiliary_loss_clip": 0.06463568, "auxiliary_loss_mlp": 0.01270376, "balance_loss_clip": 0.06295674, "balance_loss_mlp": 0.0125647, "epoch": 0.42344806854050804, "flos": 25236125936640.0, "grad_norm": 1.2471480334353886, "language_loss": 0.8245455, "learning_rate": 2.5843068070132643e-06, "loss": 0.90188497, "num_input_tokens_seen": 151217905, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.13916016, "step": 7043, "time_per_iteration": 2.6212282180786133 }, { "auxiliary_loss_clip": 0.06470105, "auxiliary_loss_mlp": 0.01275149, "balance_loss_clip": 0.06302101, "balance_loss_mlp": 0.01259729, "epoch": 0.423508191793176, "flos": 22784587505280.0, "grad_norm": 2.1474332636923066, "language_loss": 0.65839016, "learning_rate": 2.5839343258962763e-06, "loss": 0.73584265, "num_input_tokens_seen": 151234580, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.15423584, "step": 7044, "time_per_iteration": 2.5407614707946777 }, { "auxiliary_loss_clip": 0.06474494, "auxiliary_loss_mlp": 0.01273609, "balance_loss_clip": 0.0629947, "balance_loss_mlp": 0.01258994, "epoch": 0.42356831504584397, "flos": 34645376799360.0, "grad_norm": 6.353030002060416, "language_loss": 0.75617254, "learning_rate": 2.5835618226364393e-06, "loss": 0.83365363, "num_input_tokens_seen": 151254765, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.1461792, "step": 7045, "time_per_iteration": 4.140377759933472 }, { "auxiliary_loss_clip": 0.06468805, "auxiliary_loss_mlp": 0.01276711, "balance_loss_clip": 0.06301504, "balance_loss_mlp": 0.0126243, "epoch": 0.42362843829851193, "flos": 17601487896960.0, "grad_norm": 2.08245995557683, "language_loss": 0.81280768, "learning_rate": 2.5831892972478797e-06, "loss": 0.89026284, "num_input_tokens_seen": 151269045, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.14294434, "step": 7046, "time_per_iteration": 2.533665180206299 }, { "auxiliary_loss_clip": 0.06475452, "auxiliary_loss_mlp": 0.01270663, "balance_loss_clip": 0.06301528, "balance_loss_mlp": 0.01256555, "epoch": 0.4236885615511799, "flos": 22572390240000.0, "grad_norm": 1.792407164250789, "language_loss": 0.77440083, "learning_rate": 2.5828167497447242e-06, "loss": 0.85186195, "num_input_tokens_seen": 151287530, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.14105225, "step": 7047, "time_per_iteration": 2.5712530612945557 }, { "auxiliary_loss_clip": 0.06465186, "auxiliary_loss_mlp": 0.01274378, "balance_loss_clip": 0.06298791, "balance_loss_mlp": 0.01260311, "epoch": 0.42374868480384786, "flos": 26476493126400.0, "grad_norm": 1.7984222003537225, "language_loss": 0.68121052, "learning_rate": 2.582444180141098e-06, "loss": 0.75860614, "num_input_tokens_seen": 151308905, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.140625, "step": 7048, "time_per_iteration": 2.6315340995788574 }, { "auxiliary_loss_clip": 0.06469721, "auxiliary_loss_mlp": 0.0127132, "balance_loss_clip": 0.06299935, "balance_loss_mlp": 0.0125602, "epoch": 0.4238088080565159, "flos": 20375493966720.0, "grad_norm": 1.6483238935306435, "language_loss": 0.77802193, "learning_rate": 2.5820715884511307e-06, "loss": 0.85543233, "num_input_tokens_seen": 151326525, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.15301514, "step": 7049, "time_per_iteration": 2.5742533206939697 }, { "auxiliary_loss_clip": 0.06473655, "auxiliary_loss_mlp": 0.01272384, "balance_loss_clip": 0.06300765, "balance_loss_mlp": 0.01257823, "epoch": 0.42386893130918385, "flos": 21177379140480.0, "grad_norm": 2.1442317965971536, "language_loss": 0.83482391, "learning_rate": 2.5816989746889504e-06, "loss": 0.91228431, "num_input_tokens_seen": 151344675, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.14575195, "step": 7050, "time_per_iteration": 2.5596399307250977 }, { "auxiliary_loss_clip": 0.0646179, "auxiliary_loss_mlp": 0.01273374, "balance_loss_clip": 0.06289489, "balance_loss_mlp": 0.01259034, "epoch": 0.4239290545618518, "flos": 17681346437760.0, "grad_norm": 2.195269021894881, "language_loss": 0.73138833, "learning_rate": 2.581326338868687e-06, "loss": 0.80874002, "num_input_tokens_seen": 151360730, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.14343262, "step": 7051, "time_per_iteration": 2.6431286334991455 }, { "auxiliary_loss_clip": 0.06467637, "auxiliary_loss_mlp": 0.0126909, "balance_loss_clip": 0.06297678, "balance_loss_mlp": 0.01254987, "epoch": 0.4239891778145198, "flos": 24321077424000.0, "grad_norm": 1.4996194394614835, "language_loss": 0.86664134, "learning_rate": 2.5809536810044706e-06, "loss": 0.94400859, "num_input_tokens_seen": 151380445, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.14099121, "step": 7052, "time_per_iteration": 2.605787754058838 }, { "auxiliary_loss_clip": 0.06468512, "auxiliary_loss_mlp": 0.01274452, "balance_loss_clip": 0.06296256, "balance_loss_mlp": 0.01260713, "epoch": 0.42404930106718774, "flos": 20564700485760.0, "grad_norm": 1.436185406735563, "language_loss": 0.72791493, "learning_rate": 2.5805810011104323e-06, "loss": 0.80534452, "num_input_tokens_seen": 151399325, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.13745117, "step": 7053, "time_per_iteration": 3.994854688644409 }, { "auxiliary_loss_clip": 0.06463981, "auxiliary_loss_mlp": 0.01270745, "balance_loss_clip": 0.06293459, "balance_loss_mlp": 0.01256494, "epoch": 0.4241094243198557, "flos": 22314351991680.0, "grad_norm": 1.6102764229930824, "language_loss": 0.82294118, "learning_rate": 2.580208299200704e-06, "loss": 0.90028846, "num_input_tokens_seen": 151417240, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.14245605, "step": 7054, "time_per_iteration": 2.5548036098480225 }, { "auxiliary_loss_clip": 0.06358986, "auxiliary_loss_mlp": 0.01278045, "balance_loss_clip": 0.06285129, "balance_loss_mlp": 0.01275248, "epoch": 0.4241695475725237, "flos": 70632445973760.0, "grad_norm": 0.7754624458115909, "language_loss": 0.60227263, "learning_rate": 2.5798355752894183e-06, "loss": 0.67864299, "num_input_tokens_seen": 151476015, "router_z_loss_clip": 0.73974609, "router_z_loss_mlp": 0.02796936, "step": 7055, "time_per_iteration": 4.5543372631073 }, { "auxiliary_loss_clip": 0.06467745, "auxiliary_loss_mlp": 0.0126864, "balance_loss_clip": 0.06294107, "balance_loss_mlp": 0.01254031, "epoch": 0.42422967082519164, "flos": 14032640396160.0, "grad_norm": 3.8130815864019008, "language_loss": 0.77145863, "learning_rate": 2.5794628293907107e-06, "loss": 0.84882247, "num_input_tokens_seen": 151492035, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.14593506, "step": 7056, "time_per_iteration": 2.539238452911377 }, { "auxiliary_loss_clip": 0.0647137, "auxiliary_loss_mlp": 0.01273737, "balance_loss_clip": 0.06295165, "balance_loss_mlp": 0.01257525, "epoch": 0.4242897940778596, "flos": 22351975274880.0, "grad_norm": 1.9510686089936626, "language_loss": 0.84134895, "learning_rate": 2.579090061518714e-06, "loss": 0.9188, "num_input_tokens_seen": 151508970, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.16210938, "step": 7057, "time_per_iteration": 2.571727991104126 }, { "auxiliary_loss_clip": 0.06463505, "auxiliary_loss_mlp": 0.01273125, "balance_loss_clip": 0.06288923, "balance_loss_mlp": 0.01258486, "epoch": 0.42434991733052757, "flos": 22601502334080.0, "grad_norm": 2.0690797184967495, "language_loss": 0.8354193, "learning_rate": 2.5787172716875642e-06, "loss": 0.91278559, "num_input_tokens_seen": 151525295, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.1463623, "step": 7058, "time_per_iteration": 2.545975923538208 }, { "auxiliary_loss_clip": 0.06459084, "auxiliary_loss_mlp": 0.01269478, "balance_loss_clip": 0.06292607, "balance_loss_mlp": 0.01256305, "epoch": 0.42441004058319554, "flos": 20017667105280.0, "grad_norm": 1.598983006736148, "language_loss": 0.80683827, "learning_rate": 2.5783444599113973e-06, "loss": 0.88412386, "num_input_tokens_seen": 151544435, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.13165283, "step": 7059, "time_per_iteration": 2.5677998065948486 }, { "auxiliary_loss_clip": 0.0646307, "auxiliary_loss_mlp": 0.0127608, "balance_loss_clip": 0.06287944, "balance_loss_mlp": 0.0126007, "epoch": 0.4244701638358635, "flos": 11149663691520.0, "grad_norm": 3.3698609654529625, "language_loss": 0.70724928, "learning_rate": 2.57797162620435e-06, "loss": 0.78464079, "num_input_tokens_seen": 151559520, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.16015625, "step": 7060, "time_per_iteration": 2.5299932956695557 }, { "auxiliary_loss_clip": 0.06463861, "auxiliary_loss_mlp": 0.01275998, "balance_loss_clip": 0.06291588, "balance_loss_mlp": 0.01261502, "epoch": 0.42453028708853147, "flos": 23994542862720.0, "grad_norm": 1.5733047202716712, "language_loss": 0.76174724, "learning_rate": 2.577598770580562e-06, "loss": 0.83914584, "num_input_tokens_seen": 151579790, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.14489746, "step": 7061, "time_per_iteration": 2.578035354614258 }, { "auxiliary_loss_clip": 0.06460621, "auxiliary_loss_mlp": 0.01273124, "balance_loss_clip": 0.06285669, "balance_loss_mlp": 0.01258568, "epoch": 0.42459041034119943, "flos": 18412345457280.0, "grad_norm": 2.3141539361713725, "language_loss": 0.73946738, "learning_rate": 2.5772258930541693e-06, "loss": 0.81680477, "num_input_tokens_seen": 151598285, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.14562988, "step": 7062, "time_per_iteration": 2.5397706031799316 }, { "auxiliary_loss_clip": 0.06456079, "auxiliary_loss_mlp": 0.01272581, "balance_loss_clip": 0.06284384, "balance_loss_mlp": 0.01257656, "epoch": 0.42465053359386745, "flos": 20964049845120.0, "grad_norm": 1.6309444287185113, "language_loss": 0.66688442, "learning_rate": 2.5768529936393137e-06, "loss": 0.74417102, "num_input_tokens_seen": 151615430, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.14929199, "step": 7063, "time_per_iteration": 2.5434813499450684 }, { "auxiliary_loss_clip": 0.06448789, "auxiliary_loss_mlp": 0.01269504, "balance_loss_clip": 0.06280891, "balance_loss_mlp": 0.01256104, "epoch": 0.4247106568465354, "flos": 33114001979520.0, "grad_norm": 1.540214242517479, "language_loss": 0.79204071, "learning_rate": 2.5764800723501354e-06, "loss": 0.86922365, "num_input_tokens_seen": 151637030, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.13427734, "step": 7064, "time_per_iteration": 2.675496816635132 }, { "auxiliary_loss_clip": 0.0646162, "auxiliary_loss_mlp": 0.01277156, "balance_loss_clip": 0.06287391, "balance_loss_mlp": 0.01261385, "epoch": 0.4247707800992034, "flos": 20052984401280.0, "grad_norm": 2.1259643174483016, "language_loss": 0.75722122, "learning_rate": 2.5761071292007736e-06, "loss": 0.83460897, "num_input_tokens_seen": 151655745, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.15771484, "step": 7065, "time_per_iteration": 2.56075382232666 }, { "auxiliary_loss_clip": 0.06458785, "auxiliary_loss_mlp": 0.01271829, "balance_loss_clip": 0.06288987, "balance_loss_mlp": 0.01257071, "epoch": 0.42483090335187135, "flos": 22392114180480.0, "grad_norm": 1.7133992519727137, "language_loss": 0.72438264, "learning_rate": 2.5757341642053725e-06, "loss": 0.80168885, "num_input_tokens_seen": 151678040, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.14758301, "step": 7066, "time_per_iteration": 2.597600221633911 }, { "auxiliary_loss_clip": 0.06459316, "auxiliary_loss_mlp": 0.01273032, "balance_loss_clip": 0.06286424, "balance_loss_mlp": 0.01257267, "epoch": 0.4248910266045393, "flos": 21362518736640.0, "grad_norm": 1.9136214785120031, "language_loss": 0.80732918, "learning_rate": 2.5753611773780745e-06, "loss": 0.88465267, "num_input_tokens_seen": 151696410, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.15765381, "step": 7067, "time_per_iteration": 2.545304536819458 }, { "auxiliary_loss_clip": 0.06345169, "auxiliary_loss_mlp": 0.01275975, "balance_loss_clip": 0.06272606, "balance_loss_mlp": 0.01273271, "epoch": 0.4249511498572073, "flos": 64026942180480.0, "grad_norm": 0.9114116903702437, "language_loss": 0.63377172, "learning_rate": 2.574988168733022e-06, "loss": 0.70998317, "num_input_tokens_seen": 151756365, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 0.02706909, "step": 7068, "time_per_iteration": 3.1044023036956787 }, { "auxiliary_loss_clip": 0.06459805, "auxiliary_loss_mlp": 0.01278029, "balance_loss_clip": 0.06286682, "balance_loss_mlp": 0.01262043, "epoch": 0.42501127310987524, "flos": 19612699522560.0, "grad_norm": 5.903611325091604, "language_loss": 0.72605205, "learning_rate": 2.574615138284361e-06, "loss": 0.80343038, "num_input_tokens_seen": 151775165, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.159729, "step": 7069, "time_per_iteration": 2.5449740886688232 }, { "auxiliary_loss_clip": 0.06463552, "auxiliary_loss_mlp": 0.01275889, "balance_loss_clip": 0.06291199, "balance_loss_mlp": 0.01259521, "epoch": 0.4250713963625432, "flos": 19468160029440.0, "grad_norm": 2.0482834884675136, "language_loss": 0.79589355, "learning_rate": 2.5742420860462364e-06, "loss": 0.87328798, "num_input_tokens_seen": 151792620, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.16357422, "step": 7070, "time_per_iteration": 2.6114728450775146 }, { "auxiliary_loss_clip": 0.06455657, "auxiliary_loss_mlp": 0.01277459, "balance_loss_clip": 0.06283677, "balance_loss_mlp": 0.01261164, "epoch": 0.4251315196152112, "flos": 25344719228160.0, "grad_norm": 1.734650551270787, "language_loss": 0.71332061, "learning_rate": 2.573869012032795e-06, "loss": 0.7906518, "num_input_tokens_seen": 151812850, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.16296387, "step": 7071, "time_per_iteration": 2.613185167312622 }, { "auxiliary_loss_clip": 0.06458299, "auxiliary_loss_mlp": 0.01278111, "balance_loss_clip": 0.06286554, "balance_loss_mlp": 0.01262197, "epoch": 0.42519164286787914, "flos": 26366348534400.0, "grad_norm": 2.0054265946624423, "language_loss": 0.70846617, "learning_rate": 2.5734959162581824e-06, "loss": 0.78583032, "num_input_tokens_seen": 151831785, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.15917969, "step": 7072, "time_per_iteration": 2.588710308074951 }, { "auxiliary_loss_clip": 0.06469184, "auxiliary_loss_mlp": 0.01273189, "balance_loss_clip": 0.06292643, "balance_loss_mlp": 0.01258258, "epoch": 0.4252517661205471, "flos": 26038220745600.0, "grad_norm": 1.5062962645615374, "language_loss": 0.81806171, "learning_rate": 2.5731227987365475e-06, "loss": 0.89548546, "num_input_tokens_seen": 151853885, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.14935303, "step": 7073, "time_per_iteration": 2.6267802715301514 }, { "auxiliary_loss_clip": 0.06459285, "auxiliary_loss_mlp": 0.01282952, "balance_loss_clip": 0.06291054, "balance_loss_mlp": 0.01268736, "epoch": 0.42531188937321507, "flos": 12718536013440.0, "grad_norm": 2.303105854224711, "language_loss": 0.91147625, "learning_rate": 2.5727496594820386e-06, "loss": 0.98889863, "num_input_tokens_seen": 151871780, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.14208984, "step": 7074, "time_per_iteration": 2.545750379562378 }, { "auxiliary_loss_clip": 0.06464087, "auxiliary_loss_mlp": 0.01282211, "balance_loss_clip": 0.0628971, "balance_loss_mlp": 0.01266308, "epoch": 0.42537201262588303, "flos": 22098339365760.0, "grad_norm": 1.6270026801995272, "language_loss": 0.64555001, "learning_rate": 2.572376498508805e-06, "loss": 0.72301292, "num_input_tokens_seen": 151891600, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.15905762, "step": 7075, "time_per_iteration": 2.61226224899292 }, { "auxiliary_loss_clip": 0.06448679, "auxiliary_loss_mlp": 0.01280469, "balance_loss_clip": 0.06282663, "balance_loss_mlp": 0.01265877, "epoch": 0.42543213587855105, "flos": 23009824080000.0, "grad_norm": 1.5167515034788406, "language_loss": 0.7388742, "learning_rate": 2.5720033158309973e-06, "loss": 0.81616569, "num_input_tokens_seen": 151911330, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.14581299, "step": 7076, "time_per_iteration": 2.589230537414551 }, { "auxiliary_loss_clip": 0.06455328, "auxiliary_loss_mlp": 0.012723, "balance_loss_clip": 0.06281462, "balance_loss_mlp": 0.01257697, "epoch": 0.425492259131219, "flos": 25089448164480.0, "grad_norm": 2.8253027592477653, "language_loss": 0.79760098, "learning_rate": 2.571630111462766e-06, "loss": 0.87487727, "num_input_tokens_seen": 151930355, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.14599609, "step": 7077, "time_per_iteration": 2.595348596572876 }, { "auxiliary_loss_clip": 0.06446767, "auxiliary_loss_mlp": 0.01273924, "balance_loss_clip": 0.06284414, "balance_loss_mlp": 0.01259989, "epoch": 0.425552382383887, "flos": 22822881621120.0, "grad_norm": 2.3734433861260187, "language_loss": 0.73196048, "learning_rate": 2.571256885418265e-06, "loss": 0.80916739, "num_input_tokens_seen": 151949695, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1394043, "step": 7078, "time_per_iteration": 2.560576915740967 }, { "auxiliary_loss_clip": 0.06451847, "auxiliary_loss_mlp": 0.01275806, "balance_loss_clip": 0.06287368, "balance_loss_mlp": 0.01262401, "epoch": 0.42561250563655495, "flos": 13558757230080.0, "grad_norm": 1.6513163338678065, "language_loss": 0.80608487, "learning_rate": 2.5708836377116445e-06, "loss": 0.88336134, "num_input_tokens_seen": 151967640, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.13421631, "step": 7079, "time_per_iteration": 2.5430431365966797 }, { "auxiliary_loss_clip": 0.06452845, "auxiliary_loss_mlp": 0.01281581, "balance_loss_clip": 0.06285227, "balance_loss_mlp": 0.01267687, "epoch": 0.4256726288892229, "flos": 46989692478720.0, "grad_norm": 1.408788069324693, "language_loss": 0.72499335, "learning_rate": 2.5705103683570592e-06, "loss": 0.80233759, "num_input_tokens_seen": 151994020, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.13916016, "step": 7080, "time_per_iteration": 4.2442309856414795 }, { "auxiliary_loss_clip": 0.06449738, "auxiliary_loss_mlp": 0.01273675, "balance_loss_clip": 0.06281861, "balance_loss_mlp": 0.01260276, "epoch": 0.4257327521418909, "flos": 23593181005440.0, "grad_norm": 1.9839528899024286, "language_loss": 0.79936731, "learning_rate": 2.5701370773686646e-06, "loss": 0.87660146, "num_input_tokens_seen": 152013415, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.13397217, "step": 7081, "time_per_iteration": 2.629777669906616 }, { "auxiliary_loss_clip": 0.06437273, "auxiliary_loss_mlp": 0.01274058, "balance_loss_clip": 0.06276011, "balance_loss_mlp": 0.01260599, "epoch": 0.42579287539455885, "flos": 18996079726080.0, "grad_norm": 1.6468594945731947, "language_loss": 0.81871033, "learning_rate": 2.5697637647606138e-06, "loss": 0.8958236, "num_input_tokens_seen": 152030860, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.13464355, "step": 7082, "time_per_iteration": 2.5370166301727295 }, { "auxiliary_loss_clip": 0.06452292, "auxiliary_loss_mlp": 0.01276605, "balance_loss_clip": 0.06282558, "balance_loss_mlp": 0.01261799, "epoch": 0.4258529986472268, "flos": 25198921923840.0, "grad_norm": 1.7662358346352134, "language_loss": 0.703578, "learning_rate": 2.569390430547065e-06, "loss": 0.78086698, "num_input_tokens_seen": 152050395, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.14807129, "step": 7083, "time_per_iteration": 2.6386828422546387 }, { "auxiliary_loss_clip": 0.06346714, "auxiliary_loss_mlp": 0.01265137, "balance_loss_clip": 0.06273704, "balance_loss_mlp": 0.01262534, "epoch": 0.4259131218998948, "flos": 69990277881600.0, "grad_norm": 0.8611657476207957, "language_loss": 0.67026627, "learning_rate": 2.569017074742173e-06, "loss": 0.7463848, "num_input_tokens_seen": 152113555, "router_z_loss_clip": 0.73046875, "router_z_loss_mlp": 0.02604675, "step": 7084, "time_per_iteration": 3.265805244445801 }, { "auxiliary_loss_clip": 0.06446017, "auxiliary_loss_mlp": 0.01282492, "balance_loss_clip": 0.06279325, "balance_loss_mlp": 0.01267698, "epoch": 0.42597324515256274, "flos": 18010899745920.0, "grad_norm": 1.8636684610116543, "language_loss": 0.78961277, "learning_rate": 2.5686436973600964e-06, "loss": 0.86689782, "num_input_tokens_seen": 152131575, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.14788818, "step": 7085, "time_per_iteration": 3.996978759765625 }, { "auxiliary_loss_clip": 0.06461209, "auxiliary_loss_mlp": 0.01277404, "balance_loss_clip": 0.06286341, "balance_loss_mlp": 0.01262061, "epoch": 0.4260333684052307, "flos": 15164204659200.0, "grad_norm": 1.9749783305915318, "language_loss": 0.76513004, "learning_rate": 2.568270298414995e-06, "loss": 0.84251612, "num_input_tokens_seen": 152149435, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.15338135, "step": 7086, "time_per_iteration": 2.5606024265289307 }, { "auxiliary_loss_clip": 0.0644457, "auxiliary_loss_mlp": 0.01276525, "balance_loss_clip": 0.06275928, "balance_loss_mlp": 0.01261451, "epoch": 0.42609349165789867, "flos": 14944628234880.0, "grad_norm": 2.083826487033001, "language_loss": 0.80504632, "learning_rate": 2.5678968779210255e-06, "loss": 0.88225728, "num_input_tokens_seen": 152166860, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.15075684, "step": 7087, "time_per_iteration": 2.579137086868286 }, { "auxiliary_loss_clip": 0.06449416, "auxiliary_loss_mlp": 0.01271926, "balance_loss_clip": 0.06280968, "balance_loss_mlp": 0.01258116, "epoch": 0.42615361491056664, "flos": 23738642893440.0, "grad_norm": 1.531841819857943, "language_loss": 0.6646226, "learning_rate": 2.5675234358923505e-06, "loss": 0.74183595, "num_input_tokens_seen": 152187475, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.13818359, "step": 7088, "time_per_iteration": 2.6036572456359863 }, { "auxiliary_loss_clip": 0.06456889, "auxiliary_loss_mlp": 0.01272458, "balance_loss_clip": 0.06284033, "balance_loss_mlp": 0.01258034, "epoch": 0.42621373816323466, "flos": 24943399297920.0, "grad_norm": 1.8917512506311986, "language_loss": 0.68997413, "learning_rate": 2.56714997234313e-06, "loss": 0.76726758, "num_input_tokens_seen": 152207235, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.14422607, "step": 7089, "time_per_iteration": 2.6195762157440186 }, { "auxiliary_loss_clip": 0.06453022, "auxiliary_loss_mlp": 0.01270964, "balance_loss_clip": 0.06281212, "balance_loss_mlp": 0.01256409, "epoch": 0.4262738614159026, "flos": 13558044470400.0, "grad_norm": 2.5333510059961295, "language_loss": 0.73996854, "learning_rate": 2.566776487287525e-06, "loss": 0.81720841, "num_input_tokens_seen": 152224240, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.14556885, "step": 7090, "time_per_iteration": 2.5337588787078857 }, { "auxiliary_loss_clip": 0.06454138, "auxiliary_loss_mlp": 0.01274215, "balance_loss_clip": 0.06280714, "balance_loss_mlp": 0.01260041, "epoch": 0.4263339846685706, "flos": 29755926224640.0, "grad_norm": 2.126400510528295, "language_loss": 0.7540921, "learning_rate": 2.5664029807396994e-06, "loss": 0.8313756, "num_input_tokens_seen": 152242595, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.1416626, "step": 7091, "time_per_iteration": 2.6306629180908203 }, { "auxiliary_loss_clip": 0.0644491, "auxiliary_loss_mlp": 0.01270692, "balance_loss_clip": 0.06280437, "balance_loss_mlp": 0.01258259, "epoch": 0.42639410792123855, "flos": 16839406212480.0, "grad_norm": 2.088361567370709, "language_loss": 0.8273617, "learning_rate": 2.5660294527138156e-06, "loss": 0.90451777, "num_input_tokens_seen": 152260840, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.12420654, "step": 7092, "time_per_iteration": 2.5568721294403076 }, { "auxiliary_loss_clip": 0.06458519, "auxiliary_loss_mlp": 0.01276573, "balance_loss_clip": 0.06282453, "balance_loss_mlp": 0.01261887, "epoch": 0.4264542311739065, "flos": 28769991557760.0, "grad_norm": 1.645501698941552, "language_loss": 0.74067163, "learning_rate": 2.565655903224038e-06, "loss": 0.81802261, "num_input_tokens_seen": 152280580, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.14678955, "step": 7093, "time_per_iteration": 4.047697067260742 }, { "auxiliary_loss_clip": 0.06450871, "auxiliary_loss_mlp": 0.01272069, "balance_loss_clip": 0.06281757, "balance_loss_mlp": 0.01257215, "epoch": 0.4265143544265745, "flos": 24719881731840.0, "grad_norm": 2.8839000624261946, "language_loss": 0.70389068, "learning_rate": 2.565282332284532e-06, "loss": 0.78112012, "num_input_tokens_seen": 152298455, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.14868164, "step": 7094, "time_per_iteration": 4.02937912940979 }, { "auxiliary_loss_clip": 0.06451188, "auxiliary_loss_mlp": 0.01269675, "balance_loss_clip": 0.06280318, "balance_loss_mlp": 0.01255668, "epoch": 0.42657447767924245, "flos": 21871467636480.0, "grad_norm": 1.5172887254083123, "language_loss": 0.81537378, "learning_rate": 2.564908739909464e-06, "loss": 0.89258248, "num_input_tokens_seen": 152316995, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.14001465, "step": 7095, "time_per_iteration": 2.5911643505096436 }, { "auxiliary_loss_clip": 0.0645766, "auxiliary_loss_mlp": 0.01273317, "balance_loss_clip": 0.06284735, "balance_loss_mlp": 0.01258553, "epoch": 0.4266346009319104, "flos": 21476604470400.0, "grad_norm": 1.793655855553736, "language_loss": 0.81037027, "learning_rate": 2.5645351261129996e-06, "loss": 0.88768005, "num_input_tokens_seen": 152334800, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.14770508, "step": 7096, "time_per_iteration": 2.585883378982544 }, { "auxiliary_loss_clip": 0.06463121, "auxiliary_loss_mlp": 0.01272254, "balance_loss_clip": 0.06286399, "balance_loss_mlp": 0.01257657, "epoch": 0.4266947241845784, "flos": 25526295025920.0, "grad_norm": 2.4217452602170404, "language_loss": 0.6618427, "learning_rate": 2.5641614909093066e-06, "loss": 0.73919642, "num_input_tokens_seen": 152355175, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.14593506, "step": 7097, "time_per_iteration": 2.6438345909118652 }, { "auxiliary_loss_clip": 0.06452517, "auxiliary_loss_mlp": 0.01268344, "balance_loss_clip": 0.06285714, "balance_loss_mlp": 0.01254295, "epoch": 0.42675484743724634, "flos": 26548343602560.0, "grad_norm": 1.6790027029727488, "language_loss": 0.74913692, "learning_rate": 2.5637878343125535e-06, "loss": 0.82634556, "num_input_tokens_seen": 152377245, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.14044189, "step": 7098, "time_per_iteration": 2.7411346435546875 }, { "auxiliary_loss_clip": 0.06452137, "auxiliary_loss_mlp": 0.01271599, "balance_loss_clip": 0.06287093, "balance_loss_mlp": 0.01257503, "epoch": 0.4268149706899143, "flos": 23119465547520.0, "grad_norm": 1.5763383894784422, "language_loss": 0.75219858, "learning_rate": 2.5634141563369086e-06, "loss": 0.82943594, "num_input_tokens_seen": 152396985, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.14093018, "step": 7099, "time_per_iteration": 2.60675048828125 }, { "auxiliary_loss_clip": 0.06459205, "auxiliary_loss_mlp": 0.01274658, "balance_loss_clip": 0.06285842, "balance_loss_mlp": 0.01259656, "epoch": 0.4268750939425823, "flos": 22712401612800.0, "grad_norm": 2.1155359704011993, "language_loss": 0.83033168, "learning_rate": 2.5630404569965432e-06, "loss": 0.90767026, "num_input_tokens_seen": 152415590, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.15008545, "step": 7100, "time_per_iteration": 2.5661978721618652 }, { "auxiliary_loss_clip": 0.06453805, "auxiliary_loss_mlp": 0.01271257, "balance_loss_clip": 0.06282, "balance_loss_mlp": 0.01257637, "epoch": 0.42693521719525024, "flos": 25382007095040.0, "grad_norm": 1.3492526528641489, "language_loss": 0.82649839, "learning_rate": 2.562666736305627e-06, "loss": 0.90374899, "num_input_tokens_seen": 152436735, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.13623047, "step": 7101, "time_per_iteration": 2.630589246749878 }, { "auxiliary_loss_clip": 0.06460276, "auxiliary_loss_mlp": 0.01270654, "balance_loss_clip": 0.06284872, "balance_loss_mlp": 0.01255437, "epoch": 0.42699534044791826, "flos": 18156613196160.0, "grad_norm": 1.7708495389182046, "language_loss": 0.73396885, "learning_rate": 2.5622929942783314e-06, "loss": 0.81127816, "num_input_tokens_seen": 152455685, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.15216064, "step": 7102, "time_per_iteration": 2.5459070205688477 }, { "auxiliary_loss_clip": 0.06451941, "auxiliary_loss_mlp": 0.01272239, "balance_loss_clip": 0.06286359, "balance_loss_mlp": 0.01258011, "epoch": 0.4270554637005862, "flos": 13703422504320.0, "grad_norm": 1.7314368738931902, "language_loss": 0.83769315, "learning_rate": 2.5619192309288297e-06, "loss": 0.91493493, "num_input_tokens_seen": 152473500, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.14227295, "step": 7103, "time_per_iteration": 2.5596892833709717 }, { "auxiliary_loss_clip": 0.06458875, "auxiliary_loss_mlp": 0.01271427, "balance_loss_clip": 0.06286207, "balance_loss_mlp": 0.01257134, "epoch": 0.4271155869532542, "flos": 17499351369600.0, "grad_norm": 2.378582809041811, "language_loss": 0.7408396, "learning_rate": 2.561545446271294e-06, "loss": 0.81814265, "num_input_tokens_seen": 152491320, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.14300537, "step": 7104, "time_per_iteration": 2.5356552600860596 }, { "auxiliary_loss_clip": 0.06452318, "auxiliary_loss_mlp": 0.01269427, "balance_loss_clip": 0.06282634, "balance_loss_mlp": 0.01255945, "epoch": 0.42717571020592215, "flos": 32460471659520.0, "grad_norm": 2.5425272088546986, "language_loss": 0.75464118, "learning_rate": 2.5611716403198987e-06, "loss": 0.83185863, "num_input_tokens_seen": 152511970, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.13476562, "step": 7105, "time_per_iteration": 2.662161350250244 }, { "auxiliary_loss_clip": 0.06454717, "auxiliary_loss_mlp": 0.01270927, "balance_loss_clip": 0.06284338, "balance_loss_mlp": 0.01257122, "epoch": 0.4272358334585901, "flos": 16258606836480.0, "grad_norm": 4.619999404104513, "language_loss": 0.77344126, "learning_rate": 2.560797813088819e-06, "loss": 0.8506977, "num_input_tokens_seen": 152530515, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.13806152, "step": 7106, "time_per_iteration": 2.540745258331299 }, { "auxiliary_loss_clip": 0.06455034, "auxiliary_loss_mlp": 0.01271364, "balance_loss_clip": 0.06284977, "balance_loss_mlp": 0.01257321, "epoch": 0.4272959567112581, "flos": 24205817733120.0, "grad_norm": 1.7992991121358954, "language_loss": 0.80542499, "learning_rate": 2.560423964592229e-06, "loss": 0.882689, "num_input_tokens_seen": 152549295, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.14044189, "step": 7107, "time_per_iteration": 2.621640205383301 }, { "auxiliary_loss_clip": 0.06448399, "auxiliary_loss_mlp": 0.01269944, "balance_loss_clip": 0.06281724, "balance_loss_mlp": 0.01255358, "epoch": 0.42735607996392605, "flos": 27970747787520.0, "grad_norm": 1.4551443111807092, "language_loss": 0.68354839, "learning_rate": 2.5600500948443075e-06, "loss": 0.76073182, "num_input_tokens_seen": 152570725, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.14575195, "step": 7108, "time_per_iteration": 2.635829448699951 }, { "auxiliary_loss_clip": 0.06455395, "auxiliary_loss_mlp": 0.01267368, "balance_loss_clip": 0.06286767, "balance_loss_mlp": 0.01253575, "epoch": 0.427416203216594, "flos": 20300582816640.0, "grad_norm": 1.6330514222821388, "language_loss": 0.72194803, "learning_rate": 2.5596762038592294e-06, "loss": 0.79917562, "num_input_tokens_seen": 152588950, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.13806152, "step": 7109, "time_per_iteration": 2.558168411254883 }, { "auxiliary_loss_clip": 0.06458129, "auxiliary_loss_mlp": 0.0127334, "balance_loss_clip": 0.06286802, "balance_loss_mlp": 0.01257915, "epoch": 0.427476326469262, "flos": 26951382541440.0, "grad_norm": 1.8329580963185703, "language_loss": 0.64783019, "learning_rate": 2.559302291651174e-06, "loss": 0.72514486, "num_input_tokens_seen": 152608965, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.1541748, "step": 7110, "time_per_iteration": 2.609429359436035 }, { "auxiliary_loss_clip": 0.06454937, "auxiliary_loss_mlp": 0.01268505, "balance_loss_clip": 0.06286496, "balance_loss_mlp": 0.01254808, "epoch": 0.42753644972192995, "flos": 25709967175680.0, "grad_norm": 1.7799784091982431, "language_loss": 0.76890856, "learning_rate": 2.5589283582343197e-06, "loss": 0.84614301, "num_input_tokens_seen": 152630220, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.13696289, "step": 7111, "time_per_iteration": 2.5949368476867676 }, { "auxiliary_loss_clip": 0.06454676, "auxiliary_loss_mlp": 0.01266463, "balance_loss_clip": 0.06282935, "balance_loss_mlp": 0.01253022, "epoch": 0.4275965729745979, "flos": 18772855649280.0, "grad_norm": 2.0527689879485513, "language_loss": 0.73846346, "learning_rate": 2.558554403622845e-06, "loss": 0.81567484, "num_input_tokens_seen": 152648835, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.13433838, "step": 7112, "time_per_iteration": 2.553748846054077 }, { "auxiliary_loss_clip": 0.06451234, "auxiliary_loss_mlp": 0.01270604, "balance_loss_clip": 0.06284502, "balance_loss_mlp": 0.01257449, "epoch": 0.4276566962272659, "flos": 23770438318080.0, "grad_norm": 1.5219194832621905, "language_loss": 0.71692753, "learning_rate": 2.5581804278309323e-06, "loss": 0.79414594, "num_input_tokens_seen": 152668375, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.1315918, "step": 7113, "time_per_iteration": 2.625553846359253 }, { "auxiliary_loss_clip": 0.06458975, "auxiliary_loss_mlp": 0.01273331, "balance_loss_clip": 0.06287298, "balance_loss_mlp": 0.0125961, "epoch": 0.42771681947993384, "flos": 22499156171520.0, "grad_norm": 1.5736072843795923, "language_loss": 0.61947179, "learning_rate": 2.5578064308727617e-06, "loss": 0.69679487, "num_input_tokens_seen": 152689725, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.13720703, "step": 7114, "time_per_iteration": 2.602226734161377 }, { "auxiliary_loss_clip": 0.06465556, "auxiliary_loss_mlp": 0.0127686, "balance_loss_clip": 0.06288894, "balance_loss_mlp": 0.0126079, "epoch": 0.42777694273260186, "flos": 25051489464960.0, "grad_norm": 1.708240218288264, "language_loss": 0.64909446, "learning_rate": 2.5574324127625153e-06, "loss": 0.72651863, "num_input_tokens_seen": 152709375, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.16064453, "step": 7115, "time_per_iteration": 2.6115405559539795 }, { "auxiliary_loss_clip": 0.06456447, "auxiliary_loss_mlp": 0.01270305, "balance_loss_clip": 0.06284533, "balance_loss_mlp": 0.01256936, "epoch": 0.4278370659852698, "flos": 18667532666880.0, "grad_norm": 2.2815111656226934, "language_loss": 0.73968387, "learning_rate": 2.5570583735143753e-06, "loss": 0.81695139, "num_input_tokens_seen": 152727510, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.1338501, "step": 7116, "time_per_iteration": 2.5532467365264893 }, { "auxiliary_loss_clip": 0.06447206, "auxiliary_loss_mlp": 0.01271818, "balance_loss_clip": 0.06283584, "balance_loss_mlp": 0.01258961, "epoch": 0.4278971892379378, "flos": 27315666167040.0, "grad_norm": 1.8916482920938353, "language_loss": 0.69681287, "learning_rate": 2.5566843131425275e-06, "loss": 0.77400315, "num_input_tokens_seen": 152746670, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.12860107, "step": 7117, "time_per_iteration": 2.6326489448547363 }, { "auxiliary_loss_clip": 0.06454094, "auxiliary_loss_mlp": 0.01274988, "balance_loss_clip": 0.06287086, "balance_loss_mlp": 0.01261029, "epoch": 0.42795731249060576, "flos": 12892397235840.0, "grad_norm": 2.300137533537503, "language_loss": 0.69628006, "learning_rate": 2.5563102316611536e-06, "loss": 0.7735709, "num_input_tokens_seen": 152760545, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.13952637, "step": 7118, "time_per_iteration": 2.682844638824463 }, { "auxiliary_loss_clip": 0.06458023, "auxiliary_loss_mlp": 0.01280207, "balance_loss_clip": 0.06291288, "balance_loss_mlp": 0.01265318, "epoch": 0.4280174357432737, "flos": 33409873146240.0, "grad_norm": 1.9802070430929977, "language_loss": 0.75174952, "learning_rate": 2.55593612908444e-06, "loss": 0.82913178, "num_input_tokens_seen": 152780970, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.14904785, "step": 7119, "time_per_iteration": 2.8101766109466553 }, { "auxiliary_loss_clip": 0.06450275, "auxiliary_loss_mlp": 0.01272459, "balance_loss_clip": 0.06283094, "balance_loss_mlp": 0.01258475, "epoch": 0.4280775589959417, "flos": 18264871071360.0, "grad_norm": 2.344674407998421, "language_loss": 0.75394475, "learning_rate": 2.555562005426573e-06, "loss": 0.83117205, "num_input_tokens_seen": 152798475, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.13995361, "step": 7120, "time_per_iteration": 4.007999420166016 }, { "auxiliary_loss_clip": 0.06462738, "auxiliary_loss_mlp": 0.01269819, "balance_loss_clip": 0.06294608, "balance_loss_mlp": 0.01256908, "epoch": 0.42813768224860965, "flos": 21477820354560.0, "grad_norm": 1.8627019667224245, "language_loss": 0.77332079, "learning_rate": 2.5551878607017385e-06, "loss": 0.85064626, "num_input_tokens_seen": 152817555, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.12921143, "step": 7121, "time_per_iteration": 2.580465793609619 }, { "auxiliary_loss_clip": 0.06450193, "auxiliary_loss_mlp": 0.01274003, "balance_loss_clip": 0.06285046, "balance_loss_mlp": 0.01261098, "epoch": 0.4281978055012776, "flos": 15674704859520.0, "grad_norm": 1.7482908843766956, "language_loss": 0.8596251, "learning_rate": 2.554813694924126e-06, "loss": 0.936867, "num_input_tokens_seen": 152836295, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.12902832, "step": 7122, "time_per_iteration": 2.5466175079345703 }, { "auxiliary_loss_clip": 0.06451032, "auxiliary_loss_mlp": 0.01272945, "balance_loss_clip": 0.0628666, "balance_loss_mlp": 0.01259129, "epoch": 0.4282579287539456, "flos": 17717711909760.0, "grad_norm": 1.7815170108954723, "language_loss": 0.81208181, "learning_rate": 2.554439508107921e-06, "loss": 0.88932157, "num_input_tokens_seen": 152854950, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.13812256, "step": 7123, "time_per_iteration": 2.6205804347991943 }, { "auxiliary_loss_clip": 0.06450036, "auxiliary_loss_mlp": 0.01273768, "balance_loss_clip": 0.06285107, "balance_loss_mlp": 0.01260071, "epoch": 0.42831805200661355, "flos": 19287171210240.0, "grad_norm": 1.4900285683378396, "language_loss": 0.81080198, "learning_rate": 2.5540653002673153e-06, "loss": 0.88804007, "num_input_tokens_seen": 152873995, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.13690186, "step": 7124, "time_per_iteration": 3.976513624191284 }, { "auxiliary_loss_clip": 0.06455076, "auxiliary_loss_mlp": 0.01270331, "balance_loss_clip": 0.06287417, "balance_loss_mlp": 0.01255328, "epoch": 0.4283781752592815, "flos": 19798845367680.0, "grad_norm": 2.4812981451490077, "language_loss": 0.80983937, "learning_rate": 2.553691071416498e-06, "loss": 0.88709342, "num_input_tokens_seen": 152892925, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.15008545, "step": 7125, "time_per_iteration": 2.596513271331787 }, { "auxiliary_loss_clip": 0.06453192, "auxiliary_loss_mlp": 0.01270959, "balance_loss_clip": 0.06287359, "balance_loss_mlp": 0.0125747, "epoch": 0.4284382985119495, "flos": 16513584410880.0, "grad_norm": 1.764255431103595, "language_loss": 0.75359023, "learning_rate": 2.553316821569659e-06, "loss": 0.83083177, "num_input_tokens_seen": 152910935, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.13476562, "step": 7126, "time_per_iteration": 2.547595262527466 }, { "auxiliary_loss_clip": 0.06458082, "auxiliary_loss_mlp": 0.01268425, "balance_loss_clip": 0.06287661, "balance_loss_mlp": 0.01254835, "epoch": 0.42849842176461744, "flos": 23337406817280.0, "grad_norm": 1.6737152849599537, "language_loss": 0.82058072, "learning_rate": 2.5529425507409913e-06, "loss": 0.89784575, "num_input_tokens_seen": 152931030, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.13568115, "step": 7127, "time_per_iteration": 2.653775691986084 }, { "auxiliary_loss_clip": 0.06459348, "auxiliary_loss_mlp": 0.01270639, "balance_loss_clip": 0.06290862, "balance_loss_mlp": 0.01256227, "epoch": 0.4285585450172854, "flos": 17280110361600.0, "grad_norm": 1.7907737403566153, "language_loss": 0.76586336, "learning_rate": 2.5525682589446867e-06, "loss": 0.84316319, "num_input_tokens_seen": 152948085, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.14404297, "step": 7128, "time_per_iteration": 2.55658221244812 }, { "auxiliary_loss_clip": 0.06462109, "auxiliary_loss_mlp": 0.01273728, "balance_loss_clip": 0.06290545, "balance_loss_mlp": 0.01259149, "epoch": 0.42861866826995343, "flos": 24286430960640.0, "grad_norm": 1.8556794883728864, "language_loss": 0.7460866, "learning_rate": 2.552193946194937e-06, "loss": 0.82344502, "num_input_tokens_seen": 152966265, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.14556885, "step": 7129, "time_per_iteration": 2.616910934448242 }, { "auxiliary_loss_clip": 0.06462087, "auxiliary_loss_mlp": 0.01276274, "balance_loss_clip": 0.06293374, "balance_loss_mlp": 0.01261921, "epoch": 0.4286787915226214, "flos": 24360042372480.0, "grad_norm": 1.6658321581418756, "language_loss": 0.78704298, "learning_rate": 2.5518196125059394e-06, "loss": 0.86442655, "num_input_tokens_seen": 152986775, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.14337158, "step": 7130, "time_per_iteration": 2.6303768157958984 }, { "auxiliary_loss_clip": 0.06462224, "auxiliary_loss_mlp": 0.01273548, "balance_loss_clip": 0.06291837, "balance_loss_mlp": 0.01258671, "epoch": 0.42873891477528936, "flos": 15455338070400.0, "grad_norm": 2.2252304290999705, "language_loss": 0.74336267, "learning_rate": 2.551445257891886e-06, "loss": 0.82072037, "num_input_tokens_seen": 153003595, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.14886475, "step": 7131, "time_per_iteration": 2.6533477306365967 }, { "auxiliary_loss_clip": 0.06457072, "auxiliary_loss_mlp": 0.01278063, "balance_loss_clip": 0.06288329, "balance_loss_mlp": 0.01263191, "epoch": 0.4287990380279573, "flos": 17645358309120.0, "grad_norm": 2.0040347063902186, "language_loss": 0.77367747, "learning_rate": 2.551070882366973e-06, "loss": 0.85102874, "num_input_tokens_seen": 153021960, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.14874268, "step": 7132, "time_per_iteration": 2.5488102436065674 }, { "auxiliary_loss_clip": 0.06463151, "auxiliary_loss_mlp": 0.01268596, "balance_loss_clip": 0.06293237, "balance_loss_mlp": 0.01254231, "epoch": 0.4288591612806253, "flos": 27169701154560.0, "grad_norm": 1.5481176636344935, "language_loss": 0.78748107, "learning_rate": 2.550696485945397e-06, "loss": 0.86479855, "num_input_tokens_seen": 153042110, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.14361572, "step": 7133, "time_per_iteration": 4.068859815597534 }, { "auxiliary_loss_clip": 0.06457758, "auxiliary_loss_mlp": 0.01271287, "balance_loss_clip": 0.06288844, "balance_loss_mlp": 0.01257607, "epoch": 0.42891928453329325, "flos": 17168540250240.0, "grad_norm": 1.8860882334239184, "language_loss": 0.75096762, "learning_rate": 2.550322068641355e-06, "loss": 0.8282581, "num_input_tokens_seen": 153058925, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.13684082, "step": 7134, "time_per_iteration": 3.984539270401001 }, { "auxiliary_loss_clip": 0.0645749, "auxiliary_loss_mlp": 0.01271461, "balance_loss_clip": 0.06292084, "balance_loss_mlp": 0.01257556, "epoch": 0.4289794077859612, "flos": 18192936741120.0, "grad_norm": 1.7859818740999882, "language_loss": 0.84085625, "learning_rate": 2.5499476304690455e-06, "loss": 0.91814578, "num_input_tokens_seen": 153078070, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.13909912, "step": 7135, "time_per_iteration": 2.5741987228393555 }, { "auxiliary_loss_clip": 0.06455537, "auxiliary_loss_mlp": 0.01266982, "balance_loss_clip": 0.06291157, "balance_loss_mlp": 0.01252677, "epoch": 0.4290395310386292, "flos": 28264438748160.0, "grad_norm": 2.9128213112291155, "language_loss": 0.7555815, "learning_rate": 2.549573171442666e-06, "loss": 0.83280671, "num_input_tokens_seen": 153096680, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.14306641, "step": 7136, "time_per_iteration": 2.6153721809387207 }, { "auxiliary_loss_clip": 0.06464063, "auxiliary_loss_mlp": 0.01275238, "balance_loss_clip": 0.06292417, "balance_loss_mlp": 0.01260921, "epoch": 0.42909965429129715, "flos": 16221528604800.0, "grad_norm": 1.8651763035722049, "language_loss": 0.79487574, "learning_rate": 2.5491986915764175e-06, "loss": 0.87226874, "num_input_tokens_seen": 153113305, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.14306641, "step": 7137, "time_per_iteration": 2.6096506118774414 }, { "auxiliary_loss_clip": 0.06464596, "auxiliary_loss_mlp": 0.01269544, "balance_loss_clip": 0.06295201, "balance_loss_mlp": 0.01255269, "epoch": 0.4291597775439651, "flos": 23119633255680.0, "grad_norm": 1.7587829825757098, "language_loss": 0.76458573, "learning_rate": 2.548824190884499e-06, "loss": 0.84192711, "num_input_tokens_seen": 153132735, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.14276123, "step": 7138, "time_per_iteration": 2.5906763076782227 }, { "auxiliary_loss_clip": 0.06351048, "auxiliary_loss_mlp": 0.01253831, "balance_loss_clip": 0.06278536, "balance_loss_mlp": 0.01251671, "epoch": 0.4292199007966331, "flos": 67565461703040.0, "grad_norm": 0.7525280016999596, "language_loss": 0.56104124, "learning_rate": 2.548449669381113e-06, "loss": 0.63709009, "num_input_tokens_seen": 153187925, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 0.0216217, "step": 7139, "time_per_iteration": 3.1039183139801025 }, { "auxiliary_loss_clip": 0.06451719, "auxiliary_loss_mlp": 0.01269167, "balance_loss_clip": 0.06290165, "balance_loss_mlp": 0.01257163, "epoch": 0.42928002404930105, "flos": 23006008719360.0, "grad_norm": 1.5664317522224143, "language_loss": 0.81587154, "learning_rate": 2.5480751270804595e-06, "loss": 0.89308041, "num_input_tokens_seen": 153206990, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.12011719, "step": 7140, "time_per_iteration": 2.6558775901794434 }, { "auxiliary_loss_clip": 0.06463359, "auxiliary_loss_mlp": 0.0126787, "balance_loss_clip": 0.06294499, "balance_loss_mlp": 0.01254155, "epoch": 0.429340147301969, "flos": 11549432321280.0, "grad_norm": 1.8778334478938516, "language_loss": 0.82392192, "learning_rate": 2.5477005639967424e-06, "loss": 0.90123421, "num_input_tokens_seen": 153222345, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.137146, "step": 7141, "time_per_iteration": 2.5959677696228027 }, { "auxiliary_loss_clip": 0.06468855, "auxiliary_loss_mlp": 0.01274492, "balance_loss_clip": 0.06296893, "balance_loss_mlp": 0.01260038, "epoch": 0.42940027055463703, "flos": 25272030211200.0, "grad_norm": 1.642566878120891, "language_loss": 0.86778432, "learning_rate": 2.547325980144166e-06, "loss": 0.94521785, "num_input_tokens_seen": 153240570, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.14447021, "step": 7142, "time_per_iteration": 2.6502959728240967 }, { "auxiliary_loss_clip": 0.06460492, "auxiliary_loss_mlp": 0.01267498, "balance_loss_clip": 0.06298391, "balance_loss_mlp": 0.01254892, "epoch": 0.429460393807305, "flos": 23811709253760.0, "grad_norm": 2.2933494672447705, "language_loss": 0.78466129, "learning_rate": 2.5469513755369323e-06, "loss": 0.86194122, "num_input_tokens_seen": 153259575, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.12609863, "step": 7143, "time_per_iteration": 2.703368902206421 }, { "auxiliary_loss_clip": 0.0646786, "auxiliary_loss_mlp": 0.01273839, "balance_loss_clip": 0.06300328, "balance_loss_mlp": 0.01260696, "epoch": 0.42952051705997296, "flos": 13923502053120.0, "grad_norm": 2.0180375745357937, "language_loss": 0.77115637, "learning_rate": 2.5465767501892484e-06, "loss": 0.84857339, "num_input_tokens_seen": 153276650, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.13153076, "step": 7144, "time_per_iteration": 2.6314845085144043 }, { "auxiliary_loss_clip": 0.06465548, "auxiliary_loss_mlp": 0.01270492, "balance_loss_clip": 0.06295897, "balance_loss_mlp": 0.01257051, "epoch": 0.4295806403126409, "flos": 26767584610560.0, "grad_norm": 1.5740467678957608, "language_loss": 0.74631774, "learning_rate": 2.54620210411532e-06, "loss": 0.8236782, "num_input_tokens_seen": 153298025, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.13439941, "step": 7145, "time_per_iteration": 2.7262914180755615 }, { "auxiliary_loss_clip": 0.06466029, "auxiliary_loss_mlp": 0.01272304, "balance_loss_clip": 0.06295168, "balance_loss_mlp": 0.01257677, "epoch": 0.4296407635653089, "flos": 20957760789120.0, "grad_norm": 2.1348050921621007, "language_loss": 0.79218543, "learning_rate": 2.545827437329352e-06, "loss": 0.86956882, "num_input_tokens_seen": 153315775, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.1463623, "step": 7146, "time_per_iteration": 2.5932490825653076 }, { "auxiliary_loss_clip": 0.06460205, "auxiliary_loss_mlp": 0.01266124, "balance_loss_clip": 0.06294981, "balance_loss_mlp": 0.0125372, "epoch": 0.42970088681797686, "flos": 15857915811840.0, "grad_norm": 2.2760704543687473, "language_loss": 0.83554572, "learning_rate": 2.5454527498455532e-06, "loss": 0.91280901, "num_input_tokens_seen": 153332765, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.12414551, "step": 7147, "time_per_iteration": 2.5625174045562744 }, { "auxiliary_loss_clip": 0.06461768, "auxiliary_loss_mlp": 0.01274568, "balance_loss_clip": 0.06294287, "balance_loss_mlp": 0.01259035, "epoch": 0.4297610100706448, "flos": 22389179287680.0, "grad_norm": 2.0813916102015604, "language_loss": 0.8754344, "learning_rate": 2.545078041678131e-06, "loss": 0.95279777, "num_input_tokens_seen": 153350760, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.15533447, "step": 7148, "time_per_iteration": 2.5730512142181396 }, { "auxiliary_loss_clip": 0.06464174, "auxiliary_loss_mlp": 0.01271543, "balance_loss_clip": 0.06297082, "balance_loss_mlp": 0.01257804, "epoch": 0.4298211333233128, "flos": 27932705233920.0, "grad_norm": 1.523572870226018, "language_loss": 0.78206539, "learning_rate": 2.5447033128412957e-06, "loss": 0.85942256, "num_input_tokens_seen": 153370765, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.13739014, "step": 7149, "time_per_iteration": 2.6254401206970215 }, { "auxiliary_loss_clip": 0.06460936, "auxiliary_loss_mlp": 0.0127037, "balance_loss_clip": 0.0629645, "balance_loss_mlp": 0.01256005, "epoch": 0.42988125657598075, "flos": 24432479827200.0, "grad_norm": 1.554708722318767, "language_loss": 0.79756856, "learning_rate": 2.544328563349256e-06, "loss": 0.87488163, "num_input_tokens_seen": 153390725, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.14361572, "step": 7150, "time_per_iteration": 2.5863659381866455 }, { "auxiliary_loss_clip": 0.06471689, "auxiliary_loss_mlp": 0.01271784, "balance_loss_clip": 0.06297623, "balance_loss_mlp": 0.01256347, "epoch": 0.4299413798286487, "flos": 15855400189440.0, "grad_norm": 1.6809215374864583, "language_loss": 0.75446582, "learning_rate": 2.5439537932162222e-06, "loss": 0.8319006, "num_input_tokens_seen": 153408010, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.15435791, "step": 7151, "time_per_iteration": 2.5410380363464355 }, { "auxiliary_loss_clip": 0.06470235, "auxiliary_loss_mlp": 0.01269909, "balance_loss_clip": 0.06296653, "balance_loss_mlp": 0.01255925, "epoch": 0.4300015030813167, "flos": 22316029073280.0, "grad_norm": 2.0766078955051746, "language_loss": 0.70885682, "learning_rate": 2.543579002456406e-06, "loss": 0.78625828, "num_input_tokens_seen": 153426865, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.13995361, "step": 7152, "time_per_iteration": 2.5747029781341553 }, { "auxiliary_loss_clip": 0.06461999, "auxiliary_loss_mlp": 0.01269085, "balance_loss_clip": 0.06292325, "balance_loss_mlp": 0.01255221, "epoch": 0.43006162633398465, "flos": 34906391867520.0, "grad_norm": 1.839976590813382, "language_loss": 0.71146727, "learning_rate": 2.54320419108402e-06, "loss": 0.78877813, "num_input_tokens_seen": 153449410, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.13867188, "step": 7153, "time_per_iteration": 2.6985902786254883 }, { "auxiliary_loss_clip": 0.06459993, "auxiliary_loss_mlp": 0.01269484, "balance_loss_clip": 0.0629122, "balance_loss_mlp": 0.01255947, "epoch": 0.4301217495866526, "flos": 15967138008960.0, "grad_norm": 2.0119909120383577, "language_loss": 0.78576338, "learning_rate": 2.542829359113276e-06, "loss": 0.86305815, "num_input_tokens_seen": 153467910, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.13531494, "step": 7154, "time_per_iteration": 2.52665114402771 }, { "auxiliary_loss_clip": 0.06459379, "auxiliary_loss_mlp": 0.01269474, "balance_loss_clip": 0.06293218, "balance_loss_mlp": 0.01256486, "epoch": 0.43018187283932063, "flos": 18776293666560.0, "grad_norm": 1.586628587171588, "language_loss": 0.79089612, "learning_rate": 2.542454506558389e-06, "loss": 0.86818469, "num_input_tokens_seen": 153487100, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.12969971, "step": 7155, "time_per_iteration": 2.567758560180664 }, { "auxiliary_loss_clip": 0.06455737, "auxiliary_loss_mlp": 0.01266618, "balance_loss_clip": 0.06291655, "balance_loss_mlp": 0.01253672, "epoch": 0.4302419960919886, "flos": 20157007645440.0, "grad_norm": 1.8052272672778937, "language_loss": 0.89160395, "learning_rate": 2.5420796334335723e-06, "loss": 0.96882749, "num_input_tokens_seen": 153505565, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.1295166, "step": 7156, "time_per_iteration": 2.5449986457824707 }, { "auxiliary_loss_clip": 0.06465154, "auxiliary_loss_mlp": 0.01272585, "balance_loss_clip": 0.06293785, "balance_loss_mlp": 0.01258626, "epoch": 0.43030211934465656, "flos": 26440001873280.0, "grad_norm": 1.7035886610760087, "language_loss": 0.83482695, "learning_rate": 2.541704739753042e-06, "loss": 0.91220433, "num_input_tokens_seen": 153526130, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.13952637, "step": 7157, "time_per_iteration": 2.5909132957458496 }, { "auxiliary_loss_clip": 0.06468669, "auxiliary_loss_mlp": 0.01276126, "balance_loss_clip": 0.06296971, "balance_loss_mlp": 0.01261296, "epoch": 0.43036224259732453, "flos": 24396114355200.0, "grad_norm": 1.6719278357320886, "language_loss": 0.72252381, "learning_rate": 2.5413298255310132e-06, "loss": 0.79997182, "num_input_tokens_seen": 153546370, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.14819336, "step": 7158, "time_per_iteration": 2.5742568969726562 }, { "auxiliary_loss_clip": 0.06462234, "auxiliary_loss_mlp": 0.01271751, "balance_loss_clip": 0.06293992, "balance_loss_mlp": 0.01257959, "epoch": 0.4304223658499925, "flos": 17207421344640.0, "grad_norm": 2.112375111881081, "language_loss": 0.83657873, "learning_rate": 2.5409548907817034e-06, "loss": 0.91391855, "num_input_tokens_seen": 153562800, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.13800049, "step": 7159, "time_per_iteration": 2.511408567428589 }, { "auxiliary_loss_clip": 0.06470908, "auxiliary_loss_mlp": 0.01267261, "balance_loss_clip": 0.06301305, "balance_loss_mlp": 0.01254189, "epoch": 0.43048248910266046, "flos": 14908304689920.0, "grad_norm": 2.0170713287861988, "language_loss": 0.83111358, "learning_rate": 2.54057993551933e-06, "loss": 0.90849525, "num_input_tokens_seen": 153578395, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.1307373, "step": 7160, "time_per_iteration": 3.9742352962493896 }, { "auxiliary_loss_clip": 0.06474696, "auxiliary_loss_mlp": 0.01272029, "balance_loss_clip": 0.06301938, "balance_loss_mlp": 0.01257253, "epoch": 0.4305426123553284, "flos": 21586245937920.0, "grad_norm": 2.0298443708020466, "language_loss": 0.77179945, "learning_rate": 2.5402049597581116e-06, "loss": 0.84926671, "num_input_tokens_seen": 153596880, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.14788818, "step": 7161, "time_per_iteration": 2.574319362640381 }, { "auxiliary_loss_clip": 0.06471145, "auxiliary_loss_mlp": 0.01271858, "balance_loss_clip": 0.06301668, "balance_loss_mlp": 0.01258036, "epoch": 0.4306027356079964, "flos": 22607833317120.0, "grad_norm": 2.0667307560380044, "language_loss": 0.73545372, "learning_rate": 2.5398299635122662e-06, "loss": 0.81288373, "num_input_tokens_seen": 153616570, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.13818359, "step": 7162, "time_per_iteration": 2.576876640319824 }, { "auxiliary_loss_clip": 0.06377655, "auxiliary_loss_mlp": 0.0125565, "balance_loss_clip": 0.06305088, "balance_loss_mlp": 0.01252807, "epoch": 0.43066285886066435, "flos": 70689873548160.0, "grad_norm": 0.7729711470934978, "language_loss": 0.58734822, "learning_rate": 2.5394549467960147e-06, "loss": 0.66368127, "num_input_tokens_seen": 153671450, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 0.02838135, "step": 7163, "time_per_iteration": 3.0745747089385986 }, { "auxiliary_loss_clip": 0.0646489, "auxiliary_loss_mlp": 0.01269892, "balance_loss_clip": 0.06298167, "balance_loss_mlp": 0.01256755, "epoch": 0.4307229821133323, "flos": 26727236069760.0, "grad_norm": 1.7189862216167362, "language_loss": 0.79632032, "learning_rate": 2.5390799096235783e-06, "loss": 0.87366807, "num_input_tokens_seen": 153691405, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.13146973, "step": 7164, "time_per_iteration": 4.091130495071411 }, { "auxiliary_loss_clip": 0.06477237, "auxiliary_loss_mlp": 0.01268451, "balance_loss_clip": 0.06303068, "balance_loss_mlp": 0.0125423, "epoch": 0.4307831053660003, "flos": 26184311539200.0, "grad_norm": 1.7668879094959677, "language_loss": 0.68145323, "learning_rate": 2.538704852009177e-06, "loss": 0.75891006, "num_input_tokens_seen": 153711555, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.14208984, "step": 7165, "time_per_iteration": 2.7848215103149414 }, { "auxiliary_loss_clip": 0.0647082, "auxiliary_loss_mlp": 0.01271894, "balance_loss_clip": 0.0629984, "balance_loss_mlp": 0.01257118, "epoch": 0.43084322861866825, "flos": 18915298790400.0, "grad_norm": 1.929304796935034, "language_loss": 0.74693811, "learning_rate": 2.538329773967034e-06, "loss": 0.82436526, "num_input_tokens_seen": 153730095, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.14770508, "step": 7166, "time_per_iteration": 2.5779895782470703 }, { "auxiliary_loss_clip": 0.06462394, "auxiliary_loss_mlp": 0.01269533, "balance_loss_clip": 0.06295574, "balance_loss_mlp": 0.01256098, "epoch": 0.4309033518713362, "flos": 26440211508480.0, "grad_norm": 1.7911151385278994, "language_loss": 0.72233498, "learning_rate": 2.537954675511372e-06, "loss": 0.79965425, "num_input_tokens_seen": 153749320, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.13427734, "step": 7167, "time_per_iteration": 2.6066815853118896 }, { "auxiliary_loss_clip": 0.06461611, "auxiliary_loss_mlp": 0.01268959, "balance_loss_clip": 0.0629722, "balance_loss_mlp": 0.01254815, "epoch": 0.43096347512400424, "flos": 21219362835840.0, "grad_norm": 1.5249265429638599, "language_loss": 0.7895323, "learning_rate": 2.537579556656414e-06, "loss": 0.86683798, "num_input_tokens_seen": 153767825, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.14147949, "step": 7168, "time_per_iteration": 2.571700096130371 }, { "auxiliary_loss_clip": 0.06469097, "auxiliary_loss_mlp": 0.01274653, "balance_loss_clip": 0.06299748, "balance_loss_mlp": 0.01260252, "epoch": 0.4310235983766722, "flos": 16544918638080.0, "grad_norm": 2.153604430738578, "language_loss": 0.82496673, "learning_rate": 2.537204417416387e-06, "loss": 0.90240431, "num_input_tokens_seen": 153785350, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.14404297, "step": 7169, "time_per_iteration": 2.5479092597961426 }, { "auxiliary_loss_clip": 0.06362125, "auxiliary_loss_mlp": 0.01256554, "balance_loss_clip": 0.06289864, "balance_loss_mlp": 0.01253942, "epoch": 0.43108372162934017, "flos": 64794893650560.0, "grad_norm": 0.6679878657966836, "language_loss": 0.60893101, "learning_rate": 2.5368292578055132e-06, "loss": 0.68511784, "num_input_tokens_seen": 153856400, "router_z_loss_clip": 0.72216797, "router_z_loss_mlp": 0.02613831, "step": 7170, "time_per_iteration": 3.325556755065918 }, { "auxiliary_loss_clip": 0.06464307, "auxiliary_loss_mlp": 0.01267259, "balance_loss_clip": 0.06296436, "balance_loss_mlp": 0.01253312, "epoch": 0.43114384488200813, "flos": 13449241543680.0, "grad_norm": 1.7463439907897143, "language_loss": 0.76425928, "learning_rate": 2.536454077838021e-06, "loss": 0.84157491, "num_input_tokens_seen": 153875230, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.13964844, "step": 7171, "time_per_iteration": 2.552457809448242 }, { "auxiliary_loss_clip": 0.06464774, "auxiliary_loss_mlp": 0.01268311, "balance_loss_clip": 0.06296543, "balance_loss_mlp": 0.01254847, "epoch": 0.4312039681346761, "flos": 26293911079680.0, "grad_norm": 1.6473965063533196, "language_loss": 0.77875423, "learning_rate": 2.5360788775281357e-06, "loss": 0.85608506, "num_input_tokens_seen": 153894740, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.13464355, "step": 7172, "time_per_iteration": 2.6011271476745605 }, { "auxiliary_loss_clip": 0.06463554, "auxiliary_loss_mlp": 0.01269048, "balance_loss_clip": 0.0629189, "balance_loss_mlp": 0.01254046, "epoch": 0.43126409138734406, "flos": 20383040833920.0, "grad_norm": 1.5924114288141955, "language_loss": 0.77561927, "learning_rate": 2.535703656890086e-06, "loss": 0.85294521, "num_input_tokens_seen": 153913230, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.14996338, "step": 7173, "time_per_iteration": 5.392456531524658 }, { "auxiliary_loss_clip": 0.06464645, "auxiliary_loss_mlp": 0.01267612, "balance_loss_clip": 0.06297067, "balance_loss_mlp": 0.01253843, "epoch": 0.431324214640012, "flos": 22128918906240.0, "grad_norm": 1.4843884736836384, "language_loss": 0.77070606, "learning_rate": 2.5353284159381e-06, "loss": 0.84802866, "num_input_tokens_seen": 153933250, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.13757324, "step": 7174, "time_per_iteration": 2.569845676422119 }, { "auxiliary_loss_clip": 0.06463519, "auxiliary_loss_mlp": 0.01271949, "balance_loss_clip": 0.06293038, "balance_loss_mlp": 0.01258133, "epoch": 0.43138433789268, "flos": 15236306697600.0, "grad_norm": 1.5671462902493594, "language_loss": 0.82573915, "learning_rate": 2.534953154686407e-06, "loss": 0.90309381, "num_input_tokens_seen": 153951325, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.13830566, "step": 7175, "time_per_iteration": 2.549816846847534 }, { "auxiliary_loss_clip": 0.06472428, "auxiliary_loss_mlp": 0.0127253, "balance_loss_clip": 0.06296918, "balance_loss_mlp": 0.01256401, "epoch": 0.43144446114534796, "flos": 18156151998720.0, "grad_norm": 2.02967312017345, "language_loss": 0.74642563, "learning_rate": 2.5345778731492366e-06, "loss": 0.82387531, "num_input_tokens_seen": 153966975, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.16119385, "step": 7176, "time_per_iteration": 2.5258896350860596 }, { "auxiliary_loss_clip": 0.06466712, "auxiliary_loss_mlp": 0.01270217, "balance_loss_clip": 0.06293439, "balance_loss_mlp": 0.01256066, "epoch": 0.4315045843980159, "flos": 22936506157440.0, "grad_norm": 1.4963363048535478, "language_loss": 0.73835027, "learning_rate": 2.534202571340819e-06, "loss": 0.8157196, "num_input_tokens_seen": 153986695, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.14160156, "step": 7177, "time_per_iteration": 2.6080167293548584 }, { "auxiliary_loss_clip": 0.06474895, "auxiliary_loss_mlp": 0.01271111, "balance_loss_clip": 0.06292659, "balance_loss_mlp": 0.01254589, "epoch": 0.4315647076506839, "flos": 22133321245440.0, "grad_norm": 1.7068393161106028, "language_loss": 0.81760871, "learning_rate": 2.533827249275387e-06, "loss": 0.89506882, "num_input_tokens_seen": 154004710, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.1652832, "step": 7178, "time_per_iteration": 2.6123557090759277 }, { "auxiliary_loss_clip": 0.06453317, "auxiliary_loss_mlp": 0.01268687, "balance_loss_clip": 0.06291272, "balance_loss_mlp": 0.01255449, "epoch": 0.43162483090335185, "flos": 26878567743360.0, "grad_norm": 2.1905605630275193, "language_loss": 0.8445195, "learning_rate": 2.5334519069671725e-06, "loss": 0.92173952, "num_input_tokens_seen": 154024320, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.13244629, "step": 7179, "time_per_iteration": 2.635309934616089 }, { "auxiliary_loss_clip": 0.06457604, "auxiliary_loss_mlp": 0.01270649, "balance_loss_clip": 0.06289511, "balance_loss_mlp": 0.01256463, "epoch": 0.4316849541560198, "flos": 13917464559360.0, "grad_norm": 1.6164897733673997, "language_loss": 0.75746226, "learning_rate": 2.5330765444304075e-06, "loss": 0.83474481, "num_input_tokens_seen": 154041755, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.1418457, "step": 7180, "time_per_iteration": 2.577565908432007 }, { "auxiliary_loss_clip": 0.06465858, "auxiliary_loss_mlp": 0.01268138, "balance_loss_clip": 0.06291716, "balance_loss_mlp": 0.01252587, "epoch": 0.4317450774086878, "flos": 16440685758720.0, "grad_norm": 1.5933496181265223, "language_loss": 0.81788915, "learning_rate": 2.5327011616793274e-06, "loss": 0.8952291, "num_input_tokens_seen": 154056775, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.15557861, "step": 7181, "time_per_iteration": 2.6120150089263916 }, { "auxiliary_loss_clip": 0.06464754, "auxiliary_loss_mlp": 0.01269321, "balance_loss_clip": 0.06290789, "balance_loss_mlp": 0.0125442, "epoch": 0.4318052006613558, "flos": 20560675489920.0, "grad_norm": 1.5460916547129429, "language_loss": 0.89156353, "learning_rate": 2.532325758728165e-06, "loss": 0.96890426, "num_input_tokens_seen": 154075015, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.14910889, "step": 7182, "time_per_iteration": 2.5995421409606934 }, { "auxiliary_loss_clip": 0.06459682, "auxiliary_loss_mlp": 0.01268934, "balance_loss_clip": 0.06293476, "balance_loss_mlp": 0.01255719, "epoch": 0.43186532391402377, "flos": 22826613127680.0, "grad_norm": 1.6427419827052339, "language_loss": 0.75461316, "learning_rate": 2.5319503355911566e-06, "loss": 0.83189929, "num_input_tokens_seen": 154095170, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.13208008, "step": 7183, "time_per_iteration": 2.660198926925659 }, { "auxiliary_loss_clip": 0.06461491, "auxiliary_loss_mlp": 0.01271402, "balance_loss_clip": 0.06289922, "balance_loss_mlp": 0.01257586, "epoch": 0.43192544716669173, "flos": 25563624819840.0, "grad_norm": 1.5548253990682368, "language_loss": 0.77967429, "learning_rate": 2.5315748922825393e-06, "loss": 0.85700321, "num_input_tokens_seen": 154116895, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.13818359, "step": 7184, "time_per_iteration": 2.592271327972412 }, { "auxiliary_loss_clip": 0.06450985, "auxiliary_loss_mlp": 0.012691, "balance_loss_clip": 0.06288381, "balance_loss_mlp": 0.01256678, "epoch": 0.4319855704193597, "flos": 30962317783680.0, "grad_norm": 1.5350206398336206, "language_loss": 0.73828268, "learning_rate": 2.5311994288165474e-06, "loss": 0.81548351, "num_input_tokens_seen": 154138395, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.12438965, "step": 7185, "time_per_iteration": 2.6846632957458496 }, { "auxiliary_loss_clip": 0.06466434, "auxiliary_loss_mlp": 0.01273783, "balance_loss_clip": 0.06291883, "balance_loss_mlp": 0.01259859, "epoch": 0.43204569367202766, "flos": 24244824608640.0, "grad_norm": 2.2742231243931372, "language_loss": 0.7620672, "learning_rate": 2.530823945207421e-06, "loss": 0.83946937, "num_input_tokens_seen": 154156775, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.13928223, "step": 7186, "time_per_iteration": 2.611828327178955 }, { "auxiliary_loss_clip": 0.06455479, "auxiliary_loss_mlp": 0.01270737, "balance_loss_clip": 0.06285384, "balance_loss_mlp": 0.01257385, "epoch": 0.43210581692469563, "flos": 18413058216960.0, "grad_norm": 1.987602071677173, "language_loss": 0.7721802, "learning_rate": 2.5304484414693962e-06, "loss": 0.84944236, "num_input_tokens_seen": 154177500, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.13330078, "step": 7187, "time_per_iteration": 2.641434907913208 }, { "auxiliary_loss_clip": 0.06343907, "auxiliary_loss_mlp": 0.01254344, "balance_loss_clip": 0.06271558, "balance_loss_mlp": 0.0125167, "epoch": 0.4321659401773636, "flos": 49851718133760.0, "grad_norm": 0.8341435471645933, "language_loss": 0.68038595, "learning_rate": 2.530072917616714e-06, "loss": 0.7563684, "num_input_tokens_seen": 154237110, "router_z_loss_clip": 0.72314453, "router_z_loss_mlp": 0.02677917, "step": 7188, "time_per_iteration": 3.256927967071533 }, { "auxiliary_loss_clip": 0.0645581, "auxiliary_loss_mlp": 0.01272585, "balance_loss_clip": 0.06292287, "balance_loss_mlp": 0.01260133, "epoch": 0.43222606343003156, "flos": 17134229203200.0, "grad_norm": 1.9054806243111078, "language_loss": 0.7839067, "learning_rate": 2.529697373663614e-06, "loss": 0.86119062, "num_input_tokens_seen": 154253910, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.12457275, "step": 7189, "time_per_iteration": 2.5733256340026855 }, { "auxiliary_loss_clip": 0.06469467, "auxiliary_loss_mlp": 0.01272841, "balance_loss_clip": 0.06294026, "balance_loss_mlp": 0.0125875, "epoch": 0.4322861866826995, "flos": 22756984784640.0, "grad_norm": 2.27929463196943, "language_loss": 0.71507245, "learning_rate": 2.5293218096243364e-06, "loss": 0.79249555, "num_input_tokens_seen": 154274770, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.14086914, "step": 7190, "time_per_iteration": 2.6045382022857666 }, { "auxiliary_loss_clip": 0.06453362, "auxiliary_loss_mlp": 0.01273513, "balance_loss_clip": 0.06287906, "balance_loss_mlp": 0.01260001, "epoch": 0.4323463099353675, "flos": 27899400435840.0, "grad_norm": 1.330639501427529, "language_loss": 0.7993893, "learning_rate": 2.5289462255131223e-06, "loss": 0.87665808, "num_input_tokens_seen": 154295035, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.13519287, "step": 7191, "time_per_iteration": 2.623983860015869 }, { "auxiliary_loss_clip": 0.06454011, "auxiliary_loss_mlp": 0.01269337, "balance_loss_clip": 0.06287482, "balance_loss_mlp": 0.01255783, "epoch": 0.43240643318803546, "flos": 21620892401280.0, "grad_norm": 1.5484261199502956, "language_loss": 0.7530663, "learning_rate": 2.5285706213442146e-06, "loss": 0.83029974, "num_input_tokens_seen": 154314905, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.13543701, "step": 7192, "time_per_iteration": 2.585287570953369 }, { "auxiliary_loss_clip": 0.06459215, "auxiliary_loss_mlp": 0.01274269, "balance_loss_clip": 0.06290178, "balance_loss_mlp": 0.01259922, "epoch": 0.4324665564407034, "flos": 17562774510720.0, "grad_norm": 2.941240853049478, "language_loss": 0.79456407, "learning_rate": 2.5281949971318557e-06, "loss": 0.87189889, "num_input_tokens_seen": 154331740, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.14337158, "step": 7193, "time_per_iteration": 2.5599472522735596 }, { "auxiliary_loss_clip": 0.06457174, "auxiliary_loss_mlp": 0.01275617, "balance_loss_clip": 0.06288575, "balance_loss_mlp": 0.0126161, "epoch": 0.4325266796933714, "flos": 18407775409920.0, "grad_norm": 2.3556506861140827, "language_loss": 0.7570442, "learning_rate": 2.5278193528902897e-06, "loss": 0.83437216, "num_input_tokens_seen": 154348740, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.14013672, "step": 7194, "time_per_iteration": 2.752955913543701 }, { "auxiliary_loss_clip": 0.0646126, "auxiliary_loss_mlp": 0.01276917, "balance_loss_clip": 0.06291209, "balance_loss_mlp": 0.01262773, "epoch": 0.4325868029460394, "flos": 22571342064000.0, "grad_norm": 1.8996234313679874, "language_loss": 0.60310602, "learning_rate": 2.5274436886337613e-06, "loss": 0.68048787, "num_input_tokens_seen": 154368835, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.14129639, "step": 7195, "time_per_iteration": 2.7872653007507324 }, { "auxiliary_loss_clip": 0.06466338, "auxiliary_loss_mlp": 0.0127783, "balance_loss_clip": 0.06292982, "balance_loss_mlp": 0.0126365, "epoch": 0.43264692619870737, "flos": 14609834046720.0, "grad_norm": 2.775390133235936, "language_loss": 0.65468866, "learning_rate": 2.527068004376515e-06, "loss": 0.73213029, "num_input_tokens_seen": 154384620, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.1418457, "step": 7196, "time_per_iteration": 2.592822313308716 }, { "auxiliary_loss_clip": 0.06464073, "auxiliary_loss_mlp": 0.01271471, "balance_loss_clip": 0.06290288, "balance_loss_mlp": 0.01257076, "epoch": 0.43270704945137534, "flos": 21507184010880.0, "grad_norm": 2.1265114160094516, "language_loss": 0.72725034, "learning_rate": 2.526692300132797e-06, "loss": 0.80460572, "num_input_tokens_seen": 154402865, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.14385986, "step": 7197, "time_per_iteration": 2.5528926849365234 }, { "auxiliary_loss_clip": 0.06459092, "auxiliary_loss_mlp": 0.01276086, "balance_loss_clip": 0.06293596, "balance_loss_mlp": 0.01262669, "epoch": 0.4327671727040433, "flos": 25162975722240.0, "grad_norm": 1.5204659859632565, "language_loss": 0.72968352, "learning_rate": 2.5263165759168547e-06, "loss": 0.80703527, "num_input_tokens_seen": 154423625, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.13415527, "step": 7198, "time_per_iteration": 2.608029842376709 }, { "auxiliary_loss_clip": 0.06454256, "auxiliary_loss_mlp": 0.01272372, "balance_loss_clip": 0.06287801, "balance_loss_mlp": 0.01258997, "epoch": 0.43282729595671127, "flos": 25454192987520.0, "grad_norm": 1.2622739366105207, "language_loss": 0.81394923, "learning_rate": 2.525940831742934e-06, "loss": 0.8912155, "num_input_tokens_seen": 154444775, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.13378906, "step": 7199, "time_per_iteration": 2.624058246612549 }, { "auxiliary_loss_clip": 0.06457306, "auxiliary_loss_mlp": 0.01273585, "balance_loss_clip": 0.06288949, "balance_loss_mlp": 0.01260979, "epoch": 0.43288741920937923, "flos": 24131661269760.0, "grad_norm": 2.1074218934823272, "language_loss": 0.69172204, "learning_rate": 2.525565067625286e-06, "loss": 0.76903087, "num_input_tokens_seen": 154460815, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.12597656, "step": 7200, "time_per_iteration": 4.028451681137085 }, { "auxiliary_loss_clip": 0.06456423, "auxiliary_loss_mlp": 0.01273673, "balance_loss_clip": 0.06287736, "balance_loss_mlp": 0.0125923, "epoch": 0.4329475424620472, "flos": 19210415270400.0, "grad_norm": 1.782475561398576, "language_loss": 0.87402916, "learning_rate": 2.525189283578157e-06, "loss": 0.95133013, "num_input_tokens_seen": 154479145, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.14447021, "step": 7201, "time_per_iteration": 2.5761213302612305 }, { "auxiliary_loss_clip": 0.06468306, "auxiliary_loss_mlp": 0.0127333, "balance_loss_clip": 0.06292717, "balance_loss_mlp": 0.01257547, "epoch": 0.43300766571471516, "flos": 22645037329920.0, "grad_norm": 1.941638831653177, "language_loss": 0.64665687, "learning_rate": 2.5248134796157974e-06, "loss": 0.72407317, "num_input_tokens_seen": 154498905, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.15777588, "step": 7202, "time_per_iteration": 2.5883352756500244 }, { "auxiliary_loss_clip": 0.06455045, "auxiliary_loss_mlp": 0.01268827, "balance_loss_clip": 0.06287733, "balance_loss_mlp": 0.01256036, "epoch": 0.4330677889673831, "flos": 22126570992000.0, "grad_norm": 1.7663747240386192, "language_loss": 0.82557881, "learning_rate": 2.5244376557524586e-06, "loss": 0.90281749, "num_input_tokens_seen": 154517270, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.12792969, "step": 7203, "time_per_iteration": 4.006067276000977 }, { "auxiliary_loss_clip": 0.06464688, "auxiliary_loss_mlp": 0.01272168, "balance_loss_clip": 0.06290768, "balance_loss_mlp": 0.01257065, "epoch": 0.4331279122200511, "flos": 23228184620160.0, "grad_norm": 2.2407921715502255, "language_loss": 0.8173238, "learning_rate": 2.5240618120023912e-06, "loss": 0.89469242, "num_input_tokens_seen": 154535945, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.15124512, "step": 7204, "time_per_iteration": 2.574754476547241 }, { "auxiliary_loss_clip": 0.06458725, "auxiliary_loss_mlp": 0.01273526, "balance_loss_clip": 0.06291202, "balance_loss_mlp": 0.01260515, "epoch": 0.43318803547271906, "flos": 18265625758080.0, "grad_norm": 8.278271674254862, "language_loss": 0.74470997, "learning_rate": 2.5236859483798468e-06, "loss": 0.82203245, "num_input_tokens_seen": 154554935, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.13012695, "step": 7205, "time_per_iteration": 2.582864999771118 }, { "auxiliary_loss_clip": 0.06455021, "auxiliary_loss_mlp": 0.01268655, "balance_loss_clip": 0.06291875, "balance_loss_mlp": 0.01255769, "epoch": 0.433248158725387, "flos": 27425936540160.0, "grad_norm": 1.9424823667770255, "language_loss": 0.74935031, "learning_rate": 2.5233100648990803e-06, "loss": 0.82658708, "num_input_tokens_seen": 154576065, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.12908936, "step": 7206, "time_per_iteration": 2.664659023284912 }, { "auxiliary_loss_clip": 0.06459592, "auxiliary_loss_mlp": 0.01268204, "balance_loss_clip": 0.06291465, "balance_loss_mlp": 0.01253625, "epoch": 0.433308281978055, "flos": 23224075770240.0, "grad_norm": 1.7318433706886773, "language_loss": 0.78987378, "learning_rate": 2.522934161574342e-06, "loss": 0.8671518, "num_input_tokens_seen": 154595110, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.14575195, "step": 7207, "time_per_iteration": 2.578430652618408 }, { "auxiliary_loss_clip": 0.06464142, "auxiliary_loss_mlp": 0.01269633, "balance_loss_clip": 0.06290385, "balance_loss_mlp": 0.01254827, "epoch": 0.433368405230723, "flos": 15857999665920.0, "grad_norm": 1.920412673582735, "language_loss": 0.8145175, "learning_rate": 2.5225582384198888e-06, "loss": 0.89185524, "num_input_tokens_seen": 154612255, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.14807129, "step": 7208, "time_per_iteration": 2.5223047733306885 }, { "auxiliary_loss_clip": 0.0646009, "auxiliary_loss_mlp": 0.01271319, "balance_loss_clip": 0.06290042, "balance_loss_mlp": 0.01256406, "epoch": 0.433428528483391, "flos": 19032109781760.0, "grad_norm": 2.159442067783631, "language_loss": 0.70723152, "learning_rate": 2.5221822954499744e-06, "loss": 0.78454554, "num_input_tokens_seen": 154630440, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.14904785, "step": 7209, "time_per_iteration": 2.615530014038086 }, { "auxiliary_loss_clip": 0.06461261, "auxiliary_loss_mlp": 0.01275437, "balance_loss_clip": 0.06294292, "balance_loss_mlp": 0.01260631, "epoch": 0.43348865173605894, "flos": 24725290320000.0, "grad_norm": 1.522018282501447, "language_loss": 0.81322241, "learning_rate": 2.5218063326788557e-06, "loss": 0.89058936, "num_input_tokens_seen": 154652515, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.14807129, "step": 7210, "time_per_iteration": 2.605271339416504 }, { "auxiliary_loss_clip": 0.06461102, "auxiliary_loss_mlp": 0.01274339, "balance_loss_clip": 0.06292286, "balance_loss_mlp": 0.01261614, "epoch": 0.4335487749887269, "flos": 22097165408640.0, "grad_norm": 1.6186411383919066, "language_loss": 0.82206881, "learning_rate": 2.5214303501207885e-06, "loss": 0.89942318, "num_input_tokens_seen": 154670965, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.12731934, "step": 7211, "time_per_iteration": 2.572317600250244 }, { "auxiliary_loss_clip": 0.0645905, "auxiliary_loss_mlp": 0.01271692, "balance_loss_clip": 0.06289101, "balance_loss_mlp": 0.01258657, "epoch": 0.43360889824139487, "flos": 22389556631040.0, "grad_norm": 1.8746769708789346, "language_loss": 0.75584567, "learning_rate": 2.521054347790029e-06, "loss": 0.83315307, "num_input_tokens_seen": 154689980, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.13049316, "step": 7212, "time_per_iteration": 2.6573283672332764 }, { "auxiliary_loss_clip": 0.0645671, "auxiliary_loss_mlp": 0.01270294, "balance_loss_clip": 0.06288254, "balance_loss_mlp": 0.01257866, "epoch": 0.43366902149406283, "flos": 17533746270720.0, "grad_norm": 1.5916264226254078, "language_loss": 0.76584572, "learning_rate": 2.5206783257008375e-06, "loss": 0.84311575, "num_input_tokens_seen": 154706570, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.12432861, "step": 7213, "time_per_iteration": 5.397972106933594 }, { "auxiliary_loss_clip": 0.0645747, "auxiliary_loss_mlp": 0.01269277, "balance_loss_clip": 0.06287399, "balance_loss_mlp": 0.01256533, "epoch": 0.4337291447467308, "flos": 19028126712960.0, "grad_norm": 1.495150055843949, "language_loss": 0.65265733, "learning_rate": 2.520302283867471e-06, "loss": 0.7299248, "num_input_tokens_seen": 154725210, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.12744141, "step": 7214, "time_per_iteration": 2.5527992248535156 }, { "auxiliary_loss_clip": 0.06451389, "auxiliary_loss_mlp": 0.01266774, "balance_loss_clip": 0.06287214, "balance_loss_mlp": 0.01254531, "epoch": 0.43378926799939876, "flos": 27241216214400.0, "grad_norm": 1.6645547465981323, "language_loss": 0.71512961, "learning_rate": 2.519926222304191e-06, "loss": 0.79231131, "num_input_tokens_seen": 154745945, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.12237549, "step": 7215, "time_per_iteration": 2.5769364833831787 }, { "auxiliary_loss_clip": 0.06458367, "auxiliary_loss_mlp": 0.01268543, "balance_loss_clip": 0.06292976, "balance_loss_mlp": 0.01255507, "epoch": 0.43384939125206673, "flos": 15966592957440.0, "grad_norm": 1.8859120055663678, "language_loss": 0.75031191, "learning_rate": 2.519550141025255e-06, "loss": 0.82758093, "num_input_tokens_seen": 154763580, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.13031006, "step": 7216, "time_per_iteration": 2.525128126144409 }, { "auxiliary_loss_clip": 0.06469964, "auxiliary_loss_mlp": 0.01270921, "balance_loss_clip": 0.06290581, "balance_loss_mlp": 0.01255686, "epoch": 0.4339095145047347, "flos": 21798736692480.0, "grad_norm": 2.551018727005917, "language_loss": 0.75868511, "learning_rate": 2.519174040044927e-06, "loss": 0.83609396, "num_input_tokens_seen": 154776825, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.15234375, "step": 7217, "time_per_iteration": 2.534602642059326 }, { "auxiliary_loss_clip": 0.06460159, "auxiliary_loss_mlp": 0.01272663, "balance_loss_clip": 0.06289612, "balance_loss_mlp": 0.0125896, "epoch": 0.43396963775740266, "flos": 14215054734720.0, "grad_norm": 2.601093670593395, "language_loss": 0.74038327, "learning_rate": 2.5187979193774664e-06, "loss": 0.81771147, "num_input_tokens_seen": 154794025, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.13708496, "step": 7218, "time_per_iteration": 2.524768590927124 }, { "auxiliary_loss_clip": 0.06459375, "auxiliary_loss_mlp": 0.01271293, "balance_loss_clip": 0.06286828, "balance_loss_mlp": 0.01257185, "epoch": 0.4340297610100706, "flos": 19725150101760.0, "grad_norm": 1.5920798849184077, "language_loss": 0.6967665, "learning_rate": 2.5184217790371367e-06, "loss": 0.77407312, "num_input_tokens_seen": 154813105, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.14105225, "step": 7219, "time_per_iteration": 2.5390233993530273 }, { "auxiliary_loss_clip": 0.06455512, "auxiliary_loss_mlp": 0.01270908, "balance_loss_clip": 0.06286918, "balance_loss_mlp": 0.01257443, "epoch": 0.4340898842627386, "flos": 18959588472960.0, "grad_norm": 1.4603215346594143, "language_loss": 0.77341914, "learning_rate": 2.518045619038202e-06, "loss": 0.85068333, "num_input_tokens_seen": 154833525, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.13464355, "step": 7220, "time_per_iteration": 2.5925416946411133 }, { "auxiliary_loss_clip": 0.06458003, "auxiliary_loss_mlp": 0.01272998, "balance_loss_clip": 0.06289715, "balance_loss_mlp": 0.01259772, "epoch": 0.4341500075154066, "flos": 22024895662080.0, "grad_norm": 1.7859719039841755, "language_loss": 0.69829512, "learning_rate": 2.5176694393949243e-06, "loss": 0.7756052, "num_input_tokens_seen": 154853090, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.13208008, "step": 7221, "time_per_iteration": 2.5451042652130127 }, { "auxiliary_loss_clip": 0.06460147, "auxiliary_loss_mlp": 0.01270217, "balance_loss_clip": 0.0628811, "balance_loss_mlp": 0.01256621, "epoch": 0.4342101307680746, "flos": 23588527104000.0, "grad_norm": 1.843140451530268, "language_loss": 0.65370965, "learning_rate": 2.51729324012157e-06, "loss": 0.73101324, "num_input_tokens_seen": 154872055, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.13616943, "step": 7222, "time_per_iteration": 2.566987991333008 }, { "auxiliary_loss_clip": 0.0645202, "auxiliary_loss_mlp": 0.0126871, "balance_loss_clip": 0.06284074, "balance_loss_mlp": 0.01254894, "epoch": 0.43427025402074254, "flos": 17973821514240.0, "grad_norm": 2.156337883110414, "language_loss": 0.73658556, "learning_rate": 2.5169170212324053e-06, "loss": 0.81379288, "num_input_tokens_seen": 154886645, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.13806152, "step": 7223, "time_per_iteration": 2.533918857574463 }, { "auxiliary_loss_clip": 0.06460851, "auxiliary_loss_mlp": 0.01269112, "balance_loss_clip": 0.06287113, "balance_loss_mlp": 0.01255951, "epoch": 0.4343303772734105, "flos": 26293575663360.0, "grad_norm": 1.7897005163581037, "language_loss": 0.93994761, "learning_rate": 2.516540782741694e-06, "loss": 1.0172472, "num_input_tokens_seen": 154906775, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.1315918, "step": 7224, "time_per_iteration": 2.5962483882904053 }, { "auxiliary_loss_clip": 0.06452079, "auxiliary_loss_mlp": 0.01269213, "balance_loss_clip": 0.06284073, "balance_loss_mlp": 0.01255844, "epoch": 0.43439050052607847, "flos": 26841279876480.0, "grad_norm": 1.4060056802537435, "language_loss": 0.61711913, "learning_rate": 2.5161645246637056e-06, "loss": 0.694332, "num_input_tokens_seen": 154926990, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.13372803, "step": 7225, "time_per_iteration": 2.593630313873291 }, { "auxiliary_loss_clip": 0.06454866, "auxiliary_loss_mlp": 0.01269, "balance_loss_clip": 0.06285776, "balance_loss_mlp": 0.01256197, "epoch": 0.43445062377874644, "flos": 21404083161600.0, "grad_norm": 2.0196420989618677, "language_loss": 0.77729952, "learning_rate": 2.5157882470127054e-06, "loss": 0.8545382, "num_input_tokens_seen": 154946210, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.12799072, "step": 7226, "time_per_iteration": 2.5519418716430664 }, { "auxiliary_loss_clip": 0.06453703, "auxiliary_loss_mlp": 0.01271749, "balance_loss_clip": 0.06290422, "balance_loss_mlp": 0.01259077, "epoch": 0.4345107470314144, "flos": 19908151418880.0, "grad_norm": 1.6100480705684328, "language_loss": 0.85053504, "learning_rate": 2.515411949802964e-06, "loss": 0.92778957, "num_input_tokens_seen": 154964995, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.12677002, "step": 7227, "time_per_iteration": 2.560525894165039 }, { "auxiliary_loss_clip": 0.06455006, "auxiliary_loss_mlp": 0.01273179, "balance_loss_clip": 0.0628718, "balance_loss_mlp": 0.01258886, "epoch": 0.43457087028408237, "flos": 26439876092160.0, "grad_norm": 2.2186328213579647, "language_loss": 0.77502614, "learning_rate": 2.5150356330487498e-06, "loss": 0.85230803, "num_input_tokens_seen": 154984775, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.14294434, "step": 7228, "time_per_iteration": 2.5931060314178467 }, { "auxiliary_loss_clip": 0.0645003, "auxiliary_loss_mlp": 0.01269991, "balance_loss_clip": 0.06283761, "balance_loss_mlp": 0.0125624, "epoch": 0.43463099353675033, "flos": 31876947025920.0, "grad_norm": 1.4604597960933479, "language_loss": 0.8089608, "learning_rate": 2.5146592967643324e-06, "loss": 0.88616097, "num_input_tokens_seen": 155008125, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.13751221, "step": 7229, "time_per_iteration": 2.6688477993011475 }, { "auxiliary_loss_clip": 0.06458566, "auxiliary_loss_mlp": 0.01270702, "balance_loss_clip": 0.06287224, "balance_loss_mlp": 0.0125676, "epoch": 0.4346911167894183, "flos": 24578109423360.0, "grad_norm": 2.390839127000691, "language_loss": 0.82836056, "learning_rate": 2.5142829409639834e-06, "loss": 0.90565324, "num_input_tokens_seen": 155027885, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.1394043, "step": 7230, "time_per_iteration": 2.596118211746216 }, { "auxiliary_loss_clip": 0.06467783, "auxiliary_loss_mlp": 0.01267732, "balance_loss_clip": 0.06293772, "balance_loss_mlp": 0.01254374, "epoch": 0.43475124004208626, "flos": 17096102795520.0, "grad_norm": 4.59365165817866, "language_loss": 0.7689172, "learning_rate": 2.513906565661973e-06, "loss": 0.84627235, "num_input_tokens_seen": 155043375, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.13354492, "step": 7231, "time_per_iteration": 2.530728340148926 }, { "auxiliary_loss_clip": 0.06457663, "auxiliary_loss_mlp": 0.01271191, "balance_loss_clip": 0.06290855, "balance_loss_mlp": 0.01258906, "epoch": 0.4348113632947542, "flos": 26111874084480.0, "grad_norm": 1.3731625647043457, "language_loss": 0.68701434, "learning_rate": 2.513530170872575e-06, "loss": 0.76430285, "num_input_tokens_seen": 155062930, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.1229248, "step": 7232, "time_per_iteration": 2.6069111824035645 }, { "auxiliary_loss_clip": 0.06462429, "auxiliary_loss_mlp": 0.0127086, "balance_loss_clip": 0.06289981, "balance_loss_mlp": 0.01256448, "epoch": 0.4348714865474222, "flos": 34208446083840.0, "grad_norm": 1.700190794005678, "language_loss": 0.72378039, "learning_rate": 2.5131537566100605e-06, "loss": 0.80111325, "num_input_tokens_seen": 155084980, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.14404297, "step": 7233, "time_per_iteration": 2.6615631580352783 }, { "auxiliary_loss_clip": 0.06461759, "auxiliary_loss_mlp": 0.01272671, "balance_loss_clip": 0.06287751, "balance_loss_mlp": 0.01258986, "epoch": 0.43493160980009016, "flos": 31545045803520.0, "grad_norm": 1.4699148147076597, "language_loss": 0.74764252, "learning_rate": 2.5127773228887053e-06, "loss": 0.82498676, "num_input_tokens_seen": 155107260, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.13684082, "step": 7234, "time_per_iteration": 2.6168100833892822 }, { "auxiliary_loss_clip": 0.06473299, "auxiliary_loss_mlp": 0.01274853, "balance_loss_clip": 0.06295611, "balance_loss_mlp": 0.01260285, "epoch": 0.4349917330527582, "flos": 24068238128640.0, "grad_norm": 2.399635354563728, "language_loss": 0.595469, "learning_rate": 2.512400869722782e-06, "loss": 0.67295051, "num_input_tokens_seen": 155126720, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.14569092, "step": 7235, "time_per_iteration": 2.55924916267395 }, { "auxiliary_loss_clip": 0.06460533, "auxiliary_loss_mlp": 0.01276912, "balance_loss_clip": 0.06288859, "balance_loss_mlp": 0.01264073, "epoch": 0.43505185630542614, "flos": 30527315712000.0, "grad_norm": 2.3158768018661395, "language_loss": 0.77331179, "learning_rate": 2.512024397126566e-06, "loss": 0.85068625, "num_input_tokens_seen": 155148640, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.12854004, "step": 7236, "time_per_iteration": 2.620577573776245 }, { "auxiliary_loss_clip": 0.06457898, "auxiliary_loss_mlp": 0.01272782, "balance_loss_clip": 0.06290955, "balance_loss_mlp": 0.01259019, "epoch": 0.4351119795580941, "flos": 15739427738880.0, "grad_norm": 1.5547085922779156, "language_loss": 0.8119179, "learning_rate": 2.5116479051143345e-06, "loss": 0.88922471, "num_input_tokens_seen": 155165870, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.13763428, "step": 7237, "time_per_iteration": 2.5313639640808105 }, { "auxiliary_loss_clip": 0.0645608, "auxiliary_loss_mlp": 0.01275234, "balance_loss_clip": 0.062868, "balance_loss_mlp": 0.01260559, "epoch": 0.4351721028107621, "flos": 18737328718080.0, "grad_norm": 1.7339673153363064, "language_loss": 0.63236403, "learning_rate": 2.5112713937003623e-06, "loss": 0.70967716, "num_input_tokens_seen": 155185315, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.14660645, "step": 7238, "time_per_iteration": 2.540964365005493 }, { "auxiliary_loss_clip": 0.06457576, "auxiliary_loss_mlp": 0.01276322, "balance_loss_clip": 0.0629172, "balance_loss_mlp": 0.0126306, "epoch": 0.43523222606343004, "flos": 25233652241280.0, "grad_norm": 1.5989181997111126, "language_loss": 0.85697693, "learning_rate": 2.510894862898928e-06, "loss": 0.93431592, "num_input_tokens_seen": 155205790, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.13262939, "step": 7239, "time_per_iteration": 4.01233983039856 }, { "auxiliary_loss_clip": 0.06467267, "auxiliary_loss_mlp": 0.01268208, "balance_loss_clip": 0.06295449, "balance_loss_mlp": 0.01254874, "epoch": 0.435292349316098, "flos": 22715504213760.0, "grad_norm": 1.4971999844945587, "language_loss": 0.72997975, "learning_rate": 2.510518312724309e-06, "loss": 0.80733442, "num_input_tokens_seen": 155226475, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.13323975, "step": 7240, "time_per_iteration": 2.5645318031311035 }, { "auxiliary_loss_clip": 0.06464304, "auxiliary_loss_mlp": 0.01272012, "balance_loss_clip": 0.0629103, "balance_loss_mlp": 0.01257724, "epoch": 0.43535247256876597, "flos": 25783033536000.0, "grad_norm": 1.8166852055709262, "language_loss": 0.82180178, "learning_rate": 2.5101417431907842e-06, "loss": 0.89916492, "num_input_tokens_seen": 155247110, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.1428833, "step": 7241, "time_per_iteration": 2.594714403152466 }, { "auxiliary_loss_clip": 0.0647065, "auxiliary_loss_mlp": 0.01277932, "balance_loss_clip": 0.06292996, "balance_loss_mlp": 0.01263352, "epoch": 0.43541259582143393, "flos": 17533578562560.0, "grad_norm": 2.3331379915800174, "language_loss": 0.79690033, "learning_rate": 2.5097651543126345e-06, "loss": 0.87438619, "num_input_tokens_seen": 155261335, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.14587402, "step": 7242, "time_per_iteration": 2.6170034408569336 }, { "auxiliary_loss_clip": 0.06460539, "auxiliary_loss_mlp": 0.01273705, "balance_loss_clip": 0.06284874, "balance_loss_mlp": 0.01259865, "epoch": 0.4354727190741019, "flos": 15200612058240.0, "grad_norm": 2.0603206248242274, "language_loss": 0.68956196, "learning_rate": 2.509388546104138e-06, "loss": 0.76690441, "num_input_tokens_seen": 155278510, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.1385498, "step": 7243, "time_per_iteration": 4.0855796337127686 }, { "auxiliary_loss_clip": 0.06455097, "auxiliary_loss_mlp": 0.01270496, "balance_loss_clip": 0.06289995, "balance_loss_mlp": 0.01258104, "epoch": 0.43553284232676986, "flos": 16654015054080.0, "grad_norm": 1.9041940422522141, "language_loss": 0.81604254, "learning_rate": 2.5090119185795766e-06, "loss": 0.89329845, "num_input_tokens_seen": 155296450, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.1239624, "step": 7244, "time_per_iteration": 2.5508878231048584 }, { "auxiliary_loss_clip": 0.06454381, "auxiliary_loss_mlp": 0.01268628, "balance_loss_clip": 0.06286132, "balance_loss_mlp": 0.01255783, "epoch": 0.43559296557943783, "flos": 23407035160320.0, "grad_norm": 1.639228162259656, "language_loss": 0.73864788, "learning_rate": 2.508635271753234e-06, "loss": 0.81587791, "num_input_tokens_seen": 155316080, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.12841797, "step": 7245, "time_per_iteration": 2.5704612731933594 }, { "auxiliary_loss_clip": 0.06458774, "auxiliary_loss_mlp": 0.01271987, "balance_loss_clip": 0.06288875, "balance_loss_mlp": 0.01258856, "epoch": 0.4356530888321058, "flos": 22425628613760.0, "grad_norm": 1.7043776202392311, "language_loss": 0.77284139, "learning_rate": 2.508258605639389e-06, "loss": 0.85014904, "num_input_tokens_seen": 155336765, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.13134766, "step": 7246, "time_per_iteration": 2.5765154361724854 }, { "auxiliary_loss_clip": 0.06459121, "auxiliary_loss_mlp": 0.01270522, "balance_loss_clip": 0.06289738, "balance_loss_mlp": 0.01256938, "epoch": 0.43571321208477376, "flos": 21622527555840.0, "grad_norm": 1.7303549589061482, "language_loss": 0.85391903, "learning_rate": 2.5078819202523275e-06, "loss": 0.93121547, "num_input_tokens_seen": 155356440, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.13586426, "step": 7247, "time_per_iteration": 2.558433771133423 }, { "auxiliary_loss_clip": 0.06457882, "auxiliary_loss_mlp": 0.01269373, "balance_loss_clip": 0.06288204, "balance_loss_mlp": 0.01256737, "epoch": 0.4357733353374418, "flos": 23994081665280.0, "grad_norm": 1.5758996549201174, "language_loss": 0.73136425, "learning_rate": 2.507505215606333e-06, "loss": 0.80863678, "num_input_tokens_seen": 155377070, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.12634277, "step": 7248, "time_per_iteration": 2.570457935333252 }, { "auxiliary_loss_clip": 0.06456134, "auxiliary_loss_mlp": 0.01268407, "balance_loss_clip": 0.06286307, "balance_loss_mlp": 0.01255061, "epoch": 0.43583345859010975, "flos": 25271736721920.0, "grad_norm": 1.9394818465775283, "language_loss": 0.87197012, "learning_rate": 2.5071284917156893e-06, "loss": 0.94921553, "num_input_tokens_seen": 155398415, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.13342285, "step": 7249, "time_per_iteration": 2.59407901763916 }, { "auxiliary_loss_clip": 0.06459427, "auxiliary_loss_mlp": 0.0126897, "balance_loss_clip": 0.06284505, "balance_loss_mlp": 0.0125656, "epoch": 0.4358935818427777, "flos": 23703115962240.0, "grad_norm": 1.8424583540426762, "language_loss": 0.8213408, "learning_rate": 2.506751748594683e-06, "loss": 0.89862478, "num_input_tokens_seen": 155415625, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.12402344, "step": 7250, "time_per_iteration": 2.558206081390381 }, { "auxiliary_loss_clip": 0.06464177, "auxiliary_loss_mlp": 0.01270139, "balance_loss_clip": 0.06294824, "balance_loss_mlp": 0.01256996, "epoch": 0.4359537050954457, "flos": 29540416723200.0, "grad_norm": 1.982551799293422, "language_loss": 0.85312188, "learning_rate": 2.5063749862575988e-06, "loss": 0.93046498, "num_input_tokens_seen": 155435505, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.13153076, "step": 7251, "time_per_iteration": 2.6179280281066895 }, { "auxiliary_loss_clip": 0.06458895, "auxiliary_loss_mlp": 0.01272912, "balance_loss_clip": 0.06289593, "balance_loss_mlp": 0.0125934, "epoch": 0.43601382834811364, "flos": 22717935982080.0, "grad_norm": 1.4637662025354197, "language_loss": 0.69475538, "learning_rate": 2.5059982047187245e-06, "loss": 0.77207345, "num_input_tokens_seen": 155455425, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.13574219, "step": 7252, "time_per_iteration": 3.8828847408294678 }, { "auxiliary_loss_clip": 0.06452942, "auxiliary_loss_mlp": 0.01268014, "balance_loss_clip": 0.06288254, "balance_loss_mlp": 0.01255581, "epoch": 0.4360739516007816, "flos": 19104714944640.0, "grad_norm": 1.5671813018849878, "language_loss": 0.83672118, "learning_rate": 2.505621403992348e-06, "loss": 0.91393077, "num_input_tokens_seen": 155474250, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.12432861, "step": 7253, "time_per_iteration": 3.983341932296753 }, { "auxiliary_loss_clip": 0.0646096, "auxiliary_loss_mlp": 0.0127328, "balance_loss_clip": 0.06290845, "balance_loss_mlp": 0.01258987, "epoch": 0.43613407485344957, "flos": 23411185937280.0, "grad_norm": 182.63984533969435, "language_loss": 0.70554972, "learning_rate": 2.505244584092757e-06, "loss": 0.78289211, "num_input_tokens_seen": 155494685, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.14312744, "step": 7254, "time_per_iteration": 2.6374013423919678 }, { "auxiliary_loss_clip": 0.06452768, "auxiliary_loss_mlp": 0.01270666, "balance_loss_clip": 0.06286135, "balance_loss_mlp": 0.0125716, "epoch": 0.43619419810611754, "flos": 22644366497280.0, "grad_norm": 1.7772032424098971, "language_loss": 0.81780636, "learning_rate": 2.5048677450342406e-06, "loss": 0.89504063, "num_input_tokens_seen": 155513040, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.13500977, "step": 7255, "time_per_iteration": 2.5318117141723633 }, { "auxiliary_loss_clip": 0.064575, "auxiliary_loss_mlp": 0.01267804, "balance_loss_clip": 0.06287681, "balance_loss_mlp": 0.01254709, "epoch": 0.4362543213587855, "flos": 20054200285440.0, "grad_norm": 1.6692498765175858, "language_loss": 0.78265929, "learning_rate": 2.504490886831089e-06, "loss": 0.8599124, "num_input_tokens_seen": 155530100, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.13085938, "step": 7256, "time_per_iteration": 2.5355288982391357 }, { "auxiliary_loss_clip": 0.06455518, "auxiliary_loss_mlp": 0.01270391, "balance_loss_clip": 0.06289031, "balance_loss_mlp": 0.01258321, "epoch": 0.43631444461145347, "flos": 21367759616640.0, "grad_norm": 1.4827049913499646, "language_loss": 0.76334715, "learning_rate": 2.5041140094975922e-06, "loss": 0.84060621, "num_input_tokens_seen": 155549375, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.12072754, "step": 7257, "time_per_iteration": 2.568654775619507 }, { "auxiliary_loss_clip": 0.06465343, "auxiliary_loss_mlp": 0.01270666, "balance_loss_clip": 0.06294448, "balance_loss_mlp": 0.01256981, "epoch": 0.43637456786412143, "flos": 22424999708160.0, "grad_norm": 1.8341113148187214, "language_loss": 0.73639935, "learning_rate": 2.5037371130480417e-06, "loss": 0.81375945, "num_input_tokens_seen": 155569395, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.13671875, "step": 7258, "time_per_iteration": 2.5497000217437744 }, { "auxiliary_loss_clip": 0.06461155, "auxiliary_loss_mlp": 0.012731, "balance_loss_clip": 0.06288097, "balance_loss_mlp": 0.01260398, "epoch": 0.4364346911167894, "flos": 28556452627200.0, "grad_norm": 2.1780269342402026, "language_loss": 0.77065951, "learning_rate": 2.5033601974967297e-06, "loss": 0.84800208, "num_input_tokens_seen": 155589090, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.1270752, "step": 7259, "time_per_iteration": 2.5994462966918945 }, { "auxiliary_loss_clip": 0.06339581, "auxiliary_loss_mlp": 0.01271567, "balance_loss_clip": 0.06266913, "balance_loss_mlp": 0.01268718, "epoch": 0.43649481436945736, "flos": 62678149407360.0, "grad_norm": 0.7401859716796122, "language_loss": 0.56897026, "learning_rate": 2.5029832628579483e-06, "loss": 0.64508176, "num_input_tokens_seen": 155648660, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 0.02845764, "step": 7260, "time_per_iteration": 3.171586036682129 }, { "auxiliary_loss_clip": 0.0646695, "auxiliary_loss_mlp": 0.01277937, "balance_loss_clip": 0.06296143, "balance_loss_mlp": 0.01263644, "epoch": 0.4365549376221254, "flos": 30600088583040.0, "grad_norm": 2.4261932263134973, "language_loss": 0.71223438, "learning_rate": 2.5026063091459907e-06, "loss": 0.78968334, "num_input_tokens_seen": 155669945, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.14294434, "step": 7261, "time_per_iteration": 2.6344351768493652 }, { "auxiliary_loss_clip": 0.06467009, "auxiliary_loss_mlp": 0.01277495, "balance_loss_clip": 0.06297117, "balance_loss_mlp": 0.01262951, "epoch": 0.43661506087479335, "flos": 17171684778240.0, "grad_norm": 1.7694534771882406, "language_loss": 0.70118916, "learning_rate": 2.5022293363751522e-06, "loss": 0.77863419, "num_input_tokens_seen": 155688555, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.14550781, "step": 7262, "time_per_iteration": 2.534539222717285 }, { "auxiliary_loss_clip": 0.06461349, "auxiliary_loss_mlp": 0.01265731, "balance_loss_clip": 0.06298628, "balance_loss_mlp": 0.01253768, "epoch": 0.4366751841274613, "flos": 22052875726080.0, "grad_norm": 1.587306760867306, "language_loss": 0.80014479, "learning_rate": 2.501852344559726e-06, "loss": 0.87741554, "num_input_tokens_seen": 155705370, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.11950684, "step": 7263, "time_per_iteration": 2.562947988510132 }, { "auxiliary_loss_clip": 0.06463829, "auxiliary_loss_mlp": 0.01274334, "balance_loss_clip": 0.06295806, "balance_loss_mlp": 0.01260982, "epoch": 0.4367353073801293, "flos": 16002748794240.0, "grad_norm": 1.7188379886812337, "language_loss": 0.75759584, "learning_rate": 2.50147533371401e-06, "loss": 0.83497739, "num_input_tokens_seen": 155721890, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.13366699, "step": 7264, "time_per_iteration": 2.5218610763549805 }, { "auxiliary_loss_clip": 0.0646212, "auxiliary_loss_mlp": 0.01272041, "balance_loss_clip": 0.06294956, "balance_loss_mlp": 0.0125879, "epoch": 0.43679543063279724, "flos": 38226760485120.0, "grad_norm": 2.0413478646400556, "language_loss": 0.6130687, "learning_rate": 2.501098303852298e-06, "loss": 0.69041026, "num_input_tokens_seen": 155743970, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.13250732, "step": 7265, "time_per_iteration": 2.6898787021636963 }, { "auxiliary_loss_clip": 0.06459492, "auxiliary_loss_mlp": 0.01271917, "balance_loss_clip": 0.06295583, "balance_loss_mlp": 0.01259102, "epoch": 0.4368555538854652, "flos": 15198306071040.0, "grad_norm": 1.8272582671610473, "language_loss": 0.72860181, "learning_rate": 2.5007212549888884e-06, "loss": 0.80591583, "num_input_tokens_seen": 155761830, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.12817383, "step": 7266, "time_per_iteration": 2.5083017349243164 }, { "auxiliary_loss_clip": 0.06462586, "auxiliary_loss_mlp": 0.01278715, "balance_loss_clip": 0.06293318, "balance_loss_mlp": 0.01266079, "epoch": 0.4369156771381332, "flos": 23074630813440.0, "grad_norm": 1.9834179273137984, "language_loss": 0.82952189, "learning_rate": 2.5003441871380794e-06, "loss": 0.90693486, "num_input_tokens_seen": 155779610, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.12634277, "step": 7267, "time_per_iteration": 2.5556509494781494 }, { "auxiliary_loss_clip": 0.06459087, "auxiliary_loss_mlp": 0.01272268, "balance_loss_clip": 0.06293511, "balance_loss_mlp": 0.01259393, "epoch": 0.43697580039080114, "flos": 23447886825600.0, "grad_norm": 1.8232131358209962, "language_loss": 0.74710262, "learning_rate": 2.4999671003141674e-06, "loss": 0.82441616, "num_input_tokens_seen": 155798765, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.12866211, "step": 7268, "time_per_iteration": 2.563185214996338 }, { "auxiliary_loss_clip": 0.06467578, "auxiliary_loss_mlp": 0.01268716, "balance_loss_clip": 0.06293873, "balance_loss_mlp": 0.01255358, "epoch": 0.4370359236434691, "flos": 18520519478400.0, "grad_norm": 1.9683501765936038, "language_loss": 0.79953837, "learning_rate": 2.499589994531454e-06, "loss": 0.87690139, "num_input_tokens_seen": 155817750, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.13360596, "step": 7269, "time_per_iteration": 2.5301029682159424 }, { "auxiliary_loss_clip": 0.06464449, "auxiliary_loss_mlp": 0.01272209, "balance_loss_clip": 0.0629755, "balance_loss_mlp": 0.01259072, "epoch": 0.43709604689613707, "flos": 23229316650240.0, "grad_norm": 1.811530257609154, "language_loss": 0.75225681, "learning_rate": 2.499212869804237e-06, "loss": 0.82962334, "num_input_tokens_seen": 155836490, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.13140869, "step": 7270, "time_per_iteration": 2.573374032974243 }, { "auxiliary_loss_clip": 0.06459606, "auxiliary_loss_mlp": 0.01268749, "balance_loss_clip": 0.06290586, "balance_loss_mlp": 0.01256101, "epoch": 0.43715617014880503, "flos": 23810199880320.0, "grad_norm": 2.7786769169670222, "language_loss": 0.79642844, "learning_rate": 2.4988357261468182e-06, "loss": 0.873712, "num_input_tokens_seen": 155856225, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.12646484, "step": 7271, "time_per_iteration": 2.5845367908477783 }, { "auxiliary_loss_clip": 0.06337108, "auxiliary_loss_mlp": 0.01259859, "balance_loss_clip": 0.06263933, "balance_loss_mlp": 0.01257023, "epoch": 0.437216293401473, "flos": 61961824851840.0, "grad_norm": 0.668925089202642, "language_loss": 0.54703689, "learning_rate": 2.4984585635734993e-06, "loss": 0.62300652, "num_input_tokens_seen": 155916770, "router_z_loss_clip": 0.73388672, "router_z_loss_mlp": 0.02833557, "step": 7272, "time_per_iteration": 3.2733941078186035 }, { "auxiliary_loss_clip": 0.06464974, "auxiliary_loss_mlp": 0.01275388, "balance_loss_clip": 0.06292258, "balance_loss_mlp": 0.01261333, "epoch": 0.43727641665414096, "flos": 21988907533440.0, "grad_norm": 1.6263673409563977, "language_loss": 0.70682478, "learning_rate": 2.498081382098581e-06, "loss": 0.78422844, "num_input_tokens_seen": 155936490, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.14044189, "step": 7273, "time_per_iteration": 2.6169958114624023 }, { "auxiliary_loss_clip": 0.0646401, "auxiliary_loss_mlp": 0.01267471, "balance_loss_clip": 0.06292008, "balance_loss_mlp": 0.01253714, "epoch": 0.437336539906809, "flos": 39540277889280.0, "grad_norm": 1.8600350746433831, "language_loss": 0.76209635, "learning_rate": 2.497704181736367e-06, "loss": 0.83941114, "num_input_tokens_seen": 155957595, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.13751221, "step": 7274, "time_per_iteration": 2.7676870822906494 }, { "auxiliary_loss_clip": 0.06456757, "auxiliary_loss_mlp": 0.0126832, "balance_loss_clip": 0.06290533, "balance_loss_mlp": 0.0125594, "epoch": 0.43739666315947695, "flos": 17462902043520.0, "grad_norm": 1.610841382684934, "language_loss": 0.80727428, "learning_rate": 2.49732696250116e-06, "loss": 0.88452506, "num_input_tokens_seen": 155975710, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.1237793, "step": 7275, "time_per_iteration": 2.5575172901153564 }, { "auxiliary_loss_clip": 0.06466582, "auxiliary_loss_mlp": 0.01277947, "balance_loss_clip": 0.06297375, "balance_loss_mlp": 0.0126403, "epoch": 0.4374567864121449, "flos": 16363678256640.0, "grad_norm": 1.9573615487117961, "language_loss": 0.81141222, "learning_rate": 2.496949724407266e-06, "loss": 0.88885748, "num_input_tokens_seen": 155993090, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.13928223, "step": 7276, "time_per_iteration": 2.559612274169922 }, { "auxiliary_loss_clip": 0.06467518, "auxiliary_loss_mlp": 0.01272035, "balance_loss_clip": 0.06291449, "balance_loss_mlp": 0.01258356, "epoch": 0.4375169096648129, "flos": 30594721921920.0, "grad_norm": 1.8287390650474697, "language_loss": 0.73554826, "learning_rate": 2.496572467468988e-06, "loss": 0.81294376, "num_input_tokens_seen": 156013685, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.13684082, "step": 7277, "time_per_iteration": 2.651700735092163 }, { "auxiliary_loss_clip": 0.06458005, "auxiliary_loss_mlp": 0.01277216, "balance_loss_clip": 0.06290769, "balance_loss_mlp": 0.01263477, "epoch": 0.43757703291748085, "flos": 30563555402880.0, "grad_norm": 2.021073099834978, "language_loss": 0.72892386, "learning_rate": 2.4961951917006317e-06, "loss": 0.80627608, "num_input_tokens_seen": 156034300, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.13745117, "step": 7278, "time_per_iteration": 2.654195547103882 }, { "auxiliary_loss_clip": 0.06456051, "auxiliary_loss_mlp": 0.01271089, "balance_loss_clip": 0.06291254, "balance_loss_mlp": 0.01258637, "epoch": 0.4376371561701488, "flos": 21403747745280.0, "grad_norm": 1.4809547167507249, "language_loss": 0.66142029, "learning_rate": 2.4958178971165046e-06, "loss": 0.73869169, "num_input_tokens_seen": 156053805, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.12451172, "step": 7279, "time_per_iteration": 3.9825661182403564 }, { "auxiliary_loss_clip": 0.06466424, "auxiliary_loss_mlp": 0.01273301, "balance_loss_clip": 0.06290475, "balance_loss_mlp": 0.01259395, "epoch": 0.4376972794228168, "flos": 23411144010240.0, "grad_norm": 2.253533720620074, "language_loss": 0.82320631, "learning_rate": 2.4954405837309126e-06, "loss": 0.90060365, "num_input_tokens_seen": 156073295, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.13903809, "step": 7280, "time_per_iteration": 2.5745818614959717 }, { "auxiliary_loss_clip": 0.06451161, "auxiliary_loss_mlp": 0.01277944, "balance_loss_clip": 0.06288482, "balance_loss_mlp": 0.01264598, "epoch": 0.43775740267548474, "flos": 22899511779840.0, "grad_norm": 1.4357506216750004, "language_loss": 0.77440959, "learning_rate": 2.4950632515581653e-06, "loss": 0.85170066, "num_input_tokens_seen": 156094540, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.13354492, "step": 7281, "time_per_iteration": 2.5835154056549072 }, { "auxiliary_loss_clip": 0.06459315, "auxiliary_loss_mlp": 0.01272048, "balance_loss_clip": 0.06289536, "balance_loss_mlp": 0.01259847, "epoch": 0.4378175259281527, "flos": 23301041345280.0, "grad_norm": 1.768208089423153, "language_loss": 0.76515055, "learning_rate": 2.494685900612569e-06, "loss": 0.84246421, "num_input_tokens_seen": 156114070, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.12200928, "step": 7282, "time_per_iteration": 2.5563809871673584 }, { "auxiliary_loss_clip": 0.06460858, "auxiliary_loss_mlp": 0.0126866, "balance_loss_clip": 0.06290387, "balance_loss_mlp": 0.01256048, "epoch": 0.43787764918082067, "flos": 23883433948800.0, "grad_norm": 1.6091197126941161, "language_loss": 0.85266501, "learning_rate": 2.4943085309084333e-06, "loss": 0.92996019, "num_input_tokens_seen": 156132130, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.12615967, "step": 7283, "time_per_iteration": 3.9968514442443848 }, { "auxiliary_loss_clip": 0.06462453, "auxiliary_loss_mlp": 0.01270579, "balance_loss_clip": 0.0628887, "balance_loss_mlp": 0.01256489, "epoch": 0.43793777243348864, "flos": 23995004060160.0, "grad_norm": 1.669719076455912, "language_loss": 0.81133825, "learning_rate": 2.49393114246007e-06, "loss": 0.88866854, "num_input_tokens_seen": 156150820, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.14086914, "step": 7284, "time_per_iteration": 2.568631172180176 }, { "auxiliary_loss_clip": 0.06454891, "auxiliary_loss_mlp": 0.01272715, "balance_loss_clip": 0.0628711, "balance_loss_mlp": 0.01259817, "epoch": 0.4379978956861566, "flos": 18629909383680.0, "grad_norm": 1.5852673166061453, "language_loss": 0.80698895, "learning_rate": 2.493553735281787e-06, "loss": 0.88426507, "num_input_tokens_seen": 156170125, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.12902832, "step": 7285, "time_per_iteration": 2.5534749031066895 }, { "auxiliary_loss_clip": 0.06454839, "auxiliary_loss_mlp": 0.01269582, "balance_loss_clip": 0.06285839, "balance_loss_mlp": 0.01256242, "epoch": 0.43805801893882457, "flos": 21987901284480.0, "grad_norm": 2.033618844773789, "language_loss": 0.74710155, "learning_rate": 2.493176309387897e-06, "loss": 0.82434571, "num_input_tokens_seen": 156187320, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.13348389, "step": 7286, "time_per_iteration": 2.54091739654541 }, { "auxiliary_loss_clip": 0.06458624, "auxiliary_loss_mlp": 0.01269298, "balance_loss_clip": 0.06286545, "balance_loss_mlp": 0.01256769, "epoch": 0.43811814219149253, "flos": 26400114529920.0, "grad_norm": 1.436198752247577, "language_loss": 0.7384491, "learning_rate": 2.492798864792712e-06, "loss": 0.81572843, "num_input_tokens_seen": 156207455, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.12542725, "step": 7287, "time_per_iteration": 2.6387765407562256 }, { "auxiliary_loss_clip": 0.06462572, "auxiliary_loss_mlp": 0.01269852, "balance_loss_clip": 0.06291248, "balance_loss_mlp": 0.01256143, "epoch": 0.43817826544416055, "flos": 17499015953280.0, "grad_norm": 1.6508334404165046, "language_loss": 0.82515115, "learning_rate": 2.492421401510545e-06, "loss": 0.90247536, "num_input_tokens_seen": 156226560, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.137146, "step": 7288, "time_per_iteration": 2.5321147441864014 }, { "auxiliary_loss_clip": 0.06462636, "auxiliary_loss_mlp": 0.01270353, "balance_loss_clip": 0.06288241, "balance_loss_mlp": 0.01256787, "epoch": 0.4382383886968285, "flos": 21587629530240.0, "grad_norm": 1.3176275616593107, "language_loss": 0.84626079, "learning_rate": 2.4920439195557093e-06, "loss": 0.92359066, "num_input_tokens_seen": 156246740, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.13568115, "step": 7289, "time_per_iteration": 2.5929064750671387 }, { "auxiliary_loss_clip": 0.0646162, "auxiliary_loss_mlp": 0.01270212, "balance_loss_clip": 0.062846, "balance_loss_mlp": 0.01256402, "epoch": 0.4382985119494965, "flos": 27930441173760.0, "grad_norm": 1.7344293429173903, "language_loss": 0.78769135, "learning_rate": 2.4916664189425183e-06, "loss": 0.86500967, "num_input_tokens_seen": 156266440, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.13824463, "step": 7290, "time_per_iteration": 2.694528579711914 }, { "auxiliary_loss_clip": 0.06456912, "auxiliary_loss_mlp": 0.01268391, "balance_loss_clip": 0.06288039, "balance_loss_mlp": 0.01255713, "epoch": 0.43835863520216445, "flos": 24943860495360.0, "grad_norm": 1.5735684285743012, "language_loss": 0.78370154, "learning_rate": 2.491288899685288e-06, "loss": 0.86095452, "num_input_tokens_seen": 156286900, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.12683105, "step": 7291, "time_per_iteration": 2.6295974254608154 }, { "auxiliary_loss_clip": 0.06460606, "auxiliary_loss_mlp": 0.01269805, "balance_loss_clip": 0.06289259, "balance_loss_mlp": 0.01256984, "epoch": 0.4384187584548324, "flos": 33518634145920.0, "grad_norm": 1.6025136489302574, "language_loss": 0.65004045, "learning_rate": 2.4909113617983325e-06, "loss": 0.72734451, "num_input_tokens_seen": 156307690, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.12817383, "step": 7292, "time_per_iteration": 5.475728273391724 }, { "auxiliary_loss_clip": 0.06459275, "auxiliary_loss_mlp": 0.01268362, "balance_loss_clip": 0.06286686, "balance_loss_mlp": 0.01255481, "epoch": 0.4384788817075004, "flos": 23957800047360.0, "grad_norm": 1.5536731531892238, "language_loss": 0.74395216, "learning_rate": 2.49053380529597e-06, "loss": 0.8212285, "num_input_tokens_seen": 156326620, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.12890625, "step": 7293, "time_per_iteration": 2.569016456604004 }, { "auxiliary_loss_clip": 0.06459588, "auxiliary_loss_mlp": 0.01267502, "balance_loss_clip": 0.06290083, "balance_loss_mlp": 0.01254044, "epoch": 0.43853900496016834, "flos": 19104463382400.0, "grad_norm": 1.6996653298456295, "language_loss": 0.7854926, "learning_rate": 2.490156230192516e-06, "loss": 0.86276352, "num_input_tokens_seen": 156345495, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.13458252, "step": 7294, "time_per_iteration": 2.559920310974121 }, { "auxiliary_loss_clip": 0.06467819, "auxiliary_loss_mlp": 0.01272368, "balance_loss_clip": 0.06295332, "balance_loss_mlp": 0.01259476, "epoch": 0.4385991282128363, "flos": 13230503660160.0, "grad_norm": 1.5497446192024709, "language_loss": 0.73219937, "learning_rate": 2.4897786365022883e-06, "loss": 0.80960125, "num_input_tokens_seen": 156363155, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.12896729, "step": 7295, "time_per_iteration": 2.5159168243408203 }, { "auxiliary_loss_clip": 0.06465231, "auxiliary_loss_mlp": 0.01269394, "balance_loss_clip": 0.06292509, "balance_loss_mlp": 0.01255005, "epoch": 0.4386592514655043, "flos": 14325199326720.0, "grad_norm": 1.5859675914945168, "language_loss": 0.75088787, "learning_rate": 2.4894010242396063e-06, "loss": 0.82823408, "num_input_tokens_seen": 156380940, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.1439209, "step": 7296, "time_per_iteration": 2.5581815242767334 }, { "auxiliary_loss_clip": 0.06460553, "auxiliary_loss_mlp": 0.01269344, "balance_loss_clip": 0.0629236, "balance_loss_mlp": 0.0125697, "epoch": 0.43871937471817224, "flos": 22791128123520.0, "grad_norm": 1.6590677657832766, "language_loss": 0.6984657, "learning_rate": 2.4890233934187873e-06, "loss": 0.7757647, "num_input_tokens_seen": 156400415, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.12359619, "step": 7297, "time_per_iteration": 2.56528902053833 }, { "auxiliary_loss_clip": 0.06466698, "auxiliary_loss_mlp": 0.0127003, "balance_loss_clip": 0.06296141, "balance_loss_mlp": 0.01257238, "epoch": 0.4387794979708402, "flos": 28079466860160.0, "grad_norm": 1.396526477669435, "language_loss": 0.70501512, "learning_rate": 2.4886457440541535e-06, "loss": 0.78238243, "num_input_tokens_seen": 156421120, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.12792969, "step": 7298, "time_per_iteration": 2.632455825805664 }, { "auxiliary_loss_clip": 0.06457566, "auxiliary_loss_mlp": 0.01267689, "balance_loss_clip": 0.06291536, "balance_loss_mlp": 0.01255184, "epoch": 0.43883962122350817, "flos": 26256665139840.0, "grad_norm": 1.5842455766165826, "language_loss": 0.72744429, "learning_rate": 2.4882680761600238e-06, "loss": 0.8046968, "num_input_tokens_seen": 156441535, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.125, "step": 7299, "time_per_iteration": 2.604163885116577 }, { "auxiliary_loss_clip": 0.06462833, "auxiliary_loss_mlp": 0.01275491, "balance_loss_clip": 0.06290692, "balance_loss_mlp": 0.0126134, "epoch": 0.43889974447617613, "flos": 25890662505600.0, "grad_norm": 1.6993638276541003, "language_loss": 0.76745999, "learning_rate": 2.487890389750719e-06, "loss": 0.84484327, "num_input_tokens_seen": 156462015, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.14147949, "step": 7300, "time_per_iteration": 2.6331610679626465 }, { "auxiliary_loss_clip": 0.06462483, "auxiliary_loss_mlp": 0.01268429, "balance_loss_clip": 0.06291662, "balance_loss_mlp": 0.01255411, "epoch": 0.43895986772884416, "flos": 25053711598080.0, "grad_norm": 1.6849370472137828, "language_loss": 0.71515995, "learning_rate": 2.4875126848405626e-06, "loss": 0.79246908, "num_input_tokens_seen": 156482165, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.13031006, "step": 7301, "time_per_iteration": 2.5980019569396973 }, { "auxiliary_loss_clip": 0.06464092, "auxiliary_loss_mlp": 0.0126993, "balance_loss_clip": 0.06290988, "balance_loss_mlp": 0.01255553, "epoch": 0.4390199909815121, "flos": 26001729492480.0, "grad_norm": 1.8463851029138603, "language_loss": 0.71334684, "learning_rate": 2.4871349614438757e-06, "loss": 0.79068702, "num_input_tokens_seen": 156503170, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.14367676, "step": 7302, "time_per_iteration": 2.619490146636963 }, { "auxiliary_loss_clip": 0.06462687, "auxiliary_loss_mlp": 0.01269042, "balance_loss_clip": 0.06294148, "balance_loss_mlp": 0.01255608, "epoch": 0.4390801142341801, "flos": 29029790741760.0, "grad_norm": 1.5740336840149245, "language_loss": 0.82716513, "learning_rate": 2.486757219574983e-06, "loss": 0.90448236, "num_input_tokens_seen": 156523005, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.13415527, "step": 7303, "time_per_iteration": 2.610145092010498 }, { "auxiliary_loss_clip": 0.06476458, "auxiliary_loss_mlp": 0.012697, "balance_loss_clip": 0.0630033, "balance_loss_mlp": 0.01255806, "epoch": 0.43914023748684805, "flos": 33447077159040.0, "grad_norm": 2.0578194975633197, "language_loss": 0.68873405, "learning_rate": 2.4863794592482067e-06, "loss": 0.76619565, "num_input_tokens_seen": 156544440, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.13916016, "step": 7304, "time_per_iteration": 2.678163528442383 }, { "auxiliary_loss_clip": 0.06456177, "auxiliary_loss_mlp": 0.01268664, "balance_loss_clip": 0.06291021, "balance_loss_mlp": 0.012562, "epoch": 0.439200360739516, "flos": 34540347306240.0, "grad_norm": 1.5422970492791603, "language_loss": 0.78509724, "learning_rate": 2.486001680477873e-06, "loss": 0.86234564, "num_input_tokens_seen": 156565410, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.12469482, "step": 7305, "time_per_iteration": 2.6630661487579346 }, { "auxiliary_loss_clip": 0.06460831, "auxiliary_loss_mlp": 0.01271004, "balance_loss_clip": 0.06291061, "balance_loss_mlp": 0.01258118, "epoch": 0.439260483992184, "flos": 21914247945600.0, "grad_norm": 2.2934440969853336, "language_loss": 0.69141161, "learning_rate": 2.485623883278308e-06, "loss": 0.76872998, "num_input_tokens_seen": 156584210, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.12872314, "step": 7306, "time_per_iteration": 2.550147771835327 }, { "auxiliary_loss_clip": 0.06463469, "auxiliary_loss_mlp": 0.01273539, "balance_loss_clip": 0.06292994, "balance_loss_mlp": 0.01260247, "epoch": 0.43932060724485195, "flos": 21002805158400.0, "grad_norm": 1.555724936788572, "language_loss": 0.63132548, "learning_rate": 2.4852460676638344e-06, "loss": 0.70869553, "num_input_tokens_seen": 156602730, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.13287354, "step": 7307, "time_per_iteration": 2.6013143062591553 }, { "auxiliary_loss_clip": 0.0646292, "auxiliary_loss_mlp": 0.01266755, "balance_loss_clip": 0.06288402, "balance_loss_mlp": 0.01254018, "epoch": 0.4393807304975199, "flos": 17752526081280.0, "grad_norm": 2.075012591667618, "language_loss": 0.72461236, "learning_rate": 2.4848682336487828e-06, "loss": 0.80190915, "num_input_tokens_seen": 156619405, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.12750244, "step": 7308, "time_per_iteration": 2.6135149002075195 }, { "auxiliary_loss_clip": 0.06464021, "auxiliary_loss_mlp": 0.01270044, "balance_loss_clip": 0.06289901, "balance_loss_mlp": 0.01257014, "epoch": 0.4394408537501879, "flos": 22535102373120.0, "grad_norm": 1.6705795728667743, "language_loss": 0.77616942, "learning_rate": 2.4844903812474787e-06, "loss": 0.85351002, "num_input_tokens_seen": 156638165, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.13037109, "step": 7309, "time_per_iteration": 2.5594327449798584 }, { "auxiliary_loss_clip": 0.06452562, "auxiliary_loss_mlp": 0.01268927, "balance_loss_clip": 0.06287144, "balance_loss_mlp": 0.01257436, "epoch": 0.43950097700285584, "flos": 23447383701120.0, "grad_norm": 1.9061313318189976, "language_loss": 0.71509099, "learning_rate": 2.484112510474251e-06, "loss": 0.79230589, "num_input_tokens_seen": 156658845, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.11486816, "step": 7310, "time_per_iteration": 2.564760446548462 }, { "auxiliary_loss_clip": 0.06463802, "auxiliary_loss_mlp": 0.01266582, "balance_loss_clip": 0.06290582, "balance_loss_mlp": 0.01253928, "epoch": 0.4395611002555238, "flos": 23186620195200.0, "grad_norm": 2.0210548650800284, "language_loss": 0.75664449, "learning_rate": 2.483734621343429e-06, "loss": 0.83394825, "num_input_tokens_seen": 156677275, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.12664795, "step": 7311, "time_per_iteration": 2.5426294803619385 }, { "auxiliary_loss_clip": 0.06464079, "auxiliary_loss_mlp": 0.01274631, "balance_loss_clip": 0.062897, "balance_loss_mlp": 0.01261965, "epoch": 0.43962122350819177, "flos": 22133908224000.0, "grad_norm": 1.9771406705191619, "language_loss": 0.82192701, "learning_rate": 2.483356713869341e-06, "loss": 0.89931417, "num_input_tokens_seen": 156695815, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.12677002, "step": 7312, "time_per_iteration": 2.544865131378174 }, { "auxiliary_loss_clip": 0.06456816, "auxiliary_loss_mlp": 0.0127028, "balance_loss_clip": 0.06288617, "balance_loss_mlp": 0.01258025, "epoch": 0.43968134676085974, "flos": 17426285009280.0, "grad_norm": 1.8193146615805982, "language_loss": 0.85871243, "learning_rate": 2.482978788066318e-06, "loss": 0.93598342, "num_input_tokens_seen": 156714385, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.12261963, "step": 7313, "time_per_iteration": 2.528317928314209 }, { "auxiliary_loss_clip": 0.06461664, "auxiliary_loss_mlp": 0.01274904, "balance_loss_clip": 0.06288252, "balance_loss_mlp": 0.01262178, "epoch": 0.43974147001352776, "flos": 18958582224000.0, "grad_norm": 1.6903141417863965, "language_loss": 0.68042487, "learning_rate": 2.4826008439486904e-06, "loss": 0.75779057, "num_input_tokens_seen": 156732615, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.12738037, "step": 7314, "time_per_iteration": 2.53798246383667 }, { "auxiliary_loss_clip": 0.06464618, "auxiliary_loss_mlp": 0.01270089, "balance_loss_clip": 0.06292127, "balance_loss_mlp": 0.01257441, "epoch": 0.4398015932661957, "flos": 18959588472960.0, "grad_norm": 1.747775093065162, "language_loss": 0.77004731, "learning_rate": 2.4822228815307915e-06, "loss": 0.84739441, "num_input_tokens_seen": 156750920, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.12646484, "step": 7315, "time_per_iteration": 2.534324884414673 }, { "auxiliary_loss_clip": 0.06457208, "auxiliary_loss_mlp": 0.01268483, "balance_loss_clip": 0.06287915, "balance_loss_mlp": 0.01256037, "epoch": 0.4398617165188637, "flos": 24205608097920.0, "grad_norm": 2.0092510784069875, "language_loss": 0.74309266, "learning_rate": 2.4818449008269523e-06, "loss": 0.82034957, "num_input_tokens_seen": 156768520, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.12451172, "step": 7316, "time_per_iteration": 2.5667474269866943 }, { "auxiliary_loss_clip": 0.06458241, "auxiliary_loss_mlp": 0.01267211, "balance_loss_clip": 0.06290409, "balance_loss_mlp": 0.01254611, "epoch": 0.43992183977153165, "flos": 22243214275200.0, "grad_norm": 2.640260072343669, "language_loss": 0.64606845, "learning_rate": 2.481466901851506e-06, "loss": 0.72332299, "num_input_tokens_seen": 156788700, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.12597656, "step": 7317, "time_per_iteration": 2.5669772624969482 }, { "auxiliary_loss_clip": 0.06464157, "auxiliary_loss_mlp": 0.01269373, "balance_loss_clip": 0.062917, "balance_loss_mlp": 0.01255872, "epoch": 0.4399819630241996, "flos": 18703395014400.0, "grad_norm": 1.6914176716190494, "language_loss": 0.80256307, "learning_rate": 2.4810888846187865e-06, "loss": 0.87989831, "num_input_tokens_seen": 156806470, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.13500977, "step": 7318, "time_per_iteration": 4.0290563106536865 }, { "auxiliary_loss_clip": 0.06471479, "auxiliary_loss_mlp": 0.01271294, "balance_loss_clip": 0.06296279, "balance_loss_mlp": 0.01258366, "epoch": 0.4400420862768676, "flos": 23886326914560.0, "grad_norm": 1.5744743163315638, "language_loss": 0.8023749, "learning_rate": 2.4807108491431283e-06, "loss": 0.87980258, "num_input_tokens_seen": 156825895, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.12939453, "step": 7319, "time_per_iteration": 2.586468458175659 }, { "auxiliary_loss_clip": 0.06463006, "auxiliary_loss_mlp": 0.01274571, "balance_loss_clip": 0.0629203, "balance_loss_mlp": 0.01260808, "epoch": 0.44010220952953555, "flos": 28045071959040.0, "grad_norm": 1.6194499951067411, "language_loss": 0.79878068, "learning_rate": 2.4803327954388667e-06, "loss": 0.87615645, "num_input_tokens_seen": 156845990, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.13751221, "step": 7320, "time_per_iteration": 2.600175142288208 }, { "auxiliary_loss_clip": 0.06465187, "auxiliary_loss_mlp": 0.01269754, "balance_loss_clip": 0.06294884, "balance_loss_mlp": 0.01257231, "epoch": 0.4401623327822035, "flos": 23775763052160.0, "grad_norm": 1.6840692891083238, "language_loss": 0.7012434, "learning_rate": 2.4799547235203376e-06, "loss": 0.77859271, "num_input_tokens_seen": 156866685, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.12512207, "step": 7321, "time_per_iteration": 2.577301025390625 }, { "auxiliary_loss_clip": 0.06353181, "auxiliary_loss_mlp": 0.01254813, "balance_loss_clip": 0.06279898, "balance_loss_mlp": 0.01251897, "epoch": 0.4402224560348715, "flos": 70797320081280.0, "grad_norm": 0.8645911862285979, "language_loss": 0.57035536, "learning_rate": 2.4795766334018763e-06, "loss": 0.64643526, "num_input_tokens_seen": 156923450, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 0.02912903, "step": 7322, "time_per_iteration": 4.683446168899536 }, { "auxiliary_loss_clip": 0.06460597, "auxiliary_loss_mlp": 0.01271852, "balance_loss_clip": 0.06291249, "balance_loss_mlp": 0.01259735, "epoch": 0.44028257928753944, "flos": 22898170114560.0, "grad_norm": 1.3815328575397998, "language_loss": 0.76359588, "learning_rate": 2.479198525097822e-06, "loss": 0.84092039, "num_input_tokens_seen": 156944795, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.12127686, "step": 7323, "time_per_iteration": 2.6149234771728516 }, { "auxiliary_loss_clip": 0.0646798, "auxiliary_loss_mlp": 0.01272048, "balance_loss_clip": 0.06296392, "balance_loss_mlp": 0.01258798, "epoch": 0.4403427025402074, "flos": 17901719475840.0, "grad_norm": 1.4711308059226962, "language_loss": 0.80766237, "learning_rate": 2.478820398622511e-06, "loss": 0.88506269, "num_input_tokens_seen": 156962755, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.13238525, "step": 7324, "time_per_iteration": 2.528538227081299 }, { "auxiliary_loss_clip": 0.06356443, "auxiliary_loss_mlp": 0.01256077, "balance_loss_clip": 0.06283489, "balance_loss_mlp": 0.0125316, "epoch": 0.4404028257928754, "flos": 69583717071360.0, "grad_norm": 0.6529391203644522, "language_loss": 0.54506809, "learning_rate": 2.478442253990283e-06, "loss": 0.62119329, "num_input_tokens_seen": 157028095, "router_z_loss_clip": 0.72753906, "router_z_loss_mlp": 0.02912903, "step": 7325, "time_per_iteration": 3.196458578109741 }, { "auxiliary_loss_clip": 0.0646069, "auxiliary_loss_mlp": 0.0126693, "balance_loss_clip": 0.06293003, "balance_loss_mlp": 0.01255069, "epoch": 0.44046294904554334, "flos": 20930074214400.0, "grad_norm": 1.9973477204709857, "language_loss": 0.69874102, "learning_rate": 2.4780640912154766e-06, "loss": 0.77601725, "num_input_tokens_seen": 157048365, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.11859131, "step": 7326, "time_per_iteration": 2.563612461090088 }, { "auxiliary_loss_clip": 0.0646105, "auxiliary_loss_mlp": 0.01269846, "balance_loss_clip": 0.06291805, "balance_loss_mlp": 0.01257549, "epoch": 0.44052307229821136, "flos": 23630301164160.0, "grad_norm": 1.417993560358505, "language_loss": 0.76458144, "learning_rate": 2.477685910312432e-06, "loss": 0.84189039, "num_input_tokens_seen": 157069130, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.12298584, "step": 7327, "time_per_iteration": 2.573803186416626 }, { "auxiliary_loss_clip": 0.0645914, "auxiliary_loss_mlp": 0.01268586, "balance_loss_clip": 0.06292491, "balance_loss_mlp": 0.01255866, "epoch": 0.4405831955508793, "flos": 17602536072960.0, "grad_norm": 1.7819936631854143, "language_loss": 0.84249234, "learning_rate": 2.4773077112954897e-06, "loss": 0.91976964, "num_input_tokens_seen": 157084940, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.12731934, "step": 7328, "time_per_iteration": 2.509162425994873 }, { "auxiliary_loss_clip": 0.06465641, "auxiliary_loss_mlp": 0.01270991, "balance_loss_clip": 0.06298836, "balance_loss_mlp": 0.01258951, "epoch": 0.4406433188035473, "flos": 21468596405760.0, "grad_norm": 2.133604386896505, "language_loss": 0.7783947, "learning_rate": 2.4769294941789908e-06, "loss": 0.85576093, "num_input_tokens_seen": 157102770, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.12042236, "step": 7329, "time_per_iteration": 2.573117733001709 }, { "auxiliary_loss_clip": 0.06467029, "auxiliary_loss_mlp": 0.01270945, "balance_loss_clip": 0.06294391, "balance_loss_mlp": 0.01257921, "epoch": 0.44070344205621526, "flos": 22680019209600.0, "grad_norm": 1.8087215514782928, "language_loss": 0.74102831, "learning_rate": 2.476551258977278e-06, "loss": 0.81840813, "num_input_tokens_seen": 157122035, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.13031006, "step": 7330, "time_per_iteration": 2.5963988304138184 }, { "auxiliary_loss_clip": 0.06466419, "auxiliary_loss_mlp": 0.01267061, "balance_loss_clip": 0.06296851, "balance_loss_mlp": 0.01254467, "epoch": 0.4407635653088832, "flos": 23448012606720.0, "grad_norm": 2.9716368897859886, "language_loss": 0.74367809, "learning_rate": 2.4761730057046936e-06, "loss": 0.82101291, "num_input_tokens_seen": 157142800, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.12615967, "step": 7331, "time_per_iteration": 4.0507283210754395 }, { "auxiliary_loss_clip": 0.06463122, "auxiliary_loss_mlp": 0.0126646, "balance_loss_clip": 0.06295723, "balance_loss_mlp": 0.01254182, "epoch": 0.4408236885615512, "flos": 24027596098560.0, "grad_norm": 1.4805698076413405, "language_loss": 0.76213861, "learning_rate": 2.475794734375581e-06, "loss": 0.83943439, "num_input_tokens_seen": 157163295, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.1227417, "step": 7332, "time_per_iteration": 3.9945220947265625 }, { "auxiliary_loss_clip": 0.06456083, "auxiliary_loss_mlp": 0.01272968, "balance_loss_clip": 0.06287281, "balance_loss_mlp": 0.01260302, "epoch": 0.44088381181421915, "flos": 12681667416960.0, "grad_norm": 1.7722305213438578, "language_loss": 0.73559284, "learning_rate": 2.475416445004285e-06, "loss": 0.81288338, "num_input_tokens_seen": 157180890, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.12664795, "step": 7333, "time_per_iteration": 2.5186572074890137 }, { "auxiliary_loss_clip": 0.06459935, "auxiliary_loss_mlp": 0.01267061, "balance_loss_clip": 0.06296422, "balance_loss_mlp": 0.01255533, "epoch": 0.4409439350668871, "flos": 24576474268800.0, "grad_norm": 2.000843019990094, "language_loss": 0.79816735, "learning_rate": 2.4750381376051493e-06, "loss": 0.87543726, "num_input_tokens_seen": 157200580, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.11523438, "step": 7334, "time_per_iteration": 2.586540460586548 }, { "auxiliary_loss_clip": 0.06478725, "auxiliary_loss_mlp": 0.01270457, "balance_loss_clip": 0.06298964, "balance_loss_mlp": 0.01255812, "epoch": 0.4410040583195551, "flos": 22674191351040.0, "grad_norm": 2.217694045041613, "language_loss": 0.75564009, "learning_rate": 2.47465981219252e-06, "loss": 0.83313191, "num_input_tokens_seen": 157218345, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.1463623, "step": 7335, "time_per_iteration": 2.5760743618011475 }, { "auxiliary_loss_clip": 0.06463206, "auxiliary_loss_mlp": 0.01270116, "balance_loss_clip": 0.06294128, "balance_loss_mlp": 0.01256442, "epoch": 0.44106418157222305, "flos": 10857062833920.0, "grad_norm": 2.039317242184241, "language_loss": 0.72893953, "learning_rate": 2.4742814687807423e-06, "loss": 0.80627275, "num_input_tokens_seen": 157234395, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.13684082, "step": 7336, "time_per_iteration": 2.5314841270446777 }, { "auxiliary_loss_clip": 0.06472537, "auxiliary_loss_mlp": 0.01276341, "balance_loss_clip": 0.06298853, "balance_loss_mlp": 0.01263055, "epoch": 0.441124304824891, "flos": 21733301053440.0, "grad_norm": 2.1322889781864878, "language_loss": 0.64230973, "learning_rate": 2.473903107384165e-06, "loss": 0.71979845, "num_input_tokens_seen": 157254805, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.13293457, "step": 7337, "time_per_iteration": 2.7296650409698486 }, { "auxiliary_loss_clip": 0.06350781, "auxiliary_loss_mlp": 0.01263986, "balance_loss_clip": 0.06278182, "balance_loss_mlp": 0.01261535, "epoch": 0.441184428077559, "flos": 63241702041600.0, "grad_norm": 0.7286106489232543, "language_loss": 0.52675647, "learning_rate": 2.473524728017134e-06, "loss": 0.60290414, "num_input_tokens_seen": 157317870, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 0.02449036, "step": 7338, "time_per_iteration": 3.2544093132019043 }, { "auxiliary_loss_clip": 0.0647488, "auxiliary_loss_mlp": 0.01270298, "balance_loss_clip": 0.06298862, "balance_loss_mlp": 0.01256499, "epoch": 0.44124455133022694, "flos": 21184213248000.0, "grad_norm": 2.0130973956082245, "language_loss": 0.71340346, "learning_rate": 2.473146330693997e-06, "loss": 0.79085523, "num_input_tokens_seen": 157336505, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.13800049, "step": 7339, "time_per_iteration": 2.5667948722839355 }, { "auxiliary_loss_clip": 0.06458029, "auxiliary_loss_mlp": 0.01270103, "balance_loss_clip": 0.06295469, "balance_loss_mlp": 0.01257705, "epoch": 0.4413046745828949, "flos": 17463740584320.0, "grad_norm": 1.6082553054073039, "language_loss": 0.69794536, "learning_rate": 2.472767915429105e-06, "loss": 0.77522671, "num_input_tokens_seen": 157354995, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.12408447, "step": 7340, "time_per_iteration": 2.5326082706451416 }, { "auxiliary_loss_clip": 0.06348611, "auxiliary_loss_mlp": 0.01264617, "balance_loss_clip": 0.06276256, "balance_loss_mlp": 0.01262147, "epoch": 0.4413647978355629, "flos": 61602251783040.0, "grad_norm": 0.8847608649727474, "language_loss": 0.63981736, "learning_rate": 2.4723894822368054e-06, "loss": 0.71594965, "num_input_tokens_seen": 157404260, "router_z_loss_clip": 0.72412109, "router_z_loss_mlp": 0.02468872, "step": 7341, "time_per_iteration": 3.0070934295654297 }, { "auxiliary_loss_clip": 0.06463262, "auxiliary_loss_mlp": 0.012725, "balance_loss_clip": 0.06293936, "balance_loss_mlp": 0.01259649, "epoch": 0.4414249210882309, "flos": 27534404050560.0, "grad_norm": 1.8988849410029567, "language_loss": 0.74154091, "learning_rate": 2.47201103113145e-06, "loss": 0.8188985, "num_input_tokens_seen": 157423045, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.128479, "step": 7342, "time_per_iteration": 2.6258363723754883 }, { "auxiliary_loss_clip": 0.06458981, "auxiliary_loss_mlp": 0.01277596, "balance_loss_clip": 0.0629165, "balance_loss_mlp": 0.01264238, "epoch": 0.44148504434089886, "flos": 23520785477760.0, "grad_norm": 1.8662216817613582, "language_loss": 0.8025279, "learning_rate": 2.4716325621273886e-06, "loss": 0.87989366, "num_input_tokens_seen": 157441815, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.13342285, "step": 7343, "time_per_iteration": 2.614619255065918 }, { "auxiliary_loss_clip": 0.06460863, "auxiliary_loss_mlp": 0.01274608, "balance_loss_clip": 0.06293471, "balance_loss_mlp": 0.01261989, "epoch": 0.4415451675935668, "flos": 21587126405760.0, "grad_norm": 1.5169331426801949, "language_loss": 0.77156174, "learning_rate": 2.4712540752389725e-06, "loss": 0.84891641, "num_input_tokens_seen": 157460470, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.12615967, "step": 7344, "time_per_iteration": 2.576876640319824 }, { "auxiliary_loss_clip": 0.06347707, "auxiliary_loss_mlp": 0.01263078, "balance_loss_clip": 0.06275064, "balance_loss_mlp": 0.01260496, "epoch": 0.4416052908462348, "flos": 59023825142400.0, "grad_norm": 0.7847426276979128, "language_loss": 0.63772821, "learning_rate": 2.470875570480556e-06, "loss": 0.71383601, "num_input_tokens_seen": 157512655, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 0.02583313, "step": 7345, "time_per_iteration": 3.01469349861145 }, { "auxiliary_loss_clip": 0.06461722, "auxiliary_loss_mlp": 0.01270384, "balance_loss_clip": 0.06291747, "balance_loss_mlp": 0.01257187, "epoch": 0.44166541409890275, "flos": 26364545671680.0, "grad_norm": 1.5634103128170533, "language_loss": 0.86007226, "learning_rate": 2.470497047866489e-06, "loss": 0.93739337, "num_input_tokens_seen": 157533700, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.13195801, "step": 7346, "time_per_iteration": 2.615394353866577 }, { "auxiliary_loss_clip": 0.064603, "auxiliary_loss_mlp": 0.01267643, "balance_loss_clip": 0.06292051, "balance_loss_mlp": 0.01254232, "epoch": 0.4417255373515707, "flos": 20198739778560.0, "grad_norm": 1.7274219816881706, "language_loss": 0.8038255, "learning_rate": 2.470118507411128e-06, "loss": 0.88110489, "num_input_tokens_seen": 157551105, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.1340332, "step": 7347, "time_per_iteration": 2.665398597717285 }, { "auxiliary_loss_clip": 0.06463328, "auxiliary_loss_mlp": 0.01268004, "balance_loss_clip": 0.06293922, "balance_loss_mlp": 0.01254492, "epoch": 0.4417856606042387, "flos": 17892537454080.0, "grad_norm": 1.7234540736821817, "language_loss": 0.83406019, "learning_rate": 2.4697399491288263e-06, "loss": 0.9113735, "num_input_tokens_seen": 157568285, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.13513184, "step": 7348, "time_per_iteration": 2.539236307144165 }, { "auxiliary_loss_clip": 0.06464262, "auxiliary_loss_mlp": 0.01269818, "balance_loss_clip": 0.06293087, "balance_loss_mlp": 0.01257188, "epoch": 0.44184578385690665, "flos": 27971376693120.0, "grad_norm": 1.5539101753883369, "language_loss": 0.70667821, "learning_rate": 2.469361373033938e-06, "loss": 0.78401899, "num_input_tokens_seen": 157590405, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.12646484, "step": 7349, "time_per_iteration": 2.6142947673797607 }, { "auxiliary_loss_clip": 0.06456229, "auxiliary_loss_mlp": 0.01267929, "balance_loss_clip": 0.06286261, "balance_loss_mlp": 0.01254757, "epoch": 0.4419059071095746, "flos": 23374652757120.0, "grad_norm": 3.8023677607946484, "language_loss": 0.75042045, "learning_rate": 2.468982779140819e-06, "loss": 0.82766199, "num_input_tokens_seen": 157607420, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.13165283, "step": 7350, "time_per_iteration": 2.561997175216675 }, { "auxiliary_loss_clip": 0.06459715, "auxiliary_loss_mlp": 0.01265964, "balance_loss_clip": 0.06291888, "balance_loss_mlp": 0.01253578, "epoch": 0.4419660303622426, "flos": 15017443032960.0, "grad_norm": 2.8224960379446955, "language_loss": 0.81352901, "learning_rate": 2.468604167463827e-06, "loss": 0.89078587, "num_input_tokens_seen": 157624990, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.12384033, "step": 7351, "time_per_iteration": 2.542774200439453 }, { "auxiliary_loss_clip": 0.06450202, "auxiliary_loss_mlp": 0.01268014, "balance_loss_clip": 0.06287476, "balance_loss_mlp": 0.01256891, "epoch": 0.44202615361491054, "flos": 25378359442560.0, "grad_norm": 2.8765857869255664, "language_loss": 0.73633301, "learning_rate": 2.4682255380173176e-06, "loss": 0.81351507, "num_input_tokens_seen": 157645300, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.11120605, "step": 7352, "time_per_iteration": 2.615483522415161 }, { "auxiliary_loss_clip": 0.06458723, "auxiliary_loss_mlp": 0.01268061, "balance_loss_clip": 0.06290971, "balance_loss_mlp": 0.01254989, "epoch": 0.4420862768675785, "flos": 24688044380160.0, "grad_norm": 1.864746118161397, "language_loss": 0.87486136, "learning_rate": 2.467846890815649e-06, "loss": 0.95212919, "num_input_tokens_seen": 157664060, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.13079834, "step": 7353, "time_per_iteration": 2.604477643966675 }, { "auxiliary_loss_clip": 0.0645524, "auxiliary_loss_mlp": 0.01269727, "balance_loss_clip": 0.0628662, "balance_loss_mlp": 0.01257884, "epoch": 0.44214640012024653, "flos": 19533134471040.0, "grad_norm": 1.909605567488904, "language_loss": 0.75721115, "learning_rate": 2.4674682258731795e-06, "loss": 0.83446085, "num_input_tokens_seen": 157680905, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.11834717, "step": 7354, "time_per_iteration": 2.5348317623138428 }, { "auxiliary_loss_clip": 0.06451537, "auxiliary_loss_mlp": 0.01267461, "balance_loss_clip": 0.06287886, "balance_loss_mlp": 0.0125644, "epoch": 0.4422065233729145, "flos": 47568143940480.0, "grad_norm": 1.6674787357313994, "language_loss": 0.64528733, "learning_rate": 2.467089543204268e-06, "loss": 0.72247732, "num_input_tokens_seen": 157701980, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.11010742, "step": 7355, "time_per_iteration": 2.8662078380584717 }, { "auxiliary_loss_clip": 0.06463006, "auxiliary_loss_mlp": 0.01270383, "balance_loss_clip": 0.06290437, "balance_loss_mlp": 0.0125811, "epoch": 0.44226664662558246, "flos": 19287045429120.0, "grad_norm": 1.9119800863231957, "language_loss": 0.78564602, "learning_rate": 2.466710842823274e-06, "loss": 0.86297995, "num_input_tokens_seen": 157720555, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.12286377, "step": 7356, "time_per_iteration": 2.5765929222106934 }, { "auxiliary_loss_clip": 0.06461523, "auxiliary_loss_mlp": 0.01267162, "balance_loss_clip": 0.06290561, "balance_loss_mlp": 0.01254335, "epoch": 0.4423267698782504, "flos": 17827604939520.0, "grad_norm": 1.5338191795258354, "language_loss": 0.77366477, "learning_rate": 2.4663321247445577e-06, "loss": 0.85095167, "num_input_tokens_seen": 157739160, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.12835693, "step": 7357, "time_per_iteration": 2.5288383960723877 }, { "auxiliary_loss_clip": 0.06460988, "auxiliary_loss_mlp": 0.01270092, "balance_loss_clip": 0.06293602, "balance_loss_mlp": 0.01257039, "epoch": 0.4423868931309184, "flos": 29211953518080.0, "grad_norm": 1.915447339363803, "language_loss": 0.73496294, "learning_rate": 2.465953388982481e-06, "loss": 0.81227374, "num_input_tokens_seen": 157760020, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.13049316, "step": 7358, "time_per_iteration": 4.094668865203857 }, { "auxiliary_loss_clip": 0.06461539, "auxiliary_loss_mlp": 0.01269292, "balance_loss_clip": 0.06291809, "balance_loss_mlp": 0.0125727, "epoch": 0.44244701638358636, "flos": 29720399293440.0, "grad_norm": 1.5928934554090632, "language_loss": 0.75599557, "learning_rate": 2.465574635551405e-06, "loss": 0.83330393, "num_input_tokens_seen": 157780435, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.12030029, "step": 7359, "time_per_iteration": 2.6321182250976562 }, { "auxiliary_loss_clip": 0.06462443, "auxiliary_loss_mlp": 0.01271326, "balance_loss_clip": 0.06293698, "balance_loss_mlp": 0.01258941, "epoch": 0.4425071396362543, "flos": 22936715792640.0, "grad_norm": 1.6827011104304854, "language_loss": 0.70346475, "learning_rate": 2.4651958644656923e-06, "loss": 0.78080249, "num_input_tokens_seen": 157799420, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.1237793, "step": 7360, "time_per_iteration": 2.5670740604400635 }, { "auxiliary_loss_clip": 0.06463256, "auxiliary_loss_mlp": 0.01272409, "balance_loss_clip": 0.0629327, "balance_loss_mlp": 0.01260101, "epoch": 0.4425672628889223, "flos": 19798509951360.0, "grad_norm": 2.242906445185961, "language_loss": 0.69712979, "learning_rate": 2.4648170757397053e-06, "loss": 0.77448642, "num_input_tokens_seen": 157817025, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.12322998, "step": 7361, "time_per_iteration": 2.5406136512756348 }, { "auxiliary_loss_clip": 0.06465958, "auxiliary_loss_mlp": 0.01270014, "balance_loss_clip": 0.06296612, "balance_loss_mlp": 0.01257503, "epoch": 0.44262738614159025, "flos": 13667266667520.0, "grad_norm": 2.1831143025541278, "language_loss": 0.82846236, "learning_rate": 2.464438269387809e-06, "loss": 0.90582204, "num_input_tokens_seen": 157834345, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.12506104, "step": 7362, "time_per_iteration": 3.979768991470337 }, { "auxiliary_loss_clip": 0.06468003, "auxiliary_loss_mlp": 0.01274036, "balance_loss_clip": 0.06294478, "balance_loss_mlp": 0.01260732, "epoch": 0.4426875093942582, "flos": 14215474005120.0, "grad_norm": 1.8024128077548216, "language_loss": 0.75197196, "learning_rate": 2.464059445424366e-06, "loss": 0.82939231, "num_input_tokens_seen": 157852290, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.13311768, "step": 7363, "time_per_iteration": 2.568667411804199 }, { "auxiliary_loss_clip": 0.06353831, "auxiliary_loss_mlp": 0.01267518, "balance_loss_clip": 0.06280844, "balance_loss_mlp": 0.0126398, "epoch": 0.4427476326469262, "flos": 70140100181760.0, "grad_norm": 0.6656507765058317, "language_loss": 0.55573195, "learning_rate": 2.463680603863743e-06, "loss": 0.63194543, "num_input_tokens_seen": 157923060, "router_z_loss_clip": 0.73095703, "router_z_loss_mlp": 0.03536987, "step": 7364, "time_per_iteration": 3.319045066833496 }, { "auxiliary_loss_clip": 0.06461582, "auxiliary_loss_mlp": 0.01267506, "balance_loss_clip": 0.06296103, "balance_loss_mlp": 0.01255943, "epoch": 0.44280775589959415, "flos": 25451761219200.0, "grad_norm": 1.612477886522757, "language_loss": 0.74912328, "learning_rate": 2.463301744720305e-06, "loss": 0.82641417, "num_input_tokens_seen": 157944110, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.11560059, "step": 7365, "time_per_iteration": 2.6876583099365234 }, { "auxiliary_loss_clip": 0.06459251, "auxiliary_loss_mlp": 0.01267095, "balance_loss_clip": 0.06291935, "balance_loss_mlp": 0.01254554, "epoch": 0.4428678791522621, "flos": 22863900994560.0, "grad_norm": 1.625256733142878, "language_loss": 0.74784207, "learning_rate": 2.4629228680084184e-06, "loss": 0.82510543, "num_input_tokens_seen": 157964295, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.12536621, "step": 7366, "time_per_iteration": 2.5637450218200684 }, { "auxiliary_loss_clip": 0.06462921, "auxiliary_loss_mlp": 0.01268857, "balance_loss_clip": 0.06296632, "balance_loss_mlp": 0.01256114, "epoch": 0.44292800240493013, "flos": 25819608643200.0, "grad_norm": 1.9405189946107573, "language_loss": 0.73869151, "learning_rate": 2.46254397374245e-06, "loss": 0.81600928, "num_input_tokens_seen": 157983970, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.12756348, "step": 7367, "time_per_iteration": 2.6043832302093506 }, { "auxiliary_loss_clip": 0.06463559, "auxiliary_loss_mlp": 0.01271076, "balance_loss_clip": 0.06297047, "balance_loss_mlp": 0.01258589, "epoch": 0.4429881256575981, "flos": 32425238217600.0, "grad_norm": 1.536750710098706, "language_loss": 0.74399787, "learning_rate": 2.4621650619367677e-06, "loss": 0.82134426, "num_input_tokens_seen": 158006515, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.12506104, "step": 7368, "time_per_iteration": 2.6813230514526367 }, { "auxiliary_loss_clip": 0.06461294, "auxiliary_loss_mlp": 0.01267243, "balance_loss_clip": 0.06296018, "balance_loss_mlp": 0.01255113, "epoch": 0.44304824891026606, "flos": 22170231768960.0, "grad_norm": 2.43721619400034, "language_loss": 0.79923272, "learning_rate": 2.4617861326057403e-06, "loss": 0.87651813, "num_input_tokens_seen": 158025565, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.12133789, "step": 7369, "time_per_iteration": 2.5677130222320557 }, { "auxiliary_loss_clip": 0.06462022, "auxiliary_loss_mlp": 0.01267723, "balance_loss_clip": 0.06297603, "balance_loss_mlp": 0.01255736, "epoch": 0.443108372162934, "flos": 25345725477120.0, "grad_norm": 2.145246427258888, "language_loss": 0.72582555, "learning_rate": 2.461407185763737e-06, "loss": 0.803123, "num_input_tokens_seen": 158045620, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.11987305, "step": 7370, "time_per_iteration": 2.5993757247924805 }, { "auxiliary_loss_clip": 0.06454723, "auxiliary_loss_mlp": 0.01268077, "balance_loss_clip": 0.06289418, "balance_loss_mlp": 0.01255435, "epoch": 0.443168495415602, "flos": 23337616452480.0, "grad_norm": 1.8006887903958302, "language_loss": 0.70331502, "learning_rate": 2.461028221425126e-06, "loss": 0.78054297, "num_input_tokens_seen": 158063505, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.12646484, "step": 7371, "time_per_iteration": 3.97057843208313 }, { "auxiliary_loss_clip": 0.06459865, "auxiliary_loss_mlp": 0.01266243, "balance_loss_clip": 0.0629586, "balance_loss_mlp": 0.01255157, "epoch": 0.44322861866826996, "flos": 21877924400640.0, "grad_norm": 2.1732264711576477, "language_loss": 0.68410164, "learning_rate": 2.4606492396042786e-06, "loss": 0.76136267, "num_input_tokens_seen": 158080335, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.11083984, "step": 7372, "time_per_iteration": 4.0186851024627686 }, { "auxiliary_loss_clip": 0.06462931, "auxiliary_loss_mlp": 0.01270384, "balance_loss_clip": 0.06292968, "balance_loss_mlp": 0.01257283, "epoch": 0.4432887419209379, "flos": 20090649611520.0, "grad_norm": 1.7249209673267207, "language_loss": 0.83806944, "learning_rate": 2.4602702403155664e-06, "loss": 0.91540259, "num_input_tokens_seen": 158098955, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.13104248, "step": 7373, "time_per_iteration": 2.572145700454712 }, { "auxiliary_loss_clip": 0.06363669, "auxiliary_loss_mlp": 0.01256166, "balance_loss_clip": 0.06291129, "balance_loss_mlp": 0.0125314, "epoch": 0.4433488651736059, "flos": 70056593988480.0, "grad_norm": 0.7473305708833623, "language_loss": 0.55192935, "learning_rate": 2.4598912235733604e-06, "loss": 0.62812769, "num_input_tokens_seen": 158164110, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 0.03022766, "step": 7374, "time_per_iteration": 3.251563787460327 }, { "auxiliary_loss_clip": 0.06459352, "auxiliary_loss_mlp": 0.01270928, "balance_loss_clip": 0.06294806, "balance_loss_mlp": 0.0125766, "epoch": 0.44340898842627385, "flos": 16286838462720.0, "grad_norm": 2.7937566427833675, "language_loss": 0.83949447, "learning_rate": 2.4595121893920327e-06, "loss": 0.91679728, "num_input_tokens_seen": 158179850, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.13275146, "step": 7375, "time_per_iteration": 2.5261220932006836 }, { "auxiliary_loss_clip": 0.06458155, "auxiliary_loss_mlp": 0.01268073, "balance_loss_clip": 0.06289093, "balance_loss_mlp": 0.01256063, "epoch": 0.4434691116789418, "flos": 16616601406080.0, "grad_norm": 1.8557600897898787, "language_loss": 0.84222007, "learning_rate": 2.4591331377859578e-06, "loss": 0.91948241, "num_input_tokens_seen": 158196590, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.12011719, "step": 7376, "time_per_iteration": 2.5182011127471924 }, { "auxiliary_loss_clip": 0.06456442, "auxiliary_loss_mlp": 0.01268124, "balance_loss_clip": 0.06290729, "balance_loss_mlp": 0.0125634, "epoch": 0.4435292349316098, "flos": 19069397648640.0, "grad_norm": 1.7736452322238405, "language_loss": 0.77617967, "learning_rate": 2.4587540687695077e-06, "loss": 0.85342538, "num_input_tokens_seen": 158216355, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.11785889, "step": 7377, "time_per_iteration": 2.549605369567871 }, { "auxiliary_loss_clip": 0.06452405, "auxiliary_loss_mlp": 0.01269651, "balance_loss_clip": 0.06289755, "balance_loss_mlp": 0.01257605, "epoch": 0.44358935818427775, "flos": 21257656951680.0, "grad_norm": 1.8753708595953806, "language_loss": 0.76572227, "learning_rate": 2.458374982357057e-06, "loss": 0.84294289, "num_input_tokens_seen": 158235825, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.12042236, "step": 7378, "time_per_iteration": 2.578648090362549 }, { "auxiliary_loss_clip": 0.06451135, "auxiliary_loss_mlp": 0.01267381, "balance_loss_clip": 0.06284474, "balance_loss_mlp": 0.01254738, "epoch": 0.4436494814369457, "flos": 12500259327360.0, "grad_norm": 2.008075802134182, "language_loss": 0.69630599, "learning_rate": 2.457995878562982e-06, "loss": 0.77349114, "num_input_tokens_seen": 158254230, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.12640381, "step": 7379, "time_per_iteration": 2.531872034072876 }, { "auxiliary_loss_clip": 0.06455941, "auxiliary_loss_mlp": 0.01268935, "balance_loss_clip": 0.06287132, "balance_loss_mlp": 0.01256561, "epoch": 0.44370960468961373, "flos": 23666666636160.0, "grad_norm": 1.785284824765058, "language_loss": 0.7357223, "learning_rate": 2.457616757401656e-06, "loss": 0.81297106, "num_input_tokens_seen": 158273400, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.12371826, "step": 7380, "time_per_iteration": 2.5929338932037354 }, { "auxiliary_loss_clip": 0.0645717, "auxiliary_loss_mlp": 0.01269069, "balance_loss_clip": 0.06289466, "balance_loss_mlp": 0.01256874, "epoch": 0.4437697279422817, "flos": 32425196290560.0, "grad_norm": 1.5317755049023805, "language_loss": 0.65160948, "learning_rate": 2.457237618887458e-06, "loss": 0.72887188, "num_input_tokens_seen": 158296840, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.12200928, "step": 7381, "time_per_iteration": 2.660933256149292 }, { "auxiliary_loss_clip": 0.06459832, "auxiliary_loss_mlp": 0.0127437, "balance_loss_clip": 0.0629027, "balance_loss_mlp": 0.01261639, "epoch": 0.44382985119494966, "flos": 18118570642560.0, "grad_norm": 1.9876936744041345, "language_loss": 0.80729735, "learning_rate": 2.456858463034763e-06, "loss": 0.88463938, "num_input_tokens_seen": 158314935, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.1272583, "step": 7382, "time_per_iteration": 2.557023286819458 }, { "auxiliary_loss_clip": 0.06453096, "auxiliary_loss_mlp": 0.01269942, "balance_loss_clip": 0.06286423, "balance_loss_mlp": 0.01257598, "epoch": 0.44388997444761763, "flos": 30782083651200.0, "grad_norm": 2.243792442066407, "language_loss": 0.65569615, "learning_rate": 2.456479289857949e-06, "loss": 0.73292649, "num_input_tokens_seen": 158334620, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.12335205, "step": 7383, "time_per_iteration": 2.68583607673645 }, { "auxiliary_loss_clip": 0.0646076, "auxiliary_loss_mlp": 0.01270999, "balance_loss_clip": 0.06289802, "balance_loss_mlp": 0.01257147, "epoch": 0.4439500977002856, "flos": 20345333696640.0, "grad_norm": 3.6643470438695203, "language_loss": 0.76751971, "learning_rate": 2.4561000993713953e-06, "loss": 0.84483731, "num_input_tokens_seen": 158350550, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.1385498, "step": 7384, "time_per_iteration": 2.662708282470703 }, { "auxiliary_loss_clip": 0.06461388, "auxiliary_loss_mlp": 0.01267657, "balance_loss_clip": 0.06291178, "balance_loss_mlp": 0.01254342, "epoch": 0.44401022095295356, "flos": 20376667923840.0, "grad_norm": 1.6174389792454458, "language_loss": 0.81583691, "learning_rate": 2.4557208915894796e-06, "loss": 0.89312738, "num_input_tokens_seen": 158369555, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.13311768, "step": 7385, "time_per_iteration": 2.56575083732605 }, { "auxiliary_loss_clip": 0.06456009, "auxiliary_loss_mlp": 0.01273578, "balance_loss_clip": 0.06288631, "balance_loss_mlp": 0.01259219, "epoch": 0.4440703442056215, "flos": 20236950040320.0, "grad_norm": 1.585381165086698, "language_loss": 0.81994462, "learning_rate": 2.455341666526582e-06, "loss": 0.89724052, "num_input_tokens_seen": 158388045, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.14355469, "step": 7386, "time_per_iteration": 2.5568909645080566 }, { "auxiliary_loss_clip": 0.06464988, "auxiliary_loss_mlp": 0.01268799, "balance_loss_clip": 0.06289101, "balance_loss_mlp": 0.01254875, "epoch": 0.4441304674582895, "flos": 39504163979520.0, "grad_norm": 1.8086747832728238, "language_loss": 0.70570213, "learning_rate": 2.4549624241970832e-06, "loss": 0.78303999, "num_input_tokens_seen": 158410115, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.13922119, "step": 7387, "time_per_iteration": 2.6981701850891113 }, { "auxiliary_loss_clip": 0.06455378, "auxiliary_loss_mlp": 0.01272048, "balance_loss_clip": 0.06286313, "balance_loss_mlp": 0.01258566, "epoch": 0.44419059071095746, "flos": 14834902913280.0, "grad_norm": 1.8849523378757664, "language_loss": 0.7217291, "learning_rate": 2.4545831646153628e-06, "loss": 0.79900336, "num_input_tokens_seen": 158427765, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.13476562, "step": 7388, "time_per_iteration": 2.537247896194458 }, { "auxiliary_loss_clip": 0.06465232, "auxiliary_loss_mlp": 0.01268636, "balance_loss_clip": 0.06292976, "balance_loss_mlp": 0.01255344, "epoch": 0.4442507139636254, "flos": 22644408424320.0, "grad_norm": 1.6320754541850602, "language_loss": 0.69483042, "learning_rate": 2.4542038877958044e-06, "loss": 0.77216911, "num_input_tokens_seen": 158446375, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.13299561, "step": 7389, "time_per_iteration": 2.6185965538024902 }, { "auxiliary_loss_clip": 0.06458879, "auxiliary_loss_mlp": 0.01271144, "balance_loss_clip": 0.0628853, "balance_loss_mlp": 0.01258001, "epoch": 0.4443108372162934, "flos": 38299994553600.0, "grad_norm": 1.6548250695868905, "language_loss": 0.74987054, "learning_rate": 2.453824593752788e-06, "loss": 0.82717073, "num_input_tokens_seen": 158467260, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.13153076, "step": 7390, "time_per_iteration": 2.7215683460235596 }, { "auxiliary_loss_clip": 0.06455828, "auxiliary_loss_mlp": 0.01270138, "balance_loss_clip": 0.06289116, "balance_loss_mlp": 0.01258104, "epoch": 0.44437096046896135, "flos": 17754790141440.0, "grad_norm": 2.471298218829116, "language_loss": 0.82248861, "learning_rate": 2.4534452825006988e-06, "loss": 0.89974827, "num_input_tokens_seen": 158486720, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.12036133, "step": 7391, "time_per_iteration": 2.560443878173828 }, { "auxiliary_loss_clip": 0.0645759, "auxiliary_loss_mlp": 0.01267775, "balance_loss_clip": 0.06290455, "balance_loss_mlp": 0.01254739, "epoch": 0.4444310837216293, "flos": 13736936937600.0, "grad_norm": 1.7229350567245634, "language_loss": 0.73902106, "learning_rate": 2.4530659540539185e-06, "loss": 0.81627476, "num_input_tokens_seen": 158502530, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.13037109, "step": 7392, "time_per_iteration": 2.532090425491333 }, { "auxiliary_loss_clip": 0.06451969, "auxiliary_loss_mlp": 0.01268131, "balance_loss_clip": 0.06284712, "balance_loss_mlp": 0.0125655, "epoch": 0.44449120697429734, "flos": 25017346126080.0, "grad_norm": 1.4242991167862142, "language_loss": 0.7974453, "learning_rate": 2.4526866084268313e-06, "loss": 0.87464631, "num_input_tokens_seen": 158522715, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.11590576, "step": 7393, "time_per_iteration": 2.585292339324951 }, { "auxiliary_loss_clip": 0.06464405, "auxiliary_loss_mlp": 0.01267509, "balance_loss_clip": 0.06291054, "balance_loss_mlp": 0.01253836, "epoch": 0.4445513302269653, "flos": 32680006156800.0, "grad_norm": 2.0154684627012585, "language_loss": 0.80980378, "learning_rate": 2.4523072456338226e-06, "loss": 0.88712293, "num_input_tokens_seen": 158543615, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.13671875, "step": 7394, "time_per_iteration": 2.6646132469177246 }, { "auxiliary_loss_clip": 0.06450868, "auxiliary_loss_mlp": 0.01268907, "balance_loss_clip": 0.06284943, "balance_loss_mlp": 0.01257112, "epoch": 0.44461145347963327, "flos": 11660583162240.0, "grad_norm": 1.866796827365268, "language_loss": 0.79882294, "learning_rate": 2.4519278656892785e-06, "loss": 0.87602073, "num_input_tokens_seen": 158560330, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.11791992, "step": 7395, "time_per_iteration": 2.639667510986328 }, { "auxiliary_loss_clip": 0.06459111, "auxiliary_loss_mlp": 0.01269376, "balance_loss_clip": 0.06291431, "balance_loss_mlp": 0.01256847, "epoch": 0.44467157673230123, "flos": 20893079836800.0, "grad_norm": 1.9068354065996995, "language_loss": 0.69057238, "learning_rate": 2.451548468607584e-06, "loss": 0.76785725, "num_input_tokens_seen": 158579735, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.12524414, "step": 7396, "time_per_iteration": 2.5676708221435547 }, { "auxiliary_loss_clip": 0.06461109, "auxiliary_loss_mlp": 0.01269592, "balance_loss_clip": 0.06289391, "balance_loss_mlp": 0.01256503, "epoch": 0.4447316999849692, "flos": 18551140945920.0, "grad_norm": 1.6090036120065099, "language_loss": 0.81073809, "learning_rate": 2.451169054403126e-06, "loss": 0.88804513, "num_input_tokens_seen": 158597075, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.13104248, "step": 7397, "time_per_iteration": 3.95893931388855 }, { "auxiliary_loss_clip": 0.0645958, "auxiliary_loss_mlp": 0.01266359, "balance_loss_clip": 0.0629225, "balance_loss_mlp": 0.01254545, "epoch": 0.44479182323763716, "flos": 23775846906240.0, "grad_norm": 1.8779624103290817, "language_loss": 0.68373972, "learning_rate": 2.450789623090293e-06, "loss": 0.76099908, "num_input_tokens_seen": 158616650, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.11810303, "step": 7398, "time_per_iteration": 2.575775384902954 }, { "auxiliary_loss_clip": 0.06456389, "auxiliary_loss_mlp": 0.01270774, "balance_loss_clip": 0.0628971, "balance_loss_mlp": 0.01258418, "epoch": 0.44485194649030513, "flos": 16549237123200.0, "grad_norm": 1.6419662314250152, "language_loss": 0.70458406, "learning_rate": 2.450410174683472e-06, "loss": 0.7818557, "num_input_tokens_seen": 158634515, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.12365723, "step": 7399, "time_per_iteration": 2.532658576965332 }, { "auxiliary_loss_clip": 0.06452651, "auxiliary_loss_mlp": 0.01267964, "balance_loss_clip": 0.06287348, "balance_loss_mlp": 0.01255316, "epoch": 0.4449120697429731, "flos": 22607455973760.0, "grad_norm": 1.6669162567839444, "language_loss": 0.72780728, "learning_rate": 2.4500307091970514e-06, "loss": 0.80501342, "num_input_tokens_seen": 158653760, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.12652588, "step": 7400, "time_per_iteration": 2.550079584121704 }, { "auxiliary_loss_clip": 0.06454857, "auxiliary_loss_mlp": 0.01268558, "balance_loss_clip": 0.06287965, "balance_loss_mlp": 0.01256393, "epoch": 0.44497219299564106, "flos": 20009994456960.0, "grad_norm": 1.549552020953795, "language_loss": 0.85441709, "learning_rate": 2.449651226645422e-06, "loss": 0.93165123, "num_input_tokens_seen": 158672190, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.121521, "step": 7401, "time_per_iteration": 3.9880001544952393 }, { "auxiliary_loss_clip": 0.06446699, "auxiliary_loss_mlp": 0.01265634, "balance_loss_clip": 0.06282695, "balance_loss_mlp": 0.01254291, "epoch": 0.445032316248309, "flos": 25601499665280.0, "grad_norm": 1.5989914284233528, "language_loss": 0.83413893, "learning_rate": 2.449271727042973e-06, "loss": 0.91126227, "num_input_tokens_seen": 158694115, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.11346436, "step": 7402, "time_per_iteration": 2.7209203243255615 }, { "auxiliary_loss_clip": 0.06456325, "auxiliary_loss_mlp": 0.01270295, "balance_loss_clip": 0.06286823, "balance_loss_mlp": 0.01257408, "epoch": 0.445092439500977, "flos": 21256608775680.0, "grad_norm": 1.8017529167748028, "language_loss": 0.76974922, "learning_rate": 2.4488922104040947e-06, "loss": 0.84701538, "num_input_tokens_seen": 158711000, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.12896729, "step": 7403, "time_per_iteration": 2.538252353668213 }, { "auxiliary_loss_clip": 0.06350092, "auxiliary_loss_mlp": 0.01258988, "balance_loss_clip": 0.06277658, "balance_loss_mlp": 0.01256394, "epoch": 0.44515256275364495, "flos": 57781990506240.0, "grad_norm": 0.741521920685432, "language_loss": 0.60127729, "learning_rate": 2.4485126767431793e-06, "loss": 0.6773681, "num_input_tokens_seen": 158769675, "router_z_loss_clip": 0.72558594, "router_z_loss_mlp": 0.0259552, "step": 7404, "time_per_iteration": 3.179699182510376 }, { "auxiliary_loss_clip": 0.06458802, "auxiliary_loss_mlp": 0.0126984, "balance_loss_clip": 0.06287178, "balance_loss_mlp": 0.01256589, "epoch": 0.4452126860063129, "flos": 15601386936960.0, "grad_norm": 1.5398528344474256, "language_loss": 0.82388657, "learning_rate": 2.4481331260746177e-06, "loss": 0.901173, "num_input_tokens_seen": 158788215, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.13250732, "step": 7405, "time_per_iteration": 2.5446858406066895 }, { "auxiliary_loss_clip": 0.06455382, "auxiliary_loss_mlp": 0.01266783, "balance_loss_clip": 0.06287225, "balance_loss_mlp": 0.01254736, "epoch": 0.4452728092589809, "flos": 21623995002240.0, "grad_norm": 1.7126892077748466, "language_loss": 0.75584906, "learning_rate": 2.4477535584128036e-06, "loss": 0.8330707, "num_input_tokens_seen": 158809090, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.12060547, "step": 7406, "time_per_iteration": 2.630840539932251 }, { "auxiliary_loss_clip": 0.06447116, "auxiliary_loss_mlp": 0.01269872, "balance_loss_clip": 0.06284809, "balance_loss_mlp": 0.01258464, "epoch": 0.4453329325116489, "flos": 29505267135360.0, "grad_norm": 1.899163647367895, "language_loss": 0.6597892, "learning_rate": 2.447373973772129e-06, "loss": 0.7369591, "num_input_tokens_seen": 158828320, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.11413574, "step": 7407, "time_per_iteration": 2.6433212757110596 }, { "auxiliary_loss_clip": 0.06461412, "auxiliary_loss_mlp": 0.01269233, "balance_loss_clip": 0.06290688, "balance_loss_mlp": 0.01256817, "epoch": 0.44539305576431687, "flos": 21367549981440.0, "grad_norm": 1.4626862043965816, "language_loss": 0.68256885, "learning_rate": 2.4469943721669887e-06, "loss": 0.75987524, "num_input_tokens_seen": 158847040, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.12408447, "step": 7408, "time_per_iteration": 2.63647723197937 }, { "auxiliary_loss_clip": 0.06455221, "auxiliary_loss_mlp": 0.01272252, "balance_loss_clip": 0.06287623, "balance_loss_mlp": 0.01259496, "epoch": 0.44545317901698483, "flos": 41437278000000.0, "grad_norm": 1.585652290835855, "language_loss": 0.72085172, "learning_rate": 2.4466147536117776e-06, "loss": 0.79812646, "num_input_tokens_seen": 158870490, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.12756348, "step": 7409, "time_per_iteration": 2.7420921325683594 }, { "auxiliary_loss_clip": 0.06459856, "auxiliary_loss_mlp": 0.01272217, "balance_loss_clip": 0.06290299, "balance_loss_mlp": 0.01259408, "epoch": 0.4455133022696528, "flos": 22061638477440.0, "grad_norm": 1.896883795570133, "language_loss": 0.65400916, "learning_rate": 2.4462351181208895e-06, "loss": 0.73132992, "num_input_tokens_seen": 158889920, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.12823486, "step": 7410, "time_per_iteration": 2.593773365020752 }, { "auxiliary_loss_clip": 0.06467978, "auxiliary_loss_mlp": 0.01271554, "balance_loss_clip": 0.06291763, "balance_loss_mlp": 0.01258793, "epoch": 0.44557342552232077, "flos": 23483665319040.0, "grad_norm": 2.0007635000563675, "language_loss": 0.74442768, "learning_rate": 2.4458554657087217e-06, "loss": 0.82182294, "num_input_tokens_seen": 158909580, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.12762451, "step": 7411, "time_per_iteration": 5.523190498352051 }, { "auxiliary_loss_clip": 0.06455898, "auxiliary_loss_mlp": 0.01267178, "balance_loss_clip": 0.06292321, "balance_loss_mlp": 0.01256223, "epoch": 0.44563354877498873, "flos": 19140577292160.0, "grad_norm": 1.6466968893598177, "language_loss": 0.79396766, "learning_rate": 2.4454757963896695e-06, "loss": 0.87119842, "num_input_tokens_seen": 158924600, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.10949707, "step": 7412, "time_per_iteration": 2.5499675273895264 }, { "auxiliary_loss_clip": 0.0645758, "auxiliary_loss_mlp": 0.01269587, "balance_loss_clip": 0.06284474, "balance_loss_mlp": 0.01257368, "epoch": 0.4456936720276567, "flos": 13625744169600.0, "grad_norm": 2.3293201677358666, "language_loss": 0.80327964, "learning_rate": 2.4450961101781304e-06, "loss": 0.88055122, "num_input_tokens_seen": 158939345, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.12219238, "step": 7413, "time_per_iteration": 2.50238037109375 }, { "auxiliary_loss_clip": 0.06457485, "auxiliary_loss_mlp": 0.01267025, "balance_loss_clip": 0.06290802, "balance_loss_mlp": 0.01255133, "epoch": 0.44575379528032466, "flos": 14717840359680.0, "grad_norm": 2.552607918756875, "language_loss": 0.76676011, "learning_rate": 2.4447164070885026e-06, "loss": 0.84400523, "num_input_tokens_seen": 158955855, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.11889648, "step": 7414, "time_per_iteration": 2.535508632659912 }, { "auxiliary_loss_clip": 0.06456472, "auxiliary_loss_mlp": 0.01269157, "balance_loss_clip": 0.06289124, "balance_loss_mlp": 0.01257051, "epoch": 0.4458139185329926, "flos": 24177586106880.0, "grad_norm": 1.8792733883618065, "language_loss": 0.83346009, "learning_rate": 2.4443366871351837e-06, "loss": 0.91071641, "num_input_tokens_seen": 158976315, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.12109375, "step": 7415, "time_per_iteration": 2.567006826400757 }, { "auxiliary_loss_clip": 0.0645915, "auxiliary_loss_mlp": 0.01267073, "balance_loss_clip": 0.06292366, "balance_loss_mlp": 0.01255063, "epoch": 0.4458740417856606, "flos": 21768660276480.0, "grad_norm": 1.5701973180650228, "language_loss": 0.84445524, "learning_rate": 2.4439569503325732e-06, "loss": 0.92171752, "num_input_tokens_seen": 158996725, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.12011719, "step": 7416, "time_per_iteration": 2.5962741374969482 }, { "auxiliary_loss_clip": 0.06466657, "auxiliary_loss_mlp": 0.01268898, "balance_loss_clip": 0.06295, "balance_loss_mlp": 0.01256435, "epoch": 0.44593416503832856, "flos": 21075074904960.0, "grad_norm": 1.52275554587676, "language_loss": 0.8172363, "learning_rate": 2.4435771966950706e-06, "loss": 0.89459181, "num_input_tokens_seen": 159017255, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.12463379, "step": 7417, "time_per_iteration": 2.5873830318450928 }, { "auxiliary_loss_clip": 0.06465842, "auxiliary_loss_mlp": 0.0126654, "balance_loss_clip": 0.06294245, "balance_loss_mlp": 0.01254464, "epoch": 0.4459942882909965, "flos": 22606910922240.0, "grad_norm": 1.995533266192369, "language_loss": 0.81620419, "learning_rate": 2.443197426237077e-06, "loss": 0.89352798, "num_input_tokens_seen": 159035010, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.1206665, "step": 7418, "time_per_iteration": 2.608118772506714 }, { "auxiliary_loss_clip": 0.06469988, "auxiliary_loss_mlp": 0.01270194, "balance_loss_clip": 0.06299415, "balance_loss_mlp": 0.01257271, "epoch": 0.4460544115436645, "flos": 26512732817280.0, "grad_norm": 1.665629209639333, "language_loss": 0.77763665, "learning_rate": 2.442817638972991e-06, "loss": 0.85503846, "num_input_tokens_seen": 159055345, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.12908936, "step": 7419, "time_per_iteration": 2.5971786975860596 }, { "auxiliary_loss_clip": 0.06469058, "auxiliary_loss_mlp": 0.01275009, "balance_loss_clip": 0.06300637, "balance_loss_mlp": 0.01262957, "epoch": 0.4461145347963325, "flos": 17609957159040.0, "grad_norm": 1.520184490139664, "language_loss": 0.72824889, "learning_rate": 2.4424378349172176e-06, "loss": 0.80568957, "num_input_tokens_seen": 159074225, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.1204834, "step": 7420, "time_per_iteration": 2.594860076904297 }, { "auxiliary_loss_clip": 0.06457299, "auxiliary_loss_mlp": 0.01267577, "balance_loss_clip": 0.06293234, "balance_loss_mlp": 0.01255215, "epoch": 0.44617465804900047, "flos": 27274982209920.0, "grad_norm": 1.4830540390072504, "language_loss": 0.7535854, "learning_rate": 2.442058014084156e-06, "loss": 0.83083415, "num_input_tokens_seen": 159095415, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.12365723, "step": 7421, "time_per_iteration": 2.610365390777588 }, { "auxiliary_loss_clip": 0.06456721, "auxiliary_loss_mlp": 0.01266552, "balance_loss_clip": 0.06293263, "balance_loss_mlp": 0.01254994, "epoch": 0.44623478130166844, "flos": 17792371497600.0, "grad_norm": 1.9634701425636991, "language_loss": 0.75746202, "learning_rate": 2.44167817648821e-06, "loss": 0.83469486, "num_input_tokens_seen": 159114615, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.11560059, "step": 7422, "time_per_iteration": 2.5637688636779785 }, { "auxiliary_loss_clip": 0.06466476, "auxiliary_loss_mlp": 0.01267098, "balance_loss_clip": 0.06297244, "balance_loss_mlp": 0.01255064, "epoch": 0.4462949045543364, "flos": 23009698298880.0, "grad_norm": 1.4224826231610268, "language_loss": 0.6567955, "learning_rate": 2.441298322143784e-06, "loss": 0.73413122, "num_input_tokens_seen": 159134370, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.12030029, "step": 7423, "time_per_iteration": 2.5512866973876953 }, { "auxiliary_loss_clip": 0.06455371, "auxiliary_loss_mlp": 0.01268131, "balance_loss_clip": 0.06291588, "balance_loss_mlp": 0.01256425, "epoch": 0.44635502780700437, "flos": 17825592441600.0, "grad_norm": 1.490994649707887, "language_loss": 0.7972765, "learning_rate": 2.4409184510652807e-06, "loss": 0.8745116, "num_input_tokens_seen": 159152540, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.11706543, "step": 7424, "time_per_iteration": 2.5631301403045654 }, { "auxiliary_loss_clip": 0.06450696, "auxiliary_loss_mlp": 0.01267, "balance_loss_clip": 0.06288545, "balance_loss_mlp": 0.01255741, "epoch": 0.44641515105967233, "flos": 26695314864000.0, "grad_norm": 1.3681604748045, "language_loss": 0.80473769, "learning_rate": 2.4405385632671063e-06, "loss": 0.88191462, "num_input_tokens_seen": 159173425, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.11260986, "step": 7425, "time_per_iteration": 2.592798948287964 }, { "auxiliary_loss_clip": 0.06456941, "auxiliary_loss_mlp": 0.01269581, "balance_loss_clip": 0.06290136, "balance_loss_mlp": 0.01258435, "epoch": 0.4464752743123403, "flos": 18918778734720.0, "grad_norm": 1.5150970494778613, "language_loss": 0.78119695, "learning_rate": 2.4401586587636655e-06, "loss": 0.85846221, "num_input_tokens_seen": 159191210, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.11151123, "step": 7426, "time_per_iteration": 2.5949435234069824 }, { "auxiliary_loss_clip": 0.0646396, "auxiliary_loss_mlp": 0.01265965, "balance_loss_clip": 0.06293488, "balance_loss_mlp": 0.01253699, "epoch": 0.44653539756500826, "flos": 29578081933440.0, "grad_norm": 2.7337468112306897, "language_loss": 0.64813006, "learning_rate": 2.4397787375693634e-06, "loss": 0.7254293, "num_input_tokens_seen": 159211755, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.12261963, "step": 7427, "time_per_iteration": 2.6181201934814453 }, { "auxiliary_loss_clip": 0.06465487, "auxiliary_loss_mlp": 0.01269512, "balance_loss_clip": 0.0630163, "balance_loss_mlp": 0.01257156, "epoch": 0.44659552081767623, "flos": 21475137024000.0, "grad_norm": 1.8765826988282301, "language_loss": 0.75632834, "learning_rate": 2.439398799698608e-06, "loss": 0.83367836, "num_input_tokens_seen": 159230315, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.12347412, "step": 7428, "time_per_iteration": 2.6605277061462402 }, { "auxiliary_loss_clip": 0.06461963, "auxiliary_loss_mlp": 0.01266987, "balance_loss_clip": 0.06295013, "balance_loss_mlp": 0.01255191, "epoch": 0.4466556440703442, "flos": 17937791458560.0, "grad_norm": 1.8163268130286734, "language_loss": 0.78231043, "learning_rate": 2.439018845165806e-06, "loss": 0.85959995, "num_input_tokens_seen": 159249810, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.11804199, "step": 7429, "time_per_iteration": 2.532107353210449 }, { "auxiliary_loss_clip": 0.06464359, "auxiliary_loss_mlp": 0.01270152, "balance_loss_clip": 0.06293688, "balance_loss_mlp": 0.01257671, "epoch": 0.44671576732301216, "flos": 21114081780480.0, "grad_norm": 1.9546151903553628, "language_loss": 0.91216409, "learning_rate": 2.438638873985366e-06, "loss": 0.98950928, "num_input_tokens_seen": 159271715, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.12475586, "step": 7430, "time_per_iteration": 2.5936410427093506 }, { "auxiliary_loss_clip": 0.06472772, "auxiliary_loss_mlp": 0.0126984, "balance_loss_clip": 0.06297147, "balance_loss_mlp": 0.0125743, "epoch": 0.4467758905756801, "flos": 23514873765120.0, "grad_norm": 2.0695303332291113, "language_loss": 0.80123174, "learning_rate": 2.4382588861716954e-06, "loss": 0.87865782, "num_input_tokens_seen": 159290690, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.12414551, "step": 7431, "time_per_iteration": 2.677757501602173 }, { "auxiliary_loss_clip": 0.06467041, "auxiliary_loss_mlp": 0.01270045, "balance_loss_clip": 0.06297161, "balance_loss_mlp": 0.01257498, "epoch": 0.4468360138283481, "flos": 18739970121600.0, "grad_norm": 1.856709721808895, "language_loss": 0.80627823, "learning_rate": 2.437878881739204e-06, "loss": 0.88364911, "num_input_tokens_seen": 159309400, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.12548828, "step": 7432, "time_per_iteration": 2.724703311920166 }, { "auxiliary_loss_clip": 0.06471613, "auxiliary_loss_mlp": 0.01269702, "balance_loss_clip": 0.0629899, "balance_loss_mlp": 0.01256953, "epoch": 0.4468961370810161, "flos": 23483874954240.0, "grad_norm": 1.7606758396087858, "language_loss": 0.77351868, "learning_rate": 2.437498860702301e-06, "loss": 0.85093194, "num_input_tokens_seen": 159327425, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.12750244, "step": 7433, "time_per_iteration": 2.6540048122406006 }, { "auxiliary_loss_clip": 0.06454286, "auxiliary_loss_mlp": 0.01269534, "balance_loss_clip": 0.06291467, "balance_loss_mlp": 0.01258298, "epoch": 0.4469562603336841, "flos": 30081873807360.0, "grad_norm": 1.6288244358953203, "language_loss": 0.77498877, "learning_rate": 2.437118823075398e-06, "loss": 0.85222691, "num_input_tokens_seen": 159345805, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.11242676, "step": 7434, "time_per_iteration": 2.662341594696045 }, { "auxiliary_loss_clip": 0.06471579, "auxiliary_loss_mlp": 0.01269984, "balance_loss_clip": 0.06299932, "balance_loss_mlp": 0.01257801, "epoch": 0.44701638358635204, "flos": 22463126115840.0, "grad_norm": 1.7845630592146866, "language_loss": 0.64976227, "learning_rate": 2.436738768872905e-06, "loss": 0.72717798, "num_input_tokens_seen": 159364595, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.12176514, "step": 7435, "time_per_iteration": 2.6091537475585938 }, { "auxiliary_loss_clip": 0.06474435, "auxiliary_loss_mlp": 0.0126952, "balance_loss_clip": 0.06304913, "balance_loss_mlp": 0.01256532, "epoch": 0.44707650683902, "flos": 24064171205760.0, "grad_norm": 1.5488161757030576, "language_loss": 0.836182, "learning_rate": 2.4363586981092346e-06, "loss": 0.91362154, "num_input_tokens_seen": 159385265, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.12982178, "step": 7436, "time_per_iteration": 2.6436407566070557 }, { "auxiliary_loss_clip": 0.06478631, "auxiliary_loss_mlp": 0.01272014, "balance_loss_clip": 0.06306697, "balance_loss_mlp": 0.01258293, "epoch": 0.44713663009168797, "flos": 23773373210880.0, "grad_norm": 1.6480219540204633, "language_loss": 0.79890394, "learning_rate": 2.435978610798798e-06, "loss": 0.87641042, "num_input_tokens_seen": 159405080, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.137146, "step": 7437, "time_per_iteration": 4.004343509674072 }, { "auxiliary_loss_clip": 0.0647601, "auxiliary_loss_mlp": 0.01268736, "balance_loss_clip": 0.06304503, "balance_loss_mlp": 0.01256541, "epoch": 0.44719675334435594, "flos": 24506258947200.0, "grad_norm": 1.5048623493755895, "language_loss": 0.72053647, "learning_rate": 2.435598506956009e-06, "loss": 0.797984, "num_input_tokens_seen": 159424595, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.12188721, "step": 7438, "time_per_iteration": 2.6137242317199707 }, { "auxiliary_loss_clip": 0.06477994, "auxiliary_loss_mlp": 0.01268674, "balance_loss_clip": 0.0630632, "balance_loss_mlp": 0.01255859, "epoch": 0.4472568765970239, "flos": 29788308627840.0, "grad_norm": 1.7935711166870105, "language_loss": 0.67569005, "learning_rate": 2.4352183865952808e-06, "loss": 0.75315666, "num_input_tokens_seen": 159443865, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.12811279, "step": 7439, "time_per_iteration": 2.6278584003448486 }, { "auxiliary_loss_clip": 0.06477725, "auxiliary_loss_mlp": 0.01269726, "balance_loss_clip": 0.06304827, "balance_loss_mlp": 0.01256893, "epoch": 0.44731699984969187, "flos": 24649792191360.0, "grad_norm": 1.5309252609212487, "language_loss": 0.74191397, "learning_rate": 2.4348382497310285e-06, "loss": 0.81938851, "num_input_tokens_seen": 159464525, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.12841797, "step": 7440, "time_per_iteration": 4.1087353229522705 }, { "auxiliary_loss_clip": 0.0646817, "auxiliary_loss_mlp": 0.01268651, "balance_loss_clip": 0.06301221, "balance_loss_mlp": 0.01257189, "epoch": 0.44737712310235983, "flos": 29462570680320.0, "grad_norm": 1.6449649014218606, "language_loss": 0.74467981, "learning_rate": 2.4344580963776655e-06, "loss": 0.82204801, "num_input_tokens_seen": 159486385, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.11456299, "step": 7441, "time_per_iteration": 2.632859468460083 }, { "auxiliary_loss_clip": 0.06471801, "auxiliary_loss_mlp": 0.01269208, "balance_loss_clip": 0.06300521, "balance_loss_mlp": 0.01256607, "epoch": 0.4474372463550278, "flos": 24903260392320.0, "grad_norm": 1.699565506689287, "language_loss": 0.7570523, "learning_rate": 2.4340779265496082e-06, "loss": 0.8344624, "num_input_tokens_seen": 159503880, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.1260376, "step": 7442, "time_per_iteration": 2.604518413543701 }, { "auxiliary_loss_clip": 0.0648134, "auxiliary_loss_mlp": 0.01269804, "balance_loss_clip": 0.06306413, "balance_loss_mlp": 0.01257007, "epoch": 0.44749736960769576, "flos": 33189835524480.0, "grad_norm": 1.7952192407986198, "language_loss": 0.74519724, "learning_rate": 2.433697740261273e-06, "loss": 0.82270867, "num_input_tokens_seen": 159522980, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.12786865, "step": 7443, "time_per_iteration": 2.6596181392669678 }, { "auxiliary_loss_clip": 0.06464395, "auxiliary_loss_mlp": 0.01266732, "balance_loss_clip": 0.06294984, "balance_loss_mlp": 0.01254024, "epoch": 0.4475574928603637, "flos": 21078596776320.0, "grad_norm": 1.7236000048161526, "language_loss": 0.77918297, "learning_rate": 2.4333175375270748e-06, "loss": 0.85649431, "num_input_tokens_seen": 159543340, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.1272583, "step": 7444, "time_per_iteration": 2.5887584686279297 }, { "auxiliary_loss_clip": 0.06462951, "auxiliary_loss_mlp": 0.01274762, "balance_loss_clip": 0.0629731, "balance_loss_mlp": 0.01262668, "epoch": 0.4476176161130317, "flos": 21867442640640.0, "grad_norm": 1.8884245118770886, "language_loss": 0.84875751, "learning_rate": 2.4329373183614333e-06, "loss": 0.92613459, "num_input_tokens_seen": 159558210, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.12091064, "step": 7445, "time_per_iteration": 2.524138927459717 }, { "auxiliary_loss_clip": 0.06469038, "auxiliary_loss_mlp": 0.01267942, "balance_loss_clip": 0.06298952, "balance_loss_mlp": 0.01254495, "epoch": 0.4476777393656997, "flos": 22535270081280.0, "grad_norm": 4.8775363996034935, "language_loss": 0.65151727, "learning_rate": 2.432557082778765e-06, "loss": 0.72888708, "num_input_tokens_seen": 159577920, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.13458252, "step": 7446, "time_per_iteration": 2.5960161685943604 }, { "auxiliary_loss_clip": 0.06380865, "auxiliary_loss_mlp": 0.01275125, "balance_loss_clip": 0.06306255, "balance_loss_mlp": 0.01271957, "epoch": 0.4477378626183677, "flos": 49034236101120.0, "grad_norm": 0.7431947691971178, "language_loss": 0.50336647, "learning_rate": 2.4321768307934884e-06, "loss": 0.57992637, "num_input_tokens_seen": 159632295, "router_z_loss_clip": 0.74560547, "router_z_loss_mlp": 0.0316925, "step": 7447, "time_per_iteration": 3.0535967350006104 }, { "auxiliary_loss_clip": 0.06370678, "auxiliary_loss_mlp": 0.01269964, "balance_loss_clip": 0.06296334, "balance_loss_mlp": 0.0126651, "epoch": 0.44779798587103564, "flos": 56562041784960.0, "grad_norm": 0.7788258205507257, "language_loss": 0.59346873, "learning_rate": 2.4317965624200235e-06, "loss": 0.66987514, "num_input_tokens_seen": 159698435, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 0.03463745, "step": 7448, "time_per_iteration": 3.278895378112793 }, { "auxiliary_loss_clip": 0.0645982, "auxiliary_loss_mlp": 0.01266423, "balance_loss_clip": 0.06290646, "balance_loss_mlp": 0.01254902, "epoch": 0.4478581091237036, "flos": 46508933278080.0, "grad_norm": 1.4120669841161493, "language_loss": 0.58845013, "learning_rate": 2.431416277672789e-06, "loss": 0.66571259, "num_input_tokens_seen": 159722150, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.11529541, "step": 7449, "time_per_iteration": 2.9705936908721924 }, { "auxiliary_loss_clip": 0.06460452, "auxiliary_loss_mlp": 0.01268342, "balance_loss_clip": 0.0629199, "balance_loss_mlp": 0.01256605, "epoch": 0.4479182323763716, "flos": 20820768163200.0, "grad_norm": 2.143527663481145, "language_loss": 0.80831069, "learning_rate": 2.4310359765662065e-06, "loss": 0.88559866, "num_input_tokens_seen": 159740550, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.11755371, "step": 7450, "time_per_iteration": 3.859462261199951 }, { "auxiliary_loss_clip": 0.06461593, "auxiliary_loss_mlp": 0.01267355, "balance_loss_clip": 0.06293389, "balance_loss_mlp": 0.01255339, "epoch": 0.44797835562903954, "flos": 14251126717440.0, "grad_norm": 1.9634986575613542, "language_loss": 0.79817677, "learning_rate": 2.430655659114697e-06, "loss": 0.87546623, "num_input_tokens_seen": 159758245, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.12017822, "step": 7451, "time_per_iteration": 4.114990472793579 }, { "auxiliary_loss_clip": 0.06356058, "auxiliary_loss_mlp": 0.0125387, "balance_loss_clip": 0.06283028, "balance_loss_mlp": 0.01250804, "epoch": 0.4480384788817075, "flos": 63553436357760.0, "grad_norm": 0.809142910570192, "language_loss": 0.62737858, "learning_rate": 2.430275325332681e-06, "loss": 0.70347786, "num_input_tokens_seen": 159826790, "router_z_loss_clip": 0.72851562, "router_z_loss_mlp": 0.03062439, "step": 7452, "time_per_iteration": 3.311391830444336 }, { "auxiliary_loss_clip": 0.06456022, "auxiliary_loss_mlp": 0.01271973, "balance_loss_clip": 0.06287873, "balance_loss_mlp": 0.01259683, "epoch": 0.44809860213437547, "flos": 21659018808960.0, "grad_norm": 1.898783335021195, "language_loss": 0.63496161, "learning_rate": 2.429894975234582e-06, "loss": 0.71224159, "num_input_tokens_seen": 159845805, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.12286377, "step": 7453, "time_per_iteration": 2.563361406326294 }, { "auxiliary_loss_clip": 0.06351226, "auxiliary_loss_mlp": 0.01254129, "balance_loss_clip": 0.06277697, "balance_loss_mlp": 0.0125102, "epoch": 0.44815872538704343, "flos": 69210586840320.0, "grad_norm": 0.7455307461860354, "language_loss": 0.56981778, "learning_rate": 2.4295146088348224e-06, "loss": 0.64587134, "num_input_tokens_seen": 159898860, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 0.03109741, "step": 7454, "time_per_iteration": 3.1263339519500732 }, { "auxiliary_loss_clip": 0.06458364, "auxiliary_loss_mlp": 0.01268795, "balance_loss_clip": 0.06288201, "balance_loss_mlp": 0.01256785, "epoch": 0.4482188486397114, "flos": 12602186219520.0, "grad_norm": 2.1509777407809194, "language_loss": 0.75824803, "learning_rate": 2.4291342261478255e-06, "loss": 0.83551967, "num_input_tokens_seen": 159911555, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.12023926, "step": 7455, "time_per_iteration": 2.4989166259765625 }, { "auxiliary_loss_clip": 0.06453336, "auxiliary_loss_mlp": 0.01269391, "balance_loss_clip": 0.06286669, "balance_loss_mlp": 0.01257279, "epoch": 0.44827897189237936, "flos": 34066715702400.0, "grad_norm": 1.8593699076661403, "language_loss": 0.76371801, "learning_rate": 2.428753827188016e-06, "loss": 0.84094524, "num_input_tokens_seen": 159931470, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.12127686, "step": 7456, "time_per_iteration": 2.7287709712982178 }, { "auxiliary_loss_clip": 0.06449994, "auxiliary_loss_mlp": 0.01271071, "balance_loss_clip": 0.06286666, "balance_loss_mlp": 0.01259525, "epoch": 0.44833909514504733, "flos": 25153080940800.0, "grad_norm": 2.0600673486869203, "language_loss": 0.76821864, "learning_rate": 2.428373411969818e-06, "loss": 0.8454293, "num_input_tokens_seen": 159946115, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.11535645, "step": 7457, "time_per_iteration": 2.5740416049957275 }, { "auxiliary_loss_clip": 0.06461956, "auxiliary_loss_mlp": 0.01265733, "balance_loss_clip": 0.06290235, "balance_loss_mlp": 0.01253567, "epoch": 0.4483992183977153, "flos": 16185498549120.0, "grad_norm": 2.0955784732526532, "language_loss": 0.68886703, "learning_rate": 2.4279929805076576e-06, "loss": 0.76614392, "num_input_tokens_seen": 159963915, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.12176514, "step": 7458, "time_per_iteration": 2.5535976886749268 }, { "auxiliary_loss_clip": 0.06466757, "auxiliary_loss_mlp": 0.01273292, "balance_loss_clip": 0.06294349, "balance_loss_mlp": 0.01260245, "epoch": 0.44845934165038326, "flos": 17751352124160.0, "grad_norm": 1.5493093819901917, "language_loss": 0.72198868, "learning_rate": 2.427612532815961e-06, "loss": 0.79938912, "num_input_tokens_seen": 159982140, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.13049316, "step": 7459, "time_per_iteration": 2.6104369163513184 }, { "auxiliary_loss_clip": 0.06458889, "auxiliary_loss_mlp": 0.01273024, "balance_loss_clip": 0.06290866, "balance_loss_mlp": 0.01260674, "epoch": 0.4485194649030513, "flos": 21842481323520.0, "grad_norm": 2.500330768624522, "language_loss": 0.7016775, "learning_rate": 2.427232068909154e-06, "loss": 0.77899659, "num_input_tokens_seen": 160002280, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.12359619, "step": 7460, "time_per_iteration": 2.6749212741851807 }, { "auxiliary_loss_clip": 0.06461857, "auxiliary_loss_mlp": 0.01269601, "balance_loss_clip": 0.06292801, "balance_loss_mlp": 0.01257125, "epoch": 0.44857958815571924, "flos": 20090775392640.0, "grad_norm": 1.936007666131563, "language_loss": 0.77848506, "learning_rate": 2.4268515888016635e-06, "loss": 0.85579962, "num_input_tokens_seen": 160020260, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.12469482, "step": 7461, "time_per_iteration": 2.5715489387512207 }, { "auxiliary_loss_clip": 0.06462929, "auxiliary_loss_mlp": 0.01265721, "balance_loss_clip": 0.06293812, "balance_loss_mlp": 0.01253312, "epoch": 0.4486397114083872, "flos": 27060982081920.0, "grad_norm": 1.6447926131243025, "language_loss": 0.67907351, "learning_rate": 2.4264710925079184e-06, "loss": 0.75636005, "num_input_tokens_seen": 160040240, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.12420654, "step": 7462, "time_per_iteration": 2.628363847732544 }, { "auxiliary_loss_clip": 0.06349938, "auxiliary_loss_mlp": 0.01267839, "balance_loss_clip": 0.06276682, "balance_loss_mlp": 0.01264835, "epoch": 0.4486998346610552, "flos": 67339386587520.0, "grad_norm": 0.7349308862649201, "language_loss": 0.54471874, "learning_rate": 2.4260905800423462e-06, "loss": 0.62089652, "num_input_tokens_seen": 160093865, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 0.0300293, "step": 7463, "time_per_iteration": 3.178967237472534 }, { "auxiliary_loss_clip": 0.06460243, "auxiliary_loss_mlp": 0.01272952, "balance_loss_clip": 0.06292994, "balance_loss_mlp": 0.01260446, "epoch": 0.44875995791372314, "flos": 27644297080320.0, "grad_norm": 1.6817190514974873, "language_loss": 0.76074636, "learning_rate": 2.4257100514193775e-06, "loss": 0.83807826, "num_input_tokens_seen": 160113590, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.12506104, "step": 7464, "time_per_iteration": 2.6347053050994873 }, { "auxiliary_loss_clip": 0.06458418, "auxiliary_loss_mlp": 0.01272208, "balance_loss_clip": 0.06293541, "balance_loss_mlp": 0.01260663, "epoch": 0.4488200811663911, "flos": 13010969162880.0, "grad_norm": 1.8664823113070168, "language_loss": 0.74735808, "learning_rate": 2.425329506653441e-06, "loss": 0.82466429, "num_input_tokens_seen": 160131795, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.11547852, "step": 7465, "time_per_iteration": 2.5480782985687256 }, { "auxiliary_loss_clip": 0.06467666, "auxiliary_loss_mlp": 0.01271603, "balance_loss_clip": 0.06291464, "balance_loss_mlp": 0.01257864, "epoch": 0.44888020441905907, "flos": 27497283891840.0, "grad_norm": 1.9445567750888668, "language_loss": 0.80268264, "learning_rate": 2.424948945758966e-06, "loss": 0.88007534, "num_input_tokens_seen": 160150635, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.1373291, "step": 7466, "time_per_iteration": 2.6336114406585693 }, { "auxiliary_loss_clip": 0.0645674, "auxiliary_loss_mlp": 0.01271224, "balance_loss_clip": 0.0628578, "balance_loss_mlp": 0.01259077, "epoch": 0.44894032767172704, "flos": 18265541904000.0, "grad_norm": 4.903333041337602, "language_loss": 0.80728376, "learning_rate": 2.4245683687503844e-06, "loss": 0.88456345, "num_input_tokens_seen": 160168615, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.12164307, "step": 7467, "time_per_iteration": 2.546602487564087 }, { "auxiliary_loss_clip": 0.06449541, "auxiliary_loss_mlp": 0.01271039, "balance_loss_clip": 0.06287803, "balance_loss_mlp": 0.01259315, "epoch": 0.449000450924395, "flos": 21586245937920.0, "grad_norm": 1.90152854153489, "language_loss": 0.75810349, "learning_rate": 2.424187775642129e-06, "loss": 0.83530927, "num_input_tokens_seen": 160187295, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.11724854, "step": 7468, "time_per_iteration": 2.572890520095825 }, { "auxiliary_loss_clip": 0.06463216, "auxiliary_loss_mlp": 0.01267872, "balance_loss_clip": 0.06295907, "balance_loss_mlp": 0.01257257, "epoch": 0.44906057417706297, "flos": 17973737660160.0, "grad_norm": 1.6706769580436942, "language_loss": 0.70980501, "learning_rate": 2.4238071664486297e-06, "loss": 0.78711593, "num_input_tokens_seen": 160205115, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.10614014, "step": 7469, "time_per_iteration": 2.555241107940674 }, { "auxiliary_loss_clip": 0.06464691, "auxiliary_loss_mlp": 0.01268049, "balance_loss_clip": 0.06294749, "balance_loss_mlp": 0.01256099, "epoch": 0.44912069742973093, "flos": 20053487525760.0, "grad_norm": 1.6270726737533998, "language_loss": 0.72707713, "learning_rate": 2.4234265411843203e-06, "loss": 0.8044045, "num_input_tokens_seen": 160222580, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.11956787, "step": 7470, "time_per_iteration": 2.584005355834961 }, { "auxiliary_loss_clip": 0.06464612, "auxiliary_loss_mlp": 0.01269954, "balance_loss_clip": 0.06293772, "balance_loss_mlp": 0.01257449, "epoch": 0.4491808206823989, "flos": 21040009171200.0, "grad_norm": 1.83588003193707, "language_loss": 0.77441758, "learning_rate": 2.423045899863634e-06, "loss": 0.85176325, "num_input_tokens_seen": 160241520, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.125, "step": 7471, "time_per_iteration": 2.570760488510132 }, { "auxiliary_loss_clip": 0.06461342, "auxiliary_loss_mlp": 0.01270736, "balance_loss_clip": 0.06296007, "balance_loss_mlp": 0.01258105, "epoch": 0.44924094393506686, "flos": 22973919805440.0, "grad_norm": 1.607705941922957, "language_loss": 0.70702118, "learning_rate": 2.4226652425010048e-06, "loss": 0.78434193, "num_input_tokens_seen": 160261815, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.12646484, "step": 7472, "time_per_iteration": 2.6188879013061523 }, { "auxiliary_loss_clip": 0.06357542, "auxiliary_loss_mlp": 0.01269455, "balance_loss_clip": 0.062846, "balance_loss_mlp": 0.01266996, "epoch": 0.4493010671877349, "flos": 59252332026240.0, "grad_norm": 0.7261129913229183, "language_loss": 0.61680317, "learning_rate": 2.4222845691108676e-06, "loss": 0.69307315, "num_input_tokens_seen": 160317070, "router_z_loss_clip": 0.72753906, "router_z_loss_mlp": 0.02456665, "step": 7473, "time_per_iteration": 3.1493067741394043 }, { "auxiliary_loss_clip": 0.06463876, "auxiliary_loss_mlp": 0.01267394, "balance_loss_clip": 0.06296066, "balance_loss_mlp": 0.01255324, "epoch": 0.44936119044040285, "flos": 18010815891840.0, "grad_norm": 1.9333391570582572, "language_loss": 0.78447634, "learning_rate": 2.421903879707657e-06, "loss": 0.86178911, "num_input_tokens_seen": 160334980, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.12054443, "step": 7474, "time_per_iteration": 2.572350263595581 }, { "auxiliary_loss_clip": 0.06454234, "auxiliary_loss_mlp": 0.01272842, "balance_loss_clip": 0.06291319, "balance_loss_mlp": 0.01261231, "epoch": 0.4494213136930708, "flos": 21258243930240.0, "grad_norm": 2.1051181743453435, "language_loss": 0.72312129, "learning_rate": 2.4215231743058086e-06, "loss": 0.80039203, "num_input_tokens_seen": 160354500, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.11602783, "step": 7475, "time_per_iteration": 2.571723461151123 }, { "auxiliary_loss_clip": 0.06454186, "auxiliary_loss_mlp": 0.01269101, "balance_loss_clip": 0.06284288, "balance_loss_mlp": 0.01257508, "epoch": 0.4494814369457388, "flos": 27426271956480.0, "grad_norm": 1.6648375943459757, "language_loss": 0.76935333, "learning_rate": 2.4211424529197594e-06, "loss": 0.84658623, "num_input_tokens_seen": 160373650, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.1159668, "step": 7476, "time_per_iteration": 4.13580846786499 }, { "auxiliary_loss_clip": 0.06464486, "auxiliary_loss_mlp": 0.0127275, "balance_loss_clip": 0.0629081, "balance_loss_mlp": 0.01259691, "epoch": 0.44954156019840674, "flos": 22860211415040.0, "grad_norm": 1.7317320229198236, "language_loss": 0.72050154, "learning_rate": 2.4207617155639464e-06, "loss": 0.79787385, "num_input_tokens_seen": 160393430, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.1305542, "step": 7477, "time_per_iteration": 2.5758466720581055 }, { "auxiliary_loss_clip": 0.06467685, "auxiliary_loss_mlp": 0.01273378, "balance_loss_clip": 0.06294052, "balance_loss_mlp": 0.01260313, "epoch": 0.4496016834510747, "flos": 17207253636480.0, "grad_norm": 1.7954276830818292, "language_loss": 0.6815334, "learning_rate": 2.4203809622528062e-06, "loss": 0.75894403, "num_input_tokens_seen": 160410545, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.13061523, "step": 7478, "time_per_iteration": 2.7240817546844482 }, { "auxiliary_loss_clip": 0.0645059, "auxiliary_loss_mlp": 0.01271777, "balance_loss_clip": 0.06286336, "balance_loss_mlp": 0.01260166, "epoch": 0.4496618067037427, "flos": 18922636022400.0, "grad_norm": 2.0560179694085137, "language_loss": 0.90094852, "learning_rate": 2.420000193000779e-06, "loss": 0.97817218, "num_input_tokens_seen": 160428105, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.11602783, "step": 7479, "time_per_iteration": 2.5630650520324707 }, { "auxiliary_loss_clip": 0.06458034, "auxiliary_loss_mlp": 0.01267181, "balance_loss_clip": 0.06289332, "balance_loss_mlp": 0.01254467, "epoch": 0.44972192995641064, "flos": 21037828965120.0, "grad_norm": 1.9452299427087474, "language_loss": 0.75762928, "learning_rate": 2.419619407822302e-06, "loss": 0.83488142, "num_input_tokens_seen": 160448815, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.12701416, "step": 7480, "time_per_iteration": 4.129171371459961 }, { "auxiliary_loss_clip": 0.0645885, "auxiliary_loss_mlp": 0.01270931, "balance_loss_clip": 0.06288388, "balance_loss_mlp": 0.01257901, "epoch": 0.4497820532090786, "flos": 20783354515200.0, "grad_norm": 4.922501499030746, "language_loss": 0.79955024, "learning_rate": 2.419238606731815e-06, "loss": 0.87684804, "num_input_tokens_seen": 160465940, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.13037109, "step": 7481, "time_per_iteration": 2.539026975631714 }, { "auxiliary_loss_clip": 0.06445262, "auxiliary_loss_mlp": 0.01270804, "balance_loss_clip": 0.06284049, "balance_loss_mlp": 0.01258156, "epoch": 0.44984217646174657, "flos": 33811067295360.0, "grad_norm": 1.559665687133308, "language_loss": 0.68576914, "learning_rate": 2.418857789743758e-06, "loss": 0.7629298, "num_input_tokens_seen": 160486710, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.12652588, "step": 7482, "time_per_iteration": 2.6748721599578857 }, { "auxiliary_loss_clip": 0.0645899, "auxiliary_loss_mlp": 0.01271176, "balance_loss_clip": 0.06288666, "balance_loss_mlp": 0.01258087, "epoch": 0.44990229971441453, "flos": 15522953915520.0, "grad_norm": 1.960610309000273, "language_loss": 0.85234302, "learning_rate": 2.418476956872571e-06, "loss": 0.9296447, "num_input_tokens_seen": 160503405, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.13079834, "step": 7483, "time_per_iteration": 2.531219005584717 }, { "auxiliary_loss_clip": 0.06458113, "auxiliary_loss_mlp": 0.01269395, "balance_loss_clip": 0.06286458, "balance_loss_mlp": 0.01257617, "epoch": 0.4499624229670825, "flos": 29869676542080.0, "grad_norm": 1.9067062553098157, "language_loss": 0.81168652, "learning_rate": 2.4180961081326967e-06, "loss": 0.88896155, "num_input_tokens_seen": 160525080, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.11779785, "step": 7484, "time_per_iteration": 2.6680357456207275 }, { "auxiliary_loss_clip": 0.06466077, "auxiliary_loss_mlp": 0.012686, "balance_loss_clip": 0.06289559, "balance_loss_mlp": 0.01255601, "epoch": 0.45002254621975046, "flos": 18519345521280.0, "grad_norm": 2.858164793985406, "language_loss": 0.75736177, "learning_rate": 2.4177152435385754e-06, "loss": 0.83470851, "num_input_tokens_seen": 160540895, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.12994385, "step": 7485, "time_per_iteration": 2.5465633869171143 }, { "auxiliary_loss_clip": 0.06352733, "auxiliary_loss_mlp": 0.01255143, "balance_loss_clip": 0.06280649, "balance_loss_mlp": 0.01252726, "epoch": 0.4500826694724185, "flos": 70438753261440.0, "grad_norm": 0.77255607297136, "language_loss": 0.58162582, "learning_rate": 2.4173343631046504e-06, "loss": 0.65770459, "num_input_tokens_seen": 160598270, "router_z_loss_clip": 0.71972656, "router_z_loss_mlp": 0.0241394, "step": 7486, "time_per_iteration": 3.218106985092163 }, { "auxiliary_loss_clip": 0.06455389, "auxiliary_loss_mlp": 0.01270092, "balance_loss_clip": 0.06286561, "balance_loss_mlp": 0.01257695, "epoch": 0.45014279272508645, "flos": 15784388254080.0, "grad_norm": 1.959796872445101, "language_loss": 0.8422699, "learning_rate": 2.4169534668453654e-06, "loss": 0.91952473, "num_input_tokens_seen": 160614720, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.1239624, "step": 7487, "time_per_iteration": 2.5330960750579834 }, { "auxiliary_loss_clip": 0.0645131, "auxiliary_loss_mlp": 0.01270049, "balance_loss_clip": 0.06285167, "balance_loss_mlp": 0.01257222, "epoch": 0.4502029159777544, "flos": 21806157778560.0, "grad_norm": 1.4690251274127701, "language_loss": 0.77419806, "learning_rate": 2.4165725547751622e-06, "loss": 0.85141158, "num_input_tokens_seen": 160635170, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.1282959, "step": 7488, "time_per_iteration": 2.6163992881774902 }, { "auxiliary_loss_clip": 0.06469837, "auxiliary_loss_mlp": 0.0126931, "balance_loss_clip": 0.06292587, "balance_loss_mlp": 0.01255506, "epoch": 0.4502630392304224, "flos": 28775651708160.0, "grad_norm": 2.5318904160649183, "language_loss": 0.72647488, "learning_rate": 2.4161916269084858e-06, "loss": 0.80386639, "num_input_tokens_seen": 160654490, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.13800049, "step": 7489, "time_per_iteration": 2.604417562484741 }, { "auxiliary_loss_clip": 0.06459686, "auxiliary_loss_mlp": 0.01273201, "balance_loss_clip": 0.06288379, "balance_loss_mlp": 0.01259575, "epoch": 0.45032316248309034, "flos": 15848398373760.0, "grad_norm": 2.3303527820525223, "language_loss": 0.69948441, "learning_rate": 2.4158106832597817e-06, "loss": 0.77681327, "num_input_tokens_seen": 160669400, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.1362915, "step": 7490, "time_per_iteration": 3.958749532699585 }, { "auxiliary_loss_clip": 0.06342202, "auxiliary_loss_mlp": 0.01252563, "balance_loss_clip": 0.06271151, "balance_loss_mlp": 0.01250128, "epoch": 0.4503832857357583, "flos": 57873337056000.0, "grad_norm": 0.7731809244280324, "language_loss": 0.56628722, "learning_rate": 2.415429723843495e-06, "loss": 0.64223486, "num_input_tokens_seen": 160733820, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 0.02432251, "step": 7491, "time_per_iteration": 4.604488849639893 }, { "auxiliary_loss_clip": 0.06452446, "auxiliary_loss_mlp": 0.01269121, "balance_loss_clip": 0.06286405, "balance_loss_mlp": 0.01256943, "epoch": 0.4504434089884263, "flos": 23884817541120.0, "grad_norm": 1.6742649811482033, "language_loss": 0.79670858, "learning_rate": 2.4150487486740713e-06, "loss": 0.87392426, "num_input_tokens_seen": 160753175, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.12182617, "step": 7492, "time_per_iteration": 2.6116724014282227 }, { "auxiliary_loss_clip": 0.06469783, "auxiliary_loss_mlp": 0.01270892, "balance_loss_clip": 0.06294949, "balance_loss_mlp": 0.01257033, "epoch": 0.45050353224109424, "flos": 17790820197120.0, "grad_norm": 2.0403659705046797, "language_loss": 0.93484652, "learning_rate": 2.4146677577659573e-06, "loss": 1.01225328, "num_input_tokens_seen": 160768310, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.13848877, "step": 7493, "time_per_iteration": 2.5384485721588135 }, { "auxiliary_loss_clip": 0.0635175, "auxiliary_loss_mlp": 0.01252493, "balance_loss_clip": 0.06280977, "balance_loss_mlp": 0.01250271, "epoch": 0.4505636554937622, "flos": 65081960138880.0, "grad_norm": 0.8356573324966552, "language_loss": 0.62778485, "learning_rate": 2.4142867511336e-06, "loss": 0.70382726, "num_input_tokens_seen": 160827370, "router_z_loss_clip": 0.70800781, "router_z_loss_mlp": 0.02224731, "step": 7494, "time_per_iteration": 3.308661699295044 }, { "auxiliary_loss_clip": 0.06453891, "auxiliary_loss_mlp": 0.01268241, "balance_loss_clip": 0.06287893, "balance_loss_mlp": 0.01256559, "epoch": 0.45062377874643017, "flos": 22206597240960.0, "grad_norm": 1.4180700036724982, "language_loss": 0.8222096, "learning_rate": 2.4139057287914484e-06, "loss": 0.89943087, "num_input_tokens_seen": 160849140, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.11682129, "step": 7495, "time_per_iteration": 2.623213768005371 }, { "auxiliary_loss_clip": 0.06459033, "auxiliary_loss_mlp": 0.01270541, "balance_loss_clip": 0.06289096, "balance_loss_mlp": 0.01257643, "epoch": 0.45068390199909814, "flos": 37679433615360.0, "grad_norm": 1.6591139897247762, "language_loss": 0.85938185, "learning_rate": 2.41352469075395e-06, "loss": 0.93667758, "num_input_tokens_seen": 160871280, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.12896729, "step": 7496, "time_per_iteration": 2.744900703430176 }, { "auxiliary_loss_clip": 0.06461753, "auxiliary_loss_mlp": 0.01269534, "balance_loss_clip": 0.06290026, "balance_loss_mlp": 0.01256355, "epoch": 0.4507440252517661, "flos": 22307853300480.0, "grad_norm": 1.8399594521989409, "language_loss": 0.76519597, "learning_rate": 2.4131436370355534e-06, "loss": 0.84250879, "num_input_tokens_seen": 160888625, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.1317749, "step": 7497, "time_per_iteration": 2.590078353881836 }, { "auxiliary_loss_clip": 0.06458622, "auxiliary_loss_mlp": 0.01269365, "balance_loss_clip": 0.06285381, "balance_loss_mlp": 0.01256109, "epoch": 0.45080414850443407, "flos": 13193425428480.0, "grad_norm": 1.865374827201723, "language_loss": 0.74868357, "learning_rate": 2.4127625676507088e-06, "loss": 0.8259635, "num_input_tokens_seen": 160907040, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.13262939, "step": 7498, "time_per_iteration": 2.566241502761841 }, { "auxiliary_loss_clip": 0.06460711, "auxiliary_loss_mlp": 0.01272444, "balance_loss_clip": 0.06288221, "balance_loss_mlp": 0.01259027, "epoch": 0.4508642717571021, "flos": 21951451958400.0, "grad_norm": 1.9165501294345693, "language_loss": 0.70282495, "learning_rate": 2.4123814826138663e-06, "loss": 0.78015649, "num_input_tokens_seen": 160927115, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.13415527, "step": 7499, "time_per_iteration": 2.5833945274353027 }, { "auxiliary_loss_clip": 0.06465864, "auxiliary_loss_mlp": 0.01268742, "balance_loss_clip": 0.06291439, "balance_loss_mlp": 0.01255658, "epoch": 0.45092439500977005, "flos": 23374149632640.0, "grad_norm": 2.659089828888229, "language_loss": 0.77081704, "learning_rate": 2.412000381939477e-06, "loss": 0.84816313, "num_input_tokens_seen": 160944405, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.13092041, "step": 7500, "time_per_iteration": 2.5754036903381348 }, { "auxiliary_loss_clip": 0.06453235, "auxiliary_loss_mlp": 0.01270309, "balance_loss_clip": 0.06285769, "balance_loss_mlp": 0.01258018, "epoch": 0.450984518262438, "flos": 20778532905600.0, "grad_norm": 2.069948272193674, "language_loss": 0.63092744, "learning_rate": 2.411619265641992e-06, "loss": 0.7081629, "num_input_tokens_seen": 160961345, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.1229248, "step": 7501, "time_per_iteration": 2.531083345413208 }, { "auxiliary_loss_clip": 0.06459117, "auxiliary_loss_mlp": 0.0126663, "balance_loss_clip": 0.06286297, "balance_loss_mlp": 0.01253362, "epoch": 0.451044641515106, "flos": 17712303321600.0, "grad_norm": 1.8192160512732973, "language_loss": 0.85237974, "learning_rate": 2.411238133735863e-06, "loss": 0.92963719, "num_input_tokens_seen": 160977330, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.13269043, "step": 7502, "time_per_iteration": 2.5307414531707764 }, { "auxiliary_loss_clip": 0.0645253, "auxiliary_loss_mlp": 0.01269358, "balance_loss_clip": 0.06285566, "balance_loss_mlp": 0.01257228, "epoch": 0.45110476476777395, "flos": 20600940176640.0, "grad_norm": 1.5038896897940053, "language_loss": 0.79960513, "learning_rate": 2.4108569862355418e-06, "loss": 0.87682402, "num_input_tokens_seen": 160997280, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.12121582, "step": 7503, "time_per_iteration": 2.546298027038574 }, { "auxiliary_loss_clip": 0.06454912, "auxiliary_loss_mlp": 0.01269201, "balance_loss_clip": 0.06289122, "balance_loss_mlp": 0.01256731, "epoch": 0.4511648880204419, "flos": 16039533536640.0, "grad_norm": 1.8718172544093061, "language_loss": 0.81804371, "learning_rate": 2.410475823155484e-06, "loss": 0.89528489, "num_input_tokens_seen": 161014235, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.12469482, "step": 7504, "time_per_iteration": 2.524975299835205 }, { "auxiliary_loss_clip": 0.06448311, "auxiliary_loss_mlp": 0.01268392, "balance_loss_clip": 0.06281738, "balance_loss_mlp": 0.01255935, "epoch": 0.4512250112731099, "flos": 23984103029760.0, "grad_norm": 1.667586674794725, "language_loss": 0.63881254, "learning_rate": 2.4100946445101405e-06, "loss": 0.71597958, "num_input_tokens_seen": 161032360, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.12457275, "step": 7505, "time_per_iteration": 2.548219919204712 }, { "auxiliary_loss_clip": 0.06331891, "auxiliary_loss_mlp": 0.01257544, "balance_loss_clip": 0.06260313, "balance_loss_mlp": 0.01255259, "epoch": 0.45128513452577784, "flos": 71484239053440.0, "grad_norm": 0.8183225409624039, "language_loss": 0.58698803, "learning_rate": 2.409713450313968e-06, "loss": 0.66288239, "num_input_tokens_seen": 161091360, "router_z_loss_clip": 0.71826172, "router_z_loss_mlp": 0.02285767, "step": 7506, "time_per_iteration": 3.228156805038452 }, { "auxiliary_loss_clip": 0.06448952, "auxiliary_loss_mlp": 0.01267322, "balance_loss_clip": 0.0628353, "balance_loss_mlp": 0.0125512, "epoch": 0.4513452577784458, "flos": 22097375043840.0, "grad_norm": 1.5444956963959953, "language_loss": 0.79490167, "learning_rate": 2.40933224058142e-06, "loss": 0.87206435, "num_input_tokens_seen": 161110825, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.12194824, "step": 7507, "time_per_iteration": 2.558554172515869 }, { "auxiliary_loss_clip": 0.06455363, "auxiliary_loss_mlp": 0.01269368, "balance_loss_clip": 0.0628539, "balance_loss_mlp": 0.01256321, "epoch": 0.4514053810311138, "flos": 24282699454080.0, "grad_norm": 1.4240803088276628, "language_loss": 0.73934466, "learning_rate": 2.4089510153269526e-06, "loss": 0.81659198, "num_input_tokens_seen": 161130685, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.1305542, "step": 7508, "time_per_iteration": 2.60237717628479 }, { "auxiliary_loss_clip": 0.0644898, "auxiliary_loss_mlp": 0.01267965, "balance_loss_clip": 0.06284951, "balance_loss_mlp": 0.0125608, "epoch": 0.45146550428378174, "flos": 17891237715840.0, "grad_norm": 2.1420984135269907, "language_loss": 0.79809344, "learning_rate": 2.4085697745650217e-06, "loss": 0.87526286, "num_input_tokens_seen": 161147555, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.11889648, "step": 7509, "time_per_iteration": 2.5849361419677734 }, { "auxiliary_loss_clip": 0.06454478, "auxiliary_loss_mlp": 0.01270682, "balance_loss_clip": 0.06288097, "balance_loss_mlp": 0.01259083, "epoch": 0.4515256275364497, "flos": 24250317050880.0, "grad_norm": 1.7520290053827912, "language_loss": 0.73678213, "learning_rate": 2.4081885183100837e-06, "loss": 0.81403375, "num_input_tokens_seen": 161166255, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.11590576, "step": 7510, "time_per_iteration": 2.6177070140838623 }, { "auxiliary_loss_clip": 0.06453782, "auxiliary_loss_mlp": 0.01270694, "balance_loss_clip": 0.06283704, "balance_loss_mlp": 0.01258213, "epoch": 0.45158575078911767, "flos": 20637263721600.0, "grad_norm": 3.979993115743476, "language_loss": 0.76795661, "learning_rate": 2.4078072465765964e-06, "loss": 0.84520137, "num_input_tokens_seen": 161184720, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.12487793, "step": 7511, "time_per_iteration": 2.5765907764434814 }, { "auxiliary_loss_clip": 0.06458184, "auxiliary_loss_mlp": 0.01272747, "balance_loss_clip": 0.06286679, "balance_loss_mlp": 0.01258525, "epoch": 0.45164587404178563, "flos": 23333884945920.0, "grad_norm": 1.5916604681825395, "language_loss": 0.79153252, "learning_rate": 2.4074259593790174e-06, "loss": 0.86884177, "num_input_tokens_seen": 161204360, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.14208984, "step": 7512, "time_per_iteration": 2.5933308601379395 }, { "auxiliary_loss_clip": 0.06465807, "auxiliary_loss_mlp": 0.0127077, "balance_loss_clip": 0.06288409, "balance_loss_mlp": 0.01256817, "epoch": 0.45170599729445365, "flos": 23812841283840.0, "grad_norm": 26.483608449042585, "language_loss": 0.87991381, "learning_rate": 2.4070446567318053e-06, "loss": 0.95727956, "num_input_tokens_seen": 161223575, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.13952637, "step": 7513, "time_per_iteration": 2.6494104862213135 }, { "auxiliary_loss_clip": 0.06440409, "auxiliary_loss_mlp": 0.01267925, "balance_loss_clip": 0.06279783, "balance_loss_mlp": 0.01256279, "epoch": 0.4517661205471216, "flos": 23519569593600.0, "grad_norm": 1.6714980262664394, "language_loss": 0.67467672, "learning_rate": 2.406663338649419e-06, "loss": 0.75176007, "num_input_tokens_seen": 161243805, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.11651611, "step": 7514, "time_per_iteration": 2.6035118103027344 }, { "auxiliary_loss_clip": 0.06457981, "auxiliary_loss_mlp": 0.01273581, "balance_loss_clip": 0.06286439, "balance_loss_mlp": 0.0125983, "epoch": 0.4518262437997896, "flos": 23520743550720.0, "grad_norm": 1.9249306852454913, "language_loss": 0.70466459, "learning_rate": 2.406282005146318e-06, "loss": 0.78198016, "num_input_tokens_seen": 161261450, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.13751221, "step": 7515, "time_per_iteration": 2.6158266067504883 }, { "auxiliary_loss_clip": 0.06457922, "auxiliary_loss_mlp": 0.01271978, "balance_loss_clip": 0.06285317, "balance_loss_mlp": 0.01258269, "epoch": 0.45188636705245755, "flos": 14572210763520.0, "grad_norm": 2.293812131795127, "language_loss": 0.81794614, "learning_rate": 2.405900656236963e-06, "loss": 0.89524513, "num_input_tokens_seen": 161276965, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.137146, "step": 7516, "time_per_iteration": 4.024701356887817 }, { "auxiliary_loss_clip": 0.0644844, "auxiliary_loss_mlp": 0.0127208, "balance_loss_clip": 0.06283872, "balance_loss_mlp": 0.01259849, "epoch": 0.4519464903051255, "flos": 19907690221440.0, "grad_norm": 1.7557995334095096, "language_loss": 0.65999848, "learning_rate": 2.4055192919358137e-06, "loss": 0.73720372, "num_input_tokens_seen": 161295375, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.12231445, "step": 7517, "time_per_iteration": 2.5922205448150635 }, { "auxiliary_loss_clip": 0.06449576, "auxiliary_loss_mlp": 0.01270792, "balance_loss_clip": 0.06284203, "balance_loss_mlp": 0.0125939, "epoch": 0.4520066135577935, "flos": 18850492056960.0, "grad_norm": 1.9692332786080804, "language_loss": 0.63187301, "learning_rate": 2.405137912257333e-06, "loss": 0.70907676, "num_input_tokens_seen": 161313010, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.11407471, "step": 7518, "time_per_iteration": 2.5456812381744385 }, { "auxiliary_loss_clip": 0.06456663, "auxiliary_loss_mlp": 0.01268827, "balance_loss_clip": 0.06286715, "balance_loss_mlp": 0.01256632, "epoch": 0.45206673681046144, "flos": 48225279985920.0, "grad_norm": 1.4094962657601569, "language_loss": 0.59629029, "learning_rate": 2.404756517215982e-06, "loss": 0.67354524, "num_input_tokens_seen": 161336690, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.12207031, "step": 7519, "time_per_iteration": 2.807891607284546 }, { "auxiliary_loss_clip": 0.06455468, "auxiliary_loss_mlp": 0.01273033, "balance_loss_clip": 0.06284134, "balance_loss_mlp": 0.01259938, "epoch": 0.4521268600631294, "flos": 23848997120640.0, "grad_norm": 1.2733242741806479, "language_loss": 0.72294587, "learning_rate": 2.404375106826223e-06, "loss": 0.80023086, "num_input_tokens_seen": 161357845, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.13079834, "step": 7520, "time_per_iteration": 4.05877423286438 }, { "auxiliary_loss_clip": 0.06450291, "auxiliary_loss_mlp": 0.01270085, "balance_loss_clip": 0.06282619, "balance_loss_mlp": 0.01257068, "epoch": 0.4521869833157974, "flos": 18849611589120.0, "grad_norm": 1.853291640914444, "language_loss": 0.75741255, "learning_rate": 2.4039936811025194e-06, "loss": 0.8346163, "num_input_tokens_seen": 161375160, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.13018799, "step": 7521, "time_per_iteration": 2.605217695236206 }, { "auxiliary_loss_clip": 0.06462497, "auxiliary_loss_mlp": 0.01267632, "balance_loss_clip": 0.06287189, "balance_loss_mlp": 0.01255144, "epoch": 0.45224710656846534, "flos": 19793520633600.0, "grad_norm": 1.7327202994950237, "language_loss": 0.67868829, "learning_rate": 2.4036122400593343e-06, "loss": 0.75598961, "num_input_tokens_seen": 161393690, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.12481689, "step": 7522, "time_per_iteration": 2.5449981689453125 }, { "auxiliary_loss_clip": 0.06448419, "auxiliary_loss_mlp": 0.01267445, "balance_loss_clip": 0.06281938, "balance_loss_mlp": 0.01255256, "epoch": 0.4523072298211333, "flos": 28263558280320.0, "grad_norm": 1.4818331816037622, "language_loss": 0.6117152, "learning_rate": 2.403230783711134e-06, "loss": 0.68887377, "num_input_tokens_seen": 161415015, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.12188721, "step": 7523, "time_per_iteration": 2.641782760620117 }, { "auxiliary_loss_clip": 0.06457464, "auxiliary_loss_mlp": 0.01274253, "balance_loss_clip": 0.06284419, "balance_loss_mlp": 0.01261194, "epoch": 0.45236735307380127, "flos": 11185651820160.0, "grad_norm": 1.7981366133890737, "language_loss": 0.7864114, "learning_rate": 2.4028493120723813e-06, "loss": 0.86372858, "num_input_tokens_seen": 161432940, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.1307373, "step": 7524, "time_per_iteration": 2.6458699703216553 }, { "auxiliary_loss_clip": 0.06453313, "auxiliary_loss_mlp": 0.01270543, "balance_loss_clip": 0.06284654, "balance_loss_mlp": 0.01258574, "epoch": 0.45242747632646924, "flos": 22607959098240.0, "grad_norm": 1.839625178246755, "language_loss": 0.63783562, "learning_rate": 2.4024678251575417e-06, "loss": 0.71507418, "num_input_tokens_seen": 161452215, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.11968994, "step": 7525, "time_per_iteration": 2.842648983001709 }, { "auxiliary_loss_clip": 0.06450383, "auxiliary_loss_mlp": 0.01273915, "balance_loss_clip": 0.06283839, "balance_loss_mlp": 0.01261994, "epoch": 0.45248759957913726, "flos": 18261558835200.0, "grad_norm": 1.4943501599708267, "language_loss": 0.79567623, "learning_rate": 2.402086322981083e-06, "loss": 0.8729192, "num_input_tokens_seen": 161469520, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.1192627, "step": 7526, "time_per_iteration": 2.607247829437256 }, { "auxiliary_loss_clip": 0.06448915, "auxiliary_loss_mlp": 0.01271824, "balance_loss_clip": 0.06281494, "balance_loss_mlp": 0.01259688, "epoch": 0.4525477228318052, "flos": 22455746956800.0, "grad_norm": 1.5838804384648917, "language_loss": 0.81732076, "learning_rate": 2.40170480555747e-06, "loss": 0.89452815, "num_input_tokens_seen": 161487335, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.12139893, "step": 7527, "time_per_iteration": 2.568469762802124 }, { "auxiliary_loss_clip": 0.06454755, "auxiliary_loss_mlp": 0.01266729, "balance_loss_clip": 0.06287557, "balance_loss_mlp": 0.0125485, "epoch": 0.4526078460844732, "flos": 29652909229440.0, "grad_norm": 1.6551442194241832, "language_loss": 0.65674639, "learning_rate": 2.4013232729011706e-06, "loss": 0.73396122, "num_input_tokens_seen": 161510095, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.11871338, "step": 7528, "time_per_iteration": 2.6488518714904785 }, { "auxiliary_loss_clip": 0.06449482, "auxiliary_loss_mlp": 0.0127245, "balance_loss_clip": 0.06283993, "balance_loss_mlp": 0.01259266, "epoch": 0.45266796933714115, "flos": 23046483041280.0, "grad_norm": 1.482479161969816, "language_loss": 0.75343013, "learning_rate": 2.4009417250266525e-06, "loss": 0.83064944, "num_input_tokens_seen": 161528725, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.1317749, "step": 7529, "time_per_iteration": 4.016216993331909 }, { "auxiliary_loss_clip": 0.06453016, "auxiliary_loss_mlp": 0.01268256, "balance_loss_clip": 0.06283836, "balance_loss_mlp": 0.01255334, "epoch": 0.4527280925898091, "flos": 14433582983040.0, "grad_norm": 1.8302483489172452, "language_loss": 0.73456728, "learning_rate": 2.400560161948384e-06, "loss": 0.81177992, "num_input_tokens_seen": 161547195, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.12921143, "step": 7530, "time_per_iteration": 4.014037132263184 }, { "auxiliary_loss_clip": 0.0645399, "auxiliary_loss_mlp": 0.01273271, "balance_loss_clip": 0.06285667, "balance_loss_mlp": 0.01260301, "epoch": 0.4527882158424771, "flos": 22931432985600.0, "grad_norm": 1.6120699567509371, "language_loss": 0.76309484, "learning_rate": 2.400178583680834e-06, "loss": 0.84036744, "num_input_tokens_seen": 161565565, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.12982178, "step": 7531, "time_per_iteration": 2.564812421798706 }, { "auxiliary_loss_clip": 0.06446583, "auxiliary_loss_mlp": 0.01269842, "balance_loss_clip": 0.06284353, "balance_loss_mlp": 0.01258154, "epoch": 0.45284833909514505, "flos": 25562157373440.0, "grad_norm": 1.4231590641365233, "language_loss": 0.67310411, "learning_rate": 2.3997969902384717e-06, "loss": 0.75026834, "num_input_tokens_seen": 161586630, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.11694336, "step": 7532, "time_per_iteration": 2.613288402557373 }, { "auxiliary_loss_clip": 0.06449883, "auxiliary_loss_mlp": 0.01266088, "balance_loss_clip": 0.06284022, "balance_loss_mlp": 0.01254221, "epoch": 0.452908462347813, "flos": 18155816582400.0, "grad_norm": 1.9861466923605506, "language_loss": 0.78399557, "learning_rate": 2.399415381635768e-06, "loss": 0.86115527, "num_input_tokens_seen": 161603815, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.11877441, "step": 7533, "time_per_iteration": 2.52628493309021 }, { "auxiliary_loss_clip": 0.06468086, "auxiliary_loss_mlp": 0.0127192, "balance_loss_clip": 0.06288241, "balance_loss_mlp": 0.01257985, "epoch": 0.452968585600481, "flos": 19068810670080.0, "grad_norm": 1.9072049753699782, "language_loss": 0.83603442, "learning_rate": 2.3990337578871927e-06, "loss": 0.91343451, "num_input_tokens_seen": 161622900, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.13934326, "step": 7534, "time_per_iteration": 2.5779149532318115 }, { "auxiliary_loss_clip": 0.0645964, "auxiliary_loss_mlp": 0.01272172, "balance_loss_clip": 0.06286824, "balance_loss_mlp": 0.01258809, "epoch": 0.45302870885314894, "flos": 22057823116800.0, "grad_norm": 1.533836926187115, "language_loss": 0.76965219, "learning_rate": 2.3986521190072176e-06, "loss": 0.84697026, "num_input_tokens_seen": 161641700, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.13342285, "step": 7535, "time_per_iteration": 2.5695226192474365 }, { "auxiliary_loss_clip": 0.06455246, "auxiliary_loss_mlp": 0.01270238, "balance_loss_clip": 0.0628782, "balance_loss_mlp": 0.01258365, "epoch": 0.4530888321058169, "flos": 20382495782400.0, "grad_norm": 1.4830497314340862, "language_loss": 0.80850899, "learning_rate": 2.3982704650103138e-06, "loss": 0.88576376, "num_input_tokens_seen": 161661955, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.11883545, "step": 7536, "time_per_iteration": 2.556231737136841 }, { "auxiliary_loss_clip": 0.06458278, "auxiliary_loss_mlp": 0.01272134, "balance_loss_clip": 0.06286642, "balance_loss_mlp": 0.01259867, "epoch": 0.4531489553584849, "flos": 14835783381120.0, "grad_norm": 1.5297882108466765, "language_loss": 0.76268202, "learning_rate": 2.3978887959109544e-06, "loss": 0.83998615, "num_input_tokens_seen": 161679245, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.1227417, "step": 7537, "time_per_iteration": 2.535526752471924 }, { "auxiliary_loss_clip": 0.06461424, "auxiliary_loss_mlp": 0.01267356, "balance_loss_clip": 0.06288746, "balance_loss_mlp": 0.0125432, "epoch": 0.45320907861115284, "flos": 21951493885440.0, "grad_norm": 1.8817870574839786, "language_loss": 0.75972748, "learning_rate": 2.3975071117236118e-06, "loss": 0.83701527, "num_input_tokens_seen": 161698795, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.13043213, "step": 7538, "time_per_iteration": 2.595313310623169 }, { "auxiliary_loss_clip": 0.06323749, "auxiliary_loss_mlp": 0.01254293, "balance_loss_clip": 0.06252979, "balance_loss_mlp": 0.0125198, "epoch": 0.45326920186382086, "flos": 66273620578560.0, "grad_norm": 0.7897919774154503, "language_loss": 0.6235714, "learning_rate": 2.3971254124627593e-06, "loss": 0.69935179, "num_input_tokens_seen": 161761980, "router_z_loss_clip": 0.70751953, "router_z_loss_mlp": 0.02310181, "step": 7539, "time_per_iteration": 3.230377674102783 }, { "auxiliary_loss_clip": 0.06452399, "auxiliary_loss_mlp": 0.01270311, "balance_loss_clip": 0.06284937, "balance_loss_mlp": 0.01257991, "epoch": 0.4533293251164888, "flos": 14689524879360.0, "grad_norm": 1.9504573128745752, "language_loss": 0.6637212, "learning_rate": 2.396743698142872e-06, "loss": 0.74094832, "num_input_tokens_seen": 161779455, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.12310791, "step": 7540, "time_per_iteration": 2.620105743408203 }, { "auxiliary_loss_clip": 0.06462558, "auxiliary_loss_mlp": 0.0127356, "balance_loss_clip": 0.06287809, "balance_loss_mlp": 0.01259482, "epoch": 0.4533894483691568, "flos": 22607749463040.0, "grad_norm": 1.8040939239696525, "language_loss": 0.85174787, "learning_rate": 2.396361968778424e-06, "loss": 0.9291091, "num_input_tokens_seen": 161798980, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.14074707, "step": 7541, "time_per_iteration": 2.607726812362671 }, { "auxiliary_loss_clip": 0.06450659, "auxiliary_loss_mlp": 0.01269196, "balance_loss_clip": 0.06280883, "balance_loss_mlp": 0.01256882, "epoch": 0.45344957162182475, "flos": 34760301073920.0, "grad_norm": 1.951258489063777, "language_loss": 0.76533568, "learning_rate": 2.395980224383889e-06, "loss": 0.84253418, "num_input_tokens_seen": 161819745, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.12322998, "step": 7542, "time_per_iteration": 2.9141652584075928 }, { "auxiliary_loss_clip": 0.06452735, "auxiliary_loss_mlp": 0.01269339, "balance_loss_clip": 0.06280873, "balance_loss_mlp": 0.01256697, "epoch": 0.4535096948744927, "flos": 23556983241600.0, "grad_norm": 1.4487150225700092, "language_loss": 0.80704927, "learning_rate": 2.395598464973746e-06, "loss": 0.88426995, "num_input_tokens_seen": 161838575, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.12646484, "step": 7543, "time_per_iteration": 2.6501286029815674 }, { "auxiliary_loss_clip": 0.06455173, "auxiliary_loss_mlp": 0.01267617, "balance_loss_clip": 0.06284167, "balance_loss_mlp": 0.0125516, "epoch": 0.4535698181271607, "flos": 25564756849920.0, "grad_norm": 1.5202620625669374, "language_loss": 0.76619232, "learning_rate": 2.395216690562469e-06, "loss": 0.84342015, "num_input_tokens_seen": 161858590, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.12457275, "step": 7544, "time_per_iteration": 2.6089863777160645 }, { "auxiliary_loss_clip": 0.06457958, "auxiliary_loss_mlp": 0.01269487, "balance_loss_clip": 0.06284702, "balance_loss_mlp": 0.01257274, "epoch": 0.45362994137982865, "flos": 24871171478400.0, "grad_norm": 2.984879752057116, "language_loss": 0.75648391, "learning_rate": 2.3948349011645355e-06, "loss": 0.83375841, "num_input_tokens_seen": 161878390, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.12213135, "step": 7545, "time_per_iteration": 2.5816948413848877 }, { "auxiliary_loss_clip": 0.06451263, "auxiliary_loss_mlp": 0.01269782, "balance_loss_clip": 0.06280131, "balance_loss_mlp": 0.01257652, "epoch": 0.4536900646324966, "flos": 30814088711040.0, "grad_norm": 1.7274061843918096, "language_loss": 0.72510791, "learning_rate": 2.394453096794423e-06, "loss": 0.80231833, "num_input_tokens_seen": 161898610, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.12133789, "step": 7546, "time_per_iteration": 2.6308021545410156 }, { "auxiliary_loss_clip": 0.06462821, "auxiliary_loss_mlp": 0.01276744, "balance_loss_clip": 0.06284274, "balance_loss_mlp": 0.0126341, "epoch": 0.4537501878851646, "flos": 23411060156160.0, "grad_norm": 1.4951005427911588, "language_loss": 0.76410574, "learning_rate": 2.394071277466609e-06, "loss": 0.84150136, "num_input_tokens_seen": 161918210, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.13336182, "step": 7547, "time_per_iteration": 2.5815320014953613 }, { "auxiliary_loss_clip": 0.06457852, "auxiliary_loss_mlp": 0.0126862, "balance_loss_clip": 0.06283822, "balance_loss_mlp": 0.01256437, "epoch": 0.45381031113783254, "flos": 18154978041600.0, "grad_norm": 1.9513714997024225, "language_loss": 0.69938016, "learning_rate": 2.393689443195573e-06, "loss": 0.77664495, "num_input_tokens_seen": 161936950, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.12182617, "step": 7548, "time_per_iteration": 2.571096420288086 }, { "auxiliary_loss_clip": 0.06449118, "auxiliary_loss_mlp": 0.01269332, "balance_loss_clip": 0.06279005, "balance_loss_mlp": 0.01256529, "epoch": 0.4538704343905005, "flos": 25343503344000.0, "grad_norm": 1.7859607594213034, "language_loss": 0.73168379, "learning_rate": 2.393307593995794e-06, "loss": 0.80886829, "num_input_tokens_seen": 161955550, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.12817383, "step": 7549, "time_per_iteration": 2.6021807193756104 }, { "auxiliary_loss_clip": 0.06448838, "auxiliary_loss_mlp": 0.01266906, "balance_loss_clip": 0.06280681, "balance_loss_mlp": 0.01254997, "epoch": 0.4539305576431685, "flos": 28739118528000.0, "grad_norm": 1.879282730104143, "language_loss": 0.65484917, "learning_rate": 2.392925729881751e-06, "loss": 0.73200667, "num_input_tokens_seen": 161976760, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.11901855, "step": 7550, "time_per_iteration": 2.6266348361968994 }, { "auxiliary_loss_clip": 0.06450012, "auxiliary_loss_mlp": 0.01269378, "balance_loss_clip": 0.06284283, "balance_loss_mlp": 0.01257744, "epoch": 0.45399068089583644, "flos": 22499030390400.0, "grad_norm": 2.7145586498568277, "language_loss": 0.69050765, "learning_rate": 2.3925438508679263e-06, "loss": 0.76770151, "num_input_tokens_seen": 161996120, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.11639404, "step": 7551, "time_per_iteration": 2.5692830085754395 }, { "auxiliary_loss_clip": 0.06450785, "auxiliary_loss_mlp": 0.01268334, "balance_loss_clip": 0.06277578, "balance_loss_mlp": 0.01255042, "epoch": 0.45405080414850446, "flos": 12897889678080.0, "grad_norm": 1.8623246085149663, "language_loss": 0.79598296, "learning_rate": 2.392161956968798e-06, "loss": 0.87317407, "num_input_tokens_seen": 162011125, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.13299561, "step": 7552, "time_per_iteration": 2.5084075927734375 }, { "auxiliary_loss_clip": 0.0634767, "auxiliary_loss_mlp": 0.01250445, "balance_loss_clip": 0.06276324, "balance_loss_mlp": 0.01247916, "epoch": 0.4541109274011724, "flos": 59783558912640.0, "grad_norm": 0.8003219522663703, "language_loss": 0.57945234, "learning_rate": 2.39178004819885e-06, "loss": 0.65543354, "num_input_tokens_seen": 162068705, "router_z_loss_clip": 0.71142578, "router_z_loss_mlp": 0.02528381, "step": 7553, "time_per_iteration": 3.1523971557617188 }, { "auxiliary_loss_clip": 0.06448802, "auxiliary_loss_mlp": 0.01270815, "balance_loss_clip": 0.06280497, "balance_loss_mlp": 0.01258841, "epoch": 0.4541710506538404, "flos": 28519248614400.0, "grad_norm": 1.301048426810034, "language_loss": 0.77026749, "learning_rate": 2.3913981245725626e-06, "loss": 0.84746373, "num_input_tokens_seen": 162089655, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.11968994, "step": 7554, "time_per_iteration": 2.6221611499786377 }, { "auxiliary_loss_clip": 0.06458429, "auxiliary_loss_mlp": 0.0126812, "balance_loss_clip": 0.06287237, "balance_loss_mlp": 0.01254834, "epoch": 0.45423117390650836, "flos": 17681304510720.0, "grad_norm": 25.26027874842635, "language_loss": 0.76925576, "learning_rate": 2.3910161861044194e-06, "loss": 0.8465212, "num_input_tokens_seen": 162108465, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.1328125, "step": 7555, "time_per_iteration": 2.5621414184570312 }, { "auxiliary_loss_clip": 0.06452656, "auxiliary_loss_mlp": 0.01269353, "balance_loss_clip": 0.06283308, "balance_loss_mlp": 0.01256293, "epoch": 0.4542912971591763, "flos": 28079760349440.0, "grad_norm": 1.3621788579311582, "language_loss": 0.72770762, "learning_rate": 2.390634232808903e-06, "loss": 0.80492771, "num_input_tokens_seen": 162129910, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.13049316, "step": 7556, "time_per_iteration": 4.073064088821411 }, { "auxiliary_loss_clip": 0.06462931, "auxiliary_loss_mlp": 0.01269859, "balance_loss_clip": 0.06287065, "balance_loss_mlp": 0.01256787, "epoch": 0.4543514204118443, "flos": 22677922857600.0, "grad_norm": 2.05331022344562, "language_loss": 0.63096499, "learning_rate": 2.3902522647004982e-06, "loss": 0.7082929, "num_input_tokens_seen": 162148840, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.13085938, "step": 7557, "time_per_iteration": 2.5799808502197266 }, { "auxiliary_loss_clip": 0.06347521, "auxiliary_loss_mlp": 0.01252455, "balance_loss_clip": 0.06276228, "balance_loss_mlp": 0.01249858, "epoch": 0.45441154366451225, "flos": 58236027454080.0, "grad_norm": 0.668585055939492, "language_loss": 0.57606143, "learning_rate": 2.3898702817936875e-06, "loss": 0.65206122, "num_input_tokens_seen": 162208500, "router_z_loss_clip": 0.71142578, "router_z_loss_mlp": 0.02598572, "step": 7558, "time_per_iteration": 3.125145673751831 }, { "auxiliary_loss_clip": 0.06462501, "auxiliary_loss_mlp": 0.01268628, "balance_loss_clip": 0.0628814, "balance_loss_mlp": 0.01254996, "epoch": 0.4544716669171802, "flos": 16769987504640.0, "grad_norm": 2.6568441771547078, "language_loss": 0.56686282, "learning_rate": 2.3894882841029573e-06, "loss": 0.64417416, "num_input_tokens_seen": 162224650, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.13635254, "step": 7559, "time_per_iteration": 3.981628179550171 }, { "auxiliary_loss_clip": 0.06452887, "auxiliary_loss_mlp": 0.01269126, "balance_loss_clip": 0.06283731, "balance_loss_mlp": 0.01256901, "epoch": 0.4545317901698482, "flos": 15930814464000.0, "grad_norm": 2.016205079289817, "language_loss": 0.72064561, "learning_rate": 2.389106271642792e-06, "loss": 0.79786575, "num_input_tokens_seen": 162242930, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.12231445, "step": 7560, "time_per_iteration": 2.54011869430542 }, { "auxiliary_loss_clip": 0.06465689, "auxiliary_loss_mlp": 0.0127135, "balance_loss_clip": 0.06290159, "balance_loss_mlp": 0.01258398, "epoch": 0.45459191342251615, "flos": 17645567944320.0, "grad_norm": 2.840599922998114, "language_loss": 0.69748724, "learning_rate": 2.3887242444276775e-06, "loss": 0.77485764, "num_input_tokens_seen": 162261455, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.1295166, "step": 7561, "time_per_iteration": 2.5380799770355225 }, { "auxiliary_loss_clip": 0.06446634, "auxiliary_loss_mlp": 0.01267753, "balance_loss_clip": 0.06282775, "balance_loss_mlp": 0.01256041, "epoch": 0.4546520366751841, "flos": 16181557407360.0, "grad_norm": 1.5443314582815135, "language_loss": 0.85160995, "learning_rate": 2.3883422024721015e-06, "loss": 0.92875385, "num_input_tokens_seen": 162279725, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.1171875, "step": 7562, "time_per_iteration": 2.5562386512756348 }, { "auxiliary_loss_clip": 0.06451804, "auxiliary_loss_mlp": 0.0127024, "balance_loss_clip": 0.06285739, "balance_loss_mlp": 0.01257336, "epoch": 0.4547121599278521, "flos": 19756861672320.0, "grad_norm": 1.721217849654244, "language_loss": 0.90174282, "learning_rate": 2.38796014579055e-06, "loss": 0.97896338, "num_input_tokens_seen": 162297865, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.12927246, "step": 7563, "time_per_iteration": 2.5792787075042725 }, { "auxiliary_loss_clip": 0.06458826, "auxiliary_loss_mlp": 0.01273188, "balance_loss_clip": 0.06287455, "balance_loss_mlp": 0.01259807, "epoch": 0.45477228318052004, "flos": 19943510641920.0, "grad_norm": 2.00215985886423, "language_loss": 0.72227257, "learning_rate": 2.3875780743975097e-06, "loss": 0.79959273, "num_input_tokens_seen": 162316010, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.13378906, "step": 7564, "time_per_iteration": 2.5546252727508545 }, { "auxiliary_loss_clip": 0.06460367, "auxiliary_loss_mlp": 0.01268525, "balance_loss_clip": 0.06287041, "balance_loss_mlp": 0.01255179, "epoch": 0.454832406433188, "flos": 21294735183360.0, "grad_norm": 2.242520399887956, "language_loss": 0.68780422, "learning_rate": 2.3871959883074713e-06, "loss": 0.76509321, "num_input_tokens_seen": 162336115, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.13354492, "step": 7565, "time_per_iteration": 2.5707755088806152 }, { "auxiliary_loss_clip": 0.0645172, "auxiliary_loss_mlp": 0.01271275, "balance_loss_clip": 0.06281245, "balance_loss_mlp": 0.01259062, "epoch": 0.45489252968585603, "flos": 24505630041600.0, "grad_norm": 1.9615063249456541, "language_loss": 0.80249596, "learning_rate": 2.386813887534922e-06, "loss": 0.87972587, "num_input_tokens_seen": 162355705, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.12219238, "step": 7566, "time_per_iteration": 2.579674243927002 }, { "auxiliary_loss_clip": 0.0646033, "auxiliary_loss_mlp": 0.0127306, "balance_loss_clip": 0.06288742, "balance_loss_mlp": 0.01259476, "epoch": 0.454952652938524, "flos": 17098199147520.0, "grad_norm": 1.5482748503741344, "language_loss": 0.74097949, "learning_rate": 2.3864317720943508e-06, "loss": 0.81831336, "num_input_tokens_seen": 162374055, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.13586426, "step": 7567, "time_per_iteration": 2.527191162109375 }, { "auxiliary_loss_clip": 0.06459738, "auxiliary_loss_mlp": 0.01271871, "balance_loss_clip": 0.06285984, "balance_loss_mlp": 0.012597, "epoch": 0.45501277619119196, "flos": 27636792140160.0, "grad_norm": 1.4401782612024112, "language_loss": 0.8110919, "learning_rate": 2.386049642000249e-06, "loss": 0.88840801, "num_input_tokens_seen": 162393560, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.12158203, "step": 7568, "time_per_iteration": 2.609400749206543 }, { "auxiliary_loss_clip": 0.06467447, "auxiliary_loss_mlp": 0.01275618, "balance_loss_clip": 0.06290972, "balance_loss_mlp": 0.01260693, "epoch": 0.4550728994438599, "flos": 19980840435840.0, "grad_norm": 4.264168206616597, "language_loss": 0.79999971, "learning_rate": 2.3856674972671055e-06, "loss": 0.87743038, "num_input_tokens_seen": 162413170, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.14929199, "step": 7569, "time_per_iteration": 5.429542303085327 }, { "auxiliary_loss_clip": 0.06462754, "auxiliary_loss_mlp": 0.0127045, "balance_loss_clip": 0.06287986, "balance_loss_mlp": 0.01257408, "epoch": 0.4551330226965279, "flos": 26073915384960.0, "grad_norm": 1.3373823586561202, "language_loss": 0.75402015, "learning_rate": 2.385285337909412e-06, "loss": 0.83135223, "num_input_tokens_seen": 162434080, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.1305542, "step": 7570, "time_per_iteration": 2.6211514472961426 }, { "auxiliary_loss_clip": 0.06459253, "auxiliary_loss_mlp": 0.01277504, "balance_loss_clip": 0.06290788, "balance_loss_mlp": 0.01264433, "epoch": 0.45519314594919585, "flos": 32789396062080.0, "grad_norm": 1.893561885877096, "language_loss": 0.75006562, "learning_rate": 2.3849031639416596e-06, "loss": 0.82743311, "num_input_tokens_seen": 162455445, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.13061523, "step": 7571, "time_per_iteration": 2.8386220932006836 }, { "auxiliary_loss_clip": 0.06452717, "auxiliary_loss_mlp": 0.01278916, "balance_loss_clip": 0.06288673, "balance_loss_mlp": 0.01266637, "epoch": 0.4552532692018638, "flos": 19178829480960.0, "grad_norm": 1.7774688347877063, "language_loss": 0.8153581, "learning_rate": 2.3845209753783414e-06, "loss": 0.89267445, "num_input_tokens_seen": 162474940, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.12286377, "step": 7572, "time_per_iteration": 2.6324076652526855 }, { "auxiliary_loss_clip": 0.06465657, "auxiliary_loss_mlp": 0.01272385, "balance_loss_clip": 0.06290318, "balance_loss_mlp": 0.01258092, "epoch": 0.4553133924545318, "flos": 26033650698240.0, "grad_norm": 1.7660179082374692, "language_loss": 0.73190343, "learning_rate": 2.3841387722339486e-06, "loss": 0.80928385, "num_input_tokens_seen": 162493340, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.1428833, "step": 7573, "time_per_iteration": 2.613365888595581 }, { "auxiliary_loss_clip": 0.06469751, "auxiliary_loss_mlp": 0.01274346, "balance_loss_clip": 0.06295371, "balance_loss_mlp": 0.01259379, "epoch": 0.45537351570719975, "flos": 30668920312320.0, "grad_norm": 1.8373799646421776, "language_loss": 0.74647713, "learning_rate": 2.3837565545229748e-06, "loss": 0.8239181, "num_input_tokens_seen": 162514360, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.1496582, "step": 7574, "time_per_iteration": 2.6576321125030518 }, { "auxiliary_loss_clip": 0.06466184, "auxiliary_loss_mlp": 0.0127255, "balance_loss_clip": 0.06291394, "balance_loss_mlp": 0.01259514, "epoch": 0.4554336389598677, "flos": 24360377788800.0, "grad_norm": 1.5925881256531236, "language_loss": 0.71569204, "learning_rate": 2.383374322259915e-06, "loss": 0.79307938, "num_input_tokens_seen": 162535240, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.13037109, "step": 7575, "time_per_iteration": 2.6223132610321045 }, { "auxiliary_loss_clip": 0.06460056, "auxiliary_loss_mlp": 0.01269641, "balance_loss_clip": 0.06287441, "balance_loss_mlp": 0.01256439, "epoch": 0.4554937622125357, "flos": 20564113507200.0, "grad_norm": 2.0009881187741416, "language_loss": 0.7369895, "learning_rate": 2.3829920754592617e-06, "loss": 0.81428641, "num_input_tokens_seen": 162553880, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.13201904, "step": 7576, "time_per_iteration": 2.574157476425171 }, { "auxiliary_loss_clip": 0.06457589, "auxiliary_loss_mlp": 0.01279662, "balance_loss_clip": 0.06290544, "balance_loss_mlp": 0.01267055, "epoch": 0.45555388546520365, "flos": 22827451668480.0, "grad_norm": 1.7915797367647257, "language_loss": 0.66661143, "learning_rate": 2.382609814135511e-06, "loss": 0.74398392, "num_input_tokens_seen": 162574485, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.1262207, "step": 7577, "time_per_iteration": 2.5710322856903076 }, { "auxiliary_loss_clip": 0.06457984, "auxiliary_loss_mlp": 0.01271728, "balance_loss_clip": 0.06287745, "balance_loss_mlp": 0.01257155, "epoch": 0.4556140087178716, "flos": 21732462512640.0, "grad_norm": 1.699876754225604, "language_loss": 0.74785113, "learning_rate": 2.382227538303157e-06, "loss": 0.82514822, "num_input_tokens_seen": 162595130, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.14575195, "step": 7578, "time_per_iteration": 2.58803391456604 }, { "auxiliary_loss_clip": 0.06462601, "auxiliary_loss_mlp": 0.01271552, "balance_loss_clip": 0.06292717, "balance_loss_mlp": 0.01258755, "epoch": 0.45567413197053963, "flos": 26001645638400.0, "grad_norm": 1.7295883126371012, "language_loss": 0.70817518, "learning_rate": 2.381845247976697e-06, "loss": 0.78551674, "num_input_tokens_seen": 162615720, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.12805176, "step": 7579, "time_per_iteration": 2.602792263031006 }, { "auxiliary_loss_clip": 0.06452112, "auxiliary_loss_mlp": 0.01271058, "balance_loss_clip": 0.06284724, "balance_loss_mlp": 0.01259166, "epoch": 0.4557342552232076, "flos": 21543046358400.0, "grad_norm": 1.6108396274819543, "language_loss": 0.78863657, "learning_rate": 2.381462943170627e-06, "loss": 0.86586821, "num_input_tokens_seen": 162635825, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.11901855, "step": 7580, "time_per_iteration": 2.5839579105377197 }, { "auxiliary_loss_clip": 0.06457646, "auxiliary_loss_mlp": 0.01272472, "balance_loss_clip": 0.06289355, "balance_loss_mlp": 0.0125949, "epoch": 0.45579437847587556, "flos": 40010932673280.0, "grad_norm": 1.543638122364142, "language_loss": 0.69025713, "learning_rate": 2.381080623899444e-06, "loss": 0.76755828, "num_input_tokens_seen": 162659130, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.12976074, "step": 7581, "time_per_iteration": 2.740442991256714 }, { "auxiliary_loss_clip": 0.06443305, "auxiliary_loss_mlp": 0.01272887, "balance_loss_clip": 0.06279543, "balance_loss_mlp": 0.01259887, "epoch": 0.4558545017285435, "flos": 31146409203840.0, "grad_norm": 1.514889455670074, "language_loss": 0.73144996, "learning_rate": 2.3806982901776455e-06, "loss": 0.80861193, "num_input_tokens_seen": 162681665, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.12994385, "step": 7582, "time_per_iteration": 2.6565284729003906 }, { "auxiliary_loss_clip": 0.06463415, "auxiliary_loss_mlp": 0.01275156, "balance_loss_clip": 0.06287994, "balance_loss_mlp": 0.01260958, "epoch": 0.4559146249812115, "flos": 21732210950400.0, "grad_norm": 1.9681324102153188, "language_loss": 0.72831941, "learning_rate": 2.380315942019729e-06, "loss": 0.80570513, "num_input_tokens_seen": 162702040, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.14221191, "step": 7583, "time_per_iteration": 2.6604206562042236 }, { "auxiliary_loss_clip": 0.0646264, "auxiliary_loss_mlp": 0.0127091, "balance_loss_clip": 0.06288638, "balance_loss_mlp": 0.01256587, "epoch": 0.45597474823387946, "flos": 23812841283840.0, "grad_norm": 1.6451883674710432, "language_loss": 0.73315072, "learning_rate": 2.379933579440195e-06, "loss": 0.8104862, "num_input_tokens_seen": 162722375, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.14337158, "step": 7584, "time_per_iteration": 2.6215012073516846 }, { "auxiliary_loss_clip": 0.0645517, "auxiliary_loss_mlp": 0.0127013, "balance_loss_clip": 0.06285851, "balance_loss_mlp": 0.01258019, "epoch": 0.4560348714865474, "flos": 31913857549440.0, "grad_norm": 1.4779115722113847, "language_loss": 0.68147016, "learning_rate": 2.379551202453541e-06, "loss": 0.75872314, "num_input_tokens_seen": 162746095, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.12109375, "step": 7585, "time_per_iteration": 2.659856081008911 }, { "auxiliary_loss_clip": 0.06453468, "auxiliary_loss_mlp": 0.01267077, "balance_loss_clip": 0.06282235, "balance_loss_mlp": 0.01253982, "epoch": 0.4560949947392154, "flos": 22054427026560.0, "grad_norm": 1.6060533233049117, "language_loss": 0.7650013, "learning_rate": 2.379168811074267e-06, "loss": 0.84220672, "num_input_tokens_seen": 162766330, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.13104248, "step": 7586, "time_per_iteration": 2.5875580310821533 }, { "auxiliary_loss_clip": 0.06457493, "auxiliary_loss_mlp": 0.01266519, "balance_loss_clip": 0.06288782, "balance_loss_mlp": 0.01254491, "epoch": 0.45615511799188335, "flos": 24578738328960.0, "grad_norm": 1.7944842235597023, "language_loss": 0.78581774, "learning_rate": 2.3787864053168747e-06, "loss": 0.86305785, "num_input_tokens_seen": 162784755, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.12023926, "step": 7587, "time_per_iteration": 2.6094186305999756 }, { "auxiliary_loss_clip": 0.06467285, "auxiliary_loss_mlp": 0.01274425, "balance_loss_clip": 0.06286936, "balance_loss_mlp": 0.01260614, "epoch": 0.4562152412445513, "flos": 18336260350080.0, "grad_norm": 1.9735838999235482, "language_loss": 0.69755793, "learning_rate": 2.378403985195863e-06, "loss": 0.77497506, "num_input_tokens_seen": 162803850, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.13818359, "step": 7588, "time_per_iteration": 2.5787951946258545 }, { "auxiliary_loss_clip": 0.0645548, "auxiliary_loss_mlp": 0.01270138, "balance_loss_clip": 0.06288096, "balance_loss_mlp": 0.01257776, "epoch": 0.4562753644972193, "flos": 13521595144320.0, "grad_norm": 1.8008913570892404, "language_loss": 0.80049694, "learning_rate": 2.378021550725735e-06, "loss": 0.87775314, "num_input_tokens_seen": 162820775, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.1237793, "step": 7589, "time_per_iteration": 2.5542690753936768 }, { "auxiliary_loss_clip": 0.06460995, "auxiliary_loss_mlp": 0.01271373, "balance_loss_clip": 0.06290904, "balance_loss_mlp": 0.01258534, "epoch": 0.45633548774988725, "flos": 29646871735680.0, "grad_norm": 1.8761510105291752, "language_loss": 0.63084239, "learning_rate": 2.377639101920992e-06, "loss": 0.70816606, "num_input_tokens_seen": 162839695, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.128479, "step": 7590, "time_per_iteration": 2.679229497909546 }, { "auxiliary_loss_clip": 0.06454643, "auxiliary_loss_mlp": 0.01268843, "balance_loss_clip": 0.06283153, "balance_loss_mlp": 0.01256326, "epoch": 0.4563956110025552, "flos": 22239398914560.0, "grad_norm": 1.7835084918068522, "language_loss": 0.7326864, "learning_rate": 2.377256638796135e-06, "loss": 0.80992126, "num_input_tokens_seen": 162856095, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.12518311, "step": 7591, "time_per_iteration": 2.531216621398926 }, { "auxiliary_loss_clip": 0.06458623, "auxiliary_loss_mlp": 0.01271878, "balance_loss_clip": 0.06285951, "balance_loss_mlp": 0.01257835, "epoch": 0.45645573425522323, "flos": 17097696023040.0, "grad_norm": 3.1509019690342264, "language_loss": 0.77129596, "learning_rate": 2.3768741613656695e-06, "loss": 0.84860098, "num_input_tokens_seen": 162874070, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.14031982, "step": 7592, "time_per_iteration": 2.5378007888793945 }, { "auxiliary_loss_clip": 0.06458239, "auxiliary_loss_mlp": 0.01269298, "balance_loss_clip": 0.06286588, "balance_loss_mlp": 0.0125724, "epoch": 0.4565158575078912, "flos": 20337367559040.0, "grad_norm": 2.362423326637095, "language_loss": 0.6969291, "learning_rate": 2.376491669644098e-06, "loss": 0.77420449, "num_input_tokens_seen": 162891000, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.12054443, "step": 7593, "time_per_iteration": 2.560882568359375 }, { "auxiliary_loss_clip": 0.06448128, "auxiliary_loss_mlp": 0.0126613, "balance_loss_clip": 0.06282803, "balance_loss_mlp": 0.01254614, "epoch": 0.45657598076055916, "flos": 23989008493440.0, "grad_norm": 1.7413855697345784, "language_loss": 0.84553814, "learning_rate": 2.3761091636459248e-06, "loss": 0.92268074, "num_input_tokens_seen": 162910120, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.1151123, "step": 7594, "time_per_iteration": 2.579482316970825 }, { "auxiliary_loss_clip": 0.06344287, "auxiliary_loss_mlp": 0.01272096, "balance_loss_clip": 0.06272298, "balance_loss_mlp": 0.01269001, "epoch": 0.45663610401322713, "flos": 69382812908160.0, "grad_norm": 1.1645408020775931, "language_loss": 0.52650017, "learning_rate": 2.375726643385654e-06, "loss": 0.60266399, "num_input_tokens_seen": 162963720, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 0.03088379, "step": 7595, "time_per_iteration": 4.641978025436401 }, { "auxiliary_loss_clip": 0.06462801, "auxiliary_loss_mlp": 0.01269634, "balance_loss_clip": 0.0628661, "balance_loss_mlp": 0.01256163, "epoch": 0.4566962272658951, "flos": 15152884358400.0, "grad_norm": 4.0583019746379385, "language_loss": 0.87407756, "learning_rate": 2.3753441088777915e-06, "loss": 0.95140189, "num_input_tokens_seen": 162975760, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.13482666, "step": 7596, "time_per_iteration": 2.5268282890319824 }, { "auxiliary_loss_clip": 0.06456859, "auxiliary_loss_mlp": 0.01268977, "balance_loss_clip": 0.06284578, "balance_loss_mlp": 0.01256174, "epoch": 0.45675635051856306, "flos": 18703395014400.0, "grad_norm": 1.5438686544541236, "language_loss": 0.77865553, "learning_rate": 2.374961560136843e-06, "loss": 0.85591388, "num_input_tokens_seen": 162994865, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.12805176, "step": 7597, "time_per_iteration": 2.5469021797180176 }, { "auxiliary_loss_clip": 0.06452826, "auxiliary_loss_mlp": 0.01270521, "balance_loss_clip": 0.062812, "balance_loss_mlp": 0.01256889, "epoch": 0.456816473771231, "flos": 19104211820160.0, "grad_norm": 1.5853151388335558, "language_loss": 0.78313375, "learning_rate": 2.374578997177314e-06, "loss": 0.86036724, "num_input_tokens_seen": 163014730, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.1362915, "step": 7598, "time_per_iteration": 2.561800003051758 }, { "auxiliary_loss_clip": 0.06454961, "auxiliary_loss_mlp": 0.01267884, "balance_loss_clip": 0.06286369, "balance_loss_mlp": 0.01255021, "epoch": 0.456876597023899, "flos": 28957730630400.0, "grad_norm": 5.329731153070066, "language_loss": 0.72114855, "learning_rate": 2.374196420013712e-06, "loss": 0.79837698, "num_input_tokens_seen": 163033405, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.12872314, "step": 7599, "time_per_iteration": 4.1342432498931885 }, { "auxiliary_loss_clip": 0.0645186, "auxiliary_loss_mlp": 0.01269998, "balance_loss_clip": 0.06282688, "balance_loss_mlp": 0.01257326, "epoch": 0.45693672027656695, "flos": 23295297340800.0, "grad_norm": 1.7984311516511415, "language_loss": 0.70202971, "learning_rate": 2.373813828660544e-06, "loss": 0.7792483, "num_input_tokens_seen": 163051400, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.12683105, "step": 7600, "time_per_iteration": 2.568171262741089 }, { "auxiliary_loss_clip": 0.06455858, "auxiliary_loss_mlp": 0.01271176, "balance_loss_clip": 0.06284343, "balance_loss_mlp": 0.01258612, "epoch": 0.4569968435292349, "flos": 20564448923520.0, "grad_norm": 1.705880858367069, "language_loss": 0.79526472, "learning_rate": 2.373431223132319e-06, "loss": 0.87253499, "num_input_tokens_seen": 163069250, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.12567139, "step": 7601, "time_per_iteration": 2.575735330581665 }, { "auxiliary_loss_clip": 0.06458814, "auxiliary_loss_mlp": 0.01272809, "balance_loss_clip": 0.06285684, "balance_loss_mlp": 0.01260852, "epoch": 0.4570569667819029, "flos": 41292403090560.0, "grad_norm": 1.8580444845997672, "language_loss": 0.72215319, "learning_rate": 2.3730486034435448e-06, "loss": 0.79946941, "num_input_tokens_seen": 163091755, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.11956787, "step": 7602, "time_per_iteration": 2.7469427585601807 }, { "auxiliary_loss_clip": 0.06455284, "auxiliary_loss_mlp": 0.01272057, "balance_loss_clip": 0.06282726, "balance_loss_mlp": 0.01257168, "epoch": 0.45711709003457085, "flos": 26038807724160.0, "grad_norm": 1.673345966447782, "language_loss": 0.73891866, "learning_rate": 2.372665969608729e-06, "loss": 0.81619203, "num_input_tokens_seen": 163111600, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.14892578, "step": 7603, "time_per_iteration": 2.595341444015503 }, { "auxiliary_loss_clip": 0.06455013, "auxiliary_loss_mlp": 0.01270538, "balance_loss_clip": 0.06285481, "balance_loss_mlp": 0.01256358, "epoch": 0.4571772132872388, "flos": 22163649223680.0, "grad_norm": 1.7671383988836185, "language_loss": 0.83375078, "learning_rate": 2.372283321642383e-06, "loss": 0.91100627, "num_input_tokens_seen": 163127350, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.14160156, "step": 7604, "time_per_iteration": 2.5510354042053223 }, { "auxiliary_loss_clip": 0.06466272, "auxiliary_loss_mlp": 0.01273354, "balance_loss_clip": 0.06287531, "balance_loss_mlp": 0.01259633, "epoch": 0.45723733653990684, "flos": 23885739936000.0, "grad_norm": 2.05546581105731, "language_loss": 0.86556798, "learning_rate": 2.371900659559016e-06, "loss": 0.94296432, "num_input_tokens_seen": 163145855, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.1373291, "step": 7605, "time_per_iteration": 2.6038782596588135 }, { "auxiliary_loss_clip": 0.06464148, "auxiliary_loss_mlp": 0.01267453, "balance_loss_clip": 0.06285994, "balance_loss_mlp": 0.01253977, "epoch": 0.4572974597925748, "flos": 16877197203840.0, "grad_norm": 1.7079093449780824, "language_loss": 0.7402339, "learning_rate": 2.371517983373138e-06, "loss": 0.81754994, "num_input_tokens_seen": 163163830, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.13470459, "step": 7606, "time_per_iteration": 2.5222294330596924 }, { "auxiliary_loss_clip": 0.06464493, "auxiliary_loss_mlp": 0.01272387, "balance_loss_clip": 0.0628825, "balance_loss_mlp": 0.01258386, "epoch": 0.45735758304524277, "flos": 13776530791680.0, "grad_norm": 2.0350654191051993, "language_loss": 0.80115885, "learning_rate": 2.371135293099262e-06, "loss": 0.87852764, "num_input_tokens_seen": 163180700, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.13983154, "step": 7607, "time_per_iteration": 2.534003973007202 }, { "auxiliary_loss_clip": 0.06461753, "auxiliary_loss_mlp": 0.01270985, "balance_loss_clip": 0.06289804, "balance_loss_mlp": 0.01257485, "epoch": 0.45741770629791073, "flos": 21106283351040.0, "grad_norm": 1.6716668371471797, "language_loss": 0.8096441, "learning_rate": 2.3707525887518982e-06, "loss": 0.88697147, "num_input_tokens_seen": 163199450, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.1350708, "step": 7608, "time_per_iteration": 3.970085859298706 }, { "auxiliary_loss_clip": 0.06456564, "auxiliary_loss_mlp": 0.01267437, "balance_loss_clip": 0.0628631, "balance_loss_mlp": 0.01254508, "epoch": 0.4574778295505787, "flos": 23119675182720.0, "grad_norm": 1.5547372636652912, "language_loss": 0.68481714, "learning_rate": 2.370369870345559e-06, "loss": 0.76205713, "num_input_tokens_seen": 163217875, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.12915039, "step": 7609, "time_per_iteration": 4.078218460083008 }, { "auxiliary_loss_clip": 0.06457556, "auxiliary_loss_mlp": 0.01270987, "balance_loss_clip": 0.06285779, "balance_loss_mlp": 0.01257927, "epoch": 0.45753795280324666, "flos": 24359832737280.0, "grad_norm": 1.7285547257603837, "language_loss": 0.80970389, "learning_rate": 2.369987137894757e-06, "loss": 0.8869893, "num_input_tokens_seen": 163237430, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.13061523, "step": 7610, "time_per_iteration": 2.6021580696105957 }, { "auxiliary_loss_clip": 0.06469572, "auxiliary_loss_mlp": 0.0126978, "balance_loss_clip": 0.06292073, "balance_loss_mlp": 0.01256596, "epoch": 0.4575980760559146, "flos": 16659297861120.0, "grad_norm": 1.9425940704969955, "language_loss": 0.82391834, "learning_rate": 2.3696043914140057e-06, "loss": 0.90131181, "num_input_tokens_seen": 163253905, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.1317749, "step": 7611, "time_per_iteration": 2.555384397506714 }, { "auxiliary_loss_clip": 0.06456657, "auxiliary_loss_mlp": 0.01270887, "balance_loss_clip": 0.06286409, "balance_loss_mlp": 0.01257404, "epoch": 0.4576581993085826, "flos": 35919006860160.0, "grad_norm": 1.6181219036855552, "language_loss": 0.74274457, "learning_rate": 2.369221630917819e-06, "loss": 0.82001996, "num_input_tokens_seen": 163274285, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.13482666, "step": 7612, "time_per_iteration": 2.678192138671875 }, { "auxiliary_loss_clip": 0.06456057, "auxiliary_loss_mlp": 0.01268973, "balance_loss_clip": 0.06286793, "balance_loss_mlp": 0.01255479, "epoch": 0.45771832256125056, "flos": 20085995710080.0, "grad_norm": 1.6552171447272326, "language_loss": 0.85085678, "learning_rate": 2.368838856420711e-06, "loss": 0.92810702, "num_input_tokens_seen": 163293150, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.13500977, "step": 7613, "time_per_iteration": 2.580158233642578 }, { "auxiliary_loss_clip": 0.06457804, "auxiliary_loss_mlp": 0.0127392, "balance_loss_clip": 0.06285816, "balance_loss_mlp": 0.01260097, "epoch": 0.4577784458139185, "flos": 10749056520960.0, "grad_norm": 2.1335601227562697, "language_loss": 0.7558682, "learning_rate": 2.3684560679371965e-06, "loss": 0.83318543, "num_input_tokens_seen": 163310065, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.13830566, "step": 7614, "time_per_iteration": 2.5293381214141846 }, { "auxiliary_loss_clip": 0.06456588, "auxiliary_loss_mlp": 0.01272753, "balance_loss_clip": 0.06287532, "balance_loss_mlp": 0.01259634, "epoch": 0.4578385690665865, "flos": 21913577112960.0, "grad_norm": 2.003648162827131, "language_loss": 0.74757016, "learning_rate": 2.368073265481791e-06, "loss": 0.82486361, "num_input_tokens_seen": 163329415, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.13128662, "step": 7615, "time_per_iteration": 2.56270170211792 }, { "auxiliary_loss_clip": 0.06335883, "auxiliary_loss_mlp": 0.01255015, "balance_loss_clip": 0.06264865, "balance_loss_mlp": 0.01252719, "epoch": 0.45789869231925445, "flos": 64774559036160.0, "grad_norm": 0.7496852602645614, "language_loss": 0.575948, "learning_rate": 2.3676904490690105e-06, "loss": 0.65185702, "num_input_tokens_seen": 163385875, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 0.02296448, "step": 7616, "time_per_iteration": 3.145582437515259 }, { "auxiliary_loss_clip": 0.06458528, "auxiliary_loss_mlp": 0.01273346, "balance_loss_clip": 0.0628984, "balance_loss_mlp": 0.01259434, "epoch": 0.4579588155719224, "flos": 16149594274560.0, "grad_norm": 1.6042152187076972, "language_loss": 0.71502084, "learning_rate": 2.3673076187133704e-06, "loss": 0.79233956, "num_input_tokens_seen": 163405170, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.13903809, "step": 7617, "time_per_iteration": 2.5554535388946533 }, { "auxiliary_loss_clip": 0.06461018, "auxiliary_loss_mlp": 0.01271203, "balance_loss_clip": 0.06288474, "balance_loss_mlp": 0.01257112, "epoch": 0.45801893882459044, "flos": 21401609466240.0, "grad_norm": 1.9995024693738797, "language_loss": 0.76192737, "learning_rate": 2.36692477442939e-06, "loss": 0.83924961, "num_input_tokens_seen": 163423155, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.14099121, "step": 7618, "time_per_iteration": 2.562081813812256 }, { "auxiliary_loss_clip": 0.06459647, "auxiliary_loss_mlp": 0.0127148, "balance_loss_clip": 0.06287578, "balance_loss_mlp": 0.01258576, "epoch": 0.4580790620772584, "flos": 19542609982080.0, "grad_norm": 2.8986841986823806, "language_loss": 0.76760709, "learning_rate": 2.366541916231585e-06, "loss": 0.84491837, "num_input_tokens_seen": 163442450, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.12896729, "step": 7619, "time_per_iteration": 2.658968687057495 }, { "auxiliary_loss_clip": 0.0645093, "auxiliary_loss_mlp": 0.01272491, "balance_loss_clip": 0.06282894, "balance_loss_mlp": 0.01260141, "epoch": 0.45813918532992637, "flos": 16586608844160.0, "grad_norm": 1.8162977291157199, "language_loss": 0.7241255, "learning_rate": 2.366159044134473e-06, "loss": 0.80135965, "num_input_tokens_seen": 163459810, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.12347412, "step": 7620, "time_per_iteration": 2.5670583248138428 }, { "auxiliary_loss_clip": 0.06452797, "auxiliary_loss_mlp": 0.01270954, "balance_loss_clip": 0.06284702, "balance_loss_mlp": 0.01258777, "epoch": 0.45819930858259433, "flos": 42240085568640.0, "grad_norm": 1.7193688513303025, "language_loss": 0.7846157, "learning_rate": 2.3657761581525748e-06, "loss": 0.86185324, "num_input_tokens_seen": 163482970, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.12182617, "step": 7621, "time_per_iteration": 2.7355709075927734 }, { "auxiliary_loss_clip": 0.06341091, "auxiliary_loss_mlp": 0.01253822, "balance_loss_clip": 0.06269707, "balance_loss_mlp": 0.01251417, "epoch": 0.4582594318352623, "flos": 63733335073920.0, "grad_norm": 0.750967553372707, "language_loss": 0.64733446, "learning_rate": 2.3653932583004063e-06, "loss": 0.72328359, "num_input_tokens_seen": 163545330, "router_z_loss_clip": 0.71191406, "router_z_loss_mlp": 0.02401733, "step": 7622, "time_per_iteration": 3.2140088081359863 }, { "auxiliary_loss_clip": 0.06461957, "auxiliary_loss_mlp": 0.01276716, "balance_loss_clip": 0.06289338, "balance_loss_mlp": 0.01262411, "epoch": 0.45831955508793026, "flos": 26877226078080.0, "grad_norm": 1.6823745321887966, "language_loss": 0.8026793, "learning_rate": 2.3650103445924903e-06, "loss": 0.88006604, "num_input_tokens_seen": 163564620, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.14306641, "step": 7623, "time_per_iteration": 2.596104621887207 }, { "auxiliary_loss_clip": 0.06469223, "auxiliary_loss_mlp": 0.01272781, "balance_loss_clip": 0.0629383, "balance_loss_mlp": 0.01259364, "epoch": 0.45837967834059823, "flos": 18739886267520.0, "grad_norm": 1.821724481024648, "language_loss": 0.70682669, "learning_rate": 2.3646274170433452e-06, "loss": 0.78424668, "num_input_tokens_seen": 163581010, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.13421631, "step": 7624, "time_per_iteration": 2.575225830078125 }, { "auxiliary_loss_clip": 0.06464781, "auxiliary_loss_mlp": 0.01274027, "balance_loss_clip": 0.06291962, "balance_loss_mlp": 0.01261141, "epoch": 0.4584398015932662, "flos": 21184380956160.0, "grad_norm": 1.808819258693687, "language_loss": 0.73626542, "learning_rate": 2.364244475667491e-06, "loss": 0.81365347, "num_input_tokens_seen": 163599955, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.12908936, "step": 7625, "time_per_iteration": 2.580902338027954 }, { "auxiliary_loss_clip": 0.06461051, "auxiliary_loss_mlp": 0.01271492, "balance_loss_clip": 0.06291258, "balance_loss_mlp": 0.0125894, "epoch": 0.45849992484593416, "flos": 19795826620800.0, "grad_norm": 2.0582404904013014, "language_loss": 0.78923404, "learning_rate": 2.363861520479451e-06, "loss": 0.86655951, "num_input_tokens_seen": 163618545, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.12542725, "step": 7626, "time_per_iteration": 2.58606219291687 }, { "auxiliary_loss_clip": 0.06467532, "auxiliary_loss_mlp": 0.01273015, "balance_loss_clip": 0.06291856, "balance_loss_mlp": 0.01259717, "epoch": 0.4585600480986021, "flos": 18229134504960.0, "grad_norm": 1.8993356235723486, "language_loss": 0.85295558, "learning_rate": 2.3634785514937445e-06, "loss": 0.93036109, "num_input_tokens_seen": 163636055, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.13305664, "step": 7627, "time_per_iteration": 2.5853350162506104 }, { "auxiliary_loss_clip": 0.06470594, "auxiliary_loss_mlp": 0.01269972, "balance_loss_clip": 0.06293575, "balance_loss_mlp": 0.01255541, "epoch": 0.4586201713512701, "flos": 29029748814720.0, "grad_norm": 1.5882156951519921, "language_loss": 0.70119631, "learning_rate": 2.3630955687248953e-06, "loss": 0.77860188, "num_input_tokens_seen": 163657485, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.14416504, "step": 7628, "time_per_iteration": 2.6478402614593506 }, { "auxiliary_loss_clip": 0.06461503, "auxiliary_loss_mlp": 0.01270214, "balance_loss_clip": 0.06292088, "balance_loss_mlp": 0.01256863, "epoch": 0.45868029460393805, "flos": 23411395572480.0, "grad_norm": 1.530947156157187, "language_loss": 0.7887854, "learning_rate": 2.3627125721874265e-06, "loss": 0.86610258, "num_input_tokens_seen": 163676030, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.13360596, "step": 7629, "time_per_iteration": 2.5889902114868164 }, { "auxiliary_loss_clip": 0.06467562, "auxiliary_loss_mlp": 0.01273601, "balance_loss_clip": 0.06289398, "balance_loss_mlp": 0.01259064, "epoch": 0.458740417856606, "flos": 18227625131520.0, "grad_norm": 1.9443914256491044, "language_loss": 0.79922605, "learning_rate": 2.3623295618958595e-06, "loss": 0.87663764, "num_input_tokens_seen": 163694490, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.14520264, "step": 7630, "time_per_iteration": 2.584712266921997 }, { "auxiliary_loss_clip": 0.06471275, "auxiliary_loss_mlp": 0.01272796, "balance_loss_clip": 0.06295417, "balance_loss_mlp": 0.01259278, "epoch": 0.458800541109274, "flos": 34577341683840.0, "grad_norm": 2.658916305048605, "language_loss": 0.72385144, "learning_rate": 2.3619465378647198e-06, "loss": 0.80129218, "num_input_tokens_seen": 163717035, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.13513184, "step": 7631, "time_per_iteration": 2.696261167526245 }, { "auxiliary_loss_clip": 0.06466545, "auxiliary_loss_mlp": 0.01269036, "balance_loss_clip": 0.06294569, "balance_loss_mlp": 0.01255339, "epoch": 0.458860664361942, "flos": 17717837690880.0, "grad_norm": 2.338437826381958, "language_loss": 0.719868, "learning_rate": 2.361563500108531e-06, "loss": 0.79722381, "num_input_tokens_seen": 163734525, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.13696289, "step": 7632, "time_per_iteration": 2.5794320106506348 }, { "auxiliary_loss_clip": 0.06473083, "auxiliary_loss_mlp": 0.01273154, "balance_loss_clip": 0.06295769, "balance_loss_mlp": 0.01258789, "epoch": 0.45892078761460997, "flos": 18447746607360.0, "grad_norm": 2.699045315085646, "language_loss": 0.69770348, "learning_rate": 2.3611804486418178e-06, "loss": 0.7751658, "num_input_tokens_seen": 163752860, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.14379883, "step": 7633, "time_per_iteration": 2.5717599391937256 }, { "auxiliary_loss_clip": 0.06464963, "auxiliary_loss_mlp": 0.01272918, "balance_loss_clip": 0.06292187, "balance_loss_mlp": 0.01259506, "epoch": 0.45898091086727794, "flos": 22679306449920.0, "grad_norm": 1.442568901649273, "language_loss": 0.8123678, "learning_rate": 2.3607973834791062e-06, "loss": 0.88974661, "num_input_tokens_seen": 163772495, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.13409424, "step": 7634, "time_per_iteration": 2.5769546031951904 }, { "auxiliary_loss_clip": 0.06476215, "auxiliary_loss_mlp": 0.01277944, "balance_loss_clip": 0.06296708, "balance_loss_mlp": 0.01263407, "epoch": 0.4590410341199459, "flos": 21659396152320.0, "grad_norm": 1.6131226903131861, "language_loss": 0.81672889, "learning_rate": 2.3604143046349216e-06, "loss": 0.89427054, "num_input_tokens_seen": 163791475, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.14532471, "step": 7635, "time_per_iteration": 4.020402908325195 }, { "auxiliary_loss_clip": 0.06463003, "auxiliary_loss_mlp": 0.01278462, "balance_loss_clip": 0.06292294, "balance_loss_mlp": 0.01265641, "epoch": 0.45910115737261387, "flos": 36543676648320.0, "grad_norm": 1.4057340060093995, "language_loss": 0.64789486, "learning_rate": 2.3600312121237905e-06, "loss": 0.72530949, "num_input_tokens_seen": 163812995, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.12835693, "step": 7636, "time_per_iteration": 2.7290828227996826 }, { "auxiliary_loss_clip": 0.06456192, "auxiliary_loss_mlp": 0.01270247, "balance_loss_clip": 0.06288644, "balance_loss_mlp": 0.01256932, "epoch": 0.45916128062528183, "flos": 24425771500800.0, "grad_norm": 1.883786025888886, "language_loss": 0.80535829, "learning_rate": 2.3596481059602395e-06, "loss": 0.88262266, "num_input_tokens_seen": 163833945, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.13330078, "step": 7637, "time_per_iteration": 2.753634214401245 }, { "auxiliary_loss_clip": 0.06467338, "auxiliary_loss_mlp": 0.01275467, "balance_loss_clip": 0.06291569, "balance_loss_mlp": 0.01260089, "epoch": 0.4592214038779498, "flos": 23228687744640.0, "grad_norm": 1.497370745955865, "language_loss": 0.75257576, "learning_rate": 2.3592649861587965e-06, "loss": 0.83000374, "num_input_tokens_seen": 163853885, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.15374756, "step": 7638, "time_per_iteration": 4.0221288204193115 }, { "auxiliary_loss_clip": 0.06460033, "auxiliary_loss_mlp": 0.01267793, "balance_loss_clip": 0.06291885, "balance_loss_mlp": 0.01254531, "epoch": 0.45928152713061776, "flos": 19178200575360.0, "grad_norm": 1.6829994477559962, "language_loss": 0.74115592, "learning_rate": 2.358881852733989e-06, "loss": 0.81843412, "num_input_tokens_seen": 163871855, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.13244629, "step": 7639, "time_per_iteration": 2.565018653869629 }, { "auxiliary_loss_clip": 0.06467459, "auxiliary_loss_mlp": 0.01271091, "balance_loss_clip": 0.06293847, "balance_loss_mlp": 0.01257293, "epoch": 0.4593416503832857, "flos": 22420513514880.0, "grad_norm": 1.7847310231217728, "language_loss": 0.68404877, "learning_rate": 2.358498705700346e-06, "loss": 0.7614342, "num_input_tokens_seen": 163891450, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.13800049, "step": 7640, "time_per_iteration": 2.5933034420013428 }, { "auxiliary_loss_clip": 0.06465831, "auxiliary_loss_mlp": 0.01276333, "balance_loss_clip": 0.06289478, "balance_loss_mlp": 0.01262945, "epoch": 0.4594017736359537, "flos": 18886228623360.0, "grad_norm": 1.6623893211236065, "language_loss": 0.75923258, "learning_rate": 2.3581155450723958e-06, "loss": 0.83665419, "num_input_tokens_seen": 163909345, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.13409424, "step": 7641, "time_per_iteration": 2.5703787803649902 }, { "auxiliary_loss_clip": 0.06466466, "auxiliary_loss_mlp": 0.01271195, "balance_loss_clip": 0.06290952, "balance_loss_mlp": 0.01257164, "epoch": 0.45946189688862166, "flos": 20524268090880.0, "grad_norm": 1.7630707937338956, "language_loss": 0.7478832, "learning_rate": 2.357732370864668e-06, "loss": 0.8252598, "num_input_tokens_seen": 163926940, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.14031982, "step": 7642, "time_per_iteration": 2.5548696517944336 }, { "auxiliary_loss_clip": 0.06351062, "auxiliary_loss_mlp": 0.01263188, "balance_loss_clip": 0.06279276, "balance_loss_mlp": 0.01260433, "epoch": 0.4595220201412896, "flos": 61422436920960.0, "grad_norm": 0.799324330481736, "language_loss": 0.58210307, "learning_rate": 2.357349183091694e-06, "loss": 0.65824556, "num_input_tokens_seen": 163977785, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 0.02757263, "step": 7643, "time_per_iteration": 2.962514638900757 }, { "auxiliary_loss_clip": 0.06471653, "auxiliary_loss_mlp": 0.01274199, "balance_loss_clip": 0.06291446, "balance_loss_mlp": 0.01260228, "epoch": 0.4595821433939576, "flos": 23337616452480.0, "grad_norm": 1.4830948336780259, "language_loss": 0.93055296, "learning_rate": 2.3569659817680016e-06, "loss": 1.00801134, "num_input_tokens_seen": 163996630, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.13977051, "step": 7644, "time_per_iteration": 2.5966315269470215 }, { "auxiliary_loss_clip": 0.06468883, "auxiliary_loss_mlp": 0.01270306, "balance_loss_clip": 0.0629412, "balance_loss_mlp": 0.0125612, "epoch": 0.4596422666466256, "flos": 14287492189440.0, "grad_norm": 2.0319837423728626, "language_loss": 0.83020723, "learning_rate": 2.3565827669081243e-06, "loss": 0.90759909, "num_input_tokens_seen": 164013190, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.14196777, "step": 7645, "time_per_iteration": 2.5539400577545166 }, { "auxiliary_loss_clip": 0.0634493, "auxiliary_loss_mlp": 0.01254842, "balance_loss_clip": 0.0627297, "balance_loss_mlp": 0.01252614, "epoch": 0.4597023898992936, "flos": 65747188103040.0, "grad_norm": 0.7448845113805578, "language_loss": 0.59839272, "learning_rate": 2.356199538526593e-06, "loss": 0.67439044, "num_input_tokens_seen": 164074030, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 0.02229309, "step": 7646, "time_per_iteration": 3.148505926132202 }, { "auxiliary_loss_clip": 0.06464665, "auxiliary_loss_mlp": 0.01271188, "balance_loss_clip": 0.06291567, "balance_loss_mlp": 0.01258075, "epoch": 0.45976251315196154, "flos": 26914430090880.0, "grad_norm": 1.4885176390143624, "language_loss": 0.72567213, "learning_rate": 2.355816296637939e-06, "loss": 0.80303061, "num_input_tokens_seen": 164095515, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.13104248, "step": 7647, "time_per_iteration": 2.622023582458496 }, { "auxiliary_loss_clip": 0.06466489, "auxiliary_loss_mlp": 0.0127469, "balance_loss_clip": 0.06289456, "balance_loss_mlp": 0.01260707, "epoch": 0.4598226364046295, "flos": 26625854229120.0, "grad_norm": 1.5650456051667387, "language_loss": 0.6691364, "learning_rate": 2.3554330412566957e-06, "loss": 0.74654818, "num_input_tokens_seen": 164117270, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.13970947, "step": 7648, "time_per_iteration": 3.9934587478637695 }, { "auxiliary_loss_clip": 0.06462634, "auxiliary_loss_mlp": 0.01273053, "balance_loss_clip": 0.0628994, "balance_loss_mlp": 0.01259767, "epoch": 0.45988275965729747, "flos": 24394395346560.0, "grad_norm": 1.397663361130266, "language_loss": 0.79203701, "learning_rate": 2.3550497723973953e-06, "loss": 0.86939389, "num_input_tokens_seen": 164137850, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.1328125, "step": 7649, "time_per_iteration": 4.056555986404419 }, { "auxiliary_loss_clip": 0.06460096, "auxiliary_loss_mlp": 0.01271551, "balance_loss_clip": 0.06287681, "balance_loss_mlp": 0.01257872, "epoch": 0.45994288290996543, "flos": 24542834054400.0, "grad_norm": 1.9316041394322332, "language_loss": 0.70086479, "learning_rate": 2.3546664900745726e-06, "loss": 0.7781812, "num_input_tokens_seen": 164157960, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.13671875, "step": 7650, "time_per_iteration": 2.6158359050750732 }, { "auxiliary_loss_clip": 0.06464157, "auxiliary_loss_mlp": 0.01273918, "balance_loss_clip": 0.06284948, "balance_loss_mlp": 0.01258505, "epoch": 0.4600030061626334, "flos": 14835573745920.0, "grad_norm": 1.824169870027379, "language_loss": 0.83864522, "learning_rate": 2.354283194302761e-06, "loss": 0.916026, "num_input_tokens_seen": 164174590, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.15386963, "step": 7651, "time_per_iteration": 2.554525136947632 }, { "auxiliary_loss_clip": 0.06456776, "auxiliary_loss_mlp": 0.01269159, "balance_loss_clip": 0.0628618, "balance_loss_mlp": 0.01255724, "epoch": 0.46006312941530136, "flos": 18119702672640.0, "grad_norm": 1.7748020780090217, "language_loss": 0.75693202, "learning_rate": 2.3538998850964948e-06, "loss": 0.83419138, "num_input_tokens_seen": 164192935, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.13427734, "step": 7652, "time_per_iteration": 2.53411865234375 }, { "auxiliary_loss_clip": 0.06466442, "auxiliary_loss_mlp": 0.01269933, "balance_loss_clip": 0.06290356, "balance_loss_mlp": 0.01256612, "epoch": 0.46012325266796933, "flos": 21982157280000.0, "grad_norm": 1.562622217999814, "language_loss": 0.75667, "learning_rate": 2.3535165624703097e-06, "loss": 0.83403373, "num_input_tokens_seen": 164213160, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.13330078, "step": 7653, "time_per_iteration": 2.6447513103485107 }, { "auxiliary_loss_clip": 0.06477061, "auxiliary_loss_mlp": 0.01277946, "balance_loss_clip": 0.06293216, "balance_loss_mlp": 0.01261901, "epoch": 0.4601833759206373, "flos": 15273468783360.0, "grad_norm": 3.242568141653326, "language_loss": 0.66218984, "learning_rate": 2.353133226438741e-06, "loss": 0.73973989, "num_input_tokens_seen": 164229330, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.16052246, "step": 7654, "time_per_iteration": 2.5381276607513428 }, { "auxiliary_loss_clip": 0.06457289, "auxiliary_loss_mlp": 0.01274165, "balance_loss_clip": 0.06286146, "balance_loss_mlp": 0.01261475, "epoch": 0.46024349917330526, "flos": 27096299377920.0, "grad_norm": 1.7390187337558494, "language_loss": 0.79618126, "learning_rate": 2.3527498770163248e-06, "loss": 0.87349582, "num_input_tokens_seen": 164248240, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.12683105, "step": 7655, "time_per_iteration": 2.596998691558838 }, { "auxiliary_loss_clip": 0.06453677, "auxiliary_loss_mlp": 0.01269816, "balance_loss_clip": 0.06284632, "balance_loss_mlp": 0.01256649, "epoch": 0.4603036224259732, "flos": 24469935402240.0, "grad_norm": 1.8444879920135697, "language_loss": 0.68401492, "learning_rate": 2.3523665142175985e-06, "loss": 0.76124978, "num_input_tokens_seen": 164268020, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.13189697, "step": 7656, "time_per_iteration": 2.600710868835449 }, { "auxiliary_loss_clip": 0.06461737, "auxiliary_loss_mlp": 0.01268655, "balance_loss_clip": 0.0628778, "balance_loss_mlp": 0.01254964, "epoch": 0.4603637456786412, "flos": 28116545091840.0, "grad_norm": 1.6730997654912416, "language_loss": 0.81433743, "learning_rate": 2.351983138057098e-06, "loss": 0.89164132, "num_input_tokens_seen": 164287305, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.13690186, "step": 7657, "time_per_iteration": 2.622616767883301 }, { "auxiliary_loss_clip": 0.06461843, "auxiliary_loss_mlp": 0.01271039, "balance_loss_clip": 0.06287906, "balance_loss_mlp": 0.01256597, "epoch": 0.4604238689313092, "flos": 24355178835840.0, "grad_norm": 2.333226113809268, "language_loss": 0.71179992, "learning_rate": 2.3515997485493623e-06, "loss": 0.78912878, "num_input_tokens_seen": 164306835, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.14440918, "step": 7658, "time_per_iteration": 2.6221201419830322 }, { "auxiliary_loss_clip": 0.06338799, "auxiliary_loss_mlp": 0.01252959, "balance_loss_clip": 0.06266706, "balance_loss_mlp": 0.01250312, "epoch": 0.4604839921839772, "flos": 53622742337280.0, "grad_norm": 0.9207346325489741, "language_loss": 0.61847925, "learning_rate": 2.351216345708928e-06, "loss": 0.69439685, "num_input_tokens_seen": 164367095, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 0.02648926, "step": 7659, "time_per_iteration": 3.264713764190674 }, { "auxiliary_loss_clip": 0.06457874, "auxiliary_loss_mlp": 0.01272262, "balance_loss_clip": 0.06287419, "balance_loss_mlp": 0.01257456, "epoch": 0.46054411543664514, "flos": 31256428014720.0, "grad_norm": 1.7361417916371584, "language_loss": 0.68582416, "learning_rate": 2.350832929550336e-06, "loss": 0.76312554, "num_input_tokens_seen": 164388895, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.14807129, "step": 7660, "time_per_iteration": 2.6695573329925537 }, { "auxiliary_loss_clip": 0.06460075, "auxiliary_loss_mlp": 0.01270509, "balance_loss_clip": 0.06286282, "balance_loss_mlp": 0.01256442, "epoch": 0.4606042386893131, "flos": 24098943450240.0, "grad_norm": 1.7320168184392482, "language_loss": 0.77121782, "learning_rate": 2.3504495000881227e-06, "loss": 0.84852374, "num_input_tokens_seen": 164409080, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.140625, "step": 7661, "time_per_iteration": 2.588669538497925 }, { "auxiliary_loss_clip": 0.06452115, "auxiliary_loss_mlp": 0.01274558, "balance_loss_clip": 0.06284094, "balance_loss_mlp": 0.01261034, "epoch": 0.46066436194198107, "flos": 26585715323520.0, "grad_norm": 1.748836555009877, "language_loss": 0.75498676, "learning_rate": 2.3500660573368305e-06, "loss": 0.83225346, "num_input_tokens_seen": 164427585, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.13525391, "step": 7662, "time_per_iteration": 2.6114611625671387 }, { "auxiliary_loss_clip": 0.06471287, "auxiliary_loss_mlp": 0.0127437, "balance_loss_clip": 0.06288956, "balance_loss_mlp": 0.01258694, "epoch": 0.46072448519464904, "flos": 17779751458560.0, "grad_norm": 3.393015935404727, "language_loss": 0.79777801, "learning_rate": 2.349682601310998e-06, "loss": 0.87523472, "num_input_tokens_seen": 164438455, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.15710449, "step": 7663, "time_per_iteration": 2.5154342651367188 }, { "auxiliary_loss_clip": 0.06457408, "auxiliary_loss_mlp": 0.01273061, "balance_loss_clip": 0.06288119, "balance_loss_mlp": 0.01259674, "epoch": 0.460784608447317, "flos": 15091557569280.0, "grad_norm": 1.9089817283573889, "language_loss": 0.73453474, "learning_rate": 2.3492991320251653e-06, "loss": 0.81183946, "num_input_tokens_seen": 164456830, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.13378906, "step": 7664, "time_per_iteration": 2.5534262657165527 }, { "auxiliary_loss_clip": 0.06456052, "auxiliary_loss_mlp": 0.01268198, "balance_loss_clip": 0.06283727, "balance_loss_mlp": 0.01254554, "epoch": 0.46084473169998497, "flos": 18594214744320.0, "grad_norm": 1.6359479627367315, "language_loss": 0.72868997, "learning_rate": 2.3489156494938753e-06, "loss": 0.8059324, "num_input_tokens_seen": 164475375, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.13647461, "step": 7665, "time_per_iteration": 2.534116268157959 }, { "auxiliary_loss_clip": 0.06458861, "auxiliary_loss_mlp": 0.01273206, "balance_loss_clip": 0.06285989, "balance_loss_mlp": 0.01259211, "epoch": 0.46090485495265293, "flos": 19499955454080.0, "grad_norm": 1.9769863964982552, "language_loss": 0.78318167, "learning_rate": 2.348532153731669e-06, "loss": 0.8605023, "num_input_tokens_seen": 164492040, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.14001465, "step": 7666, "time_per_iteration": 2.732870101928711 }, { "auxiliary_loss_clip": 0.06460745, "auxiliary_loss_mlp": 0.01285224, "balance_loss_clip": 0.0628881, "balance_loss_mlp": 0.01270072, "epoch": 0.4609649782053209, "flos": 33373339966080.0, "grad_norm": 1.3374947096127883, "language_loss": 0.74003935, "learning_rate": 2.348148644753088e-06, "loss": 0.8174991, "num_input_tokens_seen": 164513665, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.15148926, "step": 7667, "time_per_iteration": 2.7672276496887207 }, { "auxiliary_loss_clip": 0.0645307, "auxiliary_loss_mlp": 0.01269409, "balance_loss_clip": 0.0628133, "balance_loss_mlp": 0.01255563, "epoch": 0.46102510145798886, "flos": 23775972687360.0, "grad_norm": 1.451982681181523, "language_loss": 0.76288843, "learning_rate": 2.347765122572676e-06, "loss": 0.84011322, "num_input_tokens_seen": 164533890, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.1385498, "step": 7668, "time_per_iteration": 2.6154191493988037 }, { "auxiliary_loss_clip": 0.06449543, "auxiliary_loss_mlp": 0.01273856, "balance_loss_clip": 0.06283221, "balance_loss_mlp": 0.01260785, "epoch": 0.4610852247106568, "flos": 23301544469760.0, "grad_norm": 1.6361095435699795, "language_loss": 0.78170741, "learning_rate": 2.347381587204975e-06, "loss": 0.85894144, "num_input_tokens_seen": 164553815, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.13079834, "step": 7669, "time_per_iteration": 2.572399616241455 }, { "auxiliary_loss_clip": 0.06455562, "auxiliary_loss_mlp": 0.01270459, "balance_loss_clip": 0.06282885, "balance_loss_mlp": 0.01256637, "epoch": 0.4611453479633248, "flos": 25454528403840.0, "grad_norm": 1.5924580572751845, "language_loss": 0.82991165, "learning_rate": 2.34699803866453e-06, "loss": 0.90717185, "num_input_tokens_seen": 164573125, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.13812256, "step": 7670, "time_per_iteration": 2.621339797973633 }, { "auxiliary_loss_clip": 0.06457235, "auxiliary_loss_mlp": 0.01273335, "balance_loss_clip": 0.06287955, "balance_loss_mlp": 0.01260222, "epoch": 0.4612054712159928, "flos": 21145541788800.0, "grad_norm": 1.9337475009189238, "language_loss": 0.64503503, "learning_rate": 2.3466144769658845e-06, "loss": 0.72234076, "num_input_tokens_seen": 164592575, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.13110352, "step": 7671, "time_per_iteration": 2.5653133392333984 }, { "auxiliary_loss_clip": 0.06351174, "auxiliary_loss_mlp": 0.01264283, "balance_loss_clip": 0.06280649, "balance_loss_mlp": 0.01261902, "epoch": 0.4612655944686608, "flos": 69979754194560.0, "grad_norm": 0.684231026260448, "language_loss": 0.55847639, "learning_rate": 2.346230902123583e-06, "loss": 0.63463098, "num_input_tokens_seen": 164659795, "router_z_loss_clip": 0.70458984, "router_z_loss_mlp": 0.02377319, "step": 7672, "time_per_iteration": 3.2641899585723877 }, { "auxiliary_loss_clip": 0.06459518, "auxiliary_loss_mlp": 0.01269775, "balance_loss_clip": 0.06285453, "balance_loss_mlp": 0.01255708, "epoch": 0.46132571772132874, "flos": 16842844229760.0, "grad_norm": 1.779018730593725, "language_loss": 0.7153964, "learning_rate": 2.3458473141521715e-06, "loss": 0.79268932, "num_input_tokens_seen": 164678735, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.140625, "step": 7673, "time_per_iteration": 2.5582196712493896 }, { "auxiliary_loss_clip": 0.06455315, "auxiliary_loss_mlp": 0.01268651, "balance_loss_clip": 0.06285694, "balance_loss_mlp": 0.01255079, "epoch": 0.4613858409739967, "flos": 35817666946560.0, "grad_norm": 1.9001087570451283, "language_loss": 0.71264583, "learning_rate": 2.345463713066195e-06, "loss": 0.78988546, "num_input_tokens_seen": 164700885, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.13580322, "step": 7674, "time_per_iteration": 2.6915645599365234 }, { "auxiliary_loss_clip": 0.06455991, "auxiliary_loss_mlp": 0.01276343, "balance_loss_clip": 0.06284542, "balance_loss_mlp": 0.01262789, "epoch": 0.4614459642266647, "flos": 35276251789440.0, "grad_norm": 1.3147979943852792, "language_loss": 0.65483177, "learning_rate": 2.3450800988801996e-06, "loss": 0.73215508, "num_input_tokens_seen": 164726960, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.13555908, "step": 7675, "time_per_iteration": 4.17140007019043 }, { "auxiliary_loss_clip": 0.06355563, "auxiliary_loss_mlp": 0.01267746, "balance_loss_clip": 0.06284915, "balance_loss_mlp": 0.01265199, "epoch": 0.46150608747933264, "flos": 66723311842560.0, "grad_norm": 0.7921946021004265, "language_loss": 0.58542752, "learning_rate": 2.3446964716087327e-06, "loss": 0.66166061, "num_input_tokens_seen": 164788525, "router_z_loss_clip": 0.70703125, "router_z_loss_mlp": 0.02546692, "step": 7676, "time_per_iteration": 3.213620185852051 }, { "auxiliary_loss_clip": 0.0634512, "auxiliary_loss_mlp": 0.01260243, "balance_loss_clip": 0.06274877, "balance_loss_mlp": 0.01257908, "epoch": 0.4615662107320006, "flos": 55846780133760.0, "grad_norm": 0.7704301841653421, "language_loss": 0.62703013, "learning_rate": 2.344312831266341e-06, "loss": 0.70308381, "num_input_tokens_seen": 164843525, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 0.02333069, "step": 7677, "time_per_iteration": 4.517465829849243 }, { "auxiliary_loss_clip": 0.06462646, "auxiliary_loss_mlp": 0.01272194, "balance_loss_clip": 0.06293371, "balance_loss_mlp": 0.01259254, "epoch": 0.46162633398466857, "flos": 15488055889920.0, "grad_norm": 2.3701925518611677, "language_loss": 0.76712573, "learning_rate": 2.3439291778675718e-06, "loss": 0.84447408, "num_input_tokens_seen": 164859895, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.12927246, "step": 7678, "time_per_iteration": 2.5434176921844482 }, { "auxiliary_loss_clip": 0.0646279, "auxiliary_loss_mlp": 0.0126943, "balance_loss_clip": 0.06288706, "balance_loss_mlp": 0.01254857, "epoch": 0.46168645723733653, "flos": 20017667105280.0, "grad_norm": 2.4723980324047887, "language_loss": 0.66977143, "learning_rate": 2.343545511426974e-06, "loss": 0.74709356, "num_input_tokens_seen": 164878030, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.14569092, "step": 7679, "time_per_iteration": 2.551819324493408 }, { "auxiliary_loss_clip": 0.06462676, "auxiliary_loss_mlp": 0.01271308, "balance_loss_clip": 0.06291836, "balance_loss_mlp": 0.01257945, "epoch": 0.4617465804900045, "flos": 20304020833920.0, "grad_norm": 1.936783986930272, "language_loss": 0.69536787, "learning_rate": 2.3431618319590963e-06, "loss": 0.7727077, "num_input_tokens_seen": 164895710, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.13372803, "step": 7680, "time_per_iteration": 2.554202079772949 }, { "auxiliary_loss_clip": 0.06468087, "auxiliary_loss_mlp": 0.01274366, "balance_loss_clip": 0.06292088, "balance_loss_mlp": 0.01260222, "epoch": 0.46180670374267246, "flos": 22352897669760.0, "grad_norm": 1.5813033200861308, "language_loss": 0.63995695, "learning_rate": 2.342778139478487e-06, "loss": 0.71738148, "num_input_tokens_seen": 164913365, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.14147949, "step": 7681, "time_per_iteration": 2.5786054134368896 }, { "auxiliary_loss_clip": 0.06450533, "auxiliary_loss_mlp": 0.01268489, "balance_loss_clip": 0.0628314, "balance_loss_mlp": 0.01255751, "epoch": 0.46186682699534043, "flos": 19900856113920.0, "grad_norm": 1.4147289010171638, "language_loss": 0.67655599, "learning_rate": 2.342394433999697e-06, "loss": 0.75374621, "num_input_tokens_seen": 164931620, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.12738037, "step": 7682, "time_per_iteration": 2.5727696418762207 }, { "auxiliary_loss_clip": 0.06458705, "auxiliary_loss_mlp": 0.01270441, "balance_loss_clip": 0.06286956, "balance_loss_mlp": 0.01257179, "epoch": 0.4619269502480084, "flos": 31511573297280.0, "grad_norm": 2.0746649523343548, "language_loss": 0.74446821, "learning_rate": 2.342010715537275e-06, "loss": 0.82175964, "num_input_tokens_seen": 164950905, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.13262939, "step": 7683, "time_per_iteration": 2.6942505836486816 }, { "auxiliary_loss_clip": 0.06453589, "auxiliary_loss_mlp": 0.01272926, "balance_loss_clip": 0.06285068, "balance_loss_mlp": 0.01259944, "epoch": 0.46198707350067636, "flos": 25016465658240.0, "grad_norm": 1.7496908665170923, "language_loss": 0.77120173, "learning_rate": 2.3416269841057726e-06, "loss": 0.84846681, "num_input_tokens_seen": 164970950, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.12988281, "step": 7684, "time_per_iteration": 2.590156316757202 }, { "auxiliary_loss_clip": 0.06471731, "auxiliary_loss_mlp": 0.01270673, "balance_loss_clip": 0.06292852, "balance_loss_mlp": 0.0125576, "epoch": 0.4620471967533444, "flos": 18297588890880.0, "grad_norm": 1.5836631279742703, "language_loss": 0.79675174, "learning_rate": 2.3412432397197412e-06, "loss": 0.87417573, "num_input_tokens_seen": 164989855, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.14904785, "step": 7685, "time_per_iteration": 2.5490522384643555 }, { "auxiliary_loss_clip": 0.06453834, "auxiliary_loss_mlp": 0.01268536, "balance_loss_clip": 0.06287749, "balance_loss_mlp": 0.0125435, "epoch": 0.46210732000601235, "flos": 33993607415040.0, "grad_norm": 1.997706009974922, "language_loss": 0.66754085, "learning_rate": 2.340859482393731e-06, "loss": 0.74476457, "num_input_tokens_seen": 165012290, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.14178467, "step": 7686, "time_per_iteration": 2.658726453781128 }, { "auxiliary_loss_clip": 0.064583, "auxiliary_loss_mlp": 0.01272589, "balance_loss_clip": 0.06285509, "balance_loss_mlp": 0.01258713, "epoch": 0.4621674432586803, "flos": 25016381804160.0, "grad_norm": 1.8714816014936453, "language_loss": 0.74242795, "learning_rate": 2.340475712142296e-06, "loss": 0.81973684, "num_input_tokens_seen": 165030810, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.13867188, "step": 7687, "time_per_iteration": 2.584761142730713 }, { "auxiliary_loss_clip": 0.06454162, "auxiliary_loss_mlp": 0.01269301, "balance_loss_clip": 0.0628565, "balance_loss_mlp": 0.01255944, "epoch": 0.4622275665113483, "flos": 22019906344320.0, "grad_norm": 2.5493475822182456, "language_loss": 0.75356448, "learning_rate": 2.3400919289799873e-06, "loss": 0.8307991, "num_input_tokens_seen": 165050205, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.13372803, "step": 7688, "time_per_iteration": 5.374546051025391 }, { "auxiliary_loss_clip": 0.06453928, "auxiliary_loss_mlp": 0.01269194, "balance_loss_clip": 0.06285594, "balance_loss_mlp": 0.01255318, "epoch": 0.46228768976401624, "flos": 24065303235840.0, "grad_norm": 1.7895829994213388, "language_loss": 0.7915895, "learning_rate": 2.3397081329213585e-06, "loss": 0.86882067, "num_input_tokens_seen": 165069370, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.13879395, "step": 7689, "time_per_iteration": 2.593928337097168 }, { "auxiliary_loss_clip": 0.06456263, "auxiliary_loss_mlp": 0.01272235, "balance_loss_clip": 0.06283229, "balance_loss_mlp": 0.0125765, "epoch": 0.4623478130166842, "flos": 26658655902720.0, "grad_norm": 2.0133644309050047, "language_loss": 0.57669908, "learning_rate": 2.339324323980964e-06, "loss": 0.65398407, "num_input_tokens_seen": 165089610, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.14581299, "step": 7690, "time_per_iteration": 2.609503984451294 }, { "auxiliary_loss_clip": 0.06461194, "auxiliary_loss_mlp": 0.01273155, "balance_loss_clip": 0.06287481, "balance_loss_mlp": 0.0125882, "epoch": 0.46240793626935217, "flos": 20564700485760.0, "grad_norm": 3.0045469580932145, "language_loss": 0.83607626, "learning_rate": 2.3389405021733562e-06, "loss": 0.91341972, "num_input_tokens_seen": 165109050, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.14337158, "step": 7691, "time_per_iteration": 2.5668792724609375 }, { "auxiliary_loss_clip": 0.06456485, "auxiliary_loss_mlp": 0.01268119, "balance_loss_clip": 0.06284633, "balance_loss_mlp": 0.01254326, "epoch": 0.46246805952202014, "flos": 22462706845440.0, "grad_norm": 1.3731598768885676, "language_loss": 0.75440514, "learning_rate": 2.338556667513091e-06, "loss": 0.83165121, "num_input_tokens_seen": 165130130, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.13800049, "step": 7692, "time_per_iteration": 2.6091740131378174 }, { "auxiliary_loss_clip": 0.06456872, "auxiliary_loss_mlp": 0.01272812, "balance_loss_clip": 0.06282545, "balance_loss_mlp": 0.01259133, "epoch": 0.4625281827746881, "flos": 35049673549440.0, "grad_norm": 1.5895260951630503, "language_loss": 0.74155176, "learning_rate": 2.338172820014723e-06, "loss": 0.81884861, "num_input_tokens_seen": 165152685, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.13677979, "step": 7693, "time_per_iteration": 2.7126054763793945 }, { "auxiliary_loss_clip": 0.06454818, "auxiliary_loss_mlp": 0.01272145, "balance_loss_clip": 0.06284612, "balance_loss_mlp": 0.01258532, "epoch": 0.46258830602735607, "flos": 21074907196800.0, "grad_norm": 1.4932651471147342, "language_loss": 0.85791445, "learning_rate": 2.337788959692808e-06, "loss": 0.93518412, "num_input_tokens_seen": 165173315, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.13604736, "step": 7694, "time_per_iteration": 2.6075356006622314 }, { "auxiliary_loss_clip": 0.064574, "auxiliary_loss_mlp": 0.01274156, "balance_loss_clip": 0.06283622, "balance_loss_mlp": 0.01259934, "epoch": 0.46264842928002403, "flos": 26184437320320.0, "grad_norm": 5.489289379901082, "language_loss": 0.79359114, "learning_rate": 2.337405086561902e-06, "loss": 0.87090671, "num_input_tokens_seen": 165192395, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.14215088, "step": 7695, "time_per_iteration": 2.608637809753418 }, { "auxiliary_loss_clip": 0.06449552, "auxiliary_loss_mlp": 0.01267953, "balance_loss_clip": 0.06282431, "balance_loss_mlp": 0.0125537, "epoch": 0.462708552532692, "flos": 16769903650560.0, "grad_norm": 1.590633540592256, "language_loss": 0.72570288, "learning_rate": 2.3370212006365606e-06, "loss": 0.8028779, "num_input_tokens_seen": 165211355, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.12591553, "step": 7696, "time_per_iteration": 2.555910348892212 }, { "auxiliary_loss_clip": 0.06456953, "auxiliary_loss_mlp": 0.01274249, "balance_loss_clip": 0.06285369, "balance_loss_mlp": 0.01261541, "epoch": 0.46276867578535996, "flos": 15565985786880.0, "grad_norm": 1.581169058536318, "language_loss": 0.69841754, "learning_rate": 2.3366373019313423e-06, "loss": 0.7757296, "num_input_tokens_seen": 165229380, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.12713623, "step": 7697, "time_per_iteration": 2.5540409088134766 }, { "auxiliary_loss_clip": 0.06452695, "auxiliary_loss_mlp": 0.01270343, "balance_loss_clip": 0.06281807, "balance_loss_mlp": 0.01257409, "epoch": 0.462828799038028, "flos": 22421352055680.0, "grad_norm": 1.756933289859923, "language_loss": 0.85261083, "learning_rate": 2.3362533904608025e-06, "loss": 0.92984116, "num_input_tokens_seen": 165247200, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.12945557, "step": 7698, "time_per_iteration": 2.570430278778076 }, { "auxiliary_loss_clip": 0.06453961, "auxiliary_loss_mlp": 0.01270774, "balance_loss_clip": 0.06283552, "balance_loss_mlp": 0.01257297, "epoch": 0.46288892229069595, "flos": 21075997299840.0, "grad_norm": 1.7787366564951634, "language_loss": 0.71759617, "learning_rate": 2.335869466239502e-06, "loss": 0.79484344, "num_input_tokens_seen": 165265825, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.13464355, "step": 7699, "time_per_iteration": 2.5598223209381104 }, { "auxiliary_loss_clip": 0.0646178, "auxiliary_loss_mlp": 0.01268303, "balance_loss_clip": 0.06285191, "balance_loss_mlp": 0.01254374, "epoch": 0.4629490455433639, "flos": 23192448053760.0, "grad_norm": 2.075492897996747, "language_loss": 0.71928495, "learning_rate": 2.335485529281996e-06, "loss": 0.79658574, "num_input_tokens_seen": 165284380, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.13934326, "step": 7700, "time_per_iteration": 2.57163405418396 }, { "auxiliary_loss_clip": 0.06447504, "auxiliary_loss_mlp": 0.01273594, "balance_loss_clip": 0.06280089, "balance_loss_mlp": 0.01260844, "epoch": 0.4630091687960319, "flos": 18840178005120.0, "grad_norm": 1.8611499220278287, "language_loss": 0.72545153, "learning_rate": 2.3351015796028467e-06, "loss": 0.80266249, "num_input_tokens_seen": 165300320, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.12756348, "step": 7701, "time_per_iteration": 2.529639720916748 }, { "auxiliary_loss_clip": 0.06461671, "auxiliary_loss_mlp": 0.01275355, "balance_loss_clip": 0.06285925, "balance_loss_mlp": 0.01262212, "epoch": 0.46306929204869984, "flos": 38915733882240.0, "grad_norm": 2.4756045670111617, "language_loss": 0.65250981, "learning_rate": 2.3347176172166114e-06, "loss": 0.72988003, "num_input_tokens_seen": 165318130, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.13153076, "step": 7702, "time_per_iteration": 2.692732810974121 }, { "auxiliary_loss_clip": 0.06449997, "auxiliary_loss_mlp": 0.01268404, "balance_loss_clip": 0.06281988, "balance_loss_mlp": 0.01255494, "epoch": 0.4631294153013678, "flos": 19649945462400.0, "grad_norm": 3.3314885826518084, "language_loss": 0.73551428, "learning_rate": 2.33433364213785e-06, "loss": 0.8126983, "num_input_tokens_seen": 165336225, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.12902832, "step": 7703, "time_per_iteration": 2.5790207386016846 }, { "auxiliary_loss_clip": 0.06459931, "auxiliary_loss_mlp": 0.01275197, "balance_loss_clip": 0.06285075, "balance_loss_mlp": 0.01260242, "epoch": 0.4631895385540358, "flos": 24615187655040.0, "grad_norm": 1.895682944632068, "language_loss": 0.69384801, "learning_rate": 2.3339496543811243e-06, "loss": 0.77119935, "num_input_tokens_seen": 165355005, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.14953613, "step": 7704, "time_per_iteration": 2.58282732963562 }, { "auxiliary_loss_clip": 0.06458024, "auxiliary_loss_mlp": 0.01273847, "balance_loss_clip": 0.06284226, "balance_loss_mlp": 0.01259626, "epoch": 0.46324966180670374, "flos": 26326838534400.0, "grad_norm": 3.227753239282123, "language_loss": 0.8161363, "learning_rate": 2.3335656539609934e-06, "loss": 0.89345503, "num_input_tokens_seen": 165374910, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.14233398, "step": 7705, "time_per_iteration": 2.593503952026367 }, { "auxiliary_loss_clip": 0.06462528, "auxiliary_loss_mlp": 0.01276882, "balance_loss_clip": 0.06285869, "balance_loss_mlp": 0.01262666, "epoch": 0.4633097850593717, "flos": 19245816420480.0, "grad_norm": 1.7297297821966355, "language_loss": 0.77876186, "learning_rate": 2.3331816408920196e-06, "loss": 0.85615599, "num_input_tokens_seen": 165392590, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.14215088, "step": 7706, "time_per_iteration": 2.5404677391052246 }, { "auxiliary_loss_clip": 0.06445235, "auxiliary_loss_mlp": 0.01271144, "balance_loss_clip": 0.06279212, "balance_loss_mlp": 0.01257947, "epoch": 0.46336990831203967, "flos": 22789660677120.0, "grad_norm": 1.7314773655346725, "language_loss": 0.70739609, "learning_rate": 2.3327976151887654e-06, "loss": 0.78455985, "num_input_tokens_seen": 165411195, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.13195801, "step": 7707, "time_per_iteration": 2.5827300548553467 }, { "auxiliary_loss_clip": 0.06458557, "auxiliary_loss_mlp": 0.01270635, "balance_loss_clip": 0.06282384, "balance_loss_mlp": 0.01256527, "epoch": 0.46343003156470763, "flos": 38218668566400.0, "grad_norm": 1.8891112601937314, "language_loss": 0.6073308, "learning_rate": 2.332413576865791e-06, "loss": 0.6846227, "num_input_tokens_seen": 165430150, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.14093018, "step": 7708, "time_per_iteration": 2.705357074737549 }, { "auxiliary_loss_clip": 0.06452295, "auxiliary_loss_mlp": 0.01268664, "balance_loss_clip": 0.0628121, "balance_loss_mlp": 0.01254776, "epoch": 0.4634901548173756, "flos": 31946156098560.0, "grad_norm": 2.3356013178751267, "language_loss": 0.77297354, "learning_rate": 2.3320295259376614e-06, "loss": 0.85018313, "num_input_tokens_seen": 165450595, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.13879395, "step": 7709, "time_per_iteration": 2.641387939453125 }, { "auxiliary_loss_clip": 0.0645666, "auxiliary_loss_mlp": 0.01272694, "balance_loss_clip": 0.06284765, "balance_loss_mlp": 0.01257876, "epoch": 0.46355027807004356, "flos": 20088469405440.0, "grad_norm": 1.7839542962876396, "language_loss": 0.77258837, "learning_rate": 2.3316454624189385e-06, "loss": 0.84988183, "num_input_tokens_seen": 165469515, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.14813232, "step": 7710, "time_per_iteration": 2.5541090965270996 }, { "auxiliary_loss_clip": 0.06457746, "auxiliary_loss_mlp": 0.01271529, "balance_loss_clip": 0.06282284, "balance_loss_mlp": 0.01256855, "epoch": 0.4636104013227116, "flos": 24068280055680.0, "grad_norm": 10.529444625536616, "language_loss": 0.73507583, "learning_rate": 2.3312613863241865e-06, "loss": 0.81236857, "num_input_tokens_seen": 165488125, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.14685059, "step": 7711, "time_per_iteration": 2.5598034858703613 }, { "auxiliary_loss_clip": 0.06452867, "auxiliary_loss_mlp": 0.01272379, "balance_loss_clip": 0.06282014, "balance_loss_mlp": 0.01257579, "epoch": 0.46367052457537955, "flos": 23921392648320.0, "grad_norm": 2.958853979763496, "language_loss": 0.71631062, "learning_rate": 2.33087729766797e-06, "loss": 0.79356307, "num_input_tokens_seen": 165509225, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.14782715, "step": 7712, "time_per_iteration": 2.5731141567230225 }, { "auxiliary_loss_clip": 0.06465385, "auxiliary_loss_mlp": 0.01273462, "balance_loss_clip": 0.06285635, "balance_loss_mlp": 0.01257619, "epoch": 0.4637306478280475, "flos": 26403846036480.0, "grad_norm": 2.187889976832755, "language_loss": 0.73577714, "learning_rate": 2.3304931964648524e-06, "loss": 0.81316561, "num_input_tokens_seen": 165529945, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.15844727, "step": 7713, "time_per_iteration": 2.650787830352783 }, { "auxiliary_loss_clip": 0.06465359, "auxiliary_loss_mlp": 0.01271811, "balance_loss_clip": 0.06286786, "balance_loss_mlp": 0.01256958, "epoch": 0.4637907710807155, "flos": 21987104670720.0, "grad_norm": 2.27583679674077, "language_loss": 0.58994764, "learning_rate": 2.3301090827294e-06, "loss": 0.66731936, "num_input_tokens_seen": 165550690, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.14874268, "step": 7714, "time_per_iteration": 4.056482553482056 }, { "auxiliary_loss_clip": 0.06453899, "auxiliary_loss_mlp": 0.01273705, "balance_loss_clip": 0.06283393, "balance_loss_mlp": 0.01259686, "epoch": 0.46385089433338345, "flos": 12427234894080.0, "grad_norm": 1.7373427836482045, "language_loss": 0.70297772, "learning_rate": 2.3297249564761784e-06, "loss": 0.78025377, "num_input_tokens_seen": 165567775, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.14007568, "step": 7715, "time_per_iteration": 2.5233705043792725 }, { "auxiliary_loss_clip": 0.06464494, "auxiliary_loss_mlp": 0.01272315, "balance_loss_clip": 0.06285249, "balance_loss_mlp": 0.01257998, "epoch": 0.4639110175860514, "flos": 23922692386560.0, "grad_norm": 9.191448077631625, "language_loss": 0.68816984, "learning_rate": 2.3293408177197527e-06, "loss": 0.76553786, "num_input_tokens_seen": 165587010, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.14318848, "step": 7716, "time_per_iteration": 2.584197998046875 }, { "auxiliary_loss_clip": 0.06457436, "auxiliary_loss_mlp": 0.01272137, "balance_loss_clip": 0.06282231, "balance_loss_mlp": 0.01257105, "epoch": 0.4639711408387194, "flos": 25307263653120.0, "grad_norm": 1.7123482449557352, "language_loss": 0.81273639, "learning_rate": 2.328956666474691e-06, "loss": 0.89003211, "num_input_tokens_seen": 165607850, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.15026855, "step": 7717, "time_per_iteration": 4.035799503326416 }, { "auxiliary_loss_clip": 0.06458411, "auxiliary_loss_mlp": 0.01273083, "balance_loss_clip": 0.06282182, "balance_loss_mlp": 0.01258897, "epoch": 0.46403126409138734, "flos": 21217643827200.0, "grad_norm": 3.347645625031813, "language_loss": 0.72790849, "learning_rate": 2.3285725027555593e-06, "loss": 0.80522335, "num_input_tokens_seen": 165627175, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.14196777, "step": 7718, "time_per_iteration": 2.565427541732788 }, { "auxiliary_loss_clip": 0.06460047, "auxiliary_loss_mlp": 0.01271757, "balance_loss_clip": 0.06288953, "balance_loss_mlp": 0.0125713, "epoch": 0.4640913873440553, "flos": 35854325907840.0, "grad_norm": 1.616729158662629, "language_loss": 0.71164465, "learning_rate": 2.3281883265769254e-06, "loss": 0.78896272, "num_input_tokens_seen": 165648340, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.14624023, "step": 7719, "time_per_iteration": 2.697608709335327 }, { "auxiliary_loss_clip": 0.06466922, "auxiliary_loss_mlp": 0.01273694, "balance_loss_clip": 0.06289526, "balance_loss_mlp": 0.01259401, "epoch": 0.46415151059672327, "flos": 19171282613760.0, "grad_norm": 1.6223330743831088, "language_loss": 0.86847544, "learning_rate": 2.327804137953357e-06, "loss": 0.94588161, "num_input_tokens_seen": 165667195, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.14294434, "step": 7720, "time_per_iteration": 2.5607361793518066 }, { "auxiliary_loss_clip": 0.0633813, "auxiliary_loss_mlp": 0.01250661, "balance_loss_clip": 0.06268228, "balance_loss_mlp": 0.01248341, "epoch": 0.46421163384939124, "flos": 58932841207680.0, "grad_norm": 0.7147020014983311, "language_loss": 0.5502274, "learning_rate": 2.3274199368994226e-06, "loss": 0.62611532, "num_input_tokens_seen": 165726760, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 0.02319336, "step": 7721, "time_per_iteration": 3.221376419067383 }, { "auxiliary_loss_clip": 0.06455058, "auxiliary_loss_mlp": 0.01273159, "balance_loss_clip": 0.06284863, "balance_loss_mlp": 0.01258574, "epoch": 0.4642717571020592, "flos": 20163590190720.0, "grad_norm": 2.0703831196939326, "language_loss": 0.80772686, "learning_rate": 2.3270357234296918e-06, "loss": 0.88500905, "num_input_tokens_seen": 165745005, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.14581299, "step": 7722, "time_per_iteration": 2.5620527267456055 }, { "auxiliary_loss_clip": 0.06458472, "auxiliary_loss_mlp": 0.01270699, "balance_loss_clip": 0.06283138, "balance_loss_mlp": 0.01256644, "epoch": 0.46433188035472717, "flos": 25053208473600.0, "grad_norm": 30.521150905126525, "language_loss": 0.78591746, "learning_rate": 2.3266514975587332e-06, "loss": 0.86320913, "num_input_tokens_seen": 165765750, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.140625, "step": 7723, "time_per_iteration": 2.586597442626953 }, { "auxiliary_loss_clip": 0.06451575, "auxiliary_loss_mlp": 0.01272518, "balance_loss_clip": 0.0628069, "balance_loss_mlp": 0.01258725, "epoch": 0.4643920036073952, "flos": 28083366074880.0, "grad_norm": 1.4988552840548537, "language_loss": 0.68905205, "learning_rate": 2.326267259301118e-06, "loss": 0.76629293, "num_input_tokens_seen": 165787515, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.13806152, "step": 7724, "time_per_iteration": 2.617587089538574 }, { "auxiliary_loss_clip": 0.06459497, "auxiliary_loss_mlp": 0.01275949, "balance_loss_clip": 0.06286988, "balance_loss_mlp": 0.01261727, "epoch": 0.46445212686006315, "flos": 18375267225600.0, "grad_norm": 2.3194155000212406, "language_loss": 0.68058515, "learning_rate": 2.325883008671415e-06, "loss": 0.75793964, "num_input_tokens_seen": 165806675, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.14208984, "step": 7725, "time_per_iteration": 2.5533409118652344 }, { "auxiliary_loss_clip": 0.06452646, "auxiliary_loss_mlp": 0.0127101, "balance_loss_clip": 0.0628583, "balance_loss_mlp": 0.01258403, "epoch": 0.4645122501127311, "flos": 31729514567040.0, "grad_norm": 2.0379042936368674, "language_loss": 0.65059054, "learning_rate": 2.3254987456841955e-06, "loss": 0.72782713, "num_input_tokens_seen": 165829835, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.12609863, "step": 7726, "time_per_iteration": 2.6401689052581787 }, { "auxiliary_loss_clip": 0.06463607, "auxiliary_loss_mlp": 0.01269779, "balance_loss_clip": 0.0629098, "balance_loss_mlp": 0.01255861, "epoch": 0.4645723733653991, "flos": 23775553416960.0, "grad_norm": 1.817134204623174, "language_loss": 0.75339299, "learning_rate": 2.3251144703540307e-06, "loss": 0.83072686, "num_input_tokens_seen": 165849380, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.13916016, "step": 7727, "time_per_iteration": 3.9583280086517334 }, { "auxiliary_loss_clip": 0.06454863, "auxiliary_loss_mlp": 0.01273528, "balance_loss_clip": 0.06281681, "balance_loss_mlp": 0.01259687, "epoch": 0.46463249661806705, "flos": 33153805468800.0, "grad_norm": 1.8954373597106207, "language_loss": 0.78956127, "learning_rate": 2.3247301826954936e-06, "loss": 0.86684513, "num_input_tokens_seen": 165868620, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.1385498, "step": 7728, "time_per_iteration": 4.038671493530273 }, { "auxiliary_loss_clip": 0.06460144, "auxiliary_loss_mlp": 0.01274387, "balance_loss_clip": 0.06287251, "balance_loss_mlp": 0.01260499, "epoch": 0.464692619870735, "flos": 18301865448960.0, "grad_norm": 1.929256134764102, "language_loss": 0.76209784, "learning_rate": 2.324345882723155e-06, "loss": 0.83944315, "num_input_tokens_seen": 165885915, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.13885498, "step": 7729, "time_per_iteration": 2.5147764682769775 }, { "auxiliary_loss_clip": 0.06460364, "auxiliary_loss_mlp": 0.01271881, "balance_loss_clip": 0.06288604, "balance_loss_mlp": 0.01258518, "epoch": 0.464752743123403, "flos": 22644659986560.0, "grad_norm": 1.812337092177159, "language_loss": 0.79942, "learning_rate": 2.323961570451588e-06, "loss": 0.87674248, "num_input_tokens_seen": 165905465, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.13366699, "step": 7730, "time_per_iteration": 2.6847643852233887 }, { "auxiliary_loss_clip": 0.0645457, "auxiliary_loss_mlp": 0.01270779, "balance_loss_clip": 0.0628393, "balance_loss_mlp": 0.01256635, "epoch": 0.46481286637607094, "flos": 20418316202880.0, "grad_norm": 1.9775388020977984, "language_loss": 0.77258348, "learning_rate": 2.3235772458953655e-06, "loss": 0.84983695, "num_input_tokens_seen": 165924640, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.14141846, "step": 7731, "time_per_iteration": 2.5615835189819336 }, { "auxiliary_loss_clip": 0.06452426, "auxiliary_loss_mlp": 0.01269266, "balance_loss_clip": 0.06282587, "balance_loss_mlp": 0.01255366, "epoch": 0.4648729896287389, "flos": 34283692650240.0, "grad_norm": 1.576980754810676, "language_loss": 0.65949476, "learning_rate": 2.323192909069061e-06, "loss": 0.73671162, "num_input_tokens_seen": 165945765, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.13903809, "step": 7732, "time_per_iteration": 2.6664466857910156 }, { "auxiliary_loss_clip": 0.06463936, "auxiliary_loss_mlp": 0.01272931, "balance_loss_clip": 0.0628509, "balance_loss_mlp": 0.01257303, "epoch": 0.4649331128814069, "flos": 21327704565120.0, "grad_norm": 2.760368999089724, "language_loss": 0.73196721, "learning_rate": 2.32280855998725e-06, "loss": 0.80933589, "num_input_tokens_seen": 165964025, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.15612793, "step": 7733, "time_per_iteration": 2.565416097640991 }, { "auxiliary_loss_clip": 0.06346814, "auxiliary_loss_mlp": 0.01251251, "balance_loss_clip": 0.06276961, "balance_loss_mlp": 0.01248902, "epoch": 0.46499323613407484, "flos": 58325082744960.0, "grad_norm": 1.402673185550079, "language_loss": 0.5214293, "learning_rate": 2.3224241986645057e-06, "loss": 0.5974099, "num_input_tokens_seen": 166021950, "router_z_loss_clip": 0.69824219, "router_z_loss_mlp": 0.02345276, "step": 7734, "time_per_iteration": 3.14825701713562 }, { "auxiliary_loss_clip": 0.06455714, "auxiliary_loss_mlp": 0.01273734, "balance_loss_clip": 0.06285188, "balance_loss_mlp": 0.01259692, "epoch": 0.4650533593867428, "flos": 10894308773760.0, "grad_norm": 1.9210267540673196, "language_loss": 0.75722337, "learning_rate": 2.3220398251154035e-06, "loss": 0.8345179, "num_input_tokens_seen": 166039675, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.14044189, "step": 7735, "time_per_iteration": 2.541224241256714 }, { "auxiliary_loss_clip": 0.06451732, "auxiliary_loss_mlp": 0.01273634, "balance_loss_clip": 0.0628325, "balance_loss_mlp": 0.01258214, "epoch": 0.46511348263941077, "flos": 19980756581760.0, "grad_norm": 1.714849793906724, "language_loss": 0.69609624, "learning_rate": 2.321655439354519e-06, "loss": 0.77334988, "num_input_tokens_seen": 166057745, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.1541748, "step": 7736, "time_per_iteration": 2.564579963684082 }, { "auxiliary_loss_clip": 0.06453837, "auxiliary_loss_mlp": 0.01272811, "balance_loss_clip": 0.06287937, "balance_loss_mlp": 0.01259782, "epoch": 0.46517360589207873, "flos": 19683795312000.0, "grad_norm": 1.6720239981440626, "language_loss": 0.72522998, "learning_rate": 2.321271041396427e-06, "loss": 0.80249655, "num_input_tokens_seen": 166076440, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.13024902, "step": 7737, "time_per_iteration": 2.562129020690918 }, { "auxiliary_loss_clip": 0.06459858, "auxiliary_loss_mlp": 0.01271107, "balance_loss_clip": 0.06285921, "balance_loss_mlp": 0.01256277, "epoch": 0.46523372914474675, "flos": 16878203452800.0, "grad_norm": 2.1442028047832196, "language_loss": 0.84209597, "learning_rate": 2.3208866312557065e-06, "loss": 0.91940564, "num_input_tokens_seen": 166092520, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.14819336, "step": 7738, "time_per_iteration": 2.548919439315796 }, { "auxiliary_loss_clip": 0.06356867, "auxiliary_loss_mlp": 0.01256752, "balance_loss_clip": 0.06286777, "balance_loss_mlp": 0.01254191, "epoch": 0.4652938523974147, "flos": 53458188917760.0, "grad_norm": 0.7323683243632868, "language_loss": 0.5760774, "learning_rate": 2.320502208946932e-06, "loss": 0.65221363, "num_input_tokens_seen": 166156285, "router_z_loss_clip": 0.70263672, "router_z_loss_mlp": 0.02561951, "step": 7739, "time_per_iteration": 3.24859619140625 }, { "auxiliary_loss_clip": 0.06464544, "auxiliary_loss_mlp": 0.01276402, "balance_loss_clip": 0.06292355, "balance_loss_mlp": 0.01261584, "epoch": 0.4653539756500827, "flos": 15236642113920.0, "grad_norm": 2.2283253092057445, "language_loss": 0.85180122, "learning_rate": 2.3201177744846815e-06, "loss": 0.92921066, "num_input_tokens_seen": 166173455, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.14819336, "step": 7740, "time_per_iteration": 2.527219772338867 }, { "auxiliary_loss_clip": 0.06457161, "auxiliary_loss_mlp": 0.01276511, "balance_loss_clip": 0.06286608, "balance_loss_mlp": 0.01261991, "epoch": 0.46541409890275065, "flos": 23738978309760.0, "grad_norm": 1.787777840618515, "language_loss": 0.76054096, "learning_rate": 2.3197333278835327e-06, "loss": 0.83787763, "num_input_tokens_seen": 166194370, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.14538574, "step": 7741, "time_per_iteration": 2.5772507190704346 }, { "auxiliary_loss_clip": 0.06466536, "auxiliary_loss_mlp": 0.01268375, "balance_loss_clip": 0.06288517, "balance_loss_mlp": 0.01253843, "epoch": 0.4654742221554186, "flos": 20853150566400.0, "grad_norm": 1.6535388383724945, "language_loss": 0.81127924, "learning_rate": 2.319348869158064e-06, "loss": 0.88862824, "num_input_tokens_seen": 166213195, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.1451416, "step": 7742, "time_per_iteration": 2.56355619430542 }, { "auxiliary_loss_clip": 0.0646599, "auxiliary_loss_mlp": 0.01269759, "balance_loss_clip": 0.0628974, "balance_loss_mlp": 0.01254583, "epoch": 0.4655343454080866, "flos": 20711210549760.0, "grad_norm": 1.673892641287674, "language_loss": 0.73059785, "learning_rate": 2.3189643983228555e-06, "loss": 0.80795527, "num_input_tokens_seen": 166231350, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.15185547, "step": 7743, "time_per_iteration": 2.5705628395080566 }, { "auxiliary_loss_clip": 0.06458402, "auxiliary_loss_mlp": 0.01271692, "balance_loss_clip": 0.06286407, "balance_loss_mlp": 0.0125716, "epoch": 0.46559446866075455, "flos": 18995912017920.0, "grad_norm": 1.7519583804735168, "language_loss": 0.7178092, "learning_rate": 2.318579915392483e-06, "loss": 0.79511011, "num_input_tokens_seen": 166250530, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.14532471, "step": 7744, "time_per_iteration": 2.539668560028076 }, { "auxiliary_loss_clip": 0.06459634, "auxiliary_loss_mlp": 0.01269656, "balance_loss_clip": 0.06289659, "balance_loss_mlp": 0.01255845, "epoch": 0.4656545919134225, "flos": 34505030010240.0, "grad_norm": 1.947821286219566, "language_loss": 0.85110247, "learning_rate": 2.31819542038153e-06, "loss": 0.92839533, "num_input_tokens_seen": 166272545, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.13800049, "step": 7745, "time_per_iteration": 2.66528582572937 }, { "auxiliary_loss_clip": 0.06457188, "auxiliary_loss_mlp": 0.01273854, "balance_loss_clip": 0.06289198, "balance_loss_mlp": 0.012603, "epoch": 0.4657147151660905, "flos": 24316465449600.0, "grad_norm": 2.45847605166381, "language_loss": 0.7358858, "learning_rate": 2.317810913304574e-06, "loss": 0.81319618, "num_input_tokens_seen": 166292135, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.13555908, "step": 7746, "time_per_iteration": 2.5773990154266357 }, { "auxiliary_loss_clip": 0.06457526, "auxiliary_loss_mlp": 0.01276249, "balance_loss_clip": 0.06289327, "balance_loss_mlp": 0.01262534, "epoch": 0.46577483841875844, "flos": 58807743390720.0, "grad_norm": 1.6417169065431299, "language_loss": 0.70086652, "learning_rate": 2.3174263941761963e-06, "loss": 0.77820432, "num_input_tokens_seen": 166316710, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.13720703, "step": 7747, "time_per_iteration": 2.8997414112091064 }, { "auxiliary_loss_clip": 0.06453225, "auxiliary_loss_mlp": 0.01272, "balance_loss_clip": 0.06283782, "balance_loss_mlp": 0.0125782, "epoch": 0.4658349616714264, "flos": 31330081353600.0, "grad_norm": 2.523277430290763, "language_loss": 0.67730987, "learning_rate": 2.317041863010978e-06, "loss": 0.75456214, "num_input_tokens_seen": 166338535, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.14172363, "step": 7748, "time_per_iteration": 2.657968282699585 }, { "auxiliary_loss_clip": 0.06459417, "auxiliary_loss_mlp": 0.01273328, "balance_loss_clip": 0.06283927, "balance_loss_mlp": 0.0125854, "epoch": 0.46589508492409437, "flos": 14864601985920.0, "grad_norm": 1.9512486454478843, "language_loss": 0.64957368, "learning_rate": 2.3166573198235007e-06, "loss": 0.72690117, "num_input_tokens_seen": 166355540, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.14807129, "step": 7749, "time_per_iteration": 2.5276260375976562 }, { "auxiliary_loss_clip": 0.06466635, "auxiliary_loss_mlp": 0.01272396, "balance_loss_clip": 0.0629214, "balance_loss_mlp": 0.01257536, "epoch": 0.46595520817676234, "flos": 12900908424960.0, "grad_norm": 2.31978032962688, "language_loss": 0.74657261, "learning_rate": 2.3162727646283456e-06, "loss": 0.82396293, "num_input_tokens_seen": 166372635, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.14855957, "step": 7750, "time_per_iteration": 2.5233309268951416 }, { "auxiliary_loss_clip": 0.06460878, "auxiliary_loss_mlp": 0.01273444, "balance_loss_clip": 0.06286909, "balance_loss_mlp": 0.0125865, "epoch": 0.46601533142943036, "flos": 32862504349440.0, "grad_norm": 1.8942597076357266, "language_loss": 0.7477417, "learning_rate": 2.3158881974400963e-06, "loss": 0.82508492, "num_input_tokens_seen": 166393175, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.14788818, "step": 7751, "time_per_iteration": 2.6531615257263184 }, { "auxiliary_loss_clip": 0.06466139, "auxiliary_loss_mlp": 0.01268158, "balance_loss_clip": 0.06290635, "balance_loss_mlp": 0.01253377, "epoch": 0.4660754546820983, "flos": 19972496954880.0, "grad_norm": 2.251948812428874, "language_loss": 0.74097264, "learning_rate": 2.3155036182733345e-06, "loss": 0.81831563, "num_input_tokens_seen": 166408630, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.14782715, "step": 7752, "time_per_iteration": 2.5487351417541504 }, { "auxiliary_loss_clip": 0.0646679, "auxiliary_loss_mlp": 0.01272025, "balance_loss_clip": 0.06290112, "balance_loss_mlp": 0.01257029, "epoch": 0.4661355779347663, "flos": 26695482572160.0, "grad_norm": 2.630649711569767, "language_loss": 0.69772607, "learning_rate": 2.315119027142644e-06, "loss": 0.7751143, "num_input_tokens_seen": 166428170, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.15002441, "step": 7753, "time_per_iteration": 2.6003940105438232 }, { "auxiliary_loss_clip": 0.06454651, "auxiliary_loss_mlp": 0.01270873, "balance_loss_clip": 0.06287032, "balance_loss_mlp": 0.01256783, "epoch": 0.46619570118743425, "flos": 20965726926720.0, "grad_norm": 1.7639210277867379, "language_loss": 0.73109454, "learning_rate": 2.3147344240626076e-06, "loss": 0.80834973, "num_input_tokens_seen": 166446705, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.14074707, "step": 7754, "time_per_iteration": 4.004580974578857 }, { "auxiliary_loss_clip": 0.06461182, "auxiliary_loss_mlp": 0.01274211, "balance_loss_clip": 0.06287631, "balance_loss_mlp": 0.0125782, "epoch": 0.4662558244401022, "flos": 24433024878720.0, "grad_norm": 1.3963341159639342, "language_loss": 0.79563028, "learning_rate": 2.3143498090478114e-06, "loss": 0.87298423, "num_input_tokens_seen": 166466750, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.16394043, "step": 7755, "time_per_iteration": 2.5770764350891113 }, { "auxiliary_loss_clip": 0.06450014, "auxiliary_loss_mlp": 0.01270291, "balance_loss_clip": 0.06283247, "balance_loss_mlp": 0.01257225, "epoch": 0.4663159476927702, "flos": 20601820644480.0, "grad_norm": 1.7842724115814745, "language_loss": 0.71994996, "learning_rate": 2.3139651821128382e-06, "loss": 0.797153, "num_input_tokens_seen": 166485400, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.13061523, "step": 7756, "time_per_iteration": 4.015011310577393 }, { "auxiliary_loss_clip": 0.06449981, "auxiliary_loss_mlp": 0.01268598, "balance_loss_clip": 0.06283052, "balance_loss_mlp": 0.01255533, "epoch": 0.46637607094543815, "flos": 25668235042560.0, "grad_norm": 1.6769386599168417, "language_loss": 0.78653705, "learning_rate": 2.313580543272274e-06, "loss": 0.8637228, "num_input_tokens_seen": 166505730, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.1307373, "step": 7757, "time_per_iteration": 2.5914883613586426 }, { "auxiliary_loss_clip": 0.06454287, "auxiliary_loss_mlp": 0.01270829, "balance_loss_clip": 0.0628418, "balance_loss_mlp": 0.01257442, "epoch": 0.4664361941981061, "flos": 24279722634240.0, "grad_norm": 1.7297526894142872, "language_loss": 0.66811109, "learning_rate": 2.313195892540705e-06, "loss": 0.74536228, "num_input_tokens_seen": 166523770, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.13391113, "step": 7758, "time_per_iteration": 2.567483425140381 }, { "auxiliary_loss_clip": 0.06456141, "auxiliary_loss_mlp": 0.01271862, "balance_loss_clip": 0.06287833, "balance_loss_mlp": 0.0125851, "epoch": 0.4664963174507741, "flos": 18411800405760.0, "grad_norm": 1.9514414818607817, "language_loss": 0.75013971, "learning_rate": 2.3128112299327147e-06, "loss": 0.82741976, "num_input_tokens_seen": 166542935, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.13336182, "step": 7759, "time_per_iteration": 2.5532004833221436 }, { "auxiliary_loss_clip": 0.06453434, "auxiliary_loss_mlp": 0.01272117, "balance_loss_clip": 0.06285964, "balance_loss_mlp": 0.01259338, "epoch": 0.46655644070344204, "flos": 22461616742400.0, "grad_norm": 1.3968927929417376, "language_loss": 0.77969712, "learning_rate": 2.312426555462893e-06, "loss": 0.85695261, "num_input_tokens_seen": 166563935, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.12780762, "step": 7760, "time_per_iteration": 2.7048189640045166 }, { "auxiliary_loss_clip": 0.06448977, "auxiliary_loss_mlp": 0.01265865, "balance_loss_clip": 0.06283298, "balance_loss_mlp": 0.01252347, "epoch": 0.46661656395611, "flos": 13813525169280.0, "grad_norm": 1.5706528768403094, "language_loss": 0.74254501, "learning_rate": 2.3120418691458237e-06, "loss": 0.81969339, "num_input_tokens_seen": 166582175, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.13525391, "step": 7761, "time_per_iteration": 2.6442995071411133 }, { "auxiliary_loss_clip": 0.06457025, "auxiliary_loss_mlp": 0.01273832, "balance_loss_clip": 0.0628572, "balance_loss_mlp": 0.01258311, "epoch": 0.466676687208778, "flos": 21658473757440.0, "grad_norm": 1.5349576233663564, "language_loss": 0.79180819, "learning_rate": 2.3116571709960956e-06, "loss": 0.86911678, "num_input_tokens_seen": 166601870, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.15515137, "step": 7762, "time_per_iteration": 2.589177131652832 }, { "auxiliary_loss_clip": 0.06360576, "auxiliary_loss_mlp": 0.01254578, "balance_loss_clip": 0.06292257, "balance_loss_mlp": 0.01252252, "epoch": 0.46673681046144594, "flos": 68554163554560.0, "grad_norm": 0.7775032090084532, "language_loss": 0.59669924, "learning_rate": 2.311272461028297e-06, "loss": 0.67285073, "num_input_tokens_seen": 166668960, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 0.02325439, "step": 7763, "time_per_iteration": 3.2582948207855225 }, { "auxiliary_loss_clip": 0.06462081, "auxiliary_loss_mlp": 0.01271139, "balance_loss_clip": 0.06288414, "balance_loss_mlp": 0.01256345, "epoch": 0.46679693371411396, "flos": 15819789404160.0, "grad_norm": 2.026147267484705, "language_loss": 0.78790742, "learning_rate": 2.3108877392570146e-06, "loss": 0.86523962, "num_input_tokens_seen": 166686110, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.14794922, "step": 7764, "time_per_iteration": 2.5540928840637207 }, { "auxiliary_loss_clip": 0.06453373, "auxiliary_loss_mlp": 0.01273546, "balance_loss_clip": 0.06287565, "balance_loss_mlp": 0.0126057, "epoch": 0.4668570569667819, "flos": 18520393697280.0, "grad_norm": 1.7253537677725632, "language_loss": 0.72159576, "learning_rate": 2.310503005696839e-06, "loss": 0.79886496, "num_input_tokens_seen": 166703930, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.12976074, "step": 7765, "time_per_iteration": 2.5454957485198975 }, { "auxiliary_loss_clip": 0.0645874, "auxiliary_loss_mlp": 0.01273041, "balance_loss_clip": 0.06287052, "balance_loss_mlp": 0.01258897, "epoch": 0.4669171802194499, "flos": 19212385841280.0, "grad_norm": 1.877765957780897, "language_loss": 0.77866042, "learning_rate": 2.3101182603623576e-06, "loss": 0.85597831, "num_input_tokens_seen": 166719940, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.14154053, "step": 7766, "time_per_iteration": 3.943392515182495 }, { "auxiliary_loss_clip": 0.06452334, "auxiliary_loss_mlp": 0.01273266, "balance_loss_clip": 0.06285138, "balance_loss_mlp": 0.01259378, "epoch": 0.46697730347211786, "flos": 12281018319360.0, "grad_norm": 1.9831978736189189, "language_loss": 0.6587137, "learning_rate": 2.3097335032681607e-06, "loss": 0.73596966, "num_input_tokens_seen": 166738285, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.13885498, "step": 7767, "time_per_iteration": 3.9579432010650635 }, { "auxiliary_loss_clip": 0.06455957, "auxiliary_loss_mlp": 0.01271426, "balance_loss_clip": 0.0628731, "balance_loss_mlp": 0.01257353, "epoch": 0.4670374267247858, "flos": 23593516421760.0, "grad_norm": 1.9753974888950214, "language_loss": 0.74958414, "learning_rate": 2.3093487344288393e-06, "loss": 0.82685792, "num_input_tokens_seen": 166758170, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.14050293, "step": 7768, "time_per_iteration": 2.5842411518096924 }, { "auxiliary_loss_clip": 0.06455588, "auxiliary_loss_mlp": 0.01269426, "balance_loss_clip": 0.06289148, "balance_loss_mlp": 0.01256104, "epoch": 0.4670975499774538, "flos": 15995495416320.0, "grad_norm": 1.6719569155380332, "language_loss": 0.71127278, "learning_rate": 2.308963953858982e-06, "loss": 0.78852296, "num_input_tokens_seen": 166775750, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.13317871, "step": 7769, "time_per_iteration": 2.5387885570526123 }, { "auxiliary_loss_clip": 0.06455064, "auxiliary_loss_mlp": 0.01269061, "balance_loss_clip": 0.06286915, "balance_loss_mlp": 0.01255393, "epoch": 0.46715767323012175, "flos": 15383026396800.0, "grad_norm": 1.929270322933231, "language_loss": 0.81834406, "learning_rate": 2.3085791615731803e-06, "loss": 0.8955853, "num_input_tokens_seen": 166791720, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.13690186, "step": 7770, "time_per_iteration": 2.5489630699157715 }, { "auxiliary_loss_clip": 0.06348307, "auxiliary_loss_mlp": 0.0125295, "balance_loss_clip": 0.06279667, "balance_loss_mlp": 0.01250573, "epoch": 0.4672177964827897, "flos": 60270774877440.0, "grad_norm": 0.7850979418249429, "language_loss": 0.55606937, "learning_rate": 2.3081943575860265e-06, "loss": 0.63208193, "num_input_tokens_seen": 166856360, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 0.02372742, "step": 7771, "time_per_iteration": 3.199626922607422 }, { "auxiliary_loss_clip": 0.06456731, "auxiliary_loss_mlp": 0.01267117, "balance_loss_clip": 0.06288742, "balance_loss_mlp": 0.01253402, "epoch": 0.4672779197354577, "flos": 27643500466560.0, "grad_norm": 1.9751731800828933, "language_loss": 0.6629523, "learning_rate": 2.3078095419121117e-06, "loss": 0.74019086, "num_input_tokens_seen": 166875925, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.13726807, "step": 7772, "time_per_iteration": 2.6027915477752686 }, { "auxiliary_loss_clip": 0.06454319, "auxiliary_loss_mlp": 0.01268968, "balance_loss_clip": 0.06288542, "balance_loss_mlp": 0.01256296, "epoch": 0.46733804298812565, "flos": 31402267246080.0, "grad_norm": 1.9058791778920887, "language_loss": 0.64007807, "learning_rate": 2.3074247145660283e-06, "loss": 0.71731097, "num_input_tokens_seen": 166896520, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.12664795, "step": 7773, "time_per_iteration": 2.6291561126708984 }, { "auxiliary_loss_clip": 0.06454776, "auxiliary_loss_mlp": 0.01270085, "balance_loss_clip": 0.06287204, "balance_loss_mlp": 0.01256585, "epoch": 0.4673981662407936, "flos": 19506747634560.0, "grad_norm": 1.7750221969533146, "language_loss": 0.80332989, "learning_rate": 2.3070398755623685e-06, "loss": 0.88057846, "num_input_tokens_seen": 166915370, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.13500977, "step": 7774, "time_per_iteration": 2.5448129177093506 }, { "auxiliary_loss_clip": 0.06460559, "auxiliary_loss_mlp": 0.0127066, "balance_loss_clip": 0.06289164, "balance_loss_mlp": 0.01256855, "epoch": 0.4674582894934616, "flos": 20528083451520.0, "grad_norm": 1.5653908385185145, "language_loss": 0.78102589, "learning_rate": 2.306655024915726e-06, "loss": 0.858338, "num_input_tokens_seen": 166934875, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.13818359, "step": 7775, "time_per_iteration": 2.5522327423095703 }, { "auxiliary_loss_clip": 0.06456086, "auxiliary_loss_mlp": 0.01271805, "balance_loss_clip": 0.06289877, "balance_loss_mlp": 0.01259014, "epoch": 0.46751841274612954, "flos": 22097500824960.0, "grad_norm": 1.8965738178608098, "language_loss": 0.70039362, "learning_rate": 2.306270162640694e-06, "loss": 0.77767253, "num_input_tokens_seen": 166954285, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.12786865, "step": 7776, "time_per_iteration": 2.5628130435943604 }, { "auxiliary_loss_clip": 0.06450695, "auxiliary_loss_mlp": 0.01268989, "balance_loss_clip": 0.06285147, "balance_loss_mlp": 0.01256311, "epoch": 0.46757853599879756, "flos": 26987454524160.0, "grad_norm": 1.3722289155613376, "language_loss": 0.74357992, "learning_rate": 2.3058852887518678e-06, "loss": 0.82077676, "num_input_tokens_seen": 166975975, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.12677002, "step": 7777, "time_per_iteration": 2.751805543899536 }, { "auxiliary_loss_clip": 0.06461263, "auxiliary_loss_mlp": 0.01270141, "balance_loss_clip": 0.06293309, "balance_loss_mlp": 0.01257344, "epoch": 0.4676386592514655, "flos": 24140927145600.0, "grad_norm": 2.140911208588168, "language_loss": 0.70482641, "learning_rate": 2.3055004032638394e-06, "loss": 0.78214049, "num_input_tokens_seen": 166996140, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.12786865, "step": 7778, "time_per_iteration": 2.6497044563293457 }, { "auxiliary_loss_clip": 0.06462975, "auxiliary_loss_mlp": 0.0126963, "balance_loss_clip": 0.06290943, "balance_loss_mlp": 0.01255629, "epoch": 0.4676987825041335, "flos": 25490768094720.0, "grad_norm": 1.9062719451875587, "language_loss": 0.74001646, "learning_rate": 2.305115506191206e-06, "loss": 0.8173424, "num_input_tokens_seen": 167016105, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.13989258, "step": 7779, "time_per_iteration": 2.636904001235962 }, { "auxiliary_loss_clip": 0.06453363, "auxiliary_loss_mlp": 0.01266383, "balance_loss_clip": 0.06289025, "balance_loss_mlp": 0.01253884, "epoch": 0.46775890575680146, "flos": 21951871228800.0, "grad_norm": 1.4514905734524488, "language_loss": 0.72654253, "learning_rate": 2.304730597548562e-06, "loss": 0.80373996, "num_input_tokens_seen": 167036185, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.125, "step": 7780, "time_per_iteration": 2.608304023742676 }, { "auxiliary_loss_clip": 0.06459189, "auxiliary_loss_mlp": 0.01270538, "balance_loss_clip": 0.0628574, "balance_loss_mlp": 0.01256984, "epoch": 0.4678190290094694, "flos": 25235413176960.0, "grad_norm": 1.9039005506772957, "language_loss": 0.74520695, "learning_rate": 2.3043456773505023e-06, "loss": 0.82250422, "num_input_tokens_seen": 167054515, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.13562012, "step": 7781, "time_per_iteration": 2.620007276535034 }, { "auxiliary_loss_clip": 0.06459018, "auxiliary_loss_mlp": 0.01275122, "balance_loss_clip": 0.06287205, "balance_loss_mlp": 0.01261193, "epoch": 0.4678791522621374, "flos": 32276254458240.0, "grad_norm": 1.8156695386871795, "language_loss": 0.63407528, "learning_rate": 2.3039607456116252e-06, "loss": 0.71141672, "num_input_tokens_seen": 167077245, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.13934326, "step": 7782, "time_per_iteration": 2.687086820602417 }, { "auxiliary_loss_clip": 0.06458797, "auxiliary_loss_mlp": 0.01267232, "balance_loss_clip": 0.06288514, "balance_loss_mlp": 0.01254417, "epoch": 0.46793927551480535, "flos": 27052764382080.0, "grad_norm": 2.0625440715011494, "language_loss": 0.63691765, "learning_rate": 2.3035758023465254e-06, "loss": 0.71417791, "num_input_tokens_seen": 167097235, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.12823486, "step": 7783, "time_per_iteration": 2.631066083908081 }, { "auxiliary_loss_clip": 0.06462982, "auxiliary_loss_mlp": 0.01271102, "balance_loss_clip": 0.06288, "balance_loss_mlp": 0.01256415, "epoch": 0.4679993987674733, "flos": 17463195532800.0, "grad_norm": 2.0702056687221346, "language_loss": 0.68306208, "learning_rate": 2.303190847569801e-06, "loss": 0.76040286, "num_input_tokens_seen": 167113155, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.14685059, "step": 7784, "time_per_iteration": 2.5519795417785645 }, { "auxiliary_loss_clip": 0.06450765, "auxiliary_loss_mlp": 0.01268779, "balance_loss_clip": 0.06282981, "balance_loss_mlp": 0.01255726, "epoch": 0.4680595220201413, "flos": 17170804310400.0, "grad_norm": 1.818103613730289, "language_loss": 0.84879088, "learning_rate": 2.3028058812960497e-06, "loss": 0.92598629, "num_input_tokens_seen": 167131765, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.13037109, "step": 7785, "time_per_iteration": 2.52583384513855 }, { "auxiliary_loss_clip": 0.06452549, "auxiliary_loss_mlp": 0.01267333, "balance_loss_clip": 0.06284912, "balance_loss_mlp": 0.01254261, "epoch": 0.46811964527280925, "flos": 11332329592320.0, "grad_norm": 1.7974421733190253, "language_loss": 0.77642483, "learning_rate": 2.3024209035398678e-06, "loss": 0.85362369, "num_input_tokens_seen": 167149030, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.13079834, "step": 7786, "time_per_iteration": 2.540233850479126 }, { "auxiliary_loss_clip": 0.06450374, "auxiliary_loss_mlp": 0.0126871, "balance_loss_clip": 0.06287359, "balance_loss_mlp": 0.01257134, "epoch": 0.4681797685254772, "flos": 24285508565760.0, "grad_norm": 9.295623624444053, "language_loss": 0.74412107, "learning_rate": 2.302035914315856e-06, "loss": 0.82131189, "num_input_tokens_seen": 167167375, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.11578369, "step": 7787, "time_per_iteration": 2.598790407180786 }, { "auxiliary_loss_clip": 0.06451014, "auxiliary_loss_mlp": 0.01271301, "balance_loss_clip": 0.06284125, "balance_loss_mlp": 0.0125842, "epoch": 0.4682398917781452, "flos": 31658544558720.0, "grad_norm": 1.766441848905322, "language_loss": 0.6630801, "learning_rate": 2.3016509136386116e-06, "loss": 0.74030328, "num_input_tokens_seen": 167188065, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.12878418, "step": 7788, "time_per_iteration": 2.6585278511047363 }, { "auxiliary_loss_clip": 0.06458052, "auxiliary_loss_mlp": 0.01269307, "balance_loss_clip": 0.06291177, "balance_loss_mlp": 0.01257714, "epoch": 0.46830001503081314, "flos": 28118264100480.0, "grad_norm": 2.010600012405644, "language_loss": 0.64455748, "learning_rate": 2.3012659015227343e-06, "loss": 0.72183108, "num_input_tokens_seen": 167209675, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.11584473, "step": 7789, "time_per_iteration": 2.6066153049468994 }, { "auxiliary_loss_clip": 0.06352378, "auxiliary_loss_mlp": 0.01255002, "balance_loss_clip": 0.06284814, "balance_loss_mlp": 0.01252665, "epoch": 0.4683601382834811, "flos": 57900059308800.0, "grad_norm": 0.6930892044185704, "language_loss": 0.61818194, "learning_rate": 2.300880877982825e-06, "loss": 0.69425571, "num_input_tokens_seen": 167273940, "router_z_loss_clip": 0.67578125, "router_z_loss_mlp": 0.02331543, "step": 7790, "time_per_iteration": 3.264639377593994 }, { "auxiliary_loss_clip": 0.06450378, "auxiliary_loss_mlp": 0.01269346, "balance_loss_clip": 0.06286203, "balance_loss_mlp": 0.01256835, "epoch": 0.46842026153614913, "flos": 21878427525120.0, "grad_norm": 1.5364766630201223, "language_loss": 0.79679489, "learning_rate": 2.3004958430334808e-06, "loss": 0.87399209, "num_input_tokens_seen": 167292730, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.12518311, "step": 7791, "time_per_iteration": 2.5708513259887695 }, { "auxiliary_loss_clip": 0.0644953, "auxiliary_loss_mlp": 0.01267668, "balance_loss_clip": 0.06285992, "balance_loss_mlp": 0.01255294, "epoch": 0.4684803847888171, "flos": 24907914293760.0, "grad_norm": 1.5083699568096287, "language_loss": 0.75181323, "learning_rate": 2.3001107966893052e-06, "loss": 0.82898521, "num_input_tokens_seen": 167313460, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.12390137, "step": 7792, "time_per_iteration": 2.6262660026550293 }, { "auxiliary_loss_clip": 0.0644225, "auxiliary_loss_mlp": 0.01268971, "balance_loss_clip": 0.06280594, "balance_loss_mlp": 0.01257211, "epoch": 0.46854050804148506, "flos": 26259138835200.0, "grad_norm": 2.131554951538024, "language_loss": 0.6845572, "learning_rate": 2.299725738964898e-06, "loss": 0.7616694, "num_input_tokens_seen": 167335385, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.11755371, "step": 7793, "time_per_iteration": 2.6286911964416504 }, { "auxiliary_loss_clip": 0.06448987, "auxiliary_loss_mlp": 0.01272128, "balance_loss_clip": 0.0628681, "balance_loss_mlp": 0.01260374, "epoch": 0.468600631294153, "flos": 21586204010880.0, "grad_norm": 1.8645785879038048, "language_loss": 0.74244052, "learning_rate": 2.2993406698748607e-06, "loss": 0.81965172, "num_input_tokens_seen": 167353625, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.11755371, "step": 7794, "time_per_iteration": 4.042956829071045 }, { "auxiliary_loss_clip": 0.06456482, "auxiliary_loss_mlp": 0.01268845, "balance_loss_clip": 0.06290857, "balance_loss_mlp": 0.01255857, "epoch": 0.468660754546821, "flos": 25892842711680.0, "grad_norm": 1.5268237899281976, "language_loss": 0.64107251, "learning_rate": 2.2989555894337953e-06, "loss": 0.71832585, "num_input_tokens_seen": 167374565, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.12994385, "step": 7795, "time_per_iteration": 2.6145031452178955 }, { "auxiliary_loss_clip": 0.0645122, "auxiliary_loss_mlp": 0.01269831, "balance_loss_clip": 0.06289569, "balance_loss_mlp": 0.01256796, "epoch": 0.46872087779948896, "flos": 35482746977280.0, "grad_norm": 4.272229159853904, "language_loss": 0.68479073, "learning_rate": 2.298570497656304e-06, "loss": 0.76200122, "num_input_tokens_seen": 167395010, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.13043213, "step": 7796, "time_per_iteration": 4.1361777782440186 }, { "auxiliary_loss_clip": 0.06449992, "auxiliary_loss_mlp": 0.01268839, "balance_loss_clip": 0.06283978, "balance_loss_mlp": 0.01256138, "epoch": 0.4687810010521569, "flos": 26403720255360.0, "grad_norm": 1.6996712688582776, "language_loss": 0.705917, "learning_rate": 2.2981853945569894e-06, "loss": 0.78310525, "num_input_tokens_seen": 167415285, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.12695312, "step": 7797, "time_per_iteration": 2.597804069519043 }, { "auxiliary_loss_clip": 0.06455523, "auxiliary_loss_mlp": 0.01269695, "balance_loss_clip": 0.06288395, "balance_loss_mlp": 0.01255622, "epoch": 0.4688411243048249, "flos": 19978618302720.0, "grad_norm": 4.020398798171029, "language_loss": 0.66952407, "learning_rate": 2.297800280150454e-06, "loss": 0.74677628, "num_input_tokens_seen": 167432405, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.14080811, "step": 7798, "time_per_iteration": 2.541191816329956 }, { "auxiliary_loss_clip": 0.06355362, "auxiliary_loss_mlp": 0.01255255, "balance_loss_clip": 0.06287557, "balance_loss_mlp": 0.0125282, "epoch": 0.46890124755749285, "flos": 63996739983360.0, "grad_norm": 0.9167700117977595, "language_loss": 0.64491463, "learning_rate": 2.2974151544513033e-06, "loss": 0.72102082, "num_input_tokens_seen": 167499365, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.02432251, "step": 7799, "time_per_iteration": 3.3340015411376953 }, { "auxiliary_loss_clip": 0.06448943, "auxiliary_loss_mlp": 0.01271268, "balance_loss_clip": 0.06285091, "balance_loss_mlp": 0.01258506, "epoch": 0.4689613708101608, "flos": 23775763052160.0, "grad_norm": 1.3429224253265413, "language_loss": 0.72477102, "learning_rate": 2.2970300174741395e-06, "loss": 0.8019731, "num_input_tokens_seen": 167520390, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.12756348, "step": 7800, "time_per_iteration": 2.5837085247039795 }, { "auxiliary_loss_clip": 0.06448808, "auxiliary_loss_mlp": 0.01271913, "balance_loss_clip": 0.06287342, "balance_loss_mlp": 0.01260075, "epoch": 0.4690214940628288, "flos": 24795337933440.0, "grad_norm": 1.9427339695507173, "language_loss": 0.72538733, "learning_rate": 2.296644869233568e-06, "loss": 0.80259454, "num_input_tokens_seen": 167539865, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.11834717, "step": 7801, "time_per_iteration": 2.5808980464935303 }, { "auxiliary_loss_clip": 0.06461719, "auxiliary_loss_mlp": 0.01272172, "balance_loss_clip": 0.0629089, "balance_loss_mlp": 0.01258469, "epoch": 0.46908161731549675, "flos": 18083169492480.0, "grad_norm": 1.7723867399724014, "language_loss": 0.62999785, "learning_rate": 2.2962597097441936e-06, "loss": 0.70733672, "num_input_tokens_seen": 167558190, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.13708496, "step": 7802, "time_per_iteration": 2.5356523990631104 }, { "auxiliary_loss_clip": 0.06449014, "auxiliary_loss_mlp": 0.0126944, "balance_loss_clip": 0.06282115, "balance_loss_mlp": 0.01256756, "epoch": 0.4691417405681647, "flos": 25710554154240.0, "grad_norm": 1.9385183522801464, "language_loss": 0.74041134, "learning_rate": 2.2958745390206206e-06, "loss": 0.81759584, "num_input_tokens_seen": 167577685, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.12689209, "step": 7803, "time_per_iteration": 2.589425563812256 }, { "auxiliary_loss_clip": 0.06451146, "auxiliary_loss_mlp": 0.01273675, "balance_loss_clip": 0.06287783, "balance_loss_mlp": 0.01261653, "epoch": 0.46920186382083273, "flos": 17462776262400.0, "grad_norm": 1.6501273618461214, "language_loss": 0.77997696, "learning_rate": 2.2954893570774558e-06, "loss": 0.85722512, "num_input_tokens_seen": 167596390, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.12030029, "step": 7804, "time_per_iteration": 2.5239920616149902 }, { "auxiliary_loss_clip": 0.06450127, "auxiliary_loss_mlp": 0.01266156, "balance_loss_clip": 0.06286892, "balance_loss_mlp": 0.01253645, "epoch": 0.4692619870735007, "flos": 20345669112960.0, "grad_norm": 1.6116439070667445, "language_loss": 0.7739675, "learning_rate": 2.295104163929305e-06, "loss": 0.85113037, "num_input_tokens_seen": 167614980, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.125, "step": 7805, "time_per_iteration": 2.5356194972991943 }, { "auxiliary_loss_clip": 0.0645975, "auxiliary_loss_mlp": 0.01271726, "balance_loss_clip": 0.06288462, "balance_loss_mlp": 0.01258542, "epoch": 0.46932211032616866, "flos": 29504177032320.0, "grad_norm": 1.5479043925648428, "language_loss": 0.82542318, "learning_rate": 2.2947189595907742e-06, "loss": 0.90273792, "num_input_tokens_seen": 167635895, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.13183594, "step": 7806, "time_per_iteration": 5.479264736175537 }, { "auxiliary_loss_clip": 0.0644875, "auxiliary_loss_mlp": 0.01269012, "balance_loss_clip": 0.06280522, "balance_loss_mlp": 0.01256483, "epoch": 0.4693822335788366, "flos": 36220202760960.0, "grad_norm": 1.5764085029236916, "language_loss": 0.77389216, "learning_rate": 2.294333744076472e-06, "loss": 0.85106981, "num_input_tokens_seen": 167657440, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.12530518, "step": 7807, "time_per_iteration": 2.800682306289673 }, { "auxiliary_loss_clip": 0.06455641, "auxiliary_loss_mlp": 0.01272357, "balance_loss_clip": 0.0628865, "balance_loss_mlp": 0.01259262, "epoch": 0.4694423568315046, "flos": 20345124061440.0, "grad_norm": 1.8057723657501674, "language_loss": 0.51717895, "learning_rate": 2.2939485174010035e-06, "loss": 0.59445894, "num_input_tokens_seen": 167675025, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.13092041, "step": 7808, "time_per_iteration": 2.594379186630249 }, { "auxiliary_loss_clip": 0.06354819, "auxiliary_loss_mlp": 0.01253451, "balance_loss_clip": 0.06286398, "balance_loss_mlp": 0.01251122, "epoch": 0.46950248008417256, "flos": 64343540033280.0, "grad_norm": 0.7824980031553183, "language_loss": 0.57707548, "learning_rate": 2.293563279578978e-06, "loss": 0.65315819, "num_input_tokens_seen": 167729635, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 0.02325439, "step": 7809, "time_per_iteration": 3.053394079208374 }, { "auxiliary_loss_clip": 0.06447405, "auxiliary_loss_mlp": 0.01267807, "balance_loss_clip": 0.06280968, "balance_loss_mlp": 0.01255022, "epoch": 0.4695626033368405, "flos": 19204755120000.0, "grad_norm": 2.084683365830919, "language_loss": 0.7216835, "learning_rate": 2.2931780306250045e-06, "loss": 0.79883564, "num_input_tokens_seen": 167745135, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.12780762, "step": 7810, "time_per_iteration": 2.540799856185913 }, { "auxiliary_loss_clip": 0.06450751, "auxiliary_loss_mlp": 0.01270411, "balance_loss_clip": 0.0628529, "balance_loss_mlp": 0.01257507, "epoch": 0.4696227265895085, "flos": 23009027466240.0, "grad_norm": 2.066338662349419, "language_loss": 0.81690371, "learning_rate": 2.29279277055369e-06, "loss": 0.89411527, "num_input_tokens_seen": 167763875, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.12915039, "step": 7811, "time_per_iteration": 2.5716936588287354 }, { "auxiliary_loss_clip": 0.06449046, "auxiliary_loss_mlp": 0.01269198, "balance_loss_clip": 0.06281979, "balance_loss_mlp": 0.01255316, "epoch": 0.46968284984217645, "flos": 21877169713920.0, "grad_norm": 1.4542121031834303, "language_loss": 0.80369365, "learning_rate": 2.292407499379644e-06, "loss": 0.88087606, "num_input_tokens_seen": 167784895, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.13885498, "step": 7812, "time_per_iteration": 2.5969393253326416 }, { "auxiliary_loss_clip": 0.06444632, "auxiliary_loss_mlp": 0.01270723, "balance_loss_clip": 0.06282444, "balance_loss_mlp": 0.01258993, "epoch": 0.4697429730948444, "flos": 19981217779200.0, "grad_norm": 2.010324269507895, "language_loss": 0.74512845, "learning_rate": 2.292022217117477e-06, "loss": 0.82228196, "num_input_tokens_seen": 167803185, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.1171875, "step": 7813, "time_per_iteration": 2.5738961696624756 }, { "auxiliary_loss_clip": 0.06451487, "auxiliary_loss_mlp": 0.01273375, "balance_loss_clip": 0.06286339, "balance_loss_mlp": 0.01260364, "epoch": 0.4698030963475124, "flos": 15161185912320.0, "grad_norm": 2.1535300026933935, "language_loss": 0.84988087, "learning_rate": 2.291636923781798e-06, "loss": 0.92712951, "num_input_tokens_seen": 167816550, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.13006592, "step": 7814, "time_per_iteration": 2.5302770137786865 }, { "auxiliary_loss_clip": 0.06442672, "auxiliary_loss_mlp": 0.01265865, "balance_loss_clip": 0.06280817, "balance_loss_mlp": 0.01253348, "epoch": 0.46986321960018035, "flos": 15155316126720.0, "grad_norm": 1.7627203072622444, "language_loss": 0.81722784, "learning_rate": 2.291251619387217e-06, "loss": 0.89431322, "num_input_tokens_seen": 167831845, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.12518311, "step": 7815, "time_per_iteration": 2.5120835304260254 }, { "auxiliary_loss_clip": 0.06446631, "auxiliary_loss_mlp": 0.01275609, "balance_loss_clip": 0.06281592, "balance_loss_mlp": 0.01262526, "epoch": 0.4699233428528483, "flos": 23115021281280.0, "grad_norm": 2.0736293525251637, "language_loss": 0.78068817, "learning_rate": 2.2908663039483468e-06, "loss": 0.85791051, "num_input_tokens_seen": 167850360, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.13092041, "step": 7816, "time_per_iteration": 2.5811407566070557 }, { "auxiliary_loss_clip": 0.06351097, "auxiliary_loss_mlp": 0.0125871, "balance_loss_clip": 0.0628348, "balance_loss_mlp": 0.0125629, "epoch": 0.46998346610551633, "flos": 68126917985280.0, "grad_norm": 0.8159833542505099, "language_loss": 0.58645856, "learning_rate": 2.290480977479796e-06, "loss": 0.66255665, "num_input_tokens_seen": 167908660, "router_z_loss_clip": 0.67578125, "router_z_loss_mlp": 0.02416992, "step": 7817, "time_per_iteration": 3.1758458614349365 }, { "auxiliary_loss_clip": 0.06444906, "auxiliary_loss_mlp": 0.01270157, "balance_loss_clip": 0.06285263, "balance_loss_mlp": 0.0125792, "epoch": 0.4700435893581843, "flos": 24135560484480.0, "grad_norm": 1.6695092120499373, "language_loss": 0.80047095, "learning_rate": 2.2900956399961775e-06, "loss": 0.87762153, "num_input_tokens_seen": 167927905, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.12237549, "step": 7818, "time_per_iteration": 2.61149001121521 }, { "auxiliary_loss_clip": 0.06446167, "auxiliary_loss_mlp": 0.01271227, "balance_loss_clip": 0.06281248, "balance_loss_mlp": 0.01258776, "epoch": 0.47010371261085226, "flos": 20155624053120.0, "grad_norm": 1.6359857650537055, "language_loss": 0.84331262, "learning_rate": 2.289710291512104e-06, "loss": 0.92048663, "num_input_tokens_seen": 167945995, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.12457275, "step": 7819, "time_per_iteration": 2.5546724796295166 }, { "auxiliary_loss_clip": 0.06453604, "auxiliary_loss_mlp": 0.01269491, "balance_loss_clip": 0.06284644, "balance_loss_mlp": 0.01256122, "epoch": 0.47016383586352023, "flos": 15127587624960.0, "grad_norm": 2.2557968129652206, "language_loss": 0.76708221, "learning_rate": 2.289324932042186e-06, "loss": 0.84431314, "num_input_tokens_seen": 167963380, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.1338501, "step": 7820, "time_per_iteration": 2.5219478607177734 }, { "auxiliary_loss_clip": 0.06447349, "auxiliary_loss_mlp": 0.01268809, "balance_loss_clip": 0.0628581, "balance_loss_mlp": 0.01256203, "epoch": 0.4702239591161882, "flos": 13558044470400.0, "grad_norm": 2.052938082605329, "language_loss": 0.74000323, "learning_rate": 2.288939561601039e-06, "loss": 0.81716478, "num_input_tokens_seen": 167981740, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.12609863, "step": 7821, "time_per_iteration": 2.52451491355896 }, { "auxiliary_loss_clip": 0.06444333, "auxiliary_loss_mlp": 0.0127249, "balance_loss_clip": 0.0628258, "balance_loss_mlp": 0.01260307, "epoch": 0.47028408236885616, "flos": 24282825235200.0, "grad_norm": 1.6461783113224064, "language_loss": 0.88831425, "learning_rate": 2.2885541802032746e-06, "loss": 0.96548247, "num_input_tokens_seen": 167999380, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.12188721, "step": 7822, "time_per_iteration": 2.60353684425354 }, { "auxiliary_loss_clip": 0.0644118, "auxiliary_loss_mlp": 0.01268673, "balance_loss_clip": 0.06279579, "balance_loss_mlp": 0.01256251, "epoch": 0.4703442056215241, "flos": 22863565578240.0, "grad_norm": 1.4478773491313557, "language_loss": 0.80031997, "learning_rate": 2.2881687878635055e-06, "loss": 0.87741852, "num_input_tokens_seen": 168018395, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.12408447, "step": 7823, "time_per_iteration": 2.5588061809539795 }, { "auxiliary_loss_clip": 0.06346256, "auxiliary_loss_mlp": 0.01252863, "balance_loss_clip": 0.0627871, "balance_loss_mlp": 0.01250941, "epoch": 0.4704043288741921, "flos": 69262381463040.0, "grad_norm": 0.6783262238339323, "language_loss": 0.56515747, "learning_rate": 2.2877833845963487e-06, "loss": 0.64114869, "num_input_tokens_seen": 168084080, "router_z_loss_clip": 0.67333984, "router_z_loss_mlp": 0.01919556, "step": 7824, "time_per_iteration": 3.3340988159179688 }, { "auxiliary_loss_clip": 0.06449873, "auxiliary_loss_mlp": 0.01271245, "balance_loss_clip": 0.0628408, "balance_loss_mlp": 0.01258251, "epoch": 0.47046445212686006, "flos": 18046971728640.0, "grad_norm": 1.6408358879412313, "language_loss": 0.81716907, "learning_rate": 2.2873979704164157e-06, "loss": 0.89438021, "num_input_tokens_seen": 168101555, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.12994385, "step": 7825, "time_per_iteration": 2.650322675704956 }, { "auxiliary_loss_clip": 0.06454621, "auxiliary_loss_mlp": 0.01271825, "balance_loss_clip": 0.06289002, "balance_loss_mlp": 0.01259296, "epoch": 0.470524575379528, "flos": 23958261244800.0, "grad_norm": 1.8156077966670146, "language_loss": 0.67409384, "learning_rate": 2.287012545338324e-06, "loss": 0.75135827, "num_input_tokens_seen": 168121530, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.12530518, "step": 7826, "time_per_iteration": 2.6228561401367188 }, { "auxiliary_loss_clip": 0.064523, "auxiliary_loss_mlp": 0.01269848, "balance_loss_clip": 0.06285838, "balance_loss_mlp": 0.01257092, "epoch": 0.470584698632196, "flos": 18119367256320.0, "grad_norm": 1.6565330681713728, "language_loss": 0.8411479, "learning_rate": 2.2866271093766877e-06, "loss": 0.91836941, "num_input_tokens_seen": 168140335, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.12756348, "step": 7827, "time_per_iteration": 2.579676866531372 }, { "auxiliary_loss_clip": 0.06340232, "auxiliary_loss_mlp": 0.01252363, "balance_loss_clip": 0.06272489, "balance_loss_mlp": 0.01250253, "epoch": 0.47064482188486395, "flos": 57268555413120.0, "grad_norm": 0.7880516426283687, "language_loss": 0.55654466, "learning_rate": 2.286241662546122e-06, "loss": 0.63247055, "num_input_tokens_seen": 168200535, "router_z_loss_clip": 0.67919922, "router_z_loss_mlp": 0.02111816, "step": 7828, "time_per_iteration": 3.2083232402801514 }, { "auxiliary_loss_clip": 0.0644632, "auxiliary_loss_mlp": 0.01271142, "balance_loss_clip": 0.06282955, "balance_loss_mlp": 0.01258177, "epoch": 0.4707049451375319, "flos": 17900922862080.0, "grad_norm": 1.8543238497521048, "language_loss": 0.81578356, "learning_rate": 2.285856204861245e-06, "loss": 0.89295816, "num_input_tokens_seen": 168219610, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.12957764, "step": 7829, "time_per_iteration": 2.560086488723755 }, { "auxiliary_loss_clip": 0.06449655, "auxiliary_loss_mlp": 0.01272327, "balance_loss_clip": 0.06286906, "balance_loss_mlp": 0.0126074, "epoch": 0.47076506839019994, "flos": 25240402494720.0, "grad_norm": 1.2709158464180175, "language_loss": 0.75932682, "learning_rate": 2.2854707363366703e-06, "loss": 0.83654666, "num_input_tokens_seen": 168242505, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.11584473, "step": 7830, "time_per_iteration": 2.6171631813049316 }, { "auxiliary_loss_clip": 0.06453584, "auxiliary_loss_mlp": 0.01269805, "balance_loss_clip": 0.06291635, "balance_loss_mlp": 0.01257211, "epoch": 0.4708251916428679, "flos": 13484684620800.0, "grad_norm": 1.8984961428786322, "language_loss": 0.7923826, "learning_rate": 2.2850852569870177e-06, "loss": 0.86961651, "num_input_tokens_seen": 168260220, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.12591553, "step": 7831, "time_per_iteration": 2.5318500995635986 }, { "auxiliary_loss_clip": 0.06460151, "auxiliary_loss_mlp": 0.01271334, "balance_loss_clip": 0.06287429, "balance_loss_mlp": 0.01257899, "epoch": 0.47088531489553587, "flos": 30154646678400.0, "grad_norm": 1.8975605588304574, "language_loss": 0.75621635, "learning_rate": 2.2846997668269033e-06, "loss": 0.8335312, "num_input_tokens_seen": 168277360, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.13433838, "step": 7832, "time_per_iteration": 2.633793830871582 }, { "auxiliary_loss_clip": 0.06446213, "auxiliary_loss_mlp": 0.012689, "balance_loss_clip": 0.06284912, "balance_loss_mlp": 0.01257855, "epoch": 0.47094543814820383, "flos": 21804648405120.0, "grad_norm": 1.2700894636732054, "language_loss": 0.7472499, "learning_rate": 2.2843142658709454e-06, "loss": 0.82440102, "num_input_tokens_seen": 168296605, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.11047363, "step": 7833, "time_per_iteration": 2.5582847595214844 }, { "auxiliary_loss_clip": 0.06453299, "auxiliary_loss_mlp": 0.01269898, "balance_loss_clip": 0.06289158, "balance_loss_mlp": 0.01257589, "epoch": 0.4710055614008718, "flos": 23009698298880.0, "grad_norm": 1.4707604674490675, "language_loss": 0.75913805, "learning_rate": 2.283928754133762e-06, "loss": 0.83636999, "num_input_tokens_seen": 168316205, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.12304688, "step": 7834, "time_per_iteration": 3.997138500213623 }, { "auxiliary_loss_clip": 0.06448978, "auxiliary_loss_mlp": 0.01265693, "balance_loss_clip": 0.06287432, "balance_loss_mlp": 0.01253516, "epoch": 0.47106568465353976, "flos": 42751256601600.0, "grad_norm": 1.4035393137894496, "language_loss": 0.66806722, "learning_rate": 2.283543231629972e-06, "loss": 0.74521393, "num_input_tokens_seen": 168338935, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.12188721, "step": 7835, "time_per_iteration": 4.259016275405884 }, { "auxiliary_loss_clip": 0.06342556, "auxiliary_loss_mlp": 0.01259457, "balance_loss_clip": 0.06274725, "balance_loss_mlp": 0.01257262, "epoch": 0.4711258079062077, "flos": 68571116807040.0, "grad_norm": 0.8637807470255854, "language_loss": 0.62229735, "learning_rate": 2.283157698374194e-06, "loss": 0.69831753, "num_input_tokens_seen": 168392800, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.02198792, "step": 7836, "time_per_iteration": 3.1727640628814697 }, { "auxiliary_loss_clip": 0.0645863, "auxiliary_loss_mlp": 0.01269634, "balance_loss_clip": 0.0628814, "balance_loss_mlp": 0.01256771, "epoch": 0.4711859311588757, "flos": 25453522154880.0, "grad_norm": 1.6658937640491935, "language_loss": 0.69922554, "learning_rate": 2.2827721543810475e-06, "loss": 0.77650821, "num_input_tokens_seen": 168412940, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.12854004, "step": 7837, "time_per_iteration": 2.623128890991211 }, { "auxiliary_loss_clip": 0.06453523, "auxiliary_loss_mlp": 0.01270253, "balance_loss_clip": 0.06289078, "balance_loss_mlp": 0.01257027, "epoch": 0.47124605441154366, "flos": 21988488263040.0, "grad_norm": 1.9207709844342227, "language_loss": 0.66805726, "learning_rate": 2.282386599665153e-06, "loss": 0.74529505, "num_input_tokens_seen": 168431995, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.13220215, "step": 7838, "time_per_iteration": 2.6005942821502686 }, { "auxiliary_loss_clip": 0.06459168, "auxiliary_loss_mlp": 0.01270698, "balance_loss_clip": 0.06289816, "balance_loss_mlp": 0.01258026, "epoch": 0.4713061776642116, "flos": 25420049648640.0, "grad_norm": 1.8656395357743556, "language_loss": 0.77993464, "learning_rate": 2.2820010342411304e-06, "loss": 0.85723329, "num_input_tokens_seen": 168454585, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.12670898, "step": 7839, "time_per_iteration": 2.6355671882629395 }, { "auxiliary_loss_clip": 0.06456012, "auxiliary_loss_mlp": 0.01275037, "balance_loss_clip": 0.06296353, "balance_loss_mlp": 0.01262592, "epoch": 0.4713663009168796, "flos": 26549559486720.0, "grad_norm": 1.9331718180901094, "language_loss": 0.72927481, "learning_rate": 2.2816154581235993e-06, "loss": 0.80658531, "num_input_tokens_seen": 168471265, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.12451172, "step": 7840, "time_per_iteration": 2.595536470413208 }, { "auxiliary_loss_clip": 0.06451137, "auxiliary_loss_mlp": 0.01267488, "balance_loss_clip": 0.06287477, "balance_loss_mlp": 0.01255328, "epoch": 0.47142642416954755, "flos": 23630426945280.0, "grad_norm": 1.5572182594532726, "language_loss": 0.75698817, "learning_rate": 2.2812298713271833e-06, "loss": 0.83417445, "num_input_tokens_seen": 168491360, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.12176514, "step": 7841, "time_per_iteration": 2.6057467460632324 }, { "auxiliary_loss_clip": 0.06458244, "auxiliary_loss_mlp": 0.01271397, "balance_loss_clip": 0.06292609, "balance_loss_mlp": 0.01258552, "epoch": 0.4714865474222155, "flos": 22316783760000.0, "grad_norm": 1.7621970189177791, "language_loss": 0.70609593, "learning_rate": 2.280844273866501e-06, "loss": 0.78339237, "num_input_tokens_seen": 168511335, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.12841797, "step": 7842, "time_per_iteration": 2.5722615718841553 }, { "auxiliary_loss_clip": 0.06461517, "auxiliary_loss_mlp": 0.01271864, "balance_loss_clip": 0.06297513, "balance_loss_mlp": 0.01258757, "epoch": 0.4715466706748835, "flos": 17828317699200.0, "grad_norm": 2.059267563081997, "language_loss": 0.7952376, "learning_rate": 2.280458665756177e-06, "loss": 0.87257135, "num_input_tokens_seen": 168529920, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.13098145, "step": 7843, "time_per_iteration": 2.546020984649658 }, { "auxiliary_loss_clip": 0.06450173, "auxiliary_loss_mlp": 0.01268355, "balance_loss_clip": 0.06287408, "balance_loss_mlp": 0.01256654, "epoch": 0.4716067939275515, "flos": 23666289292800.0, "grad_norm": 1.683342335123677, "language_loss": 0.74777889, "learning_rate": 2.280073047010832e-06, "loss": 0.82496417, "num_input_tokens_seen": 168550595, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.11700439, "step": 7844, "time_per_iteration": 3.9461781978607178 }, { "auxiliary_loss_clip": 0.0644744, "auxiliary_loss_mlp": 0.01273933, "balance_loss_clip": 0.06285569, "balance_loss_mlp": 0.0126082, "epoch": 0.47166691718021947, "flos": 17935778960640.0, "grad_norm": 1.4301293893857048, "language_loss": 0.79089797, "learning_rate": 2.279687417645088e-06, "loss": 0.86811173, "num_input_tokens_seen": 168569765, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.13092041, "step": 7845, "time_per_iteration": 2.541698455810547 }, { "auxiliary_loss_clip": 0.0645328, "auxiliary_loss_mlp": 0.01267514, "balance_loss_clip": 0.06292801, "balance_loss_mlp": 0.01256105, "epoch": 0.47172704043288743, "flos": 26621787306240.0, "grad_norm": 6.206543019657005, "language_loss": 0.7321915, "learning_rate": 2.2793017776735703e-06, "loss": 0.80939937, "num_input_tokens_seen": 168591525, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.11413574, "step": 7846, "time_per_iteration": 4.056241512298584 }, { "auxiliary_loss_clip": 0.06446194, "auxiliary_loss_mlp": 0.01272621, "balance_loss_clip": 0.06288613, "balance_loss_mlp": 0.01260658, "epoch": 0.4717871636855554, "flos": 27929225289600.0, "grad_norm": 1.3498882994421466, "language_loss": 0.74215913, "learning_rate": 2.2789161271109e-06, "loss": 0.81934726, "num_input_tokens_seen": 168611235, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.11962891, "step": 7847, "time_per_iteration": 2.6131374835968018 }, { "auxiliary_loss_clip": 0.06453241, "auxiliary_loss_mlp": 0.01270905, "balance_loss_clip": 0.06289682, "balance_loss_mlp": 0.01259347, "epoch": 0.47184728693822336, "flos": 14507571738240.0, "grad_norm": 1.790780874182849, "language_loss": 0.81194997, "learning_rate": 2.278530465971703e-06, "loss": 0.88919139, "num_input_tokens_seen": 168628710, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.11566162, "step": 7848, "time_per_iteration": 2.5624003410339355 }, { "auxiliary_loss_clip": 0.06455867, "auxiliary_loss_mlp": 0.01267479, "balance_loss_clip": 0.06290688, "balance_loss_mlp": 0.01254569, "epoch": 0.47190741019089133, "flos": 17862041767680.0, "grad_norm": 2.1035183021974193, "language_loss": 0.70313609, "learning_rate": 2.2781447942706032e-06, "loss": 0.78036952, "num_input_tokens_seen": 168645645, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.12908936, "step": 7849, "time_per_iteration": 2.5324149131774902 }, { "auxiliary_loss_clip": 0.0646126, "auxiliary_loss_mlp": 0.01267439, "balance_loss_clip": 0.06291391, "balance_loss_mlp": 0.01253778, "epoch": 0.4719675334435593, "flos": 17901384059520.0, "grad_norm": 2.0404525480152182, "language_loss": 0.70054746, "learning_rate": 2.277759112022224e-06, "loss": 0.77783442, "num_input_tokens_seen": 168664165, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.13677979, "step": 7850, "time_per_iteration": 2.5456795692443848 }, { "auxiliary_loss_clip": 0.06453161, "auxiliary_loss_mlp": 0.01268533, "balance_loss_clip": 0.06285381, "balance_loss_mlp": 0.01255909, "epoch": 0.47202765669622726, "flos": 20710665498240.0, "grad_norm": 1.7711985647638746, "language_loss": 0.75478339, "learning_rate": 2.2773734192411916e-06, "loss": 0.83200032, "num_input_tokens_seen": 168681940, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.12628174, "step": 7851, "time_per_iteration": 2.5721616744995117 }, { "auxiliary_loss_clip": 0.06453264, "auxiliary_loss_mlp": 0.01269191, "balance_loss_clip": 0.06286266, "balance_loss_mlp": 0.01255631, "epoch": 0.4720877799488952, "flos": 16365439192320.0, "grad_norm": 1.6999004510657065, "language_loss": 0.76977193, "learning_rate": 2.276987715942132e-06, "loss": 0.84699643, "num_input_tokens_seen": 168698830, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.13537598, "step": 7852, "time_per_iteration": 2.5798182487487793 }, { "auxiliary_loss_clip": 0.06449158, "auxiliary_loss_mlp": 0.01269201, "balance_loss_clip": 0.06285734, "balance_loss_mlp": 0.0125669, "epoch": 0.4721479032015632, "flos": 20674509661440.0, "grad_norm": 1.7718584623706246, "language_loss": 0.69397235, "learning_rate": 2.2766020021396696e-06, "loss": 0.77115595, "num_input_tokens_seen": 168718305, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.12524414, "step": 7853, "time_per_iteration": 2.6984148025512695 }, { "auxiliary_loss_clip": 0.06346244, "auxiliary_loss_mlp": 0.01258229, "balance_loss_clip": 0.06278497, "balance_loss_mlp": 0.01255991, "epoch": 0.47220802645423116, "flos": 67773367681920.0, "grad_norm": 0.687129697052906, "language_loss": 0.50147039, "learning_rate": 2.276216277848432e-06, "loss": 0.57751513, "num_input_tokens_seen": 168782365, "router_z_loss_clip": 0.67675781, "router_z_loss_mlp": 0.02241516, "step": 7854, "time_per_iteration": 3.369438648223877 }, { "auxiliary_loss_clip": 0.06449932, "auxiliary_loss_mlp": 0.01269826, "balance_loss_clip": 0.06284887, "balance_loss_mlp": 0.01257106, "epoch": 0.4722681497068991, "flos": 20927474737920.0, "grad_norm": 2.2857718275455126, "language_loss": 0.63777554, "learning_rate": 2.2758305430830455e-06, "loss": 0.71497309, "num_input_tokens_seen": 168800485, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.12731934, "step": 7855, "time_per_iteration": 2.633025646209717 }, { "auxiliary_loss_clip": 0.06454419, "auxiliary_loss_mlp": 0.01269681, "balance_loss_clip": 0.06288508, "balance_loss_mlp": 0.01257468, "epoch": 0.4723282729595671, "flos": 28300594584960.0, "grad_norm": 1.81568115150513, "language_loss": 0.76041496, "learning_rate": 2.2754447978581376e-06, "loss": 0.8376559, "num_input_tokens_seen": 168818965, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.12225342, "step": 7856, "time_per_iteration": 2.609815835952759 }, { "auxiliary_loss_clip": 0.06448762, "auxiliary_loss_mlp": 0.01271537, "balance_loss_clip": 0.06286461, "balance_loss_mlp": 0.0125995, "epoch": 0.4723883962122351, "flos": 27132287506560.0, "grad_norm": 1.7205167499887206, "language_loss": 0.75378156, "learning_rate": 2.2750590421883347e-06, "loss": 0.83098453, "num_input_tokens_seen": 168840355, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1159668, "step": 7857, "time_per_iteration": 2.6201298236846924 }, { "auxiliary_loss_clip": 0.06448004, "auxiliary_loss_mlp": 0.0127234, "balance_loss_clip": 0.06288655, "balance_loss_mlp": 0.01261022, "epoch": 0.47244851946490307, "flos": 31544794241280.0, "grad_norm": 1.413776487232651, "language_loss": 0.64712775, "learning_rate": 2.2746732760882655e-06, "loss": 0.72433126, "num_input_tokens_seen": 168861765, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.11315918, "step": 7858, "time_per_iteration": 2.639646291732788 }, { "auxiliary_loss_clip": 0.06440598, "auxiliary_loss_mlp": 0.01270674, "balance_loss_clip": 0.06282426, "balance_loss_mlp": 0.0125908, "epoch": 0.47250864271757104, "flos": 20892828274560.0, "grad_norm": 1.8054917182072205, "language_loss": 0.70842761, "learning_rate": 2.2742874995725575e-06, "loss": 0.78554034, "num_input_tokens_seen": 168881310, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.1159668, "step": 7859, "time_per_iteration": 2.5648674964904785 }, { "auxiliary_loss_clip": 0.06458644, "auxiliary_loss_mlp": 0.01270862, "balance_loss_clip": 0.06288101, "balance_loss_mlp": 0.01258447, "epoch": 0.472568765970239, "flos": 20528376940800.0, "grad_norm": 2.026223561899404, "language_loss": 0.62612194, "learning_rate": 2.2739017126558413e-06, "loss": 0.70341706, "num_input_tokens_seen": 168899470, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.12420654, "step": 7860, "time_per_iteration": 2.5511510372161865 }, { "auxiliary_loss_clip": 0.06455574, "auxiliary_loss_mlp": 0.01269211, "balance_loss_clip": 0.06289066, "balance_loss_mlp": 0.01256122, "epoch": 0.47262888922290697, "flos": 35813306534400.0, "grad_norm": 2.048624710656322, "language_loss": 0.71703649, "learning_rate": 2.2735159153527445e-06, "loss": 0.79428434, "num_input_tokens_seen": 168921495, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.13085938, "step": 7861, "time_per_iteration": 2.6929433345794678 }, { "auxiliary_loss_clip": 0.06452808, "auxiliary_loss_mlp": 0.01268958, "balance_loss_clip": 0.06288865, "balance_loss_mlp": 0.0125569, "epoch": 0.47268901247557493, "flos": 20674006536960.0, "grad_norm": 1.8833967343578777, "language_loss": 0.85531223, "learning_rate": 2.273130107677896e-06, "loss": 0.93252993, "num_input_tokens_seen": 168940515, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.13256836, "step": 7862, "time_per_iteration": 2.5621962547302246 }, { "auxiliary_loss_clip": 0.06456025, "auxiliary_loss_mlp": 0.01273108, "balance_loss_clip": 0.0628975, "balance_loss_mlp": 0.01260501, "epoch": 0.4727491357282429, "flos": 19579394724480.0, "grad_norm": 1.722484170999923, "language_loss": 0.8451972, "learning_rate": 2.272744289645927e-06, "loss": 0.92248857, "num_input_tokens_seen": 168958340, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.12609863, "step": 7863, "time_per_iteration": 2.5628294944763184 }, { "auxiliary_loss_clip": 0.06449382, "auxiliary_loss_mlp": 0.01270351, "balance_loss_clip": 0.06286349, "balance_loss_mlp": 0.01257185, "epoch": 0.47280925898091086, "flos": 18222090762240.0, "grad_norm": 1.9049735621638084, "language_loss": 0.65915889, "learning_rate": 2.272358461271467e-06, "loss": 0.7363562, "num_input_tokens_seen": 168974850, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.13183594, "step": 7864, "time_per_iteration": 2.5510880947113037 }, { "auxiliary_loss_clip": 0.06452052, "auxiliary_loss_mlp": 0.01274821, "balance_loss_clip": 0.06286389, "balance_loss_mlp": 0.01262483, "epoch": 0.4728693822335788, "flos": 17827604939520.0, "grad_norm": 1.9699171863811538, "language_loss": 0.66295028, "learning_rate": 2.271972622569147e-06, "loss": 0.740219, "num_input_tokens_seen": 168992860, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.12341309, "step": 7865, "time_per_iteration": 2.556072235107422 }, { "auxiliary_loss_clip": 0.06445517, "auxiliary_loss_mlp": 0.01276228, "balance_loss_clip": 0.06285755, "balance_loss_mlp": 0.0126333, "epoch": 0.4729295054862468, "flos": 20601359447040.0, "grad_norm": 1.6568853523998788, "language_loss": 0.74194551, "learning_rate": 2.2715867735535976e-06, "loss": 0.81916296, "num_input_tokens_seen": 169010325, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.12890625, "step": 7866, "time_per_iteration": 2.567326068878174 }, { "auxiliary_loss_clip": 0.0645121, "auxiliary_loss_mlp": 0.01272346, "balance_loss_clip": 0.06284197, "balance_loss_mlp": 0.01259715, "epoch": 0.47298962873891476, "flos": 23374862392320.0, "grad_norm": 2.3146766243727166, "language_loss": 0.83949763, "learning_rate": 2.271200914239451e-06, "loss": 0.91673315, "num_input_tokens_seen": 169029840, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.12634277, "step": 7867, "time_per_iteration": 2.5912163257598877 }, { "auxiliary_loss_clip": 0.06444348, "auxiliary_loss_mlp": 0.0126974, "balance_loss_clip": 0.06283344, "balance_loss_mlp": 0.01258242, "epoch": 0.4730497519915827, "flos": 22058410095360.0, "grad_norm": 1.7117614733509823, "language_loss": 0.80073345, "learning_rate": 2.2708150446413385e-06, "loss": 0.87787437, "num_input_tokens_seen": 169049975, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.11505127, "step": 7868, "time_per_iteration": 2.6039535999298096 }, { "auxiliary_loss_clip": 0.06459747, "auxiliary_loss_mlp": 0.01270339, "balance_loss_clip": 0.06289995, "balance_loss_mlp": 0.01257584, "epoch": 0.4731098752442507, "flos": 21076165008000.0, "grad_norm": 2.4948159045149523, "language_loss": 0.75505692, "learning_rate": 2.2704291647738915e-06, "loss": 0.83235782, "num_input_tokens_seen": 169069540, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.12762451, "step": 7869, "time_per_iteration": 2.565980911254883 }, { "auxiliary_loss_clip": 0.0645353, "auxiliary_loss_mlp": 0.01283086, "balance_loss_clip": 0.06288444, "balance_loss_mlp": 0.01268948, "epoch": 0.4731699984969187, "flos": 22535395862400.0, "grad_norm": 2.5635431408791747, "language_loss": 0.73741812, "learning_rate": 2.2700432746517443e-06, "loss": 0.81478429, "num_input_tokens_seen": 169089940, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.14147949, "step": 7870, "time_per_iteration": 2.5755040645599365 }, { "auxiliary_loss_clip": 0.06465378, "auxiliary_loss_mlp": 0.01277841, "balance_loss_clip": 0.0629379, "balance_loss_mlp": 0.01264019, "epoch": 0.4732301217495867, "flos": 24904769765760.0, "grad_norm": 2.2516033068095687, "language_loss": 0.81616163, "learning_rate": 2.2696573742895292e-06, "loss": 0.89359379, "num_input_tokens_seen": 169109650, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.13830566, "step": 7871, "time_per_iteration": 2.60378360748291 }, { "auxiliary_loss_clip": 0.06454881, "auxiliary_loss_mlp": 0.01273216, "balance_loss_clip": 0.06288909, "balance_loss_mlp": 0.01259424, "epoch": 0.47329024500225464, "flos": 22791128123520.0, "grad_norm": 1.612259435406869, "language_loss": 0.76245427, "learning_rate": 2.269271463701879e-06, "loss": 0.83973527, "num_input_tokens_seen": 169128990, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.13793945, "step": 7872, "time_per_iteration": 2.705693483352661 }, { "auxiliary_loss_clip": 0.06455296, "auxiliary_loss_mlp": 0.01271484, "balance_loss_clip": 0.06286895, "balance_loss_mlp": 0.01258192, "epoch": 0.4733503682549226, "flos": 38705884531200.0, "grad_norm": 1.779306183657531, "language_loss": 0.67643631, "learning_rate": 2.268885542903428e-06, "loss": 0.75370407, "num_input_tokens_seen": 169154645, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.1328125, "step": 7873, "time_per_iteration": 2.726778984069824 }, { "auxiliary_loss_clip": 0.06450437, "auxiliary_loss_mlp": 0.01272529, "balance_loss_clip": 0.06286645, "balance_loss_mlp": 0.01259654, "epoch": 0.47341049150759057, "flos": 22973584389120.0, "grad_norm": 1.3976979669101892, "language_loss": 0.72553921, "learning_rate": 2.26849961190881e-06, "loss": 0.80276883, "num_input_tokens_seen": 169174995, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.12878418, "step": 7874, "time_per_iteration": 4.018702745437622 }, { "auxiliary_loss_clip": 0.0645886, "auxiliary_loss_mlp": 0.0127279, "balance_loss_clip": 0.06291649, "balance_loss_mlp": 0.01260285, "epoch": 0.47347061476025853, "flos": 14543769502080.0, "grad_norm": 2.262836052464273, "language_loss": 0.65753061, "learning_rate": 2.26811367073266e-06, "loss": 0.73484707, "num_input_tokens_seen": 169191815, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.12506104, "step": 7875, "time_per_iteration": 3.9815306663513184 }, { "auxiliary_loss_clip": 0.06458253, "auxiliary_loss_mlp": 0.01272761, "balance_loss_clip": 0.06292488, "balance_loss_mlp": 0.01259517, "epoch": 0.4735307380129265, "flos": 30271080326400.0, "grad_norm": 2.1284270063593262, "language_loss": 0.81334609, "learning_rate": 2.2677277193896125e-06, "loss": 0.89065629, "num_input_tokens_seen": 169210430, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.13244629, "step": 7876, "time_per_iteration": 2.6123435497283936 }, { "auxiliary_loss_clip": 0.06453971, "auxiliary_loss_mlp": 0.01274367, "balance_loss_clip": 0.06288999, "balance_loss_mlp": 0.01261468, "epoch": 0.47359086126559446, "flos": 19397148094080.0, "grad_norm": 1.9438477768983, "language_loss": 0.79480183, "learning_rate": 2.267341757894304e-06, "loss": 0.87208521, "num_input_tokens_seen": 169229295, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.12890625, "step": 7877, "time_per_iteration": 2.5318963527679443 }, { "auxiliary_loss_clip": 0.06447646, "auxiliary_loss_mlp": 0.01276898, "balance_loss_clip": 0.06284116, "balance_loss_mlp": 0.01264089, "epoch": 0.47365098451826243, "flos": 21944995194240.0, "grad_norm": 1.7407058636593074, "language_loss": 0.71279788, "learning_rate": 2.2669557862613685e-06, "loss": 0.79004335, "num_input_tokens_seen": 169247855, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.12811279, "step": 7878, "time_per_iteration": 2.5564188957214355 }, { "auxiliary_loss_clip": 0.06447187, "auxiliary_loss_mlp": 0.01274853, "balance_loss_clip": 0.06286053, "balance_loss_mlp": 0.01262283, "epoch": 0.4737111077709304, "flos": 25851571776000.0, "grad_norm": 1.5742911191841118, "language_loss": 0.75444728, "learning_rate": 2.2665698045054425e-06, "loss": 0.83166766, "num_input_tokens_seen": 169268860, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.12567139, "step": 7879, "time_per_iteration": 2.6043894290924072 }, { "auxiliary_loss_clip": 0.06330428, "auxiliary_loss_mlp": 0.01254349, "balance_loss_clip": 0.06261611, "balance_loss_mlp": 0.01252014, "epoch": 0.47377123102359836, "flos": 67779461831040.0, "grad_norm": 0.7159467612498707, "language_loss": 0.61280853, "learning_rate": 2.266183812641164e-06, "loss": 0.68865633, "num_input_tokens_seen": 169331855, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 0.02333069, "step": 7880, "time_per_iteration": 3.2007384300231934 }, { "auxiliary_loss_clip": 0.06449576, "auxiliary_loss_mlp": 0.01268404, "balance_loss_clip": 0.06286728, "balance_loss_mlp": 0.01255399, "epoch": 0.4738313542762663, "flos": 24322796432640.0, "grad_norm": 1.7583528638604295, "language_loss": 0.6841194, "learning_rate": 2.2657978106831675e-06, "loss": 0.76129919, "num_input_tokens_seen": 169352175, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.13012695, "step": 7881, "time_per_iteration": 2.5973079204559326 }, { "auxiliary_loss_clip": 0.06447649, "auxiliary_loss_mlp": 0.01268052, "balance_loss_clip": 0.06284982, "balance_loss_mlp": 0.01255767, "epoch": 0.4738914775289343, "flos": 20711797528320.0, "grad_norm": 1.8383347625753903, "language_loss": 0.77601707, "learning_rate": 2.265411798646092e-06, "loss": 0.85317409, "num_input_tokens_seen": 169371215, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.12280273, "step": 7882, "time_per_iteration": 2.556271553039551 }, { "auxiliary_loss_clip": 0.06448943, "auxiliary_loss_mlp": 0.01271669, "balance_loss_clip": 0.06286165, "balance_loss_mlp": 0.01258598, "epoch": 0.4739516007816023, "flos": 25453228665600.0, "grad_norm": 1.531889622091483, "language_loss": 0.7654984, "learning_rate": 2.2650257765445747e-06, "loss": 0.84270447, "num_input_tokens_seen": 169391745, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.1307373, "step": 7883, "time_per_iteration": 2.597294807434082 }, { "auxiliary_loss_clip": 0.06443071, "auxiliary_loss_mlp": 0.01270678, "balance_loss_clip": 0.06281304, "balance_loss_mlp": 0.01258679, "epoch": 0.4740117240342703, "flos": 19980463092480.0, "grad_norm": 1.6569398699707798, "language_loss": 0.73040247, "learning_rate": 2.2646397443932525e-06, "loss": 0.80753994, "num_input_tokens_seen": 169409845, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.11993408, "step": 7884, "time_per_iteration": 4.005574703216553 }, { "auxiliary_loss_clip": 0.06457655, "auxiliary_loss_mlp": 0.01267543, "balance_loss_clip": 0.06287099, "balance_loss_mlp": 0.01254042, "epoch": 0.47407184728693824, "flos": 15665229348480.0, "grad_norm": 1.8884578990284597, "language_loss": 0.82236576, "learning_rate": 2.2642537022067655e-06, "loss": 0.89961779, "num_input_tokens_seen": 169426085, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.13513184, "step": 7885, "time_per_iteration": 3.976118564605713 }, { "auxiliary_loss_clip": 0.06448542, "auxiliary_loss_mlp": 0.01273703, "balance_loss_clip": 0.06284361, "balance_loss_mlp": 0.01261526, "epoch": 0.4741319705396062, "flos": 18594843649920.0, "grad_norm": 2.0559475989252096, "language_loss": 0.73608905, "learning_rate": 2.263867649999751e-06, "loss": 0.81331146, "num_input_tokens_seen": 169444705, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.12176514, "step": 7886, "time_per_iteration": 2.5302929878234863 }, { "auxiliary_loss_clip": 0.06464018, "auxiliary_loss_mlp": 0.01270964, "balance_loss_clip": 0.06291598, "balance_loss_mlp": 0.01256939, "epoch": 0.47419209379227417, "flos": 13266114445440.0, "grad_norm": 2.2857959312037517, "language_loss": 0.73931122, "learning_rate": 2.263481587786849e-06, "loss": 0.816661, "num_input_tokens_seen": 169460850, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.14025879, "step": 7887, "time_per_iteration": 2.5207881927490234 }, { "auxiliary_loss_clip": 0.0644633, "auxiliary_loss_mlp": 0.01269632, "balance_loss_clip": 0.06283477, "balance_loss_mlp": 0.01257782, "epoch": 0.47425221704494214, "flos": 20049630238080.0, "grad_norm": 1.5681655421109173, "language_loss": 0.77398658, "learning_rate": 2.2630955155826993e-06, "loss": 0.85114616, "num_input_tokens_seen": 169478890, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.11853027, "step": 7888, "time_per_iteration": 2.525801658630371 }, { "auxiliary_loss_clip": 0.06451864, "auxiliary_loss_mlp": 0.01269553, "balance_loss_clip": 0.06285913, "balance_loss_mlp": 0.01257179, "epoch": 0.4743123402976101, "flos": 27279300695040.0, "grad_norm": 1.8134686186461304, "language_loss": 0.73082012, "learning_rate": 2.2627094334019406e-06, "loss": 0.8080343, "num_input_tokens_seen": 169499690, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.12371826, "step": 7889, "time_per_iteration": 2.587163209915161 }, { "auxiliary_loss_clip": 0.06333554, "auxiliary_loss_mlp": 0.01256984, "balance_loss_clip": 0.06265117, "balance_loss_mlp": 0.01254868, "epoch": 0.47437246355027807, "flos": 55410771813120.0, "grad_norm": 0.7088842927393578, "language_loss": 0.55664057, "learning_rate": 2.262323341259214e-06, "loss": 0.63254589, "num_input_tokens_seen": 169560475, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 0.0211792, "step": 7890, "time_per_iteration": 3.240697145462036 }, { "auxiliary_loss_clip": 0.06454511, "auxiliary_loss_mlp": 0.01271627, "balance_loss_clip": 0.06288351, "balance_loss_mlp": 0.01258294, "epoch": 0.47443258680294603, "flos": 23885278738560.0, "grad_norm": 1.8878352817606654, "language_loss": 0.66417289, "learning_rate": 2.2619372391691605e-06, "loss": 0.74143428, "num_input_tokens_seen": 169580110, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.13336182, "step": 7891, "time_per_iteration": 2.5954110622406006 }, { "auxiliary_loss_clip": 0.0646098, "auxiliary_loss_mlp": 0.01271016, "balance_loss_clip": 0.06288339, "balance_loss_mlp": 0.01256466, "epoch": 0.474492710055614, "flos": 21983666653440.0, "grad_norm": 5.811911863035165, "language_loss": 0.70483005, "learning_rate": 2.26155112714642e-06, "loss": 0.78215003, "num_input_tokens_seen": 169597510, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.14538574, "step": 7892, "time_per_iteration": 2.5464417934417725 }, { "auxiliary_loss_clip": 0.06333864, "auxiliary_loss_mlp": 0.01254209, "balance_loss_clip": 0.06265916, "balance_loss_mlp": 0.01252164, "epoch": 0.47455283330828196, "flos": 62577186837120.0, "grad_norm": 0.7954462278983698, "language_loss": 0.58578146, "learning_rate": 2.2611650052056355e-06, "loss": 0.66166222, "num_input_tokens_seen": 169660010, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 0.02044678, "step": 7893, "time_per_iteration": 3.2905521392822266 }, { "auxiliary_loss_clip": 0.06449968, "auxiliary_loss_mlp": 0.01267841, "balance_loss_clip": 0.06285907, "balance_loss_mlp": 0.01256075, "epoch": 0.47461295656094993, "flos": 12098478199680.0, "grad_norm": 2.456617412071897, "language_loss": 0.7793538, "learning_rate": 2.2607788733614463e-06, "loss": 0.85653186, "num_input_tokens_seen": 169678485, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.11767578, "step": 7894, "time_per_iteration": 2.5483593940734863 }, { "auxiliary_loss_clip": 0.06451253, "auxiliary_loss_mlp": 0.01266341, "balance_loss_clip": 0.06285025, "balance_loss_mlp": 0.01254074, "epoch": 0.4746730798136179, "flos": 20890522287360.0, "grad_norm": 1.7333534324645612, "language_loss": 0.74949574, "learning_rate": 2.260392731628497e-06, "loss": 0.82667172, "num_input_tokens_seen": 169697335, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.12255859, "step": 7895, "time_per_iteration": 2.5772581100463867 }, { "auxiliary_loss_clip": 0.06451783, "auxiliary_loss_mlp": 0.01266663, "balance_loss_clip": 0.06287382, "balance_loss_mlp": 0.01254069, "epoch": 0.4747332030662859, "flos": 19981008144000.0, "grad_norm": 2.448216873108612, "language_loss": 0.82730025, "learning_rate": 2.260006580021429e-06, "loss": 0.90448475, "num_input_tokens_seen": 169715395, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.12591553, "step": 7896, "time_per_iteration": 2.54034423828125 }, { "auxiliary_loss_clip": 0.06450936, "auxiliary_loss_mlp": 0.01267916, "balance_loss_clip": 0.06286409, "balance_loss_mlp": 0.01255256, "epoch": 0.4747933263189539, "flos": 16039701244800.0, "grad_norm": 1.899205326339501, "language_loss": 0.75949347, "learning_rate": 2.259620418554886e-06, "loss": 0.83668208, "num_input_tokens_seen": 169733755, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.12658691, "step": 7897, "time_per_iteration": 2.5349907875061035 }, { "auxiliary_loss_clip": 0.06457524, "auxiliary_loss_mlp": 0.01268118, "balance_loss_clip": 0.06286393, "balance_loss_mlp": 0.01254898, "epoch": 0.47485344957162184, "flos": 13960370649600.0, "grad_norm": 2.0879859369610547, "language_loss": 0.64647025, "learning_rate": 2.25923424724351e-06, "loss": 0.72372663, "num_input_tokens_seen": 169751390, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.13232422, "step": 7898, "time_per_iteration": 2.5378365516662598 }, { "auxiliary_loss_clip": 0.06455393, "auxiliary_loss_mlp": 0.01271287, "balance_loss_clip": 0.06289472, "balance_loss_mlp": 0.01257596, "epoch": 0.4749135728242898, "flos": 20455352507520.0, "grad_norm": 2.802189318383411, "language_loss": 0.7038371, "learning_rate": 2.258848066101946e-06, "loss": 0.78110385, "num_input_tokens_seen": 169769500, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.13690186, "step": 7899, "time_per_iteration": 2.55194354057312 }, { "auxiliary_loss_clip": 0.06452774, "auxiliary_loss_mlp": 0.0126812, "balance_loss_clip": 0.06286941, "balance_loss_mlp": 0.01255228, "epoch": 0.4749736960769578, "flos": 28957604849280.0, "grad_norm": 2.502548093584722, "language_loss": 0.69007289, "learning_rate": 2.258461875144837e-06, "loss": 0.76728189, "num_input_tokens_seen": 169789215, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.12890625, "step": 7900, "time_per_iteration": 2.6336660385131836 }, { "auxiliary_loss_clip": 0.06451498, "auxiliary_loss_mlp": 0.01270001, "balance_loss_clip": 0.06286171, "balance_loss_mlp": 0.0125631, "epoch": 0.47503381932962574, "flos": 31946407660800.0, "grad_norm": 1.9163154069343709, "language_loss": 0.70670378, "learning_rate": 2.2580756743868273e-06, "loss": 0.78391874, "num_input_tokens_seen": 169808825, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.13690186, "step": 7901, "time_per_iteration": 2.744819164276123 }, { "auxiliary_loss_clip": 0.06454119, "auxiliary_loss_mlp": 0.01271865, "balance_loss_clip": 0.06289064, "balance_loss_mlp": 0.01259229, "epoch": 0.4750939425822937, "flos": 22133782442880.0, "grad_norm": 1.9022927233625915, "language_loss": 0.73976964, "learning_rate": 2.2576894638425636e-06, "loss": 0.81702948, "num_input_tokens_seen": 169827590, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.12646484, "step": 7902, "time_per_iteration": 2.627988338470459 }, { "auxiliary_loss_clip": 0.06451501, "auxiliary_loss_mlp": 0.01270059, "balance_loss_clip": 0.06287691, "balance_loss_mlp": 0.01258108, "epoch": 0.47515406583496167, "flos": 20856378948480.0, "grad_norm": 1.8787416935289631, "language_loss": 0.68974173, "learning_rate": 2.257303243526688e-06, "loss": 0.76695728, "num_input_tokens_seen": 169844925, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.11950684, "step": 7903, "time_per_iteration": 2.560182571411133 }, { "auxiliary_loss_clip": 0.06447867, "auxiliary_loss_mlp": 0.01270796, "balance_loss_clip": 0.06284909, "balance_loss_mlp": 0.01259245, "epoch": 0.47521418908762963, "flos": 17529679347840.0, "grad_norm": 1.5001429267843758, "language_loss": 0.72238755, "learning_rate": 2.256917013453848e-06, "loss": 0.7995742, "num_input_tokens_seen": 169862705, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.11541748, "step": 7904, "time_per_iteration": 2.547013998031616 }, { "auxiliary_loss_clip": 0.06448465, "auxiliary_loss_mlp": 0.01268505, "balance_loss_clip": 0.06286687, "balance_loss_mlp": 0.01257061, "epoch": 0.4752743123402976, "flos": 20565874442880.0, "grad_norm": 1.5177861961812626, "language_loss": 0.86243606, "learning_rate": 2.25653077363869e-06, "loss": 0.93960577, "num_input_tokens_seen": 169880155, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.11437988, "step": 7905, "time_per_iteration": 2.548051595687866 }, { "auxiliary_loss_clip": 0.06446274, "auxiliary_loss_mlp": 0.01272155, "balance_loss_clip": 0.06287056, "balance_loss_mlp": 0.01259906, "epoch": 0.47533443559296557, "flos": 26368025616000.0, "grad_norm": 1.6908174165033163, "language_loss": 0.82365513, "learning_rate": 2.2561445240958583e-06, "loss": 0.90083945, "num_input_tokens_seen": 169901525, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.12249756, "step": 7906, "time_per_iteration": 2.599658489227295 }, { "auxiliary_loss_clip": 0.06355584, "auxiliary_loss_mlp": 0.01253298, "balance_loss_clip": 0.0628584, "balance_loss_mlp": 0.01251182, "epoch": 0.47539455884563353, "flos": 65970118690560.0, "grad_norm": 0.6487539653368762, "language_loss": 0.58964247, "learning_rate": 2.255758264840002e-06, "loss": 0.66573125, "num_input_tokens_seen": 169970345, "router_z_loss_clip": 0.69580078, "router_z_loss_mlp": 0.0211792, "step": 7907, "time_per_iteration": 3.2884838581085205 }, { "auxiliary_loss_clip": 0.06449692, "auxiliary_loss_mlp": 0.01275253, "balance_loss_clip": 0.06286125, "balance_loss_mlp": 0.01262629, "epoch": 0.4754546820983015, "flos": 17243828743680.0, "grad_norm": 1.8833531288806273, "language_loss": 0.8121711, "learning_rate": 2.255371995885765e-06, "loss": 0.88942051, "num_input_tokens_seen": 169986440, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.1262207, "step": 7908, "time_per_iteration": 2.5306999683380127 }, { "auxiliary_loss_clip": 0.0645522, "auxiliary_loss_mlp": 0.01271396, "balance_loss_clip": 0.06290415, "balance_loss_mlp": 0.01258199, "epoch": 0.47551480535096946, "flos": 19831563187200.0, "grad_norm": 1.6252106126175587, "language_loss": 0.74387002, "learning_rate": 2.254985717247797e-06, "loss": 0.82113624, "num_input_tokens_seen": 170005705, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.13189697, "step": 7909, "time_per_iteration": 2.5725369453430176 }, { "auxiliary_loss_clip": 0.06455415, "auxiliary_loss_mlp": 0.01272152, "balance_loss_clip": 0.06289214, "balance_loss_mlp": 0.01259296, "epoch": 0.4755749286036375, "flos": 22170525258240.0, "grad_norm": 1.4707915747571898, "language_loss": 0.75373423, "learning_rate": 2.2545994289407457e-06, "loss": 0.83100992, "num_input_tokens_seen": 170023415, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.12866211, "step": 7910, "time_per_iteration": 2.5723421573638916 }, { "auxiliary_loss_clip": 0.06458396, "auxiliary_loss_mlp": 0.01270555, "balance_loss_clip": 0.06294072, "balance_loss_mlp": 0.01258449, "epoch": 0.47563505185630545, "flos": 21653945637120.0, "grad_norm": 1.6622905373730872, "language_loss": 0.7914083, "learning_rate": 2.2542131309792577e-06, "loss": 0.86869788, "num_input_tokens_seen": 170042395, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.12103271, "step": 7911, "time_per_iteration": 2.5473995208740234 }, { "auxiliary_loss_clip": 0.06463122, "auxiliary_loss_mlp": 0.01271532, "balance_loss_clip": 0.06293064, "balance_loss_mlp": 0.0125715, "epoch": 0.4756951751089734, "flos": 20634622318080.0, "grad_norm": 1.7377861279429059, "language_loss": 0.76385784, "learning_rate": 2.253826823377983e-06, "loss": 0.8412044, "num_input_tokens_seen": 170061610, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.14373779, "step": 7912, "time_per_iteration": 2.574610471725464 }, { "auxiliary_loss_clip": 0.06453215, "auxiliary_loss_mlp": 0.01273273, "balance_loss_clip": 0.06289394, "balance_loss_mlp": 0.01260512, "epoch": 0.4757552983616414, "flos": 25855932188160.0, "grad_norm": 1.4031553402950647, "language_loss": 0.74480593, "learning_rate": 2.253440506151569e-06, "loss": 0.82207084, "num_input_tokens_seen": 170083505, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.12756348, "step": 7913, "time_per_iteration": 4.07947564125061 }, { "auxiliary_loss_clip": 0.06456092, "auxiliary_loss_mlp": 0.01267439, "balance_loss_clip": 0.06291915, "balance_loss_mlp": 0.0125553, "epoch": 0.47581542161430934, "flos": 18228841015680.0, "grad_norm": 1.9765362221513993, "language_loss": 0.7202273, "learning_rate": 2.253054179314666e-06, "loss": 0.79746258, "num_input_tokens_seen": 170100690, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.11907959, "step": 7914, "time_per_iteration": 4.041851758956909 }, { "auxiliary_loss_clip": 0.06461221, "auxiliary_loss_mlp": 0.01274479, "balance_loss_clip": 0.06295, "balance_loss_mlp": 0.01261795, "epoch": 0.4758755448669773, "flos": 21586162083840.0, "grad_norm": 2.034083343945712, "language_loss": 0.65234256, "learning_rate": 2.2526678428819227e-06, "loss": 0.72969949, "num_input_tokens_seen": 170119240, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.12670898, "step": 7915, "time_per_iteration": 2.569554567337036 }, { "auxiliary_loss_clip": 0.06447407, "auxiliary_loss_mlp": 0.01268882, "balance_loss_clip": 0.06286729, "balance_loss_mlp": 0.01256622, "epoch": 0.47593566811964527, "flos": 15236474405760.0, "grad_norm": 1.9537317052374283, "language_loss": 0.77414978, "learning_rate": 2.2522814968679896e-06, "loss": 0.8513127, "num_input_tokens_seen": 170136450, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.12255859, "step": 7916, "time_per_iteration": 2.544001579284668 }, { "auxiliary_loss_clip": 0.06451632, "auxiliary_loss_mlp": 0.01270799, "balance_loss_clip": 0.06290846, "balance_loss_mlp": 0.01258509, "epoch": 0.47599579137231324, "flos": 21549628903680.0, "grad_norm": 1.7699577509948354, "language_loss": 0.64679599, "learning_rate": 2.2518951412875173e-06, "loss": 0.7240203, "num_input_tokens_seen": 170155295, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.1229248, "step": 7917, "time_per_iteration": 2.6004879474639893 }, { "auxiliary_loss_clip": 0.0635359, "auxiliary_loss_mlp": 0.01255839, "balance_loss_clip": 0.06284234, "balance_loss_mlp": 0.01253286, "epoch": 0.4760559146249812, "flos": 64573388582400.0, "grad_norm": 0.8148262034117553, "language_loss": 0.65451586, "learning_rate": 2.2515087761551557e-06, "loss": 0.73061013, "num_input_tokens_seen": 170222325, "router_z_loss_clip": 0.69482422, "router_z_loss_mlp": 0.02552795, "step": 7918, "time_per_iteration": 3.2728004455566406 }, { "auxiliary_loss_clip": 0.06451659, "auxiliary_loss_mlp": 0.01272424, "balance_loss_clip": 0.06285159, "balance_loss_mlp": 0.01260348, "epoch": 0.47611603787764917, "flos": 22239943966080.0, "grad_norm": 1.7817652956917258, "language_loss": 0.6933434, "learning_rate": 2.2511224014855563e-06, "loss": 0.77058423, "num_input_tokens_seen": 170241625, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.12072754, "step": 7919, "time_per_iteration": 2.5599536895751953 }, { "auxiliary_loss_clip": 0.06463817, "auxiliary_loss_mlp": 0.01268436, "balance_loss_clip": 0.06294437, "balance_loss_mlp": 0.01256032, "epoch": 0.47617616113031713, "flos": 22785971097600.0, "grad_norm": 1.590864833436317, "language_loss": 0.75344181, "learning_rate": 2.2507360172933694e-06, "loss": 0.83076435, "num_input_tokens_seen": 170262470, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.12402344, "step": 7920, "time_per_iteration": 2.6553077697753906 }, { "auxiliary_loss_clip": 0.06465959, "auxiliary_loss_mlp": 0.01268254, "balance_loss_clip": 0.06294974, "balance_loss_mlp": 0.01254253, "epoch": 0.4762362843829851, "flos": 24140633656320.0, "grad_norm": 1.411308677131275, "language_loss": 0.77471685, "learning_rate": 2.2503496235932487e-06, "loss": 0.85205901, "num_input_tokens_seen": 170283460, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.14001465, "step": 7921, "time_per_iteration": 2.61824893951416 }, { "auxiliary_loss_clip": 0.06455106, "auxiliary_loss_mlp": 0.01271835, "balance_loss_clip": 0.06287546, "balance_loss_mlp": 0.01257959, "epoch": 0.47629640763565306, "flos": 22458052944000.0, "grad_norm": 1.605198758571352, "language_loss": 0.78136539, "learning_rate": 2.249963220399845e-06, "loss": 0.85863477, "num_input_tokens_seen": 170304225, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.13867188, "step": 7922, "time_per_iteration": 2.6253042221069336 }, { "auxiliary_loss_clip": 0.06455456, "auxiliary_loss_mlp": 0.01266511, "balance_loss_clip": 0.06286034, "balance_loss_mlp": 0.01252802, "epoch": 0.4763565308883211, "flos": 11186071090560.0, "grad_norm": 2.0825352849445284, "language_loss": 0.73160303, "learning_rate": 2.2495768077278104e-06, "loss": 0.80882275, "num_input_tokens_seen": 170322110, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.13708496, "step": 7923, "time_per_iteration": 2.518547773361206 }, { "auxiliary_loss_clip": 0.06454001, "auxiliary_loss_mlp": 0.0126951, "balance_loss_clip": 0.06287885, "balance_loss_mlp": 0.01257559, "epoch": 0.47641665414098905, "flos": 22388634236160.0, "grad_norm": 2.2439621104049454, "language_loss": 0.81883395, "learning_rate": 2.2491903855917992e-06, "loss": 0.89606905, "num_input_tokens_seen": 170340700, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.11938477, "step": 7924, "time_per_iteration": 5.3796045780181885 }, { "auxiliary_loss_clip": 0.0646459, "auxiliary_loss_mlp": 0.01272769, "balance_loss_clip": 0.06291819, "balance_loss_mlp": 0.01257672, "epoch": 0.476476777393657, "flos": 25053166546560.0, "grad_norm": 1.736297367095354, "language_loss": 0.80922043, "learning_rate": 2.2488039540064626e-06, "loss": 0.88659394, "num_input_tokens_seen": 170359780, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.15100098, "step": 7925, "time_per_iteration": 2.6138253211975098 }, { "auxiliary_loss_clip": 0.0644765, "auxiliary_loss_mlp": 0.01270892, "balance_loss_clip": 0.06281678, "balance_loss_mlp": 0.01257576, "epoch": 0.476536900646325, "flos": 27276994707840.0, "grad_norm": 1.8429811379929637, "language_loss": 0.72384381, "learning_rate": 2.2484175129864558e-06, "loss": 0.80102926, "num_input_tokens_seen": 170381260, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.13323975, "step": 7926, "time_per_iteration": 2.64347767829895 }, { "auxiliary_loss_clip": 0.06467237, "auxiliary_loss_mlp": 0.0127229, "balance_loss_clip": 0.06295328, "balance_loss_mlp": 0.01258295, "epoch": 0.47659702389899294, "flos": 25308437610240.0, "grad_norm": 2.2458290663956872, "language_loss": 0.69105554, "learning_rate": 2.248031062546432e-06, "loss": 0.7684508, "num_input_tokens_seen": 170400595, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.13995361, "step": 7927, "time_per_iteration": 2.6620986461639404 }, { "auxiliary_loss_clip": 0.06446095, "auxiliary_loss_mlp": 0.01274603, "balance_loss_clip": 0.06283082, "balance_loss_mlp": 0.01262748, "epoch": 0.4766571471516609, "flos": 25999716994560.0, "grad_norm": 1.596520336707321, "language_loss": 0.68170214, "learning_rate": 2.247644602701045e-06, "loss": 0.75890917, "num_input_tokens_seen": 170421110, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.11871338, "step": 7928, "time_per_iteration": 2.654599189758301 }, { "auxiliary_loss_clip": 0.06454432, "auxiliary_loss_mlp": 0.01268303, "balance_loss_clip": 0.06286643, "balance_loss_mlp": 0.01255851, "epoch": 0.4767172704043289, "flos": 16037395257600.0, "grad_norm": 2.3717156293681163, "language_loss": 0.78996652, "learning_rate": 2.2472581334649496e-06, "loss": 0.86719382, "num_input_tokens_seen": 170436700, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.12457275, "step": 7929, "time_per_iteration": 2.5527400970458984 }, { "auxiliary_loss_clip": 0.06449239, "auxiliary_loss_mlp": 0.01269488, "balance_loss_clip": 0.0628499, "balance_loss_mlp": 0.0125836, "epoch": 0.47677739365699684, "flos": 39244113233280.0, "grad_norm": 1.7290343902353162, "language_loss": 0.66512346, "learning_rate": 2.2468716548528016e-06, "loss": 0.74231076, "num_input_tokens_seen": 170459555, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.11126709, "step": 7930, "time_per_iteration": 2.767446994781494 }, { "auxiliary_loss_clip": 0.0645709, "auxiliary_loss_mlp": 0.0126994, "balance_loss_clip": 0.06290743, "balance_loss_mlp": 0.01258061, "epoch": 0.4768375169096648, "flos": 24724745268480.0, "grad_norm": 1.9768285790272457, "language_loss": 0.80017805, "learning_rate": 2.2464851668792555e-06, "loss": 0.87744832, "num_input_tokens_seen": 170479175, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.11877441, "step": 7931, "time_per_iteration": 2.6183700561523438 }, { "auxiliary_loss_clip": 0.06449582, "auxiliary_loss_mlp": 0.01269817, "balance_loss_clip": 0.06282263, "balance_loss_mlp": 0.01257342, "epoch": 0.47689764016233277, "flos": 22535270081280.0, "grad_norm": 1.904250420016928, "language_loss": 0.76503372, "learning_rate": 2.2460986695589678e-06, "loss": 0.8422277, "num_input_tokens_seen": 170498450, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.12469482, "step": 7932, "time_per_iteration": 2.589245557785034 }, { "auxiliary_loss_clip": 0.06445609, "auxiliary_loss_mlp": 0.01273319, "balance_loss_clip": 0.06284671, "balance_loss_mlp": 0.01260659, "epoch": 0.47695776341500074, "flos": 15125742835200.0, "grad_norm": 1.669022109400499, "language_loss": 0.79667056, "learning_rate": 2.245712162906593e-06, "loss": 0.87385976, "num_input_tokens_seen": 170516255, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.12658691, "step": 7933, "time_per_iteration": 2.542668581008911 }, { "auxiliary_loss_clip": 0.064577, "auxiliary_loss_mlp": 0.01269602, "balance_loss_clip": 0.06285144, "balance_loss_mlp": 0.01255022, "epoch": 0.4770178866676687, "flos": 14683319677440.0, "grad_norm": 1.9683925804702074, "language_loss": 0.73987859, "learning_rate": 2.2453256469367888e-06, "loss": 0.81715161, "num_input_tokens_seen": 170532705, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.14581299, "step": 7934, "time_per_iteration": 2.55151629447937 }, { "auxiliary_loss_clip": 0.06454434, "auxiliary_loss_mlp": 0.01268491, "balance_loss_clip": 0.06287204, "balance_loss_mlp": 0.01255569, "epoch": 0.47707800992033667, "flos": 22572264458880.0, "grad_norm": 1.6786066085469575, "language_loss": 0.80371487, "learning_rate": 2.244939121664211e-06, "loss": 0.88094413, "num_input_tokens_seen": 170551925, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.12927246, "step": 7935, "time_per_iteration": 2.570788621902466 }, { "auxiliary_loss_clip": 0.06457947, "auxiliary_loss_mlp": 0.01272673, "balance_loss_clip": 0.062843, "balance_loss_mlp": 0.01259172, "epoch": 0.4771381331730047, "flos": 30925868457600.0, "grad_norm": 1.6721185821636975, "language_loss": 0.71135932, "learning_rate": 2.2445525871035177e-06, "loss": 0.78866553, "num_input_tokens_seen": 170572320, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.13494873, "step": 7936, "time_per_iteration": 2.70041823387146 }, { "auxiliary_loss_clip": 0.06456956, "auxiliary_loss_mlp": 0.01268036, "balance_loss_clip": 0.06286606, "balance_loss_mlp": 0.01255466, "epoch": 0.47719825642567265, "flos": 25745955304320.0, "grad_norm": 2.4116276873637017, "language_loss": 0.68087214, "learning_rate": 2.2441660432693656e-06, "loss": 0.75812209, "num_input_tokens_seen": 170589470, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.12567139, "step": 7937, "time_per_iteration": 2.590146780014038 }, { "auxiliary_loss_clip": 0.06332777, "auxiliary_loss_mlp": 0.0125169, "balance_loss_clip": 0.06265101, "balance_loss_mlp": 0.01249572, "epoch": 0.4772583796783406, "flos": 66376344084480.0, "grad_norm": 0.7111098040385262, "language_loss": 0.56265384, "learning_rate": 2.2437794901764128e-06, "loss": 0.63849849, "num_input_tokens_seen": 170662265, "router_z_loss_clip": 0.67871094, "router_z_loss_mlp": 0.02119446, "step": 7938, "time_per_iteration": 3.3810782432556152 }, { "auxiliary_loss_clip": 0.06448717, "auxiliary_loss_mlp": 0.01268497, "balance_loss_clip": 0.06283761, "balance_loss_mlp": 0.01254323, "epoch": 0.4773185029310086, "flos": 22057068430080.0, "grad_norm": 1.59529901992414, "language_loss": 0.8906759, "learning_rate": 2.243392927839317e-06, "loss": 0.967848, "num_input_tokens_seen": 170679680, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.14154053, "step": 7939, "time_per_iteration": 2.5798158645629883 }, { "auxiliary_loss_clip": 0.06447092, "auxiliary_loss_mlp": 0.01270111, "balance_loss_clip": 0.06282276, "balance_loss_mlp": 0.01257558, "epoch": 0.47737862618367655, "flos": 16733496251520.0, "grad_norm": 2.638940241363285, "language_loss": 0.77775735, "learning_rate": 2.2430063562727367e-06, "loss": 0.85492939, "num_input_tokens_seen": 170697340, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.12561035, "step": 7940, "time_per_iteration": 2.582197427749634 }, { "auxiliary_loss_clip": 0.06445093, "auxiliary_loss_mlp": 0.0127059, "balance_loss_clip": 0.06284477, "balance_loss_mlp": 0.01258597, "epoch": 0.4774387494363445, "flos": 19615508634240.0, "grad_norm": 1.6224475959704527, "language_loss": 0.85219014, "learning_rate": 2.2426197754913322e-06, "loss": 0.92934704, "num_input_tokens_seen": 170714905, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.11999512, "step": 7941, "time_per_iteration": 2.5427818298339844 }, { "auxiliary_loss_clip": 0.06452556, "auxiliary_loss_mlp": 0.0127372, "balance_loss_clip": 0.06284674, "balance_loss_mlp": 0.01261305, "epoch": 0.4774988726890125, "flos": 16659507496320.0, "grad_norm": 1.8788082102656123, "language_loss": 0.76239347, "learning_rate": 2.24223318550976e-06, "loss": 0.83965617, "num_input_tokens_seen": 170731810, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.12408447, "step": 7942, "time_per_iteration": 2.589092493057251 }, { "auxiliary_loss_clip": 0.06454688, "auxiliary_loss_mlp": 0.01268878, "balance_loss_clip": 0.06288643, "balance_loss_mlp": 0.01255956, "epoch": 0.47755899594168044, "flos": 20491843760640.0, "grad_norm": 1.7272810974667525, "language_loss": 0.64940262, "learning_rate": 2.241846586342682e-06, "loss": 0.72663832, "num_input_tokens_seen": 170750270, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.12921143, "step": 7943, "time_per_iteration": 2.570570945739746 }, { "auxiliary_loss_clip": 0.06453211, "auxiliary_loss_mlp": 0.01268777, "balance_loss_clip": 0.0628321, "balance_loss_mlp": 0.01254626, "epoch": 0.4776191191943484, "flos": 21659228444160.0, "grad_norm": 1.9371911342018244, "language_loss": 0.73898077, "learning_rate": 2.2414599780047577e-06, "loss": 0.81620061, "num_input_tokens_seen": 170769015, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.14147949, "step": 7944, "time_per_iteration": 2.5985910892486572 }, { "auxiliary_loss_clip": 0.06457345, "auxiliary_loss_mlp": 0.01273459, "balance_loss_clip": 0.06290948, "balance_loss_mlp": 0.01260143, "epoch": 0.4776792424470164, "flos": 18776125958400.0, "grad_norm": 1.940553673976017, "language_loss": 0.68614125, "learning_rate": 2.2410733605106456e-06, "loss": 0.76344931, "num_input_tokens_seen": 170785725, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.13330078, "step": 7945, "time_per_iteration": 2.5646722316741943 }, { "auxiliary_loss_clip": 0.06450279, "auxiliary_loss_mlp": 0.01272496, "balance_loss_clip": 0.06283396, "balance_loss_mlp": 0.01260558, "epoch": 0.47773936569968434, "flos": 29723543821440.0, "grad_norm": 1.8661012593887873, "language_loss": 0.75523686, "learning_rate": 2.240686733875009e-06, "loss": 0.83246458, "num_input_tokens_seen": 170804600, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.1194458, "step": 7946, "time_per_iteration": 2.6437950134277344 }, { "auxiliary_loss_clip": 0.06459431, "auxiliary_loss_mlp": 0.01266821, "balance_loss_clip": 0.06290236, "balance_loss_mlp": 0.01254089, "epoch": 0.4777994889523523, "flos": 24798650169600.0, "grad_norm": 1.9108628121119513, "language_loss": 0.79587698, "learning_rate": 2.240300098112506e-06, "loss": 0.8731395, "num_input_tokens_seen": 170824230, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.12738037, "step": 7947, "time_per_iteration": 2.604238986968994 }, { "auxiliary_loss_clip": 0.06447323, "auxiliary_loss_mlp": 0.01267868, "balance_loss_clip": 0.06285502, "balance_loss_mlp": 0.0125507, "epoch": 0.47785961220502027, "flos": 17863928484480.0, "grad_norm": 1.9217611767331642, "language_loss": 0.73987883, "learning_rate": 2.2399134532377998e-06, "loss": 0.81703073, "num_input_tokens_seen": 170843365, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.12786865, "step": 7948, "time_per_iteration": 2.6998960971832275 }, { "auxiliary_loss_clip": 0.0645394, "auxiliary_loss_mlp": 0.01270672, "balance_loss_clip": 0.06286722, "balance_loss_mlp": 0.01256587, "epoch": 0.4779197354576883, "flos": 20272770460800.0, "grad_norm": 1.4972058495544165, "language_loss": 0.78017139, "learning_rate": 2.2395267992655514e-06, "loss": 0.85741746, "num_input_tokens_seen": 170863515, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.14105225, "step": 7949, "time_per_iteration": 2.577305793762207 }, { "auxiliary_loss_clip": 0.06452646, "auxiliary_loss_mlp": 0.01269754, "balance_loss_clip": 0.06289412, "balance_loss_mlp": 0.01257422, "epoch": 0.47797985871035625, "flos": 17062420654080.0, "grad_norm": 2.6132156487937754, "language_loss": 0.75070024, "learning_rate": 2.2391401362104227e-06, "loss": 0.82792425, "num_input_tokens_seen": 170881245, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.12322998, "step": 7950, "time_per_iteration": 2.588397979736328 }, { "auxiliary_loss_clip": 0.06451108, "auxiliary_loss_mlp": 0.01269968, "balance_loss_clip": 0.06286967, "balance_loss_mlp": 0.0125695, "epoch": 0.4780399819630242, "flos": 31366530679680.0, "grad_norm": 2.3051234821220024, "language_loss": 0.74323153, "learning_rate": 2.2387534640870756e-06, "loss": 0.82044226, "num_input_tokens_seen": 170901285, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.13000488, "step": 7951, "time_per_iteration": 2.6649296283721924 }, { "auxiliary_loss_clip": 0.06459223, "auxiliary_loss_mlp": 0.01267297, "balance_loss_clip": 0.06290618, "balance_loss_mlp": 0.01253987, "epoch": 0.4781001052156922, "flos": 24906488774400.0, "grad_norm": 2.0477843184937723, "language_loss": 0.7999239, "learning_rate": 2.238366782910174e-06, "loss": 0.87718904, "num_input_tokens_seen": 170919740, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.13311768, "step": 7952, "time_per_iteration": 2.612657070159912 }, { "auxiliary_loss_clip": 0.0645678, "auxiliary_loss_mlp": 0.01276028, "balance_loss_clip": 0.06289095, "balance_loss_mlp": 0.01262325, "epoch": 0.47816022846836015, "flos": 18703688503680.0, "grad_norm": 1.782707036473258, "language_loss": 0.78979075, "learning_rate": 2.23798009269438e-06, "loss": 0.86711884, "num_input_tokens_seen": 170938510, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.13690186, "step": 7953, "time_per_iteration": 4.000590085983276 }, { "auxiliary_loss_clip": 0.06454986, "auxiliary_loss_mlp": 0.01271564, "balance_loss_clip": 0.06285966, "balance_loss_mlp": 0.01258391, "epoch": 0.4782203517210281, "flos": 11981289864960.0, "grad_norm": 2.762647140384183, "language_loss": 0.8409732, "learning_rate": 2.2375933934543566e-06, "loss": 0.91823876, "num_input_tokens_seen": 170951170, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.1317749, "step": 7954, "time_per_iteration": 3.9619836807250977 }, { "auxiliary_loss_clip": 0.06451699, "auxiliary_loss_mlp": 0.01268081, "balance_loss_clip": 0.06287585, "balance_loss_mlp": 0.01255707, "epoch": 0.4782804749736961, "flos": 20819761914240.0, "grad_norm": 1.4258693402633718, "language_loss": 0.70727015, "learning_rate": 2.237206685204768e-06, "loss": 0.78446794, "num_input_tokens_seen": 170970990, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.12390137, "step": 7955, "time_per_iteration": 2.54781174659729 }, { "auxiliary_loss_clip": 0.06462263, "auxiliary_loss_mlp": 0.01271638, "balance_loss_clip": 0.06295243, "balance_loss_mlp": 0.01258304, "epoch": 0.47834059822636404, "flos": 23846816914560.0, "grad_norm": 1.5688244876115383, "language_loss": 0.82392848, "learning_rate": 2.2368199679602787e-06, "loss": 0.90126747, "num_input_tokens_seen": 170991215, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.13317871, "step": 7956, "time_per_iteration": 2.6261024475097656 }, { "auxiliary_loss_clip": 0.06460369, "auxiliary_loss_mlp": 0.01268369, "balance_loss_clip": 0.06294736, "balance_loss_mlp": 0.01254219, "epoch": 0.478400721479032, "flos": 22639670668800.0, "grad_norm": 1.991431393219749, "language_loss": 0.8516894, "learning_rate": 2.2364332417355516e-06, "loss": 0.92897677, "num_input_tokens_seen": 171007325, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.14141846, "step": 7957, "time_per_iteration": 2.558272123336792 }, { "auxiliary_loss_clip": 0.06461488, "auxiliary_loss_mlp": 0.01272646, "balance_loss_clip": 0.0629722, "balance_loss_mlp": 0.0125973, "epoch": 0.4784608447317, "flos": 19361118038400.0, "grad_norm": 1.5638315261802829, "language_loss": 0.80072993, "learning_rate": 2.2360465065452527e-06, "loss": 0.87807131, "num_input_tokens_seen": 171025650, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.12908936, "step": 7958, "time_per_iteration": 2.570878744125366 }, { "auxiliary_loss_clip": 0.06464486, "auxiliary_loss_mlp": 0.01269515, "balance_loss_clip": 0.06298936, "balance_loss_mlp": 0.01256021, "epoch": 0.47852096798436794, "flos": 24027386463360.0, "grad_norm": 1.7812874494623598, "language_loss": 0.83157915, "learning_rate": 2.235659762404047e-06, "loss": 0.90891922, "num_input_tokens_seen": 171045045, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.13494873, "step": 7959, "time_per_iteration": 2.581660747528076 }, { "auxiliary_loss_clip": 0.06462386, "auxiliary_loss_mlp": 0.01271458, "balance_loss_clip": 0.06300301, "balance_loss_mlp": 0.01259853, "epoch": 0.4785810912370359, "flos": 25673559776640.0, "grad_norm": 2.8054513415158473, "language_loss": 0.73421317, "learning_rate": 2.235273009326599e-06, "loss": 0.81155163, "num_input_tokens_seen": 171062910, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.11602783, "step": 7960, "time_per_iteration": 2.636582612991333 }, { "auxiliary_loss_clip": 0.06456584, "auxiliary_loss_mlp": 0.01276609, "balance_loss_clip": 0.06293802, "balance_loss_mlp": 0.01264128, "epoch": 0.47864121448970387, "flos": 21438226500480.0, "grad_norm": 1.760523111395736, "language_loss": 0.77463096, "learning_rate": 2.2348862473275745e-06, "loss": 0.85196292, "num_input_tokens_seen": 171080875, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.12481689, "step": 7961, "time_per_iteration": 2.5661261081695557 }, { "auxiliary_loss_clip": 0.06461515, "auxiliary_loss_mlp": 0.0127058, "balance_loss_clip": 0.0629736, "balance_loss_mlp": 0.01257407, "epoch": 0.47870133774237184, "flos": 16149468493440.0, "grad_norm": 1.869602285354519, "language_loss": 0.78402412, "learning_rate": 2.2344994764216405e-06, "loss": 0.86134505, "num_input_tokens_seen": 171099190, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.13165283, "step": 7962, "time_per_iteration": 2.611381769180298 }, { "auxiliary_loss_clip": 0.06467404, "auxiliary_loss_mlp": 0.01277896, "balance_loss_clip": 0.06300934, "balance_loss_mlp": 0.01265314, "epoch": 0.47876146099503986, "flos": 26914094674560.0, "grad_norm": 2.1298100312847135, "language_loss": 0.65200877, "learning_rate": 2.2341126966234635e-06, "loss": 0.72946185, "num_input_tokens_seen": 171119060, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.12585449, "step": 7963, "time_per_iteration": 5.494138717651367 }, { "auxiliary_loss_clip": 0.06466139, "auxiliary_loss_mlp": 0.01269967, "balance_loss_clip": 0.06299442, "balance_loss_mlp": 0.012577, "epoch": 0.4788215842477078, "flos": 45342470989440.0, "grad_norm": 1.9649783837835302, "language_loss": 0.78155208, "learning_rate": 2.2337259079477083e-06, "loss": 0.85891318, "num_input_tokens_seen": 171141900, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.12268066, "step": 7964, "time_per_iteration": 2.7931771278381348 }, { "auxiliary_loss_clip": 0.06476964, "auxiliary_loss_mlp": 0.01272792, "balance_loss_clip": 0.06303802, "balance_loss_mlp": 0.01258212, "epoch": 0.4788817075003758, "flos": 22243801253760.0, "grad_norm": 1.9338862439107807, "language_loss": 0.76914334, "learning_rate": 2.233339110409044e-06, "loss": 0.84664094, "num_input_tokens_seen": 171161045, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.14569092, "step": 7965, "time_per_iteration": 2.6664884090423584 }, { "auxiliary_loss_clip": 0.06462123, "auxiliary_loss_mlp": 0.01268138, "balance_loss_clip": 0.06294227, "balance_loss_mlp": 0.01256211, "epoch": 0.47894183075304375, "flos": 16476631960320.0, "grad_norm": 1.506467717917894, "language_loss": 0.7504797, "learning_rate": 2.232952304022137e-06, "loss": 0.82778221, "num_input_tokens_seen": 171179675, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.1192627, "step": 7966, "time_per_iteration": 2.5960371494293213 }, { "auxiliary_loss_clip": 0.06462239, "auxiliary_loss_mlp": 0.01267626, "balance_loss_clip": 0.06297477, "balance_loss_mlp": 0.0125527, "epoch": 0.4790019540057117, "flos": 24290036686080.0, "grad_norm": 1.573589000385092, "language_loss": 0.72937322, "learning_rate": 2.232565488801655e-06, "loss": 0.80667186, "num_input_tokens_seen": 171201175, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.12359619, "step": 7967, "time_per_iteration": 2.600935935974121 }, { "auxiliary_loss_clip": 0.06451069, "auxiliary_loss_mlp": 0.01267784, "balance_loss_clip": 0.06293499, "balance_loss_mlp": 0.01254939, "epoch": 0.4790620772583797, "flos": 25673601703680.0, "grad_norm": 1.6206384587263742, "language_loss": 0.7941466, "learning_rate": 2.232178664762267e-06, "loss": 0.87133515, "num_input_tokens_seen": 171221750, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.12835693, "step": 7968, "time_per_iteration": 2.6647324562072754 }, { "auxiliary_loss_clip": 0.06344677, "auxiliary_loss_mlp": 0.01263897, "balance_loss_clip": 0.0627528, "balance_loss_mlp": 0.01261622, "epoch": 0.47912220051104765, "flos": 69451168711680.0, "grad_norm": 0.7392960038113542, "language_loss": 0.62167406, "learning_rate": 2.2317918319186408e-06, "loss": 0.69775981, "num_input_tokens_seen": 171292235, "router_z_loss_clip": 0.69433594, "router_z_loss_mlp": 0.02276611, "step": 7969, "time_per_iteration": 3.32180118560791 }, { "auxiliary_loss_clip": 0.06454036, "auxiliary_loss_mlp": 0.01270447, "balance_loss_clip": 0.06292391, "balance_loss_mlp": 0.01257602, "epoch": 0.4791823237637156, "flos": 24175531681920.0, "grad_norm": 1.528735101940771, "language_loss": 0.7745049, "learning_rate": 2.2314049902854446e-06, "loss": 0.85174966, "num_input_tokens_seen": 171312215, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.12835693, "step": 7970, "time_per_iteration": 2.5940160751342773 }, { "auxiliary_loss_clip": 0.0645107, "auxiliary_loss_mlp": 0.01268646, "balance_loss_clip": 0.06286316, "balance_loss_mlp": 0.01255568, "epoch": 0.4792424470163836, "flos": 24757966212480.0, "grad_norm": 1.6410134599762531, "language_loss": 0.70875907, "learning_rate": 2.231018139877349e-06, "loss": 0.7859562, "num_input_tokens_seen": 171332975, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.13085938, "step": 7971, "time_per_iteration": 2.604349374771118 }, { "auxiliary_loss_clip": 0.06451546, "auxiliary_loss_mlp": 0.01272386, "balance_loss_clip": 0.06288989, "balance_loss_mlp": 0.01259618, "epoch": 0.47930257026905154, "flos": 23264550092160.0, "grad_norm": 1.3815999725240018, "language_loss": 0.80028564, "learning_rate": 2.230631280709021e-06, "loss": 0.87752497, "num_input_tokens_seen": 171353880, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.12774658, "step": 7972, "time_per_iteration": 2.626377820968628 }, { "auxiliary_loss_clip": 0.06458507, "auxiliary_loss_mlp": 0.01268822, "balance_loss_clip": 0.06290165, "balance_loss_mlp": 0.0125559, "epoch": 0.4793626935217195, "flos": 14069299357440.0, "grad_norm": 2.0716091991524594, "language_loss": 0.70235622, "learning_rate": 2.2302444127951327e-06, "loss": 0.77962953, "num_input_tokens_seen": 171370930, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.13238525, "step": 7973, "time_per_iteration": 2.5337910652160645 }, { "auxiliary_loss_clip": 0.06454603, "auxiliary_loss_mlp": 0.01270057, "balance_loss_clip": 0.06294234, "balance_loss_mlp": 0.01257767, "epoch": 0.4794228167743875, "flos": 21805319237760.0, "grad_norm": 2.0056778334885625, "language_loss": 0.79218411, "learning_rate": 2.2298575361503523e-06, "loss": 0.86943078, "num_input_tokens_seen": 171387575, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.1229248, "step": 7974, "time_per_iteration": 2.6319310665130615 }, { "auxiliary_loss_clip": 0.06340657, "auxiliary_loss_mlp": 0.01255247, "balance_loss_clip": 0.06272194, "balance_loss_mlp": 0.01252779, "epoch": 0.47948294002705544, "flos": 66989022739200.0, "grad_norm": 0.7611583333554838, "language_loss": 0.53907251, "learning_rate": 2.2294706507893517e-06, "loss": 0.6150316, "num_input_tokens_seen": 171449980, "router_z_loss_clip": 0.68505859, "router_z_loss_mlp": 0.0246582, "step": 7975, "time_per_iteration": 3.2168772220611572 }, { "auxiliary_loss_clip": 0.06468348, "auxiliary_loss_mlp": 0.01271957, "balance_loss_clip": 0.0629632, "balance_loss_mlp": 0.01257151, "epoch": 0.47954306327972346, "flos": 12427444529280.0, "grad_norm": 2.266494561778939, "language_loss": 0.90213108, "learning_rate": 2.2290837567268008e-06, "loss": 0.97953415, "num_input_tokens_seen": 171465290, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.14788818, "step": 7976, "time_per_iteration": 2.56132435798645 }, { "auxiliary_loss_clip": 0.06464424, "auxiliary_loss_mlp": 0.01270193, "balance_loss_clip": 0.06292588, "balance_loss_mlp": 0.01256049, "epoch": 0.4796031865323914, "flos": 18366630255360.0, "grad_norm": 2.5056551990998566, "language_loss": 0.74424005, "learning_rate": 2.2286968539773713e-06, "loss": 0.82158625, "num_input_tokens_seen": 171481130, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.14147949, "step": 7977, "time_per_iteration": 2.5575926303863525 }, { "auxiliary_loss_clip": 0.06446245, "auxiliary_loss_mlp": 0.01266703, "balance_loss_clip": 0.06285074, "balance_loss_mlp": 0.01254627, "epoch": 0.4796633097850594, "flos": 21841517001600.0, "grad_norm": 1.598216011154131, "language_loss": 0.7863577, "learning_rate": 2.228309942555734e-06, "loss": 0.86348724, "num_input_tokens_seen": 171501140, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.12072754, "step": 7978, "time_per_iteration": 2.608781099319458 }, { "auxiliary_loss_clip": 0.06450979, "auxiliary_loss_mlp": 0.01272854, "balance_loss_clip": 0.06285187, "balance_loss_mlp": 0.01259628, "epoch": 0.47972343303772735, "flos": 23443526413440.0, "grad_norm": 1.8219306471235799, "language_loss": 0.89571208, "learning_rate": 2.22792302247656e-06, "loss": 0.9729504, "num_input_tokens_seen": 171519835, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.13220215, "step": 7979, "time_per_iteration": 2.587339162826538 }, { "auxiliary_loss_clip": 0.06458129, "auxiliary_loss_mlp": 0.01272982, "balance_loss_clip": 0.06289982, "balance_loss_mlp": 0.01259392, "epoch": 0.4797835562903953, "flos": 24906698409600.0, "grad_norm": 1.491653870611273, "language_loss": 0.7691521, "learning_rate": 2.227536093754523e-06, "loss": 0.8464632, "num_input_tokens_seen": 171540980, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.13598633, "step": 7980, "time_per_iteration": 2.629566192626953 }, { "auxiliary_loss_clip": 0.06465205, "auxiliary_loss_mlp": 0.01275752, "balance_loss_clip": 0.06292579, "balance_loss_mlp": 0.01261107, "epoch": 0.4798436795430633, "flos": 35051644120320.0, "grad_norm": 10.40554508207482, "language_loss": 0.71694994, "learning_rate": 2.227149156404295e-06, "loss": 0.79435956, "num_input_tokens_seen": 171563600, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.14642334, "step": 7981, "time_per_iteration": 2.736785650253296 }, { "auxiliary_loss_clip": 0.06457273, "auxiliary_loss_mlp": 0.01271592, "balance_loss_clip": 0.06294388, "balance_loss_mlp": 0.012583, "epoch": 0.47990380279573125, "flos": 20595699296640.0, "grad_norm": 1.7446886499434744, "language_loss": 0.70661539, "learning_rate": 2.2267622104405473e-06, "loss": 0.78390408, "num_input_tokens_seen": 171580700, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.13293457, "step": 7982, "time_per_iteration": 2.563850164413452 }, { "auxiliary_loss_clip": 0.06447753, "auxiliary_loss_mlp": 0.01269447, "balance_loss_clip": 0.06288929, "balance_loss_mlp": 0.01258051, "epoch": 0.4799639260483992, "flos": 26366600096640.0, "grad_norm": 1.529121870386396, "language_loss": 0.70870543, "learning_rate": 2.2263752558779544e-06, "loss": 0.78587747, "num_input_tokens_seen": 171602035, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.11401367, "step": 7983, "time_per_iteration": 2.611234664916992 }, { "auxiliary_loss_clip": 0.06344629, "auxiliary_loss_mlp": 0.01257117, "balance_loss_clip": 0.06275927, "balance_loss_mlp": 0.01254735, "epoch": 0.4800240493010672, "flos": 70999371002880.0, "grad_norm": 0.7860128503609313, "language_loss": 0.59271026, "learning_rate": 2.2259882927311883e-06, "loss": 0.66872776, "num_input_tokens_seen": 171659215, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 0.02378845, "step": 7984, "time_per_iteration": 3.12739896774292 }, { "auxiliary_loss_clip": 0.06451374, "auxiliary_loss_mlp": 0.0127612, "balance_loss_clip": 0.06288438, "balance_loss_mlp": 0.01263836, "epoch": 0.48008417255373514, "flos": 17091406967040.0, "grad_norm": 2.4136259938537616, "language_loss": 0.67353177, "learning_rate": 2.2256013210149247e-06, "loss": 0.75080669, "num_input_tokens_seen": 171675710, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.1229248, "step": 7985, "time_per_iteration": 2.547137498855591 }, { "auxiliary_loss_clip": 0.06456519, "auxiliary_loss_mlp": 0.01270898, "balance_loss_clip": 0.06286716, "balance_loss_mlp": 0.01256921, "epoch": 0.4801442958064031, "flos": 15418762963200.0, "grad_norm": 1.648746388527835, "language_loss": 0.70673585, "learning_rate": 2.225214340743835e-06, "loss": 0.78400993, "num_input_tokens_seen": 171692510, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.13970947, "step": 7986, "time_per_iteration": 2.543524980545044 }, { "auxiliary_loss_clip": 0.06459793, "auxiliary_loss_mlp": 0.01274997, "balance_loss_clip": 0.06290323, "balance_loss_mlp": 0.01261437, "epoch": 0.4802044190590711, "flos": 11478546167040.0, "grad_norm": 2.009798681132341, "language_loss": 0.79408413, "learning_rate": 2.2248273519325956e-06, "loss": 0.87143207, "num_input_tokens_seen": 171710235, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.13555908, "step": 7987, "time_per_iteration": 2.5523531436920166 }, { "auxiliary_loss_clip": 0.06459221, "auxiliary_loss_mlp": 0.0127397, "balance_loss_clip": 0.06292558, "balance_loss_mlp": 0.01261423, "epoch": 0.48026454231173904, "flos": 20955874072320.0, "grad_norm": 2.0275115278259146, "language_loss": 0.75463796, "learning_rate": 2.2244403545958812e-06, "loss": 0.83196992, "num_input_tokens_seen": 171726715, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.12548828, "step": 7988, "time_per_iteration": 2.5773518085479736 }, { "auxiliary_loss_clip": 0.06454148, "auxiliary_loss_mlp": 0.01266096, "balance_loss_clip": 0.06288867, "balance_loss_mlp": 0.01253764, "epoch": 0.48032466556440706, "flos": 20454220477440.0, "grad_norm": 1.923391757890596, "language_loss": 0.79573548, "learning_rate": 2.224053348748365e-06, "loss": 0.87293792, "num_input_tokens_seen": 171743605, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.12322998, "step": 7989, "time_per_iteration": 2.5671331882476807 }, { "auxiliary_loss_clip": 0.06460454, "auxiliary_loss_mlp": 0.01270761, "balance_loss_clip": 0.06288498, "balance_loss_mlp": 0.01257034, "epoch": 0.480384788817075, "flos": 37129507269120.0, "grad_norm": 1.6088366771317177, "language_loss": 0.73551559, "learning_rate": 2.223666334404724e-06, "loss": 0.81282777, "num_input_tokens_seen": 171765445, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.13726807, "step": 7990, "time_per_iteration": 2.748809814453125 }, { "auxiliary_loss_clip": 0.06343245, "auxiliary_loss_mlp": 0.01254166, "balance_loss_clip": 0.06275713, "balance_loss_mlp": 0.01251611, "epoch": 0.480444912069743, "flos": 69572103281280.0, "grad_norm": 0.861266531437708, "language_loss": 0.59019119, "learning_rate": 2.223279311579633e-06, "loss": 0.66616529, "num_input_tokens_seen": 171830115, "router_z_loss_clip": 0.67529297, "router_z_loss_mlp": 0.02555847, "step": 7991, "time_per_iteration": 3.263059377670288 }, { "auxiliary_loss_clip": 0.06456065, "auxiliary_loss_mlp": 0.01271702, "balance_loss_clip": 0.06291223, "balance_loss_mlp": 0.01258577, "epoch": 0.48050503532241096, "flos": 29829453782400.0, "grad_norm": 2.4031474453931865, "language_loss": 0.6765461, "learning_rate": 2.222892280287768e-06, "loss": 0.75382376, "num_input_tokens_seen": 171849135, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.13134766, "step": 7992, "time_per_iteration": 4.151284694671631 }, { "auxiliary_loss_clip": 0.06454146, "auxiliary_loss_mlp": 0.01272746, "balance_loss_clip": 0.06286651, "balance_loss_mlp": 0.01260122, "epoch": 0.4805651585750789, "flos": 23954865154560.0, "grad_norm": 1.749180275221703, "language_loss": 0.76542342, "learning_rate": 2.2225052405438056e-06, "loss": 0.84269238, "num_input_tokens_seen": 171868880, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.12628174, "step": 7993, "time_per_iteration": 4.0672430992126465 }, { "auxiliary_loss_clip": 0.06450851, "auxiliary_loss_mlp": 0.01267125, "balance_loss_clip": 0.06289735, "balance_loss_mlp": 0.0125487, "epoch": 0.4806252818277469, "flos": 25672385819520.0, "grad_norm": 1.7296883235390768, "language_loss": 0.78839695, "learning_rate": 2.222118192362422e-06, "loss": 0.86557668, "num_input_tokens_seen": 171889455, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.12268066, "step": 7994, "time_per_iteration": 2.728633403778076 }, { "auxiliary_loss_clip": 0.06453437, "auxiliary_loss_mlp": 0.01267901, "balance_loss_clip": 0.06288654, "balance_loss_mlp": 0.01254943, "epoch": 0.48068540508041485, "flos": 13157059956480.0, "grad_norm": 1.7898115900879323, "language_loss": 0.79695076, "learning_rate": 2.2217311357582946e-06, "loss": 0.8741641, "num_input_tokens_seen": 171906070, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.1295166, "step": 7995, "time_per_iteration": 2.637951374053955 }, { "auxiliary_loss_clip": 0.06448068, "auxiliary_loss_mlp": 0.01271919, "balance_loss_clip": 0.06286247, "balance_loss_mlp": 0.0125908, "epoch": 0.4807455283330828, "flos": 21182787728640.0, "grad_norm": 1.8831294931661775, "language_loss": 0.8313576, "learning_rate": 2.2213440707461e-06, "loss": 0.90855753, "num_input_tokens_seen": 171926515, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.128479, "step": 7996, "time_per_iteration": 2.618993043899536 }, { "auxiliary_loss_clip": 0.06447092, "auxiliary_loss_mlp": 0.01273877, "balance_loss_clip": 0.06288239, "balance_loss_mlp": 0.01261492, "epoch": 0.4808056515857508, "flos": 12280850611200.0, "grad_norm": 1.5473661687055755, "language_loss": 0.80632955, "learning_rate": 2.220956997340516e-06, "loss": 0.88353926, "num_input_tokens_seen": 171943845, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.1239624, "step": 7997, "time_per_iteration": 2.572049617767334 }, { "auxiliary_loss_clip": 0.06457793, "auxiliary_loss_mlp": 0.0127051, "balance_loss_clip": 0.06291252, "balance_loss_mlp": 0.01257302, "epoch": 0.48086577483841875, "flos": 24832835435520.0, "grad_norm": 1.7259048119661953, "language_loss": 0.7266413, "learning_rate": 2.220569915556221e-06, "loss": 0.80392432, "num_input_tokens_seen": 171964970, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.13208008, "step": 7998, "time_per_iteration": 2.645185947418213 }, { "auxiliary_loss_clip": 0.06446582, "auxiliary_loss_mlp": 0.0127013, "balance_loss_clip": 0.06284644, "balance_loss_mlp": 0.01257655, "epoch": 0.4809258980910867, "flos": 24472786440960.0, "grad_norm": 1.62172978210238, "language_loss": 0.70697916, "learning_rate": 2.220182825407892e-06, "loss": 0.78414637, "num_input_tokens_seen": 171986340, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.12475586, "step": 7999, "time_per_iteration": 2.6235697269439697 }, { "auxiliary_loss_clip": 0.06457897, "auxiliary_loss_mlp": 0.01267613, "balance_loss_clip": 0.06289673, "balance_loss_mlp": 0.01254608, "epoch": 0.4809860213437547, "flos": 21222465436800.0, "grad_norm": 1.465432735462145, "language_loss": 0.7206409, "learning_rate": 2.2197957269102083e-06, "loss": 0.79789597, "num_input_tokens_seen": 172007300, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.13024902, "step": 8000, "time_per_iteration": 2.6400113105773926 }, { "auxiliary_loss_clip": 0.06457955, "auxiliary_loss_mlp": 0.01269022, "balance_loss_clip": 0.06293939, "balance_loss_mlp": 0.01256463, "epoch": 0.48104614459642264, "flos": 37640929864320.0, "grad_norm": 1.3420313253915515, "language_loss": 0.75026673, "learning_rate": 2.2194086200778485e-06, "loss": 0.82753646, "num_input_tokens_seen": 172029585, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.12548828, "step": 8001, "time_per_iteration": 2.7028298377990723 }, { "auxiliary_loss_clip": 0.06453814, "auxiliary_loss_mlp": 0.01270445, "balance_loss_clip": 0.06286764, "balance_loss_mlp": 0.01257571, "epoch": 0.48110626784909066, "flos": 18412093895040.0, "grad_norm": 1.623007403905374, "language_loss": 0.81761801, "learning_rate": 2.219021504925493e-06, "loss": 0.89486063, "num_input_tokens_seen": 172047495, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.12878418, "step": 8002, "time_per_iteration": 2.6256611347198486 }, { "auxiliary_loss_clip": 0.06457414, "auxiliary_loss_mlp": 0.01267921, "balance_loss_clip": 0.06288361, "balance_loss_mlp": 0.01254486, "epoch": 0.48116639110175863, "flos": 28447481992320.0, "grad_norm": 1.6088143712400333, "language_loss": 0.7145471, "learning_rate": 2.218634381467819e-06, "loss": 0.79180044, "num_input_tokens_seen": 172067625, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.13433838, "step": 8003, "time_per_iteration": 4.08315110206604 }, { "auxiliary_loss_clip": 0.06450307, "auxiliary_loss_mlp": 0.01267277, "balance_loss_clip": 0.06290783, "balance_loss_mlp": 0.01255833, "epoch": 0.4812265143544266, "flos": 21731582044800.0, "grad_norm": 1.5495155231618576, "language_loss": 0.82539129, "learning_rate": 2.218247249719507e-06, "loss": 0.90256715, "num_input_tokens_seen": 172087885, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.11456299, "step": 8004, "time_per_iteration": 2.6335256099700928 }, { "auxiliary_loss_clip": 0.06473213, "auxiliary_loss_mlp": 0.01277456, "balance_loss_clip": 0.0629846, "balance_loss_mlp": 0.01262447, "epoch": 0.48128663760709456, "flos": 13229707046400.0, "grad_norm": 6.317963005097503, "language_loss": 0.78035825, "learning_rate": 2.217860109695239e-06, "loss": 0.85786498, "num_input_tokens_seen": 172105815, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.15008545, "step": 8005, "time_per_iteration": 2.5532867908477783 }, { "auxiliary_loss_clip": 0.06458502, "auxiliary_loss_mlp": 0.01266191, "balance_loss_clip": 0.06289709, "balance_loss_mlp": 0.01254032, "epoch": 0.4813467608597625, "flos": 24250317050880.0, "grad_norm": 2.590043724101717, "language_loss": 0.7129997, "learning_rate": 2.217472961409692e-06, "loss": 0.79024661, "num_input_tokens_seen": 172126125, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.12145996, "step": 8006, "time_per_iteration": 2.6461570262908936 }, { "auxiliary_loss_clip": 0.06458669, "auxiliary_loss_mlp": 0.01269802, "balance_loss_clip": 0.06292356, "balance_loss_mlp": 0.01256552, "epoch": 0.4814068841124305, "flos": 27486131299200.0, "grad_norm": 1.8259589101446818, "language_loss": 0.71287, "learning_rate": 2.2170858048775495e-06, "loss": 0.7901547, "num_input_tokens_seen": 172141945, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.13244629, "step": 8007, "time_per_iteration": 2.6086738109588623 }, { "auxiliary_loss_clip": 0.06456898, "auxiliary_loss_mlp": 0.0127237, "balance_loss_clip": 0.06288264, "balance_loss_mlp": 0.01259424, "epoch": 0.48146700736509845, "flos": 19578933527040.0, "grad_norm": 1.9479024945219903, "language_loss": 0.7219277, "learning_rate": 2.2166986401134914e-06, "loss": 0.79922038, "num_input_tokens_seen": 172161095, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.1295166, "step": 8008, "time_per_iteration": 2.5972931385040283 }, { "auxiliary_loss_clip": 0.06452988, "auxiliary_loss_mlp": 0.01269714, "balance_loss_clip": 0.06287306, "balance_loss_mlp": 0.01256511, "epoch": 0.4815271306177664, "flos": 20633448360960.0, "grad_norm": 2.538212607219672, "language_loss": 0.61391962, "learning_rate": 2.216311467132199e-06, "loss": 0.69114667, "num_input_tokens_seen": 172178750, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.13195801, "step": 8009, "time_per_iteration": 2.560185670852661 }, { "auxiliary_loss_clip": 0.06344771, "auxiliary_loss_mlp": 0.0125467, "balance_loss_clip": 0.0627771, "balance_loss_mlp": 0.01252362, "epoch": 0.4815872538704344, "flos": 67710168904320.0, "grad_norm": 0.8535749750164566, "language_loss": 0.61131275, "learning_rate": 2.2159242859483547e-06, "loss": 0.68730718, "num_input_tokens_seen": 172240235, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.02305603, "step": 8010, "time_per_iteration": 3.2195072174072266 }, { "auxiliary_loss_clip": 0.06451444, "auxiliary_loss_mlp": 0.0126813, "balance_loss_clip": 0.0628694, "balance_loss_mlp": 0.0125488, "epoch": 0.48164737712310235, "flos": 22827451668480.0, "grad_norm": 1.7074626441883287, "language_loss": 0.73864138, "learning_rate": 2.215537096576639e-06, "loss": 0.81583714, "num_input_tokens_seen": 172259875, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.13244629, "step": 8011, "time_per_iteration": 2.6089248657226562 }, { "auxiliary_loss_clip": 0.06446607, "auxiliary_loss_mlp": 0.01270923, "balance_loss_clip": 0.06287627, "balance_loss_mlp": 0.01259139, "epoch": 0.4817075003757703, "flos": 23740865026560.0, "grad_norm": 1.6172109336761529, "language_loss": 0.79427946, "learning_rate": 2.2151498990317354e-06, "loss": 0.87145483, "num_input_tokens_seen": 172280150, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.11785889, "step": 8012, "time_per_iteration": 2.708083152770996 }, { "auxiliary_loss_clip": 0.06449411, "auxiliary_loss_mlp": 0.01273335, "balance_loss_clip": 0.06286611, "balance_loss_mlp": 0.01259804, "epoch": 0.4817676236284383, "flos": 28190282284800.0, "grad_norm": 1.593909327611315, "language_loss": 0.74348271, "learning_rate": 2.214762693328326e-06, "loss": 0.82071018, "num_input_tokens_seen": 172300810, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.13531494, "step": 8013, "time_per_iteration": 2.668891429901123 }, { "auxiliary_loss_clip": 0.06445782, "auxiliary_loss_mlp": 0.01265601, "balance_loss_clip": 0.06285959, "balance_loss_mlp": 0.01253924, "epoch": 0.48182774688110624, "flos": 17097360606720.0, "grad_norm": 2.1297548097493, "language_loss": 0.90930068, "learning_rate": 2.214375479481094e-06, "loss": 0.98641455, "num_input_tokens_seen": 172317930, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.11676025, "step": 8014, "time_per_iteration": 2.566425085067749 }, { "auxiliary_loss_clip": 0.06453837, "auxiliary_loss_mlp": 0.01271644, "balance_loss_clip": 0.06285338, "balance_loss_mlp": 0.01258251, "epoch": 0.4818878701337742, "flos": 12572780636160.0, "grad_norm": 2.0523459072861328, "language_loss": 0.75142717, "learning_rate": 2.213988257504722e-06, "loss": 0.82868207, "num_input_tokens_seen": 172336340, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.1340332, "step": 8015, "time_per_iteration": 2.550701141357422 }, { "auxiliary_loss_clip": 0.06457065, "auxiliary_loss_mlp": 0.01268597, "balance_loss_clip": 0.06286782, "balance_loss_mlp": 0.01255365, "epoch": 0.48194799338644223, "flos": 24615481144320.0, "grad_norm": 2.191008502482909, "language_loss": 0.80564082, "learning_rate": 2.213601027413894e-06, "loss": 0.88289744, "num_input_tokens_seen": 172354315, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.13244629, "step": 8016, "time_per_iteration": 2.608806610107422 }, { "auxiliary_loss_clip": 0.06442698, "auxiliary_loss_mlp": 0.01268099, "balance_loss_clip": 0.06285462, "balance_loss_mlp": 0.01257126, "epoch": 0.4820081166391102, "flos": 21111482304000.0, "grad_norm": 1.9596914348756749, "language_loss": 0.77778858, "learning_rate": 2.2132137892232933e-06, "loss": 0.85489655, "num_input_tokens_seen": 172372695, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.10961914, "step": 8017, "time_per_iteration": 2.5727272033691406 }, { "auxiliary_loss_clip": 0.0644251, "auxiliary_loss_mlp": 0.01269197, "balance_loss_clip": 0.06282975, "balance_loss_mlp": 0.0125674, "epoch": 0.48206823989177816, "flos": 25271569013760.0, "grad_norm": 1.7914963248408915, "language_loss": 0.79915047, "learning_rate": 2.2128265429476043e-06, "loss": 0.87626755, "num_input_tokens_seen": 172390905, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.12457275, "step": 8018, "time_per_iteration": 2.5974338054656982 }, { "auxiliary_loss_clip": 0.06442858, "auxiliary_loss_mlp": 0.01269838, "balance_loss_clip": 0.06277692, "balance_loss_mlp": 0.01257237, "epoch": 0.4821283631444461, "flos": 24652056251520.0, "grad_norm": 1.624851752167559, "language_loss": 0.76557118, "learning_rate": 2.2124392886015124e-06, "loss": 0.84269816, "num_input_tokens_seen": 172412295, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.12597656, "step": 8019, "time_per_iteration": 2.605975866317749 }, { "auxiliary_loss_clip": 0.06439625, "auxiliary_loss_mlp": 0.01270921, "balance_loss_clip": 0.06276203, "balance_loss_mlp": 0.01258702, "epoch": 0.4821884863971141, "flos": 23959015931520.0, "grad_norm": 2.0980520684494066, "language_loss": 0.79552436, "learning_rate": 2.212052026199701e-06, "loss": 0.87262988, "num_input_tokens_seen": 172432625, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.12200928, "step": 8020, "time_per_iteration": 2.6268835067749023 }, { "auxiliary_loss_clip": 0.06434622, "auxiliary_loss_mlp": 0.01267487, "balance_loss_clip": 0.06276453, "balance_loss_mlp": 0.01254451, "epoch": 0.48224860964978206, "flos": 17165605357440.0, "grad_norm": 1.9493128741970394, "language_loss": 0.70095736, "learning_rate": 2.211664755756855e-06, "loss": 0.77797848, "num_input_tokens_seen": 172450010, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.13049316, "step": 8021, "time_per_iteration": 2.5720274448394775 }, { "auxiliary_loss_clip": 0.06450893, "auxiliary_loss_mlp": 0.01269493, "balance_loss_clip": 0.06282733, "balance_loss_mlp": 0.01256517, "epoch": 0.48230873290245, "flos": 23082513096960.0, "grad_norm": 1.8558958522684381, "language_loss": 0.63001442, "learning_rate": 2.2112774772876603e-06, "loss": 0.70721829, "num_input_tokens_seen": 172469080, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.12976074, "step": 8022, "time_per_iteration": 2.584900379180908 }, { "auxiliary_loss_clip": 0.0643581, "auxiliary_loss_mlp": 0.01269972, "balance_loss_clip": 0.06277104, "balance_loss_mlp": 0.01258129, "epoch": 0.482368856155118, "flos": 19359440956800.0, "grad_norm": 2.1638401888077268, "language_loss": 0.66796249, "learning_rate": 2.2108901908068028e-06, "loss": 0.74502027, "num_input_tokens_seen": 172484850, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.11853027, "step": 8023, "time_per_iteration": 2.5741777420043945 }, { "auxiliary_loss_clip": 0.0644109, "auxiliary_loss_mlp": 0.0127113, "balance_loss_clip": 0.06278864, "balance_loss_mlp": 0.01258756, "epoch": 0.48242897940778595, "flos": 20084318628480.0, "grad_norm": 1.625550029523146, "language_loss": 0.7673555, "learning_rate": 2.2105028963289683e-06, "loss": 0.84447765, "num_input_tokens_seen": 172503525, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.12371826, "step": 8024, "time_per_iteration": 2.588839292526245 }, { "auxiliary_loss_clip": 0.06443136, "auxiliary_loss_mlp": 0.01268927, "balance_loss_clip": 0.062795, "balance_loss_mlp": 0.01256529, "epoch": 0.4824891026604539, "flos": 23410682812800.0, "grad_norm": 1.4802150193869799, "language_loss": 0.75445473, "learning_rate": 2.2101155938688423e-06, "loss": 0.83157533, "num_input_tokens_seen": 172524360, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.12408447, "step": 8025, "time_per_iteration": 2.600520372390747 }, { "auxiliary_loss_clip": 0.06445811, "auxiliary_loss_mlp": 0.01267219, "balance_loss_clip": 0.06281027, "balance_loss_mlp": 0.01254648, "epoch": 0.4825492259131219, "flos": 20373691104000.0, "grad_norm": 1.7710738628187237, "language_loss": 0.71197093, "learning_rate": 2.209728283441112e-06, "loss": 0.78910124, "num_input_tokens_seen": 172541480, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.12567139, "step": 8026, "time_per_iteration": 2.5825307369232178 }, { "auxiliary_loss_clip": 0.06447469, "auxiliary_loss_mlp": 0.01269648, "balance_loss_clip": 0.06280746, "balance_loss_mlp": 0.01256517, "epoch": 0.48260934916578985, "flos": 14324193077760.0, "grad_norm": 1.958432946131694, "language_loss": 0.75026631, "learning_rate": 2.209340965060465e-06, "loss": 0.82743752, "num_input_tokens_seen": 172559005, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.13146973, "step": 8027, "time_per_iteration": 2.5411384105682373 }, { "auxiliary_loss_clip": 0.06443827, "auxiliary_loss_mlp": 0.01269896, "balance_loss_clip": 0.06278031, "balance_loss_mlp": 0.0125748, "epoch": 0.4826694724184578, "flos": 22126654846080.0, "grad_norm": 2.3759929854342547, "language_loss": 0.67696267, "learning_rate": 2.2089536387415868e-06, "loss": 0.75409985, "num_input_tokens_seen": 172578435, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.12420654, "step": 8028, "time_per_iteration": 2.63169264793396 }, { "auxiliary_loss_clip": 0.06445314, "auxiliary_loss_mlp": 0.01267293, "balance_loss_clip": 0.06282283, "balance_loss_mlp": 0.01254454, "epoch": 0.48272959567112583, "flos": 16186882141440.0, "grad_norm": 1.4246067197240517, "language_loss": 0.72710383, "learning_rate": 2.2085663044991655e-06, "loss": 0.80422986, "num_input_tokens_seen": 172596095, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.12841797, "step": 8029, "time_per_iteration": 2.5227599143981934 }, { "auxiliary_loss_clip": 0.06443248, "auxiliary_loss_mlp": 0.01269473, "balance_loss_clip": 0.06277956, "balance_loss_mlp": 0.01256646, "epoch": 0.4827897189237938, "flos": 23186326705920.0, "grad_norm": 2.385335544217769, "language_loss": 0.84603405, "learning_rate": 2.2081789623478896e-06, "loss": 0.92316127, "num_input_tokens_seen": 172615255, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.12835693, "step": 8030, "time_per_iteration": 2.61490797996521 }, { "auxiliary_loss_clip": 0.06444032, "auxiliary_loss_mlp": 0.01266428, "balance_loss_clip": 0.06280306, "balance_loss_mlp": 0.01254055, "epoch": 0.48284984217646176, "flos": 21659018808960.0, "grad_norm": 1.6619461162272438, "language_loss": 0.73517978, "learning_rate": 2.2077916123024466e-06, "loss": 0.81228435, "num_input_tokens_seen": 172633185, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.12371826, "step": 8031, "time_per_iteration": 2.5514681339263916 }, { "auxiliary_loss_clip": 0.06448483, "auxiliary_loss_mlp": 0.01272178, "balance_loss_clip": 0.06277843, "balance_loss_mlp": 0.01258606, "epoch": 0.48290996542912973, "flos": 31475501314560.0, "grad_norm": 2.628578945981558, "language_loss": 0.72136652, "learning_rate": 2.2074042543775245e-06, "loss": 0.79857308, "num_input_tokens_seen": 172654280, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.13580322, "step": 8032, "time_per_iteration": 4.000203847885132 }, { "auxiliary_loss_clip": 0.06436886, "auxiliary_loss_mlp": 0.01273382, "balance_loss_clip": 0.06274801, "balance_loss_mlp": 0.01261139, "epoch": 0.4829700886817977, "flos": 24468803372160.0, "grad_norm": 1.6740028030452145, "language_loss": 0.74416298, "learning_rate": 2.2070168885878126e-06, "loss": 0.82126564, "num_input_tokens_seen": 172675545, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.12237549, "step": 8033, "time_per_iteration": 4.155442476272583 }, { "auxiliary_loss_clip": 0.06456111, "auxiliary_loss_mlp": 0.01272192, "balance_loss_clip": 0.06284689, "balance_loss_mlp": 0.01258995, "epoch": 0.48303021193446566, "flos": 25709170561920.0, "grad_norm": 1.4875392833301595, "language_loss": 0.83709502, "learning_rate": 2.2066295149479996e-06, "loss": 0.91437805, "num_input_tokens_seen": 172696455, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.13208008, "step": 8034, "time_per_iteration": 2.6336007118225098 }, { "auxiliary_loss_clip": 0.06436155, "auxiliary_loss_mlp": 0.01267054, "balance_loss_clip": 0.06274984, "balance_loss_mlp": 0.01254812, "epoch": 0.4830903351871336, "flos": 20091613933440.0, "grad_norm": 1.4824336374589646, "language_loss": 0.79502738, "learning_rate": 2.2062421334727744e-06, "loss": 0.87205946, "num_input_tokens_seen": 172716720, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.12249756, "step": 8035, "time_per_iteration": 2.6144137382507324 }, { "auxiliary_loss_clip": 0.06441268, "auxiliary_loss_mlp": 0.01270366, "balance_loss_clip": 0.06276259, "balance_loss_mlp": 0.01256782, "epoch": 0.4831504584398016, "flos": 39460670910720.0, "grad_norm": 3.0644527038740486, "language_loss": 0.69579607, "learning_rate": 2.2058547441768267e-06, "loss": 0.77291238, "num_input_tokens_seen": 172737435, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.13568115, "step": 8036, "time_per_iteration": 2.7316176891326904 }, { "auxiliary_loss_clip": 0.06439759, "auxiliary_loss_mlp": 0.01268396, "balance_loss_clip": 0.06276168, "balance_loss_mlp": 0.01255998, "epoch": 0.48321058169246955, "flos": 20012006954880.0, "grad_norm": 1.7744222771684206, "language_loss": 0.73040658, "learning_rate": 2.205467347074847e-06, "loss": 0.80748814, "num_input_tokens_seen": 172755700, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.12408447, "step": 8037, "time_per_iteration": 2.5491015911102295 }, { "auxiliary_loss_clip": 0.06453694, "auxiliary_loss_mlp": 0.01267928, "balance_loss_clip": 0.0628067, "balance_loss_mlp": 0.01254481, "epoch": 0.4832707049451375, "flos": 20747869511040.0, "grad_norm": 2.4025206762973776, "language_loss": 0.69416082, "learning_rate": 2.205079942181525e-06, "loss": 0.77137697, "num_input_tokens_seen": 172775185, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.13446045, "step": 8038, "time_per_iteration": 2.5708487033843994 }, { "auxiliary_loss_clip": 0.06438902, "auxiliary_loss_mlp": 0.01266863, "balance_loss_clip": 0.06275427, "balance_loss_mlp": 0.0125381, "epoch": 0.4833308281978055, "flos": 33153889322880.0, "grad_norm": 1.4831324306906828, "language_loss": 0.79161382, "learning_rate": 2.20469252951155e-06, "loss": 0.86867148, "num_input_tokens_seen": 172796990, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.13061523, "step": 8039, "time_per_iteration": 2.650620222091675 }, { "auxiliary_loss_clip": 0.06441743, "auxiliary_loss_mlp": 0.01272082, "balance_loss_clip": 0.06277886, "balance_loss_mlp": 0.01259708, "epoch": 0.48339095145047345, "flos": 19105301923200.0, "grad_norm": 1.5547111053359661, "language_loss": 0.77585459, "learning_rate": 2.2043051090796143e-06, "loss": 0.85299283, "num_input_tokens_seen": 172814915, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.1237793, "step": 8040, "time_per_iteration": 2.5669291019439697 }, { "auxiliary_loss_clip": 0.06446026, "auxiliary_loss_mlp": 0.01272385, "balance_loss_clip": 0.06280263, "balance_loss_mlp": 0.01259058, "epoch": 0.4834510747031414, "flos": 34468035632640.0, "grad_norm": 1.507884277468774, "language_loss": 0.75981474, "learning_rate": 2.203917680900409e-06, "loss": 0.83699882, "num_input_tokens_seen": 172837060, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.13336182, "step": 8041, "time_per_iteration": 2.7031285762786865 }, { "auxiliary_loss_clip": 0.06442735, "auxiliary_loss_mlp": 0.01279415, "balance_loss_clip": 0.06282678, "balance_loss_mlp": 0.01266749, "epoch": 0.48351119795580944, "flos": 27388187475840.0, "grad_norm": 1.7429882544680428, "language_loss": 0.66593421, "learning_rate": 2.203530244988624e-06, "loss": 0.74315572, "num_input_tokens_seen": 172856545, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.12658691, "step": 8042, "time_per_iteration": 4.203121185302734 }, { "auxiliary_loss_clip": 0.0632555, "auxiliary_loss_mlp": 0.01253127, "balance_loss_clip": 0.06258218, "balance_loss_mlp": 0.01250872, "epoch": 0.4835713212084774, "flos": 67162967815680.0, "grad_norm": 0.6838377943678894, "language_loss": 0.58426321, "learning_rate": 2.2031428013589517e-06, "loss": 0.66005003, "num_input_tokens_seen": 172923055, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.02258301, "step": 8043, "time_per_iteration": 4.67888879776001 }, { "auxiliary_loss_clip": 0.06448644, "auxiliary_loss_mlp": 0.01275606, "balance_loss_clip": 0.06281015, "balance_loss_mlp": 0.01261771, "epoch": 0.48363144446114537, "flos": 17973234535680.0, "grad_norm": 1.9685224574805238, "language_loss": 0.72353768, "learning_rate": 2.2027553500260847e-06, "loss": 0.80078018, "num_input_tokens_seen": 172940700, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.13848877, "step": 8044, "time_per_iteration": 2.5575244426727295 }, { "auxiliary_loss_clip": 0.06440653, "auxiliary_loss_mlp": 0.01271833, "balance_loss_clip": 0.06277682, "balance_loss_mlp": 0.0125779, "epoch": 0.48369156771381333, "flos": 20599556584320.0, "grad_norm": 1.423778895023687, "language_loss": 0.75916541, "learning_rate": 2.202367891004714e-06, "loss": 0.8362903, "num_input_tokens_seen": 172961125, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.14050293, "step": 8045, "time_per_iteration": 2.6554348468780518 }, { "auxiliary_loss_clip": 0.06450595, "auxiliary_loss_mlp": 0.01273642, "balance_loss_clip": 0.06283035, "balance_loss_mlp": 0.01260797, "epoch": 0.4837516909664813, "flos": 22681780145280.0, "grad_norm": 2.2520139369980443, "language_loss": 0.69706273, "learning_rate": 2.201980424309533e-06, "loss": 0.77430511, "num_input_tokens_seen": 172980405, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.12841797, "step": 8046, "time_per_iteration": 2.5975329875946045 }, { "auxiliary_loss_clip": 0.06444825, "auxiliary_loss_mlp": 0.01272177, "balance_loss_clip": 0.06279961, "balance_loss_mlp": 0.0125947, "epoch": 0.48381181421914926, "flos": 25525414558080.0, "grad_norm": 1.8879647894023748, "language_loss": 0.82326305, "learning_rate": 2.2015929499552337e-06, "loss": 0.90043306, "num_input_tokens_seen": 172999105, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.12713623, "step": 8047, "time_per_iteration": 2.656459093093872 }, { "auxiliary_loss_clip": 0.06441353, "auxiliary_loss_mlp": 0.01272823, "balance_loss_clip": 0.06280155, "balance_loss_mlp": 0.01260317, "epoch": 0.4838719374718172, "flos": 24214454703360.0, "grad_norm": 1.5812451240078038, "language_loss": 0.80878651, "learning_rate": 2.2012054679565092e-06, "loss": 0.88592827, "num_input_tokens_seen": 173019935, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.12493896, "step": 8048, "time_per_iteration": 2.638082981109619 }, { "auxiliary_loss_clip": 0.0645253, "auxiliary_loss_mlp": 0.012729, "balance_loss_clip": 0.06283835, "balance_loss_mlp": 0.01259453, "epoch": 0.4839320607244852, "flos": 26731889971200.0, "grad_norm": 1.5571354924683625, "language_loss": 0.81975853, "learning_rate": 2.200817978328054e-06, "loss": 0.89701283, "num_input_tokens_seen": 173039700, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.13452148, "step": 8049, "time_per_iteration": 2.646379232406616 }, { "auxiliary_loss_clip": 0.0643864, "auxiliary_loss_mlp": 0.01267855, "balance_loss_clip": 0.06281048, "balance_loss_mlp": 0.012565, "epoch": 0.48399218397715316, "flos": 20455142872320.0, "grad_norm": 1.642962139097509, "language_loss": 0.73296905, "learning_rate": 2.2004304810845602e-06, "loss": 0.81003398, "num_input_tokens_seen": 173059170, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.11352539, "step": 8050, "time_per_iteration": 2.5868146419525146 }, { "auxiliary_loss_clip": 0.06316428, "auxiliary_loss_mlp": 0.01255448, "balance_loss_clip": 0.06249306, "balance_loss_mlp": 0.01252598, "epoch": 0.4840523072298211, "flos": 67199626776960.0, "grad_norm": 0.6861979739348519, "language_loss": 0.56340587, "learning_rate": 2.200042976240723e-06, "loss": 0.63912463, "num_input_tokens_seen": 173119000, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.0284729, "step": 8051, "time_per_iteration": 3.236651659011841 }, { "auxiliary_loss_clip": 0.06441861, "auxiliary_loss_mlp": 0.01268381, "balance_loss_clip": 0.06276616, "balance_loss_mlp": 0.01255638, "epoch": 0.4841124304824891, "flos": 22416782008320.0, "grad_norm": 2.2527030873073524, "language_loss": 0.7530852, "learning_rate": 2.199655463811236e-06, "loss": 0.83018762, "num_input_tokens_seen": 173137570, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.12744141, "step": 8052, "time_per_iteration": 2.636168956756592 }, { "auxiliary_loss_clip": 0.06448351, "auxiliary_loss_mlp": 0.01268677, "balance_loss_clip": 0.0628337, "balance_loss_mlp": 0.01256482, "epoch": 0.48417255373515705, "flos": 13848926319360.0, "grad_norm": 2.0571276817276094, "language_loss": 0.65778089, "learning_rate": 2.1992679438107936e-06, "loss": 0.73495114, "num_input_tokens_seen": 173154355, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.12213135, "step": 8053, "time_per_iteration": 2.5767245292663574 }, { "auxiliary_loss_clip": 0.064338, "auxiliary_loss_mlp": 0.01271198, "balance_loss_clip": 0.06273834, "balance_loss_mlp": 0.01259128, "epoch": 0.484232676987825, "flos": 31657747944960.0, "grad_norm": 1.8090381983389237, "language_loss": 0.69972414, "learning_rate": 2.198880416254091e-06, "loss": 0.77677417, "num_input_tokens_seen": 173174845, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.12060547, "step": 8054, "time_per_iteration": 2.697206974029541 }, { "auxiliary_loss_clip": 0.06437419, "auxiliary_loss_mlp": 0.01267374, "balance_loss_clip": 0.06275115, "balance_loss_mlp": 0.01255518, "epoch": 0.48429280024049304, "flos": 24101878343040.0, "grad_norm": 1.6095122109552875, "language_loss": 0.7006408, "learning_rate": 2.1984928811558233e-06, "loss": 0.77768874, "num_input_tokens_seen": 173195025, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.11846924, "step": 8055, "time_per_iteration": 2.582357883453369 }, { "auxiliary_loss_clip": 0.06439158, "auxiliary_loss_mlp": 0.01268665, "balance_loss_clip": 0.06275836, "balance_loss_mlp": 0.01256297, "epoch": 0.484352923493161, "flos": 17535842622720.0, "grad_norm": 3.0364170621618958, "language_loss": 0.64240348, "learning_rate": 2.198105338530685e-06, "loss": 0.71948177, "num_input_tokens_seen": 173213065, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.12371826, "step": 8056, "time_per_iteration": 2.588434934616089 }, { "auxiliary_loss_clip": 0.06442954, "auxiliary_loss_mlp": 0.01267728, "balance_loss_clip": 0.06278062, "balance_loss_mlp": 0.01254734, "epoch": 0.48441304674582897, "flos": 29174204453760.0, "grad_norm": 1.7689714175469882, "language_loss": 0.67513692, "learning_rate": 2.1977177883933726e-06, "loss": 0.75224376, "num_input_tokens_seen": 173234545, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.12988281, "step": 8057, "time_per_iteration": 2.629654884338379 }, { "auxiliary_loss_clip": 0.06435189, "auxiliary_loss_mlp": 0.01273249, "balance_loss_clip": 0.06274675, "balance_loss_mlp": 0.01261096, "epoch": 0.48447316999849693, "flos": 15891933369600.0, "grad_norm": 1.5823855000086184, "language_loss": 0.82221353, "learning_rate": 2.1973302307585827e-06, "loss": 0.89929795, "num_input_tokens_seen": 173252175, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.12158203, "step": 8058, "time_per_iteration": 2.587141513824463 }, { "auxiliary_loss_clip": 0.06445973, "auxiliary_loss_mlp": 0.01271696, "balance_loss_clip": 0.06278837, "balance_loss_mlp": 0.01258476, "epoch": 0.4845332932511649, "flos": 24386974260480.0, "grad_norm": 1.8641260953051242, "language_loss": 0.79700518, "learning_rate": 2.1969426656410097e-06, "loss": 0.87418193, "num_input_tokens_seen": 173268790, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.13208008, "step": 8059, "time_per_iteration": 2.5753564834594727 }, { "auxiliary_loss_clip": 0.06447512, "auxiliary_loss_mlp": 0.01271478, "balance_loss_clip": 0.06277878, "balance_loss_mlp": 0.01257984, "epoch": 0.48459341650383286, "flos": 37124434097280.0, "grad_norm": 3.8825385563157955, "language_loss": 0.67246068, "learning_rate": 2.196555093055352e-06, "loss": 0.7496506, "num_input_tokens_seen": 173288030, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.13494873, "step": 8060, "time_per_iteration": 2.8275513648986816 }, { "auxiliary_loss_clip": 0.06442917, "auxiliary_loss_mlp": 0.01268902, "balance_loss_clip": 0.06278498, "balance_loss_mlp": 0.01255658, "epoch": 0.48465353975650083, "flos": 22973500535040.0, "grad_norm": 2.087247413104877, "language_loss": 0.67149335, "learning_rate": 2.1961675130163046e-06, "loss": 0.74861157, "num_input_tokens_seen": 173305965, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.13256836, "step": 8061, "time_per_iteration": 2.5905063152313232 }, { "auxiliary_loss_clip": 0.0644137, "auxiliary_loss_mlp": 0.01273986, "balance_loss_clip": 0.06278412, "balance_loss_mlp": 0.01260319, "epoch": 0.4847136630091688, "flos": 17712680664960.0, "grad_norm": 2.074072025953764, "language_loss": 0.83075631, "learning_rate": 2.1957799255385653e-06, "loss": 0.90790987, "num_input_tokens_seen": 173321985, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.13690186, "step": 8062, "time_per_iteration": 2.598416805267334 }, { "auxiliary_loss_clip": 0.06437889, "auxiliary_loss_mlp": 0.01268579, "balance_loss_clip": 0.06277689, "balance_loss_mlp": 0.01256699, "epoch": 0.48477378626183676, "flos": 22024853735040.0, "grad_norm": 1.6205893448911937, "language_loss": 0.74306703, "learning_rate": 2.1953923306368325e-06, "loss": 0.82013172, "num_input_tokens_seen": 173341315, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.11883545, "step": 8063, "time_per_iteration": 2.569624185562134 }, { "auxiliary_loss_clip": 0.06441398, "auxiliary_loss_mlp": 0.01269355, "balance_loss_clip": 0.06277538, "balance_loss_mlp": 0.01257262, "epoch": 0.4848339095145047, "flos": 27970118881920.0, "grad_norm": 1.6235104109855552, "language_loss": 0.78889143, "learning_rate": 2.1950047283258023e-06, "loss": 0.86599892, "num_input_tokens_seen": 173361055, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.12103271, "step": 8064, "time_per_iteration": 2.6284687519073486 }, { "auxiliary_loss_clip": 0.06439423, "auxiliary_loss_mlp": 0.01267316, "balance_loss_clip": 0.06281946, "balance_loss_mlp": 0.01255675, "epoch": 0.4848940327671727, "flos": 21695090791680.0, "grad_norm": 1.7137252280803863, "language_loss": 0.79609692, "learning_rate": 2.194617118620173e-06, "loss": 0.87316424, "num_input_tokens_seen": 173379255, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.11645508, "step": 8065, "time_per_iteration": 2.5622804164886475 }, { "auxiliary_loss_clip": 0.0643504, "auxiliary_loss_mlp": 0.01270031, "balance_loss_clip": 0.06279264, "balance_loss_mlp": 0.01258605, "epoch": 0.48495415601984065, "flos": 20637892627200.0, "grad_norm": 1.5403652077361656, "language_loss": 0.76252174, "learning_rate": 2.194229501534644e-06, "loss": 0.83957243, "num_input_tokens_seen": 173398370, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.11419678, "step": 8066, "time_per_iteration": 2.5862252712249756 }, { "auxiliary_loss_clip": 0.06436943, "auxiliary_loss_mlp": 0.01268205, "balance_loss_clip": 0.0627718, "balance_loss_mlp": 0.01256934, "epoch": 0.4850142792725086, "flos": 25634972171520.0, "grad_norm": 1.2628990176398802, "language_loss": 0.72123134, "learning_rate": 2.193841877083912e-06, "loss": 0.79828286, "num_input_tokens_seen": 173419595, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.1126709, "step": 8067, "time_per_iteration": 2.615518569946289 }, { "auxiliary_loss_clip": 0.06441843, "auxiliary_loss_mlp": 0.01269693, "balance_loss_clip": 0.06279269, "balance_loss_mlp": 0.01258154, "epoch": 0.4850744025251766, "flos": 13777075843200.0, "grad_norm": 2.065313332726358, "language_loss": 0.79132944, "learning_rate": 2.1934542452826767e-06, "loss": 0.8684448, "num_input_tokens_seen": 173435390, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.11547852, "step": 8068, "time_per_iteration": 2.5932626724243164 }, { "auxiliary_loss_clip": 0.06433058, "auxiliary_loss_mlp": 0.01268771, "balance_loss_clip": 0.06272778, "balance_loss_mlp": 0.01257386, "epoch": 0.4851345257778446, "flos": 20266691040000.0, "grad_norm": 1.4789506368082497, "language_loss": 0.84708548, "learning_rate": 2.193066606145638e-06, "loss": 0.92410374, "num_input_tokens_seen": 173454095, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.1138916, "step": 8069, "time_per_iteration": 2.5557796955108643 }, { "auxiliary_loss_clip": 0.06435756, "auxiliary_loss_mlp": 0.01265378, "balance_loss_clip": 0.06275968, "balance_loss_mlp": 0.01254256, "epoch": 0.48519464903051257, "flos": 27097095991680.0, "grad_norm": 1.5785210395836924, "language_loss": 0.78601801, "learning_rate": 2.192678959687493e-06, "loss": 0.8630293, "num_input_tokens_seen": 173475300, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.11126709, "step": 8070, "time_per_iteration": 2.6124119758605957 }, { "auxiliary_loss_clip": 0.06432381, "auxiliary_loss_mlp": 0.01267959, "balance_loss_clip": 0.0627294, "balance_loss_mlp": 0.01255186, "epoch": 0.48525477228318054, "flos": 17132677902720.0, "grad_norm": 1.7948537261511228, "language_loss": 0.77743673, "learning_rate": 2.192291305922943e-06, "loss": 0.85444015, "num_input_tokens_seen": 173492005, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.12774658, "step": 8071, "time_per_iteration": 2.5237998962402344 }, { "auxiliary_loss_clip": 0.06443863, "auxiliary_loss_mlp": 0.01268257, "balance_loss_clip": 0.06280358, "balance_loss_mlp": 0.0125546, "epoch": 0.4853148955358485, "flos": 28187263537920.0, "grad_norm": 2.055815134530604, "language_loss": 0.72361249, "learning_rate": 2.1919036448666873e-06, "loss": 0.80073369, "num_input_tokens_seen": 173511995, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.12799072, "step": 8072, "time_per_iteration": 5.587508201599121 }, { "auxiliary_loss_clip": 0.06440353, "auxiliary_loss_mlp": 0.01265615, "balance_loss_clip": 0.06277749, "balance_loss_mlp": 0.01252961, "epoch": 0.48537501878851647, "flos": 17499015953280.0, "grad_norm": 1.9242864374739403, "language_loss": 0.87854499, "learning_rate": 2.1915159765334262e-06, "loss": 0.95560473, "num_input_tokens_seen": 173530215, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.12646484, "step": 8073, "time_per_iteration": 2.5254275798797607 }, { "auxiliary_loss_clip": 0.0643097, "auxiliary_loss_mlp": 0.01270827, "balance_loss_clip": 0.06275781, "balance_loss_mlp": 0.01259604, "epoch": 0.48543514204118443, "flos": 28592398828800.0, "grad_norm": 1.8023063458795898, "language_loss": 0.61097169, "learning_rate": 2.19112830093786e-06, "loss": 0.68798959, "num_input_tokens_seen": 173550920, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.11224365, "step": 8074, "time_per_iteration": 2.6479880809783936 }, { "auxiliary_loss_clip": 0.06441957, "auxiliary_loss_mlp": 0.01270443, "balance_loss_clip": 0.06276807, "balance_loss_mlp": 0.01258385, "epoch": 0.4854952652938524, "flos": 20966355832320.0, "grad_norm": 1.5667453968078962, "language_loss": 0.73506939, "learning_rate": 2.19074061809469e-06, "loss": 0.81219339, "num_input_tokens_seen": 173569065, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.12054443, "step": 8075, "time_per_iteration": 2.547957420349121 }, { "auxiliary_loss_clip": 0.06433577, "auxiliary_loss_mlp": 0.01267763, "balance_loss_clip": 0.06277779, "balance_loss_mlp": 0.01256373, "epoch": 0.48555538854652036, "flos": 66543344000640.0, "grad_norm": 1.4526207888559308, "language_loss": 0.81721222, "learning_rate": 2.1903529280186163e-06, "loss": 0.8942256, "num_input_tokens_seen": 173596085, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.11401367, "step": 8076, "time_per_iteration": 2.9995174407958984 }, { "auxiliary_loss_clip": 0.06441737, "auxiliary_loss_mlp": 0.0127453, "balance_loss_clip": 0.06280188, "balance_loss_mlp": 0.01260714, "epoch": 0.4856155117991883, "flos": 15930520974720.0, "grad_norm": 2.370495827322158, "language_loss": 0.8725785, "learning_rate": 2.1899652307243407e-06, "loss": 0.94974118, "num_input_tokens_seen": 173613900, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.13818359, "step": 8077, "time_per_iteration": 2.5517513751983643 }, { "auxiliary_loss_clip": 0.0632706, "auxiliary_loss_mlp": 0.0125494, "balance_loss_clip": 0.06260607, "balance_loss_mlp": 0.01252944, "epoch": 0.4856756350518563, "flos": 71066986848000.0, "grad_norm": 1.0825769716377716, "language_loss": 0.58409536, "learning_rate": 2.189577526226564e-06, "loss": 0.65991539, "num_input_tokens_seen": 173671305, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 0.0199585, "step": 8078, "time_per_iteration": 3.2210302352905273 }, { "auxiliary_loss_clip": 0.06449385, "auxiliary_loss_mlp": 0.01272597, "balance_loss_clip": 0.06283498, "balance_loss_mlp": 0.01260587, "epoch": 0.48573575830452426, "flos": 29833478778240.0, "grad_norm": 1.5843479520403254, "language_loss": 0.72808719, "learning_rate": 2.1891898145399884e-06, "loss": 0.80530703, "num_input_tokens_seen": 173692070, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.12011719, "step": 8079, "time_per_iteration": 2.634953260421753 }, { "auxiliary_loss_clip": 0.06445076, "auxiliary_loss_mlp": 0.01267832, "balance_loss_clip": 0.06282046, "balance_loss_mlp": 0.01256698, "epoch": 0.4857958815571922, "flos": 17645274455040.0, "grad_norm": 1.9709444608354878, "language_loss": 0.79804862, "learning_rate": 2.1888020956793172e-06, "loss": 0.87517768, "num_input_tokens_seen": 173709785, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.11138916, "step": 8080, "time_per_iteration": 2.5770018100738525 }, { "auxiliary_loss_clip": 0.06443487, "auxiliary_loss_mlp": 0.01266796, "balance_loss_clip": 0.06280443, "balance_loss_mlp": 0.01254654, "epoch": 0.4858560048098602, "flos": 21111817720320.0, "grad_norm": 1.837285509316202, "language_loss": 0.83884346, "learning_rate": 2.188414369659251e-06, "loss": 0.9159463, "num_input_tokens_seen": 173728770, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.12133789, "step": 8081, "time_per_iteration": 2.553269863128662 }, { "auxiliary_loss_clip": 0.06441852, "auxiliary_loss_mlp": 0.0126849, "balance_loss_clip": 0.06279354, "balance_loss_mlp": 0.01255329, "epoch": 0.4859161280625282, "flos": 22097375043840.0, "grad_norm": 1.5596200720623383, "language_loss": 0.83297032, "learning_rate": 2.1880266364944924e-06, "loss": 0.91007376, "num_input_tokens_seen": 173747355, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1317749, "step": 8082, "time_per_iteration": 5.541442155838013 }, { "auxiliary_loss_clip": 0.06441471, "auxiliary_loss_mlp": 0.01265988, "balance_loss_clip": 0.06285188, "balance_loss_mlp": 0.01254282, "epoch": 0.4859762513151962, "flos": 17499183661440.0, "grad_norm": 1.9263157578053114, "language_loss": 0.87408715, "learning_rate": 2.187638896199746e-06, "loss": 0.95116174, "num_input_tokens_seen": 173764825, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.11706543, "step": 8083, "time_per_iteration": 2.5561161041259766 }, { "auxiliary_loss_clip": 0.06441973, "auxiliary_loss_mlp": 0.01269513, "balance_loss_clip": 0.06283844, "balance_loss_mlp": 0.01257712, "epoch": 0.48603637456786414, "flos": 18010061205120.0, "grad_norm": 1.5711559673148474, "language_loss": 0.81203389, "learning_rate": 2.1872511487897126e-06, "loss": 0.88914877, "num_input_tokens_seen": 173783215, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.11791992, "step": 8084, "time_per_iteration": 2.592435836791992 }, { "auxiliary_loss_clip": 0.06449284, "auxiliary_loss_mlp": 0.01270611, "balance_loss_clip": 0.06286708, "balance_loss_mlp": 0.01258636, "epoch": 0.4860964978205321, "flos": 22498611120000.0, "grad_norm": 2.6550135438846634, "language_loss": 0.68643701, "learning_rate": 2.186863394279098e-06, "loss": 0.76363587, "num_input_tokens_seen": 173801905, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.11968994, "step": 8085, "time_per_iteration": 2.5646090507507324 }, { "auxiliary_loss_clip": 0.06444323, "auxiliary_loss_mlp": 0.01269398, "balance_loss_clip": 0.06282546, "balance_loss_mlp": 0.01257042, "epoch": 0.48615662107320007, "flos": 23380061345280.0, "grad_norm": 1.4239626208082745, "language_loss": 0.77564156, "learning_rate": 2.1864756326826046e-06, "loss": 0.85277879, "num_input_tokens_seen": 173824690, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.12359619, "step": 8086, "time_per_iteration": 2.71531343460083 }, { "auxiliary_loss_clip": 0.06446327, "auxiliary_loss_mlp": 0.01267435, "balance_loss_clip": 0.06285374, "balance_loss_mlp": 0.01255746, "epoch": 0.48621674432586803, "flos": 34426722769920.0, "grad_norm": 1.959367693650671, "language_loss": 0.70317221, "learning_rate": 2.1860878640149355e-06, "loss": 0.78030986, "num_input_tokens_seen": 173844450, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.11688232, "step": 8087, "time_per_iteration": 2.657135486602783 }, { "auxiliary_loss_clip": 0.06455205, "auxiliary_loss_mlp": 0.01267859, "balance_loss_clip": 0.06284499, "balance_loss_mlp": 0.01254591, "epoch": 0.486276867578536, "flos": 33115595207040.0, "grad_norm": 1.9518836124710568, "language_loss": 0.73471642, "learning_rate": 2.1857000882907974e-06, "loss": 0.81194705, "num_input_tokens_seen": 173864975, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.13262939, "step": 8088, "time_per_iteration": 2.795131206512451 }, { "auxiliary_loss_clip": 0.06448143, "auxiliary_loss_mlp": 0.01273412, "balance_loss_clip": 0.06285785, "balance_loss_mlp": 0.01261313, "epoch": 0.48633699083120396, "flos": 21477149521920.0, "grad_norm": 1.5048596256646953, "language_loss": 0.75571895, "learning_rate": 2.185312305524892e-06, "loss": 0.8329345, "num_input_tokens_seen": 173883805, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.12091064, "step": 8089, "time_per_iteration": 2.725905179977417 }, { "auxiliary_loss_clip": 0.06446664, "auxiliary_loss_mlp": 0.01265493, "balance_loss_clip": 0.06282732, "balance_loss_mlp": 0.01253494, "epoch": 0.48639711408387193, "flos": 20090565757440.0, "grad_norm": 1.5978808234830715, "language_loss": 0.84340703, "learning_rate": 2.184924515731926e-06, "loss": 0.92052859, "num_input_tokens_seen": 173903520, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.11999512, "step": 8090, "time_per_iteration": 2.6307835578918457 }, { "auxiliary_loss_clip": 0.06439673, "auxiliary_loss_mlp": 0.0127117, "balance_loss_clip": 0.06283706, "balance_loss_mlp": 0.01259452, "epoch": 0.4864572373365399, "flos": 20785450867200.0, "grad_norm": 1.415586729068185, "language_loss": 0.75961518, "learning_rate": 2.1845367189266045e-06, "loss": 0.83672363, "num_input_tokens_seen": 173924255, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.11724854, "step": 8091, "time_per_iteration": 2.5829648971557617 }, { "auxiliary_loss_clip": 0.06446145, "auxiliary_loss_mlp": 0.01265901, "balance_loss_clip": 0.06284028, "balance_loss_mlp": 0.01254088, "epoch": 0.48651736058920786, "flos": 26031554346240.0, "grad_norm": 1.3944548878389222, "language_loss": 0.80830508, "learning_rate": 2.184148915123631e-06, "loss": 0.88542551, "num_input_tokens_seen": 173943285, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.11828613, "step": 8092, "time_per_iteration": 2.635490655899048 }, { "auxiliary_loss_clip": 0.06446138, "auxiliary_loss_mlp": 0.01268405, "balance_loss_clip": 0.06282227, "balance_loss_mlp": 0.01255411, "epoch": 0.4865774838418758, "flos": 20491885687680.0, "grad_norm": 1.409398351929087, "language_loss": 0.72029901, "learning_rate": 2.1837611043377126e-06, "loss": 0.79744446, "num_input_tokens_seen": 173962205, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.12988281, "step": 8093, "time_per_iteration": 2.5529086589813232 }, { "auxiliary_loss_clip": 0.06445205, "auxiliary_loss_mlp": 0.01267504, "balance_loss_clip": 0.06283928, "balance_loss_mlp": 0.01255851, "epoch": 0.4866376070945438, "flos": 23554048348800.0, "grad_norm": 2.4160864123231627, "language_loss": 0.68453997, "learning_rate": 2.1833732865835545e-06, "loss": 0.76166701, "num_input_tokens_seen": 173980945, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.11657715, "step": 8094, "time_per_iteration": 2.623669385910034 }, { "auxiliary_loss_clip": 0.06458429, "auxiliary_loss_mlp": 0.01273522, "balance_loss_clip": 0.06291315, "balance_loss_mlp": 0.01259939, "epoch": 0.4866977303472118, "flos": 16696166457600.0, "grad_norm": 1.9483229013657801, "language_loss": 0.66911405, "learning_rate": 2.1829854618758636e-06, "loss": 0.74643362, "num_input_tokens_seen": 173998860, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.13592529, "step": 8095, "time_per_iteration": 2.5426371097564697 }, { "auxiliary_loss_clip": 0.06450093, "auxiliary_loss_mlp": 0.01267087, "balance_loss_clip": 0.06286259, "balance_loss_mlp": 0.01254707, "epoch": 0.4867578535998798, "flos": 17902012965120.0, "grad_norm": 1.7747150729195467, "language_loss": 0.79057348, "learning_rate": 2.182597630229345e-06, "loss": 0.86774534, "num_input_tokens_seen": 174016665, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.12384033, "step": 8096, "time_per_iteration": 2.610928773880005 }, { "auxiliary_loss_clip": 0.06442644, "auxiliary_loss_mlp": 0.01270299, "balance_loss_clip": 0.06281415, "balance_loss_mlp": 0.01258492, "epoch": 0.48681797685254774, "flos": 22644366497280.0, "grad_norm": 1.7211059517528224, "language_loss": 0.68125963, "learning_rate": 2.1822097916587067e-06, "loss": 0.75838912, "num_input_tokens_seen": 174034800, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.11804199, "step": 8097, "time_per_iteration": 2.5698914527893066 }, { "auxiliary_loss_clip": 0.0644514, "auxiliary_loss_mlp": 0.01268412, "balance_loss_clip": 0.06283846, "balance_loss_mlp": 0.01256682, "epoch": 0.4868781001052157, "flos": 20892283223040.0, "grad_norm": 1.728380373745615, "language_loss": 0.71517855, "learning_rate": 2.1818219461786543e-06, "loss": 0.79231405, "num_input_tokens_seen": 174054445, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.11737061, "step": 8098, "time_per_iteration": 2.6133601665496826 }, { "auxiliary_loss_clip": 0.06455344, "auxiliary_loss_mlp": 0.01266968, "balance_loss_clip": 0.062861, "balance_loss_mlp": 0.01253825, "epoch": 0.48693822335788367, "flos": 41984688723840.0, "grad_norm": 1.9869268318724473, "language_loss": 0.66322559, "learning_rate": 2.1814340938038956e-06, "loss": 0.74044871, "num_input_tokens_seen": 174077890, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.13146973, "step": 8099, "time_per_iteration": 2.782050371170044 }, { "auxiliary_loss_clip": 0.06453104, "auxiliary_loss_mlp": 0.01267448, "balance_loss_clip": 0.06288802, "balance_loss_mlp": 0.01256082, "epoch": 0.48699834661055164, "flos": 24250149342720.0, "grad_norm": 1.6524165953851422, "language_loss": 0.67543292, "learning_rate": 2.181046234549138e-06, "loss": 0.7526384, "num_input_tokens_seen": 174097460, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.11364746, "step": 8100, "time_per_iteration": 2.6194512844085693 }, { "auxiliary_loss_clip": 0.06443909, "auxiliary_loss_mlp": 0.01269095, "balance_loss_clip": 0.06284052, "balance_loss_mlp": 0.01257383, "epoch": 0.4870584698632196, "flos": 25931388389760.0, "grad_norm": 1.3355032781606166, "language_loss": 0.76835805, "learning_rate": 2.180658368429088e-06, "loss": 0.84548807, "num_input_tokens_seen": 174120775, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.11700439, "step": 8101, "time_per_iteration": 2.67695689201355 }, { "auxiliary_loss_clip": 0.0632696, "auxiliary_loss_mlp": 0.01252115, "balance_loss_clip": 0.06259897, "balance_loss_mlp": 0.01249914, "epoch": 0.48711859311588757, "flos": 70232006511360.0, "grad_norm": 0.6741217036506223, "language_loss": 0.52282727, "learning_rate": 2.1802704954584565e-06, "loss": 0.59861803, "num_input_tokens_seen": 174189135, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.02204895, "step": 8102, "time_per_iteration": 3.3118109703063965 }, { "auxiliary_loss_clip": 0.06453276, "auxiliary_loss_mlp": 0.01266709, "balance_loss_clip": 0.06290517, "balance_loss_mlp": 0.01254616, "epoch": 0.48717871636855553, "flos": 12346831301760.0, "grad_norm": 1.8216469373708388, "language_loss": 0.73750961, "learning_rate": 2.1798826156519484e-06, "loss": 0.81470948, "num_input_tokens_seen": 174203250, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.12097168, "step": 8103, "time_per_iteration": 2.5748178958892822 }, { "auxiliary_loss_clip": 0.06449556, "auxiliary_loss_mlp": 0.01270129, "balance_loss_clip": 0.06284921, "balance_loss_mlp": 0.01256748, "epoch": 0.4872388396212235, "flos": 23483874954240.0, "grad_norm": 1.5401578820325772, "language_loss": 0.63463223, "learning_rate": 2.1794947290242737e-06, "loss": 0.71182913, "num_input_tokens_seen": 174224145, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.1338501, "step": 8104, "time_per_iteration": 2.5721309185028076 }, { "auxiliary_loss_clip": 0.06444779, "auxiliary_loss_mlp": 0.01269908, "balance_loss_clip": 0.06283908, "balance_loss_mlp": 0.01257403, "epoch": 0.48729896287389146, "flos": 31435068919680.0, "grad_norm": 2.1716756662223915, "language_loss": 0.6946615, "learning_rate": 2.1791068355901413e-06, "loss": 0.77180839, "num_input_tokens_seen": 174244435, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.12506104, "step": 8105, "time_per_iteration": 2.683392286300659 }, { "auxiliary_loss_clip": 0.06445737, "auxiliary_loss_mlp": 0.01269326, "balance_loss_clip": 0.062853, "balance_loss_mlp": 0.01257643, "epoch": 0.4873590861265594, "flos": 19063192446720.0, "grad_norm": 1.7262127766459567, "language_loss": 0.73993146, "learning_rate": 2.178718935364259e-06, "loss": 0.81708217, "num_input_tokens_seen": 174262710, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.11682129, "step": 8106, "time_per_iteration": 2.554696559906006 }, { "auxiliary_loss_clip": 0.06460384, "auxiliary_loss_mlp": 0.01269845, "balance_loss_clip": 0.06292918, "balance_loss_mlp": 0.01256976, "epoch": 0.4874192093792274, "flos": 24354424149120.0, "grad_norm": 1.6392574003693297, "language_loss": 0.77095783, "learning_rate": 2.1783310283613373e-06, "loss": 0.84826005, "num_input_tokens_seen": 174281545, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.12872314, "step": 8107, "time_per_iteration": 2.722351551055908 }, { "auxiliary_loss_clip": 0.06451789, "auxiliary_loss_mlp": 0.01266788, "balance_loss_clip": 0.06292369, "balance_loss_mlp": 0.01255153, "epoch": 0.4874793326318954, "flos": 23119339766400.0, "grad_norm": 1.6540411802687207, "language_loss": 0.75296879, "learning_rate": 2.1779431145960853e-06, "loss": 0.8301546, "num_input_tokens_seen": 174300290, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.11633301, "step": 8108, "time_per_iteration": 2.6314127445220947 }, { "auxiliary_loss_clip": 0.06447248, "auxiliary_loss_mlp": 0.0126754, "balance_loss_clip": 0.06287143, "balance_loss_mlp": 0.01257122, "epoch": 0.4875394558845634, "flos": 19032193635840.0, "grad_norm": 1.6788593788970845, "language_loss": 0.73696399, "learning_rate": 2.177555194083212e-06, "loss": 0.81411189, "num_input_tokens_seen": 174318490, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.10412598, "step": 8109, "time_per_iteration": 2.594355821609497 }, { "auxiliary_loss_clip": 0.06443819, "auxiliary_loss_mlp": 0.0126688, "balance_loss_clip": 0.0628666, "balance_loss_mlp": 0.01255328, "epoch": 0.48759957913723134, "flos": 21439945509120.0, "grad_norm": 1.7931685126887833, "language_loss": 0.78711319, "learning_rate": 2.177167266837428e-06, "loss": 0.8642202, "num_input_tokens_seen": 174335505, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.11553955, "step": 8110, "time_per_iteration": 2.5626728534698486 }, { "auxiliary_loss_clip": 0.06455019, "auxiliary_loss_mlp": 0.01268073, "balance_loss_clip": 0.06292359, "balance_loss_mlp": 0.01256147, "epoch": 0.4876597023898993, "flos": 17754412798080.0, "grad_norm": 1.7682764672482545, "language_loss": 0.72378337, "learning_rate": 2.176779332873444e-06, "loss": 0.8010143, "num_input_tokens_seen": 174353990, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.11920166, "step": 8111, "time_per_iteration": 3.982109546661377 }, { "auxiliary_loss_clip": 0.0644502, "auxiliary_loss_mlp": 0.01269553, "balance_loss_clip": 0.06284401, "balance_loss_mlp": 0.01257185, "epoch": 0.4877198256425673, "flos": 17025384349440.0, "grad_norm": 2.042014167538824, "language_loss": 0.76431584, "learning_rate": 2.17639139220597e-06, "loss": 0.8414616, "num_input_tokens_seen": 174373425, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.12371826, "step": 8112, "time_per_iteration": 4.0403289794921875 }, { "auxiliary_loss_clip": 0.06464799, "auxiliary_loss_mlp": 0.01269622, "balance_loss_clip": 0.06294786, "balance_loss_mlp": 0.01257093, "epoch": 0.48777994889523524, "flos": 22390898296320.0, "grad_norm": 1.5136922625141938, "language_loss": 0.75106668, "learning_rate": 2.1760034448497166e-06, "loss": 0.82841086, "num_input_tokens_seen": 174393070, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.12536621, "step": 8113, "time_per_iteration": 2.626965045928955 }, { "auxiliary_loss_clip": 0.06336284, "auxiliary_loss_mlp": 0.01251557, "balance_loss_clip": 0.0626897, "balance_loss_mlp": 0.0124962, "epoch": 0.4878400721479032, "flos": 61261237664640.0, "grad_norm": 0.7786281640856106, "language_loss": 0.48787391, "learning_rate": 2.1756154908193943e-06, "loss": 0.56375235, "num_input_tokens_seen": 174446880, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.01934814, "step": 8114, "time_per_iteration": 3.0582568645477295 }, { "auxiliary_loss_clip": 0.06446142, "auxiliary_loss_mlp": 0.01270422, "balance_loss_clip": 0.06281652, "balance_loss_mlp": 0.01257953, "epoch": 0.48790019540057117, "flos": 24543756449280.0, "grad_norm": 1.3529069074139952, "language_loss": 0.77269661, "learning_rate": 2.1752275301297155e-06, "loss": 0.84986222, "num_input_tokens_seen": 174468485, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.12463379, "step": 8115, "time_per_iteration": 2.6330416202545166 }, { "auxiliary_loss_clip": 0.06451681, "auxiliary_loss_mlp": 0.01273283, "balance_loss_clip": 0.06285164, "balance_loss_mlp": 0.01260575, "epoch": 0.48796031865323913, "flos": 21840175336320.0, "grad_norm": 2.0733867791204745, "language_loss": 0.72516525, "learning_rate": 2.1748395627953915e-06, "loss": 0.80241489, "num_input_tokens_seen": 174486360, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.12689209, "step": 8116, "time_per_iteration": 2.5519254207611084 }, { "auxiliary_loss_clip": 0.06441397, "auxiliary_loss_mlp": 0.01266317, "balance_loss_clip": 0.062815, "balance_loss_mlp": 0.01253883, "epoch": 0.4880204419059071, "flos": 18594969431040.0, "grad_norm": 2.214315995449571, "language_loss": 0.63259077, "learning_rate": 2.1744515888311335e-06, "loss": 0.70966792, "num_input_tokens_seen": 174505075, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.12438965, "step": 8117, "time_per_iteration": 2.582850217819214 }, { "auxiliary_loss_clip": 0.06448413, "auxiliary_loss_mlp": 0.01270507, "balance_loss_clip": 0.06285435, "balance_loss_mlp": 0.01259021, "epoch": 0.48808056515857506, "flos": 19178242502400.0, "grad_norm": 3.9528076445603944, "language_loss": 0.79438353, "learning_rate": 2.1740636082516533e-06, "loss": 0.87157273, "num_input_tokens_seen": 174523385, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.1149292, "step": 8118, "time_per_iteration": 2.5326857566833496 }, { "auxiliary_loss_clip": 0.06451668, "auxiliary_loss_mlp": 0.01270548, "balance_loss_clip": 0.06286757, "balance_loss_mlp": 0.01258287, "epoch": 0.48814068841124303, "flos": 20126679667200.0, "grad_norm": 3.2237986271309262, "language_loss": 0.64023286, "learning_rate": 2.1736756210716645e-06, "loss": 0.71745503, "num_input_tokens_seen": 174542200, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.12255859, "step": 8119, "time_per_iteration": 2.620307445526123 }, { "auxiliary_loss_clip": 0.06445645, "auxiliary_loss_mlp": 0.01265278, "balance_loss_clip": 0.06282009, "balance_loss_mlp": 0.01253607, "epoch": 0.488200811663911, "flos": 22972116942720.0, "grad_norm": 2.048135597081916, "language_loss": 0.72241884, "learning_rate": 2.173287627305878e-06, "loss": 0.79952806, "num_input_tokens_seen": 174563620, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.11663818, "step": 8120, "time_per_iteration": 2.5826117992401123 }, { "auxiliary_loss_clip": 0.0645542, "auxiliary_loss_mlp": 0.01266829, "balance_loss_clip": 0.06287858, "balance_loss_mlp": 0.01254265, "epoch": 0.48826093491657896, "flos": 33918947827200.0, "grad_norm": 1.509596695440239, "language_loss": 0.64398456, "learning_rate": 2.1728996269690075e-06, "loss": 0.72120714, "num_input_tokens_seen": 174586465, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.12561035, "step": 8121, "time_per_iteration": 4.181307554244995 }, { "auxiliary_loss_clip": 0.06457093, "auxiliary_loss_mlp": 0.0126697, "balance_loss_clip": 0.06287218, "balance_loss_mlp": 0.01253702, "epoch": 0.488321058169247, "flos": 23076056332800.0, "grad_norm": 1.963073121305537, "language_loss": 0.83045375, "learning_rate": 2.1725116200757664e-06, "loss": 0.9076944, "num_input_tokens_seen": 174604035, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.13269043, "step": 8122, "time_per_iteration": 4.096210241317749 }, { "auxiliary_loss_clip": 0.06452139, "auxiliary_loss_mlp": 0.0127081, "balance_loss_clip": 0.06284019, "balance_loss_mlp": 0.01258174, "epoch": 0.48838118142191494, "flos": 19323746317440.0, "grad_norm": 1.6055067906206337, "language_loss": 0.85573769, "learning_rate": 2.172123606640866e-06, "loss": 0.93296719, "num_input_tokens_seen": 174621715, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.12634277, "step": 8123, "time_per_iteration": 2.581048011779785 }, { "auxiliary_loss_clip": 0.0644971, "auxiliary_loss_mlp": 0.01272281, "balance_loss_clip": 0.06282186, "balance_loss_mlp": 0.01260289, "epoch": 0.4884413046745829, "flos": 25417701734400.0, "grad_norm": 1.4080618545384753, "language_loss": 0.85603774, "learning_rate": 2.1717355866790227e-06, "loss": 0.93325758, "num_input_tokens_seen": 174643835, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.12005615, "step": 8124, "time_per_iteration": 2.655742883682251 }, { "auxiliary_loss_clip": 0.0645321, "auxiliary_loss_mlp": 0.01268759, "balance_loss_clip": 0.06287287, "balance_loss_mlp": 0.01256605, "epoch": 0.4885014279272509, "flos": 20997103080960.0, "grad_norm": 1.8830682640724545, "language_loss": 0.79903728, "learning_rate": 2.171347560204948e-06, "loss": 0.87625694, "num_input_tokens_seen": 174660955, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.12164307, "step": 8125, "time_per_iteration": 2.5877909660339355 }, { "auxiliary_loss_clip": 0.0644193, "auxiliary_loss_mlp": 0.01268638, "balance_loss_clip": 0.06279308, "balance_loss_mlp": 0.01256187, "epoch": 0.48856155117991884, "flos": 13776656572800.0, "grad_norm": 2.0897065989877563, "language_loss": 0.72630787, "learning_rate": 2.170959527233356e-06, "loss": 0.80341351, "num_input_tokens_seen": 174678270, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.12438965, "step": 8126, "time_per_iteration": 2.5749757289886475 }, { "auxiliary_loss_clip": 0.06453557, "auxiliary_loss_mlp": 0.01270413, "balance_loss_clip": 0.06284716, "balance_loss_mlp": 0.01257908, "epoch": 0.4886216744325868, "flos": 32095936471680.0, "grad_norm": 1.7326867563838952, "language_loss": 0.68935776, "learning_rate": 2.1705714877789633e-06, "loss": 0.76659751, "num_input_tokens_seen": 174698360, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.12506104, "step": 8127, "time_per_iteration": 2.714421033859253 }, { "auxiliary_loss_clip": 0.06455126, "auxiliary_loss_mlp": 0.01270262, "balance_loss_clip": 0.0628606, "balance_loss_mlp": 0.01257155, "epoch": 0.48868179768525477, "flos": 19616221393920.0, "grad_norm": 1.6134686301530148, "language_loss": 0.76533473, "learning_rate": 2.170183441856481e-06, "loss": 0.84258866, "num_input_tokens_seen": 174716755, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.13110352, "step": 8128, "time_per_iteration": 2.5531251430511475 }, { "auxiliary_loss_clip": 0.06457995, "auxiliary_loss_mlp": 0.01270702, "balance_loss_clip": 0.06289533, "balance_loss_mlp": 0.01257678, "epoch": 0.48874192093792274, "flos": 21293100028800.0, "grad_norm": 1.5542753972347705, "language_loss": 0.76556492, "learning_rate": 2.1697953894806265e-06, "loss": 0.84285188, "num_input_tokens_seen": 174735560, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.13018799, "step": 8129, "time_per_iteration": 2.662034034729004 }, { "auxiliary_loss_clip": 0.06450497, "auxiliary_loss_mlp": 0.01270658, "balance_loss_clip": 0.06285095, "balance_loss_mlp": 0.01257963, "epoch": 0.4888020441905907, "flos": 14178647335680.0, "grad_norm": 2.3018694294108153, "language_loss": 0.6510973, "learning_rate": 2.169407330666114e-06, "loss": 0.72830886, "num_input_tokens_seen": 174752730, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.12695312, "step": 8130, "time_per_iteration": 2.549663782119751 }, { "auxiliary_loss_clip": 0.06446856, "auxiliary_loss_mlp": 0.01269801, "balance_loss_clip": 0.06283226, "balance_loss_mlp": 0.01258411, "epoch": 0.48886216744325867, "flos": 24104813235840.0, "grad_norm": 1.63664867596237, "language_loss": 0.73120123, "learning_rate": 2.169019265427658e-06, "loss": 0.80836785, "num_input_tokens_seen": 174772520, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.1138916, "step": 8131, "time_per_iteration": 2.6610591411590576 }, { "auxiliary_loss_clip": 0.06458, "auxiliary_loss_mlp": 0.01274072, "balance_loss_clip": 0.06288045, "balance_loss_mlp": 0.01261334, "epoch": 0.48892229069592663, "flos": 38439838218240.0, "grad_norm": 1.5225233165646337, "language_loss": 0.69777167, "learning_rate": 2.1686311937799745e-06, "loss": 0.77509236, "num_input_tokens_seen": 174796540, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.12744141, "step": 8132, "time_per_iteration": 2.696347236633301 }, { "auxiliary_loss_clip": 0.06444949, "auxiliary_loss_mlp": 0.01269657, "balance_loss_clip": 0.06282849, "balance_loss_mlp": 0.01257694, "epoch": 0.4889824139485946, "flos": 23850338785920.0, "grad_norm": 1.411319929808209, "language_loss": 0.70203549, "learning_rate": 2.1682431157377797e-06, "loss": 0.7791816, "num_input_tokens_seen": 174817840, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.11981201, "step": 8133, "time_per_iteration": 2.6669809818267822 }, { "auxiliary_loss_clip": 0.06444639, "auxiliary_loss_mlp": 0.01271191, "balance_loss_clip": 0.0628242, "balance_loss_mlp": 0.01259032, "epoch": 0.48904253720126256, "flos": 24432731389440.0, "grad_norm": 1.7418265395040737, "language_loss": 0.71350318, "learning_rate": 2.1678550313157883e-06, "loss": 0.79066145, "num_input_tokens_seen": 174837885, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.12158203, "step": 8134, "time_per_iteration": 2.5940916538238525 }, { "auxiliary_loss_clip": 0.06456673, "auxiliary_loss_mlp": 0.012699, "balance_loss_clip": 0.06285721, "balance_loss_mlp": 0.01256853, "epoch": 0.4891026604539306, "flos": 24177586106880.0, "grad_norm": 1.8004146935496055, "language_loss": 0.80606431, "learning_rate": 2.167466940528718e-06, "loss": 0.88333011, "num_input_tokens_seen": 174855240, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.13061523, "step": 8135, "time_per_iteration": 2.695634126663208 }, { "auxiliary_loss_clip": 0.06448913, "auxiliary_loss_mlp": 0.01269064, "balance_loss_clip": 0.06285539, "balance_loss_mlp": 0.012574, "epoch": 0.48916278370659855, "flos": 21477443011200.0, "grad_norm": 1.6548119622382367, "language_loss": 0.74570417, "learning_rate": 2.1670788433912843e-06, "loss": 0.82288396, "num_input_tokens_seen": 174875145, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.11657715, "step": 8136, "time_per_iteration": 2.6408302783966064 }, { "auxiliary_loss_clip": 0.06447789, "auxiliary_loss_mlp": 0.01268177, "balance_loss_clip": 0.06284644, "balance_loss_mlp": 0.01255976, "epoch": 0.4892229069592665, "flos": 22316322562560.0, "grad_norm": 2.5275700413216646, "language_loss": 0.73925227, "learning_rate": 2.166690739918204e-06, "loss": 0.81641191, "num_input_tokens_seen": 174894770, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.12207031, "step": 8137, "time_per_iteration": 2.6288373470306396 }, { "auxiliary_loss_clip": 0.06452599, "auxiliary_loss_mlp": 0.01269998, "balance_loss_clip": 0.06286588, "balance_loss_mlp": 0.01257511, "epoch": 0.4892830302119345, "flos": 12791812008960.0, "grad_norm": 2.183419861180001, "language_loss": 0.75392222, "learning_rate": 2.1663026301241944e-06, "loss": 0.83114821, "num_input_tokens_seen": 174912780, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.12493896, "step": 8138, "time_per_iteration": 2.5416247844696045 }, { "auxiliary_loss_clip": 0.0645065, "auxiliary_loss_mlp": 0.01266669, "balance_loss_clip": 0.06286215, "balance_loss_mlp": 0.01254861, "epoch": 0.48934315346460244, "flos": 20820223111680.0, "grad_norm": 1.5157452805777758, "language_loss": 0.74296409, "learning_rate": 2.165914514023972e-06, "loss": 0.82013726, "num_input_tokens_seen": 174931250, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.11810303, "step": 8139, "time_per_iteration": 2.6607484817504883 }, { "auxiliary_loss_clip": 0.06449513, "auxiliary_loss_mlp": 0.01269124, "balance_loss_clip": 0.06282842, "balance_loss_mlp": 0.01256964, "epoch": 0.4894032767172704, "flos": 19761641354880.0, "grad_norm": 1.710118063022852, "language_loss": 0.62135828, "learning_rate": 2.165526391632255e-06, "loss": 0.69854468, "num_input_tokens_seen": 174951105, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.12158203, "step": 8140, "time_per_iteration": 2.5730128288269043 }, { "auxiliary_loss_clip": 0.06453752, "auxiliary_loss_mlp": 0.01271526, "balance_loss_clip": 0.0628639, "balance_loss_mlp": 0.01258919, "epoch": 0.4894633999699384, "flos": 17824292703360.0, "grad_norm": 1.6886602647391906, "language_loss": 0.82292366, "learning_rate": 2.1651382629637608e-06, "loss": 0.90017641, "num_input_tokens_seen": 174969120, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.1260376, "step": 8141, "time_per_iteration": 2.6377112865448 }, { "auxiliary_loss_clip": 0.06454138, "auxiliary_loss_mlp": 0.01270658, "balance_loss_clip": 0.06286259, "balance_loss_mlp": 0.01257843, "epoch": 0.48952352322260634, "flos": 25530781219200.0, "grad_norm": 1.491626816009275, "language_loss": 0.72619057, "learning_rate": 2.1647501280332066e-06, "loss": 0.80343854, "num_input_tokens_seen": 174991295, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.12811279, "step": 8142, "time_per_iteration": 2.615569591522217 }, { "auxiliary_loss_clip": 0.06445982, "auxiliary_loss_mlp": 0.01267952, "balance_loss_clip": 0.0628299, "balance_loss_mlp": 0.01255709, "epoch": 0.4895836464752743, "flos": 29062508561280.0, "grad_norm": 1.4625053873514877, "language_loss": 0.66889197, "learning_rate": 2.1643619868553105e-06, "loss": 0.74603128, "num_input_tokens_seen": 175012830, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.12249756, "step": 8143, "time_per_iteration": 2.709130048751831 }, { "auxiliary_loss_clip": 0.0644718, "auxiliary_loss_mlp": 0.01265859, "balance_loss_clip": 0.06284392, "balance_loss_mlp": 0.01254242, "epoch": 0.48964376972794227, "flos": 33555335034240.0, "grad_norm": 1.4946856463215026, "language_loss": 0.75370336, "learning_rate": 2.163973839444793e-06, "loss": 0.83083367, "num_input_tokens_seen": 175035695, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.11621094, "step": 8144, "time_per_iteration": 2.6935694217681885 }, { "auxiliary_loss_clip": 0.06449502, "auxiliary_loss_mlp": 0.01268119, "balance_loss_clip": 0.06284806, "balance_loss_mlp": 0.01256001, "epoch": 0.48970389298061023, "flos": 22060506447360.0, "grad_norm": 1.7815394441045413, "language_loss": 0.75934052, "learning_rate": 2.1635856858163695e-06, "loss": 0.8365168, "num_input_tokens_seen": 175056425, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.12115479, "step": 8145, "time_per_iteration": 2.6571059226989746 }, { "auxiliary_loss_clip": 0.064487, "auxiliary_loss_mlp": 0.01268238, "balance_loss_clip": 0.06283223, "balance_loss_mlp": 0.01255316, "epoch": 0.4897640162332782, "flos": 20090523830400.0, "grad_norm": 1.669087561780802, "language_loss": 0.8073951, "learning_rate": 2.163197525984761e-06, "loss": 0.88456446, "num_input_tokens_seen": 175074800, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.1293335, "step": 8146, "time_per_iteration": 2.584062337875366 }, { "auxiliary_loss_clip": 0.06442529, "auxiliary_loss_mlp": 0.01272649, "balance_loss_clip": 0.06281976, "balance_loss_mlp": 0.01261193, "epoch": 0.48982413948594616, "flos": 23813134773120.0, "grad_norm": 1.4722489009093818, "language_loss": 0.74852622, "learning_rate": 2.162809359964687e-06, "loss": 0.82567805, "num_input_tokens_seen": 175094500, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.11450195, "step": 8147, "time_per_iteration": 2.598330020904541 }, { "auxiliary_loss_clip": 0.06446633, "auxiliary_loss_mlp": 0.01270437, "balance_loss_clip": 0.0628236, "balance_loss_mlp": 0.01258058, "epoch": 0.4898842627386142, "flos": 17645442163200.0, "grad_norm": 2.0780524507813873, "language_loss": 0.82997817, "learning_rate": 2.162421187770864e-06, "loss": 0.90714884, "num_input_tokens_seen": 175112920, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.12384033, "step": 8148, "time_per_iteration": 2.5911200046539307 }, { "auxiliary_loss_clip": 0.06443365, "auxiliary_loss_mlp": 0.01266441, "balance_loss_clip": 0.06284262, "balance_loss_mlp": 0.01255122, "epoch": 0.48994438599128215, "flos": 16623519367680.0, "grad_norm": 1.6963887052154638, "language_loss": 0.74222791, "learning_rate": 2.162033009418015e-06, "loss": 0.81932592, "num_input_tokens_seen": 175129910, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.11303711, "step": 8149, "time_per_iteration": 2.5317745208740234 }, { "auxiliary_loss_clip": 0.06457012, "auxiliary_loss_mlp": 0.01270013, "balance_loss_clip": 0.06286971, "balance_loss_mlp": 0.01257132, "epoch": 0.4900045092439501, "flos": 26622080795520.0, "grad_norm": 1.7029606952991012, "language_loss": 0.76291108, "learning_rate": 2.1616448249208567e-06, "loss": 0.84018135, "num_input_tokens_seen": 175148705, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.12884521, "step": 8150, "time_per_iteration": 2.7260897159576416 }, { "auxiliary_loss_clip": 0.0644764, "auxiliary_loss_mlp": 0.01268461, "balance_loss_clip": 0.06281145, "balance_loss_mlp": 0.01255646, "epoch": 0.4900646324966181, "flos": 19908361054080.0, "grad_norm": 1.7781236089794847, "language_loss": 0.72797102, "learning_rate": 2.1612566342941106e-06, "loss": 0.80513203, "num_input_tokens_seen": 175167425, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.12817383, "step": 8151, "time_per_iteration": 5.4654107093811035 }, { "auxiliary_loss_clip": 0.06333766, "auxiliary_loss_mlp": 0.01252428, "balance_loss_clip": 0.06266843, "balance_loss_mlp": 0.01250069, "epoch": 0.49012475574928605, "flos": 59207245729920.0, "grad_norm": 0.8038836143187064, "language_loss": 0.5404253, "learning_rate": 2.1608684375524977e-06, "loss": 0.61628723, "num_input_tokens_seen": 175227985, "router_z_loss_clip": 0.66992188, "router_z_loss_mlp": 0.02354431, "step": 8152, "time_per_iteration": 3.1800200939178467 }, { "auxiliary_loss_clip": 0.06453663, "auxiliary_loss_mlp": 0.01270205, "balance_loss_clip": 0.0628306, "balance_loss_mlp": 0.01258105, "epoch": 0.490184879001954, "flos": 45270285096960.0, "grad_norm": 1.832563967759215, "language_loss": 0.61395335, "learning_rate": 2.1604802347107364e-06, "loss": 0.69119203, "num_input_tokens_seen": 175251895, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.12097168, "step": 8153, "time_per_iteration": 2.8624110221862793 }, { "auxiliary_loss_clip": 0.06445884, "auxiliary_loss_mlp": 0.01269058, "balance_loss_clip": 0.0628131, "balance_loss_mlp": 0.01256177, "epoch": 0.490245002254622, "flos": 28009754663040.0, "grad_norm": 1.648976939914876, "language_loss": 0.77348906, "learning_rate": 2.160092025783549e-06, "loss": 0.85063851, "num_input_tokens_seen": 175272770, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.12890625, "step": 8154, "time_per_iteration": 2.87636661529541 }, { "auxiliary_loss_clip": 0.06335361, "auxiliary_loss_mlp": 0.01251829, "balance_loss_clip": 0.06267987, "balance_loss_mlp": 0.01249443, "epoch": 0.49030512550728994, "flos": 58971764229120.0, "grad_norm": 0.921945962816049, "language_loss": 0.66910577, "learning_rate": 2.1597038107856564e-06, "loss": 0.74497771, "num_input_tokens_seen": 175336320, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.02381897, "step": 8155, "time_per_iteration": 3.3406662940979004 }, { "auxiliary_loss_clip": 0.06451521, "auxiliary_loss_mlp": 0.01271603, "balance_loss_clip": 0.0628623, "balance_loss_mlp": 0.01259914, "epoch": 0.4903652487599579, "flos": 19797922972800.0, "grad_norm": 1.9816998048450445, "language_loss": 0.76579666, "learning_rate": 2.1593155897317784e-06, "loss": 0.84302795, "num_input_tokens_seen": 175353540, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.11688232, "step": 8156, "time_per_iteration": 2.617483377456665 }, { "auxiliary_loss_clip": 0.06450742, "auxiliary_loss_mlp": 0.01274132, "balance_loss_clip": 0.06286138, "balance_loss_mlp": 0.01260524, "epoch": 0.49042537201262587, "flos": 21768492568320.0, "grad_norm": 1.986398469539905, "language_loss": 0.84139931, "learning_rate": 2.1589273626366377e-06, "loss": 0.918648, "num_input_tokens_seen": 175370445, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.13623047, "step": 8157, "time_per_iteration": 2.5990681648254395 }, { "auxiliary_loss_clip": 0.06447399, "auxiliary_loss_mlp": 0.01267794, "balance_loss_clip": 0.06281614, "balance_loss_mlp": 0.01255766, "epoch": 0.49048549526529384, "flos": 18959043421440.0, "grad_norm": 2.2806177908589795, "language_loss": 0.79984665, "learning_rate": 2.158539129514956e-06, "loss": 0.8769986, "num_input_tokens_seen": 175389020, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.12036133, "step": 8158, "time_per_iteration": 2.5918662548065186 }, { "auxiliary_loss_clip": 0.06456901, "auxiliary_loss_mlp": 0.01272437, "balance_loss_clip": 0.06287574, "balance_loss_mlp": 0.01259515, "epoch": 0.4905456185179618, "flos": 26913633477120.0, "grad_norm": 1.4202125215696104, "language_loss": 0.6947521, "learning_rate": 2.158150890381454e-06, "loss": 0.77204543, "num_input_tokens_seen": 175409545, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.12927246, "step": 8159, "time_per_iteration": 2.631063938140869 }, { "auxiliary_loss_clip": 0.06446815, "auxiliary_loss_mlp": 0.01269553, "balance_loss_clip": 0.06283845, "balance_loss_mlp": 0.01257054, "epoch": 0.49060574177062977, "flos": 20418567765120.0, "grad_norm": 1.8081227066111234, "language_loss": 0.73683369, "learning_rate": 2.157762645250854e-06, "loss": 0.81399739, "num_input_tokens_seen": 175429335, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.125, "step": 8160, "time_per_iteration": 2.608261823654175 }, { "auxiliary_loss_clip": 0.06454918, "auxiliary_loss_mlp": 0.01272633, "balance_loss_clip": 0.0628434, "balance_loss_mlp": 0.01258996, "epoch": 0.4906658650232978, "flos": 17499477150720.0, "grad_norm": 1.8112364512783488, "language_loss": 0.71926534, "learning_rate": 2.1573743941378796e-06, "loss": 0.79654092, "num_input_tokens_seen": 175446955, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.13635254, "step": 8161, "time_per_iteration": 5.44926118850708 }, { "auxiliary_loss_clip": 0.06451005, "auxiliary_loss_mlp": 0.01269596, "balance_loss_clip": 0.06286525, "balance_loss_mlp": 0.01257562, "epoch": 0.49072598827596575, "flos": 26621619598080.0, "grad_norm": 1.5243061110090765, "language_loss": 0.68625331, "learning_rate": 2.1569861370572517e-06, "loss": 0.76345932, "num_input_tokens_seen": 175468195, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.12042236, "step": 8162, "time_per_iteration": 2.5770976543426514 }, { "auxiliary_loss_clip": 0.06456868, "auxiliary_loss_mlp": 0.01269533, "balance_loss_clip": 0.06285796, "balance_loss_mlp": 0.0125583, "epoch": 0.4907861115286337, "flos": 20418861254400.0, "grad_norm": 1.7102591135303715, "language_loss": 0.63920987, "learning_rate": 2.1565978740236944e-06, "loss": 0.71647382, "num_input_tokens_seen": 175487455, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.13696289, "step": 8163, "time_per_iteration": 2.6000025272369385 }, { "auxiliary_loss_clip": 0.06447328, "auxiliary_loss_mlp": 0.01271428, "balance_loss_clip": 0.062883, "balance_loss_mlp": 0.01259287, "epoch": 0.4908462347813017, "flos": 14069508992640.0, "grad_norm": 1.9988348897561217, "language_loss": 0.77607131, "learning_rate": 2.1562096050519293e-06, "loss": 0.85325891, "num_input_tokens_seen": 175504450, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.12158203, "step": 8164, "time_per_iteration": 2.5361154079437256 }, { "auxiliary_loss_clip": 0.06459525, "auxiliary_loss_mlp": 0.01271292, "balance_loss_clip": 0.06291191, "balance_loss_mlp": 0.01258143, "epoch": 0.49090635803396965, "flos": 18741227932800.0, "grad_norm": 1.628901969472021, "language_loss": 0.76731747, "learning_rate": 2.1558213301566806e-06, "loss": 0.84462565, "num_input_tokens_seen": 175523600, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.13140869, "step": 8165, "time_per_iteration": 2.6483304500579834 }, { "auxiliary_loss_clip": 0.06446182, "auxiliary_loss_mlp": 0.01270245, "balance_loss_clip": 0.06283817, "balance_loss_mlp": 0.01256995, "epoch": 0.4909664812866376, "flos": 20564784339840.0, "grad_norm": 1.7343623227452598, "language_loss": 0.77534419, "learning_rate": 2.1554330493526716e-06, "loss": 0.85250843, "num_input_tokens_seen": 175542720, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.13244629, "step": 8166, "time_per_iteration": 2.5950331687927246 }, { "auxiliary_loss_clip": 0.06341459, "auxiliary_loss_mlp": 0.01256695, "balance_loss_clip": 0.06274459, "balance_loss_mlp": 0.01254513, "epoch": 0.4910266045393056, "flos": 54704006622720.0, "grad_norm": 0.7829847228197975, "language_loss": 0.54216814, "learning_rate": 2.1550447626546253e-06, "loss": 0.61814964, "num_input_tokens_seen": 175598640, "router_z_loss_clip": 0.67138672, "router_z_loss_mlp": 0.02183533, "step": 8167, "time_per_iteration": 3.2561376094818115 }, { "auxiliary_loss_clip": 0.06447186, "auxiliary_loss_mlp": 0.01269981, "balance_loss_clip": 0.06284214, "balance_loss_mlp": 0.01257828, "epoch": 0.49108672779197354, "flos": 16250892261120.0, "grad_norm": 1.709883589316082, "language_loss": 0.85975182, "learning_rate": 2.1546564700772665e-06, "loss": 0.9369235, "num_input_tokens_seen": 175615675, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.12164307, "step": 8168, "time_per_iteration": 2.5842645168304443 }, { "auxiliary_loss_clip": 0.06451066, "auxiliary_loss_mlp": 0.01272471, "balance_loss_clip": 0.06289662, "balance_loss_mlp": 0.01259936, "epoch": 0.4911468510446415, "flos": 19831018135680.0, "grad_norm": 1.6513239446653458, "language_loss": 0.73532712, "learning_rate": 2.1542681716353193e-06, "loss": 0.81256247, "num_input_tokens_seen": 175632255, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.12530518, "step": 8169, "time_per_iteration": 2.581801652908325 }, { "auxiliary_loss_clip": 0.06447191, "auxiliary_loss_mlp": 0.01268273, "balance_loss_clip": 0.06284681, "balance_loss_mlp": 0.01255631, "epoch": 0.4912069742973095, "flos": 21218650076160.0, "grad_norm": 1.40116968723525, "language_loss": 0.78306592, "learning_rate": 2.1538798673435068e-06, "loss": 0.86022055, "num_input_tokens_seen": 175651625, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.12646484, "step": 8170, "time_per_iteration": 2.6297659873962402 }, { "auxiliary_loss_clip": 0.06463977, "auxiliary_loss_mlp": 0.0126743, "balance_loss_clip": 0.06297611, "balance_loss_mlp": 0.012548, "epoch": 0.49126709754997744, "flos": 19543280814720.0, "grad_norm": 2.032876318310132, "language_loss": 0.76030284, "learning_rate": 2.1534915572165545e-06, "loss": 0.83761692, "num_input_tokens_seen": 175669265, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.12640381, "step": 8171, "time_per_iteration": 2.592982292175293 }, { "auxiliary_loss_clip": 0.064589, "auxiliary_loss_mlp": 0.01269546, "balance_loss_clip": 0.06290632, "balance_loss_mlp": 0.01257762, "epoch": 0.4913272208026454, "flos": 12244568993280.0, "grad_norm": 1.812498373756898, "language_loss": 0.81850141, "learning_rate": 2.1531032412691875e-06, "loss": 0.89578581, "num_input_tokens_seen": 175686065, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.11779785, "step": 8172, "time_per_iteration": 2.5944864749908447 }, { "auxiliary_loss_clip": 0.06336901, "auxiliary_loss_mlp": 0.01258902, "balance_loss_clip": 0.06270619, "balance_loss_mlp": 0.01256787, "epoch": 0.49138734405531337, "flos": 65484663661440.0, "grad_norm": 0.6712533698778982, "language_loss": 0.53275895, "learning_rate": 2.1527149195161295e-06, "loss": 0.60871696, "num_input_tokens_seen": 175748595, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 0.02116394, "step": 8173, "time_per_iteration": 3.206261157989502 }, { "auxiliary_loss_clip": 0.06450303, "auxiliary_loss_mlp": 0.01266639, "balance_loss_clip": 0.06283663, "balance_loss_mlp": 0.0125352, "epoch": 0.4914474673079814, "flos": 18444434371200.0, "grad_norm": 1.8308902903765973, "language_loss": 0.63785988, "learning_rate": 2.152326591972107e-06, "loss": 0.7150293, "num_input_tokens_seen": 175766770, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.13122559, "step": 8174, "time_per_iteration": 2.5598788261413574 }, { "auxiliary_loss_clip": 0.06449711, "auxiliary_loss_mlp": 0.01273132, "balance_loss_clip": 0.0628342, "balance_loss_mlp": 0.01261104, "epoch": 0.49150759056064935, "flos": 21690772306560.0, "grad_norm": 1.7146563025920125, "language_loss": 0.69690847, "learning_rate": 2.1519382586518445e-06, "loss": 0.77413696, "num_input_tokens_seen": 175783605, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.12030029, "step": 8175, "time_per_iteration": 2.5619888305664062 }, { "auxiliary_loss_clip": 0.06451221, "auxiliary_loss_mlp": 0.01270977, "balance_loss_clip": 0.06287324, "balance_loss_mlp": 0.01258711, "epoch": 0.4915677138133173, "flos": 22388969652480.0, "grad_norm": 1.7327624600603024, "language_loss": 0.74693763, "learning_rate": 2.151549919570068e-06, "loss": 0.82415962, "num_input_tokens_seen": 175801390, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.12255859, "step": 8176, "time_per_iteration": 2.586760997772217 }, { "auxiliary_loss_clip": 0.06449977, "auxiliary_loss_mlp": 0.01274709, "balance_loss_clip": 0.0628386, "balance_loss_mlp": 0.01261703, "epoch": 0.4916278370659853, "flos": 18408320461440.0, "grad_norm": 1.8520043345563981, "language_loss": 0.7026757, "learning_rate": 2.1511615747415036e-06, "loss": 0.7799226, "num_input_tokens_seen": 175819830, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.12994385, "step": 8177, "time_per_iteration": 2.598055839538574 }, { "auxiliary_loss_clip": 0.06332683, "auxiliary_loss_mlp": 0.01258102, "balance_loss_clip": 0.06265609, "balance_loss_mlp": 0.01256089, "epoch": 0.49168796031865325, "flos": 66630147701760.0, "grad_norm": 0.6730173839732073, "language_loss": 0.46049625, "learning_rate": 2.150773224180877e-06, "loss": 0.53640413, "num_input_tokens_seen": 175881765, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.02012634, "step": 8178, "time_per_iteration": 3.2296810150146484 }, { "auxiliary_loss_clip": 0.0645413, "auxiliary_loss_mlp": 0.01271823, "balance_loss_clip": 0.06284493, "balance_loss_mlp": 0.01259091, "epoch": 0.4917480835713212, "flos": 20965601145600.0, "grad_norm": 4.394227417702771, "language_loss": 0.65982401, "learning_rate": 2.1503848679029147e-06, "loss": 0.73708355, "num_input_tokens_seen": 175901795, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.1272583, "step": 8179, "time_per_iteration": 2.627737522125244 }, { "auxiliary_loss_clip": 0.0645963, "auxiliary_loss_mlp": 0.01271049, "balance_loss_clip": 0.06288368, "balance_loss_mlp": 0.01258305, "epoch": 0.4918082068239892, "flos": 15777386438400.0, "grad_norm": 1.9117540767583219, "language_loss": 0.70594501, "learning_rate": 2.149996505922343e-06, "loss": 0.78325176, "num_input_tokens_seen": 175917770, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.12738037, "step": 8180, "time_per_iteration": 2.543726682662964 }, { "auxiliary_loss_clip": 0.06448267, "auxiliary_loss_mlp": 0.0126727, "balance_loss_clip": 0.06285825, "balance_loss_mlp": 0.01254598, "epoch": 0.49186833007665715, "flos": 24611162659200.0, "grad_norm": 1.594608173921003, "language_loss": 0.8449477, "learning_rate": 2.1496081382538895e-06, "loss": 0.92210305, "num_input_tokens_seen": 175937000, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.12677002, "step": 8181, "time_per_iteration": 2.6720142364501953 }, { "auxiliary_loss_clip": 0.06444854, "auxiliary_loss_mlp": 0.01269551, "balance_loss_clip": 0.06286219, "balance_loss_mlp": 0.01257857, "epoch": 0.4919284533293251, "flos": 22097039627520.0, "grad_norm": 2.0918899258152543, "language_loss": 0.73701715, "learning_rate": 2.1492197649122793e-06, "loss": 0.81416124, "num_input_tokens_seen": 175955170, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.11700439, "step": 8182, "time_per_iteration": 2.725649118423462 }, { "auxiliary_loss_clip": 0.06447284, "auxiliary_loss_mlp": 0.01270722, "balance_loss_clip": 0.06283288, "balance_loss_mlp": 0.01259427, "epoch": 0.4919885765819931, "flos": 23374820465280.0, "grad_norm": 1.7510820508318352, "language_loss": 0.72846627, "learning_rate": 2.1488313859122412e-06, "loss": 0.80564636, "num_input_tokens_seen": 175973725, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.11291504, "step": 8183, "time_per_iteration": 2.7884762287139893 }, { "auxiliary_loss_clip": 0.06456946, "auxiliary_loss_mlp": 0.01268668, "balance_loss_clip": 0.06285174, "balance_loss_mlp": 0.01255465, "epoch": 0.49204869983466104, "flos": 21366795294720.0, "grad_norm": 1.7114791944452479, "language_loss": 0.77147985, "learning_rate": 2.1484430012685015e-06, "loss": 0.84873605, "num_input_tokens_seen": 175993885, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.13226318, "step": 8184, "time_per_iteration": 2.616798162460327 }, { "auxiliary_loss_clip": 0.06443354, "auxiliary_loss_mlp": 0.01268648, "balance_loss_clip": 0.06280085, "balance_loss_mlp": 0.01256501, "epoch": 0.492108823087329, "flos": 21149147514240.0, "grad_norm": 1.5419384550531574, "language_loss": 0.71285087, "learning_rate": 2.148054610995789e-06, "loss": 0.78997093, "num_input_tokens_seen": 176014210, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.12139893, "step": 8185, "time_per_iteration": 2.6431663036346436 }, { "auxiliary_loss_clip": 0.06450115, "auxiliary_loss_mlp": 0.01269134, "balance_loss_clip": 0.06279644, "balance_loss_mlp": 0.01255568, "epoch": 0.49216894633999697, "flos": 25123214160000.0, "grad_norm": 1.6943819882876052, "language_loss": 0.75406337, "learning_rate": 2.147666215108831e-06, "loss": 0.83125585, "num_input_tokens_seen": 176033890, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.13580322, "step": 8186, "time_per_iteration": 2.576763391494751 }, { "auxiliary_loss_clip": 0.06445079, "auxiliary_loss_mlp": 0.01269882, "balance_loss_clip": 0.06281722, "balance_loss_mlp": 0.01257413, "epoch": 0.49222906959266494, "flos": 22644534205440.0, "grad_norm": 2.3158916447740694, "language_loss": 0.68390536, "learning_rate": 2.1472778136223545e-06, "loss": 0.76105493, "num_input_tokens_seen": 176052720, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.12475586, "step": 8187, "time_per_iteration": 2.6395516395568848 }, { "auxiliary_loss_clip": 0.06443131, "auxiliary_loss_mlp": 0.01274697, "balance_loss_clip": 0.06280683, "balance_loss_mlp": 0.01262759, "epoch": 0.49228919284533296, "flos": 20416471413120.0, "grad_norm": 1.5807805892906004, "language_loss": 0.67400038, "learning_rate": 2.1468894065510894e-06, "loss": 0.75117862, "num_input_tokens_seen": 176072545, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.1194458, "step": 8188, "time_per_iteration": 2.544981002807617 }, { "auxiliary_loss_clip": 0.06445502, "auxiliary_loss_mlp": 0.0126813, "balance_loss_clip": 0.062806, "balance_loss_mlp": 0.01256578, "epoch": 0.4923493160980009, "flos": 27129142978560.0, "grad_norm": 2.0879305676902833, "language_loss": 0.74976671, "learning_rate": 2.1465009939097623e-06, "loss": 0.82690305, "num_input_tokens_seen": 176091490, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.11560059, "step": 8189, "time_per_iteration": 2.625689744949341 }, { "auxiliary_loss_clip": 0.06440623, "auxiliary_loss_mlp": 0.01269625, "balance_loss_clip": 0.06279151, "balance_loss_mlp": 0.01257758, "epoch": 0.4924094393506689, "flos": 35745522981120.0, "grad_norm": 1.610171828798239, "language_loss": 0.64661241, "learning_rate": 2.146112575713104e-06, "loss": 0.72371495, "num_input_tokens_seen": 176113200, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.11871338, "step": 8190, "time_per_iteration": 2.669403314590454 }, { "auxiliary_loss_clip": 0.06446043, "auxiliary_loss_mlp": 0.01269081, "balance_loss_clip": 0.06282856, "balance_loss_mlp": 0.01257405, "epoch": 0.49246956260333685, "flos": 20418735473280.0, "grad_norm": 1.8503109230911845, "language_loss": 0.72120947, "learning_rate": 2.1457241519758413e-06, "loss": 0.79836071, "num_input_tokens_seen": 176132485, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.11694336, "step": 8191, "time_per_iteration": 5.521266937255859 }, { "auxiliary_loss_clip": 0.06443261, "auxiliary_loss_mlp": 0.01266127, "balance_loss_clip": 0.06279002, "balance_loss_mlp": 0.01254569, "epoch": 0.4925296858560048, "flos": 38985152590080.0, "grad_norm": 1.536930388314393, "language_loss": 0.72536403, "learning_rate": 2.1453357227127043e-06, "loss": 0.80245781, "num_input_tokens_seen": 176155755, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.11560059, "step": 8192, "time_per_iteration": 2.7018649578094482 }, { "auxiliary_loss_clip": 0.06339139, "auxiliary_loss_mlp": 0.012523, "balance_loss_clip": 0.06273589, "balance_loss_mlp": 0.01250309, "epoch": 0.4925898091086728, "flos": 64300367652480.0, "grad_norm": 0.7431361348113638, "language_loss": 0.51956958, "learning_rate": 2.1449472879384224e-06, "loss": 0.5954839, "num_input_tokens_seen": 176216295, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 0.01989746, "step": 8193, "time_per_iteration": 3.242563009262085 }, { "auxiliary_loss_clip": 0.06438424, "auxiliary_loss_mlp": 0.0126963, "balance_loss_clip": 0.06277236, "balance_loss_mlp": 0.01257358, "epoch": 0.49264993236134075, "flos": 23042541899520.0, "grad_norm": 1.320558484575612, "language_loss": 0.77172828, "learning_rate": 2.1445588476677246e-06, "loss": 0.84880888, "num_input_tokens_seen": 176235925, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.1227417, "step": 8194, "time_per_iteration": 2.570016622543335 }, { "auxiliary_loss_clip": 0.06442003, "auxiliary_loss_mlp": 0.01267995, "balance_loss_clip": 0.06278548, "balance_loss_mlp": 0.01255574, "epoch": 0.4927100556140087, "flos": 24725248392960.0, "grad_norm": 4.16212499464594, "language_loss": 0.70543158, "learning_rate": 2.144170401915341e-06, "loss": 0.78253156, "num_input_tokens_seen": 176253865, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.12432861, "step": 8195, "time_per_iteration": 2.6679317951202393 }, { "auxiliary_loss_clip": 0.06446445, "auxiliary_loss_mlp": 0.0126663, "balance_loss_clip": 0.06282419, "balance_loss_mlp": 0.01254667, "epoch": 0.4927701788666767, "flos": 23510932623360.0, "grad_norm": 1.8531695673736648, "language_loss": 0.80838859, "learning_rate": 2.143781950696001e-06, "loss": 0.88551939, "num_input_tokens_seen": 176271525, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.11968994, "step": 8196, "time_per_iteration": 2.575066089630127 }, { "auxiliary_loss_clip": 0.06445368, "auxiliary_loss_mlp": 0.01269569, "balance_loss_clip": 0.0627778, "balance_loss_mlp": 0.01257505, "epoch": 0.49283030211934464, "flos": 22935374127360.0, "grad_norm": 2.4273071788035785, "language_loss": 0.7087267, "learning_rate": 2.1433934940244356e-06, "loss": 0.78587604, "num_input_tokens_seen": 176290810, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.12072754, "step": 8197, "time_per_iteration": 2.6557953357696533 }, { "auxiliary_loss_clip": 0.064418, "auxiliary_loss_mlp": 0.01266591, "balance_loss_clip": 0.06280428, "balance_loss_mlp": 0.01254836, "epoch": 0.4928904253720126, "flos": 16878622723200.0, "grad_norm": 2.324118068453479, "language_loss": 0.84709013, "learning_rate": 2.143005031915374e-06, "loss": 0.92417395, "num_input_tokens_seen": 176309165, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.11749268, "step": 8198, "time_per_iteration": 2.53755259513855 }, { "auxiliary_loss_clip": 0.06448607, "auxiliary_loss_mlp": 0.01267408, "balance_loss_clip": 0.06283607, "balance_loss_mlp": 0.01254515, "epoch": 0.4929505486246806, "flos": 14871855363840.0, "grad_norm": 1.821943982725454, "language_loss": 0.76367414, "learning_rate": 2.1426165643835467e-06, "loss": 0.84083426, "num_input_tokens_seen": 176324960, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.12896729, "step": 8199, "time_per_iteration": 2.6263089179992676 }, { "auxiliary_loss_clip": 0.06448039, "auxiliary_loss_mlp": 0.01267818, "balance_loss_clip": 0.06279425, "balance_loss_mlp": 0.01254258, "epoch": 0.49301067187734854, "flos": 23849206755840.0, "grad_norm": 1.7826256587243323, "language_loss": 0.60101992, "learning_rate": 2.1422280914436864e-06, "loss": 0.67817855, "num_input_tokens_seen": 176346195, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.13562012, "step": 8200, "time_per_iteration": 4.098783254623413 }, { "auxiliary_loss_clip": 0.06436133, "auxiliary_loss_mlp": 0.01270946, "balance_loss_clip": 0.06280077, "balance_loss_mlp": 0.01259478, "epoch": 0.49307079513001656, "flos": 22497730652160.0, "grad_norm": 1.3644742913790635, "language_loss": 0.79042625, "learning_rate": 2.1418396131105213e-06, "loss": 0.86749703, "num_input_tokens_seen": 176366735, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.11468506, "step": 8201, "time_per_iteration": 4.078781366348267 }, { "auxiliary_loss_clip": 0.0645811, "auxiliary_loss_mlp": 0.01273139, "balance_loss_clip": 0.06287332, "balance_loss_mlp": 0.01259537, "epoch": 0.4931309183826845, "flos": 15930059777280.0, "grad_norm": 2.0409508618244208, "language_loss": 0.68001711, "learning_rate": 2.141451129398785e-06, "loss": 0.75732958, "num_input_tokens_seen": 176384475, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.1361084, "step": 8202, "time_per_iteration": 2.563694477081299 }, { "auxiliary_loss_clip": 0.06441263, "auxiliary_loss_mlp": 0.01268913, "balance_loss_clip": 0.06278764, "balance_loss_mlp": 0.01256944, "epoch": 0.4931910416353525, "flos": 27316588561920.0, "grad_norm": 4.377689452107694, "language_loss": 0.758295, "learning_rate": 2.1410626403232076e-06, "loss": 0.83539677, "num_input_tokens_seen": 176402645, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.11981201, "step": 8203, "time_per_iteration": 2.682185649871826 }, { "auxiliary_loss_clip": 0.06450073, "auxiliary_loss_mlp": 0.01266507, "balance_loss_clip": 0.06285368, "balance_loss_mlp": 0.01254014, "epoch": 0.49325116488802045, "flos": 20811166871040.0, "grad_norm": 3.450446361335989, "language_loss": 0.80873942, "learning_rate": 2.1406741458985197e-06, "loss": 0.88590527, "num_input_tokens_seen": 176416715, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.125, "step": 8204, "time_per_iteration": 2.5401663780212402 }, { "auxiliary_loss_clip": 0.06444015, "auxiliary_loss_mlp": 0.01270347, "balance_loss_clip": 0.06281886, "balance_loss_mlp": 0.01258396, "epoch": 0.4933112881406884, "flos": 19872247144320.0, "grad_norm": 1.8344815754357664, "language_loss": 0.65816987, "learning_rate": 2.140285646139455e-06, "loss": 0.73531353, "num_input_tokens_seen": 176435755, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.11950684, "step": 8205, "time_per_iteration": 2.5907793045043945 }, { "auxiliary_loss_clip": 0.0645784, "auxiliary_loss_mlp": 0.01277214, "balance_loss_clip": 0.06286115, "balance_loss_mlp": 0.01262331, "epoch": 0.4933714113933564, "flos": 21833215447680.0, "grad_norm": 1.9246807876652174, "language_loss": 0.66596204, "learning_rate": 2.139897141060744e-06, "loss": 0.7433126, "num_input_tokens_seen": 176453915, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.14886475, "step": 8206, "time_per_iteration": 2.5714845657348633 }, { "auxiliary_loss_clip": 0.06443413, "auxiliary_loss_mlp": 0.01273661, "balance_loss_clip": 0.06279176, "balance_loss_mlp": 0.01261836, "epoch": 0.49343153464602435, "flos": 27897304083840.0, "grad_norm": 1.58155063655693, "language_loss": 0.76627028, "learning_rate": 2.1395086306771196e-06, "loss": 0.84344101, "num_input_tokens_seen": 176475175, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.11828613, "step": 8207, "time_per_iteration": 2.6754298210144043 }, { "auxiliary_loss_clip": 0.06449642, "auxiliary_loss_mlp": 0.01274135, "balance_loss_clip": 0.06286267, "balance_loss_mlp": 0.01261446, "epoch": 0.4934916578986923, "flos": 24688002453120.0, "grad_norm": 2.338316292275115, "language_loss": 0.60893422, "learning_rate": 2.1391201150033147e-06, "loss": 0.68617201, "num_input_tokens_seen": 176494250, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.12689209, "step": 8208, "time_per_iteration": 2.5943305492401123 }, { "auxiliary_loss_clip": 0.06443261, "auxiliary_loss_mlp": 0.0127421, "balance_loss_clip": 0.06280341, "balance_loss_mlp": 0.01261413, "epoch": 0.4935517811513603, "flos": 23412024478080.0, "grad_norm": 1.8141309762758242, "language_loss": 0.78677762, "learning_rate": 2.1387315940540598e-06, "loss": 0.86395234, "num_input_tokens_seen": 176513325, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.12786865, "step": 8209, "time_per_iteration": 2.638458490371704 }, { "auxiliary_loss_clip": 0.06443657, "auxiliary_loss_mlp": 0.01274281, "balance_loss_clip": 0.06281766, "balance_loss_mlp": 0.01261895, "epoch": 0.49361190440402825, "flos": 21950948833920.0, "grad_norm": 2.1229718483898528, "language_loss": 0.78862166, "learning_rate": 2.138343067844089e-06, "loss": 0.8658011, "num_input_tokens_seen": 176532915, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.12390137, "step": 8210, "time_per_iteration": 2.5645763874053955 }, { "auxiliary_loss_clip": 0.06453973, "auxiliary_loss_mlp": 0.01272853, "balance_loss_clip": 0.06285574, "balance_loss_mlp": 0.01260622, "epoch": 0.4936720276566962, "flos": 25122124056960.0, "grad_norm": 1.6913873819957495, "language_loss": 0.81444538, "learning_rate": 2.1379545363881363e-06, "loss": 0.89171362, "num_input_tokens_seen": 176552775, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.12225342, "step": 8211, "time_per_iteration": 2.622286558151245 }, { "auxiliary_loss_clip": 0.06446654, "auxiliary_loss_mlp": 0.01274657, "balance_loss_clip": 0.0628126, "balance_loss_mlp": 0.01262021, "epoch": 0.4937321509093642, "flos": 26366055045120.0, "grad_norm": 2.792396244113451, "language_loss": 0.91635364, "learning_rate": 2.137565999700933e-06, "loss": 0.99356675, "num_input_tokens_seen": 176572185, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.12646484, "step": 8212, "time_per_iteration": 2.584929943084717 }, { "auxiliary_loss_clip": 0.06448994, "auxiliary_loss_mlp": 0.01271463, "balance_loss_clip": 0.06285051, "balance_loss_mlp": 0.01259989, "epoch": 0.49379227416203214, "flos": 22967211479040.0, "grad_norm": 1.6320770810949043, "language_loss": 0.6523025, "learning_rate": 2.1371774577972138e-06, "loss": 0.72950697, "num_input_tokens_seen": 176591490, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.11480713, "step": 8213, "time_per_iteration": 2.653207302093506 }, { "auxiliary_loss_clip": 0.06445439, "auxiliary_loss_mlp": 0.01268705, "balance_loss_clip": 0.06281657, "balance_loss_mlp": 0.01256778, "epoch": 0.49385239741470016, "flos": 32497340256000.0, "grad_norm": 1.7732864322024413, "language_loss": 0.76258266, "learning_rate": 2.136788910691711e-06, "loss": 0.83972406, "num_input_tokens_seen": 176612715, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.1192627, "step": 8214, "time_per_iteration": 2.6438300609588623 }, { "auxiliary_loss_clip": 0.06444521, "auxiliary_loss_mlp": 0.01269178, "balance_loss_clip": 0.06280349, "balance_loss_mlp": 0.01256327, "epoch": 0.4939125206673681, "flos": 22499575441920.0, "grad_norm": 1.6081524699781313, "language_loss": 0.84547907, "learning_rate": 2.1364003583991594e-06, "loss": 0.922616, "num_input_tokens_seen": 176631950, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.12854004, "step": 8215, "time_per_iteration": 2.6662981510162354 }, { "auxiliary_loss_clip": 0.06434759, "auxiliary_loss_mlp": 0.01270917, "balance_loss_clip": 0.06278814, "balance_loss_mlp": 0.01259771, "epoch": 0.4939726439200361, "flos": 31184493684480.0, "grad_norm": 1.7883709357961097, "language_loss": 0.8386569, "learning_rate": 2.136011800934292e-06, "loss": 0.91571367, "num_input_tokens_seen": 176653060, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.1114502, "step": 8216, "time_per_iteration": 2.624309778213501 }, { "auxiliary_loss_clip": 0.0644132, "auxiliary_loss_mlp": 0.01271374, "balance_loss_clip": 0.06282341, "balance_loss_mlp": 0.01259358, "epoch": 0.49403276717270406, "flos": 22680773896320.0, "grad_norm": 1.4243727944601972, "language_loss": 0.75368059, "learning_rate": 2.1356232383118442e-06, "loss": 0.83080757, "num_input_tokens_seen": 176673895, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.12017822, "step": 8217, "time_per_iteration": 2.6219165325164795 }, { "auxiliary_loss_clip": 0.06439236, "auxiliary_loss_mlp": 0.01275103, "balance_loss_clip": 0.06280699, "balance_loss_mlp": 0.01261752, "epoch": 0.494092890425372, "flos": 20747408313600.0, "grad_norm": 1.6200880527969566, "language_loss": 0.78720307, "learning_rate": 2.1352346705465494e-06, "loss": 0.8643465, "num_input_tokens_seen": 176692550, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.13342285, "step": 8218, "time_per_iteration": 2.5505244731903076 }, { "auxiliary_loss_clip": 0.06435018, "auxiliary_loss_mlp": 0.01269624, "balance_loss_clip": 0.06276962, "balance_loss_mlp": 0.01258353, "epoch": 0.49415301367804, "flos": 18374889882240.0, "grad_norm": 2.005801760207512, "language_loss": 0.76820767, "learning_rate": 2.134846097653142e-06, "loss": 0.84525406, "num_input_tokens_seen": 176709335, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.1126709, "step": 8219, "time_per_iteration": 2.5665321350097656 }, { "auxiliary_loss_clip": 0.06441802, "auxiliary_loss_mlp": 0.01269791, "balance_loss_clip": 0.06279255, "balance_loss_mlp": 0.01258114, "epoch": 0.49421313693070795, "flos": 17536471528320.0, "grad_norm": 1.6575115219512664, "language_loss": 0.62995082, "learning_rate": 2.134457519646357e-06, "loss": 0.70706677, "num_input_tokens_seen": 176727715, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.11676025, "step": 8220, "time_per_iteration": 2.519775152206421 }, { "auxiliary_loss_clip": 0.06441128, "auxiliary_loss_mlp": 0.01269115, "balance_loss_clip": 0.06280006, "balance_loss_mlp": 0.01257712, "epoch": 0.4942732601833759, "flos": 20818210613760.0, "grad_norm": 2.8991568596257022, "language_loss": 0.7247979, "learning_rate": 2.1340689365409296e-06, "loss": 0.80190039, "num_input_tokens_seen": 176747530, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.11401367, "step": 8221, "time_per_iteration": 2.6023597717285156 }, { "auxiliary_loss_clip": 0.06444643, "auxiliary_loss_mlp": 0.01271406, "balance_loss_clip": 0.06285915, "balance_loss_mlp": 0.01260224, "epoch": 0.4943333834360439, "flos": 15054269702400.0, "grad_norm": 1.4820098462836395, "language_loss": 0.78990328, "learning_rate": 2.133680348351595e-06, "loss": 0.86706376, "num_input_tokens_seen": 176765260, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.11181641, "step": 8222, "time_per_iteration": 2.5383925437927246 }, { "auxiliary_loss_clip": 0.06442507, "auxiliary_loss_mlp": 0.01269435, "balance_loss_clip": 0.06280632, "balance_loss_mlp": 0.01256954, "epoch": 0.49439350668871185, "flos": 16075899008640.0, "grad_norm": 2.5813288663955345, "language_loss": 0.73021668, "learning_rate": 2.133291755093088e-06, "loss": 0.80733609, "num_input_tokens_seen": 176781770, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.12481689, "step": 8223, "time_per_iteration": 2.549431562423706 }, { "auxiliary_loss_clip": 0.06451255, "auxiliary_loss_mlp": 0.01269316, "balance_loss_clip": 0.06285559, "balance_loss_mlp": 0.01256698, "epoch": 0.4944536299413798, "flos": 20885281407360.0, "grad_norm": 1.5428365241318318, "language_loss": 0.7534802, "learning_rate": 2.132903156780144e-06, "loss": 0.83068585, "num_input_tokens_seen": 176800655, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.12634277, "step": 8224, "time_per_iteration": 2.582278251647949 }, { "auxiliary_loss_clip": 0.0644846, "auxiliary_loss_mlp": 0.01266836, "balance_loss_clip": 0.06285152, "balance_loss_mlp": 0.01254212, "epoch": 0.4945137531940478, "flos": 26615162833920.0, "grad_norm": 2.4195238224614157, "language_loss": 0.64036798, "learning_rate": 2.1325145534274997e-06, "loss": 0.71752089, "num_input_tokens_seen": 176820610, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.12634277, "step": 8225, "time_per_iteration": 2.6747069358825684 }, { "auxiliary_loss_clip": 0.06444663, "auxiliary_loss_mlp": 0.01267371, "balance_loss_clip": 0.06282713, "balance_loss_mlp": 0.01255855, "epoch": 0.49457387644671574, "flos": 23995004060160.0, "grad_norm": 2.382880216967814, "language_loss": 0.76710308, "learning_rate": 2.1321259450498893e-06, "loss": 0.8442235, "num_input_tokens_seen": 176840520, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.11529541, "step": 8226, "time_per_iteration": 2.6699624061584473 }, { "auxiliary_loss_clip": 0.06442855, "auxiliary_loss_mlp": 0.012703, "balance_loss_clip": 0.06276856, "balance_loss_mlp": 0.0125786, "epoch": 0.49463399969938376, "flos": 26983387601280.0, "grad_norm": 1.5958401672507192, "language_loss": 0.71771336, "learning_rate": 2.131737331662051e-06, "loss": 0.79484493, "num_input_tokens_seen": 176860265, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.12432861, "step": 8227, "time_per_iteration": 2.608288049697876 }, { "auxiliary_loss_clip": 0.06448309, "auxiliary_loss_mlp": 0.01266114, "balance_loss_clip": 0.06279238, "balance_loss_mlp": 0.01254128, "epoch": 0.49469412295205173, "flos": 29689610117760.0, "grad_norm": 1.4644051287625177, "language_loss": 0.71953696, "learning_rate": 2.131348713278718e-06, "loss": 0.79668123, "num_input_tokens_seen": 176882910, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.11993408, "step": 8228, "time_per_iteration": 2.73557448387146 }, { "auxiliary_loss_clip": 0.06437398, "auxiliary_loss_mlp": 0.01269352, "balance_loss_clip": 0.06279641, "balance_loss_mlp": 0.01258671, "epoch": 0.4947542462047197, "flos": 24138285742080.0, "grad_norm": 1.3750361350269427, "language_loss": 0.84190989, "learning_rate": 2.1309600899146304e-06, "loss": 0.91897738, "num_input_tokens_seen": 176903030, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.10687256, "step": 8229, "time_per_iteration": 2.740598678588867 }, { "auxiliary_loss_clip": 0.06453235, "auxiliary_loss_mlp": 0.01269495, "balance_loss_clip": 0.0628635, "balance_loss_mlp": 0.0125665, "epoch": 0.49481436945738766, "flos": 20050804195200.0, "grad_norm": 1.7087291604019532, "language_loss": 0.74778605, "learning_rate": 2.1305714615845227e-06, "loss": 0.8250134, "num_input_tokens_seen": 176919025, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.128479, "step": 8230, "time_per_iteration": 5.616842031478882 }, { "auxiliary_loss_clip": 0.06446044, "auxiliary_loss_mlp": 0.01270422, "balance_loss_clip": 0.06283382, "balance_loss_mlp": 0.01258417, "epoch": 0.4948744927100556, "flos": 15675040275840.0, "grad_norm": 2.07971419254203, "language_loss": 0.79876512, "learning_rate": 2.1301828283031314e-06, "loss": 0.87592977, "num_input_tokens_seen": 176937945, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.11999512, "step": 8231, "time_per_iteration": 2.610461711883545 }, { "auxiliary_loss_clip": 0.06354462, "auxiliary_loss_mlp": 0.01252798, "balance_loss_clip": 0.06287043, "balance_loss_mlp": 0.01250433, "epoch": 0.4949346159627236, "flos": 68893611644160.0, "grad_norm": 0.7609051913464466, "language_loss": 0.59972805, "learning_rate": 2.1297941900851944e-06, "loss": 0.67580056, "num_input_tokens_seen": 177004575, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.02360535, "step": 8232, "time_per_iteration": 3.278813362121582 }, { "auxiliary_loss_clip": 0.06450568, "auxiliary_loss_mlp": 0.01269054, "balance_loss_clip": 0.06281677, "balance_loss_mlp": 0.01256239, "epoch": 0.49499473921539155, "flos": 24797182723200.0, "grad_norm": 1.6713807765500968, "language_loss": 0.69345474, "learning_rate": 2.1294055469454496e-06, "loss": 0.77065086, "num_input_tokens_seen": 177024155, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.12811279, "step": 8233, "time_per_iteration": 2.595463991165161 }, { "auxiliary_loss_clip": 0.06437771, "auxiliary_loss_mlp": 0.01268614, "balance_loss_clip": 0.06278165, "balance_loss_mlp": 0.01256842, "epoch": 0.4950548624680595, "flos": 32716161993600.0, "grad_norm": 1.918198882326877, "language_loss": 0.6669364, "learning_rate": 2.129016898898633e-06, "loss": 0.7440002, "num_input_tokens_seen": 177046185, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.11779785, "step": 8234, "time_per_iteration": 2.6556484699249268 }, { "auxiliary_loss_clip": 0.06350002, "auxiliary_loss_mlp": 0.01251326, "balance_loss_clip": 0.062829, "balance_loss_mlp": 0.01249062, "epoch": 0.4951149857207275, "flos": 50100616287360.0, "grad_norm": 0.7968748441608003, "language_loss": 0.57998246, "learning_rate": 2.128628245959482e-06, "loss": 0.65599573, "num_input_tokens_seen": 177099025, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.0226593, "step": 8235, "time_per_iteration": 3.1278603076934814 }, { "auxiliary_loss_clip": 0.06446802, "auxiliary_loss_mlp": 0.01271804, "balance_loss_clip": 0.06281313, "balance_loss_mlp": 0.01258643, "epoch": 0.49517510897339545, "flos": 22243340056320.0, "grad_norm": 1.5713425347690133, "language_loss": 0.77184689, "learning_rate": 2.1282395881427355e-06, "loss": 0.84903294, "num_input_tokens_seen": 177118365, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.13165283, "step": 8236, "time_per_iteration": 2.594543218612671 }, { "auxiliary_loss_clip": 0.06439775, "auxiliary_loss_mlp": 0.01267563, "balance_loss_clip": 0.06279109, "balance_loss_mlp": 0.01256274, "epoch": 0.4952352322260634, "flos": 25381126627200.0, "grad_norm": 1.8947768004073442, "language_loss": 0.7281664, "learning_rate": 2.1278509254631315e-06, "loss": 0.8052398, "num_input_tokens_seen": 177136415, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.112854, "step": 8237, "time_per_iteration": 2.571279525756836 }, { "auxiliary_loss_clip": 0.06445383, "auxiliary_loss_mlp": 0.01270307, "balance_loss_clip": 0.06286585, "balance_loss_mlp": 0.01258231, "epoch": 0.4952953554787314, "flos": 24615732706560.0, "grad_norm": 1.7666846178952247, "language_loss": 0.75847119, "learning_rate": 2.127462257935406e-06, "loss": 0.83562809, "num_input_tokens_seen": 177155690, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.12072754, "step": 8238, "time_per_iteration": 2.6290714740753174 }, { "auxiliary_loss_clip": 0.06445667, "auxiliary_loss_mlp": 0.01272156, "balance_loss_clip": 0.0628233, "balance_loss_mlp": 0.01259663, "epoch": 0.49535547873139935, "flos": 17317020885120.0, "grad_norm": 2.236192941645426, "language_loss": 0.74634457, "learning_rate": 2.1270735855743008e-06, "loss": 0.82352281, "num_input_tokens_seen": 177173350, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.12506104, "step": 8239, "time_per_iteration": 2.5176444053649902 }, { "auxiliary_loss_clip": 0.0644621, "auxiliary_loss_mlp": 0.01271333, "balance_loss_clip": 0.06279828, "balance_loss_mlp": 0.01258113, "epoch": 0.4954156019840673, "flos": 20746527845760.0, "grad_norm": 3.000479946335793, "language_loss": 0.78992593, "learning_rate": 2.126684908394552e-06, "loss": 0.86710137, "num_input_tokens_seen": 177191115, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.13220215, "step": 8240, "time_per_iteration": 4.100542783737183 }, { "auxiliary_loss_clip": 0.06437767, "auxiliary_loss_mlp": 0.01265091, "balance_loss_clip": 0.06279512, "balance_loss_mlp": 0.0125429, "epoch": 0.49547572523673533, "flos": 12825200661120.0, "grad_norm": 1.9749053398323948, "language_loss": 0.85719883, "learning_rate": 2.126296226410898e-06, "loss": 0.93422735, "num_input_tokens_seen": 177206155, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.10803223, "step": 8241, "time_per_iteration": 2.564497709274292 }, { "auxiliary_loss_clip": 0.06436998, "auxiliary_loss_mlp": 0.01272053, "balance_loss_clip": 0.06280151, "balance_loss_mlp": 0.01261188, "epoch": 0.4955358484894033, "flos": 15602602821120.0, "grad_norm": 1.6238347757974487, "language_loss": 0.77654904, "learning_rate": 2.1259075396380794e-06, "loss": 0.85363954, "num_input_tokens_seen": 177224815, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.10858154, "step": 8242, "time_per_iteration": 2.5761117935180664 }, { "auxiliary_loss_clip": 0.06450841, "auxiliary_loss_mlp": 0.01268997, "balance_loss_clip": 0.06290142, "balance_loss_mlp": 0.01256599, "epoch": 0.49559597174207126, "flos": 26470832976000.0, "grad_norm": 1.754376634176094, "language_loss": 0.67523378, "learning_rate": 2.125518848090833e-06, "loss": 0.75243211, "num_input_tokens_seen": 177244490, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.1239624, "step": 8243, "time_per_iteration": 2.7052934169769287 }, { "auxiliary_loss_clip": 0.0644513, "auxiliary_loss_mlp": 0.01267361, "balance_loss_clip": 0.06283817, "balance_loss_mlp": 0.01254885, "epoch": 0.4956560949947392, "flos": 23154824770560.0, "grad_norm": 1.5674612683025178, "language_loss": 0.68224537, "learning_rate": 2.125130151783901e-06, "loss": 0.75937027, "num_input_tokens_seen": 177264340, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.12481689, "step": 8244, "time_per_iteration": 2.5776991844177246 }, { "auxiliary_loss_clip": 0.06447188, "auxiliary_loss_mlp": 0.01268455, "balance_loss_clip": 0.06283464, "balance_loss_mlp": 0.01256862, "epoch": 0.4957162182474072, "flos": 20779119884160.0, "grad_norm": 1.9063137599792634, "language_loss": 0.75352645, "learning_rate": 2.12474145073202e-06, "loss": 0.83068287, "num_input_tokens_seen": 177283055, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.1159668, "step": 8245, "time_per_iteration": 2.605079412460327 }, { "auxiliary_loss_clip": 0.06444199, "auxiliary_loss_mlp": 0.01266457, "balance_loss_clip": 0.06287244, "balance_loss_mlp": 0.01255048, "epoch": 0.49577634150007516, "flos": 18740179756800.0, "grad_norm": 1.8090300219077475, "language_loss": 0.81862354, "learning_rate": 2.1243527449499306e-06, "loss": 0.89573008, "num_input_tokens_seen": 177301140, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.11407471, "step": 8246, "time_per_iteration": 2.5392913818359375 }, { "auxiliary_loss_clip": 0.06452991, "auxiliary_loss_mlp": 0.01272123, "balance_loss_clip": 0.06287003, "balance_loss_mlp": 0.01260148, "epoch": 0.4958364647527431, "flos": 25560815708160.0, "grad_norm": 1.5563633052792076, "language_loss": 0.8466652, "learning_rate": 2.1239640344523733e-06, "loss": 0.92391634, "num_input_tokens_seen": 177323095, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.11962891, "step": 8247, "time_per_iteration": 2.8070287704467773 }, { "auxiliary_loss_clip": 0.06450739, "auxiliary_loss_mlp": 0.01268868, "balance_loss_clip": 0.06287529, "balance_loss_mlp": 0.01257478, "epoch": 0.4958965880054111, "flos": 24432144410880.0, "grad_norm": 1.7984700829985811, "language_loss": 0.84088159, "learning_rate": 2.123575319254087e-06, "loss": 0.91807771, "num_input_tokens_seen": 177339845, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.11376953, "step": 8248, "time_per_iteration": 2.620465040206909 }, { "auxiliary_loss_clip": 0.06448782, "auxiliary_loss_mlp": 0.01266159, "balance_loss_clip": 0.0628476, "balance_loss_mlp": 0.01254172, "epoch": 0.49595671125807905, "flos": 25090622121600.0, "grad_norm": 1.7361897383354963, "language_loss": 0.7363584, "learning_rate": 2.123186599369812e-06, "loss": 0.8135078, "num_input_tokens_seen": 177359980, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.11987305, "step": 8249, "time_per_iteration": 2.663966655731201 }, { "auxiliary_loss_clip": 0.06453176, "auxiliary_loss_mlp": 0.01274038, "balance_loss_clip": 0.06285664, "balance_loss_mlp": 0.01261688, "epoch": 0.496016834510747, "flos": 16441524299520.0, "grad_norm": 1.6687335268153072, "language_loss": 0.76148862, "learning_rate": 2.122797874814289e-06, "loss": 0.83876073, "num_input_tokens_seen": 177378580, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.12347412, "step": 8250, "time_per_iteration": 2.5334408283233643 }, { "auxiliary_loss_clip": 0.06454585, "auxiliary_loss_mlp": 0.01274246, "balance_loss_clip": 0.06289129, "balance_loss_mlp": 0.01262218, "epoch": 0.496076957763415, "flos": 23444197246080.0, "grad_norm": 1.5930278119832726, "language_loss": 0.70173174, "learning_rate": 2.1224091456022585e-06, "loss": 0.77902007, "num_input_tokens_seen": 177398790, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.12023926, "step": 8251, "time_per_iteration": 2.660249710083008 }, { "auxiliary_loss_clip": 0.06451754, "auxiliary_loss_mlp": 0.01269281, "balance_loss_clip": 0.06289621, "balance_loss_mlp": 0.01257903, "epoch": 0.49613708101608295, "flos": 16915113976320.0, "grad_norm": 1.7056267141039756, "language_loss": 0.8028037, "learning_rate": 2.122020411748461e-06, "loss": 0.88001406, "num_input_tokens_seen": 177416515, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.11376953, "step": 8252, "time_per_iteration": 2.5219178199768066 }, { "auxiliary_loss_clip": 0.06447089, "auxiliary_loss_mlp": 0.01267129, "balance_loss_clip": 0.06284762, "balance_loss_mlp": 0.01252383, "epoch": 0.4961972042687509, "flos": 16623729002880.0, "grad_norm": 1.746910577352744, "language_loss": 0.81288189, "learning_rate": 2.1216316732676363e-06, "loss": 0.89002407, "num_input_tokens_seen": 177434425, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1473999, "step": 8253, "time_per_iteration": 2.6024115085601807 }, { "auxiliary_loss_clip": 0.06442015, "auxiliary_loss_mlp": 0.01264266, "balance_loss_clip": 0.06281624, "balance_loss_mlp": 0.01253174, "epoch": 0.49625732752141893, "flos": 28965529059840.0, "grad_norm": 1.4816973338484436, "language_loss": 0.67567086, "learning_rate": 2.1212429301745275e-06, "loss": 0.75273359, "num_input_tokens_seen": 177459675, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.11096191, "step": 8254, "time_per_iteration": 2.6719765663146973 }, { "auxiliary_loss_clip": 0.06447396, "auxiliary_loss_mlp": 0.01273616, "balance_loss_clip": 0.06282286, "balance_loss_mlp": 0.0126061, "epoch": 0.4963174507740869, "flos": 23119046277120.0, "grad_norm": 1.7816677970577883, "language_loss": 0.74357092, "learning_rate": 2.1208541824838743e-06, "loss": 0.82078099, "num_input_tokens_seen": 177478895, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.13006592, "step": 8255, "time_per_iteration": 2.595820665359497 }, { "auxiliary_loss_clip": 0.06446393, "auxiliary_loss_mlp": 0.01268738, "balance_loss_clip": 0.06285763, "balance_loss_mlp": 0.0125699, "epoch": 0.49637757402675486, "flos": 13922998928640.0, "grad_norm": 1.6950273773428328, "language_loss": 0.81935227, "learning_rate": 2.1204654302104183e-06, "loss": 0.89650363, "num_input_tokens_seen": 177494920, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.11749268, "step": 8256, "time_per_iteration": 2.538367748260498 }, { "auxiliary_loss_clip": 0.06443399, "auxiliary_loss_mlp": 0.01267616, "balance_loss_clip": 0.06282871, "balance_loss_mlp": 0.01256417, "epoch": 0.49643769727942283, "flos": 22315442094720.0, "grad_norm": 11.7570487329384, "language_loss": 0.81399357, "learning_rate": 2.120076673368901e-06, "loss": 0.89110368, "num_input_tokens_seen": 177515455, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.11193848, "step": 8257, "time_per_iteration": 2.631290912628174 }, { "auxiliary_loss_clip": 0.06454295, "auxiliary_loss_mlp": 0.01268716, "balance_loss_clip": 0.06284265, "balance_loss_mlp": 0.01256426, "epoch": 0.4964978205320908, "flos": 19506328364160.0, "grad_norm": 2.4414675437541145, "language_loss": 0.66535223, "learning_rate": 2.1196879119740647e-06, "loss": 0.74258232, "num_input_tokens_seen": 177534040, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.12298584, "step": 8258, "time_per_iteration": 2.5475401878356934 }, { "auxiliary_loss_clip": 0.0643867, "auxiliary_loss_mlp": 0.01269043, "balance_loss_clip": 0.06280065, "balance_loss_mlp": 0.01258189, "epoch": 0.49655794378475876, "flos": 23442562091520.0, "grad_norm": 1.371500208339238, "language_loss": 0.7788375, "learning_rate": 2.1192991460406502e-06, "loss": 0.85591459, "num_input_tokens_seen": 177554510, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.10852051, "step": 8259, "time_per_iteration": 2.6400790214538574 }, { "auxiliary_loss_clip": 0.06445095, "auxiliary_loss_mlp": 0.01267701, "balance_loss_clip": 0.06284744, "balance_loss_mlp": 0.01255393, "epoch": 0.4966180670374267, "flos": 26837967640320.0, "grad_norm": 1.8560593658675055, "language_loss": 0.7869482, "learning_rate": 2.1189103755834e-06, "loss": 0.86407614, "num_input_tokens_seen": 177575780, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.12316895, "step": 8260, "time_per_iteration": 2.5913450717926025 }, { "auxiliary_loss_clip": 0.06450275, "auxiliary_loss_mlp": 0.01269445, "balance_loss_clip": 0.06286341, "balance_loss_mlp": 0.01257029, "epoch": 0.4966781902900947, "flos": 22014413902080.0, "grad_norm": 2.7522825884099547, "language_loss": 0.77568758, "learning_rate": 2.1185216006170573e-06, "loss": 0.85288483, "num_input_tokens_seen": 177588965, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.12420654, "step": 8261, "time_per_iteration": 2.6029837131500244 }, { "auxiliary_loss_clip": 0.06440946, "auxiliary_loss_mlp": 0.01266466, "balance_loss_clip": 0.06282788, "balance_loss_mlp": 0.01254867, "epoch": 0.49673831354276266, "flos": 26220509303040.0, "grad_norm": 1.6983681673017883, "language_loss": 0.89926863, "learning_rate": 2.1181328211563627e-06, "loss": 0.97634286, "num_input_tokens_seen": 177608425, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.11584473, "step": 8262, "time_per_iteration": 2.617502212524414 }, { "auxiliary_loss_clip": 0.06444822, "auxiliary_loss_mlp": 0.01272107, "balance_loss_clip": 0.06286357, "balance_loss_mlp": 0.01260669, "epoch": 0.4967984367954306, "flos": 23188464984960.0, "grad_norm": 1.470581474669343, "language_loss": 0.7400403, "learning_rate": 2.11774403721606e-06, "loss": 0.8172096, "num_input_tokens_seen": 177628240, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.11431885, "step": 8263, "time_per_iteration": 2.609636068344116 }, { "auxiliary_loss_clip": 0.06452563, "auxiliary_loss_mlp": 0.01269124, "balance_loss_clip": 0.06286347, "balance_loss_mlp": 0.01255075, "epoch": 0.4968585600480986, "flos": 19287506626560.0, "grad_norm": 1.850076632265977, "language_loss": 0.69736731, "learning_rate": 2.1173552488108923e-06, "loss": 0.77458417, "num_input_tokens_seen": 177645920, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.140625, "step": 8264, "time_per_iteration": 2.6069533824920654 }, { "auxiliary_loss_clip": 0.06450147, "auxiliary_loss_mlp": 0.01267533, "balance_loss_clip": 0.06284212, "balance_loss_mlp": 0.01255231, "epoch": 0.49691868330076655, "flos": 22535312008320.0, "grad_norm": 1.3721792462912832, "language_loss": 0.64890647, "learning_rate": 2.1169664559556007e-06, "loss": 0.72608328, "num_input_tokens_seen": 177667185, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.12310791, "step": 8265, "time_per_iteration": 2.5945827960968018 }, { "auxiliary_loss_clip": 0.06354019, "auxiliary_loss_mlp": 0.01271123, "balance_loss_clip": 0.0628791, "balance_loss_mlp": 0.01268857, "epoch": 0.4969788065534345, "flos": 66598897328640.0, "grad_norm": 0.9258396995995472, "language_loss": 0.5335694, "learning_rate": 2.1165776586649304e-06, "loss": 0.60982084, "num_input_tokens_seen": 177733020, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 0.02267456, "step": 8266, "time_per_iteration": 3.241305351257324 }, { "auxiliary_loss_clip": 0.06437363, "auxiliary_loss_mlp": 0.01269398, "balance_loss_clip": 0.06281623, "balance_loss_mlp": 0.0125737, "epoch": 0.49703892980610254, "flos": 24066099849600.0, "grad_norm": 1.4988010461113968, "language_loss": 0.79878008, "learning_rate": 2.1161888569536223e-06, "loss": 0.8758477, "num_input_tokens_seen": 177753370, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.12036133, "step": 8267, "time_per_iteration": 2.5706799030303955 }, { "auxiliary_loss_clip": 0.06444392, "auxiliary_loss_mlp": 0.01269645, "balance_loss_clip": 0.06281169, "balance_loss_mlp": 0.01256562, "epoch": 0.4970990530587705, "flos": 29132807736960.0, "grad_norm": 2.118054176267662, "language_loss": 0.7477529, "learning_rate": 2.1158000508364223e-06, "loss": 0.82489324, "num_input_tokens_seen": 177771530, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.13092041, "step": 8268, "time_per_iteration": 2.6929666996002197 }, { "auxiliary_loss_clip": 0.06444591, "auxiliary_loss_mlp": 0.01267323, "balance_loss_clip": 0.06282458, "balance_loss_mlp": 0.01254389, "epoch": 0.49715917631143847, "flos": 46036811047680.0, "grad_norm": 1.473144409474461, "language_loss": 0.67966127, "learning_rate": 2.115411240328073e-06, "loss": 0.75678039, "num_input_tokens_seen": 177796355, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.1293335, "step": 8269, "time_per_iteration": 2.791630506515503 }, { "auxiliary_loss_clip": 0.06441347, "auxiliary_loss_mlp": 0.01266052, "balance_loss_clip": 0.0628399, "balance_loss_mlp": 0.01254119, "epoch": 0.49721929956410643, "flos": 20197104624000.0, "grad_norm": 1.478800481839805, "language_loss": 0.85943389, "learning_rate": 2.1150224254433167e-06, "loss": 0.93650782, "num_input_tokens_seen": 177814300, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.11938477, "step": 8270, "time_per_iteration": 5.476109266281128 }, { "auxiliary_loss_clip": 0.06448052, "auxiliary_loss_mlp": 0.01270528, "balance_loss_clip": 0.06282777, "balance_loss_mlp": 0.01259472, "epoch": 0.4972794228167744, "flos": 21660108912000.0, "grad_norm": 1.6447525043653792, "language_loss": 0.71276623, "learning_rate": 2.114633606196899e-06, "loss": 0.78995204, "num_input_tokens_seen": 177833615, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.1105957, "step": 8271, "time_per_iteration": 2.635180950164795 }, { "auxiliary_loss_clip": 0.06448573, "auxiliary_loss_mlp": 0.0126757, "balance_loss_clip": 0.06286136, "balance_loss_mlp": 0.01254559, "epoch": 0.49733954606944236, "flos": 24286598668800.0, "grad_norm": 1.2770701997382277, "language_loss": 0.78506827, "learning_rate": 2.1142447826035635e-06, "loss": 0.86222965, "num_input_tokens_seen": 177855315, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.13031006, "step": 8272, "time_per_iteration": 2.5978503227233887 }, { "auxiliary_loss_clip": 0.06448995, "auxiliary_loss_mlp": 0.01266473, "balance_loss_clip": 0.06287558, "balance_loss_mlp": 0.01253879, "epoch": 0.4973996693221103, "flos": 37861722172800.0, "grad_norm": 1.6675819438788257, "language_loss": 0.66650128, "learning_rate": 2.1138559546780544e-06, "loss": 0.74365592, "num_input_tokens_seen": 177875590, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.1260376, "step": 8273, "time_per_iteration": 2.777271032333374 }, { "auxiliary_loss_clip": 0.06447235, "auxiliary_loss_mlp": 0.01267848, "balance_loss_clip": 0.06287019, "balance_loss_mlp": 0.01256261, "epoch": 0.4974597925747783, "flos": 21367885397760.0, "grad_norm": 1.697547238870654, "language_loss": 0.78558767, "learning_rate": 2.1134671224351163e-06, "loss": 0.86273849, "num_input_tokens_seen": 177894175, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.11590576, "step": 8274, "time_per_iteration": 2.5661497116088867 }, { "auxiliary_loss_clip": 0.0645088, "auxiliary_loss_mlp": 0.01269422, "balance_loss_clip": 0.06284419, "balance_loss_mlp": 0.0125644, "epoch": 0.49751991582744626, "flos": 30746137449600.0, "grad_norm": 2.264812264804198, "language_loss": 0.75916857, "learning_rate": 2.113078285889493e-06, "loss": 0.83637154, "num_input_tokens_seen": 177913920, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.12982178, "step": 8275, "time_per_iteration": 2.6472222805023193 }, { "auxiliary_loss_clip": 0.06454559, "auxiliary_loss_mlp": 0.01267793, "balance_loss_clip": 0.06288496, "balance_loss_mlp": 0.01254411, "epoch": 0.4975800390801142, "flos": 14105748683520.0, "grad_norm": 2.9168227283297377, "language_loss": 0.84463221, "learning_rate": 2.1126894450559303e-06, "loss": 0.92185569, "num_input_tokens_seen": 177930425, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.13391113, "step": 8276, "time_per_iteration": 2.7478525638580322 }, { "auxiliary_loss_clip": 0.06436829, "auxiliary_loss_mlp": 0.01270849, "balance_loss_clip": 0.0628141, "balance_loss_mlp": 0.01259601, "epoch": 0.4976401623327822, "flos": 24214203141120.0, "grad_norm": 1.2738664782489661, "language_loss": 0.70252168, "learning_rate": 2.112300599949172e-06, "loss": 0.77959847, "num_input_tokens_seen": 177949885, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.11248779, "step": 8277, "time_per_iteration": 2.7384307384490967 }, { "auxiliary_loss_clip": 0.06435116, "auxiliary_loss_mlp": 0.01270095, "balance_loss_clip": 0.06278851, "balance_loss_mlp": 0.01257751, "epoch": 0.49770028558545015, "flos": 21142229552640.0, "grad_norm": 2.053421804373145, "language_loss": 0.81889427, "learning_rate": 2.111911750583964e-06, "loss": 0.89594638, "num_input_tokens_seen": 177965720, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.12335205, "step": 8278, "time_per_iteration": 2.5991830825805664 }, { "auxiliary_loss_clip": 0.06448463, "auxiliary_loss_mlp": 0.01267143, "balance_loss_clip": 0.06283004, "balance_loss_mlp": 0.01254596, "epoch": 0.4977604088381181, "flos": 16769568234240.0, "grad_norm": 2.001369632940533, "language_loss": 0.68555105, "learning_rate": 2.111522896975052e-06, "loss": 0.76270711, "num_input_tokens_seen": 177983190, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.12542725, "step": 8279, "time_per_iteration": 3.949068546295166 }, { "auxiliary_loss_clip": 0.06445638, "auxiliary_loss_mlp": 0.01270576, "balance_loss_clip": 0.06282133, "balance_loss_mlp": 0.01256867, "epoch": 0.49782053209078614, "flos": 15708596636160.0, "grad_norm": 2.1567705391339094, "language_loss": 0.71154845, "learning_rate": 2.1111340391371794e-06, "loss": 0.78871059, "num_input_tokens_seen": 178000155, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.13702393, "step": 8280, "time_per_iteration": 4.004194498062134 }, { "auxiliary_loss_clip": 0.0644512, "auxiliary_loss_mlp": 0.01271379, "balance_loss_clip": 0.06283218, "balance_loss_mlp": 0.01258433, "epoch": 0.4978806553434541, "flos": 24760565688960.0, "grad_norm": 1.5140462111443969, "language_loss": 0.65010214, "learning_rate": 2.1107451770850936e-06, "loss": 0.72726715, "num_input_tokens_seen": 178021060, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.12945557, "step": 8281, "time_per_iteration": 2.6809518337249756 }, { "auxiliary_loss_clip": 0.06446652, "auxiliary_loss_mlp": 0.01267061, "balance_loss_clip": 0.06282456, "balance_loss_mlp": 0.01254252, "epoch": 0.49794077859612207, "flos": 13120820265600.0, "grad_norm": 2.092074606610791, "language_loss": 0.73667097, "learning_rate": 2.1103563108335387e-06, "loss": 0.81380814, "num_input_tokens_seen": 178038180, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.12817383, "step": 8282, "time_per_iteration": 2.5407447814941406 }, { "auxiliary_loss_clip": 0.06447215, "auxiliary_loss_mlp": 0.01270469, "balance_loss_clip": 0.06286885, "balance_loss_mlp": 0.01259097, "epoch": 0.49800090184879003, "flos": 27532223844480.0, "grad_norm": 1.4518965393091476, "language_loss": 0.73287636, "learning_rate": 2.109967440397263e-06, "loss": 0.81005323, "num_input_tokens_seen": 178057565, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.11383057, "step": 8283, "time_per_iteration": 2.6358773708343506 }, { "auxiliary_loss_clip": 0.0644802, "auxiliary_loss_mlp": 0.01269158, "balance_loss_clip": 0.06288444, "balance_loss_mlp": 0.01256212, "epoch": 0.498061025101458, "flos": 19798677659520.0, "grad_norm": 1.4479070843527597, "language_loss": 0.78572619, "learning_rate": 2.1095785657910095e-06, "loss": 0.86289799, "num_input_tokens_seen": 178076965, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.12945557, "step": 8284, "time_per_iteration": 2.531036376953125 }, { "auxiliary_loss_clip": 0.06452773, "auxiliary_loss_mlp": 0.01268588, "balance_loss_clip": 0.06285296, "balance_loss_mlp": 0.01255237, "epoch": 0.49812114835412596, "flos": 29900926915200.0, "grad_norm": 1.4706331933058021, "language_loss": 0.73137462, "learning_rate": 2.109189687029526e-06, "loss": 0.80858827, "num_input_tokens_seen": 178095105, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.13366699, "step": 8285, "time_per_iteration": 2.6582677364349365 }, { "auxiliary_loss_clip": 0.06443693, "auxiliary_loss_mlp": 0.01270127, "balance_loss_clip": 0.06282122, "balance_loss_mlp": 0.01257294, "epoch": 0.49818127160679393, "flos": 23153441178240.0, "grad_norm": 1.549714997185512, "language_loss": 0.74467552, "learning_rate": 2.1088008041275598e-06, "loss": 0.8218137, "num_input_tokens_seen": 178114505, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.128479, "step": 8286, "time_per_iteration": 2.5708038806915283 }, { "auxiliary_loss_clip": 0.06449093, "auxiliary_loss_mlp": 0.01266542, "balance_loss_clip": 0.06285253, "balance_loss_mlp": 0.01254192, "epoch": 0.4982413948594619, "flos": 21659228444160.0, "grad_norm": 1.6441490840817898, "language_loss": 0.85540253, "learning_rate": 2.1084119170998545e-06, "loss": 0.93255889, "num_input_tokens_seen": 178131595, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.12359619, "step": 8287, "time_per_iteration": 2.6438753604888916 }, { "auxiliary_loss_clip": 0.06449288, "auxiliary_loss_mlp": 0.01269415, "balance_loss_clip": 0.06283978, "balance_loss_mlp": 0.01256421, "epoch": 0.49830151811212986, "flos": 32494866560640.0, "grad_norm": 1.6458617347017732, "language_loss": 0.72711873, "learning_rate": 2.108023025961159e-06, "loss": 0.80430579, "num_input_tokens_seen": 178152055, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.13006592, "step": 8288, "time_per_iteration": 2.6586620807647705 }, { "auxiliary_loss_clip": 0.06456672, "auxiliary_loss_mlp": 0.01268624, "balance_loss_clip": 0.06289162, "balance_loss_mlp": 0.01254742, "epoch": 0.4983616413647978, "flos": 18146886122880.0, "grad_norm": 2.5719135301556424, "language_loss": 0.80071163, "learning_rate": 2.10763413072622e-06, "loss": 0.87796462, "num_input_tokens_seen": 178168150, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.13885498, "step": 8289, "time_per_iteration": 2.5905048847198486 }, { "auxiliary_loss_clip": 0.06452788, "auxiliary_loss_mlp": 0.01269509, "balance_loss_clip": 0.06290558, "balance_loss_mlp": 0.01256759, "epoch": 0.4984217646174658, "flos": 19724898539520.0, "grad_norm": 2.1156929913338725, "language_loss": 0.73499286, "learning_rate": 2.107245231409784e-06, "loss": 0.81221586, "num_input_tokens_seen": 178186150, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.12744141, "step": 8290, "time_per_iteration": 2.5511250495910645 }, { "auxiliary_loss_clip": 0.06449835, "auxiliary_loss_mlp": 0.012724, "balance_loss_clip": 0.06286211, "balance_loss_mlp": 0.01258738, "epoch": 0.49848188787013376, "flos": 24943525079040.0, "grad_norm": 1.4347405266550275, "language_loss": 0.84616321, "learning_rate": 2.106856328026598e-06, "loss": 0.92338556, "num_input_tokens_seen": 178207665, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.13671875, "step": 8291, "time_per_iteration": 2.615466594696045 }, { "auxiliary_loss_clip": 0.06460987, "auxiliary_loss_mlp": 0.01272032, "balance_loss_clip": 0.06290556, "balance_loss_mlp": 0.01258877, "epoch": 0.4985420111228017, "flos": 22388969652480.0, "grad_norm": 1.7050239280471897, "language_loss": 0.67743409, "learning_rate": 2.106467420591409e-06, "loss": 0.75476432, "num_input_tokens_seen": 178226325, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.13146973, "step": 8292, "time_per_iteration": 2.6132261753082275 }, { "auxiliary_loss_clip": 0.06442776, "auxiliary_loss_mlp": 0.01268245, "balance_loss_clip": 0.06281446, "balance_loss_mlp": 0.01257093, "epoch": 0.4986021343754697, "flos": 16221989802240.0, "grad_norm": 1.564650273299512, "language_loss": 0.67092603, "learning_rate": 2.106078509118965e-06, "loss": 0.74803627, "num_input_tokens_seen": 178244960, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.11151123, "step": 8293, "time_per_iteration": 2.589778184890747 }, { "auxiliary_loss_clip": 0.06450116, "auxiliary_loss_mlp": 0.01266078, "balance_loss_clip": 0.06285553, "balance_loss_mlp": 0.01254514, "epoch": 0.4986622576281377, "flos": 23410221615360.0, "grad_norm": 1.8556205590849082, "language_loss": 0.82687658, "learning_rate": 2.1056895936240133e-06, "loss": 0.90403849, "num_input_tokens_seen": 178265400, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.11566162, "step": 8294, "time_per_iteration": 2.6571414470672607 }, { "auxiliary_loss_clip": 0.06449011, "auxiliary_loss_mlp": 0.01273513, "balance_loss_clip": 0.06285994, "balance_loss_mlp": 0.01261181, "epoch": 0.49872238088080567, "flos": 19980714654720.0, "grad_norm": 2.11990592470691, "language_loss": 0.73992169, "learning_rate": 2.1053006741213016e-06, "loss": 0.8171469, "num_input_tokens_seen": 178284535, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.12335205, "step": 8295, "time_per_iteration": 2.596514940261841 }, { "auxiliary_loss_clip": 0.06445549, "auxiliary_loss_mlp": 0.01268191, "balance_loss_clip": 0.06283614, "balance_loss_mlp": 0.01255608, "epoch": 0.49878250413347364, "flos": 22899595633920.0, "grad_norm": 1.770136584383455, "language_loss": 0.67932224, "learning_rate": 2.1049117506255775e-06, "loss": 0.75645959, "num_input_tokens_seen": 178302425, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.12591553, "step": 8296, "time_per_iteration": 2.5666258335113525 }, { "auxiliary_loss_clip": 0.06453829, "auxiliary_loss_mlp": 0.01269784, "balance_loss_clip": 0.06288031, "balance_loss_mlp": 0.01256927, "epoch": 0.4988426273861416, "flos": 32606688234240.0, "grad_norm": 2.0900850624102834, "language_loss": 0.64494371, "learning_rate": 2.1045228231515895e-06, "loss": 0.72217989, "num_input_tokens_seen": 178323065, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.12860107, "step": 8297, "time_per_iteration": 2.646047353744507 }, { "auxiliary_loss_clip": 0.0644127, "auxiliary_loss_mlp": 0.01270515, "balance_loss_clip": 0.0628247, "balance_loss_mlp": 0.01258957, "epoch": 0.49890275063880957, "flos": 20929990360320.0, "grad_norm": 1.807626195455512, "language_loss": 0.69442499, "learning_rate": 2.1041338917140857e-06, "loss": 0.77154291, "num_input_tokens_seen": 178343985, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.11566162, "step": 8298, "time_per_iteration": 2.583669424057007 }, { "auxiliary_loss_clip": 0.06445816, "auxiliary_loss_mlp": 0.01267675, "balance_loss_clip": 0.0628515, "balance_loss_mlp": 0.01255319, "epoch": 0.49896287389147753, "flos": 18630370581120.0, "grad_norm": 1.8847875742260232, "language_loss": 0.84562528, "learning_rate": 2.103744956327814e-06, "loss": 0.92276019, "num_input_tokens_seen": 178362345, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.12353516, "step": 8299, "time_per_iteration": 2.550977945327759 }, { "auxiliary_loss_clip": 0.06447299, "auxiliary_loss_mlp": 0.0126839, "balance_loss_clip": 0.06282835, "balance_loss_mlp": 0.01256011, "epoch": 0.4990229971441455, "flos": 24833422414080.0, "grad_norm": 2.338383227739521, "language_loss": 0.68965381, "learning_rate": 2.1033560170075234e-06, "loss": 0.76681072, "num_input_tokens_seen": 178383190, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.1237793, "step": 8300, "time_per_iteration": 2.636467933654785 }, { "auxiliary_loss_clip": 0.06348559, "auxiliary_loss_mlp": 0.01259949, "balance_loss_clip": 0.06281886, "balance_loss_mlp": 0.0125778, "epoch": 0.49908312039681346, "flos": 71405638323840.0, "grad_norm": 0.7348758886065331, "language_loss": 0.51055658, "learning_rate": 2.1029670737679623e-06, "loss": 0.58664167, "num_input_tokens_seen": 178444250, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 0.02172852, "step": 8301, "time_per_iteration": 3.256481647491455 }, { "auxiliary_loss_clip": 0.06441922, "auxiliary_loss_mlp": 0.01270307, "balance_loss_clip": 0.06284537, "balance_loss_mlp": 0.01258166, "epoch": 0.4991432436494814, "flos": 19834791569280.0, "grad_norm": 1.6423160352060733, "language_loss": 0.84875596, "learning_rate": 2.102578126623879e-06, "loss": 0.92587829, "num_input_tokens_seen": 178463250, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.121521, "step": 8302, "time_per_iteration": 2.5553600788116455 }, { "auxiliary_loss_clip": 0.06439476, "auxiliary_loss_mlp": 0.01268197, "balance_loss_clip": 0.06281476, "balance_loss_mlp": 0.01257325, "epoch": 0.4992033669021494, "flos": 15127252208640.0, "grad_norm": 2.5649941820658793, "language_loss": 0.69754112, "learning_rate": 2.102189175590024e-06, "loss": 0.77461779, "num_input_tokens_seen": 178481340, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.10870361, "step": 8303, "time_per_iteration": 2.531755208969116 }, { "auxiliary_loss_clip": 0.06444444, "auxiliary_loss_mlp": 0.01268787, "balance_loss_clip": 0.06279269, "balance_loss_mlp": 0.01256747, "epoch": 0.49926349015481736, "flos": 31215282860160.0, "grad_norm": 1.7574193997951308, "language_loss": 0.73005903, "learning_rate": 2.101800220681144e-06, "loss": 0.80719125, "num_input_tokens_seen": 178501545, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.12042236, "step": 8304, "time_per_iteration": 2.661844491958618 }, { "auxiliary_loss_clip": 0.06438978, "auxiliary_loss_mlp": 0.0127068, "balance_loss_clip": 0.06279508, "balance_loss_mlp": 0.01259164, "epoch": 0.4993236134074853, "flos": 24907201534080.0, "grad_norm": 1.8223825137197505, "language_loss": 0.81011719, "learning_rate": 2.10141126191199e-06, "loss": 0.88721371, "num_input_tokens_seen": 178519700, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.1151123, "step": 8305, "time_per_iteration": 2.5963196754455566 }, { "auxiliary_loss_clip": 0.06335758, "auxiliary_loss_mlp": 0.01253934, "balance_loss_clip": 0.06269854, "balance_loss_mlp": 0.01251916, "epoch": 0.4993837366601533, "flos": 70438962896640.0, "grad_norm": 0.6971993947045096, "language_loss": 0.56917024, "learning_rate": 2.1010222992973107e-06, "loss": 0.64506721, "num_input_tokens_seen": 178576740, "router_z_loss_clip": 0.65966797, "router_z_loss_mlp": 0.02017212, "step": 8306, "time_per_iteration": 3.3274853229522705 }, { "auxiliary_loss_clip": 0.0644379, "auxiliary_loss_mlp": 0.01268252, "balance_loss_clip": 0.06284204, "balance_loss_mlp": 0.0125567, "epoch": 0.4994438599128213, "flos": 15966718738560.0, "grad_norm": 1.7072245084528481, "language_loss": 0.82719213, "learning_rate": 2.1006333328518556e-06, "loss": 0.90431249, "num_input_tokens_seen": 178594745, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.12591553, "step": 8307, "time_per_iteration": 2.5898022651672363 }, { "auxiliary_loss_clip": 0.06441513, "auxiliary_loss_mlp": 0.01269996, "balance_loss_clip": 0.06282144, "balance_loss_mlp": 0.01257438, "epoch": 0.4995039831654893, "flos": 27935765907840.0, "grad_norm": 1.9341366610046704, "language_loss": 0.61074567, "learning_rate": 2.1002443625903748e-06, "loss": 0.68786073, "num_input_tokens_seen": 178614110, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.12554932, "step": 8308, "time_per_iteration": 2.602647066116333 }, { "auxiliary_loss_clip": 0.06438577, "auxiliary_loss_mlp": 0.01268733, "balance_loss_clip": 0.06280382, "balance_loss_mlp": 0.01257825, "epoch": 0.49956410641815724, "flos": 24211310175360.0, "grad_norm": 1.600301924098387, "language_loss": 0.74943566, "learning_rate": 2.0998553885276168e-06, "loss": 0.82650882, "num_input_tokens_seen": 178634170, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.10900879, "step": 8309, "time_per_iteration": 5.442386150360107 }, { "auxiliary_loss_clip": 0.06444441, "auxiliary_loss_mlp": 0.0126686, "balance_loss_clip": 0.06281959, "balance_loss_mlp": 0.01255261, "epoch": 0.4996242296708252, "flos": 16185666257280.0, "grad_norm": 1.968451594268774, "language_loss": 0.79562569, "learning_rate": 2.0994664106783335e-06, "loss": 0.87273872, "num_input_tokens_seen": 178651775, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.1159668, "step": 8310, "time_per_iteration": 2.569972515106201 }, { "auxiliary_loss_clip": 0.06444699, "auxiliary_loss_mlp": 0.0126958, "balance_loss_clip": 0.06280439, "balance_loss_mlp": 0.01257623, "epoch": 0.49968435292349317, "flos": 16879209701760.0, "grad_norm": 1.6270482244149196, "language_loss": 0.7161032, "learning_rate": 2.0990774290572735e-06, "loss": 0.79324603, "num_input_tokens_seen": 178669720, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.11956787, "step": 8311, "time_per_iteration": 2.5373220443725586 }, { "auxiliary_loss_clip": 0.06444126, "auxiliary_loss_mlp": 0.01266346, "balance_loss_clip": 0.06283539, "balance_loss_mlp": 0.01254843, "epoch": 0.49974447617616113, "flos": 14944837870080.0, "grad_norm": 1.7453095829955816, "language_loss": 0.77167773, "learning_rate": 2.098688443679187e-06, "loss": 0.84878242, "num_input_tokens_seen": 178686765, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.1149292, "step": 8312, "time_per_iteration": 2.605367660522461 }, { "auxiliary_loss_clip": 0.06443501, "auxiliary_loss_mlp": 0.01267545, "balance_loss_clip": 0.06282713, "balance_loss_mlp": 0.01255618, "epoch": 0.4998045994288291, "flos": 26658823610880.0, "grad_norm": 2.179402777670418, "language_loss": 0.84627819, "learning_rate": 2.0982994545588256e-06, "loss": 0.92338866, "num_input_tokens_seen": 178705845, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.11932373, "step": 8313, "time_per_iteration": 2.566174268722534 }, { "auxiliary_loss_clip": 0.06446679, "auxiliary_loss_mlp": 0.01271361, "balance_loss_clip": 0.06283875, "balance_loss_mlp": 0.01259273, "epoch": 0.49986472268149706, "flos": 20959102454400.0, "grad_norm": 1.7073237385210318, "language_loss": 0.80683899, "learning_rate": 2.097910461710939e-06, "loss": 0.88401949, "num_input_tokens_seen": 178723410, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.12091064, "step": 8314, "time_per_iteration": 2.6231112480163574 }, { "auxiliary_loss_clip": 0.06447242, "auxiliary_loss_mlp": 0.01272284, "balance_loss_clip": 0.06285419, "balance_loss_mlp": 0.01259863, "epoch": 0.49992484593416503, "flos": 22790499217920.0, "grad_norm": 1.667684818649048, "language_loss": 0.79641116, "learning_rate": 2.0975214651502773e-06, "loss": 0.87360632, "num_input_tokens_seen": 178743560, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.12408447, "step": 8315, "time_per_iteration": 2.582381010055542 }, { "auxiliary_loss_clip": 0.06440383, "auxiliary_loss_mlp": 0.01269225, "balance_loss_clip": 0.06280334, "balance_loss_mlp": 0.01257703, "epoch": 0.499984969186833, "flos": 46796838307200.0, "grad_norm": 1.5721164834670982, "language_loss": 0.74838203, "learning_rate": 2.0971324648915926e-06, "loss": 0.82547808, "num_input_tokens_seen": 178767225, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.11523438, "step": 8316, "time_per_iteration": 2.8235023021698 }, { "auxiliary_loss_clip": 0.06438377, "auxiliary_loss_mlp": 0.01269128, "balance_loss_clip": 0.06282414, "balance_loss_mlp": 0.01258131, "epoch": 0.500045092439501, "flos": 25564086017280.0, "grad_norm": 1.4459578320965873, "language_loss": 0.81401742, "learning_rate": 2.0967434609496343e-06, "loss": 0.89109242, "num_input_tokens_seen": 178786810, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.10998535, "step": 8317, "time_per_iteration": 2.5942254066467285 }, { "auxiliary_loss_clip": 0.06440914, "auxiliary_loss_mlp": 0.01269693, "balance_loss_clip": 0.06279503, "balance_loss_mlp": 0.01257343, "epoch": 0.5001052156921689, "flos": 20711126695680.0, "grad_norm": 1.8344543412494707, "language_loss": 0.83656722, "learning_rate": 2.0963544533391548e-06, "loss": 0.91367328, "num_input_tokens_seen": 178805660, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.12347412, "step": 8318, "time_per_iteration": 2.6112875938415527 }, { "auxiliary_loss_clip": 0.06443365, "auxiliary_loss_mlp": 0.01271359, "balance_loss_clip": 0.06283233, "balance_loss_mlp": 0.01259861, "epoch": 0.500165338944837, "flos": 21257405389440.0, "grad_norm": 1.8544112722896684, "language_loss": 0.81915498, "learning_rate": 2.0959654420749045e-06, "loss": 0.89630222, "num_input_tokens_seen": 178824780, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.1151123, "step": 8319, "time_per_iteration": 5.511195182800293 }, { "auxiliary_loss_clip": 0.06445113, "auxiliary_loss_mlp": 0.01265229, "balance_loss_clip": 0.06284702, "balance_loss_mlp": 0.01254774, "epoch": 0.5002254621975049, "flos": 27861693298560.0, "grad_norm": 1.5854581555415448, "language_loss": 0.71692359, "learning_rate": 2.095576427171635e-06, "loss": 0.79402709, "num_input_tokens_seen": 178845640, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.10461426, "step": 8320, "time_per_iteration": 2.715240955352783 }, { "auxiliary_loss_clip": 0.06458092, "auxiliary_loss_mlp": 0.01274382, "balance_loss_clip": 0.0628539, "balance_loss_mlp": 0.01260881, "epoch": 0.5002855854501729, "flos": 15556049078400.0, "grad_norm": 3.01251410020235, "language_loss": 0.77391338, "learning_rate": 2.0951874086440978e-06, "loss": 0.85123813, "num_input_tokens_seen": 178862290, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.13513184, "step": 8321, "time_per_iteration": 2.5197575092315674 }, { "auxiliary_loss_clip": 0.06450167, "auxiliary_loss_mlp": 0.01272983, "balance_loss_clip": 0.06288473, "balance_loss_mlp": 0.01259601, "epoch": 0.5003457087028408, "flos": 16112977240320.0, "grad_norm": 1.6442685977671465, "language_loss": 0.83187687, "learning_rate": 2.0947983865070455e-06, "loss": 0.9091084, "num_input_tokens_seen": 178879805, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.1338501, "step": 8322, "time_per_iteration": 2.582157850265503 }, { "auxiliary_loss_clip": 0.06452055, "auxiliary_loss_mlp": 0.01273834, "balance_loss_clip": 0.06288317, "balance_loss_mlp": 0.01261455, "epoch": 0.5004058319555088, "flos": 22717055514240.0, "grad_norm": 2.398843908850606, "language_loss": 0.74612427, "learning_rate": 2.094409360775228e-06, "loss": 0.82338321, "num_input_tokens_seen": 178896985, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.12384033, "step": 8323, "time_per_iteration": 2.6105499267578125 }, { "auxiliary_loss_clip": 0.06443332, "auxiliary_loss_mlp": 0.01268181, "balance_loss_clip": 0.06281601, "balance_loss_mlp": 0.01255724, "epoch": 0.5004659552081767, "flos": 30125870000640.0, "grad_norm": 1.4716072439962455, "language_loss": 0.69801158, "learning_rate": 2.0940203314633977e-06, "loss": 0.7751267, "num_input_tokens_seen": 178920605, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.12457275, "step": 8324, "time_per_iteration": 2.6384575366973877 }, { "auxiliary_loss_clip": 0.0644753, "auxiliary_loss_mlp": 0.01272755, "balance_loss_clip": 0.06285349, "balance_loss_mlp": 0.01260596, "epoch": 0.5005260784608447, "flos": 18630664070400.0, "grad_norm": 1.8086956548728956, "language_loss": 0.7250765, "learning_rate": 2.0936312985863077e-06, "loss": 0.80227935, "num_input_tokens_seen": 178937760, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.121521, "step": 8325, "time_per_iteration": 2.5756642818450928 }, { "auxiliary_loss_clip": 0.06454901, "auxiliary_loss_mlp": 0.01268756, "balance_loss_clip": 0.06291827, "balance_loss_mlp": 0.01255994, "epoch": 0.5005862017135126, "flos": 24866349868800.0, "grad_norm": 1.6036089682149612, "language_loss": 0.73299897, "learning_rate": 2.093242262158709e-06, "loss": 0.8102355, "num_input_tokens_seen": 178957985, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.12768555, "step": 8326, "time_per_iteration": 2.62744402885437 }, { "auxiliary_loss_clip": 0.06445146, "auxiliary_loss_mlp": 0.0126827, "balance_loss_clip": 0.06285787, "balance_loss_mlp": 0.01257201, "epoch": 0.5006463249661807, "flos": 18740389392000.0, "grad_norm": 1.4905902003166134, "language_loss": 0.78240347, "learning_rate": 2.0928532221953544e-06, "loss": 0.85953772, "num_input_tokens_seen": 178977070, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.11077881, "step": 8327, "time_per_iteration": 2.5391647815704346 }, { "auxiliary_loss_clip": 0.0645417, "auxiliary_loss_mlp": 0.01274024, "balance_loss_clip": 0.06291845, "balance_loss_mlp": 0.01261382, "epoch": 0.5007064482188487, "flos": 13047124999680.0, "grad_norm": 2.2254885423958326, "language_loss": 0.88101113, "learning_rate": 2.092464178710997e-06, "loss": 0.95829308, "num_input_tokens_seen": 178994175, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.12646484, "step": 8328, "time_per_iteration": 2.564546585083008 }, { "auxiliary_loss_clip": 0.0645367, "auxiliary_loss_mlp": 0.01272227, "balance_loss_clip": 0.06286296, "balance_loss_mlp": 0.01258911, "epoch": 0.5007665714715166, "flos": 21295154453760.0, "grad_norm": 1.9807754517046112, "language_loss": 0.74450672, "learning_rate": 2.092075131720388e-06, "loss": 0.82176566, "num_input_tokens_seen": 179013710, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.13323975, "step": 8329, "time_per_iteration": 2.5703372955322266 }, { "auxiliary_loss_clip": 0.0644694, "auxiliary_loss_mlp": 0.01275164, "balance_loss_clip": 0.06286986, "balance_loss_mlp": 0.012635, "epoch": 0.5008266947241846, "flos": 29762676478080.0, "grad_norm": 1.5301452638483164, "language_loss": 0.80049026, "learning_rate": 2.091686081238281e-06, "loss": 0.8777113, "num_input_tokens_seen": 179035255, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.11663818, "step": 8330, "time_per_iteration": 2.671457529067993 }, { "auxiliary_loss_clip": 0.06345116, "auxiliary_loss_mlp": 0.01252664, "balance_loss_clip": 0.06279001, "balance_loss_mlp": 0.01250745, "epoch": 0.5008868179768525, "flos": 63574498460160.0, "grad_norm": 0.7104775082857941, "language_loss": 0.5596813, "learning_rate": 2.0912970272794282e-06, "loss": 0.6356591, "num_input_tokens_seen": 179090915, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 0.01916504, "step": 8331, "time_per_iteration": 3.0018696784973145 }, { "auxiliary_loss_clip": 0.06448126, "auxiliary_loss_mlp": 0.01270293, "balance_loss_clip": 0.06290462, "balance_loss_mlp": 0.01258551, "epoch": 0.5009469412295205, "flos": 27382108055040.0, "grad_norm": 1.944778663677315, "language_loss": 0.65172213, "learning_rate": 2.0909079698585833e-06, "loss": 0.72890633, "num_input_tokens_seen": 179109160, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.11737061, "step": 8332, "time_per_iteration": 2.6447525024414062 }, { "auxiliary_loss_clip": 0.06445229, "auxiliary_loss_mlp": 0.01268897, "balance_loss_clip": 0.06287032, "balance_loss_mlp": 0.0125694, "epoch": 0.5010070644821885, "flos": 27385839561600.0, "grad_norm": 1.4186034283748379, "language_loss": 0.74948025, "learning_rate": 2.0905189089904993e-06, "loss": 0.82662153, "num_input_tokens_seen": 179130610, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.11956787, "step": 8333, "time_per_iteration": 2.616284132003784 }, { "auxiliary_loss_clip": 0.06452106, "auxiliary_loss_mlp": 0.01265727, "balance_loss_clip": 0.06289674, "balance_loss_mlp": 0.01253991, "epoch": 0.5010671877348565, "flos": 20668178678400.0, "grad_norm": 1.8017184851682322, "language_loss": 0.80540711, "learning_rate": 2.090129844689929e-06, "loss": 0.88258547, "num_input_tokens_seen": 179147860, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.11743164, "step": 8334, "time_per_iteration": 2.6428849697113037 }, { "auxiliary_loss_clip": 0.06346469, "auxiliary_loss_mlp": 0.01251457, "balance_loss_clip": 0.06280331, "balance_loss_mlp": 0.01249326, "epoch": 0.5011273109875244, "flos": 59148266855040.0, "grad_norm": 0.9619166842650554, "language_loss": 0.62730598, "learning_rate": 2.089740776971626e-06, "loss": 0.70328516, "num_input_tokens_seen": 179210490, "router_z_loss_clip": 0.66210938, "router_z_loss_mlp": 0.02133179, "step": 8335, "time_per_iteration": 3.1652791500091553 }, { "auxiliary_loss_clip": 0.06439267, "auxiliary_loss_mlp": 0.01266656, "balance_loss_clip": 0.06282363, "balance_loss_mlp": 0.01255706, "epoch": 0.5011874342401924, "flos": 25343126000640.0, "grad_norm": 1.4312315148729366, "language_loss": 0.80274093, "learning_rate": 2.0893517058503435e-06, "loss": 0.8798002, "num_input_tokens_seen": 179231360, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.10949707, "step": 8336, "time_per_iteration": 2.662538528442383 }, { "auxiliary_loss_clip": 0.06446338, "auxiliary_loss_mlp": 0.01265437, "balance_loss_clip": 0.06286266, "balance_loss_mlp": 0.01253504, "epoch": 0.5012475574928603, "flos": 20236153426560.0, "grad_norm": 1.6364728090151979, "language_loss": 0.80440903, "learning_rate": 2.088962631340836e-06, "loss": 0.88152671, "num_input_tokens_seen": 179250625, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.1194458, "step": 8337, "time_per_iteration": 2.577974319458008 }, { "auxiliary_loss_clip": 0.06453948, "auxiliary_loss_mlp": 0.01269269, "balance_loss_clip": 0.06286687, "balance_loss_mlp": 0.01257086, "epoch": 0.5013076807455283, "flos": 22716594316800.0, "grad_norm": 1.8132903653484846, "language_loss": 0.79036582, "learning_rate": 2.0885735534578555e-06, "loss": 0.86759806, "num_input_tokens_seen": 179267360, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.12182617, "step": 8338, "time_per_iteration": 2.565307378768921 }, { "auxiliary_loss_clip": 0.06447528, "auxiliary_loss_mlp": 0.01264658, "balance_loss_clip": 0.06284463, "balance_loss_mlp": 0.01252695, "epoch": 0.5013678039981962, "flos": 24252329548800.0, "grad_norm": 1.7870539370368976, "language_loss": 0.85279137, "learning_rate": 2.0881844722161583e-06, "loss": 0.92991316, "num_input_tokens_seen": 179289810, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.11956787, "step": 8339, "time_per_iteration": 2.59928297996521 }, { "auxiliary_loss_clip": 0.06442961, "auxiliary_loss_mlp": 0.01266988, "balance_loss_clip": 0.06282264, "balance_loss_mlp": 0.01255878, "epoch": 0.5014279272508643, "flos": 26183808414720.0, "grad_norm": 1.6918306001671952, "language_loss": 0.71268451, "learning_rate": 2.0877953876304962e-06, "loss": 0.78978395, "num_input_tokens_seen": 179310620, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.11108398, "step": 8340, "time_per_iteration": 2.612668037414551 }, { "auxiliary_loss_clip": 0.06453598, "auxiliary_loss_mlp": 0.01267721, "balance_loss_clip": 0.06287047, "balance_loss_mlp": 0.01254429, "epoch": 0.5014880505035323, "flos": 21436255929600.0, "grad_norm": 1.887021926063266, "language_loss": 0.78653789, "learning_rate": 2.0874062997156245e-06, "loss": 0.86375105, "num_input_tokens_seen": 179329005, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.13287354, "step": 8341, "time_per_iteration": 2.5769600868225098 }, { "auxiliary_loss_clip": 0.06457404, "auxiliary_loss_mlp": 0.01269989, "balance_loss_clip": 0.06287964, "balance_loss_mlp": 0.01257031, "epoch": 0.5015481737562002, "flos": 15774870816000.0, "grad_norm": 2.672358628831606, "language_loss": 0.89342666, "learning_rate": 2.0870172084862975e-06, "loss": 0.97070056, "num_input_tokens_seen": 179343785, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.12963867, "step": 8342, "time_per_iteration": 2.6697981357574463 }, { "auxiliary_loss_clip": 0.06441224, "auxiliary_loss_mlp": 0.01266286, "balance_loss_clip": 0.06280719, "balance_loss_mlp": 0.01253996, "epoch": 0.5016082970088682, "flos": 26837590296960.0, "grad_norm": 1.6991869848024834, "language_loss": 0.77303743, "learning_rate": 2.0866281139572682e-06, "loss": 0.85011256, "num_input_tokens_seen": 179364070, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.1229248, "step": 8343, "time_per_iteration": 2.6175966262817383 }, { "auxiliary_loss_clip": 0.06440222, "auxiliary_loss_mlp": 0.01265763, "balance_loss_clip": 0.06282175, "balance_loss_mlp": 0.01255201, "epoch": 0.5016684202615361, "flos": 21477023740800.0, "grad_norm": 2.202710697539619, "language_loss": 0.6808517, "learning_rate": 2.086239016143293e-06, "loss": 0.75791156, "num_input_tokens_seen": 179384225, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.10559082, "step": 8344, "time_per_iteration": 2.5476138591766357 }, { "auxiliary_loss_clip": 0.06447801, "auxiliary_loss_mlp": 0.01269101, "balance_loss_clip": 0.06285089, "balance_loss_mlp": 0.01257323, "epoch": 0.5017285435142042, "flos": 26253478684800.0, "grad_norm": 2.0449481625726245, "language_loss": 0.75491536, "learning_rate": 2.0858499150591258e-06, "loss": 0.83208442, "num_input_tokens_seen": 179402595, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.11779785, "step": 8345, "time_per_iteration": 2.5928215980529785 }, { "auxiliary_loss_clip": 0.06444181, "auxiliary_loss_mlp": 0.01269411, "balance_loss_clip": 0.06282027, "balance_loss_mlp": 0.0125541, "epoch": 0.5017886667668721, "flos": 20783899566720.0, "grad_norm": 1.9684665134801895, "language_loss": 0.78930503, "learning_rate": 2.0854608107195203e-06, "loss": 0.86644101, "num_input_tokens_seen": 179419635, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.14007568, "step": 8346, "time_per_iteration": 2.565453052520752 }, { "auxiliary_loss_clip": 0.06448194, "auxiliary_loss_mlp": 0.0126733, "balance_loss_clip": 0.06286, "balance_loss_mlp": 0.01255879, "epoch": 0.5018487900195401, "flos": 20162500087680.0, "grad_norm": 1.4311881951935495, "language_loss": 0.69447184, "learning_rate": 2.0850717031392333e-06, "loss": 0.77162707, "num_input_tokens_seen": 179438770, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.11456299, "step": 8347, "time_per_iteration": 2.5642545223236084 }, { "auxiliary_loss_clip": 0.06452105, "auxiliary_loss_mlp": 0.01266328, "balance_loss_clip": 0.06286904, "balance_loss_mlp": 0.01254419, "epoch": 0.501908913272208, "flos": 18156613196160.0, "grad_norm": 2.5494890138484108, "language_loss": 0.725124, "learning_rate": 2.0846825923330174e-06, "loss": 0.80230832, "num_input_tokens_seen": 179457475, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.11914062, "step": 8348, "time_per_iteration": 4.054907560348511 }, { "auxiliary_loss_clip": 0.06442846, "auxiliary_loss_mlp": 0.01267634, "balance_loss_clip": 0.0628579, "balance_loss_mlp": 0.01256005, "epoch": 0.501969036524876, "flos": 23118962423040.0, "grad_norm": 1.4354869830088863, "language_loss": 0.74590671, "learning_rate": 2.0842934783156303e-06, "loss": 0.82301146, "num_input_tokens_seen": 179478140, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.11633301, "step": 8349, "time_per_iteration": 4.018021106719971 }, { "auxiliary_loss_clip": 0.06450561, "auxiliary_loss_mlp": 0.01272067, "balance_loss_clip": 0.0628638, "balance_loss_mlp": 0.01259043, "epoch": 0.5020291597775439, "flos": 11367814596480.0, "grad_norm": 2.0016329185540442, "language_loss": 0.63751501, "learning_rate": 2.0839043611018266e-06, "loss": 0.71474135, "num_input_tokens_seen": 179494325, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.13006592, "step": 8350, "time_per_iteration": 2.57204270362854 }, { "auxiliary_loss_clip": 0.06347014, "auxiliary_loss_mlp": 0.01257822, "balance_loss_clip": 0.06279774, "balance_loss_mlp": 0.01255785, "epoch": 0.5020892830302119, "flos": 64030422124800.0, "grad_norm": 0.7573222729459814, "language_loss": 0.59894359, "learning_rate": 2.0835152407063597e-06, "loss": 0.67499197, "num_input_tokens_seen": 179553545, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 0.02037048, "step": 8351, "time_per_iteration": 3.3124215602874756 }, { "auxiliary_loss_clip": 0.06452302, "auxiliary_loss_mlp": 0.01272684, "balance_loss_clip": 0.0628825, "balance_loss_mlp": 0.01260763, "epoch": 0.5021494062828799, "flos": 23739691069440.0, "grad_norm": 1.680795846855272, "language_loss": 0.7569015, "learning_rate": 2.0831261171439873e-06, "loss": 0.83415139, "num_input_tokens_seen": 179573645, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.11920166, "step": 8352, "time_per_iteration": 2.581231117248535 }, { "auxiliary_loss_clip": 0.06443349, "auxiliary_loss_mlp": 0.01271349, "balance_loss_clip": 0.06281354, "balance_loss_mlp": 0.01258987, "epoch": 0.5022095295355479, "flos": 21582640212480.0, "grad_norm": 1.8778602899402157, "language_loss": 0.72117591, "learning_rate": 2.082736990429464e-06, "loss": 0.79832292, "num_input_tokens_seen": 179591435, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.12359619, "step": 8353, "time_per_iteration": 2.56473970413208 }, { "auxiliary_loss_clip": 0.06455849, "auxiliary_loss_mlp": 0.01271023, "balance_loss_clip": 0.06292606, "balance_loss_mlp": 0.01257862, "epoch": 0.5022696527882159, "flos": 21403580037120.0, "grad_norm": 1.537229885506184, "language_loss": 0.7427904, "learning_rate": 2.0823478605775455e-06, "loss": 0.82005906, "num_input_tokens_seen": 179609955, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.1317749, "step": 8354, "time_per_iteration": 2.5774381160736084 }, { "auxiliary_loss_clip": 0.06448214, "auxiliary_loss_mlp": 0.01268237, "balance_loss_clip": 0.06286877, "balance_loss_mlp": 0.01255899, "epoch": 0.5023297760408838, "flos": 27167814437760.0, "grad_norm": 1.531013948918086, "language_loss": 0.72993135, "learning_rate": 2.0819587276029884e-06, "loss": 0.80709589, "num_input_tokens_seen": 179630875, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.12341309, "step": 8355, "time_per_iteration": 2.6146607398986816 }, { "auxiliary_loss_clip": 0.06459151, "auxiliary_loss_mlp": 0.01270187, "balance_loss_clip": 0.06291331, "balance_loss_mlp": 0.01256466, "epoch": 0.5023898992935518, "flos": 26221054354560.0, "grad_norm": 1.5061802650107, "language_loss": 0.81156385, "learning_rate": 2.081569591520548e-06, "loss": 0.88885725, "num_input_tokens_seen": 179649835, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.137146, "step": 8356, "time_per_iteration": 2.615186929702759 }, { "auxiliary_loss_clip": 0.06456391, "auxiliary_loss_mlp": 0.01271015, "balance_loss_clip": 0.06286363, "balance_loss_mlp": 0.01257527, "epoch": 0.5024500225462197, "flos": 13444839204480.0, "grad_norm": 2.041641040503241, "language_loss": 0.76861262, "learning_rate": 2.0811804523449803e-06, "loss": 0.84588659, "num_input_tokens_seen": 179667605, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.1348877, "step": 8357, "time_per_iteration": 2.548219919204712 }, { "auxiliary_loss_clip": 0.064501, "auxiliary_loss_mlp": 0.01272159, "balance_loss_clip": 0.06284827, "balance_loss_mlp": 0.01258223, "epoch": 0.5025101457988878, "flos": 21585952448640.0, "grad_norm": 1.6247178212557007, "language_loss": 0.76473248, "learning_rate": 2.0807913100910417e-06, "loss": 0.84195507, "num_input_tokens_seen": 179686910, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.13952637, "step": 8358, "time_per_iteration": 4.057314872741699 }, { "auxiliary_loss_clip": 0.06447463, "auxiliary_loss_mlp": 0.01267492, "balance_loss_clip": 0.06283882, "balance_loss_mlp": 0.01254909, "epoch": 0.5025702690515557, "flos": 24652140105600.0, "grad_norm": 2.242274487633572, "language_loss": 0.72211927, "learning_rate": 2.0804021647734887e-06, "loss": 0.79926884, "num_input_tokens_seen": 179706395, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.12579346, "step": 8359, "time_per_iteration": 4.050787925720215 }, { "auxiliary_loss_clip": 0.06448023, "auxiliary_loss_mlp": 0.01266969, "balance_loss_clip": 0.06287792, "balance_loss_mlp": 0.01254667, "epoch": 0.5026303923042237, "flos": 22096578430080.0, "grad_norm": 1.6118178378396435, "language_loss": 0.77033114, "learning_rate": 2.080013016407077e-06, "loss": 0.84748107, "num_input_tokens_seen": 179725735, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.12316895, "step": 8360, "time_per_iteration": 2.5894670486450195 }, { "auxiliary_loss_clip": 0.06448059, "auxiliary_loss_mlp": 0.0126827, "balance_loss_clip": 0.06287023, "balance_loss_mlp": 0.01256558, "epoch": 0.5026905155568916, "flos": 23704164138240.0, "grad_norm": 2.7892322176963393, "language_loss": 0.77409065, "learning_rate": 2.0796238650065645e-06, "loss": 0.85125399, "num_input_tokens_seen": 179746150, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.1171875, "step": 8361, "time_per_iteration": 2.5884408950805664 }, { "auxiliary_loss_clip": 0.06454638, "auxiliary_loss_mlp": 0.01271295, "balance_loss_clip": 0.06286871, "balance_loss_mlp": 0.01258081, "epoch": 0.5027506388095596, "flos": 25819566716160.0, "grad_norm": 1.5133458864200249, "language_loss": 0.85220706, "learning_rate": 2.0792347105867065e-06, "loss": 0.92946643, "num_input_tokens_seen": 179767550, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.13220215, "step": 8362, "time_per_iteration": 2.596435546875 }, { "auxiliary_loss_clip": 0.06446895, "auxiliary_loss_mlp": 0.01266879, "balance_loss_clip": 0.06283344, "balance_loss_mlp": 0.01254792, "epoch": 0.5028107620622275, "flos": 27533942853120.0, "grad_norm": 1.5994543190184594, "language_loss": 0.79480684, "learning_rate": 2.0788455531622605e-06, "loss": 0.87194461, "num_input_tokens_seen": 179790075, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.12084961, "step": 8363, "time_per_iteration": 2.612579822540283 }, { "auxiliary_loss_clip": 0.06438813, "auxiliary_loss_mlp": 0.01273704, "balance_loss_clip": 0.06282511, "balance_loss_mlp": 0.01260865, "epoch": 0.5028708853148955, "flos": 24541031191680.0, "grad_norm": 1.9589004718524303, "language_loss": 0.76329839, "learning_rate": 2.0784563927479838e-06, "loss": 0.84042358, "num_input_tokens_seen": 179806515, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.128479, "step": 8364, "time_per_iteration": 2.5946478843688965 }, { "auxiliary_loss_clip": 0.06448053, "auxiliary_loss_mlp": 0.01266697, "balance_loss_clip": 0.06287219, "balance_loss_mlp": 0.0125561, "epoch": 0.5029310085675635, "flos": 20819887695360.0, "grad_norm": 3.753477151606629, "language_loss": 0.69596028, "learning_rate": 2.0780672293586317e-06, "loss": 0.77310777, "num_input_tokens_seen": 179826450, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.11102295, "step": 8365, "time_per_iteration": 2.551330327987671 }, { "auxiliary_loss_clip": 0.06455516, "auxiliary_loss_mlp": 0.01270644, "balance_loss_clip": 0.06288135, "balance_loss_mlp": 0.01257686, "epoch": 0.5029911318202315, "flos": 22348411476480.0, "grad_norm": 1.4020180421551791, "language_loss": 0.73562539, "learning_rate": 2.0776780630089635e-06, "loss": 0.81288695, "num_input_tokens_seen": 179846770, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.12963867, "step": 8366, "time_per_iteration": 2.6222355365753174 }, { "auxiliary_loss_clip": 0.06451389, "auxiliary_loss_mlp": 0.01270383, "balance_loss_clip": 0.06290327, "balance_loss_mlp": 0.01257699, "epoch": 0.5030512550728995, "flos": 24359581175040.0, "grad_norm": 1.4762435953796804, "language_loss": 0.78626549, "learning_rate": 2.077288893713735e-06, "loss": 0.86348319, "num_input_tokens_seen": 179866585, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.12683105, "step": 8367, "time_per_iteration": 2.591937303543091 }, { "auxiliary_loss_clip": 0.06446996, "auxiliary_loss_mlp": 0.01268937, "balance_loss_clip": 0.06285112, "balance_loss_mlp": 0.01256927, "epoch": 0.5031113783255674, "flos": 18265835393280.0, "grad_norm": 1.709082497043141, "language_loss": 0.70400077, "learning_rate": 2.0768997214877035e-06, "loss": 0.78116012, "num_input_tokens_seen": 179885575, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.12023926, "step": 8368, "time_per_iteration": 2.550076484680176 }, { "auxiliary_loss_clip": 0.06344952, "auxiliary_loss_mlp": 0.01257438, "balance_loss_clip": 0.06278576, "balance_loss_mlp": 0.01255196, "epoch": 0.5031715015782354, "flos": 57270022859520.0, "grad_norm": 0.863539018418763, "language_loss": 0.63356042, "learning_rate": 2.0765105463456274e-06, "loss": 0.7095843, "num_input_tokens_seen": 179939650, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 0.0224762, "step": 8369, "time_per_iteration": 3.101585626602173 }, { "auxiliary_loss_clip": 0.06443591, "auxiliary_loss_mlp": 0.01271623, "balance_loss_clip": 0.06283492, "balance_loss_mlp": 0.01260453, "epoch": 0.5032316248309033, "flos": 27534823320960.0, "grad_norm": 8.667083702061827, "language_loss": 0.60910362, "learning_rate": 2.076121368302263e-06, "loss": 0.68625581, "num_input_tokens_seen": 179961765, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.1116333, "step": 8370, "time_per_iteration": 2.783029079437256 }, { "auxiliary_loss_clip": 0.06451047, "auxiliary_loss_mlp": 0.01270563, "balance_loss_clip": 0.06285125, "balance_loss_mlp": 0.01257116, "epoch": 0.5032917480835714, "flos": 34504401104640.0, "grad_norm": 1.5713021555815065, "language_loss": 0.68640792, "learning_rate": 2.0757321873723695e-06, "loss": 0.76362401, "num_input_tokens_seen": 179983015, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.13446045, "step": 8371, "time_per_iteration": 2.783459186553955 }, { "auxiliary_loss_clip": 0.0644803, "auxiliary_loss_mlp": 0.01271861, "balance_loss_clip": 0.06285504, "balance_loss_mlp": 0.01259046, "epoch": 0.5033518713362393, "flos": 33665228064000.0, "grad_norm": 1.7353997264979628, "language_loss": 0.68498838, "learning_rate": 2.0753430035707042e-06, "loss": 0.76218736, "num_input_tokens_seen": 180003210, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1282959, "step": 8372, "time_per_iteration": 2.708864450454712 }, { "auxiliary_loss_clip": 0.06451133, "auxiliary_loss_mlp": 0.01271777, "balance_loss_clip": 0.0628732, "balance_loss_mlp": 0.01258229, "epoch": 0.5034119945889073, "flos": 28193301031680.0, "grad_norm": 1.447622958022933, "language_loss": 0.67086291, "learning_rate": 2.0749538169120235e-06, "loss": 0.74809194, "num_input_tokens_seen": 180025530, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.13537598, "step": 8373, "time_per_iteration": 2.659677505493164 }, { "auxiliary_loss_clip": 0.06446342, "auxiliary_loss_mlp": 0.01274735, "balance_loss_clip": 0.06283966, "balance_loss_mlp": 0.01262022, "epoch": 0.5034721178415752, "flos": 21364698942720.0, "grad_norm": 1.5122711203030819, "language_loss": 0.74885857, "learning_rate": 2.0745646274110872e-06, "loss": 0.82606941, "num_input_tokens_seen": 180043180, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.12719727, "step": 8374, "time_per_iteration": 2.56195330619812 }, { "auxiliary_loss_clip": 0.06452565, "auxiliary_loss_mlp": 0.01270131, "balance_loss_clip": 0.06287459, "balance_loss_mlp": 0.0125703, "epoch": 0.5035322410942432, "flos": 22681486656000.0, "grad_norm": 1.5588801073572203, "language_loss": 0.68471587, "learning_rate": 2.0741754350826525e-06, "loss": 0.76194286, "num_input_tokens_seen": 180062905, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.13110352, "step": 8375, "time_per_iteration": 2.5728962421417236 }, { "auxiliary_loss_clip": 0.0646175, "auxiliary_loss_mlp": 0.01272992, "balance_loss_clip": 0.06291181, "balance_loss_mlp": 0.01258853, "epoch": 0.5035923643469111, "flos": 19834875423360.0, "grad_norm": 1.789951918679851, "language_loss": 0.78997993, "learning_rate": 2.0737862399414777e-06, "loss": 0.86732733, "num_input_tokens_seen": 180082000, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.14160156, "step": 8376, "time_per_iteration": 2.5743446350097656 }, { "auxiliary_loss_clip": 0.06457107, "auxiliary_loss_mlp": 0.01267043, "balance_loss_clip": 0.06288695, "balance_loss_mlp": 0.01254163, "epoch": 0.5036524875995791, "flos": 30521823269760.0, "grad_norm": 1.8036289015223443, "language_loss": 0.59310782, "learning_rate": 2.0733970420023213e-06, "loss": 0.67034924, "num_input_tokens_seen": 180101340, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.12878418, "step": 8377, "time_per_iteration": 2.645993709564209 }, { "auxiliary_loss_clip": 0.0644919, "auxiliary_loss_mlp": 0.01271665, "balance_loss_clip": 0.06285829, "balance_loss_mlp": 0.01258093, "epoch": 0.5037126108522471, "flos": 14725848424320.0, "grad_norm": 1.7683910261338316, "language_loss": 0.76518768, "learning_rate": 2.0730078412799425e-06, "loss": 0.84239626, "num_input_tokens_seen": 180119160, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.13580322, "step": 8378, "time_per_iteration": 2.561323642730713 }, { "auxiliary_loss_clip": 0.06455938, "auxiliary_loss_mlp": 0.01269755, "balance_loss_clip": 0.0629173, "balance_loss_mlp": 0.01257584, "epoch": 0.5037727341049151, "flos": 25304119125120.0, "grad_norm": 1.6548279348730723, "language_loss": 0.74684298, "learning_rate": 2.0726186377890985e-06, "loss": 0.8240999, "num_input_tokens_seen": 180138730, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.1217041, "step": 8379, "time_per_iteration": 2.6678109169006348 }, { "auxiliary_loss_clip": 0.06450418, "auxiliary_loss_mlp": 0.01274865, "balance_loss_clip": 0.06289061, "balance_loss_mlp": 0.01261741, "epoch": 0.5038328573575831, "flos": 28548193000320.0, "grad_norm": 2.084070266660844, "language_loss": 0.66969478, "learning_rate": 2.072229431544548e-06, "loss": 0.74694765, "num_input_tokens_seen": 180158810, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.13128662, "step": 8380, "time_per_iteration": 2.6547842025756836 }, { "auxiliary_loss_clip": 0.06445224, "auxiliary_loss_mlp": 0.0126673, "balance_loss_clip": 0.06285787, "balance_loss_mlp": 0.01254929, "epoch": 0.503892980610251, "flos": 31657957580160.0, "grad_norm": 3.2544510824520474, "language_loss": 0.63562918, "learning_rate": 2.071840222561051e-06, "loss": 0.71274871, "num_input_tokens_seen": 180179700, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.11791992, "step": 8381, "time_per_iteration": 2.6866514682769775 }, { "auxiliary_loss_clip": 0.06448525, "auxiliary_loss_mlp": 0.0127071, "balance_loss_clip": 0.06288065, "balance_loss_mlp": 0.01259266, "epoch": 0.503953103862919, "flos": 27096718648320.0, "grad_norm": 1.8084729464377887, "language_loss": 0.67851126, "learning_rate": 2.071451010853365e-06, "loss": 0.75570369, "num_input_tokens_seen": 180199890, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.11444092, "step": 8382, "time_per_iteration": 2.64314866065979 }, { "auxiliary_loss_clip": 0.06471154, "auxiliary_loss_mlp": 0.01272681, "balance_loss_clip": 0.06296261, "balance_loss_mlp": 0.01259156, "epoch": 0.5040132271155869, "flos": 15638423241600.0, "grad_norm": 1.787401655434667, "language_loss": 0.62540245, "learning_rate": 2.0710617964362506e-06, "loss": 0.70284081, "num_input_tokens_seen": 180217840, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.13531494, "step": 8383, "time_per_iteration": 2.5341579914093018 }, { "auxiliary_loss_clip": 0.06446363, "auxiliary_loss_mlp": 0.01268318, "balance_loss_clip": 0.06287633, "balance_loss_mlp": 0.01256176, "epoch": 0.504073350368255, "flos": 13595290410240.0, "grad_norm": 1.7340285040121344, "language_loss": 0.66942948, "learning_rate": 2.070672579324465e-06, "loss": 0.74657625, "num_input_tokens_seen": 180236465, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.12139893, "step": 8384, "time_per_iteration": 2.567624568939209 }, { "auxiliary_loss_clip": 0.06455471, "auxiliary_loss_mlp": 0.01266577, "balance_loss_clip": 0.0629293, "balance_loss_mlp": 0.01254435, "epoch": 0.5041334736209229, "flos": 29065611162240.0, "grad_norm": 7.740197354971744, "language_loss": 0.7169193, "learning_rate": 2.0702833595327674e-06, "loss": 0.79413974, "num_input_tokens_seen": 180258025, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.12145996, "step": 8385, "time_per_iteration": 2.6191837787628174 }, { "auxiliary_loss_clip": 0.06441116, "auxiliary_loss_mlp": 0.01269089, "balance_loss_clip": 0.06284437, "balance_loss_mlp": 0.01257806, "epoch": 0.5041935968735909, "flos": 24615313436160.0, "grad_norm": 2.918438015749269, "language_loss": 0.83816195, "learning_rate": 2.069894137075919e-06, "loss": 0.91526401, "num_input_tokens_seen": 180277825, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.11279297, "step": 8386, "time_per_iteration": 2.568528413772583 }, { "auxiliary_loss_clip": 0.06453684, "auxiliary_loss_mlp": 0.01267198, "balance_loss_clip": 0.0628996, "balance_loss_mlp": 0.01254651, "epoch": 0.5042537201262588, "flos": 26294204568960.0, "grad_norm": 6.12981224759675, "language_loss": 0.66616178, "learning_rate": 2.0695049119686766e-06, "loss": 0.74337053, "num_input_tokens_seen": 180300465, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.12554932, "step": 8387, "time_per_iteration": 2.61985445022583 }, { "auxiliary_loss_clip": 0.06452494, "auxiliary_loss_mlp": 0.01269618, "balance_loss_clip": 0.06291032, "balance_loss_mlp": 0.01257638, "epoch": 0.5043138433789268, "flos": 22023805559040.0, "grad_norm": 1.3750065389978112, "language_loss": 0.80776262, "learning_rate": 2.0691156842258016e-06, "loss": 0.88498378, "num_input_tokens_seen": 180321050, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.11993408, "step": 8388, "time_per_iteration": 5.489417552947998 }, { "auxiliary_loss_clip": 0.06453651, "auxiliary_loss_mlp": 0.01272362, "balance_loss_clip": 0.06291523, "balance_loss_mlp": 0.0125963, "epoch": 0.5043739666315947, "flos": 28774645459200.0, "grad_norm": 2.342492603574854, "language_loss": 0.70950633, "learning_rate": 2.0687264538620537e-06, "loss": 0.78676647, "num_input_tokens_seen": 180338870, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.12731934, "step": 8389, "time_per_iteration": 2.7709150314331055 }, { "auxiliary_loss_clip": 0.06457728, "auxiliary_loss_mlp": 0.01270973, "balance_loss_clip": 0.06292945, "balance_loss_mlp": 0.01258123, "epoch": 0.5044340898842627, "flos": 27606548016000.0, "grad_norm": 2.8721528430488608, "language_loss": 0.69738376, "learning_rate": 2.068337220892191e-06, "loss": 0.77467078, "num_input_tokens_seen": 180361285, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.12860107, "step": 8390, "time_per_iteration": 2.639569044113159 }, { "auxiliary_loss_clip": 0.06356898, "auxiliary_loss_mlp": 0.01256267, "balance_loss_clip": 0.06290269, "balance_loss_mlp": 0.01253634, "epoch": 0.5044942131369307, "flos": 67474744058880.0, "grad_norm": 0.7964994822547729, "language_loss": 0.52931625, "learning_rate": 2.067947985330974e-06, "loss": 0.60544789, "num_input_tokens_seen": 180415170, "router_z_loss_clip": 0.66455078, "router_z_loss_mlp": 0.02635193, "step": 8391, "time_per_iteration": 2.9834306240081787 }, { "auxiliary_loss_clip": 0.06353731, "auxiliary_loss_mlp": 0.01259512, "balance_loss_clip": 0.0628708, "balance_loss_mlp": 0.01257233, "epoch": 0.5045543363895987, "flos": 58646460280320.0, "grad_norm": 0.8257306685229797, "language_loss": 0.60660744, "learning_rate": 2.0675587471931628e-06, "loss": 0.68273985, "num_input_tokens_seen": 180468060, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 0.02281189, "step": 8392, "time_per_iteration": 3.049292802810669 }, { "auxiliary_loss_clip": 0.06451346, "auxiliary_loss_mlp": 0.01268018, "balance_loss_clip": 0.06292252, "balance_loss_mlp": 0.01255275, "epoch": 0.5046144596422667, "flos": 22532880240000.0, "grad_norm": 1.483390707783583, "language_loss": 0.85115677, "learning_rate": 2.067169506493517e-06, "loss": 0.92835033, "num_input_tokens_seen": 180486610, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.12762451, "step": 8393, "time_per_iteration": 2.605571746826172 }, { "auxiliary_loss_clip": 0.06457818, "auxiliary_loss_mlp": 0.01272789, "balance_loss_clip": 0.06296105, "balance_loss_mlp": 0.01261297, "epoch": 0.5046745828949346, "flos": 27461673106560.0, "grad_norm": 1.7816591233562766, "language_loss": 0.51206428, "learning_rate": 2.0667802632467974e-06, "loss": 0.58937037, "num_input_tokens_seen": 180508135, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.1149292, "step": 8394, "time_per_iteration": 2.6380269527435303 }, { "auxiliary_loss_clip": 0.06458334, "auxiliary_loss_mlp": 0.01271039, "balance_loss_clip": 0.06294326, "balance_loss_mlp": 0.01258432, "epoch": 0.5047347061476026, "flos": 17280236142720.0, "grad_norm": 1.5930595934724423, "language_loss": 0.75711215, "learning_rate": 2.0663910174677627e-06, "loss": 0.83440584, "num_input_tokens_seen": 180527000, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.12615967, "step": 8395, "time_per_iteration": 2.5797548294067383 }, { "auxiliary_loss_clip": 0.06455828, "auxiliary_loss_mlp": 0.01266641, "balance_loss_clip": 0.06292307, "balance_loss_mlp": 0.01254726, "epoch": 0.5047948294002705, "flos": 16654308543360.0, "grad_norm": 2.1011214193536856, "language_loss": 0.68498987, "learning_rate": 2.0660017691711737e-06, "loss": 0.76221454, "num_input_tokens_seen": 180544715, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.11914062, "step": 8396, "time_per_iteration": 2.566859245300293 }, { "auxiliary_loss_clip": 0.06461559, "auxiliary_loss_mlp": 0.01267102, "balance_loss_clip": 0.062997, "balance_loss_mlp": 0.01255705, "epoch": 0.5048549526529386, "flos": 26872236760320.0, "grad_norm": 1.5734661647836385, "language_loss": 0.78663886, "learning_rate": 2.065612518371792e-06, "loss": 0.86392552, "num_input_tokens_seen": 180565365, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.11401367, "step": 8397, "time_per_iteration": 2.5949819087982178 }, { "auxiliary_loss_clip": 0.06459042, "auxiliary_loss_mlp": 0.01271184, "balance_loss_clip": 0.06297742, "balance_loss_mlp": 0.01259388, "epoch": 0.5049150759056065, "flos": 21840175336320.0, "grad_norm": 1.4526202456703383, "language_loss": 0.66484439, "learning_rate": 2.065223265084376e-06, "loss": 0.74214661, "num_input_tokens_seen": 180586670, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.11798096, "step": 8398, "time_per_iteration": 4.057760715484619 }, { "auxiliary_loss_clip": 0.06457332, "auxiliary_loss_mlp": 0.01274218, "balance_loss_clip": 0.06296099, "balance_loss_mlp": 0.01261349, "epoch": 0.5049751991582745, "flos": 21691652774400.0, "grad_norm": 1.5154640030876227, "language_loss": 0.71829766, "learning_rate": 2.064834009323688e-06, "loss": 0.79561317, "num_input_tokens_seen": 180605085, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.12872314, "step": 8399, "time_per_iteration": 4.025506019592285 }, { "auxiliary_loss_clip": 0.06461018, "auxiliary_loss_mlp": 0.01273076, "balance_loss_clip": 0.06295824, "balance_loss_mlp": 0.01260434, "epoch": 0.5050353224109424, "flos": 21365495556480.0, "grad_norm": 1.70042581550944, "language_loss": 0.81722212, "learning_rate": 2.0644447511044878e-06, "loss": 0.89456302, "num_input_tokens_seen": 180624370, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.12634277, "step": 8400, "time_per_iteration": 2.552168130874634 }, { "auxiliary_loss_clip": 0.06455347, "auxiliary_loss_mlp": 0.01267683, "balance_loss_clip": 0.06294034, "balance_loss_mlp": 0.01255756, "epoch": 0.5050954456636104, "flos": 22826655054720.0, "grad_norm": 1.8440911727281855, "language_loss": 0.79192996, "learning_rate": 2.0640554904415362e-06, "loss": 0.86916023, "num_input_tokens_seen": 180642450, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.11914062, "step": 8401, "time_per_iteration": 2.5735292434692383 }, { "auxiliary_loss_clip": 0.06458458, "auxiliary_loss_mlp": 0.01266363, "balance_loss_clip": 0.06292346, "balance_loss_mlp": 0.01254347, "epoch": 0.5051555689162783, "flos": 30456513411840.0, "grad_norm": 1.465691546275332, "language_loss": 0.70157772, "learning_rate": 2.063666227349593e-06, "loss": 0.778826, "num_input_tokens_seen": 180665250, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.12017822, "step": 8402, "time_per_iteration": 2.648660182952881 }, { "auxiliary_loss_clip": 0.06454927, "auxiliary_loss_mlp": 0.01267699, "balance_loss_clip": 0.06291057, "balance_loss_mlp": 0.01255408, "epoch": 0.5052156921689464, "flos": 21294315912960.0, "grad_norm": 1.5380355865234825, "language_loss": 0.69860244, "learning_rate": 2.063276961843422e-06, "loss": 0.77582872, "num_input_tokens_seen": 180687425, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.12280273, "step": 8403, "time_per_iteration": 2.643679618835449 }, { "auxiliary_loss_clip": 0.06450908, "auxiliary_loss_mlp": 0.01268067, "balance_loss_clip": 0.06292288, "balance_loss_mlp": 0.01256611, "epoch": 0.5052758154216143, "flos": 25088106499200.0, "grad_norm": 1.3988114160116445, "language_loss": 0.85879618, "learning_rate": 2.062887693937781e-06, "loss": 0.93598592, "num_input_tokens_seen": 180708725, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.11462402, "step": 8404, "time_per_iteration": 2.621450662612915 }, { "auxiliary_loss_clip": 0.06449222, "auxiliary_loss_mlp": 0.01269894, "balance_loss_clip": 0.06287786, "balance_loss_mlp": 0.01258301, "epoch": 0.5053359386742823, "flos": 20891612390400.0, "grad_norm": 1.5312832944841819, "language_loss": 0.75753212, "learning_rate": 2.0624984236474322e-06, "loss": 0.83472323, "num_input_tokens_seen": 180727990, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.11602783, "step": 8405, "time_per_iteration": 2.5590713024139404 }, { "auxiliary_loss_clip": 0.06456357, "auxiliary_loss_mlp": 0.01266108, "balance_loss_clip": 0.06292011, "balance_loss_mlp": 0.01253162, "epoch": 0.5053960619269503, "flos": 37752499975680.0, "grad_norm": 1.588596831586711, "language_loss": 0.7354306, "learning_rate": 2.0621091509871378e-06, "loss": 0.81265521, "num_input_tokens_seen": 180749765, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.12945557, "step": 8406, "time_per_iteration": 2.7003653049468994 }, { "auxiliary_loss_clip": 0.06447046, "auxiliary_loss_mlp": 0.01267115, "balance_loss_clip": 0.06288159, "balance_loss_mlp": 0.01254729, "epoch": 0.5054561851796182, "flos": 23520617769600.0, "grad_norm": 1.717047358801635, "language_loss": 0.76740569, "learning_rate": 2.0617198759716568e-06, "loss": 0.84454727, "num_input_tokens_seen": 180769580, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.12402344, "step": 8407, "time_per_iteration": 2.606954336166382 }, { "auxiliary_loss_clip": 0.06458129, "auxiliary_loss_mlp": 0.01268237, "balance_loss_clip": 0.06294189, "balance_loss_mlp": 0.0125677, "epoch": 0.5055163084322862, "flos": 30418261223040.0, "grad_norm": 1.8454052065112152, "language_loss": 0.6281274, "learning_rate": 2.0613305986157535e-06, "loss": 0.70539105, "num_input_tokens_seen": 180790295, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.11468506, "step": 8408, "time_per_iteration": 2.661191463470459 }, { "auxiliary_loss_clip": 0.06451549, "auxiliary_loss_mlp": 0.01267457, "balance_loss_clip": 0.06289232, "balance_loss_mlp": 0.01253968, "epoch": 0.5055764316849541, "flos": 20264720469120.0, "grad_norm": 1.7159729433396052, "language_loss": 0.64064002, "learning_rate": 2.0609413189341865e-06, "loss": 0.71783006, "num_input_tokens_seen": 180807875, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.13482666, "step": 8409, "time_per_iteration": 2.5790412425994873 }, { "auxiliary_loss_clip": 0.06446132, "auxiliary_loss_mlp": 0.01265629, "balance_loss_clip": 0.06286311, "balance_loss_mlp": 0.01254066, "epoch": 0.5056365549376222, "flos": 26078611213440.0, "grad_norm": 1.29141787265621, "language_loss": 0.71269059, "learning_rate": 2.0605520369417193e-06, "loss": 0.78980821, "num_input_tokens_seen": 180831300, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.11572266, "step": 8410, "time_per_iteration": 2.62994122505188 }, { "auxiliary_loss_clip": 0.06453329, "auxiliary_loss_mlp": 0.01269777, "balance_loss_clip": 0.0629049, "balance_loss_mlp": 0.0125667, "epoch": 0.5056966781902901, "flos": 19284739441920.0, "grad_norm": 2.4076536762100327, "language_loss": 0.792436, "learning_rate": 2.060162752653113e-06, "loss": 0.86966705, "num_input_tokens_seen": 180849055, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.13104248, "step": 8411, "time_per_iteration": 2.5885202884674072 }, { "auxiliary_loss_clip": 0.06452893, "auxiliary_loss_mlp": 0.01270096, "balance_loss_clip": 0.06287883, "balance_loss_mlp": 0.01256959, "epoch": 0.5057568014429581, "flos": 21329507427840.0, "grad_norm": 1.6022668686471717, "language_loss": 0.81750321, "learning_rate": 2.0597734660831285e-06, "loss": 0.89473307, "num_input_tokens_seen": 180867395, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.13134766, "step": 8412, "time_per_iteration": 2.561046838760376 }, { "auxiliary_loss_clip": 0.06447845, "auxiliary_loss_mlp": 0.01271152, "balance_loss_clip": 0.06287928, "balance_loss_mlp": 0.01258635, "epoch": 0.505816924695626, "flos": 17499351369600.0, "grad_norm": 1.7158384485762839, "language_loss": 0.80932319, "learning_rate": 2.0593841772465283e-06, "loss": 0.88651317, "num_input_tokens_seen": 180886670, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.12518311, "step": 8413, "time_per_iteration": 2.586049795150757 }, { "auxiliary_loss_clip": 0.06451187, "auxiliary_loss_mlp": 0.01276325, "balance_loss_clip": 0.06286904, "balance_loss_mlp": 0.01262872, "epoch": 0.505877047948294, "flos": 21148434754560.0, "grad_norm": 2.0589279370626747, "language_loss": 0.80659848, "learning_rate": 2.0589948861580737e-06, "loss": 0.88387358, "num_input_tokens_seen": 180904645, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.13439941, "step": 8414, "time_per_iteration": 2.559377670288086 }, { "auxiliary_loss_clip": 0.06451597, "auxiliary_loss_mlp": 0.0127028, "balance_loss_clip": 0.06288011, "balance_loss_mlp": 0.01258413, "epoch": 0.5059371712009619, "flos": 36357824292480.0, "grad_norm": 2.9123809493584463, "language_loss": 0.62551928, "learning_rate": 2.058605592832528e-06, "loss": 0.70273805, "num_input_tokens_seen": 180922340, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.11853027, "step": 8415, "time_per_iteration": 2.6773600578308105 }, { "auxiliary_loss_clip": 0.06449983, "auxiliary_loss_mlp": 0.01274085, "balance_loss_clip": 0.06287789, "balance_loss_mlp": 0.01261789, "epoch": 0.50599729445363, "flos": 22679809574400.0, "grad_norm": 1.4310343240422698, "language_loss": 0.82284236, "learning_rate": 2.0582162972846515e-06, "loss": 0.90008307, "num_input_tokens_seen": 180941350, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.12298584, "step": 8416, "time_per_iteration": 2.5738770961761475 }, { "auxiliary_loss_clip": 0.06448918, "auxiliary_loss_mlp": 0.01266791, "balance_loss_clip": 0.06289463, "balance_loss_mlp": 0.0125571, "epoch": 0.5060574177062979, "flos": 22754553016320.0, "grad_norm": 1.584155184262108, "language_loss": 0.79705715, "learning_rate": 2.0578269995292078e-06, "loss": 0.87421429, "num_input_tokens_seen": 180960720, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.11077881, "step": 8417, "time_per_iteration": 2.6747708320617676 }, { "auxiliary_loss_clip": 0.06445435, "auxiliary_loss_mlp": 0.01272096, "balance_loss_clip": 0.06288376, "balance_loss_mlp": 0.0125983, "epoch": 0.5061175409589659, "flos": 21659689641600.0, "grad_norm": 2.3916077560707243, "language_loss": 0.63232589, "learning_rate": 2.0574376995809588e-06, "loss": 0.70950115, "num_input_tokens_seen": 180979725, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.12280273, "step": 8418, "time_per_iteration": 2.771278142929077 }, { "auxiliary_loss_clip": 0.06454188, "auxiliary_loss_mlp": 0.01270456, "balance_loss_clip": 0.06288548, "balance_loss_mlp": 0.01258136, "epoch": 0.5061776642116339, "flos": 21622653336960.0, "grad_norm": 1.9149544026142251, "language_loss": 0.78252989, "learning_rate": 2.0570483974546653e-06, "loss": 0.85977638, "num_input_tokens_seen": 180998980, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.12316895, "step": 8419, "time_per_iteration": 2.599388599395752 }, { "auxiliary_loss_clip": 0.0645048, "auxiliary_loss_mlp": 0.01268105, "balance_loss_clip": 0.06284469, "balance_loss_mlp": 0.01254813, "epoch": 0.5062377874643018, "flos": 24433276440960.0, "grad_norm": 1.6706700444801132, "language_loss": 0.77659231, "learning_rate": 2.0566590931650917e-06, "loss": 0.85377812, "num_input_tokens_seen": 181019165, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.13299561, "step": 8420, "time_per_iteration": 2.6229989528656006 }, { "auxiliary_loss_clip": 0.06452386, "auxiliary_loss_mlp": 0.01271834, "balance_loss_clip": 0.06289372, "balance_loss_mlp": 0.01258828, "epoch": 0.5062979107169698, "flos": 22530322690560.0, "grad_norm": 1.8800715140288915, "language_loss": 0.77706414, "learning_rate": 2.056269786726999e-06, "loss": 0.85430634, "num_input_tokens_seen": 181037110, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.13012695, "step": 8421, "time_per_iteration": 2.5623607635498047 }, { "auxiliary_loss_clip": 0.06445511, "auxiliary_loss_mlp": 0.01269876, "balance_loss_clip": 0.06282803, "balance_loss_mlp": 0.0125786, "epoch": 0.5063580339696377, "flos": 24578947964160.0, "grad_norm": 1.3945815962322372, "language_loss": 0.67048573, "learning_rate": 2.0558804781551512e-06, "loss": 0.7476396, "num_input_tokens_seen": 181057775, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.12017822, "step": 8422, "time_per_iteration": 2.62825345993042 }, { "auxiliary_loss_clip": 0.06445698, "auxiliary_loss_mlp": 0.01267116, "balance_loss_clip": 0.06285807, "balance_loss_mlp": 0.01254921, "epoch": 0.5064181572223058, "flos": 22601837750400.0, "grad_norm": 1.7492687128887179, "language_loss": 0.81826317, "learning_rate": 2.05549116746431e-06, "loss": 0.89539129, "num_input_tokens_seen": 181078260, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.12194824, "step": 8423, "time_per_iteration": 2.6647791862487793 }, { "auxiliary_loss_clip": 0.06452031, "auxiliary_loss_mlp": 0.01266417, "balance_loss_clip": 0.0628681, "balance_loss_mlp": 0.01253876, "epoch": 0.5064782804749737, "flos": 26002148762880.0, "grad_norm": 1.9132609326052297, "language_loss": 0.74635351, "learning_rate": 2.055101854669237e-06, "loss": 0.82353795, "num_input_tokens_seen": 181098755, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.12542725, "step": 8424, "time_per_iteration": 2.6185944080352783 }, { "auxiliary_loss_clip": 0.06446162, "auxiliary_loss_mlp": 0.01271143, "balance_loss_clip": 0.0628767, "balance_loss_mlp": 0.01259067, "epoch": 0.5065384037276417, "flos": 28561358090880.0, "grad_norm": 1.5381557205814518, "language_loss": 0.71674371, "learning_rate": 2.0547125397846975e-06, "loss": 0.79391682, "num_input_tokens_seen": 181121570, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.12078857, "step": 8425, "time_per_iteration": 2.700521469116211 }, { "auxiliary_loss_clip": 0.06447621, "auxiliary_loss_mlp": 0.01266948, "balance_loss_clip": 0.06286869, "balance_loss_mlp": 0.01254669, "epoch": 0.5065985269803096, "flos": 22972620067200.0, "grad_norm": 1.7175170654374545, "language_loss": 0.79057348, "learning_rate": 2.0543232228254524e-06, "loss": 0.86771917, "num_input_tokens_seen": 181140240, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.1229248, "step": 8426, "time_per_iteration": 2.5981364250183105 }, { "auxiliary_loss_clip": 0.06451936, "auxiliary_loss_mlp": 0.01268963, "balance_loss_clip": 0.06289697, "balance_loss_mlp": 0.01257483, "epoch": 0.5066586502329776, "flos": 21613680950400.0, "grad_norm": 2.176157756511963, "language_loss": 0.78290331, "learning_rate": 2.053933903806265e-06, "loss": 0.86011237, "num_input_tokens_seen": 181158630, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.11480713, "step": 8427, "time_per_iteration": 4.052650451660156 }, { "auxiliary_loss_clip": 0.06443088, "auxiliary_loss_mlp": 0.01266254, "balance_loss_clip": 0.06284009, "balance_loss_mlp": 0.01254232, "epoch": 0.5067187734856455, "flos": 20346214164480.0, "grad_norm": 1.715207099376553, "language_loss": 0.72159028, "learning_rate": 2.0535445827418997e-06, "loss": 0.79868364, "num_input_tokens_seen": 181176405, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.12017822, "step": 8428, "time_per_iteration": 3.974260091781616 }, { "auxiliary_loss_clip": 0.06443647, "auxiliary_loss_mlp": 0.01268453, "balance_loss_clip": 0.06283754, "balance_loss_mlp": 0.01257981, "epoch": 0.5067788967383136, "flos": 28848801922560.0, "grad_norm": 1.64766671550468, "language_loss": 0.83362746, "learning_rate": 2.0531552596471168e-06, "loss": 0.91074848, "num_input_tokens_seen": 181197595, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.10467529, "step": 8429, "time_per_iteration": 2.6402876377105713 }, { "auxiliary_loss_clip": 0.06454261, "auxiliary_loss_mlp": 0.01268484, "balance_loss_clip": 0.06286561, "balance_loss_mlp": 0.01255377, "epoch": 0.5068390199909815, "flos": 32457997964160.0, "grad_norm": 1.7510373795281768, "language_loss": 0.73306787, "learning_rate": 2.052765934536682e-06, "loss": 0.81029534, "num_input_tokens_seen": 181218560, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.13110352, "step": 8430, "time_per_iteration": 2.6294949054718018 }, { "auxiliary_loss_clip": 0.06447633, "auxiliary_loss_mlp": 0.01268017, "balance_loss_clip": 0.06285013, "balance_loss_mlp": 0.01256269, "epoch": 0.5068991432436495, "flos": 23152896126720.0, "grad_norm": 1.4865182967759054, "language_loss": 0.76795477, "learning_rate": 2.0523766074253575e-06, "loss": 0.84511125, "num_input_tokens_seen": 181237095, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.11761475, "step": 8431, "time_per_iteration": 2.602705717086792 }, { "auxiliary_loss_clip": 0.06441873, "auxiliary_loss_mlp": 0.01268319, "balance_loss_clip": 0.06282872, "balance_loss_mlp": 0.0125632, "epoch": 0.5069592664963174, "flos": 19941917414400.0, "grad_norm": 1.4650362008637405, "language_loss": 0.73050022, "learning_rate": 2.0519872783279074e-06, "loss": 0.80760217, "num_input_tokens_seen": 181255940, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.11999512, "step": 8432, "time_per_iteration": 2.547867774963379 }, { "auxiliary_loss_clip": 0.06357226, "auxiliary_loss_mlp": 0.01254359, "balance_loss_clip": 0.06291424, "balance_loss_mlp": 0.01252275, "epoch": 0.5070193897489854, "flos": 65812539888000.0, "grad_norm": 0.7343630877803897, "language_loss": 0.63302612, "learning_rate": 2.0515979472590945e-06, "loss": 0.70914197, "num_input_tokens_seen": 181316945, "router_z_loss_clip": 0.65673828, "router_z_loss_mlp": 0.02085876, "step": 8433, "time_per_iteration": 3.2189579010009766 }, { "auxiliary_loss_clip": 0.06449827, "auxiliary_loss_mlp": 0.01270924, "balance_loss_clip": 0.06288738, "balance_loss_mlp": 0.01258598, "epoch": 0.5070795130016534, "flos": 17281158537600.0, "grad_norm": 1.8006688465259315, "language_loss": 0.78076851, "learning_rate": 2.051208614233681e-06, "loss": 0.85797596, "num_input_tokens_seen": 181335555, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.12316895, "step": 8434, "time_per_iteration": 2.551301956176758 }, { "auxiliary_loss_clip": 0.06455374, "auxiliary_loss_mlp": 0.01268264, "balance_loss_clip": 0.0629006, "balance_loss_mlp": 0.01255932, "epoch": 0.5071396362543213, "flos": 21076416570240.0, "grad_norm": 1.5155445522219717, "language_loss": 0.71277583, "learning_rate": 2.0508192792664326e-06, "loss": 0.79001218, "num_input_tokens_seen": 181354580, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.12353516, "step": 8435, "time_per_iteration": 2.609042167663574 }, { "auxiliary_loss_clip": 0.06449306, "auxiliary_loss_mlp": 0.01270025, "balance_loss_clip": 0.06286927, "balance_loss_mlp": 0.01257186, "epoch": 0.5071997595069894, "flos": 23150841701760.0, "grad_norm": 2.0656721892521515, "language_loss": 0.72744131, "learning_rate": 2.050429942372112e-06, "loss": 0.80463469, "num_input_tokens_seen": 181374320, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.12854004, "step": 8436, "time_per_iteration": 2.6736044883728027 }, { "auxiliary_loss_clip": 0.06448604, "auxiliary_loss_mlp": 0.01270376, "balance_loss_clip": 0.06287494, "balance_loss_mlp": 0.01257698, "epoch": 0.5072598827596573, "flos": 22753756402560.0, "grad_norm": 1.6336004010392036, "language_loss": 0.84244668, "learning_rate": 2.050040603565483e-06, "loss": 0.91963649, "num_input_tokens_seen": 181392190, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.12677002, "step": 8437, "time_per_iteration": 2.602079391479492 }, { "auxiliary_loss_clip": 0.0644404, "auxiliary_loss_mlp": 0.01267345, "balance_loss_clip": 0.06285665, "balance_loss_mlp": 0.01255251, "epoch": 0.5073200060123253, "flos": 22573102999680.0, "grad_norm": 1.302193292539172, "language_loss": 0.80783761, "learning_rate": 2.049651262861309e-06, "loss": 0.88495147, "num_input_tokens_seen": 181413890, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.12097168, "step": 8438, "time_per_iteration": 5.504465341567993 }, { "auxiliary_loss_clip": 0.06449127, "auxiliary_loss_mlp": 0.01270162, "balance_loss_clip": 0.06285838, "balance_loss_mlp": 0.01256983, "epoch": 0.5073801292649932, "flos": 25812481046400.0, "grad_norm": 1.5276262507889928, "language_loss": 0.79814243, "learning_rate": 2.0492619202743543e-06, "loss": 0.87533534, "num_input_tokens_seen": 181433240, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.1317749, "step": 8439, "time_per_iteration": 2.6538236141204834 }, { "auxiliary_loss_clip": 0.06443242, "auxiliary_loss_mlp": 0.01266774, "balance_loss_clip": 0.06284886, "balance_loss_mlp": 0.01254984, "epoch": 0.5074402525176612, "flos": 25380916992000.0, "grad_norm": 1.5687606969934005, "language_loss": 0.71332169, "learning_rate": 2.048872575819383e-06, "loss": 0.79042184, "num_input_tokens_seen": 181453535, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.11791992, "step": 8440, "time_per_iteration": 2.614122152328491 }, { "auxiliary_loss_clip": 0.06446001, "auxiliary_loss_mlp": 0.0126818, "balance_loss_clip": 0.06283985, "balance_loss_mlp": 0.01255985, "epoch": 0.5075003757703291, "flos": 26071064346240.0, "grad_norm": 2.013579520428809, "language_loss": 0.71402973, "learning_rate": 2.048483229511158e-06, "loss": 0.79117155, "num_input_tokens_seen": 181474195, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.12188721, "step": 8441, "time_per_iteration": 2.679955005645752 }, { "auxiliary_loss_clip": 0.06447204, "auxiliary_loss_mlp": 0.01270729, "balance_loss_clip": 0.06282534, "balance_loss_mlp": 0.01257342, "epoch": 0.5075604990229972, "flos": 21841936272000.0, "grad_norm": 1.7411220125876106, "language_loss": 0.64091384, "learning_rate": 2.0480938813644445e-06, "loss": 0.71809316, "num_input_tokens_seen": 181494000, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.1338501, "step": 8442, "time_per_iteration": 2.570911169052124 }, { "auxiliary_loss_clip": 0.06439688, "auxiliary_loss_mlp": 0.01265863, "balance_loss_clip": 0.06284458, "balance_loss_mlp": 0.01254717, "epoch": 0.5076206222756651, "flos": 31986923909760.0, "grad_norm": 1.405927547380824, "language_loss": 0.71289933, "learning_rate": 2.047704531394006e-06, "loss": 0.78995478, "num_input_tokens_seen": 181515955, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.11151123, "step": 8443, "time_per_iteration": 2.6880104541778564 }, { "auxiliary_loss_clip": 0.06450461, "auxiliary_loss_mlp": 0.01271287, "balance_loss_clip": 0.06286944, "balance_loss_mlp": 0.01258723, "epoch": 0.5076807455283331, "flos": 36913033445760.0, "grad_norm": 1.2806735851313964, "language_loss": 0.62600911, "learning_rate": 2.047315179614607e-06, "loss": 0.70322657, "num_input_tokens_seen": 181540225, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.12579346, "step": 8444, "time_per_iteration": 2.6918773651123047 }, { "auxiliary_loss_clip": 0.0644203, "auxiliary_loss_mlp": 0.01264254, "balance_loss_clip": 0.06282163, "balance_loss_mlp": 0.01253388, "epoch": 0.507740868781001, "flos": 29870263520640.0, "grad_norm": 1.7440709730763007, "language_loss": 0.6443851, "learning_rate": 2.046925826041012e-06, "loss": 0.72144794, "num_input_tokens_seen": 181560125, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.10864258, "step": 8445, "time_per_iteration": 2.628279209136963 }, { "auxiliary_loss_clip": 0.06353907, "auxiliary_loss_mlp": 0.01255357, "balance_loss_clip": 0.06288465, "balance_loss_mlp": 0.01253071, "epoch": 0.507800992033669, "flos": 61935872014080.0, "grad_norm": 0.8003428538981824, "language_loss": 0.61627066, "learning_rate": 2.0465364706879845e-06, "loss": 0.69236326, "num_input_tokens_seen": 181618830, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 0.02288818, "step": 8446, "time_per_iteration": 3.218061923980713 }, { "auxiliary_loss_clip": 0.06440784, "auxiliary_loss_mlp": 0.01268241, "balance_loss_clip": 0.06280291, "balance_loss_mlp": 0.01256326, "epoch": 0.507861115286337, "flos": 20706137377920.0, "grad_norm": 1.5802469296435555, "language_loss": 0.80870509, "learning_rate": 2.04614711357029e-06, "loss": 0.88579535, "num_input_tokens_seen": 181637120, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.11914062, "step": 8447, "time_per_iteration": 2.5804576873779297 }, { "auxiliary_loss_clip": 0.06442595, "auxiliary_loss_mlp": 0.01268184, "balance_loss_clip": 0.06284705, "balance_loss_mlp": 0.0125705, "epoch": 0.507921238539005, "flos": 30854982303360.0, "grad_norm": 1.3701823667246413, "language_loss": 0.70610017, "learning_rate": 2.0457577547026916e-06, "loss": 0.78320795, "num_input_tokens_seen": 181659965, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.11138916, "step": 8448, "time_per_iteration": 2.6424195766448975 }, { "auxiliary_loss_clip": 0.06441586, "auxiliary_loss_mlp": 0.01269602, "balance_loss_clip": 0.06285103, "balance_loss_mlp": 0.01258831, "epoch": 0.507981361791673, "flos": 35709031728000.0, "grad_norm": 1.6688096836601005, "language_loss": 0.71988207, "learning_rate": 2.045368394099955e-06, "loss": 0.79699397, "num_input_tokens_seen": 181685290, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.10760498, "step": 8449, "time_per_iteration": 2.7006781101226807 }, { "auxiliary_loss_clip": 0.06443628, "auxiliary_loss_mlp": 0.01268574, "balance_loss_clip": 0.06283395, "balance_loss_mlp": 0.01257148, "epoch": 0.5080414850443409, "flos": 27168694905600.0, "grad_norm": 1.8638610081610731, "language_loss": 0.73234773, "learning_rate": 2.044979031776844e-06, "loss": 0.80946982, "num_input_tokens_seen": 181706080, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.11419678, "step": 8450, "time_per_iteration": 2.620429039001465 }, { "auxiliary_loss_clip": 0.06445256, "auxiliary_loss_mlp": 0.0126695, "balance_loss_clip": 0.06285588, "balance_loss_mlp": 0.01255524, "epoch": 0.5081016082970089, "flos": 27091855111680.0, "grad_norm": 1.530396431278492, "language_loss": 0.77122271, "learning_rate": 2.0445896677481234e-06, "loss": 0.84834468, "num_input_tokens_seen": 181724805, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.11425781, "step": 8451, "time_per_iteration": 2.630699396133423 }, { "auxiliary_loss_clip": 0.06443061, "auxiliary_loss_mlp": 0.01267459, "balance_loss_clip": 0.06282074, "balance_loss_mlp": 0.01254966, "epoch": 0.5081617315496768, "flos": 22863104380800.0, "grad_norm": 1.6033304293403243, "language_loss": 0.85265309, "learning_rate": 2.044200302028559e-06, "loss": 0.92975831, "num_input_tokens_seen": 181743725, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.12487793, "step": 8452, "time_per_iteration": 2.5934367179870605 }, { "auxiliary_loss_clip": 0.06452669, "auxiliary_loss_mlp": 0.01269738, "balance_loss_clip": 0.06287555, "balance_loss_mlp": 0.01256929, "epoch": 0.5082218548023448, "flos": 16286167630080.0, "grad_norm": 2.839593141125524, "language_loss": 0.78528464, "learning_rate": 2.0438109346329143e-06, "loss": 0.86250865, "num_input_tokens_seen": 181757720, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.12792969, "step": 8453, "time_per_iteration": 2.5426273345947266 }, { "auxiliary_loss_clip": 0.06443182, "auxiliary_loss_mlp": 0.012689, "balance_loss_clip": 0.06286664, "balance_loss_mlp": 0.01258267, "epoch": 0.5082819780550127, "flos": 24467419779840.0, "grad_norm": 1.6090036365882885, "language_loss": 0.76700908, "learning_rate": 2.0434215655759544e-06, "loss": 0.84412992, "num_input_tokens_seen": 181778545, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.10632324, "step": 8454, "time_per_iteration": 2.616507053375244 }, { "auxiliary_loss_clip": 0.06448478, "auxiliary_loss_mlp": 0.0127161, "balance_loss_clip": 0.06287575, "balance_loss_mlp": 0.01259295, "epoch": 0.5083421013076808, "flos": 23409844272000.0, "grad_norm": 1.5131909679132167, "language_loss": 0.89994729, "learning_rate": 2.0430321948724446e-06, "loss": 0.97714818, "num_input_tokens_seen": 181799495, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.12316895, "step": 8455, "time_per_iteration": 2.59527850151062 }, { "auxiliary_loss_clip": 0.06453342, "auxiliary_loss_mlp": 0.01272176, "balance_loss_clip": 0.06286745, "balance_loss_mlp": 0.01258735, "epoch": 0.5084022245603487, "flos": 23878528485120.0, "grad_norm": 1.6841408641085032, "language_loss": 0.62863779, "learning_rate": 2.042642822537149e-06, "loss": 0.70589298, "num_input_tokens_seen": 181818400, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.13433838, "step": 8456, "time_per_iteration": 2.5957248210906982 }, { "auxiliary_loss_clip": 0.06342064, "auxiliary_loss_mlp": 0.01252459, "balance_loss_clip": 0.06276043, "balance_loss_mlp": 0.01250136, "epoch": 0.5084623478130167, "flos": 62891352921600.0, "grad_norm": 0.8053847278118527, "language_loss": 0.62391043, "learning_rate": 2.0422534485848343e-06, "loss": 0.69985557, "num_input_tokens_seen": 181875975, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 0.02319336, "step": 8457, "time_per_iteration": 3.0928735733032227 }, { "auxiliary_loss_clip": 0.06449576, "auxiliary_loss_mlp": 0.01270344, "balance_loss_clip": 0.06285722, "balance_loss_mlp": 0.01257475, "epoch": 0.5085224710656846, "flos": 22352688034560.0, "grad_norm": 1.5028509690935041, "language_loss": 0.68014348, "learning_rate": 2.0418640730302644e-06, "loss": 0.75734264, "num_input_tokens_seen": 181896450, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.12860107, "step": 8458, "time_per_iteration": 2.5774855613708496 }, { "auxiliary_loss_clip": 0.0644818, "auxiliary_loss_mlp": 0.01271588, "balance_loss_clip": 0.06282975, "balance_loss_mlp": 0.01258928, "epoch": 0.5085825943183526, "flos": 26073202625280.0, "grad_norm": 1.9314614308437976, "language_loss": 0.77631891, "learning_rate": 2.0414746958882043e-06, "loss": 0.85351658, "num_input_tokens_seen": 181916770, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.12652588, "step": 8459, "time_per_iteration": 2.6277520656585693 }, { "auxiliary_loss_clip": 0.06456301, "auxiliary_loss_mlp": 0.01269509, "balance_loss_clip": 0.06288484, "balance_loss_mlp": 0.01256069, "epoch": 0.5086427175710206, "flos": 17426494644480.0, "grad_norm": 1.7791195976247873, "language_loss": 0.80484653, "learning_rate": 2.0410853171734196e-06, "loss": 0.88210464, "num_input_tokens_seen": 181932710, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.13433838, "step": 8460, "time_per_iteration": 2.5409486293792725 }, { "auxiliary_loss_clip": 0.06445914, "auxiliary_loss_mlp": 0.01271353, "balance_loss_clip": 0.0628266, "balance_loss_mlp": 0.01259283, "epoch": 0.5087028408236886, "flos": 20638102262400.0, "grad_norm": 1.6431377879376858, "language_loss": 0.69122064, "learning_rate": 2.0406959369006754e-06, "loss": 0.76839328, "num_input_tokens_seen": 181950665, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.12072754, "step": 8461, "time_per_iteration": 2.5602331161499023 }, { "auxiliary_loss_clip": 0.06438126, "auxiliary_loss_mlp": 0.01272356, "balance_loss_clip": 0.06282015, "balance_loss_mlp": 0.01260292, "epoch": 0.5087629640763566, "flos": 25600996540800.0, "grad_norm": 1.792076057187128, "language_loss": 0.76407838, "learning_rate": 2.0403065550847375e-06, "loss": 0.84118325, "num_input_tokens_seen": 181971270, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.12072754, "step": 8462, "time_per_iteration": 2.599031448364258 }, { "auxiliary_loss_clip": 0.06448528, "auxiliary_loss_mlp": 0.01270263, "balance_loss_clip": 0.06289782, "balance_loss_mlp": 0.01257853, "epoch": 0.5088230873290245, "flos": 13266743351040.0, "grad_norm": 2.0255510734442246, "language_loss": 0.81550694, "learning_rate": 2.0399171717403706e-06, "loss": 0.89269477, "num_input_tokens_seen": 181988410, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.12408447, "step": 8463, "time_per_iteration": 2.5580806732177734 }, { "auxiliary_loss_clip": 0.06445377, "auxiliary_loss_mlp": 0.0126916, "balance_loss_clip": 0.0628599, "balance_loss_mlp": 0.01257382, "epoch": 0.5088832105816925, "flos": 20048959405440.0, "grad_norm": 1.5761671874475838, "language_loss": 0.76332045, "learning_rate": 2.039527786882341e-06, "loss": 0.84046578, "num_input_tokens_seen": 182006530, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.11779785, "step": 8464, "time_per_iteration": 2.6830215454101562 }, { "auxiliary_loss_clip": 0.06350161, "auxiliary_loss_mlp": 0.01254401, "balance_loss_clip": 0.06284116, "balance_loss_mlp": 0.01252141, "epoch": 0.5089433338343604, "flos": 67445072184960.0, "grad_norm": 0.67413125485532, "language_loss": 0.59114134, "learning_rate": 2.0391384005254133e-06, "loss": 0.66718698, "num_input_tokens_seen": 182074240, "router_z_loss_clip": 0.66162109, "router_z_loss_mlp": 0.02264404, "step": 8465, "time_per_iteration": 3.4879963397979736 }, { "auxiliary_loss_clip": 0.06445286, "auxiliary_loss_mlp": 0.01269247, "balance_loss_clip": 0.06285253, "balance_loss_mlp": 0.01256986, "epoch": 0.5090034570870284, "flos": 22716845879040.0, "grad_norm": 2.086036842967225, "language_loss": 0.79994851, "learning_rate": 2.038749012684354e-06, "loss": 0.87709379, "num_input_tokens_seen": 182093360, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.12261963, "step": 8466, "time_per_iteration": 2.5976719856262207 }, { "auxiliary_loss_clip": 0.06441557, "auxiliary_loss_mlp": 0.01264239, "balance_loss_clip": 0.06283577, "balance_loss_mlp": 0.01252771, "epoch": 0.5090635803396963, "flos": 20451537146880.0, "grad_norm": 1.8365084955283715, "language_loss": 0.78412741, "learning_rate": 2.0383596233739286e-06, "loss": 0.86118543, "num_input_tokens_seen": 182110170, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.11468506, "step": 8467, "time_per_iteration": 5.453657388687134 }, { "auxiliary_loss_clip": 0.06440061, "auxiliary_loss_mlp": 0.01269284, "balance_loss_clip": 0.06284995, "balance_loss_mlp": 0.01258191, "epoch": 0.5091237035923644, "flos": 23775637271040.0, "grad_norm": 1.6184580143942497, "language_loss": 0.74712586, "learning_rate": 2.0379702326089013e-06, "loss": 0.82421929, "num_input_tokens_seen": 182129570, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.11096191, "step": 8468, "time_per_iteration": 2.5822300910949707 }, { "auxiliary_loss_clip": 0.06440124, "auxiliary_loss_mlp": 0.01270433, "balance_loss_clip": 0.06281282, "balance_loss_mlp": 0.01258005, "epoch": 0.5091838268450323, "flos": 18332990040960.0, "grad_norm": 2.9627301889258653, "language_loss": 0.78310812, "learning_rate": 2.03758084040404e-06, "loss": 0.86021376, "num_input_tokens_seen": 182147565, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.12432861, "step": 8469, "time_per_iteration": 2.5655012130737305 }, { "auxiliary_loss_clip": 0.06443475, "auxiliary_loss_mlp": 0.01268946, "balance_loss_clip": 0.06284653, "balance_loss_mlp": 0.01256632, "epoch": 0.5092439500977003, "flos": 29064982256640.0, "grad_norm": 1.6300542165457135, "language_loss": 0.6968087, "learning_rate": 2.037191446774109e-06, "loss": 0.77393287, "num_input_tokens_seen": 182169695, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.12335205, "step": 8470, "time_per_iteration": 2.627727508544922 }, { "auxiliary_loss_clip": 0.06445497, "auxiliary_loss_mlp": 0.0127047, "balance_loss_clip": 0.06283379, "balance_loss_mlp": 0.01257882, "epoch": 0.5093040733503682, "flos": 13559134573440.0, "grad_norm": 2.0550068288721803, "language_loss": 0.73591268, "learning_rate": 2.0368020517338745e-06, "loss": 0.81307232, "num_input_tokens_seen": 182186385, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.12597656, "step": 8471, "time_per_iteration": 2.520343542098999 }, { "auxiliary_loss_clip": 0.06352456, "auxiliary_loss_mlp": 0.01254665, "balance_loss_clip": 0.06286603, "balance_loss_mlp": 0.0125271, "epoch": 0.5093641966030362, "flos": 68927838837120.0, "grad_norm": 0.7416061799643918, "language_loss": 0.58038181, "learning_rate": 2.036412655298103e-06, "loss": 0.65645301, "num_input_tokens_seen": 182247095, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 0.01953125, "step": 8472, "time_per_iteration": 3.181975841522217 }, { "auxiliary_loss_clip": 0.0645087, "auxiliary_loss_mlp": 0.01268884, "balance_loss_clip": 0.06289019, "balance_loss_mlp": 0.01257196, "epoch": 0.5094243198557042, "flos": 21587545676160.0, "grad_norm": 1.7397750314357834, "language_loss": 0.69278759, "learning_rate": 2.03602325748156e-06, "loss": 0.76998514, "num_input_tokens_seen": 182266380, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.11700439, "step": 8473, "time_per_iteration": 2.5652616024017334 }, { "auxiliary_loss_clip": 0.06451484, "auxiliary_loss_mlp": 0.01271223, "balance_loss_clip": 0.06291999, "balance_loss_mlp": 0.01258772, "epoch": 0.5094844431083722, "flos": 28848382652160.0, "grad_norm": 2.3815108440279835, "language_loss": 0.8560949, "learning_rate": 2.0356338582990105e-06, "loss": 0.93332201, "num_input_tokens_seen": 182284685, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.12463379, "step": 8474, "time_per_iteration": 2.5944876670837402 }, { "auxiliary_loss_clip": 0.06448276, "auxiliary_loss_mlp": 0.01267601, "balance_loss_clip": 0.06286514, "balance_loss_mlp": 0.01255639, "epoch": 0.5095445663610402, "flos": 14981454904320.0, "grad_norm": 2.017299262057435, "language_loss": 0.65484542, "learning_rate": 2.035244457765222e-06, "loss": 0.73200417, "num_input_tokens_seen": 182301810, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.11968994, "step": 8475, "time_per_iteration": 2.528674364089966 }, { "auxiliary_loss_clip": 0.06457166, "auxiliary_loss_mlp": 0.01268214, "balance_loss_clip": 0.06293199, "balance_loss_mlp": 0.01256126, "epoch": 0.5096046896137081, "flos": 20783354515200.0, "grad_norm": 2.2471405240724738, "language_loss": 0.82330161, "learning_rate": 2.0348550558949605e-06, "loss": 0.90055537, "num_input_tokens_seen": 182320285, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.12084961, "step": 8476, "time_per_iteration": 2.542818307876587 }, { "auxiliary_loss_clip": 0.06452508, "auxiliary_loss_mlp": 0.01266997, "balance_loss_clip": 0.06288977, "balance_loss_mlp": 0.01254152, "epoch": 0.5096648128663761, "flos": 23191735294080.0, "grad_norm": 2.0052974007447126, "language_loss": 0.81033856, "learning_rate": 2.0344656527029917e-06, "loss": 0.88753355, "num_input_tokens_seen": 182339465, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.12860107, "step": 8477, "time_per_iteration": 4.020748853683472 }, { "auxiliary_loss_clip": 0.06446542, "auxiliary_loss_mlp": 0.01270183, "balance_loss_clip": 0.06283328, "balance_loss_mlp": 0.0125713, "epoch": 0.509724936119044, "flos": 22315945219200.0, "grad_norm": 1.8422314889883518, "language_loss": 0.61654544, "learning_rate": 2.034076248204082e-06, "loss": 0.69371265, "num_input_tokens_seen": 182358375, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.13043213, "step": 8478, "time_per_iteration": 3.9509029388427734 }, { "auxiliary_loss_clip": 0.06440778, "auxiliary_loss_mlp": 0.01270337, "balance_loss_clip": 0.06283206, "balance_loss_mlp": 0.01258893, "epoch": 0.509785059371712, "flos": 26294372277120.0, "grad_norm": 1.5303453185527307, "language_loss": 0.6649093, "learning_rate": 2.0336868424129968e-06, "loss": 0.74202049, "num_input_tokens_seen": 182377935, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.11431885, "step": 8479, "time_per_iteration": 2.5913140773773193 }, { "auxiliary_loss_clip": 0.06445812, "auxiliary_loss_mlp": 0.01267249, "balance_loss_clip": 0.06287754, "balance_loss_mlp": 0.01255709, "epoch": 0.50984518262438, "flos": 22970942985600.0, "grad_norm": 1.4725491249135407, "language_loss": 0.69296026, "learning_rate": 2.0332974353445037e-06, "loss": 0.77009094, "num_input_tokens_seen": 182396440, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.11547852, "step": 8480, "time_per_iteration": 2.6072592735290527 }, { "auxiliary_loss_clip": 0.06449753, "auxiliary_loss_mlp": 0.01267104, "balance_loss_clip": 0.06286731, "balance_loss_mlp": 0.01255183, "epoch": 0.509905305877048, "flos": 26220551230080.0, "grad_norm": 1.9127305264460108, "language_loss": 0.79924381, "learning_rate": 2.0329080270133688e-06, "loss": 0.87641239, "num_input_tokens_seen": 182415890, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.1192627, "step": 8481, "time_per_iteration": 2.6590864658355713 }, { "auxiliary_loss_clip": 0.06440538, "auxiliary_loss_mlp": 0.01270516, "balance_loss_clip": 0.06284787, "balance_loss_mlp": 0.01259143, "epoch": 0.5099654291297159, "flos": 20346381872640.0, "grad_norm": 1.5148771934842702, "language_loss": 0.83826792, "learning_rate": 2.0325186174343578e-06, "loss": 0.91537845, "num_input_tokens_seen": 182434235, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.1137085, "step": 8482, "time_per_iteration": 2.6844236850738525 }, { "auxiliary_loss_clip": 0.06451225, "auxiliary_loss_mlp": 0.01268857, "balance_loss_clip": 0.06286722, "balance_loss_mlp": 0.01256781, "epoch": 0.5100255523823839, "flos": 29061711947520.0, "grad_norm": 1.8839292794818845, "language_loss": 0.85723597, "learning_rate": 2.032129206622238e-06, "loss": 0.9344368, "num_input_tokens_seen": 182454360, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.12078857, "step": 8483, "time_per_iteration": 2.6390554904937744 }, { "auxiliary_loss_clip": 0.06446864, "auxiliary_loss_mlp": 0.01267845, "balance_loss_clip": 0.06285185, "balance_loss_mlp": 0.01256568, "epoch": 0.5100856756350518, "flos": 22462539137280.0, "grad_norm": 1.9025209484461465, "language_loss": 0.83112347, "learning_rate": 2.031739794591775e-06, "loss": 0.9082706, "num_input_tokens_seen": 182471940, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.112854, "step": 8484, "time_per_iteration": 2.554272174835205 }, { "auxiliary_loss_clip": 0.06450462, "auxiliary_loss_mlp": 0.01266811, "balance_loss_clip": 0.06290433, "balance_loss_mlp": 0.01254526, "epoch": 0.5101457988877198, "flos": 19176942764160.0, "grad_norm": 2.0828103048745303, "language_loss": 0.81617093, "learning_rate": 2.031350381357736e-06, "loss": 0.89334357, "num_input_tokens_seen": 182490685, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.12280273, "step": 8485, "time_per_iteration": 2.5912482738494873 }, { "auxiliary_loss_clip": 0.06437755, "auxiliary_loss_mlp": 0.01269783, "balance_loss_clip": 0.0628168, "balance_loss_mlp": 0.01258434, "epoch": 0.5102059221403878, "flos": 14871645728640.0, "grad_norm": 1.791003935224409, "language_loss": 0.74193013, "learning_rate": 2.0309609669348874e-06, "loss": 0.81900555, "num_input_tokens_seen": 182508325, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.11346436, "step": 8486, "time_per_iteration": 2.548577070236206 }, { "auxiliary_loss_clip": 0.06449646, "auxiliary_loss_mlp": 0.01266928, "balance_loss_clip": 0.06287018, "balance_loss_mlp": 0.0125462, "epoch": 0.5102660453930558, "flos": 22966876062720.0, "grad_norm": 1.4120873306357933, "language_loss": 0.70081192, "learning_rate": 2.0305715513379953e-06, "loss": 0.77797765, "num_input_tokens_seen": 182527020, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.12304688, "step": 8487, "time_per_iteration": 2.5832369327545166 }, { "auxiliary_loss_clip": 0.06443417, "auxiliary_loss_mlp": 0.01266126, "balance_loss_clip": 0.06286391, "balance_loss_mlp": 0.01253996, "epoch": 0.5103261686457238, "flos": 23156082581760.0, "grad_norm": 2.0886478796018784, "language_loss": 0.72617674, "learning_rate": 2.030182134581827e-06, "loss": 0.80327219, "num_input_tokens_seen": 182543505, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.12127686, "step": 8488, "time_per_iteration": 2.544482946395874 }, { "auxiliary_loss_clip": 0.06452888, "auxiliary_loss_mlp": 0.01271566, "balance_loss_clip": 0.0628921, "balance_loss_mlp": 0.0126017, "epoch": 0.5103862918983917, "flos": 14324444640000.0, "grad_norm": 1.8351403460188838, "language_loss": 0.69954467, "learning_rate": 2.0297927166811503e-06, "loss": 0.77678925, "num_input_tokens_seen": 182562250, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.1138916, "step": 8489, "time_per_iteration": 2.5518367290496826 }, { "auxiliary_loss_clip": 0.06446806, "auxiliary_loss_mlp": 0.0126389, "balance_loss_clip": 0.06284991, "balance_loss_mlp": 0.01252953, "epoch": 0.5104464151510597, "flos": 25855638698880.0, "grad_norm": 1.7798187554397786, "language_loss": 0.72773713, "learning_rate": 2.0294032976507297e-06, "loss": 0.80484402, "num_input_tokens_seen": 182581910, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.109375, "step": 8490, "time_per_iteration": 2.6074378490448 }, { "auxiliary_loss_clip": 0.0644016, "auxiliary_loss_mlp": 0.01272431, "balance_loss_clip": 0.06283319, "balance_loss_mlp": 0.01261476, "epoch": 0.5105065384037276, "flos": 21659354225280.0, "grad_norm": 1.514879027727431, "language_loss": 0.80631208, "learning_rate": 2.0290138775053337e-06, "loss": 0.88343799, "num_input_tokens_seen": 182601350, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.10955811, "step": 8491, "time_per_iteration": 2.6004745960235596 }, { "auxiliary_loss_clip": 0.06440353, "auxiliary_loss_mlp": 0.01268754, "balance_loss_clip": 0.06286431, "balance_loss_mlp": 0.01257382, "epoch": 0.5105666616563956, "flos": 22498066068480.0, "grad_norm": 1.9417605605662243, "language_loss": 0.79359967, "learning_rate": 2.028624456259728e-06, "loss": 0.8706907, "num_input_tokens_seen": 182619660, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.1137085, "step": 8492, "time_per_iteration": 2.5608365535736084 }, { "auxiliary_loss_clip": 0.06452563, "auxiliary_loss_mlp": 0.01272313, "balance_loss_clip": 0.06289073, "balance_loss_mlp": 0.01259409, "epoch": 0.5106267849090635, "flos": 22462371429120.0, "grad_norm": 1.8026712850143722, "language_loss": 0.78000337, "learning_rate": 2.0282350339286804e-06, "loss": 0.85725212, "num_input_tokens_seen": 182639815, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.12896729, "step": 8493, "time_per_iteration": 2.5726397037506104 }, { "auxiliary_loss_clip": 0.06444672, "auxiliary_loss_mlp": 0.01265416, "balance_loss_clip": 0.06284533, "balance_loss_mlp": 0.01253012, "epoch": 0.5106869081617316, "flos": 23553335589120.0, "grad_norm": 1.7633392735120488, "language_loss": 0.8365342, "learning_rate": 2.0278456105269574e-06, "loss": 0.91363513, "num_input_tokens_seen": 182659655, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.12402344, "step": 8494, "time_per_iteration": 2.5501763820648193 }, { "auxiliary_loss_clip": 0.06447333, "auxiliary_loss_mlp": 0.01267818, "balance_loss_clip": 0.0628534, "balance_loss_mlp": 0.01256631, "epoch": 0.5107470314143995, "flos": 26799547743360.0, "grad_norm": 1.9493839119287752, "language_loss": 0.79901958, "learning_rate": 2.027456186069326e-06, "loss": 0.87617111, "num_input_tokens_seen": 182677075, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.11187744, "step": 8495, "time_per_iteration": 2.6198766231536865 }, { "auxiliary_loss_clip": 0.06441948, "auxiliary_loss_mlp": 0.01268039, "balance_loss_clip": 0.06283548, "balance_loss_mlp": 0.01256154, "epoch": 0.5108071546670675, "flos": 25746877699200.0, "grad_norm": 1.452779708437495, "language_loss": 0.78324151, "learning_rate": 2.0270667605705535e-06, "loss": 0.86034143, "num_input_tokens_seen": 182699625, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.11883545, "step": 8496, "time_per_iteration": 2.622474193572998 }, { "auxiliary_loss_clip": 0.06440219, "auxiliary_loss_mlp": 0.01270535, "balance_loss_clip": 0.06284387, "balance_loss_mlp": 0.01258763, "epoch": 0.5108672779197354, "flos": 18703478868480.0, "grad_norm": 1.9358022521301608, "language_loss": 0.78858423, "learning_rate": 2.0266773340454066e-06, "loss": 0.86569178, "num_input_tokens_seen": 182717020, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.11773682, "step": 8497, "time_per_iteration": 2.6086971759796143 }, { "auxiliary_loss_clip": 0.06443284, "auxiliary_loss_mlp": 0.01265363, "balance_loss_clip": 0.06285826, "balance_loss_mlp": 0.01253651, "epoch": 0.5109274011724034, "flos": 26695482572160.0, "grad_norm": 1.6308989213886929, "language_loss": 0.82163423, "learning_rate": 2.0262879065086525e-06, "loss": 0.89872068, "num_input_tokens_seen": 182736955, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.11712646, "step": 8498, "time_per_iteration": 2.5830867290496826 }, { "auxiliary_loss_clip": 0.06439126, "auxiliary_loss_mlp": 0.01271763, "balance_loss_clip": 0.0628206, "balance_loss_mlp": 0.01259974, "epoch": 0.5109875244250714, "flos": 22790666926080.0, "grad_norm": 1.8923705669032727, "language_loss": 0.71043557, "learning_rate": 2.0258984779750584e-06, "loss": 0.78754443, "num_input_tokens_seen": 182757620, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.11785889, "step": 8499, "time_per_iteration": 2.609435796737671 }, { "auxiliary_loss_clip": 0.06442918, "auxiliary_loss_mlp": 0.01268735, "balance_loss_clip": 0.06283106, "balance_loss_mlp": 0.0125747, "epoch": 0.5110476476777394, "flos": 35596958492160.0, "grad_norm": 1.4727425233603295, "language_loss": 0.72542155, "learning_rate": 2.0255090484593914e-06, "loss": 0.80253804, "num_input_tokens_seen": 182780195, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.1126709, "step": 8500, "time_per_iteration": 2.651292085647583 }, { "auxiliary_loss_clip": 0.06452508, "auxiliary_loss_mlp": 0.01271038, "balance_loss_clip": 0.06285064, "balance_loss_mlp": 0.01257811, "epoch": 0.5111077709304074, "flos": 19286751939840.0, "grad_norm": 2.525256836643123, "language_loss": 0.62806237, "learning_rate": 2.0251196179764183e-06, "loss": 0.70529783, "num_input_tokens_seen": 182795765, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.13226318, "step": 8501, "time_per_iteration": 2.554098129272461 }, { "auxiliary_loss_clip": 0.06448235, "auxiliary_loss_mlp": 0.0127492, "balance_loss_clip": 0.06283344, "balance_loss_mlp": 0.01262933, "epoch": 0.5111678941830753, "flos": 20674551588480.0, "grad_norm": 1.68758510041284, "language_loss": 0.87335002, "learning_rate": 2.024730186540907e-06, "loss": 0.95058155, "num_input_tokens_seen": 182813120, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.11999512, "step": 8502, "time_per_iteration": 2.536116123199463 }, { "auxiliary_loss_clip": 0.06439947, "auxiliary_loss_mlp": 0.0126682, "balance_loss_clip": 0.06281, "balance_loss_mlp": 0.01255704, "epoch": 0.5112280174357433, "flos": 26295336599040.0, "grad_norm": 1.3564494418727022, "language_loss": 0.82468587, "learning_rate": 2.0243407541676253e-06, "loss": 0.90175354, "num_input_tokens_seen": 182835745, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.11114502, "step": 8503, "time_per_iteration": 2.652428388595581 }, { "auxiliary_loss_clip": 0.06336249, "auxiliary_loss_mlp": 0.01257375, "balance_loss_clip": 0.06269949, "balance_loss_mlp": 0.01255438, "epoch": 0.5112881406884112, "flos": 59490706492800.0, "grad_norm": 0.8422434724369294, "language_loss": 0.63758802, "learning_rate": 2.023951320871339e-06, "loss": 0.71352422, "num_input_tokens_seen": 182892540, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 0.01934814, "step": 8504, "time_per_iteration": 3.2236344814300537 }, { "auxiliary_loss_clip": 0.06443302, "auxiliary_loss_mlp": 0.01267415, "balance_loss_clip": 0.06286165, "balance_loss_mlp": 0.01255244, "epoch": 0.5113482639410792, "flos": 26476073856000.0, "grad_norm": 1.6540279827759263, "language_loss": 0.84391576, "learning_rate": 2.023561886666816e-06, "loss": 0.92102301, "num_input_tokens_seen": 182911515, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.1217041, "step": 8505, "time_per_iteration": 2.6275312900543213 }, { "auxiliary_loss_clip": 0.06440463, "auxiliary_loss_mlp": 0.01265761, "balance_loss_clip": 0.06282188, "balance_loss_mlp": 0.01254895, "epoch": 0.5114083871937471, "flos": 29903190975360.0, "grad_norm": 1.8495745807711146, "language_loss": 0.76405752, "learning_rate": 2.0231724515688246e-06, "loss": 0.84111977, "num_input_tokens_seen": 182930860, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.10870361, "step": 8506, "time_per_iteration": 5.499497413635254 }, { "auxiliary_loss_clip": 0.0644253, "auxiliary_loss_mlp": 0.01267269, "balance_loss_clip": 0.06282301, "balance_loss_mlp": 0.01254788, "epoch": 0.5114685104464152, "flos": 24321161278080.0, "grad_norm": 1.9432309362291025, "language_loss": 0.58225179, "learning_rate": 2.022783015592131e-06, "loss": 0.6593498, "num_input_tokens_seen": 182949960, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.12493896, "step": 8507, "time_per_iteration": 2.6178784370422363 }, { "auxiliary_loss_clip": 0.06446081, "auxiliary_loss_mlp": 0.0127172, "balance_loss_clip": 0.06287451, "balance_loss_mlp": 0.01259483, "epoch": 0.5115286336990831, "flos": 17024965079040.0, "grad_norm": 1.6918338833180455, "language_loss": 0.86143905, "learning_rate": 2.022393578751503e-06, "loss": 0.93861705, "num_input_tokens_seen": 182968085, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.12231445, "step": 8508, "time_per_iteration": 2.5604381561279297 }, { "auxiliary_loss_clip": 0.06444921, "auxiliary_loss_mlp": 0.01271679, "balance_loss_clip": 0.0628538, "balance_loss_mlp": 0.01258864, "epoch": 0.5115887569517511, "flos": 23666121584640.0, "grad_norm": 2.1293701186194935, "language_loss": 0.72295392, "learning_rate": 2.022004141061709e-06, "loss": 0.80011988, "num_input_tokens_seen": 182987275, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.12817383, "step": 8509, "time_per_iteration": 2.6030399799346924 }, { "auxiliary_loss_clip": 0.06439044, "auxiliary_loss_mlp": 0.0126629, "balance_loss_clip": 0.06283452, "balance_loss_mlp": 0.01255531, "epoch": 0.511648880204419, "flos": 16112725678080.0, "grad_norm": 1.6557012290056772, "language_loss": 0.76539904, "learning_rate": 2.0216147025375153e-06, "loss": 0.84245241, "num_input_tokens_seen": 183004700, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.10766602, "step": 8510, "time_per_iteration": 2.5497665405273438 }, { "auxiliary_loss_clip": 0.06439435, "auxiliary_loss_mlp": 0.01265683, "balance_loss_clip": 0.06282826, "balance_loss_mlp": 0.01255097, "epoch": 0.511709003457087, "flos": 32643221414400.0, "grad_norm": 1.5177702533436812, "language_loss": 0.71304011, "learning_rate": 2.0212252631936907e-06, "loss": 0.79009134, "num_input_tokens_seen": 183025830, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.105896, "step": 8511, "time_per_iteration": 2.8785760402679443 }, { "auxiliary_loss_clip": 0.06442767, "auxiliary_loss_mlp": 0.01267079, "balance_loss_clip": 0.06285329, "balance_loss_mlp": 0.01255421, "epoch": 0.511769126709755, "flos": 21768492568320.0, "grad_norm": 1.8436534090595853, "language_loss": 0.66544878, "learning_rate": 2.020835823045001e-06, "loss": 0.74254721, "num_input_tokens_seen": 183045140, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.11669922, "step": 8512, "time_per_iteration": 2.6632189750671387 }, { "auxiliary_loss_clip": 0.06442558, "auxiliary_loss_mlp": 0.01267431, "balance_loss_clip": 0.06279968, "balance_loss_mlp": 0.01254413, "epoch": 0.511829249962423, "flos": 23922231189120.0, "grad_norm": 1.6394895044362223, "language_loss": 0.67427158, "learning_rate": 2.0204463821062146e-06, "loss": 0.75137144, "num_input_tokens_seen": 183063935, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.13012695, "step": 8513, "time_per_iteration": 2.6652379035949707 }, { "auxiliary_loss_clip": 0.06439829, "auxiliary_loss_mlp": 0.01271566, "balance_loss_clip": 0.06283374, "balance_loss_mlp": 0.01259341, "epoch": 0.511889373215091, "flos": 23732856961920.0, "grad_norm": 1.8707668170270357, "language_loss": 0.69456017, "learning_rate": 2.0200569403921e-06, "loss": 0.7716741, "num_input_tokens_seen": 183084135, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.12237549, "step": 8514, "time_per_iteration": 2.601682424545288 }, { "auxiliary_loss_clip": 0.06439842, "auxiliary_loss_mlp": 0.01265244, "balance_loss_clip": 0.06282439, "balance_loss_mlp": 0.01254342, "epoch": 0.5119494964677589, "flos": 28119144568320.0, "grad_norm": 1.5163890511447649, "language_loss": 0.66340685, "learning_rate": 2.019667497917424e-06, "loss": 0.74045765, "num_input_tokens_seen": 183104570, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.10900879, "step": 8515, "time_per_iteration": 2.642571449279785 }, { "auxiliary_loss_clip": 0.06434577, "auxiliary_loss_mlp": 0.01267127, "balance_loss_clip": 0.0627912, "balance_loss_mlp": 0.0125638, "epoch": 0.5120096197204269, "flos": 24980225967360.0, "grad_norm": 2.7859021131585875, "language_loss": 0.75639552, "learning_rate": 2.019278054696955e-06, "loss": 0.83341259, "num_input_tokens_seen": 183123850, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.10742188, "step": 8516, "time_per_iteration": 2.5831692218780518 }, { "auxiliary_loss_clip": 0.06444231, "auxiliary_loss_mlp": 0.01269676, "balance_loss_clip": 0.06285301, "balance_loss_mlp": 0.01257874, "epoch": 0.5120697429730948, "flos": 17973863441280.0, "grad_norm": 1.9215865599953261, "language_loss": 0.78096616, "learning_rate": 2.0188886107454595e-06, "loss": 0.8581053, "num_input_tokens_seen": 183141725, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.11804199, "step": 8517, "time_per_iteration": 5.353282451629639 }, { "auxiliary_loss_clip": 0.06451736, "auxiliary_loss_mlp": 0.01270604, "balance_loss_clip": 0.06287201, "balance_loss_mlp": 0.01258105, "epoch": 0.5121298662257628, "flos": 23298651504000.0, "grad_norm": 1.9483958273919257, "language_loss": 0.74475288, "learning_rate": 2.0184991660777063e-06, "loss": 0.8219763, "num_input_tokens_seen": 183161300, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.125, "step": 8518, "time_per_iteration": 2.6079537868499756 }, { "auxiliary_loss_clip": 0.06442877, "auxiliary_loss_mlp": 0.01271659, "balance_loss_clip": 0.06282722, "balance_loss_mlp": 0.01259654, "epoch": 0.5121899894784308, "flos": 17316769322880.0, "grad_norm": 2.8360314256077457, "language_loss": 0.78751719, "learning_rate": 2.0181097207084625e-06, "loss": 0.86466259, "num_input_tokens_seen": 183180495, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.11999512, "step": 8519, "time_per_iteration": 2.555009365081787 }, { "auxiliary_loss_clip": 0.06443918, "auxiliary_loss_mlp": 0.01266073, "balance_loss_clip": 0.06285067, "balance_loss_mlp": 0.01254027, "epoch": 0.5122501127310988, "flos": 24935978211840.0, "grad_norm": 1.7970055219378889, "language_loss": 0.79925501, "learning_rate": 2.017720274652497e-06, "loss": 0.87635493, "num_input_tokens_seen": 183200330, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.12042236, "step": 8520, "time_per_iteration": 2.6473846435546875 }, { "auxiliary_loss_clip": 0.06455019, "auxiliary_loss_mlp": 0.01266785, "balance_loss_clip": 0.06288732, "balance_loss_mlp": 0.01253481, "epoch": 0.5123102359837667, "flos": 18448878637440.0, "grad_norm": 1.7020423435402208, "language_loss": 0.81743217, "learning_rate": 2.0173308279245765e-06, "loss": 0.89465016, "num_input_tokens_seen": 183218230, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.13311768, "step": 8521, "time_per_iteration": 2.5614914894104004 }, { "auxiliary_loss_clip": 0.06442745, "auxiliary_loss_mlp": 0.01265523, "balance_loss_clip": 0.062842, "balance_loss_mlp": 0.01253691, "epoch": 0.5123703592364347, "flos": 26691625284480.0, "grad_norm": 1.758713761328282, "language_loss": 0.6887728, "learning_rate": 2.0169413805394692e-06, "loss": 0.76585543, "num_input_tokens_seen": 183236735, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.11846924, "step": 8522, "time_per_iteration": 2.58378267288208 }, { "auxiliary_loss_clip": 0.06453606, "auxiliary_loss_mlp": 0.01268605, "balance_loss_clip": 0.06287529, "balance_loss_mlp": 0.0125461, "epoch": 0.5124304824891026, "flos": 28811555982720.0, "grad_norm": 1.6579440759016848, "language_loss": 0.61909652, "learning_rate": 2.0165519325119433e-06, "loss": 0.69631863, "num_input_tokens_seen": 183257550, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.14001465, "step": 8523, "time_per_iteration": 2.6314001083374023 }, { "auxiliary_loss_clip": 0.06448899, "auxiliary_loss_mlp": 0.01264161, "balance_loss_clip": 0.06289242, "balance_loss_mlp": 0.01252836, "epoch": 0.5124906057417706, "flos": 21768199079040.0, "grad_norm": 1.979771211235649, "language_loss": 0.77471006, "learning_rate": 2.0161624838567656e-06, "loss": 0.85184062, "num_input_tokens_seen": 183275515, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.11328125, "step": 8524, "time_per_iteration": 2.5447442531585693 }, { "auxiliary_loss_clip": 0.06444514, "auxiliary_loss_mlp": 0.01269342, "balance_loss_clip": 0.06285177, "balance_loss_mlp": 0.01257815, "epoch": 0.5125507289944387, "flos": 18886605966720.0, "grad_norm": 1.7803847301422744, "language_loss": 0.75097907, "learning_rate": 2.015773034588706e-06, "loss": 0.82811761, "num_input_tokens_seen": 183293880, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.11523438, "step": 8525, "time_per_iteration": 2.5978126525878906 }, { "auxiliary_loss_clip": 0.06447437, "auxiliary_loss_mlp": 0.01268984, "balance_loss_clip": 0.06285, "balance_loss_mlp": 0.01256884, "epoch": 0.5126108522471066, "flos": 35636761981440.0, "grad_norm": 2.1910790306662733, "language_loss": 0.75038594, "learning_rate": 2.015383584722531e-06, "loss": 0.82755011, "num_input_tokens_seen": 183315860, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.12097168, "step": 8526, "time_per_iteration": 2.68678617477417 }, { "auxiliary_loss_clip": 0.06451583, "auxiliary_loss_mlp": 0.01265876, "balance_loss_clip": 0.06288786, "balance_loss_mlp": 0.01254241, "epoch": 0.5126709754997746, "flos": 20196685353600.0, "grad_norm": 1.9492311188847673, "language_loss": 0.64809728, "learning_rate": 2.0149941342730088e-06, "loss": 0.72527188, "num_input_tokens_seen": 183335480, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.1161499, "step": 8527, "time_per_iteration": 2.638817071914673 }, { "auxiliary_loss_clip": 0.06440832, "auxiliary_loss_mlp": 0.01269977, "balance_loss_clip": 0.06288463, "balance_loss_mlp": 0.01258962, "epoch": 0.5127310987524425, "flos": 18594550160640.0, "grad_norm": 1.42325653924217, "language_loss": 0.74729675, "learning_rate": 2.014604683254908e-06, "loss": 0.8244049, "num_input_tokens_seen": 183354395, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.11016846, "step": 8528, "time_per_iteration": 2.5538570880889893 }, { "auxiliary_loss_clip": 0.06445958, "auxiliary_loss_mlp": 0.01270646, "balance_loss_clip": 0.06286614, "balance_loss_mlp": 0.01258547, "epoch": 0.5127912220051105, "flos": 22461113617920.0, "grad_norm": 1.6433133255076318, "language_loss": 0.83340931, "learning_rate": 2.014215231682995e-06, "loss": 0.91057527, "num_input_tokens_seen": 183372980, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.12097168, "step": 8529, "time_per_iteration": 2.7193610668182373 }, { "auxiliary_loss_clip": 0.06445715, "auxiliary_loss_mlp": 0.01266113, "balance_loss_clip": 0.06288868, "balance_loss_mlp": 0.01254431, "epoch": 0.5128513452577784, "flos": 19098845159040.0, "grad_norm": 2.779582042942902, "language_loss": 0.74102312, "learning_rate": 2.01382577957204e-06, "loss": 0.8181414, "num_input_tokens_seen": 183390160, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.11694336, "step": 8530, "time_per_iteration": 2.6429338455200195 }, { "auxiliary_loss_clip": 0.0635834, "auxiliary_loss_mlp": 0.0125419, "balance_loss_clip": 0.06292216, "balance_loss_mlp": 0.01251854, "epoch": 0.5129114685104464, "flos": 67914553011840.0, "grad_norm": 0.736393051213828, "language_loss": 0.60766459, "learning_rate": 2.0134363269368095e-06, "loss": 0.68378991, "num_input_tokens_seen": 183455280, "router_z_loss_clip": 0.66357422, "router_z_loss_mlp": 0.02333069, "step": 8531, "time_per_iteration": 3.3377838134765625 }, { "auxiliary_loss_clip": 0.06451112, "auxiliary_loss_mlp": 0.01271777, "balance_loss_clip": 0.06288779, "balance_loss_mlp": 0.01259403, "epoch": 0.5129715917631144, "flos": 20455436361600.0, "grad_norm": 1.700307756712262, "language_loss": 0.77283382, "learning_rate": 2.0130468737920725e-06, "loss": 0.85006267, "num_input_tokens_seen": 183473955, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.1237793, "step": 8532, "time_per_iteration": 2.538682699203491 }, { "auxiliary_loss_clip": 0.06448962, "auxiliary_loss_mlp": 0.01264529, "balance_loss_clip": 0.0628975, "balance_loss_mlp": 0.0125284, "epoch": 0.5130317150157824, "flos": 35124836261760.0, "grad_norm": 1.8052236069160257, "language_loss": 0.67595398, "learning_rate": 2.012657420152597e-06, "loss": 0.75308883, "num_input_tokens_seen": 183497195, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.11682129, "step": 8533, "time_per_iteration": 2.720296621322632 }, { "auxiliary_loss_clip": 0.06451564, "auxiliary_loss_mlp": 0.01274879, "balance_loss_clip": 0.06288241, "balance_loss_mlp": 0.01262028, "epoch": 0.5130918382684503, "flos": 19797671410560.0, "grad_norm": 1.9945214363954669, "language_loss": 0.82485193, "learning_rate": 2.01226796603315e-06, "loss": 0.90211636, "num_input_tokens_seen": 183513675, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.12854004, "step": 8534, "time_per_iteration": 2.5294737815856934 }, { "auxiliary_loss_clip": 0.06453409, "auxiliary_loss_mlp": 0.01271648, "balance_loss_clip": 0.06290858, "balance_loss_mlp": 0.01258929, "epoch": 0.5131519615211183, "flos": 26330318478720.0, "grad_norm": 1.4121597564766788, "language_loss": 0.63818395, "learning_rate": 2.0118785114485017e-06, "loss": 0.71543455, "num_input_tokens_seen": 183535165, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.12713623, "step": 8535, "time_per_iteration": 2.631434917449951 }, { "auxiliary_loss_clip": 0.06450966, "auxiliary_loss_mlp": 0.01267183, "balance_loss_clip": 0.06292151, "balance_loss_mlp": 0.01255352, "epoch": 0.5132120847737862, "flos": 19177949013120.0, "grad_norm": 2.1112959902587813, "language_loss": 0.69913042, "learning_rate": 2.011489056413418e-06, "loss": 0.77631187, "num_input_tokens_seen": 183553780, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.11846924, "step": 8536, "time_per_iteration": 2.546782970428467 }, { "auxiliary_loss_clip": 0.0645974, "auxiliary_loss_mlp": 0.01272909, "balance_loss_clip": 0.06294723, "balance_loss_mlp": 0.01259695, "epoch": 0.5132722080264542, "flos": 20236698478080.0, "grad_norm": 2.098930268560715, "language_loss": 0.72331238, "learning_rate": 2.011099600942669e-06, "loss": 0.80063891, "num_input_tokens_seen": 183572285, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.13214111, "step": 8537, "time_per_iteration": 2.5796899795532227 }, { "auxiliary_loss_clip": 0.06448755, "auxiliary_loss_mlp": 0.01265379, "balance_loss_clip": 0.06286515, "balance_loss_mlp": 0.01253321, "epoch": 0.5133323312791223, "flos": 16474619462400.0, "grad_norm": 1.9682760064367382, "language_loss": 0.80380464, "learning_rate": 2.0107101450510214e-06, "loss": 0.88094592, "num_input_tokens_seen": 183589330, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.1206665, "step": 8538, "time_per_iteration": 2.523813486099243 }, { "auxiliary_loss_clip": 0.06443262, "auxiliary_loss_mlp": 0.01269446, "balance_loss_clip": 0.06283014, "balance_loss_mlp": 0.0125793, "epoch": 0.5133924545317902, "flos": 26075340904320.0, "grad_norm": 1.7503854203787028, "language_loss": 0.78468013, "learning_rate": 2.0103206887532437e-06, "loss": 0.86180723, "num_input_tokens_seen": 183609205, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.11523438, "step": 8539, "time_per_iteration": 2.6377170085906982 }, { "auxiliary_loss_clip": 0.06448726, "auxiliary_loss_mlp": 0.01270493, "balance_loss_clip": 0.06287944, "balance_loss_mlp": 0.01258042, "epoch": 0.5134525777844582, "flos": 29138467887360.0, "grad_norm": 1.5758925927711316, "language_loss": 0.76142907, "learning_rate": 2.009931232064105e-06, "loss": 0.83862126, "num_input_tokens_seen": 183629985, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.12438965, "step": 8540, "time_per_iteration": 2.629518508911133 }, { "auxiliary_loss_clip": 0.06458008, "auxiliary_loss_mlp": 0.01267737, "balance_loss_clip": 0.0629208, "balance_loss_mlp": 0.01254159, "epoch": 0.5135127010371261, "flos": 17460134858880.0, "grad_norm": 1.6800265744864615, "language_loss": 0.75137889, "learning_rate": 2.0095417749983724e-06, "loss": 0.82863641, "num_input_tokens_seen": 183648220, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.13568115, "step": 8541, "time_per_iteration": 2.642542839050293 }, { "auxiliary_loss_clip": 0.06446099, "auxiliary_loss_mlp": 0.01272164, "balance_loss_clip": 0.06285512, "balance_loss_mlp": 0.01259575, "epoch": 0.5135728242897941, "flos": 21951493885440.0, "grad_norm": 1.4976313011828657, "language_loss": 0.70611119, "learning_rate": 2.0091523175708162e-06, "loss": 0.78329384, "num_input_tokens_seen": 183668230, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.12597656, "step": 8542, "time_per_iteration": 2.6151492595672607 }, { "auxiliary_loss_clip": 0.0644275, "auxiliary_loss_mlp": 0.01265531, "balance_loss_clip": 0.06282929, "balance_loss_mlp": 0.01253288, "epoch": 0.513632947542462, "flos": 22681528583040.0, "grad_norm": 1.9866501887880792, "language_loss": 0.79899466, "learning_rate": 2.0087628597962023e-06, "loss": 0.87607753, "num_input_tokens_seen": 183687800, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.12255859, "step": 8543, "time_per_iteration": 2.632983446121216 }, { "auxiliary_loss_clip": 0.06449823, "auxiliary_loss_mlp": 0.01266742, "balance_loss_clip": 0.06290303, "balance_loss_mlp": 0.01254172, "epoch": 0.51369307079513, "flos": 29464289688960.0, "grad_norm": 1.6240113005839723, "language_loss": 0.68064296, "learning_rate": 2.008373401689299e-06, "loss": 0.75780863, "num_input_tokens_seen": 183709025, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.12579346, "step": 8544, "time_per_iteration": 2.6647162437438965 }, { "auxiliary_loss_clip": 0.06445517, "auxiliary_loss_mlp": 0.01267814, "balance_loss_clip": 0.06282049, "balance_loss_mlp": 0.01256173, "epoch": 0.513753194047798, "flos": 18995325039360.0, "grad_norm": 2.007934289842706, "language_loss": 0.72762448, "learning_rate": 2.0079839432648765e-06, "loss": 0.80475777, "num_input_tokens_seen": 183725740, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.11639404, "step": 8545, "time_per_iteration": 2.574227809906006 }, { "auxiliary_loss_clip": 0.06449614, "auxiliary_loss_mlp": 0.01273518, "balance_loss_clip": 0.06285777, "balance_loss_mlp": 0.01260435, "epoch": 0.513813317300466, "flos": 17827646866560.0, "grad_norm": 1.8610673817761592, "language_loss": 0.82427466, "learning_rate": 2.0075944845377016e-06, "loss": 0.90150601, "num_input_tokens_seen": 183743995, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.13085938, "step": 8546, "time_per_iteration": 4.0033793449401855 }, { "auxiliary_loss_clip": 0.06450115, "auxiliary_loss_mlp": 0.01271034, "balance_loss_clip": 0.06287989, "balance_loss_mlp": 0.01258588, "epoch": 0.5138734405531339, "flos": 24068070420480.0, "grad_norm": 1.7122323588422073, "language_loss": 0.73478508, "learning_rate": 2.007205025522544e-06, "loss": 0.81199658, "num_input_tokens_seen": 183764150, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.12457275, "step": 8547, "time_per_iteration": 2.6192421913146973 }, { "auxiliary_loss_clip": 0.06449689, "auxiliary_loss_mlp": 0.01273995, "balance_loss_clip": 0.06289153, "balance_loss_mlp": 0.01261925, "epoch": 0.5139335638058019, "flos": 26103279041280.0, "grad_norm": 1.8933162781112953, "language_loss": 0.7391603, "learning_rate": 2.0068155662341702e-06, "loss": 0.81639707, "num_input_tokens_seen": 183783280, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.12060547, "step": 8548, "time_per_iteration": 2.5960288047790527 }, { "auxiliary_loss_clip": 0.06450585, "auxiliary_loss_mlp": 0.01267839, "balance_loss_clip": 0.06288722, "balance_loss_mlp": 0.01255632, "epoch": 0.5139936870584698, "flos": 18923181073920.0, "grad_norm": 2.08770212269381, "language_loss": 0.82536137, "learning_rate": 2.0064261066873495e-06, "loss": 0.90254557, "num_input_tokens_seen": 183800725, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.12213135, "step": 8549, "time_per_iteration": 2.592236280441284 }, { "auxiliary_loss_clip": 0.06446877, "auxiliary_loss_mlp": 0.01272226, "balance_loss_clip": 0.06289662, "balance_loss_mlp": 0.0126114, "epoch": 0.5140538103111378, "flos": 16149594274560.0, "grad_norm": 1.8258240801882484, "language_loss": 0.72885299, "learning_rate": 2.0060366468968504e-06, "loss": 0.80604398, "num_input_tokens_seen": 183818735, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.11083984, "step": 8550, "time_per_iteration": 2.5250298976898193 }, { "auxiliary_loss_clip": 0.06456698, "auxiliary_loss_mlp": 0.01267211, "balance_loss_clip": 0.0629036, "balance_loss_mlp": 0.01255302, "epoch": 0.5141139335638057, "flos": 22426886424960.0, "grad_norm": 1.5812545405218543, "language_loss": 0.75797403, "learning_rate": 2.0056471868774408e-06, "loss": 0.83521307, "num_input_tokens_seen": 183840015, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.11901855, "step": 8551, "time_per_iteration": 2.628382682800293 }, { "auxiliary_loss_clip": 0.06450433, "auxiliary_loss_mlp": 0.01267785, "balance_loss_clip": 0.06294633, "balance_loss_mlp": 0.01257128, "epoch": 0.5141740568164738, "flos": 27097054064640.0, "grad_norm": 6.714196070236469, "language_loss": 0.69223291, "learning_rate": 2.0052577266438897e-06, "loss": 0.76941508, "num_input_tokens_seen": 183860145, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.10656738, "step": 8552, "time_per_iteration": 2.659301996231079 }, { "auxiliary_loss_clip": 0.06452997, "auxiliary_loss_mlp": 0.0127172, "balance_loss_clip": 0.06289786, "balance_loss_mlp": 0.01259388, "epoch": 0.5142341800691418, "flos": 24980267894400.0, "grad_norm": 1.782508318215315, "language_loss": 0.75351954, "learning_rate": 2.004868266210965e-06, "loss": 0.83076668, "num_input_tokens_seen": 183880540, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.12335205, "step": 8553, "time_per_iteration": 2.683807611465454 }, { "auxiliary_loss_clip": 0.06447443, "auxiliary_loss_mlp": 0.01266754, "balance_loss_clip": 0.0628726, "balance_loss_mlp": 0.01254768, "epoch": 0.5142943033218097, "flos": 20710833206400.0, "grad_norm": 1.6442440188301048, "language_loss": 0.68197131, "learning_rate": 2.004478805593435e-06, "loss": 0.75911331, "num_input_tokens_seen": 183900895, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.11987305, "step": 8554, "time_per_iteration": 2.5542354583740234 }, { "auxiliary_loss_clip": 0.06455252, "auxiliary_loss_mlp": 0.01273072, "balance_loss_clip": 0.06288978, "balance_loss_mlp": 0.01260061, "epoch": 0.5143544265744777, "flos": 22931391058560.0, "grad_norm": 1.666050125306369, "language_loss": 0.7372067, "learning_rate": 2.004089344806068e-06, "loss": 0.81448996, "num_input_tokens_seen": 183920335, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.13006592, "step": 8555, "time_per_iteration": 2.623793363571167 }, { "auxiliary_loss_clip": 0.06445862, "auxiliary_loss_mlp": 0.01272924, "balance_loss_clip": 0.06284042, "balance_loss_mlp": 0.0126139, "epoch": 0.5144145498271456, "flos": 15926328270720.0, "grad_norm": 2.350496703125521, "language_loss": 0.75207675, "learning_rate": 2.003699883863633e-06, "loss": 0.82926464, "num_input_tokens_seen": 183936220, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.11523438, "step": 8556, "time_per_iteration": 3.994441509246826 }, { "auxiliary_loss_clip": 0.0644121, "auxiliary_loss_mlp": 0.01269026, "balance_loss_clip": 0.06284855, "balance_loss_mlp": 0.0125797, "epoch": 0.5144746730798136, "flos": 19687107548160.0, "grad_norm": 1.69723820724636, "language_loss": 0.86291838, "learning_rate": 2.003310422780898e-06, "loss": 0.9400208, "num_input_tokens_seen": 183953250, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.1105957, "step": 8557, "time_per_iteration": 3.9580464363098145 }, { "auxiliary_loss_clip": 0.06441022, "auxiliary_loss_mlp": 0.01268086, "balance_loss_clip": 0.06285088, "balance_loss_mlp": 0.01256648, "epoch": 0.5145347963324816, "flos": 23921476502400.0, "grad_norm": 1.4310876257486194, "language_loss": 0.89501929, "learning_rate": 2.0029209615726307e-06, "loss": 0.97211039, "num_input_tokens_seen": 183973865, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.11444092, "step": 8558, "time_per_iteration": 2.8650357723236084 }, { "auxiliary_loss_clip": 0.06440202, "auxiliary_loss_mlp": 0.01268508, "balance_loss_clip": 0.06285429, "balance_loss_mlp": 0.01257142, "epoch": 0.5145949195851496, "flos": 18265919247360.0, "grad_norm": 1.8936348877240754, "language_loss": 0.65381473, "learning_rate": 2.002531500253602e-06, "loss": 0.73090178, "num_input_tokens_seen": 183992555, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.11376953, "step": 8559, "time_per_iteration": 2.8100154399871826 }, { "auxiliary_loss_clip": 0.06444792, "auxiliary_loss_mlp": 0.01267306, "balance_loss_clip": 0.06287134, "balance_loss_mlp": 0.01255654, "epoch": 0.5146550428378175, "flos": 26220593157120.0, "grad_norm": 1.7523830301500003, "language_loss": 0.6364463, "learning_rate": 2.002142038838577e-06, "loss": 0.71356726, "num_input_tokens_seen": 184010825, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.11651611, "step": 8560, "time_per_iteration": 2.6253902912139893 }, { "auxiliary_loss_clip": 0.06446525, "auxiliary_loss_mlp": 0.01269438, "balance_loss_clip": 0.06286784, "balance_loss_mlp": 0.01258077, "epoch": 0.5147151660904855, "flos": 22680731969280.0, "grad_norm": 1.5463877302865343, "language_loss": 0.70689261, "learning_rate": 2.0017525773423265e-06, "loss": 0.78405225, "num_input_tokens_seen": 184030155, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.11358643, "step": 8561, "time_per_iteration": 2.597998857498169 }, { "auxiliary_loss_clip": 0.06445233, "auxiliary_loss_mlp": 0.01268978, "balance_loss_clip": 0.0628461, "balance_loss_mlp": 0.0125748, "epoch": 0.5147752893431534, "flos": 24979261645440.0, "grad_norm": 1.593585732862605, "language_loss": 0.66932833, "learning_rate": 2.0013631157796177e-06, "loss": 0.74647045, "num_input_tokens_seen": 184051440, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.1149292, "step": 8562, "time_per_iteration": 2.5894622802734375 }, { "auxiliary_loss_clip": 0.06455381, "auxiliary_loss_mlp": 0.01270237, "balance_loss_clip": 0.06293747, "balance_loss_mlp": 0.01258841, "epoch": 0.5148354125958214, "flos": 22750821509760.0, "grad_norm": 1.5879424255563206, "language_loss": 0.78231049, "learning_rate": 2.0009736541652188e-06, "loss": 0.85956657, "num_input_tokens_seen": 184070205, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.11395264, "step": 8563, "time_per_iteration": 2.6409692764282227 }, { "auxiliary_loss_clip": 0.06454977, "auxiliary_loss_mlp": 0.01270751, "balance_loss_clip": 0.06288524, "balance_loss_mlp": 0.01258484, "epoch": 0.5148955358484893, "flos": 23074253470080.0, "grad_norm": 1.9373110826204043, "language_loss": 0.83038813, "learning_rate": 2.0005841925139e-06, "loss": 0.9076454, "num_input_tokens_seen": 184087345, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.12280273, "step": 8564, "time_per_iteration": 2.547717332839966 }, { "auxiliary_loss_clip": 0.06455943, "auxiliary_loss_mlp": 0.01271846, "balance_loss_clip": 0.06291755, "balance_loss_mlp": 0.01259753, "epoch": 0.5149556591011574, "flos": 20346465726720.0, "grad_norm": 1.6452064004222227, "language_loss": 0.73336297, "learning_rate": 2.0001947308404283e-06, "loss": 0.81064081, "num_input_tokens_seen": 184107110, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.12091064, "step": 8565, "time_per_iteration": 2.5748233795166016 }, { "auxiliary_loss_clip": 0.06453805, "auxiliary_loss_mlp": 0.01270814, "balance_loss_clip": 0.06287038, "balance_loss_mlp": 0.0125819, "epoch": 0.5150157823538254, "flos": 22644869621760.0, "grad_norm": 2.302516910771, "language_loss": 0.6781894, "learning_rate": 1.9998052691595715e-06, "loss": 0.75543559, "num_input_tokens_seen": 184127105, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.1262207, "step": 8566, "time_per_iteration": 2.573840379714966 }, { "auxiliary_loss_clip": 0.06454965, "auxiliary_loss_mlp": 0.01268185, "balance_loss_clip": 0.06287032, "balance_loss_mlp": 0.01256228, "epoch": 0.5150759056064933, "flos": 26074795852800.0, "grad_norm": 1.8347298406885366, "language_loss": 0.7832281, "learning_rate": 1.9994158074861005e-06, "loss": 0.86045957, "num_input_tokens_seen": 184148060, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.11962891, "step": 8567, "time_per_iteration": 2.6105217933654785 }, { "auxiliary_loss_clip": 0.0644327, "auxiliary_loss_mlp": 0.01269868, "balance_loss_clip": 0.06280708, "balance_loss_mlp": 0.01257667, "epoch": 0.5151360288591613, "flos": 25958865329280.0, "grad_norm": 1.8138781295888171, "language_loss": 0.79197776, "learning_rate": 1.9990263458347806e-06, "loss": 0.86910915, "num_input_tokens_seen": 184166175, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.12188721, "step": 8568, "time_per_iteration": 2.5948617458343506 }, { "auxiliary_loss_clip": 0.06439137, "auxiliary_loss_mlp": 0.01269357, "balance_loss_clip": 0.06280912, "balance_loss_mlp": 0.01258515, "epoch": 0.5151961521118292, "flos": 18511840581120.0, "grad_norm": 2.063294242817084, "language_loss": 0.91198492, "learning_rate": 1.9986368842203825e-06, "loss": 0.98906988, "num_input_tokens_seen": 184182600, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.10839844, "step": 8569, "time_per_iteration": 2.56708025932312 }, { "auxiliary_loss_clip": 0.06449485, "auxiliary_loss_mlp": 0.01276794, "balance_loss_clip": 0.06285594, "balance_loss_mlp": 0.01264945, "epoch": 0.5152562753644973, "flos": 22239734330880.0, "grad_norm": 1.743538495156289, "language_loss": 0.77039921, "learning_rate": 1.998247422657674e-06, "loss": 0.84766197, "num_input_tokens_seen": 184202020, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.1184082, "step": 8570, "time_per_iteration": 2.572913408279419 }, { "auxiliary_loss_clip": 0.06444059, "auxiliary_loss_mlp": 0.01272751, "balance_loss_clip": 0.06283765, "balance_loss_mlp": 0.01259859, "epoch": 0.5153163986171652, "flos": 38445833784960.0, "grad_norm": 1.7036920674947946, "language_loss": 0.73763454, "learning_rate": 1.9978579611614227e-06, "loss": 0.81480265, "num_input_tokens_seen": 184224850, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.12890625, "step": 8571, "time_per_iteration": 2.7216475009918213 }, { "auxiliary_loss_clip": 0.06350485, "auxiliary_loss_mlp": 0.01259576, "balance_loss_clip": 0.06285056, "balance_loss_mlp": 0.0125752, "epoch": 0.5153765218698332, "flos": 66404533783680.0, "grad_norm": 0.7864781672077397, "language_loss": 0.52960533, "learning_rate": 1.9974684997463984e-06, "loss": 0.60570586, "num_input_tokens_seen": 184288520, "router_z_loss_clip": 0.65478516, "router_z_loss_mlp": 0.02056885, "step": 8572, "time_per_iteration": 3.2622437477111816 }, { "auxiliary_loss_clip": 0.06440358, "auxiliary_loss_mlp": 0.01270044, "balance_loss_clip": 0.06285019, "balance_loss_mlp": 0.01258451, "epoch": 0.5154366451225011, "flos": 24031537240320.0, "grad_norm": 2.1383814340099, "language_loss": 0.76580489, "learning_rate": 1.9970790384273687e-06, "loss": 0.84290898, "num_input_tokens_seen": 184308565, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.11602783, "step": 8573, "time_per_iteration": 2.5901520252227783 }, { "auxiliary_loss_clip": 0.06441797, "auxiliary_loss_mlp": 0.01268584, "balance_loss_clip": 0.06283304, "balance_loss_mlp": 0.01256752, "epoch": 0.5154967683751691, "flos": 23474189808000.0, "grad_norm": 1.691762981003605, "language_loss": 0.77423477, "learning_rate": 1.996689577219102e-06, "loss": 0.85133857, "num_input_tokens_seen": 184326795, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.11834717, "step": 8574, "time_per_iteration": 2.5726318359375 }, { "auxiliary_loss_clip": 0.06436311, "auxiliary_loss_mlp": 0.01267229, "balance_loss_clip": 0.06279501, "balance_loss_mlp": 0.01256017, "epoch": 0.515556891627837, "flos": 23812463940480.0, "grad_norm": 1.8412670388368855, "language_loss": 0.85555053, "learning_rate": 1.996300116136367e-06, "loss": 0.93258584, "num_input_tokens_seen": 184345990, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.11218262, "step": 8575, "time_per_iteration": 2.6385066509246826 }, { "auxiliary_loss_clip": 0.06445743, "auxiliary_loss_mlp": 0.0126827, "balance_loss_clip": 0.06282936, "balance_loss_mlp": 0.01256248, "epoch": 0.515617014880505, "flos": 19834665788160.0, "grad_norm": 1.5734531662136935, "language_loss": 0.76886547, "learning_rate": 1.995910655193932e-06, "loss": 0.84600556, "num_input_tokens_seen": 184366300, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.12030029, "step": 8576, "time_per_iteration": 2.5917131900787354 }, { "auxiliary_loss_clip": 0.0645857, "auxiliary_loss_mlp": 0.01269276, "balance_loss_clip": 0.06287564, "balance_loss_mlp": 0.01256824, "epoch": 0.515677138133173, "flos": 14251042863360.0, "grad_norm": 2.3559011024690233, "language_loss": 0.75925398, "learning_rate": 1.9955211944065654e-06, "loss": 0.83653247, "num_input_tokens_seen": 184383030, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.12463379, "step": 8577, "time_per_iteration": 2.670912742614746 }, { "auxiliary_loss_clip": 0.0645045, "auxiliary_loss_mlp": 0.01274455, "balance_loss_clip": 0.06285727, "balance_loss_mlp": 0.01260978, "epoch": 0.515737261385841, "flos": 28296653443200.0, "grad_norm": 1.6400567997477027, "language_loss": 0.80904907, "learning_rate": 1.9951317337890353e-06, "loss": 0.88629812, "num_input_tokens_seen": 184403410, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.13470459, "step": 8578, "time_per_iteration": 2.704037666320801 }, { "auxiliary_loss_clip": 0.06446482, "auxiliary_loss_mlp": 0.01268999, "balance_loss_clip": 0.06286211, "balance_loss_mlp": 0.01257215, "epoch": 0.515797384638509, "flos": 27899400435840.0, "grad_norm": 1.7084853655740673, "language_loss": 0.765284, "learning_rate": 1.9947422733561105e-06, "loss": 0.84243882, "num_input_tokens_seen": 184423830, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.11798096, "step": 8579, "time_per_iteration": 2.6254892349243164 }, { "auxiliary_loss_clip": 0.06447878, "auxiliary_loss_mlp": 0.01269703, "balance_loss_clip": 0.06284038, "balance_loss_mlp": 0.01257329, "epoch": 0.5158575078911769, "flos": 23046860384640.0, "grad_norm": 1.5027881446896505, "language_loss": 0.79385233, "learning_rate": 1.994352813122559e-06, "loss": 0.87102813, "num_input_tokens_seen": 184445050, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.12365723, "step": 8580, "time_per_iteration": 2.594214916229248 }, { "auxiliary_loss_clip": 0.0645275, "auxiliary_loss_mlp": 0.0127027, "balance_loss_clip": 0.06287979, "balance_loss_mlp": 0.01256704, "epoch": 0.5159176311438449, "flos": 12646350120960.0, "grad_norm": 2.5062666152760054, "language_loss": 0.73401195, "learning_rate": 1.99396335310315e-06, "loss": 0.8112421, "num_input_tokens_seen": 184460775, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.13562012, "step": 8581, "time_per_iteration": 2.5561411380767822 }, { "auxiliary_loss_clip": 0.06449182, "auxiliary_loss_mlp": 0.01270205, "balance_loss_clip": 0.06288783, "balance_loss_mlp": 0.01258451, "epoch": 0.5159777543965128, "flos": 15563302456320.0, "grad_norm": 2.5841303242895295, "language_loss": 0.75687265, "learning_rate": 1.9935738933126508e-06, "loss": 0.83406651, "num_input_tokens_seen": 184477365, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.11761475, "step": 8582, "time_per_iteration": 2.583796501159668 }, { "auxiliary_loss_clip": 0.06446806, "auxiliary_loss_mlp": 0.01269315, "balance_loss_clip": 0.06286824, "balance_loss_mlp": 0.01258562, "epoch": 0.5160378776491809, "flos": 23228352328320.0, "grad_norm": 1.8723139085777163, "language_loss": 0.65904546, "learning_rate": 1.99318443376583e-06, "loss": 0.73620665, "num_input_tokens_seen": 184497045, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.10760498, "step": 8583, "time_per_iteration": 2.585405111312866 }, { "auxiliary_loss_clip": 0.06452565, "auxiliary_loss_mlp": 0.01272066, "balance_loss_clip": 0.06289262, "balance_loss_mlp": 0.01259662, "epoch": 0.5160980009018488, "flos": 21951074615040.0, "grad_norm": 1.386213066248719, "language_loss": 0.75941306, "learning_rate": 1.9927949744774568e-06, "loss": 0.83665943, "num_input_tokens_seen": 184517675, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.12414551, "step": 8584, "time_per_iteration": 2.567934036254883 }, { "auxiliary_loss_clip": 0.06453141, "auxiliary_loss_mlp": 0.01270826, "balance_loss_clip": 0.06286606, "balance_loss_mlp": 0.01257046, "epoch": 0.5161581241545168, "flos": 22790708853120.0, "grad_norm": 2.080780868047403, "language_loss": 0.78919131, "learning_rate": 1.9924055154622983e-06, "loss": 0.866431, "num_input_tokens_seen": 184537745, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.13769531, "step": 8585, "time_per_iteration": 5.4141035079956055 }, { "auxiliary_loss_clip": 0.06437524, "auxiliary_loss_mlp": 0.01271023, "balance_loss_clip": 0.06282209, "balance_loss_mlp": 0.01259352, "epoch": 0.5162182474071847, "flos": 19680273440640.0, "grad_norm": 2.018532184108511, "language_loss": 0.81152231, "learning_rate": 1.9920160567351238e-06, "loss": 0.8886078, "num_input_tokens_seen": 184553630, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.11682129, "step": 8586, "time_per_iteration": 2.542874336242676 }, { "auxiliary_loss_clip": 0.06449745, "auxiliary_loss_mlp": 0.01276116, "balance_loss_clip": 0.06287767, "balance_loss_mlp": 0.01263265, "epoch": 0.5162783706598527, "flos": 20052145860480.0, "grad_norm": 1.5770088254808141, "language_loss": 0.71951389, "learning_rate": 1.991626598310701e-06, "loss": 0.79677248, "num_input_tokens_seen": 184573530, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.12866211, "step": 8587, "time_per_iteration": 2.5677273273468018 }, { "auxiliary_loss_clip": 0.06334017, "auxiliary_loss_mlp": 0.01253956, "balance_loss_clip": 0.06268013, "balance_loss_mlp": 0.0125199, "epoch": 0.5163384939125206, "flos": 69980089610880.0, "grad_norm": 0.7125425627085146, "language_loss": 0.57672834, "learning_rate": 1.9912371402037984e-06, "loss": 0.65260804, "num_input_tokens_seen": 184637875, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 0.01963806, "step": 8588, "time_per_iteration": 3.198838233947754 }, { "auxiliary_loss_clip": 0.06450433, "auxiliary_loss_mlp": 0.01268358, "balance_loss_clip": 0.06287094, "balance_loss_mlp": 0.01255162, "epoch": 0.5163986171651886, "flos": 17422176159360.0, "grad_norm": 1.725132644332885, "language_loss": 0.76017445, "learning_rate": 1.990847682429185e-06, "loss": 0.83736241, "num_input_tokens_seen": 184656125, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.13183594, "step": 8589, "time_per_iteration": 2.5732524394989014 }, { "auxiliary_loss_clip": 0.0645479, "auxiliary_loss_mlp": 0.01269801, "balance_loss_clip": 0.06290327, "balance_loss_mlp": 0.01257838, "epoch": 0.5164587404178566, "flos": 21328752741120.0, "grad_norm": 1.6957399911516946, "language_loss": 0.67787206, "learning_rate": 1.990458225001627e-06, "loss": 0.75511789, "num_input_tokens_seen": 184675920, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.11975098, "step": 8590, "time_per_iteration": 2.583308458328247 }, { "auxiliary_loss_clip": 0.0632858, "auxiliary_loss_mlp": 0.01254501, "balance_loss_clip": 0.06262603, "balance_loss_mlp": 0.01252385, "epoch": 0.5165188636705246, "flos": 68076506954880.0, "grad_norm": 0.7665288116392548, "language_loss": 0.55773264, "learning_rate": 1.990068767935895e-06, "loss": 0.63356352, "num_input_tokens_seen": 184730520, "router_z_loss_clip": 0.65966797, "router_z_loss_mlp": 0.0211792, "step": 8591, "time_per_iteration": 3.147024631500244 }, { "auxiliary_loss_clip": 0.06436766, "auxiliary_loss_mlp": 0.01269348, "balance_loss_clip": 0.06283616, "balance_loss_mlp": 0.01257636, "epoch": 0.5165789869231926, "flos": 19390859038080.0, "grad_norm": 1.5370403209599728, "language_loss": 0.81640363, "learning_rate": 1.9896793112467566e-06, "loss": 0.89346468, "num_input_tokens_seen": 184748340, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.11706543, "step": 8592, "time_per_iteration": 2.5454323291778564 }, { "auxiliary_loss_clip": 0.06441829, "auxiliary_loss_mlp": 0.01264049, "balance_loss_clip": 0.06284939, "balance_loss_mlp": 0.01252623, "epoch": 0.5166391101758605, "flos": 20966607394560.0, "grad_norm": 4.232600889386598, "language_loss": 0.83527541, "learning_rate": 1.989289854948979e-06, "loss": 0.9123342, "num_input_tokens_seen": 184766615, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.11425781, "step": 8593, "time_per_iteration": 2.5853638648986816 }, { "auxiliary_loss_clip": 0.06444287, "auxiliary_loss_mlp": 0.01270357, "balance_loss_clip": 0.06284642, "balance_loss_mlp": 0.01258657, "epoch": 0.5166992334285285, "flos": 29470411036800.0, "grad_norm": 2.12833528089733, "language_loss": 0.69361341, "learning_rate": 1.9889003990573314e-06, "loss": 0.77075994, "num_input_tokens_seen": 184788075, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.11682129, "step": 8594, "time_per_iteration": 2.63944673538208 }, { "auxiliary_loss_clip": 0.06444015, "auxiliary_loss_mlp": 0.01271098, "balance_loss_clip": 0.06284577, "balance_loss_mlp": 0.01259344, "epoch": 0.5167593566811964, "flos": 20310813014400.0, "grad_norm": 1.5222889982459036, "language_loss": 0.77839613, "learning_rate": 1.988510943586582e-06, "loss": 0.85554719, "num_input_tokens_seen": 184808710, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.11755371, "step": 8595, "time_per_iteration": 2.6016018390655518 }, { "auxiliary_loss_clip": 0.06444904, "auxiliary_loss_mlp": 0.0126943, "balance_loss_clip": 0.06283659, "balance_loss_mlp": 0.01257575, "epoch": 0.5168194799338645, "flos": 14616668154240.0, "grad_norm": 1.7721869892795727, "language_loss": 0.65381467, "learning_rate": 1.9881214885514986e-06, "loss": 0.73095804, "num_input_tokens_seen": 184826475, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.11859131, "step": 8596, "time_per_iteration": 4.0848188400268555 }, { "auxiliary_loss_clip": 0.06441701, "auxiliary_loss_mlp": 0.01274633, "balance_loss_clip": 0.0628177, "balance_loss_mlp": 0.01261025, "epoch": 0.5168796031865324, "flos": 25013866181760.0, "grad_norm": 1.5424534344223342, "language_loss": 0.75621855, "learning_rate": 1.9877320339668492e-06, "loss": 0.83338189, "num_input_tokens_seen": 184845245, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.1361084, "step": 8597, "time_per_iteration": 2.600579261779785 }, { "auxiliary_loss_clip": 0.064438, "auxiliary_loss_mlp": 0.01266727, "balance_loss_clip": 0.06283826, "balance_loss_mlp": 0.01255009, "epoch": 0.5169397264392004, "flos": 26946728640000.0, "grad_norm": 1.462756238188393, "language_loss": 0.8201437, "learning_rate": 1.987342579847403e-06, "loss": 0.89724898, "num_input_tokens_seen": 184866605, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.1171875, "step": 8598, "time_per_iteration": 2.6197121143341064 }, { "auxiliary_loss_clip": 0.06442054, "auxiliary_loss_mlp": 0.01270316, "balance_loss_clip": 0.06283408, "balance_loss_mlp": 0.01257852, "epoch": 0.5169998496918683, "flos": 25414347571200.0, "grad_norm": 1.5570682051336964, "language_loss": 0.75654328, "learning_rate": 1.9869531262079273e-06, "loss": 0.83366698, "num_input_tokens_seen": 184886945, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.12475586, "step": 8599, "time_per_iteration": 2.6157753467559814 }, { "auxiliary_loss_clip": 0.06444934, "auxiliary_loss_mlp": 0.01266814, "balance_loss_clip": 0.06285738, "balance_loss_mlp": 0.01255651, "epoch": 0.5170599729445363, "flos": 24687667036800.0, "grad_norm": 3.116077116658881, "language_loss": 0.72502708, "learning_rate": 1.9865636730631904e-06, "loss": 0.80214453, "num_input_tokens_seen": 184905590, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.1116333, "step": 8600, "time_per_iteration": 2.580231189727783 }, { "auxiliary_loss_clip": 0.06440435, "auxiliary_loss_mlp": 0.01269682, "balance_loss_clip": 0.06280716, "balance_loss_mlp": 0.01257826, "epoch": 0.5171200961972042, "flos": 21000499171200.0, "grad_norm": 1.7797276355613116, "language_loss": 0.74929327, "learning_rate": 1.9861742204279602e-06, "loss": 0.82639444, "num_input_tokens_seen": 184925555, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.11859131, "step": 8601, "time_per_iteration": 2.5788369178771973 }, { "auxiliary_loss_clip": 0.0644359, "auxiliary_loss_mlp": 0.01273727, "balance_loss_clip": 0.06281252, "balance_loss_mlp": 0.01260912, "epoch": 0.5171802194498722, "flos": 22751953539840.0, "grad_norm": 2.4609808136049165, "language_loss": 0.83983767, "learning_rate": 1.9857847683170045e-06, "loss": 0.91701078, "num_input_tokens_seen": 184944490, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.12811279, "step": 8602, "time_per_iteration": 2.5612523555755615 }, { "auxiliary_loss_clip": 0.06442508, "auxiliary_loss_mlp": 0.01268221, "balance_loss_clip": 0.06280892, "balance_loss_mlp": 0.01256067, "epoch": 0.5172403427025402, "flos": 28183070833920.0, "grad_norm": 2.105989043109564, "language_loss": 0.74989909, "learning_rate": 1.9853953167450926e-06, "loss": 0.8270064, "num_input_tokens_seen": 184963190, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.12145996, "step": 8603, "time_per_iteration": 2.6234419345855713 }, { "auxiliary_loss_clip": 0.06444204, "auxiliary_loss_mlp": 0.0127012, "balance_loss_clip": 0.06280571, "balance_loss_mlp": 0.01257883, "epoch": 0.5173004659552082, "flos": 20343782396160.0, "grad_norm": 2.015501493529475, "language_loss": 0.73652488, "learning_rate": 1.9850058657269915e-06, "loss": 0.81366813, "num_input_tokens_seen": 184981220, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.12237549, "step": 8604, "time_per_iteration": 2.563798427581787 }, { "auxiliary_loss_clip": 0.06459237, "auxiliary_loss_mlp": 0.0126783, "balance_loss_clip": 0.06286606, "balance_loss_mlp": 0.01254496, "epoch": 0.5173605892078762, "flos": 19069481502720.0, "grad_norm": 1.91056860481716, "language_loss": 0.84967494, "learning_rate": 1.984616415277469e-06, "loss": 0.92694557, "num_input_tokens_seen": 184998810, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.13348389, "step": 8605, "time_per_iteration": 2.564058780670166 }, { "auxiliary_loss_clip": 0.06445386, "auxiliary_loss_mlp": 0.0126765, "balance_loss_clip": 0.06284119, "balance_loss_mlp": 0.01255849, "epoch": 0.5174207124605441, "flos": 28001620817280.0, "grad_norm": 1.4659844436249851, "language_loss": 0.64999801, "learning_rate": 1.984226965411294e-06, "loss": 0.72712839, "num_input_tokens_seen": 185021185, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.11785889, "step": 8606, "time_per_iteration": 2.723727226257324 }, { "auxiliary_loss_clip": 0.06444098, "auxiliary_loss_mlp": 0.01270195, "balance_loss_clip": 0.06283435, "balance_loss_mlp": 0.01257988, "epoch": 0.5174808357132121, "flos": 19502135660160.0, "grad_norm": 2.403581417142145, "language_loss": 0.77632922, "learning_rate": 1.983837516143234e-06, "loss": 0.85347211, "num_input_tokens_seen": 185038465, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.12188721, "step": 8607, "time_per_iteration": 2.571078300476074 }, { "auxiliary_loss_clip": 0.06446651, "auxiliary_loss_mlp": 0.01271637, "balance_loss_clip": 0.06283034, "balance_loss_mlp": 0.01258822, "epoch": 0.51754095896588, "flos": 22790834634240.0, "grad_norm": 1.6447266292800446, "language_loss": 0.71544999, "learning_rate": 1.983448067488057e-06, "loss": 0.79263282, "num_input_tokens_seen": 185057340, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.12811279, "step": 8608, "time_per_iteration": 2.62378191947937 }, { "auxiliary_loss_clip": 0.06454957, "auxiliary_loss_mlp": 0.01272069, "balance_loss_clip": 0.06287353, "balance_loss_mlp": 0.0125991, "epoch": 0.5176010822185481, "flos": 22674987964800.0, "grad_norm": 1.7507907913259406, "language_loss": 0.87179565, "learning_rate": 1.983058619460531e-06, "loss": 0.94906592, "num_input_tokens_seen": 185074935, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.12158203, "step": 8609, "time_per_iteration": 2.5746078491210938 }, { "auxiliary_loss_clip": 0.06448501, "auxiliary_loss_mlp": 0.01268009, "balance_loss_clip": 0.06286599, "balance_loss_mlp": 0.01256696, "epoch": 0.517661205471216, "flos": 23957967755520.0, "grad_norm": 1.4344691703557872, "language_loss": 0.73715985, "learning_rate": 1.9826691720754237e-06, "loss": 0.81432498, "num_input_tokens_seen": 185095050, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.11315918, "step": 8610, "time_per_iteration": 2.5900089740753174 }, { "auxiliary_loss_clip": 0.06450483, "auxiliary_loss_mlp": 0.01268821, "balance_loss_clip": 0.06282331, "balance_loss_mlp": 0.01255582, "epoch": 0.517721328723884, "flos": 15601470791040.0, "grad_norm": 1.6113566988696244, "language_loss": 0.67717743, "learning_rate": 1.9822797253475034e-06, "loss": 0.75437057, "num_input_tokens_seen": 185112275, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.13238525, "step": 8611, "time_per_iteration": 2.5504565238952637 }, { "auxiliary_loss_clip": 0.06442093, "auxiliary_loss_mlp": 0.01267302, "balance_loss_clip": 0.06279489, "balance_loss_mlp": 0.01254791, "epoch": 0.5177814519765519, "flos": 20966607394560.0, "grad_norm": 1.6659480302643301, "language_loss": 0.7724964, "learning_rate": 1.9818902792915373e-06, "loss": 0.84959036, "num_input_tokens_seen": 185132165, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.12518311, "step": 8612, "time_per_iteration": 2.569878339767456 }, { "auxiliary_loss_clip": 0.06446867, "auxiliary_loss_mlp": 0.01267582, "balance_loss_clip": 0.06284101, "balance_loss_mlp": 0.01256043, "epoch": 0.5178415752292199, "flos": 17973653806080.0, "grad_norm": 1.9557815517455621, "language_loss": 0.82381296, "learning_rate": 1.981500833922294e-06, "loss": 0.90095747, "num_input_tokens_seen": 185151025, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.11553955, "step": 8613, "time_per_iteration": 2.5578150749206543 }, { "auxiliary_loss_clip": 0.06446108, "auxiliary_loss_mlp": 0.01269042, "balance_loss_clip": 0.0628196, "balance_loss_mlp": 0.01255285, "epoch": 0.5179016984818878, "flos": 17827227596160.0, "grad_norm": 2.287273099988041, "language_loss": 0.66414809, "learning_rate": 1.981111389254541e-06, "loss": 0.74129951, "num_input_tokens_seen": 185168455, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.13751221, "step": 8614, "time_per_iteration": 2.5155274868011475 }, { "auxiliary_loss_clip": 0.06445746, "auxiliary_loss_mlp": 0.01269235, "balance_loss_clip": 0.06281561, "balance_loss_mlp": 0.01256944, "epoch": 0.5179618217345558, "flos": 17826011712000.0, "grad_norm": 1.9109077022566965, "language_loss": 0.87028456, "learning_rate": 1.9807219453030453e-06, "loss": 0.94743437, "num_input_tokens_seen": 185184415, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.12286377, "step": 8615, "time_per_iteration": 2.536146402359009 }, { "auxiliary_loss_clip": 0.06445238, "auxiliary_loss_mlp": 0.01271972, "balance_loss_clip": 0.06283839, "balance_loss_mlp": 0.01260409, "epoch": 0.5180219449872238, "flos": 22527639360000.0, "grad_norm": 1.5011076800093697, "language_loss": 0.81049788, "learning_rate": 1.9803325020825763e-06, "loss": 0.88766992, "num_input_tokens_seen": 185202910, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.11560059, "step": 8616, "time_per_iteration": 2.5614449977874756 }, { "auxiliary_loss_clip": 0.06449808, "auxiliary_loss_mlp": 0.01271963, "balance_loss_clip": 0.06283766, "balance_loss_mlp": 0.01259255, "epoch": 0.5180820682398918, "flos": 23922356970240.0, "grad_norm": 1.7841053702357672, "language_loss": 0.74998069, "learning_rate": 1.9799430596079e-06, "loss": 0.82719839, "num_input_tokens_seen": 185223085, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.12701416, "step": 8617, "time_per_iteration": 2.5906999111175537 }, { "auxiliary_loss_clip": 0.06448207, "auxiliary_loss_mlp": 0.01269638, "balance_loss_clip": 0.06284189, "balance_loss_mlp": 0.01257056, "epoch": 0.5181421914925598, "flos": 16985119662720.0, "grad_norm": 1.5429068212928767, "language_loss": 0.70123309, "learning_rate": 1.979553617893785e-06, "loss": 0.77841157, "num_input_tokens_seen": 185241295, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.12585449, "step": 8618, "time_per_iteration": 2.5416765213012695 }, { "auxiliary_loss_clip": 0.06335063, "auxiliary_loss_mlp": 0.01259036, "balance_loss_clip": 0.06270733, "balance_loss_mlp": 0.01257183, "epoch": 0.5182023147452277, "flos": 66080472917760.0, "grad_norm": 0.9119639129830281, "language_loss": 0.67258328, "learning_rate": 1.979164176954999e-06, "loss": 0.74852425, "num_input_tokens_seen": 185298295, "router_z_loss_clip": 0.64453125, "router_z_loss_mlp": 0.01847839, "step": 8619, "time_per_iteration": 3.129976749420166 }, { "auxiliary_loss_clip": 0.06441024, "auxiliary_loss_mlp": 0.01271721, "balance_loss_clip": 0.06281929, "balance_loss_mlp": 0.01259883, "epoch": 0.5182624379978957, "flos": 18193775281920.0, "grad_norm": 2.2284136473255116, "language_loss": 0.79605138, "learning_rate": 1.97877473680631e-06, "loss": 0.87317878, "num_input_tokens_seen": 185317000, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.11834717, "step": 8620, "time_per_iteration": 2.5584168434143066 }, { "auxiliary_loss_clip": 0.06439014, "auxiliary_loss_mlp": 0.01266644, "balance_loss_clip": 0.0628102, "balance_loss_mlp": 0.01254812, "epoch": 0.5183225612505636, "flos": 14031759928320.0, "grad_norm": 2.1351687988501618, "language_loss": 0.82405758, "learning_rate": 1.9783852974624846e-06, "loss": 0.90111411, "num_input_tokens_seen": 185331185, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.1182251, "step": 8621, "time_per_iteration": 2.5688116550445557 }, { "auxiliary_loss_clip": 0.06440218, "auxiliary_loss_mlp": 0.0127188, "balance_loss_clip": 0.06279655, "balance_loss_mlp": 0.01259518, "epoch": 0.5183826845032317, "flos": 23666582782080.0, "grad_norm": 2.0091841123481813, "language_loss": 0.65114182, "learning_rate": 1.9779958589382905e-06, "loss": 0.72826278, "num_input_tokens_seen": 185348955, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.12359619, "step": 8622, "time_per_iteration": 2.59255051612854 }, { "auxiliary_loss_clip": 0.06450406, "auxiliary_loss_mlp": 0.01271883, "balance_loss_clip": 0.06283641, "balance_loss_mlp": 0.01259909, "epoch": 0.5184428077558996, "flos": 15894155502720.0, "grad_norm": 1.8003748484278272, "language_loss": 0.61065352, "learning_rate": 1.977606421248497e-06, "loss": 0.68787646, "num_input_tokens_seen": 185367330, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.11975098, "step": 8623, "time_per_iteration": 2.549551010131836 }, { "auxiliary_loss_clip": 0.06439727, "auxiliary_loss_mlp": 0.01268203, "balance_loss_clip": 0.06277634, "balance_loss_mlp": 0.01256366, "epoch": 0.5185029310085676, "flos": 21036864643200.0, "grad_norm": 1.7261789828534677, "language_loss": 0.76005036, "learning_rate": 1.9772169844078685e-06, "loss": 0.83712965, "num_input_tokens_seen": 185385060, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.11834717, "step": 8624, "time_per_iteration": 3.9267241954803467 }, { "auxiliary_loss_clip": 0.06436457, "auxiliary_loss_mlp": 0.01267565, "balance_loss_clip": 0.06274879, "balance_loss_mlp": 0.0125515, "epoch": 0.5185630542612355, "flos": 26550062611200.0, "grad_norm": 2.4535844662280177, "language_loss": 0.71804583, "learning_rate": 1.9768275484311756e-06, "loss": 0.79508609, "num_input_tokens_seen": 185403745, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.12408447, "step": 8625, "time_per_iteration": 4.095398664474487 }, { "auxiliary_loss_clip": 0.06438215, "auxiliary_loss_mlp": 0.01268435, "balance_loss_clip": 0.06278095, "balance_loss_mlp": 0.01256991, "epoch": 0.5186231775139035, "flos": 20674803150720.0, "grad_norm": 1.8511962358243323, "language_loss": 0.68115139, "learning_rate": 1.976438113333184e-06, "loss": 0.75821793, "num_input_tokens_seen": 185422620, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.11431885, "step": 8626, "time_per_iteration": 2.5672318935394287 }, { "auxiliary_loss_clip": 0.06440058, "auxiliary_loss_mlp": 0.012712, "balance_loss_clip": 0.06279603, "balance_loss_mlp": 0.01258385, "epoch": 0.5186833007665714, "flos": 20891612390400.0, "grad_norm": 1.8526920001919875, "language_loss": 0.70561087, "learning_rate": 1.9760486791286612e-06, "loss": 0.78272343, "num_input_tokens_seen": 185439380, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.12817383, "step": 8627, "time_per_iteration": 2.5322811603546143 }, { "auxiliary_loss_clip": 0.06443825, "auxiliary_loss_mlp": 0.01267367, "balance_loss_clip": 0.06278558, "balance_loss_mlp": 0.01254892, "epoch": 0.5187434240192395, "flos": 20893247544960.0, "grad_norm": 1.9152999634533938, "language_loss": 0.7372613, "learning_rate": 1.9756592458323753e-06, "loss": 0.8143732, "num_input_tokens_seen": 185458830, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.12463379, "step": 8628, "time_per_iteration": 2.565835952758789 }, { "auxiliary_loss_clip": 0.06436801, "auxiliary_loss_mlp": 0.01266883, "balance_loss_clip": 0.06275909, "balance_loss_mlp": 0.01255427, "epoch": 0.5188035472719074, "flos": 19865203401600.0, "grad_norm": 1.6484545134443724, "language_loss": 0.77471733, "learning_rate": 1.9752698134590927e-06, "loss": 0.85175419, "num_input_tokens_seen": 185477270, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.11450195, "step": 8629, "time_per_iteration": 2.5569229125976562 }, { "auxiliary_loss_clip": 0.06440371, "auxiliary_loss_mlp": 0.01269413, "balance_loss_clip": 0.06276651, "balance_loss_mlp": 0.01257171, "epoch": 0.5188636705245754, "flos": 21144032415360.0, "grad_norm": 2.0460004944987715, "language_loss": 0.75069916, "learning_rate": 1.9748803820235815e-06, "loss": 0.827797, "num_input_tokens_seen": 185495795, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.12249756, "step": 8630, "time_per_iteration": 2.5475480556488037 }, { "auxiliary_loss_clip": 0.06440969, "auxiliary_loss_mlp": 0.01267442, "balance_loss_clip": 0.06278012, "balance_loss_mlp": 0.01254901, "epoch": 0.5189237937772434, "flos": 22426467154560.0, "grad_norm": 1.753216872442529, "language_loss": 0.80650461, "learning_rate": 1.9744909515406093e-06, "loss": 0.88358873, "num_input_tokens_seen": 185514885, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.12548828, "step": 8631, "time_per_iteration": 2.6007280349731445 }, { "auxiliary_loss_clip": 0.06441454, "auxiliary_loss_mlp": 0.01265655, "balance_loss_clip": 0.06277266, "balance_loss_mlp": 0.01252953, "epoch": 0.5189839170299113, "flos": 25453647936000.0, "grad_norm": 1.5343193180282786, "language_loss": 0.75017673, "learning_rate": 1.974101522024942e-06, "loss": 0.8272478, "num_input_tokens_seen": 185537155, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.12701416, "step": 8632, "time_per_iteration": 2.607931613922119 }, { "auxiliary_loss_clip": 0.06432162, "auxiliary_loss_mlp": 0.01269597, "balance_loss_clip": 0.06276024, "balance_loss_mlp": 0.01257199, "epoch": 0.5190440402825793, "flos": 18593585838720.0, "grad_norm": 2.095548744384796, "language_loss": 0.79273701, "learning_rate": 1.9737120934913477e-06, "loss": 0.86975455, "num_input_tokens_seen": 185555520, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.12402344, "step": 8633, "time_per_iteration": 2.5571231842041016 }, { "auxiliary_loss_clip": 0.064414, "auxiliary_loss_mlp": 0.01266545, "balance_loss_clip": 0.06279588, "balance_loss_mlp": 0.01254964, "epoch": 0.5191041635352472, "flos": 21915170340480.0, "grad_norm": 1.6657571235885973, "language_loss": 0.80918527, "learning_rate": 1.9733226659545936e-06, "loss": 0.88626474, "num_input_tokens_seen": 185573855, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.11572266, "step": 8634, "time_per_iteration": 2.5504045486450195 }, { "auxiliary_loss_clip": 0.06437667, "auxiliary_loss_mlp": 0.01268258, "balance_loss_clip": 0.06278787, "balance_loss_mlp": 0.01257285, "epoch": 0.5191642867879153, "flos": 27535536080640.0, "grad_norm": 1.656532127777153, "language_loss": 0.69323862, "learning_rate": 1.9729332394294467e-06, "loss": 0.77029788, "num_input_tokens_seen": 185595145, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.10980225, "step": 8635, "time_per_iteration": 4.044910430908203 }, { "auxiliary_loss_clip": 0.06444526, "auxiliary_loss_mlp": 0.01269895, "balance_loss_clip": 0.06279758, "balance_loss_mlp": 0.01257992, "epoch": 0.5192244100405832, "flos": 15711489601920.0, "grad_norm": 1.606418998645363, "language_loss": 0.776981, "learning_rate": 1.9725438139306742e-06, "loss": 0.85412526, "num_input_tokens_seen": 185613320, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.11907959, "step": 8636, "time_per_iteration": 3.9713633060455322 }, { "auxiliary_loss_clip": 0.06445286, "auxiliary_loss_mlp": 0.01268722, "balance_loss_clip": 0.06279834, "balance_loss_mlp": 0.01256527, "epoch": 0.5192845332932512, "flos": 12061903092480.0, "grad_norm": 2.2267140917594457, "language_loss": 0.72312474, "learning_rate": 1.9721543894730425e-06, "loss": 0.80026484, "num_input_tokens_seen": 185630730, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.12194824, "step": 8637, "time_per_iteration": 2.5420093536376953 }, { "auxiliary_loss_clip": 0.06435874, "auxiliary_loss_mlp": 0.01269088, "balance_loss_clip": 0.06277838, "balance_loss_mlp": 0.01257268, "epoch": 0.5193446565459191, "flos": 18959211129600.0, "grad_norm": 2.2017140978223932, "language_loss": 0.76459694, "learning_rate": 1.9717649660713194e-06, "loss": 0.84164655, "num_input_tokens_seen": 185648515, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.11816406, "step": 8638, "time_per_iteration": 2.5758395195007324 }, { "auxiliary_loss_clip": 0.06437124, "auxiliary_loss_mlp": 0.01273025, "balance_loss_clip": 0.06277442, "balance_loss_mlp": 0.01261551, "epoch": 0.5194047797985871, "flos": 20381028336000.0, "grad_norm": 2.1309385564030037, "language_loss": 0.75177652, "learning_rate": 1.971375543740272e-06, "loss": 0.82887805, "num_input_tokens_seen": 185665220, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.11474609, "step": 8639, "time_per_iteration": 2.541969060897827 }, { "auxiliary_loss_clip": 0.06442901, "auxiliary_loss_mlp": 0.01271846, "balance_loss_clip": 0.06281283, "balance_loss_mlp": 0.01259651, "epoch": 0.519464903051255, "flos": 24359916591360.0, "grad_norm": 1.8328263231821127, "language_loss": 0.78225416, "learning_rate": 1.9709861224946665e-06, "loss": 0.85940158, "num_input_tokens_seen": 185683750, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.12200928, "step": 8640, "time_per_iteration": 2.6247715950012207 }, { "auxiliary_loss_clip": 0.06440414, "auxiliary_loss_mlp": 0.01268488, "balance_loss_clip": 0.06280875, "balance_loss_mlp": 0.01257122, "epoch": 0.519525026303923, "flos": 14066657953920.0, "grad_norm": 3.3265998049366217, "language_loss": 0.6619826, "learning_rate": 1.97059670234927e-06, "loss": 0.73907161, "num_input_tokens_seen": 185700625, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.11364746, "step": 8641, "time_per_iteration": 2.562246799468994 }, { "auxiliary_loss_clip": 0.06438001, "auxiliary_loss_mlp": 0.01267553, "balance_loss_clip": 0.06279619, "balance_loss_mlp": 0.01256252, "epoch": 0.519585149556591, "flos": 28842722501760.0, "grad_norm": 2.868826886647137, "language_loss": 0.77006674, "learning_rate": 1.97020728331885e-06, "loss": 0.84712225, "num_input_tokens_seen": 185721155, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.11309814, "step": 8642, "time_per_iteration": 2.6510746479034424 }, { "auxiliary_loss_clip": 0.06436409, "auxiliary_loss_mlp": 0.01270541, "balance_loss_clip": 0.06278311, "balance_loss_mlp": 0.01258614, "epoch": 0.519645272809259, "flos": 25379826888960.0, "grad_norm": 1.9050615548393495, "language_loss": 0.83254743, "learning_rate": 1.9698178654181726e-06, "loss": 0.90961695, "num_input_tokens_seen": 185740990, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.1194458, "step": 8643, "time_per_iteration": 2.5949814319610596 }, { "auxiliary_loss_clip": 0.06447376, "auxiliary_loss_mlp": 0.01268524, "balance_loss_clip": 0.06282109, "balance_loss_mlp": 0.01256073, "epoch": 0.519705396061927, "flos": 25379659180800.0, "grad_norm": 2.2577410375374183, "language_loss": 0.70788568, "learning_rate": 1.969428448662004e-06, "loss": 0.78504467, "num_input_tokens_seen": 185762235, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.12445068, "step": 8644, "time_per_iteration": 2.6058480739593506 }, { "auxiliary_loss_clip": 0.06440309, "auxiliary_loss_mlp": 0.01266794, "balance_loss_clip": 0.06278513, "balance_loss_mlp": 0.01254677, "epoch": 0.5197655193145949, "flos": 28483889391360.0, "grad_norm": 1.5706511522426385, "language_loss": 0.80168986, "learning_rate": 1.9690390330651133e-06, "loss": 0.87876093, "num_input_tokens_seen": 185783415, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.12115479, "step": 8645, "time_per_iteration": 2.6374058723449707 }, { "auxiliary_loss_clip": 0.06439954, "auxiliary_loss_mlp": 0.01275049, "balance_loss_clip": 0.06279702, "balance_loss_mlp": 0.01262431, "epoch": 0.5198256425672629, "flos": 20014983774720.0, "grad_norm": 1.6467163328865622, "language_loss": 0.78274298, "learning_rate": 1.968649618642264e-06, "loss": 0.85989308, "num_input_tokens_seen": 185801345, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.1262207, "step": 8646, "time_per_iteration": 2.5315208435058594 }, { "auxiliary_loss_clip": 0.06444746, "auxiliary_loss_mlp": 0.01271953, "balance_loss_clip": 0.06283027, "balance_loss_mlp": 0.01259985, "epoch": 0.5198857658199308, "flos": 19835043131520.0, "grad_norm": 1.9621176128654108, "language_loss": 0.66075295, "learning_rate": 1.9682602054082252e-06, "loss": 0.73791993, "num_input_tokens_seen": 185820815, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.11950684, "step": 8647, "time_per_iteration": 2.7232282161712646 }, { "auxiliary_loss_clip": 0.06448753, "auxiliary_loss_mlp": 0.01269497, "balance_loss_clip": 0.0628254, "balance_loss_mlp": 0.01256307, "epoch": 0.5199458890725989, "flos": 24468761445120.0, "grad_norm": 1.748115578079654, "language_loss": 0.71444154, "learning_rate": 1.967870793377763e-06, "loss": 0.79162401, "num_input_tokens_seen": 185841450, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.13201904, "step": 8648, "time_per_iteration": 2.5816893577575684 }, { "auxiliary_loss_clip": 0.06451562, "auxiliary_loss_mlp": 0.01272826, "balance_loss_clip": 0.06288616, "balance_loss_mlp": 0.01260446, "epoch": 0.5200060123252668, "flos": 23411605207680.0, "grad_norm": 2.1854407195416052, "language_loss": 0.65366161, "learning_rate": 1.967481382565642e-06, "loss": 0.73090541, "num_input_tokens_seen": 185859935, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.12371826, "step": 8649, "time_per_iteration": 2.6036648750305176 }, { "auxiliary_loss_clip": 0.06459081, "auxiliary_loss_mlp": 0.01270892, "balance_loss_clip": 0.06288704, "balance_loss_mlp": 0.01257887, "epoch": 0.5200661355779348, "flos": 17207002074240.0, "grad_norm": 2.0680309921672393, "language_loss": 0.71106035, "learning_rate": 1.9670919729866315e-06, "loss": 0.78836006, "num_input_tokens_seen": 185876795, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.13006592, "step": 8650, "time_per_iteration": 2.584986925125122 }, { "auxiliary_loss_clip": 0.06444196, "auxiliary_loss_mlp": 0.01267459, "balance_loss_clip": 0.06281624, "balance_loss_mlp": 0.01255258, "epoch": 0.5201262588306027, "flos": 18520980675840.0, "grad_norm": 1.9030755600590628, "language_loss": 0.78521621, "learning_rate": 1.966702564655496e-06, "loss": 0.86233282, "num_input_tokens_seen": 185895570, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.12188721, "step": 8651, "time_per_iteration": 2.5526974201202393 }, { "auxiliary_loss_clip": 0.06455186, "auxiliary_loss_mlp": 0.01268038, "balance_loss_clip": 0.06290093, "balance_loss_mlp": 0.01254919, "epoch": 0.5201863820832707, "flos": 18624458868480.0, "grad_norm": 1.6477636892649763, "language_loss": 0.78768843, "learning_rate": 1.966313157587003e-06, "loss": 0.86492068, "num_input_tokens_seen": 185913700, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.13110352, "step": 8652, "time_per_iteration": 2.5325188636779785 }, { "auxiliary_loss_clip": 0.06454888, "auxiliary_loss_mlp": 0.01268582, "balance_loss_clip": 0.06291965, "balance_loss_mlp": 0.01255523, "epoch": 0.5202465053359386, "flos": 22863817140480.0, "grad_norm": 2.209594178268574, "language_loss": 0.7068969, "learning_rate": 1.9659237517959187e-06, "loss": 0.78413165, "num_input_tokens_seen": 185932460, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.13049316, "step": 8653, "time_per_iteration": 2.5682647228240967 }, { "auxiliary_loss_clip": 0.0645726, "auxiliary_loss_mlp": 0.01270715, "balance_loss_clip": 0.06289674, "balance_loss_mlp": 0.01257161, "epoch": 0.5203066285886067, "flos": 21988068992640.0, "grad_norm": 1.5853837428576134, "language_loss": 0.78883654, "learning_rate": 1.965534347297008e-06, "loss": 0.86611629, "num_input_tokens_seen": 185952030, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.13537598, "step": 8654, "time_per_iteration": 2.5962705612182617 }, { "auxiliary_loss_clip": 0.06468104, "auxiliary_loss_mlp": 0.01272783, "balance_loss_clip": 0.06296827, "balance_loss_mlp": 0.01259711, "epoch": 0.5203667518412746, "flos": 20240094568320.0, "grad_norm": 2.3150059691823404, "language_loss": 0.84244347, "learning_rate": 1.9651449441050393e-06, "loss": 0.91985238, "num_input_tokens_seen": 185973130, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.1305542, "step": 8655, "time_per_iteration": 2.6969501972198486 }, { "auxiliary_loss_clip": 0.06450228, "auxiliary_loss_mlp": 0.01267129, "balance_loss_clip": 0.06290787, "balance_loss_mlp": 0.0125466, "epoch": 0.5204268750939426, "flos": 15710860696320.0, "grad_norm": 2.2507482336180784, "language_loss": 0.65696073, "learning_rate": 1.9647555422347777e-06, "loss": 0.73413432, "num_input_tokens_seen": 185990200, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.12469482, "step": 8656, "time_per_iteration": 2.5204362869262695 }, { "auxiliary_loss_clip": 0.06455155, "auxiliary_loss_mlp": 0.01269006, "balance_loss_clip": 0.0629113, "balance_loss_mlp": 0.01256245, "epoch": 0.5204869983466105, "flos": 27456096810240.0, "grad_norm": 1.7962253728764677, "language_loss": 0.73449153, "learning_rate": 1.9643661417009893e-06, "loss": 0.81173307, "num_input_tokens_seen": 186009880, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.12768555, "step": 8657, "time_per_iteration": 2.5865163803100586 }, { "auxiliary_loss_clip": 0.06455295, "auxiliary_loss_mlp": 0.01269847, "balance_loss_clip": 0.06293165, "balance_loss_mlp": 0.0125668, "epoch": 0.5205471215992785, "flos": 20601820644480.0, "grad_norm": 1.911523348294137, "language_loss": 0.71749747, "learning_rate": 1.9639767425184408e-06, "loss": 0.7947489, "num_input_tokens_seen": 186026680, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.13171387, "step": 8658, "time_per_iteration": 2.5307273864746094 }, { "auxiliary_loss_clip": 0.06454796, "auxiliary_loss_mlp": 0.01272277, "balance_loss_clip": 0.06292077, "balance_loss_mlp": 0.01259063, "epoch": 0.5206072448519465, "flos": 22134537129600.0, "grad_norm": 2.2408311315444474, "language_loss": 0.83487165, "learning_rate": 1.963587344701897e-06, "loss": 0.9121424, "num_input_tokens_seen": 186046920, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.13208008, "step": 8659, "time_per_iteration": 2.5574610233306885 }, { "auxiliary_loss_clip": 0.0646782, "auxiliary_loss_mlp": 0.0127455, "balance_loss_clip": 0.06295033, "balance_loss_mlp": 0.01260376, "epoch": 0.5206673681046144, "flos": 18335924933760.0, "grad_norm": 1.9441595591793397, "language_loss": 0.75543427, "learning_rate": 1.9631979482661253e-06, "loss": 0.83285797, "num_input_tokens_seen": 186062090, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.14178467, "step": 8660, "time_per_iteration": 2.5121681690216064 }, { "auxiliary_loss_clip": 0.06454162, "auxiliary_loss_mlp": 0.01270114, "balance_loss_clip": 0.06293052, "balance_loss_mlp": 0.0125824, "epoch": 0.5207274913572825, "flos": 20236488842880.0, "grad_norm": 1.7980293074198952, "language_loss": 0.78088629, "learning_rate": 1.9628085532258906e-06, "loss": 0.85812902, "num_input_tokens_seen": 186081135, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.11877441, "step": 8661, "time_per_iteration": 2.5767440795898438 }, { "auxiliary_loss_clip": 0.06459495, "auxiliary_loss_mlp": 0.01266099, "balance_loss_clip": 0.06294432, "balance_loss_mlp": 0.01254214, "epoch": 0.5207876146099504, "flos": 22133530880640.0, "grad_norm": 1.6618440278669482, "language_loss": 0.70698345, "learning_rate": 1.9624191595959603e-06, "loss": 0.78423941, "num_input_tokens_seen": 186099700, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.11889648, "step": 8662, "time_per_iteration": 2.591928482055664 }, { "auxiliary_loss_clip": 0.06445682, "auxiliary_loss_mlp": 0.01269647, "balance_loss_clip": 0.06285664, "balance_loss_mlp": 0.01257535, "epoch": 0.5208477378626184, "flos": 23885781863040.0, "grad_norm": 2.4952736298665035, "language_loss": 0.69513607, "learning_rate": 1.962029767391098e-06, "loss": 0.77228934, "num_input_tokens_seen": 186119740, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.12115479, "step": 8663, "time_per_iteration": 2.5744338035583496 }, { "auxiliary_loss_clip": 0.06462328, "auxiliary_loss_mlp": 0.01272966, "balance_loss_clip": 0.06299394, "balance_loss_mlp": 0.01260538, "epoch": 0.5209078611152863, "flos": 20968158695040.0, "grad_norm": 1.4383431684458878, "language_loss": 0.76963127, "learning_rate": 1.961640376626072e-06, "loss": 0.84698415, "num_input_tokens_seen": 186140645, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.12432861, "step": 8664, "time_per_iteration": 5.499131202697754 }, { "auxiliary_loss_clip": 0.06455131, "auxiliary_loss_mlp": 0.01274876, "balance_loss_clip": 0.06291068, "balance_loss_mlp": 0.01261876, "epoch": 0.5209679843679543, "flos": 20674006536960.0, "grad_norm": 1.892334209913994, "language_loss": 0.76486027, "learning_rate": 1.961250987315646e-06, "loss": 0.84216034, "num_input_tokens_seen": 186160130, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.12994385, "step": 8665, "time_per_iteration": 2.559441089630127 }, { "auxiliary_loss_clip": 0.06455305, "auxiliary_loss_mlp": 0.01266992, "balance_loss_clip": 0.06293301, "balance_loss_mlp": 0.01254785, "epoch": 0.5210281076206222, "flos": 20233050825600.0, "grad_norm": 1.7170597123843145, "language_loss": 0.72381926, "learning_rate": 1.960861599474586e-06, "loss": 0.80104226, "num_input_tokens_seen": 186179485, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.12225342, "step": 8666, "time_per_iteration": 2.555535316467285 }, { "auxiliary_loss_clip": 0.06471416, "auxiliary_loss_mlp": 0.01268078, "balance_loss_clip": 0.06297921, "balance_loss_mlp": 0.01253689, "epoch": 0.5210882308732903, "flos": 16075395884160.0, "grad_norm": 2.031897129017308, "language_loss": 0.68711686, "learning_rate": 1.9604722131176592e-06, "loss": 0.76451182, "num_input_tokens_seen": 186197140, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.14385986, "step": 8667, "time_per_iteration": 2.5400545597076416 }, { "auxiliary_loss_clip": 0.06452365, "auxiliary_loss_mlp": 0.01269226, "balance_loss_clip": 0.06294338, "balance_loss_mlp": 0.01257621, "epoch": 0.5211483541259582, "flos": 24831954967680.0, "grad_norm": 1.4780415792437605, "language_loss": 0.81254452, "learning_rate": 1.960082828259629e-06, "loss": 0.88976043, "num_input_tokens_seen": 186216800, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.1159668, "step": 8668, "time_per_iteration": 2.587067127227783 }, { "auxiliary_loss_clip": 0.06454855, "auxiliary_loss_mlp": 0.01266283, "balance_loss_clip": 0.06291653, "balance_loss_mlp": 0.0125404, "epoch": 0.5212084773786262, "flos": 20375997091200.0, "grad_norm": 2.086558855312347, "language_loss": 0.63875937, "learning_rate": 1.9596934449152623e-06, "loss": 0.71597075, "num_input_tokens_seen": 186235320, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.12231445, "step": 8669, "time_per_iteration": 2.532960891723633 }, { "auxiliary_loss_clip": 0.06455056, "auxiliary_loss_mlp": 0.01269777, "balance_loss_clip": 0.06292602, "balance_loss_mlp": 0.01257749, "epoch": 0.5212686006312941, "flos": 23151596388480.0, "grad_norm": 1.8138326412863088, "language_loss": 0.6663425, "learning_rate": 1.959304063099325e-06, "loss": 0.74359077, "num_input_tokens_seen": 186254460, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.12017822, "step": 8670, "time_per_iteration": 2.567356586456299 }, { "auxiliary_loss_clip": 0.06448983, "auxiliary_loss_mlp": 0.01272587, "balance_loss_clip": 0.06291325, "balance_loss_mlp": 0.01261024, "epoch": 0.5213287238839621, "flos": 27780073822080.0, "grad_norm": 1.9075508048857712, "language_loss": 0.76031566, "learning_rate": 1.9589146828265806e-06, "loss": 0.83753127, "num_input_tokens_seen": 186269465, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.11572266, "step": 8671, "time_per_iteration": 2.585026741027832 }, { "auxiliary_loss_clip": 0.06461209, "auxiliary_loss_mlp": 0.01270834, "balance_loss_clip": 0.06296047, "balance_loss_mlp": 0.01257912, "epoch": 0.5213888471366301, "flos": 19943762204160.0, "grad_norm": 1.8115449818793066, "language_loss": 0.7842325, "learning_rate": 1.958525304111796e-06, "loss": 0.86155295, "num_input_tokens_seen": 186288660, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.12927246, "step": 8672, "time_per_iteration": 2.534823179244995 }, { "auxiliary_loss_clip": 0.06453002, "auxiliary_loss_mlp": 0.0126601, "balance_loss_clip": 0.06294394, "balance_loss_mlp": 0.01253875, "epoch": 0.521448970389298, "flos": 16988389971840.0, "grad_norm": 1.8331532708626632, "language_loss": 0.71456301, "learning_rate": 1.958135926969736e-06, "loss": 0.79175317, "num_input_tokens_seen": 186305760, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.12133789, "step": 8673, "time_per_iteration": 2.667585849761963 }, { "auxiliary_loss_clip": 0.06452186, "auxiliary_loss_mlp": 0.01268447, "balance_loss_clip": 0.0629034, "balance_loss_mlp": 0.01256037, "epoch": 0.5215090936419661, "flos": 18995744309760.0, "grad_norm": 1.4308108927045329, "language_loss": 0.74843985, "learning_rate": 1.957746551415166e-06, "loss": 0.82564616, "num_input_tokens_seen": 186324135, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.12408447, "step": 8674, "time_per_iteration": 2.5178656578063965 }, { "auxiliary_loss_clip": 0.06454131, "auxiliary_loss_mlp": 0.01272859, "balance_loss_clip": 0.06289473, "balance_loss_mlp": 0.01259191, "epoch": 0.521569216894634, "flos": 16148923441920.0, "grad_norm": 2.6743909977374916, "language_loss": 0.86388153, "learning_rate": 1.9573571774628506e-06, "loss": 0.94115144, "num_input_tokens_seen": 186340205, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.13677979, "step": 8675, "time_per_iteration": 3.9564075469970703 }, { "auxiliary_loss_clip": 0.06366517, "auxiliary_loss_mlp": 0.012588, "balance_loss_clip": 0.06301376, "balance_loss_mlp": 0.01256547, "epoch": 0.521629340147302, "flos": 57596054296320.0, "grad_norm": 0.8525124291603349, "language_loss": 0.62801516, "learning_rate": 1.9569678051275556e-06, "loss": 0.7042684, "num_input_tokens_seen": 186396940, "router_z_loss_clip": 0.65332031, "router_z_loss_mlp": 0.02258301, "step": 8676, "time_per_iteration": 4.562709808349609 }, { "auxiliary_loss_clip": 0.06450682, "auxiliary_loss_mlp": 0.01265226, "balance_loss_clip": 0.06288781, "balance_loss_mlp": 0.01253621, "epoch": 0.5216894633999699, "flos": 26804117790720.0, "grad_norm": 1.5339475210253253, "language_loss": 0.69156098, "learning_rate": 1.956578434424046e-06, "loss": 0.76872015, "num_input_tokens_seen": 186418680, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.11602783, "step": 8677, "time_per_iteration": 2.6002275943756104 }, { "auxiliary_loss_clip": 0.06448083, "auxiliary_loss_mlp": 0.01267916, "balance_loss_clip": 0.06288891, "balance_loss_mlp": 0.01256061, "epoch": 0.5217495866526379, "flos": 26365803482880.0, "grad_norm": 1.5771515957936575, "language_loss": 0.65463436, "learning_rate": 1.956189065367086e-06, "loss": 0.73179436, "num_input_tokens_seen": 186438265, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.11853027, "step": 8678, "time_per_iteration": 2.609038829803467 }, { "auxiliary_loss_clip": 0.06456315, "auxiliary_loss_mlp": 0.01271249, "balance_loss_clip": 0.06289968, "balance_loss_mlp": 0.01257808, "epoch": 0.5218097099053058, "flos": 23590329966720.0, "grad_norm": 2.094498770943995, "language_loss": 0.68293297, "learning_rate": 1.9557996979714414e-06, "loss": 0.76020861, "num_input_tokens_seen": 186456870, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.13433838, "step": 8679, "time_per_iteration": 2.564047336578369 }, { "auxiliary_loss_clip": 0.06453057, "auxiliary_loss_mlp": 0.01267956, "balance_loss_clip": 0.06288905, "balance_loss_mlp": 0.01255743, "epoch": 0.5218698331579739, "flos": 18083253346560.0, "grad_norm": 1.8042633509400057, "language_loss": 0.66829765, "learning_rate": 1.9554103322518764e-06, "loss": 0.74550784, "num_input_tokens_seen": 186476425, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.12219238, "step": 8680, "time_per_iteration": 2.5636062622070312 }, { "auxiliary_loss_clip": 0.06448758, "auxiliary_loss_mlp": 0.01269443, "balance_loss_clip": 0.06285554, "balance_loss_mlp": 0.01257087, "epoch": 0.5219299564106418, "flos": 19287129283200.0, "grad_norm": 2.023516150547864, "language_loss": 0.83517385, "learning_rate": 1.955020968223156e-06, "loss": 0.9123559, "num_input_tokens_seen": 186492555, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.12359619, "step": 8681, "time_per_iteration": 2.5428452491760254 }, { "auxiliary_loss_clip": 0.06443174, "auxiliary_loss_mlp": 0.0126856, "balance_loss_clip": 0.06281751, "balance_loss_mlp": 0.01256865, "epoch": 0.5219900796633098, "flos": 26658613975680.0, "grad_norm": 1.637922833380836, "language_loss": 0.7796725, "learning_rate": 1.9546316059000454e-06, "loss": 0.85678989, "num_input_tokens_seen": 186513190, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.11694336, "step": 8682, "time_per_iteration": 2.6245570182800293 }, { "auxiliary_loss_clip": 0.06442701, "auxiliary_loss_mlp": 0.01266518, "balance_loss_clip": 0.06283976, "balance_loss_mlp": 0.01254776, "epoch": 0.5220502029159777, "flos": 34321148225280.0, "grad_norm": 1.3924690485429243, "language_loss": 0.69436848, "learning_rate": 1.9542422452973082e-06, "loss": 0.77146071, "num_input_tokens_seen": 186534830, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.11737061, "step": 8683, "time_per_iteration": 2.6631851196289062 }, { "auxiliary_loss_clip": 0.06448506, "auxiliary_loss_mlp": 0.01270308, "balance_loss_clip": 0.06285129, "balance_loss_mlp": 0.01257207, "epoch": 0.5221103261686457, "flos": 22161804433920.0, "grad_norm": 1.6434795792744132, "language_loss": 0.76569736, "learning_rate": 1.9538528864297104e-06, "loss": 0.84288549, "num_input_tokens_seen": 186554390, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.13104248, "step": 8684, "time_per_iteration": 2.58833909034729 }, { "auxiliary_loss_clip": 0.06439026, "auxiliary_loss_mlp": 0.01270083, "balance_loss_clip": 0.06281412, "balance_loss_mlp": 0.01257822, "epoch": 0.5221704494213137, "flos": 19214440266240.0, "grad_norm": 8.386791889594425, "language_loss": 0.75479817, "learning_rate": 1.9534635293120153e-06, "loss": 0.83188927, "num_input_tokens_seen": 186572360, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.12261963, "step": 8685, "time_per_iteration": 2.55611515045166 }, { "auxiliary_loss_clip": 0.0644691, "auxiliary_loss_mlp": 0.01267469, "balance_loss_clip": 0.0628382, "balance_loss_mlp": 0.01254952, "epoch": 0.5222305726739817, "flos": 19360069862400.0, "grad_norm": 2.3330373279665095, "language_loss": 0.81084514, "learning_rate": 1.9530741739589876e-06, "loss": 0.88798892, "num_input_tokens_seen": 186590655, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.12512207, "step": 8686, "time_per_iteration": 2.5395076274871826 }, { "auxiliary_loss_clip": 0.06432524, "auxiliary_loss_mlp": 0.01268335, "balance_loss_clip": 0.0627901, "balance_loss_mlp": 0.01256605, "epoch": 0.5222906959266497, "flos": 27821554392960.0, "grad_norm": 1.8078804092549425, "language_loss": 0.70396769, "learning_rate": 1.9526848203853927e-06, "loss": 0.78097624, "num_input_tokens_seen": 186610345, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.11730957, "step": 8687, "time_per_iteration": 2.6328587532043457 }, { "auxiliary_loss_clip": 0.06434286, "auxiliary_loss_mlp": 0.01265448, "balance_loss_clip": 0.062791, "balance_loss_mlp": 0.0125398, "epoch": 0.5223508191793176, "flos": 12717781326720.0, "grad_norm": 1.9867921496745276, "language_loss": 0.8273809, "learning_rate": 1.9522954686059936e-06, "loss": 0.90437818, "num_input_tokens_seen": 186624360, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.11468506, "step": 8688, "time_per_iteration": 2.522648811340332 }, { "auxiliary_loss_clip": 0.06442047, "auxiliary_loss_mlp": 0.01271616, "balance_loss_clip": 0.06283119, "balance_loss_mlp": 0.01259015, "epoch": 0.5224109424319856, "flos": 15637584700800.0, "grad_norm": 2.370145873029258, "language_loss": 0.74002516, "learning_rate": 1.9519061186355558e-06, "loss": 0.8171618, "num_input_tokens_seen": 186638680, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.12609863, "step": 8689, "time_per_iteration": 2.5322954654693604 }, { "auxiliary_loss_clip": 0.06439029, "auxiliary_loss_mlp": 0.01264636, "balance_loss_clip": 0.06282523, "balance_loss_mlp": 0.01252936, "epoch": 0.5224710656846535, "flos": 15747687365760.0, "grad_norm": 2.0129052379169545, "language_loss": 0.82858944, "learning_rate": 1.9515167704888417e-06, "loss": 0.90562618, "num_input_tokens_seen": 186655840, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.11700439, "step": 8690, "time_per_iteration": 2.5365726947784424 }, { "auxiliary_loss_clip": 0.06441084, "auxiliary_loss_mlp": 0.01266014, "balance_loss_clip": 0.06280793, "balance_loss_mlp": 0.0125364, "epoch": 0.5225311889373215, "flos": 26038136891520.0, "grad_norm": 1.9154368008905729, "language_loss": 0.79090196, "learning_rate": 1.9511274241806173e-06, "loss": 0.86797297, "num_input_tokens_seen": 186674150, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.12347412, "step": 8691, "time_per_iteration": 2.5939176082611084 }, { "auxiliary_loss_clip": 0.06443527, "auxiliary_loss_mlp": 0.01270898, "balance_loss_clip": 0.06279987, "balance_loss_mlp": 0.01258017, "epoch": 0.5225913121899894, "flos": 18375183371520.0, "grad_norm": 2.14268841364714, "language_loss": 0.77028191, "learning_rate": 1.950738079725646e-06, "loss": 0.84742618, "num_input_tokens_seen": 186690675, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.12884521, "step": 8692, "time_per_iteration": 2.534248113632202 }, { "auxiliary_loss_clip": 0.06439151, "auxiliary_loss_mlp": 0.01268261, "balance_loss_clip": 0.06284722, "balance_loss_mlp": 0.01256334, "epoch": 0.5226514354426575, "flos": 29280407904000.0, "grad_norm": 1.734856337557442, "language_loss": 0.72687745, "learning_rate": 1.950348737138691e-06, "loss": 0.80395162, "num_input_tokens_seen": 186710380, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.11920166, "step": 8693, "time_per_iteration": 2.6302731037139893 }, { "auxiliary_loss_clip": 0.06448534, "auxiliary_loss_mlp": 0.01267736, "balance_loss_clip": 0.06282033, "balance_loss_mlp": 0.01253455, "epoch": 0.5227115586953254, "flos": 22859330947200.0, "grad_norm": 2.6244395170458468, "language_loss": 0.82467949, "learning_rate": 1.949959396434517e-06, "loss": 0.90184218, "num_input_tokens_seen": 186729135, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.14276123, "step": 8694, "time_per_iteration": 2.586535930633545 }, { "auxiliary_loss_clip": 0.06358758, "auxiliary_loss_mlp": 0.01252244, "balance_loss_clip": 0.06294703, "balance_loss_mlp": 0.01249671, "epoch": 0.5227716819479934, "flos": 57491695635840.0, "grad_norm": 0.7385640778930845, "language_loss": 0.55677038, "learning_rate": 1.949570057627888e-06, "loss": 0.63288045, "num_input_tokens_seen": 186791115, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.02574158, "step": 8695, "time_per_iteration": 3.242931842803955 }, { "auxiliary_loss_clip": 0.06442334, "auxiliary_loss_mlp": 0.01266709, "balance_loss_clip": 0.06281563, "balance_loss_mlp": 0.01254341, "epoch": 0.5228318052006613, "flos": 13813357461120.0, "grad_norm": 1.8011069742428474, "language_loss": 0.73590708, "learning_rate": 1.9491807207335672e-06, "loss": 0.81299746, "num_input_tokens_seen": 186808660, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.12365723, "step": 8696, "time_per_iteration": 2.547196388244629 }, { "auxiliary_loss_clip": 0.06447487, "auxiliary_loss_mlp": 0.01270957, "balance_loss_clip": 0.06286065, "balance_loss_mlp": 0.0125813, "epoch": 0.5228919284533293, "flos": 15601596572160.0, "grad_norm": 2.0529638484877273, "language_loss": 0.71950746, "learning_rate": 1.948791385766319e-06, "loss": 0.79669189, "num_input_tokens_seen": 186825900, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.12835693, "step": 8697, "time_per_iteration": 2.535383701324463 }, { "auxiliary_loss_clip": 0.06438342, "auxiliary_loss_mlp": 0.01265383, "balance_loss_clip": 0.0628066, "balance_loss_mlp": 0.01253498, "epoch": 0.5229520517059973, "flos": 22497982214400.0, "grad_norm": 1.9620351824116693, "language_loss": 0.80587852, "learning_rate": 1.948402052740906e-06, "loss": 0.88291574, "num_input_tokens_seen": 186843735, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.11901855, "step": 8698, "time_per_iteration": 2.560842275619507 }, { "auxiliary_loss_clip": 0.06443366, "auxiliary_loss_mlp": 0.01269364, "balance_loss_clip": 0.0628516, "balance_loss_mlp": 0.0125733, "epoch": 0.5230121749586653, "flos": 22097416970880.0, "grad_norm": 1.9132589009718768, "language_loss": 0.74553144, "learning_rate": 1.948012721672093e-06, "loss": 0.82265866, "num_input_tokens_seen": 186862440, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.12030029, "step": 8699, "time_per_iteration": 2.55959153175354 }, { "auxiliary_loss_clip": 0.06451804, "auxiliary_loss_mlp": 0.01273575, "balance_loss_clip": 0.06283208, "balance_loss_mlp": 0.01260594, "epoch": 0.5230722982113333, "flos": 22133656661760.0, "grad_norm": 2.2249908417436464, "language_loss": 0.73912883, "learning_rate": 1.947623392574642e-06, "loss": 0.81638265, "num_input_tokens_seen": 186880940, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.12982178, "step": 8700, "time_per_iteration": 2.5504090785980225 }, { "auxiliary_loss_clip": 0.06448491, "auxiliary_loss_mlp": 0.01275583, "balance_loss_clip": 0.06283355, "balance_loss_mlp": 0.01262279, "epoch": 0.5231324214640012, "flos": 25016214096000.0, "grad_norm": 1.7527558172988715, "language_loss": 0.67708468, "learning_rate": 1.947234065463318e-06, "loss": 0.75432539, "num_input_tokens_seen": 186900785, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.1328125, "step": 8701, "time_per_iteration": 2.5826823711395264 }, { "auxiliary_loss_clip": 0.06439403, "auxiliary_loss_mlp": 0.01268054, "balance_loss_clip": 0.06279974, "balance_loss_mlp": 0.01255657, "epoch": 0.5231925447166692, "flos": 25747842021120.0, "grad_norm": 1.6818086446852603, "language_loss": 0.66867518, "learning_rate": 1.9468447403528826e-06, "loss": 0.74574971, "num_input_tokens_seen": 186920895, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.12408447, "step": 8702, "time_per_iteration": 2.6086649894714355 }, { "auxiliary_loss_clip": 0.06440499, "auxiliary_loss_mlp": 0.0126809, "balance_loss_clip": 0.06281051, "balance_loss_mlp": 0.01255376, "epoch": 0.5232526679693371, "flos": 21440322852480.0, "grad_norm": 2.0016799028308876, "language_loss": 0.76450109, "learning_rate": 1.946455417258101e-06, "loss": 0.84158695, "num_input_tokens_seen": 186940605, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.1270752, "step": 8703, "time_per_iteration": 4.010331630706787 }, { "auxiliary_loss_clip": 0.06450912, "auxiliary_loss_mlp": 0.012729, "balance_loss_clip": 0.06282093, "balance_loss_mlp": 0.01258816, "epoch": 0.5233127912220051, "flos": 35307082892160.0, "grad_norm": 2.0913167324308937, "language_loss": 0.77544349, "learning_rate": 1.9460660961937348e-06, "loss": 0.85268158, "num_input_tokens_seen": 186960820, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.14099121, "step": 8704, "time_per_iteration": 4.104556560516357 }, { "auxiliary_loss_clip": 0.06443571, "auxiliary_loss_mlp": 0.01281035, "balance_loss_clip": 0.06284378, "balance_loss_mlp": 0.01268948, "epoch": 0.523372914474673, "flos": 17056257379200.0, "grad_norm": 1.6790916317085762, "language_loss": 0.78334701, "learning_rate": 1.9456767771745474e-06, "loss": 0.86059308, "num_input_tokens_seen": 186976240, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.12091064, "step": 8705, "time_per_iteration": 2.5063979625701904 }, { "auxiliary_loss_clip": 0.06450035, "auxiliary_loss_mlp": 0.01268983, "balance_loss_clip": 0.0628411, "balance_loss_mlp": 0.01255745, "epoch": 0.5234330377273411, "flos": 18412303530240.0, "grad_norm": 1.9107869781208058, "language_loss": 0.69697291, "learning_rate": 1.9452874602153027e-06, "loss": 0.77416313, "num_input_tokens_seen": 186992855, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.13250732, "step": 8706, "time_per_iteration": 2.5515730381011963 }, { "auxiliary_loss_clip": 0.06348041, "auxiliary_loss_mlp": 0.01254278, "balance_loss_clip": 0.06283525, "balance_loss_mlp": 0.0125215, "epoch": 0.523493160980009, "flos": 65872426429440.0, "grad_norm": 0.6590103833888878, "language_loss": 0.52319229, "learning_rate": 1.9448981453307623e-06, "loss": 0.59921551, "num_input_tokens_seen": 187051205, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.02130127, "step": 8707, "time_per_iteration": 3.2449398040771484 }, { "auxiliary_loss_clip": 0.06443788, "auxiliary_loss_mlp": 0.01268769, "balance_loss_clip": 0.06283837, "balance_loss_mlp": 0.01256842, "epoch": 0.523553284232677, "flos": 21878595233280.0, "grad_norm": 1.6206452333432835, "language_loss": 0.75182307, "learning_rate": 1.9445088325356904e-06, "loss": 0.82894862, "num_input_tokens_seen": 187070540, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.11938477, "step": 8708, "time_per_iteration": 2.5829102993011475 }, { "auxiliary_loss_clip": 0.06440236, "auxiliary_loss_mlp": 0.01269201, "balance_loss_clip": 0.06283401, "balance_loss_mlp": 0.01257959, "epoch": 0.5236134074853449, "flos": 20854156815360.0, "grad_norm": 1.601911746997418, "language_loss": 0.77673876, "learning_rate": 1.944119521844849e-06, "loss": 0.85383308, "num_input_tokens_seen": 187089975, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.11248779, "step": 8709, "time_per_iteration": 2.5954442024230957 }, { "auxiliary_loss_clip": 0.06450649, "auxiliary_loss_mlp": 0.01271047, "balance_loss_clip": 0.06282628, "balance_loss_mlp": 0.01257523, "epoch": 0.5236735307380129, "flos": 25527510910080.0, "grad_norm": 2.0149388395972614, "language_loss": 0.84003234, "learning_rate": 1.9437302132730003e-06, "loss": 0.91724932, "num_input_tokens_seen": 187108775, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.13537598, "step": 8710, "time_per_iteration": 2.6118552684783936 }, { "auxiliary_loss_clip": 0.06442829, "auxiliary_loss_mlp": 0.01271457, "balance_loss_clip": 0.06285623, "balance_loss_mlp": 0.01259965, "epoch": 0.523733653990681, "flos": 23589281790720.0, "grad_norm": 1.8658740661994806, "language_loss": 0.70071638, "learning_rate": 1.943340906834908e-06, "loss": 0.77785921, "num_input_tokens_seen": 187128830, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.11499023, "step": 8711, "time_per_iteration": 2.586482286453247 }, { "auxiliary_loss_clip": 0.06443678, "auxiliary_loss_mlp": 0.01269507, "balance_loss_clip": 0.06284354, "balance_loss_mlp": 0.01257658, "epoch": 0.5237937772433489, "flos": 21112698188160.0, "grad_norm": 2.2410387768650004, "language_loss": 0.83399642, "learning_rate": 1.9429516025453345e-06, "loss": 0.91112828, "num_input_tokens_seen": 187149570, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.11846924, "step": 8712, "time_per_iteration": 2.5854556560516357 }, { "auxiliary_loss_clip": 0.06445932, "auxiliary_loss_mlp": 0.0127103, "balance_loss_clip": 0.0628352, "balance_loss_mlp": 0.01258311, "epoch": 0.5238539004960169, "flos": 19179081043200.0, "grad_norm": 1.5940282139969395, "language_loss": 0.69808078, "learning_rate": 1.9425623004190415e-06, "loss": 0.77525043, "num_input_tokens_seen": 187170575, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.12719727, "step": 8713, "time_per_iteration": 2.5938432216644287 }, { "auxiliary_loss_clip": 0.06450568, "auxiliary_loss_mlp": 0.01268724, "balance_loss_clip": 0.06284592, "balance_loss_mlp": 0.01254872, "epoch": 0.5239140237486848, "flos": 17892914797440.0, "grad_norm": 3.034360853275486, "language_loss": 0.76490587, "learning_rate": 1.9421730004707925e-06, "loss": 0.84209883, "num_input_tokens_seen": 187187190, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.1383667, "step": 8714, "time_per_iteration": 2.569509267807007 }, { "auxiliary_loss_clip": 0.06448717, "auxiliary_loss_mlp": 0.01269139, "balance_loss_clip": 0.0628579, "balance_loss_mlp": 0.01255979, "epoch": 0.5239741470013528, "flos": 17936072449920.0, "grad_norm": 5.725702793117526, "language_loss": 0.76450771, "learning_rate": 1.9417837027153483e-06, "loss": 0.84168631, "num_input_tokens_seen": 187204350, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.1315918, "step": 8715, "time_per_iteration": 5.363412141799927 }, { "auxiliary_loss_clip": 0.06443337, "auxiliary_loss_mlp": 0.01265678, "balance_loss_clip": 0.0628473, "balance_loss_mlp": 0.01254014, "epoch": 0.5240342702540207, "flos": 31001408513280.0, "grad_norm": 1.4091728480498436, "language_loss": 0.71309972, "learning_rate": 1.9413944071674723e-06, "loss": 0.79018986, "num_input_tokens_seen": 187225605, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.11663818, "step": 8716, "time_per_iteration": 2.680579900741577 }, { "auxiliary_loss_clip": 0.0644725, "auxiliary_loss_mlp": 0.01266837, "balance_loss_clip": 0.06285484, "balance_loss_mlp": 0.01255768, "epoch": 0.5240943935066887, "flos": 25011308632320.0, "grad_norm": 1.8396511192804517, "language_loss": 0.87157214, "learning_rate": 1.941005113841926e-06, "loss": 0.94871294, "num_input_tokens_seen": 187241335, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.11071777, "step": 8717, "time_per_iteration": 2.5538272857666016 }, { "auxiliary_loss_clip": 0.06445344, "auxiliary_loss_mlp": 0.01274761, "balance_loss_clip": 0.06283043, "balance_loss_mlp": 0.01261982, "epoch": 0.5241545167593566, "flos": 23665786168320.0, "grad_norm": 2.105060275097621, "language_loss": 0.61259609, "learning_rate": 1.9406158227534723e-06, "loss": 0.68979716, "num_input_tokens_seen": 187259925, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.12792969, "step": 8718, "time_per_iteration": 2.5815887451171875 }, { "auxiliary_loss_clip": 0.06456706, "auxiliary_loss_mlp": 0.01270198, "balance_loss_clip": 0.06292193, "balance_loss_mlp": 0.01257616, "epoch": 0.5242146400120247, "flos": 23406490108800.0, "grad_norm": 1.6103535661611088, "language_loss": 0.72209054, "learning_rate": 1.940226533916872e-06, "loss": 0.79935962, "num_input_tokens_seen": 187279035, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.12585449, "step": 8719, "time_per_iteration": 2.5527379512786865 }, { "auxiliary_loss_clip": 0.06442644, "auxiliary_loss_mlp": 0.0126832, "balance_loss_clip": 0.06285274, "balance_loss_mlp": 0.01257162, "epoch": 0.5242747632646926, "flos": 17754873995520.0, "grad_norm": 2.330141789827675, "language_loss": 0.73428291, "learning_rate": 1.9398372473468877e-06, "loss": 0.81139255, "num_input_tokens_seen": 187297555, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.11157227, "step": 8720, "time_per_iteration": 2.7219486236572266 }, { "auxiliary_loss_clip": 0.06446069, "auxiliary_loss_mlp": 0.01269182, "balance_loss_clip": 0.06284623, "balance_loss_mlp": 0.01256629, "epoch": 0.5243348865173606, "flos": 32605849693440.0, "grad_norm": 1.8304962496402708, "language_loss": 0.70655668, "learning_rate": 1.939447963058281e-06, "loss": 0.78370923, "num_input_tokens_seen": 187320265, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.12561035, "step": 8721, "time_per_iteration": 2.648582696914673 }, { "auxiliary_loss_clip": 0.0644957, "auxiliary_loss_mlp": 0.01272251, "balance_loss_clip": 0.06289496, "balance_loss_mlp": 0.01259948, "epoch": 0.5243950097700285, "flos": 25491229292160.0, "grad_norm": 1.8390370558333935, "language_loss": 0.86436057, "learning_rate": 1.939058681065813e-06, "loss": 0.94157875, "num_input_tokens_seen": 187338045, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.12304688, "step": 8722, "time_per_iteration": 2.597555160522461 }, { "auxiliary_loss_clip": 0.06436808, "auxiliary_loss_mlp": 0.01274032, "balance_loss_clip": 0.06278484, "balance_loss_mlp": 0.01262052, "epoch": 0.5244551330226965, "flos": 15273846126720.0, "grad_norm": 1.6429268202442018, "language_loss": 0.80193186, "learning_rate": 1.938669401384247e-06, "loss": 0.87904024, "num_input_tokens_seen": 187356040, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.11993408, "step": 8723, "time_per_iteration": 2.524616003036499 }, { "auxiliary_loss_clip": 0.06452952, "auxiliary_loss_mlp": 0.01271938, "balance_loss_clip": 0.0628908, "balance_loss_mlp": 0.01259218, "epoch": 0.5245152562753645, "flos": 22243717399680.0, "grad_norm": 1.8721155811218757, "language_loss": 0.7496419, "learning_rate": 1.9382801240283426e-06, "loss": 0.82689077, "num_input_tokens_seen": 187374185, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.1272583, "step": 8724, "time_per_iteration": 2.5801734924316406 }, { "auxiliary_loss_clip": 0.06458154, "auxiliary_loss_mlp": 0.0127052, "balance_loss_clip": 0.062877, "balance_loss_mlp": 0.01256674, "epoch": 0.5245753795280325, "flos": 29434548689280.0, "grad_norm": 1.6519414282924745, "language_loss": 0.70487607, "learning_rate": 1.9378908490128625e-06, "loss": 0.78216285, "num_input_tokens_seen": 187396640, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.1385498, "step": 8725, "time_per_iteration": 2.635711193084717 }, { "auxiliary_loss_clip": 0.06342599, "auxiliary_loss_mlp": 0.01256711, "balance_loss_clip": 0.06278326, "balance_loss_mlp": 0.01254721, "epoch": 0.5246355027807005, "flos": 58853569645440.0, "grad_norm": 0.7374990488664755, "language_loss": 0.55604553, "learning_rate": 1.937501576352568e-06, "loss": 0.63203859, "num_input_tokens_seen": 187455945, "router_z_loss_clip": 0.64257812, "router_z_loss_mlp": 0.01989746, "step": 8726, "time_per_iteration": 3.1988706588745117 }, { "auxiliary_loss_clip": 0.06341806, "auxiliary_loss_mlp": 0.01267231, "balance_loss_clip": 0.06277115, "balance_loss_mlp": 0.01265068, "epoch": 0.5246956260333684, "flos": 64546792110720.0, "grad_norm": 0.7837363757104584, "language_loss": 0.58426607, "learning_rate": 1.937112306062219e-06, "loss": 0.6603564, "num_input_tokens_seen": 187519975, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.02166748, "step": 8727, "time_per_iteration": 3.1992883682250977 }, { "auxiliary_loss_clip": 0.06452371, "auxiliary_loss_mlp": 0.01272552, "balance_loss_clip": 0.06287955, "balance_loss_mlp": 0.01259928, "epoch": 0.5247557492860364, "flos": 24540276504960.0, "grad_norm": 1.3963367562424502, "language_loss": 0.70586526, "learning_rate": 1.9367230381565786e-06, "loss": 0.78311449, "num_input_tokens_seen": 187541775, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.12640381, "step": 8728, "time_per_iteration": 2.6163697242736816 }, { "auxiliary_loss_clip": 0.06443807, "auxiliary_loss_mlp": 0.0126938, "balance_loss_clip": 0.06282599, "balance_loss_mlp": 0.01257447, "epoch": 0.5248158725387043, "flos": 18811946378880.0, "grad_norm": 1.436363796917257, "language_loss": 0.70083082, "learning_rate": 1.9363337726504062e-06, "loss": 0.77796268, "num_input_tokens_seen": 187560425, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.1194458, "step": 8729, "time_per_iteration": 2.5540475845336914 }, { "auxiliary_loss_clip": 0.06448498, "auxiliary_loss_mlp": 0.01272166, "balance_loss_clip": 0.06284668, "balance_loss_mlp": 0.01259762, "epoch": 0.5248759957913723, "flos": 20961534222720.0, "grad_norm": 1.6696098393693644, "language_loss": 0.84539121, "learning_rate": 1.935944509558464e-06, "loss": 0.92259783, "num_input_tokens_seen": 187579930, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.12402344, "step": 8730, "time_per_iteration": 2.5898444652557373 }, { "auxiliary_loss_clip": 0.06440577, "auxiliary_loss_mlp": 0.0126648, "balance_loss_clip": 0.06279809, "balance_loss_mlp": 0.01254481, "epoch": 0.5249361190440403, "flos": 18666903761280.0, "grad_norm": 1.9988295645389687, "language_loss": 0.79802191, "learning_rate": 1.9355552488955125e-06, "loss": 0.87509251, "num_input_tokens_seen": 187595365, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.11999512, "step": 8731, "time_per_iteration": 2.5746278762817383 }, { "auxiliary_loss_clip": 0.0643857, "auxiliary_loss_mlp": 0.01271526, "balance_loss_clip": 0.06282978, "balance_loss_mlp": 0.01259772, "epoch": 0.5249962422967083, "flos": 24870249083520.0, "grad_norm": 1.6632710750744246, "language_loss": 0.83353949, "learning_rate": 1.935165990676312e-06, "loss": 0.91064048, "num_input_tokens_seen": 187614715, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.11761475, "step": 8732, "time_per_iteration": 2.600942611694336 }, { "auxiliary_loss_clip": 0.06441464, "auxiliary_loss_mlp": 0.01271246, "balance_loss_clip": 0.06283198, "balance_loss_mlp": 0.01259218, "epoch": 0.5250563655493762, "flos": 15267179727360.0, "grad_norm": 1.502948336402697, "language_loss": 0.78199518, "learning_rate": 1.9347767349156237e-06, "loss": 0.85912234, "num_input_tokens_seen": 187630745, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.12042236, "step": 8733, "time_per_iteration": 2.511507511138916 }, { "auxiliary_loss_clip": 0.0645067, "auxiliary_loss_mlp": 0.01266749, "balance_loss_clip": 0.06286152, "balance_loss_mlp": 0.01254107, "epoch": 0.5251164888020442, "flos": 18631209121920.0, "grad_norm": 3.1847335083687125, "language_loss": 0.81756365, "learning_rate": 1.934387481628208e-06, "loss": 0.8947379, "num_input_tokens_seen": 187648200, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.12640381, "step": 8734, "time_per_iteration": 2.5487871170043945 }, { "auxiliary_loss_clip": 0.06440733, "auxiliary_loss_mlp": 0.01266763, "balance_loss_clip": 0.06282456, "balance_loss_mlp": 0.0125477, "epoch": 0.5251766120547121, "flos": 29717632108800.0, "grad_norm": 1.2860964465352027, "language_loss": 0.76733702, "learning_rate": 1.933998230828826e-06, "loss": 0.84441197, "num_input_tokens_seen": 187669205, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.11981201, "step": 8735, "time_per_iteration": 2.6318933963775635 }, { "auxiliary_loss_clip": 0.06441407, "auxiliary_loss_mlp": 0.01267563, "balance_loss_clip": 0.06281129, "balance_loss_mlp": 0.01256113, "epoch": 0.5252367353073801, "flos": 23446964430720.0, "grad_norm": 1.7529693264451216, "language_loss": 0.80747551, "learning_rate": 1.9336089825322376e-06, "loss": 0.88456523, "num_input_tokens_seen": 187690890, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.11444092, "step": 8736, "time_per_iteration": 2.6155436038970947 }, { "auxiliary_loss_clip": 0.06443887, "auxiliary_loss_mlp": 0.01268765, "balance_loss_clip": 0.06284831, "balance_loss_mlp": 0.0125632, "epoch": 0.5252968585600482, "flos": 30818658758400.0, "grad_norm": 2.208154331855085, "language_loss": 0.70460033, "learning_rate": 1.9332197367532033e-06, "loss": 0.7817269, "num_input_tokens_seen": 187713045, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.12451172, "step": 8737, "time_per_iteration": 2.688633918762207 }, { "auxiliary_loss_clip": 0.06444088, "auxiliary_loss_mlp": 0.0126963, "balance_loss_clip": 0.06282718, "balance_loss_mlp": 0.01257429, "epoch": 0.5253569818127161, "flos": 20634035339520.0, "grad_norm": 1.4394052884968973, "language_loss": 0.77486455, "learning_rate": 1.9328304935064833e-06, "loss": 0.85200179, "num_input_tokens_seen": 187733640, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.12207031, "step": 8738, "time_per_iteration": 2.62073016166687 }, { "auxiliary_loss_clip": 0.06332093, "auxiliary_loss_mlp": 0.01253864, "balance_loss_clip": 0.06267253, "balance_loss_mlp": 0.01251745, "epoch": 0.5254171050653841, "flos": 63448155302400.0, "grad_norm": 0.736197733760742, "language_loss": 0.54381406, "learning_rate": 1.932441252806837e-06, "loss": 0.61967373, "num_input_tokens_seen": 187792930, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.02120972, "step": 8739, "time_per_iteration": 3.208871841430664 }, { "auxiliary_loss_clip": 0.06443171, "auxiliary_loss_mlp": 0.01269195, "balance_loss_clip": 0.06284089, "balance_loss_mlp": 0.01256684, "epoch": 0.525477228318052, "flos": 34678136545920.0, "grad_norm": 1.7744524982732366, "language_loss": 0.84629154, "learning_rate": 1.9320520146690263e-06, "loss": 0.92341518, "num_input_tokens_seen": 187812495, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.12512207, "step": 8740, "time_per_iteration": 2.7054100036621094 }, { "auxiliary_loss_clip": 0.06446021, "auxiliary_loss_mlp": 0.01264406, "balance_loss_clip": 0.0628616, "balance_loss_mlp": 0.0125289, "epoch": 0.52553735157072, "flos": 17936575574400.0, "grad_norm": 2.9161760452087044, "language_loss": 0.69515622, "learning_rate": 1.9316627791078093e-06, "loss": 0.77226043, "num_input_tokens_seen": 187829685, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.11523438, "step": 8741, "time_per_iteration": 2.5889194011688232 }, { "auxiliary_loss_clip": 0.06446846, "auxiliary_loss_mlp": 0.01268369, "balance_loss_clip": 0.06281564, "balance_loss_mlp": 0.01255238, "epoch": 0.5255974748233879, "flos": 9945326557440.0, "grad_norm": 1.715566798473937, "language_loss": 0.66642916, "learning_rate": 1.931273546137947e-06, "loss": 0.7435813, "num_input_tokens_seen": 187846495, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.13140869, "step": 8742, "time_per_iteration": 4.0793774127960205 }, { "auxiliary_loss_clip": 0.06450911, "auxiliary_loss_mlp": 0.01269196, "balance_loss_clip": 0.06283549, "balance_loss_mlp": 0.01255779, "epoch": 0.5256575980760559, "flos": 16873256062080.0, "grad_norm": 2.634535632364583, "language_loss": 0.63496268, "learning_rate": 1.9308843157741983e-06, "loss": 0.71216375, "num_input_tokens_seen": 187862010, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.13433838, "step": 8743, "time_per_iteration": 2.5424795150756836 }, { "auxiliary_loss_clip": 0.06326626, "auxiliary_loss_mlp": 0.01252697, "balance_loss_clip": 0.06261983, "balance_loss_mlp": 0.01250537, "epoch": 0.5257177213287239, "flos": 62408105297280.0, "grad_norm": 0.7706000713130371, "language_loss": 0.54127616, "learning_rate": 1.930495088031323e-06, "loss": 0.61706936, "num_input_tokens_seen": 187922730, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.02163696, "step": 8744, "time_per_iteration": 4.766629219055176 }, { "auxiliary_loss_clip": 0.06452717, "auxiliary_loss_mlp": 0.01269699, "balance_loss_clip": 0.06285327, "balance_loss_mlp": 0.01255805, "epoch": 0.5257778445813919, "flos": 20783144880000.0, "grad_norm": 2.3380335590090713, "language_loss": 0.76109856, "learning_rate": 1.9301058629240814e-06, "loss": 0.83832276, "num_input_tokens_seen": 187940160, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.13903809, "step": 8745, "time_per_iteration": 2.5535130500793457 }, { "auxiliary_loss_clip": 0.06440704, "auxiliary_loss_mlp": 0.01271014, "balance_loss_clip": 0.06280553, "balance_loss_mlp": 0.01258396, "epoch": 0.5258379678340598, "flos": 17024168465280.0, "grad_norm": 1.9060727952066792, "language_loss": 0.8149966, "learning_rate": 1.9297166404672324e-06, "loss": 0.8921138, "num_input_tokens_seen": 187958625, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.12628174, "step": 8746, "time_per_iteration": 2.5478687286376953 }, { "auxiliary_loss_clip": 0.06438941, "auxiliary_loss_mlp": 0.01270786, "balance_loss_clip": 0.06281626, "balance_loss_mlp": 0.01259348, "epoch": 0.5258980910867278, "flos": 21075032977920.0, "grad_norm": 1.8747676375794475, "language_loss": 0.76358193, "learning_rate": 1.9293274206755353e-06, "loss": 0.84067923, "num_input_tokens_seen": 187977575, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.11437988, "step": 8747, "time_per_iteration": 2.5873239040374756 }, { "auxiliary_loss_clip": 0.06437518, "auxiliary_loss_mlp": 0.01271082, "balance_loss_clip": 0.06281938, "balance_loss_mlp": 0.01258726, "epoch": 0.5259582143393957, "flos": 18010312767360.0, "grad_norm": 1.7716154817787022, "language_loss": 0.82830715, "learning_rate": 1.9289382035637505e-06, "loss": 0.90539318, "num_input_tokens_seen": 187996650, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.12347412, "step": 8748, "time_per_iteration": 2.6238090991973877 }, { "auxiliary_loss_clip": 0.06442738, "auxiliary_loss_mlp": 0.01269643, "balance_loss_clip": 0.06280066, "balance_loss_mlp": 0.01257084, "epoch": 0.5260183375920637, "flos": 22790457290880.0, "grad_norm": 1.808380026696835, "language_loss": 0.80677009, "learning_rate": 1.9285489891466345e-06, "loss": 0.88389391, "num_input_tokens_seen": 188013510, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.12567139, "step": 8749, "time_per_iteration": 2.6837592124938965 }, { "auxiliary_loss_clip": 0.06442542, "auxiliary_loss_mlp": 0.01270406, "balance_loss_clip": 0.0628194, "balance_loss_mlp": 0.01258389, "epoch": 0.5260784608447318, "flos": 27059682343680.0, "grad_norm": 1.7634453309656433, "language_loss": 0.72613418, "learning_rate": 1.9281597774389487e-06, "loss": 0.80326366, "num_input_tokens_seen": 188032085, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.12017822, "step": 8750, "time_per_iteration": 2.605361223220825 }, { "auxiliary_loss_clip": 0.06441261, "auxiliary_loss_mlp": 0.01266386, "balance_loss_clip": 0.06280261, "balance_loss_mlp": 0.01254293, "epoch": 0.5261385840973997, "flos": 20668262532480.0, "grad_norm": 1.306123921549179, "language_loss": 0.76864731, "learning_rate": 1.9277705684554517e-06, "loss": 0.84572381, "num_input_tokens_seen": 188050590, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.12103271, "step": 8751, "time_per_iteration": 2.547729253768921 }, { "auxiliary_loss_clip": 0.06439857, "auxiliary_loss_mlp": 0.01267123, "balance_loss_clip": 0.06281415, "balance_loss_mlp": 0.01255399, "epoch": 0.5261987073500677, "flos": 23629336842240.0, "grad_norm": 1.4658612855950501, "language_loss": 0.76032972, "learning_rate": 1.927381362210902e-06, "loss": 0.83739948, "num_input_tokens_seen": 188071620, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.1171875, "step": 8752, "time_per_iteration": 2.616586446762085 }, { "auxiliary_loss_clip": 0.06451501, "auxiliary_loss_mlp": 0.01271027, "balance_loss_clip": 0.06286058, "balance_loss_mlp": 0.01257425, "epoch": 0.5262588306027356, "flos": 27643626247680.0, "grad_norm": 1.50992581156807, "language_loss": 0.68306828, "learning_rate": 1.926992158720058e-06, "loss": 0.7602936, "num_input_tokens_seen": 188091740, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.13604736, "step": 8753, "time_per_iteration": 2.6477208137512207 }, { "auxiliary_loss_clip": 0.06441985, "auxiliary_loss_mlp": 0.0126898, "balance_loss_clip": 0.06283361, "balance_loss_mlp": 0.01256553, "epoch": 0.5263189538554036, "flos": 21765725383680.0, "grad_norm": 1.487775834314126, "language_loss": 0.83844829, "learning_rate": 1.9266029579976785e-06, "loss": 0.91555798, "num_input_tokens_seen": 188111165, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.12426758, "step": 8754, "time_per_iteration": 4.044743299484253 }, { "auxiliary_loss_clip": 0.06444547, "auxiliary_loss_mlp": 0.01268462, "balance_loss_clip": 0.06281051, "balance_loss_mlp": 0.01256583, "epoch": 0.5263790771080715, "flos": 14280490373760.0, "grad_norm": 2.042669080858969, "language_loss": 0.87640035, "learning_rate": 1.926213760058522e-06, "loss": 0.95353043, "num_input_tokens_seen": 188127825, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.11883545, "step": 8755, "time_per_iteration": 2.554685115814209 }, { "auxiliary_loss_clip": 0.06336763, "auxiliary_loss_mlp": 0.01255468, "balance_loss_clip": 0.06271875, "balance_loss_mlp": 0.01253288, "epoch": 0.5264392003607395, "flos": 65827298206080.0, "grad_norm": 0.704519519947122, "language_loss": 0.58684564, "learning_rate": 1.9258245649173477e-06, "loss": 0.66276795, "num_input_tokens_seen": 188194050, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 0.02183533, "step": 8756, "time_per_iteration": 3.255124568939209 }, { "auxiliary_loss_clip": 0.06448824, "auxiliary_loss_mlp": 0.01272134, "balance_loss_clip": 0.06282972, "balance_loss_mlp": 0.01259427, "epoch": 0.5264993236134075, "flos": 21038709432960.0, "grad_norm": 1.6112837510572762, "language_loss": 0.70603156, "learning_rate": 1.925435372588913e-06, "loss": 0.78324109, "num_input_tokens_seen": 188212565, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.1272583, "step": 8757, "time_per_iteration": 2.570219039916992 }, { "auxiliary_loss_clip": 0.06440428, "auxiliary_loss_mlp": 0.01275741, "balance_loss_clip": 0.06278251, "balance_loss_mlp": 0.01263122, "epoch": 0.5265594468660755, "flos": 16623854784000.0, "grad_norm": 1.521478692733436, "language_loss": 0.88093472, "learning_rate": 1.9250461830879768e-06, "loss": 0.9580965, "num_input_tokens_seen": 188229505, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.1262207, "step": 8758, "time_per_iteration": 2.529763698577881 }, { "auxiliary_loss_clip": 0.06448083, "auxiliary_loss_mlp": 0.01274912, "balance_loss_clip": 0.06284171, "balance_loss_mlp": 0.01261608, "epoch": 0.5266195701187434, "flos": 24141010999680.0, "grad_norm": 1.466064635340553, "language_loss": 0.76213956, "learning_rate": 1.9246569964292965e-06, "loss": 0.83936954, "num_input_tokens_seen": 188250395, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.13293457, "step": 8759, "time_per_iteration": 2.590294122695923 }, { "auxiliary_loss_clip": 0.06439286, "auxiliary_loss_mlp": 0.01272776, "balance_loss_clip": 0.06281322, "balance_loss_mlp": 0.01260217, "epoch": 0.5266796933714114, "flos": 15848314519680.0, "grad_norm": 2.1417872024975053, "language_loss": 0.72091532, "learning_rate": 1.9242678126276307e-06, "loss": 0.79803592, "num_input_tokens_seen": 188266785, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.12542725, "step": 8760, "time_per_iteration": 2.5540850162506104 }, { "auxiliary_loss_clip": 0.06451239, "auxiliary_loss_mlp": 0.01267082, "balance_loss_clip": 0.06283832, "balance_loss_mlp": 0.01253736, "epoch": 0.5267398166240793, "flos": 20956377196800.0, "grad_norm": 3.6996198998236123, "language_loss": 0.75858593, "learning_rate": 1.923878631697736e-06, "loss": 0.83576906, "num_input_tokens_seen": 188282525, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.13342285, "step": 8761, "time_per_iteration": 2.579098701477051 }, { "auxiliary_loss_clip": 0.06440622, "auxiliary_loss_mlp": 0.01267369, "balance_loss_clip": 0.06278433, "balance_loss_mlp": 0.01255806, "epoch": 0.5267999398767473, "flos": 21002763231360.0, "grad_norm": 1.7702513745165493, "language_loss": 0.70879459, "learning_rate": 1.923489453654373e-06, "loss": 0.78587455, "num_input_tokens_seen": 188301395, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.11572266, "step": 8762, "time_per_iteration": 2.5702693462371826 }, { "auxiliary_loss_clip": 0.06337613, "auxiliary_loss_mlp": 0.01255348, "balance_loss_clip": 0.06273185, "balance_loss_mlp": 0.01252958, "epoch": 0.5268600631294152, "flos": 66867935189760.0, "grad_norm": 0.9138711353165507, "language_loss": 0.65242898, "learning_rate": 1.9231002785122963e-06, "loss": 0.72835857, "num_input_tokens_seen": 188357665, "router_z_loss_clip": 0.64501953, "router_z_loss_mlp": 0.02386475, "step": 8763, "time_per_iteration": 3.093696117401123 }, { "auxiliary_loss_clip": 0.06447296, "auxiliary_loss_mlp": 0.01267716, "balance_loss_clip": 0.06285317, "balance_loss_mlp": 0.01255342, "epoch": 0.5269201863820833, "flos": 17171307434880.0, "grad_norm": 1.6699428372007041, "language_loss": 0.7134617, "learning_rate": 1.922711106286265e-06, "loss": 0.7906118, "num_input_tokens_seen": 188376935, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.12371826, "step": 8764, "time_per_iteration": 2.5483012199401855 }, { "auxiliary_loss_clip": 0.06444994, "auxiliary_loss_mlp": 0.01270755, "balance_loss_clip": 0.06282013, "balance_loss_mlp": 0.01257702, "epoch": 0.5269803096347513, "flos": 20528963919360.0, "grad_norm": 1.8904147770170587, "language_loss": 0.74607301, "learning_rate": 1.9223219369910368e-06, "loss": 0.8232305, "num_input_tokens_seen": 188394995, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.13049316, "step": 8765, "time_per_iteration": 2.535166025161743 }, { "auxiliary_loss_clip": 0.06446947, "auxiliary_loss_mlp": 0.01267721, "balance_loss_clip": 0.06281768, "balance_loss_mlp": 0.01254501, "epoch": 0.5270404328874192, "flos": 27237652416000.0, "grad_norm": 1.4327137551236724, "language_loss": 0.85697651, "learning_rate": 1.9219327706413677e-06, "loss": 0.93412322, "num_input_tokens_seen": 188415475, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.13214111, "step": 8766, "time_per_iteration": 2.5950002670288086 }, { "auxiliary_loss_clip": 0.06448284, "auxiliary_loss_mlp": 0.01270465, "balance_loss_clip": 0.06284599, "balance_loss_mlp": 0.01257352, "epoch": 0.5271005561400872, "flos": 23116866071040.0, "grad_norm": 1.6693778404071542, "language_loss": 0.79545033, "learning_rate": 1.921543607252017e-06, "loss": 0.87263787, "num_input_tokens_seen": 188435665, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.13128662, "step": 8767, "time_per_iteration": 2.7188010215759277 }, { "auxiliary_loss_clip": 0.06446489, "auxiliary_loss_mlp": 0.01268254, "balance_loss_clip": 0.06283107, "balance_loss_mlp": 0.01255422, "epoch": 0.5271606793927551, "flos": 22571342064000.0, "grad_norm": 2.085502529656839, "language_loss": 0.74345183, "learning_rate": 1.9211544468377394e-06, "loss": 0.8205992, "num_input_tokens_seen": 188455405, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.128479, "step": 8768, "time_per_iteration": 2.6512460708618164 }, { "auxiliary_loss_clip": 0.064461, "auxiliary_loss_mlp": 0.01272616, "balance_loss_clip": 0.06285748, "balance_loss_mlp": 0.0126085, "epoch": 0.5272208026454231, "flos": 18769166069760.0, "grad_norm": 1.7111941877446064, "language_loss": 0.74228811, "learning_rate": 1.9207652894132933e-06, "loss": 0.81947529, "num_input_tokens_seen": 188472940, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.11779785, "step": 8769, "time_per_iteration": 2.602304458618164 }, { "auxiliary_loss_clip": 0.06442688, "auxiliary_loss_mlp": 0.01266222, "balance_loss_clip": 0.06281501, "balance_loss_mlp": 0.01253908, "epoch": 0.5272809258980911, "flos": 20418358129920.0, "grad_norm": 2.0014567323536387, "language_loss": 0.7437287, "learning_rate": 1.920376134993436e-06, "loss": 0.82081783, "num_input_tokens_seen": 188493035, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.12322998, "step": 8770, "time_per_iteration": 2.573741912841797 }, { "auxiliary_loss_clip": 0.06444417, "auxiliary_loss_mlp": 0.01270168, "balance_loss_clip": 0.06284352, "balance_loss_mlp": 0.01258063, "epoch": 0.5273410491507591, "flos": 28264271040000.0, "grad_norm": 1.8411057862762485, "language_loss": 0.68340755, "learning_rate": 1.9199869835929224e-06, "loss": 0.76055342, "num_input_tokens_seen": 188513860, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.12109375, "step": 8771, "time_per_iteration": 2.621375560760498 }, { "auxiliary_loss_clip": 0.06439237, "auxiliary_loss_mlp": 0.01271725, "balance_loss_clip": 0.06280149, "balance_loss_mlp": 0.01259828, "epoch": 0.527401172403427, "flos": 22461658669440.0, "grad_norm": 1.8184012228729631, "language_loss": 0.76553333, "learning_rate": 1.9195978352265115e-06, "loss": 0.8426429, "num_input_tokens_seen": 188533345, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.11889648, "step": 8772, "time_per_iteration": 2.563230276107788 }, { "auxiliary_loss_clip": 0.06447911, "auxiliary_loss_mlp": 0.0127264, "balance_loss_clip": 0.06284837, "balance_loss_mlp": 0.0126023, "epoch": 0.527461295656095, "flos": 21037158132480.0, "grad_norm": 1.7890654756074764, "language_loss": 0.66735959, "learning_rate": 1.9192086899089585e-06, "loss": 0.74456513, "num_input_tokens_seen": 188551550, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.12420654, "step": 8773, "time_per_iteration": 2.5682151317596436 }, { "auxiliary_loss_clip": 0.06445546, "auxiliary_loss_mlp": 0.01271237, "balance_loss_clip": 0.06282452, "balance_loss_mlp": 0.01259352, "epoch": 0.5275214189087629, "flos": 26329060667520.0, "grad_norm": 1.5663467857025504, "language_loss": 0.86518592, "learning_rate": 1.91881954765502e-06, "loss": 0.94235379, "num_input_tokens_seen": 188571615, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.11883545, "step": 8774, "time_per_iteration": 2.5835931301116943 }, { "auxiliary_loss_clip": 0.06442985, "auxiliary_loss_mlp": 0.01272057, "balance_loss_clip": 0.06281659, "balance_loss_mlp": 0.01259939, "epoch": 0.5275815421614309, "flos": 20053110182400.0, "grad_norm": 1.5060792920859654, "language_loss": 0.79806, "learning_rate": 1.9184304084794523e-06, "loss": 0.87521046, "num_input_tokens_seen": 188591965, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.12115479, "step": 8775, "time_per_iteration": 2.53548002243042 }, { "auxiliary_loss_clip": 0.06439974, "auxiliary_loss_mlp": 0.01272028, "balance_loss_clip": 0.06281853, "balance_loss_mlp": 0.0125947, "epoch": 0.5276416654140988, "flos": 21438310354560.0, "grad_norm": 1.6194239012371117, "language_loss": 0.83897787, "learning_rate": 1.918041272397012e-06, "loss": 0.91609788, "num_input_tokens_seen": 188610675, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.12548828, "step": 8776, "time_per_iteration": 2.566213607788086 }, { "auxiliary_loss_clip": 0.06442088, "auxiliary_loss_mlp": 0.01270392, "balance_loss_clip": 0.06279522, "balance_loss_mlp": 0.0125738, "epoch": 0.5277017886667669, "flos": 17170762383360.0, "grad_norm": 1.7040164439658054, "language_loss": 0.67920995, "learning_rate": 1.9176521394224547e-06, "loss": 0.75633478, "num_input_tokens_seen": 188628235, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.13018799, "step": 8777, "time_per_iteration": 2.515533924102783 }, { "auxiliary_loss_clip": 0.06443511, "auxiliary_loss_mlp": 0.01266219, "balance_loss_clip": 0.06283764, "balance_loss_mlp": 0.01254578, "epoch": 0.5277619119194349, "flos": 20454262404480.0, "grad_norm": 1.4257335444251231, "language_loss": 0.81997889, "learning_rate": 1.9172630095705358e-06, "loss": 0.89707613, "num_input_tokens_seen": 188648925, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.11633301, "step": 8778, "time_per_iteration": 2.5392470359802246 }, { "auxiliary_loss_clip": 0.06447135, "auxiliary_loss_mlp": 0.01272167, "balance_loss_clip": 0.06282836, "balance_loss_mlp": 0.01258952, "epoch": 0.5278220351721028, "flos": 24067944639360.0, "grad_norm": 2.1122386621550704, "language_loss": 0.79986471, "learning_rate": 1.916873882856013e-06, "loss": 0.87705779, "num_input_tokens_seen": 188668125, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.13220215, "step": 8779, "time_per_iteration": 2.561648368835449 }, { "auxiliary_loss_clip": 0.06437574, "auxiliary_loss_mlp": 0.01264981, "balance_loss_clip": 0.06278637, "balance_loss_mlp": 0.0125424, "epoch": 0.5278821584247708, "flos": 24649540629120.0, "grad_norm": 2.062054970813329, "language_loss": 0.77565575, "learning_rate": 1.9164847592936406e-06, "loss": 0.85268128, "num_input_tokens_seen": 188684410, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.10736084, "step": 8780, "time_per_iteration": 2.5460870265960693 }, { "auxiliary_loss_clip": 0.06455675, "auxiliary_loss_mlp": 0.01267834, "balance_loss_clip": 0.06290319, "balance_loss_mlp": 0.01255293, "epoch": 0.5279422816774387, "flos": 35417017848960.0, "grad_norm": 1.8329595217580215, "language_loss": 0.69825566, "learning_rate": 1.916095638898174e-06, "loss": 0.77549076, "num_input_tokens_seen": 188706130, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.12542725, "step": 8781, "time_per_iteration": 2.67598819732666 }, { "auxiliary_loss_clip": 0.06435199, "auxiliary_loss_mlp": 0.01271097, "balance_loss_clip": 0.06277641, "balance_loss_mlp": 0.01259826, "epoch": 0.5280024049301068, "flos": 22973794024320.0, "grad_norm": 1.6037224390668592, "language_loss": 0.72687006, "learning_rate": 1.9157065216843696e-06, "loss": 0.80393308, "num_input_tokens_seen": 188725030, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.11273193, "step": 8782, "time_per_iteration": 3.989259719848633 }, { "auxiliary_loss_clip": 0.06447082, "auxiliary_loss_mlp": 0.01271406, "balance_loss_clip": 0.0628646, "balance_loss_mlp": 0.01259467, "epoch": 0.5280625281827747, "flos": 21514143899520.0, "grad_norm": 1.6708914851915024, "language_loss": 0.68514258, "learning_rate": 1.915317407666982e-06, "loss": 0.76232743, "num_input_tokens_seen": 188744325, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.11938477, "step": 8783, "time_per_iteration": 4.019125938415527 }, { "auxiliary_loss_clip": 0.064603, "auxiliary_loss_mlp": 0.01269813, "balance_loss_clip": 0.06289381, "balance_loss_mlp": 0.01255257, "epoch": 0.5281226514354427, "flos": 31215534422400.0, "grad_norm": 1.9490238940549156, "language_loss": 0.69552284, "learning_rate": 1.9149282968607674e-06, "loss": 0.77282399, "num_input_tokens_seen": 188765100, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.14569092, "step": 8784, "time_per_iteration": 2.6268229484558105 }, { "auxiliary_loss_clip": 0.06453589, "auxiliary_loss_mlp": 0.01272372, "balance_loss_clip": 0.06281903, "balance_loss_mlp": 0.01258359, "epoch": 0.5281827746881106, "flos": 25084039576320.0, "grad_norm": 1.9328144778506533, "language_loss": 0.75123751, "learning_rate": 1.91453918928048e-06, "loss": 0.82849717, "num_input_tokens_seen": 188783995, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.14013672, "step": 8785, "time_per_iteration": 2.567030191421509 }, { "auxiliary_loss_clip": 0.06449543, "auxiliary_loss_mlp": 0.01276042, "balance_loss_clip": 0.06287471, "balance_loss_mlp": 0.01263489, "epoch": 0.5282428979407786, "flos": 20637515283840.0, "grad_norm": 1.6060537468407807, "language_loss": 0.83866787, "learning_rate": 1.9141500849408745e-06, "loss": 0.91592371, "num_input_tokens_seen": 188803120, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.12554932, "step": 8786, "time_per_iteration": 2.5485522747039795 }, { "auxiliary_loss_clip": 0.06442517, "auxiliary_loss_mlp": 0.01270736, "balance_loss_clip": 0.06286106, "balance_loss_mlp": 0.01259507, "epoch": 0.5283030211934465, "flos": 22426005957120.0, "grad_norm": 3.322053363648459, "language_loss": 0.83328092, "learning_rate": 1.9137609838567076e-06, "loss": 0.91041344, "num_input_tokens_seen": 188820960, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.11236572, "step": 8787, "time_per_iteration": 2.568965196609497 }, { "auxiliary_loss_clip": 0.06440483, "auxiliary_loss_mlp": 0.01270028, "balance_loss_clip": 0.062821, "balance_loss_mlp": 0.01258918, "epoch": 0.5283631444461145, "flos": 23620951434240.0, "grad_norm": 8.05662569812754, "language_loss": 0.8353681, "learning_rate": 1.9133718860427316e-06, "loss": 0.9124732, "num_input_tokens_seen": 188837165, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.11114502, "step": 8788, "time_per_iteration": 2.5377137660980225 }, { "auxiliary_loss_clip": 0.06444057, "auxiliary_loss_mlp": 0.01271851, "balance_loss_clip": 0.06285509, "balance_loss_mlp": 0.01258904, "epoch": 0.5284232676987825, "flos": 32680341573120.0, "grad_norm": 2.482247624768865, "language_loss": 0.75712323, "learning_rate": 1.9129827915137027e-06, "loss": 0.8342824, "num_input_tokens_seen": 188858555, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.12945557, "step": 8789, "time_per_iteration": 2.6410062313079834 }, { "auxiliary_loss_clip": 0.06451165, "auxiliary_loss_mlp": 0.01267511, "balance_loss_clip": 0.06288145, "balance_loss_mlp": 0.01255644, "epoch": 0.5284833909514505, "flos": 26768213516160.0, "grad_norm": 1.5039476744320375, "language_loss": 0.70233536, "learning_rate": 1.9125937002843754e-06, "loss": 0.77952212, "num_input_tokens_seen": 188879050, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.11877441, "step": 8790, "time_per_iteration": 2.5891318321228027 }, { "auxiliary_loss_clip": 0.06446166, "auxiliary_loss_mlp": 0.01267116, "balance_loss_clip": 0.06287172, "balance_loss_mlp": 0.01255529, "epoch": 0.5285435142041185, "flos": 22097207335680.0, "grad_norm": 1.5171803609082108, "language_loss": 0.79466909, "learning_rate": 1.9122046123695036e-06, "loss": 0.87180191, "num_input_tokens_seen": 188898885, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.11584473, "step": 8791, "time_per_iteration": 2.5546529293060303 }, { "auxiliary_loss_clip": 0.06448078, "auxiliary_loss_mlp": 0.01267611, "balance_loss_clip": 0.06288926, "balance_loss_mlp": 0.01255905, "epoch": 0.5286036374567864, "flos": 20381615314560.0, "grad_norm": 1.9077341312288012, "language_loss": 0.66337407, "learning_rate": 1.9118155277838423e-06, "loss": 0.74053097, "num_input_tokens_seen": 188917225, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.11700439, "step": 8792, "time_per_iteration": 2.525346517562866 }, { "auxiliary_loss_clip": 0.06445627, "auxiliary_loss_mlp": 0.0126728, "balance_loss_clip": 0.06285523, "balance_loss_mlp": 0.01256008, "epoch": 0.5286637607094544, "flos": 24358952269440.0, "grad_norm": 3.258491505299418, "language_loss": 0.80798542, "learning_rate": 1.9114264465421443e-06, "loss": 0.88511449, "num_input_tokens_seen": 188936120, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.11273193, "step": 8793, "time_per_iteration": 2.568157911300659 }, { "auxiliary_loss_clip": 0.06443604, "auxiliary_loss_mlp": 0.01270967, "balance_loss_clip": 0.06283137, "balance_loss_mlp": 0.0125864, "epoch": 0.5287238839621223, "flos": 17276295000960.0, "grad_norm": 1.7910530374384337, "language_loss": 0.84872895, "learning_rate": 1.9110373686591645e-06, "loss": 0.92587471, "num_input_tokens_seen": 188953405, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.12341309, "step": 8794, "time_per_iteration": 4.05438756942749 }, { "auxiliary_loss_clip": 0.06455995, "auxiliary_loss_mlp": 0.01272032, "balance_loss_clip": 0.06286666, "balance_loss_mlp": 0.01258657, "epoch": 0.5287840072147904, "flos": 17572711219200.0, "grad_norm": 2.457907483959376, "language_loss": 0.68234837, "learning_rate": 1.9106482941496564e-06, "loss": 0.75962865, "num_input_tokens_seen": 188971150, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.1338501, "step": 8795, "time_per_iteration": 2.5287892818450928 }, { "auxiliary_loss_clip": 0.06447709, "auxiliary_loss_mlp": 0.0126871, "balance_loss_clip": 0.06284882, "balance_loss_mlp": 0.0125602, "epoch": 0.5288441304674583, "flos": 18558100834560.0, "grad_norm": 1.816007554998016, "language_loss": 0.81390983, "learning_rate": 1.910259223028374e-06, "loss": 0.891074, "num_input_tokens_seen": 188989550, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.12677002, "step": 8796, "time_per_iteration": 2.5360052585601807 }, { "auxiliary_loss_clip": 0.06451046, "auxiliary_loss_mlp": 0.01268342, "balance_loss_clip": 0.06288432, "balance_loss_mlp": 0.012557, "epoch": 0.5289042537201263, "flos": 20820935871360.0, "grad_norm": 1.4753114587842995, "language_loss": 0.69195294, "learning_rate": 1.909870155310071e-06, "loss": 0.7691468, "num_input_tokens_seen": 189008795, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.12640381, "step": 8797, "time_per_iteration": 2.651125192642212 }, { "auxiliary_loss_clip": 0.06441832, "auxiliary_loss_mlp": 0.01268539, "balance_loss_clip": 0.0628636, "balance_loss_mlp": 0.01257304, "epoch": 0.5289643769727942, "flos": 15739553520000.0, "grad_norm": 1.8192079853123315, "language_loss": 0.82519424, "learning_rate": 1.9094810910095005e-06, "loss": 0.90229797, "num_input_tokens_seen": 189025540, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.11230469, "step": 8798, "time_per_iteration": 2.560708522796631 }, { "auxiliary_loss_clip": 0.06450852, "auxiliary_loss_mlp": 0.01268762, "balance_loss_clip": 0.06285755, "balance_loss_mlp": 0.01255095, "epoch": 0.5290245002254622, "flos": 19543490449920.0, "grad_norm": 1.9570524738224042, "language_loss": 0.7119171, "learning_rate": 1.9090920301414166e-06, "loss": 0.78911322, "num_input_tokens_seen": 189044885, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.13677979, "step": 8799, "time_per_iteration": 2.546560525894165 }, { "auxiliary_loss_clip": 0.06438192, "auxiliary_loss_mlp": 0.01267964, "balance_loss_clip": 0.0628357, "balance_loss_mlp": 0.01256145, "epoch": 0.5290846234781301, "flos": 15820586017920.0, "grad_norm": 1.9228370451117989, "language_loss": 0.69307512, "learning_rate": 1.9087029727205716e-06, "loss": 0.77013671, "num_input_tokens_seen": 189061280, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.1182251, "step": 8800, "time_per_iteration": 2.522714853286743 }, { "auxiliary_loss_clip": 0.06340157, "auxiliary_loss_mlp": 0.01253729, "balance_loss_clip": 0.06275759, "balance_loss_mlp": 0.01251512, "epoch": 0.5291447467307981, "flos": 70076272498560.0, "grad_norm": 0.8844085475974558, "language_loss": 0.57009989, "learning_rate": 1.9083139187617193e-06, "loss": 0.64603877, "num_input_tokens_seen": 189114775, "router_z_loss_clip": 0.64697266, "router_z_loss_mlp": 0.0222168, "step": 8801, "time_per_iteration": 3.070338726043701 }, { "auxiliary_loss_clip": 0.06452198, "auxiliary_loss_mlp": 0.01268398, "balance_loss_clip": 0.06289288, "balance_loss_mlp": 0.01256548, "epoch": 0.529204869983466, "flos": 28371396885120.0, "grad_norm": 1.4893972994287932, "language_loss": 0.64343393, "learning_rate": 1.9079248682796123e-06, "loss": 0.72063994, "num_input_tokens_seen": 189134700, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.11834717, "step": 8802, "time_per_iteration": 2.614185333251953 }, { "auxiliary_loss_clip": 0.06445796, "auxiliary_loss_mlp": 0.01268683, "balance_loss_clip": 0.06286921, "balance_loss_mlp": 0.01257162, "epoch": 0.5292649932361341, "flos": 33766064853120.0, "grad_norm": 1.529461994429876, "language_loss": 0.69061649, "learning_rate": 1.907535821289003e-06, "loss": 0.76776129, "num_input_tokens_seen": 189155365, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.11529541, "step": 8803, "time_per_iteration": 2.6603751182556152 }, { "auxiliary_loss_clip": 0.06444743, "auxiliary_loss_mlp": 0.01269771, "balance_loss_clip": 0.0628651, "balance_loss_mlp": 0.01257039, "epoch": 0.5293251164888021, "flos": 20453717352960.0, "grad_norm": 1.5835641713349033, "language_loss": 0.7653079, "learning_rate": 1.9071467778046458e-06, "loss": 0.84245306, "num_input_tokens_seen": 189173885, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.12738037, "step": 8804, "time_per_iteration": 2.5674283504486084 }, { "auxiliary_loss_clip": 0.06339926, "auxiliary_loss_mlp": 0.01253497, "balance_loss_clip": 0.06276458, "balance_loss_mlp": 0.01251209, "epoch": 0.52938523974147, "flos": 66567856590720.0, "grad_norm": 0.753675298397846, "language_loss": 0.52798122, "learning_rate": 1.906757737841291e-06, "loss": 0.60391545, "num_input_tokens_seen": 189236515, "router_z_loss_clip": 0.63427734, "router_z_loss_mlp": 0.02285767, "step": 8805, "time_per_iteration": 3.2725095748901367 }, { "auxiliary_loss_clip": 0.06333268, "auxiliary_loss_mlp": 0.01256621, "balance_loss_clip": 0.06269468, "balance_loss_mlp": 0.01254583, "epoch": 0.529445362994138, "flos": 67172065983360.0, "grad_norm": 0.7264528340678983, "language_loss": 0.63763642, "learning_rate": 1.906368701413693e-06, "loss": 0.71353531, "num_input_tokens_seen": 189300500, "router_z_loss_clip": 0.63964844, "router_z_loss_mlp": 0.02038574, "step": 8806, "time_per_iteration": 3.2544479370117188 }, { "auxiliary_loss_clip": 0.06456356, "auxiliary_loss_mlp": 0.01271529, "balance_loss_clip": 0.0628816, "balance_loss_mlp": 0.01259215, "epoch": 0.5295054862468059, "flos": 17755167484800.0, "grad_norm": 1.6519417179799443, "language_loss": 0.72632146, "learning_rate": 1.9059796685366026e-06, "loss": 0.80360031, "num_input_tokens_seen": 189319745, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.12304688, "step": 8807, "time_per_iteration": 2.5803842544555664 }, { "auxiliary_loss_clip": 0.06446011, "auxiliary_loss_mlp": 0.01269224, "balance_loss_clip": 0.06287929, "balance_loss_mlp": 0.01258238, "epoch": 0.529565609499474, "flos": 11401622519040.0, "grad_norm": 1.9581472717283805, "language_loss": 0.69180417, "learning_rate": 1.9055906392247723e-06, "loss": 0.76895654, "num_input_tokens_seen": 189334550, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.10986328, "step": 8808, "time_per_iteration": 2.524583101272583 }, { "auxiliary_loss_clip": 0.06443635, "auxiliary_loss_mlp": 0.01270815, "balance_loss_clip": 0.06283724, "balance_loss_mlp": 0.01259985, "epoch": 0.5296257327521419, "flos": 17201174215680.0, "grad_norm": 1.7213464196644224, "language_loss": 0.87154263, "learning_rate": 1.9052016134929554e-06, "loss": 0.94868714, "num_input_tokens_seen": 189351735, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.1083374, "step": 8809, "time_per_iteration": 2.5430588722229004 }, { "auxiliary_loss_clip": 0.0645957, "auxiliary_loss_mlp": 0.01268032, "balance_loss_clip": 0.06291531, "balance_loss_mlp": 0.01254156, "epoch": 0.5296858560048099, "flos": 39972806265600.0, "grad_norm": 1.6573268290215275, "language_loss": 0.64476651, "learning_rate": 1.9048125913559016e-06, "loss": 0.72204256, "num_input_tokens_seen": 189373105, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.13879395, "step": 8810, "time_per_iteration": 2.7383689880371094 }, { "auxiliary_loss_clip": 0.06442408, "auxiliary_loss_mlp": 0.01275173, "balance_loss_clip": 0.06285347, "balance_loss_mlp": 0.01263103, "epoch": 0.5297459792574778, "flos": 20968032913920.0, "grad_norm": 1.4747287390593637, "language_loss": 0.68187124, "learning_rate": 1.9044235728283646e-06, "loss": 0.75904703, "num_input_tokens_seen": 189394615, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.1206665, "step": 8811, "time_per_iteration": 2.5977933406829834 }, { "auxiliary_loss_clip": 0.06331554, "auxiliary_loss_mlp": 0.01252876, "balance_loss_clip": 0.06268304, "balance_loss_mlp": 0.01250889, "epoch": 0.5298061025101458, "flos": 66542532658560.0, "grad_norm": 0.6601467285470546, "language_loss": 0.53355533, "learning_rate": 1.9040345579250953e-06, "loss": 0.60939962, "num_input_tokens_seen": 189459750, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.01986694, "step": 8812, "time_per_iteration": 3.314509153366089 }, { "auxiliary_loss_clip": 0.06329799, "auxiliary_loss_mlp": 0.01252348, "balance_loss_clip": 0.06266759, "balance_loss_mlp": 0.0125032, "epoch": 0.5298662257628137, "flos": 67683488578560.0, "grad_norm": 0.7102695881009826, "language_loss": 0.56403571, "learning_rate": 1.9036455466608453e-06, "loss": 0.63985717, "num_input_tokens_seen": 189527540, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.02027893, "step": 8813, "time_per_iteration": 3.25911283493042 }, { "auxiliary_loss_clip": 0.06436104, "auxiliary_loss_mlp": 0.01265277, "balance_loss_clip": 0.06281776, "balance_loss_mlp": 0.01254608, "epoch": 0.5299263490154817, "flos": 19652544938880.0, "grad_norm": 1.5465327166165037, "language_loss": 0.82284224, "learning_rate": 1.9032565390503657e-06, "loss": 0.89985603, "num_input_tokens_seen": 189546900, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.10662842, "step": 8814, "time_per_iteration": 2.550999402999878 }, { "auxiliary_loss_clip": 0.06450574, "auxiliary_loss_mlp": 0.012676, "balance_loss_clip": 0.06284552, "balance_loss_mlp": 0.01255799, "epoch": 0.5299864722681497, "flos": 22061638477440.0, "grad_norm": 1.5352119190585463, "language_loss": 0.85296732, "learning_rate": 1.9028675351084076e-06, "loss": 0.93014908, "num_input_tokens_seen": 189566490, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.11798096, "step": 8815, "time_per_iteration": 2.707808494567871 }, { "auxiliary_loss_clip": 0.06440158, "auxiliary_loss_mlp": 0.0126775, "balance_loss_clip": 0.06283081, "balance_loss_mlp": 0.01255555, "epoch": 0.5300465955208177, "flos": 21770379285120.0, "grad_norm": 1.9601790395121053, "language_loss": 0.66162598, "learning_rate": 1.9024785348497225e-06, "loss": 0.73870504, "num_input_tokens_seen": 189585580, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.12194824, "step": 8816, "time_per_iteration": 2.596273183822632 }, { "auxiliary_loss_clip": 0.06444067, "auxiliary_loss_mlp": 0.01274003, "balance_loss_clip": 0.06285049, "balance_loss_mlp": 0.0126253, "epoch": 0.5301067187734857, "flos": 43006401884160.0, "grad_norm": 1.8205526493322306, "language_loss": 0.72659415, "learning_rate": 1.9020895382890611e-06, "loss": 0.80377483, "num_input_tokens_seen": 189608485, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.11480713, "step": 8817, "time_per_iteration": 2.740757942199707 }, { "auxiliary_loss_clip": 0.06444091, "auxiliary_loss_mlp": 0.01271602, "balance_loss_clip": 0.0628228, "balance_loss_mlp": 0.01259663, "epoch": 0.5301668420261536, "flos": 20559878876160.0, "grad_norm": 1.6082990352180728, "language_loss": 0.65588212, "learning_rate": 1.9017005454411743e-06, "loss": 0.73303902, "num_input_tokens_seen": 189627815, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.11938477, "step": 8818, "time_per_iteration": 2.605483055114746 }, { "auxiliary_loss_clip": 0.06444196, "auxiliary_loss_mlp": 0.0127131, "balance_loss_clip": 0.06283171, "balance_loss_mlp": 0.01258775, "epoch": 0.5302269652788216, "flos": 17491259450880.0, "grad_norm": 1.9319069609462096, "language_loss": 0.75197339, "learning_rate": 1.9013115563208126e-06, "loss": 0.82912838, "num_input_tokens_seen": 189644850, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.12536621, "step": 8819, "time_per_iteration": 2.5955889225006104 }, { "auxiliary_loss_clip": 0.06447373, "auxiliary_loss_mlp": 0.01268707, "balance_loss_clip": 0.06281361, "balance_loss_mlp": 0.01256256, "epoch": 0.5302870885314895, "flos": 14579380287360.0, "grad_norm": 1.7535464033366628, "language_loss": 0.82465351, "learning_rate": 1.9009225709427267e-06, "loss": 0.90181434, "num_input_tokens_seen": 189660945, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.12451172, "step": 8820, "time_per_iteration": 2.5245442390441895 }, { "auxiliary_loss_clip": 0.06440991, "auxiliary_loss_mlp": 0.01272542, "balance_loss_clip": 0.06279318, "balance_loss_mlp": 0.01261611, "epoch": 0.5303472117841576, "flos": 23444323027200.0, "grad_norm": 1.4196443146237416, "language_loss": 0.72679377, "learning_rate": 1.9005335893216667e-06, "loss": 0.80392909, "num_input_tokens_seen": 189680425, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.10931396, "step": 8821, "time_per_iteration": 4.158886194229126 }, { "auxiliary_loss_clip": 0.06437512, "auxiliary_loss_mlp": 0.01270808, "balance_loss_clip": 0.06280375, "balance_loss_mlp": 0.01260097, "epoch": 0.5304073350368255, "flos": 22715294578560.0, "grad_norm": 1.7380661647429714, "language_loss": 0.74423218, "learning_rate": 1.9001446114723824e-06, "loss": 0.82131535, "num_input_tokens_seen": 189700375, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.10699463, "step": 8822, "time_per_iteration": 2.550379991531372 }, { "auxiliary_loss_clip": 0.06442653, "auxiliary_loss_mlp": 0.01270187, "balance_loss_clip": 0.06283072, "balance_loss_mlp": 0.01258093, "epoch": 0.5304674582894935, "flos": 27936059397120.0, "grad_norm": 1.6640358911048292, "language_loss": 0.67395341, "learning_rate": 1.8997556374096257e-06, "loss": 0.75108176, "num_input_tokens_seen": 189721225, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.12091064, "step": 8823, "time_per_iteration": 3.988903284072876 }, { "auxiliary_loss_clip": 0.06447691, "auxiliary_loss_mlp": 0.0126848, "balance_loss_clip": 0.06281792, "balance_loss_mlp": 0.01256195, "epoch": 0.5305275815421614, "flos": 21256860337920.0, "grad_norm": 1.602707605424863, "language_loss": 0.69179749, "learning_rate": 1.8993666671481444e-06, "loss": 0.76895922, "num_input_tokens_seen": 189740170, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.1229248, "step": 8824, "time_per_iteration": 2.5548648834228516 }, { "auxiliary_loss_clip": 0.06435043, "auxiliary_loss_mlp": 0.01268416, "balance_loss_clip": 0.06281009, "balance_loss_mlp": 0.01257604, "epoch": 0.5305877047948294, "flos": 17608867056000.0, "grad_norm": 1.9913354667764513, "language_loss": 0.76296568, "learning_rate": 1.898977700702689e-06, "loss": 0.84000027, "num_input_tokens_seen": 189757890, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.10821533, "step": 8825, "time_per_iteration": 2.516822576522827 }, { "auxiliary_loss_clip": 0.06439141, "auxiliary_loss_mlp": 0.01268018, "balance_loss_clip": 0.06281662, "balance_loss_mlp": 0.01256884, "epoch": 0.5306478280474973, "flos": 15200947474560.0, "grad_norm": 2.572827901064922, "language_loss": 0.85926187, "learning_rate": 1.8985887380880103e-06, "loss": 0.93633342, "num_input_tokens_seen": 189775390, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.11138916, "step": 8826, "time_per_iteration": 2.5277130603790283 }, { "auxiliary_loss_clip": 0.06438809, "auxiliary_loss_mlp": 0.01266773, "balance_loss_clip": 0.06282507, "balance_loss_mlp": 0.01254727, "epoch": 0.5307079513001653, "flos": 15346660924800.0, "grad_norm": 1.9583380077343828, "language_loss": 0.64546561, "learning_rate": 1.8981997793188558e-06, "loss": 0.72252142, "num_input_tokens_seen": 189793975, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.12054443, "step": 8827, "time_per_iteration": 2.5299570560455322 }, { "auxiliary_loss_clip": 0.06441794, "auxiliary_loss_mlp": 0.01272193, "balance_loss_clip": 0.06279107, "balance_loss_mlp": 0.01259968, "epoch": 0.5307680745528333, "flos": 43554567294720.0, "grad_norm": 1.9505033751295713, "language_loss": 0.60326809, "learning_rate": 1.8978108244099762e-06, "loss": 0.68040794, "num_input_tokens_seen": 189817870, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.12237549, "step": 8828, "time_per_iteration": 2.76613187789917 }, { "auxiliary_loss_clip": 0.06444658, "auxiliary_loss_mlp": 0.01267461, "balance_loss_clip": 0.06281991, "balance_loss_mlp": 0.01255963, "epoch": 0.5308281978055013, "flos": 20055332315520.0, "grad_norm": 1.6988125454912057, "language_loss": 0.82118106, "learning_rate": 1.8974218733761208e-06, "loss": 0.89830226, "num_input_tokens_seen": 189837905, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.11480713, "step": 8829, "time_per_iteration": 2.5733225345611572 }, { "auxiliary_loss_clip": 0.06436095, "auxiliary_loss_mlp": 0.01270618, "balance_loss_clip": 0.06280991, "balance_loss_mlp": 0.01259669, "epoch": 0.5308883210581693, "flos": 20710162373760.0, "grad_norm": 1.532145910375898, "language_loss": 0.78336787, "learning_rate": 1.8970329262320375e-06, "loss": 0.86043501, "num_input_tokens_seen": 189856970, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.10955811, "step": 8830, "time_per_iteration": 2.5742979049682617 }, { "auxiliary_loss_clip": 0.06439617, "auxiliary_loss_mlp": 0.01270358, "balance_loss_clip": 0.06280953, "balance_loss_mlp": 0.01258813, "epoch": 0.5309484443108372, "flos": 14360684330880.0, "grad_norm": 2.1950740238560216, "language_loss": 0.81507057, "learning_rate": 1.8966439829924768e-06, "loss": 0.89217031, "num_input_tokens_seen": 189872830, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.11547852, "step": 8831, "time_per_iteration": 2.517874002456665 }, { "auxiliary_loss_clip": 0.06437777, "auxiliary_loss_mlp": 0.01266722, "balance_loss_clip": 0.06280176, "balance_loss_mlp": 0.0125554, "epoch": 0.5310085675635052, "flos": 20016577002240.0, "grad_norm": 4.714216991831957, "language_loss": 0.7341274, "learning_rate": 1.896255043672186e-06, "loss": 0.81117237, "num_input_tokens_seen": 189891635, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.11187744, "step": 8832, "time_per_iteration": 2.5534090995788574 }, { "auxiliary_loss_clip": 0.06442516, "auxiliary_loss_mlp": 0.01271768, "balance_loss_clip": 0.06279318, "balance_loss_mlp": 0.01259197, "epoch": 0.5310686908161731, "flos": 22133824369920.0, "grad_norm": 2.0391258316835756, "language_loss": 0.75637317, "learning_rate": 1.8958661082859143e-06, "loss": 0.833516, "num_input_tokens_seen": 189909050, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.12573242, "step": 8833, "time_per_iteration": 4.060806512832642 }, { "auxiliary_loss_clip": 0.06443371, "auxiliary_loss_mlp": 0.01268123, "balance_loss_clip": 0.06281541, "balance_loss_mlp": 0.01256339, "epoch": 0.5311288140688412, "flos": 24724871049600.0, "grad_norm": 2.441525237997061, "language_loss": 0.74002481, "learning_rate": 1.8954771768484103e-06, "loss": 0.81713974, "num_input_tokens_seen": 189927405, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.11785889, "step": 8834, "time_per_iteration": 2.6050264835357666 }, { "auxiliary_loss_clip": 0.06450397, "auxiliary_loss_mlp": 0.01270646, "balance_loss_clip": 0.06282888, "balance_loss_mlp": 0.01258129, "epoch": 0.5311889373215091, "flos": 24104603600640.0, "grad_norm": 1.8170743326701144, "language_loss": 0.77892971, "learning_rate": 1.8950882493744226e-06, "loss": 0.85614014, "num_input_tokens_seen": 189947740, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.12506104, "step": 8835, "time_per_iteration": 2.5670974254608154 }, { "auxiliary_loss_clip": 0.06441563, "auxiliary_loss_mlp": 0.01271764, "balance_loss_clip": 0.0628034, "balance_loss_mlp": 0.01259468, "epoch": 0.5312490605741771, "flos": 22023386288640.0, "grad_norm": 1.4914389357764621, "language_loss": 0.72722161, "learning_rate": 1.8946993258786985e-06, "loss": 0.80435485, "num_input_tokens_seen": 189966495, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1229248, "step": 8836, "time_per_iteration": 2.560701847076416 }, { "auxiliary_loss_clip": 0.06441975, "auxiliary_loss_mlp": 0.01274154, "balance_loss_clip": 0.06280509, "balance_loss_mlp": 0.01261351, "epoch": 0.531309183826845, "flos": 19396561115520.0, "grad_norm": 2.0100936303423085, "language_loss": 0.81008101, "learning_rate": 1.894310406375987e-06, "loss": 0.88724232, "num_input_tokens_seen": 189985325, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.12780762, "step": 8837, "time_per_iteration": 2.5323610305786133 }, { "auxiliary_loss_clip": 0.06438459, "auxiliary_loss_mlp": 0.01268186, "balance_loss_clip": 0.06283221, "balance_loss_mlp": 0.01256247, "epoch": 0.531369307079513, "flos": 20195679104640.0, "grad_norm": 1.811865661256792, "language_loss": 0.85596335, "learning_rate": 1.893921490881035e-06, "loss": 0.93302989, "num_input_tokens_seen": 190003290, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.11938477, "step": 8838, "time_per_iteration": 2.5526463985443115 }, { "auxiliary_loss_clip": 0.06435448, "auxiliary_loss_mlp": 0.01267254, "balance_loss_clip": 0.06278903, "balance_loss_mlp": 0.01256257, "epoch": 0.5314294303321809, "flos": 18886144769280.0, "grad_norm": 1.820038779253487, "language_loss": 0.7330482, "learning_rate": 1.8935325794085906e-06, "loss": 0.81007516, "num_input_tokens_seen": 190023260, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.10986328, "step": 8839, "time_per_iteration": 2.547438144683838 }, { "auxiliary_loss_clip": 0.06440011, "auxiliary_loss_mlp": 0.01270117, "balance_loss_clip": 0.0627917, "balance_loss_mlp": 0.01258423, "epoch": 0.531489553584849, "flos": 23046818457600.0, "grad_norm": 1.5472568830728195, "language_loss": 0.76837164, "learning_rate": 1.8931436719734023e-06, "loss": 0.84547293, "num_input_tokens_seen": 190042035, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.11688232, "step": 8840, "time_per_iteration": 2.549039125442505 }, { "auxiliary_loss_clip": 0.06444012, "auxiliary_loss_mlp": 0.0126593, "balance_loss_clip": 0.06283665, "balance_loss_mlp": 0.01254397, "epoch": 0.5315496768375169, "flos": 19796329745280.0, "grad_norm": 1.9410832235089, "language_loss": 0.77723658, "learning_rate": 1.892754768590216e-06, "loss": 0.85433602, "num_input_tokens_seen": 190057545, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.11529541, "step": 8841, "time_per_iteration": 2.507127285003662 }, { "auxiliary_loss_clip": 0.0632963, "auxiliary_loss_mlp": 0.01257123, "balance_loss_clip": 0.06265988, "balance_loss_mlp": 0.01255241, "epoch": 0.5316098000901849, "flos": 71044876569600.0, "grad_norm": 0.6786446494760888, "language_loss": 0.56600749, "learning_rate": 1.8923658692737793e-06, "loss": 0.64187503, "num_input_tokens_seen": 190123800, "router_z_loss_clip": 0.63964844, "router_z_loss_mlp": 0.01878357, "step": 8842, "time_per_iteration": 3.3330798149108887 }, { "auxiliary_loss_clip": 0.06445473, "auxiliary_loss_mlp": 0.0126736, "balance_loss_clip": 0.06282876, "balance_loss_mlp": 0.01254002, "epoch": 0.5316699233428529, "flos": 16441146956160.0, "grad_norm": 1.858115017332081, "language_loss": 0.73676491, "learning_rate": 1.8919769740388407e-06, "loss": 0.8138932, "num_input_tokens_seen": 190141625, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.13360596, "step": 8843, "time_per_iteration": 2.5257043838500977 }, { "auxiliary_loss_clip": 0.06323563, "auxiliary_loss_mlp": 0.01252688, "balance_loss_clip": 0.06259757, "balance_loss_mlp": 0.01250681, "epoch": 0.5317300465955208, "flos": 67443478957440.0, "grad_norm": 0.8759333914707823, "language_loss": 0.61030114, "learning_rate": 1.891588082900145e-06, "loss": 0.68606359, "num_input_tokens_seen": 190198110, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.02006531, "step": 8844, "time_per_iteration": 3.2572214603424072 }, { "auxiliary_loss_clip": 0.06328143, "auxiliary_loss_mlp": 0.01252643, "balance_loss_clip": 0.06264536, "balance_loss_mlp": 0.01250587, "epoch": 0.5317901698481888, "flos": 59524095144960.0, "grad_norm": 0.8233165640406613, "language_loss": 0.6208601, "learning_rate": 1.8911991958724411e-06, "loss": 0.69666791, "num_input_tokens_seen": 190259950, "router_z_loss_clip": 0.63720703, "router_z_loss_mlp": 0.02056885, "step": 8845, "time_per_iteration": 3.2120888233184814 }, { "auxiliary_loss_clip": 0.0643831, "auxiliary_loss_mlp": 0.01271921, "balance_loss_clip": 0.06281532, "balance_loss_mlp": 0.01259118, "epoch": 0.5318502931008567, "flos": 19134204382080.0, "grad_norm": 3.0938775510633145, "language_loss": 0.75395787, "learning_rate": 1.890810312970474e-06, "loss": 0.83106029, "num_input_tokens_seen": 190278265, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.12799072, "step": 8846, "time_per_iteration": 2.5858523845672607 }, { "auxiliary_loss_clip": 0.06444, "auxiliary_loss_mlp": 0.01266854, "balance_loss_clip": 0.06284328, "balance_loss_mlp": 0.01255601, "epoch": 0.5319104163535248, "flos": 24687960526080.0, "grad_norm": 1.576172586740539, "language_loss": 0.75658166, "learning_rate": 1.8904214342089903e-06, "loss": 0.83369017, "num_input_tokens_seen": 190298400, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.11248779, "step": 8847, "time_per_iteration": 2.598515272140503 }, { "auxiliary_loss_clip": 0.0643499, "auxiliary_loss_mlp": 0.01265918, "balance_loss_clip": 0.06277719, "balance_loss_mlp": 0.01254575, "epoch": 0.5319705396061927, "flos": 19390691329920.0, "grad_norm": 1.760274975387857, "language_loss": 0.87919974, "learning_rate": 1.8900325596027378e-06, "loss": 0.95620883, "num_input_tokens_seen": 190316235, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.11340332, "step": 8848, "time_per_iteration": 2.553821086883545 }, { "auxiliary_loss_clip": 0.06444439, "auxiliary_loss_mlp": 0.01272188, "balance_loss_clip": 0.06282628, "balance_loss_mlp": 0.01259046, "epoch": 0.5320306628588607, "flos": 18265122633600.0, "grad_norm": 1.9480140372146375, "language_loss": 0.74283862, "learning_rate": 1.8896436891664609e-06, "loss": 0.82000488, "num_input_tokens_seen": 190335060, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.13146973, "step": 8849, "time_per_iteration": 2.5415046215057373 }, { "auxiliary_loss_clip": 0.06447138, "auxiliary_loss_mlp": 0.01265272, "balance_loss_clip": 0.06282371, "balance_loss_mlp": 0.01253888, "epoch": 0.5320907861115286, "flos": 23739062163840.0, "grad_norm": 2.1587532031459364, "language_loss": 0.7994318, "learning_rate": 1.8892548229149066e-06, "loss": 0.87655592, "num_input_tokens_seen": 190353265, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.1138916, "step": 8850, "time_per_iteration": 2.5703868865966797 }, { "auxiliary_loss_clip": 0.06442219, "auxiliary_loss_mlp": 0.01264433, "balance_loss_clip": 0.06282154, "balance_loss_mlp": 0.01253651, "epoch": 0.5321509093641966, "flos": 34503730272000.0, "grad_norm": 1.3361880617802269, "language_loss": 0.55071044, "learning_rate": 1.888865960862821e-06, "loss": 0.62777692, "num_input_tokens_seen": 190376575, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.10778809, "step": 8851, "time_per_iteration": 2.702183961868286 }, { "auxiliary_loss_clip": 0.06442823, "auxiliary_loss_mlp": 0.01268966, "balance_loss_clip": 0.06283152, "balance_loss_mlp": 0.01257778, "epoch": 0.5322110326168645, "flos": 20017080126720.0, "grad_norm": 1.7903559978101975, "language_loss": 0.68991113, "learning_rate": 1.8884771030249484e-06, "loss": 0.76702905, "num_input_tokens_seen": 190395185, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.11187744, "step": 8852, "time_per_iteration": 2.5399067401885986 }, { "auxiliary_loss_clip": 0.06322798, "auxiliary_loss_mlp": 0.01253971, "balance_loss_clip": 0.06259579, "balance_loss_mlp": 0.01251918, "epoch": 0.5322711558695326, "flos": 64650563792640.0, "grad_norm": 36.162404606781045, "language_loss": 0.62530279, "learning_rate": 1.8880882494160357e-06, "loss": 0.70107049, "num_input_tokens_seen": 190452595, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.02053833, "step": 8853, "time_per_iteration": 3.1296825408935547 }, { "auxiliary_loss_clip": 0.06448865, "auxiliary_loss_mlp": 0.01269187, "balance_loss_clip": 0.06283807, "balance_loss_mlp": 0.01257225, "epoch": 0.5323312791222005, "flos": 14944628234880.0, "grad_norm": 3.1841592142192097, "language_loss": 0.79612005, "learning_rate": 1.8876994000508278e-06, "loss": 0.87330055, "num_input_tokens_seen": 190469140, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.11956787, "step": 8854, "time_per_iteration": 2.53942608833313 }, { "auxiliary_loss_clip": 0.06432704, "auxiliary_loss_mlp": 0.012671, "balance_loss_clip": 0.062805, "balance_loss_mlp": 0.0125717, "epoch": 0.5323914023748685, "flos": 23447593336320.0, "grad_norm": 1.8407431018249782, "language_loss": 0.73972785, "learning_rate": 1.8873105549440698e-06, "loss": 0.81672585, "num_input_tokens_seen": 190489015, "router_z_loss_clip": 1.52246094, "router_z_loss_mlp": 0.09936523, "step": 8855, "time_per_iteration": 2.56756591796875 }, { "auxiliary_loss_clip": 0.06435014, "auxiliary_loss_mlp": 0.01263666, "balance_loss_clip": 0.06277652, "balance_loss_mlp": 0.01253348, "epoch": 0.5324515256275365, "flos": 26293324101120.0, "grad_norm": 1.8734500682875548, "language_loss": 0.65195525, "learning_rate": 1.886921714110507e-06, "loss": 0.7289421, "num_input_tokens_seen": 190508065, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.10327148, "step": 8856, "time_per_iteration": 2.623947858810425 }, { "auxiliary_loss_clip": 0.06445241, "auxiliary_loss_mlp": 0.01267589, "balance_loss_clip": 0.06280883, "balance_loss_mlp": 0.01255054, "epoch": 0.5325116488802044, "flos": 26878316181120.0, "grad_norm": 1.8407589623817908, "language_loss": 0.77681792, "learning_rate": 1.8865328775648842e-06, "loss": 0.85394621, "num_input_tokens_seen": 190527045, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.12536621, "step": 8857, "time_per_iteration": 2.580780029296875 }, { "auxiliary_loss_clip": 0.06436329, "auxiliary_loss_mlp": 0.01267231, "balance_loss_clip": 0.06277096, "balance_loss_mlp": 0.01255894, "epoch": 0.5325717721328724, "flos": 25891794535680.0, "grad_norm": 3.2507235529914893, "language_loss": 0.71373534, "learning_rate": 1.8861440453219456e-06, "loss": 0.79077089, "num_input_tokens_seen": 190544075, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.11340332, "step": 8858, "time_per_iteration": 2.5984551906585693 }, { "auxiliary_loss_clip": 0.06437364, "auxiliary_loss_mlp": 0.01270217, "balance_loss_clip": 0.06279386, "balance_loss_mlp": 0.01257664, "epoch": 0.5326318953855403, "flos": 21805864289280.0, "grad_norm": 1.5547680001545932, "language_loss": 0.69527656, "learning_rate": 1.8857552173964367e-06, "loss": 0.77235234, "num_input_tokens_seen": 190566030, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.12561035, "step": 8859, "time_per_iteration": 2.591583013534546 }, { "auxiliary_loss_clip": 0.06430704, "auxiliary_loss_mlp": 0.01266964, "balance_loss_clip": 0.06279108, "balance_loss_mlp": 0.01256718, "epoch": 0.5326920186382084, "flos": 20929193746560.0, "grad_norm": 1.4421134903464707, "language_loss": 0.69469869, "learning_rate": 1.8853663938031013e-06, "loss": 0.77167541, "num_input_tokens_seen": 190585605, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.10247803, "step": 8860, "time_per_iteration": 2.573094367980957 }, { "auxiliary_loss_clip": 0.06436636, "auxiliary_loss_mlp": 0.01267109, "balance_loss_clip": 0.0627904, "balance_loss_mlp": 0.01255975, "epoch": 0.5327521418908763, "flos": 21439735873920.0, "grad_norm": 1.6934951839847636, "language_loss": 0.78329104, "learning_rate": 1.884977574556683e-06, "loss": 0.86032844, "num_input_tokens_seen": 190604625, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.11132812, "step": 8861, "time_per_iteration": 3.990042209625244 }, { "auxiliary_loss_clip": 0.06441173, "auxiliary_loss_mlp": 0.01269513, "balance_loss_clip": 0.06281633, "balance_loss_mlp": 0.01257956, "epoch": 0.5328122651435443, "flos": 21766354289280.0, "grad_norm": 1.469835305036732, "language_loss": 0.85769355, "learning_rate": 1.8845887596719279e-06, "loss": 0.93480039, "num_input_tokens_seen": 190625060, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.11553955, "step": 8862, "time_per_iteration": 4.126881837844849 }, { "auxiliary_loss_clip": 0.06446217, "auxiliary_loss_mlp": 0.01271052, "balance_loss_clip": 0.06283582, "balance_loss_mlp": 0.01258154, "epoch": 0.5328723883962122, "flos": 18302410500480.0, "grad_norm": 1.7692265454509335, "language_loss": 0.61796176, "learning_rate": 1.8841999491635778e-06, "loss": 0.69513452, "num_input_tokens_seen": 190643150, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.12896729, "step": 8863, "time_per_iteration": 2.5452442169189453 }, { "auxiliary_loss_clip": 0.06434417, "auxiliary_loss_mlp": 0.01267018, "balance_loss_clip": 0.06279703, "balance_loss_mlp": 0.01255896, "epoch": 0.5329325116488802, "flos": 25382049022080.0, "grad_norm": 1.9449723402063355, "language_loss": 0.73832083, "learning_rate": 1.883811143046377e-06, "loss": 0.81533515, "num_input_tokens_seen": 190662725, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.11114502, "step": 8864, "time_per_iteration": 2.6384215354919434 }, { "auxiliary_loss_clip": 0.06437484, "auxiliary_loss_mlp": 0.01271201, "balance_loss_clip": 0.06280167, "balance_loss_mlp": 0.0125937, "epoch": 0.5329926349015481, "flos": 25598984042880.0, "grad_norm": 1.6372040973883695, "language_loss": 0.6443882, "learning_rate": 1.8834223413350702e-06, "loss": 0.72147506, "num_input_tokens_seen": 190683680, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.11828613, "step": 8865, "time_per_iteration": 2.587496280670166 }, { "auxiliary_loss_clip": 0.06436911, "auxiliary_loss_mlp": 0.01270035, "balance_loss_clip": 0.06278002, "balance_loss_mlp": 0.01258717, "epoch": 0.5330527581542162, "flos": 22895612565120.0, "grad_norm": 5.4751637511476385, "language_loss": 0.78358299, "learning_rate": 1.8830335440443989e-06, "loss": 0.86065245, "num_input_tokens_seen": 190703350, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.11322021, "step": 8866, "time_per_iteration": 2.7466888427734375 }, { "auxiliary_loss_clip": 0.06435572, "auxiliary_loss_mlp": 0.01264831, "balance_loss_clip": 0.06277496, "balance_loss_mlp": 0.012539, "epoch": 0.5331128814068841, "flos": 16031022347520.0, "grad_norm": 8.021457084788018, "language_loss": 0.73671782, "learning_rate": 1.882644751189108e-06, "loss": 0.81372178, "num_input_tokens_seen": 190721170, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.10931396, "step": 8867, "time_per_iteration": 2.5756471157073975 }, { "auxiliary_loss_clip": 0.06434998, "auxiliary_loss_mlp": 0.01268562, "balance_loss_clip": 0.0627608, "balance_loss_mlp": 0.01255759, "epoch": 0.5331730046595521, "flos": 39353461211520.0, "grad_norm": 2.3911725903821743, "language_loss": 0.72243232, "learning_rate": 1.88225596278394e-06, "loss": 0.79946792, "num_input_tokens_seen": 190743795, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.12799072, "step": 8868, "time_per_iteration": 2.7161872386932373 }, { "auxiliary_loss_clip": 0.06437974, "auxiliary_loss_mlp": 0.01272657, "balance_loss_clip": 0.06278805, "balance_loss_mlp": 0.01261356, "epoch": 0.5332331279122201, "flos": 24031201824000.0, "grad_norm": 1.676209280688722, "language_loss": 0.78371739, "learning_rate": 1.881867178843637e-06, "loss": 0.86082369, "num_input_tokens_seen": 190761560, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.11309814, "step": 8869, "time_per_iteration": 2.57746958732605 }, { "auxiliary_loss_clip": 0.06447075, "auxiliary_loss_mlp": 0.01267758, "balance_loss_clip": 0.06282365, "balance_loss_mlp": 0.01256153, "epoch": 0.533293251164888, "flos": 17135109671040.0, "grad_norm": 1.693256532683689, "language_loss": 0.76024109, "learning_rate": 1.8814783993829434e-06, "loss": 0.83738947, "num_input_tokens_seen": 190778875, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.11602783, "step": 8870, "time_per_iteration": 2.5347936153411865 }, { "auxiliary_loss_clip": 0.06447016, "auxiliary_loss_mlp": 0.01274193, "balance_loss_clip": 0.06283449, "balance_loss_mlp": 0.01261492, "epoch": 0.533353374417556, "flos": 22132734266880.0, "grad_norm": 3.6827990686208003, "language_loss": 0.75626981, "learning_rate": 1.8810896244165997e-06, "loss": 0.83348191, "num_input_tokens_seen": 190799830, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.12701416, "step": 8871, "time_per_iteration": 2.580124855041504 }, { "auxiliary_loss_clip": 0.06435898, "auxiliary_loss_mlp": 0.01272642, "balance_loss_clip": 0.06277732, "balance_loss_mlp": 0.01261144, "epoch": 0.533413497670224, "flos": 15016185221760.0, "grad_norm": 1.7918726262367919, "language_loss": 0.72239208, "learning_rate": 1.8807008539593498e-06, "loss": 0.79947746, "num_input_tokens_seen": 190817155, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.11505127, "step": 8872, "time_per_iteration": 5.424026250839233 }, { "auxiliary_loss_clip": 0.06435546, "auxiliary_loss_mlp": 0.01270489, "balance_loss_clip": 0.06279106, "balance_loss_mlp": 0.01257287, "epoch": 0.533473620922892, "flos": 19616095612800.0, "grad_norm": 1.993873473750384, "language_loss": 0.64918375, "learning_rate": 1.880312088025936e-06, "loss": 0.72624409, "num_input_tokens_seen": 190835240, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.13214111, "step": 8873, "time_per_iteration": 2.537539482116699 }, { "auxiliary_loss_clip": 0.06439506, "auxiliary_loss_mlp": 0.01272865, "balance_loss_clip": 0.06282122, "balance_loss_mlp": 0.01261904, "epoch": 0.5335337441755599, "flos": 14287827605760.0, "grad_norm": 1.9915539753109792, "language_loss": 0.80089319, "learning_rate": 1.879923326631099e-06, "loss": 0.87801689, "num_input_tokens_seen": 190851620, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.10974121, "step": 8874, "time_per_iteration": 2.541773557662964 }, { "auxiliary_loss_clip": 0.06437178, "auxiliary_loss_mlp": 0.01267297, "balance_loss_clip": 0.06280148, "balance_loss_mlp": 0.01256497, "epoch": 0.5335938674282279, "flos": 20821313214720.0, "grad_norm": 1.7093586450340874, "language_loss": 0.70212954, "learning_rate": 1.879534569789582e-06, "loss": 0.77917433, "num_input_tokens_seen": 190870545, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.10803223, "step": 8875, "time_per_iteration": 2.6445462703704834 }, { "auxiliary_loss_clip": 0.06346963, "auxiliary_loss_mlp": 0.01255455, "balance_loss_clip": 0.06283262, "balance_loss_mlp": 0.01253369, "epoch": 0.5336539906808958, "flos": 71419558101120.0, "grad_norm": 0.7133025350905096, "language_loss": 0.59587258, "learning_rate": 1.879145817516126e-06, "loss": 0.67189676, "num_input_tokens_seen": 190931995, "router_z_loss_clip": 0.63916016, "router_z_loss_mlp": 0.02087402, "step": 8876, "time_per_iteration": 3.3032445907592773 }, { "auxiliary_loss_clip": 0.06439085, "auxiliary_loss_mlp": 0.01267741, "balance_loss_clip": 0.06280887, "balance_loss_mlp": 0.01256815, "epoch": 0.5337141139335638, "flos": 20158517018880.0, "grad_norm": 1.7579641587312573, "language_loss": 0.75101411, "learning_rate": 1.8787570698254727e-06, "loss": 0.82808232, "num_input_tokens_seen": 190949890, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.10931396, "step": 8877, "time_per_iteration": 2.570718765258789 }, { "auxiliary_loss_clip": 0.06349306, "auxiliary_loss_mlp": 0.01254063, "balance_loss_clip": 0.06285693, "balance_loss_mlp": 0.0125188, "epoch": 0.5337742371862317, "flos": 67747624479360.0, "grad_norm": 0.7319924971719559, "language_loss": 0.57130396, "learning_rate": 1.8783683267323629e-06, "loss": 0.64733768, "num_input_tokens_seen": 191008480, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 0.02186584, "step": 8878, "time_per_iteration": 3.1081411838531494 }, { "auxiliary_loss_clip": 0.06449519, "auxiliary_loss_mlp": 0.01271094, "balance_loss_clip": 0.06286556, "balance_loss_mlp": 0.01259239, "epoch": 0.5338343604388998, "flos": 25015794825600.0, "grad_norm": 1.6331022226102847, "language_loss": 0.72600061, "learning_rate": 1.8779795882515395e-06, "loss": 0.8032068, "num_input_tokens_seen": 191028995, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.11859131, "step": 8879, "time_per_iteration": 2.5774359703063965 }, { "auxiliary_loss_clip": 0.06445502, "auxiliary_loss_mlp": 0.01269263, "balance_loss_clip": 0.062832, "balance_loss_mlp": 0.01256985, "epoch": 0.5338944836915677, "flos": 17606728776960.0, "grad_norm": 1.988450355559994, "language_loss": 0.84577507, "learning_rate": 1.8775908543977416e-06, "loss": 0.92292273, "num_input_tokens_seen": 191045285, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.12280273, "step": 8880, "time_per_iteration": 2.514335870742798 }, { "auxiliary_loss_clip": 0.0643588, "auxiliary_loss_mlp": 0.01267318, "balance_loss_clip": 0.06282079, "balance_loss_mlp": 0.01256738, "epoch": 0.5339546069442357, "flos": 21730282306560.0, "grad_norm": 1.7348359371623, "language_loss": 0.79603291, "learning_rate": 1.8772021251857107e-06, "loss": 0.87306488, "num_input_tokens_seen": 191066105, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.10577393, "step": 8881, "time_per_iteration": 2.547572612762451 }, { "auxiliary_loss_clip": 0.06355606, "auxiliary_loss_mlp": 0.01253443, "balance_loss_clip": 0.06291407, "balance_loss_mlp": 0.01251305, "epoch": 0.5340147301969036, "flos": 69741226748160.0, "grad_norm": 0.7908091812303709, "language_loss": 0.59241474, "learning_rate": 1.8768134006301882e-06, "loss": 0.66850519, "num_input_tokens_seen": 191126315, "router_z_loss_clip": 0.64111328, "router_z_loss_mlp": 0.02140808, "step": 8882, "time_per_iteration": 3.1513831615448 }, { "auxiliary_loss_clip": 0.06350839, "auxiliary_loss_mlp": 0.01253183, "balance_loss_clip": 0.06286644, "balance_loss_mlp": 0.0125107, "epoch": 0.5340748534495716, "flos": 63896504901120.0, "grad_norm": 0.8577866308534595, "language_loss": 0.63730603, "learning_rate": 1.876424680745913e-06, "loss": 0.71334624, "num_input_tokens_seen": 191174240, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 0.02114868, "step": 8883, "time_per_iteration": 3.0054516792297363 }, { "auxiliary_loss_clip": 0.06444862, "auxiliary_loss_mlp": 0.01269777, "balance_loss_clip": 0.06282257, "balance_loss_mlp": 0.0125782, "epoch": 0.5341349767022396, "flos": 28701872588160.0, "grad_norm": 3.176843303160847, "language_loss": 0.82353294, "learning_rate": 1.8760359655476272e-06, "loss": 0.90067935, "num_input_tokens_seen": 191193335, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1194458, "step": 8884, "time_per_iteration": 2.6190502643585205 }, { "auxiliary_loss_clip": 0.06438473, "auxiliary_loss_mlp": 0.01267848, "balance_loss_clip": 0.06284381, "balance_loss_mlp": 0.01257077, "epoch": 0.5341950999549075, "flos": 16295265797760.0, "grad_norm": 1.6332765465556975, "language_loss": 0.7235719, "learning_rate": 1.8756472550500695e-06, "loss": 0.8006351, "num_input_tokens_seen": 191210900, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.10766602, "step": 8885, "time_per_iteration": 2.5599732398986816 }, { "auxiliary_loss_clip": 0.06445149, "auxiliary_loss_mlp": 0.01266843, "balance_loss_clip": 0.06280094, "balance_loss_mlp": 0.01254666, "epoch": 0.5342552232075756, "flos": 14360852039040.0, "grad_norm": 2.0070764304878477, "language_loss": 0.79209411, "learning_rate": 1.87525854926798e-06, "loss": 0.86921406, "num_input_tokens_seen": 191226730, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.1217041, "step": 8886, "time_per_iteration": 2.5653703212738037 }, { "auxiliary_loss_clip": 0.06443509, "auxiliary_loss_mlp": 0.01269781, "balance_loss_clip": 0.06282133, "balance_loss_mlp": 0.0125721, "epoch": 0.5343153464602435, "flos": 30305517154560.0, "grad_norm": 11.690913272712447, "language_loss": 0.75044197, "learning_rate": 1.8748698482160996e-06, "loss": 0.82757485, "num_input_tokens_seen": 191250435, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.12567139, "step": 8887, "time_per_iteration": 2.643294095993042 }, { "auxiliary_loss_clip": 0.06440678, "auxiliary_loss_mlp": 0.01267212, "balance_loss_clip": 0.06283145, "balance_loss_mlp": 0.01255756, "epoch": 0.5343754697129115, "flos": 15601722353280.0, "grad_norm": 2.0653862295250653, "language_loss": 0.69174522, "learning_rate": 1.8744811519091663e-06, "loss": 0.7688241, "num_input_tokens_seen": 191268315, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.11456299, "step": 8888, "time_per_iteration": 2.5612266063690186 }, { "auxiliary_loss_clip": 0.06455977, "auxiliary_loss_mlp": 0.01269377, "balance_loss_clip": 0.062875, "balance_loss_mlp": 0.01257391, "epoch": 0.5344355929655794, "flos": 16915239757440.0, "grad_norm": 1.933934104383068, "language_loss": 0.78148997, "learning_rate": 1.8740924603619208e-06, "loss": 0.85874355, "num_input_tokens_seen": 191287000, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.11981201, "step": 8889, "time_per_iteration": 2.529402732849121 }, { "auxiliary_loss_clip": 0.06436256, "auxiliary_loss_mlp": 0.01270914, "balance_loss_clip": 0.06278871, "balance_loss_mlp": 0.01258761, "epoch": 0.5344957162182474, "flos": 16803460010880.0, "grad_norm": 1.8181410625130145, "language_loss": 0.69039798, "learning_rate": 1.873703773589102e-06, "loss": 0.76746964, "num_input_tokens_seen": 191304565, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.12158203, "step": 8890, "time_per_iteration": 2.55824875831604 }, { "auxiliary_loss_clip": 0.06446465, "auxiliary_loss_mlp": 0.01269943, "balance_loss_clip": 0.06283842, "balance_loss_mlp": 0.0125782, "epoch": 0.5345558394709153, "flos": 12709144356480.0, "grad_norm": 2.502421709693394, "language_loss": 0.77596712, "learning_rate": 1.8733150916054483e-06, "loss": 0.85313123, "num_input_tokens_seen": 191318300, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.12127686, "step": 8891, "time_per_iteration": 2.515895366668701 }, { "auxiliary_loss_clip": 0.06443673, "auxiliary_loss_mlp": 0.01270584, "balance_loss_clip": 0.06287862, "balance_loss_mlp": 0.0125945, "epoch": 0.5346159627235834, "flos": 22461532888320.0, "grad_norm": 1.634120486130929, "language_loss": 0.74429172, "learning_rate": 1.872926414425699e-06, "loss": 0.82143432, "num_input_tokens_seen": 191337925, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.11132812, "step": 8892, "time_per_iteration": 2.580888271331787 }, { "auxiliary_loss_clip": 0.06439325, "auxiliary_loss_mlp": 0.01266088, "balance_loss_clip": 0.06280444, "balance_loss_mlp": 0.01255681, "epoch": 0.5346760859762513, "flos": 22421771326080.0, "grad_norm": 1.7659563779080525, "language_loss": 0.87946546, "learning_rate": 1.8725377420645932e-06, "loss": 0.9565196, "num_input_tokens_seen": 191357120, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.10412598, "step": 8893, "time_per_iteration": 2.5566048622131348 }, { "auxiliary_loss_clip": 0.06441233, "auxiliary_loss_mlp": 0.01265316, "balance_loss_clip": 0.06285807, "balance_loss_mlp": 0.01255058, "epoch": 0.5347362092289193, "flos": 22822043080320.0, "grad_norm": 1.7210520705926762, "language_loss": 0.72815132, "learning_rate": 1.872149074536869e-06, "loss": 0.80521685, "num_input_tokens_seen": 191375395, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.10266113, "step": 8894, "time_per_iteration": 2.5697474479675293 }, { "auxiliary_loss_clip": 0.06437402, "auxiliary_loss_mlp": 0.01270675, "balance_loss_clip": 0.06283217, "balance_loss_mlp": 0.01259326, "epoch": 0.5347963324815872, "flos": 23225794778880.0, "grad_norm": 1.5409656363435633, "language_loss": 0.75327575, "learning_rate": 1.8717604118572648e-06, "loss": 0.8303566, "num_input_tokens_seen": 191395595, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.11358643, "step": 8895, "time_per_iteration": 2.577864408493042 }, { "auxiliary_loss_clip": 0.064479, "auxiliary_loss_mlp": 0.01266037, "balance_loss_clip": 0.06288615, "balance_loss_mlp": 0.01254844, "epoch": 0.5348564557342552, "flos": 22607917171200.0, "grad_norm": 1.6056503907200925, "language_loss": 0.77215183, "learning_rate": 1.8713717540405178e-06, "loss": 0.84929121, "num_input_tokens_seen": 191413730, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.11193848, "step": 8896, "time_per_iteration": 2.5580480098724365 }, { "auxiliary_loss_clip": 0.06438868, "auxiliary_loss_mlp": 0.01268675, "balance_loss_clip": 0.06284188, "balance_loss_mlp": 0.0125747, "epoch": 0.5349165789869232, "flos": 18007880999040.0, "grad_norm": 1.53934275404283, "language_loss": 0.78838003, "learning_rate": 1.8709831011013676e-06, "loss": 0.86545545, "num_input_tokens_seen": 191432400, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.11206055, "step": 8897, "time_per_iteration": 2.5323522090911865 }, { "auxiliary_loss_clip": 0.06444772, "auxiliary_loss_mlp": 0.01265238, "balance_loss_clip": 0.06286956, "balance_loss_mlp": 0.01253919, "epoch": 0.5349767022395912, "flos": 17164557181440.0, "grad_norm": 1.8019825663828646, "language_loss": 0.76376939, "learning_rate": 1.8705944530545509e-06, "loss": 0.84086955, "num_input_tokens_seen": 191448855, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.11322021, "step": 8898, "time_per_iteration": 2.512397527694702 }, { "auxiliary_loss_clip": 0.06359755, "auxiliary_loss_mlp": 0.01252996, "balance_loss_clip": 0.06295468, "balance_loss_mlp": 0.01251, "epoch": 0.5350368254922592, "flos": 71014590518400.0, "grad_norm": 0.8078929814167342, "language_loss": 0.5804199, "learning_rate": 1.8702058099148052e-06, "loss": 0.65654743, "num_input_tokens_seen": 191519690, "router_z_loss_clip": 0.64208984, "router_z_loss_mlp": 0.0199585, "step": 8899, "time_per_iteration": 3.370044231414795 }, { "auxiliary_loss_clip": 0.06441874, "auxiliary_loss_mlp": 0.0126914, "balance_loss_clip": 0.06286804, "balance_loss_mlp": 0.01258309, "epoch": 0.5350969487449271, "flos": 27425265707520.0, "grad_norm": 1.4838418359854932, "language_loss": 0.69919127, "learning_rate": 1.869817171696868e-06, "loss": 0.77630138, "num_input_tokens_seen": 191539380, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.1083374, "step": 8900, "time_per_iteration": 4.145075082778931 }, { "auxiliary_loss_clip": 0.0645034, "auxiliary_loss_mlp": 0.01274989, "balance_loss_clip": 0.06289284, "balance_loss_mlp": 0.01263688, "epoch": 0.5351570719975951, "flos": 19321901527680.0, "grad_norm": 1.5412441373176005, "language_loss": 0.71534765, "learning_rate": 1.8694285384154777e-06, "loss": 0.79260099, "num_input_tokens_seen": 191557400, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.11297607, "step": 8901, "time_per_iteration": 2.5292088985443115 }, { "auxiliary_loss_clip": 0.06446199, "auxiliary_loss_mlp": 0.0127325, "balance_loss_clip": 0.06285083, "balance_loss_mlp": 0.01260876, "epoch": 0.535217195250263, "flos": 19834707715200.0, "grad_norm": 1.9388581482086775, "language_loss": 0.7767719, "learning_rate": 1.8690399100853699e-06, "loss": 0.85396641, "num_input_tokens_seen": 191575860, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.12371826, "step": 8902, "time_per_iteration": 3.9850704669952393 }, { "auxiliary_loss_clip": 0.06435152, "auxiliary_loss_mlp": 0.01264224, "balance_loss_clip": 0.06283857, "balance_loss_mlp": 0.0125337, "epoch": 0.535277318502931, "flos": 22134495202560.0, "grad_norm": 1.7123427418594694, "language_loss": 0.69829142, "learning_rate": 1.868651286721281e-06, "loss": 0.77528518, "num_input_tokens_seen": 191595775, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.10858154, "step": 8903, "time_per_iteration": 2.5602850914001465 }, { "auxiliary_loss_clip": 0.06447262, "auxiliary_loss_mlp": 0.01267589, "balance_loss_clip": 0.06285849, "balance_loss_mlp": 0.01256169, "epoch": 0.5353374417555989, "flos": 25052873057280.0, "grad_norm": 1.538950622223005, "language_loss": 0.72723019, "learning_rate": 1.86826266833795e-06, "loss": 0.80437863, "num_input_tokens_seen": 191617785, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.11419678, "step": 8904, "time_per_iteration": 2.6287059783935547 }, { "auxiliary_loss_clip": 0.06447709, "auxiliary_loss_mlp": 0.01275587, "balance_loss_clip": 0.06289401, "balance_loss_mlp": 0.0126329, "epoch": 0.535397565008267, "flos": 19394422836480.0, "grad_norm": 1.8206073858256482, "language_loss": 0.73261368, "learning_rate": 1.8678740549501103e-06, "loss": 0.80984664, "num_input_tokens_seen": 191636900, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.12286377, "step": 8905, "time_per_iteration": 2.6002614498138428 }, { "auxiliary_loss_clip": 0.06438874, "auxiliary_loss_mlp": 0.01263019, "balance_loss_clip": 0.06287466, "balance_loss_mlp": 0.01252719, "epoch": 0.5354576882609349, "flos": 21477736500480.0, "grad_norm": 1.4301042888313074, "language_loss": 0.83948004, "learning_rate": 1.8674854465725005e-06, "loss": 0.91649902, "num_input_tokens_seen": 191656720, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.10302734, "step": 8906, "time_per_iteration": 2.592341184616089 }, { "auxiliary_loss_clip": 0.06443199, "auxiliary_loss_mlp": 0.01269602, "balance_loss_clip": 0.06282657, "balance_loss_mlp": 0.01257449, "epoch": 0.5355178115136029, "flos": 20783857639680.0, "grad_norm": 2.705115195488805, "language_loss": 0.74341905, "learning_rate": 1.8670968432198563e-06, "loss": 0.82054704, "num_input_tokens_seen": 191674445, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.12164307, "step": 8907, "time_per_iteration": 2.5451912879943848 }, { "auxiliary_loss_clip": 0.06442602, "auxiliary_loss_mlp": 0.01265844, "balance_loss_clip": 0.06286216, "balance_loss_mlp": 0.01254966, "epoch": 0.5355779347662708, "flos": 23520827404800.0, "grad_norm": 1.6003581277182906, "language_loss": 0.76544464, "learning_rate": 1.866708244906912e-06, "loss": 0.84252906, "num_input_tokens_seen": 191695000, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.10876465, "step": 8908, "time_per_iteration": 2.604365348815918 }, { "auxiliary_loss_clip": 0.06447273, "auxiliary_loss_mlp": 0.0127233, "balance_loss_clip": 0.06286737, "balance_loss_mlp": 0.01260748, "epoch": 0.5356380580189388, "flos": 20309471349120.0, "grad_norm": 2.3989724836195943, "language_loss": 0.74316525, "learning_rate": 1.8663196516484055e-06, "loss": 0.82036126, "num_input_tokens_seen": 191713295, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.11578369, "step": 8909, "time_per_iteration": 2.631892204284668 }, { "auxiliary_loss_clip": 0.06441305, "auxiliary_loss_mlp": 0.01266405, "balance_loss_clip": 0.06286544, "balance_loss_mlp": 0.01256034, "epoch": 0.5356981812716068, "flos": 21368136960000.0, "grad_norm": 2.2744933904467826, "language_loss": 0.84318733, "learning_rate": 1.8659310634590702e-06, "loss": 0.92026448, "num_input_tokens_seen": 191732725, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.1036377, "step": 8910, "time_per_iteration": 2.5725607872009277 }, { "auxiliary_loss_clip": 0.06445761, "auxiliary_loss_mlp": 0.01266538, "balance_loss_clip": 0.06286585, "balance_loss_mlp": 0.01254713, "epoch": 0.5357583045242748, "flos": 23117746538880.0, "grad_norm": 1.523859720270558, "language_loss": 0.82322776, "learning_rate": 1.8655424803536427e-06, "loss": 0.90035081, "num_input_tokens_seen": 191753765, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.11804199, "step": 8911, "time_per_iteration": 2.5699899196624756 }, { "auxiliary_loss_clip": 0.06442779, "auxiliary_loss_mlp": 0.0127179, "balance_loss_clip": 0.06285265, "balance_loss_mlp": 0.01261175, "epoch": 0.5358184277769428, "flos": 21148057411200.0, "grad_norm": 1.8583008511490708, "language_loss": 0.69480836, "learning_rate": 1.8651539023468585e-06, "loss": 0.77195406, "num_input_tokens_seen": 191773560, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.10614014, "step": 8912, "time_per_iteration": 5.540574073791504 }, { "auxiliary_loss_clip": 0.06439136, "auxiliary_loss_mlp": 0.01265493, "balance_loss_clip": 0.06283712, "balance_loss_mlp": 0.01255104, "epoch": 0.5358785510296107, "flos": 16286754608640.0, "grad_norm": 2.043300614539447, "language_loss": 0.71331942, "learning_rate": 1.8647653294534509e-06, "loss": 0.7903657, "num_input_tokens_seen": 191791255, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.10388184, "step": 8913, "time_per_iteration": 2.596220016479492 }, { "auxiliary_loss_clip": 0.06451147, "auxiliary_loss_mlp": 0.01269028, "balance_loss_clip": 0.06289887, "balance_loss_mlp": 0.01257793, "epoch": 0.5359386742822787, "flos": 16981555864320.0, "grad_norm": 1.7212415578749185, "language_loss": 0.72102356, "learning_rate": 1.864376761688156e-06, "loss": 0.79822528, "num_input_tokens_seen": 191809325, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.11236572, "step": 8914, "time_per_iteration": 2.557396411895752 }, { "auxiliary_loss_clip": 0.06447485, "auxiliary_loss_mlp": 0.01275725, "balance_loss_clip": 0.06283015, "balance_loss_mlp": 0.01263798, "epoch": 0.5359987975349466, "flos": 20819091081600.0, "grad_norm": 2.2403589988775505, "language_loss": 0.70564115, "learning_rate": 1.8639881990657079e-06, "loss": 0.78287327, "num_input_tokens_seen": 191829795, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.11932373, "step": 8915, "time_per_iteration": 2.574856758117676 }, { "auxiliary_loss_clip": 0.06440981, "auxiliary_loss_mlp": 0.01270948, "balance_loss_clip": 0.06285312, "balance_loss_mlp": 0.01258801, "epoch": 0.5360589207876146, "flos": 22206429532800.0, "grad_norm": 1.5638251593988595, "language_loss": 0.75414771, "learning_rate": 1.8635996416008408e-06, "loss": 0.831267, "num_input_tokens_seen": 191850840, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.12145996, "step": 8916, "time_per_iteration": 2.5990147590637207 }, { "auxiliary_loss_clip": 0.06444296, "auxiliary_loss_mlp": 0.01268851, "balance_loss_clip": 0.06282786, "balance_loss_mlp": 0.01257764, "epoch": 0.5361190440402825, "flos": 31402393027200.0, "grad_norm": 1.8096486378894676, "language_loss": 0.72771347, "learning_rate": 1.863211089308289e-06, "loss": 0.80484498, "num_input_tokens_seen": 191869520, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.11077881, "step": 8917, "time_per_iteration": 2.6216681003570557 }, { "auxiliary_loss_clip": 0.06444014, "auxiliary_loss_mlp": 0.01270144, "balance_loss_clip": 0.06286822, "balance_loss_mlp": 0.01259009, "epoch": 0.5361791672929506, "flos": 16075270103040.0, "grad_norm": 1.8849127602078286, "language_loss": 0.71544361, "learning_rate": 1.8628225422027865e-06, "loss": 0.79258519, "num_input_tokens_seen": 191887240, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.11151123, "step": 8918, "time_per_iteration": 2.5660817623138428 }, { "auxiliary_loss_clip": 0.06442502, "auxiliary_loss_mlp": 0.0127265, "balance_loss_clip": 0.06285986, "balance_loss_mlp": 0.01261146, "epoch": 0.5362392905456185, "flos": 20747240605440.0, "grad_norm": 1.9853058377374355, "language_loss": 0.7531749, "learning_rate": 1.862434000299067e-06, "loss": 0.83032638, "num_input_tokens_seen": 191905690, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.11486816, "step": 8919, "time_per_iteration": 2.5424537658691406 }, { "auxiliary_loss_clip": 0.06444381, "auxiliary_loss_mlp": 0.0127234, "balance_loss_clip": 0.06284388, "balance_loss_mlp": 0.01261581, "epoch": 0.5362994137982865, "flos": 17344539751680.0, "grad_norm": 1.962819578864069, "language_loss": 0.71842861, "learning_rate": 1.862045463611864e-06, "loss": 0.79559588, "num_input_tokens_seen": 191920725, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.10760498, "step": 8920, "time_per_iteration": 2.5559139251708984 }, { "auxiliary_loss_clip": 0.06441337, "auxiliary_loss_mlp": 0.01268763, "balance_loss_clip": 0.06283675, "balance_loss_mlp": 0.01257354, "epoch": 0.5363595370509544, "flos": 42823819837440.0, "grad_norm": 1.4154178685565884, "language_loss": 0.68757933, "learning_rate": 1.8616569321559105e-06, "loss": 0.76468039, "num_input_tokens_seen": 191944645, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.11407471, "step": 8921, "time_per_iteration": 2.7568881511688232 }, { "auxiliary_loss_clip": 0.06443486, "auxiliary_loss_mlp": 0.01269522, "balance_loss_clip": 0.06285915, "balance_loss_mlp": 0.01257815, "epoch": 0.5364196603036224, "flos": 19177990940160.0, "grad_norm": 1.8038475045377558, "language_loss": 0.81713748, "learning_rate": 1.86126840594594e-06, "loss": 0.89426756, "num_input_tokens_seen": 191962265, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.11712646, "step": 8922, "time_per_iteration": 2.5502378940582275 }, { "auxiliary_loss_clip": 0.06447046, "auxiliary_loss_mlp": 0.01266462, "balance_loss_clip": 0.06287825, "balance_loss_mlp": 0.0125534, "epoch": 0.5364797835562904, "flos": 17936827136640.0, "grad_norm": 1.9282066145264347, "language_loss": 0.76719415, "learning_rate": 1.860879884996686e-06, "loss": 0.84432924, "num_input_tokens_seen": 191978850, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.11126709, "step": 8923, "time_per_iteration": 2.5344953536987305 }, { "auxiliary_loss_clip": 0.06446269, "auxiliary_loss_mlp": 0.0127036, "balance_loss_clip": 0.06285548, "balance_loss_mlp": 0.01258612, "epoch": 0.5365399068089584, "flos": 30236098446720.0, "grad_norm": 1.4713986233681893, "language_loss": 0.70503283, "learning_rate": 1.8604913693228804e-06, "loss": 0.78219914, "num_input_tokens_seen": 192002000, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.11755371, "step": 8924, "time_per_iteration": 2.655179023742676 }, { "auxiliary_loss_clip": 0.06446402, "auxiliary_loss_mlp": 0.01272678, "balance_loss_clip": 0.06285328, "balance_loss_mlp": 0.01260876, "epoch": 0.5366000300616264, "flos": 24897264825600.0, "grad_norm": 1.8160314581945942, "language_loss": 0.87691414, "learning_rate": 1.8601028589392558e-06, "loss": 0.95410496, "num_input_tokens_seen": 192019100, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.11804199, "step": 8925, "time_per_iteration": 2.5771026611328125 }, { "auxiliary_loss_clip": 0.06444795, "auxiliary_loss_mlp": 0.0126833, "balance_loss_clip": 0.06282791, "balance_loss_mlp": 0.01256862, "epoch": 0.5366601533142943, "flos": 29834610808320.0, "grad_norm": 1.858026011310021, "language_loss": 0.78388447, "learning_rate": 1.8597143538605455e-06, "loss": 0.86101568, "num_input_tokens_seen": 192041660, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.11480713, "step": 8926, "time_per_iteration": 2.6298046112060547 }, { "auxiliary_loss_clip": 0.0643661, "auxiliary_loss_mlp": 0.01266662, "balance_loss_clip": 0.06283159, "balance_loss_mlp": 0.01256297, "epoch": 0.5367202765669623, "flos": 27206821313280.0, "grad_norm": 2.170892446409595, "language_loss": 0.67224431, "learning_rate": 1.85932585410148e-06, "loss": 0.749277, "num_input_tokens_seen": 192063540, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.1036377, "step": 8927, "time_per_iteration": 2.5687239170074463 }, { "auxiliary_loss_clip": 0.06443674, "auxiliary_loss_mlp": 0.01264113, "balance_loss_clip": 0.06283943, "balance_loss_mlp": 0.01253152, "epoch": 0.5367803998196302, "flos": 20236153426560.0, "grad_norm": 1.687697409858717, "language_loss": 0.73728716, "learning_rate": 1.8589373596767929e-06, "loss": 0.81436497, "num_input_tokens_seen": 192081760, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.10943604, "step": 8928, "time_per_iteration": 2.561058759689331 }, { "auxiliary_loss_clip": 0.06448269, "auxiliary_loss_mlp": 0.01264849, "balance_loss_clip": 0.06289326, "balance_loss_mlp": 0.01254412, "epoch": 0.5368405230722982, "flos": 32161791381120.0, "grad_norm": 1.8066624293289417, "language_loss": 0.63414443, "learning_rate": 1.8585488706012154e-06, "loss": 0.71127558, "num_input_tokens_seen": 192101620, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.10437012, "step": 8929, "time_per_iteration": 2.6407651901245117 }, { "auxiliary_loss_clip": 0.06447674, "auxiliary_loss_mlp": 0.01267459, "balance_loss_clip": 0.06288689, "balance_loss_mlp": 0.01256581, "epoch": 0.5369006463249661, "flos": 26254778423040.0, "grad_norm": 1.6415640948937185, "language_loss": 0.66511583, "learning_rate": 1.8581603868894781e-06, "loss": 0.74226713, "num_input_tokens_seen": 192121805, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.10876465, "step": 8930, "time_per_iteration": 2.6166207790374756 }, { "auxiliary_loss_clip": 0.06437344, "auxiliary_loss_mlp": 0.01266171, "balance_loss_clip": 0.06283639, "balance_loss_mlp": 0.01255746, "epoch": 0.5369607695776342, "flos": 26218119461760.0, "grad_norm": 1.4143512418347517, "language_loss": 0.67212468, "learning_rate": 1.8577719085563136e-06, "loss": 0.74915987, "num_input_tokens_seen": 192141765, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.10424805, "step": 8931, "time_per_iteration": 2.6002540588378906 }, { "auxiliary_loss_clip": 0.06447208, "auxiliary_loss_mlp": 0.01270424, "balance_loss_clip": 0.06291068, "balance_loss_mlp": 0.0125861, "epoch": 0.5370208928303021, "flos": 25015920606720.0, "grad_norm": 1.6432090777150963, "language_loss": 0.75863791, "learning_rate": 1.8573834356164525e-06, "loss": 0.83581424, "num_input_tokens_seen": 192161560, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.1182251, "step": 8932, "time_per_iteration": 2.589341640472412 }, { "auxiliary_loss_clip": 0.0644378, "auxiliary_loss_mlp": 0.01270256, "balance_loss_clip": 0.06288781, "balance_loss_mlp": 0.01258115, "epoch": 0.5370810160829701, "flos": 31799646034560.0, "grad_norm": 1.7636694707296006, "language_loss": 0.66442311, "learning_rate": 1.8569949680846261e-06, "loss": 0.7415635, "num_input_tokens_seen": 192180190, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.12139893, "step": 8933, "time_per_iteration": 2.625612258911133 }, { "auxiliary_loss_clip": 0.06442483, "auxiliary_loss_mlp": 0.01268793, "balance_loss_clip": 0.062878, "balance_loss_mlp": 0.01257498, "epoch": 0.537141139335638, "flos": 23849500245120.0, "grad_norm": 3.4097978842457835, "language_loss": 0.82998323, "learning_rate": 1.856606505975565e-06, "loss": 0.90709603, "num_input_tokens_seen": 192198855, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.11297607, "step": 8934, "time_per_iteration": 2.5565459728240967 }, { "auxiliary_loss_clip": 0.06443086, "auxiliary_loss_mlp": 0.01269968, "balance_loss_clip": 0.06290029, "balance_loss_mlp": 0.01258691, "epoch": 0.537201262588306, "flos": 18513685370880.0, "grad_norm": 1.7393608056543592, "language_loss": 0.79744053, "learning_rate": 1.856218049303999e-06, "loss": 0.87457108, "num_input_tokens_seen": 192216555, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.112854, "step": 8935, "time_per_iteration": 2.534050941467285 }, { "auxiliary_loss_clip": 0.06447063, "auxiliary_loss_mlp": 0.01270422, "balance_loss_clip": 0.06289483, "balance_loss_mlp": 0.01258477, "epoch": 0.537261385840974, "flos": 25669492853760.0, "grad_norm": 1.7901442658226727, "language_loss": 0.8384012, "learning_rate": 1.855829598084659e-06, "loss": 0.91557604, "num_input_tokens_seen": 192236910, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.11956787, "step": 8936, "time_per_iteration": 2.5888659954071045 }, { "auxiliary_loss_clip": 0.06446382, "auxiliary_loss_mlp": 0.01269607, "balance_loss_clip": 0.06292398, "balance_loss_mlp": 0.01258509, "epoch": 0.537321509093642, "flos": 40744656950400.0, "grad_norm": 1.3635134329685148, "language_loss": 0.72683835, "learning_rate": 1.8554411523322754e-06, "loss": 0.80399817, "num_input_tokens_seen": 192260790, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11108398, "step": 8937, "time_per_iteration": 2.7198798656463623 }, { "auxiliary_loss_clip": 0.06453156, "auxiliary_loss_mlp": 0.012706, "balance_loss_clip": 0.06293409, "balance_loss_mlp": 0.01258834, "epoch": 0.53738163234631, "flos": 17244248014080.0, "grad_norm": 2.161186810761723, "language_loss": 0.81764483, "learning_rate": 1.8550527120615778e-06, "loss": 0.89488244, "num_input_tokens_seen": 192277230, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.11773682, "step": 8938, "time_per_iteration": 2.538649559020996 }, { "auxiliary_loss_clip": 0.06462072, "auxiliary_loss_mlp": 0.01269478, "balance_loss_clip": 0.06298178, "balance_loss_mlp": 0.01258099, "epoch": 0.5374417555989779, "flos": 12826710034560.0, "grad_norm": 2.400331131538748, "language_loss": 0.80173683, "learning_rate": 1.8546642772872957e-06, "loss": 0.87905228, "num_input_tokens_seen": 192292840, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.11383057, "step": 8939, "time_per_iteration": 2.5250749588012695 }, { "auxiliary_loss_clip": 0.06373208, "auxiliary_loss_mlp": 0.01266913, "balance_loss_clip": 0.06310277, "balance_loss_mlp": 0.01265308, "epoch": 0.5375018788516459, "flos": 67275502248960.0, "grad_norm": 0.6959899346036598, "language_loss": 0.5246132, "learning_rate": 1.8542758480241589e-06, "loss": 0.60101438, "num_input_tokens_seen": 192358240, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.01605988, "step": 8940, "time_per_iteration": 4.639883518218994 }, { "auxiliary_loss_clip": 0.06444967, "auxiliary_loss_mlp": 0.01264956, "balance_loss_clip": 0.06292017, "balance_loss_mlp": 0.01254448, "epoch": 0.5375620021043138, "flos": 18120080016000.0, "grad_norm": 1.7490763056405603, "language_loss": 0.7166003, "learning_rate": 1.8538874242868965e-06, "loss": 0.7936995, "num_input_tokens_seen": 192377370, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.1050415, "step": 8941, "time_per_iteration": 3.9442427158355713 }, { "auxiliary_loss_clip": 0.0644803, "auxiliary_loss_mlp": 0.01268162, "balance_loss_clip": 0.0629534, "balance_loss_mlp": 0.01257386, "epoch": 0.5376221253569818, "flos": 23156166435840.0, "grad_norm": 1.6321926658806738, "language_loss": 0.79594213, "learning_rate": 1.853499006090237e-06, "loss": 0.8731041, "num_input_tokens_seen": 192396450, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.10778809, "step": 8942, "time_per_iteration": 2.5606582164764404 }, { "auxiliary_loss_clip": 0.06460048, "auxiliary_loss_mlp": 0.01269789, "balance_loss_clip": 0.06296887, "balance_loss_mlp": 0.01258106, "epoch": 0.5376822486096497, "flos": 29980240404480.0, "grad_norm": 1.640241989273456, "language_loss": 0.70565534, "learning_rate": 1.853110593448911e-06, "loss": 0.78295374, "num_input_tokens_seen": 192417390, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.11688232, "step": 8943, "time_per_iteration": 2.6351912021636963 }, { "auxiliary_loss_clip": 0.06370254, "auxiliary_loss_mlp": 0.01259509, "balance_loss_clip": 0.06307133, "balance_loss_mlp": 0.01257839, "epoch": 0.5377423718623178, "flos": 54188139761280.0, "grad_norm": 0.7773206817647276, "language_loss": 0.59566283, "learning_rate": 1.852722186377645e-06, "loss": 0.67196047, "num_input_tokens_seen": 192478060, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.01673889, "step": 8944, "time_per_iteration": 3.205939292907715 }, { "auxiliary_loss_clip": 0.06459785, "auxiliary_loss_mlp": 0.0127054, "balance_loss_clip": 0.06293406, "balance_loss_mlp": 0.01258745, "epoch": 0.5378024951149857, "flos": 23263585770240.0, "grad_norm": 2.4923682658459168, "language_loss": 0.77790678, "learning_rate": 1.852333784891169e-06, "loss": 0.85521007, "num_input_tokens_seen": 192495985, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.11791992, "step": 8945, "time_per_iteration": 2.550811290740967 }, { "auxiliary_loss_clip": 0.06448755, "auxiliary_loss_mlp": 0.01267859, "balance_loss_clip": 0.06290472, "balance_loss_mlp": 0.01257238, "epoch": 0.5378626183676537, "flos": 24030866407680.0, "grad_norm": 1.6189996883251454, "language_loss": 0.68307424, "learning_rate": 1.8519453890042112e-06, "loss": 0.76024038, "num_input_tokens_seen": 192515445, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.1060791, "step": 8946, "time_per_iteration": 2.558931827545166 }, { "auxiliary_loss_clip": 0.06447692, "auxiliary_loss_mlp": 0.01269682, "balance_loss_clip": 0.06293368, "balance_loss_mlp": 0.01258745, "epoch": 0.5379227416203216, "flos": 27169072248960.0, "grad_norm": 1.5814842400080118, "language_loss": 0.77262199, "learning_rate": 1.851556998731498e-06, "loss": 0.8497957, "num_input_tokens_seen": 192536530, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.10931396, "step": 8947, "time_per_iteration": 2.5989506244659424 }, { "auxiliary_loss_clip": 0.06450866, "auxiliary_loss_mlp": 0.01275759, "balance_loss_clip": 0.06292994, "balance_loss_mlp": 0.01264762, "epoch": 0.5379828648729896, "flos": 24688631358720.0, "grad_norm": 1.7734530943632123, "language_loss": 0.60685569, "learning_rate": 1.8511686140877592e-06, "loss": 0.68412197, "num_input_tokens_seen": 192556075, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.10986328, "step": 8948, "time_per_iteration": 2.5915274620056152 }, { "auxiliary_loss_clip": 0.06457254, "auxiliary_loss_mlp": 0.01268048, "balance_loss_clip": 0.06299005, "balance_loss_mlp": 0.01257152, "epoch": 0.5380429881256577, "flos": 22528981025280.0, "grad_norm": 1.834760342874043, "language_loss": 0.79480273, "learning_rate": 1.8507802350877205e-06, "loss": 0.87205577, "num_input_tokens_seen": 192575535, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.10894775, "step": 8949, "time_per_iteration": 2.5629470348358154 }, { "auxiliary_loss_clip": 0.06443854, "auxiliary_loss_mlp": 0.01274183, "balance_loss_clip": 0.06291325, "balance_loss_mlp": 0.01262692, "epoch": 0.5381031113783256, "flos": 26986825618560.0, "grad_norm": 1.632666578353803, "language_loss": 0.78530818, "learning_rate": 1.850391861746111e-06, "loss": 0.86248857, "num_input_tokens_seen": 192594490, "router_z_loss_clip": 1.52539062, "router_z_loss_mlp": 0.11499023, "step": 8950, "time_per_iteration": 2.6043789386749268 }, { "auxiliary_loss_clip": 0.06447888, "auxiliary_loss_mlp": 0.01273538, "balance_loss_clip": 0.06296178, "balance_loss_mlp": 0.01262988, "epoch": 0.5381632346309936, "flos": 24761026886400.0, "grad_norm": 1.5707890910206288, "language_loss": 0.72712106, "learning_rate": 1.8500034940776573e-06, "loss": 0.80433536, "num_input_tokens_seen": 192615650, "router_z_loss_clip": 1.51660156, "router_z_loss_mlp": 0.10552979, "step": 8951, "time_per_iteration": 2.620359182357788 }, { "auxiliary_loss_clip": 0.06448865, "auxiliary_loss_mlp": 0.01269363, "balance_loss_clip": 0.06290368, "balance_loss_mlp": 0.01258539, "epoch": 0.5382233578836615, "flos": 15565524589440.0, "grad_norm": 7.359926603845245, "language_loss": 0.76031983, "learning_rate": 1.849615132097085e-06, "loss": 0.83750212, "num_input_tokens_seen": 192633840, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.10821533, "step": 8952, "time_per_iteration": 5.47834587097168 }, { "auxiliary_loss_clip": 0.06446682, "auxiliary_loss_mlp": 0.01272284, "balance_loss_clip": 0.06291667, "balance_loss_mlp": 0.0126053, "epoch": 0.5382834811363295, "flos": 25091838005760.0, "grad_norm": 1.3973830185539744, "language_loss": 0.79657727, "learning_rate": 1.8492267758191228e-06, "loss": 0.87376696, "num_input_tokens_seen": 192655890, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.11749268, "step": 8953, "time_per_iteration": 2.598285436630249 }, { "auxiliary_loss_clip": 0.06445722, "auxiliary_loss_mlp": 0.01267036, "balance_loss_clip": 0.06292744, "balance_loss_mlp": 0.01256843, "epoch": 0.5383436043889974, "flos": 13302983041920.0, "grad_norm": 1.9113636780012748, "language_loss": 0.80499148, "learning_rate": 1.8488384252584964e-06, "loss": 0.88211906, "num_input_tokens_seen": 192673025, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.10192871, "step": 8954, "time_per_iteration": 2.5146872997283936 }, { "auxiliary_loss_clip": 0.06443112, "auxiliary_loss_mlp": 0.01272864, "balance_loss_clip": 0.06286907, "balance_loss_mlp": 0.01260943, "epoch": 0.5384037276416654, "flos": 23046063770880.0, "grad_norm": 2.015261565761627, "language_loss": 0.76492572, "learning_rate": 1.8484500804299318e-06, "loss": 0.84208548, "num_input_tokens_seen": 192692190, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.1192627, "step": 8955, "time_per_iteration": 2.546635150909424 }, { "auxiliary_loss_clip": 0.06444551, "auxiliary_loss_mlp": 0.01272748, "balance_loss_clip": 0.06290642, "balance_loss_mlp": 0.01261059, "epoch": 0.5384638508943334, "flos": 20637389502720.0, "grad_norm": 1.5075929066321911, "language_loss": 0.78580993, "learning_rate": 1.8480617413481557e-06, "loss": 0.86298287, "num_input_tokens_seen": 192710380, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.11700439, "step": 8956, "time_per_iteration": 2.694153308868408 }, { "auxiliary_loss_clip": 0.06344084, "auxiliary_loss_mlp": 0.01257179, "balance_loss_clip": 0.06281316, "balance_loss_mlp": 0.01255397, "epoch": 0.5385239741470014, "flos": 66755820026880.0, "grad_norm": 0.8522924046848204, "language_loss": 0.63340092, "learning_rate": 1.8476734080278932e-06, "loss": 0.70941353, "num_input_tokens_seen": 192768995, "router_z_loss_clip": 0.62597656, "router_z_loss_mlp": 0.01779175, "step": 8957, "time_per_iteration": 3.1355903148651123 }, { "auxiliary_loss_clip": 0.06344517, "auxiliary_loss_mlp": 0.01255603, "balance_loss_clip": 0.06281631, "balance_loss_mlp": 0.01253846, "epoch": 0.5385840973996693, "flos": 64737466076160.0, "grad_norm": 0.7834940646954773, "language_loss": 0.51290369, "learning_rate": 1.8472850804838705e-06, "loss": 0.58890492, "num_input_tokens_seen": 192825585, "router_z_loss_clip": 0.63085938, "router_z_loss_mlp": 0.01759338, "step": 8958, "time_per_iteration": 3.213984489440918 }, { "auxiliary_loss_clip": 0.06451288, "auxiliary_loss_mlp": 0.01269409, "balance_loss_clip": 0.06290981, "balance_loss_mlp": 0.0125778, "epoch": 0.5386442206523373, "flos": 26149161951360.0, "grad_norm": 1.9771045066876998, "language_loss": 0.77460879, "learning_rate": 1.8468967587308128e-06, "loss": 0.8518157, "num_input_tokens_seen": 192847335, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.11633301, "step": 8959, "time_per_iteration": 2.629988193511963 }, { "auxiliary_loss_clip": 0.06443714, "auxiliary_loss_mlp": 0.0126801, "balance_loss_clip": 0.06284896, "balance_loss_mlp": 0.01256465, "epoch": 0.5387043439050052, "flos": 18256401809280.0, "grad_norm": 1.964064275439323, "language_loss": 0.84709245, "learning_rate": 1.8465084427834455e-06, "loss": 0.92420971, "num_input_tokens_seen": 192862205, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.11535645, "step": 8960, "time_per_iteration": 2.6196537017822266 }, { "auxiliary_loss_clip": 0.06444453, "auxiliary_loss_mlp": 0.01265301, "balance_loss_clip": 0.0628816, "balance_loss_mlp": 0.01254709, "epoch": 0.5387644671576732, "flos": 29795939349120.0, "grad_norm": 1.5837659846294334, "language_loss": 0.7877664, "learning_rate": 1.8461201326564933e-06, "loss": 0.86486399, "num_input_tokens_seen": 192883695, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.10595703, "step": 8961, "time_per_iteration": 2.6598196029663086 }, { "auxiliary_loss_clip": 0.06440762, "auxiliary_loss_mlp": 0.01271126, "balance_loss_clip": 0.06285118, "balance_loss_mlp": 0.01259485, "epoch": 0.5388245904103413, "flos": 22379661849600.0, "grad_norm": 1.71730039824477, "language_loss": 0.84461784, "learning_rate": 1.845731828364681e-06, "loss": 0.92173672, "num_input_tokens_seen": 192900190, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.11645508, "step": 8962, "time_per_iteration": 2.544468402862549 }, { "auxiliary_loss_clip": 0.06341717, "auxiliary_loss_mlp": 0.01255552, "balance_loss_clip": 0.06279133, "balance_loss_mlp": 0.01253617, "epoch": 0.5388847136630092, "flos": 69827332417920.0, "grad_norm": 0.9861427143794326, "language_loss": 0.54068446, "learning_rate": 1.8453435299227333e-06, "loss": 0.6166572, "num_input_tokens_seen": 192958675, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01933289, "step": 8963, "time_per_iteration": 3.136939764022827 }, { "auxiliary_loss_clip": 0.06339799, "auxiliary_loss_mlp": 0.01250653, "balance_loss_clip": 0.06277077, "balance_loss_mlp": 0.01248871, "epoch": 0.5389448369156772, "flos": 69844270942080.0, "grad_norm": 0.7779813495425802, "language_loss": 0.62868512, "learning_rate": 1.8449552373453744e-06, "loss": 0.70458961, "num_input_tokens_seen": 193033135, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01782227, "step": 8964, "time_per_iteration": 3.313633441925049 }, { "auxiliary_loss_clip": 0.06446566, "auxiliary_loss_mlp": 0.0126884, "balance_loss_clip": 0.06283976, "balance_loss_mlp": 0.01256943, "epoch": 0.5390049601683451, "flos": 31730478888960.0, "grad_norm": 1.5661072336728297, "language_loss": 0.70404696, "learning_rate": 1.8445669506473287e-06, "loss": 0.78120106, "num_input_tokens_seen": 193055570, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.11907959, "step": 8965, "time_per_iteration": 2.6280815601348877 }, { "auxiliary_loss_clip": 0.06452796, "auxiliary_loss_mlp": 0.01269157, "balance_loss_clip": 0.06293142, "balance_loss_mlp": 0.01258226, "epoch": 0.5390650834210131, "flos": 18119283402240.0, "grad_norm": 2.112936973999632, "language_loss": 0.81742299, "learning_rate": 1.8441786698433192e-06, "loss": 0.89464253, "num_input_tokens_seen": 193073120, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.10919189, "step": 8966, "time_per_iteration": 2.535233736038208 }, { "auxiliary_loss_clip": 0.06442654, "auxiliary_loss_mlp": 0.01268286, "balance_loss_clip": 0.06287062, "balance_loss_mlp": 0.01257146, "epoch": 0.539125206673681, "flos": 17421798816000.0, "grad_norm": 1.894761811171788, "language_loss": 0.72411311, "learning_rate": 1.8437903949480706e-06, "loss": 0.80122256, "num_input_tokens_seen": 193090105, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.11132812, "step": 8967, "time_per_iteration": 2.5226452350616455 }, { "auxiliary_loss_clip": 0.0644119, "auxiliary_loss_mlp": 0.01266238, "balance_loss_clip": 0.06284171, "balance_loss_mlp": 0.01255741, "epoch": 0.539185329926349, "flos": 22205255575680.0, "grad_norm": 1.6112642682058405, "language_loss": 0.81832904, "learning_rate": 1.8434021259763065e-06, "loss": 0.89540327, "num_input_tokens_seen": 193109325, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.10498047, "step": 8968, "time_per_iteration": 2.5516140460968018 }, { "auxiliary_loss_clip": 0.06446909, "auxiliary_loss_mlp": 0.01269333, "balance_loss_clip": 0.06289149, "balance_loss_mlp": 0.01257252, "epoch": 0.539245453179017, "flos": 21440867904000.0, "grad_norm": 1.3504580401987794, "language_loss": 0.73865509, "learning_rate": 1.8430138629427484e-06, "loss": 0.81581748, "num_input_tokens_seen": 193130595, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.12091064, "step": 8969, "time_per_iteration": 2.556597948074341 }, { "auxiliary_loss_clip": 0.06448507, "auxiliary_loss_mlp": 0.01268255, "balance_loss_clip": 0.06284295, "balance_loss_mlp": 0.01256483, "epoch": 0.539305576431685, "flos": 20740322643840.0, "grad_norm": 1.6933560431662822, "language_loss": 0.82968241, "learning_rate": 1.8426256058621205e-06, "loss": 0.90684998, "num_input_tokens_seen": 193148930, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.11767578, "step": 8970, "time_per_iteration": 2.5283589363098145 }, { "auxiliary_loss_clip": 0.06439879, "auxiliary_loss_mlp": 0.0126715, "balance_loss_clip": 0.06285316, "balance_loss_mlp": 0.01256641, "epoch": 0.5393656996843529, "flos": 30928467934080.0, "grad_norm": 1.3302281954241115, "language_loss": 0.75631809, "learning_rate": 1.842237354749146e-06, "loss": 0.83338839, "num_input_tokens_seen": 193170140, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.10510254, "step": 8971, "time_per_iteration": 2.6240124702453613 }, { "auxiliary_loss_clip": 0.06339505, "auxiliary_loss_mlp": 0.01257345, "balance_loss_clip": 0.06276988, "balance_loss_mlp": 0.01255379, "epoch": 0.5394258229370209, "flos": 50332953260160.0, "grad_norm": 0.8910836850101418, "language_loss": 0.60427058, "learning_rate": 1.8418491096185465e-06, "loss": 0.68023896, "num_input_tokens_seen": 193227235, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01963806, "step": 8972, "time_per_iteration": 3.182720184326172 }, { "auxiliary_loss_clip": 0.06443761, "auxiliary_loss_mlp": 0.01273864, "balance_loss_clip": 0.0628653, "balance_loss_mlp": 0.01262533, "epoch": 0.5394859461896888, "flos": 25419169180800.0, "grad_norm": 1.59178159769705, "language_loss": 0.78611195, "learning_rate": 1.841460870485045e-06, "loss": 0.86328822, "num_input_tokens_seen": 193248435, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.11328125, "step": 8973, "time_per_iteration": 2.578068494796753 }, { "auxiliary_loss_clip": 0.06452198, "auxiliary_loss_mlp": 0.01269902, "balance_loss_clip": 0.06288075, "balance_loss_mlp": 0.01257194, "epoch": 0.5395460694423568, "flos": 25484646746880.0, "grad_norm": 1.8448921827246751, "language_loss": 0.74156195, "learning_rate": 1.8410726373633623e-06, "loss": 0.81878299, "num_input_tokens_seen": 193267490, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.12695312, "step": 8974, "time_per_iteration": 2.594120502471924 }, { "auxiliary_loss_clip": 0.06331803, "auxiliary_loss_mlp": 0.01256702, "balance_loss_clip": 0.06269135, "balance_loss_mlp": 0.01254812, "epoch": 0.5396061926950249, "flos": 53267305317120.0, "grad_norm": 0.7127480981895724, "language_loss": 0.51064897, "learning_rate": 1.8406844102682215e-06, "loss": 0.58653396, "num_input_tokens_seen": 193326050, "router_z_loss_clip": 0.62695312, "router_z_loss_mlp": 0.01885986, "step": 8975, "time_per_iteration": 3.18730092048645 }, { "auxiliary_loss_clip": 0.06436414, "auxiliary_loss_mlp": 0.012648, "balance_loss_clip": 0.0628054, "balance_loss_mlp": 0.01253058, "epoch": 0.5396663159476928, "flos": 26732476949760.0, "grad_norm": 2.0247329178269093, "language_loss": 0.72242135, "learning_rate": 1.840296189214344e-06, "loss": 0.79943347, "num_input_tokens_seen": 193348785, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.11737061, "step": 8976, "time_per_iteration": 2.646805763244629 }, { "auxiliary_loss_clip": 0.06439485, "auxiliary_loss_mlp": 0.01268282, "balance_loss_clip": 0.0628369, "balance_loss_mlp": 0.01257416, "epoch": 0.5397264392003608, "flos": 23259267285120.0, "grad_norm": 1.834002735137468, "language_loss": 0.70384324, "learning_rate": 1.8399079742164509e-06, "loss": 0.78092086, "num_input_tokens_seen": 193367080, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.10864258, "step": 8977, "time_per_iteration": 2.569225311279297 }, { "auxiliary_loss_clip": 0.06444272, "auxiliary_loss_mlp": 0.01264626, "balance_loss_clip": 0.06284354, "balance_loss_mlp": 0.01253939, "epoch": 0.5397865624530287, "flos": 18299727169920.0, "grad_norm": 1.6135304631923013, "language_loss": 0.7290383, "learning_rate": 1.8395197652892636e-06, "loss": 0.80612731, "num_input_tokens_seen": 193383715, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.10693359, "step": 8978, "time_per_iteration": 2.5487093925476074 }, { "auxiliary_loss_clip": 0.06443287, "auxiliary_loss_mlp": 0.01272402, "balance_loss_clip": 0.0627846, "balance_loss_mlp": 0.01259617, "epoch": 0.5398466857056967, "flos": 15301742336640.0, "grad_norm": 1.9090301551668227, "language_loss": 0.74697101, "learning_rate": 1.8391315624475028e-06, "loss": 0.82412791, "num_input_tokens_seen": 193400560, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.12792969, "step": 8979, "time_per_iteration": 4.0227210521698 }, { "auxiliary_loss_clip": 0.0644673, "auxiliary_loss_mlp": 0.01270473, "balance_loss_clip": 0.06283452, "balance_loss_mlp": 0.01257938, "epoch": 0.5399068089583646, "flos": 17827521085440.0, "grad_norm": 2.1559557202489428, "language_loss": 0.76879096, "learning_rate": 1.8387433657058892e-06, "loss": 0.845963, "num_input_tokens_seen": 193418680, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.12536621, "step": 8980, "time_per_iteration": 2.5399892330169678 }, { "auxiliary_loss_clip": 0.06442393, "auxiliary_loss_mlp": 0.01269969, "balance_loss_clip": 0.06281817, "balance_loss_mlp": 0.01258846, "epoch": 0.5399669322110326, "flos": 27389109870720.0, "grad_norm": 1.6349611549012832, "language_loss": 0.82030559, "learning_rate": 1.8383551750791431e-06, "loss": 0.89742923, "num_input_tokens_seen": 193439310, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.11126709, "step": 8981, "time_per_iteration": 4.002985954284668 }, { "auxiliary_loss_clip": 0.06441593, "auxiliary_loss_mlp": 0.01269027, "balance_loss_clip": 0.06281696, "balance_loss_mlp": 0.01257911, "epoch": 0.5400270554637006, "flos": 20455394434560.0, "grad_norm": 2.233395006178791, "language_loss": 0.66979623, "learning_rate": 1.8379669905819857e-06, "loss": 0.74690247, "num_input_tokens_seen": 193458115, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.11108398, "step": 8982, "time_per_iteration": 2.5518691539764404 }, { "auxiliary_loss_clip": 0.06441487, "auxiliary_loss_mlp": 0.01271489, "balance_loss_clip": 0.06284042, "balance_loss_mlp": 0.01260432, "epoch": 0.5400871787163686, "flos": 21696055113600.0, "grad_norm": 1.487494540278059, "language_loss": 0.82905227, "learning_rate": 1.8375788122291358e-06, "loss": 0.90618205, "num_input_tokens_seen": 193477365, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.1105957, "step": 8983, "time_per_iteration": 2.551455020904541 }, { "auxiliary_loss_clip": 0.06433797, "auxiliary_loss_mlp": 0.01266644, "balance_loss_clip": 0.06278317, "balance_loss_mlp": 0.01255182, "epoch": 0.5401473019690365, "flos": 19210163708160.0, "grad_norm": 1.9483414243378838, "language_loss": 0.71188915, "learning_rate": 1.8371906400353138e-06, "loss": 0.78889352, "num_input_tokens_seen": 193495595, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.11474609, "step": 8984, "time_per_iteration": 2.5110578536987305 }, { "auxiliary_loss_clip": 0.06447258, "auxiliary_loss_mlp": 0.01268167, "balance_loss_clip": 0.06284839, "balance_loss_mlp": 0.01256086, "epoch": 0.5402074252217045, "flos": 20632987163520.0, "grad_norm": 1.874308612155788, "language_loss": 0.80412018, "learning_rate": 1.8368024740152386e-06, "loss": 0.8812744, "num_input_tokens_seen": 193514035, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.12078857, "step": 8985, "time_per_iteration": 2.5595076084136963 }, { "auxiliary_loss_clip": 0.06428713, "auxiliary_loss_mlp": 0.01267532, "balance_loss_clip": 0.06278068, "balance_loss_mlp": 0.01256583, "epoch": 0.5402675484743724, "flos": 24980519456640.0, "grad_norm": 1.4995228542205767, "language_loss": 0.79182649, "learning_rate": 1.83641431418363e-06, "loss": 0.86878896, "num_input_tokens_seen": 193535445, "router_z_loss_clip": 1.50488281, "router_z_loss_mlp": 0.10949707, "step": 8986, "time_per_iteration": 2.6151082515716553 }, { "auxiliary_loss_clip": 0.06436734, "auxiliary_loss_mlp": 0.01269154, "balance_loss_clip": 0.06279325, "balance_loss_mlp": 0.01258891, "epoch": 0.5403276717270404, "flos": 19464302741760.0, "grad_norm": 1.5122939391359445, "language_loss": 0.77019614, "learning_rate": 1.8360261605552075e-06, "loss": 0.84725511, "num_input_tokens_seen": 193554780, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.10266113, "step": 8987, "time_per_iteration": 2.5463719367980957 }, { "auxiliary_loss_clip": 0.06433187, "auxiliary_loss_mlp": 0.01266303, "balance_loss_clip": 0.06275854, "balance_loss_mlp": 0.01255312, "epoch": 0.5403877949797083, "flos": 18448040096640.0, "grad_norm": 1.8867066895202897, "language_loss": 0.71389985, "learning_rate": 1.8356380131446887e-06, "loss": 0.79089469, "num_input_tokens_seen": 193573580, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.10998535, "step": 8988, "time_per_iteration": 2.5232863426208496 }, { "auxiliary_loss_clip": 0.06441326, "auxiliary_loss_mlp": 0.01266471, "balance_loss_clip": 0.06283108, "balance_loss_mlp": 0.01255159, "epoch": 0.5404479182323764, "flos": 28300343022720.0, "grad_norm": 2.5007844304525797, "language_loss": 0.6797694, "learning_rate": 1.8352498719667934e-06, "loss": 0.75684738, "num_input_tokens_seen": 193590490, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.11315918, "step": 8989, "time_per_iteration": 2.600774049758911 }, { "auxiliary_loss_clip": 0.06434252, "auxiliary_loss_mlp": 0.01268175, "balance_loss_clip": 0.06275617, "balance_loss_mlp": 0.01256159, "epoch": 0.5405080414850444, "flos": 23373981924480.0, "grad_norm": 1.4614206134542838, "language_loss": 0.78386545, "learning_rate": 1.8348617370362399e-06, "loss": 0.86088973, "num_input_tokens_seen": 193609900, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.12017822, "step": 8990, "time_per_iteration": 2.575486421585083 }, { "auxiliary_loss_clip": 0.06429551, "auxiliary_loss_mlp": 0.01262083, "balance_loss_clip": 0.06275317, "balance_loss_mlp": 0.01251849, "epoch": 0.5405681647377123, "flos": 21112907823360.0, "grad_norm": 1.6815061166181209, "language_loss": 0.69413036, "learning_rate": 1.834473608367745e-06, "loss": 0.7710467, "num_input_tokens_seen": 193629775, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.10229492, "step": 8991, "time_per_iteration": 3.9486191272735596 }, { "auxiliary_loss_clip": 0.0643484, "auxiliary_loss_mlp": 0.01267466, "balance_loss_clip": 0.06277721, "balance_loss_mlp": 0.01256058, "epoch": 0.5406282879903803, "flos": 20455478288640.0, "grad_norm": 1.7260699063058293, "language_loss": 0.76234025, "learning_rate": 1.8340854859760277e-06, "loss": 0.83936328, "num_input_tokens_seen": 193648070, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.11419678, "step": 8992, "time_per_iteration": 3.9740283489227295 }, { "auxiliary_loss_clip": 0.06438939, "auxiliary_loss_mlp": 0.012647, "balance_loss_clip": 0.06278069, "balance_loss_mlp": 0.01253715, "epoch": 0.5406884112430482, "flos": 14214635464320.0, "grad_norm": 2.4838345662382957, "language_loss": 0.76521397, "learning_rate": 1.8336973698758056e-06, "loss": 0.84225035, "num_input_tokens_seen": 193665060, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.10992432, "step": 8993, "time_per_iteration": 2.5352210998535156 }, { "auxiliary_loss_clip": 0.06429337, "auxiliary_loss_mlp": 0.01264973, "balance_loss_clip": 0.06275247, "balance_loss_mlp": 0.01254471, "epoch": 0.5407485344957162, "flos": 23881882648320.0, "grad_norm": 1.557300600056381, "language_loss": 0.70354146, "learning_rate": 1.8333092600817959e-06, "loss": 0.78048456, "num_input_tokens_seen": 193683620, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.1050415, "step": 8994, "time_per_iteration": 2.556788444519043 }, { "auxiliary_loss_clip": 0.06434876, "auxiliary_loss_mlp": 0.01266199, "balance_loss_clip": 0.06274904, "balance_loss_mlp": 0.01254987, "epoch": 0.5408086577483842, "flos": 23155118259840.0, "grad_norm": 1.7963307740500198, "language_loss": 0.75497371, "learning_rate": 1.8329211566087157e-06, "loss": 0.83198446, "num_input_tokens_seen": 193702990, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.11212158, "step": 8995, "time_per_iteration": 2.547489881515503 }, { "auxiliary_loss_clip": 0.06429686, "auxiliary_loss_mlp": 0.01270832, "balance_loss_clip": 0.06277472, "balance_loss_mlp": 0.01260067, "epoch": 0.5408687810010522, "flos": 18777090280320.0, "grad_norm": 1.8498115892162632, "language_loss": 0.73878586, "learning_rate": 1.832533059471282e-06, "loss": 0.81579107, "num_input_tokens_seen": 193721785, "router_z_loss_clip": 1.52246094, "router_z_loss_mlp": 0.10772705, "step": 8996, "time_per_iteration": 2.5384140014648438 }, { "auxiliary_loss_clip": 0.06432146, "auxiliary_loss_mlp": 0.01268149, "balance_loss_clip": 0.06280504, "balance_loss_mlp": 0.01256496, "epoch": 0.5409289042537201, "flos": 13886717310720.0, "grad_norm": 3.256329072985926, "language_loss": 0.73653495, "learning_rate": 1.8321449686842115e-06, "loss": 0.8135379, "num_input_tokens_seen": 193740315, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.11657715, "step": 8997, "time_per_iteration": 2.545383930206299 }, { "auxiliary_loss_clip": 0.06437385, "auxiliary_loss_mlp": 0.01267211, "balance_loss_clip": 0.06279342, "balance_loss_mlp": 0.01256471, "epoch": 0.5409890275063881, "flos": 14470619287680.0, "grad_norm": 2.350900894780079, "language_loss": 0.72287989, "learning_rate": 1.8317568842622207e-06, "loss": 0.79992586, "num_input_tokens_seen": 193757580, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.10736084, "step": 8998, "time_per_iteration": 2.522573947906494 }, { "auxiliary_loss_clip": 0.06432329, "auxiliary_loss_mlp": 0.01269289, "balance_loss_clip": 0.06276792, "balance_loss_mlp": 0.01258733, "epoch": 0.541049150759056, "flos": 48987906721920.0, "grad_norm": 1.882500048815451, "language_loss": 0.70614612, "learning_rate": 1.8313688062200256e-06, "loss": 0.7831623, "num_input_tokens_seen": 193780965, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.10552979, "step": 8999, "time_per_iteration": 2.768460273742676 }, { "auxiliary_loss_clip": 0.06432563, "auxiliary_loss_mlp": 0.01268003, "balance_loss_clip": 0.06279424, "balance_loss_mlp": 0.01256339, "epoch": 0.541109274011724, "flos": 18153007470720.0, "grad_norm": 2.162392145925777, "language_loss": 0.81078291, "learning_rate": 1.8309807345723422e-06, "loss": 0.88778853, "num_input_tokens_seen": 193797855, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.11651611, "step": 9000, "time_per_iteration": 2.528306007385254 }, { "auxiliary_loss_clip": 0.06432504, "auxiliary_loss_mlp": 0.01275292, "balance_loss_clip": 0.06277888, "balance_loss_mlp": 0.01264367, "epoch": 0.541169397264392, "flos": 20528921992320.0, "grad_norm": 1.5361766657078146, "language_loss": 0.73524594, "learning_rate": 1.8305926693338863e-06, "loss": 0.81232387, "num_input_tokens_seen": 193817375, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.10919189, "step": 9001, "time_per_iteration": 2.562926769256592 }, { "auxiliary_loss_clip": 0.06439554, "auxiliary_loss_mlp": 0.01272715, "balance_loss_clip": 0.06277575, "balance_loss_mlp": 0.01261253, "epoch": 0.54122952051706, "flos": 20049630238080.0, "grad_norm": 2.183950242983341, "language_loss": 0.84775817, "learning_rate": 1.8302046105193734e-06, "loss": 0.92488086, "num_input_tokens_seen": 193832205, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.11468506, "step": 9002, "time_per_iteration": 2.515430212020874 }, { "auxiliary_loss_clip": 0.06432312, "auxiliary_loss_mlp": 0.01269803, "balance_loss_clip": 0.06279884, "balance_loss_mlp": 0.01259837, "epoch": 0.541289643769728, "flos": 19068223691520.0, "grad_norm": 2.3715647387909757, "language_loss": 0.78404868, "learning_rate": 1.8298165581435183e-06, "loss": 0.8610698, "num_input_tokens_seen": 193849830, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.09973145, "step": 9003, "time_per_iteration": 2.6847808361053467 }, { "auxiliary_loss_clip": 0.06434113, "auxiliary_loss_mlp": 0.0126838, "balance_loss_clip": 0.06279436, "balance_loss_mlp": 0.01256984, "epoch": 0.5413497670223959, "flos": 22388801944320.0, "grad_norm": 1.8479775908897775, "language_loss": 0.69412804, "learning_rate": 1.8294285122210372e-06, "loss": 0.77115297, "num_input_tokens_seen": 193869945, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.1138916, "step": 9004, "time_per_iteration": 2.562178373336792 }, { "auxiliary_loss_clip": 0.06331947, "auxiliary_loss_mlp": 0.0127054, "balance_loss_clip": 0.06267496, "balance_loss_mlp": 0.01268945, "epoch": 0.5414098902750639, "flos": 70052149722240.0, "grad_norm": 0.9045047064752917, "language_loss": 0.58717507, "learning_rate": 1.8290404727666434e-06, "loss": 0.6631999, "num_input_tokens_seen": 193930860, "router_z_loss_clip": 0.64550781, "router_z_loss_mlp": 0.01596069, "step": 9005, "time_per_iteration": 3.2981176376342773 }, { "auxiliary_loss_clip": 0.06441602, "auxiliary_loss_mlp": 0.0126773, "balance_loss_clip": 0.06281279, "balance_loss_mlp": 0.01257401, "epoch": 0.5414700135277318, "flos": 21805445018880.0, "grad_norm": 1.6513863605734402, "language_loss": 0.78587741, "learning_rate": 1.8286524397950517e-06, "loss": 0.86297077, "num_input_tokens_seen": 193949075, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.10321045, "step": 9006, "time_per_iteration": 2.571941375732422 }, { "auxiliary_loss_clip": 0.06434578, "auxiliary_loss_mlp": 0.01270398, "balance_loss_clip": 0.0628141, "balance_loss_mlp": 0.01260521, "epoch": 0.5415301367803999, "flos": 16913269186560.0, "grad_norm": 1.5587866326221167, "language_loss": 0.83428812, "learning_rate": 1.8282644133209777e-06, "loss": 0.91133785, "num_input_tokens_seen": 193967630, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.09875488, "step": 9007, "time_per_iteration": 2.7104811668395996 }, { "auxiliary_loss_clip": 0.06438062, "auxiliary_loss_mlp": 0.01272001, "balance_loss_clip": 0.0628162, "balance_loss_mlp": 0.0126014, "epoch": 0.5415902600330678, "flos": 25711518476160.0, "grad_norm": 2.2075322574835052, "language_loss": 0.6740303, "learning_rate": 1.8278763933591334e-06, "loss": 0.75113094, "num_input_tokens_seen": 193988730, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.11859131, "step": 9008, "time_per_iteration": 2.6924057006835938 }, { "auxiliary_loss_clip": 0.06443763, "auxiliary_loss_mlp": 0.01272383, "balance_loss_clip": 0.06282526, "balance_loss_mlp": 0.01261029, "epoch": 0.5416503832857358, "flos": 19214146776960.0, "grad_norm": 2.547110021454785, "language_loss": 0.74353409, "learning_rate": 1.827488379924234e-06, "loss": 0.82069552, "num_input_tokens_seen": 194005160, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.11364746, "step": 9009, "time_per_iteration": 2.5893547534942627 }, { "auxiliary_loss_clip": 0.06443439, "auxiliary_loss_mlp": 0.01265897, "balance_loss_clip": 0.06283563, "balance_loss_mlp": 0.01254376, "epoch": 0.5417105065384037, "flos": 12718619867520.0, "grad_norm": 2.0081032456204286, "language_loss": 0.87685341, "learning_rate": 1.8271003730309923e-06, "loss": 0.95394677, "num_input_tokens_seen": 194021700, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.11529541, "step": 9010, "time_per_iteration": 2.5568368434906006 }, { "auxiliary_loss_clip": 0.06434663, "auxiliary_loss_mlp": 0.01269443, "balance_loss_clip": 0.06278922, "balance_loss_mlp": 0.01258386, "epoch": 0.5417706297910717, "flos": 30343727416320.0, "grad_norm": 1.9556353063873437, "language_loss": 0.65327126, "learning_rate": 1.826712372694122e-06, "loss": 0.73031229, "num_input_tokens_seen": 194042620, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.11053467, "step": 9011, "time_per_iteration": 2.611306667327881 }, { "auxiliary_loss_clip": 0.06437778, "auxiliary_loss_mlp": 0.01270686, "balance_loss_clip": 0.06282274, "balance_loss_mlp": 0.01260052, "epoch": 0.5418307530437396, "flos": 29028323295360.0, "grad_norm": 2.9209946131953926, "language_loss": 0.79612094, "learning_rate": 1.8263243789283362e-06, "loss": 0.87320554, "num_input_tokens_seen": 194061800, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.10632324, "step": 9012, "time_per_iteration": 2.6301825046539307 }, { "auxiliary_loss_clip": 0.06440938, "auxiliary_loss_mlp": 0.01268638, "balance_loss_clip": 0.06282153, "balance_loss_mlp": 0.01257874, "epoch": 0.5418908762964076, "flos": 16879125847680.0, "grad_norm": 1.9885810499673062, "language_loss": 0.74493128, "learning_rate": 1.8259363917483466e-06, "loss": 0.82202697, "num_input_tokens_seen": 194079890, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.10766602, "step": 9013, "time_per_iteration": 2.5311946868896484 }, { "auxiliary_loss_clip": 0.0644023, "auxiliary_loss_mlp": 0.01267937, "balance_loss_clip": 0.06279536, "balance_loss_mlp": 0.01256773, "epoch": 0.5419509995490756, "flos": 18955144206720.0, "grad_norm": 3.345674932264391, "language_loss": 0.72317284, "learning_rate": 1.8255484111688667e-06, "loss": 0.80025458, "num_input_tokens_seen": 194097625, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.11151123, "step": 9014, "time_per_iteration": 2.576601505279541 }, { "auxiliary_loss_clip": 0.06436034, "auxiliary_loss_mlp": 0.01266194, "balance_loss_clip": 0.06280464, "balance_loss_mlp": 0.01255423, "epoch": 0.5420111228017436, "flos": 18083630689920.0, "grad_norm": 1.6576545352231657, "language_loss": 0.80574596, "learning_rate": 1.8251604372046085e-06, "loss": 0.88276821, "num_input_tokens_seen": 194116055, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.10772705, "step": 9015, "time_per_iteration": 2.518136978149414 }, { "auxiliary_loss_clip": 0.06445958, "auxiliary_loss_mlp": 0.01267835, "balance_loss_clip": 0.06285983, "balance_loss_mlp": 0.01256689, "epoch": 0.5420712460544116, "flos": 19067678640000.0, "grad_norm": 2.3218108384538274, "language_loss": 0.82146084, "learning_rate": 1.8247724698702843e-06, "loss": 0.89859879, "num_input_tokens_seen": 194130365, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.1114502, "step": 9016, "time_per_iteration": 2.5304861068725586 }, { "auxiliary_loss_clip": 0.06439985, "auxiliary_loss_mlp": 0.01275541, "balance_loss_clip": 0.06283026, "balance_loss_mlp": 0.01264813, "epoch": 0.5421313693070795, "flos": 18193020595200.0, "grad_norm": 1.7232416555515395, "language_loss": 0.81626689, "learning_rate": 1.8243845091806053e-06, "loss": 0.89342225, "num_input_tokens_seen": 194148975, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.1072998, "step": 9017, "time_per_iteration": 2.5350897312164307 }, { "auxiliary_loss_clip": 0.06434109, "auxiliary_loss_mlp": 0.01270226, "balance_loss_clip": 0.06281117, "balance_loss_mlp": 0.01259336, "epoch": 0.5421914925597475, "flos": 13010969162880.0, "grad_norm": 2.53718714438784, "language_loss": 0.77999002, "learning_rate": 1.8239965551502837e-06, "loss": 0.85703337, "num_input_tokens_seen": 194167185, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.10900879, "step": 9018, "time_per_iteration": 2.5300676822662354 }, { "auxiliary_loss_clip": 0.06443432, "auxiliary_loss_mlp": 0.0126567, "balance_loss_clip": 0.0628214, "balance_loss_mlp": 0.01254864, "epoch": 0.5422516158124154, "flos": 46769654856960.0, "grad_norm": 1.4076616492216407, "language_loss": 0.67048025, "learning_rate": 1.8236086077940303e-06, "loss": 0.74757129, "num_input_tokens_seen": 194192840, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.10809326, "step": 9019, "time_per_iteration": 4.34434175491333 }, { "auxiliary_loss_clip": 0.06431007, "auxiliary_loss_mlp": 0.01268135, "balance_loss_clip": 0.0627936, "balance_loss_mlp": 0.0125858, "epoch": 0.5423117390650835, "flos": 31766634725760.0, "grad_norm": 2.1306416685284746, "language_loss": 0.69606006, "learning_rate": 1.8232206671265555e-06, "loss": 0.7730515, "num_input_tokens_seen": 194213150, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.09558105, "step": 9020, "time_per_iteration": 2.681438446044922 }, { "auxiliary_loss_clip": 0.06434081, "auxiliary_loss_mlp": 0.01271164, "balance_loss_clip": 0.06281343, "balance_loss_mlp": 0.01260442, "epoch": 0.5423718623177514, "flos": 27209881987200.0, "grad_norm": 1.8553771486389092, "language_loss": 0.80553436, "learning_rate": 1.8228327331625717e-06, "loss": 0.88258684, "num_input_tokens_seen": 194234665, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.1071167, "step": 9021, "time_per_iteration": 4.049262285232544 }, { "auxiliary_loss_clip": 0.06440829, "auxiliary_loss_mlp": 0.01267861, "balance_loss_clip": 0.06286092, "balance_loss_mlp": 0.01257323, "epoch": 0.5424319855704194, "flos": 23552580902400.0, "grad_norm": 1.756205847567458, "language_loss": 0.79064423, "learning_rate": 1.822444805916788e-06, "loss": 0.86773115, "num_input_tokens_seen": 194253790, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.10534668, "step": 9022, "time_per_iteration": 2.611480951309204 }, { "auxiliary_loss_clip": 0.06435138, "auxiliary_loss_mlp": 0.01268142, "balance_loss_clip": 0.0627915, "balance_loss_mlp": 0.01257222, "epoch": 0.5424921088230873, "flos": 26623003190400.0, "grad_norm": 1.687208420477766, "language_loss": 0.82751286, "learning_rate": 1.822056885403915e-06, "loss": 0.90454572, "num_input_tokens_seen": 194274950, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.10919189, "step": 9023, "time_per_iteration": 2.602620840072632 }, { "auxiliary_loss_clip": 0.06439427, "auxiliary_loss_mlp": 0.01271012, "balance_loss_clip": 0.06284451, "balance_loss_mlp": 0.01260718, "epoch": 0.5425522320757553, "flos": 23593600275840.0, "grad_norm": 1.538052596128266, "language_loss": 0.71157312, "learning_rate": 1.8216689716386627e-06, "loss": 0.78867751, "num_input_tokens_seen": 194296155, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.10290527, "step": 9024, "time_per_iteration": 2.5874078273773193 }, { "auxiliary_loss_clip": 0.06439277, "auxiliary_loss_mlp": 0.0126803, "balance_loss_clip": 0.06280787, "balance_loss_mlp": 0.01257123, "epoch": 0.5426123553284232, "flos": 30600256291200.0, "grad_norm": 1.649478342677362, "language_loss": 0.65397435, "learning_rate": 1.8212810646357405e-06, "loss": 0.73104745, "num_input_tokens_seen": 194318025, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.10913086, "step": 9025, "time_per_iteration": 2.6229817867279053 }, { "auxiliary_loss_clip": 0.06444312, "auxiliary_loss_mlp": 0.01270804, "balance_loss_clip": 0.06284451, "balance_loss_mlp": 0.01259849, "epoch": 0.5426724785810912, "flos": 12500049692160.0, "grad_norm": 1.8496389121553043, "language_loss": 0.73933494, "learning_rate": 1.8208931644098591e-06, "loss": 0.81648612, "num_input_tokens_seen": 194336150, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.10961914, "step": 9026, "time_per_iteration": 2.5421793460845947 }, { "auxiliary_loss_clip": 0.06438501, "auxiliary_loss_mlp": 0.01272047, "balance_loss_clip": 0.06280825, "balance_loss_mlp": 0.01259184, "epoch": 0.5427326018337592, "flos": 26071273981440.0, "grad_norm": 2.9789568046788086, "language_loss": 0.78819811, "learning_rate": 1.8205052709757265e-06, "loss": 0.86530358, "num_input_tokens_seen": 194355980, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.12854004, "step": 9027, "time_per_iteration": 2.5902390480041504 }, { "auxiliary_loss_clip": 0.06328396, "auxiliary_loss_mlp": 0.01252196, "balance_loss_clip": 0.06264791, "balance_loss_mlp": 0.01250644, "epoch": 0.5427927250864272, "flos": 66004974789120.0, "grad_norm": 0.7293949437411906, "language_loss": 0.56501979, "learning_rate": 1.8201173843480515e-06, "loss": 0.64082569, "num_input_tokens_seen": 194422660, "router_z_loss_clip": 0.63867188, "router_z_loss_mlp": 0.01551056, "step": 9028, "time_per_iteration": 3.2474732398986816 }, { "auxiliary_loss_clip": 0.06442509, "auxiliary_loss_mlp": 0.01273848, "balance_loss_clip": 0.06283275, "balance_loss_mlp": 0.01262135, "epoch": 0.5428528483390952, "flos": 19981678976640.0, "grad_norm": 2.3110176799554885, "language_loss": 0.77823961, "learning_rate": 1.8197295045415442e-06, "loss": 0.85540318, "num_input_tokens_seen": 194438545, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.11712646, "step": 9029, "time_per_iteration": 2.5516388416290283 }, { "auxiliary_loss_clip": 0.06436615, "auxiliary_loss_mlp": 0.01271227, "balance_loss_clip": 0.06282259, "balance_loss_mlp": 0.01259438, "epoch": 0.5429129715917631, "flos": 21838288619520.0, "grad_norm": 1.3716575152361437, "language_loss": 0.83454561, "learning_rate": 1.8193416315709112e-06, "loss": 0.91162401, "num_input_tokens_seen": 194458060, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.11798096, "step": 9030, "time_per_iteration": 2.5994415283203125 }, { "auxiliary_loss_clip": 0.06436292, "auxiliary_loss_mlp": 0.01266882, "balance_loss_clip": 0.06282458, "balance_loss_mlp": 0.01255438, "epoch": 0.5429730948444311, "flos": 27790178238720.0, "grad_norm": 1.5386550646865533, "language_loss": 0.75736189, "learning_rate": 1.8189537654508623e-06, "loss": 0.83439362, "num_input_tokens_seen": 194477405, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.11437988, "step": 9031, "time_per_iteration": 4.090062379837036 }, { "auxiliary_loss_clip": 0.06433242, "auxiliary_loss_mlp": 0.01267518, "balance_loss_clip": 0.06282745, "balance_loss_mlp": 0.01257463, "epoch": 0.543033218097099, "flos": 26767668464640.0, "grad_norm": 1.6784277636841867, "language_loss": 0.85360974, "learning_rate": 1.8185659061961045e-06, "loss": 0.93061733, "num_input_tokens_seen": 194497085, "router_z_loss_clip": 1.50585938, "router_z_loss_mlp": 0.1005249, "step": 9032, "time_per_iteration": 2.578889846801758 }, { "auxiliary_loss_clip": 0.0644685, "auxiliary_loss_mlp": 0.01269567, "balance_loss_clip": 0.06284749, "balance_loss_mlp": 0.01258069, "epoch": 0.5430933413497671, "flos": 22681989780480.0, "grad_norm": 1.9300166776457985, "language_loss": 0.73967314, "learning_rate": 1.8181780538213457e-06, "loss": 0.81683731, "num_input_tokens_seen": 194516785, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.1149292, "step": 9033, "time_per_iteration": 2.567049264907837 }, { "auxiliary_loss_clip": 0.06436534, "auxiliary_loss_mlp": 0.01270341, "balance_loss_clip": 0.06281923, "balance_loss_mlp": 0.01258879, "epoch": 0.543153464602435, "flos": 24614307187200.0, "grad_norm": 1.6964768576012088, "language_loss": 0.76171017, "learning_rate": 1.8177902083412935e-06, "loss": 0.83877885, "num_input_tokens_seen": 194536475, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.11468506, "step": 9034, "time_per_iteration": 2.5796985626220703 }, { "auxiliary_loss_clip": 0.06440239, "auxiliary_loss_mlp": 0.01265332, "balance_loss_clip": 0.06285454, "balance_loss_mlp": 0.01254705, "epoch": 0.543213587855103, "flos": 19031690511360.0, "grad_norm": 2.0966885601330083, "language_loss": 0.84382641, "learning_rate": 1.817402369770655e-06, "loss": 0.92088211, "num_input_tokens_seen": 194554495, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.10632324, "step": 9035, "time_per_iteration": 2.5502829551696777 }, { "auxiliary_loss_clip": 0.0633193, "auxiliary_loss_mlp": 0.01251908, "balance_loss_clip": 0.06268923, "balance_loss_mlp": 0.01250253, "epoch": 0.5432737111077709, "flos": 65705539824000.0, "grad_norm": 0.7559775526853943, "language_loss": 0.55417186, "learning_rate": 1.8170145381241364e-06, "loss": 0.63001025, "num_input_tokens_seen": 194617620, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.0165863, "step": 9036, "time_per_iteration": 3.1464169025421143 }, { "auxiliary_loss_clip": 0.06441972, "auxiliary_loss_mlp": 0.01265356, "balance_loss_clip": 0.06281824, "balance_loss_mlp": 0.01253769, "epoch": 0.5433338343604389, "flos": 22098339365760.0, "grad_norm": 1.5349026256099574, "language_loss": 0.75336069, "learning_rate": 1.8166267134164451e-06, "loss": 0.83043396, "num_input_tokens_seen": 194637690, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.11584473, "step": 9037, "time_per_iteration": 2.5593101978302 }, { "auxiliary_loss_clip": 0.06441407, "auxiliary_loss_mlp": 0.01264527, "balance_loss_clip": 0.06285161, "balance_loss_mlp": 0.01253524, "epoch": 0.5433939576131068, "flos": 34680316752000.0, "grad_norm": 1.5607417855665595, "language_loss": 0.66647851, "learning_rate": 1.8162388956622875e-06, "loss": 0.74353784, "num_input_tokens_seen": 194659520, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.11004639, "step": 9038, "time_per_iteration": 2.678772211074829 }, { "auxiliary_loss_clip": 0.06440367, "auxiliary_loss_mlp": 0.01265599, "balance_loss_clip": 0.06283962, "balance_loss_mlp": 0.01255091, "epoch": 0.5434540808657748, "flos": 20309639057280.0, "grad_norm": 1.886311999993316, "language_loss": 0.78805071, "learning_rate": 1.8158510848763692e-06, "loss": 0.86511028, "num_input_tokens_seen": 194677645, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.10516357, "step": 9039, "time_per_iteration": 2.553823471069336 }, { "auxiliary_loss_clip": 0.06442918, "auxiliary_loss_mlp": 0.01267744, "balance_loss_clip": 0.06286195, "balance_loss_mlp": 0.01256825, "epoch": 0.5435142041184428, "flos": 23119549401600.0, "grad_norm": 1.6952009969147728, "language_loss": 0.76731867, "learning_rate": 1.8154632810733962e-06, "loss": 0.84442532, "num_input_tokens_seen": 194697400, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.10900879, "step": 9040, "time_per_iteration": 2.578855514526367 }, { "auxiliary_loss_clip": 0.06332045, "auxiliary_loss_mlp": 0.01250843, "balance_loss_clip": 0.06268846, "balance_loss_mlp": 0.01249196, "epoch": 0.5435743273711108, "flos": 64032350768640.0, "grad_norm": 0.6439824804396885, "language_loss": 0.52268028, "learning_rate": 1.815075484268074e-06, "loss": 0.59850913, "num_input_tokens_seen": 194761205, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.01649475, "step": 9041, "time_per_iteration": 3.187605619430542 }, { "auxiliary_loss_clip": 0.06438982, "auxiliary_loss_mlp": 0.01267043, "balance_loss_clip": 0.0628133, "balance_loss_mlp": 0.01256016, "epoch": 0.5436344506237788, "flos": 25125897490560.0, "grad_norm": 1.6718687775904253, "language_loss": 0.76459914, "learning_rate": 1.8146876944751078e-06, "loss": 0.84165943, "num_input_tokens_seen": 194782445, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.11035156, "step": 9042, "time_per_iteration": 2.5856821537017822 }, { "auxiliary_loss_clip": 0.06438712, "auxiliary_loss_mlp": 0.01267526, "balance_loss_clip": 0.06285638, "balance_loss_mlp": 0.0125675, "epoch": 0.5436945738764467, "flos": 19579017381120.0, "grad_norm": 1.5652410158160392, "language_loss": 0.67666036, "learning_rate": 1.8142999117092033e-06, "loss": 0.75372279, "num_input_tokens_seen": 194800325, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.10772705, "step": 9043, "time_per_iteration": 2.5602519512176514 }, { "auxiliary_loss_clip": 0.06438655, "auxiliary_loss_mlp": 0.01265411, "balance_loss_clip": 0.06284773, "balance_loss_mlp": 0.01254599, "epoch": 0.5437546971291147, "flos": 21148937879040.0, "grad_norm": 1.5546526521759378, "language_loss": 0.84704125, "learning_rate": 1.8139121359850644e-06, "loss": 0.92408192, "num_input_tokens_seen": 194818675, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.10803223, "step": 9044, "time_per_iteration": 2.5644378662109375 }, { "auxiliary_loss_clip": 0.06443815, "auxiliary_loss_mlp": 0.01268476, "balance_loss_clip": 0.0628242, "balance_loss_mlp": 0.01256871, "epoch": 0.5438148203817826, "flos": 25125645928320.0, "grad_norm": 1.555722083400461, "language_loss": 0.62549835, "learning_rate": 1.8135243673173956e-06, "loss": 0.70262128, "num_input_tokens_seen": 194836595, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.1161499, "step": 9045, "time_per_iteration": 2.570361852645874 }, { "auxiliary_loss_clip": 0.06446384, "auxiliary_loss_mlp": 0.01271154, "balance_loss_clip": 0.0628861, "balance_loss_mlp": 0.01260336, "epoch": 0.5438749436344507, "flos": 23009614444800.0, "grad_norm": 1.458720987170009, "language_loss": 0.70078272, "learning_rate": 1.8131366057209023e-06, "loss": 0.77795815, "num_input_tokens_seen": 194857520, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.1081543, "step": 9046, "time_per_iteration": 2.5993475914001465 }, { "auxiliary_loss_clip": 0.06437206, "auxiliary_loss_mlp": 0.01263477, "balance_loss_clip": 0.06284086, "balance_loss_mlp": 0.01252772, "epoch": 0.5439350668871186, "flos": 15492458229120.0, "grad_norm": 1.6741972913020682, "language_loss": 0.77788609, "learning_rate": 1.8127488512102868e-06, "loss": 0.85489291, "num_input_tokens_seen": 194876020, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.1071167, "step": 9047, "time_per_iteration": 2.556169033050537 }, { "auxiliary_loss_clip": 0.06441984, "auxiliary_loss_mlp": 0.01269051, "balance_loss_clip": 0.06284972, "balance_loss_mlp": 0.01257661, "epoch": 0.5439951901397866, "flos": 17244164160000.0, "grad_norm": 1.5693518890637224, "language_loss": 0.73187935, "learning_rate": 1.8123611038002547e-06, "loss": 0.80898976, "num_input_tokens_seen": 194894650, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.11395264, "step": 9048, "time_per_iteration": 2.552950859069824 }, { "auxiliary_loss_clip": 0.06435651, "auxiliary_loss_mlp": 0.0127103, "balance_loss_clip": 0.06280644, "balance_loss_mlp": 0.01259538, "epoch": 0.5440553133924545, "flos": 18666945688320.0, "grad_norm": 2.3645176598001116, "language_loss": 0.93547642, "learning_rate": 1.8119733635055076e-06, "loss": 1.0125432, "num_input_tokens_seen": 194911935, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.11486816, "step": 9049, "time_per_iteration": 2.5316367149353027 }, { "auxiliary_loss_clip": 0.06436065, "auxiliary_loss_mlp": 0.01266122, "balance_loss_clip": 0.06281937, "balance_loss_mlp": 0.01255739, "epoch": 0.5441154366451225, "flos": 27129813811200.0, "grad_norm": 1.6985629161603815, "language_loss": 0.74035442, "learning_rate": 1.8115856303407492e-06, "loss": 0.81737632, "num_input_tokens_seen": 194931620, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.10394287, "step": 9050, "time_per_iteration": 2.6227309703826904 }, { "auxiliary_loss_clip": 0.06441112, "auxiliary_loss_mlp": 0.01268641, "balance_loss_clip": 0.06283584, "balance_loss_mlp": 0.01257662, "epoch": 0.5441755598977904, "flos": 26000890951680.0, "grad_norm": 6.101994810953467, "language_loss": 0.67292404, "learning_rate": 1.8111979043206832e-06, "loss": 0.75002146, "num_input_tokens_seen": 194952560, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.10986328, "step": 9051, "time_per_iteration": 2.7495343685150146 }, { "auxiliary_loss_clip": 0.06436623, "auxiliary_loss_mlp": 0.01266524, "balance_loss_clip": 0.06281403, "balance_loss_mlp": 0.01255306, "epoch": 0.5442356831504584, "flos": 32388327694080.0, "grad_norm": 1.5988295586168157, "language_loss": 0.67272794, "learning_rate": 1.810810185460011e-06, "loss": 0.74975944, "num_input_tokens_seen": 194973915, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.11224365, "step": 9052, "time_per_iteration": 2.714451789855957 }, { "auxiliary_loss_clip": 0.06440598, "auxiliary_loss_mlp": 0.01267713, "balance_loss_clip": 0.06282864, "balance_loss_mlp": 0.01255416, "epoch": 0.5442958064031264, "flos": 24170123093760.0, "grad_norm": 1.735914981963894, "language_loss": 0.93325371, "learning_rate": 1.810422473773436e-06, "loss": 1.01033676, "num_input_tokens_seen": 194990170, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.12286377, "step": 9053, "time_per_iteration": 2.5758564472198486 }, { "auxiliary_loss_clip": 0.06445327, "auxiliary_loss_mlp": 0.01271262, "balance_loss_clip": 0.0628705, "balance_loss_mlp": 0.01259621, "epoch": 0.5443559296557944, "flos": 18769669194240.0, "grad_norm": 4.29611086853814, "language_loss": 0.84095526, "learning_rate": 1.8100347692756595e-06, "loss": 0.9181211, "num_input_tokens_seen": 195006395, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.11627197, "step": 9054, "time_per_iteration": 2.5456900596618652 }, { "auxiliary_loss_clip": 0.06441092, "auxiliary_loss_mlp": 0.01272137, "balance_loss_clip": 0.06284092, "balance_loss_mlp": 0.01259859, "epoch": 0.5444160529084624, "flos": 22638245149440.0, "grad_norm": 2.7427859140848034, "language_loss": 0.68810427, "learning_rate": 1.8096470719813836e-06, "loss": 0.7652365, "num_input_tokens_seen": 195025080, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.1227417, "step": 9055, "time_per_iteration": 2.6831820011138916 }, { "auxiliary_loss_clip": 0.06330849, "auxiliary_loss_mlp": 0.0125874, "balance_loss_clip": 0.06267636, "balance_loss_mlp": 0.01257188, "epoch": 0.5444761761611303, "flos": 69693106976640.0, "grad_norm": 0.7265206736369625, "language_loss": 0.57577968, "learning_rate": 1.80925938190531e-06, "loss": 0.65167558, "num_input_tokens_seen": 195085725, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.01551819, "step": 9056, "time_per_iteration": 3.2309582233428955 }, { "auxiliary_loss_clip": 0.0644545, "auxiliary_loss_mlp": 0.01270879, "balance_loss_clip": 0.06287261, "balance_loss_mlp": 0.01259339, "epoch": 0.5445362994137983, "flos": 14282922142080.0, "grad_norm": 2.405065615659981, "language_loss": 0.70015597, "learning_rate": 1.8088716990621395e-06, "loss": 0.77731925, "num_input_tokens_seen": 195102585, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.11547852, "step": 9057, "time_per_iteration": 2.563433885574341 }, { "auxiliary_loss_clip": 0.06450135, "auxiliary_loss_mlp": 0.01270332, "balance_loss_clip": 0.06297082, "balance_loss_mlp": 0.0125856, "epoch": 0.5445964226664662, "flos": 28993802613120.0, "grad_norm": 1.795218886181079, "language_loss": 0.7525537, "learning_rate": 1.8084840234665738e-06, "loss": 0.82975835, "num_input_tokens_seen": 195120055, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.11773682, "step": 9058, "time_per_iteration": 4.03040623664856 }, { "auxiliary_loss_clip": 0.06342221, "auxiliary_loss_mlp": 0.01259231, "balance_loss_clip": 0.06279333, "balance_loss_mlp": 0.01257447, "epoch": 0.5446565459191343, "flos": 68642323649280.0, "grad_norm": 0.7742877029227635, "language_loss": 0.6269275, "learning_rate": 1.808096355133312e-06, "loss": 0.70294201, "num_input_tokens_seen": 195181045, "router_z_loss_clip": 0.63183594, "router_z_loss_mlp": 0.01782227, "step": 9059, "time_per_iteration": 3.2922539710998535 }, { "auxiliary_loss_clip": 0.06438483, "auxiliary_loss_mlp": 0.01266345, "balance_loss_clip": 0.06285231, "balance_loss_mlp": 0.01254644, "epoch": 0.5447166691718022, "flos": 16221989802240.0, "grad_norm": 6.2921803324715935, "language_loss": 0.79813689, "learning_rate": 1.8077086940770572e-06, "loss": 0.87518519, "num_input_tokens_seen": 195198840, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.11706543, "step": 9060, "time_per_iteration": 2.536672830581665 }, { "auxiliary_loss_clip": 0.06443699, "auxiliary_loss_mlp": 0.01266212, "balance_loss_clip": 0.06287387, "balance_loss_mlp": 0.01254655, "epoch": 0.5447767924244702, "flos": 25856225677440.0, "grad_norm": 1.5898143566159537, "language_loss": 0.80048871, "learning_rate": 1.8073210403125072e-06, "loss": 0.8775878, "num_input_tokens_seen": 195218720, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.11541748, "step": 9061, "time_per_iteration": 4.0632853507995605 }, { "auxiliary_loss_clip": 0.0644507, "auxiliary_loss_mlp": 0.01265438, "balance_loss_clip": 0.06291227, "balance_loss_mlp": 0.01254601, "epoch": 0.5448369156771381, "flos": 19682998698240.0, "grad_norm": 1.6638151106372026, "language_loss": 0.86911982, "learning_rate": 1.8069333938543627e-06, "loss": 0.94622493, "num_input_tokens_seen": 195235770, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.10839844, "step": 9062, "time_per_iteration": 2.5430562496185303 }, { "auxiliary_loss_clip": 0.06442706, "auxiliary_loss_mlp": 0.01267772, "balance_loss_clip": 0.06283073, "balance_loss_mlp": 0.01256203, "epoch": 0.5448970389298061, "flos": 19287925896960.0, "grad_norm": 1.9577151085379014, "language_loss": 0.82812625, "learning_rate": 1.8065457547173233e-06, "loss": 0.90523106, "num_input_tokens_seen": 195254870, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.11566162, "step": 9063, "time_per_iteration": 2.5480940341949463 }, { "auxiliary_loss_clip": 0.06441818, "auxiliary_loss_mlp": 0.01271982, "balance_loss_clip": 0.06283701, "balance_loss_mlp": 0.01259954, "epoch": 0.544957162182474, "flos": 20997270789120.0, "grad_norm": 1.7197386088815305, "language_loss": 0.63197517, "learning_rate": 1.8061581229160878e-06, "loss": 0.70911318, "num_input_tokens_seen": 195273390, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.12023926, "step": 9064, "time_per_iteration": 2.566168785095215 }, { "auxiliary_loss_clip": 0.06448524, "auxiliary_loss_mlp": 0.01265929, "balance_loss_clip": 0.06288764, "balance_loss_mlp": 0.01254026, "epoch": 0.545017285435142, "flos": 25381671678720.0, "grad_norm": 1.6386336257178622, "language_loss": 0.80663776, "learning_rate": 1.8057704984653566e-06, "loss": 0.88378227, "num_input_tokens_seen": 195295635, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.11914062, "step": 9065, "time_per_iteration": 2.6354286670684814 }, { "auxiliary_loss_clip": 0.06444398, "auxiliary_loss_mlp": 0.01264691, "balance_loss_clip": 0.06290831, "balance_loss_mlp": 0.01254642, "epoch": 0.54507740868781, "flos": 19140661146240.0, "grad_norm": 1.8497296842319442, "language_loss": 0.77998453, "learning_rate": 1.805382881379827e-06, "loss": 0.85707545, "num_input_tokens_seen": 195312545, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.10046387, "step": 9066, "time_per_iteration": 2.5720608234405518 }, { "auxiliary_loss_clip": 0.06448585, "auxiliary_loss_mlp": 0.01271664, "balance_loss_clip": 0.06286727, "balance_loss_mlp": 0.01259504, "epoch": 0.545137531940478, "flos": 26256958629120.0, "grad_norm": 2.2599006064963953, "language_loss": 0.76033425, "learning_rate": 1.8049952716741975e-06, "loss": 0.83753675, "num_input_tokens_seen": 195332955, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.12158203, "step": 9067, "time_per_iteration": 2.614379644393921 }, { "auxiliary_loss_clip": 0.06452814, "auxiliary_loss_mlp": 0.01269297, "balance_loss_clip": 0.0628751, "balance_loss_mlp": 0.01256446, "epoch": 0.545197655193146, "flos": 37563880435200.0, "grad_norm": 2.0094298782494446, "language_loss": 0.64079112, "learning_rate": 1.8046076693631682e-06, "loss": 0.71801221, "num_input_tokens_seen": 195355930, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.12854004, "step": 9068, "time_per_iteration": 2.697902202606201 }, { "auxiliary_loss_clip": 0.06443872, "auxiliary_loss_mlp": 0.01268695, "balance_loss_clip": 0.06290068, "balance_loss_mlp": 0.01257591, "epoch": 0.5452577784458139, "flos": 26038430380800.0, "grad_norm": 1.581627761185291, "language_loss": 0.72248864, "learning_rate": 1.8042200744614343e-06, "loss": 0.79961437, "num_input_tokens_seen": 195376445, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.11108398, "step": 9069, "time_per_iteration": 2.6336071491241455 }, { "auxiliary_loss_clip": 0.06444809, "auxiliary_loss_mlp": 0.01264943, "balance_loss_clip": 0.06291209, "balance_loss_mlp": 0.01254798, "epoch": 0.5453179016984819, "flos": 17644729403520.0, "grad_norm": 1.7353646584554818, "language_loss": 0.74064958, "learning_rate": 1.8038324869836957e-06, "loss": 0.81774712, "num_input_tokens_seen": 195393725, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.10137939, "step": 9070, "time_per_iteration": 4.025329828262329 }, { "auxiliary_loss_clip": 0.06441008, "auxiliary_loss_mlp": 0.01265349, "balance_loss_clip": 0.06283654, "balance_loss_mlp": 0.01254751, "epoch": 0.5453780249511498, "flos": 23222524469760.0, "grad_norm": 2.06647401906765, "language_loss": 0.61048234, "learning_rate": 1.8034449069446489e-06, "loss": 0.6875459, "num_input_tokens_seen": 195411380, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.10595703, "step": 9071, "time_per_iteration": 3.9827466011047363 }, { "auxiliary_loss_clip": 0.06325124, "auxiliary_loss_mlp": 0.01253588, "balance_loss_clip": 0.06262739, "balance_loss_mlp": 0.01252043, "epoch": 0.5454381482038179, "flos": 68719163443200.0, "grad_norm": 0.7222447020781629, "language_loss": 0.57042098, "learning_rate": 1.80305733435899e-06, "loss": 0.64620811, "num_input_tokens_seen": 195482015, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01544189, "step": 9072, "time_per_iteration": 3.3128645420074463 }, { "auxiliary_loss_clip": 0.06437224, "auxiliary_loss_mlp": 0.01270387, "balance_loss_clip": 0.06285778, "balance_loss_mlp": 0.01259652, "epoch": 0.5454982714564858, "flos": 13265569393920.0, "grad_norm": 1.7610244687953895, "language_loss": 0.69752306, "learning_rate": 1.8026697692414174e-06, "loss": 0.77459919, "num_input_tokens_seen": 195500440, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.10736084, "step": 9073, "time_per_iteration": 2.5936410427093506 }, { "auxiliary_loss_clip": 0.06439301, "auxiliary_loss_mlp": 0.01268965, "balance_loss_clip": 0.06287415, "balance_loss_mlp": 0.01258636, "epoch": 0.5455583947091538, "flos": 21842439396480.0, "grad_norm": 1.904433546616002, "language_loss": 0.7199598, "learning_rate": 1.802282211606627e-06, "loss": 0.79704249, "num_input_tokens_seen": 195520860, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.10333252, "step": 9074, "time_per_iteration": 2.601187229156494 }, { "auxiliary_loss_clip": 0.06441835, "auxiliary_loss_mlp": 0.01268077, "balance_loss_clip": 0.06286528, "balance_loss_mlp": 0.01257104, "epoch": 0.5456185179618217, "flos": 17822489840640.0, "grad_norm": 2.0786182241783995, "language_loss": 0.68698406, "learning_rate": 1.8018946614693148e-06, "loss": 0.76408321, "num_input_tokens_seen": 195538615, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.10968018, "step": 9075, "time_per_iteration": 2.5752432346343994 }, { "auxiliary_loss_clip": 0.06439592, "auxiliary_loss_mlp": 0.01262929, "balance_loss_clip": 0.06286123, "balance_loss_mlp": 0.01252212, "epoch": 0.5456786412144897, "flos": 21075787664640.0, "grad_norm": 1.6334853404615748, "language_loss": 0.80687696, "learning_rate": 1.8015071188441768e-06, "loss": 0.88390219, "num_input_tokens_seen": 195557460, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.10717773, "step": 9076, "time_per_iteration": 2.5597450733184814 }, { "auxiliary_loss_clip": 0.06446543, "auxiliary_loss_mlp": 0.01274248, "balance_loss_clip": 0.06288075, "balance_loss_mlp": 0.01263108, "epoch": 0.5457387644671576, "flos": 23301712177920.0, "grad_norm": 1.5423207527106253, "language_loss": 0.80466211, "learning_rate": 1.8011195837459089e-06, "loss": 0.88187003, "num_input_tokens_seen": 195577985, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.11138916, "step": 9077, "time_per_iteration": 2.6062726974487305 }, { "auxiliary_loss_clip": 0.06438346, "auxiliary_loss_mlp": 0.01265868, "balance_loss_clip": 0.06283353, "balance_loss_mlp": 0.01254978, "epoch": 0.5457988877198257, "flos": 21623575731840.0, "grad_norm": 1.7747917925359866, "language_loss": 0.67786735, "learning_rate": 1.8007320561892064e-06, "loss": 0.75490946, "num_input_tokens_seen": 195597620, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.10888672, "step": 9078, "time_per_iteration": 2.5941808223724365 }, { "auxiliary_loss_clip": 0.06444022, "auxiliary_loss_mlp": 0.01267945, "balance_loss_clip": 0.06285524, "balance_loss_mlp": 0.01256376, "epoch": 0.5458590109724936, "flos": 23768174257920.0, "grad_norm": 1.7898446080140016, "language_loss": 0.8105197, "learning_rate": 1.800344536188764e-06, "loss": 0.88763928, "num_input_tokens_seen": 195615910, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.11566162, "step": 9079, "time_per_iteration": 2.5922038555145264 }, { "auxiliary_loss_clip": 0.06447074, "auxiliary_loss_mlp": 0.01267795, "balance_loss_clip": 0.06283325, "balance_loss_mlp": 0.01255021, "epoch": 0.5459191342251616, "flos": 24430928526720.0, "grad_norm": 1.9037229532123205, "language_loss": 0.75845551, "learning_rate": 1.799957023759277e-06, "loss": 0.83560419, "num_input_tokens_seen": 195635620, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.12768555, "step": 9080, "time_per_iteration": 2.590559959411621 }, { "auxiliary_loss_clip": 0.06443597, "auxiliary_loss_mlp": 0.01269195, "balance_loss_clip": 0.06285468, "balance_loss_mlp": 0.01257274, "epoch": 0.5459792574778296, "flos": 23629756112640.0, "grad_norm": 2.0413725305525547, "language_loss": 0.83348721, "learning_rate": 1.7995695189154392e-06, "loss": 0.91061515, "num_input_tokens_seen": 195652495, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.1192627, "step": 9081, "time_per_iteration": 2.600069284439087 }, { "auxiliary_loss_clip": 0.06442949, "auxiliary_loss_mlp": 0.01267255, "balance_loss_clip": 0.06282724, "balance_loss_mlp": 0.01255627, "epoch": 0.5460393807304975, "flos": 19141583541120.0, "grad_norm": 2.083987215797737, "language_loss": 0.69728005, "learning_rate": 1.7991820216719461e-06, "loss": 0.77438211, "num_input_tokens_seen": 195671965, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.11621094, "step": 9082, "time_per_iteration": 2.602445602416992 }, { "auxiliary_loss_clip": 0.06435693, "auxiliary_loss_mlp": 0.01266715, "balance_loss_clip": 0.06281728, "balance_loss_mlp": 0.01255652, "epoch": 0.5460995039831655, "flos": 35927308414080.0, "grad_norm": 1.5702205547500492, "language_loss": 0.66611892, "learning_rate": 1.7987945320434906e-06, "loss": 0.74314296, "num_input_tokens_seen": 195694725, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.11065674, "step": 9083, "time_per_iteration": 2.7043416500091553 }, { "auxiliary_loss_clip": 0.06434157, "auxiliary_loss_mlp": 0.01265475, "balance_loss_clip": 0.06282105, "balance_loss_mlp": 0.01254728, "epoch": 0.5461596272358334, "flos": 26766242945280.0, "grad_norm": 2.1997168956042144, "language_loss": 0.79235959, "learning_rate": 1.798407050044766e-06, "loss": 0.86935598, "num_input_tokens_seen": 195714090, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.10754395, "step": 9084, "time_per_iteration": 2.6230568885803223 }, { "auxiliary_loss_clip": 0.06444947, "auxiliary_loss_mlp": 0.01266429, "balance_loss_clip": 0.06287258, "balance_loss_mlp": 0.01255742, "epoch": 0.5462197504885015, "flos": 20892870201600.0, "grad_norm": 1.6901833131358441, "language_loss": 0.75061327, "learning_rate": 1.7980195756904675e-06, "loss": 0.82772708, "num_input_tokens_seen": 195733585, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.10687256, "step": 9085, "time_per_iteration": 2.6046690940856934 }, { "auxiliary_loss_clip": 0.06444641, "auxiliary_loss_mlp": 0.01265604, "balance_loss_clip": 0.06285289, "balance_loss_mlp": 0.0125441, "epoch": 0.5462798737411694, "flos": 25810887818880.0, "grad_norm": 1.839304827979502, "language_loss": 0.7502563, "learning_rate": 1.7976321089952857e-06, "loss": 0.82735878, "num_input_tokens_seen": 195752820, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.11199951, "step": 9086, "time_per_iteration": 2.6427323818206787 }, { "auxiliary_loss_clip": 0.06441399, "auxiliary_loss_mlp": 0.01267214, "balance_loss_clip": 0.06285846, "balance_loss_mlp": 0.01255889, "epoch": 0.5463399969938374, "flos": 25782027287040.0, "grad_norm": 1.5064444663345262, "language_loss": 0.77167058, "learning_rate": 1.7972446499739155e-06, "loss": 0.84875667, "num_input_tokens_seen": 195773740, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.11328125, "step": 9087, "time_per_iteration": 2.6574513912200928 }, { "auxiliary_loss_clip": 0.06444044, "auxiliary_loss_mlp": 0.01270928, "balance_loss_clip": 0.0628604, "balance_loss_mlp": 0.01257798, "epoch": 0.5464001202465053, "flos": 18849234245760.0, "grad_norm": 1.6886944004361784, "language_loss": 0.77891874, "learning_rate": 1.7968571986410484e-06, "loss": 0.85606849, "num_input_tokens_seen": 195792125, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.13122559, "step": 9088, "time_per_iteration": 2.5726230144500732 }, { "auxiliary_loss_clip": 0.06322497, "auxiliary_loss_mlp": 0.01254356, "balance_loss_clip": 0.06260419, "balance_loss_mlp": 0.01252592, "epoch": 0.5464602434991733, "flos": 69070281978240.0, "grad_norm": 0.717753584995075, "language_loss": 0.57655156, "learning_rate": 1.7964697550113758e-06, "loss": 0.65232015, "num_input_tokens_seen": 195854935, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.01765442, "step": 9089, "time_per_iteration": 3.230959177017212 }, { "auxiliary_loss_clip": 0.06443307, "auxiliary_loss_mlp": 0.0126667, "balance_loss_clip": 0.06282633, "balance_loss_mlp": 0.01255857, "epoch": 0.5465203667518412, "flos": 27566870307840.0, "grad_norm": 2.0710684464646096, "language_loss": 0.77445811, "learning_rate": 1.7960823190995918e-06, "loss": 0.85155785, "num_input_tokens_seen": 195874715, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.10809326, "step": 9090, "time_per_iteration": 2.622934341430664 }, { "auxiliary_loss_clip": 0.06444064, "auxiliary_loss_mlp": 0.01270049, "balance_loss_clip": 0.06281962, "balance_loss_mlp": 0.01257264, "epoch": 0.5465804900045093, "flos": 21215757110400.0, "grad_norm": 2.152613593574767, "language_loss": 0.74635935, "learning_rate": 1.7956948909203855e-06, "loss": 0.82350045, "num_input_tokens_seen": 195892610, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.12786865, "step": 9091, "time_per_iteration": 2.619431972503662 }, { "auxiliary_loss_clip": 0.06442873, "auxiliary_loss_mlp": 0.01267223, "balance_loss_clip": 0.06285004, "balance_loss_mlp": 0.01255708, "epoch": 0.5466406132571772, "flos": 22495005394560.0, "grad_norm": 1.8317081825375499, "language_loss": 0.78263766, "learning_rate": 1.7953074704884498e-06, "loss": 0.85973859, "num_input_tokens_seen": 195911085, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.1151123, "step": 9092, "time_per_iteration": 2.5696706771850586 }, { "auxiliary_loss_clip": 0.06446081, "auxiliary_loss_mlp": 0.01265389, "balance_loss_clip": 0.06284998, "balance_loss_mlp": 0.01253986, "epoch": 0.5467007365098452, "flos": 17681598000000.0, "grad_norm": 3.464581174894976, "language_loss": 0.75648344, "learning_rate": 1.794920057818476e-06, "loss": 0.83359814, "num_input_tokens_seen": 195929845, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.1138916, "step": 9093, "time_per_iteration": 2.591318368911743 }, { "auxiliary_loss_clip": 0.06441939, "auxiliary_loss_mlp": 0.01271469, "balance_loss_clip": 0.06281614, "balance_loss_mlp": 0.01258427, "epoch": 0.5467608597625132, "flos": 15703146120960.0, "grad_norm": 1.9735487973658052, "language_loss": 0.69189829, "learning_rate": 1.7945326529251533e-06, "loss": 0.76903242, "num_input_tokens_seen": 195946350, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.13037109, "step": 9094, "time_per_iteration": 2.5510292053222656 }, { "auxiliary_loss_clip": 0.06441516, "auxiliary_loss_mlp": 0.01268185, "balance_loss_clip": 0.0628664, "balance_loss_mlp": 0.01257432, "epoch": 0.5468209830151811, "flos": 24319106853120.0, "grad_norm": 3.9264515653612975, "language_loss": 0.68836212, "learning_rate": 1.7941452558231731e-06, "loss": 0.76545912, "num_input_tokens_seen": 195959840, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.10748291, "step": 9095, "time_per_iteration": 2.5870137214660645 }, { "auxiliary_loss_clip": 0.064458, "auxiliary_loss_mlp": 0.01267348, "balance_loss_clip": 0.06287616, "balance_loss_mlp": 0.01256637, "epoch": 0.5468811062678491, "flos": 29173575548160.0, "grad_norm": 2.0493805967210323, "language_loss": 0.66797972, "learning_rate": 1.7937578665272256e-06, "loss": 0.74511123, "num_input_tokens_seen": 195981125, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.10705566, "step": 9096, "time_per_iteration": 2.652696371078491 }, { "auxiliary_loss_clip": 0.06321335, "auxiliary_loss_mlp": 0.01251978, "balance_loss_clip": 0.06259049, "balance_loss_mlp": 0.0125048, "epoch": 0.546941229520517, "flos": 67885078302720.0, "grad_norm": 0.7266172250115553, "language_loss": 0.5745039, "learning_rate": 1.7933704850520007e-06, "loss": 0.65023696, "num_input_tokens_seen": 196038880, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.0149765, "step": 9097, "time_per_iteration": 4.919272184371948 }, { "auxiliary_loss_clip": 0.06320487, "auxiliary_loss_mlp": 0.01254998, "balance_loss_clip": 0.06258087, "balance_loss_mlp": 0.01253332, "epoch": 0.5470013527731851, "flos": 58286578993920.0, "grad_norm": 0.8846323868948976, "language_loss": 0.64640236, "learning_rate": 1.7929831114121868e-06, "loss": 0.72215724, "num_input_tokens_seen": 196099215, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01669312, "step": 9098, "time_per_iteration": 3.208366632461548 }, { "auxiliary_loss_clip": 0.06442866, "auxiliary_loss_mlp": 0.01268074, "balance_loss_clip": 0.06283024, "balance_loss_mlp": 0.01255932, "epoch": 0.547061476025853, "flos": 22972494286080.0, "grad_norm": 1.6534459982602452, "language_loss": 0.73660094, "learning_rate": 1.7925957456224753e-06, "loss": 0.81371033, "num_input_tokens_seen": 196120370, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.12127686, "step": 9099, "time_per_iteration": 2.6011617183685303 }, { "auxiliary_loss_clip": 0.06440011, "auxiliary_loss_mlp": 0.01266427, "balance_loss_clip": 0.06283551, "balance_loss_mlp": 0.01256044, "epoch": 0.547121599278521, "flos": 29975502648960.0, "grad_norm": 2.310443859188005, "language_loss": 0.73414266, "learning_rate": 1.7922083876975537e-06, "loss": 0.81120706, "num_input_tokens_seen": 196139075, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.10394287, "step": 9100, "time_per_iteration": 2.631603717803955 }, { "auxiliary_loss_clip": 0.06439719, "auxiliary_loss_mlp": 0.01266953, "balance_loss_clip": 0.06284745, "balance_loss_mlp": 0.01254937, "epoch": 0.5471817225311889, "flos": 36543760502400.0, "grad_norm": 1.543281859356182, "language_loss": 0.68497622, "learning_rate": 1.7918210376521102e-06, "loss": 0.76204288, "num_input_tokens_seen": 196159990, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.12017822, "step": 9101, "time_per_iteration": 4.154041051864624 }, { "auxiliary_loss_clip": 0.06435865, "auxiliary_loss_mlp": 0.01268521, "balance_loss_clip": 0.06278351, "balance_loss_mlp": 0.01257893, "epoch": 0.5472418457838569, "flos": 25782278849280.0, "grad_norm": 1.8936766028103305, "language_loss": 0.77898073, "learning_rate": 1.7914336955008343e-06, "loss": 0.85602462, "num_input_tokens_seen": 196180570, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.10620117, "step": 9102, "time_per_iteration": 2.6046602725982666 }, { "auxiliary_loss_clip": 0.0643394, "auxiliary_loss_mlp": 0.01265876, "balance_loss_clip": 0.06282847, "balance_loss_mlp": 0.01255189, "epoch": 0.5473019690365248, "flos": 27894453045120.0, "grad_norm": 1.4141320899434664, "language_loss": 0.72350335, "learning_rate": 1.791046361258413e-06, "loss": 0.80050153, "num_input_tokens_seen": 196200300, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.10687256, "step": 9103, "time_per_iteration": 2.7115824222564697 }, { "auxiliary_loss_clip": 0.06440884, "auxiliary_loss_mlp": 0.0126796, "balance_loss_clip": 0.06285791, "balance_loss_mlp": 0.01256826, "epoch": 0.5473620922891929, "flos": 57644551411200.0, "grad_norm": 1.2478981985961526, "language_loss": 0.65236163, "learning_rate": 1.7906590349395356e-06, "loss": 0.72945011, "num_input_tokens_seen": 196228525, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.11138916, "step": 9104, "time_per_iteration": 2.9038145542144775 }, { "auxiliary_loss_clip": 0.06442163, "auxiliary_loss_mlp": 0.01269907, "balance_loss_clip": 0.06284378, "balance_loss_mlp": 0.01258176, "epoch": 0.5474222155418608, "flos": 19360069862400.0, "grad_norm": 1.799932766272384, "language_loss": 0.82024121, "learning_rate": 1.790271716558888e-06, "loss": 0.89736199, "num_input_tokens_seen": 196247690, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.11743164, "step": 9105, "time_per_iteration": 2.52736759185791 }, { "auxiliary_loss_clip": 0.0643466, "auxiliary_loss_mlp": 0.01263908, "balance_loss_clip": 0.062809, "balance_loss_mlp": 0.01252929, "epoch": 0.5474823387945288, "flos": 25127700353280.0, "grad_norm": 1.40423244675221, "language_loss": 0.80508769, "learning_rate": 1.7898844061311575e-06, "loss": 0.88207334, "num_input_tokens_seen": 196268555, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.10974121, "step": 9106, "time_per_iteration": 2.6021671295166016 }, { "auxiliary_loss_clip": 0.06437132, "auxiliary_loss_mlp": 0.01265823, "balance_loss_clip": 0.0628226, "balance_loss_mlp": 0.01254504, "epoch": 0.5475424620471967, "flos": 18009977351040.0, "grad_norm": 1.90418467626831, "language_loss": 0.69615835, "learning_rate": 1.7894971036710322e-06, "loss": 0.77318794, "num_input_tokens_seen": 196285585, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.11315918, "step": 9107, "time_per_iteration": 2.5396947860717773 }, { "auxiliary_loss_clip": 0.06441501, "auxiliary_loss_mlp": 0.01264132, "balance_loss_clip": 0.06281617, "balance_loss_mlp": 0.01253123, "epoch": 0.5476025852998647, "flos": 22315819438080.0, "grad_norm": 2.1560807175791057, "language_loss": 0.63404793, "learning_rate": 1.789109809193197e-06, "loss": 0.71110427, "num_input_tokens_seen": 196305085, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.11010742, "step": 9108, "time_per_iteration": 2.570236921310425 }, { "auxiliary_loss_clip": 0.06438345, "auxiliary_loss_mlp": 0.01264869, "balance_loss_clip": 0.06282265, "balance_loss_mlp": 0.01254939, "epoch": 0.5476627085525327, "flos": 20126679667200.0, "grad_norm": 1.8520530374167348, "language_loss": 0.7540552, "learning_rate": 1.7887225227123396e-06, "loss": 0.83108735, "num_input_tokens_seen": 196323945, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.0993042, "step": 9109, "time_per_iteration": 2.5679173469543457 }, { "auxiliary_loss_clip": 0.0643819, "auxiliary_loss_mlp": 0.01272853, "balance_loss_clip": 0.06285363, "balance_loss_mlp": 0.01261659, "epoch": 0.5477228318052006, "flos": 17718382742400.0, "grad_norm": 1.9210340541962163, "language_loss": 0.78114641, "learning_rate": 1.7883352442431457e-06, "loss": 0.85825682, "num_input_tokens_seen": 196342200, "router_z_loss_clip": 1.52539062, "router_z_loss_mlp": 0.11187744, "step": 9110, "time_per_iteration": 5.569730043411255 }, { "auxiliary_loss_clip": 0.06431888, "auxiliary_loss_mlp": 0.01264476, "balance_loss_clip": 0.06280103, "balance_loss_mlp": 0.01254236, "epoch": 0.5477829550578687, "flos": 25856057969280.0, "grad_norm": 1.6209447474740013, "language_loss": 0.71600115, "learning_rate": 1.7879479738002993e-06, "loss": 0.79296482, "num_input_tokens_seen": 196362940, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.10241699, "step": 9111, "time_per_iteration": 2.608398914337158 }, { "auxiliary_loss_clip": 0.06437571, "auxiliary_loss_mlp": 0.01263259, "balance_loss_clip": 0.06282434, "balance_loss_mlp": 0.0125262, "epoch": 0.5478430783105366, "flos": 23046399187200.0, "grad_norm": 1.6232019683623653, "language_loss": 0.71285558, "learning_rate": 1.7875607113984876e-06, "loss": 0.78986394, "num_input_tokens_seen": 196383070, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.10632324, "step": 9112, "time_per_iteration": 2.5982658863067627 }, { "auxiliary_loss_clip": 0.06434672, "auxiliary_loss_mlp": 0.01266199, "balance_loss_clip": 0.06276882, "balance_loss_mlp": 0.01255899, "epoch": 0.5479032015632046, "flos": 16076821403520.0, "grad_norm": 2.124430729177694, "language_loss": 0.88653249, "learning_rate": 1.7871734570523953e-06, "loss": 0.96354121, "num_input_tokens_seen": 196398485, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.10302734, "step": 9113, "time_per_iteration": 2.589900016784668 }, { "auxiliary_loss_clip": 0.06438129, "auxiliary_loss_mlp": 0.01265464, "balance_loss_clip": 0.06281652, "balance_loss_mlp": 0.01254258, "epoch": 0.5479633248158725, "flos": 24285382784640.0, "grad_norm": 1.5229134246762437, "language_loss": 0.73249239, "learning_rate": 1.7867862107767067e-06, "loss": 0.80952835, "num_input_tokens_seen": 196417725, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.11206055, "step": 9114, "time_per_iteration": 2.6587231159210205 }, { "auxiliary_loss_clip": 0.0643568, "auxiliary_loss_mlp": 0.0126597, "balance_loss_clip": 0.06282067, "balance_loss_mlp": 0.01255766, "epoch": 0.5480234480685405, "flos": 26365216504320.0, "grad_norm": 1.7249948952387013, "language_loss": 0.72190207, "learning_rate": 1.7863989725861066e-06, "loss": 0.7989186, "num_input_tokens_seen": 196437840, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.10211182, "step": 9115, "time_per_iteration": 2.63624906539917 }, { "auxiliary_loss_clip": 0.06442817, "auxiliary_loss_mlp": 0.01265218, "balance_loss_clip": 0.06280893, "balance_loss_mlp": 0.01253804, "epoch": 0.5480835713212084, "flos": 22061722331520.0, "grad_norm": 1.7220314242878203, "language_loss": 0.72061455, "learning_rate": 1.7860117424952781e-06, "loss": 0.79769486, "num_input_tokens_seen": 196457300, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.11419678, "step": 9116, "time_per_iteration": 2.5730957984924316 }, { "auxiliary_loss_clip": 0.06438541, "auxiliary_loss_mlp": 0.01267841, "balance_loss_clip": 0.06282498, "balance_loss_mlp": 0.01256545, "epoch": 0.5481436945738765, "flos": 25308018339840.0, "grad_norm": 1.9050208913691775, "language_loss": 0.76560009, "learning_rate": 1.7856245205189063e-06, "loss": 0.84266388, "num_input_tokens_seen": 196476720, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.11291504, "step": 9117, "time_per_iteration": 2.5948400497436523 }, { "auxiliary_loss_clip": 0.06438049, "auxiliary_loss_mlp": 0.01264865, "balance_loss_clip": 0.06286425, "balance_loss_mlp": 0.01254363, "epoch": 0.5482038178265444, "flos": 33588807540480.0, "grad_norm": 1.5568083138567592, "language_loss": 0.63034004, "learning_rate": 1.785237306671674e-06, "loss": 0.70736921, "num_input_tokens_seen": 196496765, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.1050415, "step": 9118, "time_per_iteration": 2.664747953414917 }, { "auxiliary_loss_clip": 0.06445074, "auxiliary_loss_mlp": 0.0126574, "balance_loss_clip": 0.06284979, "balance_loss_mlp": 0.01253891, "epoch": 0.5482639410792124, "flos": 19032235562880.0, "grad_norm": 2.0777145290010655, "language_loss": 0.78752697, "learning_rate": 1.7848501009682646e-06, "loss": 0.86463511, "num_input_tokens_seen": 196516220, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.11846924, "step": 9119, "time_per_iteration": 2.5468242168426514 }, { "auxiliary_loss_clip": 0.06438275, "auxiliary_loss_mlp": 0.01266982, "balance_loss_clip": 0.0628514, "balance_loss_mlp": 0.0125673, "epoch": 0.5483240643318803, "flos": 25417282464000.0, "grad_norm": 1.7716260471754461, "language_loss": 0.82092005, "learning_rate": 1.7844629034233604e-06, "loss": 0.89797258, "num_input_tokens_seen": 196533860, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.10247803, "step": 9120, "time_per_iteration": 2.571338415145874 }, { "auxiliary_loss_clip": 0.06446965, "auxiliary_loss_mlp": 0.01266782, "balance_loss_clip": 0.06288005, "balance_loss_mlp": 0.01255475, "epoch": 0.5483841875845483, "flos": 21472705255680.0, "grad_norm": 4.934233379582528, "language_loss": 0.80323124, "learning_rate": 1.7840757140516455e-06, "loss": 0.88036871, "num_input_tokens_seen": 196551305, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.11315918, "step": 9121, "time_per_iteration": 2.5574522018432617 }, { "auxiliary_loss_clip": 0.06441589, "auxiliary_loss_mlp": 0.0126889, "balance_loss_clip": 0.06282251, "balance_loss_mlp": 0.01257589, "epoch": 0.5484443108372163, "flos": 24753060748800.0, "grad_norm": 1.7076538568697621, "language_loss": 0.61733389, "learning_rate": 1.7836885328678008e-06, "loss": 0.69443864, "num_input_tokens_seen": 196569420, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.11303711, "step": 9122, "time_per_iteration": 2.5785984992980957 }, { "auxiliary_loss_clip": 0.06441974, "auxiliary_loss_mlp": 0.01269063, "balance_loss_clip": 0.06288151, "balance_loss_mlp": 0.01258245, "epoch": 0.5485044340898843, "flos": 25382594073600.0, "grad_norm": 1.7538813087907341, "language_loss": 0.71221554, "learning_rate": 1.7833013598865084e-06, "loss": 0.78932589, "num_input_tokens_seen": 196590610, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.10821533, "step": 9123, "time_per_iteration": 2.5951149463653564 }, { "auxiliary_loss_clip": 0.06438968, "auxiliary_loss_mlp": 0.01264863, "balance_loss_clip": 0.062833, "balance_loss_mlp": 0.0125439, "epoch": 0.5485645573425523, "flos": 12646140485760.0, "grad_norm": 2.5587038596731437, "language_loss": 0.83983397, "learning_rate": 1.7829141951224505e-06, "loss": 0.91687226, "num_input_tokens_seen": 196606495, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.10473633, "step": 9124, "time_per_iteration": 2.5163607597351074 }, { "auxiliary_loss_clip": 0.06439697, "auxiliary_loss_mlp": 0.01272954, "balance_loss_clip": 0.06285977, "balance_loss_mlp": 0.01261331, "epoch": 0.5486246805952202, "flos": 28336918129920.0, "grad_norm": 1.5626670607722888, "language_loss": 0.80570853, "learning_rate": 1.7825270385903075e-06, "loss": 0.88283503, "num_input_tokens_seen": 196626365, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11633301, "step": 9125, "time_per_iteration": 2.626054525375366 }, { "auxiliary_loss_clip": 0.06447536, "auxiliary_loss_mlp": 0.01267345, "balance_loss_clip": 0.06288084, "balance_loss_mlp": 0.012559, "epoch": 0.5486848038478882, "flos": 16805598289920.0, "grad_norm": 1.8958722656982872, "language_loss": 0.74322712, "learning_rate": 1.7821398903047617e-06, "loss": 0.82037592, "num_input_tokens_seen": 196644465, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.11450195, "step": 9126, "time_per_iteration": 2.5257861614227295 }, { "auxiliary_loss_clip": 0.06449027, "auxiliary_loss_mlp": 0.01268123, "balance_loss_clip": 0.06288457, "balance_loss_mlp": 0.0125678, "epoch": 0.5487449271005561, "flos": 17241606610560.0, "grad_norm": 3.437857157323893, "language_loss": 0.67466176, "learning_rate": 1.7817527502804928e-06, "loss": 0.7518332, "num_input_tokens_seen": 196659160, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.11358643, "step": 9127, "time_per_iteration": 2.5286004543304443 }, { "auxiliary_loss_clip": 0.06443812, "auxiliary_loss_mlp": 0.01268803, "balance_loss_clip": 0.06286377, "balance_loss_mlp": 0.01257109, "epoch": 0.5488050503532241, "flos": 17345462146560.0, "grad_norm": 1.7925200990608972, "language_loss": 0.83831429, "learning_rate": 1.781365618532181e-06, "loss": 0.91544044, "num_input_tokens_seen": 196677410, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.11694336, "step": 9128, "time_per_iteration": 2.5514614582061768 }, { "auxiliary_loss_clip": 0.06443236, "auxiliary_loss_mlp": 0.01269346, "balance_loss_clip": 0.06287919, "balance_loss_mlp": 0.01257866, "epoch": 0.548865173605892, "flos": 17245044627840.0, "grad_norm": 1.8521117042947695, "language_loss": 0.74133354, "learning_rate": 1.7809784950745078e-06, "loss": 0.81845939, "num_input_tokens_seen": 196696765, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.11480713, "step": 9129, "time_per_iteration": 2.583759069442749 }, { "auxiliary_loss_clip": 0.06454151, "auxiliary_loss_mlp": 0.01271975, "balance_loss_clip": 0.06293429, "balance_loss_mlp": 0.01259959, "epoch": 0.5489252968585601, "flos": 17462398919040.0, "grad_norm": 3.101653233113172, "language_loss": 0.63570237, "learning_rate": 1.7805913799221511e-06, "loss": 0.7129637, "num_input_tokens_seen": 196714895, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.12017822, "step": 9130, "time_per_iteration": 2.563123941421509 }, { "auxiliary_loss_clip": 0.06450262, "auxiliary_loss_mlp": 0.0126884, "balance_loss_clip": 0.06289988, "balance_loss_mlp": 0.01256668, "epoch": 0.548985420111228, "flos": 26330653895040.0, "grad_norm": 1.748481298638787, "language_loss": 0.63172656, "learning_rate": 1.7802042730897915e-06, "loss": 0.7089175, "num_input_tokens_seen": 196735510, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.12164307, "step": 9131, "time_per_iteration": 2.625119686126709 }, { "auxiliary_loss_clip": 0.06449404, "auxiliary_loss_mlp": 0.01268824, "balance_loss_clip": 0.06289558, "balance_loss_mlp": 0.01256003, "epoch": 0.549045543363896, "flos": 18699034602240.0, "grad_norm": 2.112257524504048, "language_loss": 0.75299346, "learning_rate": 1.7798171745921084e-06, "loss": 0.83017576, "num_input_tokens_seen": 196752855, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.12823486, "step": 9132, "time_per_iteration": 2.5494675636291504 }, { "auxiliary_loss_clip": 0.06445719, "auxiliary_loss_mlp": 0.01266273, "balance_loss_clip": 0.06285734, "balance_loss_mlp": 0.01255473, "epoch": 0.5491056666165639, "flos": 24724284071040.0, "grad_norm": 1.4304360861404957, "language_loss": 0.81547344, "learning_rate": 1.7794300844437795e-06, "loss": 0.89259338, "num_input_tokens_seen": 196772230, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.10797119, "step": 9133, "time_per_iteration": 2.620159149169922 }, { "auxiliary_loss_clip": 0.06446015, "auxiliary_loss_mlp": 0.01269595, "balance_loss_clip": 0.06289585, "balance_loss_mlp": 0.01258234, "epoch": 0.5491657898692319, "flos": 21582849847680.0, "grad_norm": 2.881132465895502, "language_loss": 0.70433068, "learning_rate": 1.7790430026594841e-06, "loss": 0.78148681, "num_input_tokens_seen": 196790405, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.11364746, "step": 9134, "time_per_iteration": 2.556969165802002 }, { "auxiliary_loss_clip": 0.06444545, "auxiliary_loss_mlp": 0.0126775, "balance_loss_clip": 0.06284866, "balance_loss_mlp": 0.01256651, "epoch": 0.5492259131219, "flos": 50487653825280.0, "grad_norm": 1.837863359536626, "language_loss": 0.61081231, "learning_rate": 1.7786559292539004e-06, "loss": 0.68793523, "num_input_tokens_seen": 196813785, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.11102295, "step": 9135, "time_per_iteration": 2.813845634460449 }, { "auxiliary_loss_clip": 0.06444553, "auxiliary_loss_mlp": 0.01268166, "balance_loss_clip": 0.06283058, "balance_loss_mlp": 0.01255822, "epoch": 0.5492860363745679, "flos": 25126316760960.0, "grad_norm": 1.7598274595938985, "language_loss": 0.72624946, "learning_rate": 1.7782688642417058e-06, "loss": 0.80337667, "num_input_tokens_seen": 196834390, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.12353516, "step": 9136, "time_per_iteration": 4.039111614227295 }, { "auxiliary_loss_clip": 0.06448035, "auxiliary_loss_mlp": 0.01270178, "balance_loss_clip": 0.06284013, "balance_loss_mlp": 0.01257709, "epoch": 0.5493461596272359, "flos": 22639670668800.0, "grad_norm": 5.548385467054542, "language_loss": 0.68395412, "learning_rate": 1.7778818076375781e-06, "loss": 0.76113629, "num_input_tokens_seen": 196853290, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.12463379, "step": 9137, "time_per_iteration": 2.576498031616211 }, { "auxiliary_loss_clip": 0.06335147, "auxiliary_loss_mlp": 0.01261442, "balance_loss_clip": 0.062713, "balance_loss_mlp": 0.01259011, "epoch": 0.5494062828799038, "flos": 66169486281600.0, "grad_norm": 0.72559019309099, "language_loss": 0.65018296, "learning_rate": 1.7774947594561947e-06, "loss": 0.72614884, "num_input_tokens_seen": 196913120, "router_z_loss_clip": 0.64013672, "router_z_loss_mlp": 0.02427673, "step": 9138, "time_per_iteration": 3.2338881492614746 }, { "auxiliary_loss_clip": 0.06446452, "auxiliary_loss_mlp": 0.01268382, "balance_loss_clip": 0.06287363, "balance_loss_mlp": 0.01255614, "epoch": 0.5494664061325718, "flos": 21112362771840.0, "grad_norm": 1.7149355940125954, "language_loss": 0.75274688, "learning_rate": 1.7771077197122321e-06, "loss": 0.8298952, "num_input_tokens_seen": 196931530, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.12774658, "step": 9139, "time_per_iteration": 2.5514540672302246 }, { "auxiliary_loss_clip": 0.064386, "auxiliary_loss_mlp": 0.01267734, "balance_loss_clip": 0.06283261, "balance_loss_mlp": 0.01256015, "epoch": 0.5495265293852397, "flos": 14397846416640.0, "grad_norm": 2.508576182584472, "language_loss": 0.71450901, "learning_rate": 1.7767206884203672e-06, "loss": 0.79157239, "num_input_tokens_seen": 196949430, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.11724854, "step": 9140, "time_per_iteration": 3.9929471015930176 }, { "auxiliary_loss_clip": 0.06443803, "auxiliary_loss_mlp": 0.01269288, "balance_loss_clip": 0.06286548, "balance_loss_mlp": 0.01258184, "epoch": 0.5495866526379077, "flos": 25554945922560.0, "grad_norm": 1.7439688152377095, "language_loss": 0.7657938, "learning_rate": 1.7763336655952762e-06, "loss": 0.84292471, "num_input_tokens_seen": 196968265, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.11108398, "step": 9141, "time_per_iteration": 2.5767433643341064 }, { "auxiliary_loss_clip": 0.06436379, "auxiliary_loss_mlp": 0.01277347, "balance_loss_clip": 0.06285775, "balance_loss_mlp": 0.01266708, "epoch": 0.5496467758905756, "flos": 21322421758080.0, "grad_norm": 1.8277429163966081, "language_loss": 0.7535584, "learning_rate": 1.7759466512516346e-06, "loss": 0.83069563, "num_input_tokens_seen": 196984930, "router_z_loss_clip": 1.50585938, "router_z_loss_mlp": 0.10644531, "step": 9142, "time_per_iteration": 2.5547666549682617 }, { "auxiliary_loss_clip": 0.06449592, "auxiliary_loss_mlp": 0.01270858, "balance_loss_clip": 0.06289417, "balance_loss_mlp": 0.01258306, "epoch": 0.5497068991432437, "flos": 22239021571200.0, "grad_norm": 2.099510268101278, "language_loss": 0.76756018, "learning_rate": 1.7755596454041192e-06, "loss": 0.84476465, "num_input_tokens_seen": 197002320, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.12567139, "step": 9143, "time_per_iteration": 2.566829204559326 }, { "auxiliary_loss_clip": 0.06443106, "auxiliary_loss_mlp": 0.01267372, "balance_loss_clip": 0.06287068, "balance_loss_mlp": 0.01255975, "epoch": 0.5497670223959116, "flos": 18485076401280.0, "grad_norm": 2.3045144282178924, "language_loss": 0.79597569, "learning_rate": 1.7751726480674044e-06, "loss": 0.87308049, "num_input_tokens_seen": 197020825, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.11395264, "step": 9144, "time_per_iteration": 2.5933544635772705 }, { "auxiliary_loss_clip": 0.06446845, "auxiliary_loss_mlp": 0.0127121, "balance_loss_clip": 0.06288628, "balance_loss_mlp": 0.01259802, "epoch": 0.5498271456485796, "flos": 29212750131840.0, "grad_norm": 1.8233122584175536, "language_loss": 0.71522987, "learning_rate": 1.7747856592561645e-06, "loss": 0.79241049, "num_input_tokens_seen": 197040450, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.11401367, "step": 9145, "time_per_iteration": 2.672762632369995 }, { "auxiliary_loss_clip": 0.06445093, "auxiliary_loss_mlp": 0.0126717, "balance_loss_clip": 0.06287839, "balance_loss_mlp": 0.0125615, "epoch": 0.5498872689012475, "flos": 34833032017920.0, "grad_norm": 1.7878658678016892, "language_loss": 0.70607948, "learning_rate": 1.774398678985076e-06, "loss": 0.78320205, "num_input_tokens_seen": 197063930, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.11035156, "step": 9146, "time_per_iteration": 2.6826672554016113 }, { "auxiliary_loss_clip": 0.06438778, "auxiliary_loss_mlp": 0.01265674, "balance_loss_clip": 0.06286758, "balance_loss_mlp": 0.01255566, "epoch": 0.5499473921539155, "flos": 25929124329600.0, "grad_norm": 1.677071217507927, "language_loss": 0.64657736, "learning_rate": 1.7740117072688113e-06, "loss": 0.7236219, "num_input_tokens_seen": 197082660, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.10095215, "step": 9147, "time_per_iteration": 2.5991032123565674 }, { "auxiliary_loss_clip": 0.0644442, "auxiliary_loss_mlp": 0.01269623, "balance_loss_clip": 0.06289205, "balance_loss_mlp": 0.01258465, "epoch": 0.5500075154065835, "flos": 22280334433920.0, "grad_norm": 2.1285183331422126, "language_loss": 0.80954695, "learning_rate": 1.7736247441220458e-06, "loss": 0.8866874, "num_input_tokens_seen": 197100675, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.11151123, "step": 9148, "time_per_iteration": 2.629835367202759 }, { "auxiliary_loss_clip": 0.06444531, "auxiliary_loss_mlp": 0.01274637, "balance_loss_clip": 0.06288397, "balance_loss_mlp": 0.01262829, "epoch": 0.5500676386592515, "flos": 28044946177920.0, "grad_norm": 1.625650121974221, "language_loss": 0.79143786, "learning_rate": 1.773237789559453e-06, "loss": 0.86862957, "num_input_tokens_seen": 197121320, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.11810303, "step": 9149, "time_per_iteration": 4.072868585586548 }, { "auxiliary_loss_clip": 0.06444712, "auxiliary_loss_mlp": 0.01266667, "balance_loss_clip": 0.06288476, "balance_loss_mlp": 0.01254901, "epoch": 0.5501277619119195, "flos": 23921602283520.0, "grad_norm": 2.387591448918559, "language_loss": 0.72835493, "learning_rate": 1.7728508435957052e-06, "loss": 0.80546874, "num_input_tokens_seen": 197138965, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.11767578, "step": 9150, "time_per_iteration": 3.9378764629364014 }, { "auxiliary_loss_clip": 0.06445947, "auxiliary_loss_mlp": 0.01269789, "balance_loss_clip": 0.062859, "balance_loss_mlp": 0.01257231, "epoch": 0.5501878851645874, "flos": 20930199995520.0, "grad_norm": 2.1766198735917848, "language_loss": 0.75517559, "learning_rate": 1.772463906245477e-06, "loss": 0.83233297, "num_input_tokens_seen": 197156460, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.12554932, "step": 9151, "time_per_iteration": 2.5394601821899414 }, { "auxiliary_loss_clip": 0.06445569, "auxiliary_loss_mlp": 0.01266425, "balance_loss_clip": 0.06290002, "balance_loss_mlp": 0.01255297, "epoch": 0.5502480084172554, "flos": 20671155498240.0, "grad_norm": 1.8609819113169743, "language_loss": 0.76557344, "learning_rate": 1.7720769775234394e-06, "loss": 0.84269333, "num_input_tokens_seen": 197175140, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.11126709, "step": 9152, "time_per_iteration": 2.5438485145568848 }, { "auxiliary_loss_clip": 0.06439576, "auxiliary_loss_mlp": 0.01266645, "balance_loss_clip": 0.06286859, "balance_loss_mlp": 0.0125523, "epoch": 0.5503081316699233, "flos": 26439792238080.0, "grad_norm": 1.7699514080830128, "language_loss": 0.83042949, "learning_rate": 1.7716900574442662e-06, "loss": 0.90749168, "num_input_tokens_seen": 197194345, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.11413574, "step": 9153, "time_per_iteration": 2.580410957336426 }, { "auxiliary_loss_clip": 0.06445196, "auxiliary_loss_mlp": 0.01266649, "balance_loss_clip": 0.0629269, "balance_loss_mlp": 0.01254865, "epoch": 0.5503682549225913, "flos": 30637208741760.0, "grad_norm": 1.843199567611573, "language_loss": 0.74340725, "learning_rate": 1.7713031460226294e-06, "loss": 0.82052577, "num_input_tokens_seen": 197215535, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.11779785, "step": 9154, "time_per_iteration": 2.626232624053955 }, { "auxiliary_loss_clip": 0.0645187, "auxiliary_loss_mlp": 0.01269513, "balance_loss_clip": 0.06288181, "balance_loss_mlp": 0.01257723, "epoch": 0.5504283781752592, "flos": 22572096750720.0, "grad_norm": 2.4112843523525993, "language_loss": 0.729527, "learning_rate": 1.770916243273199e-06, "loss": 0.80674082, "num_input_tokens_seen": 197234945, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.11798096, "step": 9155, "time_per_iteration": 2.5502943992614746 }, { "auxiliary_loss_clip": 0.06333947, "auxiliary_loss_mlp": 0.01250809, "balance_loss_clip": 0.0627146, "balance_loss_mlp": 0.01249122, "epoch": 0.5504885014279273, "flos": 67918634663040.0, "grad_norm": 0.728166633384736, "language_loss": 0.55407131, "learning_rate": 1.7705293492106483e-06, "loss": 0.62991881, "num_input_tokens_seen": 197302285, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01690674, "step": 9156, "time_per_iteration": 3.3094799518585205 }, { "auxiliary_loss_clip": 0.06448694, "auxiliary_loss_mlp": 0.01264575, "balance_loss_clip": 0.06292854, "balance_loss_mlp": 0.01254228, "epoch": 0.5505486246805952, "flos": 22455705029760.0, "grad_norm": 1.7099374123687674, "language_loss": 0.82751614, "learning_rate": 1.7701424638496475e-06, "loss": 0.90464884, "num_input_tokens_seen": 197321575, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.10351562, "step": 9157, "time_per_iteration": 2.577915906906128 }, { "auxiliary_loss_clip": 0.06449495, "auxiliary_loss_mlp": 0.01267605, "balance_loss_clip": 0.06286751, "balance_loss_mlp": 0.01255362, "epoch": 0.5506087479332632, "flos": 26914220455680.0, "grad_norm": 2.204067176330079, "language_loss": 0.7594465, "learning_rate": 1.7697555872048677e-06, "loss": 0.83661747, "num_input_tokens_seen": 197340255, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.12255859, "step": 9158, "time_per_iteration": 2.605649948120117 }, { "auxiliary_loss_clip": 0.06449127, "auxiliary_loss_mlp": 0.01266744, "balance_loss_clip": 0.0629819, "balance_loss_mlp": 0.01256296, "epoch": 0.5506688711859311, "flos": 22936967354880.0, "grad_norm": 1.4639155488875801, "language_loss": 0.6969741, "learning_rate": 1.769368719290979e-06, "loss": 0.77413285, "num_input_tokens_seen": 197360360, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.10455322, "step": 9159, "time_per_iteration": 2.6053714752197266 }, { "auxiliary_loss_clip": 0.06447804, "auxiliary_loss_mlp": 0.01264481, "balance_loss_clip": 0.06292671, "balance_loss_mlp": 0.01253448, "epoch": 0.5507289944385991, "flos": 29614111989120.0, "grad_norm": 1.4553632055133756, "language_loss": 0.68439329, "learning_rate": 1.7689818601226516e-06, "loss": 0.76151615, "num_input_tokens_seen": 197381905, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.11035156, "step": 9160, "time_per_iteration": 2.6231110095977783 }, { "auxiliary_loss_clip": 0.06445752, "auxiliary_loss_mlp": 0.01264658, "balance_loss_clip": 0.06291823, "balance_loss_mlp": 0.01253249, "epoch": 0.5507891176912671, "flos": 15338736714240.0, "grad_norm": 2.1106087640112703, "language_loss": 0.72269189, "learning_rate": 1.7685950097145552e-06, "loss": 0.79979599, "num_input_tokens_seen": 197398555, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.11413574, "step": 9161, "time_per_iteration": 2.5201847553253174 }, { "auxiliary_loss_clip": 0.06453457, "auxiliary_loss_mlp": 0.01267488, "balance_loss_clip": 0.06298387, "balance_loss_mlp": 0.01256509, "epoch": 0.5508492409439351, "flos": 26585547615360.0, "grad_norm": 1.6722634491900068, "language_loss": 0.69772303, "learning_rate": 1.768208168081359e-06, "loss": 0.7749325, "num_input_tokens_seen": 197419630, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.10992432, "step": 9162, "time_per_iteration": 2.601546287536621 }, { "auxiliary_loss_clip": 0.0643838, "auxiliary_loss_mlp": 0.01269615, "balance_loss_clip": 0.06285109, "balance_loss_mlp": 0.01258719, "epoch": 0.5509093641966031, "flos": 25449832575360.0, "grad_norm": 1.59184845372117, "language_loss": 0.85791755, "learning_rate": 1.767821335237733e-06, "loss": 0.93499744, "num_input_tokens_seen": 197438480, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.10888672, "step": 9163, "time_per_iteration": 2.576457977294922 }, { "auxiliary_loss_clip": 0.06445529, "auxiliary_loss_mlp": 0.01267458, "balance_loss_clip": 0.06294066, "balance_loss_mlp": 0.01256789, "epoch": 0.550969487449271, "flos": 18704652825600.0, "grad_norm": 1.4952075532034792, "language_loss": 0.80951375, "learning_rate": 1.7674345111983441e-06, "loss": 0.88664359, "num_input_tokens_seen": 197456755, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.10681152, "step": 9164, "time_per_iteration": 2.5447027683258057 }, { "auxiliary_loss_clip": 0.06452385, "auxiliary_loss_mlp": 0.01268964, "balance_loss_clip": 0.06293358, "balance_loss_mlp": 0.01257115, "epoch": 0.551029610701939, "flos": 22714959162240.0, "grad_norm": 1.7518031036300545, "language_loss": 0.73889375, "learning_rate": 1.767047695977863e-06, "loss": 0.81610721, "num_input_tokens_seen": 197475530, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.1184082, "step": 9165, "time_per_iteration": 2.545172929763794 }, { "auxiliary_loss_clip": 0.06446873, "auxiliary_loss_mlp": 0.0127152, "balance_loss_clip": 0.06295564, "balance_loss_mlp": 0.01260595, "epoch": 0.5510897339546069, "flos": 12425138542080.0, "grad_norm": 1.9918530666770222, "language_loss": 0.79526132, "learning_rate": 1.7666608895909563e-06, "loss": 0.87244529, "num_input_tokens_seen": 197490835, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.10931396, "step": 9166, "time_per_iteration": 2.5062990188598633 }, { "auxiliary_loss_clip": 0.06444506, "auxiliary_loss_mlp": 0.0126893, "balance_loss_clip": 0.06287958, "balance_loss_mlp": 0.01256943, "epoch": 0.5511498572072749, "flos": 18776545228800.0, "grad_norm": 2.2090727180183656, "language_loss": 0.76351511, "learning_rate": 1.7662740920522913e-06, "loss": 0.84064949, "num_input_tokens_seen": 197508770, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.11993408, "step": 9167, "time_per_iteration": 2.5331785678863525 }, { "auxiliary_loss_clip": 0.06448384, "auxiliary_loss_mlp": 0.0127283, "balance_loss_clip": 0.06295104, "balance_loss_mlp": 0.01261434, "epoch": 0.5512099804599428, "flos": 19579436651520.0, "grad_norm": 2.4059547062388766, "language_loss": 0.80644476, "learning_rate": 1.7658873033765374e-06, "loss": 0.88365692, "num_input_tokens_seen": 197527340, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.1138916, "step": 9168, "time_per_iteration": 2.5318286418914795 }, { "auxiliary_loss_clip": 0.06445642, "auxiliary_loss_mlp": 0.01265287, "balance_loss_clip": 0.06287405, "balance_loss_mlp": 0.01254218, "epoch": 0.5512701037126109, "flos": 26252053165440.0, "grad_norm": 4.153772226832358, "language_loss": 0.69560647, "learning_rate": 1.7655005235783591e-06, "loss": 0.77271581, "num_input_tokens_seen": 197547280, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.11065674, "step": 9169, "time_per_iteration": 2.5964138507843018 }, { "auxiliary_loss_clip": 0.06444857, "auxiliary_loss_mlp": 0.01269142, "balance_loss_clip": 0.06292693, "balance_loss_mlp": 0.01258699, "epoch": 0.5513302269652788, "flos": 21951997009920.0, "grad_norm": 1.9281867075374621, "language_loss": 0.85266703, "learning_rate": 1.7651137526724251e-06, "loss": 0.92980701, "num_input_tokens_seen": 197565045, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.10443115, "step": 9170, "time_per_iteration": 2.543389320373535 }, { "auxiliary_loss_clip": 0.06334127, "auxiliary_loss_mlp": 0.0125281, "balance_loss_clip": 0.06271979, "balance_loss_mlp": 0.01251104, "epoch": 0.5513903502179468, "flos": 68254728589440.0, "grad_norm": 0.7637999603345298, "language_loss": 0.5988605, "learning_rate": 1.7647269906734017e-06, "loss": 0.67472988, "num_input_tokens_seen": 197625005, "router_z_loss_clip": 0.62451172, "router_z_loss_mlp": 0.0171051, "step": 9171, "time_per_iteration": 3.1932177543640137 }, { "auxiliary_loss_clip": 0.0643926, "auxiliary_loss_mlp": 0.01268359, "balance_loss_clip": 0.06284814, "balance_loss_mlp": 0.01258077, "epoch": 0.5514504734706147, "flos": 18740221683840.0, "grad_norm": 1.6369792364335278, "language_loss": 0.7058602, "learning_rate": 1.7643402375959533e-06, "loss": 0.78293633, "num_input_tokens_seen": 197645050, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.10284424, "step": 9172, "time_per_iteration": 2.563826322555542 }, { "auxiliary_loss_clip": 0.06440585, "auxiliary_loss_mlp": 0.01272091, "balance_loss_clip": 0.06286897, "balance_loss_mlp": 0.01261625, "epoch": 0.5515105967232827, "flos": 22277147978880.0, "grad_norm": 3.1629039893247852, "language_loss": 0.76452124, "learning_rate": 1.7639534934547474e-06, "loss": 0.84164798, "num_input_tokens_seen": 197663910, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.10467529, "step": 9173, "time_per_iteration": 2.5834856033325195 }, { "auxiliary_loss_clip": 0.06435159, "auxiliary_loss_mlp": 0.01266026, "balance_loss_clip": 0.06284366, "balance_loss_mlp": 0.01254714, "epoch": 0.5515707199759508, "flos": 22563040510080.0, "grad_norm": 1.6419334093746367, "language_loss": 0.75453085, "learning_rate": 1.7635667582644484e-06, "loss": 0.83154273, "num_input_tokens_seen": 197681580, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.11309814, "step": 9174, "time_per_iteration": 2.591012477874756 }, { "auxiliary_loss_clip": 0.06448057, "auxiliary_loss_mlp": 0.01265003, "balance_loss_clip": 0.06292624, "balance_loss_mlp": 0.01254066, "epoch": 0.5516308432286187, "flos": 28298246670720.0, "grad_norm": 1.7325486501355698, "language_loss": 0.73586148, "learning_rate": 1.7631800320397217e-06, "loss": 0.8129921, "num_input_tokens_seen": 197702095, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.109375, "step": 9175, "time_per_iteration": 2.6363894939422607 }, { "auxiliary_loss_clip": 0.06439711, "auxiliary_loss_mlp": 0.0127118, "balance_loss_clip": 0.06286481, "balance_loss_mlp": 0.01260225, "epoch": 0.5516909664812867, "flos": 18769417632000.0, "grad_norm": 2.049717424686532, "language_loss": 0.69496369, "learning_rate": 1.7627933147952318e-06, "loss": 0.77207261, "num_input_tokens_seen": 197720720, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.10955811, "step": 9176, "time_per_iteration": 4.010278224945068 }, { "auxiliary_loss_clip": 0.06437175, "auxiliary_loss_mlp": 0.01269247, "balance_loss_clip": 0.06286053, "balance_loss_mlp": 0.01258453, "epoch": 0.5517510897339546, "flos": 27746852878080.0, "grad_norm": 1.5461423924296192, "language_loss": 0.71241176, "learning_rate": 1.7624066065456435e-06, "loss": 0.78947604, "num_input_tokens_seen": 197741820, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.10797119, "step": 9177, "time_per_iteration": 2.6474177837371826 }, { "auxiliary_loss_clip": 0.06442589, "auxiliary_loss_mlp": 0.01265275, "balance_loss_clip": 0.06289467, "balance_loss_mlp": 0.01255405, "epoch": 0.5518112129866226, "flos": 18410165251200.0, "grad_norm": 1.532732810736578, "language_loss": 0.80154061, "learning_rate": 1.7620199073056204e-06, "loss": 0.87861919, "num_input_tokens_seen": 197759160, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.09869385, "step": 9178, "time_per_iteration": 2.535918712615967 }, { "auxiliary_loss_clip": 0.06437548, "auxiliary_loss_mlp": 0.0126593, "balance_loss_clip": 0.06279645, "balance_loss_mlp": 0.0125423, "epoch": 0.5518713362392905, "flos": 25089699726720.0, "grad_norm": 1.4968947288510557, "language_loss": 0.75146282, "learning_rate": 1.761633217089826e-06, "loss": 0.82849759, "num_input_tokens_seen": 197779760, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.11712646, "step": 9179, "time_per_iteration": 4.0396506786346436 }, { "auxiliary_loss_clip": 0.06436157, "auxiliary_loss_mlp": 0.01269832, "balance_loss_clip": 0.06283861, "balance_loss_mlp": 0.01259056, "epoch": 0.5519314594919585, "flos": 36547911279360.0, "grad_norm": 1.78400009477242, "language_loss": 0.69813585, "learning_rate": 1.761246535912924e-06, "loss": 0.77519578, "num_input_tokens_seen": 197801545, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.10772705, "step": 9180, "time_per_iteration": 2.6756999492645264 }, { "auxiliary_loss_clip": 0.06435402, "auxiliary_loss_mlp": 0.01267052, "balance_loss_clip": 0.06281789, "balance_loss_mlp": 0.01255787, "epoch": 0.5519915827446265, "flos": 20454807456000.0, "grad_norm": 1.77712800957052, "language_loss": 0.67610711, "learning_rate": 1.7608598637895776e-06, "loss": 0.75313169, "num_input_tokens_seen": 197820760, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.1126709, "step": 9181, "time_per_iteration": 2.5645077228546143 }, { "auxiliary_loss_clip": 0.06442951, "auxiliary_loss_mlp": 0.01267437, "balance_loss_clip": 0.06282967, "balance_loss_mlp": 0.01255927, "epoch": 0.5520517059972945, "flos": 23774672949120.0, "grad_norm": 2.0043155995548037, "language_loss": 0.79460526, "learning_rate": 1.7604732007344486e-06, "loss": 0.87170917, "num_input_tokens_seen": 197840195, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.11505127, "step": 9182, "time_per_iteration": 2.5777010917663574 }, { "auxiliary_loss_clip": 0.06437692, "auxiliary_loss_mlp": 0.01265926, "balance_loss_clip": 0.06283038, "balance_loss_mlp": 0.01254553, "epoch": 0.5521118292499624, "flos": 22202362609920.0, "grad_norm": 1.976989682476522, "language_loss": 0.83439064, "learning_rate": 1.7600865467622003e-06, "loss": 0.91142678, "num_input_tokens_seen": 197859475, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.11383057, "step": 9183, "time_per_iteration": 2.557408332824707 }, { "auxiliary_loss_clip": 0.06433978, "auxiliary_loss_mlp": 0.01265285, "balance_loss_clip": 0.06281278, "balance_loss_mlp": 0.01254902, "epoch": 0.5521719525026304, "flos": 23589491425920.0, "grad_norm": 1.3414254492396644, "language_loss": 0.67528749, "learning_rate": 1.7596999018874936e-06, "loss": 0.75228012, "num_input_tokens_seen": 197879395, "router_z_loss_clip": 1.52636719, "router_z_loss_mlp": 0.10394287, "step": 9184, "time_per_iteration": 2.569830894470215 }, { "auxiliary_loss_clip": 0.06433623, "auxiliary_loss_mlp": 0.01266112, "balance_loss_clip": 0.0628015, "balance_loss_mlp": 0.01255627, "epoch": 0.5522320757552983, "flos": 26144298414720.0, "grad_norm": 1.4067798610242856, "language_loss": 0.75938427, "learning_rate": 1.7593132661249917e-06, "loss": 0.83638161, "num_input_tokens_seen": 197900815, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.10479736, "step": 9185, "time_per_iteration": 2.6695995330810547 }, { "auxiliary_loss_clip": 0.06436324, "auxiliary_loss_mlp": 0.01267858, "balance_loss_clip": 0.06280588, "balance_loss_mlp": 0.0125695, "epoch": 0.5522921990079663, "flos": 24682258448640.0, "grad_norm": 2.0375439930702663, "language_loss": 0.74631584, "learning_rate": 1.7589266394893536e-06, "loss": 0.8233577, "num_input_tokens_seen": 197918985, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.10906982, "step": 9186, "time_per_iteration": 2.6266028881073 }, { "auxiliary_loss_clip": 0.06437497, "auxiliary_loss_mlp": 0.0126764, "balance_loss_clip": 0.06281186, "balance_loss_mlp": 0.01256351, "epoch": 0.5523523222606344, "flos": 22754888432640.0, "grad_norm": 2.4074695821131753, "language_loss": 0.66314244, "learning_rate": 1.7585400219952421e-06, "loss": 0.74019384, "num_input_tokens_seen": 197937725, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.11291504, "step": 9187, "time_per_iteration": 2.569955587387085 }, { "auxiliary_loss_clip": 0.06434517, "auxiliary_loss_mlp": 0.01268403, "balance_loss_clip": 0.0627861, "balance_loss_mlp": 0.01256947, "epoch": 0.5524124455133023, "flos": 19761976771200.0, "grad_norm": 1.8038202638789342, "language_loss": 0.78224248, "learning_rate": 1.758153413657318e-06, "loss": 0.85927171, "num_input_tokens_seen": 197955635, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.11462402, "step": 9188, "time_per_iteration": 2.5604143142700195 }, { "auxiliary_loss_clip": 0.06434263, "auxiliary_loss_mlp": 0.01271265, "balance_loss_clip": 0.0628164, "balance_loss_mlp": 0.0125972, "epoch": 0.5524725687659703, "flos": 23301544469760.0, "grad_norm": 1.764580345616596, "language_loss": 0.81623727, "learning_rate": 1.7577668144902394e-06, "loss": 0.89329255, "num_input_tokens_seen": 197974490, "router_z_loss_clip": 1.52636719, "router_z_loss_mlp": 0.11553955, "step": 9189, "time_per_iteration": 5.450241327285767 }, { "auxiliary_loss_clip": 0.06431809, "auxiliary_loss_mlp": 0.01264557, "balance_loss_clip": 0.0628134, "balance_loss_mlp": 0.01253369, "epoch": 0.5525326920186382, "flos": 24868907418240.0, "grad_norm": 1.5658757563891514, "language_loss": 0.76484466, "learning_rate": 1.7573802245086684e-06, "loss": 0.84180832, "num_input_tokens_seen": 197995735, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.11187744, "step": 9190, "time_per_iteration": 2.591536283493042 }, { "auxiliary_loss_clip": 0.06439877, "auxiliary_loss_mlp": 0.01269309, "balance_loss_clip": 0.06279108, "balance_loss_mlp": 0.01257013, "epoch": 0.5525928152713062, "flos": 13740710371200.0, "grad_norm": 2.2195840626231393, "language_loss": 0.79592562, "learning_rate": 1.7569936437272627e-06, "loss": 0.87301743, "num_input_tokens_seen": 198009685, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.12298584, "step": 9191, "time_per_iteration": 2.58870005607605 }, { "auxiliary_loss_clip": 0.06431918, "auxiliary_loss_mlp": 0.01263842, "balance_loss_clip": 0.0627806, "balance_loss_mlp": 0.01253321, "epoch": 0.5526529385239741, "flos": 13075398552960.0, "grad_norm": 1.8625807120877327, "language_loss": 0.68555307, "learning_rate": 1.7566070721606829e-06, "loss": 0.76251066, "num_input_tokens_seen": 198026845, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.10528564, "step": 9192, "time_per_iteration": 2.5702245235443115 }, { "auxiliary_loss_clip": 0.06430331, "auxiliary_loss_mlp": 0.01266142, "balance_loss_clip": 0.06280251, "balance_loss_mlp": 0.01256397, "epoch": 0.5527130617766421, "flos": 23154992478720.0, "grad_norm": 1.503301268511679, "language_loss": 0.77646327, "learning_rate": 1.756220509823588e-06, "loss": 0.85342807, "num_input_tokens_seen": 198045275, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.09747314, "step": 9193, "time_per_iteration": 2.5411739349365234 }, { "auxiliary_loss_clip": 0.06432015, "auxiliary_loss_mlp": 0.01268466, "balance_loss_clip": 0.06278718, "balance_loss_mlp": 0.0125819, "epoch": 0.55277318502931, "flos": 21291506801280.0, "grad_norm": 1.7626337593689474, "language_loss": 0.79249454, "learning_rate": 1.7558339567306344e-06, "loss": 0.86949933, "num_input_tokens_seen": 198065760, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.10284424, "step": 9194, "time_per_iteration": 2.549325704574585 }, { "auxiliary_loss_clip": 0.06439797, "auxiliary_loss_mlp": 0.0126653, "balance_loss_clip": 0.06278483, "balance_loss_mlp": 0.0125502, "epoch": 0.5528333082819781, "flos": 38333383205760.0, "grad_norm": 2.0670660954972844, "language_loss": 0.70333427, "learning_rate": 1.7554474128964825e-06, "loss": 0.78039753, "num_input_tokens_seen": 198087595, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.1151123, "step": 9195, "time_per_iteration": 2.8183650970458984 }, { "auxiliary_loss_clip": 0.06444147, "auxiliary_loss_mlp": 0.0126591, "balance_loss_clip": 0.06281706, "balance_loss_mlp": 0.0125303, "epoch": 0.552893431534646, "flos": 13558799157120.0, "grad_norm": 1.961446525542619, "language_loss": 0.74378729, "learning_rate": 1.7550608783357887e-06, "loss": 0.8208878, "num_input_tokens_seen": 198104620, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.12866211, "step": 9196, "time_per_iteration": 2.6356513500213623 }, { "auxiliary_loss_clip": 0.06438795, "auxiliary_loss_mlp": 0.01266511, "balance_loss_clip": 0.06285072, "balance_loss_mlp": 0.01254953, "epoch": 0.552953554787314, "flos": 21944995194240.0, "grad_norm": 1.7518725209475643, "language_loss": 0.77161676, "learning_rate": 1.7546743530632115e-06, "loss": 0.84866977, "num_input_tokens_seen": 198123565, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.11553955, "step": 9197, "time_per_iteration": 2.610302209854126 }, { "auxiliary_loss_clip": 0.06432538, "auxiliary_loss_mlp": 0.01266472, "balance_loss_clip": 0.06280604, "balance_loss_mlp": 0.0125597, "epoch": 0.5530136780399819, "flos": 43668820736640.0, "grad_norm": 2.256370670277029, "language_loss": 0.76042378, "learning_rate": 1.754287837093407e-06, "loss": 0.83741385, "num_input_tokens_seen": 198148270, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.10516357, "step": 9198, "time_per_iteration": 2.805697202682495 }, { "auxiliary_loss_clip": 0.06438516, "auxiliary_loss_mlp": 0.01265748, "balance_loss_clip": 0.06284013, "balance_loss_mlp": 0.01255479, "epoch": 0.5530738012926499, "flos": 25052411859840.0, "grad_norm": 1.4181398210604776, "language_loss": 0.79544389, "learning_rate": 1.7539013304410327e-06, "loss": 0.87248659, "num_input_tokens_seen": 198168810, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.1027832, "step": 9199, "time_per_iteration": 2.622087240219116 }, { "auxiliary_loss_clip": 0.06430647, "auxiliary_loss_mlp": 0.01269908, "balance_loss_clip": 0.06278695, "balance_loss_mlp": 0.01259841, "epoch": 0.553133924545318, "flos": 16477680136320.0, "grad_norm": 1.8482479541419674, "language_loss": 0.63439882, "learning_rate": 1.7535148331207443e-06, "loss": 0.71140438, "num_input_tokens_seen": 198186200, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.10064697, "step": 9200, "time_per_iteration": 2.558290958404541 }, { "auxiliary_loss_clip": 0.06437934, "auxiliary_loss_mlp": 0.01266816, "balance_loss_clip": 0.06281224, "balance_loss_mlp": 0.01255462, "epoch": 0.5531940477979859, "flos": 24612797813760.0, "grad_norm": 6.886180231337926, "language_loss": 0.66512573, "learning_rate": 1.7531283451471978e-06, "loss": 0.74217319, "num_input_tokens_seen": 198207050, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.11352539, "step": 9201, "time_per_iteration": 2.6138131618499756 }, { "auxiliary_loss_clip": 0.06439012, "auxiliary_loss_mlp": 0.01269846, "balance_loss_clip": 0.06284098, "balance_loss_mlp": 0.0125817, "epoch": 0.5532541710506539, "flos": 22165410159360.0, "grad_norm": 1.8602299670499813, "language_loss": 0.61145055, "learning_rate": 1.7527418665350502e-06, "loss": 0.68853915, "num_input_tokens_seen": 198224565, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.11676025, "step": 9202, "time_per_iteration": 2.5853543281555176 }, { "auxiliary_loss_clip": 0.06433293, "auxiliary_loss_mlp": 0.01264819, "balance_loss_clip": 0.06283714, "balance_loss_mlp": 0.01253703, "epoch": 0.5533142943033218, "flos": 21403621964160.0, "grad_norm": 1.613275018969098, "language_loss": 0.64364696, "learning_rate": 1.7523553972989548e-06, "loss": 0.72062808, "num_input_tokens_seen": 198244790, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.11120605, "step": 9203, "time_per_iteration": 2.5705134868621826 }, { "auxiliary_loss_clip": 0.06437398, "auxiliary_loss_mlp": 0.01269279, "balance_loss_clip": 0.06284009, "balance_loss_mlp": 0.0125815, "epoch": 0.5533744175559898, "flos": 23557360584960.0, "grad_norm": 1.761484731781425, "language_loss": 0.63809681, "learning_rate": 1.7519689374535683e-06, "loss": 0.71516359, "num_input_tokens_seen": 198264375, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.11132812, "step": 9204, "time_per_iteration": 2.6158394813537598 }, { "auxiliary_loss_clip": 0.06434163, "auxiliary_loss_mlp": 0.01264385, "balance_loss_clip": 0.06283505, "balance_loss_mlp": 0.01253883, "epoch": 0.5534345408086577, "flos": 24068447763840.0, "grad_norm": 1.8235385983378236, "language_loss": 0.77381766, "learning_rate": 1.7515824870135445e-06, "loss": 0.85080314, "num_input_tokens_seen": 198283895, "router_z_loss_clip": 1.50488281, "router_z_loss_mlp": 0.1050415, "step": 9205, "time_per_iteration": 2.5979549884796143 }, { "auxiliary_loss_clip": 0.06434904, "auxiliary_loss_mlp": 0.01264868, "balance_loss_clip": 0.06286247, "balance_loss_mlp": 0.01254085, "epoch": 0.5534946640613257, "flos": 33781242441600.0, "grad_norm": 1.4078557269128693, "language_loss": 0.73135805, "learning_rate": 1.751196045993537e-06, "loss": 0.80835581, "num_input_tokens_seen": 198310035, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.10778809, "step": 9206, "time_per_iteration": 2.7266464233398438 }, { "auxiliary_loss_clip": 0.064372, "auxiliary_loss_mlp": 0.01266888, "balance_loss_clip": 0.06283505, "balance_loss_mlp": 0.01256082, "epoch": 0.5535547873139937, "flos": 15164707783680.0, "grad_norm": 2.9093612844008563, "language_loss": 0.75516796, "learning_rate": 1.7508096144082012e-06, "loss": 0.83220875, "num_input_tokens_seen": 198327810, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.10803223, "step": 9207, "time_per_iteration": 2.5675511360168457 }, { "auxiliary_loss_clip": 0.06445733, "auxiliary_loss_mlp": 0.01264819, "balance_loss_clip": 0.06286649, "balance_loss_mlp": 0.01252951, "epoch": 0.5536149105666617, "flos": 16986209765760.0, "grad_norm": 2.3789999838493747, "language_loss": 0.61857289, "learning_rate": 1.750423192272189e-06, "loss": 0.69567841, "num_input_tokens_seen": 198343150, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.11865234, "step": 9208, "time_per_iteration": 2.6460015773773193 }, { "auxiliary_loss_clip": 0.06445017, "auxiliary_loss_mlp": 0.01269476, "balance_loss_clip": 0.06287816, "balance_loss_mlp": 0.01257364, "epoch": 0.5536750338193296, "flos": 18155732728320.0, "grad_norm": 3.8143323142976167, "language_loss": 0.64541495, "learning_rate": 1.7500367796001547e-06, "loss": 0.72255987, "num_input_tokens_seen": 198360925, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.12115479, "step": 9209, "time_per_iteration": 2.5826056003570557 }, { "auxiliary_loss_clip": 0.06438952, "auxiliary_loss_mlp": 0.0127229, "balance_loss_clip": 0.06284893, "balance_loss_mlp": 0.0126115, "epoch": 0.5537351570719976, "flos": 22754469162240.0, "grad_norm": 1.9293174357797342, "language_loss": 0.83303213, "learning_rate": 1.7496503764067513e-06, "loss": 0.91014457, "num_input_tokens_seen": 198379265, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.1114502, "step": 9210, "time_per_iteration": 2.5602824687957764 }, { "auxiliary_loss_clip": 0.0643411, "auxiliary_loss_mlp": 0.0126814, "balance_loss_clip": 0.06283377, "balance_loss_mlp": 0.0125827, "epoch": 0.5537952803246655, "flos": 26362658954880.0, "grad_norm": 1.692535229461379, "language_loss": 0.73281103, "learning_rate": 1.74926398270663e-06, "loss": 0.80983353, "num_input_tokens_seen": 198399490, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.09869385, "step": 9211, "time_per_iteration": 2.593045711517334 }, { "auxiliary_loss_clip": 0.06446795, "auxiliary_loss_mlp": 0.01272355, "balance_loss_clip": 0.06288711, "balance_loss_mlp": 0.01260136, "epoch": 0.5538554035773335, "flos": 18042695170560.0, "grad_norm": 2.0763708460000077, "language_loss": 0.66712004, "learning_rate": 1.7488775985144437e-06, "loss": 0.74431157, "num_input_tokens_seen": 198419110, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.12225342, "step": 9212, "time_per_iteration": 2.545570135116577 }, { "auxiliary_loss_clip": 0.06443217, "auxiliary_loss_mlp": 0.01270055, "balance_loss_clip": 0.06285324, "balance_loss_mlp": 0.01256721, "epoch": 0.5539155268300014, "flos": 31694323052160.0, "grad_norm": 3.9789384194480273, "language_loss": 0.52061242, "learning_rate": 1.7484912238448443e-06, "loss": 0.59774512, "num_input_tokens_seen": 198441360, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.13342285, "step": 9213, "time_per_iteration": 2.6543917655944824 }, { "auxiliary_loss_clip": 0.06446379, "auxiliary_loss_mlp": 0.01265668, "balance_loss_clip": 0.06289785, "balance_loss_mlp": 0.01254521, "epoch": 0.5539756500826695, "flos": 15198934976640.0, "grad_norm": 2.9235593068911814, "language_loss": 0.85589021, "learning_rate": 1.7481048587124827e-06, "loss": 0.9330107, "num_input_tokens_seen": 198459835, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.11157227, "step": 9214, "time_per_iteration": 2.5691823959350586 }, { "auxiliary_loss_clip": 0.06435941, "auxiliary_loss_mlp": 0.0126406, "balance_loss_clip": 0.06283119, "balance_loss_mlp": 0.01253313, "epoch": 0.5540357733353375, "flos": 26359262864640.0, "grad_norm": 1.6144273370483544, "language_loss": 0.70047599, "learning_rate": 1.7477185031320108e-06, "loss": 0.77747601, "num_input_tokens_seen": 198478955, "router_z_loss_clip": 1.52539062, "router_z_loss_mlp": 0.10754395, "step": 9215, "time_per_iteration": 4.059562921524048 }, { "auxiliary_loss_clip": 0.06440696, "auxiliary_loss_mlp": 0.0126531, "balance_loss_clip": 0.06283514, "balance_loss_mlp": 0.01253866, "epoch": 0.5540958965880054, "flos": 21329926698240.0, "grad_norm": 1.5799851219289984, "language_loss": 0.73379272, "learning_rate": 1.7473321571180773e-06, "loss": 0.81085277, "num_input_tokens_seen": 198499030, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.11437988, "step": 9216, "time_per_iteration": 2.587646722793579 }, { "auxiliary_loss_clip": 0.06435079, "auxiliary_loss_mlp": 0.01266816, "balance_loss_clip": 0.06284938, "balance_loss_mlp": 0.01255861, "epoch": 0.5541560198406734, "flos": 25674020974080.0, "grad_norm": 2.3173733883056795, "language_loss": 0.71671617, "learning_rate": 1.7469458206853345e-06, "loss": 0.79373509, "num_input_tokens_seen": 198520265, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.10955811, "step": 9217, "time_per_iteration": 2.6131839752197266 }, { "auxiliary_loss_clip": 0.0643113, "auxiliary_loss_mlp": 0.01262975, "balance_loss_clip": 0.06279559, "balance_loss_mlp": 0.012527, "epoch": 0.5542161430933413, "flos": 21945246756480.0, "grad_norm": 1.5983749094496316, "language_loss": 0.78550029, "learning_rate": 1.7465594938484315e-06, "loss": 0.8624413, "num_input_tokens_seen": 198539645, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.10272217, "step": 9218, "time_per_iteration": 4.05475926399231 }, { "auxiliary_loss_clip": 0.06442536, "auxiliary_loss_mlp": 0.01266162, "balance_loss_clip": 0.06284512, "balance_loss_mlp": 0.01254641, "epoch": 0.5542762663460093, "flos": 19577256445440.0, "grad_norm": 1.6722834359201677, "language_loss": 0.72802615, "learning_rate": 1.7461731766220176e-06, "loss": 0.80511314, "num_input_tokens_seen": 198558710, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.11529541, "step": 9219, "time_per_iteration": 2.5620617866516113 }, { "auxiliary_loss_clip": 0.06436677, "auxiliary_loss_mlp": 0.01270858, "balance_loss_clip": 0.06281015, "balance_loss_mlp": 0.01259318, "epoch": 0.5543363895986773, "flos": 19504944771840.0, "grad_norm": 1.4738914210940715, "language_loss": 0.71344286, "learning_rate": 1.7457868690207426e-06, "loss": 0.79051822, "num_input_tokens_seen": 198577050, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.11547852, "step": 9220, "time_per_iteration": 2.5707664489746094 }, { "auxiliary_loss_clip": 0.0643037, "auxiliary_loss_mlp": 0.01264703, "balance_loss_clip": 0.06279199, "balance_loss_mlp": 0.01254499, "epoch": 0.5543965128513453, "flos": 22641808947840.0, "grad_norm": 1.960870173646021, "language_loss": 0.79488254, "learning_rate": 1.7454005710592547e-06, "loss": 0.87183326, "num_input_tokens_seen": 198595290, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.10205078, "step": 9221, "time_per_iteration": 2.5789318084716797 }, { "auxiliary_loss_clip": 0.06433146, "auxiliary_loss_mlp": 0.0126849, "balance_loss_clip": 0.06281461, "balance_loss_mlp": 0.01257022, "epoch": 0.5544566361040132, "flos": 25996320904320.0, "grad_norm": 1.7117475257562833, "language_loss": 0.83714354, "learning_rate": 1.7450142827522027e-06, "loss": 0.91415989, "num_input_tokens_seen": 198614110, "router_z_loss_clip": 1.51660156, "router_z_loss_mlp": 0.11480713, "step": 9222, "time_per_iteration": 2.596130609512329 }, { "auxiliary_loss_clip": 0.06437956, "auxiliary_loss_mlp": 0.01268321, "balance_loss_clip": 0.06279936, "balance_loss_mlp": 0.01256513, "epoch": 0.5545167593566812, "flos": 28265235361920.0, "grad_norm": 1.7911581751579666, "language_loss": 0.7576834, "learning_rate": 1.7446280041142344e-06, "loss": 0.83474612, "num_input_tokens_seen": 198633880, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.11810303, "step": 9223, "time_per_iteration": 2.6353037357330322 }, { "auxiliary_loss_clip": 0.06435737, "auxiliary_loss_mlp": 0.01269018, "balance_loss_clip": 0.06281821, "balance_loss_mlp": 0.01257371, "epoch": 0.5545768826093491, "flos": 28484266734720.0, "grad_norm": 1.5786002262481913, "language_loss": 0.81858253, "learning_rate": 1.7442417351599986e-06, "loss": 0.89563006, "num_input_tokens_seen": 198653505, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11645508, "step": 9224, "time_per_iteration": 2.61958646774292 }, { "auxiliary_loss_clip": 0.06434928, "auxiliary_loss_mlp": 0.01272109, "balance_loss_clip": 0.06279336, "balance_loss_mlp": 0.01260134, "epoch": 0.5546370058620171, "flos": 18483860517120.0, "grad_norm": 1.9783346319421344, "language_loss": 0.57878774, "learning_rate": 1.743855475904141e-06, "loss": 0.65585816, "num_input_tokens_seen": 198671890, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.11981201, "step": 9225, "time_per_iteration": 2.5360679626464844 }, { "auxiliary_loss_clip": 0.06436422, "auxiliary_loss_mlp": 0.01266591, "balance_loss_clip": 0.06281774, "balance_loss_mlp": 0.0125492, "epoch": 0.554697129114685, "flos": 22937260844160.0, "grad_norm": 2.161367496272747, "language_loss": 0.67690945, "learning_rate": 1.7434692263613098e-06, "loss": 0.75393963, "num_input_tokens_seen": 198691995, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.11676025, "step": 9226, "time_per_iteration": 2.603609800338745 }, { "auxiliary_loss_clip": 0.06437944, "auxiliary_loss_mlp": 0.01268422, "balance_loss_clip": 0.06281282, "balance_loss_mlp": 0.01257729, "epoch": 0.5547572523673531, "flos": 21803348666880.0, "grad_norm": 1.4393765923677801, "language_loss": 0.74529481, "learning_rate": 1.7430829865461518e-06, "loss": 0.82235849, "num_input_tokens_seen": 198712440, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.10687256, "step": 9227, "time_per_iteration": 2.569063186645508 }, { "auxiliary_loss_clip": 0.06438611, "auxiliary_loss_mlp": 0.01268465, "balance_loss_clip": 0.0628258, "balance_loss_mlp": 0.01256127, "epoch": 0.5548173756200211, "flos": 22348830746880.0, "grad_norm": 1.5462376310766428, "language_loss": 0.73466557, "learning_rate": 1.7426967564733118e-06, "loss": 0.81173629, "num_input_tokens_seen": 198731515, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.12329102, "step": 9228, "time_per_iteration": 4.029580116271973 }, { "auxiliary_loss_clip": 0.06434034, "auxiliary_loss_mlp": 0.01266918, "balance_loss_clip": 0.06279156, "balance_loss_mlp": 0.01256046, "epoch": 0.554877498872689, "flos": 17864599317120.0, "grad_norm": 1.6323097754633826, "language_loss": 0.76495349, "learning_rate": 1.7423105361574373e-06, "loss": 0.84196299, "num_input_tokens_seen": 198749750, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.10864258, "step": 9229, "time_per_iteration": 3.9925732612609863 }, { "auxiliary_loss_clip": 0.06440505, "auxiliary_loss_mlp": 0.01266882, "balance_loss_clip": 0.06284808, "balance_loss_mlp": 0.01253942, "epoch": 0.554937622125357, "flos": 17244080305920.0, "grad_norm": 1.3638603609107605, "language_loss": 0.69008148, "learning_rate": 1.741924325613172e-06, "loss": 0.76715541, "num_input_tokens_seen": 198768320, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.12945557, "step": 9230, "time_per_iteration": 2.57723069190979 }, { "auxiliary_loss_clip": 0.06440333, "auxiliary_loss_mlp": 0.01265533, "balance_loss_clip": 0.06282711, "balance_loss_mlp": 0.01253487, "epoch": 0.5549977453780249, "flos": 25374082884480.0, "grad_norm": 4.9362614697065075, "language_loss": 0.68649638, "learning_rate": 1.741538124855163e-06, "loss": 0.76355499, "num_input_tokens_seen": 198787230, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.1204834, "step": 9231, "time_per_iteration": 2.6006250381469727 }, { "auxiliary_loss_clip": 0.06441125, "auxiliary_loss_mlp": 0.01271564, "balance_loss_clip": 0.06281154, "balance_loss_mlp": 0.01258618, "epoch": 0.555057868630693, "flos": 25085548949760.0, "grad_norm": 2.7749681833336592, "language_loss": 0.78314948, "learning_rate": 1.7411519338980548e-06, "loss": 0.86027634, "num_input_tokens_seen": 198806720, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.1295166, "step": 9232, "time_per_iteration": 2.602567672729492 }, { "auxiliary_loss_clip": 0.06432404, "auxiliary_loss_mlp": 0.01268314, "balance_loss_clip": 0.06281234, "balance_loss_mlp": 0.01257693, "epoch": 0.5551179918833609, "flos": 26111412887040.0, "grad_norm": 1.9212857792073585, "language_loss": 0.83002204, "learning_rate": 1.7407657527564898e-06, "loss": 0.90702927, "num_input_tokens_seen": 198826235, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.10620117, "step": 9233, "time_per_iteration": 2.591310977935791 }, { "auxiliary_loss_clip": 0.06442029, "auxiliary_loss_mlp": 0.01268339, "balance_loss_clip": 0.06282435, "balance_loss_mlp": 0.01256484, "epoch": 0.5551781151360289, "flos": 19389810862080.0, "grad_norm": 2.0365221972494525, "language_loss": 0.75221908, "learning_rate": 1.7403795814451142e-06, "loss": 0.8293227, "num_input_tokens_seen": 198842655, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.11865234, "step": 9234, "time_per_iteration": 2.5558581352233887 }, { "auxiliary_loss_clip": 0.06433694, "auxiliary_loss_mlp": 0.01264676, "balance_loss_clip": 0.06281202, "balance_loss_mlp": 0.01254137, "epoch": 0.5552382383886968, "flos": 21732420585600.0, "grad_norm": 2.0678628974020823, "language_loss": 0.65285569, "learning_rate": 1.7399934199785706e-06, "loss": 0.72983938, "num_input_tokens_seen": 198861210, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.10534668, "step": 9235, "time_per_iteration": 2.6082115173339844 }, { "auxiliary_loss_clip": 0.06433504, "auxiliary_loss_mlp": 0.01267799, "balance_loss_clip": 0.06277886, "balance_loss_mlp": 0.01256164, "epoch": 0.5552983616413648, "flos": 14361480944640.0, "grad_norm": 2.103438225516204, "language_loss": 0.68408066, "learning_rate": 1.7396072683715029e-06, "loss": 0.76109374, "num_input_tokens_seen": 198880045, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.11627197, "step": 9236, "time_per_iteration": 2.5396194458007812 }, { "auxiliary_loss_clip": 0.06432586, "auxiliary_loss_mlp": 0.01265551, "balance_loss_clip": 0.06282729, "balance_loss_mlp": 0.01255019, "epoch": 0.5553584848940327, "flos": 25484730600960.0, "grad_norm": 1.9775601489968573, "language_loss": 0.86609149, "learning_rate": 1.7392211266385536e-06, "loss": 0.94307292, "num_input_tokens_seen": 198900210, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.10522461, "step": 9237, "time_per_iteration": 2.617584228515625 }, { "auxiliary_loss_clip": 0.06432724, "auxiliary_loss_mlp": 0.01269129, "balance_loss_clip": 0.06282562, "balance_loss_mlp": 0.01258025, "epoch": 0.5554186081467007, "flos": 22170399477120.0, "grad_norm": 1.5315887741061143, "language_loss": 0.73361933, "learning_rate": 1.7388349947943652e-06, "loss": 0.81063783, "num_input_tokens_seen": 198919055, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.11102295, "step": 9238, "time_per_iteration": 2.569843053817749 }, { "auxiliary_loss_clip": 0.06441993, "auxiliary_loss_mlp": 0.01267301, "balance_loss_clip": 0.06283353, "balance_loss_mlp": 0.01256298, "epoch": 0.5554787313993687, "flos": 49757744908800.0, "grad_norm": 1.5667693581060074, "language_loss": 0.78670084, "learning_rate": 1.73844887285358e-06, "loss": 0.86379373, "num_input_tokens_seen": 198943505, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.10992432, "step": 9239, "time_per_iteration": 2.912437915802002 }, { "auxiliary_loss_clip": 0.06439289, "auxiliary_loss_mlp": 0.01266742, "balance_loss_clip": 0.06283437, "balance_loss_mlp": 0.01254803, "epoch": 0.5555388546520367, "flos": 22133908224000.0, "grad_norm": 1.4757358058120373, "language_loss": 0.80104655, "learning_rate": 1.7380627608308393e-06, "loss": 0.87810683, "num_input_tokens_seen": 198963590, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.11938477, "step": 9240, "time_per_iteration": 2.563163995742798 }, { "auxiliary_loss_clip": 0.06434449, "auxiliary_loss_mlp": 0.01268127, "balance_loss_clip": 0.06282632, "balance_loss_mlp": 0.01256665, "epoch": 0.5555989779047047, "flos": 24689218337280.0, "grad_norm": 1.5526797986275522, "language_loss": 0.64877045, "learning_rate": 1.737676658740786e-06, "loss": 0.72579622, "num_input_tokens_seen": 198982680, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.11462402, "step": 9241, "time_per_iteration": 2.5947630405426025 }, { "auxiliary_loss_clip": 0.06442414, "auxiliary_loss_mlp": 0.01266031, "balance_loss_clip": 0.06287593, "balance_loss_mlp": 0.0125402, "epoch": 0.5556591011573726, "flos": 16111929064320.0, "grad_norm": 2.0552023278782294, "language_loss": 0.72887444, "learning_rate": 1.7372905665980594e-06, "loss": 0.80595893, "num_input_tokens_seen": 199000185, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.12011719, "step": 9242, "time_per_iteration": 2.544358491897583 }, { "auxiliary_loss_clip": 0.06437739, "auxiliary_loss_mlp": 0.01267935, "balance_loss_clip": 0.06281431, "balance_loss_mlp": 0.01255418, "epoch": 0.5557192244100406, "flos": 12938825197440.0, "grad_norm": 2.5878098197228114, "language_loss": 0.6434598, "learning_rate": 1.7369044844173012e-06, "loss": 0.7205165, "num_input_tokens_seen": 199018380, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.12524414, "step": 9243, "time_per_iteration": 2.6505966186523438 }, { "auxiliary_loss_clip": 0.06440741, "auxiliary_loss_mlp": 0.01269166, "balance_loss_clip": 0.06287825, "balance_loss_mlp": 0.01257722, "epoch": 0.5557793476627085, "flos": 23118291590400.0, "grad_norm": 2.2831036412273513, "language_loss": 0.75559092, "learning_rate": 1.7365184122131509e-06, "loss": 0.83269006, "num_input_tokens_seen": 199037115, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.11437988, "step": 9244, "time_per_iteration": 2.576659679412842 }, { "auxiliary_loss_clip": 0.06436728, "auxiliary_loss_mlp": 0.01264891, "balance_loss_clip": 0.0628883, "balance_loss_mlp": 0.01254657, "epoch": 0.5558394709153766, "flos": 21433446817920.0, "grad_norm": 2.048442666895925, "language_loss": 0.74686879, "learning_rate": 1.7361323500002486e-06, "loss": 0.82388496, "num_input_tokens_seen": 199053375, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10235596, "step": 9245, "time_per_iteration": 2.581134796142578 }, { "auxiliary_loss_clip": 0.06442839, "auxiliary_loss_mlp": 0.01267725, "balance_loss_clip": 0.06284001, "balance_loss_mlp": 0.01255846, "epoch": 0.5558995941680445, "flos": 25084626554880.0, "grad_norm": 2.179758442513514, "language_loss": 0.7959733, "learning_rate": 1.7357462977932348e-06, "loss": 0.87307888, "num_input_tokens_seen": 199070930, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.11883545, "step": 9246, "time_per_iteration": 2.6127994060516357 }, { "auxiliary_loss_clip": 0.06438754, "auxiliary_loss_mlp": 0.01271898, "balance_loss_clip": 0.06283705, "balance_loss_mlp": 0.01260102, "epoch": 0.5559597174207125, "flos": 20017331688960.0, "grad_norm": 1.7539747895934676, "language_loss": 0.73876536, "learning_rate": 1.7353602556067471e-06, "loss": 0.81587195, "num_input_tokens_seen": 199088675, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.11798096, "step": 9247, "time_per_iteration": 2.568263530731201 }, { "auxiliary_loss_clip": 0.0643894, "auxiliary_loss_mlp": 0.01266398, "balance_loss_clip": 0.06284207, "balance_loss_mlp": 0.01255067, "epoch": 0.5560198406733804, "flos": 16841125221120.0, "grad_norm": 3.5282145009683386, "language_loss": 0.76675141, "learning_rate": 1.7349742234554254e-06, "loss": 0.84380472, "num_input_tokens_seen": 199103075, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.11328125, "step": 9248, "time_per_iteration": 2.5407047271728516 }, { "auxiliary_loss_clip": 0.06336354, "auxiliary_loss_mlp": 0.01262388, "balance_loss_clip": 0.06272949, "balance_loss_mlp": 0.01260342, "epoch": 0.5560799639260484, "flos": 70719012840960.0, "grad_norm": 0.9228395413964137, "language_loss": 0.59358525, "learning_rate": 1.7345882013539081e-06, "loss": 0.66957271, "num_input_tokens_seen": 199160325, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.02046204, "step": 9249, "time_per_iteration": 3.3110506534576416 }, { "auxiliary_loss_clip": 0.06437299, "auxiliary_loss_mlp": 0.01265936, "balance_loss_clip": 0.06281757, "balance_loss_mlp": 0.01255476, "epoch": 0.5561400871787163, "flos": 23155244040960.0, "grad_norm": 1.8809275403159562, "language_loss": 0.80154824, "learning_rate": 1.734202189316832e-06, "loss": 0.87858057, "num_input_tokens_seen": 199179760, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.10473633, "step": 9250, "time_per_iteration": 2.5686211585998535 }, { "auxiliary_loss_clip": 0.06442697, "auxiliary_loss_mlp": 0.01271874, "balance_loss_clip": 0.06283941, "balance_loss_mlp": 0.01260012, "epoch": 0.5562002104313843, "flos": 17572166167680.0, "grad_norm": 3.694784218615971, "language_loss": 0.69210297, "learning_rate": 1.733816187358836e-06, "loss": 0.76924872, "num_input_tokens_seen": 199196695, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.11865234, "step": 9251, "time_per_iteration": 2.550208330154419 }, { "auxiliary_loss_clip": 0.06438687, "auxiliary_loss_mlp": 0.01267584, "balance_loss_clip": 0.06283389, "balance_loss_mlp": 0.0125611, "epoch": 0.5562603336840523, "flos": 25052328005760.0, "grad_norm": 1.6819759599611253, "language_loss": 0.75960582, "learning_rate": 1.7334301954945569e-06, "loss": 0.83666861, "num_input_tokens_seen": 199217845, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.11468506, "step": 9252, "time_per_iteration": 2.673276901245117 }, { "auxiliary_loss_clip": 0.06440004, "auxiliary_loss_mlp": 0.01263516, "balance_loss_clip": 0.06281939, "balance_loss_mlp": 0.01252406, "epoch": 0.5563204569367203, "flos": 29066617411200.0, "grad_norm": 1.5721271650736148, "language_loss": 0.73051226, "learning_rate": 1.7330442137386313e-06, "loss": 0.80754745, "num_input_tokens_seen": 199239250, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.11108398, "step": 9253, "time_per_iteration": 2.7001280784606934 }, { "auxiliary_loss_clip": 0.06436147, "auxiliary_loss_mlp": 0.01267717, "balance_loss_clip": 0.06282777, "balance_loss_mlp": 0.01257387, "epoch": 0.5563805801893883, "flos": 22096913846400.0, "grad_norm": 1.7643567732895014, "language_loss": 0.82587326, "learning_rate": 1.7326582421056965e-06, "loss": 0.9029119, "num_input_tokens_seen": 199258320, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.10327148, "step": 9254, "time_per_iteration": 4.170782804489136 }, { "auxiliary_loss_clip": 0.06338149, "auxiliary_loss_mlp": 0.01255036, "balance_loss_clip": 0.06274775, "balance_loss_mlp": 0.01253096, "epoch": 0.5564407034420562, "flos": 58652623555200.0, "grad_norm": 0.8626718686419161, "language_loss": 0.64911151, "learning_rate": 1.732272280610387e-06, "loss": 0.72504336, "num_input_tokens_seen": 199314840, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.01937866, "step": 9255, "time_per_iteration": 3.0181710720062256 }, { "auxiliary_loss_clip": 0.06438573, "auxiliary_loss_mlp": 0.01268378, "balance_loss_clip": 0.06286976, "balance_loss_mlp": 0.01258269, "epoch": 0.5565008266947242, "flos": 23119004350080.0, "grad_norm": 1.8297370592316053, "language_loss": 0.70111638, "learning_rate": 1.7318863292673399e-06, "loss": 0.7781859, "num_input_tokens_seen": 199335405, "router_z_loss_clip": 1.51660156, "router_z_loss_mlp": 0.10095215, "step": 9256, "time_per_iteration": 2.573627233505249 }, { "auxiliary_loss_clip": 0.06437407, "auxiliary_loss_mlp": 0.0126318, "balance_loss_clip": 0.06287009, "balance_loss_mlp": 0.01252946, "epoch": 0.5565609499473921, "flos": 21584568856320.0, "grad_norm": 1.758485485748899, "language_loss": 0.75984788, "learning_rate": 1.73150038809119e-06, "loss": 0.83685374, "num_input_tokens_seen": 199354345, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.10229492, "step": 9257, "time_per_iteration": 4.065265417098999 }, { "auxiliary_loss_clip": 0.06437719, "auxiliary_loss_mlp": 0.01267596, "balance_loss_clip": 0.0628287, "balance_loss_mlp": 0.01256933, "epoch": 0.5566210732000602, "flos": 18375602641920.0, "grad_norm": 2.129757601306982, "language_loss": 0.61831862, "learning_rate": 1.7311144570965724e-06, "loss": 0.69537175, "num_input_tokens_seen": 199372250, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.10675049, "step": 9258, "time_per_iteration": 2.542095422744751 }, { "auxiliary_loss_clip": 0.0643584, "auxiliary_loss_mlp": 0.01265015, "balance_loss_clip": 0.06281713, "balance_loss_mlp": 0.01254066, "epoch": 0.5566811964527281, "flos": 25710554154240.0, "grad_norm": 1.677213075900907, "language_loss": 0.79593682, "learning_rate": 1.7307285362981215e-06, "loss": 0.87294537, "num_input_tokens_seen": 199392815, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.10949707, "step": 9259, "time_per_iteration": 2.5958564281463623 }, { "auxiliary_loss_clip": 0.06438383, "auxiliary_loss_mlp": 0.01266401, "balance_loss_clip": 0.0628394, "balance_loss_mlp": 0.01255684, "epoch": 0.5567413197053961, "flos": 26951424468480.0, "grad_norm": 1.9153412255084379, "language_loss": 0.81739473, "learning_rate": 1.7303426257104712e-06, "loss": 0.89444262, "num_input_tokens_seen": 199412375, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.10705566, "step": 9260, "time_per_iteration": 2.604867458343506 }, { "auxiliary_loss_clip": 0.06441177, "auxiliary_loss_mlp": 0.01268558, "balance_loss_clip": 0.06286378, "balance_loss_mlp": 0.01256649, "epoch": 0.556801442958064, "flos": 20856965927040.0, "grad_norm": 1.463592193047527, "language_loss": 0.6917733, "learning_rate": 1.729956725348256e-06, "loss": 0.76887065, "num_input_tokens_seen": 199431490, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.11920166, "step": 9261, "time_per_iteration": 2.5324220657348633 }, { "auxiliary_loss_clip": 0.06337909, "auxiliary_loss_mlp": 0.01252551, "balance_loss_clip": 0.06274761, "balance_loss_mlp": 0.01250891, "epoch": 0.556861566210732, "flos": 70517395918080.0, "grad_norm": 0.714289195811569, "language_loss": 0.61068135, "learning_rate": 1.729570835226108e-06, "loss": 0.6865859, "num_input_tokens_seen": 199495855, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 0.01663208, "step": 9262, "time_per_iteration": 3.1862432956695557 }, { "auxiliary_loss_clip": 0.06437971, "auxiliary_loss_mlp": 0.01267652, "balance_loss_clip": 0.06282204, "balance_loss_mlp": 0.0125693, "epoch": 0.5569216894633999, "flos": 25344216103680.0, "grad_norm": 1.7465443539357377, "language_loss": 0.64845389, "learning_rate": 1.7291849553586622e-06, "loss": 0.72551012, "num_input_tokens_seen": 199515870, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.10717773, "step": 9263, "time_per_iteration": 2.610915184020996 }, { "auxiliary_loss_clip": 0.06437878, "auxiliary_loss_mlp": 0.01271371, "balance_loss_clip": 0.06283979, "balance_loss_mlp": 0.01260505, "epoch": 0.556981812716068, "flos": 22645456600320.0, "grad_norm": 1.8529001129024338, "language_loss": 0.734254, "learning_rate": 1.7287990857605497e-06, "loss": 0.81134653, "num_input_tokens_seen": 199535745, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.10864258, "step": 9264, "time_per_iteration": 2.606642007827759 }, { "auxiliary_loss_clip": 0.06437233, "auxiliary_loss_mlp": 0.0126645, "balance_loss_clip": 0.062821, "balance_loss_mlp": 0.01254899, "epoch": 0.5570419359687359, "flos": 11040567275520.0, "grad_norm": 2.565391980041934, "language_loss": 0.76641577, "learning_rate": 1.7284132264464022e-06, "loss": 0.84345257, "num_input_tokens_seen": 199554035, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.11553955, "step": 9265, "time_per_iteration": 2.5676932334899902 }, { "auxiliary_loss_clip": 0.0643029, "auxiliary_loss_mlp": 0.01268939, "balance_loss_clip": 0.06283315, "balance_loss_mlp": 0.01258997, "epoch": 0.5571020592214039, "flos": 22830218853120.0, "grad_norm": 1.3296556077216555, "language_loss": 0.71109587, "learning_rate": 1.7280273774308536e-06, "loss": 0.78808814, "num_input_tokens_seen": 199576120, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.09942627, "step": 9266, "time_per_iteration": 2.6229114532470703 }, { "auxiliary_loss_clip": 0.06434128, "auxiliary_loss_mlp": 0.01270832, "balance_loss_clip": 0.06282268, "balance_loss_mlp": 0.01259919, "epoch": 0.5571621824740719, "flos": 22934074389120.0, "grad_norm": 1.9376276920418798, "language_loss": 0.68289483, "learning_rate": 1.727641538728533e-06, "loss": 0.75994444, "num_input_tokens_seen": 199593780, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.10919189, "step": 9267, "time_per_iteration": 2.553323745727539 }, { "auxiliary_loss_clip": 0.0643281, "auxiliary_loss_mlp": 0.01267154, "balance_loss_clip": 0.06284727, "balance_loss_mlp": 0.01256634, "epoch": 0.5572223057267398, "flos": 22973416680960.0, "grad_norm": 2.60120587969058, "language_loss": 0.75174665, "learning_rate": 1.7272557103540736e-06, "loss": 0.82874632, "num_input_tokens_seen": 199613220, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.10516357, "step": 9268, "time_per_iteration": 5.429242372512817 }, { "auxiliary_loss_clip": 0.06433205, "auxiliary_loss_mlp": 0.01263923, "balance_loss_clip": 0.06281732, "balance_loss_mlp": 0.01253493, "epoch": 0.5572824289794078, "flos": 20966439686400.0, "grad_norm": 1.999475587486408, "language_loss": 0.75189197, "learning_rate": 1.726869892322104e-06, "loss": 0.82886326, "num_input_tokens_seen": 199632085, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.10424805, "step": 9269, "time_per_iteration": 2.5712766647338867 }, { "auxiliary_loss_clip": 0.06436111, "auxiliary_loss_mlp": 0.01265862, "balance_loss_clip": 0.06282496, "balance_loss_mlp": 0.01255389, "epoch": 0.5573425522320757, "flos": 25048806134400.0, "grad_norm": 2.1127268654595746, "language_loss": 0.83161259, "learning_rate": 1.726484084647256e-06, "loss": 0.90863228, "num_input_tokens_seen": 199649295, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.10473633, "step": 9270, "time_per_iteration": 2.5986340045928955 }, { "auxiliary_loss_clip": 0.06437553, "auxiliary_loss_mlp": 0.01265302, "balance_loss_clip": 0.06281359, "balance_loss_mlp": 0.0125412, "epoch": 0.5574026754847438, "flos": 23666415073920.0, "grad_norm": 1.9412359866671094, "language_loss": 0.80355173, "learning_rate": 1.7260982873441591e-06, "loss": 0.88058031, "num_input_tokens_seen": 199668870, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.11193848, "step": 9271, "time_per_iteration": 2.5735230445861816 }, { "auxiliary_loss_clip": 0.06437669, "auxiliary_loss_mlp": 0.012655, "balance_loss_clip": 0.06282886, "balance_loss_mlp": 0.01254413, "epoch": 0.5574627987374117, "flos": 24787791066240.0, "grad_norm": 1.718323480795594, "language_loss": 0.9064011, "learning_rate": 1.725712500427442e-06, "loss": 0.98343283, "num_input_tokens_seen": 199684870, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.11090088, "step": 9272, "time_per_iteration": 2.586616039276123 }, { "auxiliary_loss_clip": 0.06437153, "auxiliary_loss_mlp": 0.01266641, "balance_loss_clip": 0.06287056, "balance_loss_mlp": 0.01256294, "epoch": 0.5575229219900797, "flos": 21841349293440.0, "grad_norm": 2.185037100146588, "language_loss": 0.84654236, "learning_rate": 1.7253267239117347e-06, "loss": 0.92358029, "num_input_tokens_seen": 199701975, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.10351562, "step": 9273, "time_per_iteration": 2.5869758129119873 }, { "auxiliary_loss_clip": 0.0643426, "auxiliary_loss_mlp": 0.01269132, "balance_loss_clip": 0.06281073, "balance_loss_mlp": 0.01256526, "epoch": 0.5575830452427476, "flos": 27821973663360.0, "grad_norm": 2.030374480127487, "language_loss": 0.74168736, "learning_rate": 1.7249409578116655e-06, "loss": 0.81872129, "num_input_tokens_seen": 199721865, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.1260376, "step": 9274, "time_per_iteration": 2.6042823791503906 }, { "auxiliary_loss_clip": 0.0644646, "auxiliary_loss_mlp": 0.01273013, "balance_loss_clip": 0.06285661, "balance_loss_mlp": 0.01261193, "epoch": 0.5576431684954156, "flos": 17817081252480.0, "grad_norm": 2.574087752423517, "language_loss": 0.78573871, "learning_rate": 1.7245552021418629e-06, "loss": 0.86293346, "num_input_tokens_seen": 199736455, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.11828613, "step": 9275, "time_per_iteration": 2.576761245727539 }, { "auxiliary_loss_clip": 0.06435484, "auxiliary_loss_mlp": 0.01265795, "balance_loss_clip": 0.06282168, "balance_loss_mlp": 0.01254809, "epoch": 0.5577032917480835, "flos": 15492290520960.0, "grad_norm": 1.5531050403587492, "language_loss": 0.7517671, "learning_rate": 1.7241694569169546e-06, "loss": 0.82877988, "num_input_tokens_seen": 199753125, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.10986328, "step": 9276, "time_per_iteration": 2.592256546020508 }, { "auxiliary_loss_clip": 0.06434746, "auxiliary_loss_mlp": 0.01265135, "balance_loss_clip": 0.0628148, "balance_loss_mlp": 0.01254483, "epoch": 0.5577634150007516, "flos": 21586162083840.0, "grad_norm": 1.5701507004060289, "language_loss": 0.75620735, "learning_rate": 1.7237837221515678e-06, "loss": 0.83320618, "num_input_tokens_seen": 199771365, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.10644531, "step": 9277, "time_per_iteration": 2.545672655105591 }, { "auxiliary_loss_clip": 0.06434423, "auxiliary_loss_mlp": 0.01266111, "balance_loss_clip": 0.0628402, "balance_loss_mlp": 0.01255847, "epoch": 0.5578235382534195, "flos": 21145709496960.0, "grad_norm": 1.4564847284654285, "language_loss": 0.7230444, "learning_rate": 1.7233979978603304e-06, "loss": 0.80004972, "num_input_tokens_seen": 199790035, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.10266113, "step": 9278, "time_per_iteration": 2.5891072750091553 }, { "auxiliary_loss_clip": 0.06439757, "auxiliary_loss_mlp": 0.01266352, "balance_loss_clip": 0.06282727, "balance_loss_mlp": 0.0125452, "epoch": 0.5578836615060875, "flos": 26512397400960.0, "grad_norm": 1.5098484650864372, "language_loss": 0.75630748, "learning_rate": 1.723012284057868e-06, "loss": 0.83336854, "num_input_tokens_seen": 199811125, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.1184082, "step": 9279, "time_per_iteration": 2.618732452392578 }, { "auxiliary_loss_clip": 0.06437541, "auxiliary_loss_mlp": 0.01267404, "balance_loss_clip": 0.06282216, "balance_loss_mlp": 0.01256157, "epoch": 0.5579437847587555, "flos": 20159439413760.0, "grad_norm": 1.6221536197354633, "language_loss": 0.6754238, "learning_rate": 1.7226265807588082e-06, "loss": 0.75247324, "num_input_tokens_seen": 199829915, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.11260986, "step": 9280, "time_per_iteration": 2.5911097526550293 }, { "auxiliary_loss_clip": 0.06440422, "auxiliary_loss_mlp": 0.01269928, "balance_loss_clip": 0.06282414, "balance_loss_mlp": 0.01258978, "epoch": 0.5580039080114234, "flos": 26109148826880.0, "grad_norm": 1.626790939211283, "language_loss": 0.73516625, "learning_rate": 1.7222408879777763e-06, "loss": 0.81226975, "num_input_tokens_seen": 199850670, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.10943604, "step": 9281, "time_per_iteration": 2.5722556114196777 }, { "auxiliary_loss_clip": 0.06437863, "auxiliary_loss_mlp": 0.01267062, "balance_loss_clip": 0.06286262, "balance_loss_mlp": 0.01256113, "epoch": 0.5580640312640914, "flos": 13776740426880.0, "grad_norm": 2.6255487793541454, "language_loss": 0.75532186, "learning_rate": 1.7218552057293974e-06, "loss": 0.83237112, "num_input_tokens_seen": 199867645, "router_z_loss_clip": 1.51660156, "router_z_loss_mlp": 0.10949707, "step": 9282, "time_per_iteration": 2.537940740585327 }, { "auxiliary_loss_clip": 0.06437849, "auxiliary_loss_mlp": 0.01266805, "balance_loss_clip": 0.06285854, "balance_loss_mlp": 0.01255945, "epoch": 0.5581241545167593, "flos": 17681765708160.0, "grad_norm": 2.1050294690808435, "language_loss": 0.6648103, "learning_rate": 1.721469534028297e-06, "loss": 0.74185681, "num_input_tokens_seen": 199886320, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.10858154, "step": 9283, "time_per_iteration": 2.563089370727539 }, { "auxiliary_loss_clip": 0.06439057, "auxiliary_loss_mlp": 0.01268745, "balance_loss_clip": 0.06285249, "balance_loss_mlp": 0.01258046, "epoch": 0.5581842777694274, "flos": 19574573114880.0, "grad_norm": 1.6860902445117623, "language_loss": 0.83416653, "learning_rate": 1.7210838728890994e-06, "loss": 0.91124457, "num_input_tokens_seen": 199904895, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.10699463, "step": 9284, "time_per_iteration": 2.5351946353912354 }, { "auxiliary_loss_clip": 0.06441161, "auxiliary_loss_mlp": 0.01267338, "balance_loss_clip": 0.06286389, "balance_loss_mlp": 0.01257175, "epoch": 0.5582444010220953, "flos": 20601485228160.0, "grad_norm": 3.083647996356528, "language_loss": 0.85287607, "learning_rate": 1.7206982223264304e-06, "loss": 0.92996103, "num_input_tokens_seen": 199921090, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.10162354, "step": 9285, "time_per_iteration": 2.5576279163360596 }, { "auxiliary_loss_clip": 0.06433822, "auxiliary_loss_mlp": 0.01268093, "balance_loss_clip": 0.06279674, "balance_loss_mlp": 0.01257364, "epoch": 0.5583045242747633, "flos": 19141541614080.0, "grad_norm": 2.4770624012522546, "language_loss": 0.74491948, "learning_rate": 1.720312582354912e-06, "loss": 0.82193863, "num_input_tokens_seen": 199939925, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.10723877, "step": 9286, "time_per_iteration": 2.551201581954956 }, { "auxiliary_loss_clip": 0.06439368, "auxiliary_loss_mlp": 0.01266962, "balance_loss_clip": 0.06286424, "balance_loss_mlp": 0.01255297, "epoch": 0.5583646475274312, "flos": 27462050449920.0, "grad_norm": 1.722782198730347, "language_loss": 0.74844271, "learning_rate": 1.7199269529891684e-06, "loss": 0.82550597, "num_input_tokens_seen": 199960015, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.11663818, "step": 9287, "time_per_iteration": 2.724108934402466 }, { "auxiliary_loss_clip": 0.06446981, "auxiliary_loss_mlp": 0.01266405, "balance_loss_clip": 0.06289706, "balance_loss_mlp": 0.01254616, "epoch": 0.5584247707800992, "flos": 23659580966400.0, "grad_norm": 1.773189865570377, "language_loss": 0.7503202, "learning_rate": 1.7195413342438233e-06, "loss": 0.82745409, "num_input_tokens_seen": 199980505, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.11791992, "step": 9288, "time_per_iteration": 2.6403777599334717 }, { "auxiliary_loss_clip": 0.06438738, "auxiliary_loss_mlp": 0.01265818, "balance_loss_clip": 0.062852, "balance_loss_mlp": 0.0125376, "epoch": 0.5584848940327671, "flos": 13703967555840.0, "grad_norm": 1.8961536894977007, "language_loss": 0.78466487, "learning_rate": 1.7191557261334984e-06, "loss": 0.86171037, "num_input_tokens_seen": 199999020, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.12060547, "step": 9289, "time_per_iteration": 2.5482943058013916 }, { "auxiliary_loss_clip": 0.06449485, "auxiliary_loss_mlp": 0.012667, "balance_loss_clip": 0.06288973, "balance_loss_mlp": 0.01255304, "epoch": 0.5585450172854352, "flos": 27023526506880.0, "grad_norm": 1.6662386045364428, "language_loss": 0.61228526, "learning_rate": 1.718770128672817e-06, "loss": 0.6894471, "num_input_tokens_seen": 200019020, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.1138916, "step": 9290, "time_per_iteration": 2.628417730331421 }, { "auxiliary_loss_clip": 0.0644182, "auxiliary_loss_mlp": 0.01266157, "balance_loss_clip": 0.06284368, "balance_loss_mlp": 0.01254802, "epoch": 0.5586051405381031, "flos": 23192406126720.0, "grad_norm": 2.1934784593371983, "language_loss": 0.68315339, "learning_rate": 1.7183845418764e-06, "loss": 0.76023316, "num_input_tokens_seen": 200038110, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.11352539, "step": 9291, "time_per_iteration": 2.757009983062744 }, { "auxiliary_loss_clip": 0.06441027, "auxiliary_loss_mlp": 0.01267253, "balance_loss_clip": 0.06285929, "balance_loss_mlp": 0.01255356, "epoch": 0.5586652637907711, "flos": 20781551652480.0, "grad_norm": 1.7472045890163572, "language_loss": 0.8422606, "learning_rate": 1.7179989657588698e-06, "loss": 0.91934335, "num_input_tokens_seen": 200056210, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.11895752, "step": 9292, "time_per_iteration": 2.601675033569336 }, { "auxiliary_loss_clip": 0.0643765, "auxiliary_loss_mlp": 0.01267871, "balance_loss_clip": 0.06285931, "balance_loss_mlp": 0.01257076, "epoch": 0.5587253870434391, "flos": 28227360516480.0, "grad_norm": 2.3492252741820354, "language_loss": 0.7418896, "learning_rate": 1.7176134003348476e-06, "loss": 0.81894481, "num_input_tokens_seen": 200075620, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.10797119, "step": 9293, "time_per_iteration": 2.617812156677246 }, { "auxiliary_loss_clip": 0.06432745, "auxiliary_loss_mlp": 0.01265395, "balance_loss_clip": 0.06282204, "balance_loss_mlp": 0.01254726, "epoch": 0.558785510296107, "flos": 26623128971520.0, "grad_norm": 1.9157429188798223, "language_loss": 0.72825813, "learning_rate": 1.7172278456189523e-06, "loss": 0.80523956, "num_input_tokens_seen": 200095945, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.10675049, "step": 9294, "time_per_iteration": 4.040955066680908 }, { "auxiliary_loss_clip": 0.06441128, "auxiliary_loss_mlp": 0.01265775, "balance_loss_clip": 0.06286626, "balance_loss_mlp": 0.01254778, "epoch": 0.558845633548775, "flos": 20162919358080.0, "grad_norm": 2.197070729519894, "language_loss": 0.68852323, "learning_rate": 1.716842301625806e-06, "loss": 0.76559228, "num_input_tokens_seen": 200114185, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.10998535, "step": 9295, "time_per_iteration": 2.561711549758911 }, { "auxiliary_loss_clip": 0.06440011, "auxiliary_loss_mlp": 0.01265304, "balance_loss_clip": 0.0628708, "balance_loss_mlp": 0.01253914, "epoch": 0.5589057568014429, "flos": 24357317114880.0, "grad_norm": 1.386938157920727, "language_loss": 0.80741072, "learning_rate": 1.7164567683700281e-06, "loss": 0.88446391, "num_input_tokens_seen": 200135030, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.11395264, "step": 9296, "time_per_iteration": 2.59205961227417 }, { "auxiliary_loss_clip": 0.06433693, "auxiliary_loss_mlp": 0.01267977, "balance_loss_clip": 0.06281063, "balance_loss_mlp": 0.01256324, "epoch": 0.558965880054111, "flos": 21111440376960.0, "grad_norm": 1.5474438510557669, "language_loss": 0.65988445, "learning_rate": 1.7160712458662379e-06, "loss": 0.73690116, "num_input_tokens_seen": 200154290, "router_z_loss_clip": 1.52636719, "router_z_loss_mlp": 0.11651611, "step": 9297, "time_per_iteration": 4.057859659194946 }, { "auxiliary_loss_clip": 0.06442239, "auxiliary_loss_mlp": 0.01270627, "balance_loss_clip": 0.06285624, "balance_loss_mlp": 0.01258485, "epoch": 0.5590260033067789, "flos": 18440954426880.0, "grad_norm": 1.5650254566694493, "language_loss": 0.75188559, "learning_rate": 1.7156857341290544e-06, "loss": 0.82901424, "num_input_tokens_seen": 200171555, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.12133789, "step": 9298, "time_per_iteration": 2.5844857692718506 }, { "auxiliary_loss_clip": 0.06337366, "auxiliary_loss_mlp": 0.01251718, "balance_loss_clip": 0.0627501, "balance_loss_mlp": 0.01250087, "epoch": 0.5590861265594469, "flos": 70597673729280.0, "grad_norm": 0.6734157770769776, "language_loss": 0.52237415, "learning_rate": 1.7153002331730967e-06, "loss": 0.59826493, "num_input_tokens_seen": 200237010, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01634216, "step": 9299, "time_per_iteration": 3.2093701362609863 }, { "auxiliary_loss_clip": 0.06432234, "auxiliary_loss_mlp": 0.01267224, "balance_loss_clip": 0.06281392, "balance_loss_mlp": 0.01256751, "epoch": 0.5591462498121148, "flos": 30672274475520.0, "grad_norm": 1.9908003725389702, "language_loss": 0.68812001, "learning_rate": 1.7149147430129824e-06, "loss": 0.76511461, "num_input_tokens_seen": 200260820, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.10461426, "step": 9300, "time_per_iteration": 2.657125234603882 }, { "auxiliary_loss_clip": 0.06444568, "auxiliary_loss_mlp": 0.01268874, "balance_loss_clip": 0.06286393, "balance_loss_mlp": 0.01257, "epoch": 0.5592063730647828, "flos": 18156319706880.0, "grad_norm": 1.838471765987996, "language_loss": 0.82460338, "learning_rate": 1.7145292636633293e-06, "loss": 0.90173781, "num_input_tokens_seen": 200278035, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.11865234, "step": 9301, "time_per_iteration": 2.5253899097442627 }, { "auxiliary_loss_clip": 0.0643993, "auxiliary_loss_mlp": 0.0126638, "balance_loss_clip": 0.06283589, "balance_loss_mlp": 0.01255097, "epoch": 0.5592664963174507, "flos": 24067148025600.0, "grad_norm": 1.9182404538369686, "language_loss": 0.67606151, "learning_rate": 1.714143795138756e-06, "loss": 0.75312459, "num_input_tokens_seen": 200297255, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.11279297, "step": 9302, "time_per_iteration": 2.6039719581604004 }, { "auxiliary_loss_clip": 0.06443185, "auxiliary_loss_mlp": 0.01266716, "balance_loss_clip": 0.06285101, "balance_loss_mlp": 0.01255111, "epoch": 0.5593266195701188, "flos": 19833911101440.0, "grad_norm": 2.1519375439900377, "language_loss": 0.70972639, "learning_rate": 1.713758337453878e-06, "loss": 0.78682542, "num_input_tokens_seen": 200317505, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.1159668, "step": 9303, "time_per_iteration": 2.5558621883392334 }, { "auxiliary_loss_clip": 0.06433339, "auxiliary_loss_mlp": 0.01263615, "balance_loss_clip": 0.06285348, "balance_loss_mlp": 0.01253131, "epoch": 0.5593867428227867, "flos": 25307682923520.0, "grad_norm": 1.5101689465397292, "language_loss": 0.73098344, "learning_rate": 1.7133728906233124e-06, "loss": 0.807953, "num_input_tokens_seen": 200338350, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.10491943, "step": 9304, "time_per_iteration": 2.615487575531006 }, { "auxiliary_loss_clip": 0.06436706, "auxiliary_loss_mlp": 0.01267993, "balance_loss_clip": 0.06283154, "balance_loss_mlp": 0.01257074, "epoch": 0.5594468660754547, "flos": 12938028583680.0, "grad_norm": 2.0093940076157173, "language_loss": 0.78108943, "learning_rate": 1.7129874546616763e-06, "loss": 0.85813642, "num_input_tokens_seen": 200353965, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.10913086, "step": 9305, "time_per_iteration": 2.5350117683410645 }, { "auxiliary_loss_clip": 0.06430963, "auxiliary_loss_mlp": 0.01263908, "balance_loss_clip": 0.06281721, "balance_loss_mlp": 0.01253215, "epoch": 0.5595069893281227, "flos": 19068768743040.0, "grad_norm": 1.5802489852495898, "language_loss": 0.69324654, "learning_rate": 1.7126020295835836e-06, "loss": 0.77019525, "num_input_tokens_seen": 200373595, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.10699463, "step": 9306, "time_per_iteration": 2.5626792907714844 }, { "auxiliary_loss_clip": 0.06331939, "auxiliary_loss_mlp": 0.01251135, "balance_loss_clip": 0.06269987, "balance_loss_mlp": 0.01249695, "epoch": 0.5595671125807906, "flos": 70291530437760.0, "grad_norm": 0.8952137244324587, "language_loss": 0.60309494, "learning_rate": 1.7122166154036518e-06, "loss": 0.67892563, "num_input_tokens_seen": 200429155, "router_z_loss_clip": 0.62011719, "router_z_loss_mlp": 0.01439667, "step": 9307, "time_per_iteration": 4.706178426742554 }, { "auxiliary_loss_clip": 0.06434333, "auxiliary_loss_mlp": 0.01267396, "balance_loss_clip": 0.06284036, "balance_loss_mlp": 0.01257371, "epoch": 0.5596272358334586, "flos": 20671407060480.0, "grad_norm": 1.5724287035740991, "language_loss": 0.74022782, "learning_rate": 1.7118312121364943e-06, "loss": 0.81724513, "num_input_tokens_seen": 200448290, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.10021973, "step": 9308, "time_per_iteration": 4.046956300735474 }, { "auxiliary_loss_clip": 0.06432714, "auxiliary_loss_mlp": 0.01268563, "balance_loss_clip": 0.06277638, "balance_loss_mlp": 0.01256368, "epoch": 0.5596873590861265, "flos": 25047170979840.0, "grad_norm": 1.6378156560577763, "language_loss": 0.70377481, "learning_rate": 1.7114458197967257e-06, "loss": 0.78078759, "num_input_tokens_seen": 200466555, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.12213135, "step": 9309, "time_per_iteration": 2.665996789932251 }, { "auxiliary_loss_clip": 0.06443116, "auxiliary_loss_mlp": 0.01267391, "balance_loss_clip": 0.06286541, "balance_loss_mlp": 0.01254624, "epoch": 0.5597474823387946, "flos": 25965573655680.0, "grad_norm": 2.134001203225691, "language_loss": 0.75517607, "learning_rate": 1.7110604383989613e-06, "loss": 0.83228117, "num_input_tokens_seen": 200485980, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.12756348, "step": 9310, "time_per_iteration": 2.6182563304901123 }, { "auxiliary_loss_clip": 0.06441074, "auxiliary_loss_mlp": 0.01268529, "balance_loss_clip": 0.06283745, "balance_loss_mlp": 0.01256536, "epoch": 0.5598076055914625, "flos": 26184688882560.0, "grad_norm": 2.4132381085612518, "language_loss": 0.69801354, "learning_rate": 1.7106750679578133e-06, "loss": 0.77510953, "num_input_tokens_seen": 200504555, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.11987305, "step": 9311, "time_per_iteration": 2.590041399002075 }, { "auxiliary_loss_clip": 0.06433314, "auxiliary_loss_mlp": 0.01268658, "balance_loss_clip": 0.06280284, "balance_loss_mlp": 0.01257989, "epoch": 0.5598677288441305, "flos": 11660541235200.0, "grad_norm": 2.1318978620810607, "language_loss": 0.72560012, "learning_rate": 1.7102897084878962e-06, "loss": 0.80261987, "num_input_tokens_seen": 200522700, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.10656738, "step": 9312, "time_per_iteration": 2.532470226287842 }, { "auxiliary_loss_clip": 0.06432417, "auxiliary_loss_mlp": 0.01264051, "balance_loss_clip": 0.06279922, "balance_loss_mlp": 0.01253269, "epoch": 0.5599278520967984, "flos": 22973290899840.0, "grad_norm": 1.9582719961815915, "language_loss": 0.89322478, "learning_rate": 1.709904360003822e-06, "loss": 0.97018945, "num_input_tokens_seen": 200541910, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.10784912, "step": 9313, "time_per_iteration": 2.566042184829712 }, { "auxiliary_loss_clip": 0.06438245, "auxiliary_loss_mlp": 0.01271521, "balance_loss_clip": 0.06284168, "balance_loss_mlp": 0.01260375, "epoch": 0.5599879753494664, "flos": 21222004239360.0, "grad_norm": 1.3976504318193732, "language_loss": 0.78018391, "learning_rate": 1.709519022520204e-06, "loss": 0.85728157, "num_input_tokens_seen": 200562600, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.11151123, "step": 9314, "time_per_iteration": 2.626213312149048 }, { "auxiliary_loss_clip": 0.06432153, "auxiliary_loss_mlp": 0.01267369, "balance_loss_clip": 0.06279282, "balance_loss_mlp": 0.01256718, "epoch": 0.5600480986021343, "flos": 31911006510720.0, "grad_norm": 2.0454681482053485, "language_loss": 0.70574808, "learning_rate": 1.7091336960516537e-06, "loss": 0.78274333, "num_input_tokens_seen": 200584795, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.10650635, "step": 9315, "time_per_iteration": 2.6354455947875977 }, { "auxiliary_loss_clip": 0.06442048, "auxiliary_loss_mlp": 0.01269439, "balance_loss_clip": 0.06283185, "balance_loss_mlp": 0.01258275, "epoch": 0.5601082218548024, "flos": 28483679756160.0, "grad_norm": 1.6562828205270426, "language_loss": 0.66938239, "learning_rate": 1.7087483806127824e-06, "loss": 0.74649727, "num_input_tokens_seen": 200606945, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.11175537, "step": 9316, "time_per_iteration": 2.616025924682617 }, { "auxiliary_loss_clip": 0.06433865, "auxiliary_loss_mlp": 0.01265135, "balance_loss_clip": 0.0628162, "balance_loss_mlp": 0.01253893, "epoch": 0.5601683451074703, "flos": 24103974695040.0, "grad_norm": 1.908618281206048, "language_loss": 0.87169832, "learning_rate": 1.7083630762182022e-06, "loss": 0.94868839, "num_input_tokens_seen": 200626340, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.11254883, "step": 9317, "time_per_iteration": 2.5661394596099854 }, { "auxiliary_loss_clip": 0.0643786, "auxiliary_loss_mlp": 0.01269905, "balance_loss_clip": 0.06279458, "balance_loss_mlp": 0.0125697, "epoch": 0.5602284683601383, "flos": 26362868590080.0, "grad_norm": 2.706995399727104, "language_loss": 0.77292144, "learning_rate": 1.7079777828825233e-06, "loss": 0.84999907, "num_input_tokens_seen": 200644520, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.12939453, "step": 9318, "time_per_iteration": 2.617840528488159 }, { "auxiliary_loss_clip": 0.06432842, "auxiliary_loss_mlp": 0.0126975, "balance_loss_clip": 0.06278957, "balance_loss_mlp": 0.01259504, "epoch": 0.5602885916128063, "flos": 24502904784000.0, "grad_norm": 1.5697769260690502, "language_loss": 0.76456434, "learning_rate": 1.7075925006203558e-06, "loss": 0.84159029, "num_input_tokens_seen": 200664845, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.10247803, "step": 9319, "time_per_iteration": 2.5913918018341064 }, { "auxiliary_loss_clip": 0.06432641, "auxiliary_loss_mlp": 0.01268733, "balance_loss_clip": 0.06278886, "balance_loss_mlp": 0.0125748, "epoch": 0.5603487148654742, "flos": 27352450909440.0, "grad_norm": 1.3456642314664127, "language_loss": 0.85649848, "learning_rate": 1.7072072294463101e-06, "loss": 0.93351221, "num_input_tokens_seen": 200686535, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.11242676, "step": 9320, "time_per_iteration": 2.6225714683532715 }, { "auxiliary_loss_clip": 0.06334225, "auxiliary_loss_mlp": 0.01255201, "balance_loss_clip": 0.06272084, "balance_loss_mlp": 0.01253413, "epoch": 0.5604088381181422, "flos": 54105555962880.0, "grad_norm": 0.8186698200737614, "language_loss": 0.52602565, "learning_rate": 1.706821969374996e-06, "loss": 0.60191989, "num_input_tokens_seen": 200736965, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01786804, "step": 9321, "time_per_iteration": 2.979553699493408 }, { "auxiliary_loss_clip": 0.06433841, "auxiliary_loss_mlp": 0.01264655, "balance_loss_clip": 0.06283476, "balance_loss_mlp": 0.0125426, "epoch": 0.5604689613708101, "flos": 22242878858880.0, "grad_norm": 1.5174831726895, "language_loss": 0.74676371, "learning_rate": 1.7064367204210216e-06, "loss": 0.82374871, "num_input_tokens_seen": 200757420, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.10388184, "step": 9322, "time_per_iteration": 2.5715548992156982 }, { "auxiliary_loss_clip": 0.06434613, "auxiliary_loss_mlp": 0.01268438, "balance_loss_clip": 0.06280025, "balance_loss_mlp": 0.01256869, "epoch": 0.5605290846234782, "flos": 35306370132480.0, "grad_norm": 1.6548867607407962, "language_loss": 0.74068326, "learning_rate": 1.7060514825989963e-06, "loss": 0.8177138, "num_input_tokens_seen": 200779520, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.11566162, "step": 9323, "time_per_iteration": 2.6697099208831787 }, { "auxiliary_loss_clip": 0.06439342, "auxiliary_loss_mlp": 0.01266271, "balance_loss_clip": 0.0628039, "balance_loss_mlp": 0.01254338, "epoch": 0.5605892078761461, "flos": 20268997027200.0, "grad_norm": 1.4252483516407122, "language_loss": 0.61249602, "learning_rate": 1.7056662559235286e-06, "loss": 0.68955213, "num_input_tokens_seen": 200799485, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.11938477, "step": 9324, "time_per_iteration": 2.562361240386963 }, { "auxiliary_loss_clip": 0.06433724, "auxiliary_loss_mlp": 0.01266671, "balance_loss_clip": 0.06278805, "balance_loss_mlp": 0.01253999, "epoch": 0.5606493311288141, "flos": 17313582867840.0, "grad_norm": 2.020217244651439, "language_loss": 0.88151574, "learning_rate": 1.705281040409226e-06, "loss": 0.9585197, "num_input_tokens_seen": 200817540, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.12670898, "step": 9325, "time_per_iteration": 2.553842544555664 }, { "auxiliary_loss_clip": 0.06436659, "auxiliary_loss_mlp": 0.01271424, "balance_loss_clip": 0.06280921, "balance_loss_mlp": 0.01259539, "epoch": 0.560709454381482, "flos": 21659438079360.0, "grad_norm": 8.287848203853745, "language_loss": 0.74223971, "learning_rate": 1.7048958360706952e-06, "loss": 0.81932056, "num_input_tokens_seen": 200838380, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.11883545, "step": 9326, "time_per_iteration": 2.5501224994659424 }, { "auxiliary_loss_clip": 0.06441926, "auxiliary_loss_mlp": 0.01273376, "balance_loss_clip": 0.06281201, "balance_loss_mlp": 0.01261282, "epoch": 0.56076957763415, "flos": 20309639057280.0, "grad_norm": 2.6017228875197764, "language_loss": 0.78354204, "learning_rate": 1.7045106429225447e-06, "loss": 0.86069512, "num_input_tokens_seen": 200855640, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.12091064, "step": 9327, "time_per_iteration": 2.53755784034729 }, { "auxiliary_loss_clip": 0.06435445, "auxiliary_loss_mlp": 0.01268427, "balance_loss_clip": 0.06280255, "balance_loss_mlp": 0.01256965, "epoch": 0.5608297008868179, "flos": 25052873057280.0, "grad_norm": 5.12546535991168, "language_loss": 0.785465, "learning_rate": 1.7041254609793795e-06, "loss": 0.86250377, "num_input_tokens_seen": 200876585, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.11462402, "step": 9328, "time_per_iteration": 2.5855071544647217 }, { "auxiliary_loss_clip": 0.06427474, "auxiliary_loss_mlp": 0.01263766, "balance_loss_clip": 0.06275272, "balance_loss_mlp": 0.01252602, "epoch": 0.560889824139486, "flos": 19873253393280.0, "grad_norm": 1.4151309498895273, "language_loss": 0.7372033, "learning_rate": 1.7037402902558066e-06, "loss": 0.81411564, "num_input_tokens_seen": 200898175, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.1116333, "step": 9329, "time_per_iteration": 2.5947606563568115 }, { "auxiliary_loss_clip": 0.06441919, "auxiliary_loss_mlp": 0.01266633, "balance_loss_clip": 0.06280508, "balance_loss_mlp": 0.01254748, "epoch": 0.5609499473921539, "flos": 22935961105920.0, "grad_norm": 2.1548241294118617, "language_loss": 0.83993357, "learning_rate": 1.7033551307664324e-06, "loss": 0.91701901, "num_input_tokens_seen": 200917515, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.11883545, "step": 9330, "time_per_iteration": 2.539669990539551 }, { "auxiliary_loss_clip": 0.06328365, "auxiliary_loss_mlp": 0.01251081, "balance_loss_clip": 0.06266584, "balance_loss_mlp": 0.01249456, "epoch": 0.5610100706448219, "flos": 53054479146240.0, "grad_norm": 0.7026974432253822, "language_loss": 0.57703853, "learning_rate": 1.7029699825258603e-06, "loss": 0.65283298, "num_input_tokens_seen": 200978615, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01626587, "step": 9331, "time_per_iteration": 3.1932532787323 }, { "auxiliary_loss_clip": 0.06435761, "auxiliary_loss_mlp": 0.01266747, "balance_loss_clip": 0.0627882, "balance_loss_mlp": 0.0125513, "epoch": 0.5610701938974898, "flos": 21841349293440.0, "grad_norm": 1.8687760374079516, "language_loss": 0.82093781, "learning_rate": 1.7025848455486971e-06, "loss": 0.89796293, "num_input_tokens_seen": 200997745, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.11608887, "step": 9332, "time_per_iteration": 2.5298385620117188 }, { "auxiliary_loss_clip": 0.06439395, "auxiliary_loss_mlp": 0.01270551, "balance_loss_clip": 0.06279023, "balance_loss_mlp": 0.0125723, "epoch": 0.5611303171501578, "flos": 17462943970560.0, "grad_norm": 1.774545754259258, "language_loss": 0.82315433, "learning_rate": 1.7021997198495454e-06, "loss": 0.90025383, "num_input_tokens_seen": 201016370, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.13330078, "step": 9333, "time_per_iteration": 3.9803144931793213 }, { "auxiliary_loss_clip": 0.06432962, "auxiliary_loss_mlp": 0.01265252, "balance_loss_clip": 0.06276971, "balance_loss_mlp": 0.0125438, "epoch": 0.5611904404028258, "flos": 22644366497280.0, "grad_norm": 1.903108972012193, "language_loss": 0.73138517, "learning_rate": 1.7018146054430108e-06, "loss": 0.80836725, "num_input_tokens_seen": 201034310, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.10870361, "step": 9334, "time_per_iteration": 2.6202313899993896 }, { "auxiliary_loss_clip": 0.06431013, "auxiliary_loss_mlp": 0.01268239, "balance_loss_clip": 0.06277764, "balance_loss_mlp": 0.01257629, "epoch": 0.5612505636554938, "flos": 14321048549760.0, "grad_norm": 1.768768840907996, "language_loss": 0.7134645, "learning_rate": 1.7014295023436961e-06, "loss": 0.79045701, "num_input_tokens_seen": 201052030, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.10614014, "step": 9335, "time_per_iteration": 2.5060157775878906 }, { "auxiliary_loss_clip": 0.06436123, "auxiliary_loss_mlp": 0.01264275, "balance_loss_clip": 0.06280305, "balance_loss_mlp": 0.01253147, "epoch": 0.5613106869081618, "flos": 16513835973120.0, "grad_norm": 1.6569858154285284, "language_loss": 0.76849091, "learning_rate": 1.701044410566205e-06, "loss": 0.84549487, "num_input_tokens_seen": 201068445, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.11126709, "step": 9336, "time_per_iteration": 2.5256459712982178 }, { "auxiliary_loss_clip": 0.0643238, "auxiliary_loss_mlp": 0.01265735, "balance_loss_clip": 0.06279722, "balance_loss_mlp": 0.01255358, "epoch": 0.5613708101608297, "flos": 24065009746560.0, "grad_norm": 2.7439097952724003, "language_loss": 0.65027916, "learning_rate": 1.7006593301251393e-06, "loss": 0.72726029, "num_input_tokens_seen": 201082140, "router_z_loss_clip": 1.52539062, "router_z_loss_mlp": 0.10369873, "step": 9337, "time_per_iteration": 4.185711860656738 }, { "auxiliary_loss_clip": 0.06330959, "auxiliary_loss_mlp": 0.0125368, "balance_loss_clip": 0.06269568, "balance_loss_mlp": 0.01252165, "epoch": 0.5614309334134977, "flos": 64922284984320.0, "grad_norm": 0.882804526948854, "language_loss": 0.62557769, "learning_rate": 1.700274261035102e-06, "loss": 0.70142412, "num_input_tokens_seen": 201137245, "router_z_loss_clip": 0.61523438, "router_z_loss_mlp": 0.01513672, "step": 9338, "time_per_iteration": 3.15687894821167 }, { "auxiliary_loss_clip": 0.06432765, "auxiliary_loss_mlp": 0.01266359, "balance_loss_clip": 0.06277493, "balance_loss_mlp": 0.0125507, "epoch": 0.5614910566661656, "flos": 32926975666560.0, "grad_norm": 1.7348374918238454, "language_loss": 0.65942287, "learning_rate": 1.6998892033106946e-06, "loss": 0.73641419, "num_input_tokens_seen": 201157270, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.11291504, "step": 9339, "time_per_iteration": 2.647404909133911 }, { "auxiliary_loss_clip": 0.06428259, "auxiliary_loss_mlp": 0.01268684, "balance_loss_clip": 0.06276926, "balance_loss_mlp": 0.01256745, "epoch": 0.5615511799188336, "flos": 18594927504000.0, "grad_norm": 1.7375053580159028, "language_loss": 0.70189267, "learning_rate": 1.6995041569665184e-06, "loss": 0.77886212, "num_input_tokens_seen": 201174530, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.1194458, "step": 9340, "time_per_iteration": 2.551257371902466 }, { "auxiliary_loss_clip": 0.06428477, "auxiliary_loss_mlp": 0.01267059, "balance_loss_clip": 0.06281067, "balance_loss_mlp": 0.01256139, "epoch": 0.5616113031715015, "flos": 22826571200640.0, "grad_norm": 1.570926034229566, "language_loss": 0.77811015, "learning_rate": 1.6991191220171756e-06, "loss": 0.85506558, "num_input_tokens_seen": 201194905, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.10913086, "step": 9341, "time_per_iteration": 2.59570574760437 }, { "auxiliary_loss_clip": 0.06435905, "auxiliary_loss_mlp": 0.01267704, "balance_loss_clip": 0.06279997, "balance_loss_mlp": 0.01255997, "epoch": 0.5616714264241696, "flos": 22352184910080.0, "grad_norm": 1.6433103192252272, "language_loss": 0.80006874, "learning_rate": 1.6987340984772653e-06, "loss": 0.87710488, "num_input_tokens_seen": 201213715, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.11724854, "step": 9342, "time_per_iteration": 2.5726253986358643 }, { "auxiliary_loss_clip": 0.0643216, "auxiliary_loss_mlp": 0.01269258, "balance_loss_clip": 0.06276272, "balance_loss_mlp": 0.01257629, "epoch": 0.5617315496768375, "flos": 18813875022720.0, "grad_norm": 2.230457317236058, "language_loss": 0.76452088, "learning_rate": 1.6983490863613882e-06, "loss": 0.84153509, "num_input_tokens_seen": 201231415, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.11633301, "step": 9343, "time_per_iteration": 2.5702738761901855 }, { "auxiliary_loss_clip": 0.06432857, "auxiliary_loss_mlp": 0.01264728, "balance_loss_clip": 0.06280824, "balance_loss_mlp": 0.01253451, "epoch": 0.5617916729295055, "flos": 18375225298560.0, "grad_norm": 1.95135599693808, "language_loss": 0.69040692, "learning_rate": 1.6979640856841442e-06, "loss": 0.76738286, "num_input_tokens_seen": 201249625, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.112854, "step": 9344, "time_per_iteration": 2.555601119995117 }, { "auxiliary_loss_clip": 0.06433848, "auxiliary_loss_mlp": 0.012661, "balance_loss_clip": 0.06279013, "balance_loss_mlp": 0.01254382, "epoch": 0.5618517961821734, "flos": 28186844267520.0, "grad_norm": 1.7083324137716314, "language_loss": 0.66674554, "learning_rate": 1.6975790964601318e-06, "loss": 0.74374497, "num_input_tokens_seen": 201271205, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.11730957, "step": 9345, "time_per_iteration": 2.5842113494873047 }, { "auxiliary_loss_clip": 0.06439456, "auxiliary_loss_mlp": 0.01268443, "balance_loss_clip": 0.0628286, "balance_loss_mlp": 0.01257744, "epoch": 0.5619119194348414, "flos": 15492290520960.0, "grad_norm": 1.7604481177584104, "language_loss": 0.87501073, "learning_rate": 1.6971941187039512e-06, "loss": 0.95208973, "num_input_tokens_seen": 201287700, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.10693359, "step": 9346, "time_per_iteration": 2.4960644245147705 }, { "auxiliary_loss_clip": 0.0643222, "auxiliary_loss_mlp": 0.01273848, "balance_loss_clip": 0.06279923, "balance_loss_mlp": 0.01261856, "epoch": 0.5619720426875094, "flos": 29135700702720.0, "grad_norm": 2.0847711555327546, "language_loss": 0.5988636, "learning_rate": 1.6968091524301993e-06, "loss": 0.67592436, "num_input_tokens_seen": 201307530, "router_z_loss_clip": 1.52246094, "router_z_loss_mlp": 0.11993408, "step": 9347, "time_per_iteration": 5.504266738891602 }, { "auxiliary_loss_clip": 0.06437394, "auxiliary_loss_mlp": 0.01271233, "balance_loss_clip": 0.06282049, "balance_loss_mlp": 0.01258949, "epoch": 0.5620321659401774, "flos": 18009474226560.0, "grad_norm": 2.30940056307194, "language_loss": 0.69701719, "learning_rate": 1.6964241976534745e-06, "loss": 0.7741034, "num_input_tokens_seen": 201326210, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.12286377, "step": 9348, "time_per_iteration": 2.527240753173828 }, { "auxiliary_loss_clip": 0.06444652, "auxiliary_loss_mlp": 0.01268601, "balance_loss_clip": 0.06282634, "balance_loss_mlp": 0.01256436, "epoch": 0.5620922891928454, "flos": 20600730541440.0, "grad_norm": 1.9983518401644857, "language_loss": 0.79533088, "learning_rate": 1.6960392543883754e-06, "loss": 0.87246341, "num_input_tokens_seen": 201346120, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.12164307, "step": 9349, "time_per_iteration": 2.544252872467041 }, { "auxiliary_loss_clip": 0.06437679, "auxiliary_loss_mlp": 0.01268253, "balance_loss_clip": 0.06283499, "balance_loss_mlp": 0.01256153, "epoch": 0.5621524124455133, "flos": 26294288423040.0, "grad_norm": 1.8993384188074431, "language_loss": 0.67040217, "learning_rate": 1.6956543226494975e-06, "loss": 0.74746144, "num_input_tokens_seen": 201365700, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.12103271, "step": 9350, "time_per_iteration": 2.6184117794036865 }, { "auxiliary_loss_clip": 0.06442779, "auxiliary_loss_mlp": 0.01267552, "balance_loss_clip": 0.06285527, "balance_loss_mlp": 0.01256108, "epoch": 0.5622125356981813, "flos": 12755236901760.0, "grad_norm": 1.786967265475801, "language_loss": 0.78675902, "learning_rate": 1.6952694024514381e-06, "loss": 0.86386228, "num_input_tokens_seen": 201382795, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.11450195, "step": 9351, "time_per_iteration": 2.537308692932129 }, { "auxiliary_loss_clip": 0.06443599, "auxiliary_loss_mlp": 0.01267565, "balance_loss_clip": 0.06284384, "balance_loss_mlp": 0.0125568, "epoch": 0.5622726589508492, "flos": 23812086597120.0, "grad_norm": 1.4917439600979805, "language_loss": 0.59562778, "learning_rate": 1.6948844938087945e-06, "loss": 0.67273939, "num_input_tokens_seen": 201402780, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.11883545, "step": 9352, "time_per_iteration": 2.600260019302368 }, { "auxiliary_loss_clip": 0.06429496, "auxiliary_loss_mlp": 0.01266975, "balance_loss_clip": 0.06281647, "balance_loss_mlp": 0.01256705, "epoch": 0.5623327822035172, "flos": 24725248392960.0, "grad_norm": 1.3606709990939676, "language_loss": 0.7225408, "learning_rate": 1.6944995967361604e-06, "loss": 0.79950553, "num_input_tokens_seen": 201424140, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.10272217, "step": 9353, "time_per_iteration": 2.599029064178467 }, { "auxiliary_loss_clip": 0.06440921, "auxiliary_loss_mlp": 0.01265501, "balance_loss_clip": 0.06284565, "balance_loss_mlp": 0.01253908, "epoch": 0.5623929054561851, "flos": 14023081031040.0, "grad_norm": 3.0075169091806653, "language_loss": 0.77349818, "learning_rate": 1.6941147112481327e-06, "loss": 0.85056239, "num_input_tokens_seen": 201439645, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.11590576, "step": 9354, "time_per_iteration": 2.523277759552002 }, { "auxiliary_loss_clip": 0.06446435, "auxiliary_loss_mlp": 0.0126914, "balance_loss_clip": 0.06286118, "balance_loss_mlp": 0.01257523, "epoch": 0.5624530287088532, "flos": 20710707425280.0, "grad_norm": 1.9310730338875857, "language_loss": 0.72956336, "learning_rate": 1.6937298373593056e-06, "loss": 0.80671918, "num_input_tokens_seen": 201459970, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.11621094, "step": 9355, "time_per_iteration": 2.5759313106536865 }, { "auxiliary_loss_clip": 0.06438468, "auxiliary_loss_mlp": 0.01267041, "balance_loss_clip": 0.06283818, "balance_loss_mlp": 0.0125599, "epoch": 0.5625131519615211, "flos": 21477401084160.0, "grad_norm": 1.4220185671862016, "language_loss": 0.73747551, "learning_rate": 1.693344975084274e-06, "loss": 0.81453061, "num_input_tokens_seen": 201480055, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.11053467, "step": 9356, "time_per_iteration": 2.561784029006958 }, { "auxiliary_loss_clip": 0.06430056, "auxiliary_loss_mlp": 0.01267324, "balance_loss_clip": 0.0627979, "balance_loss_mlp": 0.01256249, "epoch": 0.5625732752141891, "flos": 18704023920000.0, "grad_norm": 3.5212267800482366, "language_loss": 0.83526653, "learning_rate": 1.6929601244376318e-06, "loss": 0.91224033, "num_input_tokens_seen": 201497645, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.11083984, "step": 9357, "time_per_iteration": 2.523653745651245 }, { "auxiliary_loss_clip": 0.06438093, "auxiliary_loss_mlp": 0.01267245, "balance_loss_clip": 0.06284128, "balance_loss_mlp": 0.01256767, "epoch": 0.562633398466857, "flos": 16222492926720.0, "grad_norm": 2.1855498876864377, "language_loss": 0.72578847, "learning_rate": 1.6925752854339722e-06, "loss": 0.8028419, "num_input_tokens_seen": 201515455, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.10479736, "step": 9358, "time_per_iteration": 2.534876823425293 }, { "auxiliary_loss_clip": 0.06429526, "auxiliary_loss_mlp": 0.01265558, "balance_loss_clip": 0.06278522, "balance_loss_mlp": 0.01254805, "epoch": 0.562693521719525, "flos": 22498485338880.0, "grad_norm": 1.592124913797402, "language_loss": 0.78010452, "learning_rate": 1.6921904580878885e-06, "loss": 0.85705537, "num_input_tokens_seen": 201534500, "router_z_loss_clip": 1.50976562, "router_z_loss_mlp": 0.10742188, "step": 9359, "time_per_iteration": 2.5515453815460205 }, { "auxiliary_loss_clip": 0.06435451, "auxiliary_loss_mlp": 0.01266289, "balance_loss_clip": 0.06280944, "balance_loss_mlp": 0.01255244, "epoch": 0.562753644972193, "flos": 25337088506880.0, "grad_norm": 2.173116657151037, "language_loss": 0.70418787, "learning_rate": 1.6918056424139736e-06, "loss": 0.7812053, "num_input_tokens_seen": 201553280, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.11047363, "step": 9360, "time_per_iteration": 2.577998399734497 }, { "auxiliary_loss_clip": 0.06334771, "auxiliary_loss_mlp": 0.01253737, "balance_loss_clip": 0.06273287, "balance_loss_mlp": 0.01252121, "epoch": 0.562813768224861, "flos": 67410566231040.0, "grad_norm": 0.7647057267098212, "language_loss": 0.55543149, "learning_rate": 1.6914208384268197e-06, "loss": 0.6313166, "num_input_tokens_seen": 201610030, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01618958, "step": 9361, "time_per_iteration": 3.0923655033111572 }, { "auxiliary_loss_clip": 0.06434792, "auxiliary_loss_mlp": 0.01275741, "balance_loss_clip": 0.06283775, "balance_loss_mlp": 0.01265149, "epoch": 0.562873891477529, "flos": 23337868014720.0, "grad_norm": 1.4071068116055998, "language_loss": 0.82313681, "learning_rate": 1.691036046141018e-06, "loss": 0.90024209, "num_input_tokens_seen": 201628370, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.105896, "step": 9362, "time_per_iteration": 2.567006826400757 }, { "auxiliary_loss_clip": 0.06439155, "auxiliary_loss_mlp": 0.01267943, "balance_loss_clip": 0.0628707, "balance_loss_mlp": 0.01257357, "epoch": 0.5629340147301969, "flos": 38482073475840.0, "grad_norm": 1.711184594237501, "language_loss": 0.74746919, "learning_rate": 1.6906512655711614e-06, "loss": 0.82454014, "num_input_tokens_seen": 201649790, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.10595703, "step": 9363, "time_per_iteration": 2.6956193447113037 }, { "auxiliary_loss_clip": 0.06439018, "auxiliary_loss_mlp": 0.01270908, "balance_loss_clip": 0.06283449, "balance_loss_mlp": 0.01259715, "epoch": 0.5629941379828649, "flos": 29249744509440.0, "grad_norm": 2.624844178940054, "language_loss": 0.82861978, "learning_rate": 1.690266496731839e-06, "loss": 0.90571904, "num_input_tokens_seen": 201669175, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.11206055, "step": 9364, "time_per_iteration": 2.6068038940429688 }, { "auxiliary_loss_clip": 0.06430238, "auxiliary_loss_mlp": 0.01265483, "balance_loss_clip": 0.0628158, "balance_loss_mlp": 0.01255082, "epoch": 0.5630542612355328, "flos": 19425882844800.0, "grad_norm": 2.1192524715727585, "language_loss": 0.6521678, "learning_rate": 1.689881739637642e-06, "loss": 0.72912502, "num_input_tokens_seen": 201687000, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.10400391, "step": 9365, "time_per_iteration": 2.53475284576416 }, { "auxiliary_loss_clip": 0.06449026, "auxiliary_loss_mlp": 0.01270539, "balance_loss_clip": 0.06286889, "balance_loss_mlp": 0.01258952, "epoch": 0.5631143844882008, "flos": 22271697463680.0, "grad_norm": 2.8557498325439394, "language_loss": 0.81728756, "learning_rate": 1.6894969943031611e-06, "loss": 0.89448321, "num_input_tokens_seen": 201703335, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.11584473, "step": 9366, "time_per_iteration": 2.5289974212646484 }, { "auxiliary_loss_clip": 0.06432858, "auxiliary_loss_mlp": 0.01263877, "balance_loss_clip": 0.06282316, "balance_loss_mlp": 0.01253875, "epoch": 0.5631745077408687, "flos": 22971781526400.0, "grad_norm": 1.3882362306493936, "language_loss": 0.73831367, "learning_rate": 1.6891122607429845e-06, "loss": 0.81528103, "num_input_tokens_seen": 201723495, "router_z_loss_clip": 1.50488281, "router_z_loss_mlp": 0.09997559, "step": 9367, "time_per_iteration": 2.5651121139526367 }, { "auxiliary_loss_clip": 0.06330062, "auxiliary_loss_mlp": 0.01252772, "balance_loss_clip": 0.06268511, "balance_loss_mlp": 0.01251112, "epoch": 0.5632346309935368, "flos": 65101917409920.0, "grad_norm": 0.5965624954376084, "language_loss": 0.53386748, "learning_rate": 1.6887275389717028e-06, "loss": 0.60969579, "num_input_tokens_seen": 201792615, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01663208, "step": 9368, "time_per_iteration": 3.2848594188690186 }, { "auxiliary_loss_clip": 0.06436646, "auxiliary_loss_mlp": 0.01273436, "balance_loss_clip": 0.06284899, "balance_loss_mlp": 0.01262064, "epoch": 0.5632947542462047, "flos": 23009572517760.0, "grad_norm": 2.6805746734616624, "language_loss": 0.6913498, "learning_rate": 1.6883428290039046e-06, "loss": 0.76845062, "num_input_tokens_seen": 201812520, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.11383057, "step": 9369, "time_per_iteration": 2.5732569694519043 }, { "auxiliary_loss_clip": 0.06435283, "auxiliary_loss_mlp": 0.01270474, "balance_loss_clip": 0.0628256, "balance_loss_mlp": 0.01259454, "epoch": 0.5633548774988727, "flos": 30490530969600.0, "grad_norm": 1.8243826995752266, "language_loss": 0.76388931, "learning_rate": 1.6879581308541763e-06, "loss": 0.84094691, "num_input_tokens_seen": 201834185, "router_z_loss_clip": 1.52539062, "router_z_loss_mlp": 0.11016846, "step": 9370, "time_per_iteration": 2.6517648696899414 }, { "auxiliary_loss_clip": 0.06439876, "auxiliary_loss_mlp": 0.01265171, "balance_loss_clip": 0.06281407, "balance_loss_mlp": 0.01252654, "epoch": 0.5634150007515406, "flos": 18520938748800.0, "grad_norm": 2.2674452073265687, "language_loss": 0.75955713, "learning_rate": 1.687573444537108e-06, "loss": 0.83660758, "num_input_tokens_seen": 201851305, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.12512207, "step": 9371, "time_per_iteration": 2.540593147277832 }, { "auxiliary_loss_clip": 0.06435452, "auxiliary_loss_mlp": 0.01267591, "balance_loss_clip": 0.06282903, "balance_loss_mlp": 0.01257017, "epoch": 0.5634751240042086, "flos": 19250679957120.0, "grad_norm": 1.777404514547929, "language_loss": 0.76145327, "learning_rate": 1.687188770067285e-06, "loss": 0.83848375, "num_input_tokens_seen": 201870350, "router_z_loss_clip": 1.52539062, "router_z_loss_mlp": 0.10559082, "step": 9372, "time_per_iteration": 4.001741170883179 }, { "auxiliary_loss_clip": 0.06437592, "auxiliary_loss_mlp": 0.01267763, "balance_loss_clip": 0.06286108, "balance_loss_mlp": 0.01256527, "epoch": 0.5635352472568766, "flos": 12025453766400.0, "grad_norm": 1.90072450883886, "language_loss": 0.7154237, "learning_rate": 1.6868041074592956e-06, "loss": 0.79247725, "num_input_tokens_seen": 201886800, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.11242676, "step": 9373, "time_per_iteration": 2.52319073677063 }, { "auxiliary_loss_clip": 0.06441015, "auxiliary_loss_mlp": 0.01271444, "balance_loss_clip": 0.06287867, "balance_loss_mlp": 0.01259052, "epoch": 0.5635953705095446, "flos": 21878092108800.0, "grad_norm": 1.958087850480274, "language_loss": 0.83685863, "learning_rate": 1.6864194567277264e-06, "loss": 0.91398323, "num_input_tokens_seen": 201904730, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.1239624, "step": 9374, "time_per_iteration": 2.536977767944336 }, { "auxiliary_loss_clip": 0.06435173, "auxiliary_loss_mlp": 0.01267974, "balance_loss_clip": 0.06284674, "balance_loss_mlp": 0.01257609, "epoch": 0.5636554937622126, "flos": 27133587244800.0, "grad_norm": 1.5882967468248492, "language_loss": 0.66543865, "learning_rate": 1.6860348178871618e-06, "loss": 0.74247015, "num_input_tokens_seen": 201924850, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.1036377, "step": 9375, "time_per_iteration": 2.582951545715332 }, { "auxiliary_loss_clip": 0.06442292, "auxiliary_loss_mlp": 0.0126578, "balance_loss_clip": 0.06287497, "balance_loss_mlp": 0.01254241, "epoch": 0.5637156170148805, "flos": 12930314008320.0, "grad_norm": 2.137792603257981, "language_loss": 0.81204641, "learning_rate": 1.6856501909521889e-06, "loss": 0.88912714, "num_input_tokens_seen": 201939500, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.11553955, "step": 9376, "time_per_iteration": 3.9101951122283936 }, { "auxiliary_loss_clip": 0.06444083, "auxiliary_loss_mlp": 0.01265886, "balance_loss_clip": 0.06285786, "balance_loss_mlp": 0.01254418, "epoch": 0.5637757402675485, "flos": 45561460435200.0, "grad_norm": 1.3007596847921499, "language_loss": 0.69693816, "learning_rate": 1.6852655759373925e-06, "loss": 0.77403784, "num_input_tokens_seen": 201963000, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.11468506, "step": 9377, "time_per_iteration": 2.756732940673828 }, { "auxiliary_loss_clip": 0.06434444, "auxiliary_loss_mlp": 0.01269029, "balance_loss_clip": 0.06286614, "balance_loss_mlp": 0.01258282, "epoch": 0.5638358635202164, "flos": 20892241296000.0, "grad_norm": 1.869007108957883, "language_loss": 0.74669898, "learning_rate": 1.6848809728573565e-06, "loss": 0.82373375, "num_input_tokens_seen": 201983145, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.10742188, "step": 9378, "time_per_iteration": 2.5475707054138184 }, { "auxiliary_loss_clip": 0.06445402, "auxiliary_loss_mlp": 0.01267689, "balance_loss_clip": 0.06282619, "balance_loss_mlp": 0.01255148, "epoch": 0.5638959867728844, "flos": 18812449503360.0, "grad_norm": 2.2086787136099315, "language_loss": 0.82113117, "learning_rate": 1.6844963817266656e-06, "loss": 0.89826208, "num_input_tokens_seen": 202000335, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.12530518, "step": 9379, "time_per_iteration": 2.512822389602661 }, { "auxiliary_loss_clip": 0.06439286, "auxiliary_loss_mlp": 0.01266033, "balance_loss_clip": 0.06284759, "balance_loss_mlp": 0.0125441, "epoch": 0.5639561100255523, "flos": 27497703162240.0, "grad_norm": 2.2319351720964176, "language_loss": 0.71675158, "learning_rate": 1.6841118025599042e-06, "loss": 0.79380482, "num_input_tokens_seen": 202018275, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.11627197, "step": 9380, "time_per_iteration": 2.5733726024627686 }, { "auxiliary_loss_clip": 0.06444339, "auxiliary_loss_mlp": 0.01266752, "balance_loss_clip": 0.06288173, "balance_loss_mlp": 0.01254688, "epoch": 0.5640162332782204, "flos": 18082289024640.0, "grad_norm": 2.1679307438255546, "language_loss": 0.74999237, "learning_rate": 1.6837272353716542e-06, "loss": 0.82710326, "num_input_tokens_seen": 202034330, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.12054443, "step": 9381, "time_per_iteration": 2.629653215408325 }, { "auxiliary_loss_clip": 0.06442863, "auxiliary_loss_mlp": 0.01269252, "balance_loss_clip": 0.06287272, "balance_loss_mlp": 0.01257969, "epoch": 0.5640763565308883, "flos": 20890857703680.0, "grad_norm": 1.8843968021992559, "language_loss": 0.7254442, "learning_rate": 1.683342680176499e-06, "loss": 0.80256534, "num_input_tokens_seen": 202053100, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.112854, "step": 9382, "time_per_iteration": 2.5736920833587646 }, { "auxiliary_loss_clip": 0.0632734, "auxiliary_loss_mlp": 0.01252014, "balance_loss_clip": 0.06265934, "balance_loss_mlp": 0.01250503, "epoch": 0.5641364797835563, "flos": 64467143205120.0, "grad_norm": 0.7401023918386402, "language_loss": 0.54438674, "learning_rate": 1.682958136989022e-06, "loss": 0.62018037, "num_input_tokens_seen": 202120125, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 0.0151062, "step": 9383, "time_per_iteration": 3.294226884841919 }, { "auxiliary_loss_clip": 0.06444423, "auxiliary_loss_mlp": 0.01268862, "balance_loss_clip": 0.06283323, "balance_loss_mlp": 0.01257156, "epoch": 0.5641966030362242, "flos": 18666861834240.0, "grad_norm": 1.7737289000361125, "language_loss": 0.71238273, "learning_rate": 1.6825736058238033e-06, "loss": 0.78951561, "num_input_tokens_seen": 202138030, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.11706543, "step": 9384, "time_per_iteration": 2.5389184951782227 }, { "auxiliary_loss_clip": 0.06439476, "auxiliary_loss_mlp": 0.01267352, "balance_loss_clip": 0.06283273, "balance_loss_mlp": 0.01255562, "epoch": 0.5642567262888922, "flos": 22498946536320.0, "grad_norm": 1.7608373790336675, "language_loss": 0.76245856, "learning_rate": 1.6821890866954263e-06, "loss": 0.83952683, "num_input_tokens_seen": 202155580, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.11785889, "step": 9385, "time_per_iteration": 2.77807879447937 }, { "auxiliary_loss_clip": 0.0643392, "auxiliary_loss_mlp": 0.01266893, "balance_loss_clip": 0.06282597, "balance_loss_mlp": 0.0125598, "epoch": 0.5643168495415603, "flos": 13008663175680.0, "grad_norm": 1.7601787755821952, "language_loss": 0.82511306, "learning_rate": 1.6818045796184703e-06, "loss": 0.90212113, "num_input_tokens_seen": 202170365, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.10900879, "step": 9386, "time_per_iteration": 3.9991071224212646 }, { "auxiliary_loss_clip": 0.06445052, "auxiliary_loss_mlp": 0.01266887, "balance_loss_clip": 0.06284851, "balance_loss_mlp": 0.01255574, "epoch": 0.5643769727942282, "flos": 18594256671360.0, "grad_norm": 10.679346404809559, "language_loss": 0.70614213, "learning_rate": 1.681420084607516e-06, "loss": 0.78326154, "num_input_tokens_seen": 202189095, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.11315918, "step": 9387, "time_per_iteration": 3.989227533340454 }, { "auxiliary_loss_clip": 0.06437526, "auxiliary_loss_mlp": 0.01265598, "balance_loss_clip": 0.06279898, "balance_loss_mlp": 0.01254494, "epoch": 0.5644370960468962, "flos": 33815343853440.0, "grad_norm": 1.4807734252924751, "language_loss": 0.75115371, "learning_rate": 1.6810356016771452e-06, "loss": 0.82818496, "num_input_tokens_seen": 202213500, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.11102295, "step": 9388, "time_per_iteration": 2.690692901611328 }, { "auxiliary_loss_clip": 0.06433254, "auxiliary_loss_mlp": 0.01265561, "balance_loss_clip": 0.06283376, "balance_loss_mlp": 0.0125497, "epoch": 0.5644972192995641, "flos": 21221249552640.0, "grad_norm": 1.4732432111006633, "language_loss": 0.82000256, "learning_rate": 1.6806511308419353e-06, "loss": 0.89699066, "num_input_tokens_seen": 202231920, "router_z_loss_clip": 1.49902344, "router_z_loss_mlp": 0.10595703, "step": 9389, "time_per_iteration": 2.5535528659820557 }, { "auxiliary_loss_clip": 0.06442005, "auxiliary_loss_mlp": 0.01267289, "balance_loss_clip": 0.06285421, "balance_loss_mlp": 0.01254761, "epoch": 0.5645573425522321, "flos": 18593585838720.0, "grad_norm": 2.169624998701895, "language_loss": 0.64338899, "learning_rate": 1.680266672116467e-06, "loss": 0.72048187, "num_input_tokens_seen": 202247600, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.12518311, "step": 9390, "time_per_iteration": 2.5155491828918457 }, { "auxiliary_loss_clip": 0.06433365, "auxiliary_loss_mlp": 0.01266736, "balance_loss_clip": 0.06282567, "balance_loss_mlp": 0.0125593, "epoch": 0.5646174658049, "flos": 18119660745600.0, "grad_norm": 1.813666891535031, "language_loss": 0.92683518, "learning_rate": 1.6798822255153192e-06, "loss": 1.00383615, "num_input_tokens_seen": 202265350, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.10803223, "step": 9391, "time_per_iteration": 2.5105488300323486 }, { "auxiliary_loss_clip": 0.0644701, "auxiliary_loss_mlp": 0.01267143, "balance_loss_clip": 0.06284553, "balance_loss_mlp": 0.01254697, "epoch": 0.564677589057568, "flos": 28337547035520.0, "grad_norm": 1.957586163806332, "language_loss": 0.60221934, "learning_rate": 1.6794977910530684e-06, "loss": 0.67936087, "num_input_tokens_seen": 202284285, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.12432861, "step": 9392, "time_per_iteration": 2.572854995727539 }, { "auxiliary_loss_clip": 0.0643248, "auxiliary_loss_mlp": 0.01266089, "balance_loss_clip": 0.06279741, "balance_loss_mlp": 0.01254996, "epoch": 0.564737712310236, "flos": 22170273696000.0, "grad_norm": 2.004347568184139, "language_loss": 0.81939995, "learning_rate": 1.6791133687442937e-06, "loss": 0.89638561, "num_input_tokens_seen": 202303450, "router_z_loss_clip": 1.52636719, "router_z_loss_mlp": 0.11083984, "step": 9393, "time_per_iteration": 2.543156385421753 }, { "auxiliary_loss_clip": 0.0643737, "auxiliary_loss_mlp": 0.01266254, "balance_loss_clip": 0.06283242, "balance_loss_mlp": 0.01255066, "epoch": 0.564797835562904, "flos": 20965223802240.0, "grad_norm": 1.6286723605371922, "language_loss": 0.87204123, "learning_rate": 1.6787289586035725e-06, "loss": 0.94907749, "num_input_tokens_seen": 202322315, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.11193848, "step": 9394, "time_per_iteration": 2.5323798656463623 }, { "auxiliary_loss_clip": 0.06437927, "auxiliary_loss_mlp": 0.01268034, "balance_loss_clip": 0.06286386, "balance_loss_mlp": 0.01257698, "epoch": 0.5648579588155719, "flos": 17425991520000.0, "grad_norm": 1.7094118914334193, "language_loss": 0.85019374, "learning_rate": 1.6783445606454814e-06, "loss": 0.92725337, "num_input_tokens_seen": 202339905, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.10321045, "step": 9395, "time_per_iteration": 2.5309972763061523 }, { "auxiliary_loss_clip": 0.06329216, "auxiliary_loss_mlp": 0.01252476, "balance_loss_clip": 0.062681, "balance_loss_mlp": 0.01251049, "epoch": 0.5649180820682399, "flos": 69951187152000.0, "grad_norm": 0.7712595402081096, "language_loss": 0.57954657, "learning_rate": 1.677960174884597e-06, "loss": 0.65536356, "num_input_tokens_seen": 202397320, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.01425171, "step": 9396, "time_per_iteration": 3.1606929302215576 }, { "auxiliary_loss_clip": 0.06446126, "auxiliary_loss_mlp": 0.012642, "balance_loss_clip": 0.0628906, "balance_loss_mlp": 0.01253537, "epoch": 0.5649782053209078, "flos": 24980058259200.0, "grad_norm": 1.8758312198004368, "language_loss": 0.70619506, "learning_rate": 1.6775758013354943e-06, "loss": 0.78329825, "num_input_tokens_seen": 202416865, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.10662842, "step": 9397, "time_per_iteration": 2.5748040676116943 }, { "auxiliary_loss_clip": 0.06438302, "auxiliary_loss_mlp": 0.01266692, "balance_loss_clip": 0.06281336, "balance_loss_mlp": 0.01255618, "epoch": 0.5650383285735758, "flos": 21733175272320.0, "grad_norm": 1.749615477847939, "language_loss": 0.67083716, "learning_rate": 1.67719144001275e-06, "loss": 0.74788713, "num_input_tokens_seen": 202436210, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.11065674, "step": 9398, "time_per_iteration": 2.55898118019104 }, { "auxiliary_loss_clip": 0.06330349, "auxiliary_loss_mlp": 0.0125239, "balance_loss_clip": 0.06269032, "balance_loss_mlp": 0.01250868, "epoch": 0.5650984518262439, "flos": 65923481093760.0, "grad_norm": 0.7613780168131394, "language_loss": 0.58136052, "learning_rate": 1.6768070909309386e-06, "loss": 0.65718794, "num_input_tokens_seen": 202492925, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 0.01521301, "step": 9399, "time_per_iteration": 3.130831718444824 }, { "auxiliary_loss_clip": 0.0644559, "auxiliary_loss_mlp": 0.01267838, "balance_loss_clip": 0.06285879, "balance_loss_mlp": 0.01256125, "epoch": 0.5651585750789118, "flos": 21038919068160.0, "grad_norm": 1.7013923476391977, "language_loss": 0.73637986, "learning_rate": 1.6764227541046347e-06, "loss": 0.81351417, "num_input_tokens_seen": 202511905, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.11700439, "step": 9400, "time_per_iteration": 2.567084789276123 }, { "auxiliary_loss_clip": 0.06447654, "auxiliary_loss_mlp": 0.01267496, "balance_loss_clip": 0.06288157, "balance_loss_mlp": 0.01255247, "epoch": 0.5652186983315798, "flos": 18557891199360.0, "grad_norm": 2.0377708867860047, "language_loss": 0.60842574, "learning_rate": 1.676038429548412e-06, "loss": 0.68557727, "num_input_tokens_seen": 202529815, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.12243652, "step": 9401, "time_per_iteration": 2.549679756164551 }, { "auxiliary_loss_clip": 0.06438999, "auxiliary_loss_mlp": 0.01266469, "balance_loss_clip": 0.06285465, "balance_loss_mlp": 0.01256008, "epoch": 0.5652788215842477, "flos": 18484573276800.0, "grad_norm": 1.8274861163743346, "language_loss": 0.81338394, "learning_rate": 1.6756541172768453e-06, "loss": 0.89043868, "num_input_tokens_seen": 202547710, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.10461426, "step": 9402, "time_per_iteration": 2.5267326831817627 }, { "auxiliary_loss_clip": 0.06436382, "auxiliary_loss_mlp": 0.01265653, "balance_loss_clip": 0.06285065, "balance_loss_mlp": 0.01254858, "epoch": 0.5653389448369157, "flos": 30051797391360.0, "grad_norm": 1.454424029016483, "language_loss": 0.77959883, "learning_rate": 1.6752698173045068e-06, "loss": 0.85661912, "num_input_tokens_seen": 202568835, "router_z_loss_clip": 1.50976562, "router_z_loss_mlp": 0.10791016, "step": 9403, "time_per_iteration": 2.624103307723999 }, { "auxiliary_loss_clip": 0.06440195, "auxiliary_loss_mlp": 0.01265578, "balance_loss_clip": 0.06285505, "balance_loss_mlp": 0.01254683, "epoch": 0.5653990680895836, "flos": 16733202762240.0, "grad_norm": 1.6081078484576063, "language_loss": 0.69489801, "learning_rate": 1.6748855296459685e-06, "loss": 0.77195579, "num_input_tokens_seen": 202587385, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.10894775, "step": 9404, "time_per_iteration": 2.546022415161133 }, { "auxiliary_loss_clip": 0.06428798, "auxiliary_loss_mlp": 0.01265123, "balance_loss_clip": 0.06279134, "balance_loss_mlp": 0.01254632, "epoch": 0.5654591913422516, "flos": 14543517939840.0, "grad_norm": 1.7903723814140922, "language_loss": 0.67744124, "learning_rate": 1.6745012543158045e-06, "loss": 0.7543804, "num_input_tokens_seen": 202604815, "router_z_loss_clip": 1.49902344, "router_z_loss_mlp": 0.1048584, "step": 9405, "time_per_iteration": 2.5070228576660156 }, { "auxiliary_loss_clip": 0.06436171, "auxiliary_loss_mlp": 0.01265149, "balance_loss_clip": 0.06286828, "balance_loss_mlp": 0.0125445, "epoch": 0.5655193145949196, "flos": 26216484307200.0, "grad_norm": 1.665616183661308, "language_loss": 0.74487352, "learning_rate": 1.6741169913285852e-06, "loss": 0.82188678, "num_input_tokens_seen": 202623775, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.10699463, "step": 9406, "time_per_iteration": 2.5857503414154053 }, { "auxiliary_loss_clip": 0.06440187, "auxiliary_loss_mlp": 0.01267381, "balance_loss_clip": 0.06282046, "balance_loss_mlp": 0.01255705, "epoch": 0.5655794378475876, "flos": 25053669671040.0, "grad_norm": 1.7303730903942296, "language_loss": 0.79998606, "learning_rate": 1.673732740698882e-06, "loss": 0.87706172, "num_input_tokens_seen": 202643375, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.11669922, "step": 9407, "time_per_iteration": 2.576742649078369 }, { "auxiliary_loss_clip": 0.06432053, "auxiliary_loss_mlp": 0.01271435, "balance_loss_clip": 0.06284348, "balance_loss_mlp": 0.01260211, "epoch": 0.5656395611002555, "flos": 31041379710720.0, "grad_norm": 1.5778674208059686, "language_loss": 0.70985603, "learning_rate": 1.6733485024412666e-06, "loss": 0.78689098, "num_input_tokens_seen": 202668400, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.11236572, "step": 9408, "time_per_iteration": 2.659465789794922 }, { "auxiliary_loss_clip": 0.06435559, "auxiliary_loss_mlp": 0.01265115, "balance_loss_clip": 0.06285582, "balance_loss_mlp": 0.01254839, "epoch": 0.5656996843529235, "flos": 20235650302080.0, "grad_norm": 1.8807392514257195, "language_loss": 0.81770885, "learning_rate": 1.672964276570308e-06, "loss": 0.89471555, "num_input_tokens_seen": 202685125, "router_z_loss_clip": 1.49902344, "router_z_loss_mlp": 0.10272217, "step": 9409, "time_per_iteration": 2.53195858001709 }, { "auxiliary_loss_clip": 0.0643834, "auxiliary_loss_mlp": 0.01269015, "balance_loss_clip": 0.06283961, "balance_loss_mlp": 0.01257774, "epoch": 0.5657598076055914, "flos": 21002595523200.0, "grad_norm": 1.678113297939816, "language_loss": 0.78580141, "learning_rate": 1.6725800631005776e-06, "loss": 0.86287498, "num_input_tokens_seen": 202703830, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.11224365, "step": 9410, "time_per_iteration": 2.5438232421875 }, { "auxiliary_loss_clip": 0.06437075, "auxiliary_loss_mlp": 0.01267461, "balance_loss_clip": 0.06283659, "balance_loss_mlp": 0.01256869, "epoch": 0.5658199308582594, "flos": 11550690132480.0, "grad_norm": 2.0226316501828463, "language_loss": 0.83756888, "learning_rate": 1.6721958620466432e-06, "loss": 0.91461426, "num_input_tokens_seen": 202719835, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.105896, "step": 9411, "time_per_iteration": 2.535630464553833 }, { "auxiliary_loss_clip": 0.0644962, "auxiliary_loss_mlp": 0.01266072, "balance_loss_clip": 0.06289248, "balance_loss_mlp": 0.01254145, "epoch": 0.5658800541109275, "flos": 14177137962240.0, "grad_norm": 2.4604211958826108, "language_loss": 0.67533851, "learning_rate": 1.6718116734230749e-06, "loss": 0.75249547, "num_input_tokens_seen": 202736795, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.11932373, "step": 9412, "time_per_iteration": 4.011788606643677 }, { "auxiliary_loss_clip": 0.0643104, "auxiliary_loss_mlp": 0.01264522, "balance_loss_clip": 0.06282245, "balance_loss_mlp": 0.0125452, "epoch": 0.5659401773635954, "flos": 27311934660480.0, "grad_norm": 1.5346960254077693, "language_loss": 0.5833599, "learning_rate": 1.6714274972444413e-06, "loss": 0.66031551, "num_input_tokens_seen": 202756900, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.10009766, "step": 9413, "time_per_iteration": 2.59812331199646 }, { "auxiliary_loss_clip": 0.06433022, "auxiliary_loss_mlp": 0.01263946, "balance_loss_clip": 0.06281887, "balance_loss_mlp": 0.01253735, "epoch": 0.5660003006162634, "flos": 16733957448960.0, "grad_norm": 1.6985719310731617, "language_loss": 0.69704258, "learning_rate": 1.6710433335253092e-06, "loss": 0.77401221, "num_input_tokens_seen": 202775145, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.10211182, "step": 9414, "time_per_iteration": 2.528693675994873 }, { "auxiliary_loss_clip": 0.06433904, "auxiliary_loss_mlp": 0.01268899, "balance_loss_clip": 0.06283021, "balance_loss_mlp": 0.01258516, "epoch": 0.5660604238689313, "flos": 21659983130880.0, "grad_norm": 1.4393574443200894, "language_loss": 0.7805301, "learning_rate": 1.670659182280247e-06, "loss": 0.85755813, "num_input_tokens_seen": 202794505, "router_z_loss_clip": 1.50976562, "router_z_loss_mlp": 0.10375977, "step": 9415, "time_per_iteration": 2.5673890113830566 }, { "auxiliary_loss_clip": 0.06336886, "auxiliary_loss_mlp": 0.0125553, "balance_loss_clip": 0.06275579, "balance_loss_mlp": 0.01253965, "epoch": 0.5661205471215993, "flos": 68843619884160.0, "grad_norm": 0.6741317150086893, "language_loss": 0.49007523, "learning_rate": 1.670275043523822e-06, "loss": 0.56599939, "num_input_tokens_seen": 202858580, "router_z_loss_clip": 0.61474609, "router_z_loss_mlp": 0.01565552, "step": 9416, "time_per_iteration": 4.749706029891968 }, { "auxiliary_loss_clip": 0.06441723, "auxiliary_loss_mlp": 0.012659, "balance_loss_clip": 0.06286348, "balance_loss_mlp": 0.0125392, "epoch": 0.5661806703742672, "flos": 28629393206400.0, "grad_norm": 2.009406548184003, "language_loss": 0.6305216, "learning_rate": 1.6698909172706e-06, "loss": 0.70759785, "num_input_tokens_seen": 202878565, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.11975098, "step": 9417, "time_per_iteration": 2.6704249382019043 }, { "auxiliary_loss_clip": 0.06446148, "auxiliary_loss_mlp": 0.01271517, "balance_loss_clip": 0.06289964, "balance_loss_mlp": 0.01259579, "epoch": 0.5662407936269352, "flos": 21404418577920.0, "grad_norm": 1.7237624498774888, "language_loss": 0.69345659, "learning_rate": 1.6695068035351479e-06, "loss": 0.77063322, "num_input_tokens_seen": 202897350, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.11950684, "step": 9418, "time_per_iteration": 2.5601093769073486 }, { "auxiliary_loss_clip": 0.06443192, "auxiliary_loss_mlp": 0.01268357, "balance_loss_clip": 0.06288558, "balance_loss_mlp": 0.01256639, "epoch": 0.5663009168796032, "flos": 25666054836480.0, "grad_norm": 1.8534492350551288, "language_loss": 0.65054536, "learning_rate": 1.6691227023320304e-06, "loss": 0.72766083, "num_input_tokens_seen": 202916745, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.11724854, "step": 9419, "time_per_iteration": 2.5687403678894043 }, { "auxiliary_loss_clip": 0.0633375, "auxiliary_loss_mlp": 0.01251559, "balance_loss_clip": 0.06272362, "balance_loss_mlp": 0.01250079, "epoch": 0.5663610401322712, "flos": 67953014835840.0, "grad_norm": 0.7100383870036653, "language_loss": 0.59629816, "learning_rate": 1.6687386136758135e-06, "loss": 0.67215127, "num_input_tokens_seen": 202982375, "router_z_loss_clip": 0.61474609, "router_z_loss_mlp": 0.01478577, "step": 9420, "time_per_iteration": 3.2242910861968994 }, { "auxiliary_loss_clip": 0.0643666, "auxiliary_loss_mlp": 0.01264611, "balance_loss_clip": 0.06285349, "balance_loss_mlp": 0.01254543, "epoch": 0.5664211633849391, "flos": 24616487393280.0, "grad_norm": 2.3060843640374045, "language_loss": 0.7443338, "learning_rate": 1.6683545375810618e-06, "loss": 0.82134652, "num_input_tokens_seen": 203002430, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.10070801, "step": 9421, "time_per_iteration": 2.5846071243286133 }, { "auxiliary_loss_clip": 0.06449819, "auxiliary_loss_mlp": 0.01269787, "balance_loss_clip": 0.06292893, "balance_loss_mlp": 0.01258588, "epoch": 0.5664812866376071, "flos": 11652407389440.0, "grad_norm": 1.8760483470980953, "language_loss": 0.73524624, "learning_rate": 1.6679704740623389e-06, "loss": 0.8124423, "num_input_tokens_seen": 203019425, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.11212158, "step": 9422, "time_per_iteration": 2.527712106704712 }, { "auxiliary_loss_clip": 0.0643917, "auxiliary_loss_mlp": 0.01266153, "balance_loss_clip": 0.06288604, "balance_loss_mlp": 0.01255811, "epoch": 0.566541409890275, "flos": 24650798440320.0, "grad_norm": 1.54434275174319, "language_loss": 0.81816661, "learning_rate": 1.6675864231342085e-06, "loss": 0.89521992, "num_input_tokens_seen": 203039035, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.10339355, "step": 9423, "time_per_iteration": 2.575644016265869 }, { "auxiliary_loss_clip": 0.06443658, "auxiliary_loss_mlp": 0.01271828, "balance_loss_clip": 0.06289848, "balance_loss_mlp": 0.01260968, "epoch": 0.566601533142943, "flos": 22276686781440.0, "grad_norm": 1.5664985255600399, "language_loss": 0.81239522, "learning_rate": 1.6672023848112353e-06, "loss": 0.88955009, "num_input_tokens_seen": 203059320, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.10864258, "step": 9424, "time_per_iteration": 2.566621780395508 }, { "auxiliary_loss_clip": 0.0644942, "auxiliary_loss_mlp": 0.01268235, "balance_loss_clip": 0.06290251, "balance_loss_mlp": 0.01255438, "epoch": 0.5666616563956111, "flos": 29979485717760.0, "grad_norm": 1.904746754893991, "language_loss": 0.78897011, "learning_rate": 1.6668183591079805e-06, "loss": 0.86614668, "num_input_tokens_seen": 203078490, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.12805176, "step": 9425, "time_per_iteration": 2.5976972579956055 }, { "auxiliary_loss_clip": 0.06443843, "auxiliary_loss_mlp": 0.01269162, "balance_loss_clip": 0.06291076, "balance_loss_mlp": 0.01258159, "epoch": 0.566721779648279, "flos": 17786585566080.0, "grad_norm": 1.8505955262448774, "language_loss": 0.591299, "learning_rate": 1.6664343460390064e-06, "loss": 0.66842902, "num_input_tokens_seen": 203096065, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.10998535, "step": 9426, "time_per_iteration": 5.342297077178955 }, { "auxiliary_loss_clip": 0.06447916, "auxiliary_loss_mlp": 0.01270595, "balance_loss_clip": 0.06290057, "balance_loss_mlp": 0.01259694, "epoch": 0.566781902900947, "flos": 21039967244160.0, "grad_norm": 1.5875649752932992, "language_loss": 0.82322371, "learning_rate": 1.6660503456188764e-06, "loss": 0.90040886, "num_input_tokens_seen": 203115270, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.10900879, "step": 9427, "time_per_iteration": 2.61612868309021 }, { "auxiliary_loss_clip": 0.06440717, "auxiliary_loss_mlp": 0.0126984, "balance_loss_clip": 0.06290762, "balance_loss_mlp": 0.01259427, "epoch": 0.5668420261536149, "flos": 23155244040960.0, "grad_norm": 1.739970373976468, "language_loss": 0.8666476, "learning_rate": 1.6656663578621498e-06, "loss": 0.94375312, "num_input_tokens_seen": 203134290, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.10418701, "step": 9428, "time_per_iteration": 2.6830923557281494 }, { "auxiliary_loss_clip": 0.06447273, "auxiliary_loss_mlp": 0.0126754, "balance_loss_clip": 0.06289123, "balance_loss_mlp": 0.01255798, "epoch": 0.5669021494062829, "flos": 22608210660480.0, "grad_norm": 2.3460300547836734, "language_loss": 0.73438692, "learning_rate": 1.6652823827833886e-06, "loss": 0.81153512, "num_input_tokens_seen": 203152935, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.11749268, "step": 9429, "time_per_iteration": 2.593082904815674 }, { "auxiliary_loss_clip": 0.0644377, "auxiliary_loss_mlp": 0.01266332, "balance_loss_clip": 0.06286515, "balance_loss_mlp": 0.01254447, "epoch": 0.5669622726589508, "flos": 17386481520000.0, "grad_norm": 1.7774215752242453, "language_loss": 0.75764596, "learning_rate": 1.6648984203971538e-06, "loss": 0.83474702, "num_input_tokens_seen": 203170110, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.11883545, "step": 9430, "time_per_iteration": 2.518944263458252 }, { "auxiliary_loss_clip": 0.0643882, "auxiliary_loss_mlp": 0.01267467, "balance_loss_clip": 0.06284242, "balance_loss_mlp": 0.01256356, "epoch": 0.5670223959116188, "flos": 18767992112640.0, "grad_norm": 2.1216034299803583, "language_loss": 0.73424196, "learning_rate": 1.6645144707180032e-06, "loss": 0.81130481, "num_input_tokens_seen": 203188825, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.11102295, "step": 9431, "time_per_iteration": 2.6363985538482666 }, { "auxiliary_loss_clip": 0.06433689, "auxiliary_loss_mlp": 0.01271071, "balance_loss_clip": 0.06291181, "balance_loss_mlp": 0.01260575, "epoch": 0.5670825191642868, "flos": 13558463740800.0, "grad_norm": 1.5797068938260272, "language_loss": 0.7320534, "learning_rate": 1.6641305337604984e-06, "loss": 0.80910093, "num_input_tokens_seen": 203206860, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10491943, "step": 9432, "time_per_iteration": 2.597318410873413 }, { "auxiliary_loss_clip": 0.06440692, "auxiliary_loss_mlp": 0.0126787, "balance_loss_clip": 0.06286384, "balance_loss_mlp": 0.01257332, "epoch": 0.5671426424169548, "flos": 22060506447360.0, "grad_norm": 1.5530711811977473, "language_loss": 0.78465474, "learning_rate": 1.663746609539197e-06, "loss": 0.86174035, "num_input_tokens_seen": 203225625, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.10546875, "step": 9433, "time_per_iteration": 2.5942542552948 }, { "auxiliary_loss_clip": 0.06449988, "auxiliary_loss_mlp": 0.01272246, "balance_loss_clip": 0.06290227, "balance_loss_mlp": 0.01258943, "epoch": 0.5672027656696227, "flos": 21330262114560.0, "grad_norm": 1.8707452027494669, "language_loss": 0.6418047, "learning_rate": 1.6633626980686582e-06, "loss": 0.71902704, "num_input_tokens_seen": 203242920, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.13305664, "step": 9434, "time_per_iteration": 2.5592427253723145 }, { "auxiliary_loss_clip": 0.06435518, "auxiliary_loss_mlp": 0.0127016, "balance_loss_clip": 0.06285599, "balance_loss_mlp": 0.01259854, "epoch": 0.5672628889222907, "flos": 23520869331840.0, "grad_norm": 1.724239226197463, "language_loss": 0.67346168, "learning_rate": 1.6629787993634399e-06, "loss": 0.75051844, "num_input_tokens_seen": 203261995, "router_z_loss_clip": 1.49902344, "router_z_loss_mlp": 0.10302734, "step": 9435, "time_per_iteration": 2.605858564376831 }, { "auxiliary_loss_clip": 0.06437965, "auxiliary_loss_mlp": 0.0127052, "balance_loss_clip": 0.06285785, "balance_loss_mlp": 0.01259303, "epoch": 0.5673230121749586, "flos": 27128639854080.0, "grad_norm": 1.35028579859598, "language_loss": 0.71864694, "learning_rate": 1.6625949134380984e-06, "loss": 0.79573178, "num_input_tokens_seen": 203280670, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.11224365, "step": 9436, "time_per_iteration": 2.5970261096954346 }, { "auxiliary_loss_clip": 0.06447184, "auxiliary_loss_mlp": 0.01265696, "balance_loss_clip": 0.06288706, "balance_loss_mlp": 0.01254865, "epoch": 0.5673831354276266, "flos": 31150476126720.0, "grad_norm": 1.5744842827245982, "language_loss": 0.74430746, "learning_rate": 1.6622110403071921e-06, "loss": 0.82143623, "num_input_tokens_seen": 203304800, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.10827637, "step": 9437, "time_per_iteration": 2.66141414642334 }, { "auxiliary_loss_clip": 0.06446292, "auxiliary_loss_mlp": 0.01271555, "balance_loss_clip": 0.06289926, "balance_loss_mlp": 0.01259551, "epoch": 0.5674432586802945, "flos": 27680662552320.0, "grad_norm": 1.7205252043790777, "language_loss": 0.61569077, "learning_rate": 1.661827179985277e-06, "loss": 0.69286931, "num_input_tokens_seen": 203324060, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.11999512, "step": 9438, "time_per_iteration": 2.5882132053375244 }, { "auxiliary_loss_clip": 0.06441693, "auxiliary_loss_mlp": 0.01265881, "balance_loss_clip": 0.06285058, "balance_loss_mlp": 0.0125458, "epoch": 0.5675033819329626, "flos": 26622458138880.0, "grad_norm": 1.4110839218483247, "language_loss": 0.75556093, "learning_rate": 1.661443332486909e-06, "loss": 0.83263671, "num_input_tokens_seen": 203344360, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.11291504, "step": 9439, "time_per_iteration": 2.6010091304779053 }, { "auxiliary_loss_clip": 0.06440223, "auxiliary_loss_mlp": 0.01267906, "balance_loss_clip": 0.06289481, "balance_loss_mlp": 0.0125586, "epoch": 0.5675635051856306, "flos": 19104295674240.0, "grad_norm": 1.8032476638238495, "language_loss": 0.84611714, "learning_rate": 1.6610594978266438e-06, "loss": 0.92319834, "num_input_tokens_seen": 203362115, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.12054443, "step": 9440, "time_per_iteration": 2.548123359680176 }, { "auxiliary_loss_clip": 0.06447265, "auxiliary_loss_mlp": 0.01268411, "balance_loss_clip": 0.06286956, "balance_loss_mlp": 0.01256448, "epoch": 0.5676236284382985, "flos": 17572040386560.0, "grad_norm": 1.8446458808896604, "language_loss": 0.75612485, "learning_rate": 1.6606756760190365e-06, "loss": 0.83328164, "num_input_tokens_seen": 203380550, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.11956787, "step": 9441, "time_per_iteration": 2.531691551208496 }, { "auxiliary_loss_clip": 0.06439139, "auxiliary_loss_mlp": 0.0126743, "balance_loss_clip": 0.0628522, "balance_loss_mlp": 0.01256344, "epoch": 0.5676837516909665, "flos": 15958375257600.0, "grad_norm": 1.8543307789358252, "language_loss": 0.83624697, "learning_rate": 1.6602918670786413e-06, "loss": 0.91331267, "num_input_tokens_seen": 203396590, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11071777, "step": 9442, "time_per_iteration": 2.530283212661743 }, { "auxiliary_loss_clip": 0.06432305, "auxiliary_loss_mlp": 0.0126683, "balance_loss_clip": 0.06287351, "balance_loss_mlp": 0.0125631, "epoch": 0.5677438749436344, "flos": 18301739667840.0, "grad_norm": 2.053683692247587, "language_loss": 0.74655038, "learning_rate": 1.6599080710200126e-06, "loss": 0.82354176, "num_input_tokens_seen": 203414280, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10522461, "step": 9443, "time_per_iteration": 2.5267741680145264 }, { "auxiliary_loss_clip": 0.06438881, "auxiliary_loss_mlp": 0.01269502, "balance_loss_clip": 0.06284784, "balance_loss_mlp": 0.01257789, "epoch": 0.5678039981963025, "flos": 17937120625920.0, "grad_norm": 2.2093060492145598, "language_loss": 0.77854943, "learning_rate": 1.6595242878577046e-06, "loss": 0.85563326, "num_input_tokens_seen": 203433280, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11706543, "step": 9444, "time_per_iteration": 2.546342134475708 }, { "auxiliary_loss_clip": 0.0644498, "auxiliary_loss_mlp": 0.0126822, "balance_loss_clip": 0.06287505, "balance_loss_mlp": 0.01256424, "epoch": 0.5678641214489704, "flos": 19322153089920.0, "grad_norm": 3.337750091404869, "language_loss": 0.81320262, "learning_rate": 1.6591405176062687e-06, "loss": 0.89033461, "num_input_tokens_seen": 203449935, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.11798096, "step": 9445, "time_per_iteration": 2.5314152240753174 }, { "auxiliary_loss_clip": 0.06436509, "auxiliary_loss_mlp": 0.01265606, "balance_loss_clip": 0.06282298, "balance_loss_mlp": 0.01254395, "epoch": 0.5679242447016384, "flos": 27759389063040.0, "grad_norm": 1.3684993431718677, "language_loss": 0.71169209, "learning_rate": 1.658756760280259e-06, "loss": 0.78871328, "num_input_tokens_seen": 203473025, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.11199951, "step": 9446, "time_per_iteration": 2.6390514373779297 }, { "auxiliary_loss_clip": 0.06440514, "auxiliary_loss_mlp": 0.01266144, "balance_loss_clip": 0.06281213, "balance_loss_mlp": 0.01254605, "epoch": 0.5679843679543063, "flos": 23775888833280.0, "grad_norm": 1.7791161996373095, "language_loss": 0.73751938, "learning_rate": 1.6583730158942276e-06, "loss": 0.81458604, "num_input_tokens_seen": 203492895, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.11535645, "step": 9447, "time_per_iteration": 2.590371608734131 }, { "auxiliary_loss_clip": 0.06440962, "auxiliary_loss_mlp": 0.01269077, "balance_loss_clip": 0.06283337, "balance_loss_mlp": 0.0125752, "epoch": 0.5680444912069743, "flos": 25598732480640.0, "grad_norm": 1.962442400327548, "language_loss": 0.75824171, "learning_rate": 1.657989284462725e-06, "loss": 0.83534211, "num_input_tokens_seen": 203513710, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.11560059, "step": 9448, "time_per_iteration": 2.6376688480377197 }, { "auxiliary_loss_clip": 0.06440314, "auxiliary_loss_mlp": 0.01271216, "balance_loss_clip": 0.06281231, "balance_loss_mlp": 0.01259176, "epoch": 0.5681046144596422, "flos": 23702528983680.0, "grad_norm": 2.2206118664805055, "language_loss": 0.76494718, "learning_rate": 1.6576055660003038e-06, "loss": 0.84206247, "num_input_tokens_seen": 203531630, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.12042236, "step": 9449, "time_per_iteration": 2.5838024616241455 }, { "auxiliary_loss_clip": 0.06438351, "auxiliary_loss_mlp": 0.012726, "balance_loss_clip": 0.06282846, "balance_loss_mlp": 0.01261144, "epoch": 0.5681647377123102, "flos": 28008161435520.0, "grad_norm": 1.5021031350007206, "language_loss": 0.75036377, "learning_rate": 1.6572218605215128e-06, "loss": 0.82747328, "num_input_tokens_seen": 203551885, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.11462402, "step": 9450, "time_per_iteration": 2.6388721466064453 }, { "auxiliary_loss_clip": 0.06436666, "auxiliary_loss_mlp": 0.0126523, "balance_loss_clip": 0.06281875, "balance_loss_mlp": 0.01254317, "epoch": 0.5682248609649782, "flos": 22754427235200.0, "grad_norm": 2.2056353954461603, "language_loss": 0.66917646, "learning_rate": 1.6568381680409038e-06, "loss": 0.74619544, "num_input_tokens_seen": 203572250, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.10913086, "step": 9451, "time_per_iteration": 4.043081998825073 }, { "auxiliary_loss_clip": 0.06452425, "auxiliary_loss_mlp": 0.01268602, "balance_loss_clip": 0.06286845, "balance_loss_mlp": 0.01255656, "epoch": 0.5682849842176462, "flos": 21295070599680.0, "grad_norm": 2.363570677439875, "language_loss": 0.72432554, "learning_rate": 1.656454488573026e-06, "loss": 0.80153584, "num_input_tokens_seen": 203590605, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.12939453, "step": 9452, "time_per_iteration": 2.5610742568969727 }, { "auxiliary_loss_clip": 0.06433453, "auxiliary_loss_mlp": 0.01266199, "balance_loss_clip": 0.06282258, "balance_loss_mlp": 0.01254714, "epoch": 0.5683451074703142, "flos": 21147973557120.0, "grad_norm": 1.5083068627241645, "language_loss": 0.70474327, "learning_rate": 1.656070822132428e-06, "loss": 0.78173977, "num_input_tokens_seen": 203610080, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.11480713, "step": 9453, "time_per_iteration": 2.556417942047119 }, { "auxiliary_loss_clip": 0.06431328, "auxiliary_loss_mlp": 0.01265331, "balance_loss_clip": 0.06278262, "balance_loss_mlp": 0.01254978, "epoch": 0.5684052307229821, "flos": 22350759390720.0, "grad_norm": 1.6689333487309534, "language_loss": 0.70326602, "learning_rate": 1.6556871687336592e-06, "loss": 0.78023255, "num_input_tokens_seen": 203630060, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.10357666, "step": 9454, "time_per_iteration": 2.552400827407837 }, { "auxiliary_loss_clip": 0.06430331, "auxiliary_loss_mlp": 0.01267266, "balance_loss_clip": 0.06279729, "balance_loss_mlp": 0.01257228, "epoch": 0.5684653539756501, "flos": 21805067675520.0, "grad_norm": 2.2690260343989372, "language_loss": 0.60600847, "learning_rate": 1.6553035283912671e-06, "loss": 0.68298447, "num_input_tokens_seen": 203649065, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.10040283, "step": 9455, "time_per_iteration": 4.040551662445068 }, { "auxiliary_loss_clip": 0.06446079, "auxiliary_loss_mlp": 0.01270457, "balance_loss_clip": 0.06286203, "balance_loss_mlp": 0.01258548, "epoch": 0.568525477228318, "flos": 23005757157120.0, "grad_norm": 2.60082653876788, "language_loss": 0.73671019, "learning_rate": 1.6549199011198e-06, "loss": 0.81387556, "num_input_tokens_seen": 203667545, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.11907959, "step": 9456, "time_per_iteration": 2.5635321140289307 }, { "auxiliary_loss_clip": 0.06430774, "auxiliary_loss_mlp": 0.01266958, "balance_loss_clip": 0.06277713, "balance_loss_mlp": 0.01256813, "epoch": 0.568585600480986, "flos": 21398045667840.0, "grad_norm": 1.7633043373913968, "language_loss": 0.77354896, "learning_rate": 1.6545362869338048e-06, "loss": 0.85052627, "num_input_tokens_seen": 203686025, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.10150146, "step": 9457, "time_per_iteration": 2.5548579692840576 }, { "auxiliary_loss_clip": 0.06437057, "auxiliary_loss_mlp": 0.01267188, "balance_loss_clip": 0.06281053, "balance_loss_mlp": 0.0125572, "epoch": 0.568645723733654, "flos": 30015054576000.0, "grad_norm": 2.234358251882741, "language_loss": 0.67340219, "learning_rate": 1.6541526858478285e-06, "loss": 0.75044465, "num_input_tokens_seen": 203705540, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.11468506, "step": 9458, "time_per_iteration": 2.6163089275360107 }, { "auxiliary_loss_clip": 0.06437225, "auxiliary_loss_mlp": 0.0126295, "balance_loss_clip": 0.06279275, "balance_loss_mlp": 0.01251602, "epoch": 0.568705846986322, "flos": 20418945108480.0, "grad_norm": 2.2547619161721304, "language_loss": 0.68797362, "learning_rate": 1.6537690978764167e-06, "loss": 0.76497531, "num_input_tokens_seen": 203723670, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.11352539, "step": 9459, "time_per_iteration": 2.5583715438842773 }, { "auxiliary_loss_clip": 0.06439021, "auxiliary_loss_mlp": 0.01264023, "balance_loss_clip": 0.0627961, "balance_loss_mlp": 0.01252699, "epoch": 0.5687659702389899, "flos": 17462440846080.0, "grad_norm": 2.2100959266919156, "language_loss": 0.77303624, "learning_rate": 1.6533855230341155e-06, "loss": 0.85006678, "num_input_tokens_seen": 203739705, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.11322021, "step": 9460, "time_per_iteration": 2.5196850299835205 }, { "auxiliary_loss_clip": 0.06440554, "auxiliary_loss_mlp": 0.01269249, "balance_loss_clip": 0.06281462, "balance_loss_mlp": 0.01257656, "epoch": 0.5688260934916579, "flos": 25412335073280.0, "grad_norm": 2.0137445501849025, "language_loss": 0.72527897, "learning_rate": 1.65300196133547e-06, "loss": 0.80237705, "num_input_tokens_seen": 203759000, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.11590576, "step": 9461, "time_per_iteration": 2.6735265254974365 }, { "auxiliary_loss_clip": 0.06432892, "auxiliary_loss_mlp": 0.01264104, "balance_loss_clip": 0.06279665, "balance_loss_mlp": 0.01252874, "epoch": 0.5688862167443258, "flos": 21613052044800.0, "grad_norm": 1.9256503797647961, "language_loss": 0.7334832, "learning_rate": 1.6526184127950249e-06, "loss": 0.81045312, "num_input_tokens_seen": 203774295, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.11242676, "step": 9462, "time_per_iteration": 2.5397331714630127 }, { "auxiliary_loss_clip": 0.06431431, "auxiliary_loss_mlp": 0.01266558, "balance_loss_clip": 0.06283018, "balance_loss_mlp": 0.01256645, "epoch": 0.5689463399969938, "flos": 22425544759680.0, "grad_norm": 1.894581093631917, "language_loss": 0.72709918, "learning_rate": 1.6522348774273246e-06, "loss": 0.80407906, "num_input_tokens_seen": 203792710, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.09912109, "step": 9463, "time_per_iteration": 2.595482349395752 }, { "auxiliary_loss_clip": 0.06431764, "auxiliary_loss_mlp": 0.01265877, "balance_loss_clip": 0.06276967, "balance_loss_mlp": 0.01255202, "epoch": 0.5690064632496618, "flos": 18302787843840.0, "grad_norm": 1.7106560003821574, "language_loss": 0.74543786, "learning_rate": 1.6518513552469123e-06, "loss": 0.82241428, "num_input_tokens_seen": 203811645, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.10681152, "step": 9464, "time_per_iteration": 2.545743465423584 }, { "auxiliary_loss_clip": 0.06434079, "auxiliary_loss_mlp": 0.01264675, "balance_loss_clip": 0.06279564, "balance_loss_mlp": 0.01253225, "epoch": 0.5690665865023298, "flos": 21585575105280.0, "grad_norm": 4.261042641284849, "language_loss": 0.84427017, "learning_rate": 1.6514678462683312e-06, "loss": 0.92125773, "num_input_tokens_seen": 203830040, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.11462402, "step": 9465, "time_per_iteration": 3.9682416915893555 }, { "auxiliary_loss_clip": 0.06431741, "auxiliary_loss_mlp": 0.01265222, "balance_loss_clip": 0.06280529, "balance_loss_mlp": 0.01254809, "epoch": 0.5691267097549978, "flos": 24427616290560.0, "grad_norm": 1.6340572657288779, "language_loss": 0.72429407, "learning_rate": 1.651084350506125e-06, "loss": 0.80126369, "num_input_tokens_seen": 203851245, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.10412598, "step": 9466, "time_per_iteration": 4.006332159042358 }, { "auxiliary_loss_clip": 0.06305808, "auxiliary_loss_mlp": 0.01261152, "balance_loss_clip": 0.06243867, "balance_loss_mlp": 0.01259562, "epoch": 0.5691868330076657, "flos": 61679915389440.0, "grad_norm": 0.7001521460766909, "language_loss": 0.55233479, "learning_rate": 1.6507008679748343e-06, "loss": 0.62800443, "num_input_tokens_seen": 203916400, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.01589966, "step": 9467, "time_per_iteration": 3.2255876064300537 }, { "auxiliary_loss_clip": 0.06437746, "auxiliary_loss_mlp": 0.01264184, "balance_loss_clip": 0.06281151, "balance_loss_mlp": 0.01253015, "epoch": 0.5692469562603337, "flos": 21331687633920.0, "grad_norm": 1.9365842199416474, "language_loss": 0.64088118, "learning_rate": 1.6503173986890023e-06, "loss": 0.71790051, "num_input_tokens_seen": 203935870, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.11175537, "step": 9468, "time_per_iteration": 2.5557374954223633 }, { "auxiliary_loss_clip": 0.06432898, "auxiliary_loss_mlp": 0.01268968, "balance_loss_clip": 0.06280375, "balance_loss_mlp": 0.01258179, "epoch": 0.5693070795130016, "flos": 23374652757120.0, "grad_norm": 2.3337876021346213, "language_loss": 0.80125558, "learning_rate": 1.64993394266317e-06, "loss": 0.8782742, "num_input_tokens_seen": 203954950, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.10797119, "step": 9469, "time_per_iteration": 2.56709623336792 }, { "auxiliary_loss_clip": 0.06440905, "auxiliary_loss_mlp": 0.01267218, "balance_loss_clip": 0.06281716, "balance_loss_mlp": 0.01256066, "epoch": 0.5693672027656697, "flos": 18703143452160.0, "grad_norm": 1.9595070103755243, "language_loss": 0.70129263, "learning_rate": 1.6495504999118769e-06, "loss": 0.77837384, "num_input_tokens_seen": 203972715, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.11157227, "step": 9470, "time_per_iteration": 2.5244388580322266 }, { "auxiliary_loss_clip": 0.0643106, "auxiliary_loss_mlp": 0.01268357, "balance_loss_clip": 0.0627724, "balance_loss_mlp": 0.0125615, "epoch": 0.5694273260183376, "flos": 20455478288640.0, "grad_norm": 1.6309554890005102, "language_loss": 0.75105679, "learning_rate": 1.6491670704496644e-06, "loss": 0.82805097, "num_input_tokens_seen": 203990775, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.12207031, "step": 9471, "time_per_iteration": 2.5358083248138428 }, { "auxiliary_loss_clip": 0.06431514, "auxiliary_loss_mlp": 0.01268456, "balance_loss_clip": 0.06281786, "balance_loss_mlp": 0.01258168, "epoch": 0.5694874492710056, "flos": 17608992837120.0, "grad_norm": 1.6324644053100703, "language_loss": 0.57880616, "learning_rate": 1.6487836542910716e-06, "loss": 0.65580583, "num_input_tokens_seen": 204008845, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.10302734, "step": 9472, "time_per_iteration": 2.5348801612854004 }, { "auxiliary_loss_clip": 0.06427315, "auxiliary_loss_mlp": 0.01269269, "balance_loss_clip": 0.06280119, "balance_loss_mlp": 0.01258087, "epoch": 0.5695475725236735, "flos": 13375923621120.0, "grad_norm": 2.1200046948287437, "language_loss": 0.74262995, "learning_rate": 1.648400251450638e-06, "loss": 0.81959575, "num_input_tokens_seen": 204023755, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.11181641, "step": 9473, "time_per_iteration": 2.5493452548980713 }, { "auxiliary_loss_clip": 0.06309093, "auxiliary_loss_mlp": 0.01255334, "balance_loss_clip": 0.06247174, "balance_loss_mlp": 0.01253784, "epoch": 0.5696076957763415, "flos": 68195078881920.0, "grad_norm": 0.6441901314353052, "language_loss": 0.57407248, "learning_rate": 1.6480168619429023e-06, "loss": 0.64971673, "num_input_tokens_seen": 204091255, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01548004, "step": 9474, "time_per_iteration": 3.2123191356658936 }, { "auxiliary_loss_clip": 0.06433134, "auxiliary_loss_mlp": 0.01267708, "balance_loss_clip": 0.06282732, "balance_loss_mlp": 0.01256914, "epoch": 0.5696678190290094, "flos": 33846636153600.0, "grad_norm": 1.7324230604816448, "language_loss": 0.54768229, "learning_rate": 1.6476334857824017e-06, "loss": 0.62469071, "num_input_tokens_seen": 204113285, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.10803223, "step": 9475, "time_per_iteration": 2.799916982650757 }, { "auxiliary_loss_clip": 0.06432892, "auxiliary_loss_mlp": 0.01265811, "balance_loss_clip": 0.06279711, "balance_loss_mlp": 0.01254904, "epoch": 0.5697279422816774, "flos": 26363329787520.0, "grad_norm": 1.611821834553002, "language_loss": 0.80042899, "learning_rate": 1.647250122983675e-06, "loss": 0.87741601, "num_input_tokens_seen": 204133045, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.10900879, "step": 9476, "time_per_iteration": 2.683405876159668 }, { "auxiliary_loss_clip": 0.06440447, "auxiliary_loss_mlp": 0.01268345, "balance_loss_clip": 0.06283966, "balance_loss_mlp": 0.01257217, "epoch": 0.5697880655343454, "flos": 22937260844160.0, "grad_norm": 2.0255332147093994, "language_loss": 0.6688534, "learning_rate": 1.6468667735612592e-06, "loss": 0.74594128, "num_input_tokens_seen": 204152590, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.11126709, "step": 9477, "time_per_iteration": 2.5726091861724854 }, { "auxiliary_loss_clip": 0.06434301, "auxiliary_loss_mlp": 0.01266584, "balance_loss_clip": 0.06279832, "balance_loss_mlp": 0.01255575, "epoch": 0.5698481887870134, "flos": 26768674713600.0, "grad_norm": 1.6231722242476534, "language_loss": 0.70720512, "learning_rate": 1.6464834375296906e-06, "loss": 0.78421396, "num_input_tokens_seen": 204171815, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.11022949, "step": 9478, "time_per_iteration": 2.618043899536133 }, { "auxiliary_loss_clip": 0.06429584, "auxiliary_loss_mlp": 0.01266521, "balance_loss_clip": 0.06282634, "balance_loss_mlp": 0.01256013, "epoch": 0.5699083120396814, "flos": 15747729292800.0, "grad_norm": 1.57137182195635, "language_loss": 0.69774014, "learning_rate": 1.6461001149035055e-06, "loss": 0.77470118, "num_input_tokens_seen": 204188535, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.10516357, "step": 9479, "time_per_iteration": 2.6340458393096924 }, { "auxiliary_loss_clip": 0.06426726, "auxiliary_loss_mlp": 0.01268081, "balance_loss_clip": 0.06279223, "balance_loss_mlp": 0.01257811, "epoch": 0.5699684352923493, "flos": 19543448522880.0, "grad_norm": 1.8385394362691656, "language_loss": 0.71741253, "learning_rate": 1.6457168056972392e-06, "loss": 0.79436058, "num_input_tokens_seen": 204208365, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.10266113, "step": 9480, "time_per_iteration": 2.648810863494873 }, { "auxiliary_loss_clip": 0.06434305, "auxiliary_loss_mlp": 0.01268336, "balance_loss_clip": 0.06282517, "balance_loss_mlp": 0.01256833, "epoch": 0.5700285585450173, "flos": 16258942252800.0, "grad_norm": 2.1936384623990537, "language_loss": 0.73329008, "learning_rate": 1.6453335099254276e-06, "loss": 0.81031644, "num_input_tokens_seen": 204226560, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.11505127, "step": 9481, "time_per_iteration": 2.5258374214172363 }, { "auxiliary_loss_clip": 0.06436588, "auxiliary_loss_mlp": 0.01270536, "balance_loss_clip": 0.06285009, "balance_loss_mlp": 0.0125886, "epoch": 0.5700886817976852, "flos": 19871115114240.0, "grad_norm": 2.0904017726841393, "language_loss": 0.78790951, "learning_rate": 1.6449502276026041e-06, "loss": 0.8649807, "num_input_tokens_seen": 204245410, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.11676025, "step": 9482, "time_per_iteration": 2.5414881706237793 }, { "auxiliary_loss_clip": 0.06432008, "auxiliary_loss_mlp": 0.0126586, "balance_loss_clip": 0.06281109, "balance_loss_mlp": 0.01254976, "epoch": 0.5701488050503533, "flos": 23848452069120.0, "grad_norm": 1.495741775580948, "language_loss": 0.77856934, "learning_rate": 1.6445669587433043e-06, "loss": 0.85554808, "num_input_tokens_seen": 204264840, "router_z_loss_clip": 1.50976562, "router_z_loss_mlp": 0.10882568, "step": 9483, "time_per_iteration": 2.5648701190948486 }, { "auxiliary_loss_clip": 0.06436516, "auxiliary_loss_mlp": 0.01265582, "balance_loss_clip": 0.06284063, "balance_loss_mlp": 0.01255347, "epoch": 0.5702089283030212, "flos": 23666457000960.0, "grad_norm": 1.5494631349962336, "language_loss": 0.81845105, "learning_rate": 1.6441837033620612e-06, "loss": 0.89547205, "num_input_tokens_seen": 204284335, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.10229492, "step": 9484, "time_per_iteration": 2.578622817993164 }, { "auxiliary_loss_clip": 0.06433728, "auxiliary_loss_mlp": 0.01268206, "balance_loss_clip": 0.06279829, "balance_loss_mlp": 0.01256893, "epoch": 0.5702690515556892, "flos": 27898519968000.0, "grad_norm": 2.3168547514034508, "language_loss": 0.61459851, "learning_rate": 1.6438004614734073e-06, "loss": 0.69161785, "num_input_tokens_seen": 204302590, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11315918, "step": 9485, "time_per_iteration": 2.6370859146118164 }, { "auxiliary_loss_clip": 0.06434011, "auxiliary_loss_mlp": 0.01269617, "balance_loss_clip": 0.06281599, "balance_loss_mlp": 0.01258113, "epoch": 0.5703291748083571, "flos": 24030698699520.0, "grad_norm": 1.859337710449477, "language_loss": 0.65168411, "learning_rate": 1.6434172330918757e-06, "loss": 0.72872037, "num_input_tokens_seen": 204323055, "router_z_loss_clip": 1.52246094, "router_z_loss_mlp": 0.11499023, "step": 9486, "time_per_iteration": 2.577763080596924 }, { "auxiliary_loss_clip": 0.06309196, "auxiliary_loss_mlp": 0.01252711, "balance_loss_clip": 0.06247825, "balance_loss_mlp": 0.01251002, "epoch": 0.5703892980610251, "flos": 57044478067200.0, "grad_norm": 1.011768243896565, "language_loss": 0.47973636, "learning_rate": 1.6430340182319978e-06, "loss": 0.55535543, "num_input_tokens_seen": 204386160, "router_z_loss_clip": 0.61572266, "router_z_loss_mlp": 0.0171051, "step": 9487, "time_per_iteration": 3.2374868392944336 }, { "auxiliary_loss_clip": 0.06434724, "auxiliary_loss_mlp": 0.0126418, "balance_loss_clip": 0.06281999, "balance_loss_mlp": 0.01253213, "epoch": 0.570449421313693, "flos": 24357610604160.0, "grad_norm": 1.4904148913325195, "language_loss": 0.86387318, "learning_rate": 1.6426508169083067e-06, "loss": 0.94086218, "num_input_tokens_seen": 204406315, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.10968018, "step": 9488, "time_per_iteration": 2.5862414836883545 }, { "auxiliary_loss_clip": 0.06437934, "auxiliary_loss_mlp": 0.0126974, "balance_loss_clip": 0.06281947, "balance_loss_mlp": 0.01259035, "epoch": 0.570509544566361, "flos": 24835770328320.0, "grad_norm": 1.4538796728993268, "language_loss": 0.78891408, "learning_rate": 1.6422676291353314e-06, "loss": 0.86599088, "num_input_tokens_seen": 204427645, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.10699463, "step": 9489, "time_per_iteration": 2.622696876525879 }, { "auxiliary_loss_clip": 0.06430673, "auxiliary_loss_mlp": 0.01263647, "balance_loss_clip": 0.06280009, "balance_loss_mlp": 0.01253448, "epoch": 0.570569667819029, "flos": 21403663891200.0, "grad_norm": 1.7199858810048425, "language_loss": 0.6988008, "learning_rate": 1.641884454927604e-06, "loss": 0.77574402, "num_input_tokens_seen": 204445910, "router_z_loss_clip": 1.50585938, "router_z_loss_mlp": 0.10186768, "step": 9490, "time_per_iteration": 2.5849788188934326 }, { "auxiliary_loss_clip": 0.06435694, "auxiliary_loss_mlp": 0.01271556, "balance_loss_clip": 0.06284414, "balance_loss_mlp": 0.01260952, "epoch": 0.570629791071697, "flos": 23222608323840.0, "grad_norm": 1.4895812115645406, "language_loss": 0.76632857, "learning_rate": 1.6415012942996548e-06, "loss": 0.84340107, "num_input_tokens_seen": 204464680, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.10614014, "step": 9491, "time_per_iteration": 4.0747270584106445 }, { "auxiliary_loss_clip": 0.06306519, "auxiliary_loss_mlp": 0.01251911, "balance_loss_clip": 0.0624481, "balance_loss_mlp": 0.01250329, "epoch": 0.570689914324365, "flos": 65303632915200.0, "grad_norm": 0.7821151373636501, "language_loss": 0.57356369, "learning_rate": 1.641118147266011e-06, "loss": 0.64914799, "num_input_tokens_seen": 204525580, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01583862, "step": 9492, "time_per_iteration": 3.170997381210327 }, { "auxiliary_loss_clip": 0.06431067, "auxiliary_loss_mlp": 0.01266746, "balance_loss_clip": 0.06280057, "balance_loss_mlp": 0.01255654, "epoch": 0.5707500375770329, "flos": 21148225119360.0, "grad_norm": 1.6688103244791233, "language_loss": 0.72049391, "learning_rate": 1.6407350138412035e-06, "loss": 0.797472, "num_input_tokens_seen": 204541320, "router_z_loss_clip": 1.50976562, "router_z_loss_mlp": 0.11090088, "step": 9493, "time_per_iteration": 2.544597625732422 }, { "auxiliary_loss_clip": 0.06437695, "auxiliary_loss_mlp": 0.01269882, "balance_loss_clip": 0.06281671, "balance_loss_mlp": 0.01259212, "epoch": 0.5708101608297009, "flos": 20818881446400.0, "grad_norm": 1.5541498569198697, "language_loss": 0.77893776, "learning_rate": 1.6403518940397606e-06, "loss": 0.85601354, "num_input_tokens_seen": 204560275, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.10675049, "step": 9494, "time_per_iteration": 2.5574071407318115 }, { "auxiliary_loss_clip": 0.06440463, "auxiliary_loss_mlp": 0.01265498, "balance_loss_clip": 0.06280214, "balance_loss_mlp": 0.01253774, "epoch": 0.5708702840823688, "flos": 25819482862080.0, "grad_norm": 2.2119925899581094, "language_loss": 0.80440283, "learning_rate": 1.6399687878762096e-06, "loss": 0.88146245, "num_input_tokens_seen": 204579430, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.1171875, "step": 9495, "time_per_iteration": 4.047702312469482 }, { "auxiliary_loss_clip": 0.06443778, "auxiliary_loss_mlp": 0.01272852, "balance_loss_clip": 0.06282224, "balance_loss_mlp": 0.01259357, "epoch": 0.5709304073350369, "flos": 23657400760320.0, "grad_norm": 2.0949984521064304, "language_loss": 0.66749823, "learning_rate": 1.6395856953650784e-06, "loss": 0.74466455, "num_input_tokens_seen": 204597710, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.13500977, "step": 9496, "time_per_iteration": 2.5960195064544678 }, { "auxiliary_loss_clip": 0.06435551, "auxiliary_loss_mlp": 0.01268434, "balance_loss_clip": 0.06276812, "balance_loss_mlp": 0.01256561, "epoch": 0.5709905305877048, "flos": 16113144948480.0, "grad_norm": 2.3770278902251967, "language_loss": 0.70031321, "learning_rate": 1.6392026165208938e-06, "loss": 0.77735305, "num_input_tokens_seen": 204616140, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.11877441, "step": 9497, "time_per_iteration": 2.5216617584228516 }, { "auxiliary_loss_clip": 0.06434866, "auxiliary_loss_mlp": 0.0126854, "balance_loss_clip": 0.06279147, "balance_loss_mlp": 0.01256786, "epoch": 0.5710506538403728, "flos": 24757211525760.0, "grad_norm": 1.901454447312284, "language_loss": 0.81498754, "learning_rate": 1.638819551358182e-06, "loss": 0.89202166, "num_input_tokens_seen": 204636470, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.11767578, "step": 9498, "time_per_iteration": 2.590731382369995 }, { "auxiliary_loss_clip": 0.0643186, "auxiliary_loss_mlp": 0.01270109, "balance_loss_clip": 0.06275994, "balance_loss_mlp": 0.01258081, "epoch": 0.5711107770930407, "flos": 21988907533440.0, "grad_norm": 1.6930422291868468, "language_loss": 0.66581994, "learning_rate": 1.638436499891469e-06, "loss": 0.74283963, "num_input_tokens_seen": 204656640, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.12036133, "step": 9499, "time_per_iteration": 2.55090594291687 }, { "auxiliary_loss_clip": 0.06430693, "auxiliary_loss_mlp": 0.01268529, "balance_loss_clip": 0.06276456, "balance_loss_mlp": 0.01256912, "epoch": 0.5711709003457087, "flos": 19580233265280.0, "grad_norm": 1.5497896404200986, "language_loss": 0.71562397, "learning_rate": 1.6380534621352805e-06, "loss": 0.79261625, "num_input_tokens_seen": 204675475, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.11627197, "step": 9500, "time_per_iteration": 2.556591272354126 }, { "auxiliary_loss_clip": 0.06438828, "auxiliary_loss_mlp": 0.01268864, "balance_loss_clip": 0.06279883, "balance_loss_mlp": 0.01256299, "epoch": 0.5712310235983766, "flos": 24249436583040.0, "grad_norm": 1.9522674816390257, "language_loss": 0.76795816, "learning_rate": 1.6376704381041407e-06, "loss": 0.84503508, "num_input_tokens_seen": 204695385, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.12573242, "step": 9501, "time_per_iteration": 2.600473165512085 }, { "auxiliary_loss_clip": 0.06432505, "auxiliary_loss_mlp": 0.0126611, "balance_loss_clip": 0.06276831, "balance_loss_mlp": 0.01255166, "epoch": 0.5712911468510447, "flos": 21002469742080.0, "grad_norm": 2.652031052711224, "language_loss": 0.74758029, "learning_rate": 1.6372874278125742e-06, "loss": 0.82456642, "num_input_tokens_seen": 204714730, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.109375, "step": 9502, "time_per_iteration": 2.553053855895996 }, { "auxiliary_loss_clip": 0.06431884, "auxiliary_loss_mlp": 0.01266954, "balance_loss_clip": 0.06278962, "balance_loss_mlp": 0.01256177, "epoch": 0.5713512701037126, "flos": 18923055292800.0, "grad_norm": 1.9751679948333816, "language_loss": 0.82346702, "learning_rate": 1.636904431275105e-06, "loss": 0.90045547, "num_input_tokens_seen": 204735025, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.10778809, "step": 9503, "time_per_iteration": 2.5853872299194336 }, { "auxiliary_loss_clip": 0.06431893, "auxiliary_loss_mlp": 0.01270263, "balance_loss_clip": 0.06279602, "balance_loss_mlp": 0.01259689, "epoch": 0.5714113933563806, "flos": 17417983455360.0, "grad_norm": 2.5585895353982053, "language_loss": 0.86024678, "learning_rate": 1.6365214485062553e-06, "loss": 0.93726832, "num_input_tokens_seen": 204751365, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.10577393, "step": 9504, "time_per_iteration": 2.531419038772583 }, { "auxiliary_loss_clip": 0.0643028, "auxiliary_loss_mlp": 0.01267517, "balance_loss_clip": 0.06279019, "balance_loss_mlp": 0.01256543, "epoch": 0.5714715166090486, "flos": 20199536392320.0, "grad_norm": 2.0097295692029182, "language_loss": 0.75072837, "learning_rate": 1.6361384795205496e-06, "loss": 0.82770634, "num_input_tokens_seen": 204768980, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.10968018, "step": 9505, "time_per_iteration": 3.9893665313720703 }, { "auxiliary_loss_clip": 0.06428372, "auxiliary_loss_mlp": 0.01265669, "balance_loss_clip": 0.06277297, "balance_loss_mlp": 0.01254714, "epoch": 0.5715316398617165, "flos": 18557597710080.0, "grad_norm": 2.0515797259804334, "language_loss": 0.82450116, "learning_rate": 1.635755524332509e-06, "loss": 0.90144157, "num_input_tokens_seen": 204788110, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.10961914, "step": 9506, "time_per_iteration": 4.019757509231567 }, { "auxiliary_loss_clip": 0.06428036, "auxiliary_loss_mlp": 0.01264054, "balance_loss_clip": 0.06275804, "balance_loss_mlp": 0.01253653, "epoch": 0.5715917631143845, "flos": 18484028225280.0, "grad_norm": 1.6675583811361079, "language_loss": 0.7739116, "learning_rate": 1.6353725829566552e-06, "loss": 0.85083252, "num_input_tokens_seen": 204807240, "router_z_loss_clip": 1.52246094, "router_z_loss_mlp": 0.10400391, "step": 9507, "time_per_iteration": 2.569183111190796 }, { "auxiliary_loss_clip": 0.06432915, "auxiliary_loss_mlp": 0.01270303, "balance_loss_clip": 0.06276067, "balance_loss_mlp": 0.01257792, "epoch": 0.5716518863670524, "flos": 24026128652160.0, "grad_norm": 1.8041443378850925, "language_loss": 0.68732429, "learning_rate": 1.63498965540751e-06, "loss": 0.76435637, "num_input_tokens_seen": 204826415, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.12506104, "step": 9508, "time_per_iteration": 2.561312437057495 }, { "auxiliary_loss_clip": 0.06434235, "auxiliary_loss_mlp": 0.01265985, "balance_loss_clip": 0.06277822, "balance_loss_mlp": 0.01254028, "epoch": 0.5717120096197205, "flos": 17824879681920.0, "grad_norm": 1.8638981768864213, "language_loss": 0.80053663, "learning_rate": 1.634606741699593e-06, "loss": 0.8775388, "num_input_tokens_seen": 204844305, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.11968994, "step": 9509, "time_per_iteration": 2.5621793270111084 }, { "auxiliary_loss_clip": 0.06424435, "auxiliary_loss_mlp": 0.0126517, "balance_loss_clip": 0.06274652, "balance_loss_mlp": 0.0125459, "epoch": 0.5717721328723884, "flos": 21871551490560.0, "grad_norm": 2.228384645624059, "language_loss": 0.72584379, "learning_rate": 1.6342238418474255e-06, "loss": 0.80273986, "num_input_tokens_seen": 204861765, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.10577393, "step": 9510, "time_per_iteration": 2.530844211578369 }, { "auxiliary_loss_clip": 0.064293, "auxiliary_loss_mlp": 0.01268379, "balance_loss_clip": 0.06276434, "balance_loss_mlp": 0.01257513, "epoch": 0.5718322561250564, "flos": 28444924442880.0, "grad_norm": 1.2978610225360563, "language_loss": 0.69559276, "learning_rate": 1.6338409558655264e-06, "loss": 0.77256954, "num_input_tokens_seen": 204882505, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.10864258, "step": 9511, "time_per_iteration": 2.6143646240234375 }, { "auxiliary_loss_clip": 0.06435187, "auxiliary_loss_mlp": 0.01270234, "balance_loss_clip": 0.06281818, "balance_loss_mlp": 0.01258927, "epoch": 0.5718923793777243, "flos": 13556702805120.0, "grad_norm": 2.2493140334252875, "language_loss": 0.62141579, "learning_rate": 1.6334580837684152e-06, "loss": 0.69847, "num_input_tokens_seen": 204899830, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.11309814, "step": 9512, "time_per_iteration": 2.513113498687744 }, { "auxiliary_loss_clip": 0.06431154, "auxiliary_loss_mlp": 0.01266149, "balance_loss_clip": 0.06279448, "balance_loss_mlp": 0.01255635, "epoch": 0.5719525026303923, "flos": 17827856501760.0, "grad_norm": 2.348051486838345, "language_loss": 0.76236361, "learning_rate": 1.6330752255706104e-06, "loss": 0.83933663, "num_input_tokens_seen": 204918100, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.10510254, "step": 9513, "time_per_iteration": 2.545443058013916 }, { "auxiliary_loss_clip": 0.06317647, "auxiliary_loss_mlp": 0.01255163, "balance_loss_clip": 0.06256096, "balance_loss_mlp": 0.01253524, "epoch": 0.5720126258830602, "flos": 61314724097280.0, "grad_norm": 0.8779679620781775, "language_loss": 0.66898394, "learning_rate": 1.6326923812866288e-06, "loss": 0.74471205, "num_input_tokens_seen": 204972925, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01641846, "step": 9514, "time_per_iteration": 3.165541410446167 }, { "auxiliary_loss_clip": 0.06439404, "auxiliary_loss_mlp": 0.0127073, "balance_loss_clip": 0.06283195, "balance_loss_mlp": 0.01259601, "epoch": 0.5720727491357283, "flos": 23994878279040.0, "grad_norm": 2.3173142310926877, "language_loss": 0.82204986, "learning_rate": 1.63230955093099e-06, "loss": 0.89915121, "num_input_tokens_seen": 204990910, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.11126709, "step": 9515, "time_per_iteration": 2.6131751537323 }, { "auxiliary_loss_clip": 0.06425323, "auxiliary_loss_mlp": 0.01265094, "balance_loss_clip": 0.06276704, "balance_loss_mlp": 0.01254991, "epoch": 0.5721328723883962, "flos": 23412359894400.0, "grad_norm": 1.7398982379490606, "language_loss": 0.86046183, "learning_rate": 1.6319267345182092e-06, "loss": 0.93736601, "num_input_tokens_seen": 205010500, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.10101318, "step": 9516, "time_per_iteration": 2.589735984802246 }, { "auxiliary_loss_clip": 0.06428313, "auxiliary_loss_mlp": 0.01267223, "balance_loss_clip": 0.06276487, "balance_loss_mlp": 0.01256065, "epoch": 0.5721929956410642, "flos": 18810520859520.0, "grad_norm": 1.6365673484487753, "language_loss": 0.8807528, "learning_rate": 1.6315439320628038e-06, "loss": 0.95770818, "num_input_tokens_seen": 205028560, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.11175537, "step": 9517, "time_per_iteration": 2.5871875286102295 }, { "auxiliary_loss_clip": 0.06427264, "auxiliary_loss_mlp": 0.01265839, "balance_loss_clip": 0.06275682, "balance_loss_mlp": 0.01254383, "epoch": 0.5722531188937322, "flos": 27203676785280.0, "grad_norm": 2.7911681984208223, "language_loss": 0.85374081, "learning_rate": 1.6311611435792893e-06, "loss": 0.93067193, "num_input_tokens_seen": 205048650, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.11456299, "step": 9518, "time_per_iteration": 2.6477816104888916 }, { "auxiliary_loss_clip": 0.06430547, "auxiliary_loss_mlp": 0.01265927, "balance_loss_clip": 0.06281988, "balance_loss_mlp": 0.01255067, "epoch": 0.5723132421464001, "flos": 15201157109760.0, "grad_norm": 1.7646463363109137, "language_loss": 0.79078472, "learning_rate": 1.6307783690821812e-06, "loss": 0.86774939, "num_input_tokens_seen": 205066480, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.10858154, "step": 9519, "time_per_iteration": 2.578333616256714 }, { "auxiliary_loss_clip": 0.0642866, "auxiliary_loss_mlp": 0.01267283, "balance_loss_clip": 0.06277583, "balance_loss_mlp": 0.01256995, "epoch": 0.5723733653990681, "flos": 27606757651200.0, "grad_norm": 2.158351827047358, "language_loss": 0.8307966, "learning_rate": 1.6303956085859944e-06, "loss": 0.90775609, "num_input_tokens_seen": 205087475, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.10290527, "step": 9520, "time_per_iteration": 2.6944804191589355 }, { "auxiliary_loss_clip": 0.06438503, "auxiliary_loss_mlp": 0.01265158, "balance_loss_clip": 0.06283236, "balance_loss_mlp": 0.01254054, "epoch": 0.572433488651736, "flos": 18228673307520.0, "grad_norm": 2.097492364140699, "language_loss": 0.73068249, "learning_rate": 1.630012862105243e-06, "loss": 0.80771911, "num_input_tokens_seen": 205106495, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.11102295, "step": 9521, "time_per_iteration": 2.5501115322113037 }, { "auxiliary_loss_clip": 0.0643283, "auxiliary_loss_mlp": 0.01269307, "balance_loss_clip": 0.06280965, "balance_loss_mlp": 0.01258381, "epoch": 0.5724936119044041, "flos": 31257224628480.0, "grad_norm": 1.6101112906168435, "language_loss": 0.78366953, "learning_rate": 1.6296301296544415e-06, "loss": 0.86069095, "num_input_tokens_seen": 205128285, "router_z_loss_clip": 1.51660156, "router_z_loss_mlp": 0.10925293, "step": 9522, "time_per_iteration": 2.658860683441162 }, { "auxiliary_loss_clip": 0.06429064, "auxiliary_loss_mlp": 0.01266821, "balance_loss_clip": 0.06279925, "balance_loss_mlp": 0.01256682, "epoch": 0.572553735157072, "flos": 19207186888320.0, "grad_norm": 1.4738005252993995, "language_loss": 0.71780735, "learning_rate": 1.629247411248102e-06, "loss": 0.79476619, "num_input_tokens_seen": 205146595, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.10137939, "step": 9523, "time_per_iteration": 2.692147731781006 }, { "auxiliary_loss_clip": 0.06426417, "auxiliary_loss_mlp": 0.01265119, "balance_loss_clip": 0.06276982, "balance_loss_mlp": 0.01254677, "epoch": 0.57261385840974, "flos": 21221249552640.0, "grad_norm": 2.04034051090782, "language_loss": 0.70386708, "learning_rate": 1.628864706900738e-06, "loss": 0.78078246, "num_input_tokens_seen": 205164295, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.10449219, "step": 9524, "time_per_iteration": 2.5815069675445557 }, { "auxiliary_loss_clip": 0.06429904, "auxiliary_loss_mlp": 0.01272433, "balance_loss_clip": 0.06277941, "balance_loss_mlp": 0.01261197, "epoch": 0.5726739816624079, "flos": 33992936582400.0, "grad_norm": 1.5366297095130785, "language_loss": 0.65428072, "learning_rate": 1.6284820166268615e-06, "loss": 0.73130411, "num_input_tokens_seen": 205185380, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.11230469, "step": 9525, "time_per_iteration": 2.661463499069214 }, { "auxiliary_loss_clip": 0.06421977, "auxiliary_loss_mlp": 0.01263995, "balance_loss_clip": 0.06272308, "balance_loss_mlp": 0.01253785, "epoch": 0.5727341049150759, "flos": 24282196329600.0, "grad_norm": 1.7241963151865887, "language_loss": 0.72924888, "learning_rate": 1.628099340440984e-06, "loss": 0.80610859, "num_input_tokens_seen": 205204895, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.10211182, "step": 9526, "time_per_iteration": 2.5658555030822754 }, { "auxiliary_loss_clip": 0.0642582, "auxiliary_loss_mlp": 0.01264933, "balance_loss_clip": 0.06278431, "balance_loss_mlp": 0.0125446, "epoch": 0.5727942281677438, "flos": 28407762357120.0, "grad_norm": 1.6161476478685775, "language_loss": 0.80310762, "learning_rate": 1.6277166783576176e-06, "loss": 0.88001519, "num_input_tokens_seen": 205223440, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.10473633, "step": 9527, "time_per_iteration": 2.6162173748016357 }, { "auxiliary_loss_clip": 0.06423974, "auxiliary_loss_mlp": 0.01268267, "balance_loss_clip": 0.0627582, "balance_loss_mlp": 0.01257693, "epoch": 0.5728543514204119, "flos": 19542861544320.0, "grad_norm": 1.662314536456357, "language_loss": 0.73270607, "learning_rate": 1.6273340303912713e-06, "loss": 0.80962855, "num_input_tokens_seen": 205242800, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.10565186, "step": 9528, "time_per_iteration": 2.5557165145874023 }, { "auxiliary_loss_clip": 0.06431782, "auxiliary_loss_mlp": 0.01268048, "balance_loss_clip": 0.06279504, "balance_loss_mlp": 0.01256741, "epoch": 0.5729144746730798, "flos": 21513137650560.0, "grad_norm": 1.804864543760487, "language_loss": 0.85749382, "learning_rate": 1.6269513965564557e-06, "loss": 0.93449205, "num_input_tokens_seen": 205259465, "router_z_loss_clip": 1.52246094, "router_z_loss_mlp": 0.11291504, "step": 9529, "time_per_iteration": 2.5876216888427734 }, { "auxiliary_loss_clip": 0.06322469, "auxiliary_loss_mlp": 0.01258723, "balance_loss_clip": 0.06261009, "balance_loss_mlp": 0.01256863, "epoch": 0.5729745979257478, "flos": 58699638495360.0, "grad_norm": 0.7450681614338764, "language_loss": 0.55943519, "learning_rate": 1.6265687768676813e-06, "loss": 0.63524711, "num_input_tokens_seen": 205314100, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01855469, "step": 9530, "time_per_iteration": 4.492400169372559 }, { "auxiliary_loss_clip": 0.06436565, "auxiliary_loss_mlp": 0.01267955, "balance_loss_clip": 0.06281728, "balance_loss_mlp": 0.0125669, "epoch": 0.5730347211784158, "flos": 18558100834560.0, "grad_norm": 1.582034214618809, "language_loss": 0.66826898, "learning_rate": 1.6261861713394553e-06, "loss": 0.74531424, "num_input_tokens_seen": 205333420, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.11273193, "step": 9531, "time_per_iteration": 2.550940752029419 }, { "auxiliary_loss_clip": 0.06429868, "auxiliary_loss_mlp": 0.01270564, "balance_loss_clip": 0.06276548, "balance_loss_mlp": 0.01259072, "epoch": 0.5730948444310837, "flos": 38040069588480.0, "grad_norm": 1.9421365381249724, "language_loss": 0.76104701, "learning_rate": 1.6258035799862876e-06, "loss": 0.83805132, "num_input_tokens_seen": 205350995, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.11486816, "step": 9532, "time_per_iteration": 2.6776652336120605 }, { "auxiliary_loss_clip": 0.06432152, "auxiliary_loss_mlp": 0.01268366, "balance_loss_clip": 0.06280234, "balance_loss_mlp": 0.01256922, "epoch": 0.5731549676837517, "flos": 25233861876480.0, "grad_norm": 1.2360594151607474, "language_loss": 0.78984666, "learning_rate": 1.625421002822686e-06, "loss": 0.86685181, "num_input_tokens_seen": 205372675, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.11450195, "step": 9533, "time_per_iteration": 2.6516952514648438 }, { "auxiliary_loss_clip": 0.06427442, "auxiliary_loss_mlp": 0.01269935, "balance_loss_clip": 0.06279542, "balance_loss_mlp": 0.01258741, "epoch": 0.5732150909364196, "flos": 23375030100480.0, "grad_norm": 1.7083132206814735, "language_loss": 0.8588748, "learning_rate": 1.6250384398631574e-06, "loss": 0.93584847, "num_input_tokens_seen": 205392590, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.11193848, "step": 9534, "time_per_iteration": 4.084874868392944 }, { "auxiliary_loss_clip": 0.06434315, "auxiliary_loss_mlp": 0.01270179, "balance_loss_clip": 0.06282597, "balance_loss_mlp": 0.01258723, "epoch": 0.5732752141890877, "flos": 23086621946880.0, "grad_norm": 1.7129701635157768, "language_loss": 0.75613916, "learning_rate": 1.6246558911222085e-06, "loss": 0.83318418, "num_input_tokens_seen": 205414885, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.11456299, "step": 9535, "time_per_iteration": 2.619354724884033 }, { "auxiliary_loss_clip": 0.0644153, "auxiliary_loss_mlp": 0.01267879, "balance_loss_clip": 0.06284322, "balance_loss_mlp": 0.01256137, "epoch": 0.5733353374417556, "flos": 24359078050560.0, "grad_norm": 1.5694962751399144, "language_loss": 0.7118845, "learning_rate": 1.624273356614346e-06, "loss": 0.78897858, "num_input_tokens_seen": 205434440, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.11743164, "step": 9536, "time_per_iteration": 2.5876541137695312 }, { "auxiliary_loss_clip": 0.06429283, "auxiliary_loss_mlp": 0.01272086, "balance_loss_clip": 0.06278867, "balance_loss_mlp": 0.01261327, "epoch": 0.5733954606944236, "flos": 27206234334720.0, "grad_norm": 1.770575310232024, "language_loss": 0.70155942, "learning_rate": 1.6238908363540755e-06, "loss": 0.77857304, "num_input_tokens_seen": 205454225, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.10760498, "step": 9537, "time_per_iteration": 2.6157357692718506 }, { "auxiliary_loss_clip": 0.06433006, "auxiliary_loss_mlp": 0.01270131, "balance_loss_clip": 0.06280001, "balance_loss_mlp": 0.01258806, "epoch": 0.5734555839470915, "flos": 28772339472000.0, "grad_norm": 4.295556724255883, "language_loss": 0.63040137, "learning_rate": 1.623508330355902e-06, "loss": 0.70743275, "num_input_tokens_seen": 205474750, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.11322021, "step": 9538, "time_per_iteration": 2.625312089920044 }, { "auxiliary_loss_clip": 0.06435797, "auxiliary_loss_mlp": 0.01274512, "balance_loss_clip": 0.06284104, "balance_loss_mlp": 0.01262496, "epoch": 0.5735157071997595, "flos": 22973542462080.0, "grad_norm": 2.381119084867635, "language_loss": 0.83141601, "learning_rate": 1.6231258386343306e-06, "loss": 0.90851909, "num_input_tokens_seen": 205495495, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.11999512, "step": 9539, "time_per_iteration": 2.60353684425354 }, { "auxiliary_loss_clip": 0.06437133, "auxiliary_loss_mlp": 0.01267825, "balance_loss_clip": 0.06281005, "balance_loss_mlp": 0.01255159, "epoch": 0.5735758304524274, "flos": 18995450820480.0, "grad_norm": 2.1276826130235422, "language_loss": 0.73098767, "learning_rate": 1.6227433612038647e-06, "loss": 0.80803722, "num_input_tokens_seen": 205510070, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.12658691, "step": 9540, "time_per_iteration": 2.5835657119750977 }, { "auxiliary_loss_clip": 0.06432207, "auxiliary_loss_mlp": 0.01268986, "balance_loss_clip": 0.06279996, "balance_loss_mlp": 0.01258436, "epoch": 0.5736359537050955, "flos": 28404701683200.0, "grad_norm": 2.0968556940702836, "language_loss": 0.80792296, "learning_rate": 1.6223608980790089e-06, "loss": 0.8849349, "num_input_tokens_seen": 205530190, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.10552979, "step": 9541, "time_per_iteration": 2.61803936958313 }, { "auxiliary_loss_clip": 0.06439106, "auxiliary_loss_mlp": 0.01266737, "balance_loss_clip": 0.06282518, "balance_loss_mlp": 0.01254631, "epoch": 0.5736960769577634, "flos": 15631714915200.0, "grad_norm": 2.9616802841719334, "language_loss": 0.65658557, "learning_rate": 1.6219784492742654e-06, "loss": 0.73364401, "num_input_tokens_seen": 205547380, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.12097168, "step": 9542, "time_per_iteration": 2.5202319622039795 }, { "auxiliary_loss_clip": 0.06432098, "auxiliary_loss_mlp": 0.01266319, "balance_loss_clip": 0.06279382, "balance_loss_mlp": 0.01255876, "epoch": 0.5737562002104314, "flos": 18009767715840.0, "grad_norm": 2.053997777327673, "language_loss": 0.8324669, "learning_rate": 1.6215960148041365e-06, "loss": 0.90945107, "num_input_tokens_seen": 205566540, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.10430908, "step": 9543, "time_per_iteration": 2.5399630069732666 }, { "auxiliary_loss_clip": 0.06443291, "auxiliary_loss_mlp": 0.01266239, "balance_loss_clip": 0.06285435, "balance_loss_mlp": 0.0125377, "epoch": 0.5738163234630994, "flos": 20703454047360.0, "grad_norm": 1.888731392644075, "language_loss": 0.73857981, "learning_rate": 1.6212135946831257e-06, "loss": 0.81567514, "num_input_tokens_seen": 205584200, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.12469482, "step": 9544, "time_per_iteration": 4.024409770965576 }, { "auxiliary_loss_clip": 0.06440287, "auxiliary_loss_mlp": 0.01269002, "balance_loss_clip": 0.0628391, "balance_loss_mlp": 0.01257451, "epoch": 0.5738764467157673, "flos": 23156082581760.0, "grad_norm": 1.6923719475140615, "language_loss": 0.76243067, "learning_rate": 1.620831188925733e-06, "loss": 0.83952361, "num_input_tokens_seen": 205604675, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.11547852, "step": 9545, "time_per_iteration": 4.072517156600952 }, { "auxiliary_loss_clip": 0.06439203, "auxiliary_loss_mlp": 0.01266444, "balance_loss_clip": 0.06285702, "balance_loss_mlp": 0.01254857, "epoch": 0.5739365699684353, "flos": 29499942401280.0, "grad_norm": 2.5143641247744433, "language_loss": 0.56719339, "learning_rate": 1.620448797546459e-06, "loss": 0.6442498, "num_input_tokens_seen": 205624680, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.11578369, "step": 9546, "time_per_iteration": 2.594257116317749 }, { "auxiliary_loss_clip": 0.06438465, "auxiliary_loss_mlp": 0.01267429, "balance_loss_clip": 0.06282012, "balance_loss_mlp": 0.01255389, "epoch": 0.5739966932211032, "flos": 14032388833920.0, "grad_norm": 2.3090670226031165, "language_loss": 0.77207363, "learning_rate": 1.6200664205598055e-06, "loss": 0.84913254, "num_input_tokens_seen": 205641950, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.12036133, "step": 9547, "time_per_iteration": 2.534053325653076 }, { "auxiliary_loss_clip": 0.0643751, "auxiliary_loss_mlp": 0.0126814, "balance_loss_clip": 0.06282838, "balance_loss_mlp": 0.01256553, "epoch": 0.5740568164737713, "flos": 19067972129280.0, "grad_norm": 1.9373565187595978, "language_loss": 0.7468189, "learning_rate": 1.6196840579802704e-06, "loss": 0.82387543, "num_input_tokens_seen": 205660130, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.11584473, "step": 9548, "time_per_iteration": 2.600274085998535 }, { "auxiliary_loss_clip": 0.06438658, "auxiliary_loss_mlp": 0.01268149, "balance_loss_clip": 0.06285146, "balance_loss_mlp": 0.0125633, "epoch": 0.5741169397264392, "flos": 22134453275520.0, "grad_norm": 2.348902093044922, "language_loss": 0.69824785, "learning_rate": 1.619301709822355e-06, "loss": 0.77531588, "num_input_tokens_seen": 205678895, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.11834717, "step": 9549, "time_per_iteration": 2.5466201305389404 }, { "auxiliary_loss_clip": 0.06438175, "auxiliary_loss_mlp": 0.01266517, "balance_loss_clip": 0.06286934, "balance_loss_mlp": 0.01255329, "epoch": 0.5741770629791072, "flos": 24943860495360.0, "grad_norm": 4.098573503669879, "language_loss": 0.79784966, "learning_rate": 1.6189193761005564e-06, "loss": 0.87489665, "num_input_tokens_seen": 205698450, "router_z_loss_clip": 1.50976562, "router_z_loss_mlp": 0.11193848, "step": 9550, "time_per_iteration": 2.608700752258301 }, { "auxiliary_loss_clip": 0.06440853, "auxiliary_loss_mlp": 0.01265457, "balance_loss_clip": 0.06287633, "balance_loss_mlp": 0.01253565, "epoch": 0.5742371862317751, "flos": 18806495863680.0, "grad_norm": 2.495363108666906, "language_loss": 0.68166023, "learning_rate": 1.6185370568293727e-06, "loss": 0.75872332, "num_input_tokens_seen": 205714870, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.11871338, "step": 9551, "time_per_iteration": 2.5269386768341064 }, { "auxiliary_loss_clip": 0.06439494, "auxiliary_loss_mlp": 0.01266659, "balance_loss_clip": 0.06280716, "balance_loss_mlp": 0.01254661, "epoch": 0.5742973094844431, "flos": 24467293998720.0, "grad_norm": 4.430568473224562, "language_loss": 0.72162676, "learning_rate": 1.6181547520233031e-06, "loss": 0.79868823, "num_input_tokens_seen": 205736045, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.11987305, "step": 9552, "time_per_iteration": 2.6000168323516846 }, { "auxiliary_loss_clip": 0.06433257, "auxiliary_loss_mlp": 0.01266282, "balance_loss_clip": 0.06281354, "balance_loss_mlp": 0.01255291, "epoch": 0.574357432737111, "flos": 21659186517120.0, "grad_norm": 1.9523131110643483, "language_loss": 0.79787374, "learning_rate": 1.617772461696843e-06, "loss": 0.87486911, "num_input_tokens_seen": 205754445, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.10998535, "step": 9553, "time_per_iteration": 2.5863678455352783 }, { "auxiliary_loss_clip": 0.06440848, "auxiliary_loss_mlp": 0.01264863, "balance_loss_clip": 0.06281094, "balance_loss_mlp": 0.01253848, "epoch": 0.5744175559897791, "flos": 16550285299200.0, "grad_norm": 2.4935571344755583, "language_loss": 0.83459955, "learning_rate": 1.6173901858644895e-06, "loss": 0.91165668, "num_input_tokens_seen": 205770595, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.11016846, "step": 9554, "time_per_iteration": 2.5354549884796143 }, { "auxiliary_loss_clip": 0.06444962, "auxiliary_loss_mlp": 0.01266664, "balance_loss_clip": 0.0628693, "balance_loss_mlp": 0.01254981, "epoch": 0.574477679242447, "flos": 24214580484480.0, "grad_norm": 1.3616998846566375, "language_loss": 0.70760989, "learning_rate": 1.6170079245407385e-06, "loss": 0.7847262, "num_input_tokens_seen": 205791935, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.11682129, "step": 9555, "time_per_iteration": 2.5969431400299072 }, { "auxiliary_loss_clip": 0.06433679, "auxiliary_loss_mlp": 0.01270057, "balance_loss_clip": 0.06281558, "balance_loss_mlp": 0.01258428, "epoch": 0.574537802495115, "flos": 14908304689920.0, "grad_norm": 2.9545741427525534, "language_loss": 0.73502851, "learning_rate": 1.6166256777400853e-06, "loss": 0.8120659, "num_input_tokens_seen": 205807260, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.11633301, "step": 9556, "time_per_iteration": 2.5254125595092773 }, { "auxiliary_loss_clip": 0.0643323, "auxiliary_loss_mlp": 0.01266144, "balance_loss_clip": 0.0628143, "balance_loss_mlp": 0.01254324, "epoch": 0.5745979257477829, "flos": 24941680289280.0, "grad_norm": 2.0030178847242994, "language_loss": 0.7455979, "learning_rate": 1.6162434454770248e-06, "loss": 0.8225916, "num_input_tokens_seen": 205826885, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.11804199, "step": 9557, "time_per_iteration": 2.5679142475128174 }, { "auxiliary_loss_clip": 0.06435105, "auxiliary_loss_mlp": 0.01264034, "balance_loss_clip": 0.06282829, "balance_loss_mlp": 0.01252596, "epoch": 0.5746580490004509, "flos": 17241061559040.0, "grad_norm": 1.6846452923454511, "language_loss": 0.68390179, "learning_rate": 1.6158612277660514e-06, "loss": 0.76089311, "num_input_tokens_seen": 205844630, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.11450195, "step": 9558, "time_per_iteration": 2.5237245559692383 }, { "auxiliary_loss_clip": 0.06446853, "auxiliary_loss_mlp": 0.01271573, "balance_loss_clip": 0.06285113, "balance_loss_mlp": 0.01257637, "epoch": 0.5747181722531189, "flos": 13192838449920.0, "grad_norm": 2.16419654219503, "language_loss": 0.70811081, "learning_rate": 1.615479024621659e-06, "loss": 0.78529507, "num_input_tokens_seen": 205860960, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.13934326, "step": 9559, "time_per_iteration": 2.4907774925231934 }, { "auxiliary_loss_clip": 0.06435239, "auxiliary_loss_mlp": 0.012665, "balance_loss_clip": 0.06283974, "balance_loss_mlp": 0.01255973, "epoch": 0.5747782955057869, "flos": 22969098195840.0, "grad_norm": 1.567384085371149, "language_loss": 0.79207724, "learning_rate": 1.6150968360583398e-06, "loss": 0.86909461, "num_input_tokens_seen": 205880675, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.10528564, "step": 9560, "time_per_iteration": 2.585415840148926 }, { "auxiliary_loss_clip": 0.06439216, "auxiliary_loss_mlp": 0.01267271, "balance_loss_clip": 0.06283214, "balance_loss_mlp": 0.0125591, "epoch": 0.5748384187584549, "flos": 23409802344960.0, "grad_norm": 2.19621945680679, "language_loss": 0.64373672, "learning_rate": 1.614714662090588e-06, "loss": 0.72080159, "num_input_tokens_seen": 205900050, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.11352539, "step": 9561, "time_per_iteration": 2.530395746231079 }, { "auxiliary_loss_clip": 0.06447917, "auxiliary_loss_mlp": 0.01268452, "balance_loss_clip": 0.06287518, "balance_loss_mlp": 0.01256507, "epoch": 0.5748985420111228, "flos": 17791323321600.0, "grad_norm": 1.6099378752161595, "language_loss": 0.71858633, "learning_rate": 1.6143325027328945e-06, "loss": 0.79575002, "num_input_tokens_seen": 205918855, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.1194458, "step": 9562, "time_per_iteration": 2.5521938800811768 }, { "auxiliary_loss_clip": 0.06439976, "auxiliary_loss_mlp": 0.01269695, "balance_loss_clip": 0.06285346, "balance_loss_mlp": 0.01258859, "epoch": 0.5749586652637908, "flos": 19872582560640.0, "grad_norm": 1.6150014016230791, "language_loss": 0.84203708, "learning_rate": 1.613950357999751e-06, "loss": 0.91913378, "num_input_tokens_seen": 205936970, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.10827637, "step": 9563, "time_per_iteration": 2.54866623878479 }, { "auxiliary_loss_clip": 0.06438974, "auxiliary_loss_mlp": 0.01268408, "balance_loss_clip": 0.06280211, "balance_loss_mlp": 0.01256243, "epoch": 0.5750187885164587, "flos": 21293477372160.0, "grad_norm": 1.9812360326496048, "language_loss": 0.57128823, "learning_rate": 1.6135682279056488e-06, "loss": 0.64836204, "num_input_tokens_seen": 205954630, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.12176514, "step": 9564, "time_per_iteration": 2.577003002166748 }, { "auxiliary_loss_clip": 0.06431223, "auxiliary_loss_mlp": 0.01264094, "balance_loss_clip": 0.06283756, "balance_loss_mlp": 0.01253437, "epoch": 0.5750789117691267, "flos": 18810227370240.0, "grad_norm": 1.7074668783949043, "language_loss": 0.76385224, "learning_rate": 1.613186112465078e-06, "loss": 0.84080541, "num_input_tokens_seen": 205971510, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10644531, "step": 9565, "time_per_iteration": 2.514478921890259 }, { "auxiliary_loss_clip": 0.06316438, "auxiliary_loss_mlp": 0.01250649, "balance_loss_clip": 0.06254748, "balance_loss_mlp": 0.01249119, "epoch": 0.5751390350217946, "flos": 70685624188800.0, "grad_norm": 0.7243022659258055, "language_loss": 0.60640299, "learning_rate": 1.6128040116925287e-06, "loss": 0.68207383, "num_input_tokens_seen": 206035125, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01529694, "step": 9566, "time_per_iteration": 3.2607717514038086 }, { "auxiliary_loss_clip": 0.06436367, "auxiliary_loss_mlp": 0.01269806, "balance_loss_clip": 0.06282727, "balance_loss_mlp": 0.01258886, "epoch": 0.5751991582744627, "flos": 14251545987840.0, "grad_norm": 2.542574502043307, "language_loss": 0.75636363, "learning_rate": 1.6124219256024901e-06, "loss": 0.83342534, "num_input_tokens_seen": 206052075, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.10925293, "step": 9567, "time_per_iteration": 2.5257985591888428 }, { "auxiliary_loss_clip": 0.06435617, "auxiliary_loss_mlp": 0.01266697, "balance_loss_clip": 0.0628204, "balance_loss_mlp": 0.0125505, "epoch": 0.5752592815271306, "flos": 18333283530240.0, "grad_norm": 1.4486962280626117, "language_loss": 0.75114334, "learning_rate": 1.6120398542094504e-06, "loss": 0.82816648, "num_input_tokens_seen": 206069970, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.11651611, "step": 9568, "time_per_iteration": 2.6320157051086426 }, { "auxiliary_loss_clip": 0.06437702, "auxiliary_loss_mlp": 0.01265864, "balance_loss_clip": 0.06283712, "balance_loss_mlp": 0.01254324, "epoch": 0.5753194047797986, "flos": 20928984111360.0, "grad_norm": 1.6291402437351437, "language_loss": 0.71562326, "learning_rate": 1.6116577975278994e-06, "loss": 0.79265893, "num_input_tokens_seen": 206088950, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.11529541, "step": 9569, "time_per_iteration": 4.021485328674316 }, { "auxiliary_loss_clip": 0.06433585, "auxiliary_loss_mlp": 0.01269365, "balance_loss_clip": 0.0627979, "balance_loss_mlp": 0.01257634, "epoch": 0.5753795280324665, "flos": 19287925896960.0, "grad_norm": 2.057111924723765, "language_loss": 0.55962467, "learning_rate": 1.6112757555723223e-06, "loss": 0.63665414, "num_input_tokens_seen": 206107780, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.1171875, "step": 9570, "time_per_iteration": 2.5689778327941895 }, { "auxiliary_loss_clip": 0.06432136, "auxiliary_loss_mlp": 0.012646, "balance_loss_clip": 0.06281887, "balance_loss_mlp": 0.01253913, "epoch": 0.5754396512851345, "flos": 21659312298240.0, "grad_norm": 1.4505293551019829, "language_loss": 0.64680082, "learning_rate": 1.6108937283572082e-06, "loss": 0.72376823, "num_input_tokens_seen": 206127445, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.10693359, "step": 9571, "time_per_iteration": 2.648625612258911 }, { "auxiliary_loss_clip": 0.06431744, "auxiliary_loss_mlp": 0.0126937, "balance_loss_clip": 0.06280008, "balance_loss_mlp": 0.01257377, "epoch": 0.5754997745378025, "flos": 51032674707840.0, "grad_norm": 1.4361946835601085, "language_loss": 0.671381, "learning_rate": 1.6105117158970434e-06, "loss": 0.74839211, "num_input_tokens_seen": 206152005, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.11999512, "step": 9572, "time_per_iteration": 2.8015925884246826 }, { "auxiliary_loss_clip": 0.06435628, "auxiliary_loss_mlp": 0.01265532, "balance_loss_clip": 0.06285755, "balance_loss_mlp": 0.0125376, "epoch": 0.5755598977904705, "flos": 22863523651200.0, "grad_norm": 1.849179797850525, "language_loss": 0.72355688, "learning_rate": 1.6101297182063123e-06, "loss": 0.80056846, "num_input_tokens_seen": 206169875, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.11761475, "step": 9573, "time_per_iteration": 2.5569565296173096 }, { "auxiliary_loss_clip": 0.06428625, "auxiliary_loss_mlp": 0.01269319, "balance_loss_clip": 0.06282571, "balance_loss_mlp": 0.01259067, "epoch": 0.5756200210431385, "flos": 38482073475840.0, "grad_norm": 1.7462809724391486, "language_loss": 0.76180959, "learning_rate": 1.6097477352995022e-06, "loss": 0.83878911, "num_input_tokens_seen": 206192635, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10247803, "step": 9574, "time_per_iteration": 4.114708185195923 }, { "auxiliary_loss_clip": 0.06445349, "auxiliary_loss_mlp": 0.01267217, "balance_loss_clip": 0.06286137, "balance_loss_mlp": 0.01255087, "epoch": 0.5756801442958064, "flos": 23915984060160.0, "grad_norm": 5.857835704974745, "language_loss": 0.66670167, "learning_rate": 1.6093657671910968e-06, "loss": 0.74382734, "num_input_tokens_seen": 206211485, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.12121582, "step": 9575, "time_per_iteration": 2.5570921897888184 }, { "auxiliary_loss_clip": 0.06427887, "auxiliary_loss_mlp": 0.0126593, "balance_loss_clip": 0.06280826, "balance_loss_mlp": 0.01255511, "epoch": 0.5757402675484744, "flos": 21111566158080.0, "grad_norm": 1.5490082105772427, "language_loss": 0.80602777, "learning_rate": 1.6089838138955804e-06, "loss": 0.88296592, "num_input_tokens_seen": 206231740, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.10412598, "step": 9576, "time_per_iteration": 2.559440851211548 }, { "auxiliary_loss_clip": 0.06430835, "auxiliary_loss_mlp": 0.01271052, "balance_loss_clip": 0.06282493, "balance_loss_mlp": 0.01260484, "epoch": 0.5758003908011423, "flos": 20565497099520.0, "grad_norm": 1.756442209542146, "language_loss": 0.69830805, "learning_rate": 1.6086018754274372e-06, "loss": 0.77532685, "num_input_tokens_seen": 206250975, "router_z_loss_clip": 1.48242188, "router_z_loss_mlp": 0.10565186, "step": 9577, "time_per_iteration": 2.526765823364258 }, { "auxiliary_loss_clip": 0.06438638, "auxiliary_loss_mlp": 0.01266368, "balance_loss_clip": 0.06282996, "balance_loss_mlp": 0.01255436, "epoch": 0.5758605140538103, "flos": 16478770239360.0, "grad_norm": 2.215361393604388, "language_loss": 0.66869324, "learning_rate": 1.6082199518011504e-06, "loss": 0.74574327, "num_input_tokens_seen": 206268800, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.10931396, "step": 9578, "time_per_iteration": 2.5346949100494385 }, { "auxiliary_loss_clip": 0.06434771, "auxiliary_loss_mlp": 0.01266827, "balance_loss_clip": 0.06285005, "balance_loss_mlp": 0.0125642, "epoch": 0.5759206373064782, "flos": 21293854715520.0, "grad_norm": 1.676301240652992, "language_loss": 0.72978878, "learning_rate": 1.6078380430312016e-06, "loss": 0.80680472, "num_input_tokens_seen": 206287190, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.10406494, "step": 9579, "time_per_iteration": 2.5503740310668945 }, { "auxiliary_loss_clip": 0.06442322, "auxiliary_loss_mlp": 0.01269816, "balance_loss_clip": 0.06284492, "balance_loss_mlp": 0.01257382, "epoch": 0.5759807605591463, "flos": 26075089342080.0, "grad_norm": 2.460593483252851, "language_loss": 0.64686841, "learning_rate": 1.6074561491320742e-06, "loss": 0.72398984, "num_input_tokens_seen": 206307020, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.12445068, "step": 9580, "time_per_iteration": 2.5974366664886475 }, { "auxiliary_loss_clip": 0.06437329, "auxiliary_loss_mlp": 0.01268256, "balance_loss_clip": 0.06281952, "balance_loss_mlp": 0.01256573, "epoch": 0.5760408838118142, "flos": 18877885142400.0, "grad_norm": 1.8051618274578238, "language_loss": 0.8562746, "learning_rate": 1.6070742701182486e-06, "loss": 0.93333048, "num_input_tokens_seen": 206324095, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.11688232, "step": 9581, "time_per_iteration": 2.660841703414917 }, { "auxiliary_loss_clip": 0.06446683, "auxiliary_loss_mlp": 0.0127528, "balance_loss_clip": 0.06288569, "balance_loss_mlp": 0.01263902, "epoch": 0.5761010070644822, "flos": 15383655302400.0, "grad_norm": 2.1329561315776817, "language_loss": 0.68189812, "learning_rate": 1.6066924060042057e-06, "loss": 0.75911772, "num_input_tokens_seen": 206343210, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.11376953, "step": 9582, "time_per_iteration": 2.5628581047058105 }, { "auxiliary_loss_clip": 0.06310168, "auxiliary_loss_mlp": 0.01252441, "balance_loss_clip": 0.06248136, "balance_loss_mlp": 0.01250793, "epoch": 0.5761611303171501, "flos": 71495475500160.0, "grad_norm": 0.6290162372446415, "language_loss": 0.57089663, "learning_rate": 1.6063105568044271e-06, "loss": 0.6465227, "num_input_tokens_seen": 206415935, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.01651001, "step": 9583, "time_per_iteration": 3.3353872299194336 }, { "auxiliary_loss_clip": 0.06430025, "auxiliary_loss_mlp": 0.01264293, "balance_loss_clip": 0.06278925, "balance_loss_mlp": 0.0125329, "epoch": 0.5762212535698181, "flos": 16250556844800.0, "grad_norm": 1.9530501869578614, "language_loss": 0.82592905, "learning_rate": 1.6059287225333912e-06, "loss": 0.9028722, "num_input_tokens_seen": 206431900, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.11016846, "step": 9584, "time_per_iteration": 2.5189433097839355 }, { "auxiliary_loss_clip": 0.06316973, "auxiliary_loss_mlp": 0.01254239, "balance_loss_clip": 0.06255226, "balance_loss_mlp": 0.01252828, "epoch": 0.5762813768224861, "flos": 70207254829440.0, "grad_norm": 0.6347279866977455, "language_loss": 0.49578071, "learning_rate": 1.6055469032055773e-06, "loss": 0.57149279, "num_input_tokens_seen": 206501200, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01410675, "step": 9585, "time_per_iteration": 6.027436971664429 }, { "auxiliary_loss_clip": 0.06434038, "auxiliary_loss_mlp": 0.01266405, "balance_loss_clip": 0.06283936, "balance_loss_mlp": 0.012557, "epoch": 0.5763415000751541, "flos": 20523639185280.0, "grad_norm": 1.4179775879305494, "language_loss": 0.85023183, "learning_rate": 1.605165098835465e-06, "loss": 0.92723632, "num_input_tokens_seen": 206520575, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.10705566, "step": 9586, "time_per_iteration": 2.5471715927124023 }, { "auxiliary_loss_clip": 0.06429598, "auxiliary_loss_mlp": 0.01269825, "balance_loss_clip": 0.06277414, "balance_loss_mlp": 0.01258393, "epoch": 0.5764016233278221, "flos": 15821047215360.0, "grad_norm": 2.002198458437858, "language_loss": 0.80342603, "learning_rate": 1.6047833094375308e-06, "loss": 0.88042027, "num_input_tokens_seen": 206538060, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.11437988, "step": 9587, "time_per_iteration": 2.5477373600006104 }, { "auxiliary_loss_clip": 0.06436777, "auxiliary_loss_mlp": 0.01264456, "balance_loss_clip": 0.06285216, "balance_loss_mlp": 0.01253531, "epoch": 0.57646174658049, "flos": 20777778218880.0, "grad_norm": 1.7430130172554055, "language_loss": 0.65969706, "learning_rate": 1.6044015350262542e-06, "loss": 0.73670942, "num_input_tokens_seen": 206557320, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.10931396, "step": 9588, "time_per_iteration": 2.5812766551971436 }, { "auxiliary_loss_clip": 0.06437242, "auxiliary_loss_mlp": 0.01270804, "balance_loss_clip": 0.06283783, "balance_loss_mlp": 0.01258507, "epoch": 0.576521869833158, "flos": 23556647825280.0, "grad_norm": 1.740797816297208, "language_loss": 0.784958, "learning_rate": 1.6040197756161104e-06, "loss": 0.86203849, "num_input_tokens_seen": 206575780, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.12310791, "step": 9589, "time_per_iteration": 2.5764431953430176 }, { "auxiliary_loss_clip": 0.0642935, "auxiliary_loss_mlp": 0.01268427, "balance_loss_clip": 0.06281497, "balance_loss_mlp": 0.01257924, "epoch": 0.5765819930858259, "flos": 20272812387840.0, "grad_norm": 1.8659729444115385, "language_loss": 0.80287242, "learning_rate": 1.6036380312215762e-06, "loss": 0.87985015, "num_input_tokens_seen": 206594100, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10510254, "step": 9590, "time_per_iteration": 2.5858278274536133 }, { "auxiliary_loss_clip": 0.06443466, "auxiliary_loss_mlp": 0.01265804, "balance_loss_clip": 0.06290106, "balance_loss_mlp": 0.01255421, "epoch": 0.5766421163384939, "flos": 23155453676160.0, "grad_norm": 2.5870339154158297, "language_loss": 0.63053399, "learning_rate": 1.6032563018571283e-06, "loss": 0.7076267, "num_input_tokens_seen": 206613325, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.10388184, "step": 9591, "time_per_iteration": 2.5567638874053955 }, { "auxiliary_loss_clip": 0.06434329, "auxiliary_loss_mlp": 0.01271267, "balance_loss_clip": 0.06281188, "balance_loss_mlp": 0.01259966, "epoch": 0.5767022395911618, "flos": 25856057969280.0, "grad_norm": 2.756669958394976, "language_loss": 0.78526241, "learning_rate": 1.6028745875372406e-06, "loss": 0.86231834, "num_input_tokens_seen": 206634265, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.11309814, "step": 9592, "time_per_iteration": 2.6791672706604004 }, { "auxiliary_loss_clip": 0.06322409, "auxiliary_loss_mlp": 0.01251622, "balance_loss_clip": 0.06260367, "balance_loss_mlp": 0.01250179, "epoch": 0.5767623628438299, "flos": 68315579452800.0, "grad_norm": 0.7293110568875756, "language_loss": 0.59627301, "learning_rate": 1.6024928882763885e-06, "loss": 0.67201334, "num_input_tokens_seen": 206696990, "router_z_loss_clip": 0.62060547, "router_z_loss_mlp": 0.0144043, "step": 9593, "time_per_iteration": 3.304795026779175 }, { "auxiliary_loss_clip": 0.06438869, "auxiliary_loss_mlp": 0.01267971, "balance_loss_clip": 0.06282133, "balance_loss_mlp": 0.01256611, "epoch": 0.5768224860964978, "flos": 30195959541120.0, "grad_norm": 1.561321307124324, "language_loss": 0.71301627, "learning_rate": 1.6021112040890463e-06, "loss": 0.79008466, "num_input_tokens_seen": 206717815, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.11364746, "step": 9594, "time_per_iteration": 2.662891387939453 }, { "auxiliary_loss_clip": 0.06438562, "auxiliary_loss_mlp": 0.01274612, "balance_loss_clip": 0.06286079, "balance_loss_mlp": 0.01264223, "epoch": 0.5768826093491658, "flos": 17900880935040.0, "grad_norm": 1.6790342194969072, "language_loss": 0.71072727, "learning_rate": 1.6017295349896863e-06, "loss": 0.78785902, "num_input_tokens_seen": 206735985, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.10388184, "step": 9595, "time_per_iteration": 2.5651772022247314 }, { "auxiliary_loss_clip": 0.06439755, "auxiliary_loss_mlp": 0.01273244, "balance_loss_clip": 0.0628663, "balance_loss_mlp": 0.01262003, "epoch": 0.5769427326018337, "flos": 17462943970560.0, "grad_norm": 2.084171368372478, "language_loss": 0.69861275, "learning_rate": 1.6013478809927828e-06, "loss": 0.77574277, "num_input_tokens_seen": 206753370, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.11242676, "step": 9596, "time_per_iteration": 2.7699227333068848 }, { "auxiliary_loss_clip": 0.06447885, "auxiliary_loss_mlp": 0.01269655, "balance_loss_clip": 0.06288503, "balance_loss_mlp": 0.01257037, "epoch": 0.5770028558545017, "flos": 39431181473280.0, "grad_norm": 1.8361463878229602, "language_loss": 0.67269731, "learning_rate": 1.6009662421128074e-06, "loss": 0.74987268, "num_input_tokens_seen": 206777645, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.12609863, "step": 9597, "time_per_iteration": 2.728930711746216 }, { "auxiliary_loss_clip": 0.06436592, "auxiliary_loss_mlp": 0.01268453, "balance_loss_clip": 0.06284668, "balance_loss_mlp": 0.0125751, "epoch": 0.5770629791071697, "flos": 21541620839040.0, "grad_norm": 1.706144099001897, "language_loss": 0.81944638, "learning_rate": 1.6005846183642323e-06, "loss": 0.89649677, "num_input_tokens_seen": 206794865, "router_z_loss_clip": 1.51660156, "router_z_loss_mlp": 0.109375, "step": 9598, "time_per_iteration": 2.5414648056030273 }, { "auxiliary_loss_clip": 0.064403, "auxiliary_loss_mlp": 0.0126653, "balance_loss_clip": 0.06286751, "balance_loss_mlp": 0.01255413, "epoch": 0.5771231023598377, "flos": 20893121763840.0, "grad_norm": 1.670192696705251, "language_loss": 0.7323069, "learning_rate": 1.6002030097615277e-06, "loss": 0.80937517, "num_input_tokens_seen": 206814095, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.11126709, "step": 9599, "time_per_iteration": 2.5338046550750732 }, { "auxiliary_loss_clip": 0.06436065, "auxiliary_loss_mlp": 0.01270628, "balance_loss_clip": 0.06287411, "balance_loss_mlp": 0.01260799, "epoch": 0.5771832256125057, "flos": 18083043711360.0, "grad_norm": 1.787077333153072, "language_loss": 0.78306997, "learning_rate": 1.5998214163191663e-06, "loss": 0.86013687, "num_input_tokens_seen": 206832245, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.09832764, "step": 9600, "time_per_iteration": 2.527791738510132 }, { "auxiliary_loss_clip": 0.06448532, "auxiliary_loss_mlp": 0.0126726, "balance_loss_clip": 0.06293071, "balance_loss_mlp": 0.01256239, "epoch": 0.5772433488651736, "flos": 26366222753280.0, "grad_norm": 1.4956902821382274, "language_loss": 0.72582877, "learning_rate": 1.5994398380516163e-06, "loss": 0.80298668, "num_input_tokens_seen": 206851535, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.11022949, "step": 9601, "time_per_iteration": 2.5744333267211914 }, { "auxiliary_loss_clip": 0.0643769, "auxiliary_loss_mlp": 0.01265483, "balance_loss_clip": 0.06288637, "balance_loss_mlp": 0.0125448, "epoch": 0.5773034721178416, "flos": 19686814058880.0, "grad_norm": 1.6952693091144104, "language_loss": 0.69117063, "learning_rate": 1.599058274973348e-06, "loss": 0.76820242, "num_input_tokens_seen": 206870595, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.11004639, "step": 9602, "time_per_iteration": 2.5624489784240723 }, { "auxiliary_loss_clip": 0.06425032, "auxiliary_loss_mlp": 0.01267533, "balance_loss_clip": 0.06281406, "balance_loss_mlp": 0.01257305, "epoch": 0.5773635953705095, "flos": 25089951288960.0, "grad_norm": 1.433341813862131, "language_loss": 0.73208594, "learning_rate": 1.5986767270988297e-06, "loss": 0.80901158, "num_input_tokens_seen": 206892320, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10223389, "step": 9603, "time_per_iteration": 2.584066867828369 }, { "auxiliary_loss_clip": 0.06441949, "auxiliary_loss_mlp": 0.01265691, "balance_loss_clip": 0.06292319, "balance_loss_mlp": 0.01255332, "epoch": 0.5774237186231775, "flos": 21039380265600.0, "grad_norm": 1.6362965206792084, "language_loss": 0.76608729, "learning_rate": 1.5982951944425298e-06, "loss": 0.84316373, "num_input_tokens_seen": 206912485, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.1036377, "step": 9604, "time_per_iteration": 2.5453884601593018 }, { "auxiliary_loss_clip": 0.06442114, "auxiliary_loss_mlp": 0.01271285, "balance_loss_clip": 0.06290057, "balance_loss_mlp": 0.01259353, "epoch": 0.5774838418758454, "flos": 15237145238400.0, "grad_norm": 1.652285175622615, "language_loss": 0.83639801, "learning_rate": 1.5979136770189174e-06, "loss": 0.91353202, "num_input_tokens_seen": 206929100, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.11932373, "step": 9605, "time_per_iteration": 2.514547109603882 }, { "auxiliary_loss_clip": 0.06455049, "auxiliary_loss_mlp": 0.01268565, "balance_loss_clip": 0.06293048, "balance_loss_mlp": 0.01256626, "epoch": 0.5775439651285135, "flos": 23588694812160.0, "grad_norm": 1.6514716343829308, "language_loss": 0.78416765, "learning_rate": 1.5975321748424581e-06, "loss": 0.86140382, "num_input_tokens_seen": 206947020, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.1194458, "step": 9606, "time_per_iteration": 2.5557758808135986 }, { "auxiliary_loss_clip": 0.06440419, "auxiliary_loss_mlp": 0.01265951, "balance_loss_clip": 0.06289227, "balance_loss_mlp": 0.01255055, "epoch": 0.5776040883811814, "flos": 18046300896000.0, "grad_norm": 1.658650784823682, "language_loss": 0.73664755, "learning_rate": 1.597150687927619e-06, "loss": 0.81371129, "num_input_tokens_seen": 206964065, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.10888672, "step": 9607, "time_per_iteration": 2.5589680671691895 }, { "auxiliary_loss_clip": 0.06442538, "auxiliary_loss_mlp": 0.01267827, "balance_loss_clip": 0.06288739, "balance_loss_mlp": 0.01256812, "epoch": 0.5776642116338494, "flos": 18630580216320.0, "grad_norm": 1.6288510830504372, "language_loss": 0.69838661, "learning_rate": 1.5967692162888664e-06, "loss": 0.77549022, "num_input_tokens_seen": 206981940, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11004639, "step": 9608, "time_per_iteration": 2.544156312942505 }, { "auxiliary_loss_clip": 0.06442793, "auxiliary_loss_mlp": 0.01269669, "balance_loss_clip": 0.06290354, "balance_loss_mlp": 0.01257992, "epoch": 0.5777243348865173, "flos": 28410068344320.0, "grad_norm": 1.7231331780922443, "language_loss": 0.76713336, "learning_rate": 1.596387759940665e-06, "loss": 0.84425795, "num_input_tokens_seen": 207002365, "router_z_loss_clip": 1.52539062, "router_z_loss_mlp": 0.11663818, "step": 9609, "time_per_iteration": 4.018166780471802 }, { "auxiliary_loss_clip": 0.06441808, "auxiliary_loss_mlp": 0.01267816, "balance_loss_clip": 0.06286637, "balance_loss_mlp": 0.01256103, "epoch": 0.5777844581391853, "flos": 24031579167360.0, "grad_norm": 1.6879057146638259, "language_loss": 0.7743293, "learning_rate": 1.5960063188974808e-06, "loss": 0.85142553, "num_input_tokens_seen": 207021195, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.11700439, "step": 9610, "time_per_iteration": 2.567915678024292 }, { "auxiliary_loss_clip": 0.06442625, "auxiliary_loss_mlp": 0.01270073, "balance_loss_clip": 0.06289095, "balance_loss_mlp": 0.01258218, "epoch": 0.5778445813918534, "flos": 17781805883520.0, "grad_norm": 1.9942930999782655, "language_loss": 0.68704355, "learning_rate": 1.5956248931737777e-06, "loss": 0.76417059, "num_input_tokens_seen": 207037465, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.11853027, "step": 9611, "time_per_iteration": 2.5126967430114746 }, { "auxiliary_loss_clip": 0.06435816, "auxiliary_loss_mlp": 0.01267277, "balance_loss_clip": 0.06285343, "balance_loss_mlp": 0.01256608, "epoch": 0.5779047046445213, "flos": 22239147352320.0, "grad_norm": 1.7805590058648069, "language_loss": 0.83912563, "learning_rate": 1.5952434827840185e-06, "loss": 0.91615665, "num_input_tokens_seen": 207054230, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.10668945, "step": 9612, "time_per_iteration": 2.555673837661743 }, { "auxiliary_loss_clip": 0.06436196, "auxiliary_loss_mlp": 0.01266459, "balance_loss_clip": 0.06286921, "balance_loss_mlp": 0.01255957, "epoch": 0.5779648278971893, "flos": 21440825976960.0, "grad_norm": 1.6953049549821064, "language_loss": 0.79829478, "learning_rate": 1.594862087742667e-06, "loss": 0.87532139, "num_input_tokens_seen": 207073150, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.10510254, "step": 9613, "time_per_iteration": 3.9687907695770264 }, { "auxiliary_loss_clip": 0.06439951, "auxiliary_loss_mlp": 0.01265329, "balance_loss_clip": 0.06289795, "balance_loss_mlp": 0.01254815, "epoch": 0.5780249511498572, "flos": 19032151708800.0, "grad_norm": 1.8555979846361448, "language_loss": 0.77837741, "learning_rate": 1.5944807080641863e-06, "loss": 0.85543025, "num_input_tokens_seen": 207090375, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.10522461, "step": 9614, "time_per_iteration": 2.5359091758728027 }, { "auxiliary_loss_clip": 0.06442475, "auxiliary_loss_mlp": 0.01264512, "balance_loss_clip": 0.0628823, "balance_loss_mlp": 0.01253532, "epoch": 0.5780850744025252, "flos": 12128596542720.0, "grad_norm": 2.6481215277582146, "language_loss": 0.81404299, "learning_rate": 1.5940993437630375e-06, "loss": 0.8911128, "num_input_tokens_seen": 207106030, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.10980225, "step": 9615, "time_per_iteration": 2.492051124572754 }, { "auxiliary_loss_clip": 0.06442045, "auxiliary_loss_mlp": 0.01266973, "balance_loss_clip": 0.06287052, "balance_loss_mlp": 0.01255744, "epoch": 0.5781451976551931, "flos": 25051154048640.0, "grad_norm": 1.5489540600145273, "language_loss": 0.67430753, "learning_rate": 1.5937179948536825e-06, "loss": 0.75139773, "num_input_tokens_seen": 207125435, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.11236572, "step": 9616, "time_per_iteration": 2.5728936195373535 }, { "auxiliary_loss_clip": 0.06440008, "auxiliary_loss_mlp": 0.01267962, "balance_loss_clip": 0.06291194, "balance_loss_mlp": 0.01257436, "epoch": 0.5782053209078611, "flos": 19251770060160.0, "grad_norm": 1.627004508284012, "language_loss": 0.78506446, "learning_rate": 1.5933366613505812e-06, "loss": 0.86214411, "num_input_tokens_seen": 207145095, "router_z_loss_clip": 1.48730469, "router_z_loss_mlp": 0.10522461, "step": 9617, "time_per_iteration": 2.5286478996276855 }, { "auxiliary_loss_clip": 0.06440736, "auxiliary_loss_mlp": 0.01268135, "balance_loss_clip": 0.06291018, "balance_loss_mlp": 0.01256059, "epoch": 0.578265444160529, "flos": 26000849024640.0, "grad_norm": 1.4275671120724784, "language_loss": 0.75326765, "learning_rate": 1.5929553432681947e-06, "loss": 0.83035636, "num_input_tokens_seen": 207166045, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.12060547, "step": 9618, "time_per_iteration": 2.6948366165161133 }, { "auxiliary_loss_clip": 0.06436171, "auxiliary_loss_mlp": 0.0126898, "balance_loss_clip": 0.06286269, "balance_loss_mlp": 0.01257602, "epoch": 0.5783255674131971, "flos": 21805025748480.0, "grad_norm": 1.5154063678896297, "language_loss": 0.81295764, "learning_rate": 1.5925740406209826e-06, "loss": 0.89000916, "num_input_tokens_seen": 207185290, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.11376953, "step": 9619, "time_per_iteration": 2.5244929790496826 }, { "auxiliary_loss_clip": 0.06441745, "auxiliary_loss_mlp": 0.01265915, "balance_loss_clip": 0.06289594, "balance_loss_mlp": 0.01254448, "epoch": 0.578385690665865, "flos": 24796553817600.0, "grad_norm": 1.5758353481302085, "language_loss": 0.72548681, "learning_rate": 1.5921927534234039e-06, "loss": 0.80256343, "num_input_tokens_seen": 207205505, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.11450195, "step": 9620, "time_per_iteration": 2.5664024353027344 }, { "auxiliary_loss_clip": 0.06446823, "auxiliary_loss_mlp": 0.01266748, "balance_loss_clip": 0.06293793, "balance_loss_mlp": 0.01255805, "epoch": 0.578445813918533, "flos": 21218859711360.0, "grad_norm": 1.7503979756483354, "language_loss": 0.77569991, "learning_rate": 1.591811481689916e-06, "loss": 0.85283566, "num_input_tokens_seen": 207225315, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.109375, "step": 9621, "time_per_iteration": 2.5623726844787598 }, { "auxiliary_loss_clip": 0.06444146, "auxiliary_loss_mlp": 0.01263911, "balance_loss_clip": 0.06289693, "balance_loss_mlp": 0.01252843, "epoch": 0.5785059371712009, "flos": 25053921233280.0, "grad_norm": 1.524059802142462, "language_loss": 0.70579362, "learning_rate": 1.5914302254349787e-06, "loss": 0.78287423, "num_input_tokens_seen": 207247690, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.11065674, "step": 9622, "time_per_iteration": 2.5979886054992676 }, { "auxiliary_loss_clip": 0.06328039, "auxiliary_loss_mlp": 0.01255834, "balance_loss_clip": 0.06266037, "balance_loss_mlp": 0.01254361, "epoch": 0.5785660604238689, "flos": 70865187488640.0, "grad_norm": 0.7511078290347879, "language_loss": 0.55701005, "learning_rate": 1.5910489846730476e-06, "loss": 0.63284886, "num_input_tokens_seen": 207301735, "router_z_loss_clip": 0.61865234, "router_z_loss_mlp": 0.01472473, "step": 9623, "time_per_iteration": 3.1739959716796875 }, { "auxiliary_loss_clip": 0.06438867, "auxiliary_loss_mlp": 0.01267624, "balance_loss_clip": 0.06283563, "balance_loss_mlp": 0.01255899, "epoch": 0.578626183676537, "flos": 31658083361280.0, "grad_norm": 2.123701479584091, "language_loss": 0.71936423, "learning_rate": 1.5906677594185799e-06, "loss": 0.79642916, "num_input_tokens_seen": 207321240, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.11712646, "step": 9624, "time_per_iteration": 4.13359808921814 }, { "auxiliary_loss_clip": 0.06439694, "auxiliary_loss_mlp": 0.01268738, "balance_loss_clip": 0.06288618, "balance_loss_mlp": 0.01256805, "epoch": 0.5786863069292049, "flos": 21870545241600.0, "grad_norm": 2.1136758735174563, "language_loss": 0.82713723, "learning_rate": 1.5902865496860322e-06, "loss": 0.90422148, "num_input_tokens_seen": 207339540, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.1192627, "step": 9625, "time_per_iteration": 4.001445770263672 }, { "auxiliary_loss_clip": 0.06435932, "auxiliary_loss_mlp": 0.01267447, "balance_loss_clip": 0.06287577, "balance_loss_mlp": 0.01255633, "epoch": 0.5787464301818729, "flos": 23371214739840.0, "grad_norm": 1.5431021993718739, "language_loss": 0.70673382, "learning_rate": 1.5899053554898591e-06, "loss": 0.78376758, "num_input_tokens_seen": 207360470, "router_z_loss_clip": 1.48242188, "router_z_loss_mlp": 0.11810303, "step": 9626, "time_per_iteration": 2.557628631591797 }, { "auxiliary_loss_clip": 0.06438489, "auxiliary_loss_mlp": 0.01266413, "balance_loss_clip": 0.06289043, "balance_loss_mlp": 0.01255953, "epoch": 0.5788065534345408, "flos": 30011155361280.0, "grad_norm": 1.7480567472079265, "language_loss": 0.71933717, "learning_rate": 1.5895241768445166e-06, "loss": 0.79638624, "num_input_tokens_seen": 207383080, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.10461426, "step": 9627, "time_per_iteration": 2.608041524887085 }, { "auxiliary_loss_clip": 0.06435354, "auxiliary_loss_mlp": 0.01267515, "balance_loss_clip": 0.06285812, "balance_loss_mlp": 0.01256775, "epoch": 0.5788666766872088, "flos": 24533526251520.0, "grad_norm": 1.5705955270719119, "language_loss": 0.8415277, "learning_rate": 1.589143013764458e-06, "loss": 0.91855639, "num_input_tokens_seen": 207401000, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.10742188, "step": 9628, "time_per_iteration": 2.5810585021972656 }, { "auxiliary_loss_clip": 0.06436326, "auxiliary_loss_mlp": 0.01268541, "balance_loss_clip": 0.0628418, "balance_loss_mlp": 0.01256632, "epoch": 0.5789267999398767, "flos": 23739649142400.0, "grad_norm": 1.7110234588450115, "language_loss": 0.72579181, "learning_rate": 1.5887618662641376e-06, "loss": 0.80284047, "num_input_tokens_seen": 207419230, "router_z_loss_clip": 1.52246094, "router_z_loss_mlp": 0.11895752, "step": 9629, "time_per_iteration": 2.569761276245117 }, { "auxiliary_loss_clip": 0.06438814, "auxiliary_loss_mlp": 0.01265454, "balance_loss_clip": 0.0628762, "balance_loss_mlp": 0.01254308, "epoch": 0.5789869231925447, "flos": 21140217054720.0, "grad_norm": 2.2747868803659532, "language_loss": 0.74475288, "learning_rate": 1.5883807343580087e-06, "loss": 0.82179558, "num_input_tokens_seen": 207437615, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.1114502, "step": 9630, "time_per_iteration": 2.5429086685180664 }, { "auxiliary_loss_clip": 0.06431371, "auxiliary_loss_mlp": 0.01265509, "balance_loss_clip": 0.06285626, "balance_loss_mlp": 0.01254721, "epoch": 0.5790470464452127, "flos": 21215086277760.0, "grad_norm": 1.6291117506138686, "language_loss": 0.78926516, "learning_rate": 1.587999618060523e-06, "loss": 0.86623394, "num_input_tokens_seen": 207457270, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10778809, "step": 9631, "time_per_iteration": 2.5313615798950195 }, { "auxiliary_loss_clip": 0.06436933, "auxiliary_loss_mlp": 0.01266791, "balance_loss_clip": 0.06285162, "balance_loss_mlp": 0.01255907, "epoch": 0.5791071696978807, "flos": 23411144010240.0, "grad_norm": 1.5732173443033388, "language_loss": 0.75918043, "learning_rate": 1.5876185173861333e-06, "loss": 0.83621764, "num_input_tokens_seen": 207477890, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.10876465, "step": 9632, "time_per_iteration": 2.5720038414001465 }, { "auxiliary_loss_clip": 0.0643706, "auxiliary_loss_mlp": 0.01265843, "balance_loss_clip": 0.06285791, "balance_loss_mlp": 0.01254447, "epoch": 0.5791672929505486, "flos": 24213322673280.0, "grad_norm": 1.9901098030945665, "language_loss": 0.7957049, "learning_rate": 1.5872374323492915e-06, "loss": 0.87273395, "num_input_tokens_seen": 207497670, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.11401367, "step": 9633, "time_per_iteration": 2.5652756690979004 }, { "auxiliary_loss_clip": 0.06449504, "auxiliary_loss_mlp": 0.01268404, "balance_loss_clip": 0.06288743, "balance_loss_mlp": 0.01256203, "epoch": 0.5792274162032166, "flos": 24355094981760.0, "grad_norm": 6.318028045394578, "language_loss": 0.77811295, "learning_rate": 1.5868563629644464e-06, "loss": 0.85529202, "num_input_tokens_seen": 207516105, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.12213135, "step": 9634, "time_per_iteration": 2.6371827125549316 }, { "auxiliary_loss_clip": 0.06442587, "auxiliary_loss_mlp": 0.01265144, "balance_loss_clip": 0.06286585, "balance_loss_mlp": 0.01253426, "epoch": 0.5792875394558845, "flos": 20455729850880.0, "grad_norm": 2.18983655379103, "language_loss": 0.63672996, "learning_rate": 1.5864753092460502e-06, "loss": 0.71380728, "num_input_tokens_seen": 207533685, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.11730957, "step": 9635, "time_per_iteration": 2.552283525466919 }, { "auxiliary_loss_clip": 0.06436419, "auxiliary_loss_mlp": 0.01264933, "balance_loss_clip": 0.06286387, "balance_loss_mlp": 0.0125399, "epoch": 0.5793476627085525, "flos": 24067064171520.0, "grad_norm": 1.4211303487932407, "language_loss": 0.77121365, "learning_rate": 1.5860942712085516e-06, "loss": 0.84822714, "num_input_tokens_seen": 207552840, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.109375, "step": 9636, "time_per_iteration": 2.5816030502319336 }, { "auxiliary_loss_clip": 0.06428969, "auxiliary_loss_mlp": 0.01270988, "balance_loss_clip": 0.06283262, "balance_loss_mlp": 0.01261213, "epoch": 0.5794077859612206, "flos": 22060799936640.0, "grad_norm": 3.0478158242564346, "language_loss": 0.68576497, "learning_rate": 1.5857132488663998e-06, "loss": 0.76276451, "num_input_tokens_seen": 207572095, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.09771729, "step": 9637, "time_per_iteration": 2.555544376373291 }, { "auxiliary_loss_clip": 0.06440658, "auxiliary_loss_mlp": 0.01266214, "balance_loss_clip": 0.06285645, "balance_loss_mlp": 0.01254323, "epoch": 0.5794679092138885, "flos": 11439245802240.0, "grad_norm": 3.523454115872823, "language_loss": 0.72216845, "learning_rate": 1.585332242234043e-06, "loss": 0.79923725, "num_input_tokens_seen": 207587495, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.11901855, "step": 9638, "time_per_iteration": 2.50681209564209 }, { "auxiliary_loss_clip": 0.06436412, "auxiliary_loss_mlp": 0.01265044, "balance_loss_clip": 0.06286646, "balance_loss_mlp": 0.01253844, "epoch": 0.5795280324665565, "flos": 18886228623360.0, "grad_norm": 8.868730270549857, "language_loss": 0.72268927, "learning_rate": 1.5849512513259291e-06, "loss": 0.79970384, "num_input_tokens_seen": 207606795, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.11199951, "step": 9639, "time_per_iteration": 2.555241584777832 }, { "auxiliary_loss_clip": 0.06435086, "auxiliary_loss_mlp": 0.01265398, "balance_loss_clip": 0.06283164, "balance_loss_mlp": 0.01254407, "epoch": 0.5795881557192244, "flos": 13010969162880.0, "grad_norm": 1.8674818423831148, "language_loss": 0.69902194, "learning_rate": 1.5845702761565054e-06, "loss": 0.77602679, "num_input_tokens_seen": 207623620, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.11004639, "step": 9640, "time_per_iteration": 2.5242698192596436 }, { "auxiliary_loss_clip": 0.06451263, "auxiliary_loss_mlp": 0.01270858, "balance_loss_clip": 0.06290749, "balance_loss_mlp": 0.01258603, "epoch": 0.5796482789718924, "flos": 19937598929280.0, "grad_norm": 2.4697678464705084, "language_loss": 0.7827667, "learning_rate": 1.5841893167402183e-06, "loss": 0.85998797, "num_input_tokens_seen": 207639380, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.12255859, "step": 9641, "time_per_iteration": 2.5499327182769775 }, { "auxiliary_loss_clip": 0.06432957, "auxiliary_loss_mlp": 0.01266826, "balance_loss_clip": 0.06282844, "balance_loss_mlp": 0.01255709, "epoch": 0.5797084022245603, "flos": 21656880529920.0, "grad_norm": 1.8087042697908968, "language_loss": 0.74141014, "learning_rate": 1.5838083730915143e-06, "loss": 0.81840789, "num_input_tokens_seen": 207657915, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.11120605, "step": 9642, "time_per_iteration": 2.5347821712493896 }, { "auxiliary_loss_clip": 0.06441621, "auxiliary_loss_mlp": 0.0126243, "balance_loss_clip": 0.06291109, "balance_loss_mlp": 0.01251546, "epoch": 0.5797685254772283, "flos": 26038807724160.0, "grad_norm": 2.894097049548597, "language_loss": 0.73933697, "learning_rate": 1.5834274452248378e-06, "loss": 0.81637746, "num_input_tokens_seen": 207678620, "router_z_loss_clip": 1.50488281, "router_z_loss_mlp": 0.10888672, "step": 9643, "time_per_iteration": 2.58400559425354 }, { "auxiliary_loss_clip": 0.06438187, "auxiliary_loss_mlp": 0.01265302, "balance_loss_clip": 0.06284425, "balance_loss_mlp": 0.0125483, "epoch": 0.5798286487298963, "flos": 22710808385280.0, "grad_norm": 2.329125940060156, "language_loss": 0.67446804, "learning_rate": 1.5830465331546352e-06, "loss": 0.75150293, "num_input_tokens_seen": 207696980, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.10479736, "step": 9644, "time_per_iteration": 2.609731435775757 }, { "auxiliary_loss_clip": 0.06452939, "auxiliary_loss_mlp": 0.01268649, "balance_loss_clip": 0.06295142, "balance_loss_mlp": 0.01256334, "epoch": 0.5798887719825643, "flos": 23155705238400.0, "grad_norm": 2.3822938650192618, "language_loss": 0.85905612, "learning_rate": 1.5826656368953496e-06, "loss": 0.93627203, "num_input_tokens_seen": 207714065, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.12298584, "step": 9645, "time_per_iteration": 2.583489179611206 }, { "auxiliary_loss_clip": 0.06442246, "auxiliary_loss_mlp": 0.01267121, "balance_loss_clip": 0.0629179, "balance_loss_mlp": 0.01256768, "epoch": 0.5799488952352322, "flos": 24432982951680.0, "grad_norm": 2.6350863367229773, "language_loss": 0.75494593, "learning_rate": 1.5822847564614244e-06, "loss": 0.83203959, "num_input_tokens_seen": 207734720, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.10351562, "step": 9646, "time_per_iteration": 2.5899691581726074 }, { "auxiliary_loss_clip": 0.0644999, "auxiliary_loss_mlp": 0.01270525, "balance_loss_clip": 0.06293046, "balance_loss_mlp": 0.01258759, "epoch": 0.5800090184879002, "flos": 38404478995200.0, "grad_norm": 1.6984650525610159, "language_loss": 0.59509659, "learning_rate": 1.5819038918673038e-06, "loss": 0.67230171, "num_input_tokens_seen": 207755435, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.11767578, "step": 9647, "time_per_iteration": 2.69403076171875 }, { "auxiliary_loss_clip": 0.06447347, "auxiliary_loss_mlp": 0.0127008, "balance_loss_clip": 0.06290217, "balance_loss_mlp": 0.01258099, "epoch": 0.5800691417405681, "flos": 19789747200000.0, "grad_norm": 1.5920287071295969, "language_loss": 0.8461588, "learning_rate": 1.5815230431274288e-06, "loss": 0.92333311, "num_input_tokens_seen": 207773570, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.11981201, "step": 9648, "time_per_iteration": 3.996403932571411 }, { "auxiliary_loss_clip": 0.06325038, "auxiliary_loss_mlp": 0.01255463, "balance_loss_clip": 0.06263645, "balance_loss_mlp": 0.01254053, "epoch": 0.5801292649932361, "flos": 70333514133120.0, "grad_norm": 0.8466614666524864, "language_loss": 0.63022494, "learning_rate": 1.581142210256242e-06, "loss": 0.70603001, "num_input_tokens_seen": 207830095, "router_z_loss_clip": 0.61474609, "router_z_loss_mlp": 0.01409912, "step": 9649, "time_per_iteration": 3.192789316177368 }, { "auxiliary_loss_clip": 0.06433561, "auxiliary_loss_mlp": 0.0126406, "balance_loss_clip": 0.06285431, "balance_loss_mlp": 0.01253868, "epoch": 0.5801893882459042, "flos": 18740892516480.0, "grad_norm": 2.332359614406932, "language_loss": 0.8227905, "learning_rate": 1.5807613932681857e-06, "loss": 0.89976674, "num_input_tokens_seen": 207848555, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10192871, "step": 9650, "time_per_iteration": 2.5288197994232178 }, { "auxiliary_loss_clip": 0.06448032, "auxiliary_loss_mlp": 0.01269616, "balance_loss_clip": 0.06290828, "balance_loss_mlp": 0.0125807, "epoch": 0.5802495114985721, "flos": 15601973915520.0, "grad_norm": 2.287769497296566, "language_loss": 0.77543867, "learning_rate": 1.580380592177698e-06, "loss": 0.85261512, "num_input_tokens_seen": 207867060, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.11547852, "step": 9651, "time_per_iteration": 2.5377848148345947 }, { "auxiliary_loss_clip": 0.0644449, "auxiliary_loss_mlp": 0.01267417, "balance_loss_clip": 0.06288505, "balance_loss_mlp": 0.01255496, "epoch": 0.5803096347512401, "flos": 18260552586240.0, "grad_norm": 1.956839168391594, "language_loss": 0.7478801, "learning_rate": 1.5799998069992213e-06, "loss": 0.82499915, "num_input_tokens_seen": 207884520, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.1192627, "step": 9652, "time_per_iteration": 3.939760446548462 }, { "auxiliary_loss_clip": 0.0644744, "auxiliary_loss_mlp": 0.01269812, "balance_loss_clip": 0.06290521, "balance_loss_mlp": 0.01257826, "epoch": 0.580369758003908, "flos": 22899763342080.0, "grad_norm": 1.8715619783203241, "language_loss": 0.76948553, "learning_rate": 1.579619037747193e-06, "loss": 0.84665805, "num_input_tokens_seen": 207905370, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.11987305, "step": 9653, "time_per_iteration": 2.5844860076904297 }, { "auxiliary_loss_clip": 0.06447446, "auxiliary_loss_mlp": 0.01265663, "balance_loss_clip": 0.06292126, "balance_loss_mlp": 0.01253122, "epoch": 0.580429881256576, "flos": 18703646576640.0, "grad_norm": 2.218302982822751, "language_loss": 0.74959844, "learning_rate": 1.5792382844360534e-06, "loss": 0.82672954, "num_input_tokens_seen": 207923790, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.12536621, "step": 9654, "time_per_iteration": 2.5354228019714355 }, { "auxiliary_loss_clip": 0.06433471, "auxiliary_loss_mlp": 0.0126826, "balance_loss_clip": 0.06286779, "balance_loss_mlp": 0.01257877, "epoch": 0.5804900045092439, "flos": 24689050629120.0, "grad_norm": 1.6857560674713528, "language_loss": 0.70742971, "learning_rate": 1.5788575470802408e-06, "loss": 0.78444707, "num_input_tokens_seen": 207942335, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10375977, "step": 9655, "time_per_iteration": 2.5872581005096436 }, { "auxiliary_loss_clip": 0.06444281, "auxiliary_loss_mlp": 0.01267539, "balance_loss_clip": 0.06286117, "balance_loss_mlp": 0.01255803, "epoch": 0.580550127761912, "flos": 23119549401600.0, "grad_norm": 2.8903706585775635, "language_loss": 0.70528936, "learning_rate": 1.5784768256941915e-06, "loss": 0.78240758, "num_input_tokens_seen": 207961975, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.11743164, "step": 9656, "time_per_iteration": 2.5752336978912354 }, { "auxiliary_loss_clip": 0.06431907, "auxiliary_loss_mlp": 0.01268092, "balance_loss_clip": 0.06284946, "balance_loss_mlp": 0.01257745, "epoch": 0.5806102510145799, "flos": 18481093332480.0, "grad_norm": 1.5537175582324518, "language_loss": 0.72247767, "learning_rate": 1.5780961202923433e-06, "loss": 0.79947764, "num_input_tokens_seen": 207979520, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.10345459, "step": 9657, "time_per_iteration": 2.512521505355835 }, { "auxiliary_loss_clip": 0.06444913, "auxiliary_loss_mlp": 0.01265217, "balance_loss_clip": 0.06285456, "balance_loss_mlp": 0.01252801, "epoch": 0.5806703742672479, "flos": 23922566605440.0, "grad_norm": 2.6985832545897215, "language_loss": 0.70955551, "learning_rate": 1.5777154308891328e-06, "loss": 0.7866568, "num_input_tokens_seen": 207998375, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.12408447, "step": 9658, "time_per_iteration": 2.5717461109161377 }, { "auxiliary_loss_clip": 0.06320016, "auxiliary_loss_mlp": 0.0125512, "balance_loss_clip": 0.0625845, "balance_loss_mlp": 0.01253899, "epoch": 0.5807304975199158, "flos": 66332096328960.0, "grad_norm": 0.6430296672659707, "language_loss": 0.53553176, "learning_rate": 1.5773347574989953e-06, "loss": 0.61128318, "num_input_tokens_seen": 208060605, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01219177, "step": 9659, "time_per_iteration": 3.1824018955230713 }, { "auxiliary_loss_clip": 0.06443177, "auxiliary_loss_mlp": 0.01269837, "balance_loss_clip": 0.06286142, "balance_loss_mlp": 0.01258632, "epoch": 0.5807906207725838, "flos": 31730478888960.0, "grad_norm": 1.8998001396632205, "language_loss": 0.62407988, "learning_rate": 1.576954100136366e-06, "loss": 0.70121002, "num_input_tokens_seen": 208080320, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.11206055, "step": 9660, "time_per_iteration": 2.6437690258026123 }, { "auxiliary_loss_clip": 0.0644322, "auxiliary_loss_mlp": 0.01268504, "balance_loss_clip": 0.06286677, "balance_loss_mlp": 0.01256744, "epoch": 0.5808507440252517, "flos": 23807223060480.0, "grad_norm": 1.5043495707192103, "language_loss": 0.65402687, "learning_rate": 1.5765734588156797e-06, "loss": 0.73114407, "num_input_tokens_seen": 208099305, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.11761475, "step": 9661, "time_per_iteration": 2.552492141723633 }, { "auxiliary_loss_clip": 0.06430738, "auxiliary_loss_mlp": 0.01267051, "balance_loss_clip": 0.06283633, "balance_loss_mlp": 0.01257222, "epoch": 0.5809108672779197, "flos": 13703464431360.0, "grad_norm": 1.4281452268781214, "language_loss": 0.74645084, "learning_rate": 1.5761928335513704e-06, "loss": 0.82342875, "num_input_tokens_seen": 208116960, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.0982666, "step": 9662, "time_per_iteration": 2.5112202167510986 }, { "auxiliary_loss_clip": 0.06319956, "auxiliary_loss_mlp": 0.01252057, "balance_loss_clip": 0.06257729, "balance_loss_mlp": 0.01250813, "epoch": 0.5809709905305876, "flos": 69157687386240.0, "grad_norm": 0.8286690951155724, "language_loss": 0.58223927, "learning_rate": 1.5758122243578709e-06, "loss": 0.65795946, "num_input_tokens_seen": 208182190, "router_z_loss_clip": 0.62353516, "router_z_loss_mlp": 0.01243591, "step": 9663, "time_per_iteration": 3.2368855476379395 }, { "auxiliary_loss_clip": 0.06435964, "auxiliary_loss_mlp": 0.01268583, "balance_loss_clip": 0.06285388, "balance_loss_mlp": 0.01257478, "epoch": 0.5810311137832557, "flos": 19833491831040.0, "grad_norm": 2.986358215906931, "language_loss": 0.82082784, "learning_rate": 1.5754316312496152e-06, "loss": 0.89787328, "num_input_tokens_seen": 208197015, "router_z_loss_clip": 1.50488281, "router_z_loss_mlp": 0.11096191, "step": 9664, "time_per_iteration": 5.50184440612793 }, { "auxiliary_loss_clip": 0.06438266, "auxiliary_loss_mlp": 0.01264727, "balance_loss_clip": 0.0628254, "balance_loss_mlp": 0.01254422, "epoch": 0.5810912370359237, "flos": 29245635659520.0, "grad_norm": 2.044785342758676, "language_loss": 0.81873029, "learning_rate": 1.5750510542410337e-06, "loss": 0.89576018, "num_input_tokens_seen": 208215795, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.10308838, "step": 9665, "time_per_iteration": 2.639702081680298 }, { "auxiliary_loss_clip": 0.06446314, "auxiliary_loss_mlp": 0.01267445, "balance_loss_clip": 0.06288885, "balance_loss_mlp": 0.01255363, "epoch": 0.5811513602885916, "flos": 22792469788800.0, "grad_norm": 1.6815067169803453, "language_loss": 0.81154847, "learning_rate": 1.5746704933465599e-06, "loss": 0.88868612, "num_input_tokens_seen": 208234655, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.12078857, "step": 9666, "time_per_iteration": 2.5583996772766113 }, { "auxiliary_loss_clip": 0.06434079, "auxiliary_loss_mlp": 0.01267439, "balance_loss_clip": 0.06283782, "balance_loss_mlp": 0.01256579, "epoch": 0.5812114835412596, "flos": 18740347464960.0, "grad_norm": 1.7551235808003285, "language_loss": 0.80666798, "learning_rate": 1.5742899485806227e-06, "loss": 0.8836832, "num_input_tokens_seen": 208251300, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.10864258, "step": 9667, "time_per_iteration": 2.5231382846832275 }, { "auxiliary_loss_clip": 0.06449315, "auxiliary_loss_mlp": 0.01266016, "balance_loss_clip": 0.06287083, "balance_loss_mlp": 0.0125326, "epoch": 0.5812716067939275, "flos": 26438324791680.0, "grad_norm": 1.6385789685412069, "language_loss": 0.79056209, "learning_rate": 1.573909419957653e-06, "loss": 0.86771548, "num_input_tokens_seen": 208272685, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.12762451, "step": 9668, "time_per_iteration": 2.5855581760406494 }, { "auxiliary_loss_clip": 0.06439285, "auxiliary_loss_mlp": 0.01270737, "balance_loss_clip": 0.06286031, "balance_loss_mlp": 0.01260163, "epoch": 0.5813317300465956, "flos": 43407847595520.0, "grad_norm": 3.0009944974423983, "language_loss": 0.65277815, "learning_rate": 1.5735289074920819e-06, "loss": 0.72987843, "num_input_tokens_seen": 208294315, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.10577393, "step": 9669, "time_per_iteration": 2.732346296310425 }, { "auxiliary_loss_clip": 0.06434946, "auxiliary_loss_mlp": 0.01266571, "balance_loss_clip": 0.06282548, "balance_loss_mlp": 0.01255234, "epoch": 0.5813918532992635, "flos": 24791564499840.0, "grad_norm": 1.8038104665533004, "language_loss": 0.738868, "learning_rate": 1.5731484111983363e-06, "loss": 0.81588316, "num_input_tokens_seen": 208315610, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.11340332, "step": 9670, "time_per_iteration": 2.5933964252471924 }, { "auxiliary_loss_clip": 0.06436312, "auxiliary_loss_mlp": 0.01272197, "balance_loss_clip": 0.06281112, "balance_loss_mlp": 0.01261021, "epoch": 0.5814519765519315, "flos": 22864068702720.0, "grad_norm": 1.8431898723603162, "language_loss": 0.79194856, "learning_rate": 1.5727679310908464e-06, "loss": 0.86903369, "num_input_tokens_seen": 208334725, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.11175537, "step": 9671, "time_per_iteration": 2.566816806793213 }, { "auxiliary_loss_clip": 0.06447309, "auxiliary_loss_mlp": 0.01268915, "balance_loss_clip": 0.06286798, "balance_loss_mlp": 0.01256613, "epoch": 0.5815120998045994, "flos": 24067651150080.0, "grad_norm": 2.288102303497175, "language_loss": 0.61447763, "learning_rate": 1.5723874671840399e-06, "loss": 0.6916399, "num_input_tokens_seen": 208353825, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.12304688, "step": 9672, "time_per_iteration": 2.583017349243164 }, { "auxiliary_loss_clip": 0.06430883, "auxiliary_loss_mlp": 0.0126755, "balance_loss_clip": 0.06280208, "balance_loss_mlp": 0.01256815, "epoch": 0.5815722230572674, "flos": 24286305179520.0, "grad_norm": 1.6989133239489487, "language_loss": 0.81158745, "learning_rate": 1.572007019492342e-06, "loss": 0.88857186, "num_input_tokens_seen": 208374160, "router_z_loss_clip": 1.50585938, "router_z_loss_mlp": 0.10723877, "step": 9673, "time_per_iteration": 2.5959861278533936 }, { "auxiliary_loss_clip": 0.06440801, "auxiliary_loss_mlp": 0.01270369, "balance_loss_clip": 0.0628387, "balance_loss_mlp": 0.01257834, "epoch": 0.5816323463099353, "flos": 22206932657280.0, "grad_norm": 2.417371874698666, "language_loss": 0.88730776, "learning_rate": 1.5716265880301817e-06, "loss": 0.96441948, "num_input_tokens_seen": 208392105, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.12548828, "step": 9674, "time_per_iteration": 2.5258610248565674 }, { "auxiliary_loss_clip": 0.06437309, "auxiliary_loss_mlp": 0.01266806, "balance_loss_clip": 0.06284535, "balance_loss_mlp": 0.01255671, "epoch": 0.5816924695626033, "flos": 24141388343040.0, "grad_norm": 1.459144198135146, "language_loss": 0.78776276, "learning_rate": 1.571246172811984e-06, "loss": 0.86480397, "num_input_tokens_seen": 208411755, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.11132812, "step": 9675, "time_per_iteration": 2.5587737560272217 }, { "auxiliary_loss_clip": 0.06433151, "auxiliary_loss_mlp": 0.01266881, "balance_loss_clip": 0.06281007, "balance_loss_mlp": 0.01255616, "epoch": 0.5817525928152713, "flos": 21330555603840.0, "grad_norm": 2.2725217499731065, "language_loss": 0.69980574, "learning_rate": 1.5708657738521748e-06, "loss": 0.77680606, "num_input_tokens_seen": 208429995, "router_z_loss_clip": 1.52246094, "router_z_loss_mlp": 0.1126709, "step": 9676, "time_per_iteration": 2.5217983722686768 }, { "auxiliary_loss_clip": 0.06435638, "auxiliary_loss_mlp": 0.01267227, "balance_loss_clip": 0.06281392, "balance_loss_mlp": 0.01256391, "epoch": 0.5818127160679393, "flos": 26940355729920.0, "grad_norm": 2.554313509489751, "language_loss": 0.6427356, "learning_rate": 1.5704853911651779e-06, "loss": 0.71976429, "num_input_tokens_seen": 208443655, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.10839844, "step": 9677, "time_per_iteration": 2.5402557849884033 }, { "auxiliary_loss_clip": 0.06320782, "auxiliary_loss_mlp": 0.01257831, "balance_loss_clip": 0.06258518, "balance_loss_mlp": 0.01256373, "epoch": 0.5818728393206073, "flos": 63940779855360.0, "grad_norm": 0.7874394445478378, "language_loss": 0.54093695, "learning_rate": 1.5701050247654182e-06, "loss": 0.61672306, "num_input_tokens_seen": 208498405, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01456451, "step": 9678, "time_per_iteration": 3.254054069519043 }, { "auxiliary_loss_clip": 0.06320344, "auxiliary_loss_mlp": 0.01256536, "balance_loss_clip": 0.06258473, "balance_loss_mlp": 0.01255092, "epoch": 0.5819329625732752, "flos": 64972654087680.0, "grad_norm": 0.7693435856752936, "language_loss": 0.56179917, "learning_rate": 1.569724674667319e-06, "loss": 0.63756794, "num_input_tokens_seen": 208559075, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01441956, "step": 9679, "time_per_iteration": 3.101181983947754 }, { "auxiliary_loss_clip": 0.06432912, "auxiliary_loss_mlp": 0.01266121, "balance_loss_clip": 0.06280147, "balance_loss_mlp": 0.0125538, "epoch": 0.5819930858259432, "flos": 21221668823040.0, "grad_norm": 1.5423613675308965, "language_loss": 0.65692353, "learning_rate": 1.5693443408853032e-06, "loss": 0.73391378, "num_input_tokens_seen": 208577770, "router_z_loss_clip": 1.52636719, "router_z_loss_mlp": 0.10742188, "step": 9680, "time_per_iteration": 2.5277280807495117 }, { "auxiliary_loss_clip": 0.06435445, "auxiliary_loss_mlp": 0.01268168, "balance_loss_clip": 0.06282429, "balance_loss_mlp": 0.01257141, "epoch": 0.5820532090786111, "flos": 19463715763200.0, "grad_norm": 2.281573120166919, "language_loss": 0.83250844, "learning_rate": 1.5689640234337933e-06, "loss": 0.90954459, "num_input_tokens_seen": 208595110, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.11022949, "step": 9681, "time_per_iteration": 2.512394905090332 }, { "auxiliary_loss_clip": 0.06431791, "auxiliary_loss_mlp": 0.01267233, "balance_loss_clip": 0.06280967, "balance_loss_mlp": 0.0125589, "epoch": 0.5821133323312792, "flos": 17718424669440.0, "grad_norm": 2.045229211997997, "language_loss": 0.75928664, "learning_rate": 1.5685837223272109e-06, "loss": 0.83627689, "num_input_tokens_seen": 208612080, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.11334229, "step": 9682, "time_per_iteration": 2.510377883911133 }, { "auxiliary_loss_clip": 0.06446127, "auxiliary_loss_mlp": 0.01268727, "balance_loss_clip": 0.06287364, "balance_loss_mlp": 0.01256818, "epoch": 0.5821734555839471, "flos": 24578738328960.0, "grad_norm": 2.1136387492929427, "language_loss": 0.75590158, "learning_rate": 1.568203437579977e-06, "loss": 0.83305013, "num_input_tokens_seen": 208630235, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.11901855, "step": 9683, "time_per_iteration": 2.56815505027771 }, { "auxiliary_loss_clip": 0.06443568, "auxiliary_loss_mlp": 0.0126998, "balance_loss_clip": 0.06286177, "balance_loss_mlp": 0.01257964, "epoch": 0.5822335788366151, "flos": 22388760017280.0, "grad_norm": 1.6631174221465885, "language_loss": 0.74527103, "learning_rate": 1.5678231692065116e-06, "loss": 0.82240653, "num_input_tokens_seen": 208647925, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.12017822, "step": 9684, "time_per_iteration": 2.551288604736328 }, { "auxiliary_loss_clip": 0.06440621, "auxiliary_loss_mlp": 0.01272658, "balance_loss_clip": 0.06285271, "balance_loss_mlp": 0.01261322, "epoch": 0.582293702089283, "flos": 26729458202880.0, "grad_norm": 2.3218254684882282, "language_loss": 0.78739464, "learning_rate": 1.5674429172212348e-06, "loss": 0.8645274, "num_input_tokens_seen": 208666180, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.11334229, "step": 9685, "time_per_iteration": 2.597754955291748 }, { "auxiliary_loss_clip": 0.06436382, "auxiliary_loss_mlp": 0.01268192, "balance_loss_clip": 0.06282379, "balance_loss_mlp": 0.01257398, "epoch": 0.582353825341951, "flos": 17354560314240.0, "grad_norm": 1.7894132387533994, "language_loss": 0.75577819, "learning_rate": 1.5670626816385667e-06, "loss": 0.83282393, "num_input_tokens_seen": 208684240, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.10791016, "step": 9686, "time_per_iteration": 2.538686513900757 }, { "auxiliary_loss_clip": 0.06312433, "auxiliary_loss_mlp": 0.01252955, "balance_loss_clip": 0.06250562, "balance_loss_mlp": 0.01251497, "epoch": 0.5824139485946189, "flos": 55491133478400.0, "grad_norm": 0.8033383088318987, "language_loss": 0.57189572, "learning_rate": 1.5666824624729244e-06, "loss": 0.64754957, "num_input_tokens_seen": 208736090, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01455688, "step": 9687, "time_per_iteration": 2.9943721294403076 }, { "auxiliary_loss_clip": 0.06439421, "auxiliary_loss_mlp": 0.012678, "balance_loss_clip": 0.06285904, "balance_loss_mlp": 0.0125626, "epoch": 0.582474071847287, "flos": 20309261713920.0, "grad_norm": 2.9032508383263638, "language_loss": 0.70038128, "learning_rate": 1.566302259738727e-06, "loss": 0.77745342, "num_input_tokens_seen": 208754600, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.11547852, "step": 9688, "time_per_iteration": 3.994504928588867 }, { "auxiliary_loss_clip": 0.06432645, "auxiliary_loss_mlp": 0.01264325, "balance_loss_clip": 0.06279798, "balance_loss_mlp": 0.01253298, "epoch": 0.5825341950999549, "flos": 23884733687040.0, "grad_norm": 2.5725150906827357, "language_loss": 0.65691775, "learning_rate": 1.5659220734503918e-06, "loss": 0.73388743, "num_input_tokens_seen": 208773140, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.11010742, "step": 9689, "time_per_iteration": 2.553030490875244 }, { "auxiliary_loss_clip": 0.06432286, "auxiliary_loss_mlp": 0.01269113, "balance_loss_clip": 0.06281242, "balance_loss_mlp": 0.01257622, "epoch": 0.5825943183526229, "flos": 23119842890880.0, "grad_norm": 1.8154541567694433, "language_loss": 0.7352792, "learning_rate": 1.5655419036223341e-06, "loss": 0.81229317, "num_input_tokens_seen": 208793410, "router_z_loss_clip": 1.50976562, "router_z_loss_mlp": 0.11499023, "step": 9690, "time_per_iteration": 2.5617895126342773 }, { "auxiliary_loss_clip": 0.0644184, "auxiliary_loss_mlp": 0.01266013, "balance_loss_clip": 0.06285195, "balance_loss_mlp": 0.01253639, "epoch": 0.5826544416052909, "flos": 22864152556800.0, "grad_norm": 1.9634881869434595, "language_loss": 0.75738913, "learning_rate": 1.5651617502689717e-06, "loss": 0.83446765, "num_input_tokens_seen": 208811920, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.1237793, "step": 9691, "time_per_iteration": 2.6928656101226807 }, { "auxiliary_loss_clip": 0.0644248, "auxiliary_loss_mlp": 0.01265783, "balance_loss_clip": 0.06287505, "balance_loss_mlp": 0.01255007, "epoch": 0.5827145648579588, "flos": 31509560799360.0, "grad_norm": 1.6179606689091017, "language_loss": 0.80883795, "learning_rate": 1.5647816134047184e-06, "loss": 0.88592058, "num_input_tokens_seen": 208834720, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.10784912, "step": 9692, "time_per_iteration": 4.105777740478516 }, { "auxiliary_loss_clip": 0.06323658, "auxiliary_loss_mlp": 0.01252381, "balance_loss_clip": 0.06261705, "balance_loss_mlp": 0.01250945, "epoch": 0.5827746881106268, "flos": 69832028246400.0, "grad_norm": 0.80505838932226, "language_loss": 0.56863177, "learning_rate": 1.5644014930439907e-06, "loss": 0.64439219, "num_input_tokens_seen": 208898415, "router_z_loss_clip": 0.61914062, "router_z_loss_mlp": 0.01434326, "step": 9693, "time_per_iteration": 3.13493013381958 }, { "auxiliary_loss_clip": 0.06437798, "auxiliary_loss_mlp": 0.01264912, "balance_loss_clip": 0.06285456, "balance_loss_mlp": 0.01254552, "epoch": 0.5828348113632947, "flos": 23119088204160.0, "grad_norm": 1.76889428274482, "language_loss": 0.79125202, "learning_rate": 1.5640213892012025e-06, "loss": 0.8682791, "num_input_tokens_seen": 208919045, "router_z_loss_clip": 1.52246094, "router_z_loss_mlp": 0.1036377, "step": 9694, "time_per_iteration": 2.571864366531372 }, { "auxiliary_loss_clip": 0.06429508, "auxiliary_loss_mlp": 0.01263847, "balance_loss_clip": 0.06282221, "balance_loss_mlp": 0.01253947, "epoch": 0.5828949346159628, "flos": 21879769190400.0, "grad_norm": 1.3196015660395188, "language_loss": 0.76385379, "learning_rate": 1.5636413018907656e-06, "loss": 0.84078729, "num_input_tokens_seen": 208939375, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.09899902, "step": 9695, "time_per_iteration": 2.568734645843506 }, { "auxiliary_loss_clip": 0.06325743, "auxiliary_loss_mlp": 0.01250242, "balance_loss_clip": 0.06263336, "balance_loss_mlp": 0.01248913, "epoch": 0.5829550578686307, "flos": 65985170497920.0, "grad_norm": 0.759027629363656, "language_loss": 0.54963374, "learning_rate": 1.563261231127095e-06, "loss": 0.62539363, "num_input_tokens_seen": 209004760, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01330566, "step": 9696, "time_per_iteration": 3.2565877437591553 }, { "auxiliary_loss_clip": 0.06439662, "auxiliary_loss_mlp": 0.01264212, "balance_loss_clip": 0.06286009, "balance_loss_mlp": 0.01252768, "epoch": 0.5830151811212987, "flos": 16295391578880.0, "grad_norm": 2.1993481629695633, "language_loss": 0.77184618, "learning_rate": 1.5628811769246021e-06, "loss": 0.84888494, "num_input_tokens_seen": 209022930, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.11437988, "step": 9697, "time_per_iteration": 2.5372154712677 }, { "auxiliary_loss_clip": 0.06441757, "auxiliary_loss_mlp": 0.0126823, "balance_loss_clip": 0.0628559, "balance_loss_mlp": 0.01256303, "epoch": 0.5830753043739666, "flos": 24175447827840.0, "grad_norm": 1.594988853787176, "language_loss": 0.77776062, "learning_rate": 1.5625011392976991e-06, "loss": 0.85486042, "num_input_tokens_seen": 209043740, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.1192627, "step": 9698, "time_per_iteration": 2.5709307193756104 }, { "auxiliary_loss_clip": 0.0643862, "auxiliary_loss_mlp": 0.01273941, "balance_loss_clip": 0.06286444, "balance_loss_mlp": 0.0126205, "epoch": 0.5831354276266346, "flos": 27067438846080.0, "grad_norm": 1.6365217829078689, "language_loss": 0.83877325, "learning_rate": 1.5621211182607966e-06, "loss": 0.91589892, "num_input_tokens_seen": 209068885, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.11895752, "step": 9699, "time_per_iteration": 2.701881170272827 }, { "auxiliary_loss_clip": 0.06441524, "auxiliary_loss_mlp": 0.01265432, "balance_loss_clip": 0.06287004, "balance_loss_mlp": 0.01253887, "epoch": 0.5831955508793025, "flos": 23630301164160.0, "grad_norm": 2.150216237559117, "language_loss": 0.66243392, "learning_rate": 1.561741113828305e-06, "loss": 0.7395035, "num_input_tokens_seen": 209087340, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.11547852, "step": 9700, "time_per_iteration": 2.6211020946502686 }, { "auxiliary_loss_clip": 0.06440863, "auxiliary_loss_mlp": 0.01269528, "balance_loss_clip": 0.06286672, "balance_loss_mlp": 0.01257541, "epoch": 0.5832556741319705, "flos": 24980267894400.0, "grad_norm": 1.5395025775123496, "language_loss": 0.72036827, "learning_rate": 1.5613611260146344e-06, "loss": 0.79747224, "num_input_tokens_seen": 209108840, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.11987305, "step": 9701, "time_per_iteration": 2.620849609375 }, { "auxiliary_loss_clip": 0.06436099, "auxiliary_loss_mlp": 0.01263844, "balance_loss_clip": 0.06283182, "balance_loss_mlp": 0.01253175, "epoch": 0.5833157973846385, "flos": 23228226547200.0, "grad_norm": 1.6517863905780958, "language_loss": 0.85162562, "learning_rate": 1.5609811548341936e-06, "loss": 0.92862505, "num_input_tokens_seen": 209127985, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.10662842, "step": 9702, "time_per_iteration": 2.5538060665130615 }, { "auxiliary_loss_clip": 0.06440337, "auxiliary_loss_mlp": 0.01267356, "balance_loss_clip": 0.0629137, "balance_loss_mlp": 0.01256883, "epoch": 0.5833759206373065, "flos": 21983876288640.0, "grad_norm": 1.3521293991784769, "language_loss": 0.78084254, "learning_rate": 1.560601200301392e-06, "loss": 0.85791945, "num_input_tokens_seen": 209146885, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.10467529, "step": 9703, "time_per_iteration": 3.9308032989501953 }, { "auxiliary_loss_clip": 0.06440973, "auxiliary_loss_mlp": 0.01265666, "balance_loss_clip": 0.06286336, "balance_loss_mlp": 0.01254097, "epoch": 0.5834360438899745, "flos": 21768869911680.0, "grad_norm": 1.7709904110657764, "language_loss": 0.7134822, "learning_rate": 1.5602212624306366e-06, "loss": 0.79054856, "num_input_tokens_seen": 209166130, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.11553955, "step": 9704, "time_per_iteration": 3.9949090480804443 }, { "auxiliary_loss_clip": 0.06445669, "auxiliary_loss_mlp": 0.01265447, "balance_loss_clip": 0.06292126, "balance_loss_mlp": 0.01255142, "epoch": 0.5834961671426424, "flos": 15997214424960.0, "grad_norm": 1.5938575988798436, "language_loss": 0.81806922, "learning_rate": 1.559841341236335e-06, "loss": 0.8951804, "num_input_tokens_seen": 209183350, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.10290527, "step": 9705, "time_per_iteration": 2.5622808933258057 }, { "auxiliary_loss_clip": 0.06440917, "auxiliary_loss_mlp": 0.01268543, "balance_loss_clip": 0.0628798, "balance_loss_mlp": 0.01257427, "epoch": 0.5835562903953104, "flos": 22824600629760.0, "grad_norm": 1.6424093956252315, "language_loss": 0.80566859, "learning_rate": 1.5594614367328937e-06, "loss": 0.88276315, "num_input_tokens_seen": 209203945, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.11132812, "step": 9706, "time_per_iteration": 2.5514848232269287 }, { "auxiliary_loss_clip": 0.06438842, "auxiliary_loss_mlp": 0.01272664, "balance_loss_clip": 0.06288221, "balance_loss_mlp": 0.01261012, "epoch": 0.5836164136479783, "flos": 48478664332800.0, "grad_norm": 2.832354768651121, "language_loss": 0.74770361, "learning_rate": 1.5590815489347187e-06, "loss": 0.82481873, "num_input_tokens_seen": 209227080, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.11645508, "step": 9707, "time_per_iteration": 2.779128074645996 }, { "auxiliary_loss_clip": 0.0643684, "auxiliary_loss_mlp": 0.01267779, "balance_loss_clip": 0.06288805, "balance_loss_mlp": 0.01256824, "epoch": 0.5836765369006464, "flos": 26913172279680.0, "grad_norm": 1.5954573132748844, "language_loss": 0.81637502, "learning_rate": 1.5587016778562163e-06, "loss": 0.89342117, "num_input_tokens_seen": 209248170, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.10955811, "step": 9708, "time_per_iteration": 2.578664541244507 }, { "auxiliary_loss_clip": 0.06437075, "auxiliary_loss_mlp": 0.01270701, "balance_loss_clip": 0.06289314, "balance_loss_mlp": 0.01259841, "epoch": 0.5837366601533143, "flos": 20090230341120.0, "grad_norm": 1.3961287133347289, "language_loss": 0.78170669, "learning_rate": 1.5583218235117896e-06, "loss": 0.85878444, "num_input_tokens_seen": 209267730, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10870361, "step": 9709, "time_per_iteration": 2.5577921867370605 }, { "auxiliary_loss_clip": 0.06327334, "auxiliary_loss_mlp": 0.01252868, "balance_loss_clip": 0.06264753, "balance_loss_mlp": 0.01251547, "epoch": 0.5837967834059823, "flos": 65383910726400.0, "grad_norm": 0.7586415394082224, "language_loss": 0.56623, "learning_rate": 1.557941985915844e-06, "loss": 0.64203197, "num_input_tokens_seen": 209332510, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01321411, "step": 9710, "time_per_iteration": 3.1643433570861816 }, { "auxiliary_loss_clip": 0.06435572, "auxiliary_loss_mlp": 0.01264074, "balance_loss_clip": 0.06286818, "balance_loss_mlp": 0.01253912, "epoch": 0.5838569066586502, "flos": 25345809331200.0, "grad_norm": 1.475532414287823, "language_loss": 0.65527141, "learning_rate": 1.5575621650827833e-06, "loss": 0.73226786, "num_input_tokens_seen": 209353355, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.10168457, "step": 9711, "time_per_iteration": 2.7412703037261963 }, { "auxiliary_loss_clip": 0.06445733, "auxiliary_loss_mlp": 0.01268237, "balance_loss_clip": 0.06286569, "balance_loss_mlp": 0.01254969, "epoch": 0.5839170299113182, "flos": 22234535377920.0, "grad_norm": 1.6106448990216924, "language_loss": 0.78791726, "learning_rate": 1.5571823610270085e-06, "loss": 0.86505693, "num_input_tokens_seen": 209370960, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.13250732, "step": 9712, "time_per_iteration": 2.698258876800537 }, { "auxiliary_loss_clip": 0.06439993, "auxiliary_loss_mlp": 0.01267858, "balance_loss_clip": 0.06288271, "balance_loss_mlp": 0.01256414, "epoch": 0.5839771531639861, "flos": 22206513386880.0, "grad_norm": 1.5582199808819146, "language_loss": 0.73642361, "learning_rate": 1.5568025737629234e-06, "loss": 0.81350207, "num_input_tokens_seen": 209390955, "router_z_loss_clip": 1.51660156, "router_z_loss_mlp": 0.11456299, "step": 9713, "time_per_iteration": 2.614351749420166 }, { "auxiliary_loss_clip": 0.06445467, "auxiliary_loss_mlp": 0.01267394, "balance_loss_clip": 0.0628669, "balance_loss_mlp": 0.01253554, "epoch": 0.5840372764166541, "flos": 22425964030080.0, "grad_norm": 2.0585938604506873, "language_loss": 0.70211053, "learning_rate": 1.5564228033049292e-06, "loss": 0.77923906, "num_input_tokens_seen": 209410260, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.13842773, "step": 9714, "time_per_iteration": 2.5574159622192383 }, { "auxiliary_loss_clip": 0.06442168, "auxiliary_loss_mlp": 0.01266022, "balance_loss_clip": 0.06288601, "balance_loss_mlp": 0.01253863, "epoch": 0.5840973996693221, "flos": 19834330371840.0, "grad_norm": 1.6639677942714302, "language_loss": 0.80651283, "learning_rate": 1.5560430496674268e-06, "loss": 0.88359475, "num_input_tokens_seen": 209429920, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.121521, "step": 9715, "time_per_iteration": 2.5809121131896973 }, { "auxiliary_loss_clip": 0.06435794, "auxiliary_loss_mlp": 0.01267024, "balance_loss_clip": 0.06284967, "balance_loss_mlp": 0.01255842, "epoch": 0.5841575229219901, "flos": 21149482930560.0, "grad_norm": 2.5756784356313718, "language_loss": 0.73623347, "learning_rate": 1.5556633128648167e-06, "loss": 0.81326163, "num_input_tokens_seen": 209449470, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.11187744, "step": 9716, "time_per_iteration": 2.5399892330169678 }, { "auxiliary_loss_clip": 0.06428473, "auxiliary_loss_mlp": 0.01272476, "balance_loss_clip": 0.06281184, "balance_loss_mlp": 0.0126158, "epoch": 0.5842176461746581, "flos": 24646521882240.0, "grad_norm": 1.5634586906457806, "language_loss": 0.74969077, "learning_rate": 1.5552835929114976e-06, "loss": 0.82670033, "num_input_tokens_seen": 209467695, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10894775, "step": 9717, "time_per_iteration": 2.5853700637817383 }, { "auxiliary_loss_clip": 0.06439646, "auxiliary_loss_mlp": 0.0126728, "balance_loss_clip": 0.06288333, "balance_loss_mlp": 0.01256092, "epoch": 0.584277769427326, "flos": 19136468442240.0, "grad_norm": 2.0447206674380416, "language_loss": 0.79889047, "learning_rate": 1.5549038898218697e-06, "loss": 0.87595975, "num_input_tokens_seen": 209484250, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.11193848, "step": 9718, "time_per_iteration": 2.5179598331451416 }, { "auxiliary_loss_clip": 0.06441532, "auxiliary_loss_mlp": 0.01270189, "balance_loss_clip": 0.06289414, "balance_loss_mlp": 0.0125797, "epoch": 0.584337892679994, "flos": 22681822072320.0, "grad_norm": 1.4377637443811502, "language_loss": 0.67756909, "learning_rate": 1.5545242036103306e-06, "loss": 0.75468624, "num_input_tokens_seen": 209502830, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.12225342, "step": 9719, "time_per_iteration": 2.6008009910583496 }, { "auxiliary_loss_clip": 0.0643964, "auxiliary_loss_mlp": 0.01264067, "balance_loss_clip": 0.06285052, "balance_loss_mlp": 0.01252164, "epoch": 0.5843980159326619, "flos": 31291954945920.0, "grad_norm": 1.9356459330237759, "language_loss": 0.75566316, "learning_rate": 1.5541445342912786e-06, "loss": 0.83270019, "num_input_tokens_seen": 209525995, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.11914062, "step": 9720, "time_per_iteration": 2.645113229751587 }, { "auxiliary_loss_clip": 0.06442024, "auxiliary_loss_mlp": 0.01270389, "balance_loss_clip": 0.06288335, "balance_loss_mlp": 0.01258581, "epoch": 0.58445813918533, "flos": 22754846505600.0, "grad_norm": 1.6410654447171684, "language_loss": 0.83329105, "learning_rate": 1.5537648818791105e-06, "loss": 0.91041517, "num_input_tokens_seen": 209545895, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.11804199, "step": 9721, "time_per_iteration": 2.573105573654175 }, { "auxiliary_loss_clip": 0.06323157, "auxiliary_loss_mlp": 0.01253456, "balance_loss_clip": 0.06261043, "balance_loss_mlp": 0.01252126, "epoch": 0.5845182624379979, "flos": 60704602992000.0, "grad_norm": 0.9254904031470312, "language_loss": 0.70911902, "learning_rate": 1.5533852463882226e-06, "loss": 0.78488517, "num_input_tokens_seen": 209602315, "router_z_loss_clip": 0.62402344, "router_z_loss_mlp": 0.01331329, "step": 9722, "time_per_iteration": 3.194753408432007 }, { "auxiliary_loss_clip": 0.06436201, "auxiliary_loss_mlp": 0.01268287, "balance_loss_clip": 0.0628496, "balance_loss_mlp": 0.0125695, "epoch": 0.5845783856906659, "flos": 16367996741760.0, "grad_norm": 2.170346271621648, "language_loss": 0.89842296, "learning_rate": 1.5530056278330113e-06, "loss": 0.97546792, "num_input_tokens_seen": 209617615, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.11334229, "step": 9723, "time_per_iteration": 2.5413835048675537 }, { "auxiliary_loss_clip": 0.06439407, "auxiliary_loss_mlp": 0.01272284, "balance_loss_clip": 0.06288175, "balance_loss_mlp": 0.01261037, "epoch": 0.5846385089433338, "flos": 20089475654400.0, "grad_norm": 1.3959441324092217, "language_loss": 0.68731773, "learning_rate": 1.5526260262278709e-06, "loss": 0.76443464, "num_input_tokens_seen": 209637005, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.11236572, "step": 9724, "time_per_iteration": 2.564138650894165 }, { "auxiliary_loss_clip": 0.06439799, "auxiliary_loss_mlp": 0.01267099, "balance_loss_clip": 0.06284684, "balance_loss_mlp": 0.01254583, "epoch": 0.5846986321960018, "flos": 17316769322880.0, "grad_norm": 1.6878421492225861, "language_loss": 0.86440796, "learning_rate": 1.552246441587197e-06, "loss": 0.94147694, "num_input_tokens_seen": 209653170, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.12512207, "step": 9725, "time_per_iteration": 2.5122358798980713 }, { "auxiliary_loss_clip": 0.06441227, "auxiliary_loss_mlp": 0.01268799, "balance_loss_clip": 0.06285433, "balance_loss_mlp": 0.01257057, "epoch": 0.5847587554486697, "flos": 17202977078400.0, "grad_norm": 2.3633392095214454, "language_loss": 0.83219403, "learning_rate": 1.5518668739253821e-06, "loss": 0.90929425, "num_input_tokens_seen": 209671275, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.11743164, "step": 9726, "time_per_iteration": 2.5325379371643066 }, { "auxiliary_loss_clip": 0.06437957, "auxiliary_loss_mlp": 0.01268718, "balance_loss_clip": 0.06281967, "balance_loss_mlp": 0.01257775, "epoch": 0.5848188787013378, "flos": 24534993697920.0, "grad_norm": 2.2273709604524, "language_loss": 0.67358482, "learning_rate": 1.5514873232568206e-06, "loss": 0.75065148, "num_input_tokens_seen": 209690380, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.10943604, "step": 9727, "time_per_iteration": 3.987743616104126 }, { "auxiliary_loss_clip": 0.06441557, "auxiliary_loss_mlp": 0.01272495, "balance_loss_clip": 0.06287169, "balance_loss_mlp": 0.01260724, "epoch": 0.5848790019540057, "flos": 20634161120640.0, "grad_norm": 1.7948958984436563, "language_loss": 0.82287914, "learning_rate": 1.5511077895959055e-06, "loss": 0.90001965, "num_input_tokens_seen": 209708845, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.11779785, "step": 9728, "time_per_iteration": 2.561302900314331 }, { "auxiliary_loss_clip": 0.064337, "auxiliary_loss_mlp": 0.01272644, "balance_loss_clip": 0.06285064, "balance_loss_mlp": 0.01261975, "epoch": 0.5849391252066737, "flos": 22425377051520.0, "grad_norm": 1.8709684037999255, "language_loss": 0.78036094, "learning_rate": 1.550728272957027e-06, "loss": 0.85742432, "num_input_tokens_seen": 209729000, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.10668945, "step": 9729, "time_per_iteration": 2.561847448348999 }, { "auxiliary_loss_clip": 0.06441334, "auxiliary_loss_mlp": 0.01267962, "balance_loss_clip": 0.06286782, "balance_loss_mlp": 0.01255559, "epoch": 0.5849992484593417, "flos": 25417995223680.0, "grad_norm": 2.05920167655164, "language_loss": 0.70488977, "learning_rate": 1.5503487733545782e-06, "loss": 0.78198266, "num_input_tokens_seen": 209747435, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.1239624, "step": 9730, "time_per_iteration": 2.5855114459991455 }, { "auxiliary_loss_clip": 0.06444271, "auxiliary_loss_mlp": 0.01268801, "balance_loss_clip": 0.0628829, "balance_loss_mlp": 0.01255902, "epoch": 0.5850593717120096, "flos": 21070840273920.0, "grad_norm": 1.6362715385638453, "language_loss": 0.78917378, "learning_rate": 1.5499692908029482e-06, "loss": 0.86630452, "num_input_tokens_seen": 209764910, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.12890625, "step": 9731, "time_per_iteration": 4.00058126449585 }, { "auxiliary_loss_clip": 0.06436092, "auxiliary_loss_mlp": 0.01267814, "balance_loss_clip": 0.06284171, "balance_loss_mlp": 0.01256126, "epoch": 0.5851194949646776, "flos": 25308605318400.0, "grad_norm": 1.9682215807850432, "language_loss": 0.70706332, "learning_rate": 1.549589825316528e-06, "loss": 0.78410244, "num_input_tokens_seen": 209786115, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.11694336, "step": 9732, "time_per_iteration": 2.5896852016448975 }, { "auxiliary_loss_clip": 0.06448703, "auxiliary_loss_mlp": 0.01270149, "balance_loss_clip": 0.06290284, "balance_loss_mlp": 0.0125666, "epoch": 0.5851796182173455, "flos": 23594103400320.0, "grad_norm": 3.641270609794164, "language_loss": 0.53388214, "learning_rate": 1.5492103769097075e-06, "loss": 0.61107063, "num_input_tokens_seen": 209806095, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.1348877, "step": 9733, "time_per_iteration": 2.5702083110809326 }, { "auxiliary_loss_clip": 0.06437729, "auxiliary_loss_mlp": 0.01267491, "balance_loss_clip": 0.06285691, "balance_loss_mlp": 0.01255576, "epoch": 0.5852397414700136, "flos": 24828936220800.0, "grad_norm": 2.106947386624899, "language_loss": 0.87743932, "learning_rate": 1.5488309455968739e-06, "loss": 0.95449156, "num_input_tokens_seen": 209823650, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.11907959, "step": 9734, "time_per_iteration": 2.562469720840454 }, { "auxiliary_loss_clip": 0.06435834, "auxiliary_loss_mlp": 0.01269671, "balance_loss_clip": 0.06289186, "balance_loss_mlp": 0.0125837, "epoch": 0.5852998647226815, "flos": 19943887985280.0, "grad_norm": 1.4232561452253176, "language_loss": 0.72738338, "learning_rate": 1.5484515313924163e-06, "loss": 0.80443841, "num_input_tokens_seen": 209843220, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.11303711, "step": 9735, "time_per_iteration": 2.5485236644744873 }, { "auxiliary_loss_clip": 0.06438299, "auxiliary_loss_mlp": 0.01268859, "balance_loss_clip": 0.06282723, "balance_loss_mlp": 0.01256628, "epoch": 0.5853599879753495, "flos": 16724817354240.0, "grad_norm": 2.440240138636898, "language_loss": 0.73855287, "learning_rate": 1.5480721343107217e-06, "loss": 0.81562448, "num_input_tokens_seen": 209854880, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.12237549, "step": 9736, "time_per_iteration": 2.518160104751587 }, { "auxiliary_loss_clip": 0.06443353, "auxiliary_loss_mlp": 0.01264107, "balance_loss_clip": 0.06290445, "balance_loss_mlp": 0.01252818, "epoch": 0.5854201112280174, "flos": 44466848622720.0, "grad_norm": 1.883349286439458, "language_loss": 0.70977515, "learning_rate": 1.5476927543661772e-06, "loss": 0.78684974, "num_input_tokens_seen": 209877870, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.11297607, "step": 9737, "time_per_iteration": 2.757511854171753 }, { "auxiliary_loss_clip": 0.0643855, "auxiliary_loss_mlp": 0.01271232, "balance_loss_clip": 0.06287815, "balance_loss_mlp": 0.01259794, "epoch": 0.5854802344806854, "flos": 20345375623680.0, "grad_norm": 3.2781174741319226, "language_loss": 0.82712156, "learning_rate": 1.547313391573169e-06, "loss": 0.90421939, "num_input_tokens_seen": 209896690, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.11437988, "step": 9738, "time_per_iteration": 2.5530271530151367 }, { "auxiliary_loss_clip": 0.06444389, "auxiliary_loss_mlp": 0.01269942, "balance_loss_clip": 0.06287333, "balance_loss_mlp": 0.01258337, "epoch": 0.5855403577333533, "flos": 20927013540480.0, "grad_norm": 1.5796654761412707, "language_loss": 0.68509883, "learning_rate": 1.546934045946082e-06, "loss": 0.7622422, "num_input_tokens_seen": 209914640, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.1159668, "step": 9739, "time_per_iteration": 2.5438337326049805 }, { "auxiliary_loss_clip": 0.06437515, "auxiliary_loss_mlp": 0.01267125, "balance_loss_clip": 0.0628337, "balance_loss_mlp": 0.01255538, "epoch": 0.5856004809860214, "flos": 20454849383040.0, "grad_norm": 2.1845972027301572, "language_loss": 0.58650213, "learning_rate": 1.5465547174993017e-06, "loss": 0.66354859, "num_input_tokens_seen": 209933375, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.11584473, "step": 9740, "time_per_iteration": 2.5288357734680176 }, { "auxiliary_loss_clip": 0.06436126, "auxiliary_loss_mlp": 0.0126514, "balance_loss_clip": 0.06282683, "balance_loss_mlp": 0.01253714, "epoch": 0.5856606042386893, "flos": 19645962393600.0, "grad_norm": 2.0452016722124764, "language_loss": 0.75452292, "learning_rate": 1.5461754062472113e-06, "loss": 0.83153558, "num_input_tokens_seen": 209952055, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.11437988, "step": 9741, "time_per_iteration": 2.5473735332489014 }, { "auxiliary_loss_clip": 0.06441082, "auxiliary_loss_mlp": 0.01266701, "balance_loss_clip": 0.06285463, "balance_loss_mlp": 0.01254756, "epoch": 0.5857207274913573, "flos": 21692072044800.0, "grad_norm": 3.1169437608434714, "language_loss": 0.75894046, "learning_rate": 1.5457961122041959e-06, "loss": 0.8360182, "num_input_tokens_seen": 209971190, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.11950684, "step": 9742, "time_per_iteration": 3.912179470062256 }, { "auxiliary_loss_clip": 0.06433364, "auxiliary_loss_mlp": 0.01267552, "balance_loss_clip": 0.0628244, "balance_loss_mlp": 0.01256591, "epoch": 0.5857808507440253, "flos": 23188968109440.0, "grad_norm": 1.6524552225844158, "language_loss": 0.75371951, "learning_rate": 1.5454168353846369e-06, "loss": 0.83072865, "num_input_tokens_seen": 209990695, "router_z_loss_clip": 1.50976562, "router_z_loss_mlp": 0.10968018, "step": 9743, "time_per_iteration": 2.570437431335449 }, { "auxiliary_loss_clip": 0.06436029, "auxiliary_loss_mlp": 0.01267459, "balance_loss_clip": 0.06287532, "balance_loss_mlp": 0.01257022, "epoch": 0.5858409739966932, "flos": 27242683660800.0, "grad_norm": 1.8623049201527386, "language_loss": 0.81360024, "learning_rate": 1.5450375758029172e-06, "loss": 0.89063519, "num_input_tokens_seen": 210010210, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.10437012, "step": 9744, "time_per_iteration": 4.134809255599976 }, { "auxiliary_loss_clip": 0.06451194, "auxiliary_loss_mlp": 0.01268384, "balance_loss_clip": 0.06291932, "balance_loss_mlp": 0.0125635, "epoch": 0.5859010972493612, "flos": 27862993036800.0, "grad_norm": 1.7193432726310751, "language_loss": 0.71661103, "learning_rate": 1.5446583334734183e-06, "loss": 0.79380679, "num_input_tokens_seen": 210030030, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.12030029, "step": 9745, "time_per_iteration": 2.626021385192871 }, { "auxiliary_loss_clip": 0.06330451, "auxiliary_loss_mlp": 0.01258077, "balance_loss_clip": 0.06268442, "balance_loss_mlp": 0.01256571, "epoch": 0.5859612205020291, "flos": 70029452465280.0, "grad_norm": 0.7099303530857963, "language_loss": 0.53249955, "learning_rate": 1.5442791084105204e-06, "loss": 0.60838485, "num_input_tokens_seen": 210094840, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.0150528, "step": 9746, "time_per_iteration": 3.245661497116089 }, { "auxiliary_loss_clip": 0.06442031, "auxiliary_loss_mlp": 0.01265846, "balance_loss_clip": 0.06287606, "balance_loss_mlp": 0.0125349, "epoch": 0.5860213437546972, "flos": 24062032926720.0, "grad_norm": 1.9433547643942037, "language_loss": 0.73073006, "learning_rate": 1.5438999006286054e-06, "loss": 0.80780888, "num_input_tokens_seen": 210114660, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.12359619, "step": 9747, "time_per_iteration": 2.5986924171447754 }, { "auxiliary_loss_clip": 0.06440262, "auxiliary_loss_mlp": 0.01268689, "balance_loss_clip": 0.06285486, "balance_loss_mlp": 0.01257335, "epoch": 0.5860814670073651, "flos": 18952670511360.0, "grad_norm": 1.8061056749681763, "language_loss": 0.81097412, "learning_rate": 1.543520710142051e-06, "loss": 0.88806367, "num_input_tokens_seen": 210132770, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.11364746, "step": 9748, "time_per_iteration": 2.5591213703155518 }, { "auxiliary_loss_clip": 0.06442851, "auxiliary_loss_mlp": 0.01268165, "balance_loss_clip": 0.06287883, "balance_loss_mlp": 0.01256018, "epoch": 0.5861415902600331, "flos": 22567904046720.0, "grad_norm": 1.82791216529395, "language_loss": 0.72742319, "learning_rate": 1.5431415369652375e-06, "loss": 0.80453336, "num_input_tokens_seen": 210151895, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.121521, "step": 9749, "time_per_iteration": 2.5662553310394287 }, { "auxiliary_loss_clip": 0.06433846, "auxiliary_loss_mlp": 0.01266972, "balance_loss_clip": 0.0628438, "balance_loss_mlp": 0.0125576, "epoch": 0.586201713512701, "flos": 14397217511040.0, "grad_norm": 1.9455981093128625, "language_loss": 0.75377381, "learning_rate": 1.5427623811125428e-06, "loss": 0.83078194, "num_input_tokens_seen": 210168040, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.11224365, "step": 9750, "time_per_iteration": 2.5604193210601807 }, { "auxiliary_loss_clip": 0.06442708, "auxiliary_loss_mlp": 0.01270144, "balance_loss_clip": 0.0629058, "balance_loss_mlp": 0.01258396, "epoch": 0.586261836765369, "flos": 19504357793280.0, "grad_norm": 2.372267119727505, "language_loss": 0.71336436, "learning_rate": 1.542383242598344e-06, "loss": 0.79049289, "num_input_tokens_seen": 210187720, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.11743164, "step": 9751, "time_per_iteration": 2.560314178466797 }, { "auxiliary_loss_clip": 0.06452259, "auxiliary_loss_mlp": 0.012684, "balance_loss_clip": 0.06292893, "balance_loss_mlp": 0.0125481, "epoch": 0.5863219600180369, "flos": 20707688678400.0, "grad_norm": 1.6836379211684396, "language_loss": 0.74898958, "learning_rate": 1.5420041214370184e-06, "loss": 0.82619613, "num_input_tokens_seen": 210206080, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.13598633, "step": 9752, "time_per_iteration": 2.527040719985962 }, { "auxiliary_loss_clip": 0.06440932, "auxiliary_loss_mlp": 0.01268101, "balance_loss_clip": 0.06287034, "balance_loss_mlp": 0.01255924, "epoch": 0.586382083270705, "flos": 19798258389120.0, "grad_norm": 2.447947487090988, "language_loss": 0.77668989, "learning_rate": 1.541625017642943e-06, "loss": 0.85378027, "num_input_tokens_seen": 210225660, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.12182617, "step": 9753, "time_per_iteration": 2.5154032707214355 }, { "auxiliary_loss_clip": 0.06443205, "auxiliary_loss_mlp": 0.0126922, "balance_loss_clip": 0.06294971, "balance_loss_mlp": 0.01258736, "epoch": 0.5864422065233729, "flos": 16504821659520.0, "grad_norm": 2.1035219872447746, "language_loss": 0.71027511, "learning_rate": 1.5412459312304927e-06, "loss": 0.78739941, "num_input_tokens_seen": 210242725, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.1048584, "step": 9754, "time_per_iteration": 2.5177886486053467 }, { "auxiliary_loss_clip": 0.06443174, "auxiliary_loss_mlp": 0.01268567, "balance_loss_clip": 0.0629003, "balance_loss_mlp": 0.01256569, "epoch": 0.5865023297760409, "flos": 20419657868160.0, "grad_norm": 1.5716076835799941, "language_loss": 0.7251761, "learning_rate": 1.540866862214043e-06, "loss": 0.80229354, "num_input_tokens_seen": 210263225, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.12011719, "step": 9755, "time_per_iteration": 2.5467333793640137 }, { "auxiliary_loss_clip": 0.06328703, "auxiliary_loss_mlp": 0.01252584, "balance_loss_clip": 0.06266369, "balance_loss_mlp": 0.01251156, "epoch": 0.5865624530287089, "flos": 63369386864640.0, "grad_norm": 0.7244869604581504, "language_loss": 0.56973398, "learning_rate": 1.540487810607967e-06, "loss": 0.64554679, "num_input_tokens_seen": 210322310, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.0142746, "step": 9756, "time_per_iteration": 3.143796920776367 }, { "auxiliary_loss_clip": 0.06439408, "auxiliary_loss_mlp": 0.0126733, "balance_loss_clip": 0.06286228, "balance_loss_mlp": 0.01256435, "epoch": 0.5866225762813768, "flos": 27023610360960.0, "grad_norm": 1.6648825849537905, "language_loss": 0.76533055, "learning_rate": 1.5401087764266396e-06, "loss": 0.84239793, "num_input_tokens_seen": 210340845, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.10900879, "step": 9757, "time_per_iteration": 2.6005451679229736 }, { "auxiliary_loss_clip": 0.06327423, "auxiliary_loss_mlp": 0.01253198, "balance_loss_clip": 0.06265299, "balance_loss_mlp": 0.01251732, "epoch": 0.5866826995340448, "flos": 73007941224960.0, "grad_norm": 0.8441017464838012, "language_loss": 0.6031723, "learning_rate": 1.5397297596844337e-06, "loss": 0.6789785, "num_input_tokens_seen": 210397815, "router_z_loss_clip": 0.62158203, "router_z_loss_mlp": 0.01464844, "step": 9758, "time_per_iteration": 3.183847427368164 }, { "auxiliary_loss_clip": 0.06447764, "auxiliary_loss_mlp": 0.01269989, "balance_loss_clip": 0.06288825, "balance_loss_mlp": 0.01257239, "epoch": 0.5867428227867127, "flos": 21291716436480.0, "grad_norm": 1.9366543395855935, "language_loss": 0.72067463, "learning_rate": 1.5393507603957212e-06, "loss": 0.79785216, "num_input_tokens_seen": 210413900, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.12744141, "step": 9759, "time_per_iteration": 2.6421303749084473 }, { "auxiliary_loss_clip": 0.06439733, "auxiliary_loss_mlp": 0.01268011, "balance_loss_clip": 0.06288613, "balance_loss_mlp": 0.01256377, "epoch": 0.5868029460393808, "flos": 33476356961280.0, "grad_norm": 1.5787249971940063, "language_loss": 0.73324275, "learning_rate": 1.5389717785748742e-06, "loss": 0.8103202, "num_input_tokens_seen": 210434110, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.11633301, "step": 9760, "time_per_iteration": 2.758173704147339 }, { "auxiliary_loss_clip": 0.06438369, "auxiliary_loss_mlp": 0.01267844, "balance_loss_clip": 0.06285173, "balance_loss_mlp": 0.01255583, "epoch": 0.5868630692920487, "flos": 17894382243840.0, "grad_norm": 1.897120878538931, "language_loss": 0.72536731, "learning_rate": 1.5385928142362637e-06, "loss": 0.80242944, "num_input_tokens_seen": 210451685, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.1227417, "step": 9761, "time_per_iteration": 2.5423831939697266 }, { "auxiliary_loss_clip": 0.0644494, "auxiliary_loss_mlp": 0.01267331, "balance_loss_clip": 0.06286614, "balance_loss_mlp": 0.0125417, "epoch": 0.5869231925447167, "flos": 21041770106880.0, "grad_norm": 1.7099982711519532, "language_loss": 0.75361651, "learning_rate": 1.5382138673942597e-06, "loss": 0.83073926, "num_input_tokens_seen": 210470825, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.13165283, "step": 9762, "time_per_iteration": 2.5924160480499268 }, { "auxiliary_loss_clip": 0.06437378, "auxiliary_loss_mlp": 0.01265812, "balance_loss_clip": 0.06288099, "balance_loss_mlp": 0.01254654, "epoch": 0.5869833157973846, "flos": 74753288974080.0, "grad_norm": 1.281293674031364, "language_loss": 0.72557592, "learning_rate": 1.5378349380632317e-06, "loss": 0.80260777, "num_input_tokens_seen": 210500075, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.11151123, "step": 9763, "time_per_iteration": 2.9817965030670166 }, { "auxiliary_loss_clip": 0.06438675, "auxiliary_loss_mlp": 0.012679, "balance_loss_clip": 0.06286082, "balance_loss_mlp": 0.01256491, "epoch": 0.5870434390500526, "flos": 17644687476480.0, "grad_norm": 1.5171531880633822, "language_loss": 0.80256128, "learning_rate": 1.53745602625755e-06, "loss": 0.87962699, "num_input_tokens_seen": 210518150, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.11407471, "step": 9764, "time_per_iteration": 2.550847053527832 }, { "auxiliary_loss_clip": 0.06445014, "auxiliary_loss_mlp": 0.01268713, "balance_loss_clip": 0.06290855, "balance_loss_mlp": 0.01257108, "epoch": 0.5871035623027205, "flos": 21512424890880.0, "grad_norm": 1.8123126653368402, "language_loss": 0.79183018, "learning_rate": 1.5370771319915819e-06, "loss": 0.86896747, "num_input_tokens_seen": 210537760, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.11602783, "step": 9765, "time_per_iteration": 2.5507140159606934 }, { "auxiliary_loss_clip": 0.06436868, "auxiliary_loss_mlp": 0.01269883, "balance_loss_clip": 0.06285898, "balance_loss_mlp": 0.01258392, "epoch": 0.5871636855553886, "flos": 13556744732160.0, "grad_norm": 1.5861318229968393, "language_loss": 0.83651173, "learning_rate": 1.5366982552796947e-06, "loss": 0.91357929, "num_input_tokens_seen": 210555515, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.11486816, "step": 9766, "time_per_iteration": 2.514071226119995 }, { "auxiliary_loss_clip": 0.06449139, "auxiliary_loss_mlp": 0.01271329, "balance_loss_clip": 0.06289919, "balance_loss_mlp": 0.01259748, "epoch": 0.5872238088080565, "flos": 26220006178560.0, "grad_norm": 2.9004198694484566, "language_loss": 0.69802201, "learning_rate": 1.536319396136257e-06, "loss": 0.77522671, "num_input_tokens_seen": 210575000, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.11578369, "step": 9767, "time_per_iteration": 4.018352270126343 }, { "auxiliary_loss_clip": 0.06438521, "auxiliary_loss_mlp": 0.01267807, "balance_loss_clip": 0.06283018, "balance_loss_mlp": 0.01255642, "epoch": 0.5872839320607245, "flos": 30673196870400.0, "grad_norm": 1.8473878541071713, "language_loss": 0.6384753, "learning_rate": 1.5359405545756336e-06, "loss": 0.7155385, "num_input_tokens_seen": 210595185, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.12158203, "step": 9768, "time_per_iteration": 2.6088006496429443 }, { "auxiliary_loss_clip": 0.0632575, "auxiliary_loss_mlp": 0.01254223, "balance_loss_clip": 0.06263621, "balance_loss_mlp": 0.01252913, "epoch": 0.5873440553133924, "flos": 60324623925120.0, "grad_norm": 0.6929100633381847, "language_loss": 0.53715628, "learning_rate": 1.5355617306121914e-06, "loss": 0.61295605, "num_input_tokens_seen": 210653210, "router_z_loss_clip": 0.62402344, "router_z_loss_mlp": 0.0131073, "step": 9769, "time_per_iteration": 3.1767563819885254 }, { "auxiliary_loss_clip": 0.06435497, "auxiliary_loss_mlp": 0.01268101, "balance_loss_clip": 0.06284788, "balance_loss_mlp": 0.01256371, "epoch": 0.5874041785660604, "flos": 21545016929280.0, "grad_norm": 1.3255684389923494, "language_loss": 0.70817518, "learning_rate": 1.5351829242602945e-06, "loss": 0.78521121, "num_input_tokens_seen": 210673750, "router_z_loss_clip": 1.50585938, "router_z_loss_mlp": 0.11737061, "step": 9770, "time_per_iteration": 2.5508241653442383 }, { "auxiliary_loss_clip": 0.06435504, "auxiliary_loss_mlp": 0.01266643, "balance_loss_clip": 0.06284553, "balance_loss_mlp": 0.01255175, "epoch": 0.5874643018187284, "flos": 24395778938880.0, "grad_norm": 2.0671238136529984, "language_loss": 0.67557442, "learning_rate": 1.5348041355343077e-06, "loss": 0.7525959, "num_input_tokens_seen": 210692960, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.11456299, "step": 9771, "time_per_iteration": 4.0864174365997314 }, { "auxiliary_loss_clip": 0.06446043, "auxiliary_loss_mlp": 0.01270329, "balance_loss_clip": 0.06288768, "balance_loss_mlp": 0.01257144, "epoch": 0.5875244250713964, "flos": 28155300405120.0, "grad_norm": 1.50330238115963, "language_loss": 0.66782618, "learning_rate": 1.5344253644485954e-06, "loss": 0.74498987, "num_input_tokens_seen": 210714040, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.13183594, "step": 9772, "time_per_iteration": 2.5987396240234375 }, { "auxiliary_loss_clip": 0.06442951, "auxiliary_loss_mlp": 0.01277247, "balance_loss_clip": 0.06285914, "balance_loss_mlp": 0.01264301, "epoch": 0.5875845483240644, "flos": 25819566716160.0, "grad_norm": 2.069590740433966, "language_loss": 0.74892986, "learning_rate": 1.534046611017519e-06, "loss": 0.82613182, "num_input_tokens_seen": 210733710, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.12957764, "step": 9773, "time_per_iteration": 2.5866425037384033 }, { "auxiliary_loss_clip": 0.06439698, "auxiliary_loss_mlp": 0.01268544, "balance_loss_clip": 0.06285241, "balance_loss_mlp": 0.01256909, "epoch": 0.5876446715767323, "flos": 26913843112320.0, "grad_norm": 2.116524053921254, "language_loss": 0.53536928, "learning_rate": 1.5336678752554421e-06, "loss": 0.61245173, "num_input_tokens_seen": 210753580, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.11627197, "step": 9774, "time_per_iteration": 2.5871853828430176 }, { "auxiliary_loss_clip": 0.06437507, "auxiliary_loss_mlp": 0.01265385, "balance_loss_clip": 0.0628425, "balance_loss_mlp": 0.01253255, "epoch": 0.5877047948294003, "flos": 36693750510720.0, "grad_norm": 2.018857045955335, "language_loss": 0.65590423, "learning_rate": 1.5332891571767264e-06, "loss": 0.73293316, "num_input_tokens_seen": 210773495, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.12127686, "step": 9775, "time_per_iteration": 2.683271884918213 }, { "auxiliary_loss_clip": 0.06439143, "auxiliary_loss_mlp": 0.01267431, "balance_loss_clip": 0.06284795, "balance_loss_mlp": 0.01255701, "epoch": 0.5877649180820682, "flos": 26732057679360.0, "grad_norm": 1.6347344809060531, "language_loss": 0.74425018, "learning_rate": 1.5329104567957326e-06, "loss": 0.821316, "num_input_tokens_seen": 210793645, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.11730957, "step": 9776, "time_per_iteration": 2.597825527191162 }, { "auxiliary_loss_clip": 0.06434537, "auxiliary_loss_mlp": 0.0126968, "balance_loss_clip": 0.06280975, "balance_loss_mlp": 0.01258194, "epoch": 0.5878250413347362, "flos": 21038457870720.0, "grad_norm": 1.7165833174014358, "language_loss": 0.7511735, "learning_rate": 1.532531774126821e-06, "loss": 0.8282156, "num_input_tokens_seen": 210813415, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.11480713, "step": 9777, "time_per_iteration": 2.5388636589050293 }, { "auxiliary_loss_clip": 0.06430063, "auxiliary_loss_mlp": 0.01267194, "balance_loss_clip": 0.06282236, "balance_loss_mlp": 0.01256531, "epoch": 0.5878851645874041, "flos": 25491397000320.0, "grad_norm": 1.4607094876960685, "language_loss": 0.74866378, "learning_rate": 1.5321531091843512e-06, "loss": 0.82563633, "num_input_tokens_seen": 210833850, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.10656738, "step": 9778, "time_per_iteration": 2.5895841121673584 }, { "auxiliary_loss_clip": 0.0643494, "auxiliary_loss_mlp": 0.01268679, "balance_loss_clip": 0.06283246, "balance_loss_mlp": 0.0125699, "epoch": 0.5879452878400722, "flos": 23775930760320.0, "grad_norm": 1.8986630556058472, "language_loss": 0.70514035, "learning_rate": 1.5317744619826824e-06, "loss": 0.78217655, "num_input_tokens_seen": 210853115, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.11688232, "step": 9779, "time_per_iteration": 2.561284065246582 }, { "auxiliary_loss_clip": 0.06435707, "auxiliary_loss_mlp": 0.01265067, "balance_loss_clip": 0.06280959, "balance_loss_mlp": 0.012537, "epoch": 0.5880054110927401, "flos": 17830749467520.0, "grad_norm": 2.1380750068164747, "language_loss": 0.67095977, "learning_rate": 1.5313958325361727e-06, "loss": 0.74796748, "num_input_tokens_seen": 210872090, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.1137085, "step": 9780, "time_per_iteration": 2.5715413093566895 }, { "auxiliary_loss_clip": 0.06441781, "auxiliary_loss_mlp": 0.01270972, "balance_loss_clip": 0.06286077, "balance_loss_mlp": 0.01259415, "epoch": 0.5880655343454081, "flos": 19469417840640.0, "grad_norm": 1.778559001796177, "language_loss": 0.72356379, "learning_rate": 1.5310172208591807e-06, "loss": 0.80069137, "num_input_tokens_seen": 210888490, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.11553955, "step": 9781, "time_per_iteration": 2.512749195098877 }, { "auxiliary_loss_clip": 0.06434388, "auxiliary_loss_mlp": 0.01266942, "balance_loss_clip": 0.06281832, "balance_loss_mlp": 0.01255885, "epoch": 0.588125657598076, "flos": 21403999307520.0, "grad_norm": 1.462138288254418, "language_loss": 0.7029345, "learning_rate": 1.5306386269660622e-06, "loss": 0.77994788, "num_input_tokens_seen": 210908220, "router_z_loss_clip": 1.52539062, "router_z_loss_mlp": 0.1105957, "step": 9782, "time_per_iteration": 3.869157552719116 }, { "auxiliary_loss_clip": 0.06436545, "auxiliary_loss_mlp": 0.01269143, "balance_loss_clip": 0.06279546, "balance_loss_mlp": 0.01257234, "epoch": 0.588185780850744, "flos": 16040246296320.0, "grad_norm": 2.4246477294555975, "language_loss": 0.70626259, "learning_rate": 1.5302600508711741e-06, "loss": 0.78331947, "num_input_tokens_seen": 210923945, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.11907959, "step": 9783, "time_per_iteration": 3.9515111446380615 }, { "auxiliary_loss_clip": 0.0643907, "auxiliary_loss_mlp": 0.01269251, "balance_loss_clip": 0.06282632, "balance_loss_mlp": 0.01256382, "epoch": 0.588245904103412, "flos": 23734282481280.0, "grad_norm": 2.017948459026042, "language_loss": 0.69228524, "learning_rate": 1.5298814925888719e-06, "loss": 0.76936841, "num_input_tokens_seen": 210941955, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.12872314, "step": 9784, "time_per_iteration": 2.592973470687866 }, { "auxiliary_loss_clip": 0.06440543, "auxiliary_loss_mlp": 0.01267841, "balance_loss_clip": 0.06283141, "balance_loss_mlp": 0.01256069, "epoch": 0.58830602735608, "flos": 33810983441280.0, "grad_norm": 1.718189645990313, "language_loss": 0.69513935, "learning_rate": 1.5295029521335102e-06, "loss": 0.77222323, "num_input_tokens_seen": 210963105, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.11767578, "step": 9785, "time_per_iteration": 2.6833627223968506 }, { "auxiliary_loss_clip": 0.06426127, "auxiliary_loss_mlp": 0.01265041, "balance_loss_clip": 0.06276253, "balance_loss_mlp": 0.01253823, "epoch": 0.588366150608748, "flos": 17096144722560.0, "grad_norm": 1.988846812578569, "language_loss": 0.77936792, "learning_rate": 1.5291244295194448e-06, "loss": 0.85627961, "num_input_tokens_seen": 210978720, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.11212158, "step": 9786, "time_per_iteration": 2.531031370162964 }, { "auxiliary_loss_clip": 0.06438725, "auxiliary_loss_mlp": 0.01268605, "balance_loss_clip": 0.06283806, "balance_loss_mlp": 0.01256618, "epoch": 0.5884262738614159, "flos": 22133698588800.0, "grad_norm": 1.430702935836684, "language_loss": 0.79330707, "learning_rate": 1.5287459247610276e-06, "loss": 0.87038034, "num_input_tokens_seen": 210998750, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.11987305, "step": 9787, "time_per_iteration": 2.5704972743988037 }, { "auxiliary_loss_clip": 0.06433649, "auxiliary_loss_mlp": 0.01266661, "balance_loss_clip": 0.06280968, "balance_loss_mlp": 0.01255449, "epoch": 0.5884863971140839, "flos": 21038038600320.0, "grad_norm": 1.5179475474748843, "language_loss": 0.66448486, "learning_rate": 1.5283674378726116e-06, "loss": 0.74148798, "num_input_tokens_seen": 211017550, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.11212158, "step": 9788, "time_per_iteration": 2.573810577392578 }, { "auxiliary_loss_clip": 0.06428749, "auxiliary_loss_mlp": 0.01267372, "balance_loss_clip": 0.06279252, "balance_loss_mlp": 0.01256185, "epoch": 0.5885465203667518, "flos": 23811835034880.0, "grad_norm": 2.0971514695716946, "language_loss": 0.80657256, "learning_rate": 1.5279889688685506e-06, "loss": 0.88353378, "num_input_tokens_seen": 211034135, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.11199951, "step": 9789, "time_per_iteration": 2.649132490158081 }, { "auxiliary_loss_clip": 0.06428101, "auxiliary_loss_mlp": 0.01267486, "balance_loss_clip": 0.06279853, "balance_loss_mlp": 0.01256537, "epoch": 0.5886066436194198, "flos": 18886647893760.0, "grad_norm": 1.9168181465448613, "language_loss": 0.70494395, "learning_rate": 1.5276105177631944e-06, "loss": 0.78189981, "num_input_tokens_seen": 211053850, "router_z_loss_clip": 1.48242188, "router_z_loss_mlp": 0.10943604, "step": 9790, "time_per_iteration": 2.578766107559204 }, { "auxiliary_loss_clip": 0.06429969, "auxiliary_loss_mlp": 0.01267719, "balance_loss_clip": 0.06279568, "balance_loss_mlp": 0.0125634, "epoch": 0.5886667668720877, "flos": 24797015015040.0, "grad_norm": 1.6429387448520387, "language_loss": 0.83795673, "learning_rate": 1.527232084570895e-06, "loss": 0.91493362, "num_input_tokens_seen": 211072165, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.11364746, "step": 9791, "time_per_iteration": 2.583083391189575 }, { "auxiliary_loss_clip": 0.06434163, "auxiliary_loss_mlp": 0.01273086, "balance_loss_clip": 0.0628079, "balance_loss_mlp": 0.01261713, "epoch": 0.5887268901247558, "flos": 21620473130880.0, "grad_norm": 1.7222301524276702, "language_loss": 0.76384592, "learning_rate": 1.5268536693060026e-06, "loss": 0.84091848, "num_input_tokens_seen": 211089630, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.11383057, "step": 9792, "time_per_iteration": 2.559797763824463 }, { "auxiliary_loss_clip": 0.06438933, "auxiliary_loss_mlp": 0.01267938, "balance_loss_clip": 0.06282199, "balance_loss_mlp": 0.01256, "epoch": 0.5887870133774237, "flos": 20487357567360.0, "grad_norm": 1.956608120225435, "language_loss": 0.69578815, "learning_rate": 1.5264752719828662e-06, "loss": 0.77285689, "num_input_tokens_seen": 211106120, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.1194458, "step": 9793, "time_per_iteration": 2.5297398567199707 }, { "auxiliary_loss_clip": 0.06428653, "auxiliary_loss_mlp": 0.0126855, "balance_loss_clip": 0.06278899, "balance_loss_mlp": 0.01257523, "epoch": 0.5888471366300917, "flos": 19211966570880.0, "grad_norm": 1.7632418716399645, "language_loss": 0.60348392, "learning_rate": 1.5260968926158353e-06, "loss": 0.68045592, "num_input_tokens_seen": 211122450, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.11035156, "step": 9794, "time_per_iteration": 2.5605039596557617 }, { "auxiliary_loss_clip": 0.06438771, "auxiliary_loss_mlp": 0.0127422, "balance_loss_clip": 0.0628233, "balance_loss_mlp": 0.01262568, "epoch": 0.5889072598827596, "flos": 19978786010880.0, "grad_norm": 1.4087978317149321, "language_loss": 0.65313715, "learning_rate": 1.525718531219257e-06, "loss": 0.73026705, "num_input_tokens_seen": 211141765, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.11645508, "step": 9795, "time_per_iteration": 2.57472825050354 }, { "auxiliary_loss_clip": 0.06426486, "auxiliary_loss_mlp": 0.01266952, "balance_loss_clip": 0.06277507, "balance_loss_mlp": 0.01256337, "epoch": 0.5889673831354276, "flos": 20747617948800.0, "grad_norm": 1.5389430161085835, "language_loss": 0.74493253, "learning_rate": 1.5253401878074801e-06, "loss": 0.82186699, "num_input_tokens_seen": 211160475, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.10620117, "step": 9796, "time_per_iteration": 2.5709753036499023 }, { "auxiliary_loss_clip": 0.06435466, "auxiliary_loss_mlp": 0.01268096, "balance_loss_clip": 0.06283779, "balance_loss_mlp": 0.01256789, "epoch": 0.5890275063880956, "flos": 25307892558720.0, "grad_norm": 1.7749322398796852, "language_loss": 0.8311103, "learning_rate": 1.5249618623948507e-06, "loss": 0.9081459, "num_input_tokens_seen": 211180480, "router_z_loss_clip": 1.51660156, "router_z_loss_mlp": 0.11303711, "step": 9797, "time_per_iteration": 2.5768351554870605 }, { "auxiliary_loss_clip": 0.06432475, "auxiliary_loss_mlp": 0.01264778, "balance_loss_clip": 0.06282443, "balance_loss_mlp": 0.0125334, "epoch": 0.5890876296407636, "flos": 11770182702720.0, "grad_norm": 1.739730924549936, "language_loss": 0.79650223, "learning_rate": 1.5245835549957152e-06, "loss": 0.87347472, "num_input_tokens_seen": 211198000, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.11437988, "step": 9798, "time_per_iteration": 2.535057783126831 }, { "auxiliary_loss_clip": 0.06429428, "auxiliary_loss_mlp": 0.01265895, "balance_loss_clip": 0.06280401, "balance_loss_mlp": 0.01255274, "epoch": 0.5891477528934316, "flos": 13594535723520.0, "grad_norm": 2.2787580654932174, "language_loss": 0.74742579, "learning_rate": 1.5242052656244186e-06, "loss": 0.82437909, "num_input_tokens_seen": 211214765, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.10620117, "step": 9799, "time_per_iteration": 2.515730857849121 }, { "auxiliary_loss_clip": 0.06436978, "auxiliary_loss_mlp": 0.01270597, "balance_loss_clip": 0.06281833, "balance_loss_mlp": 0.01258169, "epoch": 0.5892078761460995, "flos": 15054563191680.0, "grad_norm": 2.295978527863033, "language_loss": 0.77225691, "learning_rate": 1.5238269942953064e-06, "loss": 0.84933263, "num_input_tokens_seen": 211232335, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.12432861, "step": 9800, "time_per_iteration": 2.5166876316070557 }, { "auxiliary_loss_clip": 0.06439231, "auxiliary_loss_mlp": 0.01267025, "balance_loss_clip": 0.06284299, "balance_loss_mlp": 0.01255133, "epoch": 0.5892679993987675, "flos": 15783591640320.0, "grad_norm": 1.7673871650443849, "language_loss": 0.79077709, "learning_rate": 1.523448741022722e-06, "loss": 0.86783957, "num_input_tokens_seen": 211249985, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.11883545, "step": 9801, "time_per_iteration": 2.5155630111694336 }, { "auxiliary_loss_clip": 0.0643867, "auxiliary_loss_mlp": 0.01266458, "balance_loss_clip": 0.06282001, "balance_loss_mlp": 0.01255086, "epoch": 0.5893281226514354, "flos": 25272281773440.0, "grad_norm": 1.8362723717415668, "language_loss": 0.66286349, "learning_rate": 1.5230705058210088e-06, "loss": 0.73991472, "num_input_tokens_seen": 211268425, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.1137085, "step": 9802, "time_per_iteration": 2.591155529022217 }, { "auxiliary_loss_clip": 0.06431854, "auxiliary_loss_mlp": 0.01269539, "balance_loss_clip": 0.06282787, "balance_loss_mlp": 0.01258316, "epoch": 0.5893882459041034, "flos": 19463380346880.0, "grad_norm": 1.6882318227905517, "language_loss": 0.78411657, "learning_rate": 1.5226922887045108e-06, "loss": 0.86113048, "num_input_tokens_seen": 211286680, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.11224365, "step": 9803, "time_per_iteration": 2.5454840660095215 }, { "auxiliary_loss_clip": 0.06436983, "auxiliary_loss_mlp": 0.01266289, "balance_loss_clip": 0.06282736, "balance_loss_mlp": 0.01255304, "epoch": 0.5894483691567713, "flos": 20640785592960.0, "grad_norm": 1.479089884256337, "language_loss": 0.7332747, "learning_rate": 1.5223140896875686e-06, "loss": 0.81030738, "num_input_tokens_seen": 211307700, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.10986328, "step": 9804, "time_per_iteration": 2.586369037628174 }, { "auxiliary_loss_clip": 0.06433149, "auxiliary_loss_mlp": 0.01267609, "balance_loss_clip": 0.06283392, "balance_loss_mlp": 0.01256881, "epoch": 0.5895084924094394, "flos": 17782812132480.0, "grad_norm": 1.65666705895697, "language_loss": 0.74728632, "learning_rate": 1.5219359087845234e-06, "loss": 0.82429397, "num_input_tokens_seen": 211324835, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.10736084, "step": 9805, "time_per_iteration": 2.5232326984405518 }, { "auxiliary_loss_clip": 0.06446596, "auxiliary_loss_mlp": 0.0126962, "balance_loss_clip": 0.06284489, "balance_loss_mlp": 0.01257359, "epoch": 0.5895686156621073, "flos": 20127350499840.0, "grad_norm": 17.084571244275192, "language_loss": 0.7845304, "learning_rate": 1.5215577460097174e-06, "loss": 0.86169255, "num_input_tokens_seen": 211344130, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.12268066, "step": 9806, "time_per_iteration": 4.01687216758728 }, { "auxiliary_loss_clip": 0.06437506, "auxiliary_loss_mlp": 0.01270778, "balance_loss_clip": 0.062856, "balance_loss_mlp": 0.01258875, "epoch": 0.5896287389147753, "flos": 20856337021440.0, "grad_norm": 1.7374468665209781, "language_loss": 0.77193558, "learning_rate": 1.5211796013774887e-06, "loss": 0.84901845, "num_input_tokens_seen": 211362915, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.11901855, "step": 9807, "time_per_iteration": 2.6844370365142822 }, { "auxiliary_loss_clip": 0.06440706, "auxiliary_loss_mlp": 0.01270556, "balance_loss_clip": 0.06283893, "balance_loss_mlp": 0.01259052, "epoch": 0.5896888621674432, "flos": 14543098669440.0, "grad_norm": 1.8985928987656093, "language_loss": 0.74953133, "learning_rate": 1.5208014749021786e-06, "loss": 0.82664394, "num_input_tokens_seen": 211380700, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.11499023, "step": 9808, "time_per_iteration": 2.723026752471924 }, { "auxiliary_loss_clip": 0.06433095, "auxiliary_loss_mlp": 0.01265333, "balance_loss_clip": 0.0627981, "balance_loss_mlp": 0.01252971, "epoch": 0.5897489854201112, "flos": 20893079836800.0, "grad_norm": 1.8751933981513773, "language_loss": 0.7209996, "learning_rate": 1.5204233665981236e-06, "loss": 0.79798388, "num_input_tokens_seen": 211400095, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.12365723, "step": 9809, "time_per_iteration": 2.5977818965911865 }, { "auxiliary_loss_clip": 0.06441005, "auxiliary_loss_mlp": 0.01268231, "balance_loss_clip": 0.06283481, "balance_loss_mlp": 0.01256322, "epoch": 0.5898091086727792, "flos": 20017331688960.0, "grad_norm": 1.8975507699300476, "language_loss": 0.82906032, "learning_rate": 1.5200452764796627e-06, "loss": 0.90615273, "num_input_tokens_seen": 211417810, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.11907959, "step": 9810, "time_per_iteration": 2.546125650405884 }, { "auxiliary_loss_clip": 0.0643237, "auxiliary_loss_mlp": 0.01267864, "balance_loss_clip": 0.0628425, "balance_loss_mlp": 0.01256527, "epoch": 0.5898692319254472, "flos": 16258816471680.0, "grad_norm": 1.5244563579346126, "language_loss": 0.81802309, "learning_rate": 1.5196672045611336e-06, "loss": 0.89502543, "num_input_tokens_seen": 211436020, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.11334229, "step": 9811, "time_per_iteration": 3.9894216060638428 }, { "auxiliary_loss_clip": 0.064395, "auxiliary_loss_mlp": 0.01267249, "balance_loss_clip": 0.0628332, "balance_loss_mlp": 0.01254911, "epoch": 0.5899293551781152, "flos": 20454723601920.0, "grad_norm": 1.9583813446842517, "language_loss": 0.76646632, "learning_rate": 1.5192891508568715e-06, "loss": 0.84353387, "num_input_tokens_seen": 211454335, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.12341309, "step": 9812, "time_per_iteration": 2.6131088733673096 }, { "auxiliary_loss_clip": 0.06431956, "auxiliary_loss_mlp": 0.01266285, "balance_loss_clip": 0.06280665, "balance_loss_mlp": 0.0125629, "epoch": 0.5899894784307831, "flos": 13886885018880.0, "grad_norm": 1.7387970299162898, "language_loss": 0.71291089, "learning_rate": 1.5189111153812133e-06, "loss": 0.78989333, "num_input_tokens_seen": 211472775, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.09991455, "step": 9813, "time_per_iteration": 2.5296366214752197 }, { "auxiliary_loss_clip": 0.06434891, "auxiliary_loss_mlp": 0.01272182, "balance_loss_clip": 0.06280577, "balance_loss_mlp": 0.0126072, "epoch": 0.5900496016834511, "flos": 20089936851840.0, "grad_norm": 1.5110254936112661, "language_loss": 0.72858584, "learning_rate": 1.518533098148494e-06, "loss": 0.80565655, "num_input_tokens_seen": 211492195, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.11468506, "step": 9814, "time_per_iteration": 2.543771505355835 }, { "auxiliary_loss_clip": 0.06435396, "auxiliary_loss_mlp": 0.01268537, "balance_loss_clip": 0.06283058, "balance_loss_mlp": 0.01256926, "epoch": 0.590109724936119, "flos": 20264133490560.0, "grad_norm": 1.9305722737955855, "language_loss": 0.78629887, "learning_rate": 1.5181550991730476e-06, "loss": 0.86333817, "num_input_tokens_seen": 211510220, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.1161499, "step": 9815, "time_per_iteration": 2.584695339202881 }, { "auxiliary_loss_clip": 0.06442399, "auxiliary_loss_mlp": 0.0126627, "balance_loss_clip": 0.06281479, "balance_loss_mlp": 0.01253729, "epoch": 0.590169848188787, "flos": 24240548050560.0, "grad_norm": 2.038799169710357, "language_loss": 0.76569295, "learning_rate": 1.5177771184692083e-06, "loss": 0.84277964, "num_input_tokens_seen": 211526260, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.12554932, "step": 9816, "time_per_iteration": 2.608908176422119 }, { "auxiliary_loss_clip": 0.06434651, "auxiliary_loss_mlp": 0.01266341, "balance_loss_clip": 0.06284251, "balance_loss_mlp": 0.01255225, "epoch": 0.590229971441455, "flos": 17790400926720.0, "grad_norm": 1.806449341639089, "language_loss": 0.81359631, "learning_rate": 1.517399156051309e-06, "loss": 0.89060622, "num_input_tokens_seen": 211542890, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.11114502, "step": 9817, "time_per_iteration": 2.53886079788208 }, { "auxiliary_loss_clip": 0.06436067, "auxiliary_loss_mlp": 0.01267096, "balance_loss_clip": 0.06282274, "balance_loss_mlp": 0.01256576, "epoch": 0.590290094694123, "flos": 22243465837440.0, "grad_norm": 1.6347939762320973, "language_loss": 0.77008438, "learning_rate": 1.517021211933682e-06, "loss": 0.84711599, "num_input_tokens_seen": 211562685, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.10516357, "step": 9818, "time_per_iteration": 2.5604588985443115 }, { "auxiliary_loss_clip": 0.06430773, "auxiliary_loss_mlp": 0.01264277, "balance_loss_clip": 0.06281346, "balance_loss_mlp": 0.01253989, "epoch": 0.5903502179467909, "flos": 19104589163520.0, "grad_norm": 1.914250559735641, "language_loss": 0.66958427, "learning_rate": 1.5166432861306592e-06, "loss": 0.74653482, "num_input_tokens_seen": 211579960, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.10284424, "step": 9819, "time_per_iteration": 2.550370693206787 }, { "auxiliary_loss_clip": 0.06436377, "auxiliary_loss_mlp": 0.01268717, "balance_loss_clip": 0.06283275, "balance_loss_mlp": 0.01257523, "epoch": 0.5904103411994589, "flos": 24241051175040.0, "grad_norm": 1.7462642954810825, "language_loss": 0.78153378, "learning_rate": 1.5162653786565714e-06, "loss": 0.8585847, "num_input_tokens_seen": 211599310, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.11199951, "step": 9820, "time_per_iteration": 2.6679062843322754 }, { "auxiliary_loss_clip": 0.06329969, "auxiliary_loss_mlp": 0.01252672, "balance_loss_clip": 0.06267247, "balance_loss_mlp": 0.01251046, "epoch": 0.5904704644521268, "flos": 64894388774400.0, "grad_norm": 0.9142214023733913, "language_loss": 0.65157962, "learning_rate": 1.5158874895257487e-06, "loss": 0.72740597, "num_input_tokens_seen": 211658790, "router_z_loss_clip": 0.62548828, "router_z_loss_mlp": 0.01628113, "step": 9821, "time_per_iteration": 4.588109254837036 }, { "auxiliary_loss_clip": 0.06433837, "auxiliary_loss_mlp": 0.01265168, "balance_loss_clip": 0.06283422, "balance_loss_mlp": 0.0125438, "epoch": 0.5905305877047948, "flos": 19616137539840.0, "grad_norm": 2.1187109732721536, "language_loss": 0.61725986, "learning_rate": 1.515509618752521e-06, "loss": 0.69424993, "num_input_tokens_seen": 211677240, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.10791016, "step": 9822, "time_per_iteration": 2.5616707801818848 }, { "auxiliary_loss_clip": 0.06436379, "auxiliary_loss_mlp": 0.01269353, "balance_loss_clip": 0.06283018, "balance_loss_mlp": 0.01257498, "epoch": 0.5905907109574628, "flos": 18995660455680.0, "grad_norm": 1.8831188009390465, "language_loss": 0.83002615, "learning_rate": 1.5151317663512173e-06, "loss": 0.90708339, "num_input_tokens_seen": 211695485, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.11859131, "step": 9823, "time_per_iteration": 3.9946348667144775 }, { "auxiliary_loss_clip": 0.06433153, "auxiliary_loss_mlp": 0.01267232, "balance_loss_clip": 0.06283253, "balance_loss_mlp": 0.0125633, "epoch": 0.5906508342101308, "flos": 22206974584320.0, "grad_norm": 1.9225947129964143, "language_loss": 0.73918182, "learning_rate": 1.514753932336165e-06, "loss": 0.81618565, "num_input_tokens_seen": 211713090, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.10894775, "step": 9824, "time_per_iteration": 2.565173625946045 }, { "auxiliary_loss_clip": 0.06447834, "auxiliary_loss_mlp": 0.01267369, "balance_loss_clip": 0.06285532, "balance_loss_mlp": 0.012549, "epoch": 0.5907109574627988, "flos": 20892995982720.0, "grad_norm": 2.1579729651207162, "language_loss": 0.8295483, "learning_rate": 1.514376116721693e-06, "loss": 0.90670037, "num_input_tokens_seen": 211732510, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.12469482, "step": 9825, "time_per_iteration": 2.572214365005493 }, { "auxiliary_loss_clip": 0.06431839, "auxiliary_loss_mlp": 0.01262229, "balance_loss_clip": 0.06285214, "balance_loss_mlp": 0.01252508, "epoch": 0.5907710807154667, "flos": 21513011869440.0, "grad_norm": 1.672932527923014, "language_loss": 0.76835835, "learning_rate": 1.5139983195221272e-06, "loss": 0.84529901, "num_input_tokens_seen": 211748695, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.09716797, "step": 9826, "time_per_iteration": 2.5543980598449707 }, { "auxiliary_loss_clip": 0.06432492, "auxiliary_loss_mlp": 0.01263779, "balance_loss_clip": 0.06283043, "balance_loss_mlp": 0.01253158, "epoch": 0.5908312039681347, "flos": 22024979516160.0, "grad_norm": 1.7599692812525964, "language_loss": 0.72648352, "learning_rate": 1.513620540751793e-06, "loss": 0.80344623, "num_input_tokens_seen": 211768545, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.10620117, "step": 9827, "time_per_iteration": 2.545254707336426 }, { "auxiliary_loss_clip": 0.06436982, "auxiliary_loss_mlp": 0.01267444, "balance_loss_clip": 0.06283478, "balance_loss_mlp": 0.01256542, "epoch": 0.5908913272208026, "flos": 18485579525760.0, "grad_norm": 1.8290561701384551, "language_loss": 0.8010264, "learning_rate": 1.5132427804250178e-06, "loss": 0.87807071, "num_input_tokens_seen": 211786665, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.10894775, "step": 9828, "time_per_iteration": 2.526198387145996 }, { "auxiliary_loss_clip": 0.06440198, "auxiliary_loss_mlp": 0.01271064, "balance_loss_clip": 0.06285995, "balance_loss_mlp": 0.0125869, "epoch": 0.5909514504734706, "flos": 12317006448000.0, "grad_norm": 1.9040023266979373, "language_loss": 0.88027847, "learning_rate": 1.5128650385561241e-06, "loss": 0.95739114, "num_input_tokens_seen": 211801215, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.12384033, "step": 9829, "time_per_iteration": 2.554924249649048 }, { "auxiliary_loss_clip": 0.06332355, "auxiliary_loss_mlp": 0.01255648, "balance_loss_clip": 0.06269507, "balance_loss_mlp": 0.0125417, "epoch": 0.5910115737261386, "flos": 70233557811840.0, "grad_norm": 0.749430751800545, "language_loss": 0.57872152, "learning_rate": 1.5124873151594376e-06, "loss": 0.65460157, "num_input_tokens_seen": 211857005, "router_z_loss_clip": 0.62841797, "router_z_loss_mlp": 0.01477051, "step": 9830, "time_per_iteration": 3.086843729019165 }, { "auxiliary_loss_clip": 0.06451081, "auxiliary_loss_mlp": 0.01267079, "balance_loss_clip": 0.06293295, "balance_loss_mlp": 0.01254491, "epoch": 0.5910716969788066, "flos": 22024266756480.0, "grad_norm": 2.157651978522792, "language_loss": 0.76344538, "learning_rate": 1.5121096102492812e-06, "loss": 0.84062696, "num_input_tokens_seen": 211876675, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.12579346, "step": 9831, "time_per_iteration": 2.557091236114502 }, { "auxiliary_loss_clip": 0.06426524, "auxiliary_loss_mlp": 0.01262128, "balance_loss_clip": 0.06282151, "balance_loss_mlp": 0.01251983, "epoch": 0.5911318202314745, "flos": 21258034295040.0, "grad_norm": 1.7142219776998984, "language_loss": 0.7768361, "learning_rate": 1.5117319238399767e-06, "loss": 0.85372263, "num_input_tokens_seen": 211895725, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.1015625, "step": 9832, "time_per_iteration": 2.5519001483917236 }, { "auxiliary_loss_clip": 0.06434076, "auxiliary_loss_mlp": 0.01264505, "balance_loss_clip": 0.06284998, "balance_loss_mlp": 0.0125352, "epoch": 0.5911919434841425, "flos": 17827353377280.0, "grad_norm": 1.7940780455223637, "language_loss": 0.83402604, "learning_rate": 1.511354255945847e-06, "loss": 0.91101182, "num_input_tokens_seen": 211913860, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.10980225, "step": 9833, "time_per_iteration": 2.5191593170166016 }, { "auxiliary_loss_clip": 0.06439824, "auxiliary_loss_mlp": 0.01270379, "balance_loss_clip": 0.062865, "balance_loss_mlp": 0.01259924, "epoch": 0.5912520667368104, "flos": 20380818700800.0, "grad_norm": 1.6320861595391265, "language_loss": 0.74896604, "learning_rate": 1.5109766065812123e-06, "loss": 0.82606804, "num_input_tokens_seen": 211932880, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.10449219, "step": 9834, "time_per_iteration": 2.559857130050659 }, { "auxiliary_loss_clip": 0.06439359, "auxiliary_loss_mlp": 0.01269563, "balance_loss_clip": 0.062876, "balance_loss_mlp": 0.01258763, "epoch": 0.5913121899894784, "flos": 17936240158080.0, "grad_norm": 2.48716272111455, "language_loss": 0.78000486, "learning_rate": 1.5105989757603942e-06, "loss": 0.85709405, "num_input_tokens_seen": 211948625, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.10809326, "step": 9835, "time_per_iteration": 2.612311840057373 }, { "auxiliary_loss_clip": 0.06437633, "auxiliary_loss_mlp": 0.01267445, "balance_loss_clip": 0.06282891, "balance_loss_mlp": 0.01256365, "epoch": 0.5913723132421465, "flos": 22133405099520.0, "grad_norm": 2.1332130260676445, "language_loss": 0.74220228, "learning_rate": 1.5102213634977117e-06, "loss": 0.81925297, "num_input_tokens_seen": 211965355, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.11083984, "step": 9836, "time_per_iteration": 2.5578489303588867 }, { "auxiliary_loss_clip": 0.06442137, "auxiliary_loss_mlp": 0.01266364, "balance_loss_clip": 0.06287423, "balance_loss_mlp": 0.01255087, "epoch": 0.5914324364948144, "flos": 15702056017920.0, "grad_norm": 2.0394010607515365, "language_loss": 0.82489049, "learning_rate": 1.5098437698074841e-06, "loss": 0.90197545, "num_input_tokens_seen": 211982245, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.11273193, "step": 9837, "time_per_iteration": 2.505514621734619 }, { "auxiliary_loss_clip": 0.06440172, "auxiliary_loss_mlp": 0.012672, "balance_loss_clip": 0.06285985, "balance_loss_mlp": 0.01255166, "epoch": 0.5914925597474824, "flos": 22753924110720.0, "grad_norm": 1.6690703446643895, "language_loss": 0.79741848, "learning_rate": 1.5094661947040304e-06, "loss": 0.87449223, "num_input_tokens_seen": 212000250, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.12030029, "step": 9838, "time_per_iteration": 2.5503780841827393 }, { "auxiliary_loss_clip": 0.06444836, "auxiliary_loss_mlp": 0.01268409, "balance_loss_clip": 0.06291915, "balance_loss_mlp": 0.01257352, "epoch": 0.5915526830001503, "flos": 18298092015360.0, "grad_norm": 2.1632369135642526, "language_loss": 0.69682759, "learning_rate": 1.5090886382016673e-06, "loss": 0.77395999, "num_input_tokens_seen": 212017505, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.11053467, "step": 9839, "time_per_iteration": 2.513383626937866 }, { "auxiliary_loss_clip": 0.06441946, "auxiliary_loss_mlp": 0.01265782, "balance_loss_clip": 0.06287286, "balance_loss_mlp": 0.01254451, "epoch": 0.5916128062528183, "flos": 17024713516800.0, "grad_norm": 2.160785854767025, "language_loss": 0.65734404, "learning_rate": 1.5087111003147124e-06, "loss": 0.73442131, "num_input_tokens_seen": 212034595, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.11334229, "step": 9840, "time_per_iteration": 2.5746943950653076 }, { "auxiliary_loss_clip": 0.06440079, "auxiliary_loss_mlp": 0.01269319, "balance_loss_clip": 0.06284812, "balance_loss_mlp": 0.01256832, "epoch": 0.5916729295054862, "flos": 24761194594560.0, "grad_norm": 1.7055341153410017, "language_loss": 0.82036841, "learning_rate": 1.5083335810574813e-06, "loss": 0.89746237, "num_input_tokens_seen": 212055775, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.12487793, "step": 9841, "time_per_iteration": 2.581669807434082 }, { "auxiliary_loss_clip": 0.06437062, "auxiliary_loss_mlp": 0.01266843, "balance_loss_clip": 0.06286447, "balance_loss_mlp": 0.0125671, "epoch": 0.5917330527581542, "flos": 15963196867200.0, "grad_norm": 1.5932667313425632, "language_loss": 0.69337249, "learning_rate": 1.507956080444291e-06, "loss": 0.77041155, "num_input_tokens_seen": 212074000, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.10137939, "step": 9842, "time_per_iteration": 2.5219664573669434 }, { "auxiliary_loss_clip": 0.06449021, "auxiliary_loss_mlp": 0.01267231, "balance_loss_clip": 0.06294715, "balance_loss_mlp": 0.01255709, "epoch": 0.5917931760108222, "flos": 23806719936000.0, "grad_norm": 1.7707742339806318, "language_loss": 0.82805133, "learning_rate": 1.5075785984894549e-06, "loss": 0.90521383, "num_input_tokens_seen": 212091415, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.11523438, "step": 9843, "time_per_iteration": 2.5871758460998535 }, { "auxiliary_loss_clip": 0.06445034, "auxiliary_loss_mlp": 0.01264142, "balance_loss_clip": 0.06291312, "balance_loss_mlp": 0.01252137, "epoch": 0.5918532992634902, "flos": 23254864945920.0, "grad_norm": 2.6927362871605123, "language_loss": 0.82024014, "learning_rate": 1.5072011352072875e-06, "loss": 0.89733183, "num_input_tokens_seen": 212105255, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.12005615, "step": 9844, "time_per_iteration": 2.5092813968658447 }, { "auxiliary_loss_clip": 0.0644476, "auxiliary_loss_mlp": 0.01264573, "balance_loss_clip": 0.06290759, "balance_loss_mlp": 0.0125326, "epoch": 0.5919134225161581, "flos": 19505867166720.0, "grad_norm": 1.974012285414727, "language_loss": 0.75055099, "learning_rate": 1.5068236906121032e-06, "loss": 0.82764435, "num_input_tokens_seen": 212122765, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11322021, "step": 9845, "time_per_iteration": 2.5138399600982666 }, { "auxiliary_loss_clip": 0.06441551, "auxiliary_loss_mlp": 0.01270003, "balance_loss_clip": 0.06286258, "balance_loss_mlp": 0.0125773, "epoch": 0.5919735457688261, "flos": 38810201264640.0, "grad_norm": 1.7560774326593043, "language_loss": 0.64189619, "learning_rate": 1.506446264718213e-06, "loss": 0.71901172, "num_input_tokens_seen": 212143960, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.12268066, "step": 9846, "time_per_iteration": 4.093703269958496 }, { "auxiliary_loss_clip": 0.06436861, "auxiliary_loss_mlp": 0.01269138, "balance_loss_clip": 0.06292445, "balance_loss_mlp": 0.01259357, "epoch": 0.592033669021494, "flos": 22170567185280.0, "grad_norm": 1.8815916982314886, "language_loss": 0.75928205, "learning_rate": 1.506068857539931e-06, "loss": 0.8363421, "num_input_tokens_seen": 212162005, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.09771729, "step": 9847, "time_per_iteration": 2.5521299839019775 }, { "auxiliary_loss_clip": 0.06444171, "auxiliary_loss_mlp": 0.012677, "balance_loss_clip": 0.06290285, "balance_loss_mlp": 0.01255636, "epoch": 0.592093792274162, "flos": 22717600565760.0, "grad_norm": 2.4826718095707734, "language_loss": 0.62289429, "learning_rate": 1.5056914690915667e-06, "loss": 0.70001304, "num_input_tokens_seen": 212181635, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.1206665, "step": 9848, "time_per_iteration": 2.555513381958008 }, { "auxiliary_loss_clip": 0.0644556, "auxiliary_loss_mlp": 0.01263537, "balance_loss_clip": 0.0629057, "balance_loss_mlp": 0.01252063, "epoch": 0.59215391552683, "flos": 22535605497600.0, "grad_norm": 1.7232833384375272, "language_loss": 0.76968038, "learning_rate": 1.5053140993874312e-06, "loss": 0.84677136, "num_input_tokens_seen": 212201615, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.11480713, "step": 9849, "time_per_iteration": 2.556056499481201 }, { "auxiliary_loss_clip": 0.06447078, "auxiliary_loss_mlp": 0.01270254, "balance_loss_clip": 0.06292448, "balance_loss_mlp": 0.01258506, "epoch": 0.592214038779498, "flos": 24505965457920.0, "grad_norm": 1.6798711267751556, "language_loss": 0.75629497, "learning_rate": 1.5049367484418353e-06, "loss": 0.83346832, "num_input_tokens_seen": 212219355, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.11749268, "step": 9850, "time_per_iteration": 4.000406742095947 }, { "auxiliary_loss_clip": 0.06441056, "auxiliary_loss_mlp": 0.01267894, "balance_loss_clip": 0.06289975, "balance_loss_mlp": 0.01256677, "epoch": 0.592274162032166, "flos": 21837156589440.0, "grad_norm": 1.675098204798543, "language_loss": 0.76082003, "learning_rate": 1.5045594162690868e-06, "loss": 0.83790946, "num_input_tokens_seen": 212236710, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.11218262, "step": 9851, "time_per_iteration": 2.5677475929260254 }, { "auxiliary_loss_clip": 0.06442157, "auxiliary_loss_mlp": 0.01266636, "balance_loss_clip": 0.06287628, "balance_loss_mlp": 0.01255407, "epoch": 0.5923342852848339, "flos": 24615061873920.0, "grad_norm": 1.9490123019898682, "language_loss": 0.70822132, "learning_rate": 1.5041821028834954e-06, "loss": 0.78530926, "num_input_tokens_seen": 212256195, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.11230469, "step": 9852, "time_per_iteration": 2.5972275733947754 }, { "auxiliary_loss_clip": 0.06449488, "auxiliary_loss_mlp": 0.01276619, "balance_loss_clip": 0.06292251, "balance_loss_mlp": 0.01264638, "epoch": 0.5923944085375019, "flos": 19944307255680.0, "grad_norm": 1.9200039183094213, "language_loss": 0.80577773, "learning_rate": 1.5038048082993685e-06, "loss": 0.88303882, "num_input_tokens_seen": 212274085, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.11975098, "step": 9853, "time_per_iteration": 2.5524609088897705 }, { "auxiliary_loss_clip": 0.0643593, "auxiliary_loss_mlp": 0.01263422, "balance_loss_clip": 0.06286479, "balance_loss_mlp": 0.01252676, "epoch": 0.5924545317901698, "flos": 28666177948800.0, "grad_norm": 1.5307313255911679, "language_loss": 0.67915654, "learning_rate": 1.5034275325310124e-06, "loss": 0.75615013, "num_input_tokens_seen": 212295530, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.10748291, "step": 9854, "time_per_iteration": 2.605604887008667 }, { "auxiliary_loss_clip": 0.06439514, "auxiliary_loss_mlp": 0.01270545, "balance_loss_clip": 0.06287628, "balance_loss_mlp": 0.01259727, "epoch": 0.5925146550428378, "flos": 19870989333120.0, "grad_norm": 1.712710277061724, "language_loss": 0.89102775, "learning_rate": 1.5030502755927344e-06, "loss": 0.96812832, "num_input_tokens_seen": 212313770, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.10821533, "step": 9855, "time_per_iteration": 2.75352144241333 }, { "auxiliary_loss_clip": 0.06438696, "auxiliary_loss_mlp": 0.01268243, "balance_loss_clip": 0.06291106, "balance_loss_mlp": 0.01257991, "epoch": 0.5925747782955058, "flos": 15128510019840.0, "grad_norm": 1.7401603510919783, "language_loss": 0.87143099, "learning_rate": 1.5026730374988397e-06, "loss": 0.94850045, "num_input_tokens_seen": 212331525, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.1026001, "step": 9856, "time_per_iteration": 2.5579049587249756 }, { "auxiliary_loss_clip": 0.06446802, "auxiliary_loss_mlp": 0.01266708, "balance_loss_clip": 0.06291226, "balance_loss_mlp": 0.0125552, "epoch": 0.5926349015481738, "flos": 18411297281280.0, "grad_norm": 2.281729874249639, "language_loss": 0.77687991, "learning_rate": 1.5022958182636332e-06, "loss": 0.85401493, "num_input_tokens_seen": 212347295, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.11187744, "step": 9857, "time_per_iteration": 2.536569833755493 }, { "auxiliary_loss_clip": 0.06443928, "auxiliary_loss_mlp": 0.01265865, "balance_loss_clip": 0.06292979, "balance_loss_mlp": 0.01254653, "epoch": 0.5926950248008417, "flos": 23117620757760.0, "grad_norm": 2.849744353922783, "language_loss": 0.64318621, "learning_rate": 1.501918617901419e-06, "loss": 0.7202841, "num_input_tokens_seen": 212365750, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.11206055, "step": 9858, "time_per_iteration": 2.5355277061462402 }, { "auxiliary_loss_clip": 0.06436333, "auxiliary_loss_mlp": 0.01268421, "balance_loss_clip": 0.06286819, "balance_loss_mlp": 0.01257204, "epoch": 0.5927551480535097, "flos": 28040753473920.0, "grad_norm": 1.954897217529263, "language_loss": 0.76951861, "learning_rate": 1.501541436426501e-06, "loss": 0.8465662, "num_input_tokens_seen": 212385300, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.11218262, "step": 9859, "time_per_iteration": 2.6443030834198 }, { "auxiliary_loss_clip": 0.06446369, "auxiliary_loss_mlp": 0.01273362, "balance_loss_clip": 0.06290221, "balance_loss_mlp": 0.01261245, "epoch": 0.5928152713061776, "flos": 21805109602560.0, "grad_norm": 2.0119626964280566, "language_loss": 0.75407457, "learning_rate": 1.5011642738531818e-06, "loss": 0.83127189, "num_input_tokens_seen": 212402140, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.12121582, "step": 9860, "time_per_iteration": 2.5259757041931152 }, { "auxiliary_loss_clip": 0.06436028, "auxiliary_loss_mlp": 0.01268943, "balance_loss_clip": 0.06286358, "balance_loss_mlp": 0.01258286, "epoch": 0.5928753945588456, "flos": 24323802681600.0, "grad_norm": 1.8956650844341278, "language_loss": 0.76717985, "learning_rate": 1.500787130195763e-06, "loss": 0.84422952, "num_input_tokens_seen": 212421790, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.10662842, "step": 9861, "time_per_iteration": 4.056377649307251 }, { "auxiliary_loss_clip": 0.06438386, "auxiliary_loss_mlp": 0.01266213, "balance_loss_clip": 0.06289098, "balance_loss_mlp": 0.01255758, "epoch": 0.5929355178115137, "flos": 26471126465280.0, "grad_norm": 1.5341693615993095, "language_loss": 0.70496619, "learning_rate": 1.5004100054685465e-06, "loss": 0.7820121, "num_input_tokens_seen": 212442115, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.10455322, "step": 9862, "time_per_iteration": 2.617286443710327 }, { "auxiliary_loss_clip": 0.06443182, "auxiliary_loss_mlp": 0.01264153, "balance_loss_clip": 0.06290621, "balance_loss_mlp": 0.01253341, "epoch": 0.5929956410641816, "flos": 24971798632320.0, "grad_norm": 1.9315593155291275, "language_loss": 0.78229713, "learning_rate": 1.500032899685832e-06, "loss": 0.85937047, "num_input_tokens_seen": 212459535, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.10809326, "step": 9863, "time_per_iteration": 4.043556213378906 }, { "auxiliary_loss_clip": 0.06443898, "auxiliary_loss_mlp": 0.01267579, "balance_loss_clip": 0.06292501, "balance_loss_mlp": 0.01256606, "epoch": 0.5930557643168496, "flos": 26214639517440.0, "grad_norm": 1.936861340283598, "language_loss": 0.7116695, "learning_rate": 1.499655812861921e-06, "loss": 0.78878427, "num_input_tokens_seen": 212479385, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.10974121, "step": 9864, "time_per_iteration": 2.5967047214508057 }, { "auxiliary_loss_clip": 0.0644177, "auxiliary_loss_mlp": 0.01268564, "balance_loss_clip": 0.06289089, "balance_loss_mlp": 0.01257049, "epoch": 0.5931158875695175, "flos": 27862322204160.0, "grad_norm": 1.3939058363407184, "language_loss": 0.67084795, "learning_rate": 1.4992787450111112e-06, "loss": 0.74795133, "num_input_tokens_seen": 212500060, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.11517334, "step": 9865, "time_per_iteration": 2.6056275367736816 }, { "auxiliary_loss_clip": 0.06438322, "auxiliary_loss_mlp": 0.01266208, "balance_loss_clip": 0.06285041, "balance_loss_mlp": 0.01254699, "epoch": 0.5931760108221855, "flos": 15419014525440.0, "grad_norm": 2.057613783705938, "language_loss": 0.78342891, "learning_rate": 1.4989016961477015e-06, "loss": 0.86047417, "num_input_tokens_seen": 212518590, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.11523438, "step": 9866, "time_per_iteration": 2.548413038253784 }, { "auxiliary_loss_clip": 0.0643713, "auxiliary_loss_mlp": 0.01269755, "balance_loss_clip": 0.06291109, "balance_loss_mlp": 0.01259026, "epoch": 0.5932361340748534, "flos": 30196043395200.0, "grad_norm": 3.6949484262222785, "language_loss": 0.72212261, "learning_rate": 1.4985246662859903e-06, "loss": 0.79919142, "num_input_tokens_seen": 212538190, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.1072998, "step": 9867, "time_per_iteration": 2.6091790199279785 }, { "auxiliary_loss_clip": 0.06439168, "auxiliary_loss_mlp": 0.01267715, "balance_loss_clip": 0.06289476, "balance_loss_mlp": 0.01255473, "epoch": 0.5932962573275214, "flos": 20163841752960.0, "grad_norm": 1.5802372734660417, "language_loss": 0.66682214, "learning_rate": 1.4981476554402732e-06, "loss": 0.743891, "num_input_tokens_seen": 212557820, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.12249756, "step": 9868, "time_per_iteration": 2.5761613845825195 }, { "auxiliary_loss_clip": 0.06437694, "auxiliary_loss_mlp": 0.01268322, "balance_loss_clip": 0.06284422, "balance_loss_mlp": 0.01256914, "epoch": 0.5933563805801894, "flos": 25452725541120.0, "grad_norm": 1.52233707854227, "language_loss": 0.75587124, "learning_rate": 1.4977706636248478e-06, "loss": 0.8329314, "num_input_tokens_seen": 212577645, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.11401367, "step": 9869, "time_per_iteration": 2.6136910915374756 }, { "auxiliary_loss_clip": 0.06442925, "auxiliary_loss_mlp": 0.01266774, "balance_loss_clip": 0.06291354, "balance_loss_mlp": 0.01255521, "epoch": 0.5934165038328574, "flos": 60007971674880.0, "grad_norm": 1.614842414486249, "language_loss": 0.74657774, "learning_rate": 1.4973936908540091e-06, "loss": 0.8236748, "num_input_tokens_seen": 212603430, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.11254883, "step": 9870, "time_per_iteration": 2.934847831726074 }, { "auxiliary_loss_clip": 0.06439194, "auxiliary_loss_mlp": 0.01266439, "balance_loss_clip": 0.06285052, "balance_loss_mlp": 0.01254882, "epoch": 0.5934766270855253, "flos": 24426568114560.0, "grad_norm": 2.1080365824249547, "language_loss": 0.72168314, "learning_rate": 1.4970167371420517e-06, "loss": 0.79873949, "num_input_tokens_seen": 212620730, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.11572266, "step": 9871, "time_per_iteration": 2.5680766105651855 }, { "auxiliary_loss_clip": 0.06442732, "auxiliary_loss_mlp": 0.01267346, "balance_loss_clip": 0.06288768, "balance_loss_mlp": 0.01255854, "epoch": 0.5935367503381933, "flos": 23519821155840.0, "grad_norm": 2.0337453926042297, "language_loss": 0.74176836, "learning_rate": 1.496639802503271e-06, "loss": 0.81886911, "num_input_tokens_seen": 212639745, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11480713, "step": 9872, "time_per_iteration": 2.5914483070373535 }, { "auxiliary_loss_clip": 0.0645247, "auxiliary_loss_mlp": 0.01270398, "balance_loss_clip": 0.06296727, "balance_loss_mlp": 0.01257852, "epoch": 0.5935968735908612, "flos": 18953550979200.0, "grad_norm": 2.1263884009147693, "language_loss": 0.79483187, "learning_rate": 1.4962628869519583e-06, "loss": 0.87206054, "num_input_tokens_seen": 212655915, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.12530518, "step": 9873, "time_per_iteration": 2.5370631217956543 }, { "auxiliary_loss_clip": 0.0643782, "auxiliary_loss_mlp": 0.0126576, "balance_loss_clip": 0.06286874, "balance_loss_mlp": 0.01253988, "epoch": 0.5936569968435292, "flos": 25490432678400.0, "grad_norm": 1.4960540826031714, "language_loss": 0.85016632, "learning_rate": 1.4958859905024078e-06, "loss": 0.92720217, "num_input_tokens_seen": 212676115, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.11785889, "step": 9874, "time_per_iteration": 2.578214168548584 }, { "auxiliary_loss_clip": 0.0634182, "auxiliary_loss_mlp": 0.01255357, "balance_loss_clip": 0.0627912, "balance_loss_mlp": 0.01253821, "epoch": 0.5937171200961973, "flos": 66397364259840.0, "grad_norm": 0.9251474809927903, "language_loss": 0.59781277, "learning_rate": 1.4955091131689115e-06, "loss": 0.67378449, "num_input_tokens_seen": 212737560, "router_z_loss_clip": 0.62841797, "router_z_loss_mlp": 0.0153656, "step": 9875, "time_per_iteration": 3.2483649253845215 }, { "auxiliary_loss_clip": 0.06450567, "auxiliary_loss_mlp": 0.01267766, "balance_loss_clip": 0.062916, "balance_loss_mlp": 0.01255773, "epoch": 0.5937772433488652, "flos": 14908849741440.0, "grad_norm": 2.4972323010416932, "language_loss": 0.77576804, "learning_rate": 1.4951322549657594e-06, "loss": 0.85295141, "num_input_tokens_seen": 212755365, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.11999512, "step": 9876, "time_per_iteration": 2.549147367477417 }, { "auxiliary_loss_clip": 0.06431144, "auxiliary_loss_mlp": 0.01263163, "balance_loss_clip": 0.06284755, "balance_loss_mlp": 0.0125303, "epoch": 0.5938373666015332, "flos": 22567484776320.0, "grad_norm": 1.5089868703885905, "language_loss": 0.75848871, "learning_rate": 1.494755415907243e-06, "loss": 0.83543175, "num_input_tokens_seen": 212773875, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.10131836, "step": 9877, "time_per_iteration": 2.558586597442627 }, { "auxiliary_loss_clip": 0.06438687, "auxiliary_loss_mlp": 0.01267276, "balance_loss_clip": 0.06286572, "balance_loss_mlp": 0.01255611, "epoch": 0.5938974898542011, "flos": 18446572650240.0, "grad_norm": 2.3171377648846074, "language_loss": 0.81326735, "learning_rate": 1.4943785960076522e-06, "loss": 0.89032698, "num_input_tokens_seen": 212790590, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.11676025, "step": 9878, "time_per_iteration": 2.8154244422912598 }, { "auxiliary_loss_clip": 0.06441824, "auxiliary_loss_mlp": 0.01268047, "balance_loss_clip": 0.0628809, "balance_loss_mlp": 0.01257306, "epoch": 0.5939576131068691, "flos": 45597029293440.0, "grad_norm": 1.7399485615722459, "language_loss": 0.71199363, "learning_rate": 1.4940017952812754e-06, "loss": 0.7890923, "num_input_tokens_seen": 212812265, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.10742188, "step": 9879, "time_per_iteration": 2.777245044708252 }, { "auxiliary_loss_clip": 0.06435834, "auxiliary_loss_mlp": 0.01266327, "balance_loss_clip": 0.06286387, "balance_loss_mlp": 0.01254924, "epoch": 0.594017736359537, "flos": 23594648451840.0, "grad_norm": 1.3860702483758316, "language_loss": 0.57831603, "learning_rate": 1.493625013742401e-06, "loss": 0.65533769, "num_input_tokens_seen": 212831915, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.11407471, "step": 9880, "time_per_iteration": 2.604832887649536 }, { "auxiliary_loss_clip": 0.06437716, "auxiliary_loss_mlp": 0.01270802, "balance_loss_clip": 0.06285872, "balance_loss_mlp": 0.01258929, "epoch": 0.594077859612205, "flos": 29464373543040.0, "grad_norm": 1.8412831521775412, "language_loss": 0.77671278, "learning_rate": 1.4932482514053177e-06, "loss": 0.85379791, "num_input_tokens_seen": 212851350, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.11877441, "step": 9881, "time_per_iteration": 2.6611452102661133 }, { "auxiliary_loss_clip": 0.06442087, "auxiliary_loss_mlp": 0.01266625, "balance_loss_clip": 0.06290654, "balance_loss_mlp": 0.01255878, "epoch": 0.594137982864873, "flos": 16805682144000.0, "grad_norm": 1.9381282537608555, "language_loss": 0.82928711, "learning_rate": 1.4928715082843112e-06, "loss": 0.90637422, "num_input_tokens_seen": 212867995, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.10754395, "step": 9882, "time_per_iteration": 2.561491012573242 }, { "auxiliary_loss_clip": 0.06444681, "auxiliary_loss_mlp": 0.01269941, "balance_loss_clip": 0.06294034, "balance_loss_mlp": 0.01258783, "epoch": 0.594198106117541, "flos": 12755194974720.0, "grad_norm": 2.221365466621029, "language_loss": 0.79612279, "learning_rate": 1.492494784393667e-06, "loss": 0.87326908, "num_input_tokens_seen": 212885220, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.11169434, "step": 9883, "time_per_iteration": 2.519254207611084 }, { "auxiliary_loss_clip": 0.06448975, "auxiliary_loss_mlp": 0.01270809, "balance_loss_clip": 0.06291012, "balance_loss_mlp": 0.01259097, "epoch": 0.5942582293702089, "flos": 21002930939520.0, "grad_norm": 2.0148925737975003, "language_loss": 0.74837971, "learning_rate": 1.4921180797476725e-06, "loss": 0.82557762, "num_input_tokens_seen": 212903195, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.11706543, "step": 9884, "time_per_iteration": 2.6205296516418457 }, { "auxiliary_loss_clip": 0.0644161, "auxiliary_loss_mlp": 0.01266829, "balance_loss_clip": 0.06290598, "balance_loss_mlp": 0.01255159, "epoch": 0.5943183526228769, "flos": 28298665941120.0, "grad_norm": 2.3782282293774335, "language_loss": 0.66377878, "learning_rate": 1.4917413943606106e-06, "loss": 0.74086314, "num_input_tokens_seen": 212923340, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.11688232, "step": 9885, "time_per_iteration": 4.08729887008667 }, { "auxiliary_loss_clip": 0.06438965, "auxiliary_loss_mlp": 0.0126714, "balance_loss_clip": 0.06289059, "balance_loss_mlp": 0.01255701, "epoch": 0.5943784758755448, "flos": 26621829233280.0, "grad_norm": 2.27996599475806, "language_loss": 0.77714837, "learning_rate": 1.4913647282467667e-06, "loss": 0.85420948, "num_input_tokens_seen": 212942755, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.11444092, "step": 9886, "time_per_iteration": 2.6059017181396484 }, { "auxiliary_loss_clip": 0.06338759, "auxiliary_loss_mlp": 0.0125156, "balance_loss_clip": 0.06275952, "balance_loss_mlp": 0.01249926, "epoch": 0.5944385991282128, "flos": 64209859643520.0, "grad_norm": 0.8281570134039521, "language_loss": 0.64578116, "learning_rate": 1.490988081420423e-06, "loss": 0.7216844, "num_input_tokens_seen": 212999355, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 0.01637268, "step": 9887, "time_per_iteration": 3.0596847534179688 }, { "auxiliary_loss_clip": 0.06439678, "auxiliary_loss_mlp": 0.01266482, "balance_loss_clip": 0.06288097, "balance_loss_mlp": 0.01255169, "epoch": 0.5944987223808808, "flos": 19577885351040.0, "grad_norm": 2.095459278641935, "language_loss": 0.69700992, "learning_rate": 1.4906114538958615e-06, "loss": 0.77407151, "num_input_tokens_seen": 213018570, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.11297607, "step": 9888, "time_per_iteration": 2.5380759239196777 }, { "auxiliary_loss_clip": 0.06440885, "auxiliary_loss_mlp": 0.01269456, "balance_loss_clip": 0.06290942, "balance_loss_mlp": 0.01257815, "epoch": 0.5945588456335488, "flos": 26184856590720.0, "grad_norm": 1.6038180132426363, "language_loss": 0.7978363, "learning_rate": 1.490234845687366e-06, "loss": 0.8749398, "num_input_tokens_seen": 213037735, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.11627197, "step": 9889, "time_per_iteration": 2.579699993133545 }, { "auxiliary_loss_clip": 0.0643838, "auxiliary_loss_mlp": 0.01267533, "balance_loss_clip": 0.06288197, "balance_loss_mlp": 0.01256751, "epoch": 0.5946189688862168, "flos": 20452333760640.0, "grad_norm": 1.5283251819588903, "language_loss": 0.71055663, "learning_rate": 1.4898582568092154e-06, "loss": 0.78761578, "num_input_tokens_seen": 213057160, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.10778809, "step": 9890, "time_per_iteration": 3.9974777698516846 }, { "auxiliary_loss_clip": 0.06440842, "auxiliary_loss_mlp": 0.01265485, "balance_loss_clip": 0.06286792, "balance_loss_mlp": 0.012541, "epoch": 0.5946790921388847, "flos": 13441568895360.0, "grad_norm": 1.9649467000470253, "language_loss": 0.69878316, "learning_rate": 1.489481687275691e-06, "loss": 0.77584642, "num_input_tokens_seen": 213073630, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.11383057, "step": 9891, "time_per_iteration": 2.531909227371216 }, { "auxiliary_loss_clip": 0.06443132, "auxiliary_loss_mlp": 0.01267137, "balance_loss_clip": 0.06293841, "balance_loss_mlp": 0.01256783, "epoch": 0.5947392153915527, "flos": 20418483911040.0, "grad_norm": 1.9801198454455478, "language_loss": 0.53703421, "learning_rate": 1.4891051371010726e-06, "loss": 0.61413687, "num_input_tokens_seen": 213092450, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.10345459, "step": 9892, "time_per_iteration": 2.5645339488983154 }, { "auxiliary_loss_clip": 0.06339462, "auxiliary_loss_mlp": 0.01251274, "balance_loss_clip": 0.06276945, "balance_loss_mlp": 0.01249766, "epoch": 0.5947993386442206, "flos": 65639181790080.0, "grad_norm": 0.6394636972049758, "language_loss": 0.54475725, "learning_rate": 1.4887286062996375e-06, "loss": 0.6206646, "num_input_tokens_seen": 213155465, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01506042, "step": 9893, "time_per_iteration": 3.235956907272339 }, { "auxiliary_loss_clip": 0.06432299, "auxiliary_loss_mlp": 0.0126652, "balance_loss_clip": 0.06284226, "balance_loss_mlp": 0.01256054, "epoch": 0.5948594618968887, "flos": 23189429306880.0, "grad_norm": 1.692253918612004, "language_loss": 0.74693364, "learning_rate": 1.4883520948856658e-06, "loss": 0.8239218, "num_input_tokens_seen": 213174875, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10467529, "step": 9894, "time_per_iteration": 2.610884189605713 }, { "auxiliary_loss_clip": 0.06434498, "auxiliary_loss_mlp": 0.01266129, "balance_loss_clip": 0.0628444, "balance_loss_mlp": 0.01255126, "epoch": 0.5949195851495566, "flos": 13631991298560.0, "grad_norm": 1.6400037907296714, "language_loss": 0.78143609, "learning_rate": 1.487975602873434e-06, "loss": 0.85844237, "num_input_tokens_seen": 213192695, "router_z_loss_clip": 1.49902344, "router_z_loss_mlp": 0.11004639, "step": 9895, "time_per_iteration": 2.574875831604004 }, { "auxiliary_loss_clip": 0.06439838, "auxiliary_loss_mlp": 0.0126511, "balance_loss_clip": 0.06285252, "balance_loss_mlp": 0.01253362, "epoch": 0.5949797084022246, "flos": 19756358547840.0, "grad_norm": 1.9004004887698063, "language_loss": 0.79584318, "learning_rate": 1.4875991302772182e-06, "loss": 0.87289262, "num_input_tokens_seen": 213211195, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.11743164, "step": 9896, "time_per_iteration": 2.652510166168213 }, { "auxiliary_loss_clip": 0.06439887, "auxiliary_loss_mlp": 0.01265964, "balance_loss_clip": 0.06287996, "balance_loss_mlp": 0.01254734, "epoch": 0.5950398316548925, "flos": 25780685621760.0, "grad_norm": 1.5861706784489158, "language_loss": 0.83530611, "learning_rate": 1.4872226771112954e-06, "loss": 0.9123646, "num_input_tokens_seen": 213231975, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.11230469, "step": 9897, "time_per_iteration": 2.615165948867798 }, { "auxiliary_loss_clip": 0.0643881, "auxiliary_loss_mlp": 0.01264528, "balance_loss_clip": 0.0628674, "balance_loss_mlp": 0.01253912, "epoch": 0.5950999549075605, "flos": 23045644500480.0, "grad_norm": 2.4777986990097998, "language_loss": 0.71531582, "learning_rate": 1.486846243389939e-06, "loss": 0.79234922, "num_input_tokens_seen": 213249760, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.10614014, "step": 9898, "time_per_iteration": 2.593593120574951 }, { "auxiliary_loss_clip": 0.06446375, "auxiliary_loss_mlp": 0.01269047, "balance_loss_clip": 0.06288766, "balance_loss_mlp": 0.01255875, "epoch": 0.5951600781602284, "flos": 32453553697920.0, "grad_norm": 3.019102928697337, "language_loss": 0.6414572, "learning_rate": 1.4864698291274251e-06, "loss": 0.71861148, "num_input_tokens_seen": 213269890, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.13165283, "step": 9899, "time_per_iteration": 2.658428192138672 }, { "auxiliary_loss_clip": 0.0643811, "auxiliary_loss_mlp": 0.0126911, "balance_loss_clip": 0.06288616, "balance_loss_mlp": 0.01259133, "epoch": 0.5952202014128964, "flos": 23806887644160.0, "grad_norm": 1.8741215194378225, "language_loss": 0.72578788, "learning_rate": 1.4860934343380267e-06, "loss": 0.80286008, "num_input_tokens_seen": 213289400, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.09979248, "step": 9900, "time_per_iteration": 4.043331146240234 }, { "auxiliary_loss_clip": 0.06439039, "auxiliary_loss_mlp": 0.01268468, "balance_loss_clip": 0.06290668, "balance_loss_mlp": 0.01256714, "epoch": 0.5952803246655644, "flos": 22498778828160.0, "grad_norm": 1.7159294536172305, "language_loss": 0.84927094, "learning_rate": 1.4857170590360169e-06, "loss": 0.926346, "num_input_tokens_seen": 213308040, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.11743164, "step": 9901, "time_per_iteration": 2.663050651550293 }, { "auxiliary_loss_clip": 0.06332308, "auxiliary_loss_mlp": 0.01250935, "balance_loss_clip": 0.06270108, "balance_loss_mlp": 0.01249477, "epoch": 0.5953404479182324, "flos": 51250810884480.0, "grad_norm": 0.7706893050579119, "language_loss": 0.58089113, "learning_rate": 1.4853407032356674e-06, "loss": 0.6567235, "num_input_tokens_seen": 213358585, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01457214, "step": 9902, "time_per_iteration": 4.51834774017334 }, { "auxiliary_loss_clip": 0.06443675, "auxiliary_loss_mlp": 0.01267353, "balance_loss_clip": 0.06289843, "balance_loss_mlp": 0.01255617, "epoch": 0.5954005711709004, "flos": 23119423620480.0, "grad_norm": 1.7194096448889031, "language_loss": 0.77902257, "learning_rate": 1.4849643669512503e-06, "loss": 0.85613292, "num_input_tokens_seen": 213379585, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11749268, "step": 9903, "time_per_iteration": 2.5826873779296875 }, { "auxiliary_loss_clip": 0.06441015, "auxiliary_loss_mlp": 0.01264946, "balance_loss_clip": 0.06289771, "balance_loss_mlp": 0.01253699, "epoch": 0.5954606944235683, "flos": 35963464250880.0, "grad_norm": 3.1093262994905815, "language_loss": 0.77470243, "learning_rate": 1.4845880501970362e-06, "loss": 0.85176206, "num_input_tokens_seen": 213401465, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.11248779, "step": 9904, "time_per_iteration": 2.6874642372131348 }, { "auxiliary_loss_clip": 0.06445011, "auxiliary_loss_mlp": 0.01264674, "balance_loss_clip": 0.06287464, "balance_loss_mlp": 0.01253439, "epoch": 0.5955208176762363, "flos": 30451188677760.0, "grad_norm": 1.3981988668664185, "language_loss": 0.73065645, "learning_rate": 1.4842117529872942e-06, "loss": 0.80775332, "num_input_tokens_seen": 213422720, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.11242676, "step": 9905, "time_per_iteration": 2.615327835083008 }, { "auxiliary_loss_clip": 0.06439137, "auxiliary_loss_mlp": 0.01269018, "balance_loss_clip": 0.06287505, "balance_loss_mlp": 0.01257878, "epoch": 0.5955809409289042, "flos": 17645987214720.0, "grad_norm": 1.7199718534196193, "language_loss": 0.7031821, "learning_rate": 1.483835475336295e-06, "loss": 0.78026366, "num_input_tokens_seen": 213439480, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.11132812, "step": 9906, "time_per_iteration": 2.5224666595458984 }, { "auxiliary_loss_clip": 0.06446072, "auxiliary_loss_mlp": 0.01264948, "balance_loss_clip": 0.06292388, "balance_loss_mlp": 0.01252943, "epoch": 0.5956410641815723, "flos": 24286766376960.0, "grad_norm": 1.6918755125928637, "language_loss": 0.75427771, "learning_rate": 1.4834592172583057e-06, "loss": 0.83138794, "num_input_tokens_seen": 213458895, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.11999512, "step": 9907, "time_per_iteration": 2.585094928741455 }, { "auxiliary_loss_clip": 0.06440431, "auxiliary_loss_mlp": 0.012675, "balance_loss_clip": 0.0628854, "balance_loss_mlp": 0.01256306, "epoch": 0.5957011874342402, "flos": 35742713869440.0, "grad_norm": 1.5275745642982248, "language_loss": 0.66877079, "learning_rate": 1.483082978767595e-06, "loss": 0.74585009, "num_input_tokens_seen": 213481730, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.11193848, "step": 9908, "time_per_iteration": 2.719705104827881 }, { "auxiliary_loss_clip": 0.06438296, "auxiliary_loss_mlp": 0.01266321, "balance_loss_clip": 0.06288178, "balance_loss_mlp": 0.01255169, "epoch": 0.5957613106869082, "flos": 21250277792640.0, "grad_norm": 1.9422468907784658, "language_loss": 0.76465732, "learning_rate": 1.4827067598784298e-06, "loss": 0.84170353, "num_input_tokens_seen": 213497225, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.11157227, "step": 9909, "time_per_iteration": 2.5229389667510986 }, { "auxiliary_loss_clip": 0.06329542, "auxiliary_loss_mlp": 0.01250299, "balance_loss_clip": 0.06267273, "balance_loss_mlp": 0.01248793, "epoch": 0.5958214339395761, "flos": 65959972346880.0, "grad_norm": 0.8894345699993886, "language_loss": 0.73347008, "learning_rate": 1.4823305606050753e-06, "loss": 0.80926847, "num_input_tokens_seen": 213556890, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01502991, "step": 9910, "time_per_iteration": 3.252899646759033 }, { "auxiliary_loss_clip": 0.06440727, "auxiliary_loss_mlp": 0.01267071, "balance_loss_clip": 0.0628752, "balance_loss_mlp": 0.01255609, "epoch": 0.5958815571922441, "flos": 23224872384000.0, "grad_norm": 3.4708713667785003, "language_loss": 0.69698286, "learning_rate": 1.481954380961799e-06, "loss": 0.77406079, "num_input_tokens_seen": 213575800, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.11462402, "step": 9911, "time_per_iteration": 2.5564346313476562 }, { "auxiliary_loss_clip": 0.06451362, "auxiliary_loss_mlp": 0.01269864, "balance_loss_clip": 0.06291421, "balance_loss_mlp": 0.01257567, "epoch": 0.595941680444912, "flos": 16543157702400.0, "grad_norm": 2.0337265467798793, "language_loss": 0.65893155, "learning_rate": 1.4815782209628631e-06, "loss": 0.73614377, "num_input_tokens_seen": 213592740, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.12298584, "step": 9912, "time_per_iteration": 2.5333304405212402 }, { "auxiliary_loss_clip": 0.0643943, "auxiliary_loss_mlp": 0.01269158, "balance_loss_clip": 0.06287286, "balance_loss_mlp": 0.012576, "epoch": 0.59600180369758, "flos": 27826334075520.0, "grad_norm": 2.284420511823763, "language_loss": 0.73276377, "learning_rate": 1.4812020806225337e-06, "loss": 0.80984968, "num_input_tokens_seen": 213611970, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.11547852, "step": 9913, "time_per_iteration": 2.603142499923706 }, { "auxiliary_loss_clip": 0.06442022, "auxiliary_loss_mlp": 0.01268309, "balance_loss_clip": 0.06284213, "balance_loss_mlp": 0.01257151, "epoch": 0.596061926950248, "flos": 29498349173760.0, "grad_norm": 2.253570283737025, "language_loss": 0.80035704, "learning_rate": 1.4808259599550738e-06, "loss": 0.87746036, "num_input_tokens_seen": 213632230, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.11151123, "step": 9914, "time_per_iteration": 2.646692991256714 }, { "auxiliary_loss_clip": 0.06440338, "auxiliary_loss_mlp": 0.01266164, "balance_loss_clip": 0.06288716, "balance_loss_mlp": 0.01254893, "epoch": 0.596122050202916, "flos": 16842424959360.0, "grad_norm": 1.8233427926551327, "language_loss": 0.67738718, "learning_rate": 1.4804498589747448e-06, "loss": 0.75445211, "num_input_tokens_seen": 213649645, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.11260986, "step": 9915, "time_per_iteration": 2.5522592067718506 }, { "auxiliary_loss_clip": 0.06440867, "auxiliary_loss_mlp": 0.01264337, "balance_loss_clip": 0.06286126, "balance_loss_mlp": 0.0125359, "epoch": 0.596182173455584, "flos": 21003056720640.0, "grad_norm": 1.5370729989550522, "language_loss": 0.78818679, "learning_rate": 1.4800737776958095e-06, "loss": 0.86523885, "num_input_tokens_seen": 213668850, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.10742188, "step": 9916, "time_per_iteration": 2.587923765182495 }, { "auxiliary_loss_clip": 0.06444065, "auxiliary_loss_mlp": 0.0127027, "balance_loss_clip": 0.06287466, "balance_loss_mlp": 0.01258587, "epoch": 0.5962422967082519, "flos": 16070364639360.0, "grad_norm": 1.788806570046333, "language_loss": 0.83667779, "learning_rate": 1.4796977161325286e-06, "loss": 0.9138211, "num_input_tokens_seen": 213685695, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.11688232, "step": 9917, "time_per_iteration": 2.5713300704956055 }, { "auxiliary_loss_clip": 0.06439321, "auxiliary_loss_mlp": 0.01267965, "balance_loss_clip": 0.06287625, "balance_loss_mlp": 0.01257313, "epoch": 0.5963024199609199, "flos": 12171879976320.0, "grad_norm": 1.7772674243373447, "language_loss": 0.77555883, "learning_rate": 1.4793216742991625e-06, "loss": 0.85263169, "num_input_tokens_seen": 213703515, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.10644531, "step": 9918, "time_per_iteration": 2.574629068374634 }, { "auxiliary_loss_clip": 0.06439708, "auxiliary_loss_mlp": 0.01267367, "balance_loss_clip": 0.06287641, "balance_loss_mlp": 0.01257044, "epoch": 0.5963625432135878, "flos": 28081772847360.0, "grad_norm": 1.79700300603775, "language_loss": 0.79099631, "learning_rate": 1.4789456522099707e-06, "loss": 0.86806703, "num_input_tokens_seen": 213724170, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.10321045, "step": 9919, "time_per_iteration": 2.5990591049194336 }, { "auxiliary_loss_clip": 0.06444981, "auxiliary_loss_mlp": 0.01268472, "balance_loss_clip": 0.06291223, "balance_loss_mlp": 0.01256968, "epoch": 0.5964226664662559, "flos": 19865664599040.0, "grad_norm": 2.3417290235621646, "language_loss": 0.77838784, "learning_rate": 1.4785696498792122e-06, "loss": 0.85552239, "num_input_tokens_seen": 213740620, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.11505127, "step": 9920, "time_per_iteration": 2.553129196166992 }, { "auxiliary_loss_clip": 0.06450848, "auxiliary_loss_mlp": 0.01268393, "balance_loss_clip": 0.06295048, "balance_loss_mlp": 0.01256883, "epoch": 0.5964827897189238, "flos": 12937567386240.0, "grad_norm": 2.2938821832024248, "language_loss": 0.826864, "learning_rate": 1.4781936673211446e-06, "loss": 0.90405643, "num_input_tokens_seen": 213755390, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.11505127, "step": 9921, "time_per_iteration": 2.538863182067871 }, { "auxiliary_loss_clip": 0.06439655, "auxiliary_loss_mlp": 0.01269682, "balance_loss_clip": 0.06287935, "balance_loss_mlp": 0.01258732, "epoch": 0.5965429129715918, "flos": 18156738977280.0, "grad_norm": 2.44050178651913, "language_loss": 0.80905437, "learning_rate": 1.4778177045500252e-06, "loss": 0.88614774, "num_input_tokens_seen": 213773225, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.10955811, "step": 9922, "time_per_iteration": 2.5534420013427734 }, { "auxiliary_loss_clip": 0.06438699, "auxiliary_loss_mlp": 0.01268021, "balance_loss_clip": 0.06285769, "balance_loss_mlp": 0.01257591, "epoch": 0.5966030362242597, "flos": 21769834233600.0, "grad_norm": 1.8642666181390724, "language_loss": 0.77225059, "learning_rate": 1.477441761580111e-06, "loss": 0.84931779, "num_input_tokens_seen": 213791860, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.10437012, "step": 9923, "time_per_iteration": 2.580061197280884 }, { "auxiliary_loss_clip": 0.06442567, "auxiliary_loss_mlp": 0.01267495, "balance_loss_clip": 0.06284816, "balance_loss_mlp": 0.01253506, "epoch": 0.5966631594769277, "flos": 18813204190080.0, "grad_norm": 1.7815474244750442, "language_loss": 0.75484413, "learning_rate": 1.4770658384256573e-06, "loss": 0.8319447, "num_input_tokens_seen": 213809455, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.13983154, "step": 9924, "time_per_iteration": 2.5332744121551514 }, { "auxiliary_loss_clip": 0.0643271, "auxiliary_loss_mlp": 0.01273459, "balance_loss_clip": 0.06284316, "balance_loss_mlp": 0.01261431, "epoch": 0.5967232827295956, "flos": 14069383211520.0, "grad_norm": 1.7671621564544946, "language_loss": 0.66554415, "learning_rate": 1.4766899351009204e-06, "loss": 0.74260581, "num_input_tokens_seen": 213826615, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.12023926, "step": 9925, "time_per_iteration": 4.152137756347656 }, { "auxiliary_loss_clip": 0.06444705, "auxiliary_loss_mlp": 0.01269608, "balance_loss_clip": 0.06295691, "balance_loss_mlp": 0.01258361, "epoch": 0.5967834059822636, "flos": 17243954524800.0, "grad_norm": 2.227338891443774, "language_loss": 0.7228564, "learning_rate": 1.4763140516201528e-06, "loss": 0.79999959, "num_input_tokens_seen": 213844495, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.11254883, "step": 9926, "time_per_iteration": 2.5868778228759766 }, { "auxiliary_loss_clip": 0.06442086, "auxiliary_loss_mlp": 0.01270612, "balance_loss_clip": 0.06287698, "balance_loss_mlp": 0.0125856, "epoch": 0.5968435292349316, "flos": 42529751533440.0, "grad_norm": 1.8408744987683263, "language_loss": 0.709566, "learning_rate": 1.4759381879976088e-06, "loss": 0.78669298, "num_input_tokens_seen": 213869125, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.12060547, "step": 9927, "time_per_iteration": 2.7476396560668945 }, { "auxiliary_loss_clip": 0.06443876, "auxiliary_loss_mlp": 0.01268022, "balance_loss_clip": 0.06285395, "balance_loss_mlp": 0.01256083, "epoch": 0.5969036524875996, "flos": 37639546272000.0, "grad_norm": 1.6833812796394316, "language_loss": 0.63884562, "learning_rate": 1.4755623442475415e-06, "loss": 0.71596462, "num_input_tokens_seen": 213891115, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.11938477, "step": 9928, "time_per_iteration": 2.7347822189331055 }, { "auxiliary_loss_clip": 0.06434694, "auxiliary_loss_mlp": 0.01264143, "balance_loss_clip": 0.06284647, "balance_loss_mlp": 0.01253498, "epoch": 0.5969637757402676, "flos": 23154992478720.0, "grad_norm": 1.4330407857312992, "language_loss": 0.69868922, "learning_rate": 1.4751865203842022e-06, "loss": 0.77567762, "num_input_tokens_seen": 213911925, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.10644531, "step": 9929, "time_per_iteration": 4.013296127319336 }, { "auxiliary_loss_clip": 0.06432288, "auxiliary_loss_mlp": 0.01268205, "balance_loss_clip": 0.0628565, "balance_loss_mlp": 0.01257446, "epoch": 0.5970238989929355, "flos": 24027176828160.0, "grad_norm": 1.8284354870120416, "language_loss": 0.76249564, "learning_rate": 1.4748107164218431e-06, "loss": 0.83950067, "num_input_tokens_seen": 213930715, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.10766602, "step": 9930, "time_per_iteration": 2.5702626705169678 }, { "auxiliary_loss_clip": 0.06446327, "auxiliary_loss_mlp": 0.0127026, "balance_loss_clip": 0.06289202, "balance_loss_mlp": 0.01257732, "epoch": 0.5970840222456035, "flos": 19432884660480.0, "grad_norm": 1.5904783318349445, "language_loss": 0.69235277, "learning_rate": 1.4744349323747146e-06, "loss": 0.76951861, "num_input_tokens_seen": 213950015, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.12548828, "step": 9931, "time_per_iteration": 2.559169292449951 }, { "auxiliary_loss_clip": 0.06323021, "auxiliary_loss_mlp": 0.01265101, "balance_loss_clip": 0.06260689, "balance_loss_mlp": 0.01263402, "epoch": 0.5971441454982714, "flos": 62993615230080.0, "grad_norm": 0.8795729918966843, "language_loss": 0.64308429, "learning_rate": 1.474059168257065e-06, "loss": 0.71896553, "num_input_tokens_seen": 214003330, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01702881, "step": 9932, "time_per_iteration": 3.076087236404419 }, { "auxiliary_loss_clip": 0.06434546, "auxiliary_loss_mlp": 0.01268035, "balance_loss_clip": 0.06282835, "balance_loss_mlp": 0.01256949, "epoch": 0.5972042687509395, "flos": 20272393117440.0, "grad_norm": 1.807879461355712, "language_loss": 0.74115092, "learning_rate": 1.4736834240831454e-06, "loss": 0.81817675, "num_input_tokens_seen": 214021680, "router_z_loss_clip": 1.51660156, "router_z_loss_mlp": 0.11090088, "step": 9933, "time_per_iteration": 2.558199167251587 }, { "auxiliary_loss_clip": 0.06320486, "auxiliary_loss_mlp": 0.01257892, "balance_loss_clip": 0.0625836, "balance_loss_mlp": 0.01256147, "epoch": 0.5972643920036074, "flos": 71675625778560.0, "grad_norm": 0.6545786810450458, "language_loss": 0.51936144, "learning_rate": 1.473307699867203e-06, "loss": 0.59514523, "num_input_tokens_seen": 214090265, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 0.01748657, "step": 9934, "time_per_iteration": 3.251230478286743 }, { "auxiliary_loss_clip": 0.06320972, "auxiliary_loss_mlp": 0.01256156, "balance_loss_clip": 0.06259014, "balance_loss_mlp": 0.0125451, "epoch": 0.5973245152562754, "flos": 56910225427200.0, "grad_norm": 0.8158298082881335, "language_loss": 0.541188, "learning_rate": 1.4729319956234849e-06, "loss": 0.61695933, "num_input_tokens_seen": 214146375, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 0.01649475, "step": 9935, "time_per_iteration": 3.1218178272247314 }, { "auxiliary_loss_clip": 0.06437707, "auxiliary_loss_mlp": 0.01264822, "balance_loss_clip": 0.06283361, "balance_loss_mlp": 0.01252741, "epoch": 0.5973846385089433, "flos": 24170206947840.0, "grad_norm": 1.6987895476863821, "language_loss": 0.65823543, "learning_rate": 1.4725563113662394e-06, "loss": 0.73526073, "num_input_tokens_seen": 214165340, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.12084961, "step": 9936, "time_per_iteration": 2.570283889770508 }, { "auxiliary_loss_clip": 0.06442374, "auxiliary_loss_mlp": 0.01265607, "balance_loss_clip": 0.06287526, "balance_loss_mlp": 0.01255057, "epoch": 0.5974447617616113, "flos": 17675476652160.0, "grad_norm": 2.6174559222370912, "language_loss": 0.67438728, "learning_rate": 1.4721806471097103e-06, "loss": 0.75146711, "num_input_tokens_seen": 214181360, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.10552979, "step": 9937, "time_per_iteration": 2.562674045562744 }, { "auxiliary_loss_clip": 0.06439959, "auxiliary_loss_mlp": 0.01270934, "balance_loss_clip": 0.06283898, "balance_loss_mlp": 0.01258703, "epoch": 0.5975048850142792, "flos": 22899008655360.0, "grad_norm": 2.1500433970075563, "language_loss": 0.77060187, "learning_rate": 1.4718050028681442e-06, "loss": 0.84771085, "num_input_tokens_seen": 214198525, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.12243652, "step": 9938, "time_per_iteration": 2.5833051204681396 }, { "auxiliary_loss_clip": 0.06442925, "auxiliary_loss_mlp": 0.01264187, "balance_loss_clip": 0.06288677, "balance_loss_mlp": 0.0125344, "epoch": 0.5975650082669473, "flos": 24360042372480.0, "grad_norm": 1.5274161173819714, "language_loss": 0.76038384, "learning_rate": 1.4714293786557855e-06, "loss": 0.83745492, "num_input_tokens_seen": 214218710, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.10748291, "step": 9939, "time_per_iteration": 2.6362900733947754 }, { "auxiliary_loss_clip": 0.06445391, "auxiliary_loss_mlp": 0.0126956, "balance_loss_clip": 0.06286148, "balance_loss_mlp": 0.01256787, "epoch": 0.5976251315196152, "flos": 20929696871040.0, "grad_norm": 2.4158604166454434, "language_loss": 0.68680382, "learning_rate": 1.471053774486878e-06, "loss": 0.76395327, "num_input_tokens_seen": 214237800, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.12774658, "step": 9940, "time_per_iteration": 3.8997974395751953 }, { "auxiliary_loss_clip": 0.06434728, "auxiliary_loss_mlp": 0.0127272, "balance_loss_clip": 0.06285605, "balance_loss_mlp": 0.01261806, "epoch": 0.5976852547722832, "flos": 35853193877760.0, "grad_norm": 1.9969450596255256, "language_loss": 0.70166481, "learning_rate": 1.470678190375664e-06, "loss": 0.77873927, "num_input_tokens_seen": 214260355, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.10913086, "step": 9941, "time_per_iteration": 2.6945741176605225 }, { "auxiliary_loss_clip": 0.06432959, "auxiliary_loss_mlp": 0.01266219, "balance_loss_clip": 0.062818, "balance_loss_mlp": 0.01254978, "epoch": 0.5977453780249512, "flos": 12860266394880.0, "grad_norm": 1.9572437938164366, "language_loss": 0.77661902, "learning_rate": 1.470302626336386e-06, "loss": 0.85361081, "num_input_tokens_seen": 214277120, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.11248779, "step": 9942, "time_per_iteration": 3.973283529281616 }, { "auxiliary_loss_clip": 0.06439482, "auxiliary_loss_mlp": 0.01264493, "balance_loss_clip": 0.0628272, "balance_loss_mlp": 0.01252775, "epoch": 0.5978055012776191, "flos": 20965391510400.0, "grad_norm": 1.8794968768356217, "language_loss": 0.75653845, "learning_rate": 1.4699270823832857e-06, "loss": 0.83357817, "num_input_tokens_seen": 214295300, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.11730957, "step": 9943, "time_per_iteration": 2.6568431854248047 }, { "auxiliary_loss_clip": 0.06437971, "auxiliary_loss_mlp": 0.01265662, "balance_loss_clip": 0.06286654, "balance_loss_mlp": 0.01255273, "epoch": 0.5978656245302871, "flos": 34066506067200.0, "grad_norm": 2.8353431966900318, "language_loss": 0.62532485, "learning_rate": 1.4695515585306032e-06, "loss": 0.70236123, "num_input_tokens_seen": 214317050, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.10388184, "step": 9944, "time_per_iteration": 2.670172929763794 }, { "auxiliary_loss_clip": 0.06436291, "auxiliary_loss_mlp": 0.01268442, "balance_loss_clip": 0.06284527, "balance_loss_mlp": 0.01257213, "epoch": 0.597925747782955, "flos": 37381508023680.0, "grad_norm": 1.8314660869488815, "language_loss": 0.72450089, "learning_rate": 1.4691760547925795e-06, "loss": 0.80154824, "num_input_tokens_seen": 214337470, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.11230469, "step": 9945, "time_per_iteration": 2.716296434402466 }, { "auxiliary_loss_clip": 0.06437169, "auxiliary_loss_mlp": 0.01271942, "balance_loss_clip": 0.0628612, "balance_loss_mlp": 0.01260355, "epoch": 0.5979858710356231, "flos": 25381923240960.0, "grad_norm": 2.1273888733570008, "language_loss": 0.67377394, "learning_rate": 1.4688005711834522e-06, "loss": 0.7508651, "num_input_tokens_seen": 214357975, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.11584473, "step": 9946, "time_per_iteration": 2.600175142288208 }, { "auxiliary_loss_clip": 0.06442371, "auxiliary_loss_mlp": 0.01268882, "balance_loss_clip": 0.0628636, "balance_loss_mlp": 0.01257253, "epoch": 0.598045994288291, "flos": 13703422504320.0, "grad_norm": 2.327856037829835, "language_loss": 0.8940419, "learning_rate": 1.468425107717461e-06, "loss": 0.97115445, "num_input_tokens_seen": 214374125, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.11639404, "step": 9947, "time_per_iteration": 2.5667712688446045 }, { "auxiliary_loss_clip": 0.06430458, "auxiliary_loss_mlp": 0.01264912, "balance_loss_clip": 0.06284398, "balance_loss_mlp": 0.01254117, "epoch": 0.598106117540959, "flos": 21987859357440.0, "grad_norm": 1.7299232318065494, "language_loss": 0.71975368, "learning_rate": 1.4680496644088432e-06, "loss": 0.79670739, "num_input_tokens_seen": 214393395, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10797119, "step": 9948, "time_per_iteration": 2.653104066848755 }, { "auxiliary_loss_clip": 0.0644261, "auxiliary_loss_mlp": 0.01266519, "balance_loss_clip": 0.06287789, "balance_loss_mlp": 0.01254002, "epoch": 0.5981662407936269, "flos": 20565790588800.0, "grad_norm": 1.839795053287762, "language_loss": 0.89572918, "learning_rate": 1.4676742412718347e-06, "loss": 0.97282046, "num_input_tokens_seen": 214411550, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.12524414, "step": 9949, "time_per_iteration": 2.602832317352295 }, { "auxiliary_loss_clip": 0.06434821, "auxiliary_loss_mlp": 0.01266664, "balance_loss_clip": 0.06283884, "balance_loss_mlp": 0.0125556, "epoch": 0.5982263640462949, "flos": 14069005868160.0, "grad_norm": 2.1057465360883083, "language_loss": 0.70449257, "learning_rate": 1.467298838320673e-06, "loss": 0.78150749, "num_input_tokens_seen": 214429780, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.11108398, "step": 9950, "time_per_iteration": 2.5231516361236572 }, { "auxiliary_loss_clip": 0.06442633, "auxiliary_loss_mlp": 0.01268725, "balance_loss_clip": 0.06288795, "balance_loss_mlp": 0.01257948, "epoch": 0.5982864872989628, "flos": 17712135613440.0, "grad_norm": 1.8905217649051507, "language_loss": 0.78667802, "learning_rate": 1.4669234555695921e-06, "loss": 0.86379158, "num_input_tokens_seen": 214447775, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.10772705, "step": 9951, "time_per_iteration": 2.628847122192383 }, { "auxiliary_loss_clip": 0.06437296, "auxiliary_loss_mlp": 0.01269127, "balance_loss_clip": 0.06284193, "balance_loss_mlp": 0.01256724, "epoch": 0.5983466105516309, "flos": 16770574483200.0, "grad_norm": 1.379178872403189, "language_loss": 0.73857296, "learning_rate": 1.4665480930328275e-06, "loss": 0.81563723, "num_input_tokens_seen": 214467245, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.12408447, "step": 9952, "time_per_iteration": 2.5234906673431396 }, { "auxiliary_loss_clip": 0.06440817, "auxiliary_loss_mlp": 0.0126591, "balance_loss_clip": 0.06285608, "balance_loss_mlp": 0.01253506, "epoch": 0.5984067338042988, "flos": 20048078937600.0, "grad_norm": 1.9809876810020037, "language_loss": 0.79249722, "learning_rate": 1.466172750724613e-06, "loss": 0.86956447, "num_input_tokens_seen": 214484385, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.12408447, "step": 9953, "time_per_iteration": 2.5328822135925293 }, { "auxiliary_loss_clip": 0.06435241, "auxiliary_loss_mlp": 0.01273422, "balance_loss_clip": 0.06284733, "balance_loss_mlp": 0.01262592, "epoch": 0.5984668570569668, "flos": 26326586972160.0, "grad_norm": 1.6908593071366855, "language_loss": 0.69816554, "learning_rate": 1.4657974286591807e-06, "loss": 0.77525222, "num_input_tokens_seen": 214503465, "router_z_loss_clip": 1.50585938, "router_z_loss_mlp": 0.1083374, "step": 9954, "time_per_iteration": 2.6014435291290283 }, { "auxiliary_loss_clip": 0.06439137, "auxiliary_loss_mlp": 0.01265498, "balance_loss_clip": 0.06284376, "balance_loss_mlp": 0.01254668, "epoch": 0.5985269803096348, "flos": 20599808146560.0, "grad_norm": 1.990342572904451, "language_loss": 0.73534936, "learning_rate": 1.4654221268507637e-06, "loss": 0.81239569, "num_input_tokens_seen": 214520725, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.10827637, "step": 9955, "time_per_iteration": 2.5517401695251465 }, { "auxiliary_loss_clip": 0.0643834, "auxiliary_loss_mlp": 0.01265568, "balance_loss_clip": 0.06286976, "balance_loss_mlp": 0.01254058, "epoch": 0.5985871035623027, "flos": 26871859416960.0, "grad_norm": 1.7363625436721633, "language_loss": 0.68924862, "learning_rate": 1.4650468453135934e-06, "loss": 0.76628768, "num_input_tokens_seen": 214540675, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.1151123, "step": 9956, "time_per_iteration": 2.584857940673828 }, { "auxiliary_loss_clip": 0.06443135, "auxiliary_loss_mlp": 0.01265833, "balance_loss_clip": 0.06289443, "balance_loss_mlp": 0.01255003, "epoch": 0.5986472268149707, "flos": 19615802123520.0, "grad_norm": 4.452106253954527, "language_loss": 0.73579156, "learning_rate": 1.4646715840618999e-06, "loss": 0.81288123, "num_input_tokens_seen": 214559910, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.1083374, "step": 9957, "time_per_iteration": 2.55755615234375 }, { "auxiliary_loss_clip": 0.06433503, "auxiliary_loss_mlp": 0.01270586, "balance_loss_clip": 0.06286979, "balance_loss_mlp": 0.01259803, "epoch": 0.5987073500676386, "flos": 21800371847040.0, "grad_norm": 2.8070134406996377, "language_loss": 0.8483721, "learning_rate": 1.4642963431099138e-06, "loss": 0.92541307, "num_input_tokens_seen": 214575960, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10797119, "step": 9958, "time_per_iteration": 2.5492160320281982 }, { "auxiliary_loss_clip": 0.06440829, "auxiliary_loss_mlp": 0.01266626, "balance_loss_clip": 0.06286107, "balance_loss_mlp": 0.0125545, "epoch": 0.5987674733203067, "flos": 24320909715840.0, "grad_norm": 1.7441246883434176, "language_loss": 0.66534042, "learning_rate": 1.463921122471864e-06, "loss": 0.74241495, "num_input_tokens_seen": 214594230, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.1116333, "step": 9959, "time_per_iteration": 2.6383609771728516 }, { "auxiliary_loss_clip": 0.06439713, "auxiliary_loss_mlp": 0.01265576, "balance_loss_clip": 0.062884, "balance_loss_mlp": 0.01255205, "epoch": 0.5988275965729746, "flos": 21325859775360.0, "grad_norm": 1.6090059723935828, "language_loss": 0.83561432, "learning_rate": 1.4635459221619796e-06, "loss": 0.91266721, "num_input_tokens_seen": 214613130, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.1038208, "step": 9960, "time_per_iteration": 2.5818328857421875 }, { "auxiliary_loss_clip": 0.06437454, "auxiliary_loss_mlp": 0.01269034, "balance_loss_clip": 0.06286642, "balance_loss_mlp": 0.01258043, "epoch": 0.5988877198256426, "flos": 25124891241600.0, "grad_norm": 1.8437648199698737, "language_loss": 0.79561949, "learning_rate": 1.4631707421944868e-06, "loss": 0.87268436, "num_input_tokens_seen": 214634470, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.10986328, "step": 9961, "time_per_iteration": 2.6226680278778076 }, { "auxiliary_loss_clip": 0.06437224, "auxiliary_loss_mlp": 0.0126377, "balance_loss_clip": 0.0628569, "balance_loss_mlp": 0.01252517, "epoch": 0.5989478430783105, "flos": 26435767242240.0, "grad_norm": 1.9550880241432904, "language_loss": 0.6700626, "learning_rate": 1.4627955825836136e-06, "loss": 0.74707258, "num_input_tokens_seen": 214654030, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.11242676, "step": 9962, "time_per_iteration": 2.5916225910186768 }, { "auxiliary_loss_clip": 0.06437796, "auxiliary_loss_mlp": 0.01266873, "balance_loss_clip": 0.06285689, "balance_loss_mlp": 0.01255769, "epoch": 0.5990079663309785, "flos": 25786010355840.0, "grad_norm": 1.4911208514421805, "language_loss": 0.7464748, "learning_rate": 1.4624204433435857e-06, "loss": 0.82352149, "num_input_tokens_seen": 214676985, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.11096191, "step": 9963, "time_per_iteration": 2.611821174621582 }, { "auxiliary_loss_clip": 0.06435319, "auxiliary_loss_mlp": 0.01266464, "balance_loss_clip": 0.06287281, "balance_loss_mlp": 0.01255765, "epoch": 0.5990680895836464, "flos": 36840889480320.0, "grad_norm": 1.6156220372019396, "language_loss": 0.68281472, "learning_rate": 1.4620453244886281e-06, "loss": 0.7598325, "num_input_tokens_seen": 214700105, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.10699463, "step": 9964, "time_per_iteration": 4.136505126953125 }, { "auxiliary_loss_clip": 0.0643446, "auxiliary_loss_mlp": 0.01268546, "balance_loss_clip": 0.06288286, "balance_loss_mlp": 0.01257621, "epoch": 0.5991282128363145, "flos": 24140340167040.0, "grad_norm": 1.9125324137259865, "language_loss": 0.77202952, "learning_rate": 1.4616702260329662e-06, "loss": 0.84905958, "num_input_tokens_seen": 214717885, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10931396, "step": 9965, "time_per_iteration": 2.5626211166381836 }, { "auxiliary_loss_clip": 0.06439456, "auxiliary_loss_mlp": 0.01266836, "balance_loss_clip": 0.06287184, "balance_loss_mlp": 0.01255999, "epoch": 0.5991883360889824, "flos": 10308310444800.0, "grad_norm": 1.7219968832707089, "language_loss": 0.77667522, "learning_rate": 1.4612951479908229e-06, "loss": 0.85373819, "num_input_tokens_seen": 214733680, "router_z_loss_clip": 1.52246094, "router_z_loss_mlp": 0.10839844, "step": 9966, "time_per_iteration": 2.534194231033325 }, { "auxiliary_loss_clip": 0.06438699, "auxiliary_loss_mlp": 0.01265657, "balance_loss_clip": 0.06289127, "balance_loss_mlp": 0.0125525, "epoch": 0.5992484593416504, "flos": 23957967755520.0, "grad_norm": 1.4586577062010788, "language_loss": 0.73583198, "learning_rate": 1.460920090376422e-06, "loss": 0.81287551, "num_input_tokens_seen": 214753285, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.10412598, "step": 9967, "time_per_iteration": 2.5623838901519775 }, { "auxiliary_loss_clip": 0.06447944, "auxiliary_loss_mlp": 0.0127136, "balance_loss_clip": 0.06290627, "balance_loss_mlp": 0.01259314, "epoch": 0.5993085825943184, "flos": 11948320483200.0, "grad_norm": 2.019417606891075, "language_loss": 0.6855123, "learning_rate": 1.4605450532039847e-06, "loss": 0.76270539, "num_input_tokens_seen": 214767810, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.12042236, "step": 9968, "time_per_iteration": 2.5370211601257324 }, { "auxiliary_loss_clip": 0.06439394, "auxiliary_loss_mlp": 0.01267256, "balance_loss_clip": 0.06285373, "balance_loss_mlp": 0.0125592, "epoch": 0.5993687058469863, "flos": 19032990249600.0, "grad_norm": 1.4345673596676274, "language_loss": 0.79470623, "learning_rate": 1.4601700364877334e-06, "loss": 0.87177277, "num_input_tokens_seen": 214786040, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.11346436, "step": 9969, "time_per_iteration": 4.134939670562744 }, { "auxiliary_loss_clip": 0.0643456, "auxiliary_loss_mlp": 0.0126474, "balance_loss_clip": 0.06284048, "balance_loss_mlp": 0.01253146, "epoch": 0.5994288290996543, "flos": 14288204949120.0, "grad_norm": 2.380528790048058, "language_loss": 0.8114177, "learning_rate": 1.4597950402418889e-06, "loss": 0.88841069, "num_input_tokens_seen": 214803110, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.11590576, "step": 9970, "time_per_iteration": 2.595913887023926 }, { "auxiliary_loss_clip": 0.06442865, "auxiliary_loss_mlp": 0.01266818, "balance_loss_clip": 0.06289834, "balance_loss_mlp": 0.01254324, "epoch": 0.5994889523523222, "flos": 19212385841280.0, "grad_norm": 2.744743305541235, "language_loss": 0.62654024, "learning_rate": 1.4594200644806697e-06, "loss": 0.70363706, "num_input_tokens_seen": 214819945, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.12481689, "step": 9971, "time_per_iteration": 2.6456072330474854 }, { "auxiliary_loss_clip": 0.06437914, "auxiliary_loss_mlp": 0.01265403, "balance_loss_clip": 0.06291203, "balance_loss_mlp": 0.01254853, "epoch": 0.5995490756049903, "flos": 28044401126400.0, "grad_norm": 1.5130033303441064, "language_loss": 0.79142427, "learning_rate": 1.4590451092182962e-06, "loss": 0.8684575, "num_input_tokens_seen": 214838810, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10546875, "step": 9972, "time_per_iteration": 2.7187211513519287 }, { "auxiliary_loss_clip": 0.0644791, "auxiliary_loss_mlp": 0.0127072, "balance_loss_clip": 0.06290007, "balance_loss_mlp": 0.01258919, "epoch": 0.5996091988576582, "flos": 29059531741440.0, "grad_norm": 2.8448834157052243, "language_loss": 0.77034026, "learning_rate": 1.4586701744689864e-06, "loss": 0.84752655, "num_input_tokens_seen": 214857040, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.11804199, "step": 9973, "time_per_iteration": 2.663729667663574 }, { "auxiliary_loss_clip": 0.06437065, "auxiliary_loss_mlp": 0.0126914, "balance_loss_clip": 0.06285521, "balance_loss_mlp": 0.0125806, "epoch": 0.5996693221103262, "flos": 20820306965760.0, "grad_norm": 2.0353465946506724, "language_loss": 0.65568852, "learning_rate": 1.4582952602469578e-06, "loss": 0.73275059, "num_input_tokens_seen": 214873375, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.11090088, "step": 9974, "time_per_iteration": 2.5460283756256104 }, { "auxiliary_loss_clip": 0.06437472, "auxiliary_loss_mlp": 0.01268471, "balance_loss_clip": 0.06285727, "balance_loss_mlp": 0.01257397, "epoch": 0.5997294453629941, "flos": 23775679198080.0, "grad_norm": 1.3344875909106142, "language_loss": 0.74822783, "learning_rate": 1.457920366566428e-06, "loss": 0.82528728, "num_input_tokens_seen": 214893900, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.11083984, "step": 9975, "time_per_iteration": 2.5916295051574707 }, { "auxiliary_loss_clip": 0.06438611, "auxiliary_loss_mlp": 0.01266477, "balance_loss_clip": 0.06288122, "balance_loss_mlp": 0.01255867, "epoch": 0.5997895686156621, "flos": 20966397759360.0, "grad_norm": 1.8901327353149784, "language_loss": 0.77661645, "learning_rate": 1.457545493441611e-06, "loss": 0.85366738, "num_input_tokens_seen": 214912110, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.10601807, "step": 9976, "time_per_iteration": 2.550029993057251 }, { "auxiliary_loss_clip": 0.06438388, "auxiliary_loss_mlp": 0.01264944, "balance_loss_clip": 0.0628874, "balance_loss_mlp": 0.01254263, "epoch": 0.59984969186833, "flos": 28372864331520.0, "grad_norm": 2.1919435479869023, "language_loss": 0.75499868, "learning_rate": 1.4571706408867237e-06, "loss": 0.83203197, "num_input_tokens_seen": 214930140, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.10681152, "step": 9977, "time_per_iteration": 2.605588912963867 }, { "auxiliary_loss_clip": 0.06437966, "auxiliary_loss_mlp": 0.01267674, "balance_loss_clip": 0.06285611, "balance_loss_mlp": 0.01256457, "epoch": 0.5999098151209981, "flos": 22572641802240.0, "grad_norm": 1.7331754372971002, "language_loss": 0.69369107, "learning_rate": 1.4567958089159802e-06, "loss": 0.77074742, "num_input_tokens_seen": 214949200, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.11224365, "step": 9978, "time_per_iteration": 2.5466206073760986 }, { "auxiliary_loss_clip": 0.06437834, "auxiliary_loss_mlp": 0.01265679, "balance_loss_clip": 0.06283917, "balance_loss_mlp": 0.01254092, "epoch": 0.599969938373666, "flos": 18774365022720.0, "grad_norm": 2.606428748647148, "language_loss": 0.82051468, "learning_rate": 1.456420997543594e-06, "loss": 0.89754981, "num_input_tokens_seen": 214965775, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11584473, "step": 9979, "time_per_iteration": 3.9188392162323 }, { "auxiliary_loss_clip": 0.06429946, "auxiliary_loss_mlp": 0.0126995, "balance_loss_clip": 0.06285629, "balance_loss_mlp": 0.0125937, "epoch": 0.600030061626334, "flos": 11331910321920.0, "grad_norm": 1.809832312340705, "language_loss": 0.70326352, "learning_rate": 1.4560462067837782e-06, "loss": 0.78026247, "num_input_tokens_seen": 214982480, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10583496, "step": 9980, "time_per_iteration": 2.5393593311309814 }, { "auxiliary_loss_clip": 0.0643913, "auxiliary_loss_mlp": 0.01269065, "balance_loss_clip": 0.06284571, "balance_loss_mlp": 0.01257377, "epoch": 0.600090184879002, "flos": 16583799732480.0, "grad_norm": 2.6714863397087774, "language_loss": 0.68901658, "learning_rate": 1.4556714366507445e-06, "loss": 0.7660985, "num_input_tokens_seen": 214998110, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.11688232, "step": 9981, "time_per_iteration": 2.506291627883911 }, { "auxiliary_loss_clip": 0.0643563, "auxiliary_loss_mlp": 0.01264001, "balance_loss_clip": 0.06286711, "balance_loss_mlp": 0.01253672, "epoch": 0.6001503081316699, "flos": 23624641013760.0, "grad_norm": 2.054169181307125, "language_loss": 0.78678942, "learning_rate": 1.4552966871587048e-06, "loss": 0.86378574, "num_input_tokens_seen": 215017995, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.10327148, "step": 9982, "time_per_iteration": 4.030878305435181 }, { "auxiliary_loss_clip": 0.06439575, "auxiliary_loss_mlp": 0.01270884, "balance_loss_clip": 0.06291735, "balance_loss_mlp": 0.01259779, "epoch": 0.6002104313843379, "flos": 20673922682880.0, "grad_norm": 1.5252848489034636, "language_loss": 0.73581129, "learning_rate": 1.4549219583218686e-06, "loss": 0.81291592, "num_input_tokens_seen": 215038285, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.11114502, "step": 9983, "time_per_iteration": 2.5661489963531494 }, { "auxiliary_loss_clip": 0.06437469, "auxiliary_loss_mlp": 0.01263908, "balance_loss_clip": 0.06286012, "balance_loss_mlp": 0.01252505, "epoch": 0.6002705546370058, "flos": 22461742523520.0, "grad_norm": 1.9361293761786218, "language_loss": 0.78096783, "learning_rate": 1.454547250154447e-06, "loss": 0.85798156, "num_input_tokens_seen": 215057825, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.11395264, "step": 9984, "time_per_iteration": 2.5716440677642822 }, { "auxiliary_loss_clip": 0.06431696, "auxiliary_loss_mlp": 0.01266134, "balance_loss_clip": 0.0628187, "balance_loss_mlp": 0.01255435, "epoch": 0.6003306778896739, "flos": 25199005777920.0, "grad_norm": 2.0767199388903865, "language_loss": 0.83277303, "learning_rate": 1.4541725626706485e-06, "loss": 0.90975142, "num_input_tokens_seen": 215077790, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.1071167, "step": 9985, "time_per_iteration": 2.6258718967437744 }, { "auxiliary_loss_clip": 0.06436167, "auxiliary_loss_mlp": 0.01272012, "balance_loss_clip": 0.06286831, "balance_loss_mlp": 0.01261778, "epoch": 0.6003908011423418, "flos": 26694979447680.0, "grad_norm": 2.379603516740877, "language_loss": 0.71563578, "learning_rate": 1.4537978958846809e-06, "loss": 0.79271758, "num_input_tokens_seen": 215097650, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.10235596, "step": 9986, "time_per_iteration": 2.6173899173736572 }, { "auxiliary_loss_clip": 0.06441383, "auxiliary_loss_mlp": 0.01266935, "balance_loss_clip": 0.06291384, "balance_loss_mlp": 0.01256266, "epoch": 0.6004509243950098, "flos": 22571677480320.0, "grad_norm": 1.3580620133246732, "language_loss": 0.71859127, "learning_rate": 1.4534232498107514e-06, "loss": 0.79567444, "num_input_tokens_seen": 215118235, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.10675049, "step": 9987, "time_per_iteration": 2.60791277885437 }, { "auxiliary_loss_clip": 0.06429262, "auxiliary_loss_mlp": 0.01269842, "balance_loss_clip": 0.06282544, "balance_loss_mlp": 0.01259924, "epoch": 0.6005110476476777, "flos": 19725443591040.0, "grad_norm": 1.668215284396634, "language_loss": 0.85228515, "learning_rate": 1.4530486244630673e-06, "loss": 0.92927617, "num_input_tokens_seen": 215136755, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.09912109, "step": 9988, "time_per_iteration": 2.5720086097717285 }, { "auxiliary_loss_clip": 0.06429155, "auxiliary_loss_mlp": 0.01267041, "balance_loss_clip": 0.06280249, "balance_loss_mlp": 0.01256068, "epoch": 0.6005711709003457, "flos": 17718340815360.0, "grad_norm": 1.7042324199279966, "language_loss": 0.66097808, "learning_rate": 1.4526740198558346e-06, "loss": 0.73794007, "num_input_tokens_seen": 215155225, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.10974121, "step": 9989, "time_per_iteration": 2.6048378944396973 }, { "auxiliary_loss_clip": 0.06432988, "auxiliary_loss_mlp": 0.01264845, "balance_loss_clip": 0.06283918, "balance_loss_mlp": 0.01254182, "epoch": 0.6006312941530136, "flos": 18520267916160.0, "grad_norm": 1.5889861977186395, "language_loss": 0.80862081, "learning_rate": 1.452299436003257e-06, "loss": 0.88559914, "num_input_tokens_seen": 215174815, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.10656738, "step": 9990, "time_per_iteration": 2.5627281665802 }, { "auxiliary_loss_clip": 0.06433364, "auxiliary_loss_mlp": 0.01271498, "balance_loss_clip": 0.06280403, "balance_loss_mlp": 0.01260137, "epoch": 0.6006914174056817, "flos": 21396117024000.0, "grad_norm": 2.5481405927715315, "language_loss": 0.82824773, "learning_rate": 1.4519248729195403e-06, "loss": 0.90529633, "num_input_tokens_seen": 215192045, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.1137085, "step": 9991, "time_per_iteration": 2.564030170440674 }, { "auxiliary_loss_clip": 0.06427835, "auxiliary_loss_mlp": 0.0126703, "balance_loss_clip": 0.06281301, "balance_loss_mlp": 0.01256021, "epoch": 0.6007515406583496, "flos": 12755488464000.0, "grad_norm": 1.6071023010844545, "language_loss": 0.82909667, "learning_rate": 1.4515503306188878e-06, "loss": 0.90604538, "num_input_tokens_seen": 215209885, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.11016846, "step": 9992, "time_per_iteration": 2.5608267784118652 }, { "auxiliary_loss_clip": 0.06432424, "auxiliary_loss_mlp": 0.01266676, "balance_loss_clip": 0.06284173, "balance_loss_mlp": 0.01255268, "epoch": 0.6008116639110176, "flos": 19212679330560.0, "grad_norm": 1.899433881202069, "language_loss": 0.66862041, "learning_rate": 1.4511758091155008e-06, "loss": 0.74561143, "num_input_tokens_seen": 215228150, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.11419678, "step": 9993, "time_per_iteration": 2.5748934745788574 }, { "auxiliary_loss_clip": 0.0642897, "auxiliary_loss_mlp": 0.01267421, "balance_loss_clip": 0.06280354, "balance_loss_mlp": 0.01256168, "epoch": 0.6008717871636855, "flos": 17060953207680.0, "grad_norm": 2.388996316726103, "language_loss": 0.81156278, "learning_rate": 1.4508013084235826e-06, "loss": 0.88852668, "num_input_tokens_seen": 215243755, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.11248779, "step": 9994, "time_per_iteration": 2.5299949645996094 }, { "auxiliary_loss_clip": 0.06424157, "auxiliary_loss_mlp": 0.01266718, "balance_loss_clip": 0.06281061, "balance_loss_mlp": 0.01256627, "epoch": 0.6009319104163535, "flos": 20304188542080.0, "grad_norm": 1.8105183819674735, "language_loss": 0.72806215, "learning_rate": 1.4504268285573337e-06, "loss": 0.80497092, "num_input_tokens_seen": 215262130, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10095215, "step": 9995, "time_per_iteration": 2.5709919929504395 }, { "auxiliary_loss_clip": 0.06431873, "auxiliary_loss_mlp": 0.01265406, "balance_loss_clip": 0.06280067, "balance_loss_mlp": 0.0125482, "epoch": 0.6009920336690215, "flos": 21843487572480.0, "grad_norm": 1.7670547170047812, "language_loss": 0.81052327, "learning_rate": 1.4500523695309546e-06, "loss": 0.88749611, "num_input_tokens_seen": 215281785, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.10583496, "step": 9996, "time_per_iteration": 2.60028338432312 }, { "auxiliary_loss_clip": 0.06432854, "auxiliary_loss_mlp": 0.01269797, "balance_loss_clip": 0.06284315, "balance_loss_mlp": 0.01259021, "epoch": 0.6010521569216895, "flos": 22601795823360.0, "grad_norm": 2.4507288322503418, "language_loss": 0.7902261, "learning_rate": 1.4496779313586447e-06, "loss": 0.86725259, "num_input_tokens_seen": 215297550, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.10772705, "step": 9997, "time_per_iteration": 2.780071496963501 }, { "auxiliary_loss_clip": 0.06435271, "auxiliary_loss_mlp": 0.01267361, "balance_loss_clip": 0.06281787, "balance_loss_mlp": 0.01255851, "epoch": 0.6011122801743575, "flos": 19177697450880.0, "grad_norm": 9.66326352149246, "language_loss": 0.73408478, "learning_rate": 1.4493035140546028e-06, "loss": 0.81111109, "num_input_tokens_seen": 215316360, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.11505127, "step": 9998, "time_per_iteration": 2.643296957015991 }, { "auxiliary_loss_clip": 0.06428022, "auxiliary_loss_mlp": 0.01264861, "balance_loss_clip": 0.06280246, "balance_loss_mlp": 0.01254502, "epoch": 0.6011724034270254, "flos": 25017094563840.0, "grad_norm": 1.4610134211025947, "language_loss": 0.72782242, "learning_rate": 1.448929117633027e-06, "loss": 0.80475128, "num_input_tokens_seen": 215336405, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.1036377, "step": 9999, "time_per_iteration": 2.6301937103271484 }, { "auxiliary_loss_clip": 0.06433453, "auxiliary_loss_mlp": 0.01264756, "balance_loss_clip": 0.06280662, "balance_loss_mlp": 0.01253777, "epoch": 0.6012325266796934, "flos": 21803935645440.0, "grad_norm": 1.4370720902463783, "language_loss": 0.78383356, "learning_rate": 1.4485547421081142e-06, "loss": 0.8608157, "num_input_tokens_seen": 215356590, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.10974121, "step": 10000, "time_per_iteration": 2.5490055084228516 }, { "auxiliary_loss_clip": 0.06437166, "auxiliary_loss_mlp": 0.01266719, "balance_loss_clip": 0.06281131, "balance_loss_mlp": 0.01254285, "epoch": 0.6012926499323613, "flos": 19579059308160.0, "grad_norm": 2.0229561886043697, "language_loss": 0.77863526, "learning_rate": 1.4481803874940608e-06, "loss": 0.85567415, "num_input_tokens_seen": 215374295, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.12420654, "step": 10001, "time_per_iteration": 2.6265623569488525 }, { "auxiliary_loss_clip": 0.06437063, "auxiliary_loss_mlp": 0.01266761, "balance_loss_clip": 0.06282976, "balance_loss_mlp": 0.01255842, "epoch": 0.6013527731850293, "flos": 34869439416960.0, "grad_norm": 1.7048571405327144, "language_loss": 0.58235586, "learning_rate": 1.4478060538050624e-06, "loss": 0.65939409, "num_input_tokens_seen": 215394535, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.10919189, "step": 10002, "time_per_iteration": 2.6600284576416016 }, { "auxiliary_loss_clip": 0.06440033, "auxiliary_loss_mlp": 0.01267105, "balance_loss_clip": 0.06287123, "balance_loss_mlp": 0.01255989, "epoch": 0.6014128964376972, "flos": 23298190306560.0, "grad_norm": 1.5655227859772405, "language_loss": 0.78324485, "learning_rate": 1.447431741055314e-06, "loss": 0.86031622, "num_input_tokens_seen": 215414355, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.11114502, "step": 10003, "time_per_iteration": 2.5615897178649902 }, { "auxiliary_loss_clip": 0.06433677, "auxiliary_loss_mlp": 0.01264734, "balance_loss_clip": 0.06281327, "balance_loss_mlp": 0.01253421, "epoch": 0.6014730196903653, "flos": 24826839868800.0, "grad_norm": 2.537359412024504, "language_loss": 0.7818436, "learning_rate": 1.4470574492590091e-06, "loss": 0.85882771, "num_input_tokens_seen": 215428280, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.11309814, "step": 10004, "time_per_iteration": 3.98757004737854 }, { "auxiliary_loss_clip": 0.06430006, "auxiliary_loss_mlp": 0.01266227, "balance_loss_clip": 0.06279394, "balance_loss_mlp": 0.01254891, "epoch": 0.6015331429430332, "flos": 23119046277120.0, "grad_norm": 1.6065030684538522, "language_loss": 0.72513163, "learning_rate": 1.4466831784303408e-06, "loss": 0.80209398, "num_input_tokens_seen": 215448970, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.11352539, "step": 10005, "time_per_iteration": 2.5509190559387207 }, { "auxiliary_loss_clip": 0.06427269, "auxiliary_loss_mlp": 0.0126817, "balance_loss_clip": 0.06281362, "balance_loss_mlp": 0.01257721, "epoch": 0.6015932661957012, "flos": 19206222566400.0, "grad_norm": 2.250047257281236, "language_loss": 0.75134301, "learning_rate": 1.4463089285835026e-06, "loss": 0.82829744, "num_input_tokens_seen": 215465260, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10443115, "step": 10006, "time_per_iteration": 2.525743007659912 }, { "auxiliary_loss_clip": 0.06429288, "auxiliary_loss_mlp": 0.0126953, "balance_loss_clip": 0.0627913, "balance_loss_mlp": 0.01257889, "epoch": 0.6016533894483691, "flos": 18119451110400.0, "grad_norm": 1.8197433517441925, "language_loss": 0.74501801, "learning_rate": 1.445934699732685e-06, "loss": 0.82200623, "num_input_tokens_seen": 215482725, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.11651611, "step": 10007, "time_per_iteration": 2.511765241622925 }, { "auxiliary_loss_clip": 0.06430244, "auxiliary_loss_mlp": 0.01265435, "balance_loss_clip": 0.06281981, "balance_loss_mlp": 0.01254807, "epoch": 0.6017135127010371, "flos": 16222492926720.0, "grad_norm": 1.6943273655663251, "language_loss": 0.70048791, "learning_rate": 1.4455604918920785e-06, "loss": 0.77744472, "num_input_tokens_seen": 215500420, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10632324, "step": 10008, "time_per_iteration": 3.970412254333496 }, { "auxiliary_loss_clip": 0.06433378, "auxiliary_loss_mlp": 0.01264191, "balance_loss_clip": 0.06284754, "balance_loss_mlp": 0.01253903, "epoch": 0.6017736359537051, "flos": 23451576405120.0, "grad_norm": 2.1515486786123366, "language_loss": 0.76491565, "learning_rate": 1.4451863050758748e-06, "loss": 0.84189129, "num_input_tokens_seen": 215522260, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.10290527, "step": 10009, "time_per_iteration": 2.599696159362793 }, { "auxiliary_loss_clip": 0.06429181, "auxiliary_loss_mlp": 0.0126998, "balance_loss_clip": 0.06280304, "balance_loss_mlp": 0.01258786, "epoch": 0.601833759206373, "flos": 23520869331840.0, "grad_norm": 2.0455592516749403, "language_loss": 0.74635595, "learning_rate": 1.4448121392982608e-06, "loss": 0.82334757, "num_input_tokens_seen": 215541715, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.11193848, "step": 10010, "time_per_iteration": 2.618704319000244 }, { "auxiliary_loss_clip": 0.06323949, "auxiliary_loss_mlp": 0.01261963, "balance_loss_clip": 0.0626236, "balance_loss_mlp": 0.01260501, "epoch": 0.6018938824590411, "flos": 64013846215680.0, "grad_norm": 0.7834066121028744, "language_loss": 0.55057096, "learning_rate": 1.4444379945734268e-06, "loss": 0.62643003, "num_input_tokens_seen": 215603020, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01461792, "step": 10011, "time_per_iteration": 3.2298645973205566 }, { "auxiliary_loss_clip": 0.06427127, "auxiliary_loss_mlp": 0.01265491, "balance_loss_clip": 0.06277686, "balance_loss_mlp": 0.01254524, "epoch": 0.601954005711709, "flos": 34648311692160.0, "grad_norm": 1.3489799450699431, "language_loss": 0.62051773, "learning_rate": 1.44406387091556e-06, "loss": 0.69744396, "num_input_tokens_seen": 215625115, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.10974121, "step": 10012, "time_per_iteration": 2.746562957763672 }, { "auxiliary_loss_clip": 0.06423029, "auxiliary_loss_mlp": 0.01261882, "balance_loss_clip": 0.06275696, "balance_loss_mlp": 0.01251756, "epoch": 0.602014128964377, "flos": 19433094295680.0, "grad_norm": 1.9343256627372563, "language_loss": 0.75428689, "learning_rate": 1.4436897683388462e-06, "loss": 0.83113599, "num_input_tokens_seen": 215643730, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10119629, "step": 10013, "time_per_iteration": 2.5946784019470215 }, { "auxiliary_loss_clip": 0.06419803, "auxiliary_loss_mlp": 0.01263038, "balance_loss_clip": 0.0627716, "balance_loss_mlp": 0.01253537, "epoch": 0.6020742522170449, "flos": 28336876202880.0, "grad_norm": 1.6146306828002375, "language_loss": 0.81912792, "learning_rate": 1.4433156868574732e-06, "loss": 0.89595634, "num_input_tokens_seen": 215664425, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.0949707, "step": 10014, "time_per_iteration": 2.6395106315612793 }, { "auxiliary_loss_clip": 0.06419436, "auxiliary_loss_mlp": 0.01263545, "balance_loss_clip": 0.06276507, "balance_loss_mlp": 0.01253794, "epoch": 0.6021343754697129, "flos": 22753588694400.0, "grad_norm": 1.737387615242534, "language_loss": 0.72550142, "learning_rate": 1.442941626485624e-06, "loss": 0.80233121, "num_input_tokens_seen": 215684280, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09765625, "step": 10015, "time_per_iteration": 2.620523691177368 }, { "auxiliary_loss_clip": 0.0632287, "auxiliary_loss_mlp": 0.01254604, "balance_loss_clip": 0.06261452, "balance_loss_mlp": 0.01253053, "epoch": 0.6021944987223808, "flos": 65769885360000.0, "grad_norm": 0.816036333912924, "language_loss": 0.54898828, "learning_rate": 1.4425675872374848e-06, "loss": 0.62476301, "num_input_tokens_seen": 215739780, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01548767, "step": 10016, "time_per_iteration": 3.117598295211792 }, { "auxiliary_loss_clip": 0.0642652, "auxiliary_loss_mlp": 0.01265968, "balance_loss_clip": 0.06280553, "balance_loss_mlp": 0.01255406, "epoch": 0.6022546219750489, "flos": 16110377763840.0, "grad_norm": 1.5576815802840265, "language_loss": 0.82865071, "learning_rate": 1.4421935691272381e-06, "loss": 0.90557563, "num_input_tokens_seen": 215757885, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10565186, "step": 10017, "time_per_iteration": 2.5872864723205566 }, { "auxiliary_loss_clip": 0.06423153, "auxiliary_loss_mlp": 0.0126645, "balance_loss_clip": 0.0627956, "balance_loss_mlp": 0.01256377, "epoch": 0.6023147452277168, "flos": 25518328888320.0, "grad_norm": 1.9035087441909861, "language_loss": 0.83674711, "learning_rate": 1.4418195721690677e-06, "loss": 0.91364312, "num_input_tokens_seen": 215776415, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10076904, "step": 10018, "time_per_iteration": 2.5887722969055176 }, { "auxiliary_loss_clip": 0.06435992, "auxiliary_loss_mlp": 0.01267332, "balance_loss_clip": 0.06281605, "balance_loss_mlp": 0.01256395, "epoch": 0.6023748684803848, "flos": 22642353999360.0, "grad_norm": 1.659089331984476, "language_loss": 0.78178, "learning_rate": 1.4414455963771549e-06, "loss": 0.85881329, "num_input_tokens_seen": 215794865, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.109375, "step": 10019, "time_per_iteration": 4.046645879745483 }, { "auxiliary_loss_clip": 0.06424458, "auxiliary_loss_mlp": 0.01264299, "balance_loss_clip": 0.0627695, "balance_loss_mlp": 0.01253654, "epoch": 0.6024349917330527, "flos": 26217113212800.0, "grad_norm": 1.4661814540274445, "language_loss": 0.74309766, "learning_rate": 1.441071641765681e-06, "loss": 0.81998521, "num_input_tokens_seen": 215816840, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.10650635, "step": 10020, "time_per_iteration": 2.639920234680176 }, { "auxiliary_loss_clip": 0.06426857, "auxiliary_loss_mlp": 0.01271489, "balance_loss_clip": 0.06277777, "balance_loss_mlp": 0.0126054, "epoch": 0.6024951149857207, "flos": 21258160076160.0, "grad_norm": 1.4925917762139562, "language_loss": 0.64177263, "learning_rate": 1.4406977083488264e-06, "loss": 0.71875608, "num_input_tokens_seen": 215836100, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.10943604, "step": 10021, "time_per_iteration": 4.266006708145142 }, { "auxiliary_loss_clip": 0.0642571, "auxiliary_loss_mlp": 0.01270108, "balance_loss_clip": 0.06277503, "balance_loss_mlp": 0.01259409, "epoch": 0.6025552382383887, "flos": 26950795562880.0, "grad_norm": 1.4573138525666738, "language_loss": 0.8091678, "learning_rate": 1.4403237961407704e-06, "loss": 0.88612592, "num_input_tokens_seen": 215858480, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.10699463, "step": 10022, "time_per_iteration": 2.700888156890869 }, { "auxiliary_loss_clip": 0.06432369, "auxiliary_loss_mlp": 0.01270983, "balance_loss_clip": 0.06279067, "balance_loss_mlp": 0.01260076, "epoch": 0.6026153614910567, "flos": 31692142846080.0, "grad_norm": 1.959015694928921, "language_loss": 0.6682384, "learning_rate": 1.439949905155693e-06, "loss": 0.74527198, "num_input_tokens_seen": 215879950, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.10900879, "step": 10023, "time_per_iteration": 2.6897101402282715 }, { "auxiliary_loss_clip": 0.06430008, "auxiliary_loss_mlp": 0.01268211, "balance_loss_clip": 0.06280442, "balance_loss_mlp": 0.01257303, "epoch": 0.6026754847437247, "flos": 29320085612160.0, "grad_norm": 1.9142048036343637, "language_loss": 0.74850279, "learning_rate": 1.4395760354077707e-06, "loss": 0.82548499, "num_input_tokens_seen": 215899830, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.10913086, "step": 10024, "time_per_iteration": 2.6179018020629883 }, { "auxiliary_loss_clip": 0.06425808, "auxiliary_loss_mlp": 0.0126511, "balance_loss_clip": 0.06277759, "balance_loss_mlp": 0.01254172, "epoch": 0.6027356079963926, "flos": 23593558348800.0, "grad_norm": 1.8286087881779702, "language_loss": 0.72973961, "learning_rate": 1.4392021869111815e-06, "loss": 0.80664885, "num_input_tokens_seen": 215920440, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.109375, "step": 10025, "time_per_iteration": 2.646559000015259 }, { "auxiliary_loss_clip": 0.06433383, "auxiliary_loss_mlp": 0.01266119, "balance_loss_clip": 0.06278336, "balance_loss_mlp": 0.01254294, "epoch": 0.6027957312490606, "flos": 20820055403520.0, "grad_norm": 2.4746606643731446, "language_loss": 0.67804372, "learning_rate": 1.4388283596801016e-06, "loss": 0.75503868, "num_input_tokens_seen": 215940535, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.1182251, "step": 10026, "time_per_iteration": 2.5998494625091553 }, { "auxiliary_loss_clip": 0.0641761, "auxiliary_loss_mlp": 0.01266143, "balance_loss_clip": 0.06274538, "balance_loss_mlp": 0.01256636, "epoch": 0.6028558545017285, "flos": 19941540071040.0, "grad_norm": 1.7608858042929125, "language_loss": 0.80331421, "learning_rate": 1.4384545537287061e-06, "loss": 0.88015169, "num_input_tokens_seen": 215958045, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09515381, "step": 10027, "time_per_iteration": 2.5834410190582275 }, { "auxiliary_loss_clip": 0.06434642, "auxiliary_loss_mlp": 0.0127007, "balance_loss_clip": 0.0628172, "balance_loss_mlp": 0.01258596, "epoch": 0.6029159777543965, "flos": 22827535522560.0, "grad_norm": 1.9566469578276156, "language_loss": 0.71516442, "learning_rate": 1.438080769071171e-06, "loss": 0.79221153, "num_input_tokens_seen": 215977330, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.11480713, "step": 10028, "time_per_iteration": 2.558833599090576 }, { "auxiliary_loss_clip": 0.0642685, "auxiliary_loss_mlp": 0.01270825, "balance_loss_clip": 0.06276837, "balance_loss_mlp": 0.01259303, "epoch": 0.6029761010070644, "flos": 23594103400320.0, "grad_norm": 1.9300970013478305, "language_loss": 0.8418715, "learning_rate": 1.437707005721669e-06, "loss": 0.91884828, "num_input_tokens_seen": 215997865, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.11523438, "step": 10029, "time_per_iteration": 2.6037497520446777 }, { "auxiliary_loss_clip": 0.06422865, "auxiliary_loss_mlp": 0.01268775, "balance_loss_clip": 0.06277162, "balance_loss_mlp": 0.01258266, "epoch": 0.6030362242597325, "flos": 13667518229760.0, "grad_norm": 1.7582772389336978, "language_loss": 0.80324399, "learning_rate": 1.437333263694373e-06, "loss": 0.88016045, "num_input_tokens_seen": 216016230, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10510254, "step": 10030, "time_per_iteration": 2.53212571144104 }, { "auxiliary_loss_clip": 0.06429582, "auxiliary_loss_mlp": 0.0126927, "balance_loss_clip": 0.06279987, "balance_loss_mlp": 0.01258463, "epoch": 0.6030963475124004, "flos": 24429293372160.0, "grad_norm": 2.6730174051811497, "language_loss": 0.71189523, "learning_rate": 1.4369595430034572e-06, "loss": 0.78888369, "num_input_tokens_seen": 216035785, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.10809326, "step": 10031, "time_per_iteration": 2.6074612140655518 }, { "auxiliary_loss_clip": 0.06432739, "auxiliary_loss_mlp": 0.01264802, "balance_loss_clip": 0.062789, "balance_loss_mlp": 0.01252947, "epoch": 0.6031564707650684, "flos": 29651944907520.0, "grad_norm": 1.5884345681372427, "language_loss": 0.73593384, "learning_rate": 1.4365858436630912e-06, "loss": 0.81290925, "num_input_tokens_seen": 216059555, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11859131, "step": 10032, "time_per_iteration": 2.629582166671753 }, { "auxiliary_loss_clip": 0.06432889, "auxiliary_loss_mlp": 0.01270709, "balance_loss_clip": 0.06281276, "balance_loss_mlp": 0.01259318, "epoch": 0.6032165940177363, "flos": 16624525616640.0, "grad_norm": 1.8291869030783348, "language_loss": 0.68725652, "learning_rate": 1.4362121656874465e-06, "loss": 0.76429248, "num_input_tokens_seen": 216077235, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.1138916, "step": 10033, "time_per_iteration": 2.5633585453033447 }, { "auxiliary_loss_clip": 0.06426217, "auxiliary_loss_mlp": 0.01270106, "balance_loss_clip": 0.06279348, "balance_loss_mlp": 0.01259061, "epoch": 0.6032767172704043, "flos": 17493020386560.0, "grad_norm": 1.7498900782353861, "language_loss": 0.75733274, "learning_rate": 1.4358385090906934e-06, "loss": 0.83429599, "num_input_tokens_seen": 216094985, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.1105957, "step": 10034, "time_per_iteration": 2.5325546264648438 }, { "auxiliary_loss_clip": 0.06430152, "auxiliary_loss_mlp": 0.01269064, "balance_loss_clip": 0.06278072, "balance_loss_mlp": 0.01258252, "epoch": 0.6033368405230723, "flos": 26840105919360.0, "grad_norm": 1.7248237301510554, "language_loss": 0.74643064, "learning_rate": 1.4354648738870004e-06, "loss": 0.82342279, "num_input_tokens_seen": 216115905, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.10809326, "step": 10035, "time_per_iteration": 2.6184091567993164 }, { "auxiliary_loss_clip": 0.06425012, "auxiliary_loss_mlp": 0.01263834, "balance_loss_clip": 0.06280379, "balance_loss_mlp": 0.01253468, "epoch": 0.6033969637757403, "flos": 16915575173760.0, "grad_norm": 1.6341928317169334, "language_loss": 0.8661319, "learning_rate": 1.435091260090536e-06, "loss": 0.94302034, "num_input_tokens_seen": 216132420, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.1036377, "step": 10036, "time_per_iteration": 2.5058233737945557 }, { "auxiliary_loss_clip": 0.06430837, "auxiliary_loss_mlp": 0.01266816, "balance_loss_clip": 0.06279819, "balance_loss_mlp": 0.01255223, "epoch": 0.6034570870284083, "flos": 22936757719680.0, "grad_norm": 1.6771530943213964, "language_loss": 0.70572388, "learning_rate": 1.4347176677154676e-06, "loss": 0.78270042, "num_input_tokens_seen": 216149800, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.1159668, "step": 10037, "time_per_iteration": 2.5667548179626465 }, { "auxiliary_loss_clip": 0.06429473, "auxiliary_loss_mlp": 0.0126725, "balance_loss_clip": 0.06281576, "balance_loss_mlp": 0.01256646, "epoch": 0.6035172102810762, "flos": 23372807967360.0, "grad_norm": 2.0340019133989427, "language_loss": 0.8537792, "learning_rate": 1.4343440967759616e-06, "loss": 0.93074638, "num_input_tokens_seen": 216168200, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.105896, "step": 10038, "time_per_iteration": 2.607579469680786 }, { "auxiliary_loss_clip": 0.06428632, "auxiliary_loss_mlp": 0.01269341, "balance_loss_clip": 0.06278569, "balance_loss_mlp": 0.01258511, "epoch": 0.6035773335337442, "flos": 20893457180160.0, "grad_norm": 2.28435976546839, "language_loss": 0.77693963, "learning_rate": 1.4339705472861846e-06, "loss": 0.85391939, "num_input_tokens_seen": 216187105, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.10827637, "step": 10039, "time_per_iteration": 2.556645631790161 }, { "auxiliary_loss_clip": 0.0642384, "auxiliary_loss_mlp": 0.01265277, "balance_loss_clip": 0.06276536, "balance_loss_mlp": 0.01254929, "epoch": 0.6036374567864121, "flos": 24943231589760.0, "grad_norm": 1.7072437561343878, "language_loss": 0.71670341, "learning_rate": 1.433597019260301e-06, "loss": 0.7935946, "num_input_tokens_seen": 216205440, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.10345459, "step": 10040, "time_per_iteration": 2.5665571689605713 }, { "auxiliary_loss_clip": 0.06429924, "auxiliary_loss_mlp": 0.01268672, "balance_loss_clip": 0.06276694, "balance_loss_mlp": 0.01256346, "epoch": 0.6036975800390801, "flos": 23154866697600.0, "grad_norm": 1.8334339484569895, "language_loss": 0.78378963, "learning_rate": 1.433223512712475e-06, "loss": 0.86077559, "num_input_tokens_seen": 216223130, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.12316895, "step": 10041, "time_per_iteration": 2.5705785751342773 }, { "auxiliary_loss_clip": 0.06426333, "auxiliary_loss_mlp": 0.01261883, "balance_loss_clip": 0.06278807, "balance_loss_mlp": 0.01251696, "epoch": 0.603757703291748, "flos": 18666610272000.0, "grad_norm": 1.818003080770303, "language_loss": 0.75388646, "learning_rate": 1.4328500276568704e-06, "loss": 0.83076859, "num_input_tokens_seen": 216240260, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.10180664, "step": 10042, "time_per_iteration": 2.54439640045166 }, { "auxiliary_loss_clip": 0.06423871, "auxiliary_loss_mlp": 0.01266577, "balance_loss_clip": 0.06276421, "balance_loss_mlp": 0.01255758, "epoch": 0.6038178265444161, "flos": 19688700775680.0, "grad_norm": 1.6731914917043158, "language_loss": 0.84760916, "learning_rate": 1.4324765641076498e-06, "loss": 0.92451358, "num_input_tokens_seen": 216258510, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.1081543, "step": 10043, "time_per_iteration": 2.551680564880371 }, { "auxiliary_loss_clip": 0.06432665, "auxiliary_loss_mlp": 0.01267727, "balance_loss_clip": 0.06278928, "balance_loss_mlp": 0.01256181, "epoch": 0.603877949797084, "flos": 22644869621760.0, "grad_norm": 1.8257935180922837, "language_loss": 0.69456828, "learning_rate": 1.432103122078974e-06, "loss": 0.77157223, "num_input_tokens_seen": 216277550, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11541748, "step": 10044, "time_per_iteration": 4.123095750808716 }, { "auxiliary_loss_clip": 0.0643253, "auxiliary_loss_mlp": 0.01267013, "balance_loss_clip": 0.06280923, "balance_loss_mlp": 0.01255211, "epoch": 0.603938073049752, "flos": 25455031528320.0, "grad_norm": 1.9967448339694536, "language_loss": 0.78105497, "learning_rate": 1.4317297015850057e-06, "loss": 0.85805047, "num_input_tokens_seen": 216296690, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.11798096, "step": 10045, "time_per_iteration": 2.595402956008911 }, { "auxiliary_loss_clip": 0.06425723, "auxiliary_loss_mlp": 0.01265336, "balance_loss_clip": 0.06277906, "balance_loss_mlp": 0.01254411, "epoch": 0.6039981963024199, "flos": 22345686218880.0, "grad_norm": 1.7673847794042548, "language_loss": 0.77770281, "learning_rate": 1.4313563026399036e-06, "loss": 0.85461342, "num_input_tokens_seen": 216316110, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.109375, "step": 10046, "time_per_iteration": 2.5525805950164795 }, { "auxiliary_loss_clip": 0.06428719, "auxiliary_loss_mlp": 0.01267968, "balance_loss_clip": 0.06278488, "balance_loss_mlp": 0.0125796, "epoch": 0.6040583195550879, "flos": 20709239978880.0, "grad_norm": 1.5537363390010792, "language_loss": 0.87394071, "learning_rate": 1.430982925257827e-06, "loss": 0.95090759, "num_input_tokens_seen": 216333855, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.10009766, "step": 10047, "time_per_iteration": 2.561022996902466 }, { "auxiliary_loss_clip": 0.06431323, "auxiliary_loss_mlp": 0.01266501, "balance_loss_clip": 0.06286214, "balance_loss_mlp": 0.01256118, "epoch": 0.604118442807756, "flos": 27170623549440.0, "grad_norm": 1.4763943828080677, "language_loss": 0.75654888, "learning_rate": 1.4306095694529358e-06, "loss": 0.83352709, "num_input_tokens_seen": 216354890, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10388184, "step": 10048, "time_per_iteration": 4.027806997299194 }, { "auxiliary_loss_clip": 0.06439938, "auxiliary_loss_mlp": 0.01268516, "balance_loss_clip": 0.06282488, "balance_loss_mlp": 0.01256035, "epoch": 0.6041785660604239, "flos": 30889125642240.0, "grad_norm": 1.9046969995185623, "language_loss": 0.66178536, "learning_rate": 1.430236235239386e-06, "loss": 0.73886991, "num_input_tokens_seen": 216376055, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.12481689, "step": 10049, "time_per_iteration": 2.648360252380371 }, { "auxiliary_loss_clip": 0.06426489, "auxiliary_loss_mlp": 0.01265655, "balance_loss_clip": 0.06278025, "balance_loss_mlp": 0.01255331, "epoch": 0.6042386893130919, "flos": 19944391109760.0, "grad_norm": 1.4953944783204924, "language_loss": 0.66800106, "learning_rate": 1.429862922631336e-06, "loss": 0.74492246, "num_input_tokens_seen": 216396295, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.10314941, "step": 10050, "time_per_iteration": 2.5411300659179688 }, { "auxiliary_loss_clip": 0.0643373, "auxiliary_loss_mlp": 0.01268288, "balance_loss_clip": 0.06282049, "balance_loss_mlp": 0.01256832, "epoch": 0.6042988125657598, "flos": 32424106187520.0, "grad_norm": 1.7471232302402702, "language_loss": 0.70013762, "learning_rate": 1.4294896316429408e-06, "loss": 0.77715778, "num_input_tokens_seen": 216416605, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.11456299, "step": 10051, "time_per_iteration": 2.6481332778930664 }, { "auxiliary_loss_clip": 0.06428057, "auxiliary_loss_mlp": 0.01268324, "balance_loss_clip": 0.06279441, "balance_loss_mlp": 0.01256701, "epoch": 0.6043589358184278, "flos": 17426578498560.0, "grad_norm": 1.7686489897467785, "language_loss": 0.6534754, "learning_rate": 1.4291163622883553e-06, "loss": 0.73043919, "num_input_tokens_seen": 216435130, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.11633301, "step": 10052, "time_per_iteration": 2.5175886154174805 }, { "auxiliary_loss_clip": 0.06435989, "auxiliary_loss_mlp": 0.01269292, "balance_loss_clip": 0.06285468, "balance_loss_mlp": 0.01257645, "epoch": 0.6044190590710957, "flos": 27680243281920.0, "grad_norm": 1.8098287908982247, "language_loss": 0.69584846, "learning_rate": 1.4287431145817358e-06, "loss": 0.77290124, "num_input_tokens_seen": 216455640, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.11645508, "step": 10053, "time_per_iteration": 2.6035170555114746 }, { "auxiliary_loss_clip": 0.06319912, "auxiliary_loss_mlp": 0.01250815, "balance_loss_clip": 0.06258663, "balance_loss_mlp": 0.01249392, "epoch": 0.6044791823237637, "flos": 65334422090880.0, "grad_norm": 0.721971750543477, "language_loss": 0.60311413, "learning_rate": 1.4283698885372336e-06, "loss": 0.67882144, "num_input_tokens_seen": 216518130, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 0.01422119, "step": 10054, "time_per_iteration": 3.2566165924072266 }, { "auxiliary_loss_clip": 0.06431326, "auxiliary_loss_mlp": 0.0126722, "balance_loss_clip": 0.06285174, "balance_loss_mlp": 0.01255931, "epoch": 0.6045393055764317, "flos": 24498208955520.0, "grad_norm": 1.7360113579584457, "language_loss": 0.85853601, "learning_rate": 1.4279966841690027e-06, "loss": 0.93552154, "num_input_tokens_seen": 216536845, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.112854, "step": 10055, "time_per_iteration": 2.5800669193267822 }, { "auxiliary_loss_clip": 0.06434352, "auxiliary_loss_mlp": 0.01272361, "balance_loss_clip": 0.06283164, "balance_loss_mlp": 0.01260279, "epoch": 0.6045994288290997, "flos": 19058999742720.0, "grad_norm": 2.36067637063733, "language_loss": 0.74560893, "learning_rate": 1.4276235014911952e-06, "loss": 0.82267612, "num_input_tokens_seen": 216551860, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.12091064, "step": 10056, "time_per_iteration": 2.5324578285217285 }, { "auxiliary_loss_clip": 0.06428468, "auxiliary_loss_mlp": 0.01262012, "balance_loss_clip": 0.06282276, "balance_loss_mlp": 0.01252231, "epoch": 0.6046595520817676, "flos": 26583660898560.0, "grad_norm": 1.706356281536992, "language_loss": 0.8041867, "learning_rate": 1.4272503405179616e-06, "loss": 0.88109148, "num_input_tokens_seen": 216574775, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.09790039, "step": 10057, "time_per_iteration": 2.6245319843292236 }, { "auxiliary_loss_clip": 0.06425908, "auxiliary_loss_mlp": 0.01270285, "balance_loss_clip": 0.06279933, "balance_loss_mlp": 0.01259676, "epoch": 0.6047196753344356, "flos": 13586150315520.0, "grad_norm": 3.3308249383759576, "language_loss": 0.75715423, "learning_rate": 1.4268772012634527e-06, "loss": 0.8341161, "num_input_tokens_seen": 216590100, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.1060791, "step": 10058, "time_per_iteration": 3.971231698989868 }, { "auxiliary_loss_clip": 0.06423753, "auxiliary_loss_mlp": 0.01266006, "balance_loss_clip": 0.06278073, "balance_loss_mlp": 0.01254776, "epoch": 0.6047797985871035, "flos": 25527552837120.0, "grad_norm": 1.8695806592234903, "language_loss": 0.71759677, "learning_rate": 1.4265040837418176e-06, "loss": 0.79449439, "num_input_tokens_seen": 216610145, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.11236572, "step": 10059, "time_per_iteration": 2.6200673580169678 }, { "auxiliary_loss_clip": 0.06434371, "auxiliary_loss_mlp": 0.01264861, "balance_loss_clip": 0.06284121, "balance_loss_mlp": 0.01253727, "epoch": 0.6048399218397715, "flos": 20526112880640.0, "grad_norm": 1.4598601150142518, "language_loss": 0.76486015, "learning_rate": 1.4261309879672054e-06, "loss": 0.84185243, "num_input_tokens_seen": 216630625, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.11138916, "step": 10060, "time_per_iteration": 2.586993455886841 }, { "auxiliary_loss_clip": 0.06427363, "auxiliary_loss_mlp": 0.01267394, "balance_loss_clip": 0.06278922, "balance_loss_mlp": 0.01256576, "epoch": 0.6049000450924396, "flos": 20414416988160.0, "grad_norm": 2.1465576809811435, "language_loss": 0.73885453, "learning_rate": 1.4257579139537628e-06, "loss": 0.8158021, "num_input_tokens_seen": 216649255, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10809326, "step": 10061, "time_per_iteration": 4.021202087402344 }, { "auxiliary_loss_clip": 0.06434725, "auxiliary_loss_mlp": 0.01264205, "balance_loss_clip": 0.0628416, "balance_loss_mlp": 0.0125384, "epoch": 0.6049601683451075, "flos": 20747743729920.0, "grad_norm": 1.6813726865660177, "language_loss": 0.67909324, "learning_rate": 1.425384861715639e-06, "loss": 0.75608253, "num_input_tokens_seen": 216668100, "router_z_loss_clip": 1.50585938, "router_z_loss_mlp": 0.10357666, "step": 10062, "time_per_iteration": 2.5713064670562744 }, { "auxiliary_loss_clip": 0.06426065, "auxiliary_loss_mlp": 0.01267587, "balance_loss_clip": 0.06278428, "balance_loss_mlp": 0.01256447, "epoch": 0.6050202915977755, "flos": 20089140238080.0, "grad_norm": 2.000232527192362, "language_loss": 0.71513188, "learning_rate": 1.425011831266978e-06, "loss": 0.79206836, "num_input_tokens_seen": 216686125, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.11132812, "step": 10063, "time_per_iteration": 2.5617709159851074 }, { "auxiliary_loss_clip": 0.06425472, "auxiliary_loss_mlp": 0.01264287, "balance_loss_clip": 0.06278566, "balance_loss_mlp": 0.01253516, "epoch": 0.6050804148504434, "flos": 15966257541120.0, "grad_norm": 1.8480465834774804, "language_loss": 0.84547591, "learning_rate": 1.424638822621926e-06, "loss": 0.92237347, "num_input_tokens_seen": 216704265, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.10766602, "step": 10064, "time_per_iteration": 2.5415730476379395 }, { "auxiliary_loss_clip": 0.06429449, "auxiliary_loss_mlp": 0.01264827, "balance_loss_clip": 0.06281234, "balance_loss_mlp": 0.012538, "epoch": 0.6051405381031114, "flos": 17462315064960.0, "grad_norm": 2.59237468178539, "language_loss": 0.7991966, "learning_rate": 1.4242658357946278e-06, "loss": 0.87613928, "num_input_tokens_seen": 216721765, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.11016846, "step": 10065, "time_per_iteration": 2.512890100479126 }, { "auxiliary_loss_clip": 0.064392, "auxiliary_loss_mlp": 0.01265577, "balance_loss_clip": 0.06285703, "balance_loss_mlp": 0.01253608, "epoch": 0.6052006613557793, "flos": 11404808974080.0, "grad_norm": 2.086822588644443, "language_loss": 0.78911328, "learning_rate": 1.423892870799226e-06, "loss": 0.86616111, "num_input_tokens_seen": 216738295, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.11981201, "step": 10066, "time_per_iteration": 2.5241100788116455 }, { "auxiliary_loss_clip": 0.06431232, "auxiliary_loss_mlp": 0.01267798, "balance_loss_clip": 0.06281187, "balance_loss_mlp": 0.01256747, "epoch": 0.6052607846084473, "flos": 24757421160960.0, "grad_norm": 5.386256878983955, "language_loss": 0.73227799, "learning_rate": 1.4235199276498655e-06, "loss": 0.80926824, "num_input_tokens_seen": 216759875, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.1105957, "step": 10067, "time_per_iteration": 2.590097665786743 }, { "auxiliary_loss_clip": 0.06429937, "auxiliary_loss_mlp": 0.01268149, "balance_loss_clip": 0.0628017, "balance_loss_mlp": 0.01256806, "epoch": 0.6053209078611153, "flos": 20747492167680.0, "grad_norm": 1.9874094106634013, "language_loss": 0.69397759, "learning_rate": 1.4231470063606863e-06, "loss": 0.77095842, "num_input_tokens_seen": 216780705, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.11340332, "step": 10068, "time_per_iteration": 2.5790698528289795 }, { "auxiliary_loss_clip": 0.06432365, "auxiliary_loss_mlp": 0.01265065, "balance_loss_clip": 0.06280042, "balance_loss_mlp": 0.01254032, "epoch": 0.6053810311137833, "flos": 18959169202560.0, "grad_norm": 2.243112053965681, "language_loss": 0.87022299, "learning_rate": 1.4227741069458303e-06, "loss": 0.94719732, "num_input_tokens_seen": 216797625, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.11035156, "step": 10069, "time_per_iteration": 2.5381979942321777 }, { "auxiliary_loss_clip": 0.06428304, "auxiliary_loss_mlp": 0.01267783, "balance_loss_clip": 0.0628016, "balance_loss_mlp": 0.01256965, "epoch": 0.6054411543664512, "flos": 23957883901440.0, "grad_norm": 1.4393660085045215, "language_loss": 0.83082509, "learning_rate": 1.4224012294194387e-06, "loss": 0.90778601, "num_input_tokens_seen": 216817610, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.10821533, "step": 10070, "time_per_iteration": 2.571248769760132 }, { "auxiliary_loss_clip": 0.06430742, "auxiliary_loss_mlp": 0.01269975, "balance_loss_clip": 0.0627835, "balance_loss_mlp": 0.01258662, "epoch": 0.6055012776191192, "flos": 20600101635840.0, "grad_norm": 1.5348367877767768, "language_loss": 0.86322105, "learning_rate": 1.4220283737956496e-06, "loss": 0.94022822, "num_input_tokens_seen": 216836835, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.11315918, "step": 10071, "time_per_iteration": 2.5475220680236816 }, { "auxiliary_loss_clip": 0.06436978, "auxiliary_loss_mlp": 0.01272195, "balance_loss_clip": 0.06282554, "balance_loss_mlp": 0.01260131, "epoch": 0.6055614008717871, "flos": 30305768716800.0, "grad_norm": 1.5944212465219194, "language_loss": 0.7743417, "learning_rate": 1.421655540088603e-06, "loss": 0.8514334, "num_input_tokens_seen": 216856760, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.12054443, "step": 10072, "time_per_iteration": 2.6160757541656494 }, { "auxiliary_loss_clip": 0.06433039, "auxiliary_loss_mlp": 0.01265624, "balance_loss_clip": 0.06281078, "balance_loss_mlp": 0.01253011, "epoch": 0.6056215241244551, "flos": 27132245579520.0, "grad_norm": 1.50997162108215, "language_loss": 0.7486254, "learning_rate": 1.4212827283124367e-06, "loss": 0.82561201, "num_input_tokens_seen": 216878795, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.12615967, "step": 10073, "time_per_iteration": 2.604703426361084 }, { "auxiliary_loss_clip": 0.06323235, "auxiliary_loss_mlp": 0.01254515, "balance_loss_clip": 0.06261906, "balance_loss_mlp": 0.01253115, "epoch": 0.6056816473771232, "flos": 56023073124480.0, "grad_norm": 0.7457220179915864, "language_loss": 0.55263698, "learning_rate": 1.4209099384812863e-06, "loss": 0.62841451, "num_input_tokens_seen": 216937800, "router_z_loss_clip": 0.61669922, "router_z_loss_mlp": 0.01400757, "step": 10074, "time_per_iteration": 3.218590259552002 }, { "auxiliary_loss_clip": 0.06430964, "auxiliary_loss_mlp": 0.01267335, "balance_loss_clip": 0.06282319, "balance_loss_mlp": 0.01256547, "epoch": 0.6057417706297911, "flos": 23556144700800.0, "grad_norm": 1.9397483431951081, "language_loss": 0.82215309, "learning_rate": 1.4205371706092894e-06, "loss": 0.89913607, "num_input_tokens_seen": 216955280, "router_z_loss_clip": 1.48730469, "router_z_loss_mlp": 0.10784912, "step": 10075, "time_per_iteration": 2.5731639862060547 }, { "auxiliary_loss_clip": 0.06429596, "auxiliary_loss_mlp": 0.01267073, "balance_loss_clip": 0.0627806, "balance_loss_mlp": 0.01255754, "epoch": 0.6058018938824591, "flos": 27751464852480.0, "grad_norm": 2.1044902404138326, "language_loss": 0.78535068, "learning_rate": 1.4201644247105813e-06, "loss": 0.86231732, "num_input_tokens_seen": 216976950, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.11328125, "step": 10076, "time_per_iteration": 2.645106315612793 }, { "auxiliary_loss_clip": 0.06431428, "auxiliary_loss_mlp": 0.01263642, "balance_loss_clip": 0.06279069, "balance_loss_mlp": 0.01252281, "epoch": 0.605862017135127, "flos": 22789912239360.0, "grad_norm": 1.5867911552971594, "language_loss": 0.72037053, "learning_rate": 1.4197917007992964e-06, "loss": 0.7973212, "num_input_tokens_seen": 216996945, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.11364746, "step": 10077, "time_per_iteration": 2.59171462059021 }, { "auxiliary_loss_clip": 0.06433567, "auxiliary_loss_mlp": 0.01269561, "balance_loss_clip": 0.06283286, "balance_loss_mlp": 0.01258183, "epoch": 0.605922140387795, "flos": 21221375333760.0, "grad_norm": 1.5422661003531637, "language_loss": 0.55795044, "learning_rate": 1.4194189988895682e-06, "loss": 0.63498175, "num_input_tokens_seen": 217016580, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.11376953, "step": 10078, "time_per_iteration": 2.579740285873413 }, { "auxiliary_loss_clip": 0.06437258, "auxiliary_loss_mlp": 0.01270184, "balance_loss_clip": 0.06283604, "balance_loss_mlp": 0.01259163, "epoch": 0.6059822636404629, "flos": 27275191845120.0, "grad_norm": 1.520447847939048, "language_loss": 0.70660257, "learning_rate": 1.4190463189955297e-06, "loss": 0.78367698, "num_input_tokens_seen": 217037300, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.11016846, "step": 10079, "time_per_iteration": 2.5877041816711426 }, { "auxiliary_loss_clip": 0.06429575, "auxiliary_loss_mlp": 0.01270433, "balance_loss_clip": 0.06280625, "balance_loss_mlp": 0.01259179, "epoch": 0.606042386893131, "flos": 20637599137920.0, "grad_norm": 5.004886240706061, "language_loss": 0.62592053, "learning_rate": 1.4186736611313131e-06, "loss": 0.70292056, "num_input_tokens_seen": 217055805, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.11248779, "step": 10080, "time_per_iteration": 2.545707941055298 }, { "auxiliary_loss_clip": 0.06431232, "auxiliary_loss_mlp": 0.01265048, "balance_loss_clip": 0.06280783, "balance_loss_mlp": 0.01254176, "epoch": 0.6061025101457989, "flos": 23008859758080.0, "grad_norm": 1.7950785177911461, "language_loss": 0.71221846, "learning_rate": 1.4183010253110492e-06, "loss": 0.78918129, "num_input_tokens_seen": 217074175, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.10876465, "step": 10081, "time_per_iteration": 2.528141736984253 }, { "auxiliary_loss_clip": 0.06435395, "auxiliary_loss_mlp": 0.0126713, "balance_loss_clip": 0.06283595, "balance_loss_mlp": 0.01256067, "epoch": 0.6061626333984669, "flos": 29906796700800.0, "grad_norm": 1.7875473257183596, "language_loss": 0.6930573, "learning_rate": 1.4179284115488691e-06, "loss": 0.77008259, "num_input_tokens_seen": 217095695, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.1105957, "step": 10082, "time_per_iteration": 2.6134889125823975 }, { "auxiliary_loss_clip": 0.06430511, "auxiliary_loss_mlp": 0.01266711, "balance_loss_clip": 0.06280865, "balance_loss_mlp": 0.01256101, "epoch": 0.6062227566511348, "flos": 25016130241920.0, "grad_norm": 1.4649673090496118, "language_loss": 0.65985513, "learning_rate": 1.4175558198589015e-06, "loss": 0.73682737, "num_input_tokens_seen": 217116260, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.10614014, "step": 10083, "time_per_iteration": 4.040276527404785 }, { "auxiliary_loss_clip": 0.06433219, "auxiliary_loss_mlp": 0.01265861, "balance_loss_clip": 0.06281221, "balance_loss_mlp": 0.01255532, "epoch": 0.6062828799038028, "flos": 19470046746240.0, "grad_norm": 1.8950941568200024, "language_loss": 0.7453438, "learning_rate": 1.4171832502552764e-06, "loss": 0.82233465, "num_input_tokens_seen": 217134465, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.10327148, "step": 10084, "time_per_iteration": 2.5393142700195312 }, { "auxiliary_loss_clip": 0.06429201, "auxiliary_loss_mlp": 0.01267836, "balance_loss_clip": 0.06279702, "balance_loss_mlp": 0.01257203, "epoch": 0.6063430031564707, "flos": 13594661504640.0, "grad_norm": 2.485791598481245, "language_loss": 0.73277193, "learning_rate": 1.4168107027521204e-06, "loss": 0.80974233, "num_input_tokens_seen": 217149920, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.10638428, "step": 10085, "time_per_iteration": 2.5122897624969482 }, { "auxiliary_loss_clip": 0.06429621, "auxiliary_loss_mlp": 0.01266794, "balance_loss_clip": 0.06281938, "balance_loss_mlp": 0.01255808, "epoch": 0.6064031264091387, "flos": 23261740980480.0, "grad_norm": 2.0293870349184027, "language_loss": 0.7627151, "learning_rate": 1.4164381773635605e-06, "loss": 0.83967924, "num_input_tokens_seen": 217168165, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.10992432, "step": 10086, "time_per_iteration": 2.5534591674804688 }, { "auxiliary_loss_clip": 0.06424256, "auxiliary_loss_mlp": 0.01265771, "balance_loss_clip": 0.06277855, "balance_loss_mlp": 0.01255286, "epoch": 0.6064632496618068, "flos": 22465515957120.0, "grad_norm": 1.6058568376050097, "language_loss": 0.7283684, "learning_rate": 1.4160656741037246e-06, "loss": 0.80526865, "num_input_tokens_seen": 217190070, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.1048584, "step": 10087, "time_per_iteration": 4.0198163986206055 }, { "auxiliary_loss_clip": 0.06424809, "auxiliary_loss_mlp": 0.0126555, "balance_loss_clip": 0.06279489, "balance_loss_mlp": 0.01255334, "epoch": 0.6065233729144747, "flos": 25125604001280.0, "grad_norm": 1.6623186697994063, "language_loss": 0.84397429, "learning_rate": 1.4156931929867355e-06, "loss": 0.92087793, "num_input_tokens_seen": 217209370, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10217285, "step": 10088, "time_per_iteration": 2.5700364112854004 }, { "auxiliary_loss_clip": 0.06425135, "auxiliary_loss_mlp": 0.01267417, "balance_loss_clip": 0.06278034, "balance_loss_mlp": 0.01256987, "epoch": 0.6065834961671427, "flos": 23484126516480.0, "grad_norm": 2.2222202334893537, "language_loss": 0.71728659, "learning_rate": 1.4153207340267201e-06, "loss": 0.7942121, "num_input_tokens_seen": 217226990, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.10430908, "step": 10089, "time_per_iteration": 2.5993335247039795 }, { "auxiliary_loss_clip": 0.06426768, "auxiliary_loss_mlp": 0.01265034, "balance_loss_clip": 0.06277829, "balance_loss_mlp": 0.01254574, "epoch": 0.6066436194198106, "flos": 17025090860160.0, "grad_norm": 1.9609429824231348, "language_loss": 0.83528805, "learning_rate": 1.4149482972378009e-06, "loss": 0.91220605, "num_input_tokens_seen": 217244585, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.10467529, "step": 10090, "time_per_iteration": 2.514073371887207 }, { "auxiliary_loss_clip": 0.06439721, "auxiliary_loss_mlp": 0.01268594, "balance_loss_clip": 0.06280841, "balance_loss_mlp": 0.01256721, "epoch": 0.6067037426724786, "flos": 18520603332480.0, "grad_norm": 2.113135300702792, "language_loss": 0.76482648, "learning_rate": 1.4145758826341e-06, "loss": 0.84190965, "num_input_tokens_seen": 217263435, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.11877441, "step": 10091, "time_per_iteration": 2.671992063522339 }, { "auxiliary_loss_clip": 0.06427781, "auxiliary_loss_mlp": 0.01269203, "balance_loss_clip": 0.06279803, "balance_loss_mlp": 0.01258456, "epoch": 0.6067638659251465, "flos": 22352520326400.0, "grad_norm": 1.4580829669680637, "language_loss": 0.7986573, "learning_rate": 1.4142034902297415e-06, "loss": 0.8756271, "num_input_tokens_seen": 217283725, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.10748291, "step": 10092, "time_per_iteration": 2.6996171474456787 }, { "auxiliary_loss_clip": 0.06425328, "auxiliary_loss_mlp": 0.01266588, "balance_loss_clip": 0.06274548, "balance_loss_mlp": 0.01255549, "epoch": 0.6068239891778145, "flos": 12454669906560.0, "grad_norm": 2.1498760580772, "language_loss": 0.76556766, "learning_rate": 1.4138311200388444e-06, "loss": 0.84248686, "num_input_tokens_seen": 217301120, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.11047363, "step": 10093, "time_per_iteration": 2.5287232398986816 }, { "auxiliary_loss_clip": 0.06428663, "auxiliary_loss_mlp": 0.01265806, "balance_loss_clip": 0.06282674, "balance_loss_mlp": 0.01255662, "epoch": 0.6068841124304825, "flos": 23192657688960.0, "grad_norm": 1.8918402288872216, "language_loss": 0.87775159, "learning_rate": 1.4134587720755304e-06, "loss": 0.95469618, "num_input_tokens_seen": 217319585, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10150146, "step": 10094, "time_per_iteration": 2.580928087234497 }, { "auxiliary_loss_clip": 0.06426664, "auxiliary_loss_mlp": 0.01268781, "balance_loss_clip": 0.06277506, "balance_loss_mlp": 0.01258046, "epoch": 0.6069442356831505, "flos": 18593795473920.0, "grad_norm": 1.6851435974979792, "language_loss": 0.72385895, "learning_rate": 1.413086446353919e-06, "loss": 0.80081344, "num_input_tokens_seen": 217338880, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.10742188, "step": 10095, "time_per_iteration": 2.540243625640869 }, { "auxiliary_loss_clip": 0.06430432, "auxiliary_loss_mlp": 0.01267733, "balance_loss_clip": 0.06280913, "balance_loss_mlp": 0.01257672, "epoch": 0.6070043589358184, "flos": 20966775102720.0, "grad_norm": 1.6339050877515622, "language_loss": 0.76739311, "learning_rate": 1.4127141428881273e-06, "loss": 0.84437478, "num_input_tokens_seen": 217357480, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.1005249, "step": 10096, "time_per_iteration": 2.563478708267212 }, { "auxiliary_loss_clip": 0.06432585, "auxiliary_loss_mlp": 0.01269016, "balance_loss_clip": 0.06281812, "balance_loss_mlp": 0.01257965, "epoch": 0.6070644821884864, "flos": 11697242123520.0, "grad_norm": 2.6692329024466104, "language_loss": 0.79692477, "learning_rate": 1.4123418616922749e-06, "loss": 0.87394071, "num_input_tokens_seen": 217374575, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.1104126, "step": 10097, "time_per_iteration": 2.5189359188079834 }, { "auxiliary_loss_clip": 0.06421062, "auxiliary_loss_mlp": 0.01267107, "balance_loss_clip": 0.06275515, "balance_loss_mlp": 0.01256968, "epoch": 0.6071246054411543, "flos": 19315402836480.0, "grad_norm": 1.3647826059184842, "language_loss": 0.67398655, "learning_rate": 1.411969602780478e-06, "loss": 0.7508682, "num_input_tokens_seen": 217392950, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10144043, "step": 10098, "time_per_iteration": 3.9285335540771484 }, { "auxiliary_loss_clip": 0.06421813, "auxiliary_loss_mlp": 0.01266634, "balance_loss_clip": 0.0627535, "balance_loss_mlp": 0.012564, "epoch": 0.6071847286938223, "flos": 17754832068480.0, "grad_norm": 1.8028513177283594, "language_loss": 0.80419725, "learning_rate": 1.4115973661668523e-06, "loss": 0.8810817, "num_input_tokens_seen": 217412145, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.10241699, "step": 10099, "time_per_iteration": 2.528315782546997 }, { "auxiliary_loss_clip": 0.06433947, "auxiliary_loss_mlp": 0.01269055, "balance_loss_clip": 0.06279504, "balance_loss_mlp": 0.01257689, "epoch": 0.6072448519464904, "flos": 22644031080960.0, "grad_norm": 1.7277918509230938, "language_loss": 0.70777577, "learning_rate": 1.4112251518655133e-06, "loss": 0.78480577, "num_input_tokens_seen": 217432080, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.1137085, "step": 10100, "time_per_iteration": 4.037968635559082 }, { "auxiliary_loss_clip": 0.06430884, "auxiliary_loss_mlp": 0.01266715, "balance_loss_clip": 0.06281243, "balance_loss_mlp": 0.01254037, "epoch": 0.6073049751991583, "flos": 19543490449920.0, "grad_norm": 1.8107056947626963, "language_loss": 0.71107441, "learning_rate": 1.4108529598905764e-06, "loss": 0.78805047, "num_input_tokens_seen": 217450945, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.12670898, "step": 10101, "time_per_iteration": 2.5611257553100586 }, { "auxiliary_loss_clip": 0.0642633, "auxiliary_loss_mlp": 0.01264286, "balance_loss_clip": 0.0627965, "balance_loss_mlp": 0.01254308, "epoch": 0.6073650984518263, "flos": 28301936250240.0, "grad_norm": 2.063009399360401, "language_loss": 0.69612443, "learning_rate": 1.410480790256154e-06, "loss": 0.77303058, "num_input_tokens_seen": 217473105, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.09979248, "step": 10102, "time_per_iteration": 2.639251708984375 }, { "auxiliary_loss_clip": 0.06429857, "auxiliary_loss_mlp": 0.01265078, "balance_loss_clip": 0.06279691, "balance_loss_mlp": 0.01254546, "epoch": 0.6074252217044942, "flos": 25671211862400.0, "grad_norm": 1.9697144713016732, "language_loss": 0.74156594, "learning_rate": 1.4101086429763589e-06, "loss": 0.8185153, "num_input_tokens_seen": 217491780, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.10534668, "step": 10103, "time_per_iteration": 2.621812105178833 }, { "auxiliary_loss_clip": 0.0644142, "auxiliary_loss_mlp": 0.01270208, "balance_loss_clip": 0.06285266, "balance_loss_mlp": 0.01258084, "epoch": 0.6074853449571622, "flos": 22863775213440.0, "grad_norm": 1.538009455521662, "language_loss": 0.77044952, "learning_rate": 1.4097365180653032e-06, "loss": 0.84756577, "num_input_tokens_seen": 217510605, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.12133789, "step": 10104, "time_per_iteration": 2.5912084579467773 }, { "auxiliary_loss_clip": 0.06325262, "auxiliary_loss_mlp": 0.01251926, "balance_loss_clip": 0.06263895, "balance_loss_mlp": 0.01250217, "epoch": 0.6075454682098301, "flos": 67131088536960.0, "grad_norm": 0.6954762167378996, "language_loss": 0.55542564, "learning_rate": 1.4093644155370977e-06, "loss": 0.63119745, "num_input_tokens_seen": 217574815, "router_z_loss_clip": 0.61474609, "router_z_loss_mlp": 0.01712036, "step": 10105, "time_per_iteration": 3.1830201148986816 }, { "auxiliary_loss_clip": 0.06329271, "auxiliary_loss_mlp": 0.01252628, "balance_loss_clip": 0.06267922, "balance_loss_mlp": 0.01251001, "epoch": 0.6076055914624982, "flos": 70730389797120.0, "grad_norm": 0.7386633685347095, "language_loss": 0.56794071, "learning_rate": 1.4089923354058533e-06, "loss": 0.64375973, "num_input_tokens_seen": 217632375, "router_z_loss_clip": 0.61376953, "router_z_loss_mlp": 0.01629639, "step": 10106, "time_per_iteration": 3.138795852661133 }, { "auxiliary_loss_clip": 0.06423196, "auxiliary_loss_mlp": 0.01263128, "balance_loss_clip": 0.06277868, "balance_loss_mlp": 0.01253037, "epoch": 0.6076657147151661, "flos": 28371816155520.0, "grad_norm": 1.7230983830689235, "language_loss": 0.68933111, "learning_rate": 1.4086202776856784e-06, "loss": 0.76619434, "num_input_tokens_seen": 217653055, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10095215, "step": 10107, "time_per_iteration": 2.6130733489990234 }, { "auxiliary_loss_clip": 0.06430772, "auxiliary_loss_mlp": 0.01265853, "balance_loss_clip": 0.06278297, "balance_loss_mlp": 0.01255691, "epoch": 0.6077258379678341, "flos": 15055234024320.0, "grad_norm": 1.6380043271135254, "language_loss": 0.81003153, "learning_rate": 1.4082482423906815e-06, "loss": 0.88699782, "num_input_tokens_seen": 217671520, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.10162354, "step": 10108, "time_per_iteration": 2.5313374996185303 }, { "auxiliary_loss_clip": 0.06434429, "auxiliary_loss_mlp": 0.01268193, "balance_loss_clip": 0.06281599, "balance_loss_mlp": 0.01256576, "epoch": 0.607785961220502, "flos": 36174948756480.0, "grad_norm": 1.9603738466187737, "language_loss": 0.71888953, "learning_rate": 1.4078762295349714e-06, "loss": 0.79591572, "num_input_tokens_seen": 217691880, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.1161499, "step": 10109, "time_per_iteration": 2.6606671810150146 }, { "auxiliary_loss_clip": 0.06418914, "auxiliary_loss_mlp": 0.01268915, "balance_loss_clip": 0.06276627, "balance_loss_mlp": 0.01259569, "epoch": 0.60784608447317, "flos": 22530113055360.0, "grad_norm": 1.537864884024869, "language_loss": 0.80304319, "learning_rate": 1.407504239132653e-06, "loss": 0.87992144, "num_input_tokens_seen": 217710530, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09338379, "step": 10110, "time_per_iteration": 2.539569616317749 }, { "auxiliary_loss_clip": 0.06426257, "auxiliary_loss_mlp": 0.01269674, "balance_loss_clip": 0.06277256, "balance_loss_mlp": 0.01259285, "epoch": 0.6079062077258379, "flos": 23847823163520.0, "grad_norm": 2.597217971300898, "language_loss": 0.70823985, "learning_rate": 1.4071322711978338e-06, "loss": 0.78519917, "num_input_tokens_seen": 217728650, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.1038208, "step": 10111, "time_per_iteration": 2.558689594268799 }, { "auxiliary_loss_clip": 0.06429993, "auxiliary_loss_mlp": 0.01265782, "balance_loss_clip": 0.06278455, "balance_loss_mlp": 0.01253938, "epoch": 0.6079663309785059, "flos": 23373646508160.0, "grad_norm": 1.6669796791767593, "language_loss": 0.64906329, "learning_rate": 1.4067603257446186e-06, "loss": 0.72602105, "num_input_tokens_seen": 217747135, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.11846924, "step": 10112, "time_per_iteration": 2.5557661056518555 }, { "auxiliary_loss_clip": 0.06319621, "auxiliary_loss_mlp": 0.0125255, "balance_loss_clip": 0.06257966, "balance_loss_mlp": 0.01250954, "epoch": 0.6080264542311739, "flos": 71403709680000.0, "grad_norm": 0.6176069426782015, "language_loss": 0.49452549, "learning_rate": 1.4063884027871105e-06, "loss": 0.57024717, "num_input_tokens_seen": 217811860, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01597595, "step": 10113, "time_per_iteration": 3.1877739429473877 }, { "auxiliary_loss_clip": 0.06313576, "auxiliary_loss_mlp": 0.01257563, "balance_loss_clip": 0.06252471, "balance_loss_mlp": 0.0125599, "epoch": 0.6080865774838419, "flos": 66549786036480.0, "grad_norm": 0.8209219312937082, "language_loss": 0.56939048, "learning_rate": 1.4060165023394147e-06, "loss": 0.64510185, "num_input_tokens_seen": 217866510, "router_z_loss_clip": 0.61083984, "router_z_loss_mlp": 0.01572418, "step": 10114, "time_per_iteration": 3.0930356979370117 }, { "auxiliary_loss_clip": 0.06426775, "auxiliary_loss_mlp": 0.01268477, "balance_loss_clip": 0.06275986, "balance_loss_mlp": 0.01256616, "epoch": 0.6081467007365099, "flos": 19213895214720.0, "grad_norm": 3.1445849141360482, "language_loss": 0.70399427, "learning_rate": 1.4056446244156317e-06, "loss": 0.78094679, "num_input_tokens_seen": 217885650, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.11865234, "step": 10115, "time_per_iteration": 2.5506057739257812 }, { "auxiliary_loss_clip": 0.06423043, "auxiliary_loss_mlp": 0.0126329, "balance_loss_clip": 0.0627535, "balance_loss_mlp": 0.01252806, "epoch": 0.6082068239891778, "flos": 24174148089600.0, "grad_norm": 1.591626950462305, "language_loss": 0.72523248, "learning_rate": 1.4052727690298642e-06, "loss": 0.80209589, "num_input_tokens_seen": 217905300, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.10491943, "step": 10116, "time_per_iteration": 2.593864917755127 }, { "auxiliary_loss_clip": 0.0642967, "auxiliary_loss_mlp": 0.01268644, "balance_loss_clip": 0.06278192, "balance_loss_mlp": 0.01257367, "epoch": 0.6082669472418458, "flos": 37422150053760.0, "grad_norm": 1.5349341396517564, "language_loss": 0.54118782, "learning_rate": 1.4049009361962138e-06, "loss": 0.61817098, "num_input_tokens_seen": 217927845, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.11273193, "step": 10117, "time_per_iteration": 2.72414231300354 }, { "auxiliary_loss_clip": 0.06427747, "auxiliary_loss_mlp": 0.01264056, "balance_loss_clip": 0.06278279, "balance_loss_mlp": 0.01253667, "epoch": 0.6083270704945137, "flos": 15090886736640.0, "grad_norm": 1.9188612158869187, "language_loss": 0.7079156, "learning_rate": 1.4045291259287786e-06, "loss": 0.78483361, "num_input_tokens_seen": 217946145, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.10388184, "step": 10118, "time_per_iteration": 2.600128412246704 }, { "auxiliary_loss_clip": 0.06423791, "auxiliary_loss_mlp": 0.01267583, "balance_loss_clip": 0.06275998, "balance_loss_mlp": 0.01257498, "epoch": 0.6083871937471818, "flos": 20674845077760.0, "grad_norm": 1.5118862968052384, "language_loss": 0.74789262, "learning_rate": 1.4041573382416588e-06, "loss": 0.82480633, "num_input_tokens_seen": 217965190, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.10089111, "step": 10119, "time_per_iteration": 2.5992066860198975 }, { "auxiliary_loss_clip": 0.06427879, "auxiliary_loss_mlp": 0.01264976, "balance_loss_clip": 0.06279656, "balance_loss_mlp": 0.0125467, "epoch": 0.6084473169998497, "flos": 21513305358720.0, "grad_norm": 2.0915583975249303, "language_loss": 0.67674637, "learning_rate": 1.4037855731489525e-06, "loss": 0.75367486, "num_input_tokens_seen": 217983625, "router_z_loss_clip": 1.48242188, "router_z_loss_mlp": 0.10302734, "step": 10120, "time_per_iteration": 2.595606803894043 }, { "auxiliary_loss_clip": 0.06433087, "auxiliary_loss_mlp": 0.01268222, "balance_loss_clip": 0.06281316, "balance_loss_mlp": 0.01256963, "epoch": 0.6085074402525177, "flos": 26877309932160.0, "grad_norm": 1.8172652425024662, "language_loss": 0.74680078, "learning_rate": 1.4034138306647571e-06, "loss": 0.82381392, "num_input_tokens_seen": 218006005, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.11254883, "step": 10121, "time_per_iteration": 2.6030001640319824 }, { "auxiliary_loss_clip": 0.0642143, "auxiliary_loss_mlp": 0.01263824, "balance_loss_clip": 0.06275409, "balance_loss_mlp": 0.01253917, "epoch": 0.6085675635051856, "flos": 10894518408960.0, "grad_norm": 1.7438994449039529, "language_loss": 0.81252813, "learning_rate": 1.4030421108031685e-06, "loss": 0.88938069, "num_input_tokens_seen": 218024195, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.09906006, "step": 10122, "time_per_iteration": 2.636725425720215 }, { "auxiliary_loss_clip": 0.06424258, "auxiliary_loss_mlp": 0.0126517, "balance_loss_clip": 0.06277451, "balance_loss_mlp": 0.01254793, "epoch": 0.6086276867578536, "flos": 34871074571520.0, "grad_norm": 1.503223726995044, "language_loss": 0.55674434, "learning_rate": 1.402670413578284e-06, "loss": 0.63363862, "num_input_tokens_seen": 218047190, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10369873, "step": 10123, "time_per_iteration": 4.091774225234985 }, { "auxiliary_loss_clip": 0.06421757, "auxiliary_loss_mlp": 0.01263963, "balance_loss_clip": 0.06276918, "balance_loss_mlp": 0.01253419, "epoch": 0.6086878100105215, "flos": 20053906796160.0, "grad_norm": 1.750711595726257, "language_loss": 0.74570036, "learning_rate": 1.4022987390041965e-06, "loss": 0.82255757, "num_input_tokens_seen": 218065945, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10540771, "step": 10124, "time_per_iteration": 2.5509889125823975 }, { "auxiliary_loss_clip": 0.06426369, "auxiliary_loss_mlp": 0.01267015, "balance_loss_clip": 0.06276596, "balance_loss_mlp": 0.01256524, "epoch": 0.6087479332631895, "flos": 18338314775040.0, "grad_norm": 5.288848291281424, "language_loss": 0.65957141, "learning_rate": 1.4019270870950006e-06, "loss": 0.73650521, "num_input_tokens_seen": 218085285, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.10491943, "step": 10125, "time_per_iteration": 2.5871737003326416 }, { "auxiliary_loss_clip": 0.06426945, "auxiliary_loss_mlp": 0.01264985, "balance_loss_clip": 0.06281114, "balance_loss_mlp": 0.01254733, "epoch": 0.6088080565158575, "flos": 24499424839680.0, "grad_norm": 1.6910604629755734, "language_loss": 0.76577294, "learning_rate": 1.40155545786479e-06, "loss": 0.84269226, "num_input_tokens_seen": 218104735, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10247803, "step": 10126, "time_per_iteration": 2.580040454864502 }, { "auxiliary_loss_clip": 0.06430808, "auxiliary_loss_mlp": 0.01263604, "balance_loss_clip": 0.06277896, "balance_loss_mlp": 0.01253448, "epoch": 0.6088681797685255, "flos": 10273496273280.0, "grad_norm": 3.8895141101622936, "language_loss": 0.72319949, "learning_rate": 1.4011838513276558e-06, "loss": 0.8001436, "num_input_tokens_seen": 218121855, "router_z_loss_clip": 1.52636719, "router_z_loss_mlp": 0.10144043, "step": 10127, "time_per_iteration": 4.016226530075073 }, { "auxiliary_loss_clip": 0.06437069, "auxiliary_loss_mlp": 0.01269424, "balance_loss_clip": 0.06284284, "balance_loss_mlp": 0.01257366, "epoch": 0.6089283030211935, "flos": 21978928897920.0, "grad_norm": 2.7751604403003234, "language_loss": 0.73090714, "learning_rate": 1.400812267497691e-06, "loss": 0.80797207, "num_input_tokens_seen": 218137325, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.1206665, "step": 10128, "time_per_iteration": 2.538790225982666 }, { "auxiliary_loss_clip": 0.06421657, "auxiliary_loss_mlp": 0.01267738, "balance_loss_clip": 0.062753, "balance_loss_mlp": 0.01258297, "epoch": 0.6089884262738614, "flos": 17790945978240.0, "grad_norm": 2.024086785385336, "language_loss": 0.73121428, "learning_rate": 1.4004407063889842e-06, "loss": 0.80810821, "num_input_tokens_seen": 218155530, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.09436035, "step": 10129, "time_per_iteration": 2.5251245498657227 }, { "auxiliary_loss_clip": 0.06429581, "auxiliary_loss_mlp": 0.01271865, "balance_loss_clip": 0.06281678, "balance_loss_mlp": 0.01260981, "epoch": 0.6090485495265294, "flos": 36920496458880.0, "grad_norm": 1.551265785833993, "language_loss": 0.65967858, "learning_rate": 1.400069168015626e-06, "loss": 0.73669302, "num_input_tokens_seen": 218182535, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.10882568, "step": 10130, "time_per_iteration": 2.7682721614837646 }, { "auxiliary_loss_clip": 0.0641957, "auxiliary_loss_mlp": 0.01264433, "balance_loss_clip": 0.06275339, "balance_loss_mlp": 0.01254956, "epoch": 0.6091086727791973, "flos": 19904755328640.0, "grad_norm": 1.679656454150147, "language_loss": 0.76824689, "learning_rate": 1.3996976523917054e-06, "loss": 0.84508693, "num_input_tokens_seen": 218201740, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.09484863, "step": 10131, "time_per_iteration": 2.534625291824341 }, { "auxiliary_loss_clip": 0.06425564, "auxiliary_loss_mlp": 0.01264539, "balance_loss_clip": 0.06279092, "balance_loss_mlp": 0.01254865, "epoch": 0.6091687960318654, "flos": 22170147914880.0, "grad_norm": 1.6457425061361974, "language_loss": 0.77069551, "learning_rate": 1.3993261595313093e-06, "loss": 0.84759653, "num_input_tokens_seen": 218219800, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.09674072, "step": 10132, "time_per_iteration": 2.549959897994995 }, { "auxiliary_loss_clip": 0.06421015, "auxiliary_loss_mlp": 0.0126954, "balance_loss_clip": 0.06276818, "balance_loss_mlp": 0.01259735, "epoch": 0.6092289192845333, "flos": 21470818538880.0, "grad_norm": 1.8615386879466198, "language_loss": 0.75934309, "learning_rate": 1.3989546894485261e-06, "loss": 0.83624864, "num_input_tokens_seen": 218237585, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.0980835, "step": 10133, "time_per_iteration": 2.5522706508636475 }, { "auxiliary_loss_clip": 0.06424386, "auxiliary_loss_mlp": 0.01267912, "balance_loss_clip": 0.06277132, "balance_loss_mlp": 0.0125676, "epoch": 0.6092890425372013, "flos": 28702585347840.0, "grad_norm": 1.8968607282281043, "language_loss": 0.6366663, "learning_rate": 1.3985832421574414e-06, "loss": 0.71358931, "num_input_tokens_seen": 218258700, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.11157227, "step": 10134, "time_per_iteration": 2.6189517974853516 }, { "auxiliary_loss_clip": 0.06427915, "auxiliary_loss_mlp": 0.01268424, "balance_loss_clip": 0.06282221, "balance_loss_mlp": 0.01258321, "epoch": 0.6093491657898692, "flos": 20819384570880.0, "grad_norm": 1.9188273650426144, "language_loss": 0.7891022, "learning_rate": 1.3982118176721397e-06, "loss": 0.8660655, "num_input_tokens_seen": 218275655, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10107422, "step": 10135, "time_per_iteration": 2.5754318237304688 }, { "auxiliary_loss_clip": 0.06426741, "auxiliary_loss_mlp": 0.01267182, "balance_loss_clip": 0.06279074, "balance_loss_mlp": 0.01257502, "epoch": 0.6094092890425372, "flos": 25453983352320.0, "grad_norm": 1.668297691069256, "language_loss": 0.72044939, "learning_rate": 1.3978404160067069e-06, "loss": 0.79738861, "num_input_tokens_seen": 218295720, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.09680176, "step": 10136, "time_per_iteration": 2.5901453495025635 }, { "auxiliary_loss_clip": 0.06426746, "auxiliary_loss_mlp": 0.01266464, "balance_loss_clip": 0.0627863, "balance_loss_mlp": 0.01255836, "epoch": 0.6094694122952051, "flos": 35629089333120.0, "grad_norm": 1.9105518072743186, "language_loss": 0.7457155, "learning_rate": 1.3974690371752253e-06, "loss": 0.82264757, "num_input_tokens_seen": 218316745, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.10626221, "step": 10137, "time_per_iteration": 2.6607673168182373 }, { "auxiliary_loss_clip": 0.06428467, "auxiliary_loss_mlp": 0.01270346, "balance_loss_clip": 0.06278133, "balance_loss_mlp": 0.01259372, "epoch": 0.6095295355478731, "flos": 24462975513600.0, "grad_norm": 1.8127330916262254, "language_loss": 0.79914868, "learning_rate": 1.3970976811917785e-06, "loss": 0.87613684, "num_input_tokens_seen": 218335385, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.10968018, "step": 10138, "time_per_iteration": 4.040156364440918 }, { "auxiliary_loss_clip": 0.06418411, "auxiliary_loss_mlp": 0.01264718, "balance_loss_clip": 0.06276873, "balance_loss_mlp": 0.01254568, "epoch": 0.6095896588005411, "flos": 15638716730880.0, "grad_norm": 1.5667056330755886, "language_loss": 0.81185794, "learning_rate": 1.3967263480704481e-06, "loss": 0.88868928, "num_input_tokens_seen": 218353320, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10150146, "step": 10139, "time_per_iteration": 2.6530721187591553 }, { "auxiliary_loss_clip": 0.06432249, "auxiliary_loss_mlp": 0.01269067, "balance_loss_clip": 0.06280251, "balance_loss_mlp": 0.01257904, "epoch": 0.6096497820532091, "flos": 15554455850880.0, "grad_norm": 2.2562554477714216, "language_loss": 0.84035265, "learning_rate": 1.396355037825315e-06, "loss": 0.91736579, "num_input_tokens_seen": 218365620, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.1116333, "step": 10140, "time_per_iteration": 3.9528684616088867 }, { "auxiliary_loss_clip": 0.06431577, "auxiliary_loss_mlp": 0.0126837, "balance_loss_clip": 0.06282939, "balance_loss_mlp": 0.01257344, "epoch": 0.6097099053058771, "flos": 24210932832000.0, "grad_norm": 1.7435104074548586, "language_loss": 0.7601856, "learning_rate": 1.3959837504704592e-06, "loss": 0.83718503, "num_input_tokens_seen": 218383785, "router_z_loss_clip": 1.48730469, "router_z_loss_mlp": 0.11029053, "step": 10141, "time_per_iteration": 2.6032066345214844 }, { "auxiliary_loss_clip": 0.06424651, "auxiliary_loss_mlp": 0.01266557, "balance_loss_clip": 0.06278796, "balance_loss_mlp": 0.01256859, "epoch": 0.609770028558545, "flos": 19575830926080.0, "grad_norm": 1.91964575333842, "language_loss": 0.76742107, "learning_rate": 1.3956124860199603e-06, "loss": 0.84433317, "num_input_tokens_seen": 218399055, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.09698486, "step": 10142, "time_per_iteration": 2.5632593631744385 }, { "auxiliary_loss_clip": 0.06429869, "auxiliary_loss_mlp": 0.01270632, "balance_loss_clip": 0.06281156, "balance_loss_mlp": 0.01259731, "epoch": 0.609830151811213, "flos": 23955619841280.0, "grad_norm": 1.6668242192213982, "language_loss": 0.7713064, "learning_rate": 1.3952412444878964e-06, "loss": 0.84831142, "num_input_tokens_seen": 218419120, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.10894775, "step": 10143, "time_per_iteration": 2.5949089527130127 }, { "auxiliary_loss_clip": 0.06427944, "auxiliary_loss_mlp": 0.01265378, "balance_loss_clip": 0.06280626, "balance_loss_mlp": 0.01254655, "epoch": 0.6098902750638809, "flos": 16185205059840.0, "grad_norm": 1.6758486038013063, "language_loss": 0.75576121, "learning_rate": 1.3948700258883448e-06, "loss": 0.83269441, "num_input_tokens_seen": 218435290, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.1072998, "step": 10144, "time_per_iteration": 2.5314204692840576 }, { "auxiliary_loss_clip": 0.06429845, "auxiliary_loss_mlp": 0.01268575, "balance_loss_clip": 0.06279509, "balance_loss_mlp": 0.01257315, "epoch": 0.609950398316549, "flos": 44536141549440.0, "grad_norm": 3.3994626891767274, "language_loss": 0.73109442, "learning_rate": 1.394498830235383e-06, "loss": 0.80807865, "num_input_tokens_seen": 218457880, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.11260986, "step": 10145, "time_per_iteration": 2.777613878250122 }, { "auxiliary_loss_clip": 0.06427119, "auxiliary_loss_mlp": 0.01263311, "balance_loss_clip": 0.06280275, "balance_loss_mlp": 0.01252224, "epoch": 0.6100105215692169, "flos": 23228436182400.0, "grad_norm": 2.4379079919970095, "language_loss": 0.69339234, "learning_rate": 1.3941276575430862e-06, "loss": 0.77029663, "num_input_tokens_seen": 218475930, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.11077881, "step": 10146, "time_per_iteration": 2.636735439300537 }, { "auxiliary_loss_clip": 0.06426184, "auxiliary_loss_mlp": 0.01270391, "balance_loss_clip": 0.06282771, "balance_loss_mlp": 0.0126083, "epoch": 0.6100706448218849, "flos": 15017904230400.0, "grad_norm": 1.831164630500029, "language_loss": 0.77464664, "learning_rate": 1.3937565078255289e-06, "loss": 0.85161239, "num_input_tokens_seen": 218493675, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09552002, "step": 10147, "time_per_iteration": 2.5565338134765625 }, { "auxiliary_loss_clip": 0.06425396, "auxiliary_loss_mlp": 0.01263082, "balance_loss_clip": 0.0627896, "balance_loss_mlp": 0.01252771, "epoch": 0.6101307680745528, "flos": 19645039998720.0, "grad_norm": 1.7182816229619542, "language_loss": 0.78572398, "learning_rate": 1.393385381096786e-06, "loss": 0.86260879, "num_input_tokens_seen": 218511780, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.10296631, "step": 10148, "time_per_iteration": 2.5850789546966553 }, { "auxiliary_loss_clip": 0.0643521, "auxiliary_loss_mlp": 0.0126789, "balance_loss_clip": 0.06282206, "balance_loss_mlp": 0.01255808, "epoch": 0.6101908913272208, "flos": 29943455662080.0, "grad_norm": 2.1314214296053495, "language_loss": 0.54017502, "learning_rate": 1.39301427737093e-06, "loss": 0.61720598, "num_input_tokens_seen": 218531850, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.12084961, "step": 10149, "time_per_iteration": 2.6662347316741943 }, { "auxiliary_loss_clip": 0.06420043, "auxiliary_loss_mlp": 0.01271795, "balance_loss_clip": 0.06280392, "balance_loss_mlp": 0.01261137, "epoch": 0.6102510145798887, "flos": 21805067675520.0, "grad_norm": 2.4333146958171707, "language_loss": 0.80462217, "learning_rate": 1.3926431966620333e-06, "loss": 0.88154054, "num_input_tokens_seen": 218551245, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.10644531, "step": 10150, "time_per_iteration": 2.570169448852539 }, { "auxiliary_loss_clip": 0.06429464, "auxiliary_loss_mlp": 0.01267242, "balance_loss_clip": 0.06280978, "balance_loss_mlp": 0.01255739, "epoch": 0.6103111378325567, "flos": 20712719923200.0, "grad_norm": 1.5122964283931932, "language_loss": 0.69432473, "learning_rate": 1.3922721389841684e-06, "loss": 0.77129173, "num_input_tokens_seen": 218571365, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.1149292, "step": 10151, "time_per_iteration": 2.5565855503082275 }, { "auxiliary_loss_clip": 0.06425634, "auxiliary_loss_mlp": 0.01264762, "balance_loss_clip": 0.06280439, "balance_loss_mlp": 0.01254832, "epoch": 0.6103712610852247, "flos": 29388330362880.0, "grad_norm": 1.8026149632345045, "language_loss": 0.71177036, "learning_rate": 1.3919011043514036e-06, "loss": 0.78867435, "num_input_tokens_seen": 218588315, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.09936523, "step": 10152, "time_per_iteration": 2.623246908187866 }, { "auxiliary_loss_clip": 0.06433485, "auxiliary_loss_mlp": 0.0127024, "balance_loss_clip": 0.06284296, "balance_loss_mlp": 0.01259601, "epoch": 0.6104313843378927, "flos": 20819216862720.0, "grad_norm": 1.7277790549971024, "language_loss": 0.78557891, "learning_rate": 1.391530092777811e-06, "loss": 0.86261618, "num_input_tokens_seen": 218605940, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.10650635, "step": 10153, "time_per_iteration": 2.569378614425659 }, { "auxiliary_loss_clip": 0.06425554, "auxiliary_loss_mlp": 0.01270985, "balance_loss_clip": 0.06278184, "balance_loss_mlp": 0.01260643, "epoch": 0.6104915075905607, "flos": 26585715323520.0, "grad_norm": 1.678690374976884, "language_loss": 0.80023813, "learning_rate": 1.3911591042774573e-06, "loss": 0.87720346, "num_input_tokens_seen": 218626100, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.10339355, "step": 10154, "time_per_iteration": 2.5952112674713135 }, { "auxiliary_loss_clip": 0.06427793, "auxiliary_loss_mlp": 0.01265893, "balance_loss_clip": 0.06283067, "balance_loss_mlp": 0.01256118, "epoch": 0.6105516308432286, "flos": 23922734313600.0, "grad_norm": 1.6019074436691143, "language_loss": 0.70618594, "learning_rate": 1.3907881388644116e-06, "loss": 0.78312278, "num_input_tokens_seen": 218645060, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.09777832, "step": 10155, "time_per_iteration": 2.695284366607666 }, { "auxiliary_loss_clip": 0.06425577, "auxiliary_loss_mlp": 0.01266746, "balance_loss_clip": 0.06280276, "balance_loss_mlp": 0.01255481, "epoch": 0.6106117540958966, "flos": 31585520125440.0, "grad_norm": 1.5113163250112638, "language_loss": 0.71834701, "learning_rate": 1.3904171965527413e-06, "loss": 0.7952702, "num_input_tokens_seen": 218667690, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.11273193, "step": 10156, "time_per_iteration": 2.636605739593506 }, { "auxiliary_loss_clip": 0.06422188, "auxiliary_loss_mlp": 0.01270361, "balance_loss_clip": 0.06279762, "balance_loss_mlp": 0.01259781, "epoch": 0.6106718773485645, "flos": 19613999260800.0, "grad_norm": 1.5908517672210891, "language_loss": 0.67385966, "learning_rate": 1.3900462773565114e-06, "loss": 0.75078511, "num_input_tokens_seen": 218687505, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10577393, "step": 10157, "time_per_iteration": 2.5407943725585938 }, { "auxiliary_loss_clip": 0.06424145, "auxiliary_loss_mlp": 0.01264242, "balance_loss_clip": 0.06275658, "balance_loss_mlp": 0.01253799, "epoch": 0.6107320006012326, "flos": 17128778688000.0, "grad_norm": 1.9267228908582892, "language_loss": 0.72523373, "learning_rate": 1.3896753812897877e-06, "loss": 0.80211765, "num_input_tokens_seen": 218705315, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.10443115, "step": 10158, "time_per_iteration": 2.547529935836792 }, { "auxiliary_loss_clip": 0.06428292, "auxiliary_loss_mlp": 0.01268701, "balance_loss_clip": 0.06280094, "balance_loss_mlp": 0.01258986, "epoch": 0.6107921238539005, "flos": 30155107875840.0, "grad_norm": 1.7568069185509576, "language_loss": 0.69785917, "learning_rate": 1.389304508366635e-06, "loss": 0.77482915, "num_input_tokens_seen": 218725735, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.09729004, "step": 10159, "time_per_iteration": 2.6149702072143555 }, { "auxiliary_loss_clip": 0.06429473, "auxiliary_loss_mlp": 0.01269554, "balance_loss_clip": 0.06281, "balance_loss_mlp": 0.01258354, "epoch": 0.6108522471065685, "flos": 18445859890560.0, "grad_norm": 2.254421218417316, "language_loss": 0.79078645, "learning_rate": 1.3889336586011167e-06, "loss": 0.86777675, "num_input_tokens_seen": 218743215, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.11212158, "step": 10160, "time_per_iteration": 2.536233901977539 }, { "auxiliary_loss_clip": 0.06326699, "auxiliary_loss_mlp": 0.01255214, "balance_loss_clip": 0.06264493, "balance_loss_mlp": 0.01253411, "epoch": 0.6109123703592364, "flos": 64157295605760.0, "grad_norm": 0.8010009835885857, "language_loss": 0.61482465, "learning_rate": 1.388562832007295e-06, "loss": 0.69064379, "num_input_tokens_seen": 218806440, "router_z_loss_clip": 0.62255859, "router_z_loss_mlp": 0.01803589, "step": 10161, "time_per_iteration": 3.3042898178100586 }, { "auxiliary_loss_clip": 0.06431144, "auxiliary_loss_mlp": 0.01272532, "balance_loss_clip": 0.06281175, "balance_loss_mlp": 0.0126045, "epoch": 0.6109724936119044, "flos": 20674132318080.0, "grad_norm": 1.9683883861330884, "language_loss": 0.76029754, "learning_rate": 1.3881920285992324e-06, "loss": 0.83733433, "num_input_tokens_seen": 218825720, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.12084961, "step": 10162, "time_per_iteration": 2.5704355239868164 }, { "auxiliary_loss_clip": 0.06425139, "auxiliary_loss_mlp": 0.01267268, "balance_loss_clip": 0.06278903, "balance_loss_mlp": 0.01255645, "epoch": 0.6110326168645723, "flos": 31358899958400.0, "grad_norm": 1.9067484319832786, "language_loss": 0.7170682, "learning_rate": 1.3878212483909888e-06, "loss": 0.79399228, "num_input_tokens_seen": 218847735, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.11633301, "step": 10163, "time_per_iteration": 4.082499742507935 }, { "auxiliary_loss_clip": 0.06415726, "auxiliary_loss_mlp": 0.01267485, "balance_loss_clip": 0.06272498, "balance_loss_mlp": 0.01258378, "epoch": 0.6110927401172404, "flos": 25009338061440.0, "grad_norm": 1.810488685241328, "language_loss": 0.59796751, "learning_rate": 1.387450491396625e-06, "loss": 0.67479962, "num_input_tokens_seen": 218866585, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09112549, "step": 10164, "time_per_iteration": 2.592935562133789 }, { "auxiliary_loss_clip": 0.06421356, "auxiliary_loss_mlp": 0.01267079, "balance_loss_clip": 0.06276758, "balance_loss_mlp": 0.01256941, "epoch": 0.6111528633699083, "flos": 26254946131200.0, "grad_norm": 1.627422092501053, "language_loss": 0.75922686, "learning_rate": 1.3870797576302003e-06, "loss": 0.83611119, "num_input_tokens_seen": 218885560, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10131836, "step": 10165, "time_per_iteration": 2.624297618865967 }, { "auxiliary_loss_clip": 0.06424034, "auxiliary_loss_mlp": 0.01268839, "balance_loss_clip": 0.06282225, "balance_loss_mlp": 0.01258223, "epoch": 0.6112129866225763, "flos": 22389011579520.0, "grad_norm": 1.5415482185189269, "language_loss": 0.79897094, "learning_rate": 1.3867090471057719e-06, "loss": 0.87589961, "num_input_tokens_seen": 218905055, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.10614014, "step": 10166, "time_per_iteration": 4.0676209926605225 }, { "auxiliary_loss_clip": 0.06425862, "auxiliary_loss_mlp": 0.0126867, "balance_loss_clip": 0.06277677, "balance_loss_mlp": 0.0125809, "epoch": 0.6112731098752443, "flos": 25234826198400.0, "grad_norm": 1.6919777637834519, "language_loss": 0.67945325, "learning_rate": 1.3863383598373987e-06, "loss": 0.75639856, "num_input_tokens_seen": 218924030, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10577393, "step": 10167, "time_per_iteration": 2.6278655529022217 }, { "auxiliary_loss_clip": 0.06419063, "auxiliary_loss_mlp": 0.01268924, "balance_loss_clip": 0.06275308, "balance_loss_mlp": 0.01259959, "epoch": 0.6113332331279122, "flos": 22899763342080.0, "grad_norm": 2.005751858477993, "language_loss": 0.79304242, "learning_rate": 1.3859676958391364e-06, "loss": 0.86992228, "num_input_tokens_seen": 218943750, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.08959961, "step": 10168, "time_per_iteration": 2.612583637237549 }, { "auxiliary_loss_clip": 0.06436682, "auxiliary_loss_mlp": 0.01268715, "balance_loss_clip": 0.06281739, "balance_loss_mlp": 0.01256228, "epoch": 0.6113933563805802, "flos": 18625548971520.0, "grad_norm": 2.3296409054780023, "language_loss": 0.86451983, "learning_rate": 1.3855970551250398e-06, "loss": 0.94157374, "num_input_tokens_seen": 218957585, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.12487793, "step": 10169, "time_per_iteration": 2.53975510597229 }, { "auxiliary_loss_clip": 0.06420328, "auxiliary_loss_mlp": 0.0126329, "balance_loss_clip": 0.06275225, "balance_loss_mlp": 0.01253688, "epoch": 0.6114534796332481, "flos": 41876137359360.0, "grad_norm": 2.3022787164510943, "language_loss": 0.78826469, "learning_rate": 1.3852264377091652e-06, "loss": 0.86510086, "num_input_tokens_seen": 218980025, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.09594727, "step": 10170, "time_per_iteration": 2.7436742782592773 }, { "auxiliary_loss_clip": 0.06432036, "auxiliary_loss_mlp": 0.01267442, "balance_loss_clip": 0.06279212, "balance_loss_mlp": 0.01255289, "epoch": 0.6115136028859162, "flos": 21914960705280.0, "grad_norm": 3.782639668876928, "language_loss": 0.68801159, "learning_rate": 1.3848558436055651e-06, "loss": 0.76500636, "num_input_tokens_seen": 218998200, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.12164307, "step": 10171, "time_per_iteration": 2.548164129257202 }, { "auxiliary_loss_clip": 0.06428488, "auxiliary_loss_mlp": 0.01266846, "balance_loss_clip": 0.06279123, "balance_loss_mlp": 0.01255027, "epoch": 0.6115737261385841, "flos": 28812604158720.0, "grad_norm": 1.6290083827447188, "language_loss": 0.79320085, "learning_rate": 1.3844852728282934e-06, "loss": 0.8701542, "num_input_tokens_seen": 219017910, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.11810303, "step": 10172, "time_per_iteration": 2.6338448524475098 }, { "auxiliary_loss_clip": 0.0643306, "auxiliary_loss_mlp": 0.01266528, "balance_loss_clip": 0.06279603, "balance_loss_mlp": 0.01255418, "epoch": 0.6116338493912521, "flos": 21257824659840.0, "grad_norm": 1.9380168013667411, "language_loss": 0.66965109, "learning_rate": 1.3841147253914022e-06, "loss": 0.74664694, "num_input_tokens_seen": 219037730, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.11120605, "step": 10173, "time_per_iteration": 2.5595905780792236 }, { "auxiliary_loss_clip": 0.06428846, "auxiliary_loss_mlp": 0.01270727, "balance_loss_clip": 0.06279884, "balance_loss_mlp": 0.01259569, "epoch": 0.61169397264392, "flos": 17535968403840.0, "grad_norm": 1.6878582739089545, "language_loss": 0.56225777, "learning_rate": 1.3837442013089416e-06, "loss": 0.6392535, "num_input_tokens_seen": 219056755, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.1116333, "step": 10174, "time_per_iteration": 2.5570833683013916 }, { "auxiliary_loss_clip": 0.06426068, "auxiliary_loss_mlp": 0.01265865, "balance_loss_clip": 0.06277061, "balance_loss_mlp": 0.01254152, "epoch": 0.611754095896588, "flos": 23958387025920.0, "grad_norm": 2.3086294362238142, "language_loss": 0.66338706, "learning_rate": 1.3833737005949628e-06, "loss": 0.74030638, "num_input_tokens_seen": 219076985, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.1171875, "step": 10175, "time_per_iteration": 2.5807745456695557 }, { "auxiliary_loss_clip": 0.06423304, "auxiliary_loss_mlp": 0.01263167, "balance_loss_clip": 0.06276878, "balance_loss_mlp": 0.0125338, "epoch": 0.6118142191492559, "flos": 26002064908800.0, "grad_norm": 1.936252157056442, "language_loss": 0.83088052, "learning_rate": 1.3830032232635154e-06, "loss": 0.90774524, "num_input_tokens_seen": 219096050, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.09790039, "step": 10176, "time_per_iteration": 2.5883655548095703 }, { "auxiliary_loss_clip": 0.06429659, "auxiliary_loss_mlp": 0.01270321, "balance_loss_clip": 0.06281494, "balance_loss_mlp": 0.01258465, "epoch": 0.611874342401924, "flos": 24609275942400.0, "grad_norm": 1.7789680277620594, "language_loss": 0.77514553, "learning_rate": 1.3826327693286474e-06, "loss": 0.85214531, "num_input_tokens_seen": 219112665, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.11859131, "step": 10177, "time_per_iteration": 3.8747687339782715 }, { "auxiliary_loss_clip": 0.06429771, "auxiliary_loss_mlp": 0.01268605, "balance_loss_clip": 0.06280565, "balance_loss_mlp": 0.01257149, "epoch": 0.6119344656545919, "flos": 15892436494080.0, "grad_norm": 1.9246731393683776, "language_loss": 0.75565457, "learning_rate": 1.3822623388044065e-06, "loss": 0.83263838, "num_input_tokens_seen": 219129120, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.11456299, "step": 10178, "time_per_iteration": 2.518739700317383 }, { "auxiliary_loss_clip": 0.06429684, "auxiliary_loss_mlp": 0.01269051, "balance_loss_clip": 0.06279323, "balance_loss_mlp": 0.01256886, "epoch": 0.6119945889072599, "flos": 21659312298240.0, "grad_norm": 1.6843122912360313, "language_loss": 0.6753521, "learning_rate": 1.3818919317048402e-06, "loss": 0.75233948, "num_input_tokens_seen": 219148950, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.12164307, "step": 10179, "time_per_iteration": 4.119441986083984 }, { "auxiliary_loss_clip": 0.06434248, "auxiliary_loss_mlp": 0.01266445, "balance_loss_clip": 0.06283864, "balance_loss_mlp": 0.01255507, "epoch": 0.6120547121599279, "flos": 13777746675840.0, "grad_norm": 1.790687413823427, "language_loss": 0.83723748, "learning_rate": 1.3815215480439933e-06, "loss": 0.91424441, "num_input_tokens_seen": 219165585, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.109375, "step": 10180, "time_per_iteration": 2.5372021198272705 }, { "auxiliary_loss_clip": 0.06428992, "auxiliary_loss_mlp": 0.01267523, "balance_loss_clip": 0.06282064, "balance_loss_mlp": 0.0125562, "epoch": 0.6121148354125958, "flos": 20084528263680.0, "grad_norm": 1.531643188748337, "language_loss": 0.77953804, "learning_rate": 1.3811511878359113e-06, "loss": 0.85650319, "num_input_tokens_seen": 219183280, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.11907959, "step": 10181, "time_per_iteration": 2.5689754486083984 }, { "auxiliary_loss_clip": 0.06433076, "auxiliary_loss_mlp": 0.01266912, "balance_loss_clip": 0.06280954, "balance_loss_mlp": 0.01255664, "epoch": 0.6121749586652638, "flos": 13474915620480.0, "grad_norm": 1.9718344648379078, "language_loss": 0.81084841, "learning_rate": 1.3807808510946384e-06, "loss": 0.88784826, "num_input_tokens_seen": 219197200, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.11242676, "step": 10182, "time_per_iteration": 2.540851593017578 }, { "auxiliary_loss_clip": 0.06422263, "auxiliary_loss_mlp": 0.01268276, "balance_loss_clip": 0.06279267, "balance_loss_mlp": 0.01259371, "epoch": 0.6122350819179317, "flos": 20126721594240.0, "grad_norm": 1.6991592293088527, "language_loss": 0.82753015, "learning_rate": 1.3804105378342177e-06, "loss": 0.90443563, "num_input_tokens_seen": 219216825, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.08905029, "step": 10183, "time_per_iteration": 2.5376508235931396 }, { "auxiliary_loss_clip": 0.06327166, "auxiliary_loss_mlp": 0.01251223, "balance_loss_clip": 0.06266284, "balance_loss_mlp": 0.01249463, "epoch": 0.6122952051705998, "flos": 65448004700160.0, "grad_norm": 0.7043271142608777, "language_loss": 0.62832475, "learning_rate": 1.3800402480686914e-06, "loss": 0.7041086, "num_input_tokens_seen": 219283795, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.0176239, "step": 10184, "time_per_iteration": 3.2905287742614746 }, { "auxiliary_loss_clip": 0.06428713, "auxiliary_loss_mlp": 0.01264386, "balance_loss_clip": 0.06281728, "balance_loss_mlp": 0.01254248, "epoch": 0.6123553284232677, "flos": 20382537709440.0, "grad_norm": 1.8008646084933058, "language_loss": 0.82694745, "learning_rate": 1.379669981812101e-06, "loss": 0.90387845, "num_input_tokens_seen": 219302385, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.10137939, "step": 10185, "time_per_iteration": 2.7227001190185547 }, { "auxiliary_loss_clip": 0.06436114, "auxiliary_loss_mlp": 0.01264925, "balance_loss_clip": 0.06283066, "balance_loss_mlp": 0.0125369, "epoch": 0.6124154516759357, "flos": 23994417081600.0, "grad_norm": 1.7065720499319668, "language_loss": 0.74724764, "learning_rate": 1.3792997390784868e-06, "loss": 0.82425803, "num_input_tokens_seen": 219319765, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.11230469, "step": 10186, "time_per_iteration": 2.864053726196289 }, { "auxiliary_loss_clip": 0.06427798, "auxiliary_loss_mlp": 0.01265114, "balance_loss_clip": 0.06280687, "balance_loss_mlp": 0.01254737, "epoch": 0.6124755749286036, "flos": 21474927388800.0, "grad_norm": 1.7673444702381713, "language_loss": 0.78877139, "learning_rate": 1.3789295198818895e-06, "loss": 0.86570048, "num_input_tokens_seen": 219337440, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.10375977, "step": 10187, "time_per_iteration": 2.585972785949707 }, { "auxiliary_loss_clip": 0.06428073, "auxiliary_loss_mlp": 0.01265498, "balance_loss_clip": 0.06279833, "balance_loss_mlp": 0.01254382, "epoch": 0.6125356981812716, "flos": 23886117279360.0, "grad_norm": 2.060879541092238, "language_loss": 0.83243847, "learning_rate": 1.3785593242363462e-06, "loss": 0.90937424, "num_input_tokens_seen": 219357525, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.11120605, "step": 10188, "time_per_iteration": 2.6076693534851074 }, { "auxiliary_loss_clip": 0.06429036, "auxiliary_loss_mlp": 0.01264874, "balance_loss_clip": 0.0628065, "balance_loss_mlp": 0.01254079, "epoch": 0.6125958214339395, "flos": 14430312673920.0, "grad_norm": 1.641598522466655, "language_loss": 0.7568202, "learning_rate": 1.378189152155896e-06, "loss": 0.83375937, "num_input_tokens_seen": 219374855, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10791016, "step": 10189, "time_per_iteration": 2.5503032207489014 }, { "auxiliary_loss_clip": 0.0643241, "auxiliary_loss_mlp": 0.0126567, "balance_loss_clip": 0.06282544, "balance_loss_mlp": 0.01254518, "epoch": 0.6126559446866076, "flos": 23265933684480.0, "grad_norm": 1.4922405413773328, "language_loss": 0.74179268, "learning_rate": 1.3778190036545758e-06, "loss": 0.81877345, "num_input_tokens_seen": 219394740, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.1116333, "step": 10190, "time_per_iteration": 2.5909693241119385 }, { "auxiliary_loss_clip": 0.06434465, "auxiliary_loss_mlp": 0.01265205, "balance_loss_clip": 0.06284407, "balance_loss_mlp": 0.01253523, "epoch": 0.6127160679392755, "flos": 26871188584320.0, "grad_norm": 1.7523116357600252, "language_loss": 0.68627483, "learning_rate": 1.3774488787464207e-06, "loss": 0.76327157, "num_input_tokens_seen": 219413755, "router_z_loss_clip": 1.49902344, "router_z_loss_mlp": 0.11688232, "step": 10191, "time_per_iteration": 2.594943046569824 }, { "auxiliary_loss_clip": 0.06434624, "auxiliary_loss_mlp": 0.01270572, "balance_loss_clip": 0.06283514, "balance_loss_mlp": 0.01258538, "epoch": 0.6127761911919435, "flos": 26403720255360.0, "grad_norm": 1.8803971256101801, "language_loss": 0.73908013, "learning_rate": 1.377078777445467e-06, "loss": 0.81613213, "num_input_tokens_seen": 219433560, "router_z_loss_clip": 1.50976562, "router_z_loss_mlp": 0.12036133, "step": 10192, "time_per_iteration": 2.621394157409668 }, { "auxiliary_loss_clip": 0.06427886, "auxiliary_loss_mlp": 0.01263919, "balance_loss_clip": 0.06281058, "balance_loss_mlp": 0.01252868, "epoch": 0.6128363144446115, "flos": 22640802698880.0, "grad_norm": 2.0139702216673956, "language_loss": 0.84335393, "learning_rate": 1.3767086997657478e-06, "loss": 0.92027199, "num_input_tokens_seen": 219452640, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.11047363, "step": 10193, "time_per_iteration": 2.5766444206237793 }, { "auxiliary_loss_clip": 0.06430126, "auxiliary_loss_mlp": 0.01267372, "balance_loss_clip": 0.06281007, "balance_loss_mlp": 0.01255976, "epoch": 0.6128964376972794, "flos": 26766033310080.0, "grad_norm": 2.1467877028794375, "language_loss": 0.7073499, "learning_rate": 1.3763386457212979e-06, "loss": 0.78432488, "num_input_tokens_seen": 219468585, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.1138916, "step": 10194, "time_per_iteration": 2.6023523807525635 }, { "auxiliary_loss_clip": 0.06331509, "auxiliary_loss_mlp": 0.0125323, "balance_loss_clip": 0.06270662, "balance_loss_mlp": 0.0125169, "epoch": 0.6129565609499474, "flos": 65585500450560.0, "grad_norm": 0.8092358829745879, "language_loss": 0.58496034, "learning_rate": 1.375968615326149e-06, "loss": 0.66080773, "num_input_tokens_seen": 219523015, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.01538849, "step": 10195, "time_per_iteration": 3.008861541748047 }, { "auxiliary_loss_clip": 0.06432572, "auxiliary_loss_mlp": 0.01269251, "balance_loss_clip": 0.0628375, "balance_loss_mlp": 0.01256699, "epoch": 0.6130166842026153, "flos": 16367577471360.0, "grad_norm": 2.0887189895107054, "language_loss": 0.6999616, "learning_rate": 1.3755986085943324e-06, "loss": 0.7769798, "num_input_tokens_seen": 219539980, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.12548828, "step": 10196, "time_per_iteration": 2.5662119388580322 }, { "auxiliary_loss_clip": 0.06427702, "auxiliary_loss_mlp": 0.01266283, "balance_loss_clip": 0.06280333, "balance_loss_mlp": 0.01255727, "epoch": 0.6130768074552834, "flos": 23658029665920.0, "grad_norm": 1.7663797746399938, "language_loss": 0.7133525, "learning_rate": 1.3752286255398788e-06, "loss": 0.79029232, "num_input_tokens_seen": 219556980, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.10559082, "step": 10197, "time_per_iteration": 2.574796676635742 }, { "auxiliary_loss_clip": 0.06428618, "auxiliary_loss_mlp": 0.01271494, "balance_loss_clip": 0.06277887, "balance_loss_mlp": 0.01260772, "epoch": 0.6131369307079513, "flos": 20053613306880.0, "grad_norm": 1.7988784890064222, "language_loss": 0.79257077, "learning_rate": 1.3748586661768191e-06, "loss": 0.86957192, "num_input_tokens_seen": 219576410, "router_z_loss_clip": 1.50585938, "router_z_loss_mlp": 0.10736084, "step": 10198, "time_per_iteration": 2.5446243286132812 }, { "auxiliary_loss_clip": 0.06436342, "auxiliary_loss_mlp": 0.01266815, "balance_loss_clip": 0.06282865, "balance_loss_mlp": 0.01255306, "epoch": 0.6131970539606193, "flos": 22678384055040.0, "grad_norm": 1.4789719512936423, "language_loss": 0.74567211, "learning_rate": 1.374488730519181e-06, "loss": 0.8227036, "num_input_tokens_seen": 219597180, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.1151123, "step": 10199, "time_per_iteration": 2.572322368621826 }, { "auxiliary_loss_clip": 0.06433104, "auxiliary_loss_mlp": 0.01268176, "balance_loss_clip": 0.06279843, "balance_loss_mlp": 0.01256559, "epoch": 0.6132571772132872, "flos": 26878316181120.0, "grad_norm": 11.394950643838765, "language_loss": 0.61517775, "learning_rate": 1.374118818580993e-06, "loss": 0.69219053, "num_input_tokens_seen": 219617630, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.11627197, "step": 10200, "time_per_iteration": 2.585385799407959 }, { "auxiliary_loss_clip": 0.06429726, "auxiliary_loss_mlp": 0.01268606, "balance_loss_clip": 0.0628058, "balance_loss_mlp": 0.01258163, "epoch": 0.6133173004659552, "flos": 22899176363520.0, "grad_norm": 1.8176315826848553, "language_loss": 0.68743157, "learning_rate": 1.3737489303762822e-06, "loss": 0.76441491, "num_input_tokens_seen": 219637025, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.10443115, "step": 10201, "time_per_iteration": 2.555638074874878 }, { "auxiliary_loss_clip": 0.06424366, "auxiliary_loss_mlp": 0.01267716, "balance_loss_clip": 0.06276823, "balance_loss_mlp": 0.01256111, "epoch": 0.6133774237186231, "flos": 20491298709120.0, "grad_norm": 2.011440576923226, "language_loss": 0.83956009, "learning_rate": 1.3733790659190746e-06, "loss": 0.9164809, "num_input_tokens_seen": 219656625, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.11602783, "step": 10202, "time_per_iteration": 3.9926936626434326 }, { "auxiliary_loss_clip": 0.06316158, "auxiliary_loss_mlp": 0.01253206, "balance_loss_clip": 0.06255636, "balance_loss_mlp": 0.0125155, "epoch": 0.6134375469712912, "flos": 69433643208960.0, "grad_norm": 0.8637664271003433, "language_loss": 0.66990066, "learning_rate": 1.3730092252233953e-06, "loss": 0.74559426, "num_input_tokens_seen": 219718090, "router_z_loss_clip": 0.60595703, "router_z_loss_mlp": 0.0165863, "step": 10203, "time_per_iteration": 3.1947808265686035 }, { "auxiliary_loss_clip": 0.06433468, "auxiliary_loss_mlp": 0.01265745, "balance_loss_clip": 0.06282666, "balance_loss_mlp": 0.01255207, "epoch": 0.6134976702239591, "flos": 41291145279360.0, "grad_norm": 1.4131929210396579, "language_loss": 0.60858929, "learning_rate": 1.37263940830327e-06, "loss": 0.68558145, "num_input_tokens_seen": 219740100, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.10540771, "step": 10204, "time_per_iteration": 2.73852801322937 }, { "auxiliary_loss_clip": 0.06425742, "auxiliary_loss_mlp": 0.01266262, "balance_loss_clip": 0.06278409, "balance_loss_mlp": 0.01255527, "epoch": 0.6135577934766271, "flos": 22353233086080.0, "grad_norm": 2.1053166469539715, "language_loss": 0.73296547, "learning_rate": 1.3722696151727204e-06, "loss": 0.80988556, "num_input_tokens_seen": 219761225, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10736084, "step": 10205, "time_per_iteration": 2.5640721321105957 }, { "auxiliary_loss_clip": 0.06424726, "auxiliary_loss_mlp": 0.01267129, "balance_loss_clip": 0.06277625, "balance_loss_mlp": 0.01254761, "epoch": 0.6136179167292951, "flos": 23734198627200.0, "grad_norm": 1.7781247615956532, "language_loss": 0.76407766, "learning_rate": 1.3718998458457701e-06, "loss": 0.84099627, "num_input_tokens_seen": 219780085, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.12371826, "step": 10206, "time_per_iteration": 3.998572587966919 }, { "auxiliary_loss_clip": 0.06427163, "auxiliary_loss_mlp": 0.0126668, "balance_loss_clip": 0.06277405, "balance_loss_mlp": 0.012547, "epoch": 0.613678039981963, "flos": 26030757732480.0, "grad_norm": 1.908816061735312, "language_loss": 0.75662059, "learning_rate": 1.3715301003364407e-06, "loss": 0.83355904, "num_input_tokens_seen": 219797895, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.11975098, "step": 10207, "time_per_iteration": 2.6188859939575195 }, { "auxiliary_loss_clip": 0.06427407, "auxiliary_loss_mlp": 0.01264869, "balance_loss_clip": 0.06279369, "balance_loss_mlp": 0.01253955, "epoch": 0.613738163234631, "flos": 9863078175360.0, "grad_norm": 2.184532021844666, "language_loss": 0.82447135, "learning_rate": 1.3711603786587525e-06, "loss": 0.90139413, "num_input_tokens_seen": 219811295, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.10919189, "step": 10208, "time_per_iteration": 2.5197393894195557 }, { "auxiliary_loss_clip": 0.06435846, "auxiliary_loss_mlp": 0.01268384, "balance_loss_clip": 0.06282216, "balance_loss_mlp": 0.0125604, "epoch": 0.613798286487299, "flos": 33190380576000.0, "grad_norm": 2.4839155488961913, "language_loss": 0.72712564, "learning_rate": 1.3707906808267265e-06, "loss": 0.80416793, "num_input_tokens_seen": 219832735, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.12341309, "step": 10209, "time_per_iteration": 2.68172025680542 }, { "auxiliary_loss_clip": 0.06427167, "auxiliary_loss_mlp": 0.01268186, "balance_loss_clip": 0.06278138, "balance_loss_mlp": 0.01257183, "epoch": 0.613858409739967, "flos": 25634678682240.0, "grad_norm": 1.5993412635644404, "language_loss": 0.74506938, "learning_rate": 1.37042100685438e-06, "loss": 0.82202291, "num_input_tokens_seen": 219852755, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.11016846, "step": 10210, "time_per_iteration": 2.59389066696167 }, { "auxiliary_loss_clip": 0.06322223, "auxiliary_loss_mlp": 0.01254019, "balance_loss_clip": 0.06262025, "balance_loss_mlp": 0.01252428, "epoch": 0.6139185329926349, "flos": 67213336919040.0, "grad_norm": 0.9562330957087056, "language_loss": 0.65092605, "learning_rate": 1.3700513567557325e-06, "loss": 0.7266885, "num_input_tokens_seen": 219922785, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01593018, "step": 10211, "time_per_iteration": 3.345089912414551 }, { "auxiliary_loss_clip": 0.06426579, "auxiliary_loss_mlp": 0.01270052, "balance_loss_clip": 0.06278519, "balance_loss_mlp": 0.01257958, "epoch": 0.6139786562453029, "flos": 21550090101120.0, "grad_norm": 1.5768265835824855, "language_loss": 0.7612887, "learning_rate": 1.369681730544801e-06, "loss": 0.83825505, "num_input_tokens_seen": 219942215, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.12091064, "step": 10212, "time_per_iteration": 2.5879759788513184 }, { "auxiliary_loss_clip": 0.06430039, "auxiliary_loss_mlp": 0.0127351, "balance_loss_clip": 0.06281683, "balance_loss_mlp": 0.01262037, "epoch": 0.6140387794979708, "flos": 26075802101760.0, "grad_norm": 1.4178888761136526, "language_loss": 0.74055064, "learning_rate": 1.3693121282356009e-06, "loss": 0.81758618, "num_input_tokens_seen": 219963830, "router_z_loss_clip": 1.48242188, "router_z_loss_mlp": 0.11468506, "step": 10213, "time_per_iteration": 2.6034164428710938 }, { "auxiliary_loss_clip": 0.0643712, "auxiliary_loss_mlp": 0.01268166, "balance_loss_clip": 0.06282005, "balance_loss_mlp": 0.01256102, "epoch": 0.6140989027506388, "flos": 23701145391360.0, "grad_norm": 1.6217952387426, "language_loss": 0.73185599, "learning_rate": 1.3689425498421483e-06, "loss": 0.80890882, "num_input_tokens_seen": 219983815, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.12060547, "step": 10214, "time_per_iteration": 2.5877723693847656 }, { "auxiliary_loss_clip": 0.06429742, "auxiliary_loss_mlp": 0.01266428, "balance_loss_clip": 0.06277683, "balance_loss_mlp": 0.01254835, "epoch": 0.6141590260033067, "flos": 22237428343680.0, "grad_norm": 1.6455461367932982, "language_loss": 0.74712193, "learning_rate": 1.3685729953784572e-06, "loss": 0.82408369, "num_input_tokens_seen": 220003165, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.11590576, "step": 10215, "time_per_iteration": 2.5823984146118164 }, { "auxiliary_loss_clip": 0.06424867, "auxiliary_loss_mlp": 0.01272763, "balance_loss_clip": 0.06276114, "balance_loss_mlp": 0.01261039, "epoch": 0.6142191492559748, "flos": 23877312600960.0, "grad_norm": 1.8731701589241188, "language_loss": 0.78559303, "learning_rate": 1.368203464858542e-06, "loss": 0.86256933, "num_input_tokens_seen": 220021015, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.11724854, "step": 10216, "time_per_iteration": 4.002488613128662 }, { "auxiliary_loss_clip": 0.06428587, "auxiliary_loss_mlp": 0.01266836, "balance_loss_clip": 0.06279923, "balance_loss_mlp": 0.01255398, "epoch": 0.6142792725086427, "flos": 15046764762240.0, "grad_norm": 2.213286595374803, "language_loss": 0.80049908, "learning_rate": 1.3678339582964147e-06, "loss": 0.87745333, "num_input_tokens_seen": 220035780, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.11437988, "step": 10217, "time_per_iteration": 2.4868669509887695 }, { "auxiliary_loss_clip": 0.0642956, "auxiliary_loss_mlp": 0.01268137, "balance_loss_clip": 0.06279315, "balance_loss_mlp": 0.0125652, "epoch": 0.6143393957613107, "flos": 23337616452480.0, "grad_norm": 2.5240664686036127, "language_loss": 0.78387243, "learning_rate": 1.3674644757060865e-06, "loss": 0.86084938, "num_input_tokens_seen": 220054280, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.1161499, "step": 10218, "time_per_iteration": 2.567108631134033 }, { "auxiliary_loss_clip": 0.06425567, "auxiliary_loss_mlp": 0.01269657, "balance_loss_clip": 0.06278919, "balance_loss_mlp": 0.01258296, "epoch": 0.6143995190139786, "flos": 20122696598400.0, "grad_norm": 1.8228042173278014, "language_loss": 0.82136619, "learning_rate": 1.367095017101569e-06, "loss": 0.89831841, "num_input_tokens_seen": 220074120, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.11352539, "step": 10219, "time_per_iteration": 3.996039867401123 }, { "auxiliary_loss_clip": 0.06426693, "auxiliary_loss_mlp": 0.01269565, "balance_loss_clip": 0.06276546, "balance_loss_mlp": 0.01257817, "epoch": 0.6144596422666466, "flos": 42313403491200.0, "grad_norm": 2.1583739844862873, "language_loss": 0.66887611, "learning_rate": 1.3667255824968717e-06, "loss": 0.74583864, "num_input_tokens_seen": 220096320, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.11749268, "step": 10220, "time_per_iteration": 2.7187066078186035 }, { "auxiliary_loss_clip": 0.06427552, "auxiliary_loss_mlp": 0.01268592, "balance_loss_clip": 0.06279899, "balance_loss_mlp": 0.01257911, "epoch": 0.6145197655193146, "flos": 21578992560000.0, "grad_norm": 2.1011331876253947, "language_loss": 0.71596563, "learning_rate": 1.3663561719060041e-06, "loss": 0.79292709, "num_input_tokens_seen": 220114850, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.10681152, "step": 10221, "time_per_iteration": 2.5689501762390137 }, { "auxiliary_loss_clip": 0.06422487, "auxiliary_loss_mlp": 0.01269321, "balance_loss_clip": 0.06275931, "balance_loss_mlp": 0.01258628, "epoch": 0.6145798887719826, "flos": 21477610719360.0, "grad_norm": 1.5937392328241047, "language_loss": 0.80111206, "learning_rate": 1.3659867853429735e-06, "loss": 0.87803018, "num_input_tokens_seen": 220133395, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.10687256, "step": 10222, "time_per_iteration": 2.5716052055358887 }, { "auxiliary_loss_clip": 0.0643101, "auxiliary_loss_mlp": 0.0126785, "balance_loss_clip": 0.06278042, "balance_loss_mlp": 0.01256853, "epoch": 0.6146400120246506, "flos": 20783270661120.0, "grad_norm": 2.0585457888265717, "language_loss": 0.76936316, "learning_rate": 1.365617422821788e-06, "loss": 0.84635174, "num_input_tokens_seen": 220152790, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.10998535, "step": 10223, "time_per_iteration": 2.553321599960327 }, { "auxiliary_loss_clip": 0.06424437, "auxiliary_loss_mlp": 0.01267898, "balance_loss_clip": 0.06279977, "balance_loss_mlp": 0.01255857, "epoch": 0.6147001352773185, "flos": 13886423821440.0, "grad_norm": 1.962212239823082, "language_loss": 0.78882986, "learning_rate": 1.3652480843564535e-06, "loss": 0.86575317, "num_input_tokens_seen": 220169535, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.12042236, "step": 10224, "time_per_iteration": 2.5174965858459473 }, { "auxiliary_loss_clip": 0.06421345, "auxiliary_loss_mlp": 0.01266488, "balance_loss_clip": 0.06277087, "balance_loss_mlp": 0.01256111, "epoch": 0.6147602585299865, "flos": 56653920915840.0, "grad_norm": 1.1708558549453096, "language_loss": 0.66512442, "learning_rate": 1.3648787699609746e-06, "loss": 0.74200273, "num_input_tokens_seen": 220195305, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.1038208, "step": 10225, "time_per_iteration": 2.8813533782958984 }, { "auxiliary_loss_clip": 0.06431668, "auxiliary_loss_mlp": 0.01269459, "balance_loss_clip": 0.06281003, "balance_loss_mlp": 0.01257836, "epoch": 0.6148203817826544, "flos": 32825468044800.0, "grad_norm": 2.0270841838252527, "language_loss": 0.6310488, "learning_rate": 1.364509479649357e-06, "loss": 0.70806003, "num_input_tokens_seen": 220215040, "router_z_loss_clip": 1.50585938, "router_z_loss_mlp": 0.11627197, "step": 10226, "time_per_iteration": 2.650597095489502 }, { "auxiliary_loss_clip": 0.06426849, "auxiliary_loss_mlp": 0.01267033, "balance_loss_clip": 0.06277308, "balance_loss_mlp": 0.01254891, "epoch": 0.6148805050353224, "flos": 18337811650560.0, "grad_norm": 2.3385428054269486, "language_loss": 0.76055586, "learning_rate": 1.3641402134356037e-06, "loss": 0.83749461, "num_input_tokens_seen": 220234205, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.12133789, "step": 10227, "time_per_iteration": 2.5761055946350098 }, { "auxiliary_loss_clip": 0.06431353, "auxiliary_loss_mlp": 0.0126843, "balance_loss_clip": 0.06278254, "balance_loss_mlp": 0.01254721, "epoch": 0.6149406282879903, "flos": 14069173576320.0, "grad_norm": 3.121069152201094, "language_loss": 0.6231277, "learning_rate": 1.3637709713337164e-06, "loss": 0.70012558, "num_input_tokens_seen": 220252730, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.13708496, "step": 10228, "time_per_iteration": 2.547377109527588 }, { "auxiliary_loss_clip": 0.06426693, "auxiliary_loss_mlp": 0.01266911, "balance_loss_clip": 0.06279812, "balance_loss_mlp": 0.01255688, "epoch": 0.6150007515406584, "flos": 25196909425920.0, "grad_norm": 1.492935985657435, "language_loss": 0.74546897, "learning_rate": 1.3634017533576985e-06, "loss": 0.82240498, "num_input_tokens_seen": 220273345, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.11230469, "step": 10229, "time_per_iteration": 2.5873324871063232 }, { "auxiliary_loss_clip": 0.06427035, "auxiliary_loss_mlp": 0.01269458, "balance_loss_clip": 0.06278986, "balance_loss_mlp": 0.01257871, "epoch": 0.6150608747933263, "flos": 21951829301760.0, "grad_norm": 2.062069229396945, "language_loss": 0.78289819, "learning_rate": 1.3630325595215493e-06, "loss": 0.85986316, "num_input_tokens_seen": 220293845, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.11590576, "step": 10230, "time_per_iteration": 2.564470052719116 }, { "auxiliary_loss_clip": 0.06432371, "auxiliary_loss_mlp": 0.01268347, "balance_loss_clip": 0.06280635, "balance_loss_mlp": 0.01257153, "epoch": 0.6151209980459943, "flos": 30125283022080.0, "grad_norm": 1.835353007119492, "language_loss": 0.73324132, "learning_rate": 1.36266338983927e-06, "loss": 0.81024849, "num_input_tokens_seen": 220316070, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.11187744, "step": 10231, "time_per_iteration": 2.6191227436065674 }, { "auxiliary_loss_clip": 0.06427483, "auxiliary_loss_mlp": 0.01268073, "balance_loss_clip": 0.06277797, "balance_loss_mlp": 0.01257207, "epoch": 0.6151811212986622, "flos": 30016228533120.0, "grad_norm": 1.60886652714547, "language_loss": 0.70500702, "learning_rate": 1.362294244324858e-06, "loss": 0.78196251, "num_input_tokens_seen": 220335695, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.10858154, "step": 10232, "time_per_iteration": 2.6602742671966553 }, { "auxiliary_loss_clip": 0.06420503, "auxiliary_loss_mlp": 0.01267777, "balance_loss_clip": 0.06277851, "balance_loss_mlp": 0.01257227, "epoch": 0.6152412445513302, "flos": 18877675507200.0, "grad_norm": 1.7967509679265217, "language_loss": 0.91641301, "learning_rate": 1.3619251229923126e-06, "loss": 0.99329579, "num_input_tokens_seen": 220353720, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10546875, "step": 10233, "time_per_iteration": 2.783226490020752 }, { "auxiliary_loss_clip": 0.06425969, "auxiliary_loss_mlp": 0.01268126, "balance_loss_clip": 0.06280132, "balance_loss_mlp": 0.01257564, "epoch": 0.6153013678039982, "flos": 25710847643520.0, "grad_norm": 1.859756787981481, "language_loss": 0.71783781, "learning_rate": 1.3615560258556306e-06, "loss": 0.79477876, "num_input_tokens_seen": 220372515, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10559082, "step": 10234, "time_per_iteration": 2.667630195617676 }, { "auxiliary_loss_clip": 0.0642902, "auxiliary_loss_mlp": 0.01267214, "balance_loss_clip": 0.06278066, "balance_loss_mlp": 0.01255574, "epoch": 0.6153614910566662, "flos": 28517529605760.0, "grad_norm": 1.9487570937992629, "language_loss": 0.67409813, "learning_rate": 1.3611869529288077e-06, "loss": 0.75106049, "num_input_tokens_seen": 220393490, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.11639404, "step": 10235, "time_per_iteration": 2.639864683151245 }, { "auxiliary_loss_clip": 0.06436025, "auxiliary_loss_mlp": 0.01269829, "balance_loss_clip": 0.0628306, "balance_loss_mlp": 0.01257914, "epoch": 0.6154216143093342, "flos": 23556480117120.0, "grad_norm": 1.6392468817964965, "language_loss": 0.81346905, "learning_rate": 1.3608179042258398e-06, "loss": 0.89052761, "num_input_tokens_seen": 220412855, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.1192627, "step": 10236, "time_per_iteration": 2.5717477798461914 }, { "auxiliary_loss_clip": 0.06433378, "auxiliary_loss_mlp": 0.01266215, "balance_loss_clip": 0.06281802, "balance_loss_mlp": 0.01254902, "epoch": 0.6154817375620021, "flos": 22754804578560.0, "grad_norm": 1.3740185696204719, "language_loss": 0.80594689, "learning_rate": 1.360448879760721e-06, "loss": 0.8829428, "num_input_tokens_seen": 220433440, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.11322021, "step": 10237, "time_per_iteration": 2.5870206356048584 }, { "auxiliary_loss_clip": 0.06428559, "auxiliary_loss_mlp": 0.01272864, "balance_loss_clip": 0.062813, "balance_loss_mlp": 0.01261319, "epoch": 0.6155418608146701, "flos": 27170455841280.0, "grad_norm": 1.88444347865769, "language_loss": 0.76550138, "learning_rate": 1.3600798795474449e-06, "loss": 0.84251559, "num_input_tokens_seen": 220453445, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.11547852, "step": 10238, "time_per_iteration": 2.596179246902466 }, { "auxiliary_loss_clip": 0.0633666, "auxiliary_loss_mlp": 0.01255468, "balance_loss_clip": 0.06275076, "balance_loss_mlp": 0.01253784, "epoch": 0.615601984067338, "flos": 68828610003840.0, "grad_norm": 0.7482166261119827, "language_loss": 0.57520282, "learning_rate": 1.3597109036000036e-06, "loss": 0.65112406, "num_input_tokens_seen": 220509730, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 0.01687622, "step": 10239, "time_per_iteration": 3.191229820251465 }, { "auxiliary_loss_clip": 0.06435587, "auxiliary_loss_mlp": 0.0126538, "balance_loss_clip": 0.06284508, "balance_loss_mlp": 0.01254126, "epoch": 0.615662107320006, "flos": 15521528396160.0, "grad_norm": 2.045676447328278, "language_loss": 0.78108633, "learning_rate": 1.3593419519323892e-06, "loss": 0.858096, "num_input_tokens_seen": 220527295, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.11254883, "step": 10240, "time_per_iteration": 2.537846565246582 }, { "auxiliary_loss_clip": 0.06438424, "auxiliary_loss_mlp": 0.01269843, "balance_loss_clip": 0.06287274, "balance_loss_mlp": 0.01258107, "epoch": 0.615722230572674, "flos": 21069121265280.0, "grad_norm": 3.3905813334341293, "language_loss": 0.72824055, "learning_rate": 1.3589730245585922e-06, "loss": 0.80532318, "num_input_tokens_seen": 220542730, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.11737061, "step": 10241, "time_per_iteration": 4.005579948425293 }, { "auxiliary_loss_clip": 0.06431817, "auxiliary_loss_mlp": 0.0126862, "balance_loss_clip": 0.06286592, "balance_loss_mlp": 0.01258243, "epoch": 0.615782353825342, "flos": 23263250353920.0, "grad_norm": 1.532486786968347, "language_loss": 0.72320056, "learning_rate": 1.3586041214926018e-06, "loss": 0.80020499, "num_input_tokens_seen": 220562995, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10388184, "step": 10242, "time_per_iteration": 2.5674374103546143 }, { "auxiliary_loss_clip": 0.06433026, "auxiliary_loss_mlp": 0.01267015, "balance_loss_clip": 0.06285159, "balance_loss_mlp": 0.01256179, "epoch": 0.6158424770780099, "flos": 21109972930560.0, "grad_norm": 1.8425830461289727, "language_loss": 0.7270292, "learning_rate": 1.3582352427484086e-06, "loss": 0.80402958, "num_input_tokens_seen": 220581775, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.1083374, "step": 10243, "time_per_iteration": 2.5556318759918213 }, { "auxiliary_loss_clip": 0.06327778, "auxiliary_loss_mlp": 0.01253071, "balance_loss_clip": 0.06266509, "balance_loss_mlp": 0.0125129, "epoch": 0.6159026003306779, "flos": 70355358120960.0, "grad_norm": 0.7510748128735739, "language_loss": 0.56794077, "learning_rate": 1.3578663883399984e-06, "loss": 0.6437493, "num_input_tokens_seen": 220646395, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 0.01779175, "step": 10244, "time_per_iteration": 3.21341872215271 }, { "auxiliary_loss_clip": 0.06426784, "auxiliary_loss_mlp": 0.01266114, "balance_loss_clip": 0.06279761, "balance_loss_mlp": 0.01255171, "epoch": 0.6159627235833458, "flos": 33882624282240.0, "grad_norm": 1.5492743926312558, "language_loss": 0.64023978, "learning_rate": 1.3574975582813593e-06, "loss": 0.71716881, "num_input_tokens_seen": 220668335, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.109375, "step": 10245, "time_per_iteration": 2.663026809692383 }, { "auxiliary_loss_clip": 0.06427023, "auxiliary_loss_mlp": 0.01266801, "balance_loss_clip": 0.0627927, "balance_loss_mlp": 0.01256644, "epoch": 0.6160228468360138, "flos": 26582193452160.0, "grad_norm": 1.839341442930465, "language_loss": 0.78897113, "learning_rate": 1.3571287525864771e-06, "loss": 0.8659094, "num_input_tokens_seen": 220688915, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.10162354, "step": 10246, "time_per_iteration": 3.99763822555542 }, { "auxiliary_loss_clip": 0.06435713, "auxiliary_loss_mlp": 0.01269906, "balance_loss_clip": 0.06281711, "balance_loss_mlp": 0.01257395, "epoch": 0.6160829700886818, "flos": 17197568490240.0, "grad_norm": 2.756898578002792, "language_loss": 0.87866628, "learning_rate": 1.3567599712693368e-06, "loss": 0.95572245, "num_input_tokens_seen": 220703465, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.12512207, "step": 10247, "time_per_iteration": 2.5018844604492188 }, { "auxiliary_loss_clip": 0.06432792, "auxiliary_loss_mlp": 0.01267381, "balance_loss_clip": 0.06282657, "balance_loss_mlp": 0.01256348, "epoch": 0.6161430933413498, "flos": 23630385018240.0, "grad_norm": 1.8546376604211572, "language_loss": 0.80721337, "learning_rate": 1.3563912143439235e-06, "loss": 0.88421506, "num_input_tokens_seen": 220722090, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.1104126, "step": 10248, "time_per_iteration": 2.5777604579925537 }, { "auxiliary_loss_clip": 0.0642344, "auxiliary_loss_mlp": 0.01268764, "balance_loss_clip": 0.06277236, "balance_loss_mlp": 0.01258131, "epoch": 0.6162032165940178, "flos": 23009027466240.0, "grad_norm": 1.7832399921906095, "language_loss": 0.86988282, "learning_rate": 1.3560224818242191e-06, "loss": 0.94680482, "num_input_tokens_seen": 220741075, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10638428, "step": 10249, "time_per_iteration": 2.5445075035095215 }, { "auxiliary_loss_clip": 0.0643018, "auxiliary_loss_mlp": 0.01267494, "balance_loss_clip": 0.06281208, "balance_loss_mlp": 0.01256676, "epoch": 0.6162633398466857, "flos": 39431474962560.0, "grad_norm": 2.1504667990944872, "language_loss": 0.69193369, "learning_rate": 1.3556537737242072e-06, "loss": 0.76891047, "num_input_tokens_seen": 220763395, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.10821533, "step": 10250, "time_per_iteration": 2.700409173965454 }, { "auxiliary_loss_clip": 0.06420776, "auxiliary_loss_mlp": 0.01264145, "balance_loss_clip": 0.06278862, "balance_loss_mlp": 0.0125474, "epoch": 0.6163234630993537, "flos": 19250679957120.0, "grad_norm": 1.670866218987976, "language_loss": 0.74289697, "learning_rate": 1.3552850900578692e-06, "loss": 0.81974614, "num_input_tokens_seen": 220780640, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09405518, "step": 10251, "time_per_iteration": 2.5225465297698975 }, { "auxiliary_loss_clip": 0.06427695, "auxiliary_loss_mlp": 0.01266858, "balance_loss_clip": 0.06278242, "balance_loss_mlp": 0.01256397, "epoch": 0.6163835863520216, "flos": 15967389571200.0, "grad_norm": 2.144942677262894, "language_loss": 0.68930078, "learning_rate": 1.3549164308391844e-06, "loss": 0.76624632, "num_input_tokens_seen": 220797960, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.10461426, "step": 10252, "time_per_iteration": 2.527880907058716 }, { "auxiliary_loss_clip": 0.06319806, "auxiliary_loss_mlp": 0.01251779, "balance_loss_clip": 0.0625865, "balance_loss_mlp": 0.01249991, "epoch": 0.6164437096046896, "flos": 68124905487360.0, "grad_norm": 0.8833106071993059, "language_loss": 0.57609081, "learning_rate": 1.3545477960821333e-06, "loss": 0.65180659, "num_input_tokens_seen": 220856930, "router_z_loss_clip": 0.61279297, "router_z_loss_mlp": 0.01786804, "step": 10253, "time_per_iteration": 3.2117345333099365 }, { "auxiliary_loss_clip": 0.06430743, "auxiliary_loss_mlp": 0.01268624, "balance_loss_clip": 0.0627972, "balance_loss_mlp": 0.01257305, "epoch": 0.6165038328573575, "flos": 21367633835520.0, "grad_norm": 1.3870179651047883, "language_loss": 0.79751813, "learning_rate": 1.3541791858006946e-06, "loss": 0.87451178, "num_input_tokens_seen": 220877595, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.11309814, "step": 10254, "time_per_iteration": 2.5875244140625 }, { "auxiliary_loss_clip": 0.06431474, "auxiliary_loss_mlp": 0.01267756, "balance_loss_clip": 0.06279695, "balance_loss_mlp": 0.0125611, "epoch": 0.6165639561100256, "flos": 21107708870400.0, "grad_norm": 1.6514775881930015, "language_loss": 0.80909169, "learning_rate": 1.353810600008846e-06, "loss": 0.88608396, "num_input_tokens_seen": 220896880, "router_z_loss_clip": 1.51660156, "router_z_loss_mlp": 0.11633301, "step": 10255, "time_per_iteration": 2.5921919345855713 }, { "auxiliary_loss_clip": 0.06430507, "auxiliary_loss_mlp": 0.0126998, "balance_loss_clip": 0.06280641, "balance_loss_mlp": 0.01258178, "epoch": 0.6166240793626935, "flos": 25345683550080.0, "grad_norm": 2.1128162378921447, "language_loss": 0.65999687, "learning_rate": 1.3534420387205646e-06, "loss": 0.73700178, "num_input_tokens_seen": 220916425, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.11791992, "step": 10256, "time_per_iteration": 4.031207323074341 }, { "auxiliary_loss_clip": 0.06427522, "auxiliary_loss_mlp": 0.01267023, "balance_loss_clip": 0.06281821, "balance_loss_mlp": 0.01256628, "epoch": 0.6166842026153615, "flos": 19688742702720.0, "grad_norm": 1.5811902851985298, "language_loss": 0.72208309, "learning_rate": 1.353073501949825e-06, "loss": 0.79902858, "num_input_tokens_seen": 220935050, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10400391, "step": 10257, "time_per_iteration": 2.575047731399536 }, { "auxiliary_loss_clip": 0.06426969, "auxiliary_loss_mlp": 0.01267675, "balance_loss_clip": 0.06277118, "balance_loss_mlp": 0.01257054, "epoch": 0.6167443258680294, "flos": 19324501004160.0, "grad_norm": 1.699916734689713, "language_loss": 0.72318292, "learning_rate": 1.3527049897106034e-06, "loss": 0.80012935, "num_input_tokens_seen": 220953085, "router_z_loss_clip": 1.49902344, "router_z_loss_mlp": 0.10638428, "step": 10258, "time_per_iteration": 3.963083028793335 }, { "auxiliary_loss_clip": 0.06430874, "auxiliary_loss_mlp": 0.01264071, "balance_loss_clip": 0.06281159, "balance_loss_mlp": 0.01253158, "epoch": 0.6168044491206974, "flos": 25272323700480.0, "grad_norm": 3.4649301252119806, "language_loss": 0.64418519, "learning_rate": 1.3523365020168735e-06, "loss": 0.72113466, "num_input_tokens_seen": 220969050, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.10913086, "step": 10259, "time_per_iteration": 2.5871810913085938 }, { "auxiliary_loss_clip": 0.06420369, "auxiliary_loss_mlp": 0.01268473, "balance_loss_clip": 0.06275666, "balance_loss_mlp": 0.01256909, "epoch": 0.6168645723733654, "flos": 13224130750080.0, "grad_norm": 1.7903348444392373, "language_loss": 0.71649432, "learning_rate": 1.3519680388826084e-06, "loss": 0.79338276, "num_input_tokens_seen": 220985825, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.11572266, "step": 10260, "time_per_iteration": 2.562183380126953 }, { "auxiliary_loss_clip": 0.06435559, "auxiliary_loss_mlp": 0.01266935, "balance_loss_clip": 0.06281401, "balance_loss_mlp": 0.01255062, "epoch": 0.6169246956260334, "flos": 26659410589440.0, "grad_norm": 1.6240529257902092, "language_loss": 0.68057317, "learning_rate": 1.3515996003217803e-06, "loss": 0.7575981, "num_input_tokens_seen": 221004465, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.11871338, "step": 10261, "time_per_iteration": 2.61279559135437 }, { "auxiliary_loss_clip": 0.06424076, "auxiliary_loss_mlp": 0.01264074, "balance_loss_clip": 0.06276929, "balance_loss_mlp": 0.01254221, "epoch": 0.6169848188787014, "flos": 23155034405760.0, "grad_norm": 1.789572881380018, "language_loss": 0.71614105, "learning_rate": 1.3512311863483602e-06, "loss": 0.79302257, "num_input_tokens_seen": 221023260, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.09857178, "step": 10262, "time_per_iteration": 2.56559157371521 }, { "auxiliary_loss_clip": 0.06426141, "auxiliary_loss_mlp": 0.01268473, "balance_loss_clip": 0.06278247, "balance_loss_mlp": 0.01257453, "epoch": 0.6170449421313693, "flos": 23338748482560.0, "grad_norm": 2.0842374084204502, "language_loss": 0.69875503, "learning_rate": 1.3508627969763188e-06, "loss": 0.77570122, "num_input_tokens_seen": 221043090, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.11010742, "step": 10263, "time_per_iteration": 2.5620715618133545 }, { "auxiliary_loss_clip": 0.06429615, "auxiliary_loss_mlp": 0.01268936, "balance_loss_clip": 0.06278072, "balance_loss_mlp": 0.01257963, "epoch": 0.6171050653840373, "flos": 15857077271040.0, "grad_norm": 3.3735580793465116, "language_loss": 0.7645399, "learning_rate": 1.3504944322196244e-06, "loss": 0.84152544, "num_input_tokens_seen": 221061435, "router_z_loss_clip": 1.51660156, "router_z_loss_mlp": 0.10980225, "step": 10264, "time_per_iteration": 2.5158705711364746 }, { "auxiliary_loss_clip": 0.06424494, "auxiliary_loss_mlp": 0.01267434, "balance_loss_clip": 0.06276889, "balance_loss_mlp": 0.01256437, "epoch": 0.6171651886367052, "flos": 20051349246720.0, "grad_norm": 2.117058278313592, "language_loss": 0.85025156, "learning_rate": 1.350126092092247e-06, "loss": 0.92717087, "num_input_tokens_seen": 221078705, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.11004639, "step": 10265, "time_per_iteration": 2.535569667816162 }, { "auxiliary_loss_clip": 0.06418226, "auxiliary_loss_mlp": 0.01268036, "balance_loss_clip": 0.06273025, "balance_loss_mlp": 0.01257355, "epoch": 0.6172253118893732, "flos": 26439959946240.0, "grad_norm": 1.7101672945293704, "language_loss": 0.65079135, "learning_rate": 1.349757776608153e-06, "loss": 0.72765398, "num_input_tokens_seen": 221099245, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10681152, "step": 10266, "time_per_iteration": 2.589138984680176 }, { "auxiliary_loss_clip": 0.06421642, "auxiliary_loss_mlp": 0.01266924, "balance_loss_clip": 0.0627317, "balance_loss_mlp": 0.01256713, "epoch": 0.6172854351420412, "flos": 22638622492800.0, "grad_norm": 1.5650038160643764, "language_loss": 0.75839216, "learning_rate": 1.3493894857813094e-06, "loss": 0.8352778, "num_input_tokens_seen": 221116930, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.10211182, "step": 10267, "time_per_iteration": 2.5628509521484375 }, { "auxiliary_loss_clip": 0.06430236, "auxiliary_loss_mlp": 0.01266191, "balance_loss_clip": 0.06278292, "balance_loss_mlp": 0.01255403, "epoch": 0.6173455583947092, "flos": 21218943565440.0, "grad_norm": 1.7213644634908132, "language_loss": 0.75061178, "learning_rate": 1.3490212196256818e-06, "loss": 0.82757604, "num_input_tokens_seen": 221137660, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.10791016, "step": 10268, "time_per_iteration": 2.575627088546753 }, { "auxiliary_loss_clip": 0.06428079, "auxiliary_loss_mlp": 0.01265658, "balance_loss_clip": 0.06275114, "balance_loss_mlp": 0.01255031, "epoch": 0.6174056816473771, "flos": 19506370291200.0, "grad_norm": 2.3097258435901837, "language_loss": 0.75708127, "learning_rate": 1.3486529781552342e-06, "loss": 0.83401871, "num_input_tokens_seen": 221156225, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.10620117, "step": 10269, "time_per_iteration": 2.550065755844116 }, { "auxiliary_loss_clip": 0.06420849, "auxiliary_loss_mlp": 0.01268586, "balance_loss_clip": 0.06273213, "balance_loss_mlp": 0.01257624, "epoch": 0.6174658049000451, "flos": 16002790721280.0, "grad_norm": 2.307117227533713, "language_loss": 0.76769173, "learning_rate": 1.3482847613839318e-06, "loss": 0.84458601, "num_input_tokens_seen": 221173820, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.10961914, "step": 10270, "time_per_iteration": 2.509183645248413 }, { "auxiliary_loss_clip": 0.06425265, "auxiliary_loss_mlp": 0.01269428, "balance_loss_clip": 0.06276439, "balance_loss_mlp": 0.01259146, "epoch": 0.617525928152713, "flos": 21909635971200.0, "grad_norm": 1.811596657941962, "language_loss": 0.82567143, "learning_rate": 1.347916569325736e-06, "loss": 0.90261835, "num_input_tokens_seen": 221191815, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.10284424, "step": 10271, "time_per_iteration": 2.558469295501709 }, { "auxiliary_loss_clip": 0.06426813, "auxiliary_loss_mlp": 0.01264481, "balance_loss_clip": 0.0627627, "balance_loss_mlp": 0.01254366, "epoch": 0.617586051405381, "flos": 21112362771840.0, "grad_norm": 1.5055215751835405, "language_loss": 0.77717304, "learning_rate": 1.3475484019946093e-06, "loss": 0.85408598, "num_input_tokens_seen": 221211205, "router_z_loss_clip": 1.50488281, "router_z_loss_mlp": 0.10113525, "step": 10272, "time_per_iteration": 2.5939321517944336 }, { "auxiliary_loss_clip": 0.06324896, "auxiliary_loss_mlp": 0.01255888, "balance_loss_clip": 0.06264016, "balance_loss_mlp": 0.01254412, "epoch": 0.617646174658049, "flos": 58629129684480.0, "grad_norm": 0.8255442831228543, "language_loss": 0.59019917, "learning_rate": 1.347180259404513e-06, "loss": 0.66600704, "num_input_tokens_seen": 221268430, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.01475525, "step": 10273, "time_per_iteration": 3.0606186389923096 }, { "auxiliary_loss_clip": 0.06418316, "auxiliary_loss_mlp": 0.01267349, "balance_loss_clip": 0.0627267, "balance_loss_mlp": 0.01256334, "epoch": 0.617706297910717, "flos": 13883363147520.0, "grad_norm": 4.4018339867837515, "language_loss": 0.73605055, "learning_rate": 1.3468121415694059e-06, "loss": 0.81290722, "num_input_tokens_seen": 221281930, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.11010742, "step": 10274, "time_per_iteration": 2.5282509326934814 }, { "auxiliary_loss_clip": 0.06426182, "auxiliary_loss_mlp": 0.01267492, "balance_loss_clip": 0.06277779, "balance_loss_mlp": 0.01257014, "epoch": 0.617766421163385, "flos": 19214482193280.0, "grad_norm": 1.9566074618525626, "language_loss": 0.77614206, "learning_rate": 1.3464440485032484e-06, "loss": 0.85307878, "num_input_tokens_seen": 221301605, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.10473633, "step": 10275, "time_per_iteration": 2.549691677093506 }, { "auxiliary_loss_clip": 0.06423342, "auxiliary_loss_mlp": 0.01270142, "balance_loss_clip": 0.06277415, "balance_loss_mlp": 0.01259216, "epoch": 0.6178265444160529, "flos": 22572725656320.0, "grad_norm": 1.9088669946267454, "language_loss": 0.79294997, "learning_rate": 1.346075980219998e-06, "loss": 0.86988485, "num_input_tokens_seen": 221320105, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10931396, "step": 10276, "time_per_iteration": 2.567469358444214 }, { "auxiliary_loss_clip": 0.06426095, "auxiliary_loss_mlp": 0.01267838, "balance_loss_clip": 0.06276391, "balance_loss_mlp": 0.01256358, "epoch": 0.6178866676687209, "flos": 11989130221440.0, "grad_norm": 1.9019538006129892, "language_loss": 0.81415069, "learning_rate": 1.345707936733612e-06, "loss": 0.8910901, "num_input_tokens_seen": 221335915, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.1149292, "step": 10277, "time_per_iteration": 2.5438520908355713 }, { "auxiliary_loss_clip": 0.06427712, "auxiliary_loss_mlp": 0.012659, "balance_loss_clip": 0.0627455, "balance_loss_mlp": 0.01254837, "epoch": 0.6179467909213888, "flos": 20997061153920.0, "grad_norm": 1.5864925255676217, "language_loss": 0.81881785, "learning_rate": 1.3453399180580466e-06, "loss": 0.89575398, "num_input_tokens_seen": 221353965, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.1105957, "step": 10278, "time_per_iteration": 2.5526394844055176 }, { "auxiliary_loss_clip": 0.06420673, "auxiliary_loss_mlp": 0.01266871, "balance_loss_clip": 0.06274091, "balance_loss_mlp": 0.01256327, "epoch": 0.6180069141740568, "flos": 25345180425600.0, "grad_norm": 1.7428207910174587, "language_loss": 0.74332398, "learning_rate": 1.3449719242072567e-06, "loss": 0.82019937, "num_input_tokens_seen": 221374080, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10552979, "step": 10279, "time_per_iteration": 2.5919432640075684 }, { "auxiliary_loss_clip": 0.06418566, "auxiliary_loss_mlp": 0.01268743, "balance_loss_clip": 0.06271525, "balance_loss_mlp": 0.01258365, "epoch": 0.6180670374267248, "flos": 19651748325120.0, "grad_norm": 1.4175851911043393, "language_loss": 0.70880389, "learning_rate": 1.3446039551951975e-06, "loss": 0.78567696, "num_input_tokens_seen": 221392910, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.1038208, "step": 10280, "time_per_iteration": 2.651336669921875 }, { "auxiliary_loss_clip": 0.06427182, "auxiliary_loss_mlp": 0.01266671, "balance_loss_clip": 0.06277089, "balance_loss_mlp": 0.01255602, "epoch": 0.6181271606793928, "flos": 19471136849280.0, "grad_norm": 1.466561334919248, "language_loss": 0.73401392, "learning_rate": 1.3442360110358215e-06, "loss": 0.81095248, "num_input_tokens_seen": 221410990, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.11071777, "step": 10281, "time_per_iteration": 4.093267917633057 }, { "auxiliary_loss_clip": 0.0642067, "auxiliary_loss_mlp": 0.0127119, "balance_loss_clip": 0.06277318, "balance_loss_mlp": 0.01261325, "epoch": 0.6181872839320607, "flos": 25601541592320.0, "grad_norm": 1.4826806373429975, "language_loss": 0.77078497, "learning_rate": 1.3438680917430827e-06, "loss": 0.84770358, "num_input_tokens_seen": 221431020, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.09863281, "step": 10282, "time_per_iteration": 2.5961861610412598 }, { "auxiliary_loss_clip": 0.06430779, "auxiliary_loss_mlp": 0.01272513, "balance_loss_clip": 0.06279245, "balance_loss_mlp": 0.01258804, "epoch": 0.6182474071847287, "flos": 25558048523520.0, "grad_norm": 1.8687287005368869, "language_loss": 0.69150686, "learning_rate": 1.343500197330931e-06, "loss": 0.76853979, "num_input_tokens_seen": 221453235, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.137146, "step": 10283, "time_per_iteration": 2.6050469875335693 }, { "auxiliary_loss_clip": 0.06437553, "auxiliary_loss_mlp": 0.01270688, "balance_loss_clip": 0.06279397, "balance_loss_mlp": 0.01258648, "epoch": 0.6183075304373966, "flos": 22129673592960.0, "grad_norm": 1.6372862158681047, "language_loss": 0.74968767, "learning_rate": 1.3431323278133176e-06, "loss": 0.82677013, "num_input_tokens_seen": 221472560, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.12054443, "step": 10284, "time_per_iteration": 2.567756414413452 }, { "auxiliary_loss_clip": 0.06419174, "auxiliary_loss_mlp": 0.01271302, "balance_loss_clip": 0.06278226, "balance_loss_mlp": 0.01261169, "epoch": 0.6183676536900646, "flos": 22462161793920.0, "grad_norm": 1.4861985406762381, "language_loss": 0.76029897, "learning_rate": 1.3427644832041922e-06, "loss": 0.83720368, "num_input_tokens_seen": 221492835, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10137939, "step": 10285, "time_per_iteration": 4.013452768325806 }, { "auxiliary_loss_clip": 0.06424071, "auxiliary_loss_mlp": 0.01273599, "balance_loss_clip": 0.06275378, "balance_loss_mlp": 0.01262143, "epoch": 0.6184277769427327, "flos": 23370250417920.0, "grad_norm": 1.7336848436599905, "language_loss": 0.73188782, "learning_rate": 1.342396663517503e-06, "loss": 0.80886447, "num_input_tokens_seen": 221511870, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.11456299, "step": 10286, "time_per_iteration": 2.5623838901519775 }, { "auxiliary_loss_clip": 0.06425069, "auxiliary_loss_mlp": 0.01274019, "balance_loss_clip": 0.06278227, "balance_loss_mlp": 0.01263314, "epoch": 0.6184879001954006, "flos": 22717684419840.0, "grad_norm": 1.5738556535196737, "language_loss": 0.76465595, "learning_rate": 1.342028868767199e-06, "loss": 0.84164679, "num_input_tokens_seen": 221529915, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10705566, "step": 10287, "time_per_iteration": 2.571239709854126 }, { "auxiliary_loss_clip": 0.06423908, "auxiliary_loss_mlp": 0.01273805, "balance_loss_clip": 0.06276117, "balance_loss_mlp": 0.01262635, "epoch": 0.6185480234480686, "flos": 23848703631360.0, "grad_norm": 1.7092981806247711, "language_loss": 0.73229289, "learning_rate": 1.3416610989672262e-06, "loss": 0.80927002, "num_input_tokens_seen": 221549745, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.1116333, "step": 10288, "time_per_iteration": 2.570962905883789 }, { "auxiliary_loss_clip": 0.06417634, "auxiliary_loss_mlp": 0.0126756, "balance_loss_clip": 0.06273945, "balance_loss_mlp": 0.01256683, "epoch": 0.6186081467007365, "flos": 45487932877440.0, "grad_norm": 1.4870226930526467, "language_loss": 0.72984242, "learning_rate": 1.3412933541315296e-06, "loss": 0.80669439, "num_input_tokens_seen": 221572455, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10882568, "step": 10289, "time_per_iteration": 2.7728545665740967 }, { "auxiliary_loss_clip": 0.064246, "auxiliary_loss_mlp": 0.01277289, "balance_loss_clip": 0.06273733, "balance_loss_mlp": 0.01265583, "epoch": 0.6186682699534045, "flos": 23557737928320.0, "grad_norm": 1.5421643891908543, "language_loss": 0.79469854, "learning_rate": 1.340925634274056e-06, "loss": 0.87171745, "num_input_tokens_seen": 221591325, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.11706543, "step": 10290, "time_per_iteration": 2.5695242881774902 }, { "auxiliary_loss_clip": 0.06431041, "auxiliary_loss_mlp": 0.01275041, "balance_loss_clip": 0.06279043, "balance_loss_mlp": 0.01263651, "epoch": 0.6187283932060724, "flos": 25781062965120.0, "grad_norm": 1.6828801550313262, "language_loss": 0.81940949, "learning_rate": 1.3405579394087475e-06, "loss": 0.89647031, "num_input_tokens_seen": 221611640, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.11395264, "step": 10291, "time_per_iteration": 2.5849416255950928 }, { "auxiliary_loss_clip": 0.06428105, "auxiliary_loss_mlp": 0.01270278, "balance_loss_clip": 0.06279817, "balance_loss_mlp": 0.01259949, "epoch": 0.6187885164587404, "flos": 25272281773440.0, "grad_norm": 1.6308052649407838, "language_loss": 0.77812523, "learning_rate": 1.3401902695495487e-06, "loss": 0.85510898, "num_input_tokens_seen": 221631225, "router_z_loss_clip": 1.48242188, "router_z_loss_mlp": 0.10327148, "step": 10292, "time_per_iteration": 2.5790112018585205 }, { "auxiliary_loss_clip": 0.06435982, "auxiliary_loss_mlp": 0.01274814, "balance_loss_clip": 0.06279878, "balance_loss_mlp": 0.01262232, "epoch": 0.6188486397114084, "flos": 26258090659200.0, "grad_norm": 1.8073673320583001, "language_loss": 0.73726088, "learning_rate": 1.339822624710401e-06, "loss": 0.81436884, "num_input_tokens_seen": 221651035, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.12585449, "step": 10293, "time_per_iteration": 2.582491397857666 }, { "auxiliary_loss_clip": 0.06429804, "auxiliary_loss_mlp": 0.01279424, "balance_loss_clip": 0.0628061, "balance_loss_mlp": 0.01268326, "epoch": 0.6189087629640764, "flos": 20929738798080.0, "grad_norm": 1.5121098696796185, "language_loss": 0.83512557, "learning_rate": 1.3394550049052454e-06, "loss": 0.91221786, "num_input_tokens_seen": 221671300, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.11102295, "step": 10294, "time_per_iteration": 2.56189227104187 }, { "auxiliary_loss_clip": 0.06426332, "auxiliary_loss_mlp": 0.01273001, "balance_loss_clip": 0.06277496, "balance_loss_mlp": 0.01261765, "epoch": 0.6189688862167443, "flos": 14835070621440.0, "grad_norm": 2.0988330343147656, "language_loss": 0.70199955, "learning_rate": 1.3390874101480225e-06, "loss": 0.77899289, "num_input_tokens_seen": 221687320, "router_z_loss_clip": 1.48730469, "router_z_loss_mlp": 0.11248779, "step": 10295, "time_per_iteration": 4.046795845031738 }, { "auxiliary_loss_clip": 0.0642667, "auxiliary_loss_mlp": 0.01278563, "balance_loss_clip": 0.06279963, "balance_loss_mlp": 0.01267065, "epoch": 0.6190290094694123, "flos": 24292803870720.0, "grad_norm": 1.4045692788094462, "language_loss": 0.70123887, "learning_rate": 1.3387198404526705e-06, "loss": 0.77829117, "num_input_tokens_seen": 221710175, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.1151123, "step": 10296, "time_per_iteration": 2.6467647552490234 }, { "auxiliary_loss_clip": 0.06426457, "auxiliary_loss_mlp": 0.01275085, "balance_loss_clip": 0.06276742, "balance_loss_mlp": 0.01262688, "epoch": 0.6190891327220802, "flos": 22536192476160.0, "grad_norm": 1.7739673785723085, "language_loss": 0.71740919, "learning_rate": 1.3383522958331287e-06, "loss": 0.79442465, "num_input_tokens_seen": 221728145, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.12390137, "step": 10297, "time_per_iteration": 2.592561721801758 }, { "auxiliary_loss_clip": 0.06318136, "auxiliary_loss_mlp": 0.01255452, "balance_loss_clip": 0.06257176, "balance_loss_mlp": 0.01253668, "epoch": 0.6191492559747482, "flos": 67748756509440.0, "grad_norm": 0.8731781701239526, "language_loss": 0.64207101, "learning_rate": 1.3379847763033345e-06, "loss": 0.71780688, "num_input_tokens_seen": 221786100, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.01782227, "step": 10298, "time_per_iteration": 4.568770408630371 }, { "auxiliary_loss_clip": 0.06432243, "auxiliary_loss_mlp": 0.01276101, "balance_loss_clip": 0.06280037, "balance_loss_mlp": 0.01264532, "epoch": 0.6192093792274163, "flos": 22353316940160.0, "grad_norm": 1.9621677119753964, "language_loss": 0.7436043, "learning_rate": 1.3376172818772236e-06, "loss": 0.82068765, "num_input_tokens_seen": 221806450, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.11578369, "step": 10299, "time_per_iteration": 2.6275930404663086 }, { "auxiliary_loss_clip": 0.06436323, "auxiliary_loss_mlp": 0.01274181, "balance_loss_clip": 0.06280939, "balance_loss_mlp": 0.0126254, "epoch": 0.6192695024800842, "flos": 13559176500480.0, "grad_norm": 1.6511549703380681, "language_loss": 0.68513966, "learning_rate": 1.337249812568732e-06, "loss": 0.7622447, "num_input_tokens_seen": 221823330, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.11645508, "step": 10300, "time_per_iteration": 2.5755701065063477 }, { "auxiliary_loss_clip": 0.06425966, "auxiliary_loss_mlp": 0.01270707, "balance_loss_clip": 0.06276512, "balance_loss_mlp": 0.01258769, "epoch": 0.6193296257327522, "flos": 17420163661440.0, "grad_norm": 1.9172279990063845, "language_loss": 0.67155051, "learning_rate": 1.3368823683917939e-06, "loss": 0.74851727, "num_input_tokens_seen": 221839360, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.11950684, "step": 10301, "time_per_iteration": 2.5781946182250977 }, { "auxiliary_loss_clip": 0.06428584, "auxiliary_loss_mlp": 0.01268745, "balance_loss_clip": 0.06279033, "balance_loss_mlp": 0.01257652, "epoch": 0.6193897489854201, "flos": 31108869774720.0, "grad_norm": 1.5888714264080686, "language_loss": 0.73143113, "learning_rate": 1.3365149493603424e-06, "loss": 0.80840445, "num_input_tokens_seen": 221859465, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.11090088, "step": 10302, "time_per_iteration": 2.6718788146972656 }, { "auxiliary_loss_clip": 0.06425974, "auxiliary_loss_mlp": 0.01271049, "balance_loss_clip": 0.0627745, "balance_loss_mlp": 0.01259349, "epoch": 0.6194498722380881, "flos": 19139822605440.0, "grad_norm": 1.6787732359680063, "language_loss": 0.80847681, "learning_rate": 1.3361475554883107e-06, "loss": 0.88544703, "num_input_tokens_seen": 221878555, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.11700439, "step": 10303, "time_per_iteration": 2.6047072410583496 }, { "auxiliary_loss_clip": 0.06433265, "auxiliary_loss_mlp": 0.01272072, "balance_loss_clip": 0.06279036, "balance_loss_mlp": 0.01259025, "epoch": 0.619509995490756, "flos": 21841517001600.0, "grad_norm": 1.553544876618618, "language_loss": 0.7675885, "learning_rate": 1.3357801867896307e-06, "loss": 0.84464192, "num_input_tokens_seen": 221898790, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.13049316, "step": 10304, "time_per_iteration": 2.566822052001953 }, { "auxiliary_loss_clip": 0.06436514, "auxiliary_loss_mlp": 0.01272391, "balance_loss_clip": 0.0628163, "balance_loss_mlp": 0.01260386, "epoch": 0.619570118743424, "flos": 23813512116480.0, "grad_norm": 2.176845507614803, "language_loss": 0.77454734, "learning_rate": 1.3354128432782324e-06, "loss": 0.85163629, "num_input_tokens_seen": 221918875, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.12011719, "step": 10305, "time_per_iteration": 2.618469476699829 }, { "auxiliary_loss_clip": 0.06433403, "auxiliary_loss_mlp": 0.01272892, "balance_loss_clip": 0.06279447, "balance_loss_mlp": 0.01260792, "epoch": 0.619630241996092, "flos": 21107289600000.0, "grad_norm": 1.7735740720266793, "language_loss": 0.78974575, "learning_rate": 1.335045524968045e-06, "loss": 0.86680871, "num_input_tokens_seen": 221937895, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.12078857, "step": 10306, "time_per_iteration": 2.5399868488311768 }, { "auxiliary_loss_clip": 0.06422253, "auxiliary_loss_mlp": 0.01267121, "balance_loss_clip": 0.0627657, "balance_loss_mlp": 0.01257197, "epoch": 0.61969036524876, "flos": 27315666167040.0, "grad_norm": 1.5764557465558688, "language_loss": 0.80636317, "learning_rate": 1.3346782318729988e-06, "loss": 0.88325691, "num_input_tokens_seen": 221955920, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.09924316, "step": 10307, "time_per_iteration": 2.6019845008850098 }, { "auxiliary_loss_clip": 0.06309664, "auxiliary_loss_mlp": 0.01255649, "balance_loss_clip": 0.06249137, "balance_loss_mlp": 0.01253979, "epoch": 0.6197504885014279, "flos": 51667308403200.0, "grad_norm": 0.8001028263898546, "language_loss": 0.59493661, "learning_rate": 1.3343109640070203e-06, "loss": 0.67058969, "num_input_tokens_seen": 222011405, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.01673889, "step": 10308, "time_per_iteration": 3.18144154548645 }, { "auxiliary_loss_clip": 0.06421916, "auxiliary_loss_mlp": 0.01264823, "balance_loss_clip": 0.06277032, "balance_loss_mlp": 0.01254553, "epoch": 0.6198106117540959, "flos": 30565316338560.0, "grad_norm": 1.7084495119816725, "language_loss": 0.68020833, "learning_rate": 1.333943721384037e-06, "loss": 0.75707573, "num_input_tokens_seen": 222034545, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10266113, "step": 10309, "time_per_iteration": 2.6233601570129395 }, { "auxiliary_loss_clip": 0.06424158, "auxiliary_loss_mlp": 0.01268333, "balance_loss_clip": 0.06277913, "balance_loss_mlp": 0.01256853, "epoch": 0.6198707350067638, "flos": 18914586030720.0, "grad_norm": 1.7728522430006282, "language_loss": 0.7258482, "learning_rate": 1.3335765040179746e-06, "loss": 0.80277312, "num_input_tokens_seen": 222052690, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.11480713, "step": 10310, "time_per_iteration": 2.554790496826172 }, { "auxiliary_loss_clip": 0.06434688, "auxiliary_loss_mlp": 0.0127239, "balance_loss_clip": 0.06282419, "balance_loss_mlp": 0.01259807, "epoch": 0.6199308582594318, "flos": 21440238998400.0, "grad_norm": 1.9230473462416287, "language_loss": 0.79009855, "learning_rate": 1.3332093119227573e-06, "loss": 0.86716926, "num_input_tokens_seen": 222069095, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.12579346, "step": 10311, "time_per_iteration": 2.6012868881225586 }, { "auxiliary_loss_clip": 0.06428373, "auxiliary_loss_mlp": 0.01264313, "balance_loss_clip": 0.06277436, "balance_loss_mlp": 0.01253012, "epoch": 0.6199909815120999, "flos": 18413561341440.0, "grad_norm": 1.8209807608013304, "language_loss": 0.72793889, "learning_rate": 1.3328421451123105e-06, "loss": 0.80486572, "num_input_tokens_seen": 222087360, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.11291504, "step": 10312, "time_per_iteration": 2.5755038261413574 }, { "auxiliary_loss_clip": 0.06432545, "auxiliary_loss_mlp": 0.01269762, "balance_loss_clip": 0.06277955, "balance_loss_mlp": 0.01257912, "epoch": 0.6200511047647678, "flos": 21472663328640.0, "grad_norm": 1.9275280543920008, "language_loss": 0.72240198, "learning_rate": 1.3324750036005557e-06, "loss": 0.79942513, "num_input_tokens_seen": 222106130, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.1184082, "step": 10313, "time_per_iteration": 2.5796585083007812 }, { "auxiliary_loss_clip": 0.06434409, "auxiliary_loss_mlp": 0.01267352, "balance_loss_clip": 0.06279571, "balance_loss_mlp": 0.01254555, "epoch": 0.6201112280174358, "flos": 18220539461760.0, "grad_norm": 1.950138670525735, "language_loss": 0.78430903, "learning_rate": 1.332107887401416e-06, "loss": 0.86132663, "num_input_tokens_seen": 222123125, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.12774658, "step": 10314, "time_per_iteration": 2.5400197505950928 }, { "auxiliary_loss_clip": 0.06427741, "auxiliary_loss_mlp": 0.01263769, "balance_loss_clip": 0.06276997, "balance_loss_mlp": 0.01252789, "epoch": 0.6201713512701037, "flos": 20017373616000.0, "grad_norm": 1.7474689859860555, "language_loss": 0.78285384, "learning_rate": 1.331740796528812e-06, "loss": 0.85976899, "num_input_tokens_seen": 222140655, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.10968018, "step": 10315, "time_per_iteration": 2.546416759490967 }, { "auxiliary_loss_clip": 0.06429936, "auxiliary_loss_mlp": 0.01271786, "balance_loss_clip": 0.06275275, "balance_loss_mlp": 0.01261331, "epoch": 0.6202314745227717, "flos": 22493537948160.0, "grad_norm": 1.73464968481875, "language_loss": 0.75848663, "learning_rate": 1.3313737309966641e-06, "loss": 0.83550382, "num_input_tokens_seen": 222160450, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.10455322, "step": 10316, "time_per_iteration": 2.575460195541382 }, { "auxiliary_loss_clip": 0.06431773, "auxiliary_loss_mlp": 0.01263207, "balance_loss_clip": 0.06278597, "balance_loss_mlp": 0.01252329, "epoch": 0.6202915977754396, "flos": 26835116601600.0, "grad_norm": 1.924778588995663, "language_loss": 0.7753318, "learning_rate": 1.3310066908188915e-06, "loss": 0.85228157, "num_input_tokens_seen": 222179170, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.10894775, "step": 10317, "time_per_iteration": 2.5960733890533447 }, { "auxiliary_loss_clip": 0.06319604, "auxiliary_loss_mlp": 0.01256317, "balance_loss_clip": 0.06258911, "balance_loss_mlp": 0.01254819, "epoch": 0.6203517210281076, "flos": 62763248828160.0, "grad_norm": 0.7080615026492226, "language_loss": 0.58815551, "learning_rate": 1.3306396760094122e-06, "loss": 0.66391468, "num_input_tokens_seen": 222242660, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.01496887, "step": 10318, "time_per_iteration": 3.232144594192505 }, { "auxiliary_loss_clip": 0.0642668, "auxiliary_loss_mlp": 0.01266338, "balance_loss_clip": 0.06276116, "balance_loss_mlp": 0.01254703, "epoch": 0.6204118442807756, "flos": 23411018229120.0, "grad_norm": 1.521449862392737, "language_loss": 0.78279471, "learning_rate": 1.330272686582143e-06, "loss": 0.85972488, "num_input_tokens_seen": 222262170, "router_z_loss_clip": 1.50585938, "router_z_loss_mlp": 0.11633301, "step": 10319, "time_per_iteration": 2.5795488357543945 }, { "auxiliary_loss_clip": 0.06422018, "auxiliary_loss_mlp": 0.01268407, "balance_loss_clip": 0.06275757, "balance_loss_mlp": 0.01258226, "epoch": 0.6204719675334436, "flos": 20199871808640.0, "grad_norm": 2.3782819003717814, "language_loss": 0.66360676, "learning_rate": 1.3299057225510013e-06, "loss": 0.74051106, "num_input_tokens_seen": 222280375, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.10186768, "step": 10320, "time_per_iteration": 4.146697759628296 }, { "auxiliary_loss_clip": 0.06420983, "auxiliary_loss_mlp": 0.01264899, "balance_loss_clip": 0.06275675, "balance_loss_mlp": 0.01254624, "epoch": 0.6205320907861115, "flos": 13193048085120.0, "grad_norm": 1.6511740206519412, "language_loss": 0.76513022, "learning_rate": 1.3295387839299013e-06, "loss": 0.84198904, "num_input_tokens_seen": 222297325, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10284424, "step": 10321, "time_per_iteration": 2.5715713500976562 }, { "auxiliary_loss_clip": 0.06421369, "auxiliary_loss_mlp": 0.01267039, "balance_loss_clip": 0.06275375, "balance_loss_mlp": 0.01256376, "epoch": 0.6205922140387795, "flos": 20674761223680.0, "grad_norm": 1.6608542114931235, "language_loss": 0.73771054, "learning_rate": 1.329171870732758e-06, "loss": 0.81459463, "num_input_tokens_seen": 222317095, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10668945, "step": 10322, "time_per_iteration": 2.562761068344116 }, { "auxiliary_loss_clip": 0.06425433, "auxiliary_loss_mlp": 0.01265978, "balance_loss_clip": 0.06278628, "balance_loss_mlp": 0.01256519, "epoch": 0.6206523372914474, "flos": 23884524051840.0, "grad_norm": 2.9757474321681534, "language_loss": 0.73103958, "learning_rate": 1.3288049829734845e-06, "loss": 0.80795372, "num_input_tokens_seen": 222337055, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.09460449, "step": 10323, "time_per_iteration": 2.626020669937134 }, { "auxiliary_loss_clip": 0.06440413, "auxiliary_loss_mlp": 0.01271293, "balance_loss_clip": 0.06282962, "balance_loss_mlp": 0.01259688, "epoch": 0.6207124605441154, "flos": 13411576333440.0, "grad_norm": 2.224094327391912, "language_loss": 0.59449381, "learning_rate": 1.3284381206659933e-06, "loss": 0.67161083, "num_input_tokens_seen": 222354515, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.1161499, "step": 10324, "time_per_iteration": 4.009197473526001 }, { "auxiliary_loss_clip": 0.06432576, "auxiliary_loss_mlp": 0.01268153, "balance_loss_clip": 0.06281315, "balance_loss_mlp": 0.01255785, "epoch": 0.6207725837967835, "flos": 18922300606080.0, "grad_norm": 2.486636269392478, "language_loss": 0.77226126, "learning_rate": 1.3280712838241956e-06, "loss": 0.84926844, "num_input_tokens_seen": 222372755, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.12365723, "step": 10325, "time_per_iteration": 2.575908660888672 }, { "auxiliary_loss_clip": 0.06436896, "auxiliary_loss_mlp": 0.01266355, "balance_loss_clip": 0.06283273, "balance_loss_mlp": 0.01255114, "epoch": 0.6208327070494514, "flos": 23985738184320.0, "grad_norm": 1.8183426053617113, "language_loss": 0.72655219, "learning_rate": 1.327704472462003e-06, "loss": 0.80358469, "num_input_tokens_seen": 222391380, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.11248779, "step": 10326, "time_per_iteration": 2.710003614425659 }, { "auxiliary_loss_clip": 0.06428754, "auxiliary_loss_mlp": 0.01268937, "balance_loss_clip": 0.06276298, "balance_loss_mlp": 0.0125738, "epoch": 0.6208928303021194, "flos": 22827032398080.0, "grad_norm": 2.5044286701323277, "language_loss": 0.74368137, "learning_rate": 1.3273376865933234e-06, "loss": 0.82065827, "num_input_tokens_seen": 222411165, "router_z_loss_clip": 1.52636719, "router_z_loss_mlp": 0.11572266, "step": 10327, "time_per_iteration": 2.854325532913208 }, { "auxiliary_loss_clip": 0.06432424, "auxiliary_loss_mlp": 0.01267735, "balance_loss_clip": 0.06279463, "balance_loss_mlp": 0.01255939, "epoch": 0.6209529535547873, "flos": 17569944034560.0, "grad_norm": 2.1986504115368932, "language_loss": 0.80403507, "learning_rate": 1.326970926232066e-06, "loss": 0.88103664, "num_input_tokens_seen": 222428110, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.11791992, "step": 10328, "time_per_iteration": 2.6734015941619873 }, { "auxiliary_loss_clip": 0.06425636, "auxiliary_loss_mlp": 0.01266332, "balance_loss_clip": 0.06275439, "balance_loss_mlp": 0.0125549, "epoch": 0.6210130768074553, "flos": 22017432648960.0, "grad_norm": 1.6620033671849253, "language_loss": 0.78077149, "learning_rate": 1.3266041913921396e-06, "loss": 0.85769117, "num_input_tokens_seen": 222446385, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.10852051, "step": 10329, "time_per_iteration": 2.6083803176879883 }, { "auxiliary_loss_clip": 0.06315957, "auxiliary_loss_mlp": 0.01253528, "balance_loss_clip": 0.06255212, "balance_loss_mlp": 0.01251959, "epoch": 0.6210732000601232, "flos": 63695166739200.0, "grad_norm": 0.82898032277051, "language_loss": 0.62153006, "learning_rate": 1.3262374820874484e-06, "loss": 0.69722497, "num_input_tokens_seen": 222502150, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.01569366, "step": 10330, "time_per_iteration": 3.12148118019104 }, { "auxiliary_loss_clip": 0.06434388, "auxiliary_loss_mlp": 0.01267801, "balance_loss_clip": 0.06280692, "balance_loss_mlp": 0.01256005, "epoch": 0.6211333233127913, "flos": 24250233196800.0, "grad_norm": 3.033079928901951, "language_loss": 0.77944469, "learning_rate": 1.3258707983319002e-06, "loss": 0.85646665, "num_input_tokens_seen": 222519880, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11804199, "step": 10331, "time_per_iteration": 2.5760657787323 }, { "auxiliary_loss_clip": 0.06432922, "auxiliary_loss_mlp": 0.01268653, "balance_loss_clip": 0.06279816, "balance_loss_mlp": 0.01256803, "epoch": 0.6211934465654592, "flos": 16949047680000.0, "grad_norm": 2.0729772687285037, "language_loss": 0.67519563, "learning_rate": 1.3255041401393992e-06, "loss": 0.75221139, "num_input_tokens_seen": 222538545, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.11859131, "step": 10332, "time_per_iteration": 2.5689759254455566 }, { "auxiliary_loss_clip": 0.06431344, "auxiliary_loss_mlp": 0.01267874, "balance_loss_clip": 0.06280789, "balance_loss_mlp": 0.01257354, "epoch": 0.6212535698181272, "flos": 15272672169600.0, "grad_norm": 2.1783194123015814, "language_loss": 0.76537192, "learning_rate": 1.3251375075238476e-06, "loss": 0.84236407, "num_input_tokens_seen": 222556935, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.10516357, "step": 10333, "time_per_iteration": 2.5454111099243164 }, { "auxiliary_loss_clip": 0.06423955, "auxiliary_loss_mlp": 0.01268369, "balance_loss_clip": 0.0627794, "balance_loss_mlp": 0.01257474, "epoch": 0.6213136930707951, "flos": 13449073835520.0, "grad_norm": 2.9644596898354405, "language_loss": 0.69801962, "learning_rate": 1.3247709004991507e-06, "loss": 0.77494287, "num_input_tokens_seen": 222574035, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10894775, "step": 10334, "time_per_iteration": 2.5358011722564697 }, { "auxiliary_loss_clip": 0.06424189, "auxiliary_loss_mlp": 0.0126707, "balance_loss_clip": 0.06276893, "balance_loss_mlp": 0.01256943, "epoch": 0.6213738163234631, "flos": 18116641998720.0, "grad_norm": 3.3981137198218545, "language_loss": 0.70536435, "learning_rate": 1.3244043190792078e-06, "loss": 0.78227693, "num_input_tokens_seen": 222592290, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10125732, "step": 10335, "time_per_iteration": 3.89906644821167 }, { "auxiliary_loss_clip": 0.06425565, "auxiliary_loss_mlp": 0.01268433, "balance_loss_clip": 0.06280209, "balance_loss_mlp": 0.0125858, "epoch": 0.621433939576131, "flos": 25344299957760.0, "grad_norm": 1.4898166260532137, "language_loss": 0.80313218, "learning_rate": 1.3240377632779213e-06, "loss": 0.88007218, "num_input_tokens_seen": 222612805, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.09857178, "step": 10336, "time_per_iteration": 2.584383726119995 }, { "auxiliary_loss_clip": 0.06422976, "auxiliary_loss_mlp": 0.01270244, "balance_loss_clip": 0.06278617, "balance_loss_mlp": 0.0125982, "epoch": 0.621494062828799, "flos": 22572306385920.0, "grad_norm": 1.6620570449648593, "language_loss": 0.73714179, "learning_rate": 1.3236712331091907e-06, "loss": 0.81407398, "num_input_tokens_seen": 222632260, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10430908, "step": 10337, "time_per_iteration": 4.008527517318726 }, { "auxiliary_loss_clip": 0.06431151, "auxiliary_loss_mlp": 0.01269659, "balance_loss_clip": 0.06279718, "balance_loss_mlp": 0.01257219, "epoch": 0.621554186081467, "flos": 27425433415680.0, "grad_norm": 2.180908091393894, "language_loss": 0.63363856, "learning_rate": 1.3233047285869145e-06, "loss": 0.71064663, "num_input_tokens_seen": 222653570, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.12432861, "step": 10338, "time_per_iteration": 2.6393368244171143 }, { "auxiliary_loss_clip": 0.06424714, "auxiliary_loss_mlp": 0.0126877, "balance_loss_clip": 0.06276434, "balance_loss_mlp": 0.01258071, "epoch": 0.621614309334135, "flos": 22353484648320.0, "grad_norm": 1.4907938909479905, "language_loss": 0.71907997, "learning_rate": 1.322938249724991e-06, "loss": 0.79601479, "num_input_tokens_seen": 222672480, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.1071167, "step": 10339, "time_per_iteration": 2.5781588554382324 }, { "auxiliary_loss_clip": 0.06419618, "auxiliary_loss_mlp": 0.01264894, "balance_loss_clip": 0.06275678, "balance_loss_mlp": 0.01254237, "epoch": 0.621674432586803, "flos": 19287255064320.0, "grad_norm": 1.691842112686629, "language_loss": 0.69577885, "learning_rate": 1.3225717965373166e-06, "loss": 0.77262396, "num_input_tokens_seen": 222691200, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10656738, "step": 10340, "time_per_iteration": 2.5663554668426514 }, { "auxiliary_loss_clip": 0.06420743, "auxiliary_loss_mlp": 0.01266684, "balance_loss_clip": 0.06274864, "balance_loss_mlp": 0.01255848, "epoch": 0.6217345558394709, "flos": 21614812980480.0, "grad_norm": 1.729563571177792, "language_loss": 0.69164217, "learning_rate": 1.322205369037788e-06, "loss": 0.76851642, "num_input_tokens_seen": 222709975, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10839844, "step": 10341, "time_per_iteration": 2.5820186138153076 }, { "auxiliary_loss_clip": 0.06431662, "auxiliary_loss_mlp": 0.01267803, "balance_loss_clip": 0.06281385, "balance_loss_mlp": 0.01255989, "epoch": 0.6217946790921389, "flos": 18009893496960.0, "grad_norm": 1.7287909068595395, "language_loss": 0.80936825, "learning_rate": 1.321838967240299e-06, "loss": 0.88636291, "num_input_tokens_seen": 222729005, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.1182251, "step": 10342, "time_per_iteration": 2.5751633644104004 }, { "auxiliary_loss_clip": 0.06316382, "auxiliary_loss_mlp": 0.01251489, "balance_loss_clip": 0.06255936, "balance_loss_mlp": 0.012501, "epoch": 0.6218548023448068, "flos": 61993578349440.0, "grad_norm": 0.7576959293390715, "language_loss": 0.57260072, "learning_rate": 1.3214725911587452e-06, "loss": 0.64827943, "num_input_tokens_seen": 222786090, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.0138855, "step": 10343, "time_per_iteration": 3.107043504714966 }, { "auxiliary_loss_clip": 0.06418669, "auxiliary_loss_mlp": 0.01265927, "balance_loss_clip": 0.06274929, "balance_loss_mlp": 0.01255591, "epoch": 0.6219149255974749, "flos": 25746248793600.0, "grad_norm": 2.0806854374814083, "language_loss": 0.73464161, "learning_rate": 1.3211062408070184e-06, "loss": 0.81148762, "num_input_tokens_seen": 222806100, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10333252, "step": 10344, "time_per_iteration": 2.606806993484497 }, { "auxiliary_loss_clip": 0.06421216, "auxiliary_loss_mlp": 0.01264299, "balance_loss_clip": 0.06273507, "balance_loss_mlp": 0.01253588, "epoch": 0.6219750488501428, "flos": 25418162931840.0, "grad_norm": 1.816541791088556, "language_loss": 0.6000911, "learning_rate": 1.3207399161990105e-06, "loss": 0.67694628, "num_input_tokens_seen": 222826575, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10699463, "step": 10345, "time_per_iteration": 2.5968568325042725 }, { "auxiliary_loss_clip": 0.06427675, "auxiliary_loss_mlp": 0.01265511, "balance_loss_clip": 0.06278767, "balance_loss_mlp": 0.01254317, "epoch": 0.6220351721028108, "flos": 20053529452800.0, "grad_norm": 1.802482227561058, "language_loss": 0.78189111, "learning_rate": 1.320373617348614e-06, "loss": 0.85882294, "num_input_tokens_seen": 222845285, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.11187744, "step": 10346, "time_per_iteration": 2.5759944915771484 }, { "auxiliary_loss_clip": 0.06427416, "auxiliary_loss_mlp": 0.01266263, "balance_loss_clip": 0.06276438, "balance_loss_mlp": 0.01254413, "epoch": 0.6220952953554787, "flos": 27495439102080.0, "grad_norm": 1.6990180106432196, "language_loss": 0.71690583, "learning_rate": 1.3200073442697171e-06, "loss": 0.79384255, "num_input_tokens_seen": 222864575, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.11859131, "step": 10347, "time_per_iteration": 2.625120162963867 }, { "auxiliary_loss_clip": 0.06419525, "auxiliary_loss_mlp": 0.01263602, "balance_loss_clip": 0.06272773, "balance_loss_mlp": 0.01253373, "epoch": 0.6221554186081467, "flos": 19213517871360.0, "grad_norm": 1.8798492094767363, "language_loss": 0.72181427, "learning_rate": 1.3196410969762108e-06, "loss": 0.79864556, "num_input_tokens_seen": 222884420, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.10229492, "step": 10348, "time_per_iteration": 2.5945777893066406 }, { "auxiliary_loss_clip": 0.06316372, "auxiliary_loss_mlp": 0.01251183, "balance_loss_clip": 0.06256253, "balance_loss_mlp": 0.01249502, "epoch": 0.6222155418608146, "flos": 62969744016000.0, "grad_norm": 0.7899471556075299, "language_loss": 0.54007685, "learning_rate": 1.3192748754819815e-06, "loss": 0.61575246, "num_input_tokens_seen": 222944690, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.0168457, "step": 10349, "time_per_iteration": 3.1992452144622803 }, { "auxiliary_loss_clip": 0.06419484, "auxiliary_loss_mlp": 0.01264264, "balance_loss_clip": 0.06270556, "balance_loss_mlp": 0.01253887, "epoch": 0.6222756651134826, "flos": 22607623681920.0, "grad_norm": 1.9034999980055751, "language_loss": 0.70116037, "learning_rate": 1.3189086798009173e-06, "loss": 0.77799785, "num_input_tokens_seen": 222962990, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.1038208, "step": 10350, "time_per_iteration": 2.558150053024292 }, { "auxiliary_loss_clip": 0.0642613, "auxiliary_loss_mlp": 0.01267438, "balance_loss_clip": 0.0627629, "balance_loss_mlp": 0.01256209, "epoch": 0.6223357883661506, "flos": 21148602462720.0, "grad_norm": 2.0818825266486463, "language_loss": 0.57261181, "learning_rate": 1.3185425099469046e-06, "loss": 0.64954752, "num_input_tokens_seen": 222980715, "router_z_loss_clip": 1.49902344, "router_z_loss_mlp": 0.11230469, "step": 10351, "time_per_iteration": 2.56367564201355 }, { "auxiliary_loss_clip": 0.06316178, "auxiliary_loss_mlp": 0.01253191, "balance_loss_clip": 0.062558, "balance_loss_mlp": 0.01251594, "epoch": 0.6223959116188186, "flos": 63785926310400.0, "grad_norm": 0.7871850785935974, "language_loss": 0.61013734, "learning_rate": 1.3181763659338276e-06, "loss": 0.68583107, "num_input_tokens_seen": 223040685, "router_z_loss_clip": 0.60498047, "router_z_loss_mlp": 0.01599121, "step": 10352, "time_per_iteration": 3.114537239074707 }, { "auxiliary_loss_clip": 0.0641887, "auxiliary_loss_mlp": 0.01262549, "balance_loss_clip": 0.06274833, "balance_loss_mlp": 0.01252583, "epoch": 0.6224560348714866, "flos": 22572432167040.0, "grad_norm": 1.9593450447102343, "language_loss": 0.82281172, "learning_rate": 1.3178102477755714e-06, "loss": 0.89962596, "num_input_tokens_seen": 223059000, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.09967041, "step": 10353, "time_per_iteration": 2.5770957469940186 }, { "auxiliary_loss_clip": 0.06414248, "auxiliary_loss_mlp": 0.01272437, "balance_loss_clip": 0.06271133, "balance_loss_mlp": 0.01262256, "epoch": 0.6225161581241545, "flos": 24104645527680.0, "grad_norm": 1.6787653493717984, "language_loss": 0.75824893, "learning_rate": 1.3174441554860195e-06, "loss": 0.83511579, "num_input_tokens_seen": 223079345, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10180664, "step": 10354, "time_per_iteration": 2.575216770172119 }, { "auxiliary_loss_clip": 0.06417468, "auxiliary_loss_mlp": 0.01265963, "balance_loss_clip": 0.06271257, "balance_loss_mlp": 0.01255604, "epoch": 0.6225762813768225, "flos": 20448853816320.0, "grad_norm": 1.6362047697398039, "language_loss": 0.78449386, "learning_rate": 1.3170780890790528e-06, "loss": 0.86132824, "num_input_tokens_seen": 223097880, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.10357666, "step": 10355, "time_per_iteration": 2.572204351425171 }, { "auxiliary_loss_clip": 0.06421316, "auxiliary_loss_mlp": 0.01267808, "balance_loss_clip": 0.06271541, "balance_loss_mlp": 0.01258045, "epoch": 0.6226364046294904, "flos": 27205395793920.0, "grad_norm": 1.5512916479219592, "language_loss": 0.78012246, "learning_rate": 1.3167120485685538e-06, "loss": 0.8570137, "num_input_tokens_seen": 223118185, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.09771729, "step": 10356, "time_per_iteration": 2.59159779548645 }, { "auxiliary_loss_clip": 0.06431956, "auxiliary_loss_mlp": 0.01270362, "balance_loss_clip": 0.06276374, "balance_loss_mlp": 0.01257416, "epoch": 0.6226965278821585, "flos": 20451495219840.0, "grad_norm": 9.866257217913157, "language_loss": 0.67843103, "learning_rate": 1.3163460339684024e-06, "loss": 0.75545418, "num_input_tokens_seen": 223137600, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.12945557, "step": 10357, "time_per_iteration": 2.5656580924987793 }, { "auxiliary_loss_clip": 0.06429017, "auxiliary_loss_mlp": 0.01268342, "balance_loss_clip": 0.0627401, "balance_loss_mlp": 0.0125654, "epoch": 0.6227566511348264, "flos": 22169099738880.0, "grad_norm": 2.948850884526411, "language_loss": 0.76370472, "learning_rate": 1.3159800452924778e-06, "loss": 0.84067827, "num_input_tokens_seen": 223154360, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.11810303, "step": 10358, "time_per_iteration": 2.526913642883301 }, { "auxiliary_loss_clip": 0.06426609, "auxiliary_loss_mlp": 0.01267636, "balance_loss_clip": 0.06274957, "balance_loss_mlp": 0.01257003, "epoch": 0.6228167743874944, "flos": 18046720166400.0, "grad_norm": 2.0632320139505693, "language_loss": 0.82718837, "learning_rate": 1.3156140825546588e-06, "loss": 0.90413082, "num_input_tokens_seen": 223172255, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.10638428, "step": 10359, "time_per_iteration": 2.569904327392578 }, { "auxiliary_loss_clip": 0.06419165, "auxiliary_loss_mlp": 0.01265247, "balance_loss_clip": 0.06274553, "balance_loss_mlp": 0.01254506, "epoch": 0.6228768976401623, "flos": 17747620617600.0, "grad_norm": 2.068681262684502, "language_loss": 0.74217534, "learning_rate": 1.315248145768822e-06, "loss": 0.81901944, "num_input_tokens_seen": 223186965, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10736084, "step": 10360, "time_per_iteration": 3.949439764022827 }, { "auxiliary_loss_clip": 0.06425153, "auxiliary_loss_mlp": 0.01265846, "balance_loss_clip": 0.06276841, "balance_loss_mlp": 0.01255004, "epoch": 0.6229370208928303, "flos": 17900755153920.0, "grad_norm": 1.9994118616899743, "language_loss": 0.7819804, "learning_rate": 1.3148822349488442e-06, "loss": 0.85889041, "num_input_tokens_seen": 223206045, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.10839844, "step": 10361, "time_per_iteration": 2.567389726638794 }, { "auxiliary_loss_clip": 0.06422912, "auxiliary_loss_mlp": 0.01267898, "balance_loss_clip": 0.06275468, "balance_loss_mlp": 0.01257247, "epoch": 0.6229971441454982, "flos": 17353512138240.0, "grad_norm": 1.696024511815875, "language_loss": 0.68057114, "learning_rate": 1.3145163501086005e-06, "loss": 0.75747925, "num_input_tokens_seen": 223224820, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.10656738, "step": 10362, "time_per_iteration": 2.5501515865325928 }, { "auxiliary_loss_clip": 0.06425124, "auxiliary_loss_mlp": 0.01264717, "balance_loss_clip": 0.06276447, "balance_loss_mlp": 0.01253148, "epoch": 0.6230572673981662, "flos": 29248989822720.0, "grad_norm": 2.3115826096261447, "language_loss": 0.68203282, "learning_rate": 1.3141504912619658e-06, "loss": 0.75893128, "num_input_tokens_seen": 223243205, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.11578369, "step": 10363, "time_per_iteration": 2.628967046737671 }, { "auxiliary_loss_clip": 0.06429075, "auxiliary_loss_mlp": 0.01264661, "balance_loss_clip": 0.06276342, "balance_loss_mlp": 0.01253145, "epoch": 0.6231173906508342, "flos": 16331505488640.0, "grad_norm": 1.6726922809301292, "language_loss": 0.86714238, "learning_rate": 1.3137846584228127e-06, "loss": 0.9440797, "num_input_tokens_seen": 223261370, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.11523438, "step": 10364, "time_per_iteration": 3.9853262901306152 }, { "auxiliary_loss_clip": 0.06314589, "auxiliary_loss_mlp": 0.01251673, "balance_loss_clip": 0.06254193, "balance_loss_mlp": 0.01250219, "epoch": 0.6231775139035022, "flos": 68719513587840.0, "grad_norm": 0.8719971600152815, "language_loss": 0.60692441, "learning_rate": 1.313418851605015e-06, "loss": 0.68258715, "num_input_tokens_seen": 223315050, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 0.01451874, "step": 10365, "time_per_iteration": 3.1851460933685303 }, { "auxiliary_loss_clip": 0.06433772, "auxiliary_loss_mlp": 0.0127147, "balance_loss_clip": 0.06277342, "balance_loss_mlp": 0.01258828, "epoch": 0.6232376371561702, "flos": 19825903036800.0, "grad_norm": 1.6691285482434894, "language_loss": 0.75437266, "learning_rate": 1.3130530708224427e-06, "loss": 0.83142507, "num_input_tokens_seen": 223332130, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.12646484, "step": 10366, "time_per_iteration": 2.5563511848449707 }, { "auxiliary_loss_clip": 0.06428754, "auxiliary_loss_mlp": 0.01267406, "balance_loss_clip": 0.06276741, "balance_loss_mlp": 0.01256105, "epoch": 0.6232977604088381, "flos": 23264969362560.0, "grad_norm": 1.8178601799993632, "language_loss": 0.76891226, "learning_rate": 1.3126873160889665e-06, "loss": 0.84587389, "num_input_tokens_seen": 223351605, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.11297607, "step": 10367, "time_per_iteration": 2.579770565032959 }, { "auxiliary_loss_clip": 0.06420676, "auxiliary_loss_mlp": 0.01269335, "balance_loss_clip": 0.0627546, "balance_loss_mlp": 0.01259244, "epoch": 0.6233578836615061, "flos": 21112907823360.0, "grad_norm": 1.4123334994268928, "language_loss": 0.78534645, "learning_rate": 1.312321587418457e-06, "loss": 0.86224663, "num_input_tokens_seen": 223372090, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10107422, "step": 10368, "time_per_iteration": 2.5583078861236572 }, { "auxiliary_loss_clip": 0.06428042, "auxiliary_loss_mlp": 0.01264896, "balance_loss_clip": 0.06277484, "balance_loss_mlp": 0.01254089, "epoch": 0.623418006914174, "flos": 23776266176640.0, "grad_norm": 2.6647814537895647, "language_loss": 0.68785363, "learning_rate": 1.3119558848247811e-06, "loss": 0.76478302, "num_input_tokens_seen": 223390110, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.10803223, "step": 10369, "time_per_iteration": 2.573763132095337 }, { "auxiliary_loss_clip": 0.06426665, "auxiliary_loss_mlp": 0.01268034, "balance_loss_clip": 0.06276441, "balance_loss_mlp": 0.01256602, "epoch": 0.6234781301668421, "flos": 17895556200960.0, "grad_norm": 1.853083717216118, "language_loss": 0.87723351, "learning_rate": 1.3115902083218072e-06, "loss": 0.95418048, "num_input_tokens_seen": 223404205, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.11419678, "step": 10370, "time_per_iteration": 2.5458903312683105 }, { "auxiliary_loss_clip": 0.06421947, "auxiliary_loss_mlp": 0.01268792, "balance_loss_clip": 0.06275505, "balance_loss_mlp": 0.01257861, "epoch": 0.62353825341951, "flos": 26182424822400.0, "grad_norm": 1.4767541140153793, "language_loss": 0.66290402, "learning_rate": 1.311224557923402e-06, "loss": 0.73981142, "num_input_tokens_seen": 223424855, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.109375, "step": 10371, "time_per_iteration": 2.588996171951294 }, { "auxiliary_loss_clip": 0.06414587, "auxiliary_loss_mlp": 0.01262644, "balance_loss_clip": 0.06274128, "balance_loss_mlp": 0.01253537, "epoch": 0.623598376672178, "flos": 31148044358400.0, "grad_norm": 1.2953165379658058, "language_loss": 0.77704865, "learning_rate": 1.3108589336434298e-06, "loss": 0.85382092, "num_input_tokens_seen": 223447225, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09106445, "step": 10372, "time_per_iteration": 2.6407723426818848 }, { "auxiliary_loss_clip": 0.06428368, "auxiliary_loss_mlp": 0.01265725, "balance_loss_clip": 0.06278542, "balance_loss_mlp": 0.01253757, "epoch": 0.6236584999248459, "flos": 23736588468480.0, "grad_norm": 1.7705735270848522, "language_loss": 0.78203499, "learning_rate": 1.3104933354957568e-06, "loss": 0.85897589, "num_input_tokens_seen": 223467520, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.11981201, "step": 10373, "time_per_iteration": 2.5741524696350098 }, { "auxiliary_loss_clip": 0.06417777, "auxiliary_loss_mlp": 0.0126353, "balance_loss_clip": 0.0627434, "balance_loss_mlp": 0.01253868, "epoch": 0.6237186231775139, "flos": 21769289182080.0, "grad_norm": 1.5670799374769808, "language_loss": 0.69518387, "learning_rate": 1.3101277634942448e-06, "loss": 0.77199692, "num_input_tokens_seen": 223488130, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09667969, "step": 10374, "time_per_iteration": 2.6908657550811768 }, { "auxiliary_loss_clip": 0.06425934, "auxiliary_loss_mlp": 0.01267053, "balance_loss_clip": 0.06275941, "balance_loss_mlp": 0.01256575, "epoch": 0.6237787464301818, "flos": 14944795943040.0, "grad_norm": 1.6794691365189252, "language_loss": 0.76939076, "learning_rate": 1.3097622176527577e-06, "loss": 0.84632063, "num_input_tokens_seen": 223505105, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.1048584, "step": 10375, "time_per_iteration": 3.997107982635498 }, { "auxiliary_loss_clip": 0.0642317, "auxiliary_loss_mlp": 0.01263651, "balance_loss_clip": 0.06277761, "balance_loss_mlp": 0.01253995, "epoch": 0.6238388696828499, "flos": 35599054844160.0, "grad_norm": 1.3629463670207154, "language_loss": 0.70063376, "learning_rate": 1.3093966979851566e-06, "loss": 0.77750206, "num_input_tokens_seen": 223528065, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.09667969, "step": 10376, "time_per_iteration": 4.186543941497803 }, { "auxiliary_loss_clip": 0.064283, "auxiliary_loss_mlp": 0.01267149, "balance_loss_clip": 0.06275938, "balance_loss_mlp": 0.01255568, "epoch": 0.6238989929355178, "flos": 23630343091200.0, "grad_norm": 1.7002563782079636, "language_loss": 0.7706368, "learning_rate": 1.309031204505301e-06, "loss": 0.84759128, "num_input_tokens_seen": 223547305, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.11578369, "step": 10377, "time_per_iteration": 2.5972845554351807 }, { "auxiliary_loss_clip": 0.0642692, "auxiliary_loss_mlp": 0.01266922, "balance_loss_clip": 0.0627934, "balance_loss_mlp": 0.01256849, "epoch": 0.6239591161881858, "flos": 22093433902080.0, "grad_norm": 2.0237851573976005, "language_loss": 0.68903041, "learning_rate": 1.308665737227052e-06, "loss": 0.76596892, "num_input_tokens_seen": 223567205, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.10070801, "step": 10378, "time_per_iteration": 2.567626476287842 }, { "auxiliary_loss_clip": 0.06427509, "auxiliary_loss_mlp": 0.01266527, "balance_loss_clip": 0.06278602, "balance_loss_mlp": 0.01255608, "epoch": 0.6240192394408538, "flos": 24542959835520.0, "grad_norm": 4.0543180754579105, "language_loss": 0.7681092, "learning_rate": 1.3083002961642675e-06, "loss": 0.84504962, "num_input_tokens_seen": 223586560, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.10925293, "step": 10379, "time_per_iteration": 2.582502841949463 }, { "auxiliary_loss_clip": 0.06423064, "auxiliary_loss_mlp": 0.012669, "balance_loss_clip": 0.06273608, "balance_loss_mlp": 0.01255831, "epoch": 0.6240793626935217, "flos": 27940000538880.0, "grad_norm": 1.4859801362297198, "language_loss": 0.79486442, "learning_rate": 1.3079348813308051e-06, "loss": 0.87176412, "num_input_tokens_seen": 223610595, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.11077881, "step": 10380, "time_per_iteration": 2.606353759765625 }, { "auxiliary_loss_clip": 0.06421687, "auxiliary_loss_mlp": 0.01264353, "balance_loss_clip": 0.06277686, "balance_loss_mlp": 0.01253994, "epoch": 0.6241394859461897, "flos": 22899008655360.0, "grad_norm": 1.5536912441318456, "language_loss": 0.79972327, "learning_rate": 1.3075694927405207e-06, "loss": 0.8765837, "num_input_tokens_seen": 223630230, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10357666, "step": 10381, "time_per_iteration": 2.5601093769073486 }, { "auxiliary_loss_clip": 0.06429083, "auxiliary_loss_mlp": 0.01264506, "balance_loss_clip": 0.06278782, "balance_loss_mlp": 0.0125367, "epoch": 0.6241996091988576, "flos": 12755781953280.0, "grad_norm": 2.1719727913606546, "language_loss": 0.74562836, "learning_rate": 1.3072041304072718e-06, "loss": 0.82256424, "num_input_tokens_seen": 223648360, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.10839844, "step": 10382, "time_per_iteration": 2.5464353561401367 }, { "auxiliary_loss_clip": 0.06425563, "auxiliary_loss_mlp": 0.0126852, "balance_loss_clip": 0.06280132, "balance_loss_mlp": 0.01258852, "epoch": 0.6242597324515257, "flos": 25858867080960.0, "grad_norm": 1.4892438823815943, "language_loss": 0.78880799, "learning_rate": 1.306838794344911e-06, "loss": 0.86574882, "num_input_tokens_seen": 223671255, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.09674072, "step": 10383, "time_per_iteration": 2.601299285888672 }, { "auxiliary_loss_clip": 0.06427158, "auxiliary_loss_mlp": 0.01264231, "balance_loss_clip": 0.062784, "balance_loss_mlp": 0.01253955, "epoch": 0.6243198557041936, "flos": 19943804131200.0, "grad_norm": 1.7111052971050995, "language_loss": 0.75763357, "learning_rate": 1.3064734845672925e-06, "loss": 0.83454752, "num_input_tokens_seen": 223689860, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.10272217, "step": 10384, "time_per_iteration": 2.5519769191741943 }, { "auxiliary_loss_clip": 0.06428579, "auxiliary_loss_mlp": 0.01265394, "balance_loss_clip": 0.06278617, "balance_loss_mlp": 0.0125426, "epoch": 0.6243799789568616, "flos": 18412177749120.0, "grad_norm": 1.9520204388523172, "language_loss": 0.66775131, "learning_rate": 1.3061082010882694e-06, "loss": 0.74469101, "num_input_tokens_seen": 223707835, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.11120605, "step": 10385, "time_per_iteration": 2.553877353668213 }, { "auxiliary_loss_clip": 0.06324748, "auxiliary_loss_mlp": 0.01252958, "balance_loss_clip": 0.06264512, "balance_loss_mlp": 0.01251468, "epoch": 0.6244401022095295, "flos": 66048887128320.0, "grad_norm": 0.7484742294038443, "language_loss": 0.6182037, "learning_rate": 1.305742943921692e-06, "loss": 0.69398081, "num_input_tokens_seen": 223771875, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01489258, "step": 10386, "time_per_iteration": 3.2036333084106445 }, { "auxiliary_loss_clip": 0.06425181, "auxiliary_loss_mlp": 0.01267096, "balance_loss_clip": 0.0627597, "balance_loss_mlp": 0.01256159, "epoch": 0.6245002254621975, "flos": 24578109423360.0, "grad_norm": 2.5279507130261085, "language_loss": 0.71905643, "learning_rate": 1.3053777130814128e-06, "loss": 0.7959792, "num_input_tokens_seen": 223788895, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.10943604, "step": 10387, "time_per_iteration": 2.6213252544403076 }, { "auxiliary_loss_clip": 0.06436393, "auxiliary_loss_mlp": 0.01267658, "balance_loss_clip": 0.06279534, "balance_loss_mlp": 0.01255046, "epoch": 0.6245603487148654, "flos": 29176510440960.0, "grad_norm": 2.57835288408977, "language_loss": 0.66020495, "learning_rate": 1.3050125085812798e-06, "loss": 0.73724544, "num_input_tokens_seen": 223810385, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.12609863, "step": 10388, "time_per_iteration": 2.6384003162384033 }, { "auxiliary_loss_clip": 0.06427279, "auxiliary_loss_mlp": 0.01266413, "balance_loss_clip": 0.06279305, "balance_loss_mlp": 0.01256209, "epoch": 0.6246204719675335, "flos": 14794805934720.0, "grad_norm": 1.6221011037101303, "language_loss": 0.79802918, "learning_rate": 1.3046473304351417e-06, "loss": 0.87496608, "num_input_tokens_seen": 223826040, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.10192871, "step": 10389, "time_per_iteration": 2.5489346981048584 }, { "auxiliary_loss_clip": 0.06425634, "auxiliary_loss_mlp": 0.01266282, "balance_loss_clip": 0.06277882, "balance_loss_mlp": 0.01256232, "epoch": 0.6246805952202014, "flos": 12498204902400.0, "grad_norm": 1.831859068313078, "language_loss": 0.61004329, "learning_rate": 1.3042821786568475e-06, "loss": 0.68696243, "num_input_tokens_seen": 223842300, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10040283, "step": 10390, "time_per_iteration": 2.559725522994995 }, { "auxiliary_loss_clip": 0.06436899, "auxiliary_loss_mlp": 0.01268748, "balance_loss_clip": 0.06283215, "balance_loss_mlp": 0.01257429, "epoch": 0.6247407184728694, "flos": 12791602373760.0, "grad_norm": 1.8263196726155608, "language_loss": 0.77002478, "learning_rate": 1.3039170532602416e-06, "loss": 0.84708118, "num_input_tokens_seen": 223858320, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.11328125, "step": 10391, "time_per_iteration": 2.578563690185547 }, { "auxiliary_loss_clip": 0.06433205, "auxiliary_loss_mlp": 0.0126655, "balance_loss_clip": 0.06282798, "balance_loss_mlp": 0.01255201, "epoch": 0.6248008417255374, "flos": 40639417822080.0, "grad_norm": 1.4126915685356398, "language_loss": 0.64361906, "learning_rate": 1.3035519542591718e-06, "loss": 0.72061658, "num_input_tokens_seen": 223883545, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.11346436, "step": 10392, "time_per_iteration": 2.753157138824463 }, { "auxiliary_loss_clip": 0.06431325, "auxiliary_loss_mlp": 0.01268132, "balance_loss_clip": 0.06280611, "balance_loss_mlp": 0.01257027, "epoch": 0.6248609649782053, "flos": 19908235272960.0, "grad_norm": 1.7235557464504383, "language_loss": 0.77109224, "learning_rate": 1.3031868816674819e-06, "loss": 0.84808677, "num_input_tokens_seen": 223901445, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.11102295, "step": 10393, "time_per_iteration": 2.561422824859619 }, { "auxiliary_loss_clip": 0.0643255, "auxiliary_loss_mlp": 0.0126787, "balance_loss_clip": 0.06279199, "balance_loss_mlp": 0.01254864, "epoch": 0.6249210882308733, "flos": 19688868483840.0, "grad_norm": 1.7130151841009753, "language_loss": 0.8270576, "learning_rate": 1.3028218354990142e-06, "loss": 0.90406179, "num_input_tokens_seen": 223920170, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.13006592, "step": 10394, "time_per_iteration": 2.551327705383301 }, { "auxiliary_loss_clip": 0.06433808, "auxiliary_loss_mlp": 0.01266066, "balance_loss_clip": 0.06280985, "balance_loss_mlp": 0.01254753, "epoch": 0.6249812114835412, "flos": 13995855653760.0, "grad_norm": 1.6674376807691937, "language_loss": 0.7534132, "learning_rate": 1.3024568157676128e-06, "loss": 0.83041191, "num_input_tokens_seen": 223936495, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.11303711, "step": 10395, "time_per_iteration": 2.5290720462799072 }, { "auxiliary_loss_clip": 0.06435226, "auxiliary_loss_mlp": 0.01270782, "balance_loss_clip": 0.06281127, "balance_loss_mlp": 0.01259588, "epoch": 0.6250413347362093, "flos": 14533916647680.0, "grad_norm": 2.125759456354683, "language_loss": 0.73144156, "learning_rate": 1.302091822487119e-06, "loss": 0.8085016, "num_input_tokens_seen": 223950070, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.11187744, "step": 10396, "time_per_iteration": 2.512152671813965 }, { "auxiliary_loss_clip": 0.06428079, "auxiliary_loss_mlp": 0.01267942, "balance_loss_clip": 0.06278889, "balance_loss_mlp": 0.01257762, "epoch": 0.6251014579888772, "flos": 22969098195840.0, "grad_norm": 1.8201058004919786, "language_loss": 0.76008457, "learning_rate": 1.3017268556713732e-06, "loss": 0.83704484, "num_input_tokens_seen": 223970065, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.10186768, "step": 10397, "time_per_iteration": 2.562412977218628 }, { "auxiliary_loss_clip": 0.06433786, "auxiliary_loss_mlp": 0.01269118, "balance_loss_clip": 0.06283275, "balance_loss_mlp": 0.01258389, "epoch": 0.6251615812415452, "flos": 28118809152000.0, "grad_norm": 1.7623409690674756, "language_loss": 0.75458258, "learning_rate": 1.3013619153342154e-06, "loss": 0.83161163, "num_input_tokens_seen": 223990315, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.10723877, "step": 10398, "time_per_iteration": 2.60099196434021 }, { "auxiliary_loss_clip": 0.06435571, "auxiliary_loss_mlp": 0.01270518, "balance_loss_clip": 0.06282653, "balance_loss_mlp": 0.01258931, "epoch": 0.6252217044942131, "flos": 26731764190080.0, "grad_norm": 1.6112557882419494, "language_loss": 0.74147564, "learning_rate": 1.300997001489483e-06, "loss": 0.81853652, "num_input_tokens_seen": 224009960, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.1159668, "step": 10399, "time_per_iteration": 4.042568683624268 }, { "auxiliary_loss_clip": 0.064265, "auxiliary_loss_mlp": 0.01267305, "balance_loss_clip": 0.06276542, "balance_loss_mlp": 0.01256511, "epoch": 0.6252818277468811, "flos": 20012216590080.0, "grad_norm": 1.7810144176138345, "language_loss": 0.74714518, "learning_rate": 1.3006321141510147e-06, "loss": 0.82408321, "num_input_tokens_seen": 224028870, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.10791016, "step": 10400, "time_per_iteration": 2.5490148067474365 }, { "auxiliary_loss_clip": 0.06319868, "auxiliary_loss_mlp": 0.01254042, "balance_loss_clip": 0.06260061, "balance_loss_mlp": 0.01252477, "epoch": 0.625341950999549, "flos": 59298550352640.0, "grad_norm": 0.8485181771823026, "language_loss": 0.56426752, "learning_rate": 1.3002672533326465e-06, "loss": 0.64000654, "num_input_tokens_seen": 224094140, "router_z_loss_clip": 0.59960938, "router_z_loss_mlp": 0.01564026, "step": 10401, "time_per_iteration": 3.2319412231445312 }, { "auxiliary_loss_clip": 0.0642664, "auxiliary_loss_mlp": 0.01263572, "balance_loss_clip": 0.06275257, "balance_loss_mlp": 0.01252343, "epoch": 0.625402074252217, "flos": 20163296701440.0, "grad_norm": 2.526249956732798, "language_loss": 0.83355784, "learning_rate": 1.2999024190482146e-06, "loss": 0.91045994, "num_input_tokens_seen": 224113235, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.11224365, "step": 10402, "time_per_iteration": 2.5463051795959473 }, { "auxiliary_loss_clip": 0.06421858, "auxiliary_loss_mlp": 0.01265552, "balance_loss_clip": 0.06274545, "balance_loss_mlp": 0.01254948, "epoch": 0.625462197504885, "flos": 29140228823040.0, "grad_norm": 1.8956413891479285, "language_loss": 0.68972111, "learning_rate": 1.2995376113115527e-06, "loss": 0.76659524, "num_input_tokens_seen": 224134530, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.1060791, "step": 10403, "time_per_iteration": 4.067505359649658 }, { "auxiliary_loss_clip": 0.06431008, "auxiliary_loss_mlp": 0.01266824, "balance_loss_clip": 0.06281056, "balance_loss_mlp": 0.0125476, "epoch": 0.625522320757553, "flos": 26111664449280.0, "grad_norm": 1.683159165417119, "language_loss": 0.72387308, "learning_rate": 1.2991728301364954e-06, "loss": 0.8008514, "num_input_tokens_seen": 224154170, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.1206665, "step": 10404, "time_per_iteration": 2.637807846069336 }, { "auxiliary_loss_clip": 0.06429939, "auxiliary_loss_mlp": 0.01266662, "balance_loss_clip": 0.06280325, "balance_loss_mlp": 0.01255153, "epoch": 0.625582444010221, "flos": 20637179867520.0, "grad_norm": 2.324657583833533, "language_loss": 0.70128584, "learning_rate": 1.2988080755368742e-06, "loss": 0.77825189, "num_input_tokens_seen": 224172730, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.1151123, "step": 10405, "time_per_iteration": 2.5906426906585693 }, { "auxiliary_loss_clip": 0.06424232, "auxiliary_loss_mlp": 0.01266127, "balance_loss_clip": 0.06277164, "balance_loss_mlp": 0.01255631, "epoch": 0.6256425672628889, "flos": 20527706108160.0, "grad_norm": 1.4744679741792601, "language_loss": 0.79056698, "learning_rate": 1.2984433475265207e-06, "loss": 0.8674705, "num_input_tokens_seen": 224192620, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.1048584, "step": 10406, "time_per_iteration": 2.585817337036133 }, { "auxiliary_loss_clip": 0.06430017, "auxiliary_loss_mlp": 0.01265364, "balance_loss_clip": 0.06280338, "balance_loss_mlp": 0.01254838, "epoch": 0.6257026905155569, "flos": 29536182092160.0, "grad_norm": 1.7494588552737989, "language_loss": 0.68955541, "learning_rate": 1.2980786461192666e-06, "loss": 0.76650918, "num_input_tokens_seen": 224214660, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.10522461, "step": 10407, "time_per_iteration": 2.625542163848877 }, { "auxiliary_loss_clip": 0.0641212, "auxiliary_loss_mlp": 0.01266965, "balance_loss_clip": 0.06270653, "balance_loss_mlp": 0.01257082, "epoch": 0.6257628137682248, "flos": 24031788802560.0, "grad_norm": 1.6001804247501805, "language_loss": 0.8516705, "learning_rate": 1.2977139713289398e-06, "loss": 0.92846131, "num_input_tokens_seen": 224234170, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09875488, "step": 10408, "time_per_iteration": 2.5768744945526123 }, { "auxiliary_loss_clip": 0.06425656, "auxiliary_loss_mlp": 0.01266164, "balance_loss_clip": 0.06279466, "balance_loss_mlp": 0.01256263, "epoch": 0.6258229370208929, "flos": 20857385197440.0, "grad_norm": 1.6068808135972152, "language_loss": 0.79583746, "learning_rate": 1.2973493231693699e-06, "loss": 0.87275565, "num_input_tokens_seen": 224253115, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.09899902, "step": 10409, "time_per_iteration": 2.558680772781372 }, { "auxiliary_loss_clip": 0.06418818, "auxiliary_loss_mlp": 0.01264977, "balance_loss_clip": 0.06271332, "balance_loss_mlp": 0.01254355, "epoch": 0.6258830602735608, "flos": 22237218708480.0, "grad_norm": 2.0607152364772636, "language_loss": 0.69690967, "learning_rate": 1.2969847016543845e-06, "loss": 0.77374762, "num_input_tokens_seen": 224271375, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.10620117, "step": 10410, "time_per_iteration": 2.5620481967926025 }, { "auxiliary_loss_clip": 0.06419452, "auxiliary_loss_mlp": 0.01263506, "balance_loss_clip": 0.06275609, "balance_loss_mlp": 0.01254255, "epoch": 0.6259431835262288, "flos": 25082949473280.0, "grad_norm": 1.7789058773617021, "language_loss": 0.68253559, "learning_rate": 1.2966201067978086e-06, "loss": 0.75936514, "num_input_tokens_seen": 224290315, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09246826, "step": 10411, "time_per_iteration": 2.5750861167907715 }, { "auxiliary_loss_clip": 0.06425527, "auxiliary_loss_mlp": 0.01270541, "balance_loss_clip": 0.06276685, "balance_loss_mlp": 0.01259473, "epoch": 0.6260033067788967, "flos": 28259072087040.0, "grad_norm": 3.810778453903971, "language_loss": 0.69823366, "learning_rate": 1.2962555386134702e-06, "loss": 0.77519441, "num_input_tokens_seen": 224310545, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.11071777, "step": 10412, "time_per_iteration": 2.6617624759674072 }, { "auxiliary_loss_clip": 0.06420852, "auxiliary_loss_mlp": 0.01266186, "balance_loss_clip": 0.06274286, "balance_loss_mlp": 0.01256261, "epoch": 0.6260634300315647, "flos": 23374107705600.0, "grad_norm": 1.3948832655396965, "language_loss": 0.69604647, "learning_rate": 1.2958909971151908e-06, "loss": 0.77291685, "num_input_tokens_seen": 224331115, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.0993042, "step": 10413, "time_per_iteration": 2.575948476791382 }, { "auxiliary_loss_clip": 0.06427267, "auxiliary_loss_mlp": 0.01266416, "balance_loss_clip": 0.06272262, "balance_loss_mlp": 0.01253988, "epoch": 0.6261235532842326, "flos": 18040221475200.0, "grad_norm": 2.891729327402108, "language_loss": 0.8101455, "learning_rate": 1.295526482316796e-06, "loss": 0.88708234, "num_input_tokens_seen": 224347525, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.12420654, "step": 10414, "time_per_iteration": 3.9537882804870605 }, { "auxiliary_loss_clip": 0.06420345, "auxiliary_loss_mlp": 0.01269537, "balance_loss_clip": 0.06274006, "balance_loss_mlp": 0.01258462, "epoch": 0.6261836765369007, "flos": 22016677962240.0, "grad_norm": 1.6087897329739291, "language_loss": 0.74606586, "learning_rate": 1.2951619942321083e-06, "loss": 0.82296461, "num_input_tokens_seen": 224367045, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.11083984, "step": 10415, "time_per_iteration": 2.5820086002349854 }, { "auxiliary_loss_clip": 0.06419306, "auxiliary_loss_mlp": 0.01267237, "balance_loss_clip": 0.06274651, "balance_loss_mlp": 0.01256478, "epoch": 0.6262437997895686, "flos": 24942896173440.0, "grad_norm": 1.5214488240048676, "language_loss": 0.74996692, "learning_rate": 1.2947975328749472e-06, "loss": 0.82683229, "num_input_tokens_seen": 224388860, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10754395, "step": 10416, "time_per_iteration": 4.060314416885376 }, { "auxiliary_loss_clip": 0.0641463, "auxiliary_loss_mlp": 0.01264796, "balance_loss_clip": 0.06272075, "balance_loss_mlp": 0.01255104, "epoch": 0.6263039230422366, "flos": 31615680395520.0, "grad_norm": 1.8402063621224811, "language_loss": 0.84402013, "learning_rate": 1.2944330982591352e-06, "loss": 0.92081434, "num_input_tokens_seen": 224409645, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09692383, "step": 10417, "time_per_iteration": 2.6472952365875244 }, { "auxiliary_loss_clip": 0.0642226, "auxiliary_loss_mlp": 0.01265653, "balance_loss_clip": 0.06273956, "balance_loss_mlp": 0.01254853, "epoch": 0.6263640462949046, "flos": 17645232528000.0, "grad_norm": 2.82830093892021, "language_loss": 0.57420176, "learning_rate": 1.2940686903984904e-06, "loss": 0.65108091, "num_input_tokens_seen": 224428530, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.10797119, "step": 10418, "time_per_iteration": 2.526890993118286 }, { "auxiliary_loss_clip": 0.06433708, "auxiliary_loss_mlp": 0.01269495, "balance_loss_clip": 0.06280184, "balance_loss_mlp": 0.01257831, "epoch": 0.6264241695475725, "flos": 19981175852160.0, "grad_norm": 1.9614699351152334, "language_loss": 0.85084772, "learning_rate": 1.2937043093068316e-06, "loss": 0.92787969, "num_input_tokens_seen": 224447175, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.11669922, "step": 10419, "time_per_iteration": 2.559544801712036 }, { "auxiliary_loss_clip": 0.06425085, "auxiliary_loss_mlp": 0.01271291, "balance_loss_clip": 0.06275731, "balance_loss_mlp": 0.01260735, "epoch": 0.6264842928002405, "flos": 27351654295680.0, "grad_norm": 1.4089068134491736, "language_loss": 0.64757681, "learning_rate": 1.2933399549979762e-06, "loss": 0.72454059, "num_input_tokens_seen": 224469445, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.10546875, "step": 10420, "time_per_iteration": 2.608151435852051 }, { "auxiliary_loss_clip": 0.06421872, "auxiliary_loss_mlp": 0.01267479, "balance_loss_clip": 0.06270361, "balance_loss_mlp": 0.0125585, "epoch": 0.6265444160529084, "flos": 23002989972480.0, "grad_norm": 1.8700302260700576, "language_loss": 0.86515981, "learning_rate": 1.292975627485741e-06, "loss": 0.94205332, "num_input_tokens_seen": 224486590, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.11621094, "step": 10421, "time_per_iteration": 2.716244697570801 }, { "auxiliary_loss_clip": 0.06421015, "auxiliary_loss_mlp": 0.01263459, "balance_loss_clip": 0.06274317, "balance_loss_mlp": 0.0125313, "epoch": 0.6266045393055765, "flos": 19944516890880.0, "grad_norm": 2.273196265110466, "language_loss": 0.79903567, "learning_rate": 1.2926113267839403e-06, "loss": 0.87588042, "num_input_tokens_seen": 224502795, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10327148, "step": 10422, "time_per_iteration": 2.670469284057617 }, { "auxiliary_loss_clip": 0.06421222, "auxiliary_loss_mlp": 0.01263916, "balance_loss_clip": 0.06273638, "balance_loss_mlp": 0.01253348, "epoch": 0.6266646625582444, "flos": 24395946647040.0, "grad_norm": 1.653403164042018, "language_loss": 0.74584103, "learning_rate": 1.292247052906389e-06, "loss": 0.82269239, "num_input_tokens_seen": 224522300, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.10571289, "step": 10423, "time_per_iteration": 2.6505424976348877 }, { "auxiliary_loss_clip": 0.0642114, "auxiliary_loss_mlp": 0.01266003, "balance_loss_clip": 0.06274629, "balance_loss_mlp": 0.01256276, "epoch": 0.6267247858109124, "flos": 14689021754880.0, "grad_norm": 2.3123451926652066, "language_loss": 0.78147328, "learning_rate": 1.2918828058669004e-06, "loss": 0.85834467, "num_input_tokens_seen": 224538260, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.09735107, "step": 10424, "time_per_iteration": 2.539147138595581 }, { "auxiliary_loss_clip": 0.06425206, "auxiliary_loss_mlp": 0.01266876, "balance_loss_clip": 0.06278085, "balance_loss_mlp": 0.01255975, "epoch": 0.6267849090635803, "flos": 24935852430720.0, "grad_norm": 1.7367852169875897, "language_loss": 0.69493169, "learning_rate": 1.2915185856792868e-06, "loss": 0.77185255, "num_input_tokens_seen": 224559155, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10906982, "step": 10425, "time_per_iteration": 2.605907678604126 }, { "auxiliary_loss_clip": 0.06414869, "auxiliary_loss_mlp": 0.01265009, "balance_loss_clip": 0.06273816, "balance_loss_mlp": 0.01255466, "epoch": 0.6268450323162483, "flos": 25344886936320.0, "grad_norm": 1.590618968190223, "language_loss": 0.75010192, "learning_rate": 1.2911543923573598e-06, "loss": 0.82690072, "num_input_tokens_seen": 224578660, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09552002, "step": 10426, "time_per_iteration": 2.5936384201049805 }, { "auxiliary_loss_clip": 0.06421129, "auxiliary_loss_mlp": 0.01266946, "balance_loss_clip": 0.06272877, "balance_loss_mlp": 0.01256992, "epoch": 0.6269051555689162, "flos": 26184521174400.0, "grad_norm": 1.342278432027636, "language_loss": 0.80641967, "learning_rate": 1.290790225914929e-06, "loss": 0.88330042, "num_input_tokens_seen": 224599080, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.0994873, "step": 10427, "time_per_iteration": 2.5979394912719727 }, { "auxiliary_loss_clip": 0.06426127, "auxiliary_loss_mlp": 0.01269078, "balance_loss_clip": 0.06276797, "balance_loss_mlp": 0.01258981, "epoch": 0.6269652788215843, "flos": 18262271594880.0, "grad_norm": 1.740248474503082, "language_loss": 0.68515456, "learning_rate": 1.2904260863658034e-06, "loss": 0.76210666, "num_input_tokens_seen": 224614225, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.10101318, "step": 10428, "time_per_iteration": 2.527590274810791 }, { "auxiliary_loss_clip": 0.0642414, "auxiliary_loss_mlp": 0.01266366, "balance_loss_clip": 0.06277055, "balance_loss_mlp": 0.01256573, "epoch": 0.6270254020742522, "flos": 11770224629760.0, "grad_norm": 1.8430988282045913, "language_loss": 0.72305667, "learning_rate": 1.2900619737237928e-06, "loss": 0.79996175, "num_input_tokens_seen": 224632365, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.09796143, "step": 10429, "time_per_iteration": 2.548534393310547 }, { "auxiliary_loss_clip": 0.06424597, "auxiliary_loss_mlp": 0.01266355, "balance_loss_clip": 0.06274135, "balance_loss_mlp": 0.01254869, "epoch": 0.6270855253269202, "flos": 23482114018560.0, "grad_norm": 1.6931134015868194, "language_loss": 0.7988503, "learning_rate": 1.2896978880027023e-06, "loss": 0.87575984, "num_input_tokens_seen": 224651125, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.11499023, "step": 10430, "time_per_iteration": 2.623540163040161 }, { "auxiliary_loss_clip": 0.0632329, "auxiliary_loss_mlp": 0.01250207, "balance_loss_clip": 0.06262261, "balance_loss_mlp": 0.01248662, "epoch": 0.6271456485795882, "flos": 70084322490240.0, "grad_norm": 0.7612740815901994, "language_loss": 0.59197652, "learning_rate": 1.2893338292163393e-06, "loss": 0.6677115, "num_input_tokens_seen": 224716115, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.01542664, "step": 10431, "time_per_iteration": 3.2947962284088135 }, { "auxiliary_loss_clip": 0.06318089, "auxiliary_loss_mlp": 0.01250636, "balance_loss_clip": 0.06257233, "balance_loss_mlp": 0.01249169, "epoch": 0.6272057718322561, "flos": 65178673349760.0, "grad_norm": 0.8490850740322519, "language_loss": 0.63717049, "learning_rate": 1.2889697973785095e-06, "loss": 0.71285772, "num_input_tokens_seen": 224782930, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 0.01464844, "step": 10432, "time_per_iteration": 3.27559232711792 }, { "auxiliary_loss_clip": 0.06420936, "auxiliary_loss_mlp": 0.01266376, "balance_loss_clip": 0.06275415, "balance_loss_mlp": 0.01256935, "epoch": 0.6272658950849241, "flos": 24396240136320.0, "grad_norm": 1.867001135596907, "language_loss": 0.64864147, "learning_rate": 1.2886057925030153e-06, "loss": 0.72551465, "num_input_tokens_seen": 224802010, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.09442139, "step": 10433, "time_per_iteration": 2.581042528152466 }, { "auxiliary_loss_clip": 0.06427221, "auxiliary_loss_mlp": 0.01265885, "balance_loss_clip": 0.06275406, "balance_loss_mlp": 0.012549, "epoch": 0.627326018337592, "flos": 17971515527040.0, "grad_norm": 2.5474378327641714, "language_loss": 0.62259912, "learning_rate": 1.2882418146036612e-06, "loss": 0.69953024, "num_input_tokens_seen": 224818875, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.10980225, "step": 10434, "time_per_iteration": 2.5694541931152344 }, { "auxiliary_loss_clip": 0.064248, "auxiliary_loss_mlp": 0.01265054, "balance_loss_clip": 0.06276876, "balance_loss_mlp": 0.01254587, "epoch": 0.6273861415902601, "flos": 20236321134720.0, "grad_norm": 1.5070252607613581, "language_loss": 0.84676433, "learning_rate": 1.2878778636942484e-06, "loss": 0.92366284, "num_input_tokens_seen": 224837790, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.10467529, "step": 10435, "time_per_iteration": 2.5473217964172363 }, { "auxiliary_loss_clip": 0.06315236, "auxiliary_loss_mlp": 0.01252187, "balance_loss_clip": 0.06254841, "balance_loss_mlp": 0.01250624, "epoch": 0.627446264842928, "flos": 64971605911680.0, "grad_norm": 0.7154412192849301, "language_loss": 0.61297297, "learning_rate": 1.2875139397885786e-06, "loss": 0.68864721, "num_input_tokens_seen": 224899685, "router_z_loss_clip": 0.60449219, "router_z_loss_mlp": 0.01561737, "step": 10436, "time_per_iteration": 3.273263454437256 }, { "auxiliary_loss_clip": 0.06425551, "auxiliary_loss_mlp": 0.01270169, "balance_loss_clip": 0.06276806, "balance_loss_mlp": 0.01259583, "epoch": 0.627506388095596, "flos": 23590623456000.0, "grad_norm": 1.51389373196119, "language_loss": 0.77818334, "learning_rate": 1.2871500429004523e-06, "loss": 0.85514057, "num_input_tokens_seen": 224918650, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.10583496, "step": 10437, "time_per_iteration": 2.5663838386535645 }, { "auxiliary_loss_clip": 0.06315102, "auxiliary_loss_mlp": 0.01251095, "balance_loss_clip": 0.06254756, "balance_loss_mlp": 0.01249621, "epoch": 0.6275665113482639, "flos": 67603043059200.0, "grad_norm": 0.7124297039944122, "language_loss": 0.54166961, "learning_rate": 1.2867861730436667e-06, "loss": 0.61733162, "num_input_tokens_seen": 224981575, "router_z_loss_clip": 0.60205078, "router_z_loss_mlp": 0.01472473, "step": 10438, "time_per_iteration": 3.133115768432617 }, { "auxiliary_loss_clip": 0.06421391, "auxiliary_loss_mlp": 0.01266655, "balance_loss_clip": 0.06275195, "balance_loss_mlp": 0.01255897, "epoch": 0.6276266346009319, "flos": 27644422861440.0, "grad_norm": 2.078821711899309, "language_loss": 0.84134787, "learning_rate": 1.2864223302320214e-06, "loss": 0.91822839, "num_input_tokens_seen": 225000820, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10760498, "step": 10439, "time_per_iteration": 4.016111373901367 }, { "auxiliary_loss_clip": 0.06424974, "auxiliary_loss_mlp": 0.01266053, "balance_loss_clip": 0.06275164, "balance_loss_mlp": 0.0125542, "epoch": 0.6276867578535998, "flos": 22752540518400.0, "grad_norm": 2.4142042941220456, "language_loss": 0.79896176, "learning_rate": 1.2860585144793128e-06, "loss": 0.87587202, "num_input_tokens_seen": 225017585, "router_z_loss_clip": 1.49902344, "router_z_loss_mlp": 0.10638428, "step": 10440, "time_per_iteration": 2.539813756942749 }, { "auxiliary_loss_clip": 0.06416254, "auxiliary_loss_mlp": 0.01264939, "balance_loss_clip": 0.06276703, "balance_loss_mlp": 0.0125579, "epoch": 0.6277468811062679, "flos": 24651050002560.0, "grad_norm": 1.3709876093833844, "language_loss": 0.74355578, "learning_rate": 1.285694725799337e-06, "loss": 0.82036769, "num_input_tokens_seen": 225039085, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09155273, "step": 10441, "time_per_iteration": 2.59472393989563 }, { "auxiliary_loss_clip": 0.06420164, "auxiliary_loss_mlp": 0.01268269, "balance_loss_clip": 0.06275319, "balance_loss_mlp": 0.01257773, "epoch": 0.6278070043589358, "flos": 19684466144640.0, "grad_norm": 1.8033644009480445, "language_loss": 0.72179943, "learning_rate": 1.2853309642058884e-06, "loss": 0.79868376, "num_input_tokens_seen": 225058105, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10498047, "step": 10442, "time_per_iteration": 2.5324928760528564 }, { "auxiliary_loss_clip": 0.06424785, "auxiliary_loss_mlp": 0.01265288, "balance_loss_clip": 0.0627683, "balance_loss_mlp": 0.01255405, "epoch": 0.6278671276116038, "flos": 22127451459840.0, "grad_norm": 1.3632935381775808, "language_loss": 0.71410751, "learning_rate": 1.284967229712762e-06, "loss": 0.79100823, "num_input_tokens_seen": 225077605, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.09881592, "step": 10443, "time_per_iteration": 4.09528923034668 }, { "auxiliary_loss_clip": 0.06424204, "auxiliary_loss_mlp": 0.01265808, "balance_loss_clip": 0.06277864, "balance_loss_mlp": 0.01255681, "epoch": 0.6279272508642717, "flos": 23045099448960.0, "grad_norm": 1.8079550206778772, "language_loss": 0.73437095, "learning_rate": 1.2846035223337492e-06, "loss": 0.81127107, "num_input_tokens_seen": 225097775, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10131836, "step": 10444, "time_per_iteration": 2.57705020904541 }, { "auxiliary_loss_clip": 0.06422625, "auxiliary_loss_mlp": 0.01270027, "balance_loss_clip": 0.06277591, "balance_loss_mlp": 0.01259947, "epoch": 0.6279873741169397, "flos": 19829466835200.0, "grad_norm": 2.1785162975353916, "language_loss": 0.72550595, "learning_rate": 1.2842398420826423e-06, "loss": 0.80243254, "num_input_tokens_seen": 225115585, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10083008, "step": 10445, "time_per_iteration": 2.5254409313201904 }, { "auxiliary_loss_clip": 0.0642146, "auxiliary_loss_mlp": 0.0126581, "balance_loss_clip": 0.06274036, "balance_loss_mlp": 0.01255201, "epoch": 0.6280474973696077, "flos": 23922273116160.0, "grad_norm": 1.8079139231103214, "language_loss": 0.69642675, "learning_rate": 1.2838761889732331e-06, "loss": 0.77329946, "num_input_tokens_seen": 225135575, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.10601807, "step": 10446, "time_per_iteration": 2.582103967666626 }, { "auxiliary_loss_clip": 0.06431121, "auxiliary_loss_mlp": 0.01265443, "balance_loss_clip": 0.06278129, "balance_loss_mlp": 0.01254809, "epoch": 0.6281076206222757, "flos": 17973821514240.0, "grad_norm": 1.8521125463728458, "language_loss": 0.73815525, "learning_rate": 1.2835125630193102e-06, "loss": 0.81512094, "num_input_tokens_seen": 225154230, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.10632324, "step": 10447, "time_per_iteration": 2.545459508895874 }, { "auxiliary_loss_clip": 0.06309956, "auxiliary_loss_mlp": 0.01251347, "balance_loss_clip": 0.06249883, "balance_loss_mlp": 0.01249787, "epoch": 0.6281677438749437, "flos": 66797216743680.0, "grad_norm": 0.6674680370519731, "language_loss": 0.52292907, "learning_rate": 1.2831489642346626e-06, "loss": 0.59854209, "num_input_tokens_seen": 225213650, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01559448, "step": 10448, "time_per_iteration": 3.0598456859588623 }, { "auxiliary_loss_clip": 0.06425316, "auxiliary_loss_mlp": 0.01266699, "balance_loss_clip": 0.06277198, "balance_loss_mlp": 0.01256161, "epoch": 0.6282278671276116, "flos": 11661002432640.0, "grad_norm": 2.0579348110280584, "language_loss": 0.91886693, "learning_rate": 1.282785392633079e-06, "loss": 0.99578708, "num_input_tokens_seen": 225230135, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.10546875, "step": 10449, "time_per_iteration": 2.5356497764587402 }, { "auxiliary_loss_clip": 0.06427851, "auxiliary_loss_mlp": 0.01266574, "balance_loss_clip": 0.06280935, "balance_loss_mlp": 0.01257079, "epoch": 0.6282879903802796, "flos": 42751550090880.0, "grad_norm": 1.520155663767538, "language_loss": 0.60353613, "learning_rate": 1.2824218482283438e-06, "loss": 0.68048036, "num_input_tokens_seen": 225253520, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.0949707, "step": 10450, "time_per_iteration": 2.748448133468628 }, { "auxiliary_loss_clip": 0.0641847, "auxiliary_loss_mlp": 0.01267391, "balance_loss_clip": 0.06276024, "balance_loss_mlp": 0.01256627, "epoch": 0.6283481136329475, "flos": 20015067628800.0, "grad_norm": 1.5292400080689246, "language_loss": 0.77144337, "learning_rate": 1.2820583310342452e-06, "loss": 0.84830201, "num_input_tokens_seen": 225272460, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10760498, "step": 10451, "time_per_iteration": 2.5401339530944824 }, { "auxiliary_loss_clip": 0.0642679, "auxiliary_loss_mlp": 0.01265833, "balance_loss_clip": 0.06276982, "balance_loss_mlp": 0.012552, "epoch": 0.6284082368856155, "flos": 21910264876800.0, "grad_norm": 1.5033667536810766, "language_loss": 0.77777362, "learning_rate": 1.281694841064566e-06, "loss": 0.85469991, "num_input_tokens_seen": 225291700, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.10632324, "step": 10452, "time_per_iteration": 2.5856285095214844 }, { "auxiliary_loss_clip": 0.06425653, "auxiliary_loss_mlp": 0.01269724, "balance_loss_clip": 0.06278313, "balance_loss_mlp": 0.01259008, "epoch": 0.6284683601382834, "flos": 25491313146240.0, "grad_norm": 1.670430006052943, "language_loss": 0.73147351, "learning_rate": 1.2813313783330904e-06, "loss": 0.80842733, "num_input_tokens_seen": 225311470, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.10723877, "step": 10453, "time_per_iteration": 2.609966278076172 }, { "auxiliary_loss_clip": 0.06426316, "auxiliary_loss_mlp": 0.01265723, "balance_loss_clip": 0.0627666, "balance_loss_mlp": 0.01254845, "epoch": 0.6285284833909515, "flos": 16543241556480.0, "grad_norm": 1.6404411440386113, "language_loss": 0.80820638, "learning_rate": 1.2809679428536013e-06, "loss": 0.88512683, "num_input_tokens_seen": 225328385, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.10876465, "step": 10454, "time_per_iteration": 4.021422624588013 }, { "auxiliary_loss_clip": 0.06422748, "auxiliary_loss_mlp": 0.01271672, "balance_loss_clip": 0.0627937, "balance_loss_mlp": 0.01261784, "epoch": 0.6285886066436194, "flos": 22827367814400.0, "grad_norm": 1.7468706022445588, "language_loss": 0.82197261, "learning_rate": 1.2806045346398792e-06, "loss": 0.89891684, "num_input_tokens_seen": 225348415, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09893799, "step": 10455, "time_per_iteration": 4.0647993087768555 }, { "auxiliary_loss_clip": 0.06428213, "auxiliary_loss_mlp": 0.01266694, "balance_loss_clip": 0.06282134, "balance_loss_mlp": 0.01256054, "epoch": 0.6286487298962874, "flos": 24722355427200.0, "grad_norm": 1.6246552503795821, "language_loss": 0.8207165, "learning_rate": 1.280241153705706e-06, "loss": 0.89766556, "num_input_tokens_seen": 225367740, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10638428, "step": 10456, "time_per_iteration": 2.5858707427978516 }, { "auxiliary_loss_clip": 0.06430922, "auxiliary_loss_mlp": 0.01266399, "balance_loss_clip": 0.06279506, "balance_loss_mlp": 0.01255628, "epoch": 0.6287088531489553, "flos": 20747114824320.0, "grad_norm": 1.5424948920892405, "language_loss": 0.72248089, "learning_rate": 1.27987780006486e-06, "loss": 0.79945409, "num_input_tokens_seen": 225388405, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.10766602, "step": 10457, "time_per_iteration": 2.6272826194763184 }, { "auxiliary_loss_clip": 0.0643269, "auxiliary_loss_mlp": 0.01265409, "balance_loss_clip": 0.06279933, "balance_loss_mlp": 0.01254519, "epoch": 0.6287689764016233, "flos": 23076433676160.0, "grad_norm": 2.408877794276963, "language_loss": 0.80439758, "learning_rate": 1.2795144737311202e-06, "loss": 0.88137853, "num_input_tokens_seen": 225408360, "router_z_loss_clip": 1.52539062, "router_z_loss_mlp": 0.10894775, "step": 10458, "time_per_iteration": 2.5643506050109863 }, { "auxiliary_loss_clip": 0.06436276, "auxiliary_loss_mlp": 0.01274377, "balance_loss_clip": 0.06284413, "balance_loss_mlp": 0.01263589, "epoch": 0.6288290996542913, "flos": 32241859557120.0, "grad_norm": 1.652231480887734, "language_loss": 0.61382246, "learning_rate": 1.2791511747182635e-06, "loss": 0.69092894, "num_input_tokens_seen": 225431310, "router_z_loss_clip": 1.51660156, "router_z_loss_mlp": 0.10791016, "step": 10459, "time_per_iteration": 2.640444278717041 }, { "auxiliary_loss_clip": 0.06429504, "auxiliary_loss_mlp": 0.01267959, "balance_loss_clip": 0.06281006, "balance_loss_mlp": 0.0125785, "epoch": 0.6288892229069593, "flos": 24647695839360.0, "grad_norm": 1.6568362037240973, "language_loss": 0.79477108, "learning_rate": 1.2787879030400666e-06, "loss": 0.87174571, "num_input_tokens_seen": 225450385, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.10113525, "step": 10460, "time_per_iteration": 2.581303119659424 }, { "auxiliary_loss_clip": 0.06422593, "auxiliary_loss_mlp": 0.0126798, "balance_loss_clip": 0.06277716, "balance_loss_mlp": 0.0125796, "epoch": 0.6289493461596273, "flos": 17864138119680.0, "grad_norm": 1.6474574082311895, "language_loss": 0.73936093, "learning_rate": 1.2784246587103047e-06, "loss": 0.81626672, "num_input_tokens_seen": 225467325, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10021973, "step": 10461, "time_per_iteration": 2.5330960750579834 }, { "auxiliary_loss_clip": 0.06423032, "auxiliary_loss_mlp": 0.01268361, "balance_loss_clip": 0.06279536, "balance_loss_mlp": 0.01258717, "epoch": 0.6290094694122952, "flos": 22351807566720.0, "grad_norm": 1.774019555709216, "language_loss": 0.70600694, "learning_rate": 1.2780614417427523e-06, "loss": 0.7829209, "num_input_tokens_seen": 225487370, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09649658, "step": 10462, "time_per_iteration": 2.5948758125305176 }, { "auxiliary_loss_clip": 0.06423246, "auxiliary_loss_mlp": 0.0126682, "balance_loss_clip": 0.06282938, "balance_loss_mlp": 0.01257128, "epoch": 0.6290695926649632, "flos": 28409942563200.0, "grad_norm": 2.401074499383976, "language_loss": 0.72054404, "learning_rate": 1.2776982521511821e-06, "loss": 0.7974447, "num_input_tokens_seen": 225506915, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09686279, "step": 10463, "time_per_iteration": 2.5993587970733643 }, { "auxiliary_loss_clip": 0.06423895, "auxiliary_loss_mlp": 0.01269294, "balance_loss_clip": 0.06282499, "balance_loss_mlp": 0.01259341, "epoch": 0.6291297159176311, "flos": 21511628277120.0, "grad_norm": 1.626174341510869, "language_loss": 0.72474051, "learning_rate": 1.2773350899493665e-06, "loss": 0.80167246, "num_input_tokens_seen": 225525670, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09960938, "step": 10464, "time_per_iteration": 2.5468945503234863 }, { "auxiliary_loss_clip": 0.06422588, "auxiliary_loss_mlp": 0.01265095, "balance_loss_clip": 0.06277175, "balance_loss_mlp": 0.01255808, "epoch": 0.6291898391702991, "flos": 12208203521280.0, "grad_norm": 1.6512238305960902, "language_loss": 0.69317585, "learning_rate": 1.2769719551510768e-06, "loss": 0.77005261, "num_input_tokens_seen": 225542235, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.09289551, "step": 10465, "time_per_iteration": 2.5159645080566406 }, { "auxiliary_loss_clip": 0.06315342, "auxiliary_loss_mlp": 0.01251001, "balance_loss_clip": 0.0625513, "balance_loss_mlp": 0.01249473, "epoch": 0.629249962422967, "flos": 69319347840000.0, "grad_norm": 2.9614041451122746, "language_loss": 0.59692705, "learning_rate": 1.2766088477700832e-06, "loss": 0.67259043, "num_input_tokens_seen": 225607185, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01528931, "step": 10466, "time_per_iteration": 3.2796647548675537 }, { "auxiliary_loss_clip": 0.06421547, "auxiliary_loss_mlp": 0.01265836, "balance_loss_clip": 0.06276147, "balance_loss_mlp": 0.01256859, "epoch": 0.6293100856756351, "flos": 40087353196800.0, "grad_norm": 2.203009551177066, "language_loss": 0.6515525, "learning_rate": 1.276245767820154e-06, "loss": 0.72842634, "num_input_tokens_seen": 225628785, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.08978271, "step": 10467, "time_per_iteration": 2.7112491130828857 }, { "auxiliary_loss_clip": 0.06326649, "auxiliary_loss_mlp": 0.01251418, "balance_loss_clip": 0.06266418, "balance_loss_mlp": 0.01249883, "epoch": 0.629370208928303, "flos": 67518907960320.0, "grad_norm": 0.7816188488627374, "language_loss": 0.56850576, "learning_rate": 1.2758827153150586e-06, "loss": 0.64428645, "num_input_tokens_seen": 225678980, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01535797, "step": 10468, "time_per_iteration": 3.129768133163452 }, { "auxiliary_loss_clip": 0.0632254, "auxiliary_loss_mlp": 0.01252831, "balance_loss_clip": 0.06262394, "balance_loss_mlp": 0.01251246, "epoch": 0.629430332180971, "flos": 60680228653440.0, "grad_norm": 0.7460743438706353, "language_loss": 0.57507926, "learning_rate": 1.2755196902685626e-06, "loss": 0.65083289, "num_input_tokens_seen": 225740295, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01585388, "step": 10469, "time_per_iteration": 3.2071120738983154 }, { "auxiliary_loss_clip": 0.06313437, "auxiliary_loss_mlp": 0.0125127, "balance_loss_clip": 0.06253637, "balance_loss_mlp": 0.01249781, "epoch": 0.6294904554336389, "flos": 66891707821440.0, "grad_norm": 0.7193960719311786, "language_loss": 0.5203954, "learning_rate": 1.2751566926944329e-06, "loss": 0.59604239, "num_input_tokens_seen": 225805615, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01487732, "step": 10470, "time_per_iteration": 3.2623064517974854 }, { "auxiliary_loss_clip": 0.06420043, "auxiliary_loss_mlp": 0.01271087, "balance_loss_clip": 0.06276803, "balance_loss_mlp": 0.01260948, "epoch": 0.6295505786863069, "flos": 42532728353280.0, "grad_norm": 1.669882473569771, "language_loss": 0.74667764, "learning_rate": 1.2747937226064342e-06, "loss": 0.82358897, "num_input_tokens_seen": 225826585, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10137939, "step": 10471, "time_per_iteration": 2.7503976821899414 }, { "auxiliary_loss_clip": 0.06430665, "auxiliary_loss_mlp": 0.01264287, "balance_loss_clip": 0.06280958, "balance_loss_mlp": 0.01254077, "epoch": 0.629610701938975, "flos": 17389877610240.0, "grad_norm": 2.404409966669024, "language_loss": 0.63272941, "learning_rate": 1.2744307800183297e-06, "loss": 0.70967901, "num_input_tokens_seen": 225844095, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.10205078, "step": 10472, "time_per_iteration": 2.5369837284088135 }, { "auxiliary_loss_clip": 0.0642909, "auxiliary_loss_mlp": 0.01269838, "balance_loss_clip": 0.06279567, "balance_loss_mlp": 0.01259253, "epoch": 0.6296708251916429, "flos": 24249730072320.0, "grad_norm": 1.6209774123108496, "language_loss": 0.69482255, "learning_rate": 1.2740678649438828e-06, "loss": 0.77181184, "num_input_tokens_seen": 225864310, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.105896, "step": 10473, "time_per_iteration": 2.5751447677612305 }, { "auxiliary_loss_clip": 0.06424303, "auxiliary_loss_mlp": 0.01264135, "balance_loss_clip": 0.06279292, "balance_loss_mlp": 0.01254342, "epoch": 0.6297309484443109, "flos": 19284110536320.0, "grad_norm": 1.709165349512539, "language_loss": 0.7419644, "learning_rate": 1.2737049773968554e-06, "loss": 0.81884879, "num_input_tokens_seen": 225883830, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.09802246, "step": 10474, "time_per_iteration": 2.562548875808716 }, { "auxiliary_loss_clip": 0.06426521, "auxiliary_loss_mlp": 0.01265081, "balance_loss_clip": 0.06279065, "balance_loss_mlp": 0.01254805, "epoch": 0.6297910716969788, "flos": 30670261977600.0, "grad_norm": 1.372783511028569, "language_loss": 0.66332662, "learning_rate": 1.2733421173910081e-06, "loss": 0.7402426, "num_input_tokens_seen": 225905755, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.1027832, "step": 10475, "time_per_iteration": 2.6574859619140625 }, { "auxiliary_loss_clip": 0.06417626, "auxiliary_loss_mlp": 0.0126362, "balance_loss_clip": 0.06276049, "balance_loss_mlp": 0.01253988, "epoch": 0.6298511949496468, "flos": 14427293927040.0, "grad_norm": 1.8751379550220564, "language_loss": 0.90441459, "learning_rate": 1.272979284940101e-06, "loss": 0.98122704, "num_input_tokens_seen": 225922155, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09625244, "step": 10476, "time_per_iteration": 2.5397329330444336 }, { "auxiliary_loss_clip": 0.06426971, "auxiliary_loss_mlp": 0.01271769, "balance_loss_clip": 0.06280629, "balance_loss_mlp": 0.01261571, "epoch": 0.6299113182023147, "flos": 23520995112960.0, "grad_norm": 1.6994578185325928, "language_loss": 0.75282705, "learning_rate": 1.2726164800578913e-06, "loss": 0.82981443, "num_input_tokens_seen": 225941060, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.10198975, "step": 10477, "time_per_iteration": 2.5861878395080566 }, { "auxiliary_loss_clip": 0.06425268, "auxiliary_loss_mlp": 0.01265377, "balance_loss_clip": 0.06277839, "balance_loss_mlp": 0.01254702, "epoch": 0.6299714414549827, "flos": 22681109312640.0, "grad_norm": 1.794551847030356, "language_loss": 0.7036376, "learning_rate": 1.272253702758138e-06, "loss": 0.78054404, "num_input_tokens_seen": 225960870, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.10681152, "step": 10478, "time_per_iteration": 2.5900485515594482 }, { "auxiliary_loss_clip": 0.06430366, "auxiliary_loss_mlp": 0.01265807, "balance_loss_clip": 0.06278998, "balance_loss_mlp": 0.01255161, "epoch": 0.6300315647076506, "flos": 14506984759680.0, "grad_norm": 2.3588376792893113, "language_loss": 0.6827023, "learning_rate": 1.2718909530545974e-06, "loss": 0.75966406, "num_input_tokens_seen": 225977895, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.10644531, "step": 10479, "time_per_iteration": 4.0175042152404785 }, { "auxiliary_loss_clip": 0.06423245, "auxiliary_loss_mlp": 0.01265161, "balance_loss_clip": 0.06279446, "balance_loss_mlp": 0.01255022, "epoch": 0.6300916879603187, "flos": 21878134035840.0, "grad_norm": 1.6682521148334843, "language_loss": 0.73853236, "learning_rate": 1.2715282309610245e-06, "loss": 0.81541646, "num_input_tokens_seen": 225997835, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10125732, "step": 10480, "time_per_iteration": 2.609501361846924 }, { "auxiliary_loss_clip": 0.06429169, "auxiliary_loss_mlp": 0.01267377, "balance_loss_clip": 0.06279719, "balance_loss_mlp": 0.01256303, "epoch": 0.6301518112129866, "flos": 21840301117440.0, "grad_norm": 1.8651573716796637, "language_loss": 0.7904042, "learning_rate": 1.2711655364911744e-06, "loss": 0.86736971, "num_input_tokens_seen": 226017620, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.11077881, "step": 10481, "time_per_iteration": 2.5614166259765625 }, { "auxiliary_loss_clip": 0.06318062, "auxiliary_loss_mlp": 0.01254416, "balance_loss_clip": 0.06257936, "balance_loss_mlp": 0.01252962, "epoch": 0.6302119344656546, "flos": 44348429675520.0, "grad_norm": 0.8761131767421658, "language_loss": 0.61835873, "learning_rate": 1.2708028696588e-06, "loss": 0.69408351, "num_input_tokens_seen": 226068755, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01452637, "step": 10482, "time_per_iteration": 2.9221999645233154 }, { "auxiliary_loss_clip": 0.06440921, "auxiliary_loss_mlp": 0.01268488, "balance_loss_clip": 0.06284709, "balance_loss_mlp": 0.012568, "epoch": 0.6302720577183225, "flos": 11222604270720.0, "grad_norm": 1.9072441766612847, "language_loss": 0.8313958, "learning_rate": 1.2704402304776541e-06, "loss": 0.90848988, "num_input_tokens_seen": 226084395, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.11688232, "step": 10483, "time_per_iteration": 3.9679291248321533 }, { "auxiliary_loss_clip": 0.06422839, "auxiliary_loss_mlp": 0.01262636, "balance_loss_clip": 0.06283447, "balance_loss_mlp": 0.01252975, "epoch": 0.6303321809709905, "flos": 27972424869120.0, "grad_norm": 1.669386706686313, "language_loss": 0.73362499, "learning_rate": 1.270077618961487e-06, "loss": 0.81047976, "num_input_tokens_seen": 226105890, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09655762, "step": 10484, "time_per_iteration": 2.5986101627349854 }, { "auxiliary_loss_clip": 0.06431342, "auxiliary_loss_mlp": 0.01265799, "balance_loss_clip": 0.06282409, "balance_loss_mlp": 0.01255613, "epoch": 0.6303923042236586, "flos": 28228366765440.0, "grad_norm": 1.646547741019837, "language_loss": 0.74979091, "learning_rate": 1.2697150351240506e-06, "loss": 0.82676232, "num_input_tokens_seen": 226126760, "router_z_loss_clip": 1.48730469, "router_z_loss_mlp": 0.10192871, "step": 10485, "time_per_iteration": 2.6285653114318848 }, { "auxiliary_loss_clip": 0.06435806, "auxiliary_loss_mlp": 0.01265741, "balance_loss_clip": 0.06283355, "balance_loss_mlp": 0.01254714, "epoch": 0.6304524274763265, "flos": 27637546826880.0, "grad_norm": 1.923128445899758, "language_loss": 0.81301653, "learning_rate": 1.269352478979093e-06, "loss": 0.89003199, "num_input_tokens_seen": 226147315, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.11016846, "step": 10486, "time_per_iteration": 2.6361005306243896 }, { "auxiliary_loss_clip": 0.06427889, "auxiliary_loss_mlp": 0.01265408, "balance_loss_clip": 0.06281957, "balance_loss_mlp": 0.01254948, "epoch": 0.6305125507289945, "flos": 17317062812160.0, "grad_norm": 1.7501474688604102, "language_loss": 0.64005697, "learning_rate": 1.2689899505403628e-06, "loss": 0.71698999, "num_input_tokens_seen": 226165935, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10455322, "step": 10487, "time_per_iteration": 2.55552077293396 }, { "auxiliary_loss_clip": 0.06427665, "auxiliary_loss_mlp": 0.01269181, "balance_loss_clip": 0.06282853, "balance_loss_mlp": 0.01259138, "epoch": 0.6305726739816624, "flos": 25814745106560.0, "grad_norm": 1.4284719558049725, "language_loss": 0.67396772, "learning_rate": 1.2686274498216065e-06, "loss": 0.75093615, "num_input_tokens_seen": 226186890, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10040283, "step": 10488, "time_per_iteration": 2.5870487689971924 }, { "auxiliary_loss_clip": 0.06430209, "auxiliary_loss_mlp": 0.01268958, "balance_loss_clip": 0.06282017, "balance_loss_mlp": 0.01257901, "epoch": 0.6306327972343304, "flos": 21803684083200.0, "grad_norm": 1.9466984688867235, "language_loss": 0.67676365, "learning_rate": 1.2682649768365706e-06, "loss": 0.75375533, "num_input_tokens_seen": 226206710, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.1105957, "step": 10489, "time_per_iteration": 2.5652432441711426 }, { "auxiliary_loss_clip": 0.06438054, "auxiliary_loss_mlp": 0.01266463, "balance_loss_clip": 0.06280243, "balance_loss_mlp": 0.01254619, "epoch": 0.6306929204869983, "flos": 20783689931520.0, "grad_norm": 2.181741789029565, "language_loss": 0.69961083, "learning_rate": 1.2679025315990007e-06, "loss": 0.77665591, "num_input_tokens_seen": 226225565, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.11846924, "step": 10490, "time_per_iteration": 2.5564587116241455 }, { "auxiliary_loss_clip": 0.06427345, "auxiliary_loss_mlp": 0.01268, "balance_loss_clip": 0.06278341, "balance_loss_mlp": 0.01256919, "epoch": 0.6307530437396663, "flos": 23660084090880.0, "grad_norm": 1.8598521170485052, "language_loss": 0.78378034, "learning_rate": 1.2675401141226393e-06, "loss": 0.86073375, "num_input_tokens_seen": 226243680, "router_z_loss_clip": 1.48730469, "router_z_loss_mlp": 0.11083984, "step": 10491, "time_per_iteration": 2.592087745666504 }, { "auxiliary_loss_clip": 0.06428565, "auxiliary_loss_mlp": 0.01265867, "balance_loss_clip": 0.06282587, "balance_loss_mlp": 0.01255281, "epoch": 0.6308131669923343, "flos": 24726170787840.0, "grad_norm": 1.7715164311859237, "language_loss": 0.55773652, "learning_rate": 1.2671777244212308e-06, "loss": 0.63468087, "num_input_tokens_seen": 226264345, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.105896, "step": 10492, "time_per_iteration": 2.5976037979125977 }, { "auxiliary_loss_clip": 0.06434236, "auxiliary_loss_mlp": 0.0126351, "balance_loss_clip": 0.06284751, "balance_loss_mlp": 0.01252644, "epoch": 0.6308732902450023, "flos": 22572054823680.0, "grad_norm": 2.0627421672525137, "language_loss": 0.64596891, "learning_rate": 1.2668153625085168e-06, "loss": 0.72294641, "num_input_tokens_seen": 226283165, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.10870361, "step": 10493, "time_per_iteration": 4.06670618057251 }, { "auxiliary_loss_clip": 0.06429437, "auxiliary_loss_mlp": 0.01264889, "balance_loss_clip": 0.06281359, "balance_loss_mlp": 0.01254088, "epoch": 0.6309334134976702, "flos": 24651050002560.0, "grad_norm": 1.3396975599722434, "language_loss": 0.82465702, "learning_rate": 1.2664530283982367e-06, "loss": 0.9016003, "num_input_tokens_seen": 226304080, "router_z_loss_clip": 1.48242188, "router_z_loss_mlp": 0.10809326, "step": 10494, "time_per_iteration": 4.052953243255615 }, { "auxiliary_loss_clip": 0.06427219, "auxiliary_loss_mlp": 0.01271988, "balance_loss_clip": 0.06280607, "balance_loss_mlp": 0.01261885, "epoch": 0.6309935367503382, "flos": 41437655343360.0, "grad_norm": 1.8388914760235353, "language_loss": 0.79661906, "learning_rate": 1.2660907221041317e-06, "loss": 0.87361109, "num_input_tokens_seen": 226325925, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10107422, "step": 10495, "time_per_iteration": 2.7265965938568115 }, { "auxiliary_loss_clip": 0.06431086, "auxiliary_loss_mlp": 0.01266907, "balance_loss_clip": 0.06281557, "balance_loss_mlp": 0.01255767, "epoch": 0.6310536600030061, "flos": 15123772264320.0, "grad_norm": 1.912680332181652, "language_loss": 0.70882761, "learning_rate": 1.2657284436399403e-06, "loss": 0.78580749, "num_input_tokens_seen": 226344190, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.1114502, "step": 10496, "time_per_iteration": 2.565403461456299 }, { "auxiliary_loss_clip": 0.06428675, "auxiliary_loss_mlp": 0.0127175, "balance_loss_clip": 0.06280383, "balance_loss_mlp": 0.0126083, "epoch": 0.6311137832556741, "flos": 15237019457280.0, "grad_norm": 2.4215251810599163, "language_loss": 0.80561942, "learning_rate": 1.2653661930193997e-06, "loss": 0.88262367, "num_input_tokens_seen": 226361520, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10925293, "step": 10497, "time_per_iteration": 2.6176140308380127 }, { "auxiliary_loss_clip": 0.06424353, "auxiliary_loss_mlp": 0.01263618, "balance_loss_clip": 0.06278063, "balance_loss_mlp": 0.012546, "epoch": 0.6311739065083422, "flos": 22025314932480.0, "grad_norm": 2.010572889926376, "language_loss": 0.74542534, "learning_rate": 1.265003970256247e-06, "loss": 0.82230502, "num_input_tokens_seen": 226381920, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.09020996, "step": 10498, "time_per_iteration": 2.5686073303222656 }, { "auxiliary_loss_clip": 0.06428058, "auxiliary_loss_mlp": 0.01267069, "balance_loss_clip": 0.06281089, "balance_loss_mlp": 0.01256549, "epoch": 0.6312340297610101, "flos": 22717349003520.0, "grad_norm": 2.075619124009513, "language_loss": 0.70194215, "learning_rate": 1.264641775364217e-06, "loss": 0.77889341, "num_input_tokens_seen": 226400035, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.10522461, "step": 10499, "time_per_iteration": 2.5576682090759277 }, { "auxiliary_loss_clip": 0.06424849, "auxiliary_loss_mlp": 0.01269337, "balance_loss_clip": 0.06283092, "balance_loss_mlp": 0.01259431, "epoch": 0.6312941530136781, "flos": 24287017939200.0, "grad_norm": 2.056109480053462, "language_loss": 0.70541984, "learning_rate": 1.2642796083570448e-06, "loss": 0.78236169, "num_input_tokens_seen": 226418280, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09906006, "step": 10500, "time_per_iteration": 2.572056531906128 }, { "auxiliary_loss_clip": 0.06426433, "auxiliary_loss_mlp": 0.01265021, "balance_loss_clip": 0.06280346, "balance_loss_mlp": 0.01254739, "epoch": 0.631354276266346, "flos": 21732420585600.0, "grad_norm": 1.7123698237854785, "language_loss": 0.74288166, "learning_rate": 1.2639174692484634e-06, "loss": 0.8197962, "num_input_tokens_seen": 226436650, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.1027832, "step": 10501, "time_per_iteration": 2.5475986003875732 }, { "auxiliary_loss_clip": 0.06424856, "auxiliary_loss_mlp": 0.01267398, "balance_loss_clip": 0.06281084, "balance_loss_mlp": 0.0125584, "epoch": 0.631414399519014, "flos": 24032040364800.0, "grad_norm": 2.4695191117982334, "language_loss": 0.75792587, "learning_rate": 1.2635553580522053e-06, "loss": 0.8348484, "num_input_tokens_seen": 226456275, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.11553955, "step": 10502, "time_per_iteration": 2.556361198425293 }, { "auxiliary_loss_clip": 0.06431489, "auxiliary_loss_mlp": 0.01267521, "balance_loss_clip": 0.0627939, "balance_loss_mlp": 0.01256053, "epoch": 0.6314745227716819, "flos": 24322586797440.0, "grad_norm": 1.850886623443599, "language_loss": 0.85891062, "learning_rate": 1.2631932747820022e-06, "loss": 0.93590075, "num_input_tokens_seen": 226473610, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.11474609, "step": 10503, "time_per_iteration": 2.5699820518493652 }, { "auxiliary_loss_clip": 0.06427383, "auxiliary_loss_mlp": 0.01264157, "balance_loss_clip": 0.06278194, "balance_loss_mlp": 0.01253715, "epoch": 0.6315346460243499, "flos": 23372891821440.0, "grad_norm": 1.6640157101972803, "language_loss": 0.86488414, "learning_rate": 1.2628312194515838e-06, "loss": 0.94179952, "num_input_tokens_seen": 226493665, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.10430908, "step": 10504, "time_per_iteration": 2.5610623359680176 }, { "auxiliary_loss_clip": 0.06437979, "auxiliary_loss_mlp": 0.0127075, "balance_loss_clip": 0.0628398, "balance_loss_mlp": 0.01259234, "epoch": 0.6315947692770179, "flos": 20265517082880.0, "grad_norm": 1.7106475209280927, "language_loss": 0.76904571, "learning_rate": 1.2624691920746793e-06, "loss": 0.84613299, "num_input_tokens_seen": 226511625, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.11505127, "step": 10505, "time_per_iteration": 2.549668073654175 }, { "auxiliary_loss_clip": 0.0642674, "auxiliary_loss_mlp": 0.01264987, "balance_loss_clip": 0.06278971, "balance_loss_mlp": 0.01254383, "epoch": 0.6316548925296859, "flos": 25273036460160.0, "grad_norm": 2.0422482665476913, "language_loss": 0.81915575, "learning_rate": 1.2621071926650166e-06, "loss": 0.89607298, "num_input_tokens_seen": 226530085, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.1060791, "step": 10506, "time_per_iteration": 2.5889484882354736 }, { "auxiliary_loss_clip": 0.06424218, "auxiliary_loss_mlp": 0.01268841, "balance_loss_clip": 0.06275752, "balance_loss_mlp": 0.01257355, "epoch": 0.6317150157823538, "flos": 22937344698240.0, "grad_norm": 1.7327739514824396, "language_loss": 0.74375069, "learning_rate": 1.2617452212363238e-06, "loss": 0.82068133, "num_input_tokens_seen": 226548115, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.11486816, "step": 10507, "time_per_iteration": 2.5699007511138916 }, { "auxiliary_loss_clip": 0.06432237, "auxiliary_loss_mlp": 0.01269624, "balance_loss_clip": 0.06281726, "balance_loss_mlp": 0.01257781, "epoch": 0.6317751390350218, "flos": 22533383364480.0, "grad_norm": 1.7310033307500268, "language_loss": 0.68177807, "learning_rate": 1.2613832778023258e-06, "loss": 0.75879669, "num_input_tokens_seen": 226567955, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.1184082, "step": 10508, "time_per_iteration": 2.5722241401672363 }, { "auxiliary_loss_clip": 0.06423659, "auxiliary_loss_mlp": 0.01269046, "balance_loss_clip": 0.06276965, "balance_loss_mlp": 0.01258026, "epoch": 0.6318352622876897, "flos": 23301460615680.0, "grad_norm": 1.66020240716203, "language_loss": 0.71086371, "learning_rate": 1.2610213623767478e-06, "loss": 0.78779078, "num_input_tokens_seen": 226588205, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.11010742, "step": 10509, "time_per_iteration": 2.5835745334625244 }, { "auxiliary_loss_clip": 0.06421809, "auxiliary_loss_mlp": 0.01269284, "balance_loss_clip": 0.06276537, "balance_loss_mlp": 0.01259086, "epoch": 0.6318953855403577, "flos": 20710330081920.0, "grad_norm": 1.5130014613434326, "language_loss": 0.79650462, "learning_rate": 1.2606594749733143e-06, "loss": 0.87341553, "num_input_tokens_seen": 226606965, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10205078, "step": 10510, "time_per_iteration": 2.624824047088623 }, { "auxiliary_loss_clip": 0.06426293, "auxiliary_loss_mlp": 0.01266495, "balance_loss_clip": 0.06276657, "balance_loss_mlp": 0.01254825, "epoch": 0.6319555087930258, "flos": 22826613127680.0, "grad_norm": 1.6147277409642768, "language_loss": 0.71007538, "learning_rate": 1.2602976156057469e-06, "loss": 0.78700322, "num_input_tokens_seen": 226627845, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.11676025, "step": 10511, "time_per_iteration": 2.5936577320098877 }, { "auxiliary_loss_clip": 0.06418838, "auxiliary_loss_mlp": 0.01267849, "balance_loss_clip": 0.06275344, "balance_loss_mlp": 0.0125749, "epoch": 0.6320156320456937, "flos": 19976480023680.0, "grad_norm": 2.066460817547239, "language_loss": 0.80373204, "learning_rate": 1.2599357842877684e-06, "loss": 0.8805989, "num_input_tokens_seen": 226645855, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10357666, "step": 10512, "time_per_iteration": 2.546722412109375 }, { "auxiliary_loss_clip": 0.0642909, "auxiliary_loss_mlp": 0.01268082, "balance_loss_clip": 0.06281316, "balance_loss_mlp": 0.01257306, "epoch": 0.6320757552983617, "flos": 27020256197760.0, "grad_norm": 1.7064013392651287, "language_loss": 0.71127307, "learning_rate": 1.2595739810330994e-06, "loss": 0.78824478, "num_input_tokens_seen": 226665375, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.10778809, "step": 10513, "time_per_iteration": 2.6058645248413086 }, { "auxiliary_loss_clip": 0.06429298, "auxiliary_loss_mlp": 0.01269942, "balance_loss_clip": 0.06278002, "balance_loss_mlp": 0.01258897, "epoch": 0.6321358785510296, "flos": 23702696691840.0, "grad_norm": 1.8523575681499, "language_loss": 0.6719051, "learning_rate": 1.259212205855459e-06, "loss": 0.74889743, "num_input_tokens_seen": 226685270, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.1104126, "step": 10514, "time_per_iteration": 2.5808863639831543 }, { "auxiliary_loss_clip": 0.06419598, "auxiliary_loss_mlp": 0.01267117, "balance_loss_clip": 0.0627498, "balance_loss_mlp": 0.01257199, "epoch": 0.6321960018036976, "flos": 26002484179200.0, "grad_norm": 2.0720987201710397, "language_loss": 0.74715137, "learning_rate": 1.2588504587685663e-06, "loss": 0.8240186, "num_input_tokens_seen": 226705325, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.09924316, "step": 10515, "time_per_iteration": 2.7066447734832764 }, { "auxiliary_loss_clip": 0.06419591, "auxiliary_loss_mlp": 0.0126708, "balance_loss_clip": 0.06277345, "balance_loss_mlp": 0.01257383, "epoch": 0.6322561250563655, "flos": 22827745157760.0, "grad_norm": 1.7172804680963347, "language_loss": 0.90096194, "learning_rate": 1.2584887397861379e-06, "loss": 0.97782862, "num_input_tokens_seen": 226723815, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09698486, "step": 10516, "time_per_iteration": 2.694478988647461 }, { "auxiliary_loss_clip": 0.06437701, "auxiliary_loss_mlp": 0.01265271, "balance_loss_clip": 0.06281804, "balance_loss_mlp": 0.01252998, "epoch": 0.6323162483090335, "flos": 18994234936320.0, "grad_norm": 1.7378169014697946, "language_loss": 0.82200599, "learning_rate": 1.2581270489218911e-06, "loss": 0.89903569, "num_input_tokens_seen": 226741550, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.12280273, "step": 10517, "time_per_iteration": 2.5975451469421387 }, { "auxiliary_loss_clip": 0.06422888, "auxiliary_loss_mlp": 0.01266323, "balance_loss_clip": 0.06277045, "balance_loss_mlp": 0.01255934, "epoch": 0.6323763715617015, "flos": 19871324749440.0, "grad_norm": 2.2217876735247604, "language_loss": 0.77532089, "learning_rate": 1.257765386189541e-06, "loss": 0.85221297, "num_input_tokens_seen": 226761115, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10388184, "step": 10518, "time_per_iteration": 4.085821628570557 }, { "auxiliary_loss_clip": 0.06420937, "auxiliary_loss_mlp": 0.01267912, "balance_loss_clip": 0.0627749, "balance_loss_mlp": 0.01257833, "epoch": 0.6324364948143695, "flos": 22789115625600.0, "grad_norm": 1.4604494329481748, "language_loss": 0.85295141, "learning_rate": 1.2574037516028018e-06, "loss": 0.92983997, "num_input_tokens_seen": 226782225, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10089111, "step": 10519, "time_per_iteration": 2.6048057079315186 }, { "auxiliary_loss_clip": 0.06421442, "auxiliary_loss_mlp": 0.01266636, "balance_loss_clip": 0.06278568, "balance_loss_mlp": 0.01256855, "epoch": 0.6324966180670374, "flos": 22242333807360.0, "grad_norm": 1.4520255252391754, "language_loss": 0.72022128, "learning_rate": 1.2570421451753867e-06, "loss": 0.79710209, "num_input_tokens_seen": 226802375, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09777832, "step": 10520, "time_per_iteration": 2.596458911895752 }, { "auxiliary_loss_clip": 0.06422722, "auxiliary_loss_mlp": 0.01265065, "balance_loss_clip": 0.06275832, "balance_loss_mlp": 0.01254634, "epoch": 0.6325567413197054, "flos": 21695593916160.0, "grad_norm": 1.5904452376210074, "language_loss": 0.72253168, "learning_rate": 1.2566805669210081e-06, "loss": 0.79940963, "num_input_tokens_seen": 226822165, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.10424805, "step": 10521, "time_per_iteration": 2.5538647174835205 }, { "auxiliary_loss_clip": 0.06431945, "auxiliary_loss_mlp": 0.01269579, "balance_loss_clip": 0.06282332, "balance_loss_mlp": 0.01258129, "epoch": 0.6326168645723733, "flos": 19943133298560.0, "grad_norm": 1.6023758209525072, "language_loss": 0.72255182, "learning_rate": 1.256319016853377e-06, "loss": 0.79956704, "num_input_tokens_seen": 226841645, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.11462402, "step": 10522, "time_per_iteration": 2.6155409812927246 }, { "auxiliary_loss_clip": 0.06422608, "auxiliary_loss_mlp": 0.01267569, "balance_loss_clip": 0.06277054, "balance_loss_mlp": 0.01256799, "epoch": 0.6326769878250413, "flos": 20236614624000.0, "grad_norm": 2.128133478219974, "language_loss": 0.81805855, "learning_rate": 1.2559574949862023e-06, "loss": 0.89496028, "num_input_tokens_seen": 226860355, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10772705, "step": 10523, "time_per_iteration": 4.019504070281982 }, { "auxiliary_loss_clip": 0.0642654, "auxiliary_loss_mlp": 0.01265869, "balance_loss_clip": 0.06280756, "balance_loss_mlp": 0.01255403, "epoch": 0.6327371110777094, "flos": 20781803214720.0, "grad_norm": 2.082685206174848, "language_loss": 0.73832548, "learning_rate": 1.255596001333195e-06, "loss": 0.81524956, "num_input_tokens_seen": 226878390, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10461426, "step": 10524, "time_per_iteration": 2.563493013381958 }, { "auxiliary_loss_clip": 0.06437319, "auxiliary_loss_mlp": 0.01270564, "balance_loss_clip": 0.06282866, "balance_loss_mlp": 0.01259, "epoch": 0.6327972343303773, "flos": 30344440176000.0, "grad_norm": 5.980292147114679, "language_loss": 0.84535456, "learning_rate": 1.2552345359080615e-06, "loss": 0.92243338, "num_input_tokens_seen": 226898420, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.11566162, "step": 10525, "time_per_iteration": 2.6181414127349854 }, { "auxiliary_loss_clip": 0.0642461, "auxiliary_loss_mlp": 0.01266372, "balance_loss_clip": 0.06278834, "balance_loss_mlp": 0.01256626, "epoch": 0.6328573575830453, "flos": 17097947585280.0, "grad_norm": 1.6509383648614042, "language_loss": 0.66654438, "learning_rate": 1.2548730987245093e-06, "loss": 0.74345422, "num_input_tokens_seen": 226916305, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.09747314, "step": 10526, "time_per_iteration": 2.523015022277832 }, { "auxiliary_loss_clip": 0.06433522, "auxiliary_loss_mlp": 0.01266818, "balance_loss_clip": 0.06281938, "balance_loss_mlp": 0.0125476, "epoch": 0.6329174808357132, "flos": 25054340503680.0, "grad_norm": 1.5001051897239346, "language_loss": 0.73371327, "learning_rate": 1.254511689796244e-06, "loss": 0.81071669, "num_input_tokens_seen": 226937705, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.1206665, "step": 10527, "time_per_iteration": 2.6061971187591553 }, { "auxiliary_loss_clip": 0.0642536, "auxiliary_loss_mlp": 0.01264663, "balance_loss_clip": 0.06281021, "balance_loss_mlp": 0.01255007, "epoch": 0.6329776040883812, "flos": 16842466886400.0, "grad_norm": 2.087316115869226, "language_loss": 0.71617877, "learning_rate": 1.2541503091369693e-06, "loss": 0.79307896, "num_input_tokens_seen": 226954880, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.09655762, "step": 10528, "time_per_iteration": 2.5550076961517334 }, { "auxiliary_loss_clip": 0.06429487, "auxiliary_loss_mlp": 0.01267417, "balance_loss_clip": 0.06281133, "balance_loss_mlp": 0.01256479, "epoch": 0.6330377273410491, "flos": 13521804779520.0, "grad_norm": 2.6728434403460866, "language_loss": 0.67477179, "learning_rate": 1.2537889567603905e-06, "loss": 0.75174081, "num_input_tokens_seen": 226972595, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.109375, "step": 10529, "time_per_iteration": 2.5205116271972656 }, { "auxiliary_loss_clip": 0.06436855, "auxiliary_loss_mlp": 0.01268036, "balance_loss_clip": 0.06283251, "balance_loss_mlp": 0.01255108, "epoch": 0.6330978505937171, "flos": 21544471877760.0, "grad_norm": 2.0646726920771576, "language_loss": 0.75537205, "learning_rate": 1.2534276326802092e-06, "loss": 0.83242095, "num_input_tokens_seen": 226991910, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.1293335, "step": 10530, "time_per_iteration": 2.536533832550049 }, { "auxiliary_loss_clip": 0.0643737, "auxiliary_loss_mlp": 0.01266025, "balance_loss_clip": 0.06286286, "balance_loss_mlp": 0.01255093, "epoch": 0.6331579738463851, "flos": 25016465658240.0, "grad_norm": 1.6673718602356902, "language_loss": 0.74083173, "learning_rate": 1.2530663369101259e-06, "loss": 0.81786567, "num_input_tokens_seen": 227010175, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.10925293, "step": 10531, "time_per_iteration": 2.608156681060791 }, { "auxiliary_loss_clip": 0.06427368, "auxiliary_loss_mlp": 0.01269715, "balance_loss_clip": 0.06282929, "balance_loss_mlp": 0.0125929, "epoch": 0.6332180970990531, "flos": 14981329123200.0, "grad_norm": 2.5188280132628416, "language_loss": 0.79566753, "learning_rate": 1.2527050694638432e-06, "loss": 0.87263834, "num_input_tokens_seen": 227025540, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10418701, "step": 10532, "time_per_iteration": 2.5116491317749023 }, { "auxiliary_loss_clip": 0.06429954, "auxiliary_loss_mlp": 0.01268189, "balance_loss_clip": 0.06284516, "balance_loss_mlp": 0.01258301, "epoch": 0.633278220351721, "flos": 22712904737280.0, "grad_norm": 1.4744027508608442, "language_loss": 0.75177991, "learning_rate": 1.2523438303550582e-06, "loss": 0.82876134, "num_input_tokens_seen": 227045520, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.09893799, "step": 10533, "time_per_iteration": 3.8815648555755615 }, { "auxiliary_loss_clip": 0.06440915, "auxiliary_loss_mlp": 0.01268877, "balance_loss_clip": 0.06286702, "balance_loss_mlp": 0.01256426, "epoch": 0.633338343604389, "flos": 12607594807680.0, "grad_norm": 2.28333486814795, "language_loss": 0.77609062, "learning_rate": 1.2519826195974706e-06, "loss": 0.85318851, "num_input_tokens_seen": 227059420, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.12457275, "step": 10534, "time_per_iteration": 3.9777915477752686 }, { "auxiliary_loss_clip": 0.06431236, "auxiliary_loss_mlp": 0.01267042, "balance_loss_clip": 0.06284744, "balance_loss_mlp": 0.01255866, "epoch": 0.6333984668570569, "flos": 25967586153600.0, "grad_norm": 1.732965752136887, "language_loss": 0.85972208, "learning_rate": 1.251621437204777e-06, "loss": 0.93670487, "num_input_tokens_seen": 227081310, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.11169434, "step": 10535, "time_per_iteration": 2.587131977081299 }, { "auxiliary_loss_clip": 0.06434138, "auxiliary_loss_mlp": 0.0126572, "balance_loss_clip": 0.06286082, "balance_loss_mlp": 0.012552, "epoch": 0.6334585901097249, "flos": 23665953876480.0, "grad_norm": 1.8059305796793883, "language_loss": 0.76477122, "learning_rate": 1.2512602831906733e-06, "loss": 0.84176981, "num_input_tokens_seen": 227100365, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10522461, "step": 10536, "time_per_iteration": 2.596073627471924 }, { "auxiliary_loss_clip": 0.06427504, "auxiliary_loss_mlp": 0.01263272, "balance_loss_clip": 0.06282166, "balance_loss_mlp": 0.01252883, "epoch": 0.633518713362393, "flos": 28766930883840.0, "grad_norm": 1.9582427931972257, "language_loss": 0.6013267, "learning_rate": 1.250899157568855e-06, "loss": 0.67823452, "num_input_tokens_seen": 227119680, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10394287, "step": 10537, "time_per_iteration": 2.6219165325164795 }, { "auxiliary_loss_clip": 0.06331477, "auxiliary_loss_mlp": 0.01250928, "balance_loss_clip": 0.06271192, "balance_loss_mlp": 0.01249264, "epoch": 0.6335788366150609, "flos": 70438669407360.0, "grad_norm": 0.763620556249517, "language_loss": 0.52190173, "learning_rate": 1.2505380603530155e-06, "loss": 0.59772587, "num_input_tokens_seen": 227184465, "router_z_loss_clip": 0.60205078, "router_z_loss_mlp": 0.01667786, "step": 10538, "time_per_iteration": 3.2701950073242188 }, { "auxiliary_loss_clip": 0.06436724, "auxiliary_loss_mlp": 0.01272349, "balance_loss_clip": 0.06283286, "balance_loss_mlp": 0.0126044, "epoch": 0.6336389598677289, "flos": 23738768674560.0, "grad_norm": 1.7391118552733034, "language_loss": 0.83536327, "learning_rate": 1.250176991556848e-06, "loss": 0.91245401, "num_input_tokens_seen": 227202185, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.11907959, "step": 10539, "time_per_iteration": 2.5726816654205322 }, { "auxiliary_loss_clip": 0.06439353, "auxiliary_loss_mlp": 0.01264495, "balance_loss_clip": 0.06288747, "balance_loss_mlp": 0.01252872, "epoch": 0.6336990831203968, "flos": 29284097483520.0, "grad_norm": 1.5529207407312677, "language_loss": 0.86754835, "learning_rate": 1.2498159511940438e-06, "loss": 0.94458687, "num_input_tokens_seen": 227222020, "router_z_loss_clip": 1.50585938, "router_z_loss_mlp": 0.1161499, "step": 10540, "time_per_iteration": 2.638676166534424 }, { "auxiliary_loss_clip": 0.06429589, "auxiliary_loss_mlp": 0.01266233, "balance_loss_clip": 0.06284997, "balance_loss_mlp": 0.01256213, "epoch": 0.6337592063730648, "flos": 29104659964800.0, "grad_norm": 1.6504253603860282, "language_loss": 0.72883499, "learning_rate": 1.2494549392782943e-06, "loss": 0.80579317, "num_input_tokens_seen": 227240885, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10015869, "step": 10541, "time_per_iteration": 2.6312367916107178 }, { "auxiliary_loss_clip": 0.06444054, "auxiliary_loss_mlp": 0.01270544, "balance_loss_clip": 0.06288938, "balance_loss_mlp": 0.01257926, "epoch": 0.6338193296257327, "flos": 34713705404160.0, "grad_norm": 2.35218320799286, "language_loss": 0.85451043, "learning_rate": 1.2490939558232887e-06, "loss": 0.93165642, "num_input_tokens_seen": 227257880, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.12609863, "step": 10542, "time_per_iteration": 2.6733503341674805 }, { "auxiliary_loss_clip": 0.06434831, "auxiliary_loss_mlp": 0.01266206, "balance_loss_clip": 0.06286812, "balance_loss_mlp": 0.01253939, "epoch": 0.6338794528784008, "flos": 16692644586240.0, "grad_norm": 1.6499661458216375, "language_loss": 0.77936482, "learning_rate": 1.2487330008427153e-06, "loss": 0.85637522, "num_input_tokens_seen": 227274840, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.12261963, "step": 10543, "time_per_iteration": 2.5587005615234375 }, { "auxiliary_loss_clip": 0.06428333, "auxiliary_loss_mlp": 0.01267516, "balance_loss_clip": 0.06285308, "balance_loss_mlp": 0.01256847, "epoch": 0.6339395761310687, "flos": 22353233086080.0, "grad_norm": 1.6076060240598296, "language_loss": 0.73100173, "learning_rate": 1.2483720743502618e-06, "loss": 0.80796015, "num_input_tokens_seen": 227294835, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10675049, "step": 10544, "time_per_iteration": 2.582472324371338 }, { "auxiliary_loss_clip": 0.06438942, "auxiliary_loss_mlp": 0.01268421, "balance_loss_clip": 0.06283678, "balance_loss_mlp": 0.01256089, "epoch": 0.6339996993837367, "flos": 18557765418240.0, "grad_norm": 1.8041365645245264, "language_loss": 0.68671989, "learning_rate": 1.2480111763596144e-06, "loss": 0.76379359, "num_input_tokens_seen": 227314935, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.12329102, "step": 10545, "time_per_iteration": 2.565674304962158 }, { "auxiliary_loss_clip": 0.06431007, "auxiliary_loss_mlp": 0.01264776, "balance_loss_clip": 0.06286013, "balance_loss_mlp": 0.0125428, "epoch": 0.6340598226364046, "flos": 12974519836800.0, "grad_norm": 2.17812659459492, "language_loss": 0.71484017, "learning_rate": 1.2476503068844592e-06, "loss": 0.791798, "num_input_tokens_seen": 227332905, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10491943, "step": 10546, "time_per_iteration": 2.5291624069213867 }, { "auxiliary_loss_clip": 0.06426585, "auxiliary_loss_mlp": 0.01265278, "balance_loss_clip": 0.06284437, "balance_loss_mlp": 0.01255777, "epoch": 0.6341199458890726, "flos": 26695272936960.0, "grad_norm": 1.2587886605788878, "language_loss": 0.78412557, "learning_rate": 1.2472894659384792e-06, "loss": 0.86104423, "num_input_tokens_seen": 227354915, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.0949707, "step": 10547, "time_per_iteration": 2.6079177856445312 }, { "auxiliary_loss_clip": 0.06435327, "auxiliary_loss_mlp": 0.01266033, "balance_loss_clip": 0.06283604, "balance_loss_mlp": 0.01254601, "epoch": 0.6341800691417405, "flos": 18740263610880.0, "grad_norm": 1.7030792421485412, "language_loss": 0.63949513, "learning_rate": 1.2469286535353578e-06, "loss": 0.71650869, "num_input_tokens_seen": 227372990, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.11425781, "step": 10548, "time_per_iteration": 2.5392067432403564 }, { "auxiliary_loss_clip": 0.06431861, "auxiliary_loss_mlp": 0.01267461, "balance_loss_clip": 0.06284948, "balance_loss_mlp": 0.01257078, "epoch": 0.6342401923944085, "flos": 26256539358720.0, "grad_norm": 1.669001116345569, "language_loss": 0.61476779, "learning_rate": 1.2465678696887785e-06, "loss": 0.69176108, "num_input_tokens_seen": 227393270, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.10375977, "step": 10549, "time_per_iteration": 2.5986688137054443 }, { "auxiliary_loss_clip": 0.06433789, "auxiliary_loss_mlp": 0.01268733, "balance_loss_clip": 0.06285626, "balance_loss_mlp": 0.01257963, "epoch": 0.6343003156470765, "flos": 24687834744960.0, "grad_norm": 1.5143607993403447, "language_loss": 0.74083138, "learning_rate": 1.2462071144124197e-06, "loss": 0.81785655, "num_input_tokens_seen": 227413630, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.10766602, "step": 10550, "time_per_iteration": 2.5964722633361816 }, { "auxiliary_loss_clip": 0.06323183, "auxiliary_loss_mlp": 0.01251039, "balance_loss_clip": 0.06263492, "balance_loss_mlp": 0.01249509, "epoch": 0.6343604388997445, "flos": 69824481379200.0, "grad_norm": 0.6846377778400461, "language_loss": 0.57713652, "learning_rate": 1.2458463877199638e-06, "loss": 0.65287882, "num_input_tokens_seen": 227476630, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.01528931, "step": 10551, "time_per_iteration": 3.2296173572540283 }, { "auxiliary_loss_clip": 0.06423751, "auxiliary_loss_mlp": 0.01262853, "balance_loss_clip": 0.06278022, "balance_loss_mlp": 0.01253275, "epoch": 0.6344205621524125, "flos": 21989117168640.0, "grad_norm": 1.6005871137982022, "language_loss": 0.67137057, "learning_rate": 1.2454856896250881e-06, "loss": 0.7482366, "num_input_tokens_seen": 227496060, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.0958252, "step": 10552, "time_per_iteration": 2.5650453567504883 }, { "auxiliary_loss_clip": 0.064378, "auxiliary_loss_mlp": 0.01263604, "balance_loss_clip": 0.06284866, "balance_loss_mlp": 0.01253006, "epoch": 0.6344806854050804, "flos": 20455100945280.0, "grad_norm": 1.7446890674359496, "language_loss": 0.82558358, "learning_rate": 1.24512502014147e-06, "loss": 0.90259761, "num_input_tokens_seen": 227513440, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.10601807, "step": 10553, "time_per_iteration": 2.6145052909851074 }, { "auxiliary_loss_clip": 0.06430651, "auxiliary_loss_mlp": 0.01264606, "balance_loss_clip": 0.06281079, "balance_loss_mlp": 0.01253651, "epoch": 0.6345408086577484, "flos": 40519294594560.0, "grad_norm": 4.63446515818535, "language_loss": 0.54990232, "learning_rate": 1.2447643792827879e-06, "loss": 0.6268549, "num_input_tokens_seen": 227535395, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.10961914, "step": 10554, "time_per_iteration": 2.716472864151001 }, { "auxiliary_loss_clip": 0.06432151, "auxiliary_loss_mlp": 0.0126455, "balance_loss_clip": 0.06282912, "balance_loss_mlp": 0.01253898, "epoch": 0.6346009319104163, "flos": 21367759616640.0, "grad_norm": 1.6591052397322117, "language_loss": 0.7067734, "learning_rate": 1.2444037670627153e-06, "loss": 0.7837404, "num_input_tokens_seen": 227554545, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.10668945, "step": 10555, "time_per_iteration": 2.5915141105651855 }, { "auxiliary_loss_clip": 0.06326912, "auxiliary_loss_mlp": 0.01251135, "balance_loss_clip": 0.06267071, "balance_loss_mlp": 0.01249658, "epoch": 0.6346610551630844, "flos": 71383333138560.0, "grad_norm": 0.7695510780247933, "language_loss": 0.55336934, "learning_rate": 1.2440431834949276e-06, "loss": 0.62914985, "num_input_tokens_seen": 227608575, "router_z_loss_clip": 0.60107422, "router_z_loss_mlp": 0.01475525, "step": 10556, "time_per_iteration": 3.101614236831665 }, { "auxiliary_loss_clip": 0.06432397, "auxiliary_loss_mlp": 0.01270658, "balance_loss_clip": 0.06280345, "balance_loss_mlp": 0.01258761, "epoch": 0.6347211784157523, "flos": 25418666056320.0, "grad_norm": 1.9829997841699765, "language_loss": 0.68346834, "learning_rate": 1.2436826285930985e-06, "loss": 0.76049888, "num_input_tokens_seen": 227628175, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.11907959, "step": 10557, "time_per_iteration": 2.6267189979553223 }, { "auxiliary_loss_clip": 0.06429541, "auxiliary_loss_mlp": 0.01263856, "balance_loss_clip": 0.06281702, "balance_loss_mlp": 0.01252967, "epoch": 0.6347813016684203, "flos": 15748274344320.0, "grad_norm": 1.6646022008954608, "language_loss": 0.7047931, "learning_rate": 1.2433221023709002e-06, "loss": 0.78172708, "num_input_tokens_seen": 227645330, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.10894775, "step": 10558, "time_per_iteration": 4.0147058963775635 }, { "auxiliary_loss_clip": 0.06430271, "auxiliary_loss_mlp": 0.0126801, "balance_loss_clip": 0.06282564, "balance_loss_mlp": 0.01257525, "epoch": 0.6348414249210882, "flos": 21470231560320.0, "grad_norm": 1.636994452072828, "language_loss": 0.78532088, "learning_rate": 1.2429616048420031e-06, "loss": 0.86230367, "num_input_tokens_seen": 227665250, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.1048584, "step": 10559, "time_per_iteration": 2.583392858505249 }, { "auxiliary_loss_clip": 0.06430723, "auxiliary_loss_mlp": 0.01269459, "balance_loss_clip": 0.06280473, "balance_loss_mlp": 0.01257943, "epoch": 0.6349015481737562, "flos": 21659521933440.0, "grad_norm": 1.7285192293944982, "language_loss": 0.67582011, "learning_rate": 1.242601136020078e-06, "loss": 0.75282192, "num_input_tokens_seen": 227685070, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.1151123, "step": 10560, "time_per_iteration": 2.605722665786743 }, { "auxiliary_loss_clip": 0.06428063, "auxiliary_loss_mlp": 0.01267418, "balance_loss_clip": 0.06280006, "balance_loss_mlp": 0.01257136, "epoch": 0.6349616714264241, "flos": 22200643601280.0, "grad_norm": 1.8439502095334632, "language_loss": 0.77473599, "learning_rate": 1.2422406959187939e-06, "loss": 0.85169077, "num_input_tokens_seen": 227704430, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.1027832, "step": 10561, "time_per_iteration": 2.6052846908569336 }, { "auxiliary_loss_clip": 0.06432466, "auxiliary_loss_mlp": 0.01268373, "balance_loss_clip": 0.06282514, "balance_loss_mlp": 0.01256518, "epoch": 0.6350217946790921, "flos": 25417324391040.0, "grad_norm": 1.7793114941041324, "language_loss": 0.72151506, "learning_rate": 1.2418802845518178e-06, "loss": 0.79852343, "num_input_tokens_seen": 227724920, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.11859131, "step": 10562, "time_per_iteration": 4.220686435699463 }, { "auxiliary_loss_clip": 0.06437322, "auxiliary_loss_mlp": 0.01265348, "balance_loss_clip": 0.06286575, "balance_loss_mlp": 0.01253952, "epoch": 0.63508191793176, "flos": 19725024320640.0, "grad_norm": 4.819076345361625, "language_loss": 0.80626619, "learning_rate": 1.2415199019328185e-06, "loss": 0.88329291, "num_input_tokens_seen": 227743400, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.11395264, "step": 10563, "time_per_iteration": 2.6181840896606445 }, { "auxiliary_loss_clip": 0.0644206, "auxiliary_loss_mlp": 0.01268971, "balance_loss_clip": 0.06290022, "balance_loss_mlp": 0.01257312, "epoch": 0.6351420411844281, "flos": 18192810960000.0, "grad_norm": 2.0388552620510194, "language_loss": 0.81302845, "learning_rate": 1.2411595480754597e-06, "loss": 0.89013875, "num_input_tokens_seen": 227759990, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.11651611, "step": 10564, "time_per_iteration": 2.5711445808410645 }, { "auxiliary_loss_clip": 0.06436064, "auxiliary_loss_mlp": 0.01266694, "balance_loss_clip": 0.06286411, "balance_loss_mlp": 0.01255482, "epoch": 0.6352021644370961, "flos": 33734437136640.0, "grad_norm": 1.6786043510195807, "language_loss": 0.73045337, "learning_rate": 1.240799222993407e-06, "loss": 0.80748093, "num_input_tokens_seen": 227780835, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.11212158, "step": 10565, "time_per_iteration": 2.6738626956939697 }, { "auxiliary_loss_clip": 0.06435077, "auxiliary_loss_mlp": 0.01266672, "balance_loss_clip": 0.06282763, "balance_loss_mlp": 0.01254704, "epoch": 0.635262287689764, "flos": 20380818700800.0, "grad_norm": 2.029609202795122, "language_loss": 0.69159698, "learning_rate": 1.240438926700324e-06, "loss": 0.76861441, "num_input_tokens_seen": 227798580, "router_z_loss_clip": 1.52636719, "router_z_loss_mlp": 0.11975098, "step": 10566, "time_per_iteration": 2.547823667526245 }, { "auxiliary_loss_clip": 0.06436057, "auxiliary_loss_mlp": 0.01267334, "balance_loss_clip": 0.06293102, "balance_loss_mlp": 0.01257309, "epoch": 0.635322410942432, "flos": 27532559260800.0, "grad_norm": 1.490988718831978, "language_loss": 0.69759196, "learning_rate": 1.2400786592098725e-06, "loss": 0.7746259, "num_input_tokens_seen": 227819210, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.1003418, "step": 10567, "time_per_iteration": 2.665364980697632 }, { "auxiliary_loss_clip": 0.06428431, "auxiliary_loss_mlp": 0.01264725, "balance_loss_clip": 0.0628384, "balance_loss_mlp": 0.01254163, "epoch": 0.6353825341950999, "flos": 21550048174080.0, "grad_norm": 2.2545647671531035, "language_loss": 0.84823942, "learning_rate": 1.2397184205357154e-06, "loss": 0.9251709, "num_input_tokens_seen": 227838340, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10565186, "step": 10568, "time_per_iteration": 2.5833616256713867 }, { "auxiliary_loss_clip": 0.06435762, "auxiliary_loss_mlp": 0.01271396, "balance_loss_clip": 0.06285238, "balance_loss_mlp": 0.0126034, "epoch": 0.635442657447768, "flos": 31767934464000.0, "grad_norm": 1.7804662029777716, "language_loss": 0.84050637, "learning_rate": 1.2393582106915113e-06, "loss": 0.91757798, "num_input_tokens_seen": 227859170, "router_z_loss_clip": 1.50488281, "router_z_loss_mlp": 0.1105957, "step": 10569, "time_per_iteration": 2.6699471473693848 }, { "auxiliary_loss_clip": 0.0643184, "auxiliary_loss_mlp": 0.0126799, "balance_loss_clip": 0.06286143, "balance_loss_mlp": 0.0125685, "epoch": 0.6355027807004359, "flos": 19835001204480.0, "grad_norm": 2.0282261007361786, "language_loss": 0.69685745, "learning_rate": 1.2389980296909198e-06, "loss": 0.77385569, "num_input_tokens_seen": 227878545, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.11126709, "step": 10570, "time_per_iteration": 2.544595718383789 }, { "auxiliary_loss_clip": 0.06433263, "auxiliary_loss_mlp": 0.01268907, "balance_loss_clip": 0.06281788, "balance_loss_mlp": 0.0125757, "epoch": 0.6355629039531039, "flos": 30380176742400.0, "grad_norm": 1.664183108671306, "language_loss": 0.65761822, "learning_rate": 1.2386378775476e-06, "loss": 0.73463994, "num_input_tokens_seen": 227898875, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.11340332, "step": 10571, "time_per_iteration": 2.641807794570923 }, { "auxiliary_loss_clip": 0.06439524, "auxiliary_loss_mlp": 0.01267623, "balance_loss_clip": 0.06288968, "balance_loss_mlp": 0.01256453, "epoch": 0.6356230272057718, "flos": 17938001093760.0, "grad_norm": 1.7094089521058298, "language_loss": 0.71930999, "learning_rate": 1.2382777542752074e-06, "loss": 0.79638147, "num_input_tokens_seen": 227917130, "router_z_loss_clip": 1.50585938, "router_z_loss_mlp": 0.11169434, "step": 10572, "time_per_iteration": 3.9217166900634766 }, { "auxiliary_loss_clip": 0.06436129, "auxiliary_loss_mlp": 0.01267646, "balance_loss_clip": 0.06288487, "balance_loss_mlp": 0.01256786, "epoch": 0.6356831504584398, "flos": 25383139125120.0, "grad_norm": 1.4083993859042236, "language_loss": 0.812518, "learning_rate": 1.2379176598873992e-06, "loss": 0.88955575, "num_input_tokens_seen": 227939550, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.10852051, "step": 10573, "time_per_iteration": 4.160903453826904 }, { "auxiliary_loss_clip": 0.06440187, "auxiliary_loss_mlp": 0.01270972, "balance_loss_clip": 0.06288123, "balance_loss_mlp": 0.01259736, "epoch": 0.6357432737111077, "flos": 46511029630080.0, "grad_norm": 1.4852905638330482, "language_loss": 0.68776304, "learning_rate": 1.2375575943978303e-06, "loss": 0.76487458, "num_input_tokens_seen": 227962200, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.11236572, "step": 10574, "time_per_iteration": 2.781465530395508 }, { "auxiliary_loss_clip": 0.06432258, "auxiliary_loss_mlp": 0.01267893, "balance_loss_clip": 0.06284498, "balance_loss_mlp": 0.01256705, "epoch": 0.6358033969637757, "flos": 17280026507520.0, "grad_norm": 2.2601015756767637, "language_loss": 0.87547833, "learning_rate": 1.2371975578201525e-06, "loss": 0.95247984, "num_input_tokens_seen": 227979270, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.11187744, "step": 10575, "time_per_iteration": 2.5305263996124268 }, { "auxiliary_loss_clip": 0.06435584, "auxiliary_loss_mlp": 0.01267135, "balance_loss_clip": 0.06287801, "balance_loss_mlp": 0.01256704, "epoch": 0.6358635202164437, "flos": 27132832558080.0, "grad_norm": 1.4797229124513596, "language_loss": 0.72304845, "learning_rate": 1.2368375501680204e-06, "loss": 0.80007565, "num_input_tokens_seen": 228000550, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.10430908, "step": 10576, "time_per_iteration": 2.592775344848633 }, { "auxiliary_loss_clip": 0.06438075, "auxiliary_loss_mlp": 0.01271621, "balance_loss_clip": 0.0628652, "balance_loss_mlp": 0.01260731, "epoch": 0.6359236434691117, "flos": 27532307698560.0, "grad_norm": 1.4114539599772942, "language_loss": 0.69393975, "learning_rate": 1.236477571455085e-06, "loss": 0.77103668, "num_input_tokens_seen": 228022005, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.10888672, "step": 10577, "time_per_iteration": 2.625932455062866 }, { "auxiliary_loss_clip": 0.06434599, "auxiliary_loss_mlp": 0.01271331, "balance_loss_clip": 0.06286813, "balance_loss_mlp": 0.01261014, "epoch": 0.6359837667217797, "flos": 39357653915520.0, "grad_norm": 1.6753143411868274, "language_loss": 0.72352111, "learning_rate": 1.2361176216949964e-06, "loss": 0.80058044, "num_input_tokens_seen": 228043770, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.10321045, "step": 10578, "time_per_iteration": 2.700460195541382 }, { "auxiliary_loss_clip": 0.06327382, "auxiliary_loss_mlp": 0.01255746, "balance_loss_clip": 0.06268153, "balance_loss_mlp": 0.01254037, "epoch": 0.6360438899744476, "flos": 56430472475520.0, "grad_norm": 0.6949609983503907, "language_loss": 0.54482973, "learning_rate": 1.2357577009014044e-06, "loss": 0.62066096, "num_input_tokens_seen": 228104985, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01713562, "step": 10579, "time_per_iteration": 3.2457077503204346 }, { "auxiliary_loss_clip": 0.06431988, "auxiliary_loss_mlp": 0.01268688, "balance_loss_clip": 0.06283704, "balance_loss_mlp": 0.01257924, "epoch": 0.6361040132271156, "flos": 24980100186240.0, "grad_norm": 1.783335545753512, "language_loss": 0.77763468, "learning_rate": 1.2353978090879568e-06, "loss": 0.85464144, "num_input_tokens_seen": 228125620, "router_z_loss_clip": 1.48242188, "router_z_loss_mlp": 0.10772705, "step": 10580, "time_per_iteration": 2.597796678543091 }, { "auxiliary_loss_clip": 0.06432958, "auxiliary_loss_mlp": 0.01272055, "balance_loss_clip": 0.06284346, "balance_loss_mlp": 0.01261475, "epoch": 0.6361641364797835, "flos": 23266059465600.0, "grad_norm": 2.205695198770919, "language_loss": 0.66854429, "learning_rate": 1.235037946268301e-06, "loss": 0.74559438, "num_input_tokens_seen": 228143495, "router_z_loss_clip": 1.48730469, "router_z_loss_mlp": 0.10571289, "step": 10581, "time_per_iteration": 2.589930772781372 }, { "auxiliary_loss_clip": 0.06433124, "auxiliary_loss_mlp": 0.01266867, "balance_loss_clip": 0.06284938, "balance_loss_mlp": 0.01256669, "epoch": 0.6362242597324516, "flos": 26001645638400.0, "grad_norm": 1.4018814469598764, "language_loss": 0.68610394, "learning_rate": 1.2346781124560828e-06, "loss": 0.76310384, "num_input_tokens_seen": 228166500, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.10205078, "step": 10582, "time_per_iteration": 2.6675405502319336 }, { "auxiliary_loss_clip": 0.06438423, "auxiliary_loss_mlp": 0.01264628, "balance_loss_clip": 0.06287351, "balance_loss_mlp": 0.01252898, "epoch": 0.6362843829851195, "flos": 25710428373120.0, "grad_norm": 1.7748553268190383, "language_loss": 0.845505, "learning_rate": 1.2343183076649473e-06, "loss": 0.92253548, "num_input_tokens_seen": 228185325, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.1171875, "step": 10583, "time_per_iteration": 2.6300570964813232 }, { "auxiliary_loss_clip": 0.06433256, "auxiliary_loss_mlp": 0.01270246, "balance_loss_clip": 0.0628875, "balance_loss_mlp": 0.0125957, "epoch": 0.6363445062377875, "flos": 20529341262720.0, "grad_norm": 1.5262699995775064, "language_loss": 0.7584399, "learning_rate": 1.233958531908538e-06, "loss": 0.83547491, "num_input_tokens_seen": 228204050, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10675049, "step": 10584, "time_per_iteration": 2.613753080368042 }, { "auxiliary_loss_clip": 0.0643467, "auxiliary_loss_mlp": 0.01269444, "balance_loss_clip": 0.06282494, "balance_loss_mlp": 0.0125704, "epoch": 0.6364046294904554, "flos": 19469879038080.0, "grad_norm": 1.7428301673171673, "language_loss": 0.72943574, "learning_rate": 1.2335987852004985e-06, "loss": 0.80647683, "num_input_tokens_seen": 228222430, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.12414551, "step": 10585, "time_per_iteration": 2.573188304901123 }, { "auxiliary_loss_clip": 0.06433889, "auxiliary_loss_mlp": 0.01267958, "balance_loss_clip": 0.06285369, "balance_loss_mlp": 0.01257718, "epoch": 0.6364647527431234, "flos": 21002176252800.0, "grad_norm": 1.8073292639056318, "language_loss": 0.83059418, "learning_rate": 1.2332390675544697e-06, "loss": 0.90761268, "num_input_tokens_seen": 228241925, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.10241699, "step": 10586, "time_per_iteration": 2.5667991638183594 }, { "auxiliary_loss_clip": 0.06432583, "auxiliary_loss_mlp": 0.01265992, "balance_loss_clip": 0.06286088, "balance_loss_mlp": 0.0125599, "epoch": 0.6365248759957913, "flos": 25777079896320.0, "grad_norm": 1.8647729381348577, "language_loss": 0.72696906, "learning_rate": 1.2328793789840918e-06, "loss": 0.80395472, "num_input_tokens_seen": 228262535, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.10003662, "step": 10587, "time_per_iteration": 2.591810941696167 }, { "auxiliary_loss_clip": 0.06430256, "auxiliary_loss_mlp": 0.01265942, "balance_loss_clip": 0.06279688, "balance_loss_mlp": 0.0125498, "epoch": 0.6365849992484593, "flos": 22462161793920.0, "grad_norm": 1.8123091589198963, "language_loss": 0.77256536, "learning_rate": 1.2325197195030058e-06, "loss": 0.8495273, "num_input_tokens_seen": 228281340, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.10955811, "step": 10588, "time_per_iteration": 2.6009340286254883 }, { "auxiliary_loss_clip": 0.06423995, "auxiliary_loss_mlp": 0.01268096, "balance_loss_clip": 0.06280763, "balance_loss_mlp": 0.01256974, "epoch": 0.6366451225011273, "flos": 19031648584320.0, "grad_norm": 1.7658325799921033, "language_loss": 0.79909784, "learning_rate": 1.2321600891248478e-06, "loss": 0.87601876, "num_input_tokens_seen": 228300865, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.11126709, "step": 10589, "time_per_iteration": 2.5710654258728027 }, { "auxiliary_loss_clip": 0.06430405, "auxiliary_loss_mlp": 0.0126964, "balance_loss_clip": 0.06284733, "balance_loss_mlp": 0.01257958, "epoch": 0.6367052457537953, "flos": 25235413176960.0, "grad_norm": 1.9010565781186748, "language_loss": 0.67081523, "learning_rate": 1.231800487863257e-06, "loss": 0.74781567, "num_input_tokens_seen": 228320815, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.11700439, "step": 10590, "time_per_iteration": 2.5919549465179443 }, { "auxiliary_loss_clip": 0.06444085, "auxiliary_loss_mlp": 0.01269265, "balance_loss_clip": 0.06286692, "balance_loss_mlp": 0.01256909, "epoch": 0.6367653690064633, "flos": 19214482193280.0, "grad_norm": 1.723464686060364, "language_loss": 0.79274452, "learning_rate": 1.2314409157318685e-06, "loss": 0.86987805, "num_input_tokens_seen": 228339065, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.12353516, "step": 10591, "time_per_iteration": 2.535919189453125 }, { "auxiliary_loss_clip": 0.06425375, "auxiliary_loss_mlp": 0.01269452, "balance_loss_clip": 0.06280697, "balance_loss_mlp": 0.01259665, "epoch": 0.6368254922591312, "flos": 23553000172800.0, "grad_norm": 1.4011392403528589, "language_loss": 0.88993263, "learning_rate": 1.231081372744317e-06, "loss": 0.96688092, "num_input_tokens_seen": 228359210, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.09790039, "step": 10592, "time_per_iteration": 2.5936944484710693 }, { "auxiliary_loss_clip": 0.06424279, "auxiliary_loss_mlp": 0.01265942, "balance_loss_clip": 0.06280503, "balance_loss_mlp": 0.01256083, "epoch": 0.6368856155117992, "flos": 26474270993280.0, "grad_norm": 1.3674358632013068, "language_loss": 0.68855566, "learning_rate": 1.2307218589142376e-06, "loss": 0.76545787, "num_input_tokens_seen": 228379630, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.09857178, "step": 10593, "time_per_iteration": 2.604390859603882 }, { "auxiliary_loss_clip": 0.06426817, "auxiliary_loss_mlp": 0.01266222, "balance_loss_clip": 0.06280832, "balance_loss_mlp": 0.01255702, "epoch": 0.6369457387644671, "flos": 33700754995200.0, "grad_norm": 1.9104074046569237, "language_loss": 0.64217675, "learning_rate": 1.2303623742552618e-06, "loss": 0.71910715, "num_input_tokens_seen": 228401410, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10516357, "step": 10594, "time_per_iteration": 2.7608463764190674 }, { "auxiliary_loss_clip": 0.06323835, "auxiliary_loss_mlp": 0.01253715, "balance_loss_clip": 0.06264408, "balance_loss_mlp": 0.0125215, "epoch": 0.6370058620171352, "flos": 70929365316480.0, "grad_norm": 0.7582785567567671, "language_loss": 0.54593337, "learning_rate": 1.230002918781022e-06, "loss": 0.62170887, "num_input_tokens_seen": 228470335, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01564789, "step": 10595, "time_per_iteration": 3.288522958755493 }, { "auxiliary_loss_clip": 0.06436656, "auxiliary_loss_mlp": 0.01267674, "balance_loss_clip": 0.06283928, "balance_loss_mlp": 0.01256015, "epoch": 0.6370659852698031, "flos": 21148267046400.0, "grad_norm": 2.171639303197717, "language_loss": 0.66982889, "learning_rate": 1.2296434925051493e-06, "loss": 0.74687219, "num_input_tokens_seen": 228490765, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.11657715, "step": 10596, "time_per_iteration": 2.6075756549835205 }, { "auxiliary_loss_clip": 0.06426016, "auxiliary_loss_mlp": 0.01265576, "balance_loss_clip": 0.06278473, "balance_loss_mlp": 0.01254728, "epoch": 0.6371261085224711, "flos": 20199452538240.0, "grad_norm": 2.2467853583702206, "language_loss": 0.8009752, "learning_rate": 1.2292840954412718e-06, "loss": 0.87789112, "num_input_tokens_seen": 228509700, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.10852051, "step": 10597, "time_per_iteration": 4.028800964355469 }, { "auxiliary_loss_clip": 0.06434211, "auxiliary_loss_mlp": 0.01266836, "balance_loss_clip": 0.06284574, "balance_loss_mlp": 0.01256369, "epoch": 0.637186231775139, "flos": 19689790878720.0, "grad_norm": 1.5958014459196512, "language_loss": 0.74960524, "learning_rate": 1.2289247276030189e-06, "loss": 0.82661569, "num_input_tokens_seen": 228529050, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.10461426, "step": 10598, "time_per_iteration": 2.551914691925049 }, { "auxiliary_loss_clip": 0.06430621, "auxiliary_loss_mlp": 0.01264694, "balance_loss_clip": 0.06280857, "balance_loss_mlp": 0.01254162, "epoch": 0.637246355027807, "flos": 13074937355520.0, "grad_norm": 1.811188618711185, "language_loss": 0.68535841, "learning_rate": 1.2285653890040176e-06, "loss": 0.76231158, "num_input_tokens_seen": 228544665, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.10534668, "step": 10599, "time_per_iteration": 2.634369373321533 }, { "auxiliary_loss_clip": 0.06437624, "auxiliary_loss_mlp": 0.01268703, "balance_loss_clip": 0.06284133, "balance_loss_mlp": 0.01256519, "epoch": 0.6373064782804749, "flos": 18228421745280.0, "grad_norm": 2.429233063924628, "language_loss": 0.80905229, "learning_rate": 1.2282060796578942e-06, "loss": 0.88611561, "num_input_tokens_seen": 228562060, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.12194824, "step": 10600, "time_per_iteration": 2.5414083003997803 }, { "auxiliary_loss_clip": 0.06425001, "auxiliary_loss_mlp": 0.01268229, "balance_loss_clip": 0.06278816, "balance_loss_mlp": 0.0125725, "epoch": 0.637366601533143, "flos": 24505336552320.0, "grad_norm": 1.384023245228792, "language_loss": 0.80012822, "learning_rate": 1.2278467995782732e-06, "loss": 0.87706053, "num_input_tokens_seen": 228582550, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.10986328, "step": 10601, "time_per_iteration": 2.573442220687866 }, { "auxiliary_loss_clip": 0.06435066, "auxiliary_loss_mlp": 0.01263058, "balance_loss_clip": 0.06283627, "balance_loss_mlp": 0.01251847, "epoch": 0.6374267247858109, "flos": 26366180826240.0, "grad_norm": 2.0010277242767596, "language_loss": 0.68079615, "learning_rate": 1.2274875487787797e-06, "loss": 0.75777745, "num_input_tokens_seen": 228604960, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.11212158, "step": 10602, "time_per_iteration": 4.13379430770874 }, { "auxiliary_loss_clip": 0.06435864, "auxiliary_loss_mlp": 0.01267181, "balance_loss_clip": 0.06286264, "balance_loss_mlp": 0.0125585, "epoch": 0.6374868480384789, "flos": 20377254902400.0, "grad_norm": 1.5960096607865197, "language_loss": 0.79638785, "learning_rate": 1.2271283272730354e-06, "loss": 0.87341833, "num_input_tokens_seen": 228622195, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.11340332, "step": 10603, "time_per_iteration": 2.5464861392974854 }, { "auxiliary_loss_clip": 0.06428112, "auxiliary_loss_mlp": 0.01266011, "balance_loss_clip": 0.06278642, "balance_loss_mlp": 0.0125483, "epoch": 0.6375469712911469, "flos": 21002595523200.0, "grad_norm": 1.9682028278144377, "language_loss": 0.77446032, "learning_rate": 1.2267691350746621e-06, "loss": 0.85140157, "num_input_tokens_seen": 228639735, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.11181641, "step": 10604, "time_per_iteration": 2.5375070571899414 }, { "auxiliary_loss_clip": 0.06436922, "auxiliary_loss_mlp": 0.01264068, "balance_loss_clip": 0.06283119, "balance_loss_mlp": 0.01252409, "epoch": 0.6376070945438148, "flos": 19721292814080.0, "grad_norm": 3.6509812565997293, "language_loss": 0.77051151, "learning_rate": 1.226409972197281e-06, "loss": 0.84752136, "num_input_tokens_seen": 228658195, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11657715, "step": 10605, "time_per_iteration": 2.5273213386535645 }, { "auxiliary_loss_clip": 0.06434674, "auxiliary_loss_mlp": 0.01265832, "balance_loss_clip": 0.06283738, "balance_loss_mlp": 0.01253435, "epoch": 0.6376672177964828, "flos": 21513137650560.0, "grad_norm": 2.0616204232587694, "language_loss": 0.65876663, "learning_rate": 1.2260508386545106e-06, "loss": 0.73577166, "num_input_tokens_seen": 228677415, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.12390137, "step": 10606, "time_per_iteration": 2.557673215866089 }, { "auxiliary_loss_clip": 0.06427695, "auxiliary_loss_mlp": 0.01268903, "balance_loss_clip": 0.06284297, "balance_loss_mlp": 0.01258836, "epoch": 0.6377273410491507, "flos": 18849905078400.0, "grad_norm": 1.6121170930842494, "language_loss": 0.75459033, "learning_rate": 1.225691734459971e-06, "loss": 0.83155632, "num_input_tokens_seen": 228696450, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10064697, "step": 10607, "time_per_iteration": 2.5365381240844727 }, { "auxiliary_loss_clip": 0.06434423, "auxiliary_loss_mlp": 0.01272105, "balance_loss_clip": 0.06284361, "balance_loss_mlp": 0.01261299, "epoch": 0.6377874643018188, "flos": 53073962749440.0, "grad_norm": 2.123824860352273, "language_loss": 0.65742803, "learning_rate": 1.225332659627278e-06, "loss": 0.73449326, "num_input_tokens_seen": 228721600, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.10809326, "step": 10608, "time_per_iteration": 2.8595693111419678 }, { "auxiliary_loss_clip": 0.06319228, "auxiliary_loss_mlp": 0.01252209, "balance_loss_clip": 0.06259675, "balance_loss_mlp": 0.01250588, "epoch": 0.6378475875544867, "flos": 65153349417600.0, "grad_norm": 0.7101789162825242, "language_loss": 0.51922309, "learning_rate": 1.2249736141700475e-06, "loss": 0.5949375, "num_input_tokens_seen": 228784535, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01623535, "step": 10609, "time_per_iteration": 3.204436779022217 }, { "auxiliary_loss_clip": 0.06425613, "auxiliary_loss_mlp": 0.01266087, "balance_loss_clip": 0.06280744, "balance_loss_mlp": 0.0125633, "epoch": 0.6379077108071547, "flos": 23009404809600.0, "grad_norm": 1.6055521890520523, "language_loss": 0.75166368, "learning_rate": 1.2246145981018965e-06, "loss": 0.82858062, "num_input_tokens_seen": 228804110, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.09747314, "step": 10610, "time_per_iteration": 2.7491228580474854 }, { "auxiliary_loss_clip": 0.06316002, "auxiliary_loss_mlp": 0.01252894, "balance_loss_clip": 0.06256461, "balance_loss_mlp": 0.01251256, "epoch": 0.6379678340598226, "flos": 67624425849600.0, "grad_norm": 0.8305056498856627, "language_loss": 0.62666929, "learning_rate": 1.2242556114364364e-06, "loss": 0.70235825, "num_input_tokens_seen": 228867705, "router_z_loss_clip": 0.59521484, "router_z_loss_mlp": 0.0164032, "step": 10611, "time_per_iteration": 3.2472264766693115 }, { "auxiliary_loss_clip": 0.06432526, "auxiliary_loss_mlp": 0.01265437, "balance_loss_clip": 0.06281735, "balance_loss_mlp": 0.01254726, "epoch": 0.6380279573124906, "flos": 29687891109120.0, "grad_norm": 2.918111652763971, "language_loss": 0.72988117, "learning_rate": 1.223896654187282e-06, "loss": 0.8068608, "num_input_tokens_seen": 228889215, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.10705566, "step": 10612, "time_per_iteration": 4.117860555648804 }, { "auxiliary_loss_clip": 0.06315716, "auxiliary_loss_mlp": 0.01251756, "balance_loss_clip": 0.06256057, "balance_loss_mlp": 0.01250229, "epoch": 0.6380880805651585, "flos": 66502435680000.0, "grad_norm": 0.6985687082035584, "language_loss": 0.57680678, "learning_rate": 1.2235377263680446e-06, "loss": 0.65248144, "num_input_tokens_seen": 228948465, "router_z_loss_clip": 0.59667969, "router_z_loss_mlp": 0.01525879, "step": 10613, "time_per_iteration": 3.106326103210449 }, { "auxiliary_loss_clip": 0.06439404, "auxiliary_loss_mlp": 0.01266006, "balance_loss_clip": 0.06288019, "balance_loss_mlp": 0.0125455, "epoch": 0.6381482038178266, "flos": 23921811918720.0, "grad_norm": 3.695231918112729, "language_loss": 0.75621444, "learning_rate": 1.2231788279923334e-06, "loss": 0.83326852, "num_input_tokens_seen": 228967955, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.11444092, "step": 10614, "time_per_iteration": 2.5800437927246094 }, { "auxiliary_loss_clip": 0.06428321, "auxiliary_loss_mlp": 0.01264896, "balance_loss_clip": 0.06280212, "balance_loss_mlp": 0.01253738, "epoch": 0.6382083270704945, "flos": 24249855853440.0, "grad_norm": 2.2836955863946224, "language_loss": 0.79769075, "learning_rate": 1.2228199590737599e-06, "loss": 0.87462294, "num_input_tokens_seen": 228985495, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.1116333, "step": 10615, "time_per_iteration": 2.5883305072784424 }, { "auxiliary_loss_clip": 0.06317801, "auxiliary_loss_mlp": 0.0125005, "balance_loss_clip": 0.06258434, "balance_loss_mlp": 0.01248643, "epoch": 0.6382684503231625, "flos": 70798452111360.0, "grad_norm": 0.6393165724216033, "language_loss": 0.55116963, "learning_rate": 1.2224611196259305e-06, "loss": 0.6268481, "num_input_tokens_seen": 229052995, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01406097, "step": 10616, "time_per_iteration": 3.258044481277466 }, { "auxiliary_loss_clip": 0.06428052, "auxiliary_loss_mlp": 0.01263819, "balance_loss_clip": 0.06278738, "balance_loss_mlp": 0.0125275, "epoch": 0.6383285735758305, "flos": 16550411080320.0, "grad_norm": 1.7353939595439747, "language_loss": 0.84892035, "learning_rate": 1.2221023096624538e-06, "loss": 0.92583913, "num_input_tokens_seen": 229071030, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.11083984, "step": 10617, "time_per_iteration": 2.526029348373413 }, { "auxiliary_loss_clip": 0.06428477, "auxiliary_loss_mlp": 0.01264378, "balance_loss_clip": 0.06279159, "balance_loss_mlp": 0.01253363, "epoch": 0.6383886968284984, "flos": 14432702515200.0, "grad_norm": 1.9692831499754615, "language_loss": 0.87184572, "learning_rate": 1.221743529196936e-06, "loss": 0.94877422, "num_input_tokens_seen": 229088275, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.11010742, "step": 10618, "time_per_iteration": 2.5866904258728027 }, { "auxiliary_loss_clip": 0.0643021, "auxiliary_loss_mlp": 0.01264786, "balance_loss_clip": 0.062793, "balance_loss_mlp": 0.01254021, "epoch": 0.6384488200811664, "flos": 17935191982080.0, "grad_norm": 1.5958007904855627, "language_loss": 0.73284769, "learning_rate": 1.2213847782429806e-06, "loss": 0.80979764, "num_input_tokens_seen": 229105190, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.10766602, "step": 10619, "time_per_iteration": 2.535822629928589 }, { "auxiliary_loss_clip": 0.06438285, "auxiliary_loss_mlp": 0.0127255, "balance_loss_clip": 0.06282698, "balance_loss_mlp": 0.01260444, "epoch": 0.6385089433338343, "flos": 18521567654400.0, "grad_norm": 1.9812093714667307, "language_loss": 0.76711094, "learning_rate": 1.221026056814193e-06, "loss": 0.84421933, "num_input_tokens_seen": 229122290, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.12091064, "step": 10620, "time_per_iteration": 2.547421455383301 }, { "auxiliary_loss_clip": 0.06432515, "auxiliary_loss_mlp": 0.01267535, "balance_loss_clip": 0.0628427, "balance_loss_mlp": 0.01256777, "epoch": 0.6385690665865024, "flos": 24760481834880.0, "grad_norm": 2.4423032843452557, "language_loss": 0.70988667, "learning_rate": 1.2206673649241752e-06, "loss": 0.78688717, "num_input_tokens_seen": 229141620, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10760498, "step": 10621, "time_per_iteration": 2.5875084400177 }, { "auxiliary_loss_clip": 0.06424414, "auxiliary_loss_mlp": 0.01266147, "balance_loss_clip": 0.06281885, "balance_loss_mlp": 0.01256533, "epoch": 0.6386291898391703, "flos": 20126763521280.0, "grad_norm": 1.520983323852965, "language_loss": 0.77975452, "learning_rate": 1.220308702586529e-06, "loss": 0.85666013, "num_input_tokens_seen": 229161570, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09606934, "step": 10622, "time_per_iteration": 2.568692207336426 }, { "auxiliary_loss_clip": 0.06423751, "auxiliary_loss_mlp": 0.01263797, "balance_loss_clip": 0.06280123, "balance_loss_mlp": 0.01253539, "epoch": 0.6386893130918383, "flos": 16871914396800.0, "grad_norm": 2.0305078839297797, "language_loss": 0.74827975, "learning_rate": 1.2199500698148546e-06, "loss": 0.82515526, "num_input_tokens_seen": 229178465, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.1026001, "step": 10623, "time_per_iteration": 2.5186522006988525 }, { "auxiliary_loss_clip": 0.06425283, "auxiliary_loss_mlp": 0.01266277, "balance_loss_clip": 0.06280176, "balance_loss_mlp": 0.01256591, "epoch": 0.6387494363445062, "flos": 22972913556480.0, "grad_norm": 1.3569731921792876, "language_loss": 0.76969153, "learning_rate": 1.2195914666227527e-06, "loss": 0.84660709, "num_input_tokens_seen": 229198975, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.09692383, "step": 10624, "time_per_iteration": 2.5917375087738037 }, { "auxiliary_loss_clip": 0.06432186, "auxiliary_loss_mlp": 0.01264092, "balance_loss_clip": 0.06283917, "balance_loss_mlp": 0.01253507, "epoch": 0.6388095595971742, "flos": 22864487973120.0, "grad_norm": 1.576169536928284, "language_loss": 0.80754292, "learning_rate": 1.21923289302382e-06, "loss": 0.88450569, "num_input_tokens_seen": 229218825, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.10571289, "step": 10625, "time_per_iteration": 2.5889992713928223 }, { "auxiliary_loss_clip": 0.0642942, "auxiliary_loss_mlp": 0.01267003, "balance_loss_clip": 0.06280909, "balance_loss_mlp": 0.01255541, "epoch": 0.6388696828498421, "flos": 17317314374400.0, "grad_norm": 1.8267298849050166, "language_loss": 0.73299134, "learning_rate": 1.218874349031654e-06, "loss": 0.80995554, "num_input_tokens_seen": 229236060, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.11456299, "step": 10626, "time_per_iteration": 2.564086675643921 }, { "auxiliary_loss_clip": 0.06424983, "auxiliary_loss_mlp": 0.01268129, "balance_loss_clip": 0.06276096, "balance_loss_mlp": 0.01257185, "epoch": 0.6389298061025102, "flos": 17134313057280.0, "grad_norm": 1.5365079376739146, "language_loss": 0.73521382, "learning_rate": 1.2185158346598517e-06, "loss": 0.81214494, "num_input_tokens_seen": 229255160, "router_z_loss_clip": 1.48730469, "router_z_loss_mlp": 0.109375, "step": 10627, "time_per_iteration": 2.5431528091430664 }, { "auxiliary_loss_clip": 0.06436984, "auxiliary_loss_mlp": 0.01267772, "balance_loss_clip": 0.06281514, "balance_loss_mlp": 0.01255702, "epoch": 0.6389899293551781, "flos": 27718663178880.0, "grad_norm": 1.7389021429866482, "language_loss": 0.67352498, "learning_rate": 1.2181573499220064e-06, "loss": 0.7505725, "num_input_tokens_seen": 229278705, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.12078857, "step": 10628, "time_per_iteration": 2.623213768005371 }, { "auxiliary_loss_clip": 0.06425631, "auxiliary_loss_mlp": 0.01263796, "balance_loss_clip": 0.06282821, "balance_loss_mlp": 0.01254044, "epoch": 0.6390500526078461, "flos": 21222171947520.0, "grad_norm": 1.7915223598587278, "language_loss": 0.68402994, "learning_rate": 1.2177988948317135e-06, "loss": 0.76092416, "num_input_tokens_seen": 229299990, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09753418, "step": 10629, "time_per_iteration": 2.5962440967559814 }, { "auxiliary_loss_clip": 0.06441581, "auxiliary_loss_mlp": 0.01272811, "balance_loss_clip": 0.06285954, "balance_loss_mlp": 0.01258929, "epoch": 0.6391101758605141, "flos": 21587671457280.0, "grad_norm": 1.4620131056067538, "language_loss": 0.75558722, "learning_rate": 1.2174404694025646e-06, "loss": 0.83273119, "num_input_tokens_seen": 229319230, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.13873291, "step": 10630, "time_per_iteration": 2.632915735244751 }, { "auxiliary_loss_clip": 0.06424163, "auxiliary_loss_mlp": 0.01267091, "balance_loss_clip": 0.06277961, "balance_loss_mlp": 0.01256767, "epoch": 0.639170299113182, "flos": 19906432410240.0, "grad_norm": 1.8609870616900943, "language_loss": 0.70699871, "learning_rate": 1.2170820736481511e-06, "loss": 0.78391123, "num_input_tokens_seen": 229338600, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.10327148, "step": 10631, "time_per_iteration": 2.571587324142456 }, { "auxiliary_loss_clip": 0.06327029, "auxiliary_loss_mlp": 0.01252676, "balance_loss_clip": 0.0626712, "balance_loss_mlp": 0.01251339, "epoch": 0.63923042236585, "flos": 69896625344640.0, "grad_norm": 0.7552992619397757, "language_loss": 0.62711316, "learning_rate": 1.2167237075820646e-06, "loss": 0.70291024, "num_input_tokens_seen": 229402420, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01338196, "step": 10632, "time_per_iteration": 3.233680248260498 }, { "auxiliary_loss_clip": 0.06424945, "auxiliary_loss_mlp": 0.01266812, "balance_loss_clip": 0.06280006, "balance_loss_mlp": 0.01256155, "epoch": 0.639290545618518, "flos": 22681486656000.0, "grad_norm": 2.0287059802114427, "language_loss": 0.66994655, "learning_rate": 1.216365371217893e-06, "loss": 0.74686414, "num_input_tokens_seen": 229419185, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10662842, "step": 10633, "time_per_iteration": 2.609703302383423 }, { "auxiliary_loss_clip": 0.06428959, "auxiliary_loss_mlp": 0.01265531, "balance_loss_clip": 0.06281558, "balance_loss_mlp": 0.01254701, "epoch": 0.639350668871186, "flos": 19835420474880.0, "grad_norm": 2.202008515220858, "language_loss": 0.82113492, "learning_rate": 1.216007064569225e-06, "loss": 0.89807981, "num_input_tokens_seen": 229436735, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.1083374, "step": 10634, "time_per_iteration": 2.5789859294891357 }, { "auxiliary_loss_clip": 0.06425737, "auxiliary_loss_mlp": 0.01268427, "balance_loss_clip": 0.06278261, "balance_loss_mlp": 0.01257418, "epoch": 0.6394107921238539, "flos": 20558746846080.0, "grad_norm": 1.5079405084363562, "language_loss": 0.74927825, "learning_rate": 1.2156487876496483e-06, "loss": 0.82621992, "num_input_tokens_seen": 229455595, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.11016846, "step": 10635, "time_per_iteration": 2.5434975624084473 }, { "auxiliary_loss_clip": 0.06426277, "auxiliary_loss_mlp": 0.01263959, "balance_loss_clip": 0.06278841, "balance_loss_mlp": 0.01253928, "epoch": 0.6394709153765219, "flos": 25781985360000.0, "grad_norm": 2.5244341429660864, "language_loss": 0.71415794, "learning_rate": 1.2152905404727475e-06, "loss": 0.79106033, "num_input_tokens_seen": 229476230, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.1003418, "step": 10636, "time_per_iteration": 2.5825276374816895 }, { "auxiliary_loss_clip": 0.0643447, "auxiliary_loss_mlp": 0.01266682, "balance_loss_clip": 0.06282707, "balance_loss_mlp": 0.01255357, "epoch": 0.6395310386291898, "flos": 17535926476800.0, "grad_norm": 2.092612304037431, "language_loss": 0.74789321, "learning_rate": 1.2149323230521085e-06, "loss": 0.82490474, "num_input_tokens_seen": 229494300, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.11322021, "step": 10637, "time_per_iteration": 3.98069429397583 }, { "auxiliary_loss_clip": 0.06435251, "auxiliary_loss_mlp": 0.01265443, "balance_loss_clip": 0.06284334, "balance_loss_mlp": 0.01253838, "epoch": 0.6395911618818578, "flos": 18594172817280.0, "grad_norm": 1.7664830409568413, "language_loss": 0.78285497, "learning_rate": 1.2145741354013143e-06, "loss": 0.85986191, "num_input_tokens_seen": 229512985, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.11602783, "step": 10638, "time_per_iteration": 2.5382800102233887 }, { "auxiliary_loss_clip": 0.0642694, "auxiliary_loss_mlp": 0.0126545, "balance_loss_clip": 0.06279215, "balance_loss_mlp": 0.01254524, "epoch": 0.6396512851345257, "flos": 28374164069760.0, "grad_norm": 2.1067808420091847, "language_loss": 0.82025778, "learning_rate": 1.2142159775339478e-06, "loss": 0.89718175, "num_input_tokens_seen": 229534270, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.10925293, "step": 10639, "time_per_iteration": 2.6090362071990967 }, { "auxiliary_loss_clip": 0.06319734, "auxiliary_loss_mlp": 0.012505, "balance_loss_clip": 0.06260362, "balance_loss_mlp": 0.01248889, "epoch": 0.6397114083871938, "flos": 70744728844800.0, "grad_norm": 0.8097763471776674, "language_loss": 0.5889926, "learning_rate": 1.21385784946359e-06, "loss": 0.66469491, "num_input_tokens_seen": 229596455, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01612854, "step": 10640, "time_per_iteration": 3.117521047592163 }, { "auxiliary_loss_clip": 0.06424303, "auxiliary_loss_mlp": 0.01264878, "balance_loss_clip": 0.06280248, "balance_loss_mlp": 0.01255049, "epoch": 0.6397715316398617, "flos": 18147095758080.0, "grad_norm": 1.9096332350604788, "language_loss": 0.78307807, "learning_rate": 1.2134997512038215e-06, "loss": 0.85996985, "num_input_tokens_seen": 229612860, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.09838867, "step": 10641, "time_per_iteration": 4.000844478607178 }, { "auxiliary_loss_clip": 0.06439188, "auxiliary_loss_mlp": 0.01268557, "balance_loss_clip": 0.06282839, "balance_loss_mlp": 0.01257178, "epoch": 0.6398316548925297, "flos": 25746668064000.0, "grad_norm": 1.5144430038197816, "language_loss": 0.63665211, "learning_rate": 1.2131416827682209e-06, "loss": 0.71372962, "num_input_tokens_seen": 229633960, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.1137085, "step": 10642, "time_per_iteration": 2.6638925075531006 }, { "auxiliary_loss_clip": 0.063254, "auxiliary_loss_mlp": 0.01250814, "balance_loss_clip": 0.06266002, "balance_loss_mlp": 0.0124918, "epoch": 0.6398917781451977, "flos": 71231246778240.0, "grad_norm": 0.8861453704194892, "language_loss": 0.55894053, "learning_rate": 1.2127836441703667e-06, "loss": 0.63470268, "num_input_tokens_seen": 229686730, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01635742, "step": 10643, "time_per_iteration": 3.110440254211426 }, { "auxiliary_loss_clip": 0.06431881, "auxiliary_loss_mlp": 0.01266627, "balance_loss_clip": 0.06279489, "balance_loss_mlp": 0.01255171, "epoch": 0.6399519013978656, "flos": 20528083451520.0, "grad_norm": 2.7638616668158518, "language_loss": 0.77282822, "learning_rate": 1.2124256354238358e-06, "loss": 0.84981328, "num_input_tokens_seen": 229704800, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.11462402, "step": 10644, "time_per_iteration": 2.556943655014038 }, { "auxiliary_loss_clip": 0.06427975, "auxiliary_loss_mlp": 0.01267421, "balance_loss_clip": 0.06282103, "balance_loss_mlp": 0.01256787, "epoch": 0.6400120246505336, "flos": 24467503633920.0, "grad_norm": 5.857328134743548, "language_loss": 0.82493627, "learning_rate": 1.212067656542203e-06, "loss": 0.90189028, "num_input_tokens_seen": 229725265, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10644531, "step": 10645, "time_per_iteration": 2.6019883155822754 }, { "auxiliary_loss_clip": 0.06439423, "auxiliary_loss_mlp": 0.01266556, "balance_loss_clip": 0.06285091, "balance_loss_mlp": 0.01254457, "epoch": 0.6400721479032015, "flos": 28373619018240.0, "grad_norm": 1.8070985446838062, "language_loss": 0.73658848, "learning_rate": 1.2117097075390447e-06, "loss": 0.81364822, "num_input_tokens_seen": 229744840, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.12091064, "step": 10646, "time_per_iteration": 2.6150670051574707 }, { "auxiliary_loss_clip": 0.0643633, "auxiliary_loss_mlp": 0.01269037, "balance_loss_clip": 0.06288058, "balance_loss_mlp": 0.01257551, "epoch": 0.6401322711558696, "flos": 17821441664640.0, "grad_norm": 2.4233737925470904, "language_loss": 0.80475634, "learning_rate": 1.2113517884279327e-06, "loss": 0.88181001, "num_input_tokens_seen": 229759095, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.1149292, "step": 10647, "time_per_iteration": 2.5369088649749756 }, { "auxiliary_loss_clip": 0.06433135, "auxiliary_loss_mlp": 0.0126454, "balance_loss_clip": 0.06288819, "balance_loss_mlp": 0.01254115, "epoch": 0.6401923944085375, "flos": 26037969183360.0, "grad_norm": 1.4602999990070895, "language_loss": 0.76156437, "learning_rate": 1.2109938992224399e-06, "loss": 0.83854115, "num_input_tokens_seen": 229777750, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10424805, "step": 10648, "time_per_iteration": 2.60959792137146 }, { "auxiliary_loss_clip": 0.06428028, "auxiliary_loss_mlp": 0.01263177, "balance_loss_clip": 0.06278893, "balance_loss_mlp": 0.01252436, "epoch": 0.6402525176612055, "flos": 23593181005440.0, "grad_norm": 1.8983399508674168, "language_loss": 0.78713036, "learning_rate": 1.210636039936138e-06, "loss": 0.8640424, "num_input_tokens_seen": 229796785, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.1072998, "step": 10649, "time_per_iteration": 2.5787549018859863 }, { "auxiliary_loss_clip": 0.06432074, "auxiliary_loss_mlp": 0.01267804, "balance_loss_clip": 0.06283078, "balance_loss_mlp": 0.0125698, "epoch": 0.6403126409138734, "flos": 18047349072000.0, "grad_norm": 1.6794742477847096, "language_loss": 0.76380163, "learning_rate": 1.2102782105825956e-06, "loss": 0.8408004, "num_input_tokens_seen": 229815425, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.10821533, "step": 10650, "time_per_iteration": 2.5491936206817627 }, { "auxiliary_loss_clip": 0.06425148, "auxiliary_loss_mlp": 0.01267958, "balance_loss_clip": 0.06279269, "balance_loss_mlp": 0.01256758, "epoch": 0.6403727641665414, "flos": 21985679151360.0, "grad_norm": 1.5048391394591776, "language_loss": 0.71070218, "learning_rate": 1.2099204111753833e-06, "loss": 0.78763324, "num_input_tokens_seen": 229834545, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.11193848, "step": 10651, "time_per_iteration": 4.0498127937316895 }, { "auxiliary_loss_clip": 0.06436402, "auxiliary_loss_mlp": 0.01270989, "balance_loss_clip": 0.06287646, "balance_loss_mlp": 0.01259205, "epoch": 0.6404328874192093, "flos": 24901751018880.0, "grad_norm": 2.277314188658736, "language_loss": 0.64584684, "learning_rate": 1.2095626417280684e-06, "loss": 0.72292072, "num_input_tokens_seen": 229849175, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.11785889, "step": 10652, "time_per_iteration": 3.9938206672668457 }, { "auxiliary_loss_clip": 0.06432699, "auxiliary_loss_mlp": 0.01265816, "balance_loss_clip": 0.06285962, "balance_loss_mlp": 0.01255642, "epoch": 0.6404930106718774, "flos": 17601991021440.0, "grad_norm": 2.0402155840070884, "language_loss": 0.79714596, "learning_rate": 1.2092049022542168e-06, "loss": 0.87413108, "num_input_tokens_seen": 229865400, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10174561, "step": 10653, "time_per_iteration": 2.5333850383758545 }, { "auxiliary_loss_clip": 0.06447975, "auxiliary_loss_mlp": 0.012709, "balance_loss_clip": 0.0628842, "balance_loss_mlp": 0.01258097, "epoch": 0.6405531339245453, "flos": 20164219096320.0, "grad_norm": 2.2187418300060577, "language_loss": 0.7107206, "learning_rate": 1.2088471927673952e-06, "loss": 0.78790939, "num_input_tokens_seen": 229882945, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.12786865, "step": 10654, "time_per_iteration": 2.607825994491577 }, { "auxiliary_loss_clip": 0.06438583, "auxiliary_loss_mlp": 0.01267052, "balance_loss_clip": 0.06283849, "balance_loss_mlp": 0.01253808, "epoch": 0.6406132571772133, "flos": 21948349357440.0, "grad_norm": 3.3270173840385504, "language_loss": 0.72804117, "learning_rate": 1.2084895132811666e-06, "loss": 0.80509752, "num_input_tokens_seen": 229901590, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.13238525, "step": 10655, "time_per_iteration": 2.6389408111572266 }, { "auxiliary_loss_clip": 0.0643054, "auxiliary_loss_mlp": 0.01269207, "balance_loss_clip": 0.06280135, "balance_loss_mlp": 0.01257423, "epoch": 0.6406733804298813, "flos": 28775693635200.0, "grad_norm": 1.6914771688593853, "language_loss": 0.82920623, "learning_rate": 1.2081318638090952e-06, "loss": 0.90620369, "num_input_tokens_seen": 229922535, "router_z_loss_clip": 1.50488281, "router_z_loss_mlp": 0.11791992, "step": 10656, "time_per_iteration": 2.8547608852386475 }, { "auxiliary_loss_clip": 0.06429501, "auxiliary_loss_mlp": 0.0127034, "balance_loss_clip": 0.0628078, "balance_loss_mlp": 0.01259659, "epoch": 0.6407335036825492, "flos": 17462943970560.0, "grad_norm": 2.2165248910042537, "language_loss": 0.72391629, "learning_rate": 1.2077742443647433e-06, "loss": 0.80091465, "num_input_tokens_seen": 229939575, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.10675049, "step": 10657, "time_per_iteration": 2.5714237689971924 }, { "auxiliary_loss_clip": 0.06432055, "auxiliary_loss_mlp": 0.01272658, "balance_loss_clip": 0.06283229, "balance_loss_mlp": 0.01262072, "epoch": 0.6407936269352172, "flos": 22131476455680.0, "grad_norm": 1.626552738343333, "language_loss": 0.77626562, "learning_rate": 1.2074166549616707e-06, "loss": 0.85331279, "num_input_tokens_seen": 229958840, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.105896, "step": 10658, "time_per_iteration": 2.586944103240967 }, { "auxiliary_loss_clip": 0.06434505, "auxiliary_loss_mlp": 0.01267216, "balance_loss_clip": 0.06282955, "balance_loss_mlp": 0.01255808, "epoch": 0.6408537501878852, "flos": 23117033779200.0, "grad_norm": 1.7899772631654596, "language_loss": 0.76363993, "learning_rate": 1.2070590956134386e-06, "loss": 0.84065711, "num_input_tokens_seen": 229979680, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.11413574, "step": 10659, "time_per_iteration": 2.575042247772217 }, { "auxiliary_loss_clip": 0.06430637, "auxiliary_loss_mlp": 0.01265245, "balance_loss_clip": 0.06280008, "balance_loss_mlp": 0.01254468, "epoch": 0.6409138734405532, "flos": 16478099406720.0, "grad_norm": 1.7902585673532425, "language_loss": 0.77965045, "learning_rate": 1.2067015663336046e-06, "loss": 0.85660923, "num_input_tokens_seen": 229996830, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.10784912, "step": 10660, "time_per_iteration": 2.5471928119659424 }, { "auxiliary_loss_clip": 0.06437838, "auxiliary_loss_mlp": 0.01267474, "balance_loss_clip": 0.06283043, "balance_loss_mlp": 0.01254695, "epoch": 0.6409739966932211, "flos": 22783539329280.0, "grad_norm": 1.6440781330961598, "language_loss": 0.68645674, "learning_rate": 1.206344067135727e-06, "loss": 0.76350987, "num_input_tokens_seen": 230015115, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.12799072, "step": 10661, "time_per_iteration": 2.548882007598877 }, { "auxiliary_loss_clip": 0.06429113, "auxiliary_loss_mlp": 0.01271393, "balance_loss_clip": 0.06283474, "balance_loss_mlp": 0.01261039, "epoch": 0.6410341199458891, "flos": 25158489528960.0, "grad_norm": 1.455142715096195, "language_loss": 0.75787103, "learning_rate": 1.205986598033362e-06, "loss": 0.83487606, "num_input_tokens_seen": 230035515, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.1036377, "step": 10662, "time_per_iteration": 2.605786085128784 }, { "auxiliary_loss_clip": 0.06429172, "auxiliary_loss_mlp": 0.01265152, "balance_loss_clip": 0.06280027, "balance_loss_mlp": 0.01255067, "epoch": 0.641094243198557, "flos": 27052428965760.0, "grad_norm": 1.898577259091684, "language_loss": 0.69757193, "learning_rate": 1.2056291590400644e-06, "loss": 0.77451515, "num_input_tokens_seen": 230054355, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.10089111, "step": 10663, "time_per_iteration": 2.5938589572906494 }, { "auxiliary_loss_clip": 0.06431584, "auxiliary_loss_mlp": 0.01270187, "balance_loss_clip": 0.06282148, "balance_loss_mlp": 0.01257545, "epoch": 0.641154366451225, "flos": 25381629751680.0, "grad_norm": 1.8932375768769294, "language_loss": 0.68332028, "learning_rate": 1.205271750169389e-06, "loss": 0.76033807, "num_input_tokens_seen": 230074605, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.12652588, "step": 10664, "time_per_iteration": 2.5920255184173584 }, { "auxiliary_loss_clip": 0.06426646, "auxiliary_loss_mlp": 0.01267564, "balance_loss_clip": 0.06280567, "balance_loss_mlp": 0.01257186, "epoch": 0.6412144897038929, "flos": 25159998902400.0, "grad_norm": 1.7050563306445952, "language_loss": 0.66549134, "learning_rate": 1.2049143714348881e-06, "loss": 0.74243343, "num_input_tokens_seen": 230093820, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10375977, "step": 10665, "time_per_iteration": 2.572622060775757 }, { "auxiliary_loss_clip": 0.06422973, "auxiliary_loss_mlp": 0.0126405, "balance_loss_clip": 0.06277274, "balance_loss_mlp": 0.01254019, "epoch": 0.641274612956561, "flos": 23447509482240.0, "grad_norm": 1.7549800596220566, "language_loss": 0.64474744, "learning_rate": 1.2045570228501145e-06, "loss": 0.7216177, "num_input_tokens_seen": 230114285, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10021973, "step": 10666, "time_per_iteration": 2.577061891555786 }, { "auxiliary_loss_clip": 0.06428948, "auxiliary_loss_mlp": 0.0127013, "balance_loss_clip": 0.06279182, "balance_loss_mlp": 0.01258883, "epoch": 0.6413347362092289, "flos": 19433597420160.0, "grad_norm": 2.2162957280783524, "language_loss": 0.71091747, "learning_rate": 1.2041997044286176e-06, "loss": 0.78790826, "num_input_tokens_seen": 230132760, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.11242676, "step": 10667, "time_per_iteration": 2.525813102722168 }, { "auxiliary_loss_clip": 0.06443086, "auxiliary_loss_mlp": 0.01270566, "balance_loss_clip": 0.06281675, "balance_loss_mlp": 0.01257143, "epoch": 0.6413948594618969, "flos": 17201425777920.0, "grad_norm": 2.019020410872063, "language_loss": 0.7801857, "learning_rate": 1.2038424161839484e-06, "loss": 0.85732222, "num_input_tokens_seen": 230149690, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.13433838, "step": 10668, "time_per_iteration": 2.524702548980713 }, { "auxiliary_loss_clip": 0.06425353, "auxiliary_loss_mlp": 0.01270436, "balance_loss_clip": 0.06278259, "balance_loss_mlp": 0.0125966, "epoch": 0.6414549827145648, "flos": 22275764386560.0, "grad_norm": 2.086904256772596, "language_loss": 0.67763913, "learning_rate": 1.2034851581296544e-06, "loss": 0.75459701, "num_input_tokens_seen": 230166950, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.10784912, "step": 10669, "time_per_iteration": 2.5888831615448 }, { "auxiliary_loss_clip": 0.06436807, "auxiliary_loss_mlp": 0.01267278, "balance_loss_clip": 0.06280968, "balance_loss_mlp": 0.01255637, "epoch": 0.6415151059672328, "flos": 19645291560960.0, "grad_norm": 2.0155800811848263, "language_loss": 0.78930038, "learning_rate": 1.2031279302792825e-06, "loss": 0.86634123, "num_input_tokens_seen": 230184785, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.11651611, "step": 10670, "time_per_iteration": 2.5519464015960693 }, { "auxiliary_loss_clip": 0.0643262, "auxiliary_loss_mlp": 0.01264713, "balance_loss_clip": 0.06280431, "balance_loss_mlp": 0.01253793, "epoch": 0.6415752292199008, "flos": 14871016823040.0, "grad_norm": 2.7629685833717144, "language_loss": 0.88865417, "learning_rate": 1.20277073264638e-06, "loss": 0.96562749, "num_input_tokens_seen": 230201385, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.10925293, "step": 10671, "time_per_iteration": 2.5306177139282227 }, { "auxiliary_loss_clip": 0.06424683, "auxiliary_loss_mlp": 0.01263659, "balance_loss_clip": 0.06280936, "balance_loss_mlp": 0.01253741, "epoch": 0.6416353524725688, "flos": 13740710371200.0, "grad_norm": 1.4552459039797117, "language_loss": 0.69802463, "learning_rate": 1.2024135652444907e-06, "loss": 0.77490807, "num_input_tokens_seen": 230220380, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.09924316, "step": 10672, "time_per_iteration": 2.5585927963256836 }, { "auxiliary_loss_clip": 0.06431687, "auxiliary_loss_mlp": 0.01266509, "balance_loss_clip": 0.06278154, "balance_loss_mlp": 0.01253503, "epoch": 0.6416954757252368, "flos": 24541785878400.0, "grad_norm": 2.0499347010499616, "language_loss": 0.7495448, "learning_rate": 1.2020564280871593e-06, "loss": 0.82652676, "num_input_tokens_seen": 230239845, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.13012695, "step": 10673, "time_per_iteration": 2.593965768814087 }, { "auxiliary_loss_clip": 0.06425727, "auxiliary_loss_mlp": 0.01270857, "balance_loss_clip": 0.06277114, "balance_loss_mlp": 0.01259538, "epoch": 0.6417555989779047, "flos": 27717531148800.0, "grad_norm": 1.6450969979513619, "language_loss": 0.6958859, "learning_rate": 1.2016993211879283e-06, "loss": 0.77285182, "num_input_tokens_seen": 230262420, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.11315918, "step": 10674, "time_per_iteration": 2.6088643074035645 }, { "auxiliary_loss_clip": 0.06434922, "auxiliary_loss_mlp": 0.01263745, "balance_loss_clip": 0.06279952, "balance_loss_mlp": 0.01252659, "epoch": 0.6418157222305727, "flos": 20562604133760.0, "grad_norm": 1.9005099388694222, "language_loss": 0.66968518, "learning_rate": 1.201342244560338e-06, "loss": 0.74667186, "num_input_tokens_seen": 230279950, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.11090088, "step": 10675, "time_per_iteration": 2.5933022499084473 }, { "auxiliary_loss_clip": 0.06429194, "auxiliary_loss_mlp": 0.01268648, "balance_loss_clip": 0.06281987, "balance_loss_mlp": 0.0125836, "epoch": 0.6418758454832406, "flos": 22608126806400.0, "grad_norm": 1.7473236097478255, "language_loss": 0.66851437, "learning_rate": 1.2009851982179307e-06, "loss": 0.74549282, "num_input_tokens_seen": 230299705, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10296631, "step": 10676, "time_per_iteration": 4.0962815284729 }, { "auxiliary_loss_clip": 0.06430988, "auxiliary_loss_mlp": 0.01266025, "balance_loss_clip": 0.06281067, "balance_loss_mlp": 0.01254134, "epoch": 0.6419359687359086, "flos": 27381479149440.0, "grad_norm": 5.630518764062397, "language_loss": 0.75482482, "learning_rate": 1.2006281821742446e-06, "loss": 0.83179486, "num_input_tokens_seen": 230320030, "router_z_loss_clip": 1.49902344, "router_z_loss_mlp": 0.11895752, "step": 10677, "time_per_iteration": 2.6227774620056152 }, { "auxiliary_loss_clip": 0.06323425, "auxiliary_loss_mlp": 0.01254133, "balance_loss_clip": 0.06263901, "balance_loss_mlp": 0.01252895, "epoch": 0.6419960919885765, "flos": 67270722566400.0, "grad_norm": 0.7401172047316277, "language_loss": 0.60645533, "learning_rate": 1.200271196442818e-06, "loss": 0.68223095, "num_input_tokens_seen": 230381495, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01236725, "step": 10678, "time_per_iteration": 3.255261182785034 }, { "auxiliary_loss_clip": 0.06425272, "auxiliary_loss_mlp": 0.01266651, "balance_loss_clip": 0.06280675, "balance_loss_mlp": 0.01256709, "epoch": 0.6420562152412446, "flos": 19908067564800.0, "grad_norm": 4.975064591533623, "language_loss": 0.67561126, "learning_rate": 1.1999142410371875e-06, "loss": 0.75253052, "num_input_tokens_seen": 230401385, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.0994873, "step": 10679, "time_per_iteration": 2.5551652908325195 }, { "auxiliary_loss_clip": 0.06430416, "auxiliary_loss_mlp": 0.01263477, "balance_loss_clip": 0.06279515, "balance_loss_mlp": 0.01252659, "epoch": 0.6421163384939125, "flos": 24797056942080.0, "grad_norm": 1.7258083174934455, "language_loss": 0.7409519, "learning_rate": 1.1995573159708897e-06, "loss": 0.81789082, "num_input_tokens_seen": 230421340, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.10827637, "step": 10680, "time_per_iteration": 2.689000129699707 }, { "auxiliary_loss_clip": 0.06431224, "auxiliary_loss_mlp": 0.01264879, "balance_loss_clip": 0.06283467, "balance_loss_mlp": 0.01255044, "epoch": 0.6421764617465805, "flos": 25599822583680.0, "grad_norm": 1.7941985337920718, "language_loss": 0.68571067, "learning_rate": 1.1992004212574582e-06, "loss": 0.76267171, "num_input_tokens_seen": 230441270, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.09838867, "step": 10681, "time_per_iteration": 4.017273902893066 }, { "auxiliary_loss_clip": 0.06429654, "auxiliary_loss_mlp": 0.01265233, "balance_loss_clip": 0.06282148, "balance_loss_mlp": 0.01254212, "epoch": 0.6422365849992484, "flos": 14139556606080.0, "grad_norm": 1.664027566766372, "language_loss": 0.75055981, "learning_rate": 1.198843556910427e-06, "loss": 0.82750863, "num_input_tokens_seen": 230457455, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.11016846, "step": 10682, "time_per_iteration": 2.519432306289673 }, { "auxiliary_loss_clip": 0.06418924, "auxiliary_loss_mlp": 0.0126891, "balance_loss_clip": 0.06278874, "balance_loss_mlp": 0.01258908, "epoch": 0.6422967082519164, "flos": 22390688661120.0, "grad_norm": 1.4228789337894467, "language_loss": 0.79429227, "learning_rate": 1.1984867229433287e-06, "loss": 0.87117064, "num_input_tokens_seen": 230478955, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.10003662, "step": 10683, "time_per_iteration": 2.56772780418396 }, { "auxiliary_loss_clip": 0.0643359, "auxiliary_loss_mlp": 0.01265971, "balance_loss_clip": 0.06283654, "balance_loss_mlp": 0.01254842, "epoch": 0.6423568315045844, "flos": 14653243261440.0, "grad_norm": 1.6261404623986366, "language_loss": 0.67645168, "learning_rate": 1.1981299193696941e-06, "loss": 0.75344729, "num_input_tokens_seen": 230496425, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.11132812, "step": 10684, "time_per_iteration": 2.5131795406341553 }, { "auxiliary_loss_clip": 0.06430814, "auxiliary_loss_mlp": 0.01267858, "balance_loss_clip": 0.06281319, "balance_loss_mlp": 0.01256509, "epoch": 0.6424169547572524, "flos": 26841237949440.0, "grad_norm": 7.211563259984019, "language_loss": 0.71914464, "learning_rate": 1.1977731462030533e-06, "loss": 0.79613137, "num_input_tokens_seen": 230516245, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.11346436, "step": 10685, "time_per_iteration": 2.600555658340454 }, { "auxiliary_loss_clip": 0.0642812, "auxiliary_loss_mlp": 0.01268968, "balance_loss_clip": 0.06282562, "balance_loss_mlp": 0.01258555, "epoch": 0.6424770780099204, "flos": 22713449788800.0, "grad_norm": 1.5099819508426566, "language_loss": 0.75145316, "learning_rate": 1.197416403456935e-06, "loss": 0.82842404, "num_input_tokens_seen": 230534745, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10412598, "step": 10686, "time_per_iteration": 2.5981597900390625 }, { "auxiliary_loss_clip": 0.06437251, "auxiliary_loss_mlp": 0.01269482, "balance_loss_clip": 0.0628586, "balance_loss_mlp": 0.0125737, "epoch": 0.6425372012625883, "flos": 28476049034880.0, "grad_norm": 2.3209416439520942, "language_loss": 0.68799412, "learning_rate": 1.197059691144867e-06, "loss": 0.7650615, "num_input_tokens_seen": 230555895, "router_z_loss_clip": 1.51660156, "router_z_loss_mlp": 0.12115479, "step": 10687, "time_per_iteration": 2.6324429512023926 }, { "auxiliary_loss_clip": 0.06432336, "auxiliary_loss_mlp": 0.01267146, "balance_loss_clip": 0.06282058, "balance_loss_mlp": 0.01256226, "epoch": 0.6425973245152563, "flos": 29359469831040.0, "grad_norm": 2.0372575320806474, "language_loss": 0.66888809, "learning_rate": 1.1967030092803767e-06, "loss": 0.74588287, "num_input_tokens_seen": 230577460, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.10919189, "step": 10688, "time_per_iteration": 2.614717721939087 }, { "auxiliary_loss_clip": 0.0642729, "auxiliary_loss_mlp": 0.01264871, "balance_loss_clip": 0.06279644, "balance_loss_mlp": 0.0125447, "epoch": 0.6426574477679242, "flos": 16435109462400.0, "grad_norm": 1.6357224598957418, "language_loss": 0.73617959, "learning_rate": 1.1963463578769876e-06, "loss": 0.81310117, "num_input_tokens_seen": 230595030, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10400391, "step": 10689, "time_per_iteration": 2.5429675579071045 }, { "auxiliary_loss_clip": 0.06424087, "auxiliary_loss_mlp": 0.01262953, "balance_loss_clip": 0.06279595, "balance_loss_mlp": 0.01253244, "epoch": 0.6427175710205922, "flos": 21842481323520.0, "grad_norm": 1.8650066131449732, "language_loss": 0.71982324, "learning_rate": 1.195989736948226e-06, "loss": 0.79669368, "num_input_tokens_seen": 230615135, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.09716797, "step": 10690, "time_per_iteration": 2.5737338066101074 }, { "auxiliary_loss_clip": 0.06426376, "auxiliary_loss_mlp": 0.01267727, "balance_loss_clip": 0.06281756, "balance_loss_mlp": 0.01257373, "epoch": 0.6427776942732601, "flos": 17792623059840.0, "grad_norm": 1.8223688315877762, "language_loss": 0.77707946, "learning_rate": 1.1956331465076143e-06, "loss": 0.85402048, "num_input_tokens_seen": 230631965, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.1036377, "step": 10691, "time_per_iteration": 5.48931884765625 }, { "auxiliary_loss_clip": 0.06434479, "auxiliary_loss_mlp": 0.01267316, "balance_loss_clip": 0.06284267, "balance_loss_mlp": 0.01256313, "epoch": 0.6428378175259282, "flos": 15091306007040.0, "grad_norm": 2.0610347642935425, "language_loss": 0.75075227, "learning_rate": 1.1952765865686738e-06, "loss": 0.82777023, "num_input_tokens_seen": 230649565, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.10998535, "step": 10692, "time_per_iteration": 2.5713846683502197 }, { "auxiliary_loss_clip": 0.06434151, "auxiliary_loss_mlp": 0.01264421, "balance_loss_clip": 0.06286081, "balance_loss_mlp": 0.01253943, "epoch": 0.6428979407785961, "flos": 23848535923200.0, "grad_norm": 2.1559007550357214, "language_loss": 0.6131835, "learning_rate": 1.1949200571449263e-06, "loss": 0.69016922, "num_input_tokens_seen": 230669265, "router_z_loss_clip": 1.48242188, "router_z_loss_mlp": 0.10479736, "step": 10693, "time_per_iteration": 2.572293519973755 }, { "auxiliary_loss_clip": 0.0643314, "auxiliary_loss_mlp": 0.01265916, "balance_loss_clip": 0.06281357, "balance_loss_mlp": 0.01254484, "epoch": 0.6429580640312641, "flos": 32935151439360.0, "grad_norm": 1.566322750354902, "language_loss": 0.60061032, "learning_rate": 1.1945635582498903e-06, "loss": 0.67760086, "num_input_tokens_seen": 230690575, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.11425781, "step": 10694, "time_per_iteration": 2.679682731628418 }, { "auxiliary_loss_clip": 0.06431741, "auxiliary_loss_mlp": 0.01268621, "balance_loss_clip": 0.06282555, "balance_loss_mlp": 0.01258065, "epoch": 0.643018187283932, "flos": 21074571780480.0, "grad_norm": 1.3751371812293767, "language_loss": 0.80282122, "learning_rate": 1.1942070898970853e-06, "loss": 0.87982488, "num_input_tokens_seen": 230709420, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.10552979, "step": 10695, "time_per_iteration": 2.5553152561187744 }, { "auxiliary_loss_clip": 0.0643046, "auxiliary_loss_mlp": 0.01267172, "balance_loss_clip": 0.06279562, "balance_loss_mlp": 0.01255848, "epoch": 0.6430783105366, "flos": 26731973825280.0, "grad_norm": 1.8345776591803058, "language_loss": 0.73825127, "learning_rate": 1.1938506521000285e-06, "loss": 0.81522763, "num_input_tokens_seen": 230729350, "router_z_loss_clip": 1.50976562, "router_z_loss_mlp": 0.11322021, "step": 10696, "time_per_iteration": 2.6502978801727295 }, { "auxiliary_loss_clip": 0.06425282, "auxiliary_loss_mlp": 0.01265885, "balance_loss_clip": 0.06280817, "balance_loss_mlp": 0.01255717, "epoch": 0.643138433789268, "flos": 23703744867840.0, "grad_norm": 1.6225179028039862, "language_loss": 0.75711012, "learning_rate": 1.1934942448722347e-06, "loss": 0.83402181, "num_input_tokens_seen": 230749220, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10168457, "step": 10697, "time_per_iteration": 2.5866801738739014 }, { "auxiliary_loss_clip": 0.06424631, "auxiliary_loss_mlp": 0.01265444, "balance_loss_clip": 0.06280875, "balance_loss_mlp": 0.01255401, "epoch": 0.643198557041936, "flos": 34210416654720.0, "grad_norm": 1.4998393459515433, "language_loss": 0.66631472, "learning_rate": 1.1931378682272208e-06, "loss": 0.74321544, "num_input_tokens_seen": 230770245, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10040283, "step": 10698, "time_per_iteration": 2.68756365776062 }, { "auxiliary_loss_clip": 0.06317793, "auxiliary_loss_mlp": 0.012515, "balance_loss_clip": 0.06258381, "balance_loss_mlp": 0.01249988, "epoch": 0.643258680294604, "flos": 67646955398400.0, "grad_norm": 0.7856836669428388, "language_loss": 0.63463485, "learning_rate": 1.1927815221784996e-06, "loss": 0.71032774, "num_input_tokens_seen": 230837030, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01509857, "step": 10699, "time_per_iteration": 3.1694235801696777 }, { "auxiliary_loss_clip": 0.0642104, "auxiliary_loss_mlp": 0.01262267, "balance_loss_clip": 0.06278959, "balance_loss_mlp": 0.01253153, "epoch": 0.6433188035472719, "flos": 25192003962240.0, "grad_norm": 1.5180671147788487, "language_loss": 0.69449866, "learning_rate": 1.1924252067395838e-06, "loss": 0.77133167, "num_input_tokens_seen": 230856845, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09112549, "step": 10700, "time_per_iteration": 2.5845754146575928 }, { "auxiliary_loss_clip": 0.06428414, "auxiliary_loss_mlp": 0.0126539, "balance_loss_clip": 0.06279606, "balance_loss_mlp": 0.01254941, "epoch": 0.6433789267999399, "flos": 24980645237760.0, "grad_norm": 1.7639409410970555, "language_loss": 0.73584783, "learning_rate": 1.1920689219239855e-06, "loss": 0.81278592, "num_input_tokens_seen": 230878785, "router_z_loss_clip": 1.48730469, "router_z_loss_mlp": 0.10443115, "step": 10701, "time_per_iteration": 2.6094894409179688 }, { "auxiliary_loss_clip": 0.06435485, "auxiliary_loss_mlp": 0.0126393, "balance_loss_clip": 0.06282611, "balance_loss_mlp": 0.01252874, "epoch": 0.6434390500526078, "flos": 17571704970240.0, "grad_norm": 1.8634307466396438, "language_loss": 0.81969827, "learning_rate": 1.1917126677452144e-06, "loss": 0.89669245, "num_input_tokens_seen": 230895445, "router_z_loss_clip": 1.52636719, "router_z_loss_mlp": 0.1105957, "step": 10702, "time_per_iteration": 2.5435187816619873 }, { "auxiliary_loss_clip": 0.06429198, "auxiliary_loss_mlp": 0.01271737, "balance_loss_clip": 0.06283308, "balance_loss_mlp": 0.01261658, "epoch": 0.6434991733052758, "flos": 20848790154240.0, "grad_norm": 1.8199428132489455, "language_loss": 0.74601257, "learning_rate": 1.1913564442167798e-06, "loss": 0.82302189, "num_input_tokens_seen": 230911375, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10083008, "step": 10703, "time_per_iteration": 2.6905808448791504 }, { "auxiliary_loss_clip": 0.06324857, "auxiliary_loss_mlp": 0.01251955, "balance_loss_clip": 0.06265384, "balance_loss_mlp": 0.01250195, "epoch": 0.6435592965579437, "flos": 66114909745920.0, "grad_norm": 0.6467446672917562, "language_loss": 0.54605836, "learning_rate": 1.1910002513521898e-06, "loss": 0.62182647, "num_input_tokens_seen": 230975990, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01760864, "step": 10704, "time_per_iteration": 3.292938470840454 }, { "auxiliary_loss_clip": 0.06428672, "auxiliary_loss_mlp": 0.01267941, "balance_loss_clip": 0.06280653, "balance_loss_mlp": 0.01258136, "epoch": 0.6436194198106118, "flos": 23775595344000.0, "grad_norm": 1.8731769443565864, "language_loss": 0.77200937, "learning_rate": 1.1906440891649519e-06, "loss": 0.84897548, "num_input_tokens_seen": 230997110, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.0980835, "step": 10705, "time_per_iteration": 2.64729380607605 }, { "auxiliary_loss_clip": 0.06429298, "auxiliary_loss_mlp": 0.01267021, "balance_loss_clip": 0.06281295, "balance_loss_mlp": 0.01256864, "epoch": 0.6436795430632797, "flos": 20236572696960.0, "grad_norm": 2.0489375767691618, "language_loss": 0.79366481, "learning_rate": 1.1902879576685708e-06, "loss": 0.870628, "num_input_tokens_seen": 231015590, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.1015625, "step": 10706, "time_per_iteration": 2.5932130813598633 }, { "auxiliary_loss_clip": 0.0642329, "auxiliary_loss_mlp": 0.01264274, "balance_loss_clip": 0.06276537, "balance_loss_mlp": 0.01254749, "epoch": 0.6437396663159477, "flos": 20307878121600.0, "grad_norm": 1.8992744308473553, "language_loss": 0.80343127, "learning_rate": 1.1899318568765518e-06, "loss": 0.88030696, "num_input_tokens_seen": 231033800, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.09521484, "step": 10707, "time_per_iteration": 2.565599203109741 }, { "auxiliary_loss_clip": 0.06427253, "auxiliary_loss_mlp": 0.01264144, "balance_loss_clip": 0.06280267, "balance_loss_mlp": 0.01254566, "epoch": 0.6437997895686156, "flos": 23885404519680.0, "grad_norm": 1.6307406945011942, "language_loss": 0.85835934, "learning_rate": 1.1895757868023978e-06, "loss": 0.93527329, "num_input_tokens_seen": 231053160, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.0958252, "step": 10708, "time_per_iteration": 2.6149699687957764 }, { "auxiliary_loss_clip": 0.06443457, "auxiliary_loss_mlp": 0.01266564, "balance_loss_clip": 0.06287226, "balance_loss_mlp": 0.01255334, "epoch": 0.6438599128212836, "flos": 18995241185280.0, "grad_norm": 2.5193678684190233, "language_loss": 0.65936768, "learning_rate": 1.1892197474596106e-06, "loss": 0.73646796, "num_input_tokens_seen": 231069470, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.11224365, "step": 10709, "time_per_iteration": 2.578483819961548 }, { "auxiliary_loss_clip": 0.06425481, "auxiliary_loss_mlp": 0.01265591, "balance_loss_clip": 0.06280448, "balance_loss_mlp": 0.01255667, "epoch": 0.6439200360739517, "flos": 24103010373120.0, "grad_norm": 1.8110996788491813, "language_loss": 0.8066926, "learning_rate": 1.1888637388616929e-06, "loss": 0.88360333, "num_input_tokens_seen": 231088205, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.09918213, "step": 10710, "time_per_iteration": 2.597217321395874 }, { "auxiliary_loss_clip": 0.06427275, "auxiliary_loss_mlp": 0.01264741, "balance_loss_clip": 0.06282489, "balance_loss_mlp": 0.01254835, "epoch": 0.6439801593266196, "flos": 31909748699520.0, "grad_norm": 2.463274161595882, "language_loss": 0.66284347, "learning_rate": 1.1885077610221425e-06, "loss": 0.73976362, "num_input_tokens_seen": 231107850, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.09906006, "step": 10711, "time_per_iteration": 2.6598334312438965 }, { "auxiliary_loss_clip": 0.06431112, "auxiliary_loss_mlp": 0.01266757, "balance_loss_clip": 0.06282709, "balance_loss_mlp": 0.01256403, "epoch": 0.6440402825792876, "flos": 27133251828480.0, "grad_norm": 1.542888891773353, "language_loss": 0.78769451, "learning_rate": 1.1881518139544597e-06, "loss": 0.8646732, "num_input_tokens_seen": 231127200, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.10357666, "step": 10712, "time_per_iteration": 2.6309094429016113 }, { "auxiliary_loss_clip": 0.06429379, "auxiliary_loss_mlp": 0.01266612, "balance_loss_clip": 0.06279075, "balance_loss_mlp": 0.01255895, "epoch": 0.6441004058319555, "flos": 20673964609920.0, "grad_norm": 1.712398210682463, "language_loss": 0.82886589, "learning_rate": 1.1877958976721417e-06, "loss": 0.90582573, "num_input_tokens_seen": 231146360, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.10723877, "step": 10713, "time_per_iteration": 2.5695295333862305 }, { "auxiliary_loss_clip": 0.06419352, "auxiliary_loss_mlp": 0.01264844, "balance_loss_clip": 0.06279851, "balance_loss_mlp": 0.01254181, "epoch": 0.6441605290846235, "flos": 26032309032960.0, "grad_norm": 1.3617179290020485, "language_loss": 0.7846117, "learning_rate": 1.187440012188684e-06, "loss": 0.86145371, "num_input_tokens_seen": 231168350, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.10656738, "step": 10714, "time_per_iteration": 2.6342172622680664 }, { "auxiliary_loss_clip": 0.06424095, "auxiliary_loss_mlp": 0.01263493, "balance_loss_clip": 0.0627902, "balance_loss_mlp": 0.01254195, "epoch": 0.6442206523372914, "flos": 24906362993280.0, "grad_norm": 1.4055412655553077, "language_loss": 0.81936914, "learning_rate": 1.187084157517583e-06, "loss": 0.896245, "num_input_tokens_seen": 231188385, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.09301758, "step": 10715, "time_per_iteration": 2.6249959468841553 }, { "auxiliary_loss_clip": 0.06431518, "auxiliary_loss_mlp": 0.01265713, "balance_loss_clip": 0.06282786, "balance_loss_mlp": 0.01255413, "epoch": 0.6442807755899594, "flos": 25163478846720.0, "grad_norm": 1.7149454096307988, "language_loss": 0.81651688, "learning_rate": 1.186728333672332e-06, "loss": 0.89348924, "num_input_tokens_seen": 231209880, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.10302734, "step": 10716, "time_per_iteration": 4.067634344100952 }, { "auxiliary_loss_clip": 0.06431039, "auxiliary_loss_mlp": 0.01267518, "balance_loss_clip": 0.06279884, "balance_loss_mlp": 0.01256784, "epoch": 0.6443408988426274, "flos": 27351863930880.0, "grad_norm": 2.9440632692644693, "language_loss": 0.7874468, "learning_rate": 1.186372540666424e-06, "loss": 0.86443233, "num_input_tokens_seen": 231230765, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.10742188, "step": 10717, "time_per_iteration": 2.6141841411590576 }, { "auxiliary_loss_clip": 0.06420878, "auxiliary_loss_mlp": 0.01267918, "balance_loss_clip": 0.06280594, "balance_loss_mlp": 0.01258036, "epoch": 0.6444010220952954, "flos": 27935807834880.0, "grad_norm": 1.8587804270707824, "language_loss": 0.68320405, "learning_rate": 1.1860167785133513e-06, "loss": 0.76009196, "num_input_tokens_seen": 231252350, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09887695, "step": 10718, "time_per_iteration": 2.7125601768493652 }, { "auxiliary_loss_clip": 0.06317931, "auxiliary_loss_mlp": 0.01250589, "balance_loss_clip": 0.06258564, "balance_loss_mlp": 0.01248982, "epoch": 0.6444611453479633, "flos": 71232169173120.0, "grad_norm": 0.7547639402079768, "language_loss": 0.49623132, "learning_rate": 1.185661047226603e-06, "loss": 0.57191652, "num_input_tokens_seen": 231313865, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01608276, "step": 10719, "time_per_iteration": 3.330983877182007 }, { "auxiliary_loss_clip": 0.06430092, "auxiliary_loss_mlp": 0.0126642, "balance_loss_clip": 0.06280934, "balance_loss_mlp": 0.01255041, "epoch": 0.6445212686006313, "flos": 22710766458240.0, "grad_norm": 1.7453918182597585, "language_loss": 0.7864635, "learning_rate": 1.18530534681967e-06, "loss": 0.86342859, "num_input_tokens_seen": 231331710, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.1138916, "step": 10720, "time_per_iteration": 2.6103107929229736 }, { "auxiliary_loss_clip": 0.06421123, "auxiliary_loss_mlp": 0.01264925, "balance_loss_clip": 0.0627739, "balance_loss_mlp": 0.0125487, "epoch": 0.6445813918532992, "flos": 21185219496960.0, "grad_norm": 1.6543490553521378, "language_loss": 0.77094364, "learning_rate": 1.18494967730604e-06, "loss": 0.84780413, "num_input_tokens_seen": 231350705, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10046387, "step": 10721, "time_per_iteration": 4.10527229309082 }, { "auxiliary_loss_clip": 0.06424718, "auxiliary_loss_mlp": 0.01265181, "balance_loss_clip": 0.06277104, "balance_loss_mlp": 0.01255042, "epoch": 0.6446415151059672, "flos": 25198921923840.0, "grad_norm": 2.2123799017088244, "language_loss": 0.72985864, "learning_rate": 1.1845940386991995e-06, "loss": 0.80675757, "num_input_tokens_seen": 231369550, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10131836, "step": 10722, "time_per_iteration": 2.619354009628296 }, { "auxiliary_loss_clip": 0.06425419, "auxiliary_loss_mlp": 0.01267453, "balance_loss_clip": 0.06279998, "balance_loss_mlp": 0.01258, "epoch": 0.6447016383586353, "flos": 25309401932160.0, "grad_norm": 2.336178500250051, "language_loss": 0.78125715, "learning_rate": 1.184238431012635e-06, "loss": 0.85818589, "num_input_tokens_seen": 231389285, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.09460449, "step": 10723, "time_per_iteration": 2.612337112426758 }, { "auxiliary_loss_clip": 0.06425558, "auxiliary_loss_mlp": 0.01266192, "balance_loss_clip": 0.06275744, "balance_loss_mlp": 0.01254849, "epoch": 0.6447617616113032, "flos": 27709523084160.0, "grad_norm": 1.5759950801422187, "language_loss": 0.581949, "learning_rate": 1.1838828542598312e-06, "loss": 0.65886647, "num_input_tokens_seen": 231408820, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.11352539, "step": 10724, "time_per_iteration": 2.6199557781219482 }, { "auxiliary_loss_clip": 0.0641817, "auxiliary_loss_mlp": 0.01263359, "balance_loss_clip": 0.06277846, "balance_loss_mlp": 0.01254096, "epoch": 0.6448218848639712, "flos": 23045728354560.0, "grad_norm": 1.611815539649115, "language_loss": 0.84069026, "learning_rate": 1.183527308454271e-06, "loss": 0.91750562, "num_input_tokens_seen": 231428100, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09259033, "step": 10725, "time_per_iteration": 2.5627458095550537 }, { "auxiliary_loss_clip": 0.06423028, "auxiliary_loss_mlp": 0.01265746, "balance_loss_clip": 0.06277919, "balance_loss_mlp": 0.01255542, "epoch": 0.6448820081166391, "flos": 24502569367680.0, "grad_norm": 1.7725946018823915, "language_loss": 0.82355034, "learning_rate": 1.1831717936094368e-06, "loss": 0.90043807, "num_input_tokens_seen": 231445810, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.10205078, "step": 10726, "time_per_iteration": 2.6098599433898926 }, { "auxiliary_loss_clip": 0.06426945, "auxiliary_loss_mlp": 0.01264841, "balance_loss_clip": 0.06276312, "balance_loss_mlp": 0.01254285, "epoch": 0.6449421313693071, "flos": 22425880176000.0, "grad_norm": 1.6951849500743346, "language_loss": 0.81331539, "learning_rate": 1.1828163097388108e-06, "loss": 0.89023322, "num_input_tokens_seen": 231463570, "router_z_loss_clip": 1.50585938, "router_z_loss_mlp": 0.10552979, "step": 10727, "time_per_iteration": 2.5500924587249756 }, { "auxiliary_loss_clip": 0.06437366, "auxiliary_loss_mlp": 0.0126916, "balance_loss_clip": 0.0628186, "balance_loss_mlp": 0.01256977, "epoch": 0.645002254621975, "flos": 20231206035840.0, "grad_norm": 2.5405354151372186, "language_loss": 0.79419446, "learning_rate": 1.1824608568558717e-06, "loss": 0.87125981, "num_input_tokens_seen": 231482155, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.12200928, "step": 10728, "time_per_iteration": 2.597212791442871 }, { "auxiliary_loss_clip": 0.06427164, "auxiliary_loss_mlp": 0.01266753, "balance_loss_clip": 0.0628063, "balance_loss_mlp": 0.01255297, "epoch": 0.645062377874643, "flos": 27862909182720.0, "grad_norm": 1.7366447066842015, "language_loss": 0.748357, "learning_rate": 1.1821054349740988e-06, "loss": 0.82529616, "num_input_tokens_seen": 231502465, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.11456299, "step": 10729, "time_per_iteration": 2.626185655593872 }, { "auxiliary_loss_clip": 0.06432237, "auxiliary_loss_mlp": 0.01266734, "balance_loss_clip": 0.06283034, "balance_loss_mlp": 0.01255671, "epoch": 0.645122501127311, "flos": 25308563391360.0, "grad_norm": 1.601275767495443, "language_loss": 0.66832304, "learning_rate": 1.1817500441069706e-06, "loss": 0.74531269, "num_input_tokens_seen": 231522740, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.11065674, "step": 10730, "time_per_iteration": 4.069629907608032 }, { "auxiliary_loss_clip": 0.06430241, "auxiliary_loss_mlp": 0.01268435, "balance_loss_clip": 0.06282276, "balance_loss_mlp": 0.01256788, "epoch": 0.645182624379979, "flos": 18813371898240.0, "grad_norm": 1.5564197021242658, "language_loss": 0.64136684, "learning_rate": 1.1813946842679614e-06, "loss": 0.71835363, "num_input_tokens_seen": 231542050, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.11645508, "step": 10731, "time_per_iteration": 3.8374242782592773 }, { "auxiliary_loss_clip": 0.06421661, "auxiliary_loss_mlp": 0.01265208, "balance_loss_clip": 0.06278218, "balance_loss_mlp": 0.01254759, "epoch": 0.6452427476326469, "flos": 18337979358720.0, "grad_norm": 1.566688161025488, "language_loss": 0.68222654, "learning_rate": 1.1810393554705492e-06, "loss": 0.75909519, "num_input_tokens_seen": 231560380, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10455322, "step": 10732, "time_per_iteration": 2.561415672302246 }, { "auxiliary_loss_clip": 0.06422851, "auxiliary_loss_mlp": 0.01265664, "balance_loss_clip": 0.06281069, "balance_loss_mlp": 0.01255174, "epoch": 0.6453028708853149, "flos": 22791505466880.0, "grad_norm": 1.8992121727183227, "language_loss": 0.76152551, "learning_rate": 1.1806840577282055e-06, "loss": 0.83841062, "num_input_tokens_seen": 231580810, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10498047, "step": 10733, "time_per_iteration": 2.558501720428467 }, { "auxiliary_loss_clip": 0.06434605, "auxiliary_loss_mlp": 0.01268836, "balance_loss_clip": 0.06282201, "balance_loss_mlp": 0.01257183, "epoch": 0.6453629941379828, "flos": 23951888334720.0, "grad_norm": 2.318143562689576, "language_loss": 0.67488074, "learning_rate": 1.1803287910544048e-06, "loss": 0.7519151, "num_input_tokens_seen": 231600585, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.11657715, "step": 10734, "time_per_iteration": 2.586010694503784 }, { "auxiliary_loss_clip": 0.06421097, "auxiliary_loss_mlp": 0.01264886, "balance_loss_clip": 0.06282289, "balance_loss_mlp": 0.01254467, "epoch": 0.6454231173906508, "flos": 17682226905600.0, "grad_norm": 1.967748746919916, "language_loss": 0.74009442, "learning_rate": 1.1799735554626191e-06, "loss": 0.81695426, "num_input_tokens_seen": 231618765, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10418701, "step": 10735, "time_per_iteration": 2.566845655441284 }, { "auxiliary_loss_clip": 0.06429844, "auxiliary_loss_mlp": 0.01267064, "balance_loss_clip": 0.06283152, "balance_loss_mlp": 0.01257396, "epoch": 0.6454832406433189, "flos": 23299154628480.0, "grad_norm": 1.9765734963943034, "language_loss": 0.75061738, "learning_rate": 1.1796183509663176e-06, "loss": 0.82758641, "num_input_tokens_seen": 231638525, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.09674072, "step": 10736, "time_per_iteration": 2.6181352138519287 }, { "auxiliary_loss_clip": 0.06435104, "auxiliary_loss_mlp": 0.01266181, "balance_loss_clip": 0.06282423, "balance_loss_mlp": 0.01254421, "epoch": 0.6455433638959868, "flos": 20163422482560.0, "grad_norm": 1.8974073126459683, "language_loss": 0.71005023, "learning_rate": 1.1792631775789708e-06, "loss": 0.78706312, "num_input_tokens_seen": 231656785, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.11767578, "step": 10737, "time_per_iteration": 2.5878138542175293 }, { "auxiliary_loss_clip": 0.06314611, "auxiliary_loss_mlp": 0.01252585, "balance_loss_clip": 0.06254856, "balance_loss_mlp": 0.01251108, "epoch": 0.6456034871486548, "flos": 66553391761920.0, "grad_norm": 0.7727358571986036, "language_loss": 0.58446896, "learning_rate": 1.1789080353140464e-06, "loss": 0.66014099, "num_input_tokens_seen": 231719075, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 0.01476288, "step": 10738, "time_per_iteration": 3.2455391883850098 }, { "auxiliary_loss_clip": 0.06425145, "auxiliary_loss_mlp": 0.01264349, "balance_loss_clip": 0.06281211, "balance_loss_mlp": 0.01253811, "epoch": 0.6456636104013227, "flos": 24212819548800.0, "grad_norm": 1.8549834289656617, "language_loss": 0.75137353, "learning_rate": 1.1785529241850118e-06, "loss": 0.82826847, "num_input_tokens_seen": 231737810, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10540771, "step": 10739, "time_per_iteration": 2.5978682041168213 }, { "auxiliary_loss_clip": 0.06431218, "auxiliary_loss_mlp": 0.01266446, "balance_loss_clip": 0.06280844, "balance_loss_mlp": 0.01255365, "epoch": 0.6457237336539907, "flos": 23631013923840.0, "grad_norm": 1.776658806394006, "language_loss": 0.71112055, "learning_rate": 1.1781978442053324e-06, "loss": 0.78809714, "num_input_tokens_seen": 231756140, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.11090088, "step": 10740, "time_per_iteration": 2.5932915210723877 }, { "auxiliary_loss_clip": 0.06312141, "auxiliary_loss_mlp": 0.01254053, "balance_loss_clip": 0.0625256, "balance_loss_mlp": 0.01252494, "epoch": 0.6457838569066586, "flos": 65867437111680.0, "grad_norm": 0.6538927571105221, "language_loss": 0.55300665, "learning_rate": 1.1778427953884733e-06, "loss": 0.62866861, "num_input_tokens_seen": 231823665, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01560211, "step": 10741, "time_per_iteration": 3.2073984146118164 }, { "auxiliary_loss_clip": 0.06429081, "auxiliary_loss_mlp": 0.01264893, "balance_loss_clip": 0.06284973, "balance_loss_mlp": 0.01254981, "epoch": 0.6458439801593266, "flos": 22388424600960.0, "grad_norm": 1.4594720187255503, "language_loss": 0.80612326, "learning_rate": 1.1774877777478977e-06, "loss": 0.88306302, "num_input_tokens_seen": 231844500, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.09912109, "step": 10742, "time_per_iteration": 2.6230833530426025 }, { "auxiliary_loss_clip": 0.06421787, "auxiliary_loss_mlp": 0.01266441, "balance_loss_clip": 0.06281904, "balance_loss_mlp": 0.01256046, "epoch": 0.6459041034119946, "flos": 24795966839040.0, "grad_norm": 1.6916917732069578, "language_loss": 0.82103837, "learning_rate": 1.1771327912970678e-06, "loss": 0.89792067, "num_input_tokens_seen": 231864510, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.10388184, "step": 10743, "time_per_iteration": 2.6051993370056152 }, { "auxiliary_loss_clip": 0.06423135, "auxiliary_loss_mlp": 0.01265614, "balance_loss_clip": 0.06279276, "balance_loss_mlp": 0.01255636, "epoch": 0.6459642266646626, "flos": 18330013221120.0, "grad_norm": 2.760190583512646, "language_loss": 0.72217119, "learning_rate": 1.1767778360494453e-06, "loss": 0.79905868, "num_input_tokens_seen": 231881555, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.09979248, "step": 10744, "time_per_iteration": 2.56713604927063 }, { "auxiliary_loss_clip": 0.06425443, "auxiliary_loss_mlp": 0.01265027, "balance_loss_clip": 0.06279945, "balance_loss_mlp": 0.01254888, "epoch": 0.6460243499173305, "flos": 43591561672320.0, "grad_norm": 1.683137049081729, "language_loss": 0.66301632, "learning_rate": 1.1764229120184896e-06, "loss": 0.73992109, "num_input_tokens_seen": 231905945, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10137939, "step": 10745, "time_per_iteration": 2.7632970809936523 }, { "auxiliary_loss_clip": 0.06426924, "auxiliary_loss_mlp": 0.01268639, "balance_loss_clip": 0.06281597, "balance_loss_mlp": 0.01257481, "epoch": 0.6460844731699985, "flos": 19249925270400.0, "grad_norm": 2.303851256276742, "language_loss": 0.73860657, "learning_rate": 1.1760680192176597e-06, "loss": 0.81556225, "num_input_tokens_seen": 231922535, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.11157227, "step": 10746, "time_per_iteration": 2.5788276195526123 }, { "auxiliary_loss_clip": 0.06429325, "auxiliary_loss_mlp": 0.01265618, "balance_loss_clip": 0.06280571, "balance_loss_mlp": 0.01255772, "epoch": 0.6461445964226664, "flos": 27460624930560.0, "grad_norm": 1.398677780151025, "language_loss": 0.66936636, "learning_rate": 1.175713157660413e-06, "loss": 0.74631584, "num_input_tokens_seen": 231944800, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.09844971, "step": 10747, "time_per_iteration": 2.59889554977417 }, { "auxiliary_loss_clip": 0.06431638, "auxiliary_loss_mlp": 0.01264983, "balance_loss_clip": 0.06283759, "balance_loss_mlp": 0.01254427, "epoch": 0.6462047196753344, "flos": 20300457035520.0, "grad_norm": 1.680963135820525, "language_loss": 0.67599124, "learning_rate": 1.1753583273602056e-06, "loss": 0.7529574, "num_input_tokens_seen": 231962970, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10565186, "step": 10748, "time_per_iteration": 2.602919816970825 }, { "auxiliary_loss_clip": 0.06431295, "auxiliary_loss_mlp": 0.01266629, "balance_loss_clip": 0.06280752, "balance_loss_mlp": 0.01255054, "epoch": 0.6462648429280025, "flos": 22024937589120.0, "grad_norm": 1.5103237432021301, "language_loss": 0.76235628, "learning_rate": 1.1750035283304937e-06, "loss": 0.83933544, "num_input_tokens_seen": 231981195, "router_z_loss_clip": 1.50488281, "router_z_loss_mlp": 0.11578369, "step": 10749, "time_per_iteration": 2.5772175788879395 }, { "auxiliary_loss_clip": 0.0643011, "auxiliary_loss_mlp": 0.01264543, "balance_loss_clip": 0.06280148, "balance_loss_mlp": 0.0125422, "epoch": 0.6463249661806704, "flos": 27788375376000.0, "grad_norm": 1.491822425691843, "language_loss": 0.77331054, "learning_rate": 1.17464876058473e-06, "loss": 0.85025716, "num_input_tokens_seen": 232001735, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.10321045, "step": 10750, "time_per_iteration": 2.804975748062134 }, { "auxiliary_loss_clip": 0.06438007, "auxiliary_loss_mlp": 0.01269782, "balance_loss_clip": 0.06285763, "balance_loss_mlp": 0.01258183, "epoch": 0.6463850894333384, "flos": 22056481451520.0, "grad_norm": 2.1987761878085244, "language_loss": 0.68617588, "learning_rate": 1.1742940241363683e-06, "loss": 0.76325381, "num_input_tokens_seen": 232019830, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.11590576, "step": 10751, "time_per_iteration": 2.805711269378662 }, { "auxiliary_loss_clip": 0.06432493, "auxiliary_loss_mlp": 0.01263596, "balance_loss_clip": 0.06280944, "balance_loss_mlp": 0.01252909, "epoch": 0.6464452126860063, "flos": 21112698188160.0, "grad_norm": 1.6657492894681984, "language_loss": 0.71782398, "learning_rate": 1.1739393189988604e-06, "loss": 0.7947849, "num_input_tokens_seen": 232039625, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.10681152, "step": 10752, "time_per_iteration": 2.62394642829895 }, { "auxiliary_loss_clip": 0.06432964, "auxiliary_loss_mlp": 0.0126668, "balance_loss_clip": 0.06280781, "balance_loss_mlp": 0.0125343, "epoch": 0.6465053359386743, "flos": 16032531720960.0, "grad_norm": 1.9283833386193363, "language_loss": 0.78201467, "learning_rate": 1.1735846451856554e-06, "loss": 0.85901111, "num_input_tokens_seen": 232055855, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.13250732, "step": 10753, "time_per_iteration": 2.5118958950042725 }, { "auxiliary_loss_clip": 0.06428611, "auxiliary_loss_mlp": 0.01267934, "balance_loss_clip": 0.06283014, "balance_loss_mlp": 0.01256693, "epoch": 0.6465654591913422, "flos": 23404477610880.0, "grad_norm": 1.8752260326439572, "language_loss": 0.85733342, "learning_rate": 1.1732300027102041e-06, "loss": 0.93429887, "num_input_tokens_seen": 232073475, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.11248779, "step": 10754, "time_per_iteration": 2.5936765670776367 }, { "auxiliary_loss_clip": 0.06431429, "auxiliary_loss_mlp": 0.01265999, "balance_loss_clip": 0.06284147, "balance_loss_mlp": 0.01255056, "epoch": 0.6466255824440102, "flos": 15382649053440.0, "grad_norm": 2.060093538856309, "language_loss": 0.59901571, "learning_rate": 1.1728753915859541e-06, "loss": 0.67598999, "num_input_tokens_seen": 232091090, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.10943604, "step": 10755, "time_per_iteration": 3.934147834777832 }, { "auxiliary_loss_clip": 0.0643253, "auxiliary_loss_mlp": 0.01269287, "balance_loss_clip": 0.06285975, "balance_loss_mlp": 0.0125776, "epoch": 0.6466857056966782, "flos": 16258355274240.0, "grad_norm": 2.197320853511235, "language_loss": 0.68987292, "learning_rate": 1.1725208118263518e-06, "loss": 0.76689112, "num_input_tokens_seen": 232107320, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.11523438, "step": 10756, "time_per_iteration": 2.6140902042388916 }, { "auxiliary_loss_clip": 0.06437302, "auxiliary_loss_mlp": 0.01267199, "balance_loss_clip": 0.06283425, "balance_loss_mlp": 0.0125597, "epoch": 0.6467458289493462, "flos": 21184548664320.0, "grad_norm": 2.603270730168277, "language_loss": 0.74274695, "learning_rate": 1.172166263444844e-06, "loss": 0.81979191, "num_input_tokens_seen": 232123930, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.11236572, "step": 10757, "time_per_iteration": 2.563453435897827 }, { "auxiliary_loss_clip": 0.06421047, "auxiliary_loss_mlp": 0.0126862, "balance_loss_clip": 0.06278679, "balance_loss_mlp": 0.01257319, "epoch": 0.6468059522020141, "flos": 17974198857600.0, "grad_norm": 1.3956102381145654, "language_loss": 0.74721873, "learning_rate": 1.1718117464548734e-06, "loss": 0.82411528, "num_input_tokens_seen": 232142905, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.11303711, "step": 10758, "time_per_iteration": 2.567371368408203 }, { "auxiliary_loss_clip": 0.06430954, "auxiliary_loss_mlp": 0.01272068, "balance_loss_clip": 0.06281939, "balance_loss_mlp": 0.01260135, "epoch": 0.6468660754546821, "flos": 17895178857600.0, "grad_norm": 3.228751203821645, "language_loss": 0.6819737, "learning_rate": 1.1714572608698845e-06, "loss": 0.75900394, "num_input_tokens_seen": 232162230, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.1194458, "step": 10759, "time_per_iteration": 2.566406488418579 }, { "auxiliary_loss_clip": 0.06434418, "auxiliary_loss_mlp": 0.0126833, "balance_loss_clip": 0.06281313, "balance_loss_mlp": 0.01257076, "epoch": 0.64692619870735, "flos": 22607497900800.0, "grad_norm": 1.5095863668500509, "language_loss": 0.75590432, "learning_rate": 1.1711028067033197e-06, "loss": 0.83293176, "num_input_tokens_seen": 232182700, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.11260986, "step": 10760, "time_per_iteration": 2.5999550819396973 }, { "auxiliary_loss_clip": 0.06424978, "auxiliary_loss_mlp": 0.01269063, "balance_loss_clip": 0.06279577, "balance_loss_mlp": 0.01258524, "epoch": 0.646986321960018, "flos": 49611863750400.0, "grad_norm": 1.6174222486867909, "language_loss": 0.65750432, "learning_rate": 1.1707483839686194e-06, "loss": 0.73444474, "num_input_tokens_seen": 232208235, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10540771, "step": 10761, "time_per_iteration": 4.250915765762329 }, { "auxiliary_loss_clip": 0.0642731, "auxiliary_loss_mlp": 0.01267582, "balance_loss_clip": 0.06279217, "balance_loss_mlp": 0.01256198, "epoch": 0.6470464452126861, "flos": 21914960705280.0, "grad_norm": 5.816050313780348, "language_loss": 0.70023251, "learning_rate": 1.1703939926792235e-06, "loss": 0.77718139, "num_input_tokens_seen": 232228720, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.11383057, "step": 10762, "time_per_iteration": 2.608656406402588 }, { "auxiliary_loss_clip": 0.06430732, "auxiliary_loss_mlp": 0.01271589, "balance_loss_clip": 0.06279022, "balance_loss_mlp": 0.0125965, "epoch": 0.647106568465354, "flos": 18110688359040.0, "grad_norm": 2.027355171505448, "language_loss": 0.83205277, "learning_rate": 1.1700396328485705e-06, "loss": 0.90907598, "num_input_tokens_seen": 232244655, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.11932373, "step": 10763, "time_per_iteration": 2.5193912982940674 }, { "auxiliary_loss_clip": 0.06316422, "auxiliary_loss_mlp": 0.01256538, "balance_loss_clip": 0.06256934, "balance_loss_mlp": 0.01255053, "epoch": 0.647166691718022, "flos": 69499623899520.0, "grad_norm": 0.69802774201229, "language_loss": 0.57613903, "learning_rate": 1.1696853044900978e-06, "loss": 0.65186858, "num_input_tokens_seen": 232308685, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.0148468, "step": 10764, "time_per_iteration": 3.3813648223876953 }, { "auxiliary_loss_clip": 0.06429318, "auxiliary_loss_mlp": 0.01264984, "balance_loss_clip": 0.0628143, "balance_loss_mlp": 0.01254887, "epoch": 0.6472268149706899, "flos": 34103793934080.0, "grad_norm": 1.8584441497128228, "language_loss": 0.60855258, "learning_rate": 1.1693310076172413e-06, "loss": 0.68549562, "num_input_tokens_seen": 232327520, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.10095215, "step": 10765, "time_per_iteration": 2.6700265407562256 }, { "auxiliary_loss_clip": 0.06421715, "auxiliary_loss_mlp": 0.01271581, "balance_loss_clip": 0.06277621, "balance_loss_mlp": 0.01261562, "epoch": 0.6472869382233579, "flos": 28118809152000.0, "grad_norm": 1.7307269598907544, "language_loss": 0.63001603, "learning_rate": 1.168976742243437e-06, "loss": 0.706949, "num_input_tokens_seen": 232349025, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10021973, "step": 10766, "time_per_iteration": 2.6792304515838623 }, { "auxiliary_loss_clip": 0.06426869, "auxiliary_loss_mlp": 0.01272949, "balance_loss_clip": 0.06280081, "balance_loss_mlp": 0.01262369, "epoch": 0.6473470614760258, "flos": 22498736901120.0, "grad_norm": 1.6938703182363826, "language_loss": 0.7591145, "learning_rate": 1.1686225083821174e-06, "loss": 0.83611274, "num_input_tokens_seen": 232367835, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10577393, "step": 10767, "time_per_iteration": 2.5639266967773438 }, { "auxiliary_loss_clip": 0.06424876, "auxiliary_loss_mlp": 0.01267028, "balance_loss_clip": 0.0627767, "balance_loss_mlp": 0.01255888, "epoch": 0.6474071847286939, "flos": 14544314553600.0, "grad_norm": 2.6427085187908204, "language_loss": 0.78169245, "learning_rate": 1.1682683060467153e-06, "loss": 0.85861146, "num_input_tokens_seen": 232385840, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.11132812, "step": 10768, "time_per_iteration": 2.5468082427978516 }, { "auxiliary_loss_clip": 0.06424527, "auxiliary_loss_mlp": 0.01267597, "balance_loss_clip": 0.06277974, "balance_loss_mlp": 0.01257137, "epoch": 0.6474673079813618, "flos": 24105190579200.0, "grad_norm": 1.7274247454381826, "language_loss": 0.71952546, "learning_rate": 1.167914135250663e-06, "loss": 0.79644668, "num_input_tokens_seen": 232406205, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10461426, "step": 10769, "time_per_iteration": 2.5716285705566406 }, { "auxiliary_loss_clip": 0.06418642, "auxiliary_loss_mlp": 0.01270207, "balance_loss_clip": 0.06275791, "balance_loss_mlp": 0.01259735, "epoch": 0.6475274312340298, "flos": 14981538758400.0, "grad_norm": 2.045349354173361, "language_loss": 0.72093999, "learning_rate": 1.1675599960073895e-06, "loss": 0.79782856, "num_input_tokens_seen": 232424995, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.1048584, "step": 10770, "time_per_iteration": 3.979252815246582 }, { "auxiliary_loss_clip": 0.06428113, "auxiliary_loss_mlp": 0.01266679, "balance_loss_clip": 0.06273825, "balance_loss_mlp": 0.01254085, "epoch": 0.6475875544866977, "flos": 25052202224640.0, "grad_norm": 1.5380067798832289, "language_loss": 0.73397005, "learning_rate": 1.167205888330325e-06, "loss": 0.81091797, "num_input_tokens_seen": 232445870, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.12585449, "step": 10771, "time_per_iteration": 3.988144874572754 }, { "auxiliary_loss_clip": 0.06423088, "auxiliary_loss_mlp": 0.01265214, "balance_loss_clip": 0.06277769, "balance_loss_mlp": 0.01254318, "epoch": 0.6476476777393657, "flos": 16477763990400.0, "grad_norm": 1.9976013335837726, "language_loss": 0.74221575, "learning_rate": 1.1668518122328958e-06, "loss": 0.81909877, "num_input_tokens_seen": 232464285, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10906982, "step": 10772, "time_per_iteration": 2.557119607925415 }, { "auxiliary_loss_clip": 0.06421386, "auxiliary_loss_mlp": 0.01265506, "balance_loss_clip": 0.06278811, "balance_loss_mlp": 0.0125656, "epoch": 0.6477078009920336, "flos": 25819399008000.0, "grad_norm": 1.4402261670685095, "language_loss": 0.8304621, "learning_rate": 1.1664977677285305e-06, "loss": 0.90733099, "num_input_tokens_seen": 232485815, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.08947754, "step": 10773, "time_per_iteration": 2.5988922119140625 }, { "auxiliary_loss_clip": 0.06421086, "auxiliary_loss_mlp": 0.01266863, "balance_loss_clip": 0.06278913, "balance_loss_mlp": 0.01256825, "epoch": 0.6477679242447016, "flos": 17681933416320.0, "grad_norm": 1.5547645447341705, "language_loss": 0.78967184, "learning_rate": 1.1661437548306524e-06, "loss": 0.8665514, "num_input_tokens_seen": 232504875, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10040283, "step": 10774, "time_per_iteration": 2.575936794281006 }, { "auxiliary_loss_clip": 0.06426962, "auxiliary_loss_mlp": 0.01277248, "balance_loss_clip": 0.06276964, "balance_loss_mlp": 0.01265494, "epoch": 0.6478280474973696, "flos": 21038583651840.0, "grad_norm": 2.242082417298189, "language_loss": 0.6957804, "learning_rate": 1.1657897735526867e-06, "loss": 0.7728225, "num_input_tokens_seen": 232521945, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.11755371, "step": 10775, "time_per_iteration": 2.5593199729919434 }, { "auxiliary_loss_clip": 0.06434724, "auxiliary_loss_mlp": 0.01268717, "balance_loss_clip": 0.06281009, "balance_loss_mlp": 0.01258238, "epoch": 0.6478881707500376, "flos": 21623449950720.0, "grad_norm": 1.840046204674277, "language_loss": 0.65964568, "learning_rate": 1.1654358239080574e-06, "loss": 0.73668009, "num_input_tokens_seen": 232541500, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.1048584, "step": 10776, "time_per_iteration": 2.619900941848755 }, { "auxiliary_loss_clip": 0.06427879, "auxiliary_loss_mlp": 0.01269371, "balance_loss_clip": 0.0627847, "balance_loss_mlp": 0.01258148, "epoch": 0.6479482940027056, "flos": 18448543221120.0, "grad_norm": 2.41309316439435, "language_loss": 0.7940172, "learning_rate": 1.1650819059101839e-06, "loss": 0.87098962, "num_input_tokens_seen": 232559720, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.11218262, "step": 10777, "time_per_iteration": 2.5411880016326904 }, { "auxiliary_loss_clip": 0.0642223, "auxiliary_loss_mlp": 0.01265549, "balance_loss_clip": 0.06275966, "balance_loss_mlp": 0.01254844, "epoch": 0.6480084172553735, "flos": 22170651039360.0, "grad_norm": 1.9365092006526605, "language_loss": 0.73516405, "learning_rate": 1.1647280195724896e-06, "loss": 0.81204176, "num_input_tokens_seen": 232579370, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.1071167, "step": 10778, "time_per_iteration": 2.6036570072174072 }, { "auxiliary_loss_clip": 0.0642175, "auxiliary_loss_mlp": 0.01264723, "balance_loss_clip": 0.06277545, "balance_loss_mlp": 0.01254287, "epoch": 0.6480685405080415, "flos": 24323089921920.0, "grad_norm": 1.4440894346825566, "language_loss": 0.78225046, "learning_rate": 1.1643741649083923e-06, "loss": 0.85911512, "num_input_tokens_seen": 232600495, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10437012, "step": 10779, "time_per_iteration": 2.590057373046875 }, { "auxiliary_loss_clip": 0.06317677, "auxiliary_loss_mlp": 0.01251409, "balance_loss_clip": 0.0625762, "balance_loss_mlp": 0.01249882, "epoch": 0.6481286637607094, "flos": 59910348539520.0, "grad_norm": 0.7324830026769489, "language_loss": 0.59435749, "learning_rate": 1.1640203419313095e-06, "loss": 0.6700483, "num_input_tokens_seen": 232663165, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01525879, "step": 10780, "time_per_iteration": 3.201634645462036 }, { "auxiliary_loss_clip": 0.06424163, "auxiliary_loss_mlp": 0.01265701, "balance_loss_clip": 0.06277993, "balance_loss_mlp": 0.01255484, "epoch": 0.6481887870133775, "flos": 25491313146240.0, "grad_norm": 3.4871085750384583, "language_loss": 0.79293036, "learning_rate": 1.1636665506546599e-06, "loss": 0.86982894, "num_input_tokens_seen": 232683385, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10217285, "step": 10781, "time_per_iteration": 2.594266891479492 }, { "auxiliary_loss_clip": 0.06432831, "auxiliary_loss_mlp": 0.01269953, "balance_loss_clip": 0.0628235, "balance_loss_mlp": 0.01257978, "epoch": 0.6482489102660454, "flos": 19935041379840.0, "grad_norm": 2.2626909499489547, "language_loss": 0.79005551, "learning_rate": 1.1633127910918578e-06, "loss": 0.86708331, "num_input_tokens_seen": 232699095, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.11975098, "step": 10782, "time_per_iteration": 2.553459882736206 }, { "auxiliary_loss_clip": 0.06433707, "auxiliary_loss_mlp": 0.01268492, "balance_loss_clip": 0.06283578, "balance_loss_mlp": 0.01257245, "epoch": 0.6483090335187134, "flos": 26986741764480.0, "grad_norm": 2.2095461415196813, "language_loss": 0.64616388, "learning_rate": 1.1629590632563187e-06, "loss": 0.7231859, "num_input_tokens_seen": 232717920, "router_z_loss_clip": 1.49902344, "router_z_loss_mlp": 0.11260986, "step": 10783, "time_per_iteration": 2.5934009552001953 }, { "auxiliary_loss_clip": 0.06434631, "auxiliary_loss_mlp": 0.0126634, "balance_loss_clip": 0.06282917, "balance_loss_mlp": 0.0125455, "epoch": 0.6483691567713813, "flos": 25084207284480.0, "grad_norm": 2.0462809219878353, "language_loss": 0.88806504, "learning_rate": 1.1626053671614561e-06, "loss": 0.96507472, "num_input_tokens_seen": 232737605, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.11791992, "step": 10784, "time_per_iteration": 2.609621286392212 }, { "auxiliary_loss_clip": 0.0642932, "auxiliary_loss_mlp": 0.01267761, "balance_loss_clip": 0.06284563, "balance_loss_mlp": 0.01256353, "epoch": 0.6484292800240493, "flos": 16111300158720.0, "grad_norm": 3.9802591488485, "language_loss": 0.73477513, "learning_rate": 1.1622517028206815e-06, "loss": 0.81174594, "num_input_tokens_seen": 232755110, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.11413574, "step": 10785, "time_per_iteration": 2.5207817554473877 }, { "auxiliary_loss_clip": 0.06422572, "auxiliary_loss_mlp": 0.01265527, "balance_loss_clip": 0.06279138, "balance_loss_mlp": 0.01255806, "epoch": 0.6484894032767172, "flos": 28848005308800.0, "grad_norm": 1.3982221232252015, "language_loss": 0.6907208, "learning_rate": 1.1618980702474071e-06, "loss": 0.76760185, "num_input_tokens_seen": 232779040, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09716797, "step": 10786, "time_per_iteration": 2.658548355102539 }, { "auxiliary_loss_clip": 0.06427827, "auxiliary_loss_mlp": 0.01270149, "balance_loss_clip": 0.0628164, "balance_loss_mlp": 0.01259361, "epoch": 0.6485495265293852, "flos": 30234924489600.0, "grad_norm": 1.8356423467916916, "language_loss": 0.71365142, "learning_rate": 1.161544469455041e-06, "loss": 0.79063118, "num_input_tokens_seen": 232800515, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.10784912, "step": 10787, "time_per_iteration": 2.6417815685272217 }, { "auxiliary_loss_clip": 0.06433493, "auxiliary_loss_mlp": 0.01268045, "balance_loss_clip": 0.06282617, "balance_loss_mlp": 0.01256351, "epoch": 0.6486096497820532, "flos": 20088050135040.0, "grad_norm": 2.2953763907472777, "language_loss": 0.84562165, "learning_rate": 1.1611909004569934e-06, "loss": 0.92263699, "num_input_tokens_seen": 232818450, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.11706543, "step": 10788, "time_per_iteration": 2.5325064659118652 }, { "auxiliary_loss_clip": 0.06430788, "auxiliary_loss_mlp": 0.01268084, "balance_loss_clip": 0.06282964, "balance_loss_mlp": 0.0125698, "epoch": 0.6486697730347212, "flos": 17134816181760.0, "grad_norm": 1.8537066430729094, "language_loss": 0.77653611, "learning_rate": 1.1608373632666708e-06, "loss": 0.85352486, "num_input_tokens_seen": 232834785, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.11096191, "step": 10789, "time_per_iteration": 2.5223474502563477 }, { "auxiliary_loss_clip": 0.06426923, "auxiliary_loss_mlp": 0.01265799, "balance_loss_clip": 0.06281211, "balance_loss_mlp": 0.01255428, "epoch": 0.6487298962873892, "flos": 38921477886720.0, "grad_norm": 1.8314909344029233, "language_loss": 0.76215994, "learning_rate": 1.160483857897479e-06, "loss": 0.83908713, "num_input_tokens_seen": 232856050, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10375977, "step": 10790, "time_per_iteration": 2.7211713790893555 }, { "auxiliary_loss_clip": 0.06429089, "auxiliary_loss_mlp": 0.01266713, "balance_loss_clip": 0.06283201, "balance_loss_mlp": 0.01256265, "epoch": 0.6487900195400571, "flos": 11952680895360.0, "grad_norm": 2.6968078710767345, "language_loss": 0.609716, "learning_rate": 1.160130384362823e-06, "loss": 0.686674, "num_input_tokens_seen": 232873945, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10449219, "step": 10791, "time_per_iteration": 2.5449538230895996 }, { "auxiliary_loss_clip": 0.06430973, "auxiliary_loss_mlp": 0.01271449, "balance_loss_clip": 0.06282847, "balance_loss_mlp": 0.01260488, "epoch": 0.6488501427927251, "flos": 22350717463680.0, "grad_norm": 2.0018698862255087, "language_loss": 0.86531079, "learning_rate": 1.1597769426761082e-06, "loss": 0.94233501, "num_input_tokens_seen": 232892160, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.10968018, "step": 10792, "time_per_iteration": 2.578101873397827 }, { "auxiliary_loss_clip": 0.06440321, "auxiliary_loss_mlp": 0.01267773, "balance_loss_clip": 0.06287535, "balance_loss_mlp": 0.0125621, "epoch": 0.648910266045393, "flos": 22242753077760.0, "grad_norm": 2.355261662316527, "language_loss": 0.78444105, "learning_rate": 1.159423532850735e-06, "loss": 0.86152196, "num_input_tokens_seen": 232911725, "router_z_loss_clip": 1.52539062, "router_z_loss_mlp": 0.11566162, "step": 10793, "time_per_iteration": 2.5995752811431885 }, { "auxiliary_loss_clip": 0.06435438, "auxiliary_loss_mlp": 0.01268251, "balance_loss_clip": 0.06284395, "balance_loss_mlp": 0.01257045, "epoch": 0.6489703892980611, "flos": 25308269902080.0, "grad_norm": 2.295562630610226, "language_loss": 0.74640763, "learning_rate": 1.1590701549001055e-06, "loss": 0.82344449, "num_input_tokens_seen": 232929085, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.11193848, "step": 10794, "time_per_iteration": 2.651843547821045 }, { "auxiliary_loss_clip": 0.06431082, "auxiliary_loss_mlp": 0.01265615, "balance_loss_clip": 0.06280959, "balance_loss_mlp": 0.01254612, "epoch": 0.649030512550729, "flos": 24578864110080.0, "grad_norm": 1.5183492796426143, "language_loss": 0.70263588, "learning_rate": 1.158716808837621e-06, "loss": 0.77960283, "num_input_tokens_seen": 232949455, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.11004639, "step": 10795, "time_per_iteration": 3.9954211711883545 }, { "auxiliary_loss_clip": 0.06441817, "auxiliary_loss_mlp": 0.0127142, "balance_loss_clip": 0.06288757, "balance_loss_mlp": 0.01258891, "epoch": 0.649090635803397, "flos": 26251004989440.0, "grad_norm": 2.251135621416653, "language_loss": 0.54120767, "learning_rate": 1.158363494676679e-06, "loss": 0.61834008, "num_input_tokens_seen": 232969445, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.12524414, "step": 10796, "time_per_iteration": 2.6220483779907227 }, { "auxiliary_loss_clip": 0.06431632, "auxiliary_loss_mlp": 0.01265081, "balance_loss_clip": 0.06282267, "balance_loss_mlp": 0.01254563, "epoch": 0.6491507590560649, "flos": 24944489400960.0, "grad_norm": 1.622740721725262, "language_loss": 0.78360647, "learning_rate": 1.1580102124306775e-06, "loss": 0.86057365, "num_input_tokens_seen": 232988900, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.10519409, "step": 10797, "time_per_iteration": 2.6860287189483643 }, { "auxiliary_loss_clip": 0.06421857, "auxiliary_loss_mlp": 0.01265625, "balance_loss_clip": 0.06280395, "balance_loss_mlp": 0.01256113, "epoch": 0.6492108823087329, "flos": 19505783312640.0, "grad_norm": 1.8721019526919764, "language_loss": 0.70698285, "learning_rate": 1.1576569621130134e-06, "loss": 0.78385764, "num_input_tokens_seen": 233005060, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09509277, "step": 10798, "time_per_iteration": 2.78220796585083 }, { "auxiliary_loss_clip": 0.06426049, "auxiliary_loss_mlp": 0.01266223, "balance_loss_clip": 0.06278895, "balance_loss_mlp": 0.01255768, "epoch": 0.6492710055614008, "flos": 19725443591040.0, "grad_norm": 1.7682647905495903, "language_loss": 0.77267003, "learning_rate": 1.1573037437370811e-06, "loss": 0.84959275, "num_input_tokens_seen": 233023375, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10449219, "step": 10799, "time_per_iteration": 2.584174871444702 }, { "auxiliary_loss_clip": 0.0643656, "auxiliary_loss_mlp": 0.01271581, "balance_loss_clip": 0.06282992, "balance_loss_mlp": 0.01260161, "epoch": 0.6493311288140688, "flos": 24324012316800.0, "grad_norm": 1.7429971571569236, "language_loss": 0.7216363, "learning_rate": 1.1569505573162755e-06, "loss": 0.79871768, "num_input_tokens_seen": 233043130, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.11425781, "step": 10800, "time_per_iteration": 4.055465221405029 }, { "auxiliary_loss_clip": 0.06323808, "auxiliary_loss_mlp": 0.01255707, "balance_loss_clip": 0.0626339, "balance_loss_mlp": 0.01253958, "epoch": 0.6493912520667368, "flos": 70953655800960.0, "grad_norm": 0.7606462164831223, "language_loss": 0.60180891, "learning_rate": 1.1565974028639897e-06, "loss": 0.67760408, "num_input_tokens_seen": 233110560, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01751709, "step": 10801, "time_per_iteration": 3.2887492179870605 }, { "auxiliary_loss_clip": 0.06430058, "auxiliary_loss_mlp": 0.01269437, "balance_loss_clip": 0.06280597, "balance_loss_mlp": 0.01257677, "epoch": 0.6494513753194048, "flos": 25344803082240.0, "grad_norm": 1.9224272924375556, "language_loss": 0.78875846, "learning_rate": 1.156244280393614e-06, "loss": 0.86575341, "num_input_tokens_seen": 233130080, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.11755371, "step": 10802, "time_per_iteration": 2.609233856201172 }, { "auxiliary_loss_clip": 0.06428888, "auxiliary_loss_mlp": 0.01266096, "balance_loss_clip": 0.06279081, "balance_loss_mlp": 0.01255004, "epoch": 0.6495114985720728, "flos": 24689050629120.0, "grad_norm": 1.6778471804066577, "language_loss": 0.74839544, "learning_rate": 1.155891189918541e-06, "loss": 0.82534534, "num_input_tokens_seen": 233150235, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.11102295, "step": 10803, "time_per_iteration": 2.5875723361968994 }, { "auxiliary_loss_clip": 0.06429606, "auxiliary_loss_mlp": 0.01265323, "balance_loss_clip": 0.06282261, "balance_loss_mlp": 0.01254368, "epoch": 0.6495716218247407, "flos": 23656520292480.0, "grad_norm": 5.838123051901687, "language_loss": 0.70216739, "learning_rate": 1.1555381314521578e-06, "loss": 0.77911675, "num_input_tokens_seen": 233166710, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.10961914, "step": 10804, "time_per_iteration": 2.5701839923858643 }, { "auxiliary_loss_clip": 0.06423898, "auxiliary_loss_mlp": 0.01264499, "balance_loss_clip": 0.06278124, "balance_loss_mlp": 0.0125321, "epoch": 0.6496317450774087, "flos": 22352729961600.0, "grad_norm": 8.756754725782166, "language_loss": 0.72990078, "learning_rate": 1.1551851050078537e-06, "loss": 0.80678469, "num_input_tokens_seen": 233185445, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.112854, "step": 10805, "time_per_iteration": 2.5690596103668213 }, { "auxiliary_loss_clip": 0.06423506, "auxiliary_loss_mlp": 0.01264822, "balance_loss_clip": 0.06274368, "balance_loss_mlp": 0.01255053, "epoch": 0.6496918683300766, "flos": 30526519098240.0, "grad_norm": 2.2598042558291427, "language_loss": 0.66082442, "learning_rate": 1.1548321105990155e-06, "loss": 0.73770773, "num_input_tokens_seen": 233205805, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.09765625, "step": 10806, "time_per_iteration": 2.658971071243286 }, { "auxiliary_loss_clip": 0.06433515, "auxiliary_loss_mlp": 0.01266615, "balance_loss_clip": 0.06280023, "balance_loss_mlp": 0.0125529, "epoch": 0.6497519915827447, "flos": 12463977709440.0, "grad_norm": 2.1819294964800093, "language_loss": 0.79100269, "learning_rate": 1.1544791482390275e-06, "loss": 0.86800402, "num_input_tokens_seen": 233224215, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.11322021, "step": 10807, "time_per_iteration": 2.5689263343811035 }, { "auxiliary_loss_clip": 0.06323233, "auxiliary_loss_mlp": 0.0125345, "balance_loss_clip": 0.06263345, "balance_loss_mlp": 0.01251804, "epoch": 0.6498121148354126, "flos": 69115787544960.0, "grad_norm": 0.8145243300402478, "language_loss": 0.58897209, "learning_rate": 1.1541262179412745e-06, "loss": 0.66473901, "num_input_tokens_seen": 233294440, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01649475, "step": 10808, "time_per_iteration": 3.3705806732177734 }, { "auxiliary_loss_clip": 0.06420573, "auxiliary_loss_mlp": 0.01263384, "balance_loss_clip": 0.06278172, "balance_loss_mlp": 0.01252643, "epoch": 0.6498722380880806, "flos": 36904983454080.0, "grad_norm": 1.9627935134728491, "language_loss": 0.63212264, "learning_rate": 1.1537733197191415e-06, "loss": 0.7089622, "num_input_tokens_seen": 233316125, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10742188, "step": 10809, "time_per_iteration": 4.14583158493042 }, { "auxiliary_loss_clip": 0.0642134, "auxiliary_loss_mlp": 0.01265375, "balance_loss_clip": 0.06276542, "balance_loss_mlp": 0.01255266, "epoch": 0.6499323613407485, "flos": 29024549861760.0, "grad_norm": 1.4327330911307476, "language_loss": 0.81716466, "learning_rate": 1.153420453586008e-06, "loss": 0.89403182, "num_input_tokens_seen": 233336140, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10107422, "step": 10810, "time_per_iteration": 2.6470882892608643 }, { "auxiliary_loss_clip": 0.06416364, "auxiliary_loss_mlp": 0.01268301, "balance_loss_clip": 0.06274149, "balance_loss_mlp": 0.01257989, "epoch": 0.6499924845934165, "flos": 20125212220800.0, "grad_norm": 1.5027268091710584, "language_loss": 0.71748745, "learning_rate": 1.1530676195552561e-06, "loss": 0.79433411, "num_input_tokens_seen": 233356095, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10314941, "step": 10811, "time_per_iteration": 4.092877626419067 }, { "auxiliary_loss_clip": 0.06416261, "auxiliary_loss_mlp": 0.01267391, "balance_loss_clip": 0.0627588, "balance_loss_mlp": 0.01257121, "epoch": 0.6500526078460844, "flos": 24427490509440.0, "grad_norm": 1.6995450588540075, "language_loss": 0.77822006, "learning_rate": 1.1527148176402649e-06, "loss": 0.85505652, "num_input_tokens_seen": 233376830, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.10266113, "step": 10812, "time_per_iteration": 2.6224191188812256 }, { "auxiliary_loss_clip": 0.06427547, "auxiliary_loss_mlp": 0.01266756, "balance_loss_clip": 0.06279321, "balance_loss_mlp": 0.01255127, "epoch": 0.6501127310987524, "flos": 23337700306560.0, "grad_norm": 1.8936084632357828, "language_loss": 0.85373974, "learning_rate": 1.152362047854413e-06, "loss": 0.93068278, "num_input_tokens_seen": 233395275, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.11621094, "step": 10813, "time_per_iteration": 2.563267946243286 }, { "auxiliary_loss_clip": 0.06421679, "auxiliary_loss_mlp": 0.01267437, "balance_loss_clip": 0.06275857, "balance_loss_mlp": 0.01256982, "epoch": 0.6501728543514204, "flos": 18703814284800.0, "grad_norm": 1.9312661616603275, "language_loss": 0.80438614, "learning_rate": 1.1520093102110764e-06, "loss": 0.88127732, "num_input_tokens_seen": 233413345, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10455322, "step": 10814, "time_per_iteration": 2.565883159637451 }, { "auxiliary_loss_clip": 0.06431594, "auxiliary_loss_mlp": 0.01264005, "balance_loss_clip": 0.06280304, "balance_loss_mlp": 0.01253079, "epoch": 0.6502329776040884, "flos": 44209858550400.0, "grad_norm": 1.5685238567149486, "language_loss": 0.65238655, "learning_rate": 1.1516566047236328e-06, "loss": 0.72934252, "num_input_tokens_seen": 233436105, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.10925293, "step": 10815, "time_per_iteration": 2.7590487003326416 }, { "auxiliary_loss_clip": 0.06432962, "auxiliary_loss_mlp": 0.01269225, "balance_loss_clip": 0.06279278, "balance_loss_mlp": 0.01256398, "epoch": 0.6502931008567564, "flos": 14580009192960.0, "grad_norm": 2.1617456094328484, "language_loss": 0.75544465, "learning_rate": 1.1513039314054546e-06, "loss": 0.83246648, "num_input_tokens_seen": 233452320, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.12835693, "step": 10816, "time_per_iteration": 2.5464820861816406 }, { "auxiliary_loss_clip": 0.06421245, "auxiliary_loss_mlp": 0.01268006, "balance_loss_clip": 0.06277291, "balance_loss_mlp": 0.01256616, "epoch": 0.6503532241094243, "flos": 21400980560640.0, "grad_norm": 1.6432846426630898, "language_loss": 0.73297524, "learning_rate": 1.1509512902699174e-06, "loss": 0.80986774, "num_input_tokens_seen": 233469920, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.1138916, "step": 10817, "time_per_iteration": 2.572309732437134 }, { "auxiliary_loss_clip": 0.0642249, "auxiliary_loss_mlp": 0.01267228, "balance_loss_clip": 0.06274747, "balance_loss_mlp": 0.01256219, "epoch": 0.6504133473620923, "flos": 74756349648000.0, "grad_norm": 1.7047822017247107, "language_loss": 0.72013581, "learning_rate": 1.1505986813303916e-06, "loss": 0.79703295, "num_input_tokens_seen": 233499780, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.10998535, "step": 10818, "time_per_iteration": 2.9919886589050293 }, { "auxiliary_loss_clip": 0.06425487, "auxiliary_loss_mlp": 0.0126821, "balance_loss_clip": 0.06275267, "balance_loss_mlp": 0.0125735, "epoch": 0.6504734706147602, "flos": 19718399848320.0, "grad_norm": 2.14450543828958, "language_loss": 0.65170044, "learning_rate": 1.150246104600249e-06, "loss": 0.72863746, "num_input_tokens_seen": 233518235, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.10864258, "step": 10819, "time_per_iteration": 2.5713918209075928 }, { "auxiliary_loss_clip": 0.0642722, "auxiliary_loss_mlp": 0.01266802, "balance_loss_clip": 0.06277318, "balance_loss_mlp": 0.01256126, "epoch": 0.6505335938674283, "flos": 25563960236160.0, "grad_norm": 7.619378132832352, "language_loss": 0.84246898, "learning_rate": 1.14989356009286e-06, "loss": 0.91940928, "num_input_tokens_seen": 233535215, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.10675049, "step": 10820, "time_per_iteration": 2.6235809326171875 }, { "auxiliary_loss_clip": 0.06429462, "auxiliary_loss_mlp": 0.01267933, "balance_loss_clip": 0.0627799, "balance_loss_mlp": 0.01256209, "epoch": 0.6505937171200962, "flos": 17827143742080.0, "grad_norm": 1.9600413004575907, "language_loss": 0.78396028, "learning_rate": 1.1495410478215914e-06, "loss": 0.86093414, "num_input_tokens_seen": 233552775, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.1171875, "step": 10821, "time_per_iteration": 2.5811305046081543 }, { "auxiliary_loss_clip": 0.06416196, "auxiliary_loss_mlp": 0.0126779, "balance_loss_clip": 0.06274003, "balance_loss_mlp": 0.01258348, "epoch": 0.6506538403727642, "flos": 20674467734400.0, "grad_norm": 1.433629132892809, "language_loss": 0.80024993, "learning_rate": 1.1491885677998126e-06, "loss": 0.87708974, "num_input_tokens_seen": 233572080, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09454346, "step": 10822, "time_per_iteration": 2.6141951084136963 }, { "auxiliary_loss_clip": 0.06417608, "auxiliary_loss_mlp": 0.01263412, "balance_loss_clip": 0.06272799, "balance_loss_mlp": 0.01252689, "epoch": 0.6507139636254321, "flos": 11724970625280.0, "grad_norm": 1.8384816640771693, "language_loss": 0.87548327, "learning_rate": 1.1488361200408883e-06, "loss": 0.95229352, "num_input_tokens_seen": 233589155, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.1071167, "step": 10823, "time_per_iteration": 2.545933485031128 }, { "auxiliary_loss_clip": 0.06418842, "auxiliary_loss_mlp": 0.01264861, "balance_loss_clip": 0.06270908, "balance_loss_mlp": 0.01253835, "epoch": 0.6507740868781001, "flos": 26769177838080.0, "grad_norm": 1.5839891585313661, "language_loss": 0.66700131, "learning_rate": 1.148483704558183e-06, "loss": 0.74383831, "num_input_tokens_seen": 233608180, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.11029053, "step": 10824, "time_per_iteration": 2.6091184616088867 }, { "auxiliary_loss_clip": 0.06428225, "auxiliary_loss_mlp": 0.01268442, "balance_loss_clip": 0.06277436, "balance_loss_mlp": 0.01256962, "epoch": 0.650834210130768, "flos": 16477260865920.0, "grad_norm": 2.4271084650426276, "language_loss": 0.87867618, "learning_rate": 1.1481313213650607e-06, "loss": 0.95564294, "num_input_tokens_seen": 233625750, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.11499023, "step": 10825, "time_per_iteration": 2.5903327465057373 }, { "auxiliary_loss_clip": 0.06426232, "auxiliary_loss_mlp": 0.01267057, "balance_loss_clip": 0.06275241, "balance_loss_mlp": 0.01254391, "epoch": 0.650894333383436, "flos": 17134354984320.0, "grad_norm": 2.137769536709055, "language_loss": 0.73424757, "learning_rate": 1.147778970474885e-06, "loss": 0.81118053, "num_input_tokens_seen": 233644235, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.12670898, "step": 10826, "time_per_iteration": 2.574411392211914 }, { "auxiliary_loss_clip": 0.06425447, "auxiliary_loss_mlp": 0.01264777, "balance_loss_clip": 0.0627867, "balance_loss_mlp": 0.01254477, "epoch": 0.650954456636104, "flos": 18740221683840.0, "grad_norm": 1.990022452117842, "language_loss": 0.70007688, "learning_rate": 1.1474266519010157e-06, "loss": 0.77697915, "num_input_tokens_seen": 233662845, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10302734, "step": 10827, "time_per_iteration": 2.574099063873291 }, { "auxiliary_loss_clip": 0.06426944, "auxiliary_loss_mlp": 0.01267441, "balance_loss_clip": 0.06277657, "balance_loss_mlp": 0.01256677, "epoch": 0.651014579888772, "flos": 24533987448960.0, "grad_norm": 1.940749350339102, "language_loss": 0.7742033, "learning_rate": 1.1470743656568136e-06, "loss": 0.85114706, "num_input_tokens_seen": 233681990, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.10772705, "step": 10828, "time_per_iteration": 2.633953809738159 }, { "auxiliary_loss_clip": 0.06418975, "auxiliary_loss_mlp": 0.01263059, "balance_loss_clip": 0.06273232, "balance_loss_mlp": 0.01252771, "epoch": 0.65107470314144, "flos": 24067944639360.0, "grad_norm": 1.6370583726004992, "language_loss": 0.89622647, "learning_rate": 1.1467221117556362e-06, "loss": 0.97304684, "num_input_tokens_seen": 233698930, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10290527, "step": 10829, "time_per_iteration": 2.592299461364746 }, { "auxiliary_loss_clip": 0.06313971, "auxiliary_loss_mlp": 0.01250917, "balance_loss_clip": 0.06254121, "balance_loss_mlp": 0.01249258, "epoch": 0.6511348263941079, "flos": 72502304561280.0, "grad_norm": 0.6367875193572462, "language_loss": 0.55347574, "learning_rate": 1.1463698902108428e-06, "loss": 0.62912464, "num_input_tokens_seen": 233769825, "router_z_loss_clip": 0.59912109, "router_z_loss_mlp": 0.01661682, "step": 10830, "time_per_iteration": 3.327580690383911 }, { "auxiliary_loss_clip": 0.06431271, "auxiliary_loss_mlp": 0.01265966, "balance_loss_clip": 0.06279118, "balance_loss_mlp": 0.01254385, "epoch": 0.6511949496467759, "flos": 23374401194880.0, "grad_norm": 2.010802323259489, "language_loss": 0.75156462, "learning_rate": 1.1460177010357878e-06, "loss": 0.82853699, "num_input_tokens_seen": 233787095, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.11572266, "step": 10831, "time_per_iteration": 2.5709125995635986 }, { "auxiliary_loss_clip": 0.06314316, "auxiliary_loss_mlp": 0.01251084, "balance_loss_clip": 0.06254714, "balance_loss_mlp": 0.01249572, "epoch": 0.6512550728994438, "flos": 67353390218880.0, "grad_norm": 0.6382107342350376, "language_loss": 0.51035738, "learning_rate": 1.145665544243828e-06, "loss": 0.58601141, "num_input_tokens_seen": 233853050, "router_z_loss_clip": 0.59521484, "router_z_loss_mlp": 0.01512146, "step": 10832, "time_per_iteration": 3.277261734008789 }, { "auxiliary_loss_clip": 0.06430055, "auxiliary_loss_mlp": 0.01266355, "balance_loss_clip": 0.06277806, "balance_loss_mlp": 0.01255615, "epoch": 0.6513151961521119, "flos": 21147973557120.0, "grad_norm": 2.001319114080363, "language_loss": 0.83745682, "learning_rate": 1.145313419848316e-06, "loss": 0.91442096, "num_input_tokens_seen": 233871385, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.10748291, "step": 10833, "time_per_iteration": 2.5504355430603027 }, { "auxiliary_loss_clip": 0.06420706, "auxiliary_loss_mlp": 0.01267402, "balance_loss_clip": 0.06274849, "balance_loss_mlp": 0.01256053, "epoch": 0.6513753194047798, "flos": 15164246586240.0, "grad_norm": 2.0692129182018566, "language_loss": 0.83680487, "learning_rate": 1.1449613278626049e-06, "loss": 0.91368598, "num_input_tokens_seen": 233888175, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.11340332, "step": 10834, "time_per_iteration": 3.933246374130249 }, { "auxiliary_loss_clip": 0.06423908, "auxiliary_loss_mlp": 0.01271245, "balance_loss_clip": 0.06275965, "balance_loss_mlp": 0.01260701, "epoch": 0.6514354426574478, "flos": 30234421365120.0, "grad_norm": 1.4630373145096518, "language_loss": 0.7746526, "learning_rate": 1.1446092683000455e-06, "loss": 0.8516041, "num_input_tokens_seen": 233911470, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.10546875, "step": 10835, "time_per_iteration": 2.6338226795196533 }, { "auxiliary_loss_clip": 0.06425472, "auxiliary_loss_mlp": 0.01265715, "balance_loss_clip": 0.06277678, "balance_loss_mlp": 0.01255022, "epoch": 0.6514955659101157, "flos": 24212232570240.0, "grad_norm": 1.9399273758351705, "language_loss": 0.77772999, "learning_rate": 1.1442572411739882e-06, "loss": 0.8546418, "num_input_tokens_seen": 233932135, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10705566, "step": 10836, "time_per_iteration": 2.584737539291382 }, { "auxiliary_loss_clip": 0.06423871, "auxiliary_loss_mlp": 0.0126921, "balance_loss_clip": 0.06276374, "balance_loss_mlp": 0.01258701, "epoch": 0.6515556891627837, "flos": 12381351984000.0, "grad_norm": 4.559882878198619, "language_loss": 0.82486826, "learning_rate": 1.143905246497783e-06, "loss": 0.90179908, "num_input_tokens_seen": 233947880, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.10516357, "step": 10837, "time_per_iteration": 2.5306482315063477 }, { "auxiliary_loss_clip": 0.06422657, "auxiliary_loss_mlp": 0.01269662, "balance_loss_clip": 0.06278887, "balance_loss_mlp": 0.01258469, "epoch": 0.6516158124154516, "flos": 49612366874880.0, "grad_norm": 2.831632528319076, "language_loss": 0.59468079, "learning_rate": 1.1435532842847758e-06, "loss": 0.67160398, "num_input_tokens_seen": 233971475, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.11199951, "step": 10838, "time_per_iteration": 2.810809373855591 }, { "auxiliary_loss_clip": 0.06311613, "auxiliary_loss_mlp": 0.01251526, "balance_loss_clip": 0.0625163, "balance_loss_mlp": 0.01249939, "epoch": 0.6516759356681197, "flos": 59720848531200.0, "grad_norm": 0.7182869777371041, "language_loss": 0.60528463, "learning_rate": 1.1432013545483147e-06, "loss": 0.68091607, "num_input_tokens_seen": 234030690, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01587677, "step": 10839, "time_per_iteration": 3.2285943031311035 }, { "auxiliary_loss_clip": 0.06423293, "auxiliary_loss_mlp": 0.01263408, "balance_loss_clip": 0.06278743, "balance_loss_mlp": 0.01253526, "epoch": 0.6517360589207876, "flos": 37459815264000.0, "grad_norm": 1.6382102923370798, "language_loss": 0.68040615, "learning_rate": 1.1428494573017439e-06, "loss": 0.75727314, "num_input_tokens_seen": 234052470, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.09887695, "step": 10840, "time_per_iteration": 4.1851255893707275 }, { "auxiliary_loss_clip": 0.06427935, "auxiliary_loss_mlp": 0.01265323, "balance_loss_clip": 0.06280454, "balance_loss_mlp": 0.01254446, "epoch": 0.6517961821734556, "flos": 25382049022080.0, "grad_norm": 3.2270468977915585, "language_loss": 0.74310386, "learning_rate": 1.1424975925584071e-06, "loss": 0.82003647, "num_input_tokens_seen": 234071495, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.10876465, "step": 10841, "time_per_iteration": 2.6046814918518066 }, { "auxiliary_loss_clip": 0.06427409, "auxiliary_loss_mlp": 0.01264429, "balance_loss_clip": 0.06278342, "balance_loss_mlp": 0.01253241, "epoch": 0.6518563054261236, "flos": 28774519678080.0, "grad_norm": 1.410511989277224, "language_loss": 0.62775987, "learning_rate": 1.142145760331648e-06, "loss": 0.70467824, "num_input_tokens_seen": 234092325, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.11193848, "step": 10842, "time_per_iteration": 2.664263963699341 }, { "auxiliary_loss_clip": 0.06317576, "auxiliary_loss_mlp": 0.01253397, "balance_loss_clip": 0.06257499, "balance_loss_mlp": 0.01251567, "epoch": 0.6519164286787915, "flos": 68942905372800.0, "grad_norm": 0.813426403076287, "language_loss": 0.56083357, "learning_rate": 1.141793960634807e-06, "loss": 0.63654333, "num_input_tokens_seen": 234148005, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01824951, "step": 10843, "time_per_iteration": 3.1341819763183594 }, { "auxiliary_loss_clip": 0.0642991, "auxiliary_loss_mlp": 0.01268993, "balance_loss_clip": 0.06275473, "balance_loss_mlp": 0.01256392, "epoch": 0.6519765519314595, "flos": 20447009026560.0, "grad_norm": 1.663965937714691, "language_loss": 0.82733804, "learning_rate": 1.1414421934812253e-06, "loss": 0.90432703, "num_input_tokens_seen": 234164280, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.12597656, "step": 10844, "time_per_iteration": 2.9390695095062256 }, { "auxiliary_loss_clip": 0.06422296, "auxiliary_loss_mlp": 0.01266107, "balance_loss_clip": 0.06274282, "balance_loss_mlp": 0.01254627, "epoch": 0.6520366751841274, "flos": 28410571468800.0, "grad_norm": 4.479385337330749, "language_loss": 0.60002321, "learning_rate": 1.1410904588842421e-06, "loss": 0.67690724, "num_input_tokens_seen": 234185090, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.11480713, "step": 10845, "time_per_iteration": 2.684988498687744 }, { "auxiliary_loss_clip": 0.06433572, "auxiliary_loss_mlp": 0.0126638, "balance_loss_clip": 0.06285265, "balance_loss_mlp": 0.01255055, "epoch": 0.6520967984367955, "flos": 22279999017600.0, "grad_norm": 1.6403617560745705, "language_loss": 0.79127675, "learning_rate": 1.140738756857194e-06, "loss": 0.8682763, "num_input_tokens_seen": 234204050, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.11328125, "step": 10846, "time_per_iteration": 2.5887351036071777 }, { "auxiliary_loss_clip": 0.06320001, "auxiliary_loss_mlp": 0.01254217, "balance_loss_clip": 0.06260101, "balance_loss_mlp": 0.01252575, "epoch": 0.6521569216894634, "flos": 68940123459840.0, "grad_norm": 0.6924840884657183, "language_loss": 0.60058546, "learning_rate": 1.1403870874134192e-06, "loss": 0.67632771, "num_input_tokens_seen": 234269790, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 0.01644897, "step": 10847, "time_per_iteration": 3.286944627761841 }, { "auxiliary_loss_clip": 0.06431255, "auxiliary_loss_mlp": 0.01268019, "balance_loss_clip": 0.06279841, "balance_loss_mlp": 0.01256557, "epoch": 0.6522170449421314, "flos": 29137880908800.0, "grad_norm": 1.5051999175497397, "language_loss": 0.81308007, "learning_rate": 1.1400354505662514e-06, "loss": 0.89007276, "num_input_tokens_seen": 234290135, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.11462402, "step": 10848, "time_per_iteration": 2.615283489227295 }, { "auxiliary_loss_clip": 0.06424175, "auxiliary_loss_mlp": 0.01269811, "balance_loss_clip": 0.06277901, "balance_loss_mlp": 0.01259309, "epoch": 0.6522771681947993, "flos": 26659284808320.0, "grad_norm": 2.594671193350269, "language_loss": 0.74660444, "learning_rate": 1.1396838463290263e-06, "loss": 0.82354432, "num_input_tokens_seen": 234309535, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.10494995, "step": 10849, "time_per_iteration": 4.058034658432007 }, { "auxiliary_loss_clip": 0.06423228, "auxiliary_loss_mlp": 0.0127102, "balance_loss_clip": 0.06279037, "balance_loss_mlp": 0.0126038, "epoch": 0.6523372914474673, "flos": 25746961553280.0, "grad_norm": 2.563520037363661, "language_loss": 0.6830169, "learning_rate": 1.1393322747150752e-06, "loss": 0.75995934, "num_input_tokens_seen": 234328755, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10644531, "step": 10850, "time_per_iteration": 4.0284223556518555 }, { "auxiliary_loss_clip": 0.06418628, "auxiliary_loss_mlp": 0.0126382, "balance_loss_clip": 0.06274763, "balance_loss_mlp": 0.01253198, "epoch": 0.6523974147001352, "flos": 24834344808960.0, "grad_norm": 1.568284799037983, "language_loss": 0.66777694, "learning_rate": 1.1389807357377313e-06, "loss": 0.74460143, "num_input_tokens_seen": 234348655, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10632324, "step": 10851, "time_per_iteration": 2.61570405960083 }, { "auxiliary_loss_clip": 0.0643198, "auxiliary_loss_mlp": 0.01265901, "balance_loss_clip": 0.06279144, "balance_loss_mlp": 0.01254594, "epoch": 0.6524575379528033, "flos": 26323945568640.0, "grad_norm": 2.3601471458692576, "language_loss": 0.74422228, "learning_rate": 1.1386292294103235e-06, "loss": 0.82120109, "num_input_tokens_seen": 234367445, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.11297607, "step": 10852, "time_per_iteration": 2.6445815563201904 }, { "auxiliary_loss_clip": 0.06429505, "auxiliary_loss_mlp": 0.01267912, "balance_loss_clip": 0.06276996, "balance_loss_mlp": 0.01255568, "epoch": 0.6525176612054712, "flos": 19499200767360.0, "grad_norm": 1.9387684824693758, "language_loss": 0.67310464, "learning_rate": 1.1382777557461812e-06, "loss": 0.7500788, "num_input_tokens_seen": 234384825, "router_z_loss_clip": 1.52636719, "router_z_loss_mlp": 0.12347412, "step": 10853, "time_per_iteration": 2.547032594680786 }, { "auxiliary_loss_clip": 0.06315424, "auxiliary_loss_mlp": 0.01257436, "balance_loss_clip": 0.06255896, "balance_loss_mlp": 0.0125599, "epoch": 0.6525777844581392, "flos": 71727057786240.0, "grad_norm": 0.7117044318758848, "language_loss": 0.63056439, "learning_rate": 1.137926314758634e-06, "loss": 0.70629299, "num_input_tokens_seen": 234450630, "router_z_loss_clip": 0.59423828, "router_z_loss_mlp": 0.01444244, "step": 10854, "time_per_iteration": 3.3182530403137207 }, { "auxiliary_loss_clip": 0.06427309, "auxiliary_loss_mlp": 0.01272228, "balance_loss_clip": 0.062783, "balance_loss_mlp": 0.01260247, "epoch": 0.6526379077108072, "flos": 26660668400640.0, "grad_norm": 1.5901202857541892, "language_loss": 0.77614272, "learning_rate": 1.1375749064610072e-06, "loss": 0.85313803, "num_input_tokens_seen": 234473505, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.11987305, "step": 10855, "time_per_iteration": 2.623685359954834 }, { "auxiliary_loss_clip": 0.06417395, "auxiliary_loss_mlp": 0.01267665, "balance_loss_clip": 0.06275354, "balance_loss_mlp": 0.01256972, "epoch": 0.6526980309634751, "flos": 22826990471040.0, "grad_norm": 1.7478693543362323, "language_loss": 0.795196, "learning_rate": 1.1372235308666256e-06, "loss": 0.87204665, "num_input_tokens_seen": 234492485, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10681152, "step": 10856, "time_per_iteration": 2.584606647491455 }, { "auxiliary_loss_clip": 0.06420726, "auxiliary_loss_mlp": 0.01267303, "balance_loss_clip": 0.06274148, "balance_loss_mlp": 0.01256193, "epoch": 0.6527581542161431, "flos": 28372403134080.0, "grad_norm": 1.9119343001116083, "language_loss": 0.73857939, "learning_rate": 1.136872187988815e-06, "loss": 0.81545973, "num_input_tokens_seen": 234512645, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.11120605, "step": 10857, "time_per_iteration": 2.6332223415374756 }, { "auxiliary_loss_clip": 0.06421642, "auxiliary_loss_mlp": 0.01266856, "balance_loss_clip": 0.06274438, "balance_loss_mlp": 0.01256419, "epoch": 0.652818277468811, "flos": 18375099517440.0, "grad_norm": 2.240162259910854, "language_loss": 0.63210177, "learning_rate": 1.1365208778408965e-06, "loss": 0.70898676, "num_input_tokens_seen": 234529310, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10424805, "step": 10858, "time_per_iteration": 2.532819986343384 }, { "auxiliary_loss_clip": 0.06419365, "auxiliary_loss_mlp": 0.01266577, "balance_loss_clip": 0.06275088, "balance_loss_mlp": 0.01255943, "epoch": 0.6528784007214791, "flos": 18041227724160.0, "grad_norm": 1.7193335919622579, "language_loss": 0.78565955, "learning_rate": 1.1361696004361939e-06, "loss": 0.86251891, "num_input_tokens_seen": 234546685, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10638428, "step": 10859, "time_per_iteration": 2.515890121459961 }, { "auxiliary_loss_clip": 0.06429464, "auxiliary_loss_mlp": 0.01263774, "balance_loss_clip": 0.06278848, "balance_loss_mlp": 0.01252628, "epoch": 0.652938523974147, "flos": 22388466528000.0, "grad_norm": 1.5997430862333666, "language_loss": 0.67946881, "learning_rate": 1.1358183557880256e-06, "loss": 0.75640118, "num_input_tokens_seen": 234566255, "router_z_loss_clip": 1.50488281, "router_z_loss_mlp": 0.1114502, "step": 10860, "time_per_iteration": 2.5858097076416016 }, { "auxiliary_loss_clip": 0.06432045, "auxiliary_loss_mlp": 0.01270417, "balance_loss_clip": 0.0628057, "balance_loss_mlp": 0.01259766, "epoch": 0.652998647226815, "flos": 16769694015360.0, "grad_norm": 1.8348251993780147, "language_loss": 0.67239404, "learning_rate": 1.135467143909712e-06, "loss": 0.74941862, "num_input_tokens_seen": 234585405, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.10650635, "step": 10861, "time_per_iteration": 2.537731170654297 }, { "auxiliary_loss_clip": 0.06428595, "auxiliary_loss_mlp": 0.01269094, "balance_loss_clip": 0.06279351, "balance_loss_mlp": 0.01256971, "epoch": 0.6530587704794829, "flos": 35781259547520.0, "grad_norm": 1.7027858818135244, "language_loss": 0.65372777, "learning_rate": 1.135115964814572e-06, "loss": 0.73070467, "num_input_tokens_seen": 234608095, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.12115479, "step": 10862, "time_per_iteration": 2.6637771129608154 }, { "auxiliary_loss_clip": 0.06425473, "auxiliary_loss_mlp": 0.01267434, "balance_loss_clip": 0.0627905, "balance_loss_mlp": 0.01257038, "epoch": 0.6531188937321509, "flos": 19321901527680.0, "grad_norm": 1.572762514445488, "language_loss": 0.77357566, "learning_rate": 1.13476481851592e-06, "loss": 0.85050476, "num_input_tokens_seen": 234627335, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.10400391, "step": 10863, "time_per_iteration": 2.553391456604004 }, { "auxiliary_loss_clip": 0.06425278, "auxiliary_loss_mlp": 0.01264293, "balance_loss_clip": 0.06277424, "balance_loss_mlp": 0.0125416, "epoch": 0.6531790169848188, "flos": 22900476101760.0, "grad_norm": 1.9366947982684302, "language_loss": 0.75128251, "learning_rate": 1.1344137050270739e-06, "loss": 0.82817817, "num_input_tokens_seen": 234646540, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10137939, "step": 10864, "time_per_iteration": 2.573662042617798 }, { "auxiliary_loss_clip": 0.0642378, "auxiliary_loss_mlp": 0.01266405, "balance_loss_clip": 0.06279145, "balance_loss_mlp": 0.01256051, "epoch": 0.6532391402374869, "flos": 29570157722880.0, "grad_norm": 1.8269285749834019, "language_loss": 0.86178845, "learning_rate": 1.1340626243613458e-06, "loss": 0.9386903, "num_input_tokens_seen": 234665470, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10351562, "step": 10865, "time_per_iteration": 2.631916046142578 }, { "auxiliary_loss_clip": 0.06428468, "auxiliary_loss_mlp": 0.01270689, "balance_loss_clip": 0.06278131, "balance_loss_mlp": 0.01259996, "epoch": 0.6532992634901548, "flos": 23110996285440.0, "grad_norm": 1.9106326880450106, "language_loss": 0.81534517, "learning_rate": 1.133711576532051e-06, "loss": 0.89233679, "num_input_tokens_seen": 234683955, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.10687256, "step": 10866, "time_per_iteration": 2.564910888671875 }, { "auxiliary_loss_clip": 0.06423237, "auxiliary_loss_mlp": 0.01264839, "balance_loss_clip": 0.06278147, "balance_loss_mlp": 0.01254003, "epoch": 0.6533593867428228, "flos": 26074460436480.0, "grad_norm": 1.4024268030999276, "language_loss": 0.82466573, "learning_rate": 1.1333605615524995e-06, "loss": 0.90154648, "num_input_tokens_seen": 234704595, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10827637, "step": 10867, "time_per_iteration": 2.5823192596435547 }, { "auxiliary_loss_clip": 0.06421703, "auxiliary_loss_mlp": 0.01264383, "balance_loss_clip": 0.0627408, "balance_loss_mlp": 0.01253875, "epoch": 0.6534195099954908, "flos": 21218398513920.0, "grad_norm": 1.7931233487955551, "language_loss": 0.81594306, "learning_rate": 1.1330095794360016e-06, "loss": 0.89280397, "num_input_tokens_seen": 234724090, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.10510254, "step": 10868, "time_per_iteration": 2.543330192565918 }, { "auxiliary_loss_clip": 0.06430463, "auxiliary_loss_mlp": 0.01266298, "balance_loss_clip": 0.06279323, "balance_loss_mlp": 0.01255235, "epoch": 0.6534796332481587, "flos": 19652754574080.0, "grad_norm": 1.86911287928558, "language_loss": 0.80253828, "learning_rate": 1.1326586301958675e-06, "loss": 0.87950587, "num_input_tokens_seen": 234742560, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.11065674, "step": 10869, "time_per_iteration": 2.5295357704162598 }, { "auxiliary_loss_clip": 0.06424727, "auxiliary_loss_mlp": 0.01266555, "balance_loss_clip": 0.06277857, "balance_loss_mlp": 0.01255689, "epoch": 0.6535397565008267, "flos": 24028979690880.0, "grad_norm": 1.7967835002481871, "language_loss": 0.72677803, "learning_rate": 1.1323077138454063e-06, "loss": 0.80369085, "num_input_tokens_seen": 234762315, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.10858154, "step": 10870, "time_per_iteration": 2.5693182945251465 }, { "auxiliary_loss_clip": 0.0642707, "auxiliary_loss_mlp": 0.01266077, "balance_loss_clip": 0.0628026, "balance_loss_mlp": 0.01256117, "epoch": 0.6535998797534947, "flos": 24608772817920.0, "grad_norm": 2.0462578442907766, "language_loss": 0.74786597, "learning_rate": 1.1319568303979221e-06, "loss": 0.82479739, "num_input_tokens_seen": 234781300, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.09967041, "step": 10871, "time_per_iteration": 2.6172640323638916 }, { "auxiliary_loss_clip": 0.0642062, "auxiliary_loss_mlp": 0.01265382, "balance_loss_clip": 0.06278052, "balance_loss_mlp": 0.01255267, "epoch": 0.6536600030061627, "flos": 23370292344960.0, "grad_norm": 1.3701946287915159, "language_loss": 0.5622046, "learning_rate": 1.1316059798667227e-06, "loss": 0.63906461, "num_input_tokens_seen": 234801040, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10119629, "step": 10872, "time_per_iteration": 2.6353812217712402 }, { "auxiliary_loss_clip": 0.06428488, "auxiliary_loss_mlp": 0.01267323, "balance_loss_clip": 0.06284524, "balance_loss_mlp": 0.01256874, "epoch": 0.6537201262588306, "flos": 23885278738560.0, "grad_norm": 1.5340245793330771, "language_loss": 0.7509234, "learning_rate": 1.1312551622651112e-06, "loss": 0.82788146, "num_input_tokens_seen": 234821415, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10449219, "step": 10873, "time_per_iteration": 4.063173532485962 }, { "auxiliary_loss_clip": 0.0642399, "auxiliary_loss_mlp": 0.0126672, "balance_loss_clip": 0.06277967, "balance_loss_mlp": 0.01256045, "epoch": 0.6537802495114986, "flos": 24361971016320.0, "grad_norm": 1.6416370760508705, "language_loss": 0.75854099, "learning_rate": 1.1309043776063917e-06, "loss": 0.83544815, "num_input_tokens_seen": 234843795, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10681152, "step": 10874, "time_per_iteration": 2.632688522338867 }, { "auxiliary_loss_clip": 0.06430389, "auxiliary_loss_mlp": 0.01266416, "balance_loss_clip": 0.06284638, "balance_loss_mlp": 0.01255354, "epoch": 0.6538403727641665, "flos": 28003633315200.0, "grad_norm": 1.540961364122859, "language_loss": 0.81942439, "learning_rate": 1.1305536259038642e-06, "loss": 0.89639246, "num_input_tokens_seen": 234862350, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.11071777, "step": 10875, "time_per_iteration": 2.613856554031372 }, { "auxiliary_loss_clip": 0.06426783, "auxiliary_loss_mlp": 0.01266722, "balance_loss_clip": 0.06279119, "balance_loss_mlp": 0.01256267, "epoch": 0.6539004960168345, "flos": 27571021084800.0, "grad_norm": 1.5159680329143337, "language_loss": 0.70089173, "learning_rate": 1.1302029071708314e-06, "loss": 0.77782679, "num_input_tokens_seen": 234881790, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10449219, "step": 10876, "time_per_iteration": 2.606250286102295 }, { "auxiliary_loss_clip": 0.0642674, "auxiliary_loss_mlp": 0.01266876, "balance_loss_clip": 0.06281002, "balance_loss_mlp": 0.01255855, "epoch": 0.6539606192695024, "flos": 14533958574720.0, "grad_norm": 2.1689361969713086, "language_loss": 0.79726303, "learning_rate": 1.1298522214205908e-06, "loss": 0.87419915, "num_input_tokens_seen": 234897775, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.11022949, "step": 10877, "time_per_iteration": 2.5040624141693115 }, { "auxiliary_loss_clip": 0.06421461, "auxiliary_loss_mlp": 0.0127372, "balance_loss_clip": 0.06276899, "balance_loss_mlp": 0.01263569, "epoch": 0.6540207425221705, "flos": 21622779118080.0, "grad_norm": 3.5832344621041274, "language_loss": 0.79701698, "learning_rate": 1.1295015686664408e-06, "loss": 0.87396872, "num_input_tokens_seen": 234918395, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.1015625, "step": 10878, "time_per_iteration": 2.583817481994629 }, { "auxiliary_loss_clip": 0.06427908, "auxiliary_loss_mlp": 0.01267708, "balance_loss_clip": 0.06281306, "balance_loss_mlp": 0.01256418, "epoch": 0.6540808657748384, "flos": 17673589935360.0, "grad_norm": 1.7709445385683054, "language_loss": 0.84751207, "learning_rate": 1.1291509489216797e-06, "loss": 0.92446828, "num_input_tokens_seen": 234936260, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.11291504, "step": 10879, "time_per_iteration": 2.535313129425049 }, { "auxiliary_loss_clip": 0.06431761, "auxiliary_loss_mlp": 0.01267153, "balance_loss_clip": 0.06280981, "balance_loss_mlp": 0.01255899, "epoch": 0.6541409890275064, "flos": 14543559866880.0, "grad_norm": 2.6215277386516873, "language_loss": 0.72451645, "learning_rate": 1.128800362199601e-06, "loss": 0.80150563, "num_input_tokens_seen": 234952110, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.11260986, "step": 10880, "time_per_iteration": 3.958238363265991 }, { "auxiliary_loss_clip": 0.06424606, "auxiliary_loss_mlp": 0.01266507, "balance_loss_clip": 0.06280717, "balance_loss_mlp": 0.01256416, "epoch": 0.6542011122801744, "flos": 17171013945600.0, "grad_norm": 1.8229846670707994, "language_loss": 0.84688175, "learning_rate": 1.1284498085135005e-06, "loss": 0.92379284, "num_input_tokens_seen": 234970810, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10101318, "step": 10881, "time_per_iteration": 2.5252256393432617 }, { "auxiliary_loss_clip": 0.06434765, "auxiliary_loss_mlp": 0.01266151, "balance_loss_clip": 0.06283613, "balance_loss_mlp": 0.01254313, "epoch": 0.6542612355328423, "flos": 18192433616640.0, "grad_norm": 1.9881764844421803, "language_loss": 0.77564692, "learning_rate": 1.1280992878766699e-06, "loss": 0.85265607, "num_input_tokens_seen": 234989565, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.11846924, "step": 10882, "time_per_iteration": 2.5348970890045166 }, { "auxiliary_loss_clip": 0.06430283, "auxiliary_loss_mlp": 0.01269233, "balance_loss_clip": 0.06282286, "balance_loss_mlp": 0.01257175, "epoch": 0.6543213587855103, "flos": 19798635732480.0, "grad_norm": 1.6179824769380993, "language_loss": 0.81921583, "learning_rate": 1.1277488003024024e-06, "loss": 0.89621091, "num_input_tokens_seen": 235007955, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.12054443, "step": 10883, "time_per_iteration": 2.549778461456299 }, { "auxiliary_loss_clip": 0.06432385, "auxiliary_loss_mlp": 0.01267754, "balance_loss_clip": 0.06283703, "balance_loss_mlp": 0.0125671, "epoch": 0.6543814820381783, "flos": 21111356522880.0, "grad_norm": 2.8844868107001482, "language_loss": 0.85639775, "learning_rate": 1.127398345803988e-06, "loss": 0.93339908, "num_input_tokens_seen": 235024860, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.11053467, "step": 10884, "time_per_iteration": 2.5491411685943604 }, { "auxiliary_loss_clip": 0.06430135, "auxiliary_loss_mlp": 0.01265631, "balance_loss_clip": 0.06283115, "balance_loss_mlp": 0.01254801, "epoch": 0.6544416052908463, "flos": 20200333006080.0, "grad_norm": 2.530553381101282, "language_loss": 0.79943323, "learning_rate": 1.127047924394715e-06, "loss": 0.87639093, "num_input_tokens_seen": 235043815, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.1083374, "step": 10885, "time_per_iteration": 2.588892936706543 }, { "auxiliary_loss_clip": 0.06421162, "auxiliary_loss_mlp": 0.01272593, "balance_loss_clip": 0.06277876, "balance_loss_mlp": 0.01262216, "epoch": 0.6545017285435142, "flos": 23375072027520.0, "grad_norm": 1.9513688150199962, "language_loss": 0.72116375, "learning_rate": 1.1266975360878722e-06, "loss": 0.79810125, "num_input_tokens_seen": 235062985, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.1038208, "step": 10886, "time_per_iteration": 2.5711188316345215 }, { "auxiliary_loss_clip": 0.06425706, "auxiliary_loss_mlp": 0.0126308, "balance_loss_clip": 0.06278652, "balance_loss_mlp": 0.01252965, "epoch": 0.6545618517961822, "flos": 19140619219200.0, "grad_norm": 1.6980989284821884, "language_loss": 0.78532851, "learning_rate": 1.1263471808967468e-06, "loss": 0.86221635, "num_input_tokens_seen": 235081670, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.10113525, "step": 10887, "time_per_iteration": 2.5525453090667725 }, { "auxiliary_loss_clip": 0.06424472, "auxiliary_loss_mlp": 0.01266604, "balance_loss_clip": 0.0627923, "balance_loss_mlp": 0.01255946, "epoch": 0.6546219750488501, "flos": 14943789694080.0, "grad_norm": 4.0414376178131235, "language_loss": 0.79284787, "learning_rate": 1.1259968588346234e-06, "loss": 0.86975873, "num_input_tokens_seen": 235098510, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10656738, "step": 10888, "time_per_iteration": 2.5254571437835693 }, { "auxiliary_loss_clip": 0.06422883, "auxiliary_loss_mlp": 0.01268486, "balance_loss_clip": 0.06279623, "balance_loss_mlp": 0.01258639, "epoch": 0.6546820983015181, "flos": 36329466885120.0, "grad_norm": 1.8514619883062677, "language_loss": 0.66950023, "learning_rate": 1.1256465699147874e-06, "loss": 0.74641395, "num_input_tokens_seen": 235119990, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09844971, "step": 10889, "time_per_iteration": 4.141958475112915 }, { "auxiliary_loss_clip": 0.06427291, "auxiliary_loss_mlp": 0.01269356, "balance_loss_clip": 0.06278776, "balance_loss_mlp": 0.01257292, "epoch": 0.654742221554186, "flos": 20417519589120.0, "grad_norm": 1.5482659291870429, "language_loss": 0.80023563, "learning_rate": 1.1252963141505203e-06, "loss": 0.87720209, "num_input_tokens_seen": 235139255, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.1206665, "step": 10890, "time_per_iteration": 4.037220478057861 }, { "auxiliary_loss_clip": 0.06429215, "auxiliary_loss_mlp": 0.01266803, "balance_loss_clip": 0.06277764, "balance_loss_mlp": 0.01255741, "epoch": 0.6548023448068541, "flos": 24870626426880.0, "grad_norm": 14.169806969008645, "language_loss": 0.65855241, "learning_rate": 1.1249460915551052e-06, "loss": 0.73551261, "num_input_tokens_seen": 235158455, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.11065674, "step": 10891, "time_per_iteration": 2.6548240184783936 }, { "auxiliary_loss_clip": 0.06424324, "auxiliary_loss_mlp": 0.01271414, "balance_loss_clip": 0.06279287, "balance_loss_mlp": 0.01261121, "epoch": 0.654862468059522, "flos": 21432901766400.0, "grad_norm": 1.7448686905793325, "language_loss": 0.79852176, "learning_rate": 1.1245959021418214e-06, "loss": 0.87547916, "num_input_tokens_seen": 235177350, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10302734, "step": 10892, "time_per_iteration": 2.553717613220215 }, { "auxiliary_loss_clip": 0.06429481, "auxiliary_loss_mlp": 0.01269755, "balance_loss_clip": 0.06278673, "balance_loss_mlp": 0.01259282, "epoch": 0.65492259131219, "flos": 26585002563840.0, "grad_norm": 1.8196303826168798, "language_loss": 0.78109562, "learning_rate": 1.1242457459239497e-06, "loss": 0.85808802, "num_input_tokens_seen": 235196435, "router_z_loss_clip": 1.50878906, "router_z_loss_mlp": 0.10467529, "step": 10893, "time_per_iteration": 2.5794951915740967 }, { "auxiliary_loss_clip": 0.06426376, "auxiliary_loss_mlp": 0.01266186, "balance_loss_clip": 0.06277777, "balance_loss_mlp": 0.01254534, "epoch": 0.6549827145648579, "flos": 21506806667520.0, "grad_norm": 1.4421463130856362, "language_loss": 0.70660192, "learning_rate": 1.123895622914766e-06, "loss": 0.78352755, "num_input_tokens_seen": 235215430, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.11639404, "step": 10894, "time_per_iteration": 2.555551528930664 }, { "auxiliary_loss_clip": 0.06425695, "auxiliary_loss_mlp": 0.01266728, "balance_loss_clip": 0.06275699, "balance_loss_mlp": 0.01254843, "epoch": 0.6550428378175259, "flos": 22599657544320.0, "grad_norm": 2.8052503686251424, "language_loss": 0.62468076, "learning_rate": 1.123545533127549e-06, "loss": 0.70160496, "num_input_tokens_seen": 235232015, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.11889648, "step": 10895, "time_per_iteration": 2.53674054145813 }, { "auxiliary_loss_clip": 0.06418341, "auxiliary_loss_mlp": 0.01264245, "balance_loss_clip": 0.06274098, "balance_loss_mlp": 0.01254166, "epoch": 0.655102961070194, "flos": 12828848313600.0, "grad_norm": 1.939322848542337, "language_loss": 0.79340285, "learning_rate": 1.1231954765755722e-06, "loss": 0.87022871, "num_input_tokens_seen": 235248115, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10083008, "step": 10896, "time_per_iteration": 2.5570614337921143 }, { "auxiliary_loss_clip": 0.06418557, "auxiliary_loss_mlp": 0.01268368, "balance_loss_clip": 0.06275994, "balance_loss_mlp": 0.01258569, "epoch": 0.6551630843228619, "flos": 24798105118080.0, "grad_norm": 1.366520448466338, "language_loss": 0.70831591, "learning_rate": 1.1228454532721111e-06, "loss": 0.78518516, "num_input_tokens_seen": 235270785, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09796143, "step": 10897, "time_per_iteration": 2.6285197734832764 }, { "auxiliary_loss_clip": 0.06427985, "auxiliary_loss_mlp": 0.01270989, "balance_loss_clip": 0.06278501, "balance_loss_mlp": 0.01260665, "epoch": 0.6552232075755299, "flos": 16729597036800.0, "grad_norm": 1.5402923676188511, "language_loss": 0.75555927, "learning_rate": 1.1224954632304391e-06, "loss": 0.83254898, "num_input_tokens_seen": 235287905, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.10327148, "step": 10898, "time_per_iteration": 2.600539207458496 }, { "auxiliary_loss_clip": 0.06422581, "auxiliary_loss_mlp": 0.01268523, "balance_loss_clip": 0.06276965, "balance_loss_mlp": 0.01258271, "epoch": 0.6552833308281978, "flos": 22022757383040.0, "grad_norm": 2.2007149246426674, "language_loss": 0.74078274, "learning_rate": 1.122145506463827e-06, "loss": 0.81769383, "num_input_tokens_seen": 235305525, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.1026001, "step": 10899, "time_per_iteration": 2.533904790878296 }, { "auxiliary_loss_clip": 0.06423283, "auxiliary_loss_mlp": 0.01268349, "balance_loss_clip": 0.06275959, "balance_loss_mlp": 0.01257668, "epoch": 0.6553434540808658, "flos": 24870332937600.0, "grad_norm": 2.0410530670453477, "language_loss": 0.56798041, "learning_rate": 1.1217955829855443e-06, "loss": 0.64489669, "num_input_tokens_seen": 235324415, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10681152, "step": 10900, "time_per_iteration": 2.5940730571746826 }, { "auxiliary_loss_clip": 0.06427008, "auxiliary_loss_mlp": 0.01266209, "balance_loss_clip": 0.06279104, "balance_loss_mlp": 0.01255093, "epoch": 0.6554035773335337, "flos": 23227639568640.0, "grad_norm": 3.0216254764002395, "language_loss": 0.77170455, "learning_rate": 1.1214456928088622e-06, "loss": 0.84863669, "num_input_tokens_seen": 235341595, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.11126709, "step": 10901, "time_per_iteration": 2.5488059520721436 }, { "auxiliary_loss_clip": 0.06429236, "auxiliary_loss_mlp": 0.01265769, "balance_loss_clip": 0.06284177, "balance_loss_mlp": 0.01255553, "epoch": 0.6554637005862017, "flos": 22790163801600.0, "grad_norm": 1.7194182109119422, "language_loss": 0.73169768, "learning_rate": 1.1210958359470463e-06, "loss": 0.80864775, "num_input_tokens_seen": 235361700, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10217285, "step": 10902, "time_per_iteration": 2.5775322914123535 }, { "auxiliary_loss_clip": 0.06416886, "auxiliary_loss_mlp": 0.01265765, "balance_loss_clip": 0.06272663, "balance_loss_mlp": 0.012554, "epoch": 0.6555238238388696, "flos": 21513682702080.0, "grad_norm": 1.7938769795909586, "language_loss": 0.68398958, "learning_rate": 1.1207460124133645e-06, "loss": 0.7608161, "num_input_tokens_seen": 235382065, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10351562, "step": 10903, "time_per_iteration": 2.555570125579834 }, { "auxiliary_loss_clip": 0.06427059, "auxiliary_loss_mlp": 0.01268715, "balance_loss_clip": 0.06275943, "balance_loss_mlp": 0.01257765, "epoch": 0.6555839470915377, "flos": 30527483420160.0, "grad_norm": 2.044881482466512, "language_loss": 0.66685748, "learning_rate": 1.1203962222210832e-06, "loss": 0.74381518, "num_input_tokens_seen": 235402130, "router_z_loss_clip": 1.50976562, "router_z_loss_mlp": 0.10943604, "step": 10904, "time_per_iteration": 2.6087372303009033 }, { "auxiliary_loss_clip": 0.06427702, "auxiliary_loss_mlp": 0.01266681, "balance_loss_clip": 0.06277348, "balance_loss_mlp": 0.01254533, "epoch": 0.6556440703442056, "flos": 24649582556160.0, "grad_norm": 1.803613929103336, "language_loss": 0.90609378, "learning_rate": 1.120046465383464e-06, "loss": 0.98303765, "num_input_tokens_seen": 235420435, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.121521, "step": 10905, "time_per_iteration": 2.557338237762451 }, { "auxiliary_loss_clip": 0.06416586, "auxiliary_loss_mlp": 0.01263678, "balance_loss_clip": 0.06275237, "balance_loss_mlp": 0.01253325, "epoch": 0.6557041935968736, "flos": 23739229872000.0, "grad_norm": 2.071033643063306, "language_loss": 0.75596917, "learning_rate": 1.1196967419137721e-06, "loss": 0.83277184, "num_input_tokens_seen": 235439960, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10345459, "step": 10906, "time_per_iteration": 2.544933557510376 }, { "auxiliary_loss_clip": 0.0642603, "auxiliary_loss_mlp": 0.01266923, "balance_loss_clip": 0.06276227, "balance_loss_mlp": 0.01256027, "epoch": 0.6557643168495415, "flos": 11106464112000.0, "grad_norm": 3.0720029909517836, "language_loss": 0.74714088, "learning_rate": 1.119347051825267e-06, "loss": 0.82407039, "num_input_tokens_seen": 235457495, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.10894775, "step": 10907, "time_per_iteration": 2.5276215076446533 }, { "auxiliary_loss_clip": 0.06421096, "auxiliary_loss_mlp": 0.01265944, "balance_loss_clip": 0.06274764, "balance_loss_mlp": 0.01254566, "epoch": 0.6558244401022095, "flos": 30198978288000.0, "grad_norm": 1.4508710236710873, "language_loss": 0.72531778, "learning_rate": 1.118997395131211e-06, "loss": 0.80218816, "num_input_tokens_seen": 235479525, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.1137085, "step": 10908, "time_per_iteration": 2.6128597259521484 }, { "auxiliary_loss_clip": 0.06428923, "auxiliary_loss_mlp": 0.01263584, "balance_loss_clip": 0.06281763, "balance_loss_mlp": 0.01252968, "epoch": 0.6558845633548775, "flos": 17936827136640.0, "grad_norm": 2.2528761546616165, "language_loss": 0.82237792, "learning_rate": 1.118647771844861e-06, "loss": 0.89930296, "num_input_tokens_seen": 235496305, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10626221, "step": 10909, "time_per_iteration": 2.511716842651367 }, { "auxiliary_loss_clip": 0.06426852, "auxiliary_loss_mlp": 0.01268689, "balance_loss_clip": 0.06277839, "balance_loss_mlp": 0.01257692, "epoch": 0.6559446866075455, "flos": 21909929460480.0, "grad_norm": 2.012074216362446, "language_loss": 0.64639068, "learning_rate": 1.1182981819794767e-06, "loss": 0.72334611, "num_input_tokens_seen": 235512545, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.10998535, "step": 10910, "time_per_iteration": 2.5569920539855957 }, { "auxiliary_loss_clip": 0.06437699, "auxiliary_loss_mlp": 0.01269593, "balance_loss_clip": 0.06282165, "balance_loss_mlp": 0.01257106, "epoch": 0.6560048098602135, "flos": 14131674322560.0, "grad_norm": 3.1232844742692727, "language_loss": 0.76070505, "learning_rate": 1.117948625548313e-06, "loss": 0.83777797, "num_input_tokens_seen": 235526045, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.12487793, "step": 10911, "time_per_iteration": 2.5193417072296143 }, { "auxiliary_loss_clip": 0.06418261, "auxiliary_loss_mlp": 0.01269012, "balance_loss_clip": 0.06276219, "balance_loss_mlp": 0.01258879, "epoch": 0.6560649331128814, "flos": 18813623460480.0, "grad_norm": 1.509809025369934, "language_loss": 0.75768411, "learning_rate": 1.1175991025646265e-06, "loss": 0.83455676, "num_input_tokens_seen": 235545285, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10131836, "step": 10912, "time_per_iteration": 2.576259136199951 }, { "auxiliary_loss_clip": 0.06437736, "auxiliary_loss_mlp": 0.01272589, "balance_loss_clip": 0.0628109, "balance_loss_mlp": 0.01260263, "epoch": 0.6561250563655494, "flos": 17058940709760.0, "grad_norm": 1.6709468819457036, "language_loss": 0.77734828, "learning_rate": 1.1172496130416697e-06, "loss": 0.85445154, "num_input_tokens_seen": 235563150, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.12322998, "step": 10913, "time_per_iteration": 3.9488632678985596 }, { "auxiliary_loss_clip": 0.06419182, "auxiliary_loss_mlp": 0.0126583, "balance_loss_clip": 0.06277294, "balance_loss_mlp": 0.01256186, "epoch": 0.6561851796182173, "flos": 22644198789120.0, "grad_norm": 1.8511563445327088, "language_loss": 0.71285927, "learning_rate": 1.1169001569926961e-06, "loss": 0.78970945, "num_input_tokens_seen": 235582535, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09649658, "step": 10914, "time_per_iteration": 2.5530877113342285 }, { "auxiliary_loss_clip": 0.06424543, "auxiliary_loss_mlp": 0.01269925, "balance_loss_clip": 0.06279805, "balance_loss_mlp": 0.01258809, "epoch": 0.6562453028708853, "flos": 19244307047040.0, "grad_norm": 1.6715812306957363, "language_loss": 0.74154961, "learning_rate": 1.116550734430958e-06, "loss": 0.81849426, "num_input_tokens_seen": 235601490, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.11120605, "step": 10915, "time_per_iteration": 2.5682778358459473 }, { "auxiliary_loss_clip": 0.06420626, "auxiliary_loss_mlp": 0.01268192, "balance_loss_clip": 0.06276135, "balance_loss_mlp": 0.01256742, "epoch": 0.6563054261235532, "flos": 23807390768640.0, "grad_norm": 1.4799431561665233, "language_loss": 0.79533613, "learning_rate": 1.1162013453697042e-06, "loss": 0.87222433, "num_input_tokens_seen": 235619165, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.11456299, "step": 10916, "time_per_iteration": 2.5678951740264893 }, { "auxiliary_loss_clip": 0.06424221, "auxiliary_loss_mlp": 0.01269868, "balance_loss_clip": 0.06276472, "balance_loss_mlp": 0.01259175, "epoch": 0.6563655493762213, "flos": 19245271368960.0, "grad_norm": 2.1522975937217383, "language_loss": 0.763273, "learning_rate": 1.1158519898221831e-06, "loss": 0.84021389, "num_input_tokens_seen": 235637115, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.10699463, "step": 10917, "time_per_iteration": 2.5878684520721436 }, { "auxiliary_loss_clip": 0.06419883, "auxiliary_loss_mlp": 0.01268689, "balance_loss_clip": 0.06276512, "balance_loss_mlp": 0.01257233, "epoch": 0.6564256726288892, "flos": 25563457111680.0, "grad_norm": 2.0004884718018245, "language_loss": 0.70479476, "learning_rate": 1.1155026678016445e-06, "loss": 0.78168046, "num_input_tokens_seen": 235656330, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.11462402, "step": 10918, "time_per_iteration": 2.5848264694213867 }, { "auxiliary_loss_clip": 0.06415305, "auxiliary_loss_mlp": 0.01267911, "balance_loss_clip": 0.06274374, "balance_loss_mlp": 0.01257504, "epoch": 0.6564857958815572, "flos": 22207226146560.0, "grad_norm": 1.5404009060146535, "language_loss": 0.76483887, "learning_rate": 1.115153379321332e-06, "loss": 0.84167099, "num_input_tokens_seen": 235674510, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10406494, "step": 10919, "time_per_iteration": 4.03309440612793 }, { "auxiliary_loss_clip": 0.06321745, "auxiliary_loss_mlp": 0.01259453, "balance_loss_clip": 0.06262597, "balance_loss_mlp": 0.01258034, "epoch": 0.6565459191342251, "flos": 58139188462080.0, "grad_norm": 0.7465570491174152, "language_loss": 0.5301863, "learning_rate": 1.1148041243944931e-06, "loss": 0.60599828, "num_input_tokens_seen": 235735050, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.0141983, "step": 10920, "time_per_iteration": 3.1838746070861816 }, { "auxiliary_loss_clip": 0.06419969, "auxiliary_loss_mlp": 0.01263866, "balance_loss_clip": 0.06277868, "balance_loss_mlp": 0.01253948, "epoch": 0.6566060423868931, "flos": 30817400947200.0, "grad_norm": 1.5582104675891413, "language_loss": 0.65655988, "learning_rate": 1.1144549030343697e-06, "loss": 0.7333982, "num_input_tokens_seen": 235757545, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09924316, "step": 10921, "time_per_iteration": 2.6916511058807373 }, { "auxiliary_loss_clip": 0.06420492, "auxiliary_loss_mlp": 0.01267677, "balance_loss_clip": 0.06275365, "balance_loss_mlp": 0.0125522, "epoch": 0.6566661656395612, "flos": 23374107705600.0, "grad_norm": 1.6825120059113696, "language_loss": 0.81194299, "learning_rate": 1.114105715254205e-06, "loss": 0.8888247, "num_input_tokens_seen": 235777265, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.12457275, "step": 10922, "time_per_iteration": 2.6237802505493164 }, { "auxiliary_loss_clip": 0.06423132, "auxiliary_loss_mlp": 0.01266903, "balance_loss_clip": 0.06275652, "balance_loss_mlp": 0.01256091, "epoch": 0.6567262888922291, "flos": 25742098016640.0, "grad_norm": 1.9050218429952408, "language_loss": 0.72018391, "learning_rate": 1.1137565610672414e-06, "loss": 0.79708421, "num_input_tokens_seen": 235796565, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.10809326, "step": 10923, "time_per_iteration": 2.569066286087036 }, { "auxiliary_loss_clip": 0.06423729, "auxiliary_loss_mlp": 0.01265751, "balance_loss_clip": 0.06277415, "balance_loss_mlp": 0.01255291, "epoch": 0.6567864121448971, "flos": 17128569052800.0, "grad_norm": 1.9495127312082272, "language_loss": 0.80605674, "learning_rate": 1.1134074404867169e-06, "loss": 0.8829515, "num_input_tokens_seen": 235814805, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10455322, "step": 10924, "time_per_iteration": 2.5483453273773193 }, { "auxiliary_loss_clip": 0.06422514, "auxiliary_loss_mlp": 0.01262624, "balance_loss_clip": 0.06277139, "balance_loss_mlp": 0.01253052, "epoch": 0.656846535397565, "flos": 22425922103040.0, "grad_norm": 1.5461846077893726, "language_loss": 0.72902781, "learning_rate": 1.1130583535258717e-06, "loss": 0.80587924, "num_input_tokens_seen": 235833405, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.09570312, "step": 10925, "time_per_iteration": 2.5603058338165283 }, { "auxiliary_loss_clip": 0.06423355, "auxiliary_loss_mlp": 0.01265899, "balance_loss_clip": 0.062769, "balance_loss_mlp": 0.01255557, "epoch": 0.656906658650233, "flos": 17708991085440.0, "grad_norm": 2.366672221585082, "language_loss": 0.72602594, "learning_rate": 1.112709300197942e-06, "loss": 0.80291849, "num_input_tokens_seen": 235848530, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10345459, "step": 10926, "time_per_iteration": 2.541595220565796 }, { "auxiliary_loss_clip": 0.06427991, "auxiliary_loss_mlp": 0.01266215, "balance_loss_clip": 0.0627854, "balance_loss_mlp": 0.01254425, "epoch": 0.6569667819029009, "flos": 21180942938880.0, "grad_norm": 3.3572011598932145, "language_loss": 0.72978926, "learning_rate": 1.1123602805161656e-06, "loss": 0.80673134, "num_input_tokens_seen": 235867225, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.11791992, "step": 10927, "time_per_iteration": 2.5536365509033203 }, { "auxiliary_loss_clip": 0.06309623, "auxiliary_loss_mlp": 0.01256026, "balance_loss_clip": 0.06250623, "balance_loss_mlp": 0.01254721, "epoch": 0.6570269051555689, "flos": 68783299344000.0, "grad_norm": 0.7169238357148866, "language_loss": 0.64418972, "learning_rate": 1.112011294493775e-06, "loss": 0.71984619, "num_input_tokens_seen": 235932925, "router_z_loss_clip": 0.59130859, "router_z_loss_mlp": 0.01305389, "step": 10928, "time_per_iteration": 4.614474534988403 }, { "auxiliary_loss_clip": 0.06422355, "auxiliary_loss_mlp": 0.01271151, "balance_loss_clip": 0.06277493, "balance_loss_mlp": 0.0125985, "epoch": 0.6570870284082369, "flos": 26325874212480.0, "grad_norm": 1.6763340464517331, "language_loss": 0.7806958, "learning_rate": 1.1116623421440063e-06, "loss": 0.85763085, "num_input_tokens_seen": 235952680, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.11303711, "step": 10929, "time_per_iteration": 2.599722146987915 }, { "auxiliary_loss_clip": 0.06415602, "auxiliary_loss_mlp": 0.01264054, "balance_loss_clip": 0.06271674, "balance_loss_mlp": 0.0125286, "epoch": 0.6571471516609049, "flos": 26181544354560.0, "grad_norm": 1.6753702724546482, "language_loss": 0.65556884, "learning_rate": 1.1113134234800895e-06, "loss": 0.73236549, "num_input_tokens_seen": 235972075, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.11187744, "step": 10930, "time_per_iteration": 3.996661424636841 }, { "auxiliary_loss_clip": 0.06423472, "auxiliary_loss_mlp": 0.01266956, "balance_loss_clip": 0.06277257, "balance_loss_mlp": 0.01256257, "epoch": 0.6572072749135728, "flos": 20382537709440.0, "grad_norm": 1.591942707864639, "language_loss": 0.71237695, "learning_rate": 1.110964538515258e-06, "loss": 0.78928125, "num_input_tokens_seen": 235990340, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10705566, "step": 10931, "time_per_iteration": 2.544487953186035 }, { "auxiliary_loss_clip": 0.06424496, "auxiliary_loss_mlp": 0.01268659, "balance_loss_clip": 0.06274998, "balance_loss_mlp": 0.01258306, "epoch": 0.6572673981662408, "flos": 17134438838400.0, "grad_norm": 2.083073217055359, "language_loss": 0.68973702, "learning_rate": 1.1106156872627393e-06, "loss": 0.76666856, "num_input_tokens_seen": 236007470, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.10357666, "step": 10932, "time_per_iteration": 2.5116641521453857 }, { "auxiliary_loss_clip": 0.06424965, "auxiliary_loss_mlp": 0.01265564, "balance_loss_clip": 0.06278196, "balance_loss_mlp": 0.0125505, "epoch": 0.6573275214189087, "flos": 41283640339200.0, "grad_norm": 1.779366475903778, "language_loss": 0.79915285, "learning_rate": 1.1102668697357626e-06, "loss": 0.8760581, "num_input_tokens_seen": 236029030, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10516357, "step": 10933, "time_per_iteration": 2.7195417881011963 }, { "auxiliary_loss_clip": 0.0642909, "auxiliary_loss_mlp": 0.01264602, "balance_loss_clip": 0.06279901, "balance_loss_mlp": 0.01254064, "epoch": 0.6573876446715767, "flos": 22896241470720.0, "grad_norm": 1.783648610041282, "language_loss": 0.74167871, "learning_rate": 1.1099180859475571e-06, "loss": 0.81861567, "num_input_tokens_seen": 236047160, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.10534668, "step": 10934, "time_per_iteration": 2.5462536811828613 }, { "auxiliary_loss_clip": 0.06419428, "auxiliary_loss_mlp": 0.01266713, "balance_loss_clip": 0.06275968, "balance_loss_mlp": 0.01256401, "epoch": 0.6574477679242448, "flos": 44028240825600.0, "grad_norm": 1.6460918883428735, "language_loss": 0.76105213, "learning_rate": 1.1095693359113454e-06, "loss": 0.83791357, "num_input_tokens_seen": 236069215, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10314941, "step": 10935, "time_per_iteration": 2.7388181686401367 }, { "auxiliary_loss_clip": 0.0642135, "auxiliary_loss_mlp": 0.01267433, "balance_loss_clip": 0.06273936, "balance_loss_mlp": 0.01256079, "epoch": 0.6575078911769127, "flos": 24578402912640.0, "grad_norm": 1.9076517107789284, "language_loss": 0.78767693, "learning_rate": 1.1092206196403538e-06, "loss": 0.86456466, "num_input_tokens_seen": 236088335, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.11364746, "step": 10936, "time_per_iteration": 2.5888826847076416 }, { "auxiliary_loss_clip": 0.06419763, "auxiliary_loss_mlp": 0.01271347, "balance_loss_clip": 0.0627667, "balance_loss_mlp": 0.01261107, "epoch": 0.6575680144295807, "flos": 20930493484800.0, "grad_norm": 1.761967986238006, "language_loss": 0.69013357, "learning_rate": 1.1088719371478056e-06, "loss": 0.76704466, "num_input_tokens_seen": 236108540, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10235596, "step": 10937, "time_per_iteration": 2.5973246097564697 }, { "auxiliary_loss_clip": 0.06419697, "auxiliary_loss_mlp": 0.0126547, "balance_loss_clip": 0.06274303, "balance_loss_mlp": 0.0125399, "epoch": 0.6576281376822486, "flos": 10930213048320.0, "grad_norm": 2.4290452851226836, "language_loss": 0.68518859, "learning_rate": 1.1085232884469236e-06, "loss": 0.76204026, "num_input_tokens_seen": 236124495, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.11468506, "step": 10938, "time_per_iteration": 2.712109327316284 }, { "auxiliary_loss_clip": 0.06423234, "auxiliary_loss_mlp": 0.01265044, "balance_loss_clip": 0.06277085, "balance_loss_mlp": 0.01254434, "epoch": 0.6576882609349166, "flos": 19287632407680.0, "grad_norm": 1.9665990126498265, "language_loss": 0.71066457, "learning_rate": 1.108174673550927e-06, "loss": 0.78754735, "num_input_tokens_seen": 236142550, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10620117, "step": 10939, "time_per_iteration": 2.7431933879852295 }, { "auxiliary_loss_clip": 0.06424186, "auxiliary_loss_mlp": 0.01268068, "balance_loss_clip": 0.06276078, "balance_loss_mlp": 0.01256308, "epoch": 0.6577483841875845, "flos": 20225168542080.0, "grad_norm": 2.402472132052399, "language_loss": 0.77815866, "learning_rate": 1.107826092473037e-06, "loss": 0.85508114, "num_input_tokens_seen": 236156620, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.11749268, "step": 10940, "time_per_iteration": 2.6213889122009277 }, { "auxiliary_loss_clip": 0.06426467, "auxiliary_loss_mlp": 0.0126916, "balance_loss_clip": 0.06274944, "balance_loss_mlp": 0.01257805, "epoch": 0.6578085074402525, "flos": 34759672168320.0, "grad_norm": 2.17919101184848, "language_loss": 0.68594038, "learning_rate": 1.107477545226471e-06, "loss": 0.76289666, "num_input_tokens_seen": 236177095, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.1137085, "step": 10941, "time_per_iteration": 2.766287326812744 }, { "auxiliary_loss_clip": 0.06419057, "auxiliary_loss_mlp": 0.01262919, "balance_loss_clip": 0.06273252, "balance_loss_mlp": 0.01252834, "epoch": 0.6578686306929205, "flos": 23476705430400.0, "grad_norm": 1.8817683588146539, "language_loss": 0.68710959, "learning_rate": 1.1071290318244448e-06, "loss": 0.76392925, "num_input_tokens_seen": 236194695, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10089111, "step": 10942, "time_per_iteration": 2.582824945449829 }, { "auxiliary_loss_clip": 0.06428665, "auxiliary_loss_mlp": 0.01269327, "balance_loss_clip": 0.06274902, "balance_loss_mlp": 0.01257692, "epoch": 0.6579287539455885, "flos": 18082876003200.0, "grad_norm": 1.8225761556411542, "language_loss": 0.71759576, "learning_rate": 1.1067805522801753e-06, "loss": 0.79457563, "num_input_tokens_seen": 236213885, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.11645508, "step": 10943, "time_per_iteration": 2.5810210704803467 }, { "auxiliary_loss_clip": 0.06418926, "auxiliary_loss_mlp": 0.01267715, "balance_loss_clip": 0.06274699, "balance_loss_mlp": 0.01257112, "epoch": 0.6579888771982564, "flos": 28669532112000.0, "grad_norm": 1.8554512921965753, "language_loss": 0.59761918, "learning_rate": 1.1064321066068778e-06, "loss": 0.67448556, "num_input_tokens_seen": 236237315, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.1060791, "step": 10944, "time_per_iteration": 2.6379435062408447 }, { "auxiliary_loss_clip": 0.06432991, "auxiliary_loss_mlp": 0.01269948, "balance_loss_clip": 0.0627933, "balance_loss_mlp": 0.01257878, "epoch": 0.6580490004509244, "flos": 25053627744000.0, "grad_norm": 1.537632026022629, "language_loss": 0.7264449, "learning_rate": 1.1060836948177646e-06, "loss": 0.80347431, "num_input_tokens_seen": 236256345, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.1206665, "step": 10945, "time_per_iteration": 2.6020145416259766 }, { "auxiliary_loss_clip": 0.06420527, "auxiliary_loss_mlp": 0.01265834, "balance_loss_clip": 0.06276713, "balance_loss_mlp": 0.01255731, "epoch": 0.6581091237035923, "flos": 43519040363520.0, "grad_norm": 1.5606045595564952, "language_loss": 0.70251054, "learning_rate": 1.105735316926046e-06, "loss": 0.77937424, "num_input_tokens_seen": 236281890, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10107422, "step": 10946, "time_per_iteration": 2.7467408180236816 }, { "auxiliary_loss_clip": 0.06419076, "auxiliary_loss_mlp": 0.0126565, "balance_loss_clip": 0.06273444, "balance_loss_mlp": 0.01255189, "epoch": 0.6581692469562603, "flos": 22421352055680.0, "grad_norm": 1.9926750034571186, "language_loss": 0.82731611, "learning_rate": 1.105386972944934e-06, "loss": 0.90416336, "num_input_tokens_seen": 236298370, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10461426, "step": 10947, "time_per_iteration": 2.5513579845428467 }, { "auxiliary_loss_clip": 0.06424567, "auxiliary_loss_mlp": 0.01264426, "balance_loss_clip": 0.06276371, "balance_loss_mlp": 0.01253835, "epoch": 0.6582293702089284, "flos": 24866098306560.0, "grad_norm": 1.5424245264194407, "language_loss": 0.77623099, "learning_rate": 1.1050386628876385e-06, "loss": 0.85312092, "num_input_tokens_seen": 236317380, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10595703, "step": 10948, "time_per_iteration": 2.57070255279541 }, { "auxiliary_loss_clip": 0.06419015, "auxiliary_loss_mlp": 0.01265796, "balance_loss_clip": 0.06275007, "balance_loss_mlp": 0.01255723, "epoch": 0.6582894934615963, "flos": 23046399187200.0, "grad_norm": 1.6279307214481507, "language_loss": 0.79001617, "learning_rate": 1.1046903867673655e-06, "loss": 0.86686432, "num_input_tokens_seen": 236336210, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10076904, "step": 10949, "time_per_iteration": 2.5622165203094482 }, { "auxiliary_loss_clip": 0.06302825, "auxiliary_loss_mlp": 0.01250507, "balance_loss_clip": 0.0624364, "balance_loss_mlp": 0.01249327, "epoch": 0.6583496167142643, "flos": 72573274569600.0, "grad_norm": 0.7254513942848005, "language_loss": 0.61746496, "learning_rate": 1.104342144597323e-06, "loss": 0.69299829, "num_input_tokens_seen": 236403090, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01178741, "step": 10950, "time_per_iteration": 3.251453161239624 }, { "auxiliary_loss_clip": 0.06417651, "auxiliary_loss_mlp": 0.01267842, "balance_loss_clip": 0.06276557, "balance_loss_mlp": 0.01258508, "epoch": 0.6584097399669322, "flos": 13083867815040.0, "grad_norm": 1.8683510189889692, "language_loss": 0.67099941, "learning_rate": 1.1039939363907178e-06, "loss": 0.74785435, "num_input_tokens_seen": 236420475, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09332275, "step": 10951, "time_per_iteration": 2.540761947631836 }, { "auxiliary_loss_clip": 0.06417702, "auxiliary_loss_mlp": 0.01269934, "balance_loss_clip": 0.06274983, "balance_loss_mlp": 0.01259611, "epoch": 0.6584698632196002, "flos": 28700530922880.0, "grad_norm": 1.448006491887197, "language_loss": 0.76639593, "learning_rate": 1.1036457621607504e-06, "loss": 0.84327227, "num_input_tokens_seen": 236441915, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10314941, "step": 10952, "time_per_iteration": 3.9869182109832764 }, { "auxiliary_loss_clip": 0.06419847, "auxiliary_loss_mlp": 0.01267089, "balance_loss_clip": 0.06276993, "balance_loss_mlp": 0.01256468, "epoch": 0.6585299864722681, "flos": 14324486567040.0, "grad_norm": 2.531723824930416, "language_loss": 0.73649061, "learning_rate": 1.1032976219206257e-06, "loss": 0.81335998, "num_input_tokens_seen": 236460340, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.1060791, "step": 10953, "time_per_iteration": 2.52294921875 }, { "auxiliary_loss_clip": 0.06417652, "auxiliary_loss_mlp": 0.01266865, "balance_loss_clip": 0.0627349, "balance_loss_mlp": 0.01256279, "epoch": 0.6585901097249361, "flos": 26805291747840.0, "grad_norm": 1.802949882961572, "language_loss": 0.78976846, "learning_rate": 1.102949515683546e-06, "loss": 0.86661369, "num_input_tokens_seen": 236478280, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10595703, "step": 10954, "time_per_iteration": 2.60294771194458 }, { "auxiliary_loss_clip": 0.06421138, "auxiliary_loss_mlp": 0.01270171, "balance_loss_clip": 0.06276086, "balance_loss_mlp": 0.01259883, "epoch": 0.658650232977604, "flos": 18738921945600.0, "grad_norm": 1.8625174304008734, "language_loss": 0.69968814, "learning_rate": 1.1026014434627096e-06, "loss": 0.7766012, "num_input_tokens_seen": 236493225, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10302734, "step": 10955, "time_per_iteration": 2.5368800163269043 }, { "auxiliary_loss_clip": 0.06415093, "auxiliary_loss_mlp": 0.0126567, "balance_loss_clip": 0.06275167, "balance_loss_mlp": 0.0125571, "epoch": 0.6587103562302721, "flos": 24760272199680.0, "grad_norm": 3.5913535996342305, "language_loss": 0.80572736, "learning_rate": 1.1022534052713172e-06, "loss": 0.88253504, "num_input_tokens_seen": 236514420, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09960938, "step": 10956, "time_per_iteration": 2.601478099822998 }, { "auxiliary_loss_clip": 0.06423149, "auxiliary_loss_mlp": 0.01270039, "balance_loss_clip": 0.06278148, "balance_loss_mlp": 0.01258029, "epoch": 0.65877047948294, "flos": 22352688034560.0, "grad_norm": 3.3390658749539237, "language_loss": 0.81435066, "learning_rate": 1.1019054011225648e-06, "loss": 0.89128256, "num_input_tokens_seen": 236532785, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.12011719, "step": 10957, "time_per_iteration": 2.549116373062134 }, { "auxiliary_loss_clip": 0.06416118, "auxiliary_loss_mlp": 0.01265859, "balance_loss_clip": 0.06273901, "balance_loss_mlp": 0.01256751, "epoch": 0.658830602735608, "flos": 45189965358720.0, "grad_norm": 1.6289030283100332, "language_loss": 0.76330578, "learning_rate": 1.1015574310296506e-06, "loss": 0.84012556, "num_input_tokens_seen": 236553330, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09106445, "step": 10958, "time_per_iteration": 2.762193202972412 }, { "auxiliary_loss_clip": 0.06417935, "auxiliary_loss_mlp": 0.01268999, "balance_loss_clip": 0.06273466, "balance_loss_mlp": 0.01258347, "epoch": 0.6588907259882759, "flos": 19907774075520.0, "grad_norm": 1.5474891845395786, "language_loss": 0.74922395, "learning_rate": 1.1012094950057678e-06, "loss": 0.82609326, "num_input_tokens_seen": 236572960, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10656738, "step": 10959, "time_per_iteration": 4.065315246582031 }, { "auxiliary_loss_clip": 0.06419329, "auxiliary_loss_mlp": 0.01265367, "balance_loss_clip": 0.06274558, "balance_loss_mlp": 0.01255204, "epoch": 0.6589508492409439, "flos": 24140591729280.0, "grad_norm": 1.5646772753896696, "language_loss": 0.65071297, "learning_rate": 1.1008615930641107e-06, "loss": 0.72755992, "num_input_tokens_seen": 236594090, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.1015625, "step": 10960, "time_per_iteration": 2.5997509956359863 }, { "auxiliary_loss_clip": 0.06427548, "auxiliary_loss_mlp": 0.012667, "balance_loss_clip": 0.06275368, "balance_loss_mlp": 0.01255881, "epoch": 0.659010972493612, "flos": 18228715234560.0, "grad_norm": 2.706238057125816, "language_loss": 0.82188082, "learning_rate": 1.1005137252178734e-06, "loss": 0.89882326, "num_input_tokens_seen": 236610190, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.1083374, "step": 10961, "time_per_iteration": 2.553311824798584 }, { "auxiliary_loss_clip": 0.06419809, "auxiliary_loss_mlp": 0.0126907, "balance_loss_clip": 0.0627431, "balance_loss_mlp": 0.01259074, "epoch": 0.6590710957462799, "flos": 27607428483840.0, "grad_norm": 4.678569044478485, "language_loss": 0.73602015, "learning_rate": 1.1001658914802453e-06, "loss": 0.81290895, "num_input_tokens_seen": 236631575, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.09997559, "step": 10962, "time_per_iteration": 2.642085075378418 }, { "auxiliary_loss_clip": 0.06424683, "auxiliary_loss_mlp": 0.01268959, "balance_loss_clip": 0.0627585, "balance_loss_mlp": 0.0125798, "epoch": 0.6591312189989479, "flos": 20309177859840.0, "grad_norm": 1.7486740550393267, "language_loss": 0.80511874, "learning_rate": 1.0998180918644165e-06, "loss": 0.88205516, "num_input_tokens_seen": 236649815, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.10986328, "step": 10963, "time_per_iteration": 2.5807576179504395 }, { "auxiliary_loss_clip": 0.06416916, "auxiliary_loss_mlp": 0.01267439, "balance_loss_clip": 0.06274591, "balance_loss_mlp": 0.01256984, "epoch": 0.6591913422516158, "flos": 12317886915840.0, "grad_norm": 1.5057529562969352, "language_loss": 0.78555667, "learning_rate": 1.0994703263835754e-06, "loss": 0.86240017, "num_input_tokens_seen": 236668335, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10461426, "step": 10964, "time_per_iteration": 2.5241758823394775 }, { "auxiliary_loss_clip": 0.0642368, "auxiliary_loss_mlp": 0.01268599, "balance_loss_clip": 0.06274606, "balance_loss_mlp": 0.01258663, "epoch": 0.6592514655042838, "flos": 25891626827520.0, "grad_norm": 1.6280084924689033, "language_loss": 0.74161458, "learning_rate": 1.0991225950509106e-06, "loss": 0.81853735, "num_input_tokens_seen": 236688945, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.09936523, "step": 10965, "time_per_iteration": 2.639439821243286 }, { "auxiliary_loss_clip": 0.06425415, "auxiliary_loss_mlp": 0.01268724, "balance_loss_clip": 0.06274225, "balance_loss_mlp": 0.01257268, "epoch": 0.6593115887569517, "flos": 14068754305920.0, "grad_norm": 2.6526483309753224, "language_loss": 0.74320567, "learning_rate": 1.0987748978796067e-06, "loss": 0.82014704, "num_input_tokens_seen": 236707055, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.11450195, "step": 10966, "time_per_iteration": 2.5181212425231934 }, { "auxiliary_loss_clip": 0.06418498, "auxiliary_loss_mlp": 0.01265761, "balance_loss_clip": 0.06273706, "balance_loss_mlp": 0.01254758, "epoch": 0.6593717120096197, "flos": 24724912976640.0, "grad_norm": 1.6436709680954216, "language_loss": 0.77329099, "learning_rate": 1.0984272348828487e-06, "loss": 0.85013354, "num_input_tokens_seen": 236725900, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10992432, "step": 10967, "time_per_iteration": 4.031988620758057 }, { "auxiliary_loss_clip": 0.06309004, "auxiliary_loss_mlp": 0.01252286, "balance_loss_clip": 0.06249984, "balance_loss_mlp": 0.01251081, "epoch": 0.6594318352622877, "flos": 55577951907840.0, "grad_norm": 1.050399511007231, "language_loss": 0.48268485, "learning_rate": 1.0980796060738221e-06, "loss": 0.55829775, "num_input_tokens_seen": 236788415, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01203156, "step": 10968, "time_per_iteration": 3.148306369781494 }, { "auxiliary_loss_clip": 0.06420632, "auxiliary_loss_mlp": 0.01264198, "balance_loss_clip": 0.06273581, "balance_loss_mlp": 0.01252718, "epoch": 0.6594919585149557, "flos": 17462650481280.0, "grad_norm": 1.7437361552785053, "language_loss": 0.79460984, "learning_rate": 1.0977320114657058e-06, "loss": 0.87145817, "num_input_tokens_seen": 236805155, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.11474609, "step": 10969, "time_per_iteration": 2.5081801414489746 }, { "auxiliary_loss_clip": 0.06422011, "auxiliary_loss_mlp": 0.01266907, "balance_loss_clip": 0.06276046, "balance_loss_mlp": 0.01256053, "epoch": 0.6595520817676236, "flos": 18229092577920.0, "grad_norm": 2.220822842473494, "language_loss": 0.66216946, "learning_rate": 1.0973844510716817e-06, "loss": 0.73905861, "num_input_tokens_seen": 236824360, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10852051, "step": 10970, "time_per_iteration": 3.904585838317871 }, { "auxiliary_loss_clip": 0.06420808, "auxiliary_loss_mlp": 0.01263926, "balance_loss_clip": 0.06274321, "balance_loss_mlp": 0.0125346, "epoch": 0.6596122050202916, "flos": 22206219897600.0, "grad_norm": 1.589829007629208, "language_loss": 0.76953036, "learning_rate": 1.0970369249049308e-06, "loss": 0.84637773, "num_input_tokens_seen": 236844640, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.10461426, "step": 10971, "time_per_iteration": 2.557877540588379 }, { "auxiliary_loss_clip": 0.06420329, "auxiliary_loss_mlp": 0.01265399, "balance_loss_clip": 0.06273247, "balance_loss_mlp": 0.01254265, "epoch": 0.6596723282729595, "flos": 14179108533120.0, "grad_norm": 3.273344932602923, "language_loss": 0.7107693, "learning_rate": 1.096689432978629e-06, "loss": 0.7876265, "num_input_tokens_seen": 236861160, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.11138916, "step": 10972, "time_per_iteration": 2.518573522567749 }, { "auxiliary_loss_clip": 0.06417981, "auxiliary_loss_mlp": 0.01263688, "balance_loss_clip": 0.06272601, "balance_loss_mlp": 0.01253328, "epoch": 0.6597324515256275, "flos": 30560746291200.0, "grad_norm": 1.6476117941637747, "language_loss": 0.55694151, "learning_rate": 1.0963419753059556e-06, "loss": 0.63375819, "num_input_tokens_seen": 236880465, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.1036377, "step": 10973, "time_per_iteration": 2.6109185218811035 }, { "auxiliary_loss_clip": 0.06433673, "auxiliary_loss_mlp": 0.01268909, "balance_loss_clip": 0.06279597, "balance_loss_mlp": 0.01258049, "epoch": 0.6597925747782956, "flos": 17645693725440.0, "grad_norm": 2.0946546457543285, "language_loss": 0.78863871, "learning_rate": 1.0959945519000839e-06, "loss": 0.86566454, "num_input_tokens_seen": 236897730, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.10864258, "step": 10974, "time_per_iteration": 2.5276927947998047 }, { "auxiliary_loss_clip": 0.06421621, "auxiliary_loss_mlp": 0.01265999, "balance_loss_clip": 0.06274493, "balance_loss_mlp": 0.01254745, "epoch": 0.6598526980309635, "flos": 22825523024640.0, "grad_norm": 2.533363120480172, "language_loss": 0.69190156, "learning_rate": 1.0956471627741906e-06, "loss": 0.76877779, "num_input_tokens_seen": 236917300, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.11242676, "step": 10975, "time_per_iteration": 2.5675337314605713 }, { "auxiliary_loss_clip": 0.06421381, "auxiliary_loss_mlp": 0.01263706, "balance_loss_clip": 0.06275336, "balance_loss_mlp": 0.01253585, "epoch": 0.6599128212836315, "flos": 21074194437120.0, "grad_norm": 1.8159965933382167, "language_loss": 0.70671284, "learning_rate": 1.0952998079414464e-06, "loss": 0.78356367, "num_input_tokens_seen": 236935590, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10125732, "step": 10976, "time_per_iteration": 2.5557196140289307 }, { "auxiliary_loss_clip": 0.06416454, "auxiliary_loss_mlp": 0.0126738, "balance_loss_clip": 0.06273814, "balance_loss_mlp": 0.01256652, "epoch": 0.6599729445362994, "flos": 22170022133760.0, "grad_norm": 1.5820835348611124, "language_loss": 0.67736781, "learning_rate": 1.0949524874150243e-06, "loss": 0.75420618, "num_input_tokens_seen": 236952830, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.1072998, "step": 10977, "time_per_iteration": 2.5555503368377686 }, { "auxiliary_loss_clip": 0.06429622, "auxiliary_loss_mlp": 0.0126904, "balance_loss_clip": 0.06277485, "balance_loss_mlp": 0.01257525, "epoch": 0.6600330677889674, "flos": 18155900436480.0, "grad_norm": 2.2063488102319466, "language_loss": 0.81535661, "learning_rate": 1.0946052012080952e-06, "loss": 0.89234322, "num_input_tokens_seen": 236971930, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.1151123, "step": 10978, "time_per_iteration": 2.522690534591675 }, { "auxiliary_loss_clip": 0.06423867, "auxiliary_loss_mlp": 0.01266477, "balance_loss_clip": 0.06275192, "balance_loss_mlp": 0.01255074, "epoch": 0.6600931910416353, "flos": 18155942363520.0, "grad_norm": 2.3739197949874797, "language_loss": 0.67729145, "learning_rate": 1.0942579493338278e-06, "loss": 0.75419492, "num_input_tokens_seen": 236989920, "router_z_loss_clip": 1.48730469, "router_z_loss_mlp": 0.11401367, "step": 10979, "time_per_iteration": 2.517425298690796 }, { "auxiliary_loss_clip": 0.06421258, "auxiliary_loss_mlp": 0.01266717, "balance_loss_clip": 0.0627315, "balance_loss_mlp": 0.01255851, "epoch": 0.6601533142943034, "flos": 17426494644480.0, "grad_norm": 2.1579286493256005, "language_loss": 0.72890961, "learning_rate": 1.0939107318053889e-06, "loss": 0.80578935, "num_input_tokens_seen": 237006570, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.10870361, "step": 10980, "time_per_iteration": 2.509563684463501 }, { "auxiliary_loss_clip": 0.06419209, "auxiliary_loss_mlp": 0.01268734, "balance_loss_clip": 0.06278364, "balance_loss_mlp": 0.01258625, "epoch": 0.6602134375469713, "flos": 28226983173120.0, "grad_norm": 1.5272841547459537, "language_loss": 0.73218834, "learning_rate": 1.0935635486359459e-06, "loss": 0.80906785, "num_input_tokens_seen": 237028415, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10107422, "step": 10981, "time_per_iteration": 2.5998353958129883 }, { "auxiliary_loss_clip": 0.06422383, "auxiliary_loss_mlp": 0.0126551, "balance_loss_clip": 0.06274287, "balance_loss_mlp": 0.01255061, "epoch": 0.6602735607996393, "flos": 29424737761920.0, "grad_norm": 2.2140431429534084, "language_loss": 0.69263649, "learning_rate": 1.0932163998386647e-06, "loss": 0.76951545, "num_input_tokens_seen": 237046595, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.10449219, "step": 10982, "time_per_iteration": 2.5950441360473633 }, { "auxiliary_loss_clip": 0.06422464, "auxiliary_loss_mlp": 0.01265212, "balance_loss_clip": 0.06278281, "balance_loss_mlp": 0.01253941, "epoch": 0.6603336840523072, "flos": 18593963182080.0, "grad_norm": 1.4846797405683123, "language_loss": 0.69661105, "learning_rate": 1.0928692854267075e-06, "loss": 0.77348781, "num_input_tokens_seen": 237066150, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.11273193, "step": 10983, "time_per_iteration": 2.5915274620056152 }, { "auxiliary_loss_clip": 0.06422359, "auxiliary_loss_mlp": 0.01263365, "balance_loss_clip": 0.06274489, "balance_loss_mlp": 0.0125244, "epoch": 0.6603938073049752, "flos": 33263153447040.0, "grad_norm": 2.001097932544907, "language_loss": 0.70795107, "learning_rate": 1.092522205413239e-06, "loss": 0.78480828, "num_input_tokens_seen": 237087060, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10931396, "step": 10984, "time_per_iteration": 2.6538593769073486 }, { "auxiliary_loss_clip": 0.06416419, "auxiliary_loss_mlp": 0.01268134, "balance_loss_clip": 0.06274393, "balance_loss_mlp": 0.01257345, "epoch": 0.6604539305576431, "flos": 17390045318400.0, "grad_norm": 1.6260120834141298, "language_loss": 0.8421393, "learning_rate": 1.0921751598114193e-06, "loss": 0.91898489, "num_input_tokens_seen": 237103825, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10784912, "step": 10985, "time_per_iteration": 2.545025110244751 }, { "auxiliary_loss_clip": 0.06425016, "auxiliary_loss_mlp": 0.01266408, "balance_loss_clip": 0.06276712, "balance_loss_mlp": 0.01255477, "epoch": 0.6605140538103111, "flos": 21257447316480.0, "grad_norm": 2.3866438745058796, "language_loss": 0.74153918, "learning_rate": 1.0918281486344077e-06, "loss": 0.81845343, "num_input_tokens_seen": 237121740, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10931396, "step": 10986, "time_per_iteration": 2.7440929412841797 }, { "auxiliary_loss_clip": 0.06424551, "auxiliary_loss_mlp": 0.01264964, "balance_loss_clip": 0.06280172, "balance_loss_mlp": 0.01255004, "epoch": 0.6605741770629792, "flos": 13886885018880.0, "grad_norm": 1.7640356356158973, "language_loss": 0.79647332, "learning_rate": 1.0914811718953636e-06, "loss": 0.8733685, "num_input_tokens_seen": 237139565, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.09960938, "step": 10987, "time_per_iteration": 2.801466703414917 }, { "auxiliary_loss_clip": 0.0631635, "auxiliary_loss_mlp": 0.01252036, "balance_loss_clip": 0.06257249, "balance_loss_mlp": 0.01250581, "epoch": 0.6606343003156471, "flos": 69338885840640.0, "grad_norm": 0.7991172043638309, "language_loss": 0.54166269, "learning_rate": 1.0911342296074454e-06, "loss": 0.61734653, "num_input_tokens_seen": 237201055, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.014534, "step": 10988, "time_per_iteration": 3.256861925125122 }, { "auxiliary_loss_clip": 0.06423879, "auxiliary_loss_mlp": 0.01267402, "balance_loss_clip": 0.06279454, "balance_loss_mlp": 0.01257019, "epoch": 0.6606944235683151, "flos": 27279887673600.0, "grad_norm": 1.5323975825992797, "language_loss": 0.77456206, "learning_rate": 1.0907873217838077e-06, "loss": 0.85147488, "num_input_tokens_seen": 237221805, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10375977, "step": 10989, "time_per_iteration": 2.643810510635376 }, { "auxiliary_loss_clip": 0.06422158, "auxiliary_loss_mlp": 0.01268092, "balance_loss_clip": 0.06278016, "balance_loss_mlp": 0.01257482, "epoch": 0.660754546820983, "flos": 13778082092160.0, "grad_norm": 1.915956443777793, "language_loss": 0.77193069, "learning_rate": 1.0904404484376064e-06, "loss": 0.84883326, "num_input_tokens_seen": 237238270, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.1060791, "step": 10990, "time_per_iteration": 2.524015188217163 }, { "auxiliary_loss_clip": 0.06430188, "auxiliary_loss_mlp": 0.01264248, "balance_loss_clip": 0.06282926, "balance_loss_mlp": 0.01253877, "epoch": 0.660814670073651, "flos": 15710567207040.0, "grad_norm": 2.0201747703876776, "language_loss": 0.61075056, "learning_rate": 1.0900936095819937e-06, "loss": 0.68769497, "num_input_tokens_seen": 237255400, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10369873, "step": 10991, "time_per_iteration": 3.963545083999634 }, { "auxiliary_loss_clip": 0.06426881, "auxiliary_loss_mlp": 0.01267278, "balance_loss_clip": 0.0627762, "balance_loss_mlp": 0.01255649, "epoch": 0.6608747933263189, "flos": 20856295094400.0, "grad_norm": 2.452450170459608, "language_loss": 0.68903548, "learning_rate": 1.0897468052301234e-06, "loss": 0.76597703, "num_input_tokens_seen": 237273105, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.11621094, "step": 10992, "time_per_iteration": 2.5772292613983154 }, { "auxiliary_loss_clip": 0.06425706, "auxiliary_loss_mlp": 0.01268055, "balance_loss_clip": 0.06277017, "balance_loss_mlp": 0.01256874, "epoch": 0.660934916578987, "flos": 20638521532800.0, "grad_norm": 1.864990282849175, "language_loss": 0.87809908, "learning_rate": 1.0894000353951444e-06, "loss": 0.95503664, "num_input_tokens_seen": 237292650, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.11175537, "step": 10993, "time_per_iteration": 2.5636045932769775 }, { "auxiliary_loss_clip": 0.0643254, "auxiliary_loss_mlp": 0.01265859, "balance_loss_clip": 0.06281163, "balance_loss_mlp": 0.01253807, "epoch": 0.6609950398316549, "flos": 25119692288640.0, "grad_norm": 1.6014591421055757, "language_loss": 0.67032552, "learning_rate": 1.0890533000902078e-06, "loss": 0.74730951, "num_input_tokens_seen": 237312865, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.12036133, "step": 10994, "time_per_iteration": 2.599975347518921 }, { "auxiliary_loss_clip": 0.0642339, "auxiliary_loss_mlp": 0.01266811, "balance_loss_clip": 0.06277415, "balance_loss_mlp": 0.01255557, "epoch": 0.6610551630843229, "flos": 18667155323520.0, "grad_norm": 1.6983870958754916, "language_loss": 0.77332932, "learning_rate": 1.0887065993284626e-06, "loss": 0.85023135, "num_input_tokens_seen": 237331210, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.11248779, "step": 10995, "time_per_iteration": 2.5252084732055664 }, { "auxiliary_loss_clip": 0.06425188, "auxiliary_loss_mlp": 0.01267062, "balance_loss_clip": 0.06279078, "balance_loss_mlp": 0.01257275, "epoch": 0.6611152863369908, "flos": 23264885508480.0, "grad_norm": 2.159530025466187, "language_loss": 0.74793684, "learning_rate": 1.088359933123053e-06, "loss": 0.82485938, "num_input_tokens_seen": 237349455, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.09790039, "step": 10996, "time_per_iteration": 2.5967986583709717 }, { "auxiliary_loss_clip": 0.0642238, "auxiliary_loss_mlp": 0.01266365, "balance_loss_clip": 0.06276904, "balance_loss_mlp": 0.01255767, "epoch": 0.6611754095896588, "flos": 22165577867520.0, "grad_norm": 1.6348578788558072, "language_loss": 0.69122529, "learning_rate": 1.088013301487126e-06, "loss": 0.76811278, "num_input_tokens_seen": 237367100, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10601807, "step": 10997, "time_per_iteration": 2.5411407947540283 }, { "auxiliary_loss_clip": 0.06427753, "auxiliary_loss_mlp": 0.01266112, "balance_loss_clip": 0.062775, "balance_loss_mlp": 0.01255907, "epoch": 0.6612355328423267, "flos": 13996442632320.0, "grad_norm": 1.9268615433435803, "language_loss": 0.69241726, "learning_rate": 1.0876667044338269e-06, "loss": 0.76935589, "num_input_tokens_seen": 237384840, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.10205078, "step": 10998, "time_per_iteration": 3.9389233589172363 }, { "auxiliary_loss_clip": 0.06320363, "auxiliary_loss_mlp": 0.01252634, "balance_loss_clip": 0.06260913, "balance_loss_mlp": 0.01251275, "epoch": 0.6612956560949947, "flos": 61472051337600.0, "grad_norm": 0.6399600493432315, "language_loss": 0.50998718, "learning_rate": 1.087320141976297e-06, "loss": 0.5857172, "num_input_tokens_seen": 237443355, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01360321, "step": 10999, "time_per_iteration": 3.1452488899230957 }, { "auxiliary_loss_clip": 0.06432506, "auxiliary_loss_mlp": 0.01267686, "balance_loss_clip": 0.06280394, "balance_loss_mlp": 0.01256421, "epoch": 0.6613557793476627, "flos": 21623114534400.0, "grad_norm": 2.2192619154300206, "language_loss": 0.70737958, "learning_rate": 1.086973614127679e-06, "loss": 0.78438151, "num_input_tokens_seen": 237459205, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.1126709, "step": 11000, "time_per_iteration": 2.5426523685455322 }, { "auxiliary_loss_clip": 0.06419262, "auxiliary_loss_mlp": 0.01263885, "balance_loss_clip": 0.06276035, "balance_loss_mlp": 0.01254307, "epoch": 0.6614159026003307, "flos": 34028379659520.0, "grad_norm": 1.555463547893936, "language_loss": 0.65436238, "learning_rate": 1.0866271209011133e-06, "loss": 0.73119384, "num_input_tokens_seen": 237483580, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.0958252, "step": 11001, "time_per_iteration": 2.667513608932495 }, { "auxiliary_loss_clip": 0.064245, "auxiliary_loss_mlp": 0.01267049, "balance_loss_clip": 0.0628075, "balance_loss_mlp": 0.01256606, "epoch": 0.6614760258529987, "flos": 24104100476160.0, "grad_norm": 1.7547517859906243, "language_loss": 0.73441869, "learning_rate": 1.086280662309739e-06, "loss": 0.81133419, "num_input_tokens_seen": 237502860, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10449219, "step": 11002, "time_per_iteration": 2.6086995601654053 }, { "auxiliary_loss_clip": 0.06423074, "auxiliary_loss_mlp": 0.01266457, "balance_loss_clip": 0.06279057, "balance_loss_mlp": 0.01256008, "epoch": 0.6615361491056666, "flos": 14909227084800.0, "grad_norm": 2.045064299712965, "language_loss": 0.7896778, "learning_rate": 1.0859342383666928e-06, "loss": 0.8665731, "num_input_tokens_seen": 237521030, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10443115, "step": 11003, "time_per_iteration": 2.5570530891418457 }, { "auxiliary_loss_clip": 0.06426117, "auxiliary_loss_mlp": 0.01269324, "balance_loss_clip": 0.06278546, "balance_loss_mlp": 0.01257779, "epoch": 0.6615962723583346, "flos": 15310337379840.0, "grad_norm": 2.0349762903764983, "language_loss": 0.69015062, "learning_rate": 1.0855878490851119e-06, "loss": 0.76710504, "num_input_tokens_seen": 237539585, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.11541748, "step": 11004, "time_per_iteration": 2.552945852279663 }, { "auxiliary_loss_clip": 0.06431262, "auxiliary_loss_mlp": 0.0126928, "balance_loss_clip": 0.06280078, "balance_loss_mlp": 0.01257115, "epoch": 0.6616563956110025, "flos": 18738293040000.0, "grad_norm": 2.024503331014921, "language_loss": 0.7016719, "learning_rate": 1.085241494478132e-06, "loss": 0.77867734, "num_input_tokens_seen": 237557655, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.12164307, "step": 11005, "time_per_iteration": 2.553767204284668 }, { "auxiliary_loss_clip": 0.0642045, "auxiliary_loss_mlp": 0.01262986, "balance_loss_clip": 0.06276644, "balance_loss_mlp": 0.0125252, "epoch": 0.6617165188636706, "flos": 24501353483520.0, "grad_norm": 1.8831896731257225, "language_loss": 0.78452218, "learning_rate": 1.0848951745588855e-06, "loss": 0.8613565, "num_input_tokens_seen": 237577000, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10473633, "step": 11006, "time_per_iteration": 2.5935311317443848 }, { "auxiliary_loss_clip": 0.06419446, "auxiliary_loss_mlp": 0.01268048, "balance_loss_clip": 0.06275037, "balance_loss_mlp": 0.01256902, "epoch": 0.6617766421163385, "flos": 22385741270400.0, "grad_norm": 1.6422393238690456, "language_loss": 0.76821232, "learning_rate": 1.0845488893405068e-06, "loss": 0.84508729, "num_input_tokens_seen": 237597960, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.11138916, "step": 11007, "time_per_iteration": 4.064641714096069 }, { "auxiliary_loss_clip": 0.06423808, "auxiliary_loss_mlp": 0.01265946, "balance_loss_clip": 0.06278501, "balance_loss_mlp": 0.01255396, "epoch": 0.6618367653690065, "flos": 20856756291840.0, "grad_norm": 1.5287073051199076, "language_loss": 0.78360677, "learning_rate": 1.0842026388361248e-06, "loss": 0.86050439, "num_input_tokens_seen": 237616385, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10546875, "step": 11008, "time_per_iteration": 2.555579662322998 }, { "auxiliary_loss_clip": 0.06427066, "auxiliary_loss_mlp": 0.01266202, "balance_loss_clip": 0.06275089, "balance_loss_mlp": 0.01254585, "epoch": 0.6618968886216744, "flos": 17718089253120.0, "grad_norm": 1.6913333517046918, "language_loss": 0.82019508, "learning_rate": 1.0838564230588715e-06, "loss": 0.89712775, "num_input_tokens_seen": 237634930, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.11621094, "step": 11009, "time_per_iteration": 4.006803274154663 }, { "auxiliary_loss_clip": 0.0630593, "auxiliary_loss_mlp": 0.01251559, "balance_loss_clip": 0.0624626, "balance_loss_mlp": 0.01249985, "epoch": 0.6619570118743424, "flos": 67054500305280.0, "grad_norm": 0.9599742588395718, "language_loss": 0.67449284, "learning_rate": 1.0835102420218735e-06, "loss": 0.75006771, "num_input_tokens_seen": 237693175, "router_z_loss_clip": 0.59619141, "router_z_loss_mlp": 0.0157547, "step": 11010, "time_per_iteration": 3.1098616123199463 }, { "auxiliary_loss_clip": 0.06423979, "auxiliary_loss_mlp": 0.01266273, "balance_loss_clip": 0.06277134, "balance_loss_mlp": 0.01254489, "epoch": 0.6620171351270103, "flos": 18666819907200.0, "grad_norm": 1.5979491990474841, "language_loss": 0.72111881, "learning_rate": 1.0831640957382593e-06, "loss": 0.79802132, "num_input_tokens_seen": 237713160, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.11785889, "step": 11011, "time_per_iteration": 2.5718555450439453 }, { "auxiliary_loss_clip": 0.06426581, "auxiliary_loss_mlp": 0.01267784, "balance_loss_clip": 0.06281169, "balance_loss_mlp": 0.01257163, "epoch": 0.6620772583796783, "flos": 24177376471680.0, "grad_norm": 1.4862237131016625, "language_loss": 0.72711706, "learning_rate": 1.0828179842211557e-06, "loss": 0.8040607, "num_input_tokens_seen": 237733600, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10620117, "step": 11012, "time_per_iteration": 2.577378988265991 }, { "auxiliary_loss_clip": 0.06415574, "auxiliary_loss_mlp": 0.01269814, "balance_loss_clip": 0.0627885, "balance_loss_mlp": 0.01260516, "epoch": 0.6621373816323463, "flos": 23630385018240.0, "grad_norm": 1.4970719991961112, "language_loss": 0.79646766, "learning_rate": 1.0824719074836845e-06, "loss": 0.87332159, "num_input_tokens_seen": 237752135, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.09301758, "step": 11013, "time_per_iteration": 2.6107473373413086 }, { "auxiliary_loss_clip": 0.06419393, "auxiliary_loss_mlp": 0.01268626, "balance_loss_clip": 0.06275761, "balance_loss_mlp": 0.01257987, "epoch": 0.6621975048850143, "flos": 18448123950720.0, "grad_norm": 1.9091591461714292, "language_loss": 0.70744932, "learning_rate": 1.082125865538971e-06, "loss": 0.78432947, "num_input_tokens_seen": 237770735, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10638428, "step": 11014, "time_per_iteration": 2.5873847007751465 }, { "auxiliary_loss_clip": 0.06419879, "auxiliary_loss_mlp": 0.01267759, "balance_loss_clip": 0.06278893, "balance_loss_mlp": 0.01258436, "epoch": 0.6622576281376823, "flos": 14069047795200.0, "grad_norm": 1.7022786827263847, "language_loss": 0.77232099, "learning_rate": 1.081779858400137e-06, "loss": 0.84919739, "num_input_tokens_seen": 237789005, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09326172, "step": 11015, "time_per_iteration": 2.6073110103607178 }, { "auxiliary_loss_clip": 0.06421559, "auxiliary_loss_mlp": 0.01266338, "balance_loss_clip": 0.0627853, "balance_loss_mlp": 0.01255418, "epoch": 0.6623177513903502, "flos": 17024587735680.0, "grad_norm": 3.276273057480128, "language_loss": 0.82347769, "learning_rate": 1.0814338860803021e-06, "loss": 0.90035665, "num_input_tokens_seen": 237807740, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10906982, "step": 11016, "time_per_iteration": 2.561833381652832 }, { "auxiliary_loss_clip": 0.06424446, "auxiliary_loss_mlp": 0.01263304, "balance_loss_clip": 0.06275438, "balance_loss_mlp": 0.01252409, "epoch": 0.6623778746430182, "flos": 17276127292800.0, "grad_norm": 2.421648515726512, "language_loss": 0.70093358, "learning_rate": 1.0810879485925864e-06, "loss": 0.77781105, "num_input_tokens_seen": 237826340, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.10894775, "step": 11017, "time_per_iteration": 2.5542633533477783 }, { "auxiliary_loss_clip": 0.06422046, "auxiliary_loss_mlp": 0.01267178, "balance_loss_clip": 0.06277405, "balance_loss_mlp": 0.01256515, "epoch": 0.6624379978956861, "flos": 48802725198720.0, "grad_norm": 1.9217403020103347, "language_loss": 0.771698, "learning_rate": 1.0807420459501084e-06, "loss": 0.84859025, "num_input_tokens_seen": 237848305, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10668945, "step": 11018, "time_per_iteration": 2.7871110439300537 }, { "auxiliary_loss_clip": 0.06420317, "auxiliary_loss_mlp": 0.01262727, "balance_loss_clip": 0.0627757, "balance_loss_mlp": 0.01252409, "epoch": 0.6624981211483542, "flos": 18958330661760.0, "grad_norm": 1.9218150304439259, "language_loss": 0.83672941, "learning_rate": 1.0803961781659841e-06, "loss": 0.91355985, "num_input_tokens_seen": 237867020, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10321045, "step": 11019, "time_per_iteration": 2.5360636711120605 }, { "auxiliary_loss_clip": 0.0641678, "auxiliary_loss_mlp": 0.01262105, "balance_loss_clip": 0.06276305, "balance_loss_mlp": 0.01251918, "epoch": 0.6625582444010221, "flos": 23262998791680.0, "grad_norm": 1.5552325904588868, "language_loss": 0.71597826, "learning_rate": 1.080050345253328e-06, "loss": 0.79276717, "num_input_tokens_seen": 237886710, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.10192871, "step": 11020, "time_per_iteration": 2.602001905441284 }, { "auxiliary_loss_clip": 0.06431869, "auxiliary_loss_mlp": 0.01269327, "balance_loss_clip": 0.06280665, "balance_loss_mlp": 0.01257543, "epoch": 0.6626183676536901, "flos": 21400770925440.0, "grad_norm": 2.1719490225859643, "language_loss": 0.7250948, "learning_rate": 1.0797045472252554e-06, "loss": 0.8021068, "num_input_tokens_seen": 237904795, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.11779785, "step": 11021, "time_per_iteration": 2.643054723739624 }, { "auxiliary_loss_clip": 0.06428127, "auxiliary_loss_mlp": 0.01269082, "balance_loss_clip": 0.06281543, "balance_loss_mlp": 0.01257697, "epoch": 0.662678490906358, "flos": 14575984197120.0, "grad_norm": 1.9839028551947397, "language_loss": 0.83883679, "learning_rate": 1.0793587840948793e-06, "loss": 0.91580892, "num_input_tokens_seen": 237921320, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.11383057, "step": 11022, "time_per_iteration": 2.5487985610961914 }, { "auxiliary_loss_clip": 0.06434686, "auxiliary_loss_mlp": 0.01269546, "balance_loss_clip": 0.06278384, "balance_loss_mlp": 0.01256731, "epoch": 0.662738614159026, "flos": 15996962862720.0, "grad_norm": 2.7213600170306163, "language_loss": 0.72775292, "learning_rate": 1.0790130558753099e-06, "loss": 0.80479521, "num_input_tokens_seen": 237933525, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.12823486, "step": 11023, "time_per_iteration": 2.5062670707702637 }, { "auxiliary_loss_clip": 0.06422111, "auxiliary_loss_mlp": 0.01270779, "balance_loss_clip": 0.0627811, "balance_loss_mlp": 0.01260604, "epoch": 0.6627987374116939, "flos": 19542358419840.0, "grad_norm": 1.6219186821232168, "language_loss": 0.7526437, "learning_rate": 1.0786673625796574e-06, "loss": 0.82957268, "num_input_tokens_seen": 237953395, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10180664, "step": 11024, "time_per_iteration": 2.580878734588623 }, { "auxiliary_loss_clip": 0.06423107, "auxiliary_loss_mlp": 0.01270827, "balance_loss_clip": 0.0627745, "balance_loss_mlp": 0.01259711, "epoch": 0.662858860664362, "flos": 15707800022400.0, "grad_norm": 2.254103433111981, "language_loss": 0.69773144, "learning_rate": 1.0783217042210306e-06, "loss": 0.77467072, "num_input_tokens_seen": 237971445, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.11108398, "step": 11025, "time_per_iteration": 2.5492608547210693 }, { "auxiliary_loss_clip": 0.06426114, "auxiliary_loss_mlp": 0.01268975, "balance_loss_clip": 0.06281275, "balance_loss_mlp": 0.01258133, "epoch": 0.6629189839170299, "flos": 20160026392320.0, "grad_norm": 1.457632250322765, "language_loss": 0.79053795, "learning_rate": 1.0779760808125379e-06, "loss": 0.86748886, "num_input_tokens_seen": 237989965, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10845947, "step": 11026, "time_per_iteration": 2.5775973796844482 }, { "auxiliary_loss_clip": 0.06420615, "auxiliary_loss_mlp": 0.01269094, "balance_loss_clip": 0.0627729, "balance_loss_mlp": 0.01258788, "epoch": 0.6629791071696979, "flos": 20920430995200.0, "grad_norm": 1.7039483067664618, "language_loss": 0.76649654, "learning_rate": 1.0776304923672842e-06, "loss": 0.84339356, "num_input_tokens_seen": 238006820, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10308838, "step": 11027, "time_per_iteration": 2.581354856491089 }, { "auxiliary_loss_clip": 0.0642436, "auxiliary_loss_mlp": 0.01269333, "balance_loss_clip": 0.06277657, "balance_loss_mlp": 0.01258389, "epoch": 0.6630392304223659, "flos": 20852647441920.0, "grad_norm": 2.2852232079236483, "language_loss": 0.70796835, "learning_rate": 1.0772849388983742e-06, "loss": 0.78490531, "num_input_tokens_seen": 238022560, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.10955811, "step": 11028, "time_per_iteration": 2.5653462409973145 }, { "auxiliary_loss_clip": 0.06421335, "auxiliary_loss_mlp": 0.01264523, "balance_loss_clip": 0.06277458, "balance_loss_mlp": 0.01254706, "epoch": 0.6630993536750338, "flos": 21002092398720.0, "grad_norm": 1.907215790825944, "language_loss": 0.79493612, "learning_rate": 1.0769394204189138e-06, "loss": 0.8717947, "num_input_tokens_seen": 238041895, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.09820557, "step": 11029, "time_per_iteration": 2.5800561904907227 }, { "auxiliary_loss_clip": 0.0642246, "auxiliary_loss_mlp": 0.01268738, "balance_loss_clip": 0.06275421, "balance_loss_mlp": 0.01257342, "epoch": 0.6631594769277018, "flos": 18264787217280.0, "grad_norm": 2.24297195229564, "language_loss": 0.76153207, "learning_rate": 1.0765939369420012e-06, "loss": 0.83844411, "num_input_tokens_seen": 238060445, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.11401367, "step": 11030, "time_per_iteration": 2.5836265087127686 }, { "auxiliary_loss_clip": 0.06431536, "auxiliary_loss_mlp": 0.01270172, "balance_loss_clip": 0.06279676, "balance_loss_mlp": 0.01258746, "epoch": 0.6632196001803697, "flos": 17826053639040.0, "grad_norm": 2.382798253291732, "language_loss": 0.75263733, "learning_rate": 1.0762484884807391e-06, "loss": 0.82965446, "num_input_tokens_seen": 238077080, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.11425781, "step": 11031, "time_per_iteration": 4.0114357471466064 }, { "auxiliary_loss_clip": 0.06424168, "auxiliary_loss_mlp": 0.01265102, "balance_loss_clip": 0.06275865, "balance_loss_mlp": 0.01253819, "epoch": 0.6632797234330378, "flos": 12673910914560.0, "grad_norm": 3.0492669127787435, "language_loss": 0.74971622, "learning_rate": 1.075903075048228e-06, "loss": 0.8266089, "num_input_tokens_seen": 238091045, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.1126709, "step": 11032, "time_per_iteration": 2.4936559200286865 }, { "auxiliary_loss_clip": 0.06419475, "auxiliary_loss_mlp": 0.01267541, "balance_loss_clip": 0.06277063, "balance_loss_mlp": 0.01257819, "epoch": 0.6633398466857057, "flos": 23591168507520.0, "grad_norm": 1.672287732161128, "language_loss": 0.80722374, "learning_rate": 1.0755576966575635e-06, "loss": 0.88409388, "num_input_tokens_seen": 238110220, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.097229, "step": 11033, "time_per_iteration": 2.6679773330688477 }, { "auxiliary_loss_clip": 0.06425783, "auxiliary_loss_mlp": 0.01272404, "balance_loss_clip": 0.06278726, "balance_loss_mlp": 0.0126028, "epoch": 0.6633999699383737, "flos": 20638018408320.0, "grad_norm": 1.6793141381866492, "language_loss": 0.80614352, "learning_rate": 1.0752123533218451e-06, "loss": 0.88312536, "num_input_tokens_seen": 238130400, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.12115479, "step": 11034, "time_per_iteration": 2.700068712234497 }, { "auxiliary_loss_clip": 0.06418885, "auxiliary_loss_mlp": 0.01267176, "balance_loss_clip": 0.06275033, "balance_loss_mlp": 0.01257311, "epoch": 0.6634600931910416, "flos": 21803264812800.0, "grad_norm": 1.6560406089693016, "language_loss": 0.75714517, "learning_rate": 1.074867045054166e-06, "loss": 0.83400571, "num_input_tokens_seen": 238148165, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.09875488, "step": 11035, "time_per_iteration": 2.716390609741211 }, { "auxiliary_loss_clip": 0.06427966, "auxiliary_loss_mlp": 0.01264456, "balance_loss_clip": 0.06277975, "balance_loss_mlp": 0.01254073, "epoch": 0.6635202164437096, "flos": 18738628456320.0, "grad_norm": 1.7146531929103919, "language_loss": 0.83243072, "learning_rate": 1.074521771867622e-06, "loss": 0.90935493, "num_input_tokens_seen": 238166360, "router_z_loss_clip": 1.49902344, "router_z_loss_mlp": 0.10388184, "step": 11036, "time_per_iteration": 2.5654118061065674 }, { "auxiliary_loss_clip": 0.0631617, "auxiliary_loss_mlp": 0.0125288, "balance_loss_clip": 0.06257403, "balance_loss_mlp": 0.01251531, "epoch": 0.6635803396963775, "flos": 60242501324160.0, "grad_norm": 0.7648939203843489, "language_loss": 0.5240106, "learning_rate": 1.0741765337753044e-06, "loss": 0.59970105, "num_input_tokens_seen": 238227630, "router_z_loss_clip": 0.58740234, "router_z_loss_mlp": 0.01351166, "step": 11037, "time_per_iteration": 3.152636766433716 }, { "auxiliary_loss_clip": 0.06424254, "auxiliary_loss_mlp": 0.01268048, "balance_loss_clip": 0.06277701, "balance_loss_mlp": 0.01256753, "epoch": 0.6636404629490456, "flos": 29174414088960.0, "grad_norm": 1.5128085607827524, "language_loss": 0.7912823, "learning_rate": 1.0738313307903052e-06, "loss": 0.86820531, "num_input_tokens_seen": 238248435, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.11303711, "step": 11038, "time_per_iteration": 4.076410531997681 }, { "auxiliary_loss_clip": 0.06424124, "auxiliary_loss_mlp": 0.01265753, "balance_loss_clip": 0.06277661, "balance_loss_mlp": 0.01254672, "epoch": 0.6637005862017135, "flos": 38916530496000.0, "grad_norm": 1.8035167069414568, "language_loss": 0.64531732, "learning_rate": 1.073486162925716e-06, "loss": 0.72221607, "num_input_tokens_seen": 238268755, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.11071777, "step": 11039, "time_per_iteration": 2.7059943675994873 }, { "auxiliary_loss_clip": 0.06427407, "auxiliary_loss_mlp": 0.0126679, "balance_loss_clip": 0.06277572, "balance_loss_mlp": 0.01255781, "epoch": 0.6637607094543815, "flos": 22789870312320.0, "grad_norm": 1.4149695346448465, "language_loss": 0.64138258, "learning_rate": 1.0731410301946237e-06, "loss": 0.71832454, "num_input_tokens_seen": 238290120, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.11016846, "step": 11040, "time_per_iteration": 2.581845283508301 }, { "auxiliary_loss_clip": 0.06423618, "auxiliary_loss_mlp": 0.01266982, "balance_loss_clip": 0.06280731, "balance_loss_mlp": 0.01257231, "epoch": 0.6638208327070495, "flos": 18119996161920.0, "grad_norm": 2.498773930115529, "language_loss": 0.72438002, "learning_rate": 1.0727959326101161e-06, "loss": 0.80128598, "num_input_tokens_seen": 238309290, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09747314, "step": 11041, "time_per_iteration": 2.5754590034484863 }, { "auxiliary_loss_clip": 0.06424842, "auxiliary_loss_mlp": 0.0126757, "balance_loss_clip": 0.06280556, "balance_loss_mlp": 0.01256811, "epoch": 0.6638809559597174, "flos": 29432703899520.0, "grad_norm": 2.8876806233977867, "language_loss": 0.62144136, "learning_rate": 1.0724508701852806e-06, "loss": 0.69836545, "num_input_tokens_seen": 238327280, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10760498, "step": 11042, "time_per_iteration": 2.6302218437194824 }, { "auxiliary_loss_clip": 0.06428055, "auxiliary_loss_mlp": 0.01268519, "balance_loss_clip": 0.06277177, "balance_loss_mlp": 0.0125723, "epoch": 0.6639410792123854, "flos": 28079928057600.0, "grad_norm": 1.8111829442138736, "language_loss": 0.68592572, "learning_rate": 1.0721058429331998e-06, "loss": 0.76289153, "num_input_tokens_seen": 238346330, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.112854, "step": 11043, "time_per_iteration": 2.6058645248413086 }, { "auxiliary_loss_clip": 0.064171, "auxiliary_loss_mlp": 0.01266444, "balance_loss_clip": 0.06277163, "balance_loss_mlp": 0.01257581, "epoch": 0.6640012024650533, "flos": 25563373257600.0, "grad_norm": 1.6185896137879718, "language_loss": 0.84044266, "learning_rate": 1.0717608508669587e-06, "loss": 0.91727805, "num_input_tokens_seen": 238364650, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.08874512, "step": 11044, "time_per_iteration": 2.569658041000366 }, { "auxiliary_loss_clip": 0.06419998, "auxiliary_loss_mlp": 0.01266658, "balance_loss_clip": 0.06276071, "balance_loss_mlp": 0.01255983, "epoch": 0.6640613257177214, "flos": 14872316561280.0, "grad_norm": 2.0452830010881042, "language_loss": 0.69851458, "learning_rate": 1.0714158939996392e-06, "loss": 0.77538121, "num_input_tokens_seen": 238381630, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10662842, "step": 11045, "time_per_iteration": 2.547548770904541 }, { "auxiliary_loss_clip": 0.06424938, "auxiliary_loss_mlp": 0.01268397, "balance_loss_clip": 0.0627846, "balance_loss_mlp": 0.01257418, "epoch": 0.6641214489703893, "flos": 23227681495680.0, "grad_norm": 1.404505676580243, "language_loss": 0.64296842, "learning_rate": 1.0710709723443235e-06, "loss": 0.71990174, "num_input_tokens_seen": 238402595, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.10980225, "step": 11046, "time_per_iteration": 2.5927138328552246 }, { "auxiliary_loss_clip": 0.06425029, "auxiliary_loss_mlp": 0.01265927, "balance_loss_clip": 0.06281494, "balance_loss_mlp": 0.01255484, "epoch": 0.6641815722230573, "flos": 37751661434880.0, "grad_norm": 1.4189536876744688, "language_loss": 0.71678984, "learning_rate": 1.070726085914088e-06, "loss": 0.79369938, "num_input_tokens_seen": 238426860, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10437012, "step": 11047, "time_per_iteration": 4.228404521942139 }, { "auxiliary_loss_clip": 0.06424119, "auxiliary_loss_mlp": 0.01266367, "balance_loss_clip": 0.06278564, "balance_loss_mlp": 0.01255102, "epoch": 0.6642416954757252, "flos": 17936910990720.0, "grad_norm": 1.8423420778099109, "language_loss": 0.77322054, "learning_rate": 1.0703812347220126e-06, "loss": 0.85012531, "num_input_tokens_seen": 238443990, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.1126709, "step": 11048, "time_per_iteration": 2.562704563140869 }, { "auxiliary_loss_clip": 0.0631883, "auxiliary_loss_mlp": 0.01250932, "balance_loss_clip": 0.06259708, "balance_loss_mlp": 0.01249575, "epoch": 0.6643018187283932, "flos": 52010712362880.0, "grad_norm": 0.7369534070022873, "language_loss": 0.55030406, "learning_rate": 1.0700364187811745e-06, "loss": 0.62600172, "num_input_tokens_seen": 238503045, "router_z_loss_clip": 0.59277344, "router_z_loss_mlp": 0.01359558, "step": 11049, "time_per_iteration": 4.690397500991821 }, { "auxiliary_loss_clip": 0.06424459, "auxiliary_loss_mlp": 0.01269682, "balance_loss_clip": 0.06280748, "balance_loss_mlp": 0.01259722, "epoch": 0.6643619419810611, "flos": 30234463292160.0, "grad_norm": 1.5806475279224845, "language_loss": 0.64384657, "learning_rate": 1.069691638104648e-06, "loss": 0.720788, "num_input_tokens_seen": 238527320, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09960938, "step": 11050, "time_per_iteration": 2.652384042739868 }, { "auxiliary_loss_clip": 0.06422748, "auxiliary_loss_mlp": 0.01265193, "balance_loss_clip": 0.06279944, "balance_loss_mlp": 0.01254619, "epoch": 0.6644220652337292, "flos": 22972745848320.0, "grad_norm": 2.005497150787898, "language_loss": 0.79075223, "learning_rate": 1.0693468927055085e-06, "loss": 0.86763167, "num_input_tokens_seen": 238546030, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10577393, "step": 11051, "time_per_iteration": 2.6160500049591064 }, { "auxiliary_loss_clip": 0.06424124, "auxiliary_loss_mlp": 0.01266576, "balance_loss_clip": 0.06278265, "balance_loss_mlp": 0.01256676, "epoch": 0.6644821884863971, "flos": 21148602462720.0, "grad_norm": 1.5832639899229664, "language_loss": 0.85768282, "learning_rate": 1.0690021825968276e-06, "loss": 0.93458986, "num_input_tokens_seen": 238564175, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.09912109, "step": 11052, "time_per_iteration": 2.5856237411499023 }, { "auxiliary_loss_clip": 0.06433129, "auxiliary_loss_mlp": 0.01265413, "balance_loss_clip": 0.06282828, "balance_loss_mlp": 0.01254327, "epoch": 0.6645423117390651, "flos": 20198907486720.0, "grad_norm": 2.431003097621049, "language_loss": 0.75069141, "learning_rate": 1.0686575077916776e-06, "loss": 0.82767677, "num_input_tokens_seen": 238581010, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.11090088, "step": 11053, "time_per_iteration": 2.559692621231079 }, { "auxiliary_loss_clip": 0.06418476, "auxiliary_loss_mlp": 0.01266228, "balance_loss_clip": 0.0627545, "balance_loss_mlp": 0.01256065, "epoch": 0.6646024349917331, "flos": 24358700707200.0, "grad_norm": 1.4839564194573818, "language_loss": 0.79494619, "learning_rate": 1.0683128683031278e-06, "loss": 0.87179327, "num_input_tokens_seen": 238601365, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10168457, "step": 11054, "time_per_iteration": 2.600065231323242 }, { "auxiliary_loss_clip": 0.06422567, "auxiliary_loss_mlp": 0.01271281, "balance_loss_clip": 0.06280121, "balance_loss_mlp": 0.01261714, "epoch": 0.664662558244401, "flos": 18812617211520.0, "grad_norm": 1.5561087349761125, "language_loss": 0.74173188, "learning_rate": 1.0679682641442472e-06, "loss": 0.81867033, "num_input_tokens_seen": 238619850, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.09570312, "step": 11055, "time_per_iteration": 2.5454609394073486 }, { "auxiliary_loss_clip": 0.06426291, "auxiliary_loss_mlp": 0.01266606, "balance_loss_clip": 0.06278832, "balance_loss_mlp": 0.0125509, "epoch": 0.664722681497069, "flos": 18958749932160.0, "grad_norm": 1.946078631246332, "language_loss": 0.72791886, "learning_rate": 1.0676236953281042e-06, "loss": 0.80484778, "num_input_tokens_seen": 238637635, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.1151123, "step": 11056, "time_per_iteration": 2.5274908542633057 }, { "auxiliary_loss_clip": 0.06424308, "auxiliary_loss_mlp": 0.01266913, "balance_loss_clip": 0.0628093, "balance_loss_mlp": 0.01256333, "epoch": 0.6647828047497369, "flos": 19577046810240.0, "grad_norm": 1.8723645852958313, "language_loss": 0.70487869, "learning_rate": 1.0672791618677641e-06, "loss": 0.78179091, "num_input_tokens_seen": 238656200, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10583496, "step": 11057, "time_per_iteration": 2.568877696990967 }, { "auxiliary_loss_clip": 0.06423943, "auxiliary_loss_mlp": 0.01264821, "balance_loss_clip": 0.06277692, "balance_loss_mlp": 0.01254581, "epoch": 0.664842928002405, "flos": 23156250289920.0, "grad_norm": 1.7946743305772637, "language_loss": 0.8058219, "learning_rate": 1.066934663776291e-06, "loss": 0.8827095, "num_input_tokens_seen": 238675005, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.10247803, "step": 11058, "time_per_iteration": 2.5595405101776123 }, { "auxiliary_loss_clip": 0.06320253, "auxiliary_loss_mlp": 0.01252128, "balance_loss_clip": 0.06261358, "balance_loss_mlp": 0.01250886, "epoch": 0.6649030512550729, "flos": 65263326301440.0, "grad_norm": 0.7707710335363135, "language_loss": 0.62559098, "learning_rate": 1.0665902010667496e-06, "loss": 0.70131475, "num_input_tokens_seen": 238731425, "router_z_loss_clip": 0.58935547, "router_z_loss_mlp": 0.0124054, "step": 11059, "time_per_iteration": 3.0757949352264404 }, { "auxiliary_loss_clip": 0.06423324, "auxiliary_loss_mlp": 0.0126575, "balance_loss_clip": 0.06279548, "balance_loss_mlp": 0.01255951, "epoch": 0.6649631745077409, "flos": 20201213473920.0, "grad_norm": 1.4363497082217864, "language_loss": 0.78792417, "learning_rate": 1.0662457737522008e-06, "loss": 0.86481494, "num_input_tokens_seen": 238752020, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09796143, "step": 11060, "time_per_iteration": 2.5860607624053955 }, { "auxiliary_loss_clip": 0.06423348, "auxiliary_loss_mlp": 0.0126897, "balance_loss_clip": 0.06279393, "balance_loss_mlp": 0.01258718, "epoch": 0.6650232977604088, "flos": 17244331868160.0, "grad_norm": 1.5758067008132106, "language_loss": 0.78918207, "learning_rate": 1.0659013818457055e-06, "loss": 0.86610532, "num_input_tokens_seen": 238769665, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10247803, "step": 11061, "time_per_iteration": 2.522301197052002 }, { "auxiliary_loss_clip": 0.06423616, "auxiliary_loss_mlp": 0.01266158, "balance_loss_clip": 0.06280529, "balance_loss_mlp": 0.0125624, "epoch": 0.6650834210130768, "flos": 10010175217920.0, "grad_norm": 2.3696952540648506, "language_loss": 0.57026762, "learning_rate": 1.0655570253603243e-06, "loss": 0.64716542, "num_input_tokens_seen": 238782180, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09918213, "step": 11062, "time_per_iteration": 2.519041061401367 }, { "auxiliary_loss_clip": 0.06429697, "auxiliary_loss_mlp": 0.01266947, "balance_loss_clip": 0.0627971, "balance_loss_mlp": 0.01255265, "epoch": 0.6651435442657447, "flos": 10456707225600.0, "grad_norm": 1.6406345195333243, "language_loss": 0.76253116, "learning_rate": 1.0652127043091144e-06, "loss": 0.83949757, "num_input_tokens_seen": 238800315, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.11676025, "step": 11063, "time_per_iteration": 2.5405306816101074 }, { "auxiliary_loss_clip": 0.06426761, "auxiliary_loss_mlp": 0.01268106, "balance_loss_clip": 0.06281048, "balance_loss_mlp": 0.01258563, "epoch": 0.6652036675184128, "flos": 22350465901440.0, "grad_norm": 1.3078288500094146, "language_loss": 0.70634121, "learning_rate": 1.0648684187051316e-06, "loss": 0.78328985, "num_input_tokens_seen": 238822250, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.09539795, "step": 11064, "time_per_iteration": 2.61504864692688 }, { "auxiliary_loss_clip": 0.06315333, "auxiliary_loss_mlp": 0.01251031, "balance_loss_clip": 0.06256635, "balance_loss_mlp": 0.01249684, "epoch": 0.6652637907710807, "flos": 52925467386240.0, "grad_norm": 0.8176641965454148, "language_loss": 0.62944698, "learning_rate": 1.0645241685614322e-06, "loss": 0.70511067, "num_input_tokens_seen": 238877190, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01348877, "step": 11065, "time_per_iteration": 3.113645553588867 }, { "auxiliary_loss_clip": 0.06427565, "auxiliary_loss_mlp": 0.01265465, "balance_loss_clip": 0.06282061, "balance_loss_mlp": 0.01254992, "epoch": 0.6653239140237487, "flos": 23110031963520.0, "grad_norm": 1.5673348734626997, "language_loss": 0.62397659, "learning_rate": 1.0641799538910708e-06, "loss": 0.70090693, "num_input_tokens_seen": 238896010, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10473633, "step": 11066, "time_per_iteration": 2.6051084995269775 }, { "auxiliary_loss_clip": 0.06427512, "auxiliary_loss_mlp": 0.01264804, "balance_loss_clip": 0.06280153, "balance_loss_mlp": 0.01254266, "epoch": 0.6653840372764167, "flos": 25966747612800.0, "grad_norm": 1.436440577421291, "language_loss": 0.69935554, "learning_rate": 1.0638357747070985e-06, "loss": 0.77627867, "num_input_tokens_seen": 238918990, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.10546875, "step": 11067, "time_per_iteration": 2.6415603160858154 }, { "auxiliary_loss_clip": 0.06316639, "auxiliary_loss_mlp": 0.01251957, "balance_loss_clip": 0.06257521, "balance_loss_mlp": 0.01250746, "epoch": 0.6654441605290846, "flos": 66059593251840.0, "grad_norm": 0.8747771128966247, "language_loss": 0.71961427, "learning_rate": 1.0634916310225684e-06, "loss": 0.79530025, "num_input_tokens_seen": 238975735, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 0.01210022, "step": 11068, "time_per_iteration": 3.1465888023376465 }, { "auxiliary_loss_clip": 0.06315255, "auxiliary_loss_mlp": 0.01254761, "balance_loss_clip": 0.06256287, "balance_loss_mlp": 0.01253305, "epoch": 0.6655042837817526, "flos": 65218560693120.0, "grad_norm": 0.6819659439967898, "language_loss": 0.57699072, "learning_rate": 1.0631475228505285e-06, "loss": 0.65269089, "num_input_tokens_seen": 239042360, "router_z_loss_clip": 0.59033203, "router_z_loss_mlp": 0.01454926, "step": 11069, "time_per_iteration": 3.2836673259735107 }, { "auxiliary_loss_clip": 0.06316219, "auxiliary_loss_mlp": 0.012531, "balance_loss_clip": 0.06257399, "balance_loss_mlp": 0.01251887, "epoch": 0.6655644070344205, "flos": 69028759480320.0, "grad_norm": 0.740435201480988, "language_loss": 0.63487363, "learning_rate": 1.062803450204029e-06, "loss": 0.71056676, "num_input_tokens_seen": 239109410, "router_z_loss_clip": 0.58886719, "router_z_loss_mlp": 0.01211548, "step": 11070, "time_per_iteration": 4.713442325592041 }, { "auxiliary_loss_clip": 0.06421024, "auxiliary_loss_mlp": 0.01263756, "balance_loss_clip": 0.06275544, "balance_loss_mlp": 0.01253987, "epoch": 0.6656245302870886, "flos": 36323680953600.0, "grad_norm": 1.6545154434675224, "language_loss": 0.58687592, "learning_rate": 1.062459413096116e-06, "loss": 0.66372371, "num_input_tokens_seen": 239135345, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.09765625, "step": 11071, "time_per_iteration": 2.7059202194213867 }, { "auxiliary_loss_clip": 0.06423078, "auxiliary_loss_mlp": 0.01270839, "balance_loss_clip": 0.06279759, "balance_loss_mlp": 0.01261112, "epoch": 0.6656846535397565, "flos": 21800623409280.0, "grad_norm": 2.4947542229635737, "language_loss": 0.73349667, "learning_rate": 1.0621154115398364e-06, "loss": 0.81043589, "num_input_tokens_seen": 239154340, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.09729004, "step": 11072, "time_per_iteration": 2.5829598903656006 }, { "auxiliary_loss_clip": 0.06417295, "auxiliary_loss_mlp": 0.01266662, "balance_loss_clip": 0.06275234, "balance_loss_mlp": 0.01256363, "epoch": 0.6657447767924245, "flos": 37496683860480.0, "grad_norm": 1.8178542710452514, "language_loss": 0.70779741, "learning_rate": 1.0617714455482353e-06, "loss": 0.78463697, "num_input_tokens_seen": 239177815, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10296631, "step": 11073, "time_per_iteration": 2.705275058746338 }, { "auxiliary_loss_clip": 0.06429535, "auxiliary_loss_mlp": 0.01263203, "balance_loss_clip": 0.06279889, "balance_loss_mlp": 0.01252797, "epoch": 0.6658049000450924, "flos": 16843473135360.0, "grad_norm": 1.9707286520377372, "language_loss": 0.56578571, "learning_rate": 1.061427515134354e-06, "loss": 0.64271307, "num_input_tokens_seen": 239195735, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.10412598, "step": 11074, "time_per_iteration": 2.5795280933380127 }, { "auxiliary_loss_clip": 0.06421011, "auxiliary_loss_mlp": 0.01266837, "balance_loss_clip": 0.06278351, "balance_loss_mlp": 0.01257098, "epoch": 0.6658650232977604, "flos": 33519430759680.0, "grad_norm": 1.604111644545677, "language_loss": 0.72646254, "learning_rate": 1.061083620311235e-06, "loss": 0.80334097, "num_input_tokens_seen": 239217535, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09735107, "step": 11075, "time_per_iteration": 2.657156467437744 }, { "auxiliary_loss_clip": 0.06415693, "auxiliary_loss_mlp": 0.01264721, "balance_loss_clip": 0.06274929, "balance_loss_mlp": 0.01255459, "epoch": 0.6659251465504283, "flos": 37715379816960.0, "grad_norm": 1.4257072191328308, "language_loss": 0.66677475, "learning_rate": 1.0607397610919202e-06, "loss": 0.74357885, "num_input_tokens_seen": 239241975, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09259033, "step": 11076, "time_per_iteration": 2.692946672439575 }, { "auxiliary_loss_clip": 0.06417337, "auxiliary_loss_mlp": 0.01267341, "balance_loss_clip": 0.06274949, "balance_loss_mlp": 0.01256952, "epoch": 0.6659852698030964, "flos": 24899277323520.0, "grad_norm": 1.6192340791278301, "language_loss": 0.75568962, "learning_rate": 1.0603959374894468e-06, "loss": 0.83253634, "num_input_tokens_seen": 239262025, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10394287, "step": 11077, "time_per_iteration": 3.989168405532837 }, { "auxiliary_loss_clip": 0.06420616, "auxiliary_loss_mlp": 0.01264592, "balance_loss_clip": 0.06276082, "balance_loss_mlp": 0.01254471, "epoch": 0.6660453930557643, "flos": 24359706956160.0, "grad_norm": 1.5136189662905721, "language_loss": 0.66732872, "learning_rate": 1.0600521495168538e-06, "loss": 0.7441808, "num_input_tokens_seen": 239282775, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10113525, "step": 11078, "time_per_iteration": 2.575098991394043 }, { "auxiliary_loss_clip": 0.06429733, "auxiliary_loss_mlp": 0.01271509, "balance_loss_clip": 0.0628178, "balance_loss_mlp": 0.01260929, "epoch": 0.6661055163084323, "flos": 10602420675840.0, "grad_norm": 1.792606750136591, "language_loss": 0.69835252, "learning_rate": 1.0597083971871783e-06, "loss": 0.775365, "num_input_tokens_seen": 239299775, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.10583496, "step": 11079, "time_per_iteration": 2.757232189178467 }, { "auxiliary_loss_clip": 0.06419145, "auxiliary_loss_mlp": 0.0126764, "balance_loss_clip": 0.06274916, "balance_loss_mlp": 0.0125768, "epoch": 0.6661656395611003, "flos": 24063751935360.0, "grad_norm": 1.3906747078060535, "language_loss": 0.80514872, "learning_rate": 1.0593646805134544e-06, "loss": 0.8820166, "num_input_tokens_seen": 239319660, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.09954834, "step": 11080, "time_per_iteration": 2.787625551223755 }, { "auxiliary_loss_clip": 0.06412461, "auxiliary_loss_mlp": 0.01262472, "balance_loss_clip": 0.06273934, "balance_loss_mlp": 0.01252369, "epoch": 0.6662257628137682, "flos": 23042332264320.0, "grad_norm": 1.7597011977601558, "language_loss": 0.78329986, "learning_rate": 1.0590209995087157e-06, "loss": 0.86004925, "num_input_tokens_seen": 239339215, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.10101318, "step": 11081, "time_per_iteration": 2.6122329235076904 }, { "auxiliary_loss_clip": 0.06424475, "auxiliary_loss_mlp": 0.01267291, "balance_loss_clip": 0.06277339, "balance_loss_mlp": 0.0125627, "epoch": 0.6662858860664362, "flos": 24761446156800.0, "grad_norm": 1.7416665736906964, "language_loss": 0.79810238, "learning_rate": 1.0586773541859946e-06, "loss": 0.87502003, "num_input_tokens_seen": 239358545, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.11016846, "step": 11082, "time_per_iteration": 2.55841326713562 }, { "auxiliary_loss_clip": 0.06421977, "auxiliary_loss_mlp": 0.01269475, "balance_loss_clip": 0.0627825, "balance_loss_mlp": 0.01259754, "epoch": 0.6663460093191041, "flos": 20014899920640.0, "grad_norm": 1.4590733179603732, "language_loss": 0.84049964, "learning_rate": 1.0583337445583234e-06, "loss": 0.91741419, "num_input_tokens_seen": 239376665, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09729004, "step": 11083, "time_per_iteration": 2.539436101913452 }, { "auxiliary_loss_clip": 0.06429996, "auxiliary_loss_mlp": 0.01265352, "balance_loss_clip": 0.06280264, "balance_loss_mlp": 0.01254796, "epoch": 0.6664061325717722, "flos": 17827101815040.0, "grad_norm": 3.3192226152049287, "language_loss": 0.85455894, "learning_rate": 1.057990170638731e-06, "loss": 0.93151242, "num_input_tokens_seen": 239394345, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.10552979, "step": 11084, "time_per_iteration": 2.540639877319336 }, { "auxiliary_loss_clip": 0.06428604, "auxiliary_loss_mlp": 0.01264644, "balance_loss_clip": 0.06279531, "balance_loss_mlp": 0.01254165, "epoch": 0.6664662558244401, "flos": 18082666368000.0, "grad_norm": 2.4990441111371586, "language_loss": 0.72801483, "learning_rate": 1.0576466324402452e-06, "loss": 0.80494732, "num_input_tokens_seen": 239410605, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.1048584, "step": 11085, "time_per_iteration": 2.525700330734253 }, { "auxiliary_loss_clip": 0.06420916, "auxiliary_loss_mlp": 0.01265937, "balance_loss_clip": 0.06277463, "balance_loss_mlp": 0.01255399, "epoch": 0.6665263790771081, "flos": 21579663392640.0, "grad_norm": 1.7665896969031403, "language_loss": 0.80169308, "learning_rate": 1.057303129975894e-06, "loss": 0.87856162, "num_input_tokens_seen": 239427155, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10534668, "step": 11086, "time_per_iteration": 3.9852135181427 }, { "auxiliary_loss_clip": 0.06417984, "auxiliary_loss_mlp": 0.01268231, "balance_loss_clip": 0.06275154, "balance_loss_mlp": 0.0125783, "epoch": 0.666586502329776, "flos": 24213448454400.0, "grad_norm": 2.1346796665687857, "language_loss": 0.74808818, "learning_rate": 1.056959663258702e-06, "loss": 0.82495034, "num_input_tokens_seen": 239445510, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10400391, "step": 11087, "time_per_iteration": 2.5452895164489746 }, { "auxiliary_loss_clip": 0.06420603, "auxiliary_loss_mlp": 0.0126343, "balance_loss_clip": 0.06278051, "balance_loss_mlp": 0.01253077, "epoch": 0.666646625582444, "flos": 22207100365440.0, "grad_norm": 1.8138533096888598, "language_loss": 0.65127003, "learning_rate": 1.0566162323016939e-06, "loss": 0.72811043, "num_input_tokens_seen": 239464805, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10351562, "step": 11088, "time_per_iteration": 2.5410618782043457 }, { "auxiliary_loss_clip": 0.06423485, "auxiliary_loss_mlp": 0.01267848, "balance_loss_clip": 0.06278232, "balance_loss_mlp": 0.01256475, "epoch": 0.6667067488351119, "flos": 18265835393280.0, "grad_norm": 1.8059796500415164, "language_loss": 0.63905787, "learning_rate": 1.0562728371178928e-06, "loss": 0.71597123, "num_input_tokens_seen": 239483890, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.11376953, "step": 11089, "time_per_iteration": 3.906068801879883 }, { "auxiliary_loss_clip": 0.06420365, "auxiliary_loss_mlp": 0.01264282, "balance_loss_clip": 0.06278682, "balance_loss_mlp": 0.01254984, "epoch": 0.66676687208778, "flos": 17241983953920.0, "grad_norm": 2.3642634054294795, "language_loss": 0.80982584, "learning_rate": 1.0559294777203221e-06, "loss": 0.88667226, "num_input_tokens_seen": 239500080, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09301758, "step": 11090, "time_per_iteration": 2.5166077613830566 }, { "auxiliary_loss_clip": 0.06424736, "auxiliary_loss_mlp": 0.01265451, "balance_loss_clip": 0.06276207, "balance_loss_mlp": 0.012553, "epoch": 0.6668269953404479, "flos": 19757742140160.0, "grad_norm": 2.193936431041838, "language_loss": 0.77354962, "learning_rate": 1.0555861541219984e-06, "loss": 0.85045147, "num_input_tokens_seen": 239517335, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.10150146, "step": 11091, "time_per_iteration": 2.5497517585754395 }, { "auxiliary_loss_clip": 0.06423396, "auxiliary_loss_mlp": 0.01264476, "balance_loss_clip": 0.06278829, "balance_loss_mlp": 0.01254564, "epoch": 0.6668871185931159, "flos": 20564700485760.0, "grad_norm": 1.8251258961212398, "language_loss": 0.79451829, "learning_rate": 1.0552428663359425e-06, "loss": 0.87139702, "num_input_tokens_seen": 239536240, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.09918213, "step": 11092, "time_per_iteration": 2.5990936756134033 }, { "auxiliary_loss_clip": 0.06318416, "auxiliary_loss_mlp": 0.01254294, "balance_loss_clip": 0.0625961, "balance_loss_mlp": 0.01252969, "epoch": 0.6669472418457839, "flos": 58104458144640.0, "grad_norm": 0.7510560249973143, "language_loss": 0.57552445, "learning_rate": 1.0548996143751724e-06, "loss": 0.65125149, "num_input_tokens_seen": 239598000, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01325989, "step": 11093, "time_per_iteration": 3.1684646606445312 }, { "auxiliary_loss_clip": 0.06420238, "auxiliary_loss_mlp": 0.01266651, "balance_loss_clip": 0.06277148, "balance_loss_mlp": 0.0125659, "epoch": 0.6670073650984518, "flos": 26071860960000.0, "grad_norm": 1.5554609627102256, "language_loss": 0.76501948, "learning_rate": 1.054556398252703e-06, "loss": 0.84188831, "num_input_tokens_seen": 239617650, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.10058594, "step": 11094, "time_per_iteration": 2.587846517562866 }, { "auxiliary_loss_clip": 0.06419582, "auxiliary_loss_mlp": 0.01266915, "balance_loss_clip": 0.06276069, "balance_loss_mlp": 0.0125652, "epoch": 0.6670674883511198, "flos": 32425196290560.0, "grad_norm": 1.7538083763276886, "language_loss": 0.7339108, "learning_rate": 1.05421321798155e-06, "loss": 0.81077576, "num_input_tokens_seen": 239639825, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10394287, "step": 11095, "time_per_iteration": 2.70451021194458 }, { "auxiliary_loss_clip": 0.06423847, "auxiliary_loss_mlp": 0.01270943, "balance_loss_clip": 0.06280413, "balance_loss_mlp": 0.01260858, "epoch": 0.6671276116037878, "flos": 18043114440960.0, "grad_norm": 2.482325518019424, "language_loss": 0.73709714, "learning_rate": 1.053870073574727e-06, "loss": 0.81404507, "num_input_tokens_seen": 239656300, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10083008, "step": 11096, "time_per_iteration": 2.5604207515716553 }, { "auxiliary_loss_clip": 0.06417532, "auxiliary_loss_mlp": 0.01267074, "balance_loss_clip": 0.06277204, "balance_loss_mlp": 0.0125721, "epoch": 0.6671877348564558, "flos": 23773498992000.0, "grad_norm": 1.8204358137095364, "language_loss": 0.65154535, "learning_rate": 1.0535269650452456e-06, "loss": 0.72839141, "num_input_tokens_seen": 239676655, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09863281, "step": 11097, "time_per_iteration": 2.559119462966919 }, { "auxiliary_loss_clip": 0.06424668, "auxiliary_loss_mlp": 0.01268262, "balance_loss_clip": 0.0627639, "balance_loss_mlp": 0.01258004, "epoch": 0.6672478581091237, "flos": 20923869012480.0, "grad_norm": 1.8661810254987539, "language_loss": 0.76246423, "learning_rate": 1.0531838924061158e-06, "loss": 0.8393935, "num_input_tokens_seen": 239695430, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.10266113, "step": 11098, "time_per_iteration": 2.534773826599121 }, { "auxiliary_loss_clip": 0.06423596, "auxiliary_loss_mlp": 0.01269304, "balance_loss_clip": 0.06277122, "balance_loss_mlp": 0.01259344, "epoch": 0.6673079813617917, "flos": 27863328453120.0, "grad_norm": 1.6990786181790956, "language_loss": 0.74877977, "learning_rate": 1.0528408556703476e-06, "loss": 0.82570875, "num_input_tokens_seen": 239717070, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.09960938, "step": 11099, "time_per_iteration": 2.6020419597625732 }, { "auxiliary_loss_clip": 0.06421246, "auxiliary_loss_mlp": 0.01264816, "balance_loss_clip": 0.06280428, "balance_loss_mlp": 0.01254886, "epoch": 0.6673681046144596, "flos": 21623366096640.0, "grad_norm": 1.7823059152095708, "language_loss": 0.78721172, "learning_rate": 1.0524978548509502e-06, "loss": 0.86407232, "num_input_tokens_seen": 239737105, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.0993042, "step": 11100, "time_per_iteration": 2.5506417751312256 }, { "auxiliary_loss_clip": 0.06423502, "auxiliary_loss_mlp": 0.01264605, "balance_loss_clip": 0.06280927, "balance_loss_mlp": 0.01254961, "epoch": 0.6674282278671276, "flos": 20896727489280.0, "grad_norm": 2.008679861047657, "language_loss": 0.60268092, "learning_rate": 1.0521548899609288e-06, "loss": 0.67956197, "num_input_tokens_seen": 239757835, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.09649658, "step": 11101, "time_per_iteration": 2.5556769371032715 }, { "auxiliary_loss_clip": 0.06432903, "auxiliary_loss_mlp": 0.01264752, "balance_loss_clip": 0.06280435, "balance_loss_mlp": 0.01253308, "epoch": 0.6674883511197955, "flos": 23631139704960.0, "grad_norm": 1.803390932009768, "language_loss": 0.71340477, "learning_rate": 1.0518119610132884e-06, "loss": 0.79038131, "num_input_tokens_seen": 239775425, "router_z_loss_clip": 1.52636719, "router_z_loss_mlp": 0.11431885, "step": 11102, "time_per_iteration": 2.560389280319214 }, { "auxiliary_loss_clip": 0.0642143, "auxiliary_loss_mlp": 0.01264649, "balance_loss_clip": 0.06275699, "balance_loss_mlp": 0.01254945, "epoch": 0.6675484743724636, "flos": 19615760196480.0, "grad_norm": 1.4610206736234639, "language_loss": 0.84599984, "learning_rate": 1.051469068021034e-06, "loss": 0.92286062, "num_input_tokens_seen": 239794605, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.0970459, "step": 11103, "time_per_iteration": 2.5301105976104736 }, { "auxiliary_loss_clip": 0.06420086, "auxiliary_loss_mlp": 0.01263962, "balance_loss_clip": 0.06275305, "balance_loss_mlp": 0.0125486, "epoch": 0.6676085976251315, "flos": 14324696202240.0, "grad_norm": 1.9551297229706117, "language_loss": 0.78635991, "learning_rate": 1.0511262109971668e-06, "loss": 0.86320043, "num_input_tokens_seen": 239812135, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.09106445, "step": 11104, "time_per_iteration": 2.5351643562316895 }, { "auxiliary_loss_clip": 0.0643189, "auxiliary_loss_mlp": 0.01265244, "balance_loss_clip": 0.06282182, "balance_loss_mlp": 0.01255118, "epoch": 0.6676687208777995, "flos": 38113219802880.0, "grad_norm": 1.709983061603157, "language_loss": 0.58627814, "learning_rate": 1.0507833899546889e-06, "loss": 0.66324949, "num_input_tokens_seen": 239835845, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.10125732, "step": 11105, "time_per_iteration": 2.6997063159942627 }, { "auxiliary_loss_clip": 0.06430735, "auxiliary_loss_mlp": 0.01266952, "balance_loss_clip": 0.06278421, "balance_loss_mlp": 0.01255496, "epoch": 0.6677288441304675, "flos": 23987331411840.0, "grad_norm": 1.6273933057876566, "language_loss": 0.73288816, "learning_rate": 1.0504406049066e-06, "loss": 0.809865, "num_input_tokens_seen": 239853820, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.11450195, "step": 11106, "time_per_iteration": 2.57472825050354 }, { "auxiliary_loss_clip": 0.06422803, "auxiliary_loss_mlp": 0.01267598, "balance_loss_clip": 0.06277767, "balance_loss_mlp": 0.01257441, "epoch": 0.6677889673831354, "flos": 24177586106880.0, "grad_norm": 1.9830838724887485, "language_loss": 0.76547575, "learning_rate": 1.0500978558659e-06, "loss": 0.84237975, "num_input_tokens_seen": 239873365, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10150146, "step": 11107, "time_per_iteration": 2.5575389862060547 }, { "auxiliary_loss_clip": 0.06418133, "auxiliary_loss_mlp": 0.01271032, "balance_loss_clip": 0.06278995, "balance_loss_mlp": 0.01261149, "epoch": 0.6678490906358034, "flos": 22316196781440.0, "grad_norm": 2.128434807619882, "language_loss": 0.89665723, "learning_rate": 1.049755142845583e-06, "loss": 0.97354889, "num_input_tokens_seen": 239891215, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09881592, "step": 11108, "time_per_iteration": 2.542508840560913 }, { "auxiliary_loss_clip": 0.06420156, "auxiliary_loss_mlp": 0.01263982, "balance_loss_clip": 0.06277567, "balance_loss_mlp": 0.01255256, "epoch": 0.6679092138884714, "flos": 36906870170880.0, "grad_norm": 1.4464562510877397, "language_loss": 0.82942379, "learning_rate": 1.049412465858646e-06, "loss": 0.90626526, "num_input_tokens_seen": 239913490, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.08728027, "step": 11109, "time_per_iteration": 2.6662423610687256 }, { "auxiliary_loss_clip": 0.06425565, "auxiliary_loss_mlp": 0.0126712, "balance_loss_clip": 0.06280264, "balance_loss_mlp": 0.0125629, "epoch": 0.6679693371411394, "flos": 18156151998720.0, "grad_norm": 1.8040605878808198, "language_loss": 0.6995818, "learning_rate": 1.0490698249180847e-06, "loss": 0.77650869, "num_input_tokens_seen": 239931565, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.1083374, "step": 11110, "time_per_iteration": 3.953439950942993 }, { "auxiliary_loss_clip": 0.06434301, "auxiliary_loss_mlp": 0.01268701, "balance_loss_clip": 0.06285743, "balance_loss_mlp": 0.01256845, "epoch": 0.6680294603938073, "flos": 27205437720960.0, "grad_norm": 1.5020046653177106, "language_loss": 0.74052614, "learning_rate": 1.04872722003689e-06, "loss": 0.81755614, "num_input_tokens_seen": 239952395, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.11853027, "step": 11111, "time_per_iteration": 2.596170663833618 }, { "auxiliary_loss_clip": 0.06425332, "auxiliary_loss_mlp": 0.0126649, "balance_loss_clip": 0.0628175, "balance_loss_mlp": 0.01256751, "epoch": 0.6680895836464753, "flos": 21731665898880.0, "grad_norm": 1.8736655011681833, "language_loss": 0.65212691, "learning_rate": 1.0483846512280553e-06, "loss": 0.72904515, "num_input_tokens_seen": 239968910, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.09741211, "step": 11112, "time_per_iteration": 2.5941309928894043 }, { "auxiliary_loss_clip": 0.06423119, "auxiliary_loss_mlp": 0.01262743, "balance_loss_clip": 0.06278273, "balance_loss_mlp": 0.01252843, "epoch": 0.6681497068991432, "flos": 19652628792960.0, "grad_norm": 2.4419129093422476, "language_loss": 0.63264197, "learning_rate": 1.048042118504569e-06, "loss": 0.70950055, "num_input_tokens_seen": 239987680, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.09906006, "step": 11113, "time_per_iteration": 2.5386478900909424 }, { "auxiliary_loss_clip": 0.06420882, "auxiliary_loss_mlp": 0.01266282, "balance_loss_clip": 0.06281536, "balance_loss_mlp": 0.01256888, "epoch": 0.6682098301518112, "flos": 17424649854720.0, "grad_norm": 1.7591450402483717, "language_loss": 0.6619851, "learning_rate": 1.047699621879422e-06, "loss": 0.73885679, "num_input_tokens_seen": 240005790, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09387207, "step": 11114, "time_per_iteration": 2.513784170150757 }, { "auxiliary_loss_clip": 0.06418891, "auxiliary_loss_mlp": 0.01264869, "balance_loss_clip": 0.06275597, "balance_loss_mlp": 0.0125476, "epoch": 0.6682699534044791, "flos": 22605191913600.0, "grad_norm": 1.5992556061420602, "language_loss": 0.7903831, "learning_rate": 1.0473571613655998e-06, "loss": 0.8672207, "num_input_tokens_seen": 240025895, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10107422, "step": 11115, "time_per_iteration": 2.546787977218628 }, { "auxiliary_loss_clip": 0.06422342, "auxiliary_loss_mlp": 0.01268228, "balance_loss_clip": 0.06276067, "balance_loss_mlp": 0.01258119, "epoch": 0.6683300766571472, "flos": 24870668353920.0, "grad_norm": 1.7236363884599946, "language_loss": 0.80050969, "learning_rate": 1.0470147369760896e-06, "loss": 0.87741542, "num_input_tokens_seen": 240044880, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.10113525, "step": 11116, "time_per_iteration": 2.578786849975586 }, { "auxiliary_loss_clip": 0.06426404, "auxiliary_loss_mlp": 0.01272952, "balance_loss_clip": 0.0627966, "balance_loss_mlp": 0.01262498, "epoch": 0.6683901999098151, "flos": 27134132296320.0, "grad_norm": 1.9898481123612015, "language_loss": 0.79406154, "learning_rate": 1.0466723487238768e-06, "loss": 0.87105513, "num_input_tokens_seen": 240065785, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10461426, "step": 11117, "time_per_iteration": 4.061743497848511 }, { "auxiliary_loss_clip": 0.06426053, "auxiliary_loss_mlp": 0.01268662, "balance_loss_clip": 0.06279858, "balance_loss_mlp": 0.01257409, "epoch": 0.6684503231624831, "flos": 20745018472320.0, "grad_norm": 1.8734419332852617, "language_loss": 0.66136539, "learning_rate": 1.0463299966219441e-06, "loss": 0.7383126, "num_input_tokens_seen": 240085130, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.11248779, "step": 11118, "time_per_iteration": 2.5459957122802734 }, { "auxiliary_loss_clip": 0.06420343, "auxiliary_loss_mlp": 0.01264464, "balance_loss_clip": 0.06276928, "balance_loss_mlp": 0.01254844, "epoch": 0.668510446415151, "flos": 21768618349440.0, "grad_norm": 2.1270010350900495, "language_loss": 0.69107974, "learning_rate": 1.0459876806832727e-06, "loss": 0.76792783, "num_input_tokens_seen": 240105495, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09619141, "step": 11119, "time_per_iteration": 2.5656797885894775 }, { "auxiliary_loss_clip": 0.06424236, "auxiliary_loss_mlp": 0.0126506, "balance_loss_clip": 0.06277775, "balance_loss_mlp": 0.012551, "epoch": 0.668570569667819, "flos": 30199229850240.0, "grad_norm": 1.5814817761035835, "language_loss": 0.67926389, "learning_rate": 1.0456454009208448e-06, "loss": 0.75615686, "num_input_tokens_seen": 240125455, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.0994873, "step": 11120, "time_per_iteration": 2.611865997314453 }, { "auxiliary_loss_clip": 0.06424055, "auxiliary_loss_mlp": 0.01266502, "balance_loss_clip": 0.06278019, "balance_loss_mlp": 0.01255594, "epoch": 0.668630692920487, "flos": 24177544179840.0, "grad_norm": 1.8527674442043705, "language_loss": 0.72455227, "learning_rate": 1.045303157347638e-06, "loss": 0.80145788, "num_input_tokens_seen": 240143870, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10900879, "step": 11121, "time_per_iteration": 2.5685484409332275 }, { "auxiliary_loss_clip": 0.06424008, "auxiliary_loss_mlp": 0.0126803, "balance_loss_clip": 0.06276524, "balance_loss_mlp": 0.01258153, "epoch": 0.668690816173155, "flos": 17462902043520.0, "grad_norm": 2.662018086466236, "language_loss": 0.70134389, "learning_rate": 1.0449609499766316e-06, "loss": 0.77826428, "num_input_tokens_seen": 240161020, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.09875488, "step": 11122, "time_per_iteration": 2.5004539489746094 }, { "auxiliary_loss_clip": 0.06424162, "auxiliary_loss_mlp": 0.01264471, "balance_loss_clip": 0.0627825, "balance_loss_mlp": 0.01254362, "epoch": 0.668750939425823, "flos": 25011350559360.0, "grad_norm": 1.6260082075536246, "language_loss": 0.71678948, "learning_rate": 1.0446187788208015e-06, "loss": 0.79367584, "num_input_tokens_seen": 240179820, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10113525, "step": 11123, "time_per_iteration": 2.602024793624878 }, { "auxiliary_loss_clip": 0.06429295, "auxiliary_loss_mlp": 0.01269778, "balance_loss_clip": 0.06280133, "balance_loss_mlp": 0.0125933, "epoch": 0.6688110626784909, "flos": 24103513497600.0, "grad_norm": 1.4264197779714074, "language_loss": 0.79606098, "learning_rate": 1.0442766438931244e-06, "loss": 0.8730517, "num_input_tokens_seen": 240200130, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.10449219, "step": 11124, "time_per_iteration": 2.5603530406951904 }, { "auxiliary_loss_clip": 0.06422846, "auxiliary_loss_mlp": 0.01266478, "balance_loss_clip": 0.06278455, "balance_loss_mlp": 0.01256882, "epoch": 0.6688711859311589, "flos": 21765515748480.0, "grad_norm": 1.6402815442962875, "language_loss": 0.743554, "learning_rate": 1.0439345452065716e-06, "loss": 0.82044721, "num_input_tokens_seen": 240217945, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.09594727, "step": 11125, "time_per_iteration": 2.5460243225097656 }, { "auxiliary_loss_clip": 0.0642599, "auxiliary_loss_mlp": 0.0126662, "balance_loss_clip": 0.06279393, "balance_loss_mlp": 0.01256141, "epoch": 0.6689313091838268, "flos": 22936254595200.0, "grad_norm": 2.0624563057823275, "language_loss": 0.66981775, "learning_rate": 1.043592482774116e-06, "loss": 0.74674386, "num_input_tokens_seen": 240237220, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10479736, "step": 11126, "time_per_iteration": 4.13178014755249 }, { "auxiliary_loss_clip": 0.06425689, "auxiliary_loss_mlp": 0.01267135, "balance_loss_clip": 0.0627943, "balance_loss_mlp": 0.01257109, "epoch": 0.6689914324364948, "flos": 20892367077120.0, "grad_norm": 1.5920364926803308, "language_loss": 0.71539426, "learning_rate": 1.0432504566087305e-06, "loss": 0.79232246, "num_input_tokens_seen": 240256000, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.10015869, "step": 11127, "time_per_iteration": 2.687669038772583 }, { "auxiliary_loss_clip": 0.06428264, "auxiliary_loss_mlp": 0.012709, "balance_loss_clip": 0.06277175, "balance_loss_mlp": 0.01259224, "epoch": 0.6690515556891627, "flos": 22754972286720.0, "grad_norm": 1.932428572100985, "language_loss": 0.80856562, "learning_rate": 1.0429084667233827e-06, "loss": 0.88555723, "num_input_tokens_seen": 240275845, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.11688232, "step": 11128, "time_per_iteration": 2.688192844390869 }, { "auxiliary_loss_clip": 0.06423753, "auxiliary_loss_mlp": 0.01267522, "balance_loss_clip": 0.06275205, "balance_loss_mlp": 0.01257145, "epoch": 0.6691116789418308, "flos": 23338203431040.0, "grad_norm": 1.5985403218644996, "language_loss": 0.81226647, "learning_rate": 1.0425665131310427e-06, "loss": 0.88917923, "num_input_tokens_seen": 240294095, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.10375977, "step": 11129, "time_per_iteration": 4.003838062286377 }, { "auxiliary_loss_clip": 0.06416684, "auxiliary_loss_mlp": 0.01266701, "balance_loss_clip": 0.06276041, "balance_loss_mlp": 0.01256729, "epoch": 0.6691718021944987, "flos": 32454308384640.0, "grad_norm": 2.277321255454589, "language_loss": 0.7044546, "learning_rate": 1.0422245958446762e-06, "loss": 0.78128844, "num_input_tokens_seen": 240313460, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09973145, "step": 11130, "time_per_iteration": 2.624223232269287 }, { "auxiliary_loss_clip": 0.06419064, "auxiliary_loss_mlp": 0.01266733, "balance_loss_clip": 0.06277336, "balance_loss_mlp": 0.01257107, "epoch": 0.6692319254471667, "flos": 23738223623040.0, "grad_norm": 1.5999072206830112, "language_loss": 0.70591378, "learning_rate": 1.0418827148772486e-06, "loss": 0.78277177, "num_input_tokens_seen": 240333540, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09631348, "step": 11131, "time_per_iteration": 2.583549737930298 }, { "auxiliary_loss_clip": 0.06424788, "auxiliary_loss_mlp": 0.01267787, "balance_loss_clip": 0.06276406, "balance_loss_mlp": 0.0125648, "epoch": 0.6692920486998346, "flos": 14432996004480.0, "grad_norm": 2.4161956152622777, "language_loss": 0.66154569, "learning_rate": 1.0415408702417243e-06, "loss": 0.73847139, "num_input_tokens_seen": 240350085, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.11309814, "step": 11132, "time_per_iteration": 2.523367404937744 }, { "auxiliary_loss_clip": 0.06428893, "auxiliary_loss_mlp": 0.01267057, "balance_loss_clip": 0.06280234, "balance_loss_mlp": 0.01255875, "epoch": 0.6693521719525026, "flos": 21513976191360.0, "grad_norm": 1.7300886821683854, "language_loss": 0.74768466, "learning_rate": 1.0411990619510661e-06, "loss": 0.82464409, "num_input_tokens_seen": 240370015, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.11181641, "step": 11133, "time_per_iteration": 2.5631930828094482 }, { "auxiliary_loss_clip": 0.06433589, "auxiliary_loss_mlp": 0.01271541, "balance_loss_clip": 0.06282978, "balance_loss_mlp": 0.01259448, "epoch": 0.6694122952051706, "flos": 25413341322240.0, "grad_norm": 1.8080474026443392, "language_loss": 0.66543001, "learning_rate": 1.0408572900182363e-06, "loss": 0.74248135, "num_input_tokens_seen": 240390770, "router_z_loss_clip": 1.50683594, "router_z_loss_mlp": 0.12103271, "step": 11134, "time_per_iteration": 2.5931525230407715 }, { "auxiliary_loss_clip": 0.06434737, "auxiliary_loss_mlp": 0.01267339, "balance_loss_clip": 0.062831, "balance_loss_mlp": 0.01256389, "epoch": 0.6694724184578386, "flos": 25668067334400.0, "grad_norm": 1.8586135818257028, "language_loss": 0.7736398, "learning_rate": 1.0405155544561943e-06, "loss": 0.85066056, "num_input_tokens_seen": 240409590, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.10955811, "step": 11135, "time_per_iteration": 2.5880606174468994 }, { "auxiliary_loss_clip": 0.06418115, "auxiliary_loss_mlp": 0.01265349, "balance_loss_clip": 0.06275301, "balance_loss_mlp": 0.01255044, "epoch": 0.6695325417105066, "flos": 17714567381760.0, "grad_norm": 1.9675137324258134, "language_loss": 0.73993289, "learning_rate": 1.040173855277898e-06, "loss": 0.81676751, "num_input_tokens_seen": 240428180, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10314941, "step": 11136, "time_per_iteration": 2.551330089569092 }, { "auxiliary_loss_clip": 0.06435462, "auxiliary_loss_mlp": 0.01266243, "balance_loss_clip": 0.06282547, "balance_loss_mlp": 0.01254966, "epoch": 0.6695926649631745, "flos": 24466581239040.0, "grad_norm": 1.5947838883555125, "language_loss": 0.62117159, "learning_rate": 1.0398321924963061e-06, "loss": 0.69818866, "num_input_tokens_seen": 240447815, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.11273193, "step": 11137, "time_per_iteration": 2.6083192825317383 }, { "auxiliary_loss_clip": 0.06428062, "auxiliary_loss_mlp": 0.01268589, "balance_loss_clip": 0.06282493, "balance_loss_mlp": 0.0125795, "epoch": 0.6696527882158425, "flos": 24287059866240.0, "grad_norm": 2.0954044619691103, "language_loss": 0.66557884, "learning_rate": 1.0394905661243724e-06, "loss": 0.74254537, "num_input_tokens_seen": 240468635, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10638428, "step": 11138, "time_per_iteration": 2.5908279418945312 }, { "auxiliary_loss_clip": 0.0642013, "auxiliary_loss_mlp": 0.01265802, "balance_loss_clip": 0.06278473, "balance_loss_mlp": 0.01256111, "epoch": 0.6697129114685104, "flos": 23009404809600.0, "grad_norm": 1.5739903800630592, "language_loss": 0.73138916, "learning_rate": 1.039148976175053e-06, "loss": 0.80824852, "num_input_tokens_seen": 240488550, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09692383, "step": 11139, "time_per_iteration": 2.5660781860351562 }, { "auxiliary_loss_clip": 0.06415105, "auxiliary_loss_mlp": 0.01266488, "balance_loss_clip": 0.06275581, "balance_loss_mlp": 0.01256832, "epoch": 0.6697730347211784, "flos": 22644743840640.0, "grad_norm": 2.472915060630476, "language_loss": 0.70780188, "learning_rate": 1.0388074226613016e-06, "loss": 0.78461778, "num_input_tokens_seen": 240508330, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09655762, "step": 11140, "time_per_iteration": 2.624868869781494 }, { "auxiliary_loss_clip": 0.0642857, "auxiliary_loss_mlp": 0.01265524, "balance_loss_clip": 0.06279358, "balance_loss_mlp": 0.01254801, "epoch": 0.6698331579738463, "flos": 28884915832320.0, "grad_norm": 1.7912401635541029, "language_loss": 0.76179224, "learning_rate": 1.0384659055960691e-06, "loss": 0.8387332, "num_input_tokens_seen": 240528470, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.1072998, "step": 11141, "time_per_iteration": 2.6172869205474854 }, { "auxiliary_loss_clip": 0.06420414, "auxiliary_loss_mlp": 0.01271541, "balance_loss_clip": 0.06274803, "balance_loss_mlp": 0.01261205, "epoch": 0.6698932812265144, "flos": 24213993505920.0, "grad_norm": 1.7452202135672656, "language_loss": 0.82560188, "learning_rate": 1.0381244249923052e-06, "loss": 0.90252143, "num_input_tokens_seen": 240547815, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10327148, "step": 11142, "time_per_iteration": 2.6064202785491943 }, { "auxiliary_loss_clip": 0.06419601, "auxiliary_loss_mlp": 0.01267605, "balance_loss_clip": 0.06277014, "balance_loss_mlp": 0.01257669, "epoch": 0.6699534044791823, "flos": 22096704211200.0, "grad_norm": 1.4442629956099284, "language_loss": 0.70057034, "learning_rate": 1.037782980862959e-06, "loss": 0.77744234, "num_input_tokens_seen": 240567765, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09936523, "step": 11143, "time_per_iteration": 2.546783208847046 }, { "auxiliary_loss_clip": 0.06417769, "auxiliary_loss_mlp": 0.01264351, "balance_loss_clip": 0.06276437, "balance_loss_mlp": 0.01254879, "epoch": 0.6700135277318503, "flos": 25199466975360.0, "grad_norm": 1.5075521819557445, "language_loss": 0.70430732, "learning_rate": 1.0374415732209796e-06, "loss": 0.78112853, "num_input_tokens_seen": 240590750, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09472656, "step": 11144, "time_per_iteration": 2.6356425285339355 }, { "auxiliary_loss_clip": 0.06418286, "auxiliary_loss_mlp": 0.01266693, "balance_loss_clip": 0.06275593, "balance_loss_mlp": 0.0125569, "epoch": 0.6700736509845182, "flos": 23446838649600.0, "grad_norm": 1.6624487584917778, "language_loss": 0.74916136, "learning_rate": 1.0371002020793114e-06, "loss": 0.82601118, "num_input_tokens_seen": 240608875, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.11004639, "step": 11145, "time_per_iteration": 2.554497003555298 }, { "auxiliary_loss_clip": 0.06429955, "auxiliary_loss_mlp": 0.01267057, "balance_loss_clip": 0.06281221, "balance_loss_mlp": 0.01256293, "epoch": 0.6701337742371862, "flos": 24396952896000.0, "grad_norm": 1.5864041535877953, "language_loss": 0.71139365, "learning_rate": 1.0367588674509008e-06, "loss": 0.78836381, "num_input_tokens_seen": 240628565, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.10766602, "step": 11146, "time_per_iteration": 2.6191887855529785 }, { "auxiliary_loss_clip": 0.06416068, "auxiliary_loss_mlp": 0.0126473, "balance_loss_clip": 0.06276585, "balance_loss_mlp": 0.01255038, "epoch": 0.6701938974898543, "flos": 14798956711680.0, "grad_norm": 1.8509365112000098, "language_loss": 0.78356421, "learning_rate": 1.0364175693486905e-06, "loss": 0.86037225, "num_input_tokens_seen": 240646325, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09680176, "step": 11147, "time_per_iteration": 2.524921417236328 }, { "auxiliary_loss_clip": 0.06420259, "auxiliary_loss_mlp": 0.0127008, "balance_loss_clip": 0.06276315, "balance_loss_mlp": 0.01259322, "epoch": 0.6702540207425222, "flos": 20159690976000.0, "grad_norm": 2.047473051865521, "language_loss": 0.71007657, "learning_rate": 1.0360763077856218e-06, "loss": 0.78697991, "num_input_tokens_seen": 240666145, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10754395, "step": 11148, "time_per_iteration": 2.545710325241089 }, { "auxiliary_loss_clip": 0.06422399, "auxiliary_loss_mlp": 0.01266583, "balance_loss_clip": 0.06277847, "balance_loss_mlp": 0.01256641, "epoch": 0.6703141439951902, "flos": 21220369084800.0, "grad_norm": 1.780213872043051, "language_loss": 0.70906281, "learning_rate": 1.035735082774636e-06, "loss": 0.78595269, "num_input_tokens_seen": 240685570, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.09942627, "step": 11149, "time_per_iteration": 4.092790126800537 }, { "auxiliary_loss_clip": 0.06423673, "auxiliary_loss_mlp": 0.01264338, "balance_loss_clip": 0.06275902, "balance_loss_mlp": 0.0125414, "epoch": 0.6703742672478581, "flos": 23119255912320.0, "grad_norm": 2.0827565482439274, "language_loss": 0.73921406, "learning_rate": 1.0353938943286727e-06, "loss": 0.81609416, "num_input_tokens_seen": 240706945, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.10198975, "step": 11150, "time_per_iteration": 2.611637592315674 }, { "auxiliary_loss_clip": 0.06427106, "auxiliary_loss_mlp": 0.01265875, "balance_loss_clip": 0.06281081, "balance_loss_mlp": 0.01255754, "epoch": 0.6704343905005261, "flos": 22535563570560.0, "grad_norm": 1.7014300737584869, "language_loss": 0.78498787, "learning_rate": 1.035052742460671e-06, "loss": 0.86191767, "num_input_tokens_seen": 240727990, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10119629, "step": 11151, "time_per_iteration": 2.5829129219055176 }, { "auxiliary_loss_clip": 0.06314196, "auxiliary_loss_mlp": 0.01254539, "balance_loss_clip": 0.06255381, "balance_loss_mlp": 0.01253363, "epoch": 0.670494513753194, "flos": 64815270192000.0, "grad_norm": 0.7627322192315277, "language_loss": 0.55431664, "learning_rate": 1.0347116271835643e-06, "loss": 0.63000399, "num_input_tokens_seen": 240790380, "router_z_loss_clip": 0.58691406, "router_z_loss_mlp": 0.01174927, "step": 11152, "time_per_iteration": 3.263821601867676 }, { "auxiliary_loss_clip": 0.06424854, "auxiliary_loss_mlp": 0.01266661, "balance_loss_clip": 0.06278103, "balance_loss_mlp": 0.01256332, "epoch": 0.670554637005862, "flos": 23517892512000.0, "grad_norm": 1.6386165032140667, "language_loss": 0.81350386, "learning_rate": 1.0343705485102896e-06, "loss": 0.89041901, "num_input_tokens_seen": 240811545, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10327148, "step": 11153, "time_per_iteration": 2.602632761001587 }, { "auxiliary_loss_clip": 0.06427559, "auxiliary_loss_mlp": 0.01266314, "balance_loss_clip": 0.06280419, "balance_loss_mlp": 0.01255913, "epoch": 0.67061476025853, "flos": 19469417840640.0, "grad_norm": 3.7519027000895075, "language_loss": 0.76344931, "learning_rate": 1.0340295064537814e-06, "loss": 0.84038806, "num_input_tokens_seen": 240831380, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.10406494, "step": 11154, "time_per_iteration": 2.5838875770568848 }, { "auxiliary_loss_clip": 0.06426816, "auxiliary_loss_mlp": 0.01272818, "balance_loss_clip": 0.06277677, "balance_loss_mlp": 0.01261791, "epoch": 0.670674883511198, "flos": 20525903245440.0, "grad_norm": 1.667906021442215, "language_loss": 0.7667886, "learning_rate": 1.0336885010269702e-06, "loss": 0.84378493, "num_input_tokens_seen": 240851855, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.11029053, "step": 11155, "time_per_iteration": 2.5628561973571777 }, { "auxiliary_loss_clip": 0.06423572, "auxiliary_loss_mlp": 0.01265006, "balance_loss_clip": 0.06277047, "balance_loss_mlp": 0.01255177, "epoch": 0.6707350067638659, "flos": 25491061584000.0, "grad_norm": 1.9084094588713107, "language_loss": 0.8183257, "learning_rate": 1.0333475322427878e-06, "loss": 0.89521152, "num_input_tokens_seen": 240869980, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.09832764, "step": 11156, "time_per_iteration": 4.060584783554077 }, { "auxiliary_loss_clip": 0.06422149, "auxiliary_loss_mlp": 0.01266086, "balance_loss_clip": 0.06278487, "balance_loss_mlp": 0.01256687, "epoch": 0.6707951300165339, "flos": 22280040944640.0, "grad_norm": 2.277556442747367, "language_loss": 0.75221658, "learning_rate": 1.033006600114165e-06, "loss": 0.82909894, "num_input_tokens_seen": 240888680, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.09411621, "step": 11157, "time_per_iteration": 2.576228141784668 }, { "auxiliary_loss_clip": 0.06427679, "auxiliary_loss_mlp": 0.01269152, "balance_loss_clip": 0.0627982, "balance_loss_mlp": 0.01258823, "epoch": 0.6708552532692018, "flos": 23990853283200.0, "grad_norm": 1.4767839039011155, "language_loss": 0.74298179, "learning_rate": 1.0326657046540282e-06, "loss": 0.81995016, "num_input_tokens_seen": 240909050, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10327148, "step": 11158, "time_per_iteration": 2.6124801635742188 }, { "auxiliary_loss_clip": 0.06428164, "auxiliary_loss_mlp": 0.01263903, "balance_loss_clip": 0.06279306, "balance_loss_mlp": 0.01254062, "epoch": 0.6709153765218698, "flos": 24944657109120.0, "grad_norm": 1.4275790252884748, "language_loss": 0.81860721, "learning_rate": 1.0323248458753044e-06, "loss": 0.89552796, "num_input_tokens_seen": 240930035, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.09844971, "step": 11159, "time_per_iteration": 2.577615261077881 }, { "auxiliary_loss_clip": 0.06423567, "auxiliary_loss_mlp": 0.01266721, "balance_loss_clip": 0.06277673, "balance_loss_mlp": 0.01256749, "epoch": 0.6709754997745379, "flos": 17536010330880.0, "grad_norm": 2.1817966068349373, "language_loss": 0.77679026, "learning_rate": 1.0319840237909193e-06, "loss": 0.85369313, "num_input_tokens_seen": 240948895, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.09973145, "step": 11160, "time_per_iteration": 2.5613949298858643 }, { "auxiliary_loss_clip": 0.06421037, "auxiliary_loss_mlp": 0.01263748, "balance_loss_clip": 0.0627765, "balance_loss_mlp": 0.01254503, "epoch": 0.6710356230272058, "flos": 22097416970880.0, "grad_norm": 2.862126706105882, "language_loss": 0.73974955, "learning_rate": 1.0316432384137978e-06, "loss": 0.8165974, "num_input_tokens_seen": 240967770, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09259033, "step": 11161, "time_per_iteration": 2.534951686859131 }, { "auxiliary_loss_clip": 0.06427429, "auxiliary_loss_mlp": 0.01268635, "balance_loss_clip": 0.06278168, "balance_loss_mlp": 0.0125793, "epoch": 0.6710957462798738, "flos": 24213238819200.0, "grad_norm": 1.9348151986847395, "language_loss": 0.68731147, "learning_rate": 1.0313024897568618e-06, "loss": 0.76427209, "num_input_tokens_seen": 240988985, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.10705566, "step": 11162, "time_per_iteration": 2.61863374710083 }, { "auxiliary_loss_clip": 0.06421029, "auxiliary_loss_mlp": 0.01265815, "balance_loss_clip": 0.0627737, "balance_loss_mlp": 0.01255873, "epoch": 0.6711558695325417, "flos": 19099138648320.0, "grad_norm": 3.800506808353587, "language_loss": 0.70145655, "learning_rate": 1.030961777833032e-06, "loss": 0.77832496, "num_input_tokens_seen": 241005455, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.09936523, "step": 11163, "time_per_iteration": 2.5448591709136963 }, { "auxiliary_loss_clip": 0.06417251, "auxiliary_loss_mlp": 0.0126273, "balance_loss_clip": 0.06275351, "balance_loss_mlp": 0.01253324, "epoch": 0.6712159927852097, "flos": 25565134193280.0, "grad_norm": 1.561352056826397, "language_loss": 0.75984639, "learning_rate": 1.0306211026552291e-06, "loss": 0.8366462, "num_input_tokens_seen": 241026175, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09411621, "step": 11164, "time_per_iteration": 2.603703260421753 }, { "auxiliary_loss_clip": 0.06426434, "auxiliary_loss_mlp": 0.01265416, "balance_loss_clip": 0.06280315, "balance_loss_mlp": 0.01254997, "epoch": 0.6712761160378776, "flos": 22234032253440.0, "grad_norm": 1.9767408329355385, "language_loss": 0.65606546, "learning_rate": 1.0302804642363704e-06, "loss": 0.73298395, "num_input_tokens_seen": 241044040, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.10418701, "step": 11165, "time_per_iteration": 3.9671268463134766 }, { "auxiliary_loss_clip": 0.06421505, "auxiliary_loss_mlp": 0.01268872, "balance_loss_clip": 0.06277885, "balance_loss_mlp": 0.01258525, "epoch": 0.6713362392905456, "flos": 22462077939840.0, "grad_norm": 2.3122734883254576, "language_loss": 0.72228193, "learning_rate": 1.0299398625893738e-06, "loss": 0.79918563, "num_input_tokens_seen": 241063615, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10345459, "step": 11166, "time_per_iteration": 2.573796272277832 }, { "auxiliary_loss_clip": 0.0642066, "auxiliary_loss_mlp": 0.01265571, "balance_loss_clip": 0.06279417, "balance_loss_mlp": 0.01255856, "epoch": 0.6713963625432136, "flos": 25637362012800.0, "grad_norm": 1.7743299214981805, "language_loss": 0.77366805, "learning_rate": 1.0295992977271546e-06, "loss": 0.85053033, "num_input_tokens_seen": 241082520, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09716797, "step": 11167, "time_per_iteration": 2.57243275642395 }, { "auxiliary_loss_clip": 0.06423707, "auxiliary_loss_mlp": 0.01270551, "balance_loss_clip": 0.06277132, "balance_loss_mlp": 0.0125987, "epoch": 0.6714564857958816, "flos": 35015110940160.0, "grad_norm": 1.8511886692098123, "language_loss": 0.68792689, "learning_rate": 1.029258769662629e-06, "loss": 0.76486945, "num_input_tokens_seen": 241103505, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10681152, "step": 11168, "time_per_iteration": 4.1677329540252686 }, { "auxiliary_loss_clip": 0.06430553, "auxiliary_loss_mlp": 0.01269032, "balance_loss_clip": 0.06282794, "balance_loss_mlp": 0.01258124, "epoch": 0.6715166090485495, "flos": 26286028796160.0, "grad_norm": 2.000565312964548, "language_loss": 0.73057854, "learning_rate": 1.0289182784087068e-06, "loss": 0.80757439, "num_input_tokens_seen": 241122885, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.10900879, "step": 11169, "time_per_iteration": 2.569222927093506 }, { "auxiliary_loss_clip": 0.06423327, "auxiliary_loss_mlp": 0.01264657, "balance_loss_clip": 0.06275278, "balance_loss_mlp": 0.01253594, "epoch": 0.6715767323012175, "flos": 15929556652800.0, "grad_norm": 2.0921213574774375, "language_loss": 0.7630229, "learning_rate": 1.0285778239783005e-06, "loss": 0.83990276, "num_input_tokens_seen": 241140865, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.1105957, "step": 11170, "time_per_iteration": 2.5538318157196045 }, { "auxiliary_loss_clip": 0.06429041, "auxiliary_loss_mlp": 0.01267095, "balance_loss_clip": 0.06280564, "balance_loss_mlp": 0.01256729, "epoch": 0.6716368555538854, "flos": 17496835747200.0, "grad_norm": 2.2335126045717986, "language_loss": 0.75243461, "learning_rate": 1.0282374063843212e-06, "loss": 0.82939595, "num_input_tokens_seen": 241158225, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.10369873, "step": 11171, "time_per_iteration": 2.5282154083251953 }, { "auxiliary_loss_clip": 0.06430259, "auxiliary_loss_mlp": 0.01266555, "balance_loss_clip": 0.06281883, "balance_loss_mlp": 0.01256106, "epoch": 0.6716969788065534, "flos": 16766759122560.0, "grad_norm": 1.4937015996290033, "language_loss": 0.8659575, "learning_rate": 1.0278970256396762e-06, "loss": 0.94292569, "num_input_tokens_seen": 241175215, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.10443115, "step": 11172, "time_per_iteration": 2.5441558361053467 }, { "auxiliary_loss_clip": 0.0642326, "auxiliary_loss_mlp": 0.01269026, "balance_loss_clip": 0.06277521, "balance_loss_mlp": 0.01258762, "epoch": 0.6717571020592215, "flos": 22716216973440.0, "grad_norm": 1.841184788761429, "language_loss": 0.63843155, "learning_rate": 1.0275566817572733e-06, "loss": 0.71535444, "num_input_tokens_seen": 241195250, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10253906, "step": 11173, "time_per_iteration": 2.5464413166046143 }, { "auxiliary_loss_clip": 0.06440844, "auxiliary_loss_mlp": 0.01270972, "balance_loss_clip": 0.0628383, "balance_loss_mlp": 0.01259295, "epoch": 0.6718172253118894, "flos": 18740053975680.0, "grad_norm": 2.2048956758756924, "language_loss": 0.71867299, "learning_rate": 1.02721637475002e-06, "loss": 0.79579115, "num_input_tokens_seen": 241210720, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.11676025, "step": 11174, "time_per_iteration": 2.6479437351226807 }, { "auxiliary_loss_clip": 0.06417559, "auxiliary_loss_mlp": 0.01264398, "balance_loss_clip": 0.0627614, "balance_loss_mlp": 0.01254516, "epoch": 0.6718773485645574, "flos": 15637920117120.0, "grad_norm": 2.0531729035152715, "language_loss": 0.69323158, "learning_rate": 1.0268761046308178e-06, "loss": 0.77005124, "num_input_tokens_seen": 241227395, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09881592, "step": 11175, "time_per_iteration": 2.6852078437805176 }, { "auxiliary_loss_clip": 0.06423077, "auxiliary_loss_mlp": 0.01267778, "balance_loss_clip": 0.06281185, "balance_loss_mlp": 0.01257699, "epoch": 0.6719374718172253, "flos": 19360908403200.0, "grad_norm": 1.9747381003126938, "language_loss": 0.74347544, "learning_rate": 1.0265358714125714e-06, "loss": 0.82038403, "num_input_tokens_seen": 241246355, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10076904, "step": 11176, "time_per_iteration": 2.688730239868164 }, { "auxiliary_loss_clip": 0.06427078, "auxiliary_loss_mlp": 0.01266866, "balance_loss_clip": 0.0627815, "balance_loss_mlp": 0.01256382, "epoch": 0.6719975950698933, "flos": 21987817430400.0, "grad_norm": 1.6619792900299335, "language_loss": 0.73049992, "learning_rate": 1.026195675108182e-06, "loss": 0.80743933, "num_input_tokens_seen": 241264180, "router_z_loss_clip": 1.48730469, "router_z_loss_mlp": 0.1048584, "step": 11177, "time_per_iteration": 2.550316333770752 }, { "auxiliary_loss_clip": 0.06425984, "auxiliary_loss_mlp": 0.01269831, "balance_loss_clip": 0.06278695, "balance_loss_mlp": 0.01258209, "epoch": 0.6720577183225612, "flos": 25235035833600.0, "grad_norm": 1.9996353686456159, "language_loss": 0.77074242, "learning_rate": 1.025855515730551e-06, "loss": 0.84770048, "num_input_tokens_seen": 241282245, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.11627197, "step": 11178, "time_per_iteration": 2.628924608230591 }, { "auxiliary_loss_clip": 0.06423505, "auxiliary_loss_mlp": 0.01263928, "balance_loss_clip": 0.06276719, "balance_loss_mlp": 0.01254451, "epoch": 0.6721178415752292, "flos": 16951479448320.0, "grad_norm": 1.5718763634834112, "language_loss": 0.70070028, "learning_rate": 1.0255153932925766e-06, "loss": 0.7775746, "num_input_tokens_seen": 241300745, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.09466553, "step": 11179, "time_per_iteration": 2.5201609134674072 }, { "auxiliary_loss_clip": 0.06418845, "auxiliary_loss_mlp": 0.01264543, "balance_loss_clip": 0.06275295, "balance_loss_mlp": 0.01255167, "epoch": 0.6721779648278972, "flos": 21547448697600.0, "grad_norm": 1.6757773887715133, "language_loss": 0.74312651, "learning_rate": 1.0251753078071557e-06, "loss": 0.81996042, "num_input_tokens_seen": 241319320, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.09368896, "step": 11180, "time_per_iteration": 2.571959972381592 }, { "auxiliary_loss_clip": 0.06420634, "auxiliary_loss_mlp": 0.01263924, "balance_loss_clip": 0.06276734, "balance_loss_mlp": 0.01253535, "epoch": 0.6722380880805652, "flos": 22612696853760.0, "grad_norm": 1.3942602535904733, "language_loss": 0.75075197, "learning_rate": 1.0248352592871848e-06, "loss": 0.82759756, "num_input_tokens_seen": 241342225, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10394287, "step": 11181, "time_per_iteration": 2.6021203994750977 }, { "auxiliary_loss_clip": 0.06426266, "auxiliary_loss_mlp": 0.01264169, "balance_loss_clip": 0.06277475, "balance_loss_mlp": 0.01253732, "epoch": 0.6722982113332331, "flos": 15930856391040.0, "grad_norm": 2.4163677971761293, "language_loss": 0.75018966, "learning_rate": 1.0244952477455585e-06, "loss": 0.82709402, "num_input_tokens_seen": 241358240, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.10424805, "step": 11182, "time_per_iteration": 2.611297130584717 }, { "auxiliary_loss_clip": 0.06421102, "auxiliary_loss_mlp": 0.01267628, "balance_loss_clip": 0.06278498, "balance_loss_mlp": 0.01257949, "epoch": 0.6723583345859011, "flos": 20602659185280.0, "grad_norm": 1.67242449480447, "language_loss": 0.69894844, "learning_rate": 1.0241552731951699e-06, "loss": 0.77583575, "num_input_tokens_seen": 241378420, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09674072, "step": 11183, "time_per_iteration": 2.575201988220215 }, { "auxiliary_loss_clip": 0.06423189, "auxiliary_loss_mlp": 0.0126976, "balance_loss_clip": 0.06276633, "balance_loss_mlp": 0.01259764, "epoch": 0.672418457838569, "flos": 21732294804480.0, "grad_norm": 1.550044953227719, "language_loss": 0.78221744, "learning_rate": 1.0238153356489112e-06, "loss": 0.85914695, "num_input_tokens_seen": 241397185, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.09985352, "step": 11184, "time_per_iteration": 2.568465232849121 }, { "auxiliary_loss_clip": 0.06432189, "auxiliary_loss_mlp": 0.01269293, "balance_loss_clip": 0.06278846, "balance_loss_mlp": 0.01258164, "epoch": 0.672478581091237, "flos": 21476772178560.0, "grad_norm": 2.0414181328522796, "language_loss": 0.66332364, "learning_rate": 1.0234754351196743e-06, "loss": 0.74033844, "num_input_tokens_seen": 241415785, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.11126709, "step": 11185, "time_per_iteration": 2.604370355606079 }, { "auxiliary_loss_clip": 0.06421512, "auxiliary_loss_mlp": 0.0126555, "balance_loss_clip": 0.06277075, "balance_loss_mlp": 0.01255102, "epoch": 0.6725387043439051, "flos": 30854646887040.0, "grad_norm": 2.4349131967724698, "language_loss": 0.80401146, "learning_rate": 1.023135571620345e-06, "loss": 0.88088208, "num_input_tokens_seen": 241437390, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10443115, "step": 11186, "time_per_iteration": 2.6855928897857666 }, { "auxiliary_loss_clip": 0.0641564, "auxiliary_loss_mlp": 0.01267153, "balance_loss_clip": 0.06274974, "balance_loss_mlp": 0.01257997, "epoch": 0.672598827596573, "flos": 24061949072640.0, "grad_norm": 1.3776112493200088, "language_loss": 0.80505037, "learning_rate": 1.022795745163813e-06, "loss": 0.88187832, "num_input_tokens_seen": 241458085, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09155273, "step": 11187, "time_per_iteration": 2.6233766078948975 }, { "auxiliary_loss_clip": 0.06431161, "auxiliary_loss_mlp": 0.01269021, "balance_loss_clip": 0.06279195, "balance_loss_mlp": 0.01258358, "epoch": 0.672658950849241, "flos": 21878343671040.0, "grad_norm": 1.8141309203426292, "language_loss": 0.71032155, "learning_rate": 1.022455955762965e-06, "loss": 0.7873233, "num_input_tokens_seen": 241476880, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.10668945, "step": 11188, "time_per_iteration": 4.008691310882568 }, { "auxiliary_loss_clip": 0.06414833, "auxiliary_loss_mlp": 0.01268159, "balance_loss_clip": 0.0627534, "balance_loss_mlp": 0.01258563, "epoch": 0.6727190741019089, "flos": 23228855452800.0, "grad_norm": 2.026367272592167, "language_loss": 0.759942, "learning_rate": 1.0221162034306842e-06, "loss": 0.83677191, "num_input_tokens_seen": 241496535, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.0960083, "step": 11189, "time_per_iteration": 2.5651676654815674 }, { "auxiliary_loss_clip": 0.06426505, "auxiliary_loss_mlp": 0.01264562, "balance_loss_clip": 0.06276597, "balance_loss_mlp": 0.01253553, "epoch": 0.6727791973545769, "flos": 15784052837760.0, "grad_norm": 2.2580827112311814, "language_loss": 0.75454438, "learning_rate": 1.0217764881798562e-06, "loss": 0.83145499, "num_input_tokens_seen": 241513465, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.11016846, "step": 11190, "time_per_iteration": 2.52755069732666 }, { "auxiliary_loss_clip": 0.06421074, "auxiliary_loss_mlp": 0.01267504, "balance_loss_clip": 0.06277797, "balance_loss_mlp": 0.01257002, "epoch": 0.6728393206072448, "flos": 21255937943040.0, "grad_norm": 1.4378449178240478, "language_loss": 0.77236319, "learning_rate": 1.0214368100233612e-06, "loss": 0.84924901, "num_input_tokens_seen": 241534125, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.1050415, "step": 11191, "time_per_iteration": 2.577845335006714 }, { "auxiliary_loss_clip": 0.06418651, "auxiliary_loss_mlp": 0.01264799, "balance_loss_clip": 0.06276943, "balance_loss_mlp": 0.01255167, "epoch": 0.6728994438599128, "flos": 32131295694720.0, "grad_norm": 1.733946979726839, "language_loss": 0.863967, "learning_rate": 1.0210971689740802e-06, "loss": 0.9408015, "num_input_tokens_seen": 241556340, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09619141, "step": 11192, "time_per_iteration": 2.6869616508483887 }, { "auxiliary_loss_clip": 0.06428705, "auxiliary_loss_mlp": 0.01268247, "balance_loss_clip": 0.0628199, "balance_loss_mlp": 0.01257512, "epoch": 0.6729595671125808, "flos": 23119046277120.0, "grad_norm": 1.7646126452875537, "language_loss": 0.76039708, "learning_rate": 1.0207575650448923e-06, "loss": 0.83736658, "num_input_tokens_seen": 241575185, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.10736084, "step": 11193, "time_per_iteration": 2.5482559204101562 }, { "auxiliary_loss_clip": 0.06422238, "auxiliary_loss_mlp": 0.01266652, "balance_loss_clip": 0.06277079, "balance_loss_mlp": 0.0125528, "epoch": 0.6730196903652488, "flos": 14616710081280.0, "grad_norm": 1.9748361250244046, "language_loss": 0.79107356, "learning_rate": 1.0204179982486758e-06, "loss": 0.86796242, "num_input_tokens_seen": 241592970, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.11376953, "step": 11194, "time_per_iteration": 2.589038610458374 }, { "auxiliary_loss_clip": 0.06424298, "auxiliary_loss_mlp": 0.01262862, "balance_loss_clip": 0.0627719, "balance_loss_mlp": 0.01252598, "epoch": 0.6730798136179167, "flos": 21112320844800.0, "grad_norm": 1.7826170780227901, "language_loss": 0.90294838, "learning_rate": 1.0200784685983075e-06, "loss": 0.97982007, "num_input_tokens_seen": 241610245, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.1026001, "step": 11195, "time_per_iteration": 2.558621883392334 }, { "auxiliary_loss_clip": 0.06416427, "auxiliary_loss_mlp": 0.01266291, "balance_loss_clip": 0.06274599, "balance_loss_mlp": 0.01256092, "epoch": 0.6731399368705847, "flos": 28993886467200.0, "grad_norm": 1.6095132923478848, "language_loss": 0.72606635, "learning_rate": 1.019738976106662e-06, "loss": 0.80289352, "num_input_tokens_seen": 241630350, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.10211182, "step": 11196, "time_per_iteration": 4.077466249465942 }, { "auxiliary_loss_clip": 0.06306496, "auxiliary_loss_mlp": 0.01254144, "balance_loss_clip": 0.06248008, "balance_loss_mlp": 0.0125297, "epoch": 0.6732000601232526, "flos": 64763643277440.0, "grad_norm": 0.7563370256744331, "language_loss": 0.56518471, "learning_rate": 1.0193995207866123e-06, "loss": 0.64079106, "num_input_tokens_seen": 241692380, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01171875, "step": 11197, "time_per_iteration": 3.1100738048553467 }, { "auxiliary_loss_clip": 0.06410421, "auxiliary_loss_mlp": 0.01266168, "balance_loss_clip": 0.06271353, "balance_loss_mlp": 0.01256649, "epoch": 0.6732601833759206, "flos": 17207337490560.0, "grad_norm": 2.025208327185953, "language_loss": 0.76009202, "learning_rate": 1.0190601026510312e-06, "loss": 0.83685791, "num_input_tokens_seen": 241710430, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09521484, "step": 11198, "time_per_iteration": 2.595405101776123 }, { "auxiliary_loss_clip": 0.06423949, "auxiliary_loss_mlp": 0.01262442, "balance_loss_clip": 0.06276341, "balance_loss_mlp": 0.01252029, "epoch": 0.6733203066285887, "flos": 18664430065920.0, "grad_norm": 1.9090395736804178, "language_loss": 0.81558251, "learning_rate": 1.0187207217127892e-06, "loss": 0.8924464, "num_input_tokens_seen": 241724775, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.10412598, "step": 11199, "time_per_iteration": 2.5499138832092285 }, { "auxiliary_loss_clip": 0.06423866, "auxiliary_loss_mlp": 0.01266012, "balance_loss_clip": 0.06275639, "balance_loss_mlp": 0.01255509, "epoch": 0.6733804298812566, "flos": 35818128144000.0, "grad_norm": 1.8555633375363214, "language_loss": 0.72003227, "learning_rate": 1.0183813779847552e-06, "loss": 0.79693103, "num_input_tokens_seen": 241744440, "router_z_loss_clip": 1.48242188, "router_z_loss_mlp": 0.1050415, "step": 11200, "time_per_iteration": 2.716526985168457 }, { "auxiliary_loss_clip": 0.0642392, "auxiliary_loss_mlp": 0.01271597, "balance_loss_clip": 0.06277706, "balance_loss_mlp": 0.0126091, "epoch": 0.6734405531339246, "flos": 61651545511680.0, "grad_norm": 1.6244903794408874, "language_loss": 0.64418554, "learning_rate": 1.0180420714797987e-06, "loss": 0.72114074, "num_input_tokens_seen": 241771705, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10693359, "step": 11201, "time_per_iteration": 2.9283134937286377 }, { "auxiliary_loss_clip": 0.06427124, "auxiliary_loss_mlp": 0.01266962, "balance_loss_clip": 0.06277465, "balance_loss_mlp": 0.01256257, "epoch": 0.6735006763865925, "flos": 20528670430080.0, "grad_norm": 1.6161798685283206, "language_loss": 0.63635767, "learning_rate": 1.0177028022107856e-06, "loss": 0.7132985, "num_input_tokens_seen": 241790830, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.10699463, "step": 11202, "time_per_iteration": 2.6282126903533936 }, { "auxiliary_loss_clip": 0.06423054, "auxiliary_loss_mlp": 0.01267209, "balance_loss_clip": 0.06277101, "balance_loss_mlp": 0.01257065, "epoch": 0.6735607996392605, "flos": 13924172885760.0, "grad_norm": 1.6942710930281195, "language_loss": 0.74814779, "learning_rate": 1.0173635701905796e-06, "loss": 0.82505035, "num_input_tokens_seen": 241808165, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10150146, "step": 11203, "time_per_iteration": 2.5763418674468994 }, { "auxiliary_loss_clip": 0.06432385, "auxiliary_loss_mlp": 0.01266487, "balance_loss_clip": 0.06281035, "balance_loss_mlp": 0.01254691, "epoch": 0.6736209228919284, "flos": 18813246117120.0, "grad_norm": 1.754165109077395, "language_loss": 0.67451525, "learning_rate": 1.0170243754320456e-06, "loss": 0.751504, "num_input_tokens_seen": 241826925, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.11791992, "step": 11204, "time_per_iteration": 2.601478099822998 }, { "auxiliary_loss_clip": 0.0643072, "auxiliary_loss_mlp": 0.01267967, "balance_loss_clip": 0.06279806, "balance_loss_mlp": 0.01256583, "epoch": 0.6736810461445965, "flos": 20378890056960.0, "grad_norm": 1.6798979312712183, "language_loss": 0.74228024, "learning_rate": 1.0166852179480465e-06, "loss": 0.81926715, "num_input_tokens_seen": 241845525, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.11383057, "step": 11205, "time_per_iteration": 3.9808151721954346 }, { "auxiliary_loss_clip": 0.06418169, "auxiliary_loss_mlp": 0.01265918, "balance_loss_clip": 0.06276508, "balance_loss_mlp": 0.01255827, "epoch": 0.6737411693972644, "flos": 30015264211200.0, "grad_norm": 1.5429860872445267, "language_loss": 0.7166816, "learning_rate": 1.0163460977514416e-06, "loss": 0.79352254, "num_input_tokens_seen": 241866815, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.10083008, "step": 11206, "time_per_iteration": 2.6581003665924072 }, { "auxiliary_loss_clip": 0.06429678, "auxiliary_loss_mlp": 0.01270134, "balance_loss_clip": 0.06277149, "balance_loss_mlp": 0.01258791, "epoch": 0.6738012926499324, "flos": 25454402622720.0, "grad_norm": 5.743461506305742, "language_loss": 0.67680794, "learning_rate": 1.016007014855092e-06, "loss": 0.75380605, "num_input_tokens_seen": 241887050, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.11328125, "step": 11207, "time_per_iteration": 2.5827205181121826 }, { "auxiliary_loss_clip": 0.06416008, "auxiliary_loss_mlp": 0.01269018, "balance_loss_clip": 0.06275057, "balance_loss_mlp": 0.01259135, "epoch": 0.6738614159026003, "flos": 20783102952960.0, "grad_norm": 2.3024970809948937, "language_loss": 0.74355221, "learning_rate": 1.0156679692718553e-06, "loss": 0.8204025, "num_input_tokens_seen": 241904280, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09881592, "step": 11208, "time_per_iteration": 4.009423732757568 }, { "auxiliary_loss_clip": 0.06421541, "auxiliary_loss_mlp": 0.01268958, "balance_loss_clip": 0.06275005, "balance_loss_mlp": 0.01257418, "epoch": 0.6739215391552683, "flos": 19571931711360.0, "grad_norm": 1.9255650071997428, "language_loss": 0.75965601, "learning_rate": 1.0153289610145867e-06, "loss": 0.83656096, "num_input_tokens_seen": 241919190, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.11535645, "step": 11209, "time_per_iteration": 2.576659917831421 }, { "auxiliary_loss_clip": 0.06413437, "auxiliary_loss_mlp": 0.01262824, "balance_loss_clip": 0.06274034, "balance_loss_mlp": 0.01253347, "epoch": 0.6739816624079362, "flos": 24394898471040.0, "grad_norm": 1.6322394532462732, "language_loss": 0.66658354, "learning_rate": 1.0149899900961428e-06, "loss": 0.7433461, "num_input_tokens_seen": 241940525, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09466553, "step": 11210, "time_per_iteration": 2.628679037094116 }, { "auxiliary_loss_clip": 0.06414283, "auxiliary_loss_mlp": 0.01264531, "balance_loss_clip": 0.06274116, "balance_loss_mlp": 0.01254899, "epoch": 0.6740417856606042, "flos": 22534683102720.0, "grad_norm": 2.105998661935769, "language_loss": 0.80456561, "learning_rate": 1.014651056529377e-06, "loss": 0.88135374, "num_input_tokens_seen": 241959290, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09625244, "step": 11211, "time_per_iteration": 2.591308355331421 }, { "auxiliary_loss_clip": 0.06416985, "auxiliary_loss_mlp": 0.01264651, "balance_loss_clip": 0.06276344, "balance_loss_mlp": 0.01254632, "epoch": 0.6741019089132723, "flos": 25782530411520.0, "grad_norm": 1.341243786174982, "language_loss": 0.7665751, "learning_rate": 1.014312160327143e-06, "loss": 0.84339142, "num_input_tokens_seen": 241980715, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10015869, "step": 11212, "time_per_iteration": 2.6381947994232178 }, { "auxiliary_loss_clip": 0.06421951, "auxiliary_loss_mlp": 0.0126738, "balance_loss_clip": 0.06275059, "balance_loss_mlp": 0.01256085, "epoch": 0.6741620321659402, "flos": 21112027355520.0, "grad_norm": 2.6026340927772385, "language_loss": 0.78283131, "learning_rate": 1.0139733015022905e-06, "loss": 0.85972464, "num_input_tokens_seen": 241999985, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.11303711, "step": 11213, "time_per_iteration": 2.5865254402160645 }, { "auxiliary_loss_clip": 0.06430317, "auxiliary_loss_mlp": 0.01269431, "balance_loss_clip": 0.06281918, "balance_loss_mlp": 0.01258458, "epoch": 0.6742221554186082, "flos": 20746653626880.0, "grad_norm": 2.2947645686763805, "language_loss": 0.67835855, "learning_rate": 1.0136344800676685e-06, "loss": 0.75535607, "num_input_tokens_seen": 242018990, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.10974121, "step": 11214, "time_per_iteration": 2.5894622802734375 }, { "auxiliary_loss_clip": 0.0642617, "auxiliary_loss_mlp": 0.01267426, "balance_loss_clip": 0.06278369, "balance_loss_mlp": 0.01257454, "epoch": 0.6742822786712761, "flos": 37782366756480.0, "grad_norm": 1.9855869743981172, "language_loss": 0.72882342, "learning_rate": 1.0132956960361263e-06, "loss": 0.80575931, "num_input_tokens_seen": 242039340, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.09973145, "step": 11215, "time_per_iteration": 2.713580846786499 }, { "auxiliary_loss_clip": 0.06425963, "auxiliary_loss_mlp": 0.01265191, "balance_loss_clip": 0.06279127, "balance_loss_mlp": 0.01255165, "epoch": 0.6743424019239441, "flos": 37272118118400.0, "grad_norm": 1.5967665164527138, "language_loss": 0.6694833, "learning_rate": 1.0129569494205096e-06, "loss": 0.74639487, "num_input_tokens_seen": 242062215, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.10028076, "step": 11216, "time_per_iteration": 2.7125000953674316 }, { "auxiliary_loss_clip": 0.06302455, "auxiliary_loss_mlp": 0.01254877, "balance_loss_clip": 0.06243913, "balance_loss_mlp": 0.01253497, "epoch": 0.674402525176612, "flos": 66020152377600.0, "grad_norm": 0.6766350142094111, "language_loss": 0.56293869, "learning_rate": 1.0126182402336646e-06, "loss": 0.63851202, "num_input_tokens_seen": 242131130, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01381683, "step": 11217, "time_per_iteration": 3.289529800415039 }, { "auxiliary_loss_clip": 0.06417277, "auxiliary_loss_mlp": 0.01266402, "balance_loss_clip": 0.06274451, "balance_loss_mlp": 0.01255923, "epoch": 0.67446264842928, "flos": 26467143396480.0, "grad_norm": 1.7125048107629357, "language_loss": 0.74565899, "learning_rate": 1.0122795684884363e-06, "loss": 0.82249582, "num_input_tokens_seen": 242149720, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.1048584, "step": 11218, "time_per_iteration": 2.6103556156158447 }, { "auxiliary_loss_clip": 0.06425142, "auxiliary_loss_mlp": 0.01269333, "balance_loss_clip": 0.06279464, "balance_loss_mlp": 0.01257847, "epoch": 0.674522771681948, "flos": 23739146017920.0, "grad_norm": 1.5531707566370083, "language_loss": 0.66557848, "learning_rate": 1.0119409341976639e-06, "loss": 0.74252319, "num_input_tokens_seen": 242168875, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.11480713, "step": 11219, "time_per_iteration": 2.596240758895874 }, { "auxiliary_loss_clip": 0.06425424, "auxiliary_loss_mlp": 0.01268819, "balance_loss_clip": 0.06277929, "balance_loss_mlp": 0.01257941, "epoch": 0.674582894934616, "flos": 24761320375680.0, "grad_norm": 2.3470482015776053, "language_loss": 0.75122094, "learning_rate": 1.0116023373741904e-06, "loss": 0.82816327, "num_input_tokens_seen": 242188465, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.10876465, "step": 11220, "time_per_iteration": 2.619966983795166 }, { "auxiliary_loss_clip": 0.06420797, "auxiliary_loss_mlp": 0.01268027, "balance_loss_clip": 0.06275628, "balance_loss_mlp": 0.01257178, "epoch": 0.6746430181872839, "flos": 24833506268160.0, "grad_norm": 1.8920288304527983, "language_loss": 0.70471734, "learning_rate": 1.0112637780308554e-06, "loss": 0.7816056, "num_input_tokens_seen": 242208675, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10845947, "step": 11221, "time_per_iteration": 2.6359574794769287 }, { "auxiliary_loss_clip": 0.06422272, "auxiliary_loss_mlp": 0.01264715, "balance_loss_clip": 0.06277837, "balance_loss_mlp": 0.01254594, "epoch": 0.6747031414399519, "flos": 16879167774720.0, "grad_norm": 1.914330888387703, "language_loss": 0.58385491, "learning_rate": 1.010925256180498e-06, "loss": 0.66072476, "num_input_tokens_seen": 242227440, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10113525, "step": 11222, "time_per_iteration": 2.734957218170166 }, { "auxiliary_loss_clip": 0.06423262, "auxiliary_loss_mlp": 0.01267288, "balance_loss_clip": 0.06277464, "balance_loss_mlp": 0.01256303, "epoch": 0.6747632646926198, "flos": 22791715102080.0, "grad_norm": 2.6368648329758204, "language_loss": 0.76550663, "learning_rate": 1.0105867718359528e-06, "loss": 0.84241211, "num_input_tokens_seen": 242245240, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10974121, "step": 11223, "time_per_iteration": 2.7358288764953613 }, { "auxiliary_loss_clip": 0.0642187, "auxiliary_loss_mlp": 0.01269584, "balance_loss_clip": 0.06276409, "balance_loss_mlp": 0.01259165, "epoch": 0.6748233879452878, "flos": 20052020079360.0, "grad_norm": 1.9489108403901991, "language_loss": 0.75609195, "learning_rate": 1.0102483250100574e-06, "loss": 0.83300644, "num_input_tokens_seen": 242263435, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10406494, "step": 11224, "time_per_iteration": 2.5953845977783203 }, { "auxiliary_loss_clip": 0.0641548, "auxiliary_loss_mlp": 0.01265466, "balance_loss_clip": 0.06273788, "balance_loss_mlp": 0.01255823, "epoch": 0.6748835111979558, "flos": 23009488663680.0, "grad_norm": 1.5069922100120015, "language_loss": 0.63108367, "learning_rate": 1.0099099157156445e-06, "loss": 0.70789313, "num_input_tokens_seen": 242282765, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09637451, "step": 11225, "time_per_iteration": 2.6082513332366943 }, { "auxiliary_loss_clip": 0.06414524, "auxiliary_loss_mlp": 0.01265621, "balance_loss_clip": 0.06276244, "balance_loss_mlp": 0.01256281, "epoch": 0.6749436344506238, "flos": 12201201705600.0, "grad_norm": 1.7461910912039842, "language_loss": 0.64126456, "learning_rate": 1.0095715439655462e-06, "loss": 0.71806604, "num_input_tokens_seen": 242298980, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09332275, "step": 11226, "time_per_iteration": 2.53938627243042 }, { "auxiliary_loss_clip": 0.06428652, "auxiliary_loss_mlp": 0.01266136, "balance_loss_clip": 0.06281884, "balance_loss_mlp": 0.0125567, "epoch": 0.6750037577032918, "flos": 11878356723840.0, "grad_norm": 2.060647739666874, "language_loss": 0.72078896, "learning_rate": 1.0092332097725945e-06, "loss": 0.79773688, "num_input_tokens_seen": 242315420, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10473633, "step": 11227, "time_per_iteration": 2.5544140338897705 }, { "auxiliary_loss_clip": 0.06419913, "auxiliary_loss_mlp": 0.01264582, "balance_loss_clip": 0.06276128, "balance_loss_mlp": 0.01254771, "epoch": 0.6750638809559597, "flos": 17025342422400.0, "grad_norm": 2.5455868841249005, "language_loss": 0.72104436, "learning_rate": 1.0088949131496183e-06, "loss": 0.79788935, "num_input_tokens_seen": 242332805, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.09814453, "step": 11228, "time_per_iteration": 4.011121511459351 }, { "auxiliary_loss_clip": 0.0631072, "auxiliary_loss_mlp": 0.01252806, "balance_loss_clip": 0.06252187, "balance_loss_mlp": 0.01251538, "epoch": 0.6751240042086277, "flos": 70972774531200.0, "grad_norm": 0.7490561509826699, "language_loss": 0.529706, "learning_rate": 1.0085566541094482e-06, "loss": 0.6053412, "num_input_tokens_seen": 242396160, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01267242, "step": 11229, "time_per_iteration": 3.220426559448242 }, { "auxiliary_loss_clip": 0.06417088, "auxiliary_loss_mlp": 0.01264803, "balance_loss_clip": 0.06276034, "balance_loss_mlp": 0.01255463, "epoch": 0.6751841274612956, "flos": 22681863999360.0, "grad_norm": 1.7334622539746412, "language_loss": 0.80613625, "learning_rate": 1.0082184326649072e-06, "loss": 0.88295519, "num_input_tokens_seen": 242414660, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09326172, "step": 11230, "time_per_iteration": 2.5855154991149902 }, { "auxiliary_loss_clip": 0.06414339, "auxiliary_loss_mlp": 0.01264155, "balance_loss_clip": 0.06273194, "balance_loss_mlp": 0.0125491, "epoch": 0.6752442507139637, "flos": 21295112526720.0, "grad_norm": 1.4106849734911295, "language_loss": 0.65801835, "learning_rate": 1.0078802488288228e-06, "loss": 0.73480332, "num_input_tokens_seen": 242434225, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09246826, "step": 11231, "time_per_iteration": 2.553557872772217 }, { "auxiliary_loss_clip": 0.06434739, "auxiliary_loss_mlp": 0.0126815, "balance_loss_clip": 0.06285635, "balance_loss_mlp": 0.01256694, "epoch": 0.6753043739666316, "flos": 28264480675200.0, "grad_norm": 1.8691483102441968, "language_loss": 0.66804153, "learning_rate": 1.0075421026140198e-06, "loss": 0.74507046, "num_input_tokens_seen": 242454355, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.11462402, "step": 11232, "time_per_iteration": 2.614579677581787 }, { "auxiliary_loss_clip": 0.06417646, "auxiliary_loss_mlp": 0.01267825, "balance_loss_clip": 0.06275819, "balance_loss_mlp": 0.01258086, "epoch": 0.6753644972192996, "flos": 21366627586560.0, "grad_norm": 1.6145905952564599, "language_loss": 0.72636354, "learning_rate": 1.0072039940333188e-06, "loss": 0.80321831, "num_input_tokens_seen": 242474935, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09735107, "step": 11233, "time_per_iteration": 2.5378966331481934 }, { "auxiliary_loss_clip": 0.0642083, "auxiliary_loss_mlp": 0.01264323, "balance_loss_clip": 0.06275982, "balance_loss_mlp": 0.01254709, "epoch": 0.6754246204719675, "flos": 26549224070400.0, "grad_norm": 1.6378284027116328, "language_loss": 0.76890564, "learning_rate": 1.0068659230995418e-06, "loss": 0.84575713, "num_input_tokens_seen": 242495530, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.09619141, "step": 11234, "time_per_iteration": 2.6005280017852783 }, { "auxiliary_loss_clip": 0.06419106, "auxiliary_loss_mlp": 0.01265025, "balance_loss_clip": 0.06276065, "balance_loss_mlp": 0.01254147, "epoch": 0.6754847437246355, "flos": 25563750600960.0, "grad_norm": 1.6807074274052358, "language_loss": 0.7556662, "learning_rate": 1.0065278898255101e-06, "loss": 0.83250749, "num_input_tokens_seen": 242514550, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10876465, "step": 11235, "time_per_iteration": 3.9976298809051514 }, { "auxiliary_loss_clip": 0.06311747, "auxiliary_loss_mlp": 0.01253793, "balance_loss_clip": 0.06253196, "balance_loss_mlp": 0.01252405, "epoch": 0.6755448669773034, "flos": 59530216492800.0, "grad_norm": 0.7594954288053573, "language_loss": 0.5123083, "learning_rate": 1.0061898942240387e-06, "loss": 0.5879637, "num_input_tokens_seen": 242569200, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01387787, "step": 11236, "time_per_iteration": 3.1233975887298584 }, { "auxiliary_loss_clip": 0.06420755, "auxiliary_loss_mlp": 0.01266026, "balance_loss_clip": 0.06276721, "balance_loss_mlp": 0.01254517, "epoch": 0.6756049902299714, "flos": 23301209053440.0, "grad_norm": 2.4233645366771843, "language_loss": 0.75792563, "learning_rate": 1.0058519363079464e-06, "loss": 0.83479345, "num_input_tokens_seen": 242586950, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.1151123, "step": 11237, "time_per_iteration": 2.552584409713745 }, { "auxiliary_loss_clip": 0.06421299, "auxiliary_loss_mlp": 0.0126578, "balance_loss_clip": 0.06277763, "balance_loss_mlp": 0.01255933, "epoch": 0.6756651134826394, "flos": 31583256065280.0, "grad_norm": 2.310319460598172, "language_loss": 0.77471262, "learning_rate": 1.0055140160900482e-06, "loss": 0.85158336, "num_input_tokens_seen": 242607380, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09844971, "step": 11238, "time_per_iteration": 2.6231181621551514 }, { "auxiliary_loss_clip": 0.06428389, "auxiliary_loss_mlp": 0.01268707, "balance_loss_clip": 0.06278041, "balance_loss_mlp": 0.01257996, "epoch": 0.6757252367353074, "flos": 27279761892480.0, "grad_norm": 1.608402701982888, "language_loss": 0.66875964, "learning_rate": 1.0051761335831587e-06, "loss": 0.74573064, "num_input_tokens_seen": 242628025, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.10705566, "step": 11239, "time_per_iteration": 2.576679229736328 }, { "auxiliary_loss_clip": 0.06415576, "auxiliary_loss_mlp": 0.01263441, "balance_loss_clip": 0.06275122, "balance_loss_mlp": 0.01253982, "epoch": 0.6757853599879754, "flos": 16835548924800.0, "grad_norm": 1.7513353344517972, "language_loss": 0.83186495, "learning_rate": 1.0048382888000898e-06, "loss": 0.90865505, "num_input_tokens_seen": 242643825, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09460449, "step": 11240, "time_per_iteration": 2.551830291748047 }, { "auxiliary_loss_clip": 0.06434312, "auxiliary_loss_mlp": 0.01269051, "balance_loss_clip": 0.06283604, "balance_loss_mlp": 0.01257231, "epoch": 0.6758454832406433, "flos": 23226465611520.0, "grad_norm": 2.276104841728552, "language_loss": 0.74588823, "learning_rate": 1.0045004817536525e-06, "loss": 0.82292187, "num_input_tokens_seen": 242661820, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.11828613, "step": 11241, "time_per_iteration": 2.542872667312622 }, { "auxiliary_loss_clip": 0.06420414, "auxiliary_loss_mlp": 0.01265758, "balance_loss_clip": 0.06275874, "balance_loss_mlp": 0.01255303, "epoch": 0.6759056064933113, "flos": 16295098089600.0, "grad_norm": 2.1454064488090006, "language_loss": 0.80831975, "learning_rate": 1.0041627124566572e-06, "loss": 0.88518149, "num_input_tokens_seen": 242679890, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10449219, "step": 11242, "time_per_iteration": 2.5435216426849365 }, { "auxiliary_loss_clip": 0.06420369, "auxiliary_loss_mlp": 0.01261325, "balance_loss_clip": 0.06276135, "balance_loss_mlp": 0.01252003, "epoch": 0.6759657297459792, "flos": 25929543600000.0, "grad_norm": 2.1777684872133523, "language_loss": 0.72835815, "learning_rate": 1.0038249809219109e-06, "loss": 0.80517507, "num_input_tokens_seen": 242699495, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.09326172, "step": 11243, "time_per_iteration": 2.576230764389038 }, { "auxiliary_loss_clip": 0.06421647, "auxiliary_loss_mlp": 0.01266881, "balance_loss_clip": 0.06278942, "balance_loss_mlp": 0.01256933, "epoch": 0.6760258529986473, "flos": 23007140749440.0, "grad_norm": 1.5381647805420027, "language_loss": 0.72610885, "learning_rate": 1.003487287162221e-06, "loss": 0.80299413, "num_input_tokens_seen": 242719500, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09936523, "step": 11244, "time_per_iteration": 4.022468328475952 }, { "auxiliary_loss_clip": 0.0642319, "auxiliary_loss_mlp": 0.01266073, "balance_loss_clip": 0.0627736, "balance_loss_mlp": 0.01255886, "epoch": 0.6760859762513152, "flos": 20965601145600.0, "grad_norm": 1.8842751346056898, "language_loss": 0.85593832, "learning_rate": 1.003149631190393e-06, "loss": 0.93283093, "num_input_tokens_seen": 242738325, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10186768, "step": 11245, "time_per_iteration": 2.5518252849578857 }, { "auxiliary_loss_clip": 0.06428003, "auxiliary_loss_mlp": 0.01265831, "balance_loss_clip": 0.06277771, "balance_loss_mlp": 0.01255078, "epoch": 0.6761460995039832, "flos": 23629672258560.0, "grad_norm": 1.7963282390505435, "language_loss": 0.73914206, "learning_rate": 1.0028120130192327e-06, "loss": 0.81608039, "num_input_tokens_seen": 242756620, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.10748291, "step": 11246, "time_per_iteration": 2.593332529067993 }, { "auxiliary_loss_clip": 0.06419424, "auxiliary_loss_mlp": 0.01261995, "balance_loss_clip": 0.06274842, "balance_loss_mlp": 0.01251976, "epoch": 0.6762062227566511, "flos": 20776101137280.0, "grad_norm": 1.812340308965183, "language_loss": 0.88347119, "learning_rate": 1.002474432661539e-06, "loss": 0.96028537, "num_input_tokens_seen": 242774505, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10009766, "step": 11247, "time_per_iteration": 2.5404884815216064 }, { "auxiliary_loss_clip": 0.06311217, "auxiliary_loss_mlp": 0.01249304, "balance_loss_clip": 0.06252566, "balance_loss_mlp": 0.01247976, "epoch": 0.6762663460093191, "flos": 52836915219840.0, "grad_norm": 0.8060722346708497, "language_loss": 0.54050195, "learning_rate": 1.002136890130115e-06, "loss": 0.61610723, "num_input_tokens_seen": 242828645, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01329803, "step": 11248, "time_per_iteration": 4.535492658615112 }, { "auxiliary_loss_clip": 0.06416163, "auxiliary_loss_mlp": 0.01267005, "balance_loss_clip": 0.06278274, "balance_loss_mlp": 0.01257415, "epoch": 0.676326469261987, "flos": 23703115962240.0, "grad_norm": 3.8766408861894646, "language_loss": 0.73511761, "learning_rate": 1.001799385437761e-06, "loss": 0.81194931, "num_input_tokens_seen": 242850100, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09588623, "step": 11249, "time_per_iteration": 2.5913186073303223 }, { "auxiliary_loss_clip": 0.06426397, "auxiliary_loss_mlp": 0.01266044, "balance_loss_clip": 0.0627777, "balance_loss_mlp": 0.01254755, "epoch": 0.676386592514655, "flos": 14068880087040.0, "grad_norm": 1.9604587859361342, "language_loss": 0.73923039, "learning_rate": 1.0014619185972732e-06, "loss": 0.81615478, "num_input_tokens_seen": 242867775, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.11279297, "step": 11250, "time_per_iteration": 2.5341131687164307 }, { "auxiliary_loss_clip": 0.06420618, "auxiliary_loss_mlp": 0.01268773, "balance_loss_clip": 0.06275619, "balance_loss_mlp": 0.01258551, "epoch": 0.676446715767323, "flos": 20418441984000.0, "grad_norm": 2.981480236228485, "language_loss": 0.75236648, "learning_rate": 1.0011244896214497e-06, "loss": 0.82926041, "num_input_tokens_seen": 242886865, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10217285, "step": 11251, "time_per_iteration": 2.559418201446533 }, { "auxiliary_loss_clip": 0.06425263, "auxiliary_loss_mlp": 0.01267211, "balance_loss_clip": 0.0628238, "balance_loss_mlp": 0.01257174, "epoch": 0.676506839019991, "flos": 21294651329280.0, "grad_norm": 2.4522445851351455, "language_loss": 0.70239198, "learning_rate": 1.0007870985230873e-06, "loss": 0.77931678, "num_input_tokens_seen": 242906705, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10040283, "step": 11252, "time_per_iteration": 2.596235513687134 }, { "auxiliary_loss_clip": 0.06419539, "auxiliary_loss_mlp": 0.01264725, "balance_loss_clip": 0.06276952, "balance_loss_mlp": 0.01255391, "epoch": 0.676566962272659, "flos": 29939849936640.0, "grad_norm": 1.8011885076628331, "language_loss": 0.67253196, "learning_rate": 1.0004497453149765e-06, "loss": 0.74937457, "num_input_tokens_seen": 242925215, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09338379, "step": 11253, "time_per_iteration": 2.620850086212158 }, { "auxiliary_loss_clip": 0.0642731, "auxiliary_loss_mlp": 0.01267392, "balance_loss_clip": 0.06279655, "balance_loss_mlp": 0.0125652, "epoch": 0.6766270855253269, "flos": 17936994844800.0, "grad_norm": 1.6864166719823226, "language_loss": 0.77190661, "learning_rate": 1.0001124300099115e-06, "loss": 0.84885365, "num_input_tokens_seen": 242944750, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.10870361, "step": 11254, "time_per_iteration": 2.593217611312866 }, { "auxiliary_loss_clip": 0.06424108, "auxiliary_loss_mlp": 0.01266988, "balance_loss_clip": 0.06277786, "balance_loss_mlp": 0.01256516, "epoch": 0.6766872087779949, "flos": 23110283525760.0, "grad_norm": 1.9068221158358576, "language_loss": 0.72113061, "learning_rate": 9.997751526206835e-07, "loss": 0.79804158, "num_input_tokens_seen": 242963860, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10473633, "step": 11255, "time_per_iteration": 2.5503528118133545 }, { "auxiliary_loss_clip": 0.06425613, "auxiliary_loss_mlp": 0.01268549, "balance_loss_clip": 0.06278697, "balance_loss_mlp": 0.01257767, "epoch": 0.6767473320306628, "flos": 26220257740800.0, "grad_norm": 2.302991198107141, "language_loss": 0.75785136, "learning_rate": 9.994379131600828e-07, "loss": 0.83479297, "num_input_tokens_seen": 242983050, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10797119, "step": 11256, "time_per_iteration": 2.6040053367614746 }, { "auxiliary_loss_clip": 0.06425278, "auxiliary_loss_mlp": 0.01267327, "balance_loss_clip": 0.06281812, "balance_loss_mlp": 0.01256706, "epoch": 0.6768074552833309, "flos": 18374554465920.0, "grad_norm": 3.1431523777636365, "language_loss": 0.65957004, "learning_rate": 9.991007116408965e-07, "loss": 0.73649609, "num_input_tokens_seen": 243001125, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10620117, "step": 11257, "time_per_iteration": 2.5160374641418457 }, { "auxiliary_loss_clip": 0.06422254, "auxiliary_loss_mlp": 0.01266208, "balance_loss_clip": 0.06280676, "balance_loss_mlp": 0.01256129, "epoch": 0.6768675785359988, "flos": 23046692676480.0, "grad_norm": 1.3985643959536356, "language_loss": 0.75739247, "learning_rate": 9.987635480759109e-07, "loss": 0.83427709, "num_input_tokens_seen": 243021865, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.10076904, "step": 11258, "time_per_iteration": 2.580928087234497 }, { "auxiliary_loss_clip": 0.06411061, "auxiliary_loss_mlp": 0.01264498, "balance_loss_clip": 0.06272063, "balance_loss_mlp": 0.01255176, "epoch": 0.6769277017886668, "flos": 33044876760960.0, "grad_norm": 1.52394802285095, "language_loss": 0.66626477, "learning_rate": 9.984264224779127e-07, "loss": 0.74302042, "num_input_tokens_seen": 243042970, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09320068, "step": 11259, "time_per_iteration": 2.6420493125915527 }, { "auxiliary_loss_clip": 0.06422111, "auxiliary_loss_mlp": 0.01266639, "balance_loss_clip": 0.06277548, "balance_loss_mlp": 0.01256417, "epoch": 0.6769878250413347, "flos": 20854408377600.0, "grad_norm": 3.03349927417686, "language_loss": 0.86075479, "learning_rate": 9.980893348596839e-07, "loss": 0.93764222, "num_input_tokens_seen": 243058470, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10223389, "step": 11260, "time_per_iteration": 2.5410873889923096 }, { "auxiliary_loss_clip": 0.06430306, "auxiliary_loss_mlp": 0.01265714, "balance_loss_clip": 0.06280795, "balance_loss_mlp": 0.01254717, "epoch": 0.6770479482940027, "flos": 15601345009920.0, "grad_norm": 6.73784704589798, "language_loss": 0.78395545, "learning_rate": 9.977522852340081e-07, "loss": 0.86091566, "num_input_tokens_seen": 243076630, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.11004639, "step": 11261, "time_per_iteration": 2.5566346645355225 }, { "auxiliary_loss_clip": 0.06421275, "auxiliary_loss_mlp": 0.01266816, "balance_loss_clip": 0.06275882, "balance_loss_mlp": 0.01256051, "epoch": 0.6771080715466706, "flos": 18626345585280.0, "grad_norm": 1.6061240555016847, "language_loss": 0.88297266, "learning_rate": 9.97415273613666e-07, "loss": 0.95985359, "num_input_tokens_seen": 243092260, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.10760498, "step": 11262, "time_per_iteration": 2.5668654441833496 }, { "auxiliary_loss_clip": 0.06430148, "auxiliary_loss_mlp": 0.01263718, "balance_loss_clip": 0.06281941, "balance_loss_mlp": 0.01253615, "epoch": 0.6771681947993387, "flos": 12500427035520.0, "grad_norm": 1.9119903362101236, "language_loss": 0.74677861, "learning_rate": 9.97078300011439e-07, "loss": 0.82371724, "num_input_tokens_seen": 243109405, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.10101318, "step": 11263, "time_per_iteration": 2.5841617584228516 }, { "auxiliary_loss_clip": 0.06427495, "auxiliary_loss_mlp": 0.01265924, "balance_loss_clip": 0.06278685, "balance_loss_mlp": 0.01254188, "epoch": 0.6772283180520066, "flos": 22243549691520.0, "grad_norm": 2.1089412160885224, "language_loss": 0.68539917, "learning_rate": 9.967413644401016e-07, "loss": 0.76233333, "num_input_tokens_seen": 243128135, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.11755371, "step": 11264, "time_per_iteration": 2.58347487449646 }, { "auxiliary_loss_clip": 0.06421262, "auxiliary_loss_mlp": 0.01265858, "balance_loss_clip": 0.06278253, "balance_loss_mlp": 0.01254819, "epoch": 0.6772884413046746, "flos": 16148588025600.0, "grad_norm": 1.7221310575798605, "language_loss": 0.73329389, "learning_rate": 9.964044669124324e-07, "loss": 0.81016517, "num_input_tokens_seen": 243146785, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.11035156, "step": 11265, "time_per_iteration": 2.531623363494873 }, { "auxiliary_loss_clip": 0.06418258, "auxiliary_loss_mlp": 0.0126974, "balance_loss_clip": 0.06277736, "balance_loss_mlp": 0.01259678, "epoch": 0.6773485645573426, "flos": 19141835103360.0, "grad_norm": 2.8493394680593607, "language_loss": 0.6167227, "learning_rate": 9.96067607441207e-07, "loss": 0.69360274, "num_input_tokens_seen": 243165275, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.10064697, "step": 11266, "time_per_iteration": 2.5194826126098633 }, { "auxiliary_loss_clip": 0.06423508, "auxiliary_loss_mlp": 0.01268445, "balance_loss_clip": 0.06278416, "balance_loss_mlp": 0.01257955, "epoch": 0.6774086878100105, "flos": 14142114155520.0, "grad_norm": 1.7959963551886, "language_loss": 0.7051872, "learning_rate": 9.957307860391976e-07, "loss": 0.78210676, "num_input_tokens_seen": 243182845, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10498047, "step": 11267, "time_per_iteration": 3.9431910514831543 }, { "auxiliary_loss_clip": 0.06422833, "auxiliary_loss_mlp": 0.01265324, "balance_loss_clip": 0.0627922, "balance_loss_mlp": 0.01255483, "epoch": 0.6774688110626785, "flos": 22203075369600.0, "grad_norm": 1.7027386198611285, "language_loss": 0.71227497, "learning_rate": 9.953940027191785e-07, "loss": 0.7891565, "num_input_tokens_seen": 243201475, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09851074, "step": 11268, "time_per_iteration": 2.55281925201416 }, { "auxiliary_loss_clip": 0.0642447, "auxiliary_loss_mlp": 0.0126928, "balance_loss_clip": 0.06279196, "balance_loss_mlp": 0.01258062, "epoch": 0.6775289343153464, "flos": 23046734603520.0, "grad_norm": 1.5290313392325319, "language_loss": 0.77333885, "learning_rate": 9.950572574939194e-07, "loss": 0.85027635, "num_input_tokens_seen": 243221850, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.11212158, "step": 11269, "time_per_iteration": 2.7471635341644287 }, { "auxiliary_loss_clip": 0.06425767, "auxiliary_loss_mlp": 0.01268062, "balance_loss_clip": 0.06279264, "balance_loss_mlp": 0.01256684, "epoch": 0.6775890575680145, "flos": 18298930556160.0, "grad_norm": 1.940949474389059, "language_loss": 0.74687129, "learning_rate": 9.94720550376189e-07, "loss": 0.82380962, "num_input_tokens_seen": 243239855, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.1137085, "step": 11270, "time_per_iteration": 2.7292771339416504 }, { "auxiliary_loss_clip": 0.06421672, "auxiliary_loss_mlp": 0.01265334, "balance_loss_clip": 0.06279138, "balance_loss_mlp": 0.01254802, "epoch": 0.6776491808206824, "flos": 25343251781760.0, "grad_norm": 15.16168476167783, "language_loss": 0.7276721, "learning_rate": 9.94383881378756e-07, "loss": 0.80454218, "num_input_tokens_seen": 243260085, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10540771, "step": 11271, "time_per_iteration": 2.637336015701294 }, { "auxiliary_loss_clip": 0.06422984, "auxiliary_loss_mlp": 0.0126548, "balance_loss_clip": 0.06278615, "balance_loss_mlp": 0.01255234, "epoch": 0.6777093040733504, "flos": 26034908509440.0, "grad_norm": 1.4966617920764185, "language_loss": 0.68140608, "learning_rate": 9.94047250514387e-07, "loss": 0.75829077, "num_input_tokens_seen": 243280065, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10253906, "step": 11272, "time_per_iteration": 2.6297435760498047 }, { "auxiliary_loss_clip": 0.06428123, "auxiliary_loss_mlp": 0.01267931, "balance_loss_clip": 0.06279558, "balance_loss_mlp": 0.01256308, "epoch": 0.6777694273260183, "flos": 18009306518400.0, "grad_norm": 2.883587325176517, "language_loss": 0.74165285, "learning_rate": 9.937106577958481e-07, "loss": 0.81861341, "num_input_tokens_seen": 243297775, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.11621094, "step": 11273, "time_per_iteration": 2.5282928943634033 }, { "auxiliary_loss_clip": 0.06422, "auxiliary_loss_mlp": 0.01268437, "balance_loss_clip": 0.06279735, "balance_loss_mlp": 0.01258078, "epoch": 0.6778295505786863, "flos": 23447886825600.0, "grad_norm": 2.4862296236982755, "language_loss": 0.70570803, "learning_rate": 9.933741032359015e-07, "loss": 0.78261244, "num_input_tokens_seen": 243315760, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10357666, "step": 11274, "time_per_iteration": 2.570199728012085 }, { "auxiliary_loss_clip": 0.06422603, "auxiliary_loss_mlp": 0.01269143, "balance_loss_clip": 0.06275532, "balance_loss_mlp": 0.01258808, "epoch": 0.6778896738313542, "flos": 19104337601280.0, "grad_norm": 1.6582037197285255, "language_loss": 0.6607765, "learning_rate": 9.930375868473093e-07, "loss": 0.73769403, "num_input_tokens_seen": 243335715, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.10333252, "step": 11275, "time_per_iteration": 4.032511949539185 }, { "auxiliary_loss_clip": 0.06421372, "auxiliary_loss_mlp": 0.01269089, "balance_loss_clip": 0.06277305, "balance_loss_mlp": 0.01259493, "epoch": 0.6779497970840223, "flos": 26111077470720.0, "grad_norm": 1.492966324260643, "language_loss": 0.73494422, "learning_rate": 9.927011086428335e-07, "loss": 0.81184882, "num_input_tokens_seen": 243356935, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.09594727, "step": 11276, "time_per_iteration": 2.6182830333709717 }, { "auxiliary_loss_clip": 0.06420057, "auxiliary_loss_mlp": 0.01265206, "balance_loss_clip": 0.06277915, "balance_loss_mlp": 0.01255174, "epoch": 0.6780099203366902, "flos": 19725359736960.0, "grad_norm": 1.6498972395815132, "language_loss": 0.77099121, "learning_rate": 9.923646686352317e-07, "loss": 0.84784377, "num_input_tokens_seen": 243375625, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10040283, "step": 11277, "time_per_iteration": 2.5792009830474854 }, { "auxiliary_loss_clip": 0.06428409, "auxiliary_loss_mlp": 0.01265799, "balance_loss_clip": 0.0628015, "balance_loss_mlp": 0.01254987, "epoch": 0.6780700435893582, "flos": 18218946234240.0, "grad_norm": 2.330716636476749, "language_loss": 0.8361004, "learning_rate": 9.920282668372627e-07, "loss": 0.91304249, "num_input_tokens_seen": 243390195, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10827637, "step": 11278, "time_per_iteration": 2.5047566890716553 }, { "auxiliary_loss_clip": 0.06419527, "auxiliary_loss_mlp": 0.01271618, "balance_loss_clip": 0.06279636, "balance_loss_mlp": 0.01262045, "epoch": 0.6781301668420262, "flos": 25383600322560.0, "grad_norm": 1.6445171242215324, "language_loss": 0.70462167, "learning_rate": 9.916919032616844e-07, "loss": 0.78153312, "num_input_tokens_seen": 243411690, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09576416, "step": 11279, "time_per_iteration": 2.5927202701568604 }, { "auxiliary_loss_clip": 0.06424163, "auxiliary_loss_mlp": 0.01266377, "balance_loss_clip": 0.0627926, "balance_loss_mlp": 0.01254731, "epoch": 0.6781902900946941, "flos": 24026589849600.0, "grad_norm": 1.8278339361170004, "language_loss": 0.74398226, "learning_rate": 9.913555779212485e-07, "loss": 0.82088768, "num_input_tokens_seen": 243430280, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.11645508, "step": 11280, "time_per_iteration": 2.5525102615356445 }, { "auxiliary_loss_clip": 0.06426154, "auxiliary_loss_mlp": 0.01265643, "balance_loss_clip": 0.06277349, "balance_loss_mlp": 0.01254496, "epoch": 0.6782504133473621, "flos": 19652964209280.0, "grad_norm": 1.8046414538954785, "language_loss": 0.70150876, "learning_rate": 9.910192908287104e-07, "loss": 0.77842677, "num_input_tokens_seen": 243448690, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.11138916, "step": 11281, "time_per_iteration": 2.5407843589782715 }, { "auxiliary_loss_clip": 0.06421166, "auxiliary_loss_mlp": 0.01266129, "balance_loss_clip": 0.06280988, "balance_loss_mlp": 0.01256145, "epoch": 0.67831053660003, "flos": 24939080812800.0, "grad_norm": 1.534318543050993, "language_loss": 0.64228177, "learning_rate": 9.906830419968217e-07, "loss": 0.71915472, "num_input_tokens_seen": 243470695, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09985352, "step": 11282, "time_per_iteration": 2.6240811347961426 }, { "auxiliary_loss_clip": 0.06431544, "auxiliary_loss_mlp": 0.01267179, "balance_loss_clip": 0.06281169, "balance_loss_mlp": 0.01255621, "epoch": 0.6783706598526981, "flos": 31215785984640.0, "grad_norm": 1.5173130674172666, "language_loss": 0.74515367, "learning_rate": 9.90346831438334e-07, "loss": 0.82214093, "num_input_tokens_seen": 243493345, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.11547852, "step": 11283, "time_per_iteration": 2.6475651264190674 }, { "auxiliary_loss_clip": 0.0642238, "auxiliary_loss_mlp": 0.01264521, "balance_loss_clip": 0.06277937, "balance_loss_mlp": 0.01254847, "epoch": 0.678430783105366, "flos": 35449526033280.0, "grad_norm": 1.571487934096614, "language_loss": 0.56950909, "learning_rate": 9.900106591659948e-07, "loss": 0.64637804, "num_input_tokens_seen": 243515670, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.09661865, "step": 11284, "time_per_iteration": 4.124989032745361 }, { "auxiliary_loss_clip": 0.06421373, "auxiliary_loss_mlp": 0.01263213, "balance_loss_clip": 0.0627622, "balance_loss_mlp": 0.01253045, "epoch": 0.678490906358034, "flos": 14434044180480.0, "grad_norm": 2.1832981670985463, "language_loss": 0.75968093, "learning_rate": 9.896745251925535e-07, "loss": 0.83652687, "num_input_tokens_seen": 243533625, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10168457, "step": 11285, "time_per_iteration": 2.5246407985687256 }, { "auxiliary_loss_clip": 0.06421161, "auxiliary_loss_mlp": 0.01265547, "balance_loss_clip": 0.06280343, "balance_loss_mlp": 0.01255325, "epoch": 0.6785510296107019, "flos": 24317262063360.0, "grad_norm": 1.567698716634572, "language_loss": 0.66477883, "learning_rate": 9.893384295307557e-07, "loss": 0.74164593, "num_input_tokens_seen": 243553040, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10223389, "step": 11286, "time_per_iteration": 2.600499391555786 }, { "auxiliary_loss_clip": 0.06425221, "auxiliary_loss_mlp": 0.01265236, "balance_loss_clip": 0.06278199, "balance_loss_mlp": 0.01254543, "epoch": 0.6786111528633699, "flos": 26984142288000.0, "grad_norm": 2.1650901026856957, "language_loss": 0.52560681, "learning_rate": 9.890023721933447e-07, "loss": 0.60251135, "num_input_tokens_seen": 243572590, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.10693359, "step": 11287, "time_per_iteration": 2.5799362659454346 }, { "auxiliary_loss_clip": 0.06424175, "auxiliary_loss_mlp": 0.01265249, "balance_loss_clip": 0.06280938, "balance_loss_mlp": 0.01255611, "epoch": 0.6786712761160378, "flos": 24324641222400.0, "grad_norm": 1.5329929822119224, "language_loss": 0.77250755, "learning_rate": 9.886663531930655e-07, "loss": 0.84940183, "num_input_tokens_seen": 243594140, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09637451, "step": 11288, "time_per_iteration": 4.018954277038574 }, { "auxiliary_loss_clip": 0.06430344, "auxiliary_loss_mlp": 0.01270059, "balance_loss_clip": 0.06284367, "balance_loss_mlp": 0.01259336, "epoch": 0.6787313993687059, "flos": 22937176990080.0, "grad_norm": 1.9946851397389351, "language_loss": 0.73011422, "learning_rate": 9.883303725426593e-07, "loss": 0.80711824, "num_input_tokens_seen": 243615170, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10717773, "step": 11289, "time_per_iteration": 2.6097207069396973 }, { "auxiliary_loss_clip": 0.06422958, "auxiliary_loss_mlp": 0.01266604, "balance_loss_clip": 0.06276001, "balance_loss_mlp": 0.01255798, "epoch": 0.6787915226213738, "flos": 26875423215360.0, "grad_norm": 1.3253112167789296, "language_loss": 0.80138576, "learning_rate": 9.879944302548682e-07, "loss": 0.87828141, "num_input_tokens_seen": 243635675, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.10809326, "step": 11290, "time_per_iteration": 2.6181254386901855 }, { "auxiliary_loss_clip": 0.06423056, "auxiliary_loss_mlp": 0.01266554, "balance_loss_clip": 0.06282824, "balance_loss_mlp": 0.01256862, "epoch": 0.6788516458740418, "flos": 20014648358400.0, "grad_norm": 1.4438641707639004, "language_loss": 0.74801505, "learning_rate": 9.87658526342428e-07, "loss": 0.82491118, "num_input_tokens_seen": 243654950, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09692383, "step": 11291, "time_per_iteration": 2.5815210342407227 }, { "auxiliary_loss_clip": 0.06424768, "auxiliary_loss_mlp": 0.0126523, "balance_loss_clip": 0.06278558, "balance_loss_mlp": 0.01254811, "epoch": 0.6789117691267098, "flos": 28734045356160.0, "grad_norm": 1.7050725587550564, "language_loss": 0.75527525, "learning_rate": 9.873226608180785e-07, "loss": 0.8321752, "num_input_tokens_seen": 243674970, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10412598, "step": 11292, "time_per_iteration": 2.5861716270446777 }, { "auxiliary_loss_clip": 0.06422143, "auxiliary_loss_mlp": 0.0126805, "balance_loss_clip": 0.06278711, "balance_loss_mlp": 0.01258138, "epoch": 0.6789718923793777, "flos": 23410053907200.0, "grad_norm": 1.8022248106032952, "language_loss": 0.84209907, "learning_rate": 9.869868336945556e-07, "loss": 0.91900098, "num_input_tokens_seen": 243693440, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.09912109, "step": 11293, "time_per_iteration": 2.559985876083374 }, { "auxiliary_loss_clip": 0.06436982, "auxiliary_loss_mlp": 0.01267243, "balance_loss_clip": 0.06284706, "balance_loss_mlp": 0.01256246, "epoch": 0.6790320156320457, "flos": 20455100945280.0, "grad_norm": 2.1332839243152963, "language_loss": 0.79560459, "learning_rate": 9.866510449845929e-07, "loss": 0.87264681, "num_input_tokens_seen": 243710055, "router_z_loss_clip": 1.52246094, "router_z_loss_mlp": 0.10998535, "step": 11294, "time_per_iteration": 2.5571563243865967 }, { "auxiliary_loss_clip": 0.06426418, "auxiliary_loss_mlp": 0.01264269, "balance_loss_clip": 0.0628261, "balance_loss_mlp": 0.01254071, "epoch": 0.6790921388847136, "flos": 24173519184000.0, "grad_norm": 1.7504516530824659, "language_loss": 0.79421544, "learning_rate": 9.86315294700924e-07, "loss": 0.87112236, "num_input_tokens_seen": 243728635, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10192871, "step": 11295, "time_per_iteration": 2.585599660873413 }, { "auxiliary_loss_clip": 0.06416969, "auxiliary_loss_mlp": 0.01270527, "balance_loss_clip": 0.06278211, "balance_loss_mlp": 0.01261432, "epoch": 0.6791522621373817, "flos": 21914541434880.0, "grad_norm": 3.369059450327785, "language_loss": 0.71340704, "learning_rate": 9.859795828562823e-07, "loss": 0.79028201, "num_input_tokens_seen": 243748330, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09088135, "step": 11296, "time_per_iteration": 2.529588222503662 }, { "auxiliary_loss_clip": 0.06422704, "auxiliary_loss_mlp": 0.01266029, "balance_loss_clip": 0.06278107, "balance_loss_mlp": 0.01255896, "epoch": 0.6792123853900496, "flos": 24833380487040.0, "grad_norm": 2.120081215889868, "language_loss": 0.7074759, "learning_rate": 9.856439094633949e-07, "loss": 0.78436327, "num_input_tokens_seen": 243769380, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10137939, "step": 11297, "time_per_iteration": 2.6244120597839355 }, { "auxiliary_loss_clip": 0.06432364, "auxiliary_loss_mlp": 0.01266996, "balance_loss_clip": 0.06282742, "balance_loss_mlp": 0.01255718, "epoch": 0.6792725086427176, "flos": 17571998459520.0, "grad_norm": 2.111434979709011, "language_loss": 0.66354382, "learning_rate": 9.853082745349918e-07, "loss": 0.74053746, "num_input_tokens_seen": 243785510, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.112854, "step": 11298, "time_per_iteration": 2.509795665740967 }, { "auxiliary_loss_clip": 0.06425035, "auxiliary_loss_mlp": 0.01262565, "balance_loss_clip": 0.06279218, "balance_loss_mlp": 0.01253129, "epoch": 0.6793326318953855, "flos": 26948908846080.0, "grad_norm": 1.7643583891823502, "language_loss": 0.72369301, "learning_rate": 9.84972678083801e-07, "loss": 0.80056894, "num_input_tokens_seen": 243805545, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.09436035, "step": 11299, "time_per_iteration": 2.619398355484009 }, { "auxiliary_loss_clip": 0.06425604, "auxiliary_loss_mlp": 0.01267407, "balance_loss_clip": 0.06280194, "balance_loss_mlp": 0.01256928, "epoch": 0.6793927551480535, "flos": 24325479763200.0, "grad_norm": 1.2342816404517054, "language_loss": 0.77595317, "learning_rate": 9.846371201225488e-07, "loss": 0.85288328, "num_input_tokens_seen": 243825185, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10479736, "step": 11300, "time_per_iteration": 2.568458080291748 }, { "auxiliary_loss_clip": 0.06425186, "auxiliary_loss_mlp": 0.01265271, "balance_loss_clip": 0.0628076, "balance_loss_mlp": 0.01254876, "epoch": 0.6794528784007214, "flos": 11441300227200.0, "grad_norm": 1.8557706332759267, "language_loss": 0.63485551, "learning_rate": 9.843016006639577e-07, "loss": 0.7117601, "num_input_tokens_seen": 243841600, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10394287, "step": 11301, "time_per_iteration": 2.56070876121521 }, { "auxiliary_loss_clip": 0.06422919, "auxiliary_loss_mlp": 0.01264787, "balance_loss_clip": 0.06278545, "balance_loss_mlp": 0.01254785, "epoch": 0.6795130016533895, "flos": 25236922550400.0, "grad_norm": 1.607020840003371, "language_loss": 0.82984775, "learning_rate": 9.839661197207525e-07, "loss": 0.90672481, "num_input_tokens_seen": 243862250, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10003662, "step": 11302, "time_per_iteration": 2.5667998790740967 }, { "auxiliary_loss_clip": 0.0642969, "auxiliary_loss_mlp": 0.01265056, "balance_loss_clip": 0.06282836, "balance_loss_mlp": 0.01254643, "epoch": 0.6795731249060574, "flos": 18302326646400.0, "grad_norm": 2.070145667987942, "language_loss": 0.69891787, "learning_rate": 9.83630677305654e-07, "loss": 0.77586532, "num_input_tokens_seen": 243880560, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.10406494, "step": 11303, "time_per_iteration": 2.5553090572357178 }, { "auxiliary_loss_clip": 0.06432556, "auxiliary_loss_mlp": 0.0126634, "balance_loss_clip": 0.06284136, "balance_loss_mlp": 0.01255385, "epoch": 0.6796332481587254, "flos": 20306159112960.0, "grad_norm": 1.7333914658327083, "language_loss": 0.70310509, "learning_rate": 9.832952734313813e-07, "loss": 0.78009403, "num_input_tokens_seen": 243900635, "router_z_loss_clip": 1.48339844, "router_z_loss_mlp": 0.10961914, "step": 11304, "time_per_iteration": 2.5511388778686523 }, { "auxiliary_loss_clip": 0.06434602, "auxiliary_loss_mlp": 0.012685, "balance_loss_clip": 0.06287868, "balance_loss_mlp": 0.01257133, "epoch": 0.6796933714113934, "flos": 23593642202880.0, "grad_norm": 1.9281700200240166, "language_loss": 0.72222692, "learning_rate": 9.829599081106536e-07, "loss": 0.79925799, "num_input_tokens_seen": 243920160, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.11376953, "step": 11305, "time_per_iteration": 2.6098368167877197 }, { "auxiliary_loss_clip": 0.06423797, "auxiliary_loss_mlp": 0.01265576, "balance_loss_clip": 0.06278497, "balance_loss_mlp": 0.01254329, "epoch": 0.6797534946640613, "flos": 27126291939840.0, "grad_norm": 2.308071417495805, "language_loss": 0.66209179, "learning_rate": 9.826245813561882e-07, "loss": 0.73898554, "num_input_tokens_seen": 243939015, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.11236572, "step": 11306, "time_per_iteration": 4.040339469909668 }, { "auxiliary_loss_clip": 0.06422307, "auxiliary_loss_mlp": 0.01265096, "balance_loss_clip": 0.06279374, "balance_loss_mlp": 0.01255214, "epoch": 0.6798136179167293, "flos": 22133992078080.0, "grad_norm": 1.6217759868244288, "language_loss": 0.80180883, "learning_rate": 9.822892931807021e-07, "loss": 0.87868285, "num_input_tokens_seen": 243958470, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09875488, "step": 11307, "time_per_iteration": 2.5762805938720703 }, { "auxiliary_loss_clip": 0.0642194, "auxiliary_loss_mlp": 0.01264967, "balance_loss_clip": 0.06279807, "balance_loss_mlp": 0.01254375, "epoch": 0.6798737411693972, "flos": 17493565438080.0, "grad_norm": 1.9337682522534152, "language_loss": 0.89263475, "learning_rate": 9.819540435969066e-07, "loss": 0.96950382, "num_input_tokens_seen": 243975450, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10601807, "step": 11308, "time_per_iteration": 2.5154776573181152 }, { "auxiliary_loss_clip": 0.06425273, "auxiliary_loss_mlp": 0.01266376, "balance_loss_clip": 0.06279454, "balance_loss_mlp": 0.01255689, "epoch": 0.6799338644220653, "flos": 22898715166080.0, "grad_norm": 2.0071507386041487, "language_loss": 0.71260715, "learning_rate": 9.816188326175154e-07, "loss": 0.7895236, "num_input_tokens_seen": 243994355, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10681152, "step": 11309, "time_per_iteration": 2.547968864440918 }, { "auxiliary_loss_clip": 0.0642309, "auxiliary_loss_mlp": 0.01268959, "balance_loss_clip": 0.06278334, "balance_loss_mlp": 0.01258349, "epoch": 0.6799939876747332, "flos": 23186284778880.0, "grad_norm": 1.867190423380403, "language_loss": 0.84641266, "learning_rate": 9.812836602552411e-07, "loss": 0.92333317, "num_input_tokens_seen": 244011620, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10614014, "step": 11310, "time_per_iteration": 2.544090509414673 }, { "auxiliary_loss_clip": 0.06418408, "auxiliary_loss_mlp": 0.01265468, "balance_loss_clip": 0.06279045, "balance_loss_mlp": 0.01256324, "epoch": 0.6800541109274012, "flos": 19505951020800.0, "grad_norm": 2.488624045233959, "language_loss": 0.83355117, "learning_rate": 9.80948526522792e-07, "loss": 0.91038996, "num_input_tokens_seen": 244029925, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09143066, "step": 11311, "time_per_iteration": 2.531965732574463 }, { "auxiliary_loss_clip": 0.06426041, "auxiliary_loss_mlp": 0.01267509, "balance_loss_clip": 0.06275704, "balance_loss_mlp": 0.01255785, "epoch": 0.6801142341800691, "flos": 22284946408320.0, "grad_norm": 1.5882480412920201, "language_loss": 0.76122332, "learning_rate": 9.806134314328767e-07, "loss": 0.83815885, "num_input_tokens_seen": 244051225, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.11724854, "step": 11312, "time_per_iteration": 2.5749502182006836 }, { "auxiliary_loss_clip": 0.06315351, "auxiliary_loss_mlp": 0.01251467, "balance_loss_clip": 0.06256602, "balance_loss_mlp": 0.01250085, "epoch": 0.6801743574327371, "flos": 68734439614080.0, "grad_norm": 0.6471097725330718, "language_loss": 0.57277834, "learning_rate": 9.802783749982038e-07, "loss": 0.64844656, "num_input_tokens_seen": 244115930, "router_z_loss_clip": 0.58642578, "router_z_loss_mlp": 0.01383972, "step": 11313, "time_per_iteration": 3.276381731033325 }, { "auxiliary_loss_clip": 0.06423579, "auxiliary_loss_mlp": 0.01264926, "balance_loss_clip": 0.06277587, "balance_loss_mlp": 0.01254578, "epoch": 0.680234480685405, "flos": 29468146976640.0, "grad_norm": 1.709028301143708, "language_loss": 0.6866641, "learning_rate": 9.799433572314754e-07, "loss": 0.76354909, "num_input_tokens_seen": 244137320, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10345459, "step": 11314, "time_per_iteration": 2.596625804901123 }, { "auxiliary_loss_clip": 0.06419465, "auxiliary_loss_mlp": 0.01266971, "balance_loss_clip": 0.06276956, "balance_loss_mlp": 0.01257625, "epoch": 0.6802946039380731, "flos": 15921045463680.0, "grad_norm": 1.8117722901445183, "language_loss": 0.81649554, "learning_rate": 9.796083781453972e-07, "loss": 0.8933599, "num_input_tokens_seen": 244152755, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09338379, "step": 11315, "time_per_iteration": 3.9139108657836914 }, { "auxiliary_loss_clip": 0.06422438, "auxiliary_loss_mlp": 0.01266324, "balance_loss_clip": 0.06278544, "balance_loss_mlp": 0.01256096, "epoch": 0.680354727190741, "flos": 22025314932480.0, "grad_norm": 1.646414438943566, "language_loss": 0.70459068, "learning_rate": 9.792734377526718e-07, "loss": 0.78147829, "num_input_tokens_seen": 244171480, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10223389, "step": 11316, "time_per_iteration": 2.594132900238037 }, { "auxiliary_loss_clip": 0.06423151, "auxiliary_loss_mlp": 0.01273679, "balance_loss_clip": 0.06280485, "balance_loss_mlp": 0.01263421, "epoch": 0.680414850443409, "flos": 18447285409920.0, "grad_norm": 1.9712821806978875, "language_loss": 0.66818047, "learning_rate": 9.789385360660003e-07, "loss": 0.74514878, "num_input_tokens_seen": 244187920, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10253906, "step": 11317, "time_per_iteration": 2.8961002826690674 }, { "auxiliary_loss_clip": 0.06430305, "auxiliary_loss_mlp": 0.01266889, "balance_loss_clip": 0.06282927, "balance_loss_mlp": 0.01256369, "epoch": 0.680474973696077, "flos": 26365677701760.0, "grad_norm": 1.5445100872087898, "language_loss": 0.75577855, "learning_rate": 9.78603673098082e-07, "loss": 0.83275056, "num_input_tokens_seen": 244209565, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.10522461, "step": 11318, "time_per_iteration": 2.713665246963501 }, { "auxiliary_loss_clip": 0.06418057, "auxiliary_loss_mlp": 0.01262617, "balance_loss_clip": 0.06276633, "balance_loss_mlp": 0.01253456, "epoch": 0.6805350969487449, "flos": 18339069461760.0, "grad_norm": 1.61058160271029, "language_loss": 0.6804167, "learning_rate": 9.782688488616143e-07, "loss": 0.75722343, "num_input_tokens_seen": 244228015, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09161377, "step": 11319, "time_per_iteration": 2.6584436893463135 }, { "auxiliary_loss_clip": 0.0641779, "auxiliary_loss_mlp": 0.012694, "balance_loss_clip": 0.06276311, "balance_loss_mlp": 0.01259017, "epoch": 0.6805952202014129, "flos": 19943552568960.0, "grad_norm": 1.7174245662532468, "language_loss": 0.76867831, "learning_rate": 9.779340633692945e-07, "loss": 0.84555024, "num_input_tokens_seen": 244245615, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.1038208, "step": 11320, "time_per_iteration": 2.5423643589019775 }, { "auxiliary_loss_clip": 0.06422642, "auxiliary_loss_mlp": 0.01266662, "balance_loss_clip": 0.06278693, "balance_loss_mlp": 0.0125641, "epoch": 0.6806553434540809, "flos": 25230633494400.0, "grad_norm": 1.7226506636289958, "language_loss": 0.74701858, "learning_rate": 9.77599316633817e-07, "loss": 0.82391167, "num_input_tokens_seen": 244263625, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10253906, "step": 11321, "time_per_iteration": 2.6033711433410645 }, { "auxiliary_loss_clip": 0.06427537, "auxiliary_loss_mlp": 0.01266927, "balance_loss_clip": 0.06280932, "balance_loss_mlp": 0.01256538, "epoch": 0.6807154667067489, "flos": 17791407175680.0, "grad_norm": 1.7397333815350755, "language_loss": 0.72905052, "learning_rate": 9.772646086678758e-07, "loss": 0.80599523, "num_input_tokens_seen": 244282745, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10388184, "step": 11322, "time_per_iteration": 2.5605580806732178 }, { "auxiliary_loss_clip": 0.06427731, "auxiliary_loss_mlp": 0.01264163, "balance_loss_clip": 0.06280698, "balance_loss_mlp": 0.01252862, "epoch": 0.6807755899594168, "flos": 22206387605760.0, "grad_norm": 3.8460234548953203, "language_loss": 0.78750736, "learning_rate": 9.769299394841638e-07, "loss": 0.86442631, "num_input_tokens_seen": 244303770, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.11303711, "step": 11323, "time_per_iteration": 4.063258647918701 }, { "auxiliary_loss_clip": 0.06321362, "auxiliary_loss_mlp": 0.01253047, "balance_loss_clip": 0.06262814, "balance_loss_mlp": 0.01251681, "epoch": 0.6808357132120848, "flos": 68648878995840.0, "grad_norm": 0.7663817643559198, "language_loss": 0.57032412, "learning_rate": 9.765953090953714e-07, "loss": 0.64606822, "num_input_tokens_seen": 244355910, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.0136795, "step": 11324, "time_per_iteration": 2.9422857761383057 }, { "auxiliary_loss_clip": 0.06425042, "auxiliary_loss_mlp": 0.01268487, "balance_loss_clip": 0.0627977, "balance_loss_mlp": 0.01257318, "epoch": 0.6808958364647527, "flos": 23850380712960.0, "grad_norm": 1.9182341866873631, "language_loss": 0.68480623, "learning_rate": 9.76260717514186e-07, "loss": 0.76174158, "num_input_tokens_seen": 244376610, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.11175537, "step": 11325, "time_per_iteration": 2.5902621746063232 }, { "auxiliary_loss_clip": 0.06427047, "auxiliary_loss_mlp": 0.01270017, "balance_loss_clip": 0.06278397, "balance_loss_mlp": 0.01259139, "epoch": 0.6809559597174207, "flos": 17717376493440.0, "grad_norm": 2.1802082733726804, "language_loss": 0.7020123, "learning_rate": 9.759261647532974e-07, "loss": 0.77898294, "num_input_tokens_seen": 244393000, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.10876465, "step": 11326, "time_per_iteration": 2.510563373565674 }, { "auxiliary_loss_clip": 0.06421675, "auxiliary_loss_mlp": 0.01263753, "balance_loss_clip": 0.06278054, "balance_loss_mlp": 0.01253471, "epoch": 0.6810160829700886, "flos": 22498443411840.0, "grad_norm": 1.651836812120826, "language_loss": 0.73029834, "learning_rate": 9.75591650825392e-07, "loss": 0.80715263, "num_input_tokens_seen": 244409515, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10290527, "step": 11327, "time_per_iteration": 3.9540607929229736 }, { "auxiliary_loss_clip": 0.06422982, "auxiliary_loss_mlp": 0.01265261, "balance_loss_clip": 0.06280629, "balance_loss_mlp": 0.01254902, "epoch": 0.6810762062227567, "flos": 16837854912000.0, "grad_norm": 2.1795212740921186, "language_loss": 0.77428836, "learning_rate": 9.752571757431526e-07, "loss": 0.85117078, "num_input_tokens_seen": 244427165, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10357666, "step": 11328, "time_per_iteration": 2.495478391647339 }, { "auxiliary_loss_clip": 0.0642812, "auxiliary_loss_mlp": 0.01266682, "balance_loss_clip": 0.0628235, "balance_loss_mlp": 0.01256239, "epoch": 0.6811363294754246, "flos": 12719751897600.0, "grad_norm": 1.7705510541662972, "language_loss": 0.64609647, "learning_rate": 9.74922739519265e-07, "loss": 0.7230444, "num_input_tokens_seen": 244445705, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10449219, "step": 11329, "time_per_iteration": 2.560887575149536 }, { "auxiliary_loss_clip": 0.06425549, "auxiliary_loss_mlp": 0.01265806, "balance_loss_clip": 0.06280051, "balance_loss_mlp": 0.01255232, "epoch": 0.6811964527280926, "flos": 17717669982720.0, "grad_norm": 1.863423601678303, "language_loss": 0.79974824, "learning_rate": 9.745883421664096e-07, "loss": 0.87666178, "num_input_tokens_seen": 244460415, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10583496, "step": 11330, "time_per_iteration": 2.5160462856292725 }, { "auxiliary_loss_clip": 0.06422428, "auxiliary_loss_mlp": 0.01266871, "balance_loss_clip": 0.06278904, "balance_loss_mlp": 0.01256309, "epoch": 0.6812565759807605, "flos": 24870416791680.0, "grad_norm": 1.8376402797108904, "language_loss": 0.64064968, "learning_rate": 9.742539836972665e-07, "loss": 0.71754265, "num_input_tokens_seen": 244480555, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10565186, "step": 11331, "time_per_iteration": 2.5822248458862305 }, { "auxiliary_loss_clip": 0.06424029, "auxiliary_loss_mlp": 0.0126666, "balance_loss_clip": 0.06280749, "balance_loss_mlp": 0.01256181, "epoch": 0.6813166992334285, "flos": 17171852486400.0, "grad_norm": 1.5204006762677287, "language_loss": 0.72618806, "learning_rate": 9.739196641245148e-07, "loss": 0.80309498, "num_input_tokens_seen": 244498540, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10479736, "step": 11332, "time_per_iteration": 2.5130228996276855 }, { "auxiliary_loss_clip": 0.06430509, "auxiliary_loss_mlp": 0.012718, "balance_loss_clip": 0.06283965, "balance_loss_mlp": 0.01261286, "epoch": 0.6813768224860965, "flos": 18849527735040.0, "grad_norm": 1.9893153962467043, "language_loss": 0.74808288, "learning_rate": 9.735853834608326e-07, "loss": 0.82510591, "num_input_tokens_seen": 244517015, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.10522461, "step": 11333, "time_per_iteration": 2.5330264568328857 }, { "auxiliary_loss_clip": 0.06426853, "auxiliary_loss_mlp": 0.01270678, "balance_loss_clip": 0.06279363, "balance_loss_mlp": 0.01260236, "epoch": 0.6814369457387645, "flos": 24539228328960.0, "grad_norm": 1.4837373757071253, "language_loss": 0.72359645, "learning_rate": 9.732511417188963e-07, "loss": 0.8005718, "num_input_tokens_seen": 244537450, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.10437012, "step": 11334, "time_per_iteration": 2.5679867267608643 }, { "auxiliary_loss_clip": 0.06419481, "auxiliary_loss_mlp": 0.01270087, "balance_loss_clip": 0.06277999, "balance_loss_mlp": 0.01259757, "epoch": 0.6814970689914325, "flos": 18228799088640.0, "grad_norm": 1.7001162692058487, "language_loss": 0.8625375, "learning_rate": 9.729169389113791e-07, "loss": 0.9394331, "num_input_tokens_seen": 244555640, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10327148, "step": 11335, "time_per_iteration": 2.553528070449829 }, { "auxiliary_loss_clip": 0.06420201, "auxiliary_loss_mlp": 0.01266974, "balance_loss_clip": 0.0628192, "balance_loss_mlp": 0.01257157, "epoch": 0.6815571922441004, "flos": 25235874374400.0, "grad_norm": 1.5944705640482904, "language_loss": 0.8201223, "learning_rate": 9.725827750509542e-07, "loss": 0.89699399, "num_input_tokens_seen": 244574005, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.0982666, "step": 11336, "time_per_iteration": 2.577033042907715 }, { "auxiliary_loss_clip": 0.06419699, "auxiliary_loss_mlp": 0.01273354, "balance_loss_clip": 0.06280358, "balance_loss_mlp": 0.01262994, "epoch": 0.6816173154967684, "flos": 19460864724480.0, "grad_norm": 1.8111947699011095, "language_loss": 0.82007754, "learning_rate": 9.72248650150294e-07, "loss": 0.89700812, "num_input_tokens_seen": 244591395, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.10369873, "step": 11337, "time_per_iteration": 2.547482967376709 }, { "auxiliary_loss_clip": 0.06419939, "auxiliary_loss_mlp": 0.01270153, "balance_loss_clip": 0.06279571, "balance_loss_mlp": 0.01260283, "epoch": 0.6816774387494363, "flos": 17937288334080.0, "grad_norm": 2.0871582570866294, "language_loss": 0.72710353, "learning_rate": 9.719145642220673e-07, "loss": 0.80400449, "num_input_tokens_seen": 244610400, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09863281, "step": 11338, "time_per_iteration": 2.525367259979248 }, { "auxiliary_loss_clip": 0.06428872, "auxiliary_loss_mlp": 0.01266582, "balance_loss_clip": 0.06284368, "balance_loss_mlp": 0.01256169, "epoch": 0.6817375620021043, "flos": 22238937717120.0, "grad_norm": 1.457874700161821, "language_loss": 0.7780326, "learning_rate": 9.715805172789435e-07, "loss": 0.85498714, "num_input_tokens_seen": 244630400, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10406494, "step": 11339, "time_per_iteration": 2.5469255447387695 }, { "auxiliary_loss_clip": 0.06422403, "auxiliary_loss_mlp": 0.01266073, "balance_loss_clip": 0.06276911, "balance_loss_mlp": 0.01255017, "epoch": 0.6817976852547722, "flos": 25381462043520.0, "grad_norm": 1.698074218430524, "language_loss": 0.70777118, "learning_rate": 9.712465093335901e-07, "loss": 0.78465593, "num_input_tokens_seen": 244649155, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.11047363, "step": 11340, "time_per_iteration": 2.5522353649139404 }, { "auxiliary_loss_clip": 0.06426942, "auxiliary_loss_mlp": 0.01268221, "balance_loss_clip": 0.06280233, "balance_loss_mlp": 0.01257838, "epoch": 0.6818578085074403, "flos": 22271068558080.0, "grad_norm": 2.124720582287081, "language_loss": 0.83911777, "learning_rate": 9.709125403986722e-07, "loss": 0.91606945, "num_input_tokens_seen": 244665470, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.1038208, "step": 11341, "time_per_iteration": 2.565140962600708 }, { "auxiliary_loss_clip": 0.06426911, "auxiliary_loss_mlp": 0.01266162, "balance_loss_clip": 0.06280896, "balance_loss_mlp": 0.01255463, "epoch": 0.6819179317601082, "flos": 19324249441920.0, "grad_norm": 1.5990163505757746, "language_loss": 0.68767953, "learning_rate": 9.705786104868531e-07, "loss": 0.76461023, "num_input_tokens_seen": 244684390, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10699463, "step": 11342, "time_per_iteration": 2.535991668701172 }, { "auxiliary_loss_clip": 0.06424315, "auxiliary_loss_mlp": 0.01260723, "balance_loss_clip": 0.06281158, "balance_loss_mlp": 0.01251073, "epoch": 0.6819780550127762, "flos": 21110224492800.0, "grad_norm": 1.554585671055618, "language_loss": 0.75377822, "learning_rate": 9.702447196107963e-07, "loss": 0.83062863, "num_input_tokens_seen": 244703370, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09655762, "step": 11343, "time_per_iteration": 2.5843288898468018 }, { "auxiliary_loss_clip": 0.0642862, "auxiliary_loss_mlp": 0.01266504, "balance_loss_clip": 0.06283342, "balance_loss_mlp": 0.0125568, "epoch": 0.6820381782654441, "flos": 29724214654080.0, "grad_norm": 1.5213766940833433, "language_loss": 0.79970658, "learning_rate": 9.699108677831639e-07, "loss": 0.87665772, "num_input_tokens_seen": 244723325, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10821533, "step": 11344, "time_per_iteration": 2.612380027770996 }, { "auxiliary_loss_clip": 0.06430008, "auxiliary_loss_mlp": 0.01266561, "balance_loss_clip": 0.06283634, "balance_loss_mlp": 0.01255648, "epoch": 0.6820983015181121, "flos": 29249870290560.0, "grad_norm": 2.6409003624734786, "language_loss": 0.66287994, "learning_rate": 9.695770550166136e-07, "loss": 0.73984563, "num_input_tokens_seen": 244745650, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.10919189, "step": 11345, "time_per_iteration": 2.6783995628356934 }, { "auxiliary_loss_clip": 0.06428918, "auxiliary_loss_mlp": 0.0126473, "balance_loss_clip": 0.06281656, "balance_loss_mlp": 0.01254669, "epoch": 0.6821584247707801, "flos": 18876375768960.0, "grad_norm": 3.0532498381907844, "language_loss": 0.65609801, "learning_rate": 9.692432813238054e-07, "loss": 0.73303449, "num_input_tokens_seen": 244760270, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.10058594, "step": 11346, "time_per_iteration": 3.8990931510925293 }, { "auxiliary_loss_clip": 0.06433468, "auxiliary_loss_mlp": 0.01264182, "balance_loss_clip": 0.06285331, "balance_loss_mlp": 0.01253936, "epoch": 0.6822185480234481, "flos": 21330974874240.0, "grad_norm": 1.4663169622151522, "language_loss": 0.78884971, "learning_rate": 9.689095467173952e-07, "loss": 0.86582613, "num_input_tokens_seen": 244779565, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10253906, "step": 11347, "time_per_iteration": 2.5371780395507812 }, { "auxiliary_loss_clip": 0.06325684, "auxiliary_loss_mlp": 0.01256629, "balance_loss_clip": 0.06266934, "balance_loss_mlp": 0.01255294, "epoch": 0.6822786712761161, "flos": 63505540949760.0, "grad_norm": 0.7055976138055056, "language_loss": 0.52428305, "learning_rate": 9.685758512100378e-07, "loss": 0.60010612, "num_input_tokens_seen": 244838480, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.0133667, "step": 11348, "time_per_iteration": 3.1690142154693604 }, { "auxiliary_loss_clip": 0.0642431, "auxiliary_loss_mlp": 0.01268449, "balance_loss_clip": 0.06282693, "balance_loss_mlp": 0.01258906, "epoch": 0.682338794528784, "flos": 21075242613120.0, "grad_norm": 2.002166722611818, "language_loss": 0.80028653, "learning_rate": 9.682421948143873e-07, "loss": 0.87721407, "num_input_tokens_seen": 244855265, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09539795, "step": 11349, "time_per_iteration": 2.529775619506836 }, { "auxiliary_loss_clip": 0.06439628, "auxiliary_loss_mlp": 0.01267672, "balance_loss_clip": 0.06286051, "balance_loss_mlp": 0.0125528, "epoch": 0.682398917781452, "flos": 36292053237120.0, "grad_norm": 2.4091867424096436, "language_loss": 0.73568028, "learning_rate": 9.67908577543096e-07, "loss": 0.81275332, "num_input_tokens_seen": 244875555, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.12384033, "step": 11350, "time_per_iteration": 2.6842355728149414 }, { "auxiliary_loss_clip": 0.06426398, "auxiliary_loss_mlp": 0.0126601, "balance_loss_clip": 0.06282823, "balance_loss_mlp": 0.01256157, "epoch": 0.6824590410341199, "flos": 24865427473920.0, "grad_norm": 1.423123796649477, "language_loss": 0.79399669, "learning_rate": 9.675749994088161e-07, "loss": 0.87092078, "num_input_tokens_seen": 244895270, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09851074, "step": 11351, "time_per_iteration": 2.5782434940338135 }, { "auxiliary_loss_clip": 0.0642493, "auxiliary_loss_mlp": 0.01264899, "balance_loss_clip": 0.06282441, "balance_loss_mlp": 0.01255321, "epoch": 0.6825191642867879, "flos": 22458430287360.0, "grad_norm": 1.6697035977272188, "language_loss": 0.73357797, "learning_rate": 9.672414604241954e-07, "loss": 0.81047624, "num_input_tokens_seen": 244914535, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09576416, "step": 11352, "time_per_iteration": 2.5586485862731934 }, { "auxiliary_loss_clip": 0.06431428, "auxiliary_loss_mlp": 0.01266222, "balance_loss_clip": 0.062841, "balance_loss_mlp": 0.01254957, "epoch": 0.6825792875394558, "flos": 29432116920960.0, "grad_norm": 1.441380064378111, "language_loss": 0.80667877, "learning_rate": 9.669079606018814e-07, "loss": 0.88365525, "num_input_tokens_seen": 244936095, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.11260986, "step": 11353, "time_per_iteration": 2.6139519214630127 }, { "auxiliary_loss_clip": 0.0643298, "auxiliary_loss_mlp": 0.01265981, "balance_loss_clip": 0.06288809, "balance_loss_mlp": 0.01255383, "epoch": 0.6826394107921239, "flos": 18777006426240.0, "grad_norm": 1.7078297277203114, "language_loss": 0.78780639, "learning_rate": 9.665744999545218e-07, "loss": 0.86479604, "num_input_tokens_seen": 244955290, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10595703, "step": 11354, "time_per_iteration": 4.015610218048096 }, { "auxiliary_loss_clip": 0.06425845, "auxiliary_loss_mlp": 0.01265277, "balance_loss_clip": 0.0628245, "balance_loss_mlp": 0.01255288, "epoch": 0.6826995340447918, "flos": 16623142024320.0, "grad_norm": 1.9914266202248472, "language_loss": 0.6234411, "learning_rate": 9.662410784947599e-07, "loss": 0.70035225, "num_input_tokens_seen": 244972935, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.09985352, "step": 11355, "time_per_iteration": 2.5746214389801025 }, { "auxiliary_loss_clip": 0.06427134, "auxiliary_loss_mlp": 0.01265825, "balance_loss_clip": 0.06282572, "balance_loss_mlp": 0.01255877, "epoch": 0.6827596572974598, "flos": 20854282596480.0, "grad_norm": 2.0766922477713865, "language_loss": 0.82391393, "learning_rate": 9.659076962352398e-07, "loss": 0.9008435, "num_input_tokens_seen": 244989440, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.0994873, "step": 11356, "time_per_iteration": 2.5411324501037598 }, { "auxiliary_loss_clip": 0.06434665, "auxiliary_loss_mlp": 0.01264516, "balance_loss_clip": 0.06288005, "balance_loss_mlp": 0.01253853, "epoch": 0.6828197805501277, "flos": 22754804578560.0, "grad_norm": 1.7129624122799283, "language_loss": 0.78447068, "learning_rate": 9.655743531886052e-07, "loss": 0.86146247, "num_input_tokens_seen": 245007830, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10662842, "step": 11357, "time_per_iteration": 2.5711965560913086 }, { "auxiliary_loss_clip": 0.06322292, "auxiliary_loss_mlp": 0.0126314, "balance_loss_clip": 0.06263493, "balance_loss_mlp": 0.01261751, "epoch": 0.6828799038027957, "flos": 71668833598080.0, "grad_norm": 0.8452840377810406, "language_loss": 0.59612769, "learning_rate": 9.65241049367493e-07, "loss": 0.67198205, "num_input_tokens_seen": 245070720, "router_z_loss_clip": 0.58642578, "router_z_loss_mlp": 0.01389313, "step": 11358, "time_per_iteration": 3.200552225112915 }, { "auxiliary_loss_clip": 0.0643767, "auxiliary_loss_mlp": 0.01269987, "balance_loss_clip": 0.06287979, "balance_loss_mlp": 0.01258043, "epoch": 0.6829400270554637, "flos": 19835378547840.0, "grad_norm": 1.668215553751214, "language_loss": 0.78801107, "learning_rate": 9.64907784784544e-07, "loss": 0.86508763, "num_input_tokens_seen": 245089070, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.11938477, "step": 11359, "time_per_iteration": 2.5592730045318604 }, { "auxiliary_loss_clip": 0.06431273, "auxiliary_loss_mlp": 0.01267429, "balance_loss_clip": 0.06284663, "balance_loss_mlp": 0.01256849, "epoch": 0.6830001503081317, "flos": 21987020816640.0, "grad_norm": 1.78459903861204, "language_loss": 0.81717992, "learning_rate": 9.645745594523958e-07, "loss": 0.89416689, "num_input_tokens_seen": 245106500, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10577393, "step": 11360, "time_per_iteration": 2.5711774826049805 }, { "auxiliary_loss_clip": 0.06430452, "auxiliary_loss_mlp": 0.01264729, "balance_loss_clip": 0.06284575, "balance_loss_mlp": 0.01254501, "epoch": 0.6830602735607997, "flos": 24323718827520.0, "grad_norm": 1.9710542705852583, "language_loss": 0.75474727, "learning_rate": 9.642413733836844e-07, "loss": 0.83169907, "num_input_tokens_seen": 245125260, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10229492, "step": 11361, "time_per_iteration": 2.5990753173828125 }, { "auxiliary_loss_clip": 0.06318838, "auxiliary_loss_mlp": 0.01257659, "balance_loss_clip": 0.06260104, "balance_loss_mlp": 0.01256364, "epoch": 0.6831203968134676, "flos": 57706827793920.0, "grad_norm": 0.9088474705766596, "language_loss": 0.5941627, "learning_rate": 9.639082265910437e-07, "loss": 0.66992772, "num_input_tokens_seen": 245188730, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01295471, "step": 11362, "time_per_iteration": 3.2904653549194336 }, { "auxiliary_loss_clip": 0.06429559, "auxiliary_loss_mlp": 0.01270582, "balance_loss_clip": 0.06280716, "balance_loss_mlp": 0.01258917, "epoch": 0.6831805200661356, "flos": 14393024807040.0, "grad_norm": 2.7372286964691708, "language_loss": 0.75053686, "learning_rate": 9.635751190871074e-07, "loss": 0.82753825, "num_input_tokens_seen": 245205065, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.11663818, "step": 11363, "time_per_iteration": 3.9758529663085938 }, { "auxiliary_loss_clip": 0.06423937, "auxiliary_loss_mlp": 0.01268986, "balance_loss_clip": 0.06279924, "balance_loss_mlp": 0.01258108, "epoch": 0.6832406433188035, "flos": 22826906616960.0, "grad_norm": 2.3517904062999957, "language_loss": 0.89523089, "learning_rate": 9.632420508845063e-07, "loss": 0.9721601, "num_input_tokens_seen": 245224265, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10882568, "step": 11364, "time_per_iteration": 2.7869811058044434 }, { "auxiliary_loss_clip": 0.06424391, "auxiliary_loss_mlp": 0.01264378, "balance_loss_clip": 0.06282094, "balance_loss_mlp": 0.01254281, "epoch": 0.6833007665714715, "flos": 17566673725440.0, "grad_norm": 1.9738615790163125, "language_loss": 0.88616514, "learning_rate": 9.629090219958697e-07, "loss": 0.96305281, "num_input_tokens_seen": 245243360, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10095215, "step": 11365, "time_per_iteration": 2.752255916595459 }, { "auxiliary_loss_clip": 0.06436832, "auxiliary_loss_mlp": 0.01268762, "balance_loss_clip": 0.06287903, "balance_loss_mlp": 0.01257741, "epoch": 0.6833608898241395, "flos": 22450883420160.0, "grad_norm": 2.0458173119355147, "language_loss": 0.81021738, "learning_rate": 9.625760324338272e-07, "loss": 0.88727337, "num_input_tokens_seen": 245256350, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.11004639, "step": 11366, "time_per_iteration": 2.6296348571777344 }, { "auxiliary_loss_clip": 0.06428255, "auxiliary_loss_mlp": 0.01266221, "balance_loss_clip": 0.06281196, "balance_loss_mlp": 0.01255838, "epoch": 0.6834210130768075, "flos": 24541450462080.0, "grad_norm": 1.4601278750237587, "language_loss": 0.76859432, "learning_rate": 9.622430822110062e-07, "loss": 0.84553909, "num_input_tokens_seen": 245277575, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.10388184, "step": 11367, "time_per_iteration": 3.9999308586120605 }, { "auxiliary_loss_clip": 0.06429346, "auxiliary_loss_mlp": 0.01266325, "balance_loss_clip": 0.06282641, "balance_loss_mlp": 0.01255262, "epoch": 0.6834811363294754, "flos": 20053235963520.0, "grad_norm": 1.4849335407907127, "language_loss": 0.69626546, "learning_rate": 9.619101713400312e-07, "loss": 0.77322221, "num_input_tokens_seen": 245296615, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.11053467, "step": 11368, "time_per_iteration": 2.5661051273345947 }, { "auxiliary_loss_clip": 0.06423543, "auxiliary_loss_mlp": 0.01266337, "balance_loss_clip": 0.06279134, "balance_loss_mlp": 0.01255548, "epoch": 0.6835412595821434, "flos": 24797727774720.0, "grad_norm": 1.6129418428747553, "language_loss": 0.73513669, "learning_rate": 9.615772998335261e-07, "loss": 0.8120355, "num_input_tokens_seen": 245316275, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10791016, "step": 11369, "time_per_iteration": 2.6073195934295654 }, { "auxiliary_loss_clip": 0.06426281, "auxiliary_loss_mlp": 0.01266036, "balance_loss_clip": 0.06280843, "balance_loss_mlp": 0.01255349, "epoch": 0.6836013828348113, "flos": 19506454145280.0, "grad_norm": 2.7386966767055925, "language_loss": 0.78952336, "learning_rate": 9.612444677041138e-07, "loss": 0.8664465, "num_input_tokens_seen": 245334595, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10675049, "step": 11370, "time_per_iteration": 2.515427827835083 }, { "auxiliary_loss_clip": 0.06309967, "auxiliary_loss_mlp": 0.01253583, "balance_loss_clip": 0.06251265, "balance_loss_mlp": 0.01252132, "epoch": 0.6836615060874793, "flos": 58383753402240.0, "grad_norm": 0.746079710693371, "language_loss": 0.59731793, "learning_rate": 9.609116749644162e-07, "loss": 0.67295343, "num_input_tokens_seen": 245389750, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01449585, "step": 11371, "time_per_iteration": 3.0397331714630127 }, { "auxiliary_loss_clip": 0.06415574, "auxiliary_loss_mlp": 0.01264994, "balance_loss_clip": 0.06275563, "balance_loss_mlp": 0.0125529, "epoch": 0.6837216293401474, "flos": 12171796122240.0, "grad_norm": 1.4519776416197339, "language_loss": 0.63762081, "learning_rate": 9.605789216270511e-07, "loss": 0.71442652, "num_input_tokens_seen": 245407530, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09698486, "step": 11372, "time_per_iteration": 2.5610365867614746 }, { "auxiliary_loss_clip": 0.0642269, "auxiliary_loss_mlp": 0.01266514, "balance_loss_clip": 0.06279023, "balance_loss_mlp": 0.01255868, "epoch": 0.6837817525928153, "flos": 22134159786240.0, "grad_norm": 1.552355248922406, "language_loss": 0.71841073, "learning_rate": 9.602462077046375e-07, "loss": 0.79530275, "num_input_tokens_seen": 245427000, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10644531, "step": 11373, "time_per_iteration": 2.620241403579712 }, { "auxiliary_loss_clip": 0.06316843, "auxiliary_loss_mlp": 0.01255203, "balance_loss_clip": 0.0625811, "balance_loss_mlp": 0.01254005, "epoch": 0.6838418758454833, "flos": 65027048186880.0, "grad_norm": 1.1911891318371446, "language_loss": 0.56706399, "learning_rate": 9.599135332097935e-07, "loss": 0.64278448, "num_input_tokens_seen": 245491620, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01196289, "step": 11374, "time_per_iteration": 3.324296712875366 }, { "auxiliary_loss_clip": 0.06426376, "auxiliary_loss_mlp": 0.01268526, "balance_loss_clip": 0.06279974, "balance_loss_mlp": 0.01257774, "epoch": 0.6839019990981512, "flos": 21036864643200.0, "grad_norm": 1.4081497817618776, "language_loss": 0.73947543, "learning_rate": 9.595808981551312e-07, "loss": 0.81642449, "num_input_tokens_seen": 245511285, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10748291, "step": 11375, "time_per_iteration": 2.538379669189453 }, { "auxiliary_loss_clip": 0.06418544, "auxiliary_loss_mlp": 0.01267243, "balance_loss_clip": 0.06275911, "balance_loss_mlp": 0.01257265, "epoch": 0.6839621223508192, "flos": 24942351121920.0, "grad_norm": 1.6590040197627511, "language_loss": 0.70794874, "learning_rate": 9.592483025532651e-07, "loss": 0.78480661, "num_input_tokens_seen": 245532910, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09973145, "step": 11376, "time_per_iteration": 2.6220803260803223 }, { "auxiliary_loss_clip": 0.06427564, "auxiliary_loss_mlp": 0.01267282, "balance_loss_clip": 0.06279163, "balance_loss_mlp": 0.01255605, "epoch": 0.6840222456034871, "flos": 26365929264000.0, "grad_norm": 1.837801786232145, "language_loss": 0.74187052, "learning_rate": 9.58915746416808e-07, "loss": 0.81881893, "num_input_tokens_seen": 245550540, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.11682129, "step": 11377, "time_per_iteration": 2.585214138031006 }, { "auxiliary_loss_clip": 0.06316746, "auxiliary_loss_mlp": 0.01254072, "balance_loss_clip": 0.06258167, "balance_loss_mlp": 0.01252801, "epoch": 0.6840823688561551, "flos": 66009167493120.0, "grad_norm": 0.7062776173774339, "language_loss": 0.56752074, "learning_rate": 9.585832297583707e-07, "loss": 0.64322889, "num_input_tokens_seen": 245619570, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01271057, "step": 11378, "time_per_iteration": 3.306586503982544 }, { "auxiliary_loss_clip": 0.06423454, "auxiliary_loss_mlp": 0.01265893, "balance_loss_clip": 0.06276727, "balance_loss_mlp": 0.0125427, "epoch": 0.684142492108823, "flos": 21403999307520.0, "grad_norm": 1.629213745471089, "language_loss": 0.78553867, "learning_rate": 9.58250752590561e-07, "loss": 0.86243212, "num_input_tokens_seen": 245637980, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.11621094, "step": 11379, "time_per_iteration": 2.5517358779907227 }, { "auxiliary_loss_clip": 0.06411434, "auxiliary_loss_mlp": 0.01264846, "balance_loss_clip": 0.06274986, "balance_loss_mlp": 0.01255834, "epoch": 0.6842026153614911, "flos": 18806453936640.0, "grad_norm": 1.8895221655203907, "language_loss": 0.69395053, "learning_rate": 9.57918314925988e-07, "loss": 0.77071333, "num_input_tokens_seen": 245655690, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09014893, "step": 11380, "time_per_iteration": 2.6294443607330322 }, { "auxiliary_loss_clip": 0.06421278, "auxiliary_loss_mlp": 0.01267178, "balance_loss_clip": 0.06278694, "balance_loss_mlp": 0.01256205, "epoch": 0.684262738614159, "flos": 19652544938880.0, "grad_norm": 2.017917864508865, "language_loss": 0.78791332, "learning_rate": 9.575859167772568e-07, "loss": 0.86479783, "num_input_tokens_seen": 245671525, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10974121, "step": 11381, "time_per_iteration": 2.5646555423736572 }, { "auxiliary_loss_clip": 0.0631439, "auxiliary_loss_mlp": 0.01251116, "balance_loss_clip": 0.06256068, "balance_loss_mlp": 0.01249865, "epoch": 0.684322861866827, "flos": 62371041793920.0, "grad_norm": 0.9804121670445944, "language_loss": 0.67265022, "learning_rate": 9.572535581569713e-07, "loss": 0.74830526, "num_input_tokens_seen": 245724115, "router_z_loss_clip": 0.58447266, "router_z_loss_mlp": 0.01250458, "step": 11382, "time_per_iteration": 3.089613914489746 }, { "auxiliary_loss_clip": 0.0631744, "auxiliary_loss_mlp": 0.01252123, "balance_loss_clip": 0.06258967, "balance_loss_mlp": 0.01250699, "epoch": 0.6843829851194949, "flos": 65825704978560.0, "grad_norm": 0.8394628842214883, "language_loss": 0.5802151, "learning_rate": 9.569212390777356e-07, "loss": 0.65591073, "num_input_tokens_seen": 245789245, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01422882, "step": 11383, "time_per_iteration": 3.229177474975586 }, { "auxiliary_loss_clip": 0.06418479, "auxiliary_loss_mlp": 0.01265023, "balance_loss_clip": 0.06276202, "balance_loss_mlp": 0.01255004, "epoch": 0.6844431083721629, "flos": 27862573766400.0, "grad_norm": 1.5884813997953195, "language_loss": 0.80039978, "learning_rate": 9.565889595521517e-07, "loss": 0.87723482, "num_input_tokens_seen": 245812420, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10021973, "step": 11384, "time_per_iteration": 2.6052534580230713 }, { "auxiliary_loss_clip": 0.06426516, "auxiliary_loss_mlp": 0.01266156, "balance_loss_clip": 0.06277813, "balance_loss_mlp": 0.0125529, "epoch": 0.684503231624831, "flos": 18260091388800.0, "grad_norm": 2.2023037807178683, "language_loss": 0.77508318, "learning_rate": 9.562567195928187e-07, "loss": 0.85200989, "num_input_tokens_seen": 245829135, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.10858154, "step": 11385, "time_per_iteration": 3.99289870262146 }, { "auxiliary_loss_clip": 0.06437726, "auxiliary_loss_mlp": 0.01266659, "balance_loss_clip": 0.0628451, "balance_loss_mlp": 0.01254315, "epoch": 0.6845633548774989, "flos": 17645484090240.0, "grad_norm": 2.0264831709386293, "language_loss": 0.84445357, "learning_rate": 9.55924519212335e-07, "loss": 0.92149746, "num_input_tokens_seen": 245847140, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.12347412, "step": 11386, "time_per_iteration": 2.5252697467803955 }, { "auxiliary_loss_clip": 0.06424306, "auxiliary_loss_mlp": 0.01262718, "balance_loss_clip": 0.06279811, "balance_loss_mlp": 0.01253134, "epoch": 0.6846234781301669, "flos": 20812843952640.0, "grad_norm": 1.92343380462113, "language_loss": 0.83427012, "learning_rate": 9.555923584232984e-07, "loss": 0.91114032, "num_input_tokens_seen": 245862855, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.0958252, "step": 11387, "time_per_iteration": 2.5672037601470947 }, { "auxiliary_loss_clip": 0.06419624, "auxiliary_loss_mlp": 0.01262895, "balance_loss_clip": 0.06276726, "balance_loss_mlp": 0.01252839, "epoch": 0.6846836013828348, "flos": 36110016241920.0, "grad_norm": 1.8922710253485389, "language_loss": 0.72315276, "learning_rate": 9.552602372383047e-07, "loss": 0.79997802, "num_input_tokens_seen": 245885415, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10058594, "step": 11388, "time_per_iteration": 2.7017855644226074 }, { "auxiliary_loss_clip": 0.06426942, "auxiliary_loss_mlp": 0.01269137, "balance_loss_clip": 0.06283639, "balance_loss_mlp": 0.01258784, "epoch": 0.6847437246355028, "flos": 43152408823680.0, "grad_norm": 1.8706190883985052, "language_loss": 0.62907678, "learning_rate": 9.549281556699469e-07, "loss": 0.70603752, "num_input_tokens_seen": 245906285, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10357666, "step": 11389, "time_per_iteration": 2.754807472229004 }, { "auxiliary_loss_clip": 0.06319028, "auxiliary_loss_mlp": 0.01251689, "balance_loss_clip": 0.06260753, "balance_loss_mlp": 0.01250403, "epoch": 0.6848038478881707, "flos": 71682768103680.0, "grad_norm": 0.7093203430807313, "language_loss": 0.55875194, "learning_rate": 9.54596113730818e-07, "loss": 0.63445914, "num_input_tokens_seen": 245967620, "router_z_loss_clip": 0.58496094, "router_z_loss_mlp": 0.01286316, "step": 11390, "time_per_iteration": 3.3480539321899414 }, { "auxiliary_loss_clip": 0.0642506, "auxiliary_loss_mlp": 0.01267709, "balance_loss_clip": 0.06279635, "balance_loss_mlp": 0.01257088, "epoch": 0.6848639711408387, "flos": 19943929912320.0, "grad_norm": 1.7191187489434205, "language_loss": 0.87812853, "learning_rate": 9.542641114335109e-07, "loss": 0.95505625, "num_input_tokens_seen": 245985075, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10614014, "step": 11391, "time_per_iteration": 2.5514464378356934 }, { "auxiliary_loss_clip": 0.06428726, "auxiliary_loss_mlp": 0.01265206, "balance_loss_clip": 0.06281405, "balance_loss_mlp": 0.01254609, "epoch": 0.6849240943935067, "flos": 26874333112320.0, "grad_norm": 1.5669246493644613, "language_loss": 0.79316556, "learning_rate": 9.539321487906117e-07, "loss": 0.87010485, "num_input_tokens_seen": 246003560, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.10595703, "step": 11392, "time_per_iteration": 2.601412534713745 }, { "auxiliary_loss_clip": 0.06418808, "auxiliary_loss_mlp": 0.01266912, "balance_loss_clip": 0.0627818, "balance_loss_mlp": 0.01256661, "epoch": 0.6849842176461747, "flos": 13740458808960.0, "grad_norm": 11.943743007753097, "language_loss": 0.70972121, "learning_rate": 9.536002258147104e-07, "loss": 0.78657842, "num_input_tokens_seen": 246019600, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10247803, "step": 11393, "time_per_iteration": 2.54406476020813 }, { "auxiliary_loss_clip": 0.06435758, "auxiliary_loss_mlp": 0.01267363, "balance_loss_clip": 0.06285927, "balance_loss_mlp": 0.01256133, "epoch": 0.6850443408988426, "flos": 24980058259200.0, "grad_norm": 1.5935649352732315, "language_loss": 0.64520586, "learning_rate": 9.532683425183936e-07, "loss": 0.72223711, "num_input_tokens_seen": 246038920, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.11236572, "step": 11394, "time_per_iteration": 4.017607927322388 }, { "auxiliary_loss_clip": 0.06427206, "auxiliary_loss_mlp": 0.0126634, "balance_loss_clip": 0.06279288, "balance_loss_mlp": 0.01255784, "epoch": 0.6851044641515106, "flos": 27751380998400.0, "grad_norm": 1.7623274807464093, "language_loss": 0.80980599, "learning_rate": 9.529364989142468e-07, "loss": 0.88674146, "num_input_tokens_seen": 246060490, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.10559082, "step": 11395, "time_per_iteration": 2.6248672008514404 }, { "auxiliary_loss_clip": 0.06426139, "auxiliary_loss_mlp": 0.0126932, "balance_loss_clip": 0.06282444, "balance_loss_mlp": 0.01258127, "epoch": 0.6851645874041785, "flos": 24357652531200.0, "grad_norm": 1.5139356725604045, "language_loss": 0.73119831, "learning_rate": 9.526046950148527e-07, "loss": 0.80815285, "num_input_tokens_seen": 246081465, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.11193848, "step": 11396, "time_per_iteration": 2.607790470123291 }, { "auxiliary_loss_clip": 0.06426498, "auxiliary_loss_mlp": 0.012688, "balance_loss_clip": 0.06278284, "balance_loss_mlp": 0.01257785, "epoch": 0.6852247106568465, "flos": 15081914350080.0, "grad_norm": 2.4347782934477764, "language_loss": 0.79534942, "learning_rate": 9.522729308327931e-07, "loss": 0.87230241, "num_input_tokens_seen": 246096110, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.11022949, "step": 11397, "time_per_iteration": 2.611260175704956 }, { "auxiliary_loss_clip": 0.06424209, "auxiliary_loss_mlp": 0.01266016, "balance_loss_clip": 0.06278116, "balance_loss_mlp": 0.01255692, "epoch": 0.6852848339095146, "flos": 18775874396160.0, "grad_norm": 1.7857794492850712, "language_loss": 0.72319067, "learning_rate": 9.519412063806493e-07, "loss": 0.80009294, "num_input_tokens_seen": 246114785, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10333252, "step": 11398, "time_per_iteration": 2.5470869541168213 }, { "auxiliary_loss_clip": 0.06419457, "auxiliary_loss_mlp": 0.01266461, "balance_loss_clip": 0.06278294, "balance_loss_mlp": 0.01256579, "epoch": 0.6853449571621825, "flos": 27861651371520.0, "grad_norm": 1.66896290298992, "language_loss": 0.71216798, "learning_rate": 9.516095216709996e-07, "loss": 0.78902709, "num_input_tokens_seen": 246136375, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09887695, "step": 11399, "time_per_iteration": 2.6079156398773193 }, { "auxiliary_loss_clip": 0.06423558, "auxiliary_loss_mlp": 0.01269464, "balance_loss_clip": 0.06277303, "balance_loss_mlp": 0.01259457, "epoch": 0.6854050804148505, "flos": 18156403560960.0, "grad_norm": 1.4811215694647735, "language_loss": 0.70580924, "learning_rate": 9.512778767164217e-07, "loss": 0.78273952, "num_input_tokens_seen": 246155090, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.10003662, "step": 11400, "time_per_iteration": 2.520984172821045 }, { "auxiliary_loss_clip": 0.06447619, "auxiliary_loss_mlp": 0.01272045, "balance_loss_clip": 0.0629027, "balance_loss_mlp": 0.01259665, "epoch": 0.6854652036675184, "flos": 16331798977920.0, "grad_norm": 2.4710892276832, "language_loss": 0.78595841, "learning_rate": 9.509462715294927e-07, "loss": 0.86315507, "num_input_tokens_seen": 246172645, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.12390137, "step": 11401, "time_per_iteration": 2.533313035964966 }, { "auxiliary_loss_clip": 0.06418757, "auxiliary_loss_mlp": 0.01266398, "balance_loss_clip": 0.06276685, "balance_loss_mlp": 0.01256289, "epoch": 0.6855253269201864, "flos": 14946347243520.0, "grad_norm": 1.7499959099465339, "language_loss": 0.75946075, "learning_rate": 9.50614706122786e-07, "loss": 0.83631229, "num_input_tokens_seen": 246189055, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10107422, "step": 11402, "time_per_iteration": 4.03454065322876 }, { "auxiliary_loss_clip": 0.06430192, "auxiliary_loss_mlp": 0.01267801, "balance_loss_clip": 0.06280336, "balance_loss_mlp": 0.01256077, "epoch": 0.6855854501728543, "flos": 23044135127040.0, "grad_norm": 2.325884535123521, "language_loss": 0.7306844, "learning_rate": 9.502831805088742e-07, "loss": 0.80766439, "num_input_tokens_seen": 246207990, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.11730957, "step": 11403, "time_per_iteration": 2.5767483711242676 }, { "auxiliary_loss_clip": 0.06424735, "auxiliary_loss_mlp": 0.01265811, "balance_loss_clip": 0.06280865, "balance_loss_mlp": 0.01255482, "epoch": 0.6856455734255223, "flos": 13257393621120.0, "grad_norm": 2.825303552312831, "language_loss": 0.81703484, "learning_rate": 9.499516947003294e-07, "loss": 0.89394033, "num_input_tokens_seen": 246221595, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10327148, "step": 11404, "time_per_iteration": 2.5288925170898438 }, { "auxiliary_loss_clip": 0.06419869, "auxiliary_loss_mlp": 0.01269559, "balance_loss_clip": 0.06278801, "balance_loss_mlp": 0.01259248, "epoch": 0.6857056966781903, "flos": 23340551345280.0, "grad_norm": 1.4001745408547577, "language_loss": 0.77793521, "learning_rate": 9.496202487097222e-07, "loss": 0.85482943, "num_input_tokens_seen": 246242970, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10308838, "step": 11405, "time_per_iteration": 2.596397876739502 }, { "auxiliary_loss_clip": 0.06316575, "auxiliary_loss_mlp": 0.01250699, "balance_loss_clip": 0.06258234, "balance_loss_mlp": 0.01249442, "epoch": 0.6857658199308583, "flos": 61870646010240.0, "grad_norm": 0.7758602682702385, "language_loss": 0.60794604, "learning_rate": 9.492888425496199e-07, "loss": 0.68361878, "num_input_tokens_seen": 246300405, "router_z_loss_clip": 0.58447266, "router_z_loss_mlp": 0.01256561, "step": 11406, "time_per_iteration": 4.695172548294067 }, { "auxiliary_loss_clip": 0.06426713, "auxiliary_loss_mlp": 0.0126456, "balance_loss_clip": 0.06280649, "balance_loss_mlp": 0.01254022, "epoch": 0.6858259431835262, "flos": 16660178328960.0, "grad_norm": 1.8274458376893572, "language_loss": 0.77553737, "learning_rate": 9.489574762325907e-07, "loss": 0.85245007, "num_input_tokens_seen": 246318780, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10528564, "step": 11407, "time_per_iteration": 2.5690500736236572 }, { "auxiliary_loss_clip": 0.06421658, "auxiliary_loss_mlp": 0.01266638, "balance_loss_clip": 0.06275475, "balance_loss_mlp": 0.01254861, "epoch": 0.6858860664361942, "flos": 21879643409280.0, "grad_norm": 2.099331921415499, "language_loss": 0.70958483, "learning_rate": 9.486261497711991e-07, "loss": 0.78646779, "num_input_tokens_seen": 246339405, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.11773682, "step": 11408, "time_per_iteration": 2.610203981399536 }, { "auxiliary_loss_clip": 0.06429639, "auxiliary_loss_mlp": 0.01264014, "balance_loss_clip": 0.06280774, "balance_loss_mlp": 0.01253231, "epoch": 0.6859461896888621, "flos": 15272965658880.0, "grad_norm": 2.1506170017138206, "language_loss": 0.70317578, "learning_rate": 9.482948631780087e-07, "loss": 0.78011233, "num_input_tokens_seen": 246357055, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.10778809, "step": 11409, "time_per_iteration": 2.5507824420928955 }, { "auxiliary_loss_clip": 0.06413957, "auxiliary_loss_mlp": 0.01265322, "balance_loss_clip": 0.06275557, "balance_loss_mlp": 0.01255655, "epoch": 0.6860063129415301, "flos": 18625507044480.0, "grad_norm": 1.6152391426665567, "language_loss": 0.7828604, "learning_rate": 9.479636164655825e-07, "loss": 0.85965312, "num_input_tokens_seen": 246374050, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09674072, "step": 11410, "time_per_iteration": 2.687880039215088 }, { "auxiliary_loss_clip": 0.06431018, "auxiliary_loss_mlp": 0.01264815, "balance_loss_clip": 0.06279835, "balance_loss_mlp": 0.01253443, "epoch": 0.6860664361941982, "flos": 23958177390720.0, "grad_norm": 1.5908417192349902, "language_loss": 0.72009134, "learning_rate": 9.476324096464821e-07, "loss": 0.79704964, "num_input_tokens_seen": 246392910, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.1137085, "step": 11411, "time_per_iteration": 2.6780588626861572 }, { "auxiliary_loss_clip": 0.06425621, "auxiliary_loss_mlp": 0.01267233, "balance_loss_clip": 0.06279766, "balance_loss_mlp": 0.01256439, "epoch": 0.6861265594468661, "flos": 20413243031040.0, "grad_norm": 1.9327300035263097, "language_loss": 0.7009871, "learning_rate": 9.473012427332654e-07, "loss": 0.7779156, "num_input_tokens_seen": 246411540, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10797119, "step": 11412, "time_per_iteration": 2.531752824783325 }, { "auxiliary_loss_clip": 0.06426521, "auxiliary_loss_mlp": 0.01268164, "balance_loss_clip": 0.06281026, "balance_loss_mlp": 0.01257918, "epoch": 0.6861866826995341, "flos": 11431908570240.0, "grad_norm": 2.4756459246830445, "language_loss": 0.71744907, "learning_rate": 9.469701157384919e-07, "loss": 0.79439592, "num_input_tokens_seen": 246423295, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10247803, "step": 11413, "time_per_iteration": 2.493894338607788 }, { "auxiliary_loss_clip": 0.06426825, "auxiliary_loss_mlp": 0.01270056, "balance_loss_clip": 0.06280698, "balance_loss_mlp": 0.01259762, "epoch": 0.686246805952202, "flos": 16003084210560.0, "grad_norm": 1.6498082616365075, "language_loss": 0.73676229, "learning_rate": 9.466390286747164e-07, "loss": 0.81373107, "num_input_tokens_seen": 246441045, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10290527, "step": 11414, "time_per_iteration": 2.5136098861694336 }, { "auxiliary_loss_clip": 0.06427944, "auxiliary_loss_mlp": 0.01266727, "balance_loss_clip": 0.06281114, "balance_loss_mlp": 0.01255849, "epoch": 0.68630692920487, "flos": 19832527509120.0, "grad_norm": 2.1770552678368853, "language_loss": 0.8702023, "learning_rate": 9.46307981554495e-07, "loss": 0.94714904, "num_input_tokens_seen": 246456905, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10876465, "step": 11415, "time_per_iteration": 2.516920804977417 }, { "auxiliary_loss_clip": 0.06429458, "auxiliary_loss_mlp": 0.01265446, "balance_loss_clip": 0.06282994, "balance_loss_mlp": 0.01253769, "epoch": 0.6863670524575379, "flos": 26293366028160.0, "grad_norm": 2.066862713835249, "language_loss": 0.67371327, "learning_rate": 9.459769743903801e-07, "loss": 0.75066233, "num_input_tokens_seen": 246477545, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.11682129, "step": 11416, "time_per_iteration": 2.582207679748535 }, { "auxiliary_loss_clip": 0.06418528, "auxiliary_loss_mlp": 0.01264126, "balance_loss_clip": 0.06276019, "balance_loss_mlp": 0.01254041, "epoch": 0.686427175710206, "flos": 19179374532480.0, "grad_norm": 1.3412040108050864, "language_loss": 0.76316512, "learning_rate": 9.456460071949237e-07, "loss": 0.83999163, "num_input_tokens_seen": 246496705, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10083008, "step": 11417, "time_per_iteration": 2.5285677909851074 }, { "auxiliary_loss_clip": 0.06423533, "auxiliary_loss_mlp": 0.0126921, "balance_loss_clip": 0.06278154, "balance_loss_mlp": 0.01259476, "epoch": 0.6864872989628739, "flos": 18922636022400.0, "grad_norm": 1.744564616650946, "language_loss": 0.77377069, "learning_rate": 9.45315079980678e-07, "loss": 0.85069817, "num_input_tokens_seen": 246514860, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.09735107, "step": 11418, "time_per_iteration": 2.521070718765259 }, { "auxiliary_loss_clip": 0.06428093, "auxiliary_loss_mlp": 0.01264661, "balance_loss_clip": 0.06281733, "balance_loss_mlp": 0.01253872, "epoch": 0.6865474222155419, "flos": 25963016106240.0, "grad_norm": 1.5764431636440706, "language_loss": 0.76454139, "learning_rate": 9.449841927601887e-07, "loss": 0.84146893, "num_input_tokens_seen": 246536145, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.10797119, "step": 11419, "time_per_iteration": 2.614035129547119 }, { "auxiliary_loss_clip": 0.06422688, "auxiliary_loss_mlp": 0.01269635, "balance_loss_clip": 0.06277652, "balance_loss_mlp": 0.01259878, "epoch": 0.6866075454682098, "flos": 18483902444160.0, "grad_norm": 1.7407307085370993, "language_loss": 0.71981394, "learning_rate": 9.446533455460044e-07, "loss": 0.79673707, "num_input_tokens_seen": 246553265, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.09759521, "step": 11420, "time_per_iteration": 2.548686981201172 }, { "auxiliary_loss_clip": 0.06418408, "auxiliary_loss_mlp": 0.01264717, "balance_loss_clip": 0.06275666, "balance_loss_mlp": 0.01254703, "epoch": 0.6866676687208778, "flos": 34248459208320.0, "grad_norm": 1.4676589170397054, "language_loss": 0.75100631, "learning_rate": 9.443225383506712e-07, "loss": 0.82783759, "num_input_tokens_seen": 246575130, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10009766, "step": 11421, "time_per_iteration": 2.6696066856384277 }, { "auxiliary_loss_clip": 0.06421594, "auxiliary_loss_mlp": 0.01266737, "balance_loss_clip": 0.06279516, "balance_loss_mlp": 0.01256836, "epoch": 0.6867277919735457, "flos": 21727515121920.0, "grad_norm": 1.6704103603102953, "language_loss": 0.77180636, "learning_rate": 9.439917711867338e-07, "loss": 0.84868968, "num_input_tokens_seen": 246593095, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09906006, "step": 11422, "time_per_iteration": 2.626791000366211 }, { "auxiliary_loss_clip": 0.06426139, "auxiliary_loss_mlp": 0.01271327, "balance_loss_clip": 0.06281865, "balance_loss_mlp": 0.01260551, "epoch": 0.6867879152262137, "flos": 24104939016960.0, "grad_norm": 1.7712172360436864, "language_loss": 0.77512556, "learning_rate": 9.436610440667334e-07, "loss": 0.85210013, "num_input_tokens_seen": 246612165, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10791016, "step": 11423, "time_per_iteration": 2.61781907081604 }, { "auxiliary_loss_clip": 0.06429978, "auxiliary_loss_mlp": 0.01265004, "balance_loss_clip": 0.06281804, "balance_loss_mlp": 0.01253905, "epoch": 0.6868480384788818, "flos": 21622150212480.0, "grad_norm": 1.3847875452066036, "language_loss": 0.73049414, "learning_rate": 9.433303570032129e-07, "loss": 0.80744392, "num_input_tokens_seen": 246632065, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.11096191, "step": 11424, "time_per_iteration": 2.570631980895996 }, { "auxiliary_loss_clip": 0.0642498, "auxiliary_loss_mlp": 0.01267195, "balance_loss_clip": 0.0627863, "balance_loss_mlp": 0.01257081, "epoch": 0.6869081617315497, "flos": 26293282174080.0, "grad_norm": 1.791750828167853, "language_loss": 0.65052974, "learning_rate": 9.429997100087112e-07, "loss": 0.72745144, "num_input_tokens_seen": 246651245, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10119629, "step": 11425, "time_per_iteration": 4.048047780990601 }, { "auxiliary_loss_clip": 0.06421287, "auxiliary_loss_mlp": 0.01267723, "balance_loss_clip": 0.06280036, "balance_loss_mlp": 0.01257632, "epoch": 0.6869682849842177, "flos": 21111356522880.0, "grad_norm": 1.3745925903259193, "language_loss": 0.7186811, "learning_rate": 9.426691030957657e-07, "loss": 0.79557121, "num_input_tokens_seen": 246672225, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10101318, "step": 11426, "time_per_iteration": 2.6056764125823975 }, { "auxiliary_loss_clip": 0.06426136, "auxiliary_loss_mlp": 0.01267224, "balance_loss_clip": 0.06280671, "balance_loss_mlp": 0.01257228, "epoch": 0.6870284082368856, "flos": 17098408782720.0, "grad_norm": 2.0241474388414447, "language_loss": 0.85280365, "learning_rate": 9.423385362769136e-07, "loss": 0.92973721, "num_input_tokens_seen": 246688385, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.09997559, "step": 11427, "time_per_iteration": 2.5134220123291016 }, { "auxiliary_loss_clip": 0.06421247, "auxiliary_loss_mlp": 0.01262784, "balance_loss_clip": 0.06279038, "balance_loss_mlp": 0.0125233, "epoch": 0.6870885314895536, "flos": 27315456531840.0, "grad_norm": 4.047756296214936, "language_loss": 0.76370311, "learning_rate": 9.420080095646909e-07, "loss": 0.84054345, "num_input_tokens_seen": 246710730, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10461426, "step": 11428, "time_per_iteration": 2.6131951808929443 }, { "auxiliary_loss_clip": 0.06429712, "auxiliary_loss_mlp": 0.01267677, "balance_loss_clip": 0.06280502, "balance_loss_mlp": 0.01256758, "epoch": 0.6871486547422215, "flos": 20820977798400.0, "grad_norm": 1.7555046207677771, "language_loss": 0.73508495, "learning_rate": 9.4167752297163e-07, "loss": 0.81205881, "num_input_tokens_seen": 246730350, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.10925293, "step": 11429, "time_per_iteration": 2.5768771171569824 }, { "auxiliary_loss_clip": 0.06429303, "auxiliary_loss_mlp": 0.01263427, "balance_loss_clip": 0.06283415, "balance_loss_mlp": 0.01253604, "epoch": 0.6872087779948896, "flos": 30161983910400.0, "grad_norm": 1.6536331527435457, "language_loss": 0.8322289, "learning_rate": 9.413470765102643e-07, "loss": 0.90915614, "num_input_tokens_seen": 246751700, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.0982666, "step": 11430, "time_per_iteration": 2.634579658508301 }, { "auxiliary_loss_clip": 0.06425507, "auxiliary_loss_mlp": 0.01264834, "balance_loss_clip": 0.06280217, "balance_loss_mlp": 0.01254088, "epoch": 0.6872689012475575, "flos": 20710917060480.0, "grad_norm": 1.8630637728476163, "language_loss": 0.70389718, "learning_rate": 9.410166701931225e-07, "loss": 0.78080058, "num_input_tokens_seen": 246769860, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10742188, "step": 11431, "time_per_iteration": 2.5452988147735596 }, { "auxiliary_loss_clip": 0.06424034, "auxiliary_loss_mlp": 0.01265753, "balance_loss_clip": 0.06277555, "balance_loss_mlp": 0.01256079, "epoch": 0.6873290245002255, "flos": 25528014034560.0, "grad_norm": 1.6615478366374454, "language_loss": 0.80686843, "learning_rate": 9.406863040327355e-07, "loss": 0.88376629, "num_input_tokens_seen": 246789905, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.09667969, "step": 11432, "time_per_iteration": 2.5870635509490967 }, { "auxiliary_loss_clip": 0.06414412, "auxiliary_loss_mlp": 0.01267201, "balance_loss_clip": 0.06275643, "balance_loss_mlp": 0.01257575, "epoch": 0.6873891477528934, "flos": 25198418799360.0, "grad_norm": 1.4217474922458433, "language_loss": 0.68172705, "learning_rate": 9.403559780416295e-07, "loss": 0.75854313, "num_input_tokens_seen": 246808815, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09625244, "step": 11433, "time_per_iteration": 4.046096563339233 }, { "auxiliary_loss_clip": 0.06425832, "auxiliary_loss_mlp": 0.01266229, "balance_loss_clip": 0.06282717, "balance_loss_mlp": 0.01255744, "epoch": 0.6874492710055614, "flos": 35161034025600.0, "grad_norm": 1.9678444931661014, "language_loss": 0.73285395, "learning_rate": 9.400256922323309e-07, "loss": 0.80977458, "num_input_tokens_seen": 246829775, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.1048584, "step": 11434, "time_per_iteration": 2.687359094619751 }, { "auxiliary_loss_clip": 0.06422979, "auxiliary_loss_mlp": 0.01264986, "balance_loss_clip": 0.06281306, "balance_loss_mlp": 0.01255128, "epoch": 0.6875093942582293, "flos": 17828066136960.0, "grad_norm": 1.769421265727694, "language_loss": 0.80742264, "learning_rate": 9.396954466173657e-07, "loss": 0.88430232, "num_input_tokens_seen": 246848045, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09857178, "step": 11435, "time_per_iteration": 2.5671088695526123 }, { "auxiliary_loss_clip": 0.06425399, "auxiliary_loss_mlp": 0.01267084, "balance_loss_clip": 0.06278138, "balance_loss_mlp": 0.01255938, "epoch": 0.6875695175108973, "flos": 20710875133440.0, "grad_norm": 1.855790654343185, "language_loss": 0.80919659, "learning_rate": 9.393652412092538e-07, "loss": 0.88612145, "num_input_tokens_seen": 246866095, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.1114502, "step": 11436, "time_per_iteration": 2.6154866218566895 }, { "auxiliary_loss_clip": 0.064193, "auxiliary_loss_mlp": 0.01265901, "balance_loss_clip": 0.06282195, "balance_loss_mlp": 0.01256746, "epoch": 0.6876296407635654, "flos": 25381000846080.0, "grad_norm": 1.6536239216445827, "language_loss": 0.81942064, "learning_rate": 9.390350760205183e-07, "loss": 0.89627266, "num_input_tokens_seen": 246883975, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.0914917, "step": 11437, "time_per_iteration": 2.614103317260742 }, { "auxiliary_loss_clip": 0.06440911, "auxiliary_loss_mlp": 0.012693, "balance_loss_clip": 0.06286518, "balance_loss_mlp": 0.01257671, "epoch": 0.6876897640162333, "flos": 23229107015040.0, "grad_norm": 2.881967871497044, "language_loss": 0.77999532, "learning_rate": 9.387049510636793e-07, "loss": 0.85709739, "num_input_tokens_seen": 246901560, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.11639404, "step": 11438, "time_per_iteration": 2.5678327083587646 }, { "auxiliary_loss_clip": 0.06419526, "auxiliary_loss_mlp": 0.01270381, "balance_loss_clip": 0.06279924, "balance_loss_mlp": 0.0126085, "epoch": 0.6877498872689013, "flos": 27131448965760.0, "grad_norm": 1.6239297310910037, "language_loss": 0.72572541, "learning_rate": 9.383748663512554e-07, "loss": 0.80262458, "num_input_tokens_seen": 246922655, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09527588, "step": 11439, "time_per_iteration": 2.5925862789154053 }, { "auxiliary_loss_clip": 0.06420654, "auxiliary_loss_mlp": 0.01267604, "balance_loss_clip": 0.06277655, "balance_loss_mlp": 0.01257144, "epoch": 0.6878100105215692, "flos": 11586217063680.0, "grad_norm": 2.338282658630056, "language_loss": 0.75369072, "learning_rate": 9.380448218957623e-07, "loss": 0.83057332, "num_input_tokens_seen": 246940100, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10455322, "step": 11440, "time_per_iteration": 2.513495922088623 }, { "auxiliary_loss_clip": 0.06415233, "auxiliary_loss_mlp": 0.01267615, "balance_loss_clip": 0.06275326, "balance_loss_mlp": 0.01258162, "epoch": 0.6878701337742372, "flos": 20309429422080.0, "grad_norm": 1.594684920804541, "language_loss": 0.72139728, "learning_rate": 9.377148177097167e-07, "loss": 0.79822582, "num_input_tokens_seen": 246958545, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09454346, "step": 11441, "time_per_iteration": 2.6066980361938477 }, { "auxiliary_loss_clip": 0.06429908, "auxiliary_loss_mlp": 0.01265526, "balance_loss_clip": 0.06280731, "balance_loss_mlp": 0.0125407, "epoch": 0.6879302570269051, "flos": 13844398199040.0, "grad_norm": 1.6528640449853171, "language_loss": 0.67034382, "learning_rate": 9.373848538056317e-07, "loss": 0.74729818, "num_input_tokens_seen": 246974805, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.11450195, "step": 11442, "time_per_iteration": 3.9930062294006348 }, { "auxiliary_loss_clip": 0.06424306, "auxiliary_loss_mlp": 0.01268176, "balance_loss_clip": 0.06280039, "balance_loss_mlp": 0.0125787, "epoch": 0.6879903802795732, "flos": 21331058728320.0, "grad_norm": 2.713793956791193, "language_loss": 0.69590324, "learning_rate": 9.370549301960189e-07, "loss": 0.77282804, "num_input_tokens_seen": 246992505, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10296631, "step": 11443, "time_per_iteration": 2.5376758575439453 }, { "auxiliary_loss_clip": 0.06432375, "auxiliary_loss_mlp": 0.01264708, "balance_loss_clip": 0.06285168, "balance_loss_mlp": 0.01254242, "epoch": 0.6880505035322411, "flos": 25158489528960.0, "grad_norm": 1.768048466474706, "language_loss": 0.76478732, "learning_rate": 9.367250468933893e-07, "loss": 0.84175807, "num_input_tokens_seen": 247013370, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.10461426, "step": 11444, "time_per_iteration": 2.5841879844665527 }, { "auxiliary_loss_clip": 0.06422557, "auxiliary_loss_mlp": 0.01265469, "balance_loss_clip": 0.06281568, "balance_loss_mlp": 0.01255741, "epoch": 0.6881106267849091, "flos": 23221182804480.0, "grad_norm": 1.8911639575448673, "language_loss": 0.76771963, "learning_rate": 9.363952039102536e-07, "loss": 0.84459984, "num_input_tokens_seen": 247029855, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09729004, "step": 11445, "time_per_iteration": 2.532681465148926 }, { "auxiliary_loss_clip": 0.06313479, "auxiliary_loss_mlp": 0.01250765, "balance_loss_clip": 0.06255254, "balance_loss_mlp": 0.01249456, "epoch": 0.688170750037577, "flos": 48497741136000.0, "grad_norm": 0.8138998997621519, "language_loss": 0.58230942, "learning_rate": 9.360654012591183e-07, "loss": 0.65795189, "num_input_tokens_seen": 247085030, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.01309967, "step": 11446, "time_per_iteration": 4.644446849822998 }, { "auxiliary_loss_clip": 0.06427909, "auxiliary_loss_mlp": 0.01263181, "balance_loss_clip": 0.06279831, "balance_loss_mlp": 0.01252791, "epoch": 0.688230873290245, "flos": 22790205728640.0, "grad_norm": 1.4309637290769357, "language_loss": 0.75747883, "learning_rate": 9.357356389524886e-07, "loss": 0.83438969, "num_input_tokens_seen": 247104840, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10394287, "step": 11447, "time_per_iteration": 2.5750670433044434 }, { "auxiliary_loss_clip": 0.0642219, "auxiliary_loss_mlp": 0.01265449, "balance_loss_clip": 0.06277969, "balance_loss_mlp": 0.01255251, "epoch": 0.6882909965429129, "flos": 22462245648000.0, "grad_norm": 1.9133358750977028, "language_loss": 0.73473561, "learning_rate": 9.354059170028705e-07, "loss": 0.81161201, "num_input_tokens_seen": 247121905, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10198975, "step": 11448, "time_per_iteration": 2.573444128036499 }, { "auxiliary_loss_clip": 0.06429768, "auxiliary_loss_mlp": 0.01267205, "balance_loss_clip": 0.06280258, "balance_loss_mlp": 0.01256125, "epoch": 0.688351119795581, "flos": 26221431697920.0, "grad_norm": 1.673723278541684, "language_loss": 0.74981284, "learning_rate": 9.350762354227673e-07, "loss": 0.82678258, "num_input_tokens_seen": 247142375, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.11071777, "step": 11449, "time_per_iteration": 2.596569776535034 }, { "auxiliary_loss_clip": 0.06417702, "auxiliary_loss_mlp": 0.012649, "balance_loss_clip": 0.0627468, "balance_loss_mlp": 0.01254732, "epoch": 0.6884112430482489, "flos": 22571887115520.0, "grad_norm": 1.7081511813844616, "language_loss": 0.70070422, "learning_rate": 9.34746594224679e-07, "loss": 0.77753019, "num_input_tokens_seen": 247161095, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10168457, "step": 11450, "time_per_iteration": 2.5517444610595703 }, { "auxiliary_loss_clip": 0.0643393, "auxiliary_loss_mlp": 0.01268691, "balance_loss_clip": 0.06283517, "balance_loss_mlp": 0.01257366, "epoch": 0.6884713663009169, "flos": 17345671781760.0, "grad_norm": 1.9067253099224735, "language_loss": 0.7628125, "learning_rate": 9.344169934211068e-07, "loss": 0.83983874, "num_input_tokens_seen": 247178565, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.11328125, "step": 11451, "time_per_iteration": 2.5577478408813477 }, { "auxiliary_loss_clip": 0.06428864, "auxiliary_loss_mlp": 0.01262093, "balance_loss_clip": 0.06281683, "balance_loss_mlp": 0.01252628, "epoch": 0.6885314895535849, "flos": 26478379843200.0, "grad_norm": 1.268989309136781, "language_loss": 0.6901651, "learning_rate": 9.340874330245505e-07, "loss": 0.7670747, "num_input_tokens_seen": 247202345, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.09466553, "step": 11452, "time_per_iteration": 2.6184749603271484 }, { "auxiliary_loss_clip": 0.06420746, "auxiliary_loss_mlp": 0.01271183, "balance_loss_clip": 0.06277099, "balance_loss_mlp": 0.01259239, "epoch": 0.6885916128062528, "flos": 20527748035200.0, "grad_norm": 1.6501664260276314, "language_loss": 0.71808553, "learning_rate": 9.337579130475042e-07, "loss": 0.79500479, "num_input_tokens_seen": 247219240, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.1194458, "step": 11453, "time_per_iteration": 2.5712437629699707 }, { "auxiliary_loss_clip": 0.06318215, "auxiliary_loss_mlp": 0.01251739, "balance_loss_clip": 0.06259854, "balance_loss_mlp": 0.01250584, "epoch": 0.6886517360589208, "flos": 70734792136320.0, "grad_norm": 0.7630523510548985, "language_loss": 0.50507456, "learning_rate": 9.334284335024644e-07, "loss": 0.58077413, "num_input_tokens_seen": 247272010, "router_z_loss_clip": 0.58544922, "router_z_loss_mlp": 0.01153564, "step": 11454, "time_per_iteration": 3.025827646255493 }, { "auxiliary_loss_clip": 0.06410646, "auxiliary_loss_mlp": 0.01264702, "balance_loss_clip": 0.06273342, "balance_loss_mlp": 0.01255261, "epoch": 0.6887118593115887, "flos": 17899119999360.0, "grad_norm": 4.494767371934795, "language_loss": 0.75846469, "learning_rate": 9.330989944019263e-07, "loss": 0.83521819, "num_input_tokens_seen": 247290630, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09442139, "step": 11455, "time_per_iteration": 2.5574281215667725 }, { "auxiliary_loss_clip": 0.06423564, "auxiliary_loss_mlp": 0.01267253, "balance_loss_clip": 0.06275818, "balance_loss_mlp": 0.01255702, "epoch": 0.6887719825642568, "flos": 17458080433920.0, "grad_norm": 2.4712566139723293, "language_loss": 0.73263896, "learning_rate": 9.327695957583803e-07, "loss": 0.80954719, "num_input_tokens_seen": 247304800, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.11553955, "step": 11456, "time_per_iteration": 2.5484423637390137 }, { "auxiliary_loss_clip": 0.06421638, "auxiliary_loss_mlp": 0.01267626, "balance_loss_clip": 0.06280042, "balance_loss_mlp": 0.01257642, "epoch": 0.6888321058169247, "flos": 23075930551680.0, "grad_norm": 1.9704261502686968, "language_loss": 0.81257391, "learning_rate": 9.32440237584319e-07, "loss": 0.88946652, "num_input_tokens_seen": 247323450, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09979248, "step": 11457, "time_per_iteration": 2.5646402835845947 }, { "auxiliary_loss_clip": 0.06423344, "auxiliary_loss_mlp": 0.01269636, "balance_loss_clip": 0.0627618, "balance_loss_mlp": 0.01258562, "epoch": 0.6888922290695927, "flos": 23375742860160.0, "grad_norm": 1.7083475831706054, "language_loss": 0.76323652, "learning_rate": 9.321109198922301e-07, "loss": 0.84016633, "num_input_tokens_seen": 247343845, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.11065674, "step": 11458, "time_per_iteration": 2.6431570053100586 }, { "auxiliary_loss_clip": 0.06420544, "auxiliary_loss_mlp": 0.01266483, "balance_loss_clip": 0.06275995, "balance_loss_mlp": 0.01255933, "epoch": 0.6889523523222606, "flos": 17636092433280.0, "grad_norm": 2.9556285198336205, "language_loss": 0.6832943, "learning_rate": 9.31781642694603e-07, "loss": 0.76016462, "num_input_tokens_seen": 247356650, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10540771, "step": 11459, "time_per_iteration": 2.6540000438690186 }, { "auxiliary_loss_clip": 0.06425876, "auxiliary_loss_mlp": 0.01267258, "balance_loss_clip": 0.06279836, "balance_loss_mlp": 0.01257698, "epoch": 0.6890124755749286, "flos": 25235119687680.0, "grad_norm": 1.604945790998, "language_loss": 0.6864931, "learning_rate": 9.314524060039221e-07, "loss": 0.76342446, "num_input_tokens_seen": 247377340, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.09558105, "step": 11460, "time_per_iteration": 2.6533737182617188 }, { "auxiliary_loss_clip": 0.06437678, "auxiliary_loss_mlp": 0.01267088, "balance_loss_clip": 0.06282791, "balance_loss_mlp": 0.01255686, "epoch": 0.6890725988275965, "flos": 20236488842880.0, "grad_norm": 1.6670356391593169, "language_loss": 0.77157772, "learning_rate": 9.311232098326731e-07, "loss": 0.84862536, "num_input_tokens_seen": 247395805, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.11401367, "step": 11461, "time_per_iteration": 2.573376417160034 }, { "auxiliary_loss_clip": 0.06422657, "auxiliary_loss_mlp": 0.01267972, "balance_loss_clip": 0.06278501, "balance_loss_mlp": 0.01257314, "epoch": 0.6891327220802645, "flos": 14540079922560.0, "grad_norm": 1.6367076012595203, "language_loss": 0.69853222, "learning_rate": 9.307940541933401e-07, "loss": 0.77543849, "num_input_tokens_seen": 247413165, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10662842, "step": 11462, "time_per_iteration": 2.540402412414551 }, { "auxiliary_loss_clip": 0.06425682, "auxiliary_loss_mlp": 0.0126706, "balance_loss_clip": 0.06279425, "balance_loss_mlp": 0.01256713, "epoch": 0.6891928453329325, "flos": 21144996737280.0, "grad_norm": 2.4714922541537665, "language_loss": 0.87508941, "learning_rate": 9.304649390984034e-07, "loss": 0.95201683, "num_input_tokens_seen": 247433140, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.10351562, "step": 11463, "time_per_iteration": 2.5754611492156982 }, { "auxiliary_loss_clip": 0.06415535, "auxiliary_loss_mlp": 0.01267002, "balance_loss_clip": 0.06276058, "balance_loss_mlp": 0.01257364, "epoch": 0.6892529685856005, "flos": 17864347754880.0, "grad_norm": 1.561299550698093, "language_loss": 0.68573809, "learning_rate": 9.301358645603428e-07, "loss": 0.76256347, "num_input_tokens_seen": 247451265, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09631348, "step": 11464, "time_per_iteration": 3.9961791038513184 }, { "auxiliary_loss_clip": 0.06422754, "auxiliary_loss_mlp": 0.01266496, "balance_loss_clip": 0.06278293, "balance_loss_mlp": 0.01255904, "epoch": 0.6893130918382685, "flos": 29942575194240.0, "grad_norm": 1.651858513485915, "language_loss": 0.65666997, "learning_rate": 9.298068305916373e-07, "loss": 0.73356247, "num_input_tokens_seen": 247471645, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10601807, "step": 11465, "time_per_iteration": 2.586357593536377 }, { "auxiliary_loss_clip": 0.06428055, "auxiliary_loss_mlp": 0.01267493, "balance_loss_clip": 0.06278145, "balance_loss_mlp": 0.01255983, "epoch": 0.6893732150909364, "flos": 24395275814400.0, "grad_norm": 1.419182897692677, "language_loss": 0.72565067, "learning_rate": 9.294778372047649e-07, "loss": 0.80260611, "num_input_tokens_seen": 247491170, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.11517334, "step": 11466, "time_per_iteration": 2.5820670127868652 }, { "auxiliary_loss_clip": 0.06422424, "auxiliary_loss_mlp": 0.01266365, "balance_loss_clip": 0.06277683, "balance_loss_mlp": 0.01256274, "epoch": 0.6894333383436044, "flos": 16988557680000.0, "grad_norm": 2.0185541701371363, "language_loss": 0.72294551, "learning_rate": 9.291488844121995e-07, "loss": 0.79983342, "num_input_tokens_seen": 247509005, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10095215, "step": 11467, "time_per_iteration": 2.5205955505371094 }, { "auxiliary_loss_clip": 0.06430049, "auxiliary_loss_mlp": 0.01265693, "balance_loss_clip": 0.06280909, "balance_loss_mlp": 0.01253665, "epoch": 0.6894934615962723, "flos": 18990880773120.0, "grad_norm": 2.015635745328847, "language_loss": 0.81255817, "learning_rate": 9.288199722264156e-07, "loss": 0.88951558, "num_input_tokens_seen": 247527050, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.12036133, "step": 11468, "time_per_iteration": 2.5493123531341553 }, { "auxiliary_loss_clip": 0.06426185, "auxiliary_loss_mlp": 0.01265656, "balance_loss_clip": 0.062796, "balance_loss_mlp": 0.01255148, "epoch": 0.6895535848489404, "flos": 34540137671040.0, "grad_norm": 3.465317519281915, "language_loss": 0.66467249, "learning_rate": 9.284911006598875e-07, "loss": 0.74159098, "num_input_tokens_seen": 247547765, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10510254, "step": 11469, "time_per_iteration": 2.6484146118164062 }, { "auxiliary_loss_clip": 0.06311818, "auxiliary_loss_mlp": 0.01253697, "balance_loss_clip": 0.06253447, "balance_loss_mlp": 0.01252351, "epoch": 0.6896137081016083, "flos": 50093237128320.0, "grad_norm": 0.7862009378148145, "language_loss": 0.54971701, "learning_rate": 9.281622697250824e-07, "loss": 0.62537217, "num_input_tokens_seen": 247603515, "router_z_loss_clip": 0.58544922, "router_z_loss_mlp": 0.01347351, "step": 11470, "time_per_iteration": 3.0641908645629883 }, { "auxiliary_loss_clip": 0.06423856, "auxiliary_loss_mlp": 0.01268455, "balance_loss_clip": 0.06282702, "balance_loss_mlp": 0.01259574, "epoch": 0.6896738313542763, "flos": 19944391109760.0, "grad_norm": 1.9270804595194377, "language_loss": 0.78196687, "learning_rate": 9.278334794344715e-07, "loss": 0.85889, "num_input_tokens_seen": 247622110, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.08880615, "step": 11471, "time_per_iteration": 2.5515880584716797 }, { "auxiliary_loss_clip": 0.06422389, "auxiliary_loss_mlp": 0.01269187, "balance_loss_clip": 0.06277685, "balance_loss_mlp": 0.01258291, "epoch": 0.6897339546069442, "flos": 21731875534080.0, "grad_norm": 1.7098336277679127, "language_loss": 0.78368533, "learning_rate": 9.275047298005232e-07, "loss": 0.86060101, "num_input_tokens_seen": 247641905, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10900879, "step": 11472, "time_per_iteration": 2.5580027103424072 }, { "auxiliary_loss_clip": 0.06421657, "auxiliary_loss_mlp": 0.01265556, "balance_loss_clip": 0.06277779, "balance_loss_mlp": 0.01255226, "epoch": 0.6897940778596122, "flos": 19832275946880.0, "grad_norm": 1.7913542135748046, "language_loss": 0.7658779, "learning_rate": 9.271760208357024e-07, "loss": 0.84275001, "num_input_tokens_seen": 247660945, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10333252, "step": 11473, "time_per_iteration": 4.023810625076294 }, { "auxiliary_loss_clip": 0.06433785, "auxiliary_loss_mlp": 0.0126795, "balance_loss_clip": 0.06284698, "balance_loss_mlp": 0.01256637, "epoch": 0.6898542011122801, "flos": 17315595365760.0, "grad_norm": 1.9448523171500824, "language_loss": 0.75962651, "learning_rate": 9.268473525524751e-07, "loss": 0.83664393, "num_input_tokens_seen": 247678395, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.11315918, "step": 11474, "time_per_iteration": 2.551622152328491 }, { "auxiliary_loss_clip": 0.06426181, "auxiliary_loss_mlp": 0.01269864, "balance_loss_clip": 0.06280611, "balance_loss_mlp": 0.01259266, "epoch": 0.6899143243649482, "flos": 24760984959360.0, "grad_norm": 1.473760413499796, "language_loss": 0.74783969, "learning_rate": 9.26518724963303e-07, "loss": 0.82480007, "num_input_tokens_seen": 247698380, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10601807, "step": 11475, "time_per_iteration": 2.584383964538574 }, { "auxiliary_loss_clip": 0.06422989, "auxiliary_loss_mlp": 0.01265383, "balance_loss_clip": 0.06279332, "balance_loss_mlp": 0.01254535, "epoch": 0.6899744476176161, "flos": 17239636039680.0, "grad_norm": 2.057229107846959, "language_loss": 0.89280987, "learning_rate": 9.261901380806491e-07, "loss": 0.96969366, "num_input_tokens_seen": 247716370, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10845947, "step": 11476, "time_per_iteration": 2.5302083492279053 }, { "auxiliary_loss_clip": 0.06424862, "auxiliary_loss_mlp": 0.01268626, "balance_loss_clip": 0.06281952, "balance_loss_mlp": 0.01258571, "epoch": 0.6900345708702841, "flos": 25417701734400.0, "grad_norm": 1.3430357000017143, "language_loss": 0.70231801, "learning_rate": 9.258615919169724e-07, "loss": 0.77925289, "num_input_tokens_seen": 247737335, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.1005249, "step": 11477, "time_per_iteration": 2.605397939682007 }, { "auxiliary_loss_clip": 0.06439124, "auxiliary_loss_mlp": 0.01268367, "balance_loss_clip": 0.06288476, "balance_loss_mlp": 0.01256827, "epoch": 0.6900946941229521, "flos": 23439836833920.0, "grad_norm": 3.0692761108437603, "language_loss": 0.68530101, "learning_rate": 9.255330864847313e-07, "loss": 0.76237595, "num_input_tokens_seen": 247756680, "router_z_loss_clip": 1.50488281, "router_z_loss_mlp": 0.11547852, "step": 11478, "time_per_iteration": 2.625779390335083 }, { "auxiliary_loss_clip": 0.06432734, "auxiliary_loss_mlp": 0.01266424, "balance_loss_clip": 0.06285278, "balance_loss_mlp": 0.01256047, "epoch": 0.69015481737562, "flos": 17825592441600.0, "grad_norm": 1.891598715396875, "language_loss": 0.76693898, "learning_rate": 9.252046217963843e-07, "loss": 0.8439306, "num_input_tokens_seen": 247774265, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.1036377, "step": 11479, "time_per_iteration": 2.5324411392211914 }, { "auxiliary_loss_clip": 0.06433407, "auxiliary_loss_mlp": 0.01266549, "balance_loss_clip": 0.06285588, "balance_loss_mlp": 0.01255373, "epoch": 0.690214940628288, "flos": 17462147356800.0, "grad_norm": 1.6826903931289898, "language_loss": 0.79033816, "learning_rate": 9.248761978643856e-07, "loss": 0.8673377, "num_input_tokens_seen": 247792395, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.11181641, "step": 11480, "time_per_iteration": 2.5150089263916016 }, { "auxiliary_loss_clip": 0.06427275, "auxiliary_loss_mlp": 0.01266321, "balance_loss_clip": 0.06282143, "balance_loss_mlp": 0.01255342, "epoch": 0.6902750638809559, "flos": 29573847302400.0, "grad_norm": 1.7119964668136822, "language_loss": 0.75669783, "learning_rate": 9.245478147011885e-07, "loss": 0.83363384, "num_input_tokens_seen": 247811985, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10980225, "step": 11481, "time_per_iteration": 2.6278674602508545 }, { "auxiliary_loss_clip": 0.06426598, "auxiliary_loss_mlp": 0.01267924, "balance_loss_clip": 0.06282584, "balance_loss_mlp": 0.01257177, "epoch": 0.690335187133624, "flos": 25564253725440.0, "grad_norm": 2.357738020624728, "language_loss": 0.69456005, "learning_rate": 9.24219472319246e-07, "loss": 0.77150536, "num_input_tokens_seen": 247831880, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10748291, "step": 11482, "time_per_iteration": 4.023549556732178 }, { "auxiliary_loss_clip": 0.06427468, "auxiliary_loss_mlp": 0.01266249, "balance_loss_clip": 0.06281275, "balance_loss_mlp": 0.0125558, "epoch": 0.6903953103862919, "flos": 22494418416000.0, "grad_norm": 3.913080181238989, "language_loss": 0.82942098, "learning_rate": 9.238911707310096e-07, "loss": 0.90635818, "num_input_tokens_seen": 247851170, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10675049, "step": 11483, "time_per_iteration": 2.56231427192688 }, { "auxiliary_loss_clip": 0.06431217, "auxiliary_loss_mlp": 0.01266386, "balance_loss_clip": 0.06282347, "balance_loss_mlp": 0.01256354, "epoch": 0.6904554336389599, "flos": 26107094401920.0, "grad_norm": 1.700570431977542, "language_loss": 0.65562004, "learning_rate": 9.235629099489273e-07, "loss": 0.73259604, "num_input_tokens_seen": 247868950, "router_z_loss_clip": 1.48730469, "router_z_loss_mlp": 0.1003418, "step": 11484, "time_per_iteration": 2.583745241165161 }, { "auxiliary_loss_clip": 0.06425284, "auxiliary_loss_mlp": 0.01267817, "balance_loss_clip": 0.06283271, "balance_loss_mlp": 0.01257601, "epoch": 0.6905155568916278, "flos": 31179127023360.0, "grad_norm": 1.83244532097592, "language_loss": 0.739012, "learning_rate": 9.232346899854479e-07, "loss": 0.815943, "num_input_tokens_seen": 247889805, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10217285, "step": 11485, "time_per_iteration": 4.099102973937988 }, { "auxiliary_loss_clip": 0.06430405, "auxiliary_loss_mlp": 0.0126617, "balance_loss_clip": 0.06283395, "balance_loss_mlp": 0.01255441, "epoch": 0.6905756801442958, "flos": 17645484090240.0, "grad_norm": 1.7164688899535818, "language_loss": 0.85255861, "learning_rate": 9.22906510853017e-07, "loss": 0.9295243, "num_input_tokens_seen": 247908585, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10736084, "step": 11486, "time_per_iteration": 2.5456981658935547 }, { "auxiliary_loss_clip": 0.06430697, "auxiliary_loss_mlp": 0.01268563, "balance_loss_clip": 0.06284578, "balance_loss_mlp": 0.01258514, "epoch": 0.6906358033969637, "flos": 22349836995840.0, "grad_norm": 1.5594570874490492, "language_loss": 0.73062593, "learning_rate": 9.225783725640786e-07, "loss": 0.8076185, "num_input_tokens_seen": 247928480, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.1005249, "step": 11487, "time_per_iteration": 2.5475144386291504 }, { "auxiliary_loss_clip": 0.06316654, "auxiliary_loss_mlp": 0.01251069, "balance_loss_clip": 0.06258553, "balance_loss_mlp": 0.01249775, "epoch": 0.6906959266496318, "flos": 69769485573120.0, "grad_norm": 0.8802223128919322, "language_loss": 0.66538268, "learning_rate": 9.222502751310759e-07, "loss": 0.7410599, "num_input_tokens_seen": 247988855, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.01294708, "step": 11488, "time_per_iteration": 3.1890218257904053 }, { "auxiliary_loss_clip": 0.06436887, "auxiliary_loss_mlp": 0.01272173, "balance_loss_clip": 0.06284069, "balance_loss_mlp": 0.01260419, "epoch": 0.6907560499022997, "flos": 21440700195840.0, "grad_norm": 1.7326508847539706, "language_loss": 0.75063789, "learning_rate": 9.219222185664519e-07, "loss": 0.82772845, "num_input_tokens_seen": 248007685, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.11737061, "step": 11489, "time_per_iteration": 2.526134967803955 }, { "auxiliary_loss_clip": 0.06429993, "auxiliary_loss_mlp": 0.01270001, "balance_loss_clip": 0.06282728, "balance_loss_mlp": 0.01258992, "epoch": 0.6908161731549677, "flos": 14397427146240.0, "grad_norm": 2.134180570591563, "language_loss": 0.62683928, "learning_rate": 9.215942028826445e-07, "loss": 0.70383918, "num_input_tokens_seen": 248025145, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.11004639, "step": 11490, "time_per_iteration": 2.5509467124938965 }, { "auxiliary_loss_clip": 0.06436408, "auxiliary_loss_mlp": 0.0126626, "balance_loss_clip": 0.0628936, "balance_loss_mlp": 0.01256193, "epoch": 0.6908762964076357, "flos": 20017122053760.0, "grad_norm": 1.673888456405917, "language_loss": 0.72978091, "learning_rate": 9.212662280920937e-07, "loss": 0.80680752, "num_input_tokens_seen": 248043750, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.10076904, "step": 11491, "time_per_iteration": 2.546468496322632 }, { "auxiliary_loss_clip": 0.06421215, "auxiliary_loss_mlp": 0.01266736, "balance_loss_clip": 0.06279144, "balance_loss_mlp": 0.01255757, "epoch": 0.6909364196603036, "flos": 28776951446400.0, "grad_norm": 1.3442589592490546, "language_loss": 0.70497006, "learning_rate": 9.20938294207235e-07, "loss": 0.78184956, "num_input_tokens_seen": 248065765, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10980225, "step": 11492, "time_per_iteration": 2.6252803802490234 }, { "auxiliary_loss_clip": 0.06440071, "auxiliary_loss_mlp": 0.01272151, "balance_loss_clip": 0.06286816, "balance_loss_mlp": 0.01260618, "epoch": 0.6909965429129716, "flos": 22534641175680.0, "grad_norm": 1.9413753260463082, "language_loss": 0.75236112, "learning_rate": 9.206104012405049e-07, "loss": 0.82948333, "num_input_tokens_seen": 248083810, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.11529541, "step": 11493, "time_per_iteration": 2.573373317718506 }, { "auxiliary_loss_clip": 0.06425241, "auxiliary_loss_mlp": 0.01267448, "balance_loss_clip": 0.06280611, "balance_loss_mlp": 0.01256892, "epoch": 0.6910566661656395, "flos": 18411884259840.0, "grad_norm": 1.8115550686947697, "language_loss": 0.74514604, "learning_rate": 9.20282549204336e-07, "loss": 0.82207298, "num_input_tokens_seen": 248103185, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10546875, "step": 11494, "time_per_iteration": 2.531420946121216 }, { "auxiliary_loss_clip": 0.06427462, "auxiliary_loss_mlp": 0.01267469, "balance_loss_clip": 0.06282275, "balance_loss_mlp": 0.01257176, "epoch": 0.6911167894183076, "flos": 30781874016000.0, "grad_norm": 2.532440948219052, "language_loss": 0.6829561, "learning_rate": 9.19954738111161e-07, "loss": 0.7599054, "num_input_tokens_seen": 248125665, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10290527, "step": 11495, "time_per_iteration": 2.6715824604034424 }, { "auxiliary_loss_clip": 0.06427284, "auxiliary_loss_mlp": 0.01266576, "balance_loss_clip": 0.06281693, "balance_loss_mlp": 0.0125546, "epoch": 0.6911769126709755, "flos": 13740878079360.0, "grad_norm": 1.634068494758555, "language_loss": 0.74362504, "learning_rate": 9.196269679734119e-07, "loss": 0.82056367, "num_input_tokens_seen": 248142545, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.11117554, "step": 11496, "time_per_iteration": 2.529391288757324 }, { "auxiliary_loss_clip": 0.06423086, "auxiliary_loss_mlp": 0.0126479, "balance_loss_clip": 0.06279685, "balance_loss_mlp": 0.01255438, "epoch": 0.6912370359236435, "flos": 17572669292160.0, "grad_norm": 1.5953950463694861, "language_loss": 0.80002737, "learning_rate": 9.19299238803515e-07, "loss": 0.87690616, "num_input_tokens_seen": 248160225, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09356689, "step": 11497, "time_per_iteration": 2.526064157485962 }, { "auxiliary_loss_clip": 0.06432939, "auxiliary_loss_mlp": 0.01268244, "balance_loss_clip": 0.06282693, "balance_loss_mlp": 0.01256907, "epoch": 0.6912971591763114, "flos": 22097291189760.0, "grad_norm": 1.5349813342448342, "language_loss": 0.81077015, "learning_rate": 9.189715506138993e-07, "loss": 0.88778198, "num_input_tokens_seen": 248180430, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.11340332, "step": 11498, "time_per_iteration": 2.580130100250244 }, { "auxiliary_loss_clip": 0.06421506, "auxiliary_loss_mlp": 0.01263417, "balance_loss_clip": 0.06279169, "balance_loss_mlp": 0.01252968, "epoch": 0.6913572824289794, "flos": 29979276082560.0, "grad_norm": 1.406986603164111, "language_loss": 0.8590712, "learning_rate": 9.186439034169915e-07, "loss": 0.93592042, "num_input_tokens_seen": 248202365, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10449219, "step": 11499, "time_per_iteration": 2.641296148300171 }, { "auxiliary_loss_clip": 0.06424283, "auxiliary_loss_mlp": 0.01264216, "balance_loss_clip": 0.0628318, "balance_loss_mlp": 0.01254125, "epoch": 0.6914174056816473, "flos": 20455184799360.0, "grad_norm": 1.5141244566193437, "language_loss": 0.75606084, "learning_rate": 9.183162972252145e-07, "loss": 0.83294582, "num_input_tokens_seen": 248221750, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10095215, "step": 11500, "time_per_iteration": 2.5470781326293945 }, { "auxiliary_loss_clip": 0.06429473, "auxiliary_loss_mlp": 0.01265887, "balance_loss_clip": 0.0628389, "balance_loss_mlp": 0.01254681, "epoch": 0.6914775289343154, "flos": 21287984929920.0, "grad_norm": 1.7824956017006406, "language_loss": 0.77634692, "learning_rate": 9.179887320509921e-07, "loss": 0.85330057, "num_input_tokens_seen": 248239535, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.11199951, "step": 11501, "time_per_iteration": 2.5679144859313965 }, { "auxiliary_loss_clip": 0.06430455, "auxiliary_loss_mlp": 0.01267626, "balance_loss_clip": 0.06283008, "balance_loss_mlp": 0.01257326, "epoch": 0.6915376521869833, "flos": 23884859468160.0, "grad_norm": 1.816449050183039, "language_loss": 0.73794663, "learning_rate": 9.176612079067458e-07, "loss": 0.8149274, "num_input_tokens_seen": 248259055, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.10302734, "step": 11502, "time_per_iteration": 2.5832464694976807 }, { "auxiliary_loss_clip": 0.06425149, "auxiliary_loss_mlp": 0.01266816, "balance_loss_clip": 0.06278085, "balance_loss_mlp": 0.01255658, "epoch": 0.6915977754396513, "flos": 11515079347200.0, "grad_norm": 1.9361069002793136, "language_loss": 0.73788476, "learning_rate": 9.173337248048953e-07, "loss": 0.81480449, "num_input_tokens_seen": 248276765, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.11157227, "step": 11503, "time_per_iteration": 2.5617153644561768 }, { "auxiliary_loss_clip": 0.06421874, "auxiliary_loss_mlp": 0.01265184, "balance_loss_clip": 0.06277864, "balance_loss_mlp": 0.01254855, "epoch": 0.6916578986923193, "flos": 22607833317120.0, "grad_norm": 1.881465069974745, "language_loss": 0.77264351, "learning_rate": 9.170062827578575e-07, "loss": 0.84951407, "num_input_tokens_seen": 248295310, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10333252, "step": 11504, "time_per_iteration": 4.074572563171387 }, { "auxiliary_loss_clip": 0.0642703, "auxiliary_loss_mlp": 0.01263161, "balance_loss_clip": 0.06281722, "balance_loss_mlp": 0.01252724, "epoch": 0.6917180219449872, "flos": 23484126516480.0, "grad_norm": 1.7786435132963487, "language_loss": 0.74151754, "learning_rate": 9.166788817780499e-07, "loss": 0.81841946, "num_input_tokens_seen": 248315230, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10443115, "step": 11505, "time_per_iteration": 2.616088628768921 }, { "auxiliary_loss_clip": 0.06421912, "auxiliary_loss_mlp": 0.01267024, "balance_loss_clip": 0.06279109, "balance_loss_mlp": 0.01256808, "epoch": 0.6917781451976552, "flos": 23739313726080.0, "grad_norm": 1.8401170329808654, "language_loss": 0.87733042, "learning_rate": 9.163515218778886e-07, "loss": 0.95421982, "num_input_tokens_seen": 248332980, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10229492, "step": 11506, "time_per_iteration": 2.736328125 }, { "auxiliary_loss_clip": 0.06429352, "auxiliary_loss_mlp": 0.01266106, "balance_loss_clip": 0.06285001, "balance_loss_mlp": 0.01255675, "epoch": 0.6918382684503231, "flos": 31474704700800.0, "grad_norm": 1.8535163408000865, "language_loss": 0.70424759, "learning_rate": 9.160242030697856e-07, "loss": 0.78120214, "num_input_tokens_seen": 248352865, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10437012, "step": 11507, "time_per_iteration": 2.778352737426758 }, { "auxiliary_loss_clip": 0.06428374, "auxiliary_loss_mlp": 0.01264976, "balance_loss_clip": 0.06281152, "balance_loss_mlp": 0.01254432, "epoch": 0.6918983917029912, "flos": 21656503186560.0, "grad_norm": 1.8903458514615479, "language_loss": 0.77466214, "learning_rate": 9.156969253661538e-07, "loss": 0.85159564, "num_input_tokens_seen": 248371125, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.10540771, "step": 11508, "time_per_iteration": 2.5767154693603516 }, { "auxiliary_loss_clip": 0.06416783, "auxiliary_loss_mlp": 0.01267122, "balance_loss_clip": 0.06277001, "balance_loss_mlp": 0.01257914, "epoch": 0.6919585149556591, "flos": 25556036025600.0, "grad_norm": 1.5872955119712764, "language_loss": 0.7475245, "learning_rate": 9.153696887794027e-07, "loss": 0.82436353, "num_input_tokens_seen": 248390455, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09216309, "step": 11509, "time_per_iteration": 2.5739641189575195 }, { "auxiliary_loss_clip": 0.06418951, "auxiliary_loss_mlp": 0.01264277, "balance_loss_clip": 0.06277663, "balance_loss_mlp": 0.01254668, "epoch": 0.6920186382083271, "flos": 23666582782080.0, "grad_norm": 2.224575507961604, "language_loss": 0.63960123, "learning_rate": 9.150424933219425e-07, "loss": 0.71643353, "num_input_tokens_seen": 248411305, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.0960083, "step": 11510, "time_per_iteration": 2.5631003379821777 }, { "auxiliary_loss_clip": 0.06433476, "auxiliary_loss_mlp": 0.01270659, "balance_loss_clip": 0.06282811, "balance_loss_mlp": 0.01259442, "epoch": 0.692078761460995, "flos": 19067888275200.0, "grad_norm": 1.5849531686268015, "language_loss": 0.7558741, "learning_rate": 9.147153390061788e-07, "loss": 0.83291543, "num_input_tokens_seen": 248430190, "router_z_loss_clip": 1.50488281, "router_z_loss_mlp": 0.11218262, "step": 11511, "time_per_iteration": 2.5289194583892822 }, { "auxiliary_loss_clip": 0.06423487, "auxiliary_loss_mlp": 0.01266326, "balance_loss_clip": 0.06281839, "balance_loss_mlp": 0.01256724, "epoch": 0.692138884713663, "flos": 29031006625920.0, "grad_norm": 1.4541271425105682, "language_loss": 0.62683249, "learning_rate": 9.143882258445184e-07, "loss": 0.70373058, "num_input_tokens_seen": 248450830, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09594727, "step": 11512, "time_per_iteration": 2.6102986335754395 }, { "auxiliary_loss_clip": 0.06424228, "auxiliary_loss_mlp": 0.01269034, "balance_loss_clip": 0.06278306, "balance_loss_mlp": 0.01258598, "epoch": 0.6921990079663309, "flos": 14763262072320.0, "grad_norm": 2.391241423877175, "language_loss": 0.83224571, "learning_rate": 9.140611538493666e-07, "loss": 0.90917832, "num_input_tokens_seen": 248468585, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10430908, "step": 11513, "time_per_iteration": 4.097963571548462 }, { "auxiliary_loss_clip": 0.06418414, "auxiliary_loss_mlp": 0.01264127, "balance_loss_clip": 0.06276786, "balance_loss_mlp": 0.01254852, "epoch": 0.692259131218999, "flos": 23848619777280.0, "grad_norm": 1.4153745915303364, "language_loss": 0.78082216, "learning_rate": 9.137341230331233e-07, "loss": 0.8576476, "num_input_tokens_seen": 248490535, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09265137, "step": 11514, "time_per_iteration": 2.5774147510528564 }, { "auxiliary_loss_clip": 0.06425049, "auxiliary_loss_mlp": 0.01265079, "balance_loss_clip": 0.06276892, "balance_loss_mlp": 0.01254881, "epoch": 0.6923192544716669, "flos": 19141038489600.0, "grad_norm": 2.0848549806114534, "language_loss": 0.75390303, "learning_rate": 9.134071334081907e-07, "loss": 0.83080435, "num_input_tokens_seen": 248508575, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.10198975, "step": 11515, "time_per_iteration": 2.5457725524902344 }, { "auxiliary_loss_clip": 0.06415517, "auxiliary_loss_mlp": 0.0126465, "balance_loss_clip": 0.0627564, "balance_loss_mlp": 0.01255506, "epoch": 0.6923793777243349, "flos": 28082192117760.0, "grad_norm": 2.7132284631640577, "language_loss": 0.53519535, "learning_rate": 9.130801849869694e-07, "loss": 0.61199701, "num_input_tokens_seen": 248527025, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09143066, "step": 11516, "time_per_iteration": 2.591193437576294 }, { "auxiliary_loss_clip": 0.06412347, "auxiliary_loss_mlp": 0.01267711, "balance_loss_clip": 0.06275615, "balance_loss_mlp": 0.01257113, "epoch": 0.6924395009770029, "flos": 16586818479360.0, "grad_norm": 1.656089670780939, "language_loss": 0.73397136, "learning_rate": 9.127532777818557e-07, "loss": 0.81077188, "num_input_tokens_seen": 248544275, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.105896, "step": 11517, "time_per_iteration": 2.5332300662994385 }, { "auxiliary_loss_clip": 0.06427693, "auxiliary_loss_mlp": 0.01272723, "balance_loss_clip": 0.06282794, "balance_loss_mlp": 0.01262096, "epoch": 0.6924996242296708, "flos": 16661058796800.0, "grad_norm": 1.8749242603997698, "language_loss": 0.76612175, "learning_rate": 9.124264118052465e-07, "loss": 0.84312588, "num_input_tokens_seen": 248561870, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10626221, "step": 11518, "time_per_iteration": 2.528208017349243 }, { "auxiliary_loss_clip": 0.06433636, "auxiliary_loss_mlp": 0.01272933, "balance_loss_clip": 0.0628367, "balance_loss_mlp": 0.01261256, "epoch": 0.6925597474823388, "flos": 34763277893760.0, "grad_norm": 1.3844765819794425, "language_loss": 0.64624417, "learning_rate": 9.120995870695376e-07, "loss": 0.72330987, "num_input_tokens_seen": 248588190, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.11676025, "step": 11519, "time_per_iteration": 2.6978611946105957 }, { "auxiliary_loss_clip": 0.06418931, "auxiliary_loss_mlp": 0.01265385, "balance_loss_clip": 0.06274442, "balance_loss_mlp": 0.01254686, "epoch": 0.6926198707350067, "flos": 21878175962880.0, "grad_norm": 1.9184485427251834, "language_loss": 0.62895757, "learning_rate": 9.117728035871212e-07, "loss": 0.70580077, "num_input_tokens_seen": 248606460, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10693359, "step": 11520, "time_per_iteration": 2.5320310592651367 }, { "auxiliary_loss_clip": 0.06430411, "auxiliary_loss_mlp": 0.01270814, "balance_loss_clip": 0.06279082, "balance_loss_mlp": 0.01259561, "epoch": 0.6926799939876748, "flos": 13011346506240.0, "grad_norm": 1.7439547959839137, "language_loss": 0.78191721, "learning_rate": 9.114460613703887e-07, "loss": 0.85892951, "num_input_tokens_seen": 248623715, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.11248779, "step": 11521, "time_per_iteration": 3.956700563430786 }, { "auxiliary_loss_clip": 0.06424055, "auxiliary_loss_mlp": 0.01266427, "balance_loss_clip": 0.06276615, "balance_loss_mlp": 0.01254953, "epoch": 0.6927401172403427, "flos": 16766423706240.0, "grad_norm": 1.758171801971639, "language_loss": 0.8176868, "learning_rate": 9.111193604317304e-07, "loss": 0.89459163, "num_input_tokens_seen": 248640575, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.11468506, "step": 11522, "time_per_iteration": 2.5145206451416016 }, { "auxiliary_loss_clip": 0.06422603, "auxiliary_loss_mlp": 0.01263786, "balance_loss_clip": 0.06281329, "balance_loss_mlp": 0.01253868, "epoch": 0.6928002404930107, "flos": 25713237484800.0, "grad_norm": 1.323733436873897, "language_loss": 0.76896191, "learning_rate": 9.107927007835361e-07, "loss": 0.84582579, "num_input_tokens_seen": 248663535, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09918213, "step": 11523, "time_per_iteration": 2.631340742111206 }, { "auxiliary_loss_clip": 0.06419, "auxiliary_loss_mlp": 0.01264308, "balance_loss_clip": 0.06277382, "balance_loss_mlp": 0.01255362, "epoch": 0.6928603637456786, "flos": 18594214744320.0, "grad_norm": 1.7810076400189956, "language_loss": 0.68681443, "learning_rate": 9.104660824381915e-07, "loss": 0.7636475, "num_input_tokens_seen": 248681125, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.08935547, "step": 11524, "time_per_iteration": 2.524104595184326 }, { "auxiliary_loss_clip": 0.06427561, "auxiliary_loss_mlp": 0.01267311, "balance_loss_clip": 0.06279196, "balance_loss_mlp": 0.01256063, "epoch": 0.6929204869983466, "flos": 22207519635840.0, "grad_norm": 1.7398429923112673, "language_loss": 0.64421165, "learning_rate": 9.101395054080815e-07, "loss": 0.72116041, "num_input_tokens_seen": 248700555, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.11248779, "step": 11525, "time_per_iteration": 3.8933265209198 }, { "auxiliary_loss_clip": 0.06429759, "auxiliary_loss_mlp": 0.01270316, "balance_loss_clip": 0.06285679, "balance_loss_mlp": 0.01260094, "epoch": 0.6929806102510145, "flos": 17900545518720.0, "grad_norm": 1.9563969932385206, "language_loss": 0.7092129, "learning_rate": 9.098129697055907e-07, "loss": 0.78621364, "num_input_tokens_seen": 248716095, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10217285, "step": 11526, "time_per_iteration": 2.54545259475708 }, { "auxiliary_loss_clip": 0.06422652, "auxiliary_loss_mlp": 0.01266385, "balance_loss_clip": 0.06279407, "balance_loss_mlp": 0.01256073, "epoch": 0.6930407335036826, "flos": 19761222084480.0, "grad_norm": 1.5670361125543313, "language_loss": 0.76474822, "learning_rate": 9.094864753431022e-07, "loss": 0.84163857, "num_input_tokens_seen": 248735330, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10296631, "step": 11527, "time_per_iteration": 2.5617737770080566 }, { "auxiliary_loss_clip": 0.06422982, "auxiliary_loss_mlp": 0.01264, "balance_loss_clip": 0.06278996, "balance_loss_mlp": 0.01254517, "epoch": 0.6931008567563505, "flos": 21550802860800.0, "grad_norm": 1.880733485944256, "language_loss": 0.79901016, "learning_rate": 9.091600223329952e-07, "loss": 0.87587988, "num_input_tokens_seen": 248754530, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09484863, "step": 11528, "time_per_iteration": 2.5630407333374023 }, { "auxiliary_loss_clip": 0.06415845, "auxiliary_loss_mlp": 0.01268516, "balance_loss_clip": 0.06277852, "balance_loss_mlp": 0.01258764, "epoch": 0.6931609800090185, "flos": 26257210191360.0, "grad_norm": 1.3086717516737996, "language_loss": 0.75860989, "learning_rate": 9.088336106876491e-07, "loss": 0.83545345, "num_input_tokens_seen": 248775825, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09747314, "step": 11529, "time_per_iteration": 2.5871081352233887 }, { "auxiliary_loss_clip": 0.06421645, "auxiliary_loss_mlp": 0.01265919, "balance_loss_clip": 0.06280083, "balance_loss_mlp": 0.01256293, "epoch": 0.6932211032616865, "flos": 32351626805760.0, "grad_norm": 2.285913534283708, "language_loss": 0.72873425, "learning_rate": 9.085072404194436e-07, "loss": 0.80560988, "num_input_tokens_seen": 248796180, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09631348, "step": 11530, "time_per_iteration": 2.6502742767333984 }, { "auxiliary_loss_clip": 0.06430949, "auxiliary_loss_mlp": 0.01267814, "balance_loss_clip": 0.06280643, "balance_loss_mlp": 0.01256388, "epoch": 0.6932812265143544, "flos": 22054720515840.0, "grad_norm": 1.6257057044077543, "language_loss": 0.78327751, "learning_rate": 9.081809115407513e-07, "loss": 0.86026514, "num_input_tokens_seen": 248814735, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.11425781, "step": 11531, "time_per_iteration": 2.5636935234069824 }, { "auxiliary_loss_clip": 0.06418259, "auxiliary_loss_mlp": 0.01263918, "balance_loss_clip": 0.06278257, "balance_loss_mlp": 0.01254429, "epoch": 0.6933413497670224, "flos": 26264924766720.0, "grad_norm": 1.331158185348222, "language_loss": 0.69421995, "learning_rate": 9.078546240639484e-07, "loss": 0.77104175, "num_input_tokens_seen": 248839140, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09484863, "step": 11532, "time_per_iteration": 2.6223368644714355 }, { "auxiliary_loss_clip": 0.06426986, "auxiliary_loss_mlp": 0.01263909, "balance_loss_clip": 0.06280558, "balance_loss_mlp": 0.01252995, "epoch": 0.6934014730196904, "flos": 19579059308160.0, "grad_norm": 1.2735189291949613, "language_loss": 0.67060781, "learning_rate": 9.075283780014082e-07, "loss": 0.74751675, "num_input_tokens_seen": 248858300, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.10913086, "step": 11533, "time_per_iteration": 2.5330114364624023 }, { "auxiliary_loss_clip": 0.06425419, "auxiliary_loss_mlp": 0.01265386, "balance_loss_clip": 0.06279034, "balance_loss_mlp": 0.0125452, "epoch": 0.6934615962723584, "flos": 22124432712960.0, "grad_norm": 3.0641551774907256, "language_loss": 0.59923494, "learning_rate": 9.072021733655007e-07, "loss": 0.67614299, "num_input_tokens_seen": 248876310, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.10870361, "step": 11534, "time_per_iteration": 2.544207811355591 }, { "auxiliary_loss_clip": 0.06423485, "auxiliary_loss_mlp": 0.01265556, "balance_loss_clip": 0.0627955, "balance_loss_mlp": 0.01255108, "epoch": 0.6935217195250263, "flos": 21367172638080.0, "grad_norm": 1.9721248818088684, "language_loss": 0.71439755, "learning_rate": 9.068760101685971e-07, "loss": 0.79128802, "num_input_tokens_seen": 248895650, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10455322, "step": 11535, "time_per_iteration": 2.525709629058838 }, { "auxiliary_loss_clip": 0.06319766, "auxiliary_loss_mlp": 0.01252953, "balance_loss_clip": 0.06261224, "balance_loss_mlp": 0.0125144, "epoch": 0.6935818427776943, "flos": 64085864400000.0, "grad_norm": 0.6900770543129546, "language_loss": 0.58953834, "learning_rate": 9.065498884230638e-07, "loss": 0.6652655, "num_input_tokens_seen": 248963920, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01512909, "step": 11536, "time_per_iteration": 3.272197961807251 }, { "auxiliary_loss_clip": 0.06426896, "auxiliary_loss_mlp": 0.01269, "balance_loss_clip": 0.06280232, "balance_loss_mlp": 0.01258277, "epoch": 0.6936419660303622, "flos": 20308716662400.0, "grad_norm": 1.490242372713176, "language_loss": 0.72910881, "learning_rate": 9.062238081412692e-07, "loss": 0.80606782, "num_input_tokens_seen": 248983380, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10717773, "step": 11537, "time_per_iteration": 2.550913095474243 }, { "auxiliary_loss_clip": 0.06321108, "auxiliary_loss_mlp": 0.01252662, "balance_loss_clip": 0.06262383, "balance_loss_mlp": 0.01251293, "epoch": 0.6937020892830302, "flos": 67201974691200.0, "grad_norm": 0.7499518749308587, "language_loss": 0.55673599, "learning_rate": 9.058977693355767e-07, "loss": 0.63247371, "num_input_tokens_seen": 249044680, "router_z_loss_clip": 0.58642578, "router_z_loss_mlp": 0.01371765, "step": 11538, "time_per_iteration": 3.165757894515991 }, { "auxiliary_loss_clip": 0.06408979, "auxiliary_loss_mlp": 0.01262568, "balance_loss_clip": 0.06273776, "balance_loss_mlp": 0.01253472, "epoch": 0.6937622125356981, "flos": 23884943322240.0, "grad_norm": 1.7551032331499172, "language_loss": 0.77848768, "learning_rate": 9.055717720183505e-07, "loss": 0.85520303, "num_input_tokens_seen": 249061060, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.09094238, "step": 11539, "time_per_iteration": 2.5981993675231934 }, { "auxiliary_loss_clip": 0.06418183, "auxiliary_loss_mlp": 0.01262436, "balance_loss_clip": 0.06277309, "balance_loss_mlp": 0.01252923, "epoch": 0.6938223357883662, "flos": 28738154206080.0, "grad_norm": 1.7842630293201638, "language_loss": 0.64198667, "learning_rate": 9.05245816201953e-07, "loss": 0.71879286, "num_input_tokens_seen": 249081430, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09503174, "step": 11540, "time_per_iteration": 2.5934362411499023 }, { "auxiliary_loss_clip": 0.06417106, "auxiliary_loss_mlp": 0.01263909, "balance_loss_clip": 0.06276309, "balance_loss_mlp": 0.01254581, "epoch": 0.6938824590410341, "flos": 28662111025920.0, "grad_norm": 1.452639837590886, "language_loss": 0.86641723, "learning_rate": 9.049199018987437e-07, "loss": 0.94322735, "num_input_tokens_seen": 249103020, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09332275, "step": 11541, "time_per_iteration": 2.7221648693084717 }, { "auxiliary_loss_clip": 0.06422658, "auxiliary_loss_mlp": 0.01265226, "balance_loss_clip": 0.06278469, "balance_loss_mlp": 0.01255064, "epoch": 0.6939425822937021, "flos": 18987987807360.0, "grad_norm": 1.7972168861330209, "language_loss": 0.84300643, "learning_rate": 9.04594029121081e-07, "loss": 0.91988528, "num_input_tokens_seen": 249120810, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.1015625, "step": 11542, "time_per_iteration": 2.542616128921509 }, { "auxiliary_loss_clip": 0.06425156, "auxiliary_loss_mlp": 0.01265082, "balance_loss_clip": 0.0627792, "balance_loss_mlp": 0.01254044, "epoch": 0.6940027055463701, "flos": 23082513096960.0, "grad_norm": 1.828583487050809, "language_loss": 0.75847596, "learning_rate": 9.04268197881323e-07, "loss": 0.83537829, "num_input_tokens_seen": 249138050, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.11029053, "step": 11543, "time_per_iteration": 2.5786125659942627 }, { "auxiliary_loss_clip": 0.06416805, "auxiliary_loss_mlp": 0.01265674, "balance_loss_clip": 0.06273983, "balance_loss_mlp": 0.01255732, "epoch": 0.694062828799038, "flos": 18192391689600.0, "grad_norm": 1.5570968403015841, "language_loss": 0.76075518, "learning_rate": 9.039424081918241e-07, "loss": 0.83757997, "num_input_tokens_seen": 249155570, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.0994873, "step": 11544, "time_per_iteration": 3.9660415649414062 }, { "auxiliary_loss_clip": 0.06420632, "auxiliary_loss_mlp": 0.01268312, "balance_loss_clip": 0.06276426, "balance_loss_mlp": 0.01258007, "epoch": 0.694122952051706, "flos": 17827269523200.0, "grad_norm": 1.6601992068416347, "language_loss": 0.71283478, "learning_rate": 9.036166600649388e-07, "loss": 0.78972423, "num_input_tokens_seen": 249172960, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10308838, "step": 11545, "time_per_iteration": 2.5210719108581543 }, { "auxiliary_loss_clip": 0.0641496, "auxiliary_loss_mlp": 0.01262163, "balance_loss_clip": 0.06277533, "balance_loss_mlp": 0.01253413, "epoch": 0.694183075304374, "flos": 21221710750080.0, "grad_norm": 1.5196980672904277, "language_loss": 0.79792678, "learning_rate": 9.0329095351302e-07, "loss": 0.87469798, "num_input_tokens_seen": 249192450, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08746338, "step": 11546, "time_per_iteration": 2.54848313331604 }, { "auxiliary_loss_clip": 0.06422289, "auxiliary_loss_mlp": 0.01266302, "balance_loss_clip": 0.06279218, "balance_loss_mlp": 0.01256217, "epoch": 0.694243198557042, "flos": 24067273806720.0, "grad_norm": 1.455175118057219, "language_loss": 0.79058278, "learning_rate": 9.029652885484194e-07, "loss": 0.86746871, "num_input_tokens_seen": 249214320, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.10083008, "step": 11547, "time_per_iteration": 2.594287395477295 }, { "auxiliary_loss_clip": 0.06416617, "auxiliary_loss_mlp": 0.0126585, "balance_loss_clip": 0.06275858, "balance_loss_mlp": 0.01255741, "epoch": 0.6943033218097099, "flos": 21148183192320.0, "grad_norm": 1.8254083240575292, "language_loss": 0.81230456, "learning_rate": 9.026396651834834e-07, "loss": 0.88912928, "num_input_tokens_seen": 249230925, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10107422, "step": 11548, "time_per_iteration": 2.5265495777130127 }, { "auxiliary_loss_clip": 0.06317414, "auxiliary_loss_mlp": 0.01250618, "balance_loss_clip": 0.06259078, "balance_loss_mlp": 0.01249369, "epoch": 0.6943634450623779, "flos": 57830892163200.0, "grad_norm": 0.8561043171542084, "language_loss": 0.53489161, "learning_rate": 9.023140834305613e-07, "loss": 0.61057192, "num_input_tokens_seen": 249293975, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01248169, "step": 11549, "time_per_iteration": 3.15749454498291 }, { "auxiliary_loss_clip": 0.06417121, "auxiliary_loss_mlp": 0.01264017, "balance_loss_clip": 0.06274907, "balance_loss_mlp": 0.01253795, "epoch": 0.6944235683150458, "flos": 30598411501440.0, "grad_norm": 1.283468636344261, "language_loss": 0.73960602, "learning_rate": 9.01988543302e-07, "loss": 0.81641746, "num_input_tokens_seen": 249315285, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10229492, "step": 11550, "time_per_iteration": 2.634185791015625 }, { "auxiliary_loss_clip": 0.06424987, "auxiliary_loss_mlp": 0.01267369, "balance_loss_clip": 0.06277211, "balance_loss_mlp": 0.01256724, "epoch": 0.6944836915677138, "flos": 19725611299200.0, "grad_norm": 1.7044664538575935, "language_loss": 0.74650943, "learning_rate": 9.016630448101425e-07, "loss": 0.82343298, "num_input_tokens_seen": 249333505, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.10650635, "step": 11551, "time_per_iteration": 2.5387158393859863 }, { "auxiliary_loss_clip": 0.06423943, "auxiliary_loss_mlp": 0.01268896, "balance_loss_clip": 0.06279396, "balance_loss_mlp": 0.01258185, "epoch": 0.6945438148203817, "flos": 24870542572800.0, "grad_norm": 1.448094255944053, "language_loss": 0.8432042, "learning_rate": 9.01337587967333e-07, "loss": 0.92013258, "num_input_tokens_seen": 249354180, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.1071167, "step": 11552, "time_per_iteration": 4.034431219100952 }, { "auxiliary_loss_clip": 0.06422739, "auxiliary_loss_mlp": 0.01270681, "balance_loss_clip": 0.06278959, "balance_loss_mlp": 0.01260327, "epoch": 0.6946039380730498, "flos": 33334752360960.0, "grad_norm": 1.729986057701332, "language_loss": 0.67811483, "learning_rate": 9.010121727859117e-07, "loss": 0.75504899, "num_input_tokens_seen": 249377035, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10357666, "step": 11553, "time_per_iteration": 2.819788694381714 }, { "auxiliary_loss_clip": 0.06430331, "auxiliary_loss_mlp": 0.01265054, "balance_loss_clip": 0.06281553, "balance_loss_mlp": 0.01254302, "epoch": 0.6946640613257177, "flos": 20857385197440.0, "grad_norm": 2.090057669768035, "language_loss": 0.7958082, "learning_rate": 9.006867992782195e-07, "loss": 0.87276202, "num_input_tokens_seen": 249396155, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.10742188, "step": 11554, "time_per_iteration": 2.729779005050659 }, { "auxiliary_loss_clip": 0.06421702, "auxiliary_loss_mlp": 0.01267692, "balance_loss_clip": 0.06276257, "balance_loss_mlp": 0.01257541, "epoch": 0.6947241845783857, "flos": 19360992257280.0, "grad_norm": 1.725006550615799, "language_loss": 0.72825873, "learning_rate": 9.003614674565934e-07, "loss": 0.80515265, "num_input_tokens_seen": 249414555, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.1015625, "step": 11555, "time_per_iteration": 2.56168794631958 }, { "auxiliary_loss_clip": 0.06416383, "auxiliary_loss_mlp": 0.01265901, "balance_loss_clip": 0.06274205, "balance_loss_mlp": 0.01255726, "epoch": 0.6947843078310536, "flos": 27126669283200.0, "grad_norm": 1.727255071915, "language_loss": 0.78025681, "learning_rate": 9.000361773333705e-07, "loss": 0.85707963, "num_input_tokens_seen": 249433570, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10180664, "step": 11556, "time_per_iteration": 2.605041265487671 }, { "auxiliary_loss_clip": 0.0642114, "auxiliary_loss_mlp": 0.01263844, "balance_loss_clip": 0.06277321, "balance_loss_mlp": 0.01254367, "epoch": 0.6948444310837216, "flos": 28592692318080.0, "grad_norm": 2.0040763307341405, "language_loss": 0.60910296, "learning_rate": 8.997109289208869e-07, "loss": 0.68595278, "num_input_tokens_seen": 249453735, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.09472656, "step": 11557, "time_per_iteration": 2.6232714653015137 }, { "auxiliary_loss_clip": 0.06414646, "auxiliary_loss_mlp": 0.01265588, "balance_loss_clip": 0.06275278, "balance_loss_mlp": 0.01255998, "epoch": 0.6949045543363896, "flos": 15674704859520.0, "grad_norm": 1.6847171315007472, "language_loss": 0.85697788, "learning_rate": 8.993857222314752e-07, "loss": 0.93378019, "num_input_tokens_seen": 249470805, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09588623, "step": 11558, "time_per_iteration": 2.5230908393859863 }, { "auxiliary_loss_clip": 0.06425305, "auxiliary_loss_mlp": 0.01268848, "balance_loss_clip": 0.06278708, "balance_loss_mlp": 0.01258298, "epoch": 0.6949646775890576, "flos": 23266311027840.0, "grad_norm": 1.4411623253734163, "language_loss": 0.70766747, "learning_rate": 8.990605572774664e-07, "loss": 0.78460896, "num_input_tokens_seen": 249491150, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10552979, "step": 11559, "time_per_iteration": 2.6072962284088135 }, { "auxiliary_loss_clip": 0.06423679, "auxiliary_loss_mlp": 0.01265921, "balance_loss_clip": 0.06281374, "balance_loss_mlp": 0.01256748, "epoch": 0.6950248008417256, "flos": 22389095433600.0, "grad_norm": 1.997537996913751, "language_loss": 0.79126179, "learning_rate": 8.987354340711921e-07, "loss": 0.86815774, "num_input_tokens_seen": 249511560, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.0916748, "step": 11560, "time_per_iteration": 2.5504651069641113 }, { "auxiliary_loss_clip": 0.06417383, "auxiliary_loss_mlp": 0.01265031, "balance_loss_clip": 0.06276105, "balance_loss_mlp": 0.01255863, "epoch": 0.6950849240943935, "flos": 23484126516480.0, "grad_norm": 1.6125455123532675, "language_loss": 0.76924729, "learning_rate": 8.9841035262498e-07, "loss": 0.84607136, "num_input_tokens_seen": 249531910, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09173584, "step": 11561, "time_per_iteration": 4.010195016860962 }, { "auxiliary_loss_clip": 0.06423666, "auxiliary_loss_mlp": 0.01271146, "balance_loss_clip": 0.06282607, "balance_loss_mlp": 0.01260917, "epoch": 0.6951450473470615, "flos": 17426285009280.0, "grad_norm": 1.782498631305868, "language_loss": 0.78982008, "learning_rate": 8.980853129511577e-07, "loss": 0.86676818, "num_input_tokens_seen": 249550300, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10235596, "step": 11562, "time_per_iteration": 2.5215134620666504 }, { "auxiliary_loss_clip": 0.0642489, "auxiliary_loss_mlp": 0.01266688, "balance_loss_clip": 0.0628009, "balance_loss_mlp": 0.01256472, "epoch": 0.6952051705997294, "flos": 20492053395840.0, "grad_norm": 1.9633194459845091, "language_loss": 0.69418436, "learning_rate": 8.977603150620515e-07, "loss": 0.77110016, "num_input_tokens_seen": 249567740, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10217285, "step": 11563, "time_per_iteration": 2.548715829849243 }, { "auxiliary_loss_clip": 0.06414977, "auxiliary_loss_mlp": 0.01266155, "balance_loss_clip": 0.06277061, "balance_loss_mlp": 0.01256756, "epoch": 0.6952652938523974, "flos": 13994472061440.0, "grad_norm": 2.5892834790984836, "language_loss": 0.74026859, "learning_rate": 8.974353589699846e-07, "loss": 0.8170799, "num_input_tokens_seen": 249582700, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09399414, "step": 11564, "time_per_iteration": 3.9543628692626953 }, { "auxiliary_loss_clip": 0.0644664, "auxiliary_loss_mlp": 0.0127095, "balance_loss_clip": 0.06289278, "balance_loss_mlp": 0.01258129, "epoch": 0.6953254171050653, "flos": 30961479242880.0, "grad_norm": 2.29864736853458, "language_loss": 0.72208595, "learning_rate": 8.971104446872785e-07, "loss": 0.79926181, "num_input_tokens_seen": 249602920, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.12841797, "step": 11565, "time_per_iteration": 2.628223419189453 }, { "auxiliary_loss_clip": 0.06324503, "auxiliary_loss_mlp": 0.01250918, "balance_loss_clip": 0.06266166, "balance_loss_mlp": 0.01249708, "epoch": 0.6953855403577334, "flos": 61688231671680.0, "grad_norm": 0.9611521024604339, "language_loss": 0.58345836, "learning_rate": 8.96785572226255e-07, "loss": 0.65921259, "num_input_tokens_seen": 249660400, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01209259, "step": 11566, "time_per_iteration": 3.0227208137512207 }, { "auxiliary_loss_clip": 0.06427293, "auxiliary_loss_mlp": 0.01265269, "balance_loss_clip": 0.06280092, "balance_loss_mlp": 0.01254463, "epoch": 0.6954456636104013, "flos": 23045644500480.0, "grad_norm": 1.7953245754160598, "language_loss": 0.74049485, "learning_rate": 8.964607415992338e-07, "loss": 0.81742048, "num_input_tokens_seen": 249679335, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.1081543, "step": 11567, "time_per_iteration": 2.569514274597168 }, { "auxiliary_loss_clip": 0.06420903, "auxiliary_loss_mlp": 0.01264523, "balance_loss_clip": 0.06280518, "balance_loss_mlp": 0.01254628, "epoch": 0.6955057868630693, "flos": 23925920768640.0, "grad_norm": 1.2310023080791277, "language_loss": 0.76848173, "learning_rate": 8.961359528185313e-07, "loss": 0.84533596, "num_input_tokens_seen": 249701805, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09887695, "step": 11568, "time_per_iteration": 2.609351396560669 }, { "auxiliary_loss_clip": 0.06424, "auxiliary_loss_mlp": 0.01264519, "balance_loss_clip": 0.06282191, "balance_loss_mlp": 0.01255203, "epoch": 0.6955659101157372, "flos": 22600076814720.0, "grad_norm": 1.7641903831945147, "language_loss": 0.72498977, "learning_rate": 8.958112058964649e-07, "loss": 0.801875, "num_input_tokens_seen": 249720550, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09313965, "step": 11569, "time_per_iteration": 2.5695180892944336 }, { "auxiliary_loss_clip": 0.06421967, "auxiliary_loss_mlp": 0.01266162, "balance_loss_clip": 0.06278251, "balance_loss_mlp": 0.0125622, "epoch": 0.6956260333684052, "flos": 24579576869760.0, "grad_norm": 1.8038453788245499, "language_loss": 0.77144861, "learning_rate": 8.954865008453471e-07, "loss": 0.8483299, "num_input_tokens_seen": 249740325, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.0993042, "step": 11570, "time_per_iteration": 2.5826003551483154 }, { "auxiliary_loss_clip": 0.06428307, "auxiliary_loss_mlp": 0.0126592, "balance_loss_clip": 0.0628157, "balance_loss_mlp": 0.01256473, "epoch": 0.6956861566210732, "flos": 25852745733120.0, "grad_norm": 2.0291655577485113, "language_loss": 0.74498582, "learning_rate": 8.95161837677493e-07, "loss": 0.82192814, "num_input_tokens_seen": 249760570, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.09442139, "step": 11571, "time_per_iteration": 2.5903663635253906 }, { "auxiliary_loss_clip": 0.06414373, "auxiliary_loss_mlp": 0.01261652, "balance_loss_clip": 0.06277684, "balance_loss_mlp": 0.01252748, "epoch": 0.6957462798737412, "flos": 15306270456960.0, "grad_norm": 1.658244322944304, "language_loss": 0.749331, "learning_rate": 8.948372164052118e-07, "loss": 0.82609129, "num_input_tokens_seen": 249778290, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.08905029, "step": 11572, "time_per_iteration": 2.5338799953460693 }, { "auxiliary_loss_clip": 0.06421091, "auxiliary_loss_mlp": 0.01266216, "balance_loss_clip": 0.06277049, "balance_loss_mlp": 0.01256745, "epoch": 0.6958064031264092, "flos": 36255645838080.0, "grad_norm": 1.6779563190605336, "language_loss": 0.70364749, "learning_rate": 8.94512637040814e-07, "loss": 0.78052056, "num_input_tokens_seen": 249800925, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.09472656, "step": 11573, "time_per_iteration": 2.6725196838378906 }, { "auxiliary_loss_clip": 0.06433457, "auxiliary_loss_mlp": 0.01266161, "balance_loss_clip": 0.0628427, "balance_loss_mlp": 0.01255498, "epoch": 0.6958665263790771, "flos": 19214817609600.0, "grad_norm": 1.6407758704295046, "language_loss": 0.75052267, "learning_rate": 8.941880995966095e-07, "loss": 0.82751888, "num_input_tokens_seen": 249820500, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.10662842, "step": 11574, "time_per_iteration": 2.5553925037384033 }, { "auxiliary_loss_clip": 0.06424478, "auxiliary_loss_mlp": 0.01262615, "balance_loss_clip": 0.06277837, "balance_loss_mlp": 0.0125315, "epoch": 0.6959266496317451, "flos": 21801797366400.0, "grad_norm": 1.5798893324539123, "language_loss": 0.74605882, "learning_rate": 8.938636040849014e-07, "loss": 0.82292974, "num_input_tokens_seen": 249839845, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.09460449, "step": 11575, "time_per_iteration": 2.5504369735717773 }, { "auxiliary_loss_clip": 0.06421746, "auxiliary_loss_mlp": 0.01266784, "balance_loss_clip": 0.06277344, "balance_loss_mlp": 0.01256663, "epoch": 0.695986772884413, "flos": 20564490850560.0, "grad_norm": 1.80249169435516, "language_loss": 0.78770459, "learning_rate": 8.935391505179966e-07, "loss": 0.86458981, "num_input_tokens_seen": 249857400, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10125732, "step": 11576, "time_per_iteration": 2.548293352127075 }, { "auxiliary_loss_clip": 0.06425677, "auxiliary_loss_mlp": 0.01262742, "balance_loss_clip": 0.06279029, "balance_loss_mlp": 0.01253062, "epoch": 0.696046896137081, "flos": 14940980582400.0, "grad_norm": 2.4254468349298466, "language_loss": 0.56806087, "learning_rate": 8.932147389081985e-07, "loss": 0.64494503, "num_input_tokens_seen": 249871645, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.09686279, "step": 11577, "time_per_iteration": 2.491748094558716 }, { "auxiliary_loss_clip": 0.06413992, "auxiliary_loss_mlp": 0.01264131, "balance_loss_clip": 0.06277332, "balance_loss_mlp": 0.01255649, "epoch": 0.696107019389749, "flos": 30748569217920.0, "grad_norm": 1.564413479919931, "language_loss": 0.76619077, "learning_rate": 8.928903692678081e-07, "loss": 0.84297198, "num_input_tokens_seen": 249894215, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.08483887, "step": 11578, "time_per_iteration": 2.6466987133026123 }, { "auxiliary_loss_clip": 0.06423637, "auxiliary_loss_mlp": 0.01262482, "balance_loss_clip": 0.06280299, "balance_loss_mlp": 0.01253446, "epoch": 0.696167142642417, "flos": 20782935244800.0, "grad_norm": 1.8286476883475578, "language_loss": 0.79768896, "learning_rate": 8.925660416091254e-07, "loss": 0.8745501, "num_input_tokens_seen": 249912850, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09033203, "step": 11579, "time_per_iteration": 2.5537567138671875 }, { "auxiliary_loss_clip": 0.06416234, "auxiliary_loss_mlp": 0.01263307, "balance_loss_clip": 0.06275886, "balance_loss_mlp": 0.01253788, "epoch": 0.6962272658950849, "flos": 22571761334400.0, "grad_norm": 1.7953248697976698, "language_loss": 0.72990566, "learning_rate": 8.922417559444502e-07, "loss": 0.80670106, "num_input_tokens_seen": 249932650, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09521484, "step": 11580, "time_per_iteration": 2.533130645751953 }, { "auxiliary_loss_clip": 0.0642433, "auxiliary_loss_mlp": 0.01269854, "balance_loss_clip": 0.06278438, "balance_loss_mlp": 0.01259233, "epoch": 0.6962873891477529, "flos": 22206681095040.0, "grad_norm": 1.8936334828947845, "language_loss": 0.65815759, "learning_rate": 8.919175122860787e-07, "loss": 0.73509943, "num_input_tokens_seen": 249951205, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10620117, "step": 11581, "time_per_iteration": 2.5457136631011963 }, { "auxiliary_loss_clip": 0.0642671, "auxiliary_loss_mlp": 0.0126387, "balance_loss_clip": 0.06283814, "balance_loss_mlp": 0.01255144, "epoch": 0.6963475124004208, "flos": 12493718709120.0, "grad_norm": 1.7680310885977064, "language_loss": 0.76729119, "learning_rate": 8.915933106463056e-07, "loss": 0.84419698, "num_input_tokens_seen": 249967045, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.0871582, "step": 11582, "time_per_iteration": 2.4934232234954834 }, { "auxiliary_loss_clip": 0.06419802, "auxiliary_loss_mlp": 0.01264935, "balance_loss_clip": 0.0627713, "balance_loss_mlp": 0.01255732, "epoch": 0.6964076356530888, "flos": 17170762383360.0, "grad_norm": 2.0873184056565193, "language_loss": 0.70249081, "learning_rate": 8.91269151037425e-07, "loss": 0.77933812, "num_input_tokens_seen": 249984565, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09197998, "step": 11583, "time_per_iteration": 2.52125883102417 }, { "auxiliary_loss_clip": 0.06418928, "auxiliary_loss_mlp": 0.01268898, "balance_loss_clip": 0.06276216, "balance_loss_mlp": 0.01258754, "epoch": 0.6964677589057569, "flos": 19943342933760.0, "grad_norm": 1.7198198511953613, "language_loss": 0.82696319, "learning_rate": 8.909450334717301e-07, "loss": 0.90384144, "num_input_tokens_seen": 250004235, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10144043, "step": 11584, "time_per_iteration": 3.9859211444854736 }, { "auxiliary_loss_clip": 0.06424944, "auxiliary_loss_mlp": 0.0127005, "balance_loss_clip": 0.06279075, "balance_loss_mlp": 0.01259589, "epoch": 0.6965278821584248, "flos": 22790708853120.0, "grad_norm": 2.1721209231452985, "language_loss": 0.79738355, "learning_rate": 8.906209579615107e-07, "loss": 0.8743335, "num_input_tokens_seen": 250017645, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10455322, "step": 11585, "time_per_iteration": 2.5322983264923096 }, { "auxiliary_loss_clip": 0.06413879, "auxiliary_loss_mlp": 0.01266672, "balance_loss_clip": 0.06275457, "balance_loss_mlp": 0.01257821, "epoch": 0.6965880054110928, "flos": 20053739088000.0, "grad_norm": 1.606139687236839, "language_loss": 0.77938867, "learning_rate": 8.90296924519055e-07, "loss": 0.85619414, "num_input_tokens_seen": 250037640, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.08850098, "step": 11586, "time_per_iteration": 2.550060987472534 }, { "auxiliary_loss_clip": 0.06415296, "auxiliary_loss_mlp": 0.01266842, "balance_loss_clip": 0.06279379, "balance_loss_mlp": 0.01258176, "epoch": 0.6966481286637607, "flos": 21914709143040.0, "grad_norm": 1.5508037674092314, "language_loss": 0.79064596, "learning_rate": 8.899729331566519e-07, "loss": 0.86746734, "num_input_tokens_seen": 250056490, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.08673096, "step": 11587, "time_per_iteration": 2.528764009475708 }, { "auxiliary_loss_clip": 0.06414387, "auxiliary_loss_mlp": 0.01265427, "balance_loss_clip": 0.06277622, "balance_loss_mlp": 0.01256248, "epoch": 0.6967082519164287, "flos": 15638674803840.0, "grad_norm": 2.0864460462702, "language_loss": 0.72736925, "learning_rate": 8.896489838865857e-07, "loss": 0.80416739, "num_input_tokens_seen": 250074285, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.09179688, "step": 11588, "time_per_iteration": 2.529214382171631 }, { "auxiliary_loss_clip": 0.06419466, "auxiliary_loss_mlp": 0.01260931, "balance_loss_clip": 0.06277384, "balance_loss_mlp": 0.01251954, "epoch": 0.6967683751690966, "flos": 24031453386240.0, "grad_norm": 1.650716095241274, "language_loss": 0.75775218, "learning_rate": 8.893250767211413e-07, "loss": 0.8345561, "num_input_tokens_seen": 250093350, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.08978271, "step": 11589, "time_per_iteration": 2.571201801300049 }, { "auxiliary_loss_clip": 0.06419446, "auxiliary_loss_mlp": 0.0126275, "balance_loss_clip": 0.06277041, "balance_loss_mlp": 0.01252725, "epoch": 0.6968284984217646, "flos": 31031862272640.0, "grad_norm": 1.9701983817685174, "language_loss": 0.63964969, "learning_rate": 8.890012116726012e-07, "loss": 0.71647167, "num_input_tokens_seen": 250114170, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10040283, "step": 11590, "time_per_iteration": 2.608572006225586 }, { "auxiliary_loss_clip": 0.06314269, "auxiliary_loss_mlp": 0.01250039, "balance_loss_clip": 0.06255841, "balance_loss_mlp": 0.01248895, "epoch": 0.6968886216744326, "flos": 67642888475520.0, "grad_norm": 0.7404980684029817, "language_loss": 0.6118716, "learning_rate": 8.88677388753248e-07, "loss": 0.68751472, "num_input_tokens_seen": 250178250, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01144409, "step": 11591, "time_per_iteration": 3.271422863006592 }, { "auxiliary_loss_clip": 0.06422174, "auxiliary_loss_mlp": 0.01267443, "balance_loss_clip": 0.06280315, "balance_loss_mlp": 0.01257233, "epoch": 0.6969487449271006, "flos": 24870668353920.0, "grad_norm": 1.6146224339538493, "language_loss": 0.69550568, "learning_rate": 8.883536079753582e-07, "loss": 0.77240181, "num_input_tokens_seen": 250198420, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10211182, "step": 11592, "time_per_iteration": 4.012571811676025 }, { "auxiliary_loss_clip": 0.0641796, "auxiliary_loss_mlp": 0.01265524, "balance_loss_clip": 0.06276777, "balance_loss_mlp": 0.01256798, "epoch": 0.6970088681797685, "flos": 28775525927040.0, "grad_norm": 1.3816725334620998, "language_loss": 0.6264503, "learning_rate": 8.880298693512109e-07, "loss": 0.7032851, "num_input_tokens_seen": 250220650, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.08721924, "step": 11593, "time_per_iteration": 2.6369731426239014 }, { "auxiliary_loss_clip": 0.06413803, "auxiliary_loss_mlp": 0.0126386, "balance_loss_clip": 0.06275922, "balance_loss_mlp": 0.01255473, "epoch": 0.6970689914324365, "flos": 27316001583360.0, "grad_norm": 1.4151138773352503, "language_loss": 0.5439955, "learning_rate": 8.877061728930832e-07, "loss": 0.62077212, "num_input_tokens_seen": 250241750, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.0838623, "step": 11594, "time_per_iteration": 2.6000099182128906 }, { "auxiliary_loss_clip": 0.06424284, "auxiliary_loss_mlp": 0.01262465, "balance_loss_clip": 0.06281397, "balance_loss_mlp": 0.01253477, "epoch": 0.6971291146851044, "flos": 19142422081920.0, "grad_norm": 1.9610617963475179, "language_loss": 0.77472568, "learning_rate": 8.87382518613248e-07, "loss": 0.85159314, "num_input_tokens_seen": 250259445, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.08990479, "step": 11595, "time_per_iteration": 2.5344197750091553 }, { "auxiliary_loss_clip": 0.06423241, "auxiliary_loss_mlp": 0.01264908, "balance_loss_clip": 0.06279334, "balance_loss_mlp": 0.01254542, "epoch": 0.6971892379377724, "flos": 14615661905280.0, "grad_norm": 2.404618481332192, "language_loss": 0.71730846, "learning_rate": 8.870589065239793e-07, "loss": 0.79418993, "num_input_tokens_seen": 250275640, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10375977, "step": 11596, "time_per_iteration": 2.502249240875244 }, { "auxiliary_loss_clip": 0.06420231, "auxiliary_loss_mlp": 0.01264086, "balance_loss_clip": 0.06278671, "balance_loss_mlp": 0.01254347, "epoch": 0.6972493611904405, "flos": 22313639232000.0, "grad_norm": 2.384300088525751, "language_loss": 0.76536691, "learning_rate": 8.867353366375492e-07, "loss": 0.84221011, "num_input_tokens_seen": 250296435, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09735107, "step": 11597, "time_per_iteration": 2.5629827976226807 }, { "auxiliary_loss_clip": 0.06416169, "auxiliary_loss_mlp": 0.01267873, "balance_loss_clip": 0.06274826, "balance_loss_mlp": 0.01258133, "epoch": 0.6973094844431084, "flos": 17426075374080.0, "grad_norm": 2.317103988181836, "language_loss": 0.75388205, "learning_rate": 8.864118089662267e-07, "loss": 0.83072245, "num_input_tokens_seen": 250314035, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09729004, "step": 11598, "time_per_iteration": 2.523932933807373 }, { "auxiliary_loss_clip": 0.06425637, "auxiliary_loss_mlp": 0.01266363, "balance_loss_clip": 0.06279842, "balance_loss_mlp": 0.01256296, "epoch": 0.6973696076957764, "flos": 27242767514880.0, "grad_norm": 1.7354913623413046, "language_loss": 0.89554441, "learning_rate": 8.860883235222791e-07, "loss": 0.97246444, "num_input_tokens_seen": 250332995, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10064697, "step": 11599, "time_per_iteration": 2.5880298614501953 }, { "auxiliary_loss_clip": 0.06435362, "auxiliary_loss_mlp": 0.01265782, "balance_loss_clip": 0.06286344, "balance_loss_mlp": 0.0125519, "epoch": 0.6974297309484443, "flos": 22024644099840.0, "grad_norm": 1.9885366820435477, "language_loss": 0.6997571, "learning_rate": 8.85764880317974e-07, "loss": 0.77676845, "num_input_tokens_seen": 250352120, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.105896, "step": 11600, "time_per_iteration": 4.067085266113281 }, { "auxiliary_loss_clip": 0.06423165, "auxiliary_loss_mlp": 0.01264943, "balance_loss_clip": 0.06279242, "balance_loss_mlp": 0.0125493, "epoch": 0.6974898542011123, "flos": 28374038288640.0, "grad_norm": 1.5096047585556756, "language_loss": 0.76659608, "learning_rate": 8.854414793655771e-07, "loss": 0.84347713, "num_input_tokens_seen": 250371705, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10021973, "step": 11601, "time_per_iteration": 2.765144109725952 }, { "auxiliary_loss_clip": 0.06415333, "auxiliary_loss_mlp": 0.01264282, "balance_loss_clip": 0.06277184, "balance_loss_mlp": 0.01255433, "epoch": 0.6975499774537802, "flos": 15237522581760.0, "grad_norm": 2.1256848871476404, "language_loss": 0.72226369, "learning_rate": 8.851181206773508e-07, "loss": 0.79905981, "num_input_tokens_seen": 250390485, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.08853149, "step": 11602, "time_per_iteration": 2.633559465408325 }, { "auxiliary_loss_clip": 0.0642155, "auxiliary_loss_mlp": 0.01263068, "balance_loss_clip": 0.06280933, "balance_loss_mlp": 0.01253996, "epoch": 0.6976101007064482, "flos": 22162894536960.0, "grad_norm": 2.3978174075975955, "language_loss": 0.76769471, "learning_rate": 8.847948042655567e-07, "loss": 0.84454083, "num_input_tokens_seen": 250407020, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09069824, "step": 11603, "time_per_iteration": 2.7226245403289795 }, { "auxiliary_loss_clip": 0.06419882, "auxiliary_loss_mlp": 0.01266014, "balance_loss_clip": 0.06278364, "balance_loss_mlp": 0.01256829, "epoch": 0.6976702239591162, "flos": 22280124798720.0, "grad_norm": 1.545548360666107, "language_loss": 0.62528086, "learning_rate": 8.844715301424557e-07, "loss": 0.70213985, "num_input_tokens_seen": 250425880, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09185791, "step": 11604, "time_per_iteration": 3.9118130207061768 }, { "auxiliary_loss_clip": 0.06424659, "auxiliary_loss_mlp": 0.01266778, "balance_loss_clip": 0.06279753, "balance_loss_mlp": 0.01255924, "epoch": 0.6977303472117842, "flos": 25855722552960.0, "grad_norm": 2.6747555417761975, "language_loss": 0.82197821, "learning_rate": 8.841482983203057e-07, "loss": 0.89889258, "num_input_tokens_seen": 250442925, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10852051, "step": 11605, "time_per_iteration": 2.5720314979553223 }, { "auxiliary_loss_clip": 0.06418909, "auxiliary_loss_mlp": 0.0126604, "balance_loss_clip": 0.06276896, "balance_loss_mlp": 0.01256771, "epoch": 0.6977904704644521, "flos": 20965894634880.0, "grad_norm": 1.957806911669474, "language_loss": 0.70771736, "learning_rate": 8.838251088113638e-07, "loss": 0.78456682, "num_input_tokens_seen": 250461220, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09265137, "step": 11606, "time_per_iteration": 2.5413198471069336 }, { "auxiliary_loss_clip": 0.06424289, "auxiliary_loss_mlp": 0.01263504, "balance_loss_clip": 0.06278474, "balance_loss_mlp": 0.01253359, "epoch": 0.6978505937171201, "flos": 22061680404480.0, "grad_norm": 1.8216802024408083, "language_loss": 0.82648844, "learning_rate": 8.835019616278856e-07, "loss": 0.90336633, "num_input_tokens_seen": 250480975, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.1015625, "step": 11607, "time_per_iteration": 2.649261474609375 }, { "auxiliary_loss_clip": 0.06429759, "auxiliary_loss_mlp": 0.01267366, "balance_loss_clip": 0.06281529, "balance_loss_mlp": 0.01256918, "epoch": 0.697910716969788, "flos": 20049252894720.0, "grad_norm": 1.7554913175137794, "language_loss": 0.79120696, "learning_rate": 8.831788567821265e-07, "loss": 0.86817819, "num_input_tokens_seen": 250497980, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.10461426, "step": 11608, "time_per_iteration": 2.5306622982025146 }, { "auxiliary_loss_clip": 0.06422156, "auxiliary_loss_mlp": 0.01264631, "balance_loss_clip": 0.06276776, "balance_loss_mlp": 0.01254993, "epoch": 0.697970840222456, "flos": 15893736232320.0, "grad_norm": 2.1173799705190586, "language_loss": 0.90328002, "learning_rate": 8.828557942863357e-07, "loss": 0.9801479, "num_input_tokens_seen": 250511910, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.09631348, "step": 11609, "time_per_iteration": 2.506696939468384 }, { "auxiliary_loss_clip": 0.06421509, "auxiliary_loss_mlp": 0.01261356, "balance_loss_clip": 0.06276043, "balance_loss_mlp": 0.01252344, "epoch": 0.698030963475124, "flos": 21222088093440.0, "grad_norm": 1.6350937780914314, "language_loss": 0.64276743, "learning_rate": 8.82532774152765e-07, "loss": 0.71959603, "num_input_tokens_seen": 250531090, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.09014893, "step": 11610, "time_per_iteration": 2.54725980758667 }, { "auxiliary_loss_clip": 0.06418063, "auxiliary_loss_mlp": 0.01265069, "balance_loss_clip": 0.06278299, "balance_loss_mlp": 0.0125558, "epoch": 0.698091086727792, "flos": 33767113029120.0, "grad_norm": 1.7963763334709197, "language_loss": 0.84782648, "learning_rate": 8.822097963936643e-07, "loss": 0.92465782, "num_input_tokens_seen": 250551565, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09490967, "step": 11611, "time_per_iteration": 2.6573026180267334 }, { "auxiliary_loss_clip": 0.06418907, "auxiliary_loss_mlp": 0.01263432, "balance_loss_clip": 0.06274295, "balance_loss_mlp": 0.01253162, "epoch": 0.69815120998046, "flos": 15893275034880.0, "grad_norm": 1.8021187590839398, "language_loss": 0.70744646, "learning_rate": 8.818868610212793e-07, "loss": 0.78426987, "num_input_tokens_seen": 250569625, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10266113, "step": 11612, "time_per_iteration": 2.518958568572998 }, { "auxiliary_loss_clip": 0.06417213, "auxiliary_loss_mlp": 0.01264709, "balance_loss_clip": 0.06277119, "balance_loss_mlp": 0.01254874, "epoch": 0.6982113332331279, "flos": 18952041605760.0, "grad_norm": 1.5461601808519363, "language_loss": 0.81055617, "learning_rate": 8.815639680478573e-07, "loss": 0.88737535, "num_input_tokens_seen": 250586960, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09838867, "step": 11613, "time_per_iteration": 2.5424201488494873 }, { "auxiliary_loss_clip": 0.06420355, "auxiliary_loss_mlp": 0.01265242, "balance_loss_clip": 0.06279071, "balance_loss_mlp": 0.01256272, "epoch": 0.6982714564857959, "flos": 24396533625600.0, "grad_norm": 1.8803870637214182, "language_loss": 0.75487745, "learning_rate": 8.812411174856411e-07, "loss": 0.83173347, "num_input_tokens_seen": 250605080, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.08978271, "step": 11614, "time_per_iteration": 2.5625829696655273 }, { "auxiliary_loss_clip": 0.06419528, "auxiliary_loss_mlp": 0.01267588, "balance_loss_clip": 0.06278613, "balance_loss_mlp": 0.01257938, "epoch": 0.6983315797384638, "flos": 20089852997760.0, "grad_norm": 2.3056255099421263, "language_loss": 0.7804873, "learning_rate": 8.809183093468746e-07, "loss": 0.85735857, "num_input_tokens_seen": 250623965, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09649658, "step": 11615, "time_per_iteration": 2.5599822998046875 }, { "auxiliary_loss_clip": 0.06411859, "auxiliary_loss_mlp": 0.01264645, "balance_loss_clip": 0.06274666, "balance_loss_mlp": 0.01255603, "epoch": 0.6983917029911318, "flos": 13516815461760.0, "grad_norm": 2.7535483763840607, "language_loss": 0.72779572, "learning_rate": 8.80595543643797e-07, "loss": 0.80456078, "num_input_tokens_seen": 250640675, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09039307, "step": 11616, "time_per_iteration": 2.532588481903076 }, { "auxiliary_loss_clip": 0.06420123, "auxiliary_loss_mlp": 0.01263064, "balance_loss_clip": 0.06281658, "balance_loss_mlp": 0.01253652, "epoch": 0.6984518262437998, "flos": 22025021443200.0, "grad_norm": 1.5413918012563255, "language_loss": 0.84433037, "learning_rate": 8.802728203886487e-07, "loss": 0.92116225, "num_input_tokens_seen": 250660295, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09411621, "step": 11617, "time_per_iteration": 2.5882163047790527 }, { "auxiliary_loss_clip": 0.06426422, "auxiliary_loss_mlp": 0.01268372, "balance_loss_clip": 0.06279948, "balance_loss_mlp": 0.01257995, "epoch": 0.6985119494964678, "flos": 18776587155840.0, "grad_norm": 2.0205649551622287, "language_loss": 0.59400928, "learning_rate": 8.799501395936682e-07, "loss": 0.67095721, "num_input_tokens_seen": 250678155, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10375977, "step": 11618, "time_per_iteration": 2.5276055335998535 }, { "auxiliary_loss_clip": 0.06421424, "auxiliary_loss_mlp": 0.01262633, "balance_loss_clip": 0.06281754, "balance_loss_mlp": 0.01253764, "epoch": 0.6985720727491357, "flos": 22389430849920.0, "grad_norm": 1.8077898547373032, "language_loss": 0.83048248, "learning_rate": 8.796275012710903e-07, "loss": 0.90732306, "num_input_tokens_seen": 250697230, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.08874512, "step": 11619, "time_per_iteration": 2.5865540504455566 }, { "auxiliary_loss_clip": 0.06417587, "auxiliary_loss_mlp": 0.01267922, "balance_loss_clip": 0.0628042, "balance_loss_mlp": 0.01259506, "epoch": 0.6986321960018037, "flos": 39577398048000.0, "grad_norm": 1.627490325079995, "language_loss": 0.67636168, "learning_rate": 8.793049054331494e-07, "loss": 0.7532168, "num_input_tokens_seen": 250719865, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.08410645, "step": 11620, "time_per_iteration": 2.696690082550049 }, { "auxiliary_loss_clip": 0.06424949, "auxiliary_loss_mlp": 0.01264801, "balance_loss_clip": 0.06281076, "balance_loss_mlp": 0.01254501, "epoch": 0.6986923192544716, "flos": 17973528024960.0, "grad_norm": 2.015430206046444, "language_loss": 0.73322535, "learning_rate": 8.789823520920794e-07, "loss": 0.81012285, "num_input_tokens_seen": 250736565, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10290527, "step": 11621, "time_per_iteration": 2.5247304439544678 }, { "auxiliary_loss_clip": 0.06423771, "auxiliary_loss_mlp": 0.01263069, "balance_loss_clip": 0.06278546, "balance_loss_mlp": 0.0125324, "epoch": 0.6987524425071396, "flos": 25601583519360.0, "grad_norm": 1.7513129796974256, "language_loss": 0.68587053, "learning_rate": 8.7865984126011e-07, "loss": 0.76273882, "num_input_tokens_seen": 250757235, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.0982666, "step": 11622, "time_per_iteration": 2.5879197120666504 }, { "auxiliary_loss_clip": 0.06420399, "auxiliary_loss_mlp": 0.01265835, "balance_loss_clip": 0.06282844, "balance_loss_mlp": 0.01256507, "epoch": 0.6988125657598077, "flos": 17535842622720.0, "grad_norm": 1.5164800255881723, "language_loss": 0.627599, "learning_rate": 8.783373729494721e-07, "loss": 0.70446134, "num_input_tokens_seen": 250775585, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09326172, "step": 11623, "time_per_iteration": 3.9803860187530518 }, { "auxiliary_loss_clip": 0.06428342, "auxiliary_loss_mlp": 0.01263204, "balance_loss_clip": 0.06279963, "balance_loss_mlp": 0.0125331, "epoch": 0.6988726890124756, "flos": 39175029941760.0, "grad_norm": 1.698562788992515, "language_loss": 0.6081568, "learning_rate": 8.780149471723932e-07, "loss": 0.68507224, "num_input_tokens_seen": 250795725, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.09893799, "step": 11624, "time_per_iteration": 2.718998432159424 }, { "auxiliary_loss_clip": 0.06424283, "auxiliary_loss_mlp": 0.01264688, "balance_loss_clip": 0.06278916, "balance_loss_mlp": 0.01254681, "epoch": 0.6989328122651436, "flos": 20199662173440.0, "grad_norm": 1.760023183088319, "language_loss": 0.78487313, "learning_rate": 8.776925639411017e-07, "loss": 0.86176276, "num_input_tokens_seen": 250814555, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10003662, "step": 11625, "time_per_iteration": 2.5444588661193848 }, { "auxiliary_loss_clip": 0.06418849, "auxiliary_loss_mlp": 0.01267016, "balance_loss_clip": 0.0628171, "balance_loss_mlp": 0.01257658, "epoch": 0.6989929355178115, "flos": 21841265439360.0, "grad_norm": 1.8131766273020389, "language_loss": 0.66502798, "learning_rate": 8.773702232678188e-07, "loss": 0.74188662, "num_input_tokens_seen": 250833105, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09362793, "step": 11626, "time_per_iteration": 2.549147605895996 }, { "auxiliary_loss_clip": 0.06424282, "auxiliary_loss_mlp": 0.0126465, "balance_loss_clip": 0.06282052, "balance_loss_mlp": 0.01255096, "epoch": 0.6990530587704795, "flos": 26330066916480.0, "grad_norm": 1.7434908180021165, "language_loss": 0.71202129, "learning_rate": 8.770479251647697e-07, "loss": 0.78891063, "num_input_tokens_seen": 250852570, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09564209, "step": 11627, "time_per_iteration": 2.5760841369628906 }, { "auxiliary_loss_clip": 0.06415321, "auxiliary_loss_mlp": 0.01264609, "balance_loss_clip": 0.06278393, "balance_loss_mlp": 0.01256074, "epoch": 0.6991131820231474, "flos": 19835168912640.0, "grad_norm": 1.7178176222950916, "language_loss": 0.62315643, "learning_rate": 8.767256696441768e-07, "loss": 0.69995576, "num_input_tokens_seen": 250870500, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.08532715, "step": 11628, "time_per_iteration": 2.541111946105957 }, { "auxiliary_loss_clip": 0.06421187, "auxiliary_loss_mlp": 0.01265145, "balance_loss_clip": 0.06275894, "balance_loss_mlp": 0.01255221, "epoch": 0.6991733052758154, "flos": 33993271998720.0, "grad_norm": 2.04208935499813, "language_loss": 0.68963718, "learning_rate": 8.764034567182581e-07, "loss": 0.76650053, "num_input_tokens_seen": 250892745, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.09924316, "step": 11629, "time_per_iteration": 2.6364846229553223 }, { "auxiliary_loss_clip": 0.06418522, "auxiliary_loss_mlp": 0.01267663, "balance_loss_clip": 0.06279388, "balance_loss_mlp": 0.012574, "epoch": 0.6992334285284834, "flos": 15638632876800.0, "grad_norm": 1.8249185306701097, "language_loss": 0.72778249, "learning_rate": 8.760812863992337e-07, "loss": 0.80464435, "num_input_tokens_seen": 250910225, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10266113, "step": 11630, "time_per_iteration": 2.5205204486846924 }, { "auxiliary_loss_clip": 0.06424558, "auxiliary_loss_mlp": 0.0126533, "balance_loss_clip": 0.06285037, "balance_loss_mlp": 0.01255764, "epoch": 0.6992935517811514, "flos": 21732797928960.0, "grad_norm": 1.5242148409545355, "language_loss": 0.74357891, "learning_rate": 8.757591586993196e-07, "loss": 0.82047772, "num_input_tokens_seen": 250929715, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09564209, "step": 11631, "time_per_iteration": 3.9580423831939697 }, { "auxiliary_loss_clip": 0.06431951, "auxiliary_loss_mlp": 0.01269159, "balance_loss_clip": 0.06287332, "balance_loss_mlp": 0.0125871, "epoch": 0.6993536750338193, "flos": 20120558319360.0, "grad_norm": 1.9444217130300936, "language_loss": 0.89799607, "learning_rate": 8.7543707363073e-07, "loss": 0.97500718, "num_input_tokens_seen": 250944230, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10449219, "step": 11632, "time_per_iteration": 2.5260612964630127 }, { "auxiliary_loss_clip": 0.06425171, "auxiliary_loss_mlp": 0.01265621, "balance_loss_clip": 0.06281406, "balance_loss_mlp": 0.01255929, "epoch": 0.6994137982864873, "flos": 22015839421440.0, "grad_norm": 1.6016265592677534, "language_loss": 0.79953146, "learning_rate": 8.751150312056792e-07, "loss": 0.87643939, "num_input_tokens_seen": 250961865, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.09692383, "step": 11633, "time_per_iteration": 2.5463130474090576 }, { "auxiliary_loss_clip": 0.06429112, "auxiliary_loss_mlp": 0.01268013, "balance_loss_clip": 0.06282387, "balance_loss_mlp": 0.01256981, "epoch": 0.6994739215391552, "flos": 25525875755520.0, "grad_norm": 2.3976449912917586, "language_loss": 0.67795682, "learning_rate": 8.747930314363794e-07, "loss": 0.75492811, "num_input_tokens_seen": 250982025, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.11029053, "step": 11634, "time_per_iteration": 2.583765983581543 }, { "auxiliary_loss_clip": 0.06324795, "auxiliary_loss_mlp": 0.01253156, "balance_loss_clip": 0.06266422, "balance_loss_mlp": 0.0125194, "epoch": 0.6995340447918232, "flos": 59147931438720.0, "grad_norm": 0.675407140254534, "language_loss": 0.53209555, "learning_rate": 8.744710743350412e-07, "loss": 0.60787499, "num_input_tokens_seen": 251046900, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01213837, "step": 11635, "time_per_iteration": 3.3061954975128174 }, { "auxiliary_loss_clip": 0.06424846, "auxiliary_loss_mlp": 0.01263169, "balance_loss_clip": 0.06282715, "balance_loss_mlp": 0.01253299, "epoch": 0.6995941680444913, "flos": 17973653806080.0, "grad_norm": 1.6586129231663316, "language_loss": 0.82384241, "learning_rate": 8.741491599138726e-07, "loss": 0.9007225, "num_input_tokens_seen": 251065050, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09875488, "step": 11636, "time_per_iteration": 2.5374526977539062 }, { "auxiliary_loss_clip": 0.06426895, "auxiliary_loss_mlp": 0.01266354, "balance_loss_clip": 0.06282233, "balance_loss_mlp": 0.01256335, "epoch": 0.6996542912971592, "flos": 21986391911040.0, "grad_norm": 1.9360717043834283, "language_loss": 0.83164233, "learning_rate": 8.738272881850801e-07, "loss": 0.90857482, "num_input_tokens_seen": 251083355, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10015869, "step": 11637, "time_per_iteration": 2.588693857192993 }, { "auxiliary_loss_clip": 0.06421098, "auxiliary_loss_mlp": 0.01267193, "balance_loss_clip": 0.06279931, "balance_loss_mlp": 0.01256929, "epoch": 0.6997144145498272, "flos": 11689904891520.0, "grad_norm": 1.9330234566785487, "language_loss": 0.68036211, "learning_rate": 8.735054591608704e-07, "loss": 0.75724506, "num_input_tokens_seen": 251096420, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10266113, "step": 11638, "time_per_iteration": 2.5488290786743164 }, { "auxiliary_loss_clip": 0.06428879, "auxiliary_loss_mlp": 0.01266989, "balance_loss_clip": 0.06280155, "balance_loss_mlp": 0.01255461, "epoch": 0.6997745378024951, "flos": 29614992456960.0, "grad_norm": 1.8348043344965441, "language_loss": 0.78396648, "learning_rate": 8.731836728534459e-07, "loss": 0.8609252, "num_input_tokens_seen": 251115410, "router_z_loss_clip": 1.48730469, "router_z_loss_mlp": 0.11541748, "step": 11639, "time_per_iteration": 2.5921339988708496 }, { "auxiliary_loss_clip": 0.06426242, "auxiliary_loss_mlp": 0.01269711, "balance_loss_clip": 0.0628275, "balance_loss_mlp": 0.01259548, "epoch": 0.6998346610551631, "flos": 20892912128640.0, "grad_norm": 2.3750177390806577, "language_loss": 0.8309623, "learning_rate": 8.728619292750093e-07, "loss": 0.90792185, "num_input_tokens_seen": 251133530, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10162354, "step": 11640, "time_per_iteration": 2.5270884037017822 }, { "auxiliary_loss_clip": 0.06423143, "auxiliary_loss_mlp": 0.01265159, "balance_loss_clip": 0.06280995, "balance_loss_mlp": 0.01255789, "epoch": 0.699894784307831, "flos": 27170539695360.0, "grad_norm": 1.6986607568974719, "language_loss": 0.75920939, "learning_rate": 8.725402284377619e-07, "loss": 0.83609235, "num_input_tokens_seen": 251153985, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09368896, "step": 11641, "time_per_iteration": 4.079655408859253 }, { "auxiliary_loss_clip": 0.06423406, "auxiliary_loss_mlp": 0.0126547, "balance_loss_clip": 0.06280743, "balance_loss_mlp": 0.01255325, "epoch": 0.699954907560499, "flos": 20930032287360.0, "grad_norm": 1.8988998357019955, "language_loss": 0.78315985, "learning_rate": 8.722185703539022e-07, "loss": 0.86004853, "num_input_tokens_seen": 251173225, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10150146, "step": 11642, "time_per_iteration": 2.5344440937042236 }, { "auxiliary_loss_clip": 0.06430027, "auxiliary_loss_mlp": 0.01266408, "balance_loss_clip": 0.0628238, "balance_loss_mlp": 0.01254666, "epoch": 0.700015030813167, "flos": 28665339408000.0, "grad_norm": 2.9440842343551754, "language_loss": 0.75084996, "learning_rate": 8.718969550356266e-07, "loss": 0.82781428, "num_input_tokens_seen": 251192485, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.11749268, "step": 11643, "time_per_iteration": 2.595855712890625 }, { "auxiliary_loss_clip": 0.06426133, "auxiliary_loss_mlp": 0.01264861, "balance_loss_clip": 0.06280363, "balance_loss_mlp": 0.01254901, "epoch": 0.700075154065835, "flos": 29212959767040.0, "grad_norm": 1.4561980919400046, "language_loss": 0.60426939, "learning_rate": 8.715753824951315e-07, "loss": 0.68117934, "num_input_tokens_seen": 251214965, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.09960938, "step": 11644, "time_per_iteration": 4.017992258071899 }, { "auxiliary_loss_clip": 0.06416564, "auxiliary_loss_mlp": 0.01267846, "balance_loss_clip": 0.06276783, "balance_loss_mlp": 0.01257469, "epoch": 0.7001352773185029, "flos": 23119130131200.0, "grad_norm": 1.609425293589473, "language_loss": 0.81989861, "learning_rate": 8.712538527446119e-07, "loss": 0.8967427, "num_input_tokens_seen": 251234500, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.1038208, "step": 11645, "time_per_iteration": 2.5529041290283203 }, { "auxiliary_loss_clip": 0.06417949, "auxiliary_loss_mlp": 0.01265697, "balance_loss_clip": 0.06278372, "balance_loss_mlp": 0.01256387, "epoch": 0.7001954005711709, "flos": 21328962376320.0, "grad_norm": 1.741459871602976, "language_loss": 0.67992496, "learning_rate": 8.709323657962584e-07, "loss": 0.75676143, "num_input_tokens_seen": 251254360, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09313965, "step": 11646, "time_per_iteration": 2.53937029838562 }, { "auxiliary_loss_clip": 0.06421195, "auxiliary_loss_mlp": 0.01263681, "balance_loss_clip": 0.06279887, "balance_loss_mlp": 0.01254693, "epoch": 0.7002555238238388, "flos": 24542834054400.0, "grad_norm": 1.6888394429781264, "language_loss": 0.71357477, "learning_rate": 8.706109216622635e-07, "loss": 0.79042351, "num_input_tokens_seen": 251274790, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.08984375, "step": 11647, "time_per_iteration": 2.5807244777679443 }, { "auxiliary_loss_clip": 0.06426141, "auxiliary_loss_mlp": 0.01271671, "balance_loss_clip": 0.06279859, "balance_loss_mlp": 0.01260877, "epoch": 0.7003156470765068, "flos": 39065891598720.0, "grad_norm": 2.253194528926019, "language_loss": 0.71585017, "learning_rate": 8.702895203548155e-07, "loss": 0.79282832, "num_input_tokens_seen": 251296275, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.10797119, "step": 11648, "time_per_iteration": 2.8455562591552734 }, { "auxiliary_loss_clip": 0.06415337, "auxiliary_loss_mlp": 0.01266993, "balance_loss_clip": 0.06275401, "balance_loss_mlp": 0.01256711, "epoch": 0.7003757703291749, "flos": 28811723690880.0, "grad_norm": 1.485700345762628, "language_loss": 0.77313167, "learning_rate": 8.699681618861014e-07, "loss": 0.84995496, "num_input_tokens_seen": 251317375, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.10284424, "step": 11649, "time_per_iteration": 2.7299413681030273 }, { "auxiliary_loss_clip": 0.06417616, "auxiliary_loss_mlp": 0.01264637, "balance_loss_clip": 0.06276229, "balance_loss_mlp": 0.01255529, "epoch": 0.7004358935818428, "flos": 15958123695360.0, "grad_norm": 2.738368820735876, "language_loss": 0.78925371, "learning_rate": 8.69646846268308e-07, "loss": 0.86607617, "num_input_tokens_seen": 251333570, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09106445, "step": 11650, "time_per_iteration": 2.699355125427246 }, { "auxiliary_loss_clip": 0.06419751, "auxiliary_loss_mlp": 0.0126233, "balance_loss_clip": 0.0627662, "balance_loss_mlp": 0.01253192, "epoch": 0.7004960168345108, "flos": 20418148494720.0, "grad_norm": 2.0307078484218715, "language_loss": 0.78671235, "learning_rate": 8.693255735136194e-07, "loss": 0.86353314, "num_input_tokens_seen": 251351070, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09136963, "step": 11651, "time_per_iteration": 2.554388999938965 }, { "auxiliary_loss_clip": 0.0643505, "auxiliary_loss_mlp": 0.01267578, "balance_loss_clip": 0.06288221, "balance_loss_mlp": 0.01257636, "epoch": 0.7005561400871787, "flos": 17353260576000.0, "grad_norm": 1.6520280515495305, "language_loss": 0.69853592, "learning_rate": 8.690043436342198e-07, "loss": 0.77556217, "num_input_tokens_seen": 251370005, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.09936523, "step": 11652, "time_per_iteration": 2.5432772636413574 }, { "auxiliary_loss_clip": 0.06423499, "auxiliary_loss_mlp": 0.01263417, "balance_loss_clip": 0.06281053, "balance_loss_mlp": 0.01253677, "epoch": 0.7006162633398467, "flos": 25309276151040.0, "grad_norm": 1.3154352177706072, "language_loss": 0.74840599, "learning_rate": 8.686831566422874e-07, "loss": 0.82527518, "num_input_tokens_seen": 251391210, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.09729004, "step": 11653, "time_per_iteration": 2.5970020294189453 }, { "auxiliary_loss_clip": 0.06428275, "auxiliary_loss_mlp": 0.01262988, "balance_loss_clip": 0.06282505, "balance_loss_mlp": 0.01252677, "epoch": 0.7006763865925146, "flos": 20675473983360.0, "grad_norm": 2.025058245018905, "language_loss": 0.71063447, "learning_rate": 8.68362012550003e-07, "loss": 0.78754705, "num_input_tokens_seen": 251411505, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10314941, "step": 11654, "time_per_iteration": 2.5548033714294434 }, { "auxiliary_loss_clip": 0.06424189, "auxiliary_loss_mlp": 0.01267435, "balance_loss_clip": 0.062795, "balance_loss_mlp": 0.01256087, "epoch": 0.7007365098451827, "flos": 20052439349760.0, "grad_norm": 2.1608158675978433, "language_loss": 0.73403955, "learning_rate": 8.680409113695453e-07, "loss": 0.81095576, "num_input_tokens_seen": 251428975, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.11352539, "step": 11655, "time_per_iteration": 2.5572574138641357 }, { "auxiliary_loss_clip": 0.06435616, "auxiliary_loss_mlp": 0.01271672, "balance_loss_clip": 0.06284335, "balance_loss_mlp": 0.01259846, "epoch": 0.7007966330978506, "flos": 20783689931520.0, "grad_norm": 2.0603688216948575, "language_loss": 0.70875645, "learning_rate": 8.677198531130889e-07, "loss": 0.78582931, "num_input_tokens_seen": 251446940, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.1182251, "step": 11656, "time_per_iteration": 2.5674562454223633 }, { "auxiliary_loss_clip": 0.06417171, "auxiliary_loss_mlp": 0.01268141, "balance_loss_clip": 0.06276799, "balance_loss_mlp": 0.01259081, "epoch": 0.7008567563505186, "flos": 29645110800000.0, "grad_norm": 1.575071451499183, "language_loss": 0.7794075, "learning_rate": 8.673988377928092e-07, "loss": 0.8562606, "num_input_tokens_seen": 251466205, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09057617, "step": 11657, "time_per_iteration": 2.6150784492492676 }, { "auxiliary_loss_clip": 0.06430131, "auxiliary_loss_mlp": 0.01268339, "balance_loss_clip": 0.06280495, "balance_loss_mlp": 0.01257646, "epoch": 0.7009168796031865, "flos": 17097654096000.0, "grad_norm": 2.096229551970787, "language_loss": 0.784042, "learning_rate": 8.670778654208797e-07, "loss": 0.8610267, "num_input_tokens_seen": 251484820, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.10693359, "step": 11658, "time_per_iteration": 2.519850492477417 }, { "auxiliary_loss_clip": 0.06420459, "auxiliary_loss_mlp": 0.01264533, "balance_loss_clip": 0.06281362, "balance_loss_mlp": 0.0125502, "epoch": 0.7009770028558545, "flos": 20455226726400.0, "grad_norm": 1.7364113637375198, "language_loss": 0.8276993, "learning_rate": 8.667569360094713e-07, "loss": 0.90454924, "num_input_tokens_seen": 251502670, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09515381, "step": 11659, "time_per_iteration": 2.5283162593841553 }, { "auxiliary_loss_clip": 0.06413605, "auxiliary_loss_mlp": 0.01265141, "balance_loss_clip": 0.06274274, "balance_loss_mlp": 0.0125589, "epoch": 0.7010371261085224, "flos": 19251225008640.0, "grad_norm": 2.1661293851671783, "language_loss": 0.70060873, "learning_rate": 8.664360495707526e-07, "loss": 0.7773962, "num_input_tokens_seen": 251521630, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09246826, "step": 11660, "time_per_iteration": 2.538942575454712 }, { "auxiliary_loss_clip": 0.06426329, "auxiliary_loss_mlp": 0.01270031, "balance_loss_clip": 0.06278709, "balance_loss_mlp": 0.01259833, "epoch": 0.7010972493611904, "flos": 22134159786240.0, "grad_norm": 1.9670074392076677, "language_loss": 0.81232154, "learning_rate": 8.661152061168924e-07, "loss": 0.88928509, "num_input_tokens_seen": 251540105, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10205078, "step": 11661, "time_per_iteration": 2.5552992820739746 }, { "auxiliary_loss_clip": 0.06418969, "auxiliary_loss_mlp": 0.01264547, "balance_loss_clip": 0.06276311, "balance_loss_mlp": 0.01255028, "epoch": 0.7011573726138585, "flos": 31398619593600.0, "grad_norm": 1.4894192530382757, "language_loss": 0.79181099, "learning_rate": 8.657944056600579e-07, "loss": 0.86864614, "num_input_tokens_seen": 251560530, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09527588, "step": 11662, "time_per_iteration": 2.6051809787750244 }, { "auxiliary_loss_clip": 0.06426425, "auxiliary_loss_mlp": 0.01266978, "balance_loss_clip": 0.06280083, "balance_loss_mlp": 0.01255808, "epoch": 0.7012174958665264, "flos": 18156487415040.0, "grad_norm": 1.7545497111396502, "language_loss": 0.83658308, "learning_rate": 8.654736482124134e-07, "loss": 0.91351712, "num_input_tokens_seen": 251577930, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.11169434, "step": 11663, "time_per_iteration": 4.0031373500823975 }, { "auxiliary_loss_clip": 0.06328128, "auxiliary_loss_mlp": 0.01252911, "balance_loss_clip": 0.06269845, "balance_loss_mlp": 0.01251604, "epoch": 0.7012776191191944, "flos": 60669495331200.0, "grad_norm": 0.8267701631199313, "language_loss": 0.53795129, "learning_rate": 8.651529337861209e-07, "loss": 0.61376172, "num_input_tokens_seen": 251638820, "router_z_loss_clip": 0.58251953, "router_z_loss_mlp": 0.01308441, "step": 11664, "time_per_iteration": 3.1493823528289795 }, { "auxiliary_loss_clip": 0.06423533, "auxiliary_loss_mlp": 0.01268588, "balance_loss_clip": 0.06277668, "balance_loss_mlp": 0.01257997, "epoch": 0.7013377423718623, "flos": 27205940845440.0, "grad_norm": 2.066286121510633, "language_loss": 0.7911495, "learning_rate": 8.64832262393344e-07, "loss": 0.86807066, "num_input_tokens_seen": 251658070, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.105896, "step": 11665, "time_per_iteration": 2.578765630722046 }, { "auxiliary_loss_clip": 0.06421442, "auxiliary_loss_mlp": 0.01266312, "balance_loss_clip": 0.06280668, "balance_loss_mlp": 0.01256358, "epoch": 0.7013978656245303, "flos": 16548901706880.0, "grad_norm": 2.933404055465897, "language_loss": 0.76752508, "learning_rate": 8.645116340462404e-07, "loss": 0.84440261, "num_input_tokens_seen": 251671575, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09942627, "step": 11666, "time_per_iteration": 2.5037636756896973 }, { "auxiliary_loss_clip": 0.06427772, "auxiliary_loss_mlp": 0.01266341, "balance_loss_clip": 0.06283495, "balance_loss_mlp": 0.01256387, "epoch": 0.7014579888771982, "flos": 23149625817600.0, "grad_norm": 1.9012064469736814, "language_loss": 0.81364053, "learning_rate": 8.641910487569695e-07, "loss": 0.89058173, "num_input_tokens_seen": 251689350, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.09960938, "step": 11667, "time_per_iteration": 2.5564846992492676 }, { "auxiliary_loss_clip": 0.06420137, "auxiliary_loss_mlp": 0.0126872, "balance_loss_clip": 0.06277808, "balance_loss_mlp": 0.01258396, "epoch": 0.7015181121298663, "flos": 25089028894080.0, "grad_norm": 2.498451403274689, "language_loss": 0.65758568, "learning_rate": 8.638705065376879e-07, "loss": 0.73447424, "num_input_tokens_seen": 251704635, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10333252, "step": 11668, "time_per_iteration": 2.569389581680298 }, { "auxiliary_loss_clip": 0.06426103, "auxiliary_loss_mlp": 0.01266773, "balance_loss_clip": 0.06278953, "balance_loss_mlp": 0.01256807, "epoch": 0.7015782353825342, "flos": 23334052654080.0, "grad_norm": 2.0590184428228233, "language_loss": 0.76645154, "learning_rate": 8.635500074005519e-07, "loss": 0.84338033, "num_input_tokens_seen": 251723035, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.09967041, "step": 11669, "time_per_iteration": 2.5738492012023926 }, { "auxiliary_loss_clip": 0.06320243, "auxiliary_loss_mlp": 0.01255197, "balance_loss_clip": 0.06261633, "balance_loss_mlp": 0.01253798, "epoch": 0.7016383586352022, "flos": 70417733086080.0, "grad_norm": 0.6796433210501602, "language_loss": 0.54477501, "learning_rate": 8.632295513577122e-07, "loss": 0.62052941, "num_input_tokens_seen": 251791630, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 0.01398468, "step": 11670, "time_per_iteration": 4.706883192062378 }, { "auxiliary_loss_clip": 0.06417987, "auxiliary_loss_mlp": 0.01268732, "balance_loss_clip": 0.06276412, "balance_loss_mlp": 0.01258713, "epoch": 0.7016984818878701, "flos": 19798426097280.0, "grad_norm": 1.6564088012639289, "language_loss": 0.81864619, "learning_rate": 8.629091384213218e-07, "loss": 0.89551342, "num_input_tokens_seen": 251809840, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10021973, "step": 11671, "time_per_iteration": 2.549285411834717 }, { "auxiliary_loss_clip": 0.06426131, "auxiliary_loss_mlp": 0.01265293, "balance_loss_clip": 0.06280144, "balance_loss_mlp": 0.01254803, "epoch": 0.7017586051405381, "flos": 12901998528000.0, "grad_norm": 1.9029520420569177, "language_loss": 0.75412613, "learning_rate": 8.625887686035313e-07, "loss": 0.83104038, "num_input_tokens_seen": 251827550, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10491943, "step": 11672, "time_per_iteration": 2.52602481842041 }, { "auxiliary_loss_clip": 0.06422748, "auxiliary_loss_mlp": 0.01268269, "balance_loss_clip": 0.06279026, "balance_loss_mlp": 0.01256933, "epoch": 0.701818728393206, "flos": 18338734045440.0, "grad_norm": 1.7435202293186904, "language_loss": 0.86985278, "learning_rate": 8.622684419164883e-07, "loss": 0.94676298, "num_input_tokens_seen": 251844880, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.11340332, "step": 11673, "time_per_iteration": 2.5370635986328125 }, { "auxiliary_loss_clip": 0.06419439, "auxiliary_loss_mlp": 0.01269886, "balance_loss_clip": 0.06279096, "balance_loss_mlp": 0.01259515, "epoch": 0.701878851645874, "flos": 17389961464320.0, "grad_norm": 1.801548863429534, "language_loss": 0.73385191, "learning_rate": 8.619481583723399e-07, "loss": 0.81074512, "num_input_tokens_seen": 251861025, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.10375977, "step": 11674, "time_per_iteration": 2.4975762367248535 }, { "auxiliary_loss_clip": 0.06418456, "auxiliary_loss_mlp": 0.0126649, "balance_loss_clip": 0.06278529, "balance_loss_mlp": 0.01256727, "epoch": 0.701938974898542, "flos": 23922398897280.0, "grad_norm": 1.7374678040407203, "language_loss": 0.72348261, "learning_rate": 8.616279179832329e-07, "loss": 0.80033207, "num_input_tokens_seen": 251880175, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09771729, "step": 11675, "time_per_iteration": 2.56727933883667 }, { "auxiliary_loss_clip": 0.06425697, "auxiliary_loss_mlp": 0.01268674, "balance_loss_clip": 0.06280395, "balance_loss_mlp": 0.01258303, "epoch": 0.70199909815121, "flos": 21801503877120.0, "grad_norm": 2.057043492371745, "language_loss": 0.51403153, "learning_rate": 8.613077207613078e-07, "loss": 0.59097528, "num_input_tokens_seen": 251899005, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10388184, "step": 11676, "time_per_iteration": 2.5503084659576416 }, { "auxiliary_loss_clip": 0.06318352, "auxiliary_loss_mlp": 0.01253805, "balance_loss_clip": 0.06260015, "balance_loss_mlp": 0.01252457, "epoch": 0.702059221403878, "flos": 71736575224320.0, "grad_norm": 0.7242715928001012, "language_loss": 0.59173715, "learning_rate": 8.609875667187079e-07, "loss": 0.66745877, "num_input_tokens_seen": 251966790, "router_z_loss_clip": 0.58496094, "router_z_loss_mlp": 0.01350403, "step": 11677, "time_per_iteration": 3.222733736038208 }, { "auxiliary_loss_clip": 0.06423231, "auxiliary_loss_mlp": 0.0126561, "balance_loss_clip": 0.06278154, "balance_loss_mlp": 0.012554, "epoch": 0.7021193446565459, "flos": 28118599516800.0, "grad_norm": 2.012910934508799, "language_loss": 0.62682492, "learning_rate": 8.606674558675737e-07, "loss": 0.70371336, "num_input_tokens_seen": 251989315, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10211182, "step": 11678, "time_per_iteration": 2.6381778717041016 }, { "auxiliary_loss_clip": 0.06419639, "auxiliary_loss_mlp": 0.01267387, "balance_loss_clip": 0.06278532, "balance_loss_mlp": 0.01257081, "epoch": 0.7021794679092139, "flos": 22930720225920.0, "grad_norm": 3.780324747087741, "language_loss": 0.79509091, "learning_rate": 8.603473882200444e-07, "loss": 0.87196112, "num_input_tokens_seen": 252006620, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10302734, "step": 11679, "time_per_iteration": 2.5798940658569336 }, { "auxiliary_loss_clip": 0.06418078, "auxiliary_loss_mlp": 0.01263582, "balance_loss_clip": 0.06277639, "balance_loss_mlp": 0.01254606, "epoch": 0.7022395911618818, "flos": 18083756471040.0, "grad_norm": 2.6981755081026337, "language_loss": 0.70606416, "learning_rate": 8.600273637882567e-07, "loss": 0.78288072, "num_input_tokens_seen": 252024570, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.08990479, "step": 11680, "time_per_iteration": 3.9730756282806396 }, { "auxiliary_loss_clip": 0.06426324, "auxiliary_loss_mlp": 0.01266639, "balance_loss_clip": 0.06279077, "balance_loss_mlp": 0.01256238, "epoch": 0.7022997144145499, "flos": 16039827025920.0, "grad_norm": 1.6808711193093813, "language_loss": 0.75197023, "learning_rate": 8.597073825843446e-07, "loss": 0.82889986, "num_input_tokens_seen": 252042775, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.10400391, "step": 11681, "time_per_iteration": 2.5291638374328613 }, { "auxiliary_loss_clip": 0.06419529, "auxiliary_loss_mlp": 0.01265039, "balance_loss_clip": 0.06277193, "balance_loss_mlp": 0.01255348, "epoch": 0.7023598376672178, "flos": 26475864220800.0, "grad_norm": 1.7616421806188451, "language_loss": 0.77004778, "learning_rate": 8.593874446204434e-07, "loss": 0.84689355, "num_input_tokens_seen": 252063690, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09692383, "step": 11682, "time_per_iteration": 2.645090103149414 }, { "auxiliary_loss_clip": 0.06426082, "auxiliary_loss_mlp": 0.01266918, "balance_loss_clip": 0.06277967, "balance_loss_mlp": 0.01255421, "epoch": 0.7024199609198858, "flos": 17061624040320.0, "grad_norm": 2.1602984818286344, "language_loss": 0.7371192, "learning_rate": 8.590675499086841e-07, "loss": 0.81404924, "num_input_tokens_seen": 252080335, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.11486816, "step": 11683, "time_per_iteration": 2.5326528549194336 }, { "auxiliary_loss_clip": 0.06425963, "auxiliary_loss_mlp": 0.01268519, "balance_loss_clip": 0.06283925, "balance_loss_mlp": 0.01257158, "epoch": 0.7024800841725537, "flos": 25856225677440.0, "grad_norm": 2.020161669688035, "language_loss": 0.71639556, "learning_rate": 8.587476984611976e-07, "loss": 0.79334038, "num_input_tokens_seen": 252101075, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.11358643, "step": 11684, "time_per_iteration": 3.991097927093506 }, { "auxiliary_loss_clip": 0.06417795, "auxiliary_loss_mlp": 0.01268892, "balance_loss_clip": 0.06274316, "balance_loss_mlp": 0.01257793, "epoch": 0.7025402074252217, "flos": 23519653447680.0, "grad_norm": 2.017739949255569, "language_loss": 0.72267818, "learning_rate": 8.584278902901128e-07, "loss": 0.79954505, "num_input_tokens_seen": 252120510, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.11096191, "step": 11685, "time_per_iteration": 2.5549216270446777 }, { "auxiliary_loss_clip": 0.06420392, "auxiliary_loss_mlp": 0.01268143, "balance_loss_clip": 0.06276029, "balance_loss_mlp": 0.01258058, "epoch": 0.7026003306778896, "flos": 20156169104640.0, "grad_norm": 2.059453652809267, "language_loss": 0.84757382, "learning_rate": 8.581081254075582e-07, "loss": 0.92445922, "num_input_tokens_seen": 252137590, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10095215, "step": 11686, "time_per_iteration": 2.535184860229492 }, { "auxiliary_loss_clip": 0.06315318, "auxiliary_loss_mlp": 0.01250577, "balance_loss_clip": 0.06257191, "balance_loss_mlp": 0.01249186, "epoch": 0.7026604539305576, "flos": 64791036362880.0, "grad_norm": 0.9719322906574626, "language_loss": 0.69966656, "learning_rate": 8.577884038256566e-07, "loss": 0.77532554, "num_input_tokens_seen": 252199830, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.01392365, "step": 11687, "time_per_iteration": 3.306565761566162 }, { "auxiliary_loss_clip": 0.064211, "auxiliary_loss_mlp": 0.01266755, "balance_loss_clip": 0.0627729, "balance_loss_mlp": 0.01257075, "epoch": 0.7027205771832256, "flos": 21877882473600.0, "grad_norm": 2.028714410422561, "language_loss": 0.77368653, "learning_rate": 8.574687255565329e-07, "loss": 0.85056508, "num_input_tokens_seen": 252217200, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.09680176, "step": 11688, "time_per_iteration": 2.5407040119171143 }, { "auxiliary_loss_clip": 0.06420054, "auxiliary_loss_mlp": 0.01264243, "balance_loss_clip": 0.06277506, "balance_loss_mlp": 0.01253949, "epoch": 0.7027807004358936, "flos": 23374526976000.0, "grad_norm": 2.690689733434486, "language_loss": 0.68215394, "learning_rate": 8.571490906123107e-07, "loss": 0.75899696, "num_input_tokens_seen": 252236105, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10290527, "step": 11689, "time_per_iteration": 2.5471956729888916 }, { "auxiliary_loss_clip": 0.06424963, "auxiliary_loss_mlp": 0.01271634, "balance_loss_clip": 0.06277999, "balance_loss_mlp": 0.01259314, "epoch": 0.7028408236885616, "flos": 15309624620160.0, "grad_norm": 2.022681730582381, "language_loss": 0.79775012, "learning_rate": 8.568294990051086e-07, "loss": 0.87471604, "num_input_tokens_seen": 252253315, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.12322998, "step": 11690, "time_per_iteration": 2.5456395149230957 }, { "auxiliary_loss_clip": 0.06422134, "auxiliary_loss_mlp": 0.01267644, "balance_loss_clip": 0.06279981, "balance_loss_mlp": 0.01257136, "epoch": 0.7029009469412295, "flos": 22024769880960.0, "grad_norm": 1.6019295486117573, "language_loss": 0.76231259, "learning_rate": 8.56509950747047e-07, "loss": 0.83921039, "num_input_tokens_seen": 252272765, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.1050415, "step": 11691, "time_per_iteration": 2.555204153060913 }, { "auxiliary_loss_clip": 0.06421255, "auxiliary_loss_mlp": 0.01263814, "balance_loss_clip": 0.06280823, "balance_loss_mlp": 0.01253758, "epoch": 0.7029610701938975, "flos": 21842020126080.0, "grad_norm": 1.7930533293929185, "language_loss": 0.82313204, "learning_rate": 8.561904458502429e-07, "loss": 0.89998269, "num_input_tokens_seen": 252290510, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.1005249, "step": 11692, "time_per_iteration": 2.5684216022491455 }, { "auxiliary_loss_clip": 0.06415872, "auxiliary_loss_mlp": 0.01265915, "balance_loss_clip": 0.06273101, "balance_loss_mlp": 0.0125487, "epoch": 0.7030211934465654, "flos": 19141709322240.0, "grad_norm": 1.7045572045667545, "language_loss": 0.76665127, "learning_rate": 8.558709843268111e-07, "loss": 0.84346914, "num_input_tokens_seen": 252309365, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.11035156, "step": 11693, "time_per_iteration": 2.5809707641601562 }, { "auxiliary_loss_clip": 0.06418154, "auxiliary_loss_mlp": 0.01267848, "balance_loss_clip": 0.06277157, "balance_loss_mlp": 0.01257513, "epoch": 0.7030813166992335, "flos": 38555307544320.0, "grad_norm": 1.57614093786438, "language_loss": 0.68365347, "learning_rate": 8.55551566188866e-07, "loss": 0.76051342, "num_input_tokens_seen": 252333010, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10345459, "step": 11694, "time_per_iteration": 2.6990675926208496 }, { "auxiliary_loss_clip": 0.06424028, "auxiliary_loss_mlp": 0.01265618, "balance_loss_clip": 0.06279173, "balance_loss_mlp": 0.01255688, "epoch": 0.7031414399519014, "flos": 14726225767680.0, "grad_norm": 2.0595415704996443, "language_loss": 0.76460755, "learning_rate": 8.552321914485203e-07, "loss": 0.84150398, "num_input_tokens_seen": 252351330, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.0993042, "step": 11695, "time_per_iteration": 2.5613319873809814 }, { "auxiliary_loss_clip": 0.0643231, "auxiliary_loss_mlp": 0.01268019, "balance_loss_clip": 0.06285563, "balance_loss_mlp": 0.01257045, "epoch": 0.7032015632045694, "flos": 14032388833920.0, "grad_norm": 1.9800541618824317, "language_loss": 0.7365337, "learning_rate": 8.549128601178852e-07, "loss": 0.81353694, "num_input_tokens_seen": 252369580, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10974121, "step": 11696, "time_per_iteration": 2.7319459915161133 }, { "auxiliary_loss_clip": 0.06421749, "auxiliary_loss_mlp": 0.01266527, "balance_loss_clip": 0.0627775, "balance_loss_mlp": 0.01255798, "epoch": 0.7032616864572373, "flos": 27644716350720.0, "grad_norm": 1.535721903236956, "language_loss": 0.75433397, "learning_rate": 8.545935722090693e-07, "loss": 0.83121669, "num_input_tokens_seen": 252390525, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.1072998, "step": 11697, "time_per_iteration": 2.878777027130127 }, { "auxiliary_loss_clip": 0.06421528, "auxiliary_loss_mlp": 0.01268569, "balance_loss_clip": 0.06276456, "balance_loss_mlp": 0.01256707, "epoch": 0.7033218097099053, "flos": 17973024900480.0, "grad_norm": 1.8398588728472998, "language_loss": 0.80473286, "learning_rate": 8.542743277341793e-07, "loss": 0.88163382, "num_input_tokens_seen": 252407470, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.11859131, "step": 11698, "time_per_iteration": 2.5624725818634033 }, { "auxiliary_loss_clip": 0.06419314, "auxiliary_loss_mlp": 0.01266455, "balance_loss_clip": 0.06273615, "balance_loss_mlp": 0.0125606, "epoch": 0.7033819329625732, "flos": 19508047372800.0, "grad_norm": 1.4252653375251898, "language_loss": 0.84608722, "learning_rate": 8.539551267053222e-07, "loss": 0.9229449, "num_input_tokens_seen": 252427025, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10394287, "step": 11699, "time_per_iteration": 2.5694828033447266 }, { "auxiliary_loss_clip": 0.06416667, "auxiliary_loss_mlp": 0.01266832, "balance_loss_clip": 0.06276271, "balance_loss_mlp": 0.01256354, "epoch": 0.7034420562152413, "flos": 23994417081600.0, "grad_norm": 1.9001859800706398, "language_loss": 0.79319406, "learning_rate": 8.53635969134601e-07, "loss": 0.87002897, "num_input_tokens_seen": 252445410, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.1048584, "step": 11700, "time_per_iteration": 2.570707321166992 }, { "auxiliary_loss_clip": 0.06424876, "auxiliary_loss_mlp": 0.01264378, "balance_loss_clip": 0.06278524, "balance_loss_mlp": 0.01253416, "epoch": 0.7035021794679092, "flos": 35052147244800.0, "grad_norm": 1.7214882561666287, "language_loss": 0.75009573, "learning_rate": 8.533168550341186e-07, "loss": 0.82698828, "num_input_tokens_seen": 252463905, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10961914, "step": 11701, "time_per_iteration": 2.665794610977173 }, { "auxiliary_loss_clip": 0.06426702, "auxiliary_loss_mlp": 0.01265552, "balance_loss_clip": 0.06280409, "balance_loss_mlp": 0.01254513, "epoch": 0.7035623027205772, "flos": 11001811962240.0, "grad_norm": 2.1724563899767335, "language_loss": 0.84172529, "learning_rate": 8.529977844159769e-07, "loss": 0.91864777, "num_input_tokens_seen": 252478655, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.1104126, "step": 11702, "time_per_iteration": 3.995469808578491 }, { "auxiliary_loss_clip": 0.06421162, "auxiliary_loss_mlp": 0.01265941, "balance_loss_clip": 0.06275591, "balance_loss_mlp": 0.01256279, "epoch": 0.7036224259732452, "flos": 23630594653440.0, "grad_norm": 1.5229696384081668, "language_loss": 0.6100921, "learning_rate": 8.526787572922738e-07, "loss": 0.68696314, "num_input_tokens_seen": 252498740, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.09661865, "step": 11703, "time_per_iteration": 2.569340705871582 }, { "auxiliary_loss_clip": 0.06423078, "auxiliary_loss_mlp": 0.01268183, "balance_loss_clip": 0.06277186, "balance_loss_mlp": 0.01257359, "epoch": 0.7036825492259131, "flos": 31694239198080.0, "grad_norm": 2.027055963204299, "language_loss": 0.6128819, "learning_rate": 8.523597736751067e-07, "loss": 0.68979442, "num_input_tokens_seen": 252517800, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10827637, "step": 11704, "time_per_iteration": 2.651242971420288 }, { "auxiliary_loss_clip": 0.06414817, "auxiliary_loss_mlp": 0.01268058, "balance_loss_clip": 0.06274191, "balance_loss_mlp": 0.01258348, "epoch": 0.7037426724785811, "flos": 30201116567040.0, "grad_norm": 1.639451577320681, "language_loss": 0.70966768, "learning_rate": 8.520408335765719e-07, "loss": 0.7864964, "num_input_tokens_seen": 252539620, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09710693, "step": 11705, "time_per_iteration": 2.6313090324401855 }, { "auxiliary_loss_clip": 0.06417467, "auxiliary_loss_mlp": 0.01264874, "balance_loss_clip": 0.06276202, "balance_loss_mlp": 0.01254717, "epoch": 0.703802795731249, "flos": 24317597479680.0, "grad_norm": 1.9097161620665286, "language_loss": 0.62305224, "learning_rate": 8.517219370087645e-07, "loss": 0.69987565, "num_input_tokens_seen": 252557300, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10150146, "step": 11706, "time_per_iteration": 2.5571041107177734 }, { "auxiliary_loss_clip": 0.06420571, "auxiliary_loss_mlp": 0.01265753, "balance_loss_clip": 0.06275149, "balance_loss_mlp": 0.01254798, "epoch": 0.7038629189839171, "flos": 22535605497600.0, "grad_norm": 1.9430542020415944, "language_loss": 0.68239319, "learning_rate": 8.514030839837756e-07, "loss": 0.75925642, "num_input_tokens_seen": 252576715, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.10955811, "step": 11707, "time_per_iteration": 2.5428786277770996 }, { "auxiliary_loss_clip": 0.06417191, "auxiliary_loss_mlp": 0.01268465, "balance_loss_clip": 0.06275798, "balance_loss_mlp": 0.01258184, "epoch": 0.703923042236585, "flos": 26257755242880.0, "grad_norm": 1.8897472768262333, "language_loss": 0.76718462, "learning_rate": 8.510842745136974e-07, "loss": 0.84404111, "num_input_tokens_seen": 252596190, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10272217, "step": 11708, "time_per_iteration": 2.594571590423584 }, { "auxiliary_loss_clip": 0.06419632, "auxiliary_loss_mlp": 0.01262436, "balance_loss_clip": 0.06278624, "balance_loss_mlp": 0.01252923, "epoch": 0.703983165489253, "flos": 19396225699200.0, "grad_norm": 1.7154267143987187, "language_loss": 0.72524667, "learning_rate": 8.50765508610619e-07, "loss": 0.80206728, "num_input_tokens_seen": 252613410, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09509277, "step": 11709, "time_per_iteration": 3.991274833679199 }, { "auxiliary_loss_clip": 0.06420308, "auxiliary_loss_mlp": 0.01267412, "balance_loss_clip": 0.06278248, "balance_loss_mlp": 0.01257845, "epoch": 0.7040432887419209, "flos": 16688032611840.0, "grad_norm": 2.3870976931819925, "language_loss": 0.79441559, "learning_rate": 8.504467862866267e-07, "loss": 0.87129283, "num_input_tokens_seen": 252629150, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09564209, "step": 11710, "time_per_iteration": 2.5258727073669434 }, { "auxiliary_loss_clip": 0.06425275, "auxiliary_loss_mlp": 0.01265291, "balance_loss_clip": 0.06281156, "balance_loss_mlp": 0.01254831, "epoch": 0.7041034119945889, "flos": 21147638140800.0, "grad_norm": 1.6534262729273081, "language_loss": 0.77564281, "learning_rate": 8.501281075538076e-07, "loss": 0.85254848, "num_input_tokens_seen": 252648225, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10461426, "step": 11711, "time_per_iteration": 2.5408637523651123 }, { "auxiliary_loss_clip": 0.06425463, "auxiliary_loss_mlp": 0.01267122, "balance_loss_clip": 0.06284525, "balance_loss_mlp": 0.01257007, "epoch": 0.7041635352472568, "flos": 16916036371200.0, "grad_norm": 2.229423252476614, "language_loss": 0.74166965, "learning_rate": 8.498094724242457e-07, "loss": 0.81859553, "num_input_tokens_seen": 252665380, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10107422, "step": 11712, "time_per_iteration": 2.5381553173065186 }, { "auxiliary_loss_clip": 0.06324339, "auxiliary_loss_mlp": 0.0124947, "balance_loss_clip": 0.06266277, "balance_loss_mlp": 0.01248176, "epoch": 0.7042236584999249, "flos": 71703186572160.0, "grad_norm": 0.8578661150247059, "language_loss": 0.64634621, "learning_rate": 8.494908809100247e-07, "loss": 0.72208428, "num_input_tokens_seen": 252727950, "router_z_loss_clip": 0.58154297, "router_z_loss_mlp": 0.01294708, "step": 11713, "time_per_iteration": 3.2130236625671387 }, { "auxiliary_loss_clip": 0.06418432, "auxiliary_loss_mlp": 0.01263736, "balance_loss_clip": 0.06276491, "balance_loss_mlp": 0.01254199, "epoch": 0.7042837817525928, "flos": 28665800605440.0, "grad_norm": 1.8604508912011433, "language_loss": 0.72964096, "learning_rate": 8.49172333023225e-07, "loss": 0.80646265, "num_input_tokens_seen": 252746770, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09539795, "step": 11714, "time_per_iteration": 2.6180734634399414 }, { "auxiliary_loss_clip": 0.06420538, "auxiliary_loss_mlp": 0.01268532, "balance_loss_clip": 0.06278495, "balance_loss_mlp": 0.01257386, "epoch": 0.7043439050052608, "flos": 19759335367680.0, "grad_norm": 1.855800625248294, "language_loss": 0.801736, "learning_rate": 8.488538287759248e-07, "loss": 0.8786267, "num_input_tokens_seen": 252765610, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.1114502, "step": 11715, "time_per_iteration": 2.5630033016204834 }, { "auxiliary_loss_clip": 0.06425019, "auxiliary_loss_mlp": 0.01271486, "balance_loss_clip": 0.06279996, "balance_loss_mlp": 0.0126081, "epoch": 0.7044040282579288, "flos": 11541969308160.0, "grad_norm": 2.0869398837447752, "language_loss": 0.7144447, "learning_rate": 8.485353681802037e-07, "loss": 0.79140973, "num_input_tokens_seen": 252781610, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10662842, "step": 11716, "time_per_iteration": 2.5434062480926514 }, { "auxiliary_loss_clip": 0.06432015, "auxiliary_loss_mlp": 0.01266902, "balance_loss_clip": 0.06284645, "balance_loss_mlp": 0.01255816, "epoch": 0.7044641515105967, "flos": 33664473377280.0, "grad_norm": 2.0533786579221984, "language_loss": 0.66684341, "learning_rate": 8.482169512481358e-07, "loss": 0.74383259, "num_input_tokens_seen": 252800600, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.11083984, "step": 11717, "time_per_iteration": 2.639310598373413 }, { "auxiliary_loss_clip": 0.06423268, "auxiliary_loss_mlp": 0.01267027, "balance_loss_clip": 0.06279524, "balance_loss_mlp": 0.01256715, "epoch": 0.7045242747632647, "flos": 26731051430400.0, "grad_norm": 1.4008719631442648, "language_loss": 0.74638367, "learning_rate": 8.478985779917967e-07, "loss": 0.82328653, "num_input_tokens_seen": 252822310, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10308838, "step": 11718, "time_per_iteration": 2.5919086933135986 }, { "auxiliary_loss_clip": 0.06422155, "auxiliary_loss_mlp": 0.01265748, "balance_loss_clip": 0.06281129, "balance_loss_mlp": 0.01255931, "epoch": 0.7045843980159326, "flos": 26804998258560.0, "grad_norm": 1.5467209309330217, "language_loss": 0.80117202, "learning_rate": 8.475802484232606e-07, "loss": 0.87805104, "num_input_tokens_seen": 252842355, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09814453, "step": 11719, "time_per_iteration": 4.054445743560791 }, { "auxiliary_loss_clip": 0.0642622, "auxiliary_loss_mlp": 0.01265012, "balance_loss_clip": 0.06283475, "balance_loss_mlp": 0.01254373, "epoch": 0.7046445212686007, "flos": 41584710458880.0, "grad_norm": 1.6657686771574245, "language_loss": 0.65964198, "learning_rate": 8.472619625545951e-07, "loss": 0.73655432, "num_input_tokens_seen": 252866785, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10638428, "step": 11720, "time_per_iteration": 2.721766710281372 }, { "auxiliary_loss_clip": 0.0643774, "auxiliary_loss_mlp": 0.01264272, "balance_loss_clip": 0.06287668, "balance_loss_mlp": 0.01253763, "epoch": 0.7047046445212686, "flos": 15565650370560.0, "grad_norm": 1.9804762008952526, "language_loss": 0.80000675, "learning_rate": 8.46943720397872e-07, "loss": 0.87702686, "num_input_tokens_seen": 252881870, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.10510254, "step": 11721, "time_per_iteration": 2.534616231918335 }, { "auxiliary_loss_clip": 0.06323805, "auxiliary_loss_mlp": 0.0125276, "balance_loss_clip": 0.0626591, "balance_loss_mlp": 0.01251454, "epoch": 0.7047647677739366, "flos": 70433036455680.0, "grad_norm": 0.7712750664733007, "language_loss": 0.64754105, "learning_rate": 8.466255219651582e-07, "loss": 0.72330666, "num_input_tokens_seen": 252951300, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.01306915, "step": 11722, "time_per_iteration": 3.3250746726989746 }, { "auxiliary_loss_clip": 0.06421354, "auxiliary_loss_mlp": 0.01266325, "balance_loss_clip": 0.06279628, "balance_loss_mlp": 0.0125621, "epoch": 0.7048248910266045, "flos": 23666876271360.0, "grad_norm": 3.481881876018049, "language_loss": 0.66054159, "learning_rate": 8.463073672685211e-07, "loss": 0.73741841, "num_input_tokens_seen": 252971400, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10113525, "step": 11723, "time_per_iteration": 4.112760066986084 }, { "auxiliary_loss_clip": 0.06425076, "auxiliary_loss_mlp": 0.01266178, "balance_loss_clip": 0.0627985, "balance_loss_mlp": 0.01255484, "epoch": 0.7048850142792725, "flos": 21403496183040.0, "grad_norm": 1.699941777716038, "language_loss": 0.80755317, "learning_rate": 8.459892563200235e-07, "loss": 0.88446575, "num_input_tokens_seen": 252989475, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10687256, "step": 11724, "time_per_iteration": 2.563779592514038 }, { "auxiliary_loss_clip": 0.06425464, "auxiliary_loss_mlp": 0.01262983, "balance_loss_clip": 0.06279469, "balance_loss_mlp": 0.01252325, "epoch": 0.7049451375319404, "flos": 21653736001920.0, "grad_norm": 1.8357649090358965, "language_loss": 0.72982937, "learning_rate": 8.456711891317296e-07, "loss": 0.80671382, "num_input_tokens_seen": 253007220, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.10668945, "step": 11725, "time_per_iteration": 2.5700933933258057 }, { "auxiliary_loss_clip": 0.06430411, "auxiliary_loss_mlp": 0.01267218, "balance_loss_clip": 0.06283279, "balance_loss_mlp": 0.01256561, "epoch": 0.7050052607846085, "flos": 14872148853120.0, "grad_norm": 1.9722783416180751, "language_loss": 0.77940291, "learning_rate": 8.453531657156998e-07, "loss": 0.85637915, "num_input_tokens_seen": 253025410, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10656738, "step": 11726, "time_per_iteration": 2.533172845840454 }, { "auxiliary_loss_clip": 0.06425557, "auxiliary_loss_mlp": 0.01267618, "balance_loss_clip": 0.06283134, "balance_loss_mlp": 0.01257735, "epoch": 0.7050653840372764, "flos": 19247283866880.0, "grad_norm": 1.8612894502808552, "language_loss": 0.70691347, "learning_rate": 8.450351860839931e-07, "loss": 0.78384519, "num_input_tokens_seen": 253043305, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09881592, "step": 11727, "time_per_iteration": 2.558987855911255 }, { "auxiliary_loss_clip": 0.06413224, "auxiliary_loss_mlp": 0.01266962, "balance_loss_clip": 0.06277968, "balance_loss_mlp": 0.01258176, "epoch": 0.7051255072899444, "flos": 27787536835200.0, "grad_norm": 1.571512157290353, "language_loss": 0.68853533, "learning_rate": 8.44717250248668e-07, "loss": 0.76533717, "num_input_tokens_seen": 253062790, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.08789062, "step": 11728, "time_per_iteration": 2.5990302562713623 }, { "auxiliary_loss_clip": 0.06423648, "auxiliary_loss_mlp": 0.01262968, "balance_loss_clip": 0.06281845, "balance_loss_mlp": 0.01253295, "epoch": 0.7051856305426124, "flos": 27899526216960.0, "grad_norm": 1.6574327200570529, "language_loss": 0.73032629, "learning_rate": 8.443993582217803e-07, "loss": 0.80719244, "num_input_tokens_seen": 253082055, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09674072, "step": 11729, "time_per_iteration": 2.6144917011260986 }, { "auxiliary_loss_clip": 0.06434345, "auxiliary_loss_mlp": 0.01268132, "balance_loss_clip": 0.06283316, "balance_loss_mlp": 0.01257147, "epoch": 0.7052457537952803, "flos": 25050147799680.0, "grad_norm": 1.5279583684427898, "language_loss": 0.77794832, "learning_rate": 8.440815100153862e-07, "loss": 0.85497308, "num_input_tokens_seen": 253102575, "router_z_loss_clip": 1.50976562, "router_z_loss_mlp": 0.10992432, "step": 11730, "time_per_iteration": 2.596283197402954 }, { "auxiliary_loss_clip": 0.06423937, "auxiliary_loss_mlp": 0.01266293, "balance_loss_clip": 0.06277584, "balance_loss_mlp": 0.0125594, "epoch": 0.7053058770479483, "flos": 21878175962880.0, "grad_norm": 2.0819034991169647, "language_loss": 0.6356504, "learning_rate": 8.437637056415359e-07, "loss": 0.71255273, "num_input_tokens_seen": 253121290, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.10351562, "step": 11731, "time_per_iteration": 2.5443804264068604 }, { "auxiliary_loss_clip": 0.06429575, "auxiliary_loss_mlp": 0.01269987, "balance_loss_clip": 0.06283547, "balance_loss_mlp": 0.01259336, "epoch": 0.7053660003006162, "flos": 16404236432640.0, "grad_norm": 1.883301128228481, "language_loss": 0.74307424, "learning_rate": 8.434459451122815e-07, "loss": 0.82006991, "num_input_tokens_seen": 253139720, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10644531, "step": 11732, "time_per_iteration": 2.533090353012085 }, { "auxiliary_loss_clip": 0.06420103, "auxiliary_loss_mlp": 0.01266608, "balance_loss_clip": 0.06280215, "balance_loss_mlp": 0.01256815, "epoch": 0.7054261235532843, "flos": 22718271398400.0, "grad_norm": 1.3780784502127787, "language_loss": 0.7108829, "learning_rate": 8.431282284396735e-07, "loss": 0.78775001, "num_input_tokens_seen": 253160250, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09790039, "step": 11733, "time_per_iteration": 2.586733102798462 }, { "auxiliary_loss_clip": 0.06421255, "auxiliary_loss_mlp": 0.01268023, "balance_loss_clip": 0.06279072, "balance_loss_mlp": 0.01257759, "epoch": 0.7054862468059522, "flos": 13594829212800.0, "grad_norm": 1.7758221341397729, "language_loss": 0.73500425, "learning_rate": 8.428105556357583e-07, "loss": 0.81189704, "num_input_tokens_seen": 253178710, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.1026001, "step": 11734, "time_per_iteration": 2.52107572555542 }, { "auxiliary_loss_clip": 0.06434888, "auxiliary_loss_mlp": 0.01271352, "balance_loss_clip": 0.06284665, "balance_loss_mlp": 0.01260235, "epoch": 0.7055463700586202, "flos": 15884931553920.0, "grad_norm": 2.3219525527761458, "language_loss": 0.69359118, "learning_rate": 8.424929267125829e-07, "loss": 0.77065361, "num_input_tokens_seen": 253194805, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.11114502, "step": 11735, "time_per_iteration": 2.542799472808838 }, { "auxiliary_loss_clip": 0.06425905, "auxiliary_loss_mlp": 0.01269508, "balance_loss_clip": 0.06280086, "balance_loss_mlp": 0.01258237, "epoch": 0.7056064933112881, "flos": 23082890440320.0, "grad_norm": 1.6058682067701577, "language_loss": 0.72263432, "learning_rate": 8.421753416821933e-07, "loss": 0.79958844, "num_input_tokens_seen": 253213895, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.11279297, "step": 11736, "time_per_iteration": 2.5879294872283936 }, { "auxiliary_loss_clip": 0.06423691, "auxiliary_loss_mlp": 0.01264667, "balance_loss_clip": 0.06284766, "balance_loss_mlp": 0.01255124, "epoch": 0.7056666165639561, "flos": 24063374592000.0, "grad_norm": 1.7198267849771527, "language_loss": 0.68742871, "learning_rate": 8.41857800556629e-07, "loss": 0.76431227, "num_input_tokens_seen": 253231620, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09545898, "step": 11737, "time_per_iteration": 2.6122398376464844 }, { "auxiliary_loss_clip": 0.06427561, "auxiliary_loss_mlp": 0.01267211, "balance_loss_clip": 0.06283569, "balance_loss_mlp": 0.01256202, "epoch": 0.705726739816624, "flos": 17498932099200.0, "grad_norm": 1.9780559068736134, "language_loss": 0.67718279, "learning_rate": 8.415403033479332e-07, "loss": 0.75413048, "num_input_tokens_seen": 253249590, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.11016846, "step": 11738, "time_per_iteration": 2.571038007736206 }, { "auxiliary_loss_clip": 0.06424393, "auxiliary_loss_mlp": 0.01266467, "balance_loss_clip": 0.06281444, "balance_loss_mlp": 0.01255607, "epoch": 0.7057868630692921, "flos": 51361515256320.0, "grad_norm": 1.593677506016029, "language_loss": 0.7540983, "learning_rate": 8.41222850068145e-07, "loss": 0.83100688, "num_input_tokens_seen": 253273870, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10858154, "step": 11739, "time_per_iteration": 2.8168423175811768 }, { "auxiliary_loss_clip": 0.0642258, "auxiliary_loss_mlp": 0.0126296, "balance_loss_clip": 0.06282448, "balance_loss_mlp": 0.01251999, "epoch": 0.70584698632196, "flos": 26109945440640.0, "grad_norm": 1.6473337755757356, "language_loss": 0.71883416, "learning_rate": 8.409054407293032e-07, "loss": 0.79568952, "num_input_tokens_seen": 253293720, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.10961914, "step": 11740, "time_per_iteration": 2.622230291366577 }, { "auxiliary_loss_clip": 0.06419963, "auxiliary_loss_mlp": 0.01268154, "balance_loss_clip": 0.06278975, "balance_loss_mlp": 0.01258361, "epoch": 0.705907109574628, "flos": 21549503122560.0, "grad_norm": 2.417468836835572, "language_loss": 0.82138497, "learning_rate": 8.405880753434434e-07, "loss": 0.89826614, "num_input_tokens_seen": 253313700, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09796143, "step": 11741, "time_per_iteration": 2.549564838409424 }, { "auxiliary_loss_clip": 0.06424814, "auxiliary_loss_mlp": 0.01264667, "balance_loss_clip": 0.06281401, "balance_loss_mlp": 0.0125392, "epoch": 0.705967232827296, "flos": 22717432857600.0, "grad_norm": 2.1678048181495524, "language_loss": 0.77813256, "learning_rate": 8.402707539225993e-07, "loss": 0.85502732, "num_input_tokens_seen": 253332425, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10742188, "step": 11742, "time_per_iteration": 4.078226804733276 }, { "auxiliary_loss_clip": 0.06431597, "auxiliary_loss_mlp": 0.01267696, "balance_loss_clip": 0.06282535, "balance_loss_mlp": 0.01256782, "epoch": 0.7060273560799639, "flos": 28698266862720.0, "grad_norm": 1.4844064453069585, "language_loss": 0.64570838, "learning_rate": 8.39953476478805e-07, "loss": 0.72270131, "num_input_tokens_seen": 253353620, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.10913086, "step": 11743, "time_per_iteration": 2.6834816932678223 }, { "auxiliary_loss_clip": 0.06427722, "auxiliary_loss_mlp": 0.01269242, "balance_loss_clip": 0.06280142, "balance_loss_mlp": 0.01258055, "epoch": 0.7060874793326319, "flos": 15711699237120.0, "grad_norm": 1.9369968323963387, "language_loss": 0.66185319, "learning_rate": 8.396362430240902e-07, "loss": 0.73882282, "num_input_tokens_seen": 253370930, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.11181641, "step": 11744, "time_per_iteration": 2.720139980316162 }, { "auxiliary_loss_clip": 0.06426705, "auxiliary_loss_mlp": 0.01269842, "balance_loss_clip": 0.06286401, "balance_loss_mlp": 0.01259161, "epoch": 0.7061476025852998, "flos": 21513137650560.0, "grad_norm": 1.7235562268821734, "language_loss": 0.6380533, "learning_rate": 8.393190535704857e-07, "loss": 0.71501875, "num_input_tokens_seen": 253389810, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.10687256, "step": 11745, "time_per_iteration": 2.560828685760498 }, { "auxiliary_loss_clip": 0.06421569, "auxiliary_loss_mlp": 0.01263166, "balance_loss_clip": 0.06277496, "balance_loss_mlp": 0.01252855, "epoch": 0.7062077258379679, "flos": 28189024473600.0, "grad_norm": 1.569697900867658, "language_loss": 0.71745545, "learning_rate": 8.390019081300188e-07, "loss": 0.79430282, "num_input_tokens_seen": 253408685, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10321045, "step": 11746, "time_per_iteration": 2.6388022899627686 }, { "auxiliary_loss_clip": 0.06425507, "auxiliary_loss_mlp": 0.01268135, "balance_loss_clip": 0.06282011, "balance_loss_mlp": 0.01257925, "epoch": 0.7062678490906358, "flos": 27860854757760.0, "grad_norm": 1.4482376448937317, "language_loss": 0.79848456, "learning_rate": 8.386848067147175e-07, "loss": 0.87542093, "num_input_tokens_seen": 253429685, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10211182, "step": 11747, "time_per_iteration": 2.630887985229492 }, { "auxiliary_loss_clip": 0.06427909, "auxiliary_loss_mlp": 0.01262915, "balance_loss_clip": 0.06286546, "balance_loss_mlp": 0.01253277, "epoch": 0.7063279723433038, "flos": 23191483731840.0, "grad_norm": 1.5687841083624283, "language_loss": 0.65423, "learning_rate": 8.383677493366031e-07, "loss": 0.73113823, "num_input_tokens_seen": 253448260, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09643555, "step": 11748, "time_per_iteration": 2.5549156665802 }, { "auxiliary_loss_clip": 0.06427011, "auxiliary_loss_mlp": 0.01265207, "balance_loss_clip": 0.06284729, "balance_loss_mlp": 0.01254228, "epoch": 0.7063880955959717, "flos": 20194043950080.0, "grad_norm": 1.7684794500098477, "language_loss": 0.80025727, "learning_rate": 8.380507360077003e-07, "loss": 0.87717944, "num_input_tokens_seen": 253467725, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10980225, "step": 11749, "time_per_iteration": 4.014606952667236 }, { "auxiliary_loss_clip": 0.06330822, "auxiliary_loss_mlp": 0.01252034, "balance_loss_clip": 0.06272735, "balance_loss_mlp": 0.01250834, "epoch": 0.7064482188486397, "flos": 63685020395520.0, "grad_norm": 0.7703625282821089, "language_loss": 0.5402379, "learning_rate": 8.377337667400304e-07, "loss": 0.61606646, "num_input_tokens_seen": 253526940, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 0.01198578, "step": 11750, "time_per_iteration": 3.116241455078125 }, { "auxiliary_loss_clip": 0.06424887, "auxiliary_loss_mlp": 0.01267375, "balance_loss_clip": 0.06281637, "balance_loss_mlp": 0.01256342, "epoch": 0.7065083421013076, "flos": 25198125310080.0, "grad_norm": 1.6384625802234176, "language_loss": 0.7861799, "learning_rate": 8.37416841545612e-07, "loss": 0.8631025, "num_input_tokens_seen": 253546160, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.11029053, "step": 11751, "time_per_iteration": 2.568432331085205 }, { "auxiliary_loss_clip": 0.06424694, "auxiliary_loss_mlp": 0.01265295, "balance_loss_clip": 0.0628338, "balance_loss_mlp": 0.01255586, "epoch": 0.7065684653539757, "flos": 22900392247680.0, "grad_norm": 1.8401236036905326, "language_loss": 0.68228769, "learning_rate": 8.370999604364634e-07, "loss": 0.75918758, "num_input_tokens_seen": 253565505, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09710693, "step": 11752, "time_per_iteration": 2.5704002380371094 }, { "auxiliary_loss_clip": 0.06422143, "auxiliary_loss_mlp": 0.01268835, "balance_loss_clip": 0.06281569, "balance_loss_mlp": 0.0125832, "epoch": 0.7066285886066436, "flos": 23557025168640.0, "grad_norm": 1.9656114768646777, "language_loss": 0.76790321, "learning_rate": 8.367831234246025e-07, "loss": 0.84481299, "num_input_tokens_seen": 253585125, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.10516357, "step": 11753, "time_per_iteration": 2.560554027557373 }, { "auxiliary_loss_clip": 0.06421445, "auxiliary_loss_mlp": 0.01264564, "balance_loss_clip": 0.06282808, "balance_loss_mlp": 0.01254485, "epoch": 0.7066887118593116, "flos": 21075661883520.0, "grad_norm": 1.4491732952551848, "language_loss": 0.70972693, "learning_rate": 8.364663305220405e-07, "loss": 0.78658706, "num_input_tokens_seen": 253604815, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.10083008, "step": 11754, "time_per_iteration": 2.5579073429107666 }, { "auxiliary_loss_clip": 0.06424969, "auxiliary_loss_mlp": 0.01267213, "balance_loss_clip": 0.06280921, "balance_loss_mlp": 0.01257247, "epoch": 0.7067488351119796, "flos": 21182284604160.0, "grad_norm": 2.0205569353377784, "language_loss": 0.89341307, "learning_rate": 8.361495817407919e-07, "loss": 0.97033483, "num_input_tokens_seen": 253622855, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.09967041, "step": 11755, "time_per_iteration": 2.5384883880615234 }, { "auxiliary_loss_clip": 0.06422025, "auxiliary_loss_mlp": 0.0126997, "balance_loss_clip": 0.06280876, "balance_loss_mlp": 0.01259461, "epoch": 0.7068089583646475, "flos": 20455520215680.0, "grad_norm": 1.7118047798378861, "language_loss": 0.80021501, "learning_rate": 8.358328770928678e-07, "loss": 0.87713498, "num_input_tokens_seen": 253642760, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10510254, "step": 11756, "time_per_iteration": 2.5623300075531006 }, { "auxiliary_loss_clip": 0.06334139, "auxiliary_loss_mlp": 0.01249963, "balance_loss_clip": 0.06276096, "balance_loss_mlp": 0.01248727, "epoch": 0.7068690816173155, "flos": 59125542399360.0, "grad_norm": 0.8489039268994539, "language_loss": 0.6018135, "learning_rate": 8.355162165902785e-07, "loss": 0.67765456, "num_input_tokens_seen": 253695685, "router_z_loss_clip": 0.57861328, "router_z_loss_mlp": 0.01235962, "step": 11757, "time_per_iteration": 2.926708221435547 }, { "auxiliary_loss_clip": 0.06420422, "auxiliary_loss_mlp": 0.01266523, "balance_loss_clip": 0.06280074, "balance_loss_mlp": 0.01255967, "epoch": 0.7069292048699835, "flos": 16256845900800.0, "grad_norm": 1.5705182833842652, "language_loss": 0.80725163, "learning_rate": 8.351996002450307e-07, "loss": 0.88412112, "num_input_tokens_seen": 253713305, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.10559082, "step": 11758, "time_per_iteration": 2.54243540763855 }, { "auxiliary_loss_clip": 0.06423199, "auxiliary_loss_mlp": 0.01268495, "balance_loss_clip": 0.062813, "balance_loss_mlp": 0.01257403, "epoch": 0.7069893281226515, "flos": 41182468133760.0, "grad_norm": 1.8742675375737274, "language_loss": 0.77633536, "learning_rate": 8.348830280691304e-07, "loss": 0.85325229, "num_input_tokens_seen": 253736100, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.11083984, "step": 11759, "time_per_iteration": 4.164474248886108 }, { "auxiliary_loss_clip": 0.06424023, "auxiliary_loss_mlp": 0.01268098, "balance_loss_clip": 0.06281704, "balance_loss_mlp": 0.01257297, "epoch": 0.7070494513753194, "flos": 24214203141120.0, "grad_norm": 1.6809378322718707, "language_loss": 0.68166316, "learning_rate": 8.34566500074583e-07, "loss": 0.75858438, "num_input_tokens_seen": 253757350, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10784912, "step": 11760, "time_per_iteration": 2.6072657108306885 }, { "auxiliary_loss_clip": 0.06430632, "auxiliary_loss_mlp": 0.01265023, "balance_loss_clip": 0.06285224, "balance_loss_mlp": 0.01254455, "epoch": 0.7071095746279874, "flos": 20190564005760.0, "grad_norm": 1.887470425189044, "language_loss": 0.80775815, "learning_rate": 8.342500162733899e-07, "loss": 0.88471466, "num_input_tokens_seen": 253772855, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10565186, "step": 11761, "time_per_iteration": 2.564000368118286 }, { "auxiliary_loss_clip": 0.06425913, "auxiliary_loss_mlp": 0.01269548, "balance_loss_clip": 0.06282225, "balance_loss_mlp": 0.01257949, "epoch": 0.7071696978806553, "flos": 18188282839680.0, "grad_norm": 2.4529567640976384, "language_loss": 0.75562179, "learning_rate": 8.33933576677553e-07, "loss": 0.83257639, "num_input_tokens_seen": 253790360, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.1159668, "step": 11762, "time_per_iteration": 2.537832260131836 }, { "auxiliary_loss_clip": 0.06420901, "auxiliary_loss_mlp": 0.0126895, "balance_loss_clip": 0.06279556, "balance_loss_mlp": 0.01258912, "epoch": 0.7072298211333233, "flos": 24138201888000.0, "grad_norm": 1.7331296755934953, "language_loss": 0.77076578, "learning_rate": 8.336171812990724e-07, "loss": 0.8476643, "num_input_tokens_seen": 253810585, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.1003418, "step": 11763, "time_per_iteration": 3.9661459922790527 }, { "auxiliary_loss_clip": 0.064224, "auxiliary_loss_mlp": 0.01266166, "balance_loss_clip": 0.06278399, "balance_loss_mlp": 0.01255055, "epoch": 0.7072899443859912, "flos": 27205731210240.0, "grad_norm": 2.161390688634522, "language_loss": 0.78887779, "learning_rate": 8.333008301499453e-07, "loss": 0.86576343, "num_input_tokens_seen": 253829080, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.11126709, "step": 11764, "time_per_iteration": 2.5970590114593506 }, { "auxiliary_loss_clip": 0.06431369, "auxiliary_loss_mlp": 0.01270769, "balance_loss_clip": 0.06283884, "balance_loss_mlp": 0.01259265, "epoch": 0.7073500676386593, "flos": 16441188883200.0, "grad_norm": 1.5527058831456333, "language_loss": 0.79530162, "learning_rate": 8.32984523242167e-07, "loss": 0.87232304, "num_input_tokens_seen": 253846780, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.11499023, "step": 11765, "time_per_iteration": 2.539134979248047 }, { "auxiliary_loss_clip": 0.06420936, "auxiliary_loss_mlp": 0.012663, "balance_loss_clip": 0.06280273, "balance_loss_mlp": 0.01256888, "epoch": 0.7074101908913272, "flos": 27681291457920.0, "grad_norm": 1.5634622757329095, "language_loss": 0.6866678, "learning_rate": 8.326682605877324e-07, "loss": 0.76354009, "num_input_tokens_seen": 253867075, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09405518, "step": 11766, "time_per_iteration": 2.60121750831604 }, { "auxiliary_loss_clip": 0.06425337, "auxiliary_loss_mlp": 0.01266643, "balance_loss_clip": 0.06280567, "balance_loss_mlp": 0.01255569, "epoch": 0.7074703141439952, "flos": 22244849429760.0, "grad_norm": 2.9908693815434946, "language_loss": 0.64433324, "learning_rate": 8.323520421986352e-07, "loss": 0.7212531, "num_input_tokens_seen": 253885790, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.11083984, "step": 11767, "time_per_iteration": 2.5886292457580566 }, { "auxiliary_loss_clip": 0.06422083, "auxiliary_loss_mlp": 0.0126588, "balance_loss_clip": 0.06279078, "balance_loss_mlp": 0.01255878, "epoch": 0.7075304373966632, "flos": 29650980585600.0, "grad_norm": 1.5622476750787544, "language_loss": 0.53601825, "learning_rate": 8.320358680868646e-07, "loss": 0.61289793, "num_input_tokens_seen": 253907070, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10003662, "step": 11768, "time_per_iteration": 2.6410484313964844 }, { "auxiliary_loss_clip": 0.06420441, "auxiliary_loss_mlp": 0.01265402, "balance_loss_clip": 0.06278963, "balance_loss_mlp": 0.01255806, "epoch": 0.7075905606493311, "flos": 19761264011520.0, "grad_norm": 1.707518880500728, "language_loss": 0.7643643, "learning_rate": 8.317197382644119e-07, "loss": 0.84122276, "num_input_tokens_seen": 253927290, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09588623, "step": 11769, "time_per_iteration": 2.5609703063964844 }, { "auxiliary_loss_clip": 0.06334723, "auxiliary_loss_mlp": 0.01250649, "balance_loss_clip": 0.06277037, "balance_loss_mlp": 0.01249379, "epoch": 0.7076506839019991, "flos": 65734106866560.0, "grad_norm": 0.8292973090600521, "language_loss": 0.6196357, "learning_rate": 8.314036527432637e-07, "loss": 0.69548941, "num_input_tokens_seen": 253983440, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.01270294, "step": 11770, "time_per_iteration": 3.1048192977905273 }, { "auxiliary_loss_clip": 0.06428806, "auxiliary_loss_mlp": 0.0126849, "balance_loss_clip": 0.06282578, "balance_loss_mlp": 0.01257773, "epoch": 0.707710807154667, "flos": 23771444567040.0, "grad_norm": 1.64399705212526, "language_loss": 0.76186275, "learning_rate": 8.310876115354055e-07, "loss": 0.83883572, "num_input_tokens_seen": 254003825, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.10717773, "step": 11771, "time_per_iteration": 2.5706067085266113 }, { "auxiliary_loss_clip": 0.06420575, "auxiliary_loss_mlp": 0.01266777, "balance_loss_clip": 0.06282397, "balance_loss_mlp": 0.01256573, "epoch": 0.7077709304073351, "flos": 21257698878720.0, "grad_norm": 2.317654491793301, "language_loss": 0.71441126, "learning_rate": 8.307716146528221e-07, "loss": 0.79128474, "num_input_tokens_seen": 254023345, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.10198975, "step": 11772, "time_per_iteration": 2.5660812854766846 }, { "auxiliary_loss_clip": 0.06427851, "auxiliary_loss_mlp": 0.01266583, "balance_loss_clip": 0.06280367, "balance_loss_mlp": 0.01255342, "epoch": 0.707831053660003, "flos": 20747030970240.0, "grad_norm": 1.7713756810885166, "language_loss": 0.69981998, "learning_rate": 8.30455662107496e-07, "loss": 0.77676433, "num_input_tokens_seen": 254041815, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.11236572, "step": 11773, "time_per_iteration": 2.5472118854522705 }, { "auxiliary_loss_clip": 0.06423366, "auxiliary_loss_mlp": 0.01267751, "balance_loss_clip": 0.06280343, "balance_loss_mlp": 0.01257385, "epoch": 0.707891176912671, "flos": 21987440087040.0, "grad_norm": 1.4150605979294664, "language_loss": 0.70396525, "learning_rate": 8.301397539114095e-07, "loss": 0.7808764, "num_input_tokens_seen": 254062065, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10375977, "step": 11774, "time_per_iteration": 2.5991013050079346 }, { "auxiliary_loss_clip": 0.06419148, "auxiliary_loss_mlp": 0.01264073, "balance_loss_clip": 0.06281055, "balance_loss_mlp": 0.01254268, "epoch": 0.7079513001653389, "flos": 21075284540160.0, "grad_norm": 1.600379763437487, "language_loss": 0.74846667, "learning_rate": 8.298238900765407e-07, "loss": 0.82529891, "num_input_tokens_seen": 254080605, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09802246, "step": 11775, "time_per_iteration": 2.5742595195770264 }, { "auxiliary_loss_clip": 0.0643105, "auxiliary_loss_mlp": 0.01266377, "balance_loss_clip": 0.0628659, "balance_loss_mlp": 0.01255827, "epoch": 0.7080114234180069, "flos": 18046468604160.0, "grad_norm": 1.5838087208042235, "language_loss": 0.87222505, "learning_rate": 8.295080706148665e-07, "loss": 0.94919932, "num_input_tokens_seen": 254098710, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10559082, "step": 11776, "time_per_iteration": 2.5439155101776123 }, { "auxiliary_loss_clip": 0.06419866, "auxiliary_loss_mlp": 0.01264784, "balance_loss_clip": 0.06277936, "balance_loss_mlp": 0.01254794, "epoch": 0.7080715466706748, "flos": 15127671479040.0, "grad_norm": 1.8283629138936297, "language_loss": 0.75286537, "learning_rate": 8.291922955383641e-07, "loss": 0.82971191, "num_input_tokens_seen": 254117200, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09985352, "step": 11777, "time_per_iteration": 2.5260214805603027 }, { "auxiliary_loss_clip": 0.06434874, "auxiliary_loss_mlp": 0.01267227, "balance_loss_clip": 0.06286396, "balance_loss_mlp": 0.01256379, "epoch": 0.7081316699233429, "flos": 14427042364800.0, "grad_norm": 1.9437041179739136, "language_loss": 0.82711482, "learning_rate": 8.288765648590066e-07, "loss": 0.90413588, "num_input_tokens_seen": 254132115, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.10845947, "step": 11778, "time_per_iteration": 2.5164542198181152 }, { "auxiliary_loss_clip": 0.0641859, "auxiliary_loss_mlp": 0.0126521, "balance_loss_clip": 0.06282154, "balance_loss_mlp": 0.01256227, "epoch": 0.7081917931760108, "flos": 23229190869120.0, "grad_norm": 1.6864616389445373, "language_loss": 0.85067743, "learning_rate": 8.285608785887673e-07, "loss": 0.92751539, "num_input_tokens_seen": 254152285, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.08984375, "step": 11779, "time_per_iteration": 2.548153877258301 }, { "auxiliary_loss_clip": 0.06428926, "auxiliary_loss_mlp": 0.01265326, "balance_loss_clip": 0.06284598, "balance_loss_mlp": 0.01255009, "epoch": 0.7082519164286788, "flos": 39317221520640.0, "grad_norm": 1.885632539112991, "language_loss": 0.72144365, "learning_rate": 8.28245236739618e-07, "loss": 0.79838616, "num_input_tokens_seen": 254172805, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10314941, "step": 11780, "time_per_iteration": 2.7158002853393555 }, { "auxiliary_loss_clip": 0.06422918, "auxiliary_loss_mlp": 0.012679, "balance_loss_clip": 0.06281917, "balance_loss_mlp": 0.01257988, "epoch": 0.7083120396813467, "flos": 21657299800320.0, "grad_norm": 1.503784111324504, "language_loss": 0.73049033, "learning_rate": 8.279296393235256e-07, "loss": 0.80739844, "num_input_tokens_seen": 254191890, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09912109, "step": 11781, "time_per_iteration": 4.025541543960571 }, { "auxiliary_loss_clip": 0.06419923, "auxiliary_loss_mlp": 0.01263951, "balance_loss_clip": 0.06279649, "balance_loss_mlp": 0.01254307, "epoch": 0.7083721629340147, "flos": 17572878927360.0, "grad_norm": 1.6013204557918785, "language_loss": 0.77919745, "learning_rate": 8.276140863524585e-07, "loss": 0.85603625, "num_input_tokens_seen": 254210150, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09649658, "step": 11782, "time_per_iteration": 2.5407168865203857 }, { "auxiliary_loss_clip": 0.06422214, "auxiliary_loss_mlp": 0.01264499, "balance_loss_clip": 0.06281832, "balance_loss_mlp": 0.01255368, "epoch": 0.7084322861866827, "flos": 29358086238720.0, "grad_norm": 1.4297602517707817, "language_loss": 0.70067692, "learning_rate": 8.272985778383828e-07, "loss": 0.77754408, "num_input_tokens_seen": 254233015, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09124756, "step": 11783, "time_per_iteration": 2.664487361907959 }, { "auxiliary_loss_clip": 0.06429087, "auxiliary_loss_mlp": 0.01267363, "balance_loss_clip": 0.06282699, "balance_loss_mlp": 0.01257385, "epoch": 0.7084924094393507, "flos": 20200626495360.0, "grad_norm": 1.663167443875314, "language_loss": 0.79123032, "learning_rate": 8.269831137932632e-07, "loss": 0.86819476, "num_input_tokens_seen": 254251345, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.09985352, "step": 11784, "time_per_iteration": 2.572349786758423 }, { "auxiliary_loss_clip": 0.06417479, "auxiliary_loss_mlp": 0.01266241, "balance_loss_clip": 0.06275646, "balance_loss_mlp": 0.01256311, "epoch": 0.7085525326920187, "flos": 23483958808320.0, "grad_norm": 2.3690774418204836, "language_loss": 0.77770543, "learning_rate": 8.266676942290609e-07, "loss": 0.85454261, "num_input_tokens_seen": 254269905, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.0993042, "step": 11785, "time_per_iteration": 2.597888231277466 }, { "auxiliary_loss_clip": 0.0642564, "auxiliary_loss_mlp": 0.01269064, "balance_loss_clip": 0.06283502, "balance_loss_mlp": 0.01259188, "epoch": 0.7086126559446866, "flos": 25966076780160.0, "grad_norm": 1.5264328258459705, "language_loss": 0.78045571, "learning_rate": 8.26352319157738e-07, "loss": 0.85740274, "num_input_tokens_seen": 254289990, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09875488, "step": 11786, "time_per_iteration": 2.5955348014831543 }, { "auxiliary_loss_clip": 0.06426975, "auxiliary_loss_mlp": 0.01269003, "balance_loss_clip": 0.06283306, "balance_loss_mlp": 0.01258704, "epoch": 0.7086727791973546, "flos": 26732141533440.0, "grad_norm": 2.0345154063204705, "language_loss": 0.79094124, "learning_rate": 8.260369885912526e-07, "loss": 0.86790097, "num_input_tokens_seen": 254309085, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10290527, "step": 11787, "time_per_iteration": 2.5843982696533203 }, { "auxiliary_loss_clip": 0.06423034, "auxiliary_loss_mlp": 0.01264878, "balance_loss_clip": 0.06281026, "balance_loss_mlp": 0.01254548, "epoch": 0.7087329024500225, "flos": 21688801735680.0, "grad_norm": 1.6896240758059937, "language_loss": 0.77574623, "learning_rate": 8.257217025415615e-07, "loss": 0.85262537, "num_input_tokens_seen": 254327045, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10314941, "step": 11788, "time_per_iteration": 4.030010938644409 }, { "auxiliary_loss_clip": 0.06430233, "auxiliary_loss_mlp": 0.01267809, "balance_loss_clip": 0.06282843, "balance_loss_mlp": 0.01256257, "epoch": 0.7087930257026905, "flos": 17936827136640.0, "grad_norm": 1.9042458083706617, "language_loss": 0.68574846, "learning_rate": 8.254064610206212e-07, "loss": 0.76272887, "num_input_tokens_seen": 254344585, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.11560059, "step": 11789, "time_per_iteration": 2.5941991806030273 }, { "auxiliary_loss_clip": 0.06420718, "auxiliary_loss_mlp": 0.01269149, "balance_loss_clip": 0.0627533, "balance_loss_mlp": 0.01258903, "epoch": 0.7088531489553584, "flos": 18916682382720.0, "grad_norm": 1.5471090203465585, "language_loss": 0.77541178, "learning_rate": 8.250912640403858e-07, "loss": 0.85231048, "num_input_tokens_seen": 254362470, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10247803, "step": 11790, "time_per_iteration": 2.5472919940948486 }, { "auxiliary_loss_clip": 0.06428123, "auxiliary_loss_mlp": 0.01266784, "balance_loss_clip": 0.06278607, "balance_loss_mlp": 0.01255394, "epoch": 0.7089132722080265, "flos": 27388229402880.0, "grad_norm": 2.2299822317367184, "language_loss": 0.71373677, "learning_rate": 8.247761116128085e-07, "loss": 0.79068577, "num_input_tokens_seen": 254383190, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.11395264, "step": 11791, "time_per_iteration": 2.814227342605591 }, { "auxiliary_loss_clip": 0.06422192, "auxiliary_loss_mlp": 0.01268658, "balance_loss_clip": 0.06279515, "balance_loss_mlp": 0.01258448, "epoch": 0.7089733954606944, "flos": 22169309374080.0, "grad_norm": 1.5200998021422456, "language_loss": 0.82279366, "learning_rate": 8.244610037498376e-07, "loss": 0.89970213, "num_input_tokens_seen": 254403115, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10205078, "step": 11792, "time_per_iteration": 2.7582321166992188 }, { "auxiliary_loss_clip": 0.06424225, "auxiliary_loss_mlp": 0.01267951, "balance_loss_clip": 0.06276215, "balance_loss_mlp": 0.01257681, "epoch": 0.7090335187133624, "flos": 24432731389440.0, "grad_norm": 1.8824771974513268, "language_loss": 0.64818978, "learning_rate": 8.241459404634232e-07, "loss": 0.72511154, "num_input_tokens_seen": 254421875, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.10272217, "step": 11793, "time_per_iteration": 2.5967602729797363 }, { "auxiliary_loss_clip": 0.06419847, "auxiliary_loss_mlp": 0.01266762, "balance_loss_clip": 0.06277847, "balance_loss_mlp": 0.01257231, "epoch": 0.7090936419660303, "flos": 21841684709760.0, "grad_norm": 2.075501516735942, "language_loss": 0.70617932, "learning_rate": 8.238309217655133e-07, "loss": 0.78304541, "num_input_tokens_seen": 254440765, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09527588, "step": 11794, "time_per_iteration": 2.580162763595581 }, { "auxiliary_loss_clip": 0.06423833, "auxiliary_loss_mlp": 0.01264786, "balance_loss_clip": 0.06282482, "balance_loss_mlp": 0.012551, "epoch": 0.7091537652186983, "flos": 20088259770240.0, "grad_norm": 1.7903137511214644, "language_loss": 0.76503664, "learning_rate": 8.23515947668052e-07, "loss": 0.84192282, "num_input_tokens_seen": 254459480, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09692383, "step": 11795, "time_per_iteration": 2.5593416690826416 }, { "auxiliary_loss_clip": 0.06422403, "auxiliary_loss_mlp": 0.01268991, "balance_loss_clip": 0.0627809, "balance_loss_mlp": 0.01258113, "epoch": 0.7092138884713663, "flos": 13156556832000.0, "grad_norm": 1.9787468176694547, "language_loss": 0.75587964, "learning_rate": 8.232010181829838e-07, "loss": 0.83279359, "num_input_tokens_seen": 254473985, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10882568, "step": 11796, "time_per_iteration": 2.4959590435028076 }, { "auxiliary_loss_clip": 0.06430788, "auxiliary_loss_mlp": 0.01267048, "balance_loss_clip": 0.06282753, "balance_loss_mlp": 0.01255526, "epoch": 0.7092740117240343, "flos": 21651262306560.0, "grad_norm": 1.9716521696868348, "language_loss": 0.74630523, "learning_rate": 8.228861333222523e-07, "loss": 0.82328355, "num_input_tokens_seen": 254492135, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.11523438, "step": 11797, "time_per_iteration": 2.5834765434265137 }, { "auxiliary_loss_clip": 0.06419773, "auxiliary_loss_mlp": 0.01265997, "balance_loss_clip": 0.06277134, "balance_loss_mlp": 0.01255644, "epoch": 0.7093341349767023, "flos": 21038835214080.0, "grad_norm": 1.4209911314453354, "language_loss": 0.79419017, "learning_rate": 8.225712930977953e-07, "loss": 0.87104785, "num_input_tokens_seen": 254512865, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10351562, "step": 11798, "time_per_iteration": 2.594419479370117 }, { "auxiliary_loss_clip": 0.06419817, "auxiliary_loss_mlp": 0.01265102, "balance_loss_clip": 0.06278569, "balance_loss_mlp": 0.012551, "epoch": 0.7093942582293702, "flos": 22024140975360.0, "grad_norm": 1.854460899048521, "language_loss": 0.6690926, "learning_rate": 8.222564975215529e-07, "loss": 0.74594176, "num_input_tokens_seen": 254532605, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09997559, "step": 11799, "time_per_iteration": 4.002288341522217 }, { "auxiliary_loss_clip": 0.0642163, "auxiliary_loss_mlp": 0.01266288, "balance_loss_clip": 0.06279524, "balance_loss_mlp": 0.01255518, "epoch": 0.7094543814820382, "flos": 27243019077120.0, "grad_norm": 1.6964762837131486, "language_loss": 0.82035542, "learning_rate": 8.219417466054622e-07, "loss": 0.89723456, "num_input_tokens_seen": 254553780, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10778809, "step": 11800, "time_per_iteration": 2.6091184616088867 }, { "auxiliary_loss_clip": 0.06418818, "auxiliary_loss_mlp": 0.0126349, "balance_loss_clip": 0.06277908, "balance_loss_mlp": 0.01254263, "epoch": 0.7095145047347061, "flos": 12093237319680.0, "grad_norm": 2.0983383542638694, "language_loss": 0.86762559, "learning_rate": 8.21627040361459e-07, "loss": 0.94444865, "num_input_tokens_seen": 254567510, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09222412, "step": 11801, "time_per_iteration": 2.5422847270965576 }, { "auxiliary_loss_clip": 0.06424312, "auxiliary_loss_mlp": 0.0126539, "balance_loss_clip": 0.06280801, "balance_loss_mlp": 0.0125546, "epoch": 0.7095746279873741, "flos": 19388678832000.0, "grad_norm": 1.745901251099213, "language_loss": 0.76267934, "learning_rate": 8.213123788014758e-07, "loss": 0.83957636, "num_input_tokens_seen": 254585565, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.0993042, "step": 11802, "time_per_iteration": 2.5679514408111572 }, { "auxiliary_loss_clip": 0.06422087, "auxiliary_loss_mlp": 0.01271163, "balance_loss_clip": 0.06279442, "balance_loss_mlp": 0.01260678, "epoch": 0.709634751240042, "flos": 21366921075840.0, "grad_norm": 2.3705519991967834, "language_loss": 0.82018375, "learning_rate": 8.209977619374462e-07, "loss": 0.8971163, "num_input_tokens_seen": 254603465, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.1048584, "step": 11803, "time_per_iteration": 3.9200737476348877 }, { "auxiliary_loss_clip": 0.06422624, "auxiliary_loss_mlp": 0.01266183, "balance_loss_clip": 0.06277195, "balance_loss_mlp": 0.01254828, "epoch": 0.7096948744927101, "flos": 13922034606720.0, "grad_norm": 2.273043110472033, "language_loss": 0.67863148, "learning_rate": 8.206831897812995e-07, "loss": 0.75551957, "num_input_tokens_seen": 254620500, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.11346436, "step": 11804, "time_per_iteration": 2.5337743759155273 }, { "auxiliary_loss_clip": 0.06415456, "auxiliary_loss_mlp": 0.01267331, "balance_loss_clip": 0.06278963, "balance_loss_mlp": 0.01258176, "epoch": 0.709754997745378, "flos": 30305936424960.0, "grad_norm": 1.7839107286081541, "language_loss": 0.78613329, "learning_rate": 8.203686623449637e-07, "loss": 0.86296117, "num_input_tokens_seen": 254638565, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.09155273, "step": 11805, "time_per_iteration": 2.635023832321167 }, { "auxiliary_loss_clip": 0.06421471, "auxiliary_loss_mlp": 0.01265174, "balance_loss_clip": 0.0627829, "balance_loss_mlp": 0.01254856, "epoch": 0.709815120998046, "flos": 18521064529920.0, "grad_norm": 3.7315918043617424, "language_loss": 0.79205739, "learning_rate": 8.200541796403667e-07, "loss": 0.86892378, "num_input_tokens_seen": 254657505, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10314941, "step": 11806, "time_per_iteration": 2.536778688430786 }, { "auxiliary_loss_clip": 0.06418268, "auxiliary_loss_mlp": 0.01269313, "balance_loss_clip": 0.0627754, "balance_loss_mlp": 0.01259121, "epoch": 0.7098752442507139, "flos": 22279034695680.0, "grad_norm": 2.2869585026852906, "language_loss": 0.56828415, "learning_rate": 8.197397416794332e-07, "loss": 0.64515996, "num_input_tokens_seen": 254674730, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.10198975, "step": 11807, "time_per_iteration": 2.5988733768463135 }, { "auxiliary_loss_clip": 0.0642737, "auxiliary_loss_mlp": 0.01269687, "balance_loss_clip": 0.06278285, "balance_loss_mlp": 0.01258368, "epoch": 0.7099353675033819, "flos": 19280504810880.0, "grad_norm": 1.9324786721951237, "language_loss": 0.68451941, "learning_rate": 8.194253484740882e-07, "loss": 0.76148999, "num_input_tokens_seen": 254691665, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.11315918, "step": 11808, "time_per_iteration": 2.5343751907348633 }, { "auxiliary_loss_clip": 0.06424873, "auxiliary_loss_mlp": 0.01265732, "balance_loss_clip": 0.06278828, "balance_loss_mlp": 0.01255129, "epoch": 0.70999549075605, "flos": 21915044559360.0, "grad_norm": 2.202966692400813, "language_loss": 0.71388155, "learning_rate": 8.191110000362513e-07, "loss": 0.79078758, "num_input_tokens_seen": 254711610, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.1060791, "step": 11809, "time_per_iteration": 2.5763354301452637 }, { "auxiliary_loss_clip": 0.06321247, "auxiliary_loss_mlp": 0.01252474, "balance_loss_clip": 0.06263201, "balance_loss_mlp": 0.0125106, "epoch": 0.7100556140087179, "flos": 70474280192640.0, "grad_norm": 1.628501835859013, "language_loss": 0.59149611, "learning_rate": 8.187966963778435e-07, "loss": 0.66723335, "num_input_tokens_seen": 254772615, "router_z_loss_clip": 0.58105469, "router_z_loss_mlp": 0.01413727, "step": 11810, "time_per_iteration": 3.257249116897583 }, { "auxiliary_loss_clip": 0.06425421, "auxiliary_loss_mlp": 0.01265922, "balance_loss_clip": 0.06281883, "balance_loss_mlp": 0.01255766, "epoch": 0.7101157372613859, "flos": 23046273406080.0, "grad_norm": 1.6849111002648194, "language_loss": 0.74563062, "learning_rate": 8.18482437510784e-07, "loss": 0.8225441, "num_input_tokens_seen": 254791375, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10162354, "step": 11811, "time_per_iteration": 2.5845248699188232 }, { "auxiliary_loss_clip": 0.06417633, "auxiliary_loss_mlp": 0.01263503, "balance_loss_clip": 0.06279039, "balance_loss_mlp": 0.01253597, "epoch": 0.7101758605140538, "flos": 23192028783360.0, "grad_norm": 1.7216137908335152, "language_loss": 0.83560395, "learning_rate": 8.181682234469882e-07, "loss": 0.91241527, "num_input_tokens_seen": 254809300, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09899902, "step": 11812, "time_per_iteration": 2.584961175918579 }, { "auxiliary_loss_clip": 0.06423466, "auxiliary_loss_mlp": 0.01271077, "balance_loss_clip": 0.06279561, "balance_loss_mlp": 0.0126095, "epoch": 0.7102359837667218, "flos": 23702906327040.0, "grad_norm": 1.5347074851695173, "language_loss": 0.7056036, "learning_rate": 8.178540541983716e-07, "loss": 0.78254902, "num_input_tokens_seen": 254829325, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10119629, "step": 11813, "time_per_iteration": 2.6730241775512695 }, { "auxiliary_loss_clip": 0.06414536, "auxiliary_loss_mlp": 0.01263816, "balance_loss_clip": 0.06274615, "balance_loss_mlp": 0.01254315, "epoch": 0.7102961070193897, "flos": 19397231948160.0, "grad_norm": 1.8273322108726062, "language_loss": 0.81974226, "learning_rate": 8.175399297768495e-07, "loss": 0.89652574, "num_input_tokens_seen": 254847690, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09503174, "step": 11814, "time_per_iteration": 2.557558536529541 }, { "auxiliary_loss_clip": 0.06421077, "auxiliary_loss_mlp": 0.01265332, "balance_loss_clip": 0.06280029, "balance_loss_mlp": 0.01254764, "epoch": 0.7103562302720577, "flos": 21514018118400.0, "grad_norm": 2.2031225997172568, "language_loss": 0.76202905, "learning_rate": 8.172258501943301e-07, "loss": 0.83889318, "num_input_tokens_seen": 254865960, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10559082, "step": 11815, "time_per_iteration": 2.5665669441223145 }, { "auxiliary_loss_clip": 0.0641934, "auxiliary_loss_mlp": 0.01265185, "balance_loss_clip": 0.06280081, "balance_loss_mlp": 0.01256006, "epoch": 0.7104163535247257, "flos": 14539786433280.0, "grad_norm": 1.7550817487225159, "language_loss": 0.7872104, "learning_rate": 8.16911815462725e-07, "loss": 0.86405563, "num_input_tokens_seen": 254882815, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.0916748, "step": 11816, "time_per_iteration": 2.523690938949585 }, { "auxiliary_loss_clip": 0.0642314, "auxiliary_loss_mlp": 0.01265324, "balance_loss_clip": 0.06280606, "balance_loss_mlp": 0.01255633, "epoch": 0.7104764767773937, "flos": 11405018609280.0, "grad_norm": 1.668239688199207, "language_loss": 0.86962652, "learning_rate": 8.165978255939426e-07, "loss": 0.94651115, "num_input_tokens_seen": 254898705, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09692383, "step": 11817, "time_per_iteration": 2.5548923015594482 }, { "auxiliary_loss_clip": 0.06415993, "auxiliary_loss_mlp": 0.01264286, "balance_loss_clip": 0.06275932, "balance_loss_mlp": 0.01254964, "epoch": 0.7105366000300616, "flos": 11694894209280.0, "grad_norm": 2.1365187019664678, "language_loss": 0.85282671, "learning_rate": 8.162838805998897e-07, "loss": 0.92962956, "num_input_tokens_seen": 254913665, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09326172, "step": 11818, "time_per_iteration": 2.5231595039367676 }, { "auxiliary_loss_clip": 0.06419638, "auxiliary_loss_mlp": 0.01266555, "balance_loss_clip": 0.06276692, "balance_loss_mlp": 0.01256249, "epoch": 0.7105967232827296, "flos": 19360027935360.0, "grad_norm": 2.251307116827228, "language_loss": 0.75796318, "learning_rate": 8.159699804924709e-07, "loss": 0.8348251, "num_input_tokens_seen": 254932140, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10308838, "step": 11819, "time_per_iteration": 2.5393176078796387 }, { "auxiliary_loss_clip": 0.06418763, "auxiliary_loss_mlp": 0.01270412, "balance_loss_clip": 0.06277137, "balance_loss_mlp": 0.01259784, "epoch": 0.7106568465353975, "flos": 22937135063040.0, "grad_norm": 1.5475340963707682, "language_loss": 0.71074951, "learning_rate": 8.156561252835883e-07, "loss": 0.78764129, "num_input_tokens_seen": 254951580, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.10626221, "step": 11820, "time_per_iteration": 2.589354991912842 }, { "auxiliary_loss_clip": 0.06419557, "auxiliary_loss_mlp": 0.01267069, "balance_loss_clip": 0.06277338, "balance_loss_mlp": 0.01256894, "epoch": 0.7107169697880655, "flos": 19105805047680.0, "grad_norm": 1.722036937854371, "language_loss": 0.76026011, "learning_rate": 8.153423149851449e-07, "loss": 0.83712631, "num_input_tokens_seen": 254969425, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10174561, "step": 11821, "time_per_iteration": 4.00352931022644 }, { "auxiliary_loss_clip": 0.06327522, "auxiliary_loss_mlp": 0.01250884, "balance_loss_clip": 0.06269506, "balance_loss_mlp": 0.0124944, "epoch": 0.7107770930407336, "flos": 63655950228480.0, "grad_norm": 0.7541745390264487, "language_loss": 0.55221832, "learning_rate": 8.150285496090388e-07, "loss": 0.62800235, "num_input_tokens_seen": 255032680, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.01441956, "step": 11822, "time_per_iteration": 3.1916165351867676 }, { "auxiliary_loss_clip": 0.064115, "auxiliary_loss_mlp": 0.01264255, "balance_loss_clip": 0.06276695, "balance_loss_mlp": 0.01254921, "epoch": 0.7108372162934015, "flos": 22061009571840.0, "grad_norm": 2.130640028230698, "language_loss": 0.60609353, "learning_rate": 8.147148291671688e-07, "loss": 0.68285108, "num_input_tokens_seen": 255054400, "router_z_loss_clip": 1.34863281, "router_z_loss_mlp": 0.09332275, "step": 11823, "time_per_iteration": 2.567342758178711 }, { "auxiliary_loss_clip": 0.06419601, "auxiliary_loss_mlp": 0.01264931, "balance_loss_clip": 0.06278192, "balance_loss_mlp": 0.01255603, "epoch": 0.7108973395460695, "flos": 19141122343680.0, "grad_norm": 1.9655772648221344, "language_loss": 0.71656758, "learning_rate": 8.144011536714322e-07, "loss": 0.79341292, "num_input_tokens_seen": 255072785, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09338379, "step": 11824, "time_per_iteration": 2.572727680206299 }, { "auxiliary_loss_clip": 0.06412472, "auxiliary_loss_mlp": 0.01266213, "balance_loss_clip": 0.0627603, "balance_loss_mlp": 0.01256891, "epoch": 0.7109574627987374, "flos": 17900168175360.0, "grad_norm": 1.5818909556006748, "language_loss": 0.73073626, "learning_rate": 8.140875231337223e-07, "loss": 0.80752307, "num_input_tokens_seen": 255091820, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09320068, "step": 11825, "time_per_iteration": 2.5447211265563965 }, { "auxiliary_loss_clip": 0.064238, "auxiliary_loss_mlp": 0.01266715, "balance_loss_clip": 0.06280722, "balance_loss_mlp": 0.01256553, "epoch": 0.7110175860514054, "flos": 28986129964800.0, "grad_norm": 1.8022373996024401, "language_loss": 0.80132872, "learning_rate": 8.137739375659321e-07, "loss": 0.87823391, "num_input_tokens_seen": 255111720, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10162354, "step": 11826, "time_per_iteration": 2.5965988636016846 }, { "auxiliary_loss_clip": 0.0641318, "auxiliary_loss_mlp": 0.01270074, "balance_loss_clip": 0.06274375, "balance_loss_mlp": 0.01260507, "epoch": 0.7110777093040733, "flos": 26179867272960.0, "grad_norm": 1.428231004475087, "language_loss": 0.83039504, "learning_rate": 8.134603969799527e-07, "loss": 0.90722758, "num_input_tokens_seen": 255133495, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09552002, "step": 11827, "time_per_iteration": 2.6056466102600098 }, { "auxiliary_loss_clip": 0.0642013, "auxiliary_loss_mlp": 0.01267102, "balance_loss_clip": 0.06275766, "balance_loss_mlp": 0.01256582, "epoch": 0.7111378325567413, "flos": 26877184151040.0, "grad_norm": 1.6209680920947358, "language_loss": 0.62499237, "learning_rate": 8.131469013876748e-07, "loss": 0.70186472, "num_input_tokens_seen": 255156880, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10528564, "step": 11828, "time_per_iteration": 4.218080759048462 }, { "auxiliary_loss_clip": 0.06420662, "auxiliary_loss_mlp": 0.01265878, "balance_loss_clip": 0.062792, "balance_loss_mlp": 0.01255834, "epoch": 0.7111979558094093, "flos": 27279216840960.0, "grad_norm": 1.4455420274311392, "language_loss": 0.72042352, "learning_rate": 8.128334508009846e-07, "loss": 0.79728889, "num_input_tokens_seen": 255178920, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10046387, "step": 11829, "time_per_iteration": 2.6420464515686035 }, { "auxiliary_loss_clip": 0.06415951, "auxiliary_loss_mlp": 0.01266745, "balance_loss_clip": 0.06275199, "balance_loss_mlp": 0.01257107, "epoch": 0.7112580790620773, "flos": 25054088941440.0, "grad_norm": 1.9965327252759868, "language_loss": 0.80561888, "learning_rate": 8.125200452317697e-07, "loss": 0.88244587, "num_input_tokens_seen": 255198095, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09643555, "step": 11830, "time_per_iteration": 2.6168153285980225 }, { "auxiliary_loss_clip": 0.0641844, "auxiliary_loss_mlp": 0.01263845, "balance_loss_clip": 0.06276935, "balance_loss_mlp": 0.01253921, "epoch": 0.7113182023147452, "flos": 21652016993280.0, "grad_norm": 1.669801192911215, "language_loss": 0.84715748, "learning_rate": 8.122066846919138e-07, "loss": 0.92398036, "num_input_tokens_seen": 255215860, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09918213, "step": 11831, "time_per_iteration": 2.5432493686676025 }, { "auxiliary_loss_clip": 0.06420197, "auxiliary_loss_mlp": 0.01263532, "balance_loss_clip": 0.06277382, "balance_loss_mlp": 0.01254013, "epoch": 0.7113783255674132, "flos": 21002637450240.0, "grad_norm": 1.6798082253960727, "language_loss": 0.76957023, "learning_rate": 8.118933691932985e-07, "loss": 0.84640747, "num_input_tokens_seen": 255235425, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09515381, "step": 11832, "time_per_iteration": 2.580925226211548 }, { "auxiliary_loss_clip": 0.06325006, "auxiliary_loss_mlp": 0.01251271, "balance_loss_clip": 0.06267812, "balance_loss_mlp": 0.01249933, "epoch": 0.7114384488200811, "flos": 66788705554560.0, "grad_norm": 0.7835712615212499, "language_loss": 0.56558609, "learning_rate": 8.115800987478059e-07, "loss": 0.64134884, "num_input_tokens_seen": 255291680, "router_z_loss_clip": 0.57080078, "router_z_loss_mlp": 0.01339722, "step": 11833, "time_per_iteration": 3.085573673248291 }, { "auxiliary_loss_clip": 0.06417547, "auxiliary_loss_mlp": 0.01265512, "balance_loss_clip": 0.06276204, "balance_loss_mlp": 0.01256369, "epoch": 0.7114985720727491, "flos": 25017136490880.0, "grad_norm": 1.5098709875373693, "language_loss": 0.70948523, "learning_rate": 8.11266873367315e-07, "loss": 0.7863158, "num_input_tokens_seen": 255313880, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09143066, "step": 11834, "time_per_iteration": 2.638155221939087 }, { "auxiliary_loss_clip": 0.06422524, "auxiliary_loss_mlp": 0.01268705, "balance_loss_clip": 0.06278558, "balance_loss_mlp": 0.01257952, "epoch": 0.7115586953254172, "flos": 21476478689280.0, "grad_norm": 1.876693663720011, "language_loss": 0.79880035, "learning_rate": 8.10953693063704e-07, "loss": 0.87571269, "num_input_tokens_seen": 255332390, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10748291, "step": 11835, "time_per_iteration": 2.5683670043945312 }, { "auxiliary_loss_clip": 0.0641453, "auxiliary_loss_mlp": 0.01262965, "balance_loss_clip": 0.06275201, "balance_loss_mlp": 0.01253792, "epoch": 0.7116188185780851, "flos": 28630357528320.0, "grad_norm": 1.6053042291874784, "language_loss": 0.76355815, "learning_rate": 8.10640557848848e-07, "loss": 0.84033304, "num_input_tokens_seen": 255354025, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09173584, "step": 11836, "time_per_iteration": 2.682162284851074 }, { "auxiliary_loss_clip": 0.06417817, "auxiliary_loss_mlp": 0.0126459, "balance_loss_clip": 0.06278138, "balance_loss_mlp": 0.0125537, "epoch": 0.7116789418307531, "flos": 25299339442560.0, "grad_norm": 1.6337763874707492, "language_loss": 0.70126778, "learning_rate": 8.103274677346208e-07, "loss": 0.77809191, "num_input_tokens_seen": 255371400, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09222412, "step": 11837, "time_per_iteration": 2.5724403858184814 }, { "auxiliary_loss_clip": 0.06423999, "auxiliary_loss_mlp": 0.01265613, "balance_loss_clip": 0.06278366, "balance_loss_mlp": 0.01254503, "epoch": 0.711739065083421, "flos": 25564463360640.0, "grad_norm": 1.9907284120861684, "language_loss": 0.62163532, "learning_rate": 8.100144227328958e-07, "loss": 0.69853145, "num_input_tokens_seen": 255390710, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.11114502, "step": 11838, "time_per_iteration": 4.133758306503296 }, { "auxiliary_loss_clip": 0.06421721, "auxiliary_loss_mlp": 0.01266204, "balance_loss_clip": 0.06279622, "balance_loss_mlp": 0.01256697, "epoch": 0.711799188336089, "flos": 26148239556480.0, "grad_norm": 2.128497795898106, "language_loss": 0.67967439, "learning_rate": 8.097014228555426e-07, "loss": 0.75655365, "num_input_tokens_seen": 255408790, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09509277, "step": 11839, "time_per_iteration": 2.717573642730713 }, { "auxiliary_loss_clip": 0.06418766, "auxiliary_loss_mlp": 0.01264894, "balance_loss_clip": 0.06277724, "balance_loss_mlp": 0.01255208, "epoch": 0.7118593115887569, "flos": 21146757672960.0, "grad_norm": 1.9556405990860246, "language_loss": 0.84497011, "learning_rate": 8.093884681144305e-07, "loss": 0.92180669, "num_input_tokens_seen": 255426280, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09680176, "step": 11840, "time_per_iteration": 2.5630991458892822 }, { "auxiliary_loss_clip": 0.06425638, "auxiliary_loss_mlp": 0.01265877, "balance_loss_clip": 0.06280942, "balance_loss_mlp": 0.01256144, "epoch": 0.711919434841425, "flos": 14980951779840.0, "grad_norm": 2.010083394499666, "language_loss": 0.77523971, "learning_rate": 8.090755585214277e-07, "loss": 0.85215485, "num_input_tokens_seen": 255442935, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.09729004, "step": 11841, "time_per_iteration": 2.5360045433044434 }, { "auxiliary_loss_clip": 0.06419913, "auxiliary_loss_mlp": 0.01266455, "balance_loss_clip": 0.06277058, "balance_loss_mlp": 0.01256483, "epoch": 0.7119795580940929, "flos": 16514674513920.0, "grad_norm": 1.9314344155090857, "language_loss": 0.74807292, "learning_rate": 8.087626940883994e-07, "loss": 0.82493663, "num_input_tokens_seen": 255460925, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09967041, "step": 11842, "time_per_iteration": 3.991332769393921 }, { "auxiliary_loss_clip": 0.0632735, "auxiliary_loss_mlp": 0.01254051, "balance_loss_clip": 0.06269833, "balance_loss_mlp": 0.01252925, "epoch": 0.7120396813467609, "flos": 66591434315520.0, "grad_norm": 0.7663395439781714, "language_loss": 0.61434042, "learning_rate": 8.084498748272082e-07, "loss": 0.69015443, "num_input_tokens_seen": 255521360, "router_z_loss_clip": 0.57666016, "router_z_loss_mlp": 0.01127625, "step": 11843, "time_per_iteration": 3.1354167461395264 }, { "auxiliary_loss_clip": 0.06415676, "auxiliary_loss_mlp": 0.01266094, "balance_loss_clip": 0.06275108, "balance_loss_mlp": 0.01256599, "epoch": 0.7120998045994288, "flos": 26440001873280.0, "grad_norm": 1.8118987311137027, "language_loss": 0.80146801, "learning_rate": 8.081371007497171e-07, "loss": 0.87828565, "num_input_tokens_seen": 255541435, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09503174, "step": 11844, "time_per_iteration": 2.596092700958252 }, { "auxiliary_loss_clip": 0.0641935, "auxiliary_loss_mlp": 0.01262345, "balance_loss_clip": 0.06276806, "balance_loss_mlp": 0.01252891, "epoch": 0.7121599278520968, "flos": 16432300350720.0, "grad_norm": 2.6725451875885464, "language_loss": 0.79215825, "learning_rate": 8.078243718677873e-07, "loss": 0.86897522, "num_input_tokens_seen": 255558505, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09448242, "step": 11845, "time_per_iteration": 2.5108325481414795 }, { "auxiliary_loss_clip": 0.06412928, "auxiliary_loss_mlp": 0.01263734, "balance_loss_clip": 0.0627563, "balance_loss_mlp": 0.01254054, "epoch": 0.7122200511047647, "flos": 28957520995200.0, "grad_norm": 3.8622880353389304, "language_loss": 0.77517599, "learning_rate": 8.075116881932762e-07, "loss": 0.8519426, "num_input_tokens_seen": 255577815, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09686279, "step": 11846, "time_per_iteration": 2.6198980808258057 }, { "auxiliary_loss_clip": 0.06426154, "auxiliary_loss_mlp": 0.01266111, "balance_loss_clip": 0.06282942, "balance_loss_mlp": 0.01255692, "epoch": 0.7122801743574327, "flos": 16477428574080.0, "grad_norm": 1.951873345582831, "language_loss": 0.58806461, "learning_rate": 8.071990497380421e-07, "loss": 0.66498733, "num_input_tokens_seen": 255595885, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10418701, "step": 11847, "time_per_iteration": 2.561380624771118 }, { "auxiliary_loss_clip": 0.06410946, "auxiliary_loss_mlp": 0.01265955, "balance_loss_clip": 0.06276128, "balance_loss_mlp": 0.01256227, "epoch": 0.7123402976101008, "flos": 20637263721600.0, "grad_norm": 1.4647437433143173, "language_loss": 0.71698666, "learning_rate": 8.068864565139395e-07, "loss": 0.79375565, "num_input_tokens_seen": 255616750, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.09729004, "step": 11848, "time_per_iteration": 2.599472761154175 }, { "auxiliary_loss_clip": 0.06325669, "auxiliary_loss_mlp": 0.01252222, "balance_loss_clip": 0.06268311, "balance_loss_mlp": 0.01251059, "epoch": 0.7124004208627687, "flos": 62343606781440.0, "grad_norm": 0.8261465091975249, "language_loss": 0.62717175, "learning_rate": 8.065739085328211e-07, "loss": 0.70295066, "num_input_tokens_seen": 255677900, "router_z_loss_clip": 0.57275391, "router_z_loss_mlp": 0.01161194, "step": 11849, "time_per_iteration": 3.1740496158599854 }, { "auxiliary_loss_clip": 0.06420369, "auxiliary_loss_mlp": 0.01263464, "balance_loss_clip": 0.06276949, "balance_loss_mlp": 0.01253736, "epoch": 0.7124605441154367, "flos": 39685278579840.0, "grad_norm": 1.4535200726411308, "language_loss": 0.64560151, "learning_rate": 8.0626140580654e-07, "loss": 0.72243977, "num_input_tokens_seen": 255699140, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.097229, "step": 11850, "time_per_iteration": 2.7325587272644043 }, { "auxiliary_loss_clip": 0.06417513, "auxiliary_loss_mlp": 0.01262708, "balance_loss_clip": 0.06275173, "balance_loss_mlp": 0.01252849, "epoch": 0.7125206673681046, "flos": 28189066400640.0, "grad_norm": 1.5365308726701508, "language_loss": 0.70144403, "learning_rate": 8.05948948346946e-07, "loss": 0.77824628, "num_input_tokens_seen": 255719640, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09857178, "step": 11851, "time_per_iteration": 2.6050713062286377 }, { "auxiliary_loss_clip": 0.0641825, "auxiliary_loss_mlp": 0.01264025, "balance_loss_clip": 0.0627946, "balance_loss_mlp": 0.01254811, "epoch": 0.7125807906207726, "flos": 26184101904000.0, "grad_norm": 1.8660317497416483, "language_loss": 0.83322287, "learning_rate": 8.056365361658882e-07, "loss": 0.91004562, "num_input_tokens_seen": 255740450, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09222412, "step": 11852, "time_per_iteration": 2.6426618099212646 }, { "auxiliary_loss_clip": 0.0642547, "auxiliary_loss_mlp": 0.01265683, "balance_loss_clip": 0.0628019, "balance_loss_mlp": 0.01255228, "epoch": 0.7126409138734405, "flos": 17161706142720.0, "grad_norm": 2.102225535240212, "language_loss": 0.72989416, "learning_rate": 8.053241692752126e-07, "loss": 0.80680561, "num_input_tokens_seen": 255758070, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10461426, "step": 11853, "time_per_iteration": 2.5294556617736816 }, { "auxiliary_loss_clip": 0.06414792, "auxiliary_loss_mlp": 0.01265107, "balance_loss_clip": 0.06280261, "balance_loss_mlp": 0.01256536, "epoch": 0.7127010371261085, "flos": 18775790542080.0, "grad_norm": 1.7357261026092938, "language_loss": 0.92561817, "learning_rate": 8.050118476867635e-07, "loss": 1.00241709, "num_input_tokens_seen": 255775685, "router_z_loss_clip": 1.34667969, "router_z_loss_mlp": 0.08569336, "step": 11854, "time_per_iteration": 2.5944225788116455 }, { "auxiliary_loss_clip": 0.06417709, "auxiliary_loss_mlp": 0.01264113, "balance_loss_clip": 0.06280299, "balance_loss_mlp": 0.0125573, "epoch": 0.7127611603787765, "flos": 20382747344640.0, "grad_norm": 1.7770761937185373, "language_loss": 0.79558676, "learning_rate": 8.046995714123856e-07, "loss": 0.87240493, "num_input_tokens_seen": 255794750, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.08377075, "step": 11855, "time_per_iteration": 2.5596251487731934 }, { "auxiliary_loss_clip": 0.0642025, "auxiliary_loss_mlp": 0.01265206, "balance_loss_clip": 0.06280665, "balance_loss_mlp": 0.01254614, "epoch": 0.7128212836314445, "flos": 20455268653440.0, "grad_norm": 1.5896003428968826, "language_loss": 0.72946465, "learning_rate": 8.043873404639192e-07, "loss": 0.80631924, "num_input_tokens_seen": 255813325, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.105896, "step": 11856, "time_per_iteration": 2.570497989654541 }, { "auxiliary_loss_clip": 0.06425372, "auxiliary_loss_mlp": 0.01266743, "balance_loss_clip": 0.06280676, "balance_loss_mlp": 0.01256562, "epoch": 0.7128814068841124, "flos": 23447593336320.0, "grad_norm": 1.7773567726439194, "language_loss": 0.70446992, "learning_rate": 8.040751548532046e-07, "loss": 0.78139108, "num_input_tokens_seen": 255832470, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10186768, "step": 11857, "time_per_iteration": 2.619739532470703 }, { "auxiliary_loss_clip": 0.06417727, "auxiliary_loss_mlp": 0.01262345, "balance_loss_clip": 0.06278998, "balance_loss_mlp": 0.0125229, "epoch": 0.7129415301367804, "flos": 18228757161600.0, "grad_norm": 1.9619223916044972, "language_loss": 0.85541773, "learning_rate": 8.03763014592081e-07, "loss": 0.93221843, "num_input_tokens_seen": 255849740, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.1005249, "step": 11858, "time_per_iteration": 2.5545237064361572 }, { "auxiliary_loss_clip": 0.06426793, "auxiliary_loss_mlp": 0.01266809, "balance_loss_clip": 0.06281193, "balance_loss_mlp": 0.01256491, "epoch": 0.7130016533894483, "flos": 15529410679680.0, "grad_norm": 1.7876895237709014, "language_loss": 0.80480969, "learning_rate": 8.034509196923829e-07, "loss": 0.88174576, "num_input_tokens_seen": 255866975, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10321045, "step": 11859, "time_per_iteration": 2.5387086868286133 }, { "auxiliary_loss_clip": 0.06415979, "auxiliary_loss_mlp": 0.01264597, "balance_loss_clip": 0.06277189, "balance_loss_mlp": 0.01255484, "epoch": 0.7130617766421163, "flos": 57127804081920.0, "grad_norm": 1.2147811389778256, "language_loss": 0.68902278, "learning_rate": 8.031388701659456e-07, "loss": 0.76582849, "num_input_tokens_seen": 255892915, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09112549, "step": 11860, "time_per_iteration": 4.353181600570679 }, { "auxiliary_loss_clip": 0.0642263, "auxiliary_loss_mlp": 0.01265653, "balance_loss_clip": 0.06280097, "balance_loss_mlp": 0.01254996, "epoch": 0.7131218998947844, "flos": 19793730268800.0, "grad_norm": 1.624913652268994, "language_loss": 0.64540303, "learning_rate": 8.028268660246023e-07, "loss": 0.72228587, "num_input_tokens_seen": 255911480, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10650635, "step": 11861, "time_per_iteration": 2.559432029724121 }, { "auxiliary_loss_clip": 0.06427626, "auxiliary_loss_mlp": 0.01264756, "balance_loss_clip": 0.06283294, "balance_loss_mlp": 0.01254289, "epoch": 0.7131820231474523, "flos": 26659242881280.0, "grad_norm": 1.5648514272154668, "language_loss": 0.67129761, "learning_rate": 8.025149072801849e-07, "loss": 0.7482214, "num_input_tokens_seen": 255931140, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10467529, "step": 11862, "time_per_iteration": 2.6129839420318604 }, { "auxiliary_loss_clip": 0.06421925, "auxiliary_loss_mlp": 0.01262709, "balance_loss_clip": 0.062834, "balance_loss_mlp": 0.01253911, "epoch": 0.7132421464001203, "flos": 29213337110400.0, "grad_norm": 2.0430025364836317, "language_loss": 0.66314554, "learning_rate": 8.022029939445214e-07, "loss": 0.73999178, "num_input_tokens_seen": 255951665, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.08795166, "step": 11863, "time_per_iteration": 2.628603219985962 }, { "auxiliary_loss_clip": 0.06430373, "auxiliary_loss_mlp": 0.01266714, "balance_loss_clip": 0.06283019, "balance_loss_mlp": 0.01255985, "epoch": 0.7133022696527882, "flos": 23079913620480.0, "grad_norm": 1.9032912615592283, "language_loss": 0.65950465, "learning_rate": 8.018911260294414e-07, "loss": 0.73647547, "num_input_tokens_seen": 255970055, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.1072998, "step": 11864, "time_per_iteration": 2.600046157836914 }, { "auxiliary_loss_clip": 0.0642507, "auxiliary_loss_mlp": 0.01266133, "balance_loss_clip": 0.06280956, "balance_loss_mlp": 0.01255798, "epoch": 0.7133623929054562, "flos": 17462860116480.0, "grad_norm": 1.7900456738300166, "language_loss": 0.8591882, "learning_rate": 8.015793035467697e-07, "loss": 0.93610024, "num_input_tokens_seen": 255987720, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10327148, "step": 11865, "time_per_iteration": 2.5732100009918213 }, { "auxiliary_loss_clip": 0.06424809, "auxiliary_loss_mlp": 0.01264449, "balance_loss_clip": 0.06281815, "balance_loss_mlp": 0.01254107, "epoch": 0.7134225161581241, "flos": 19542609982080.0, "grad_norm": 2.673888105521146, "language_loss": 0.74778819, "learning_rate": 8.012675265083304e-07, "loss": 0.82468075, "num_input_tokens_seen": 256005490, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10339355, "step": 11866, "time_per_iteration": 2.580118417739868 }, { "auxiliary_loss_clip": 0.06427209, "auxiliary_loss_mlp": 0.01266799, "balance_loss_clip": 0.06284834, "balance_loss_mlp": 0.01256088, "epoch": 0.7134826394107922, "flos": 26257294045440.0, "grad_norm": 2.969781903902724, "language_loss": 0.70582098, "learning_rate": 8.009557949259464e-07, "loss": 0.7827611, "num_input_tokens_seen": 256026030, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10699463, "step": 11867, "time_per_iteration": 4.023592948913574 }, { "auxiliary_loss_clip": 0.06423378, "auxiliary_loss_mlp": 0.01265341, "balance_loss_clip": 0.06285356, "balance_loss_mlp": 0.012559, "epoch": 0.7135427626634601, "flos": 15820795653120.0, "grad_norm": 1.9861086726920416, "language_loss": 0.71750796, "learning_rate": 8.006441088114397e-07, "loss": 0.79439515, "num_input_tokens_seen": 256043680, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09442139, "step": 11868, "time_per_iteration": 2.555098533630371 }, { "auxiliary_loss_clip": 0.064277, "auxiliary_loss_mlp": 0.01267815, "balance_loss_clip": 0.06281416, "balance_loss_mlp": 0.01256442, "epoch": 0.7136028859161281, "flos": 18229302213120.0, "grad_norm": 3.12793515685221, "language_loss": 0.67024416, "learning_rate": 8.003324681766286e-07, "loss": 0.7471993, "num_input_tokens_seen": 256059705, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.1137085, "step": 11869, "time_per_iteration": 2.5287668704986572 }, { "auxiliary_loss_clip": 0.0642397, "auxiliary_loss_mlp": 0.01264143, "balance_loss_clip": 0.06281941, "balance_loss_mlp": 0.01254797, "epoch": 0.713663009168796, "flos": 24321454767360.0, "grad_norm": 1.784711713732724, "language_loss": 0.78145581, "learning_rate": 8.000208730333298e-07, "loss": 0.85833699, "num_input_tokens_seen": 256079785, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09350586, "step": 11870, "time_per_iteration": 2.5772624015808105 }, { "auxiliary_loss_clip": 0.06421983, "auxiliary_loss_mlp": 0.01265737, "balance_loss_clip": 0.06281244, "balance_loss_mlp": 0.01254215, "epoch": 0.713723132421464, "flos": 26545157147520.0, "grad_norm": 1.6740011321529846, "language_loss": 0.81248748, "learning_rate": 7.997093233933597e-07, "loss": 0.88936472, "num_input_tokens_seen": 256099000, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.11523438, "step": 11871, "time_per_iteration": 2.5912179946899414 }, { "auxiliary_loss_clip": 0.06424495, "auxiliary_loss_mlp": 0.01271667, "balance_loss_clip": 0.0627932, "balance_loss_mlp": 0.01261606, "epoch": 0.7137832556741319, "flos": 19871911728000.0, "grad_norm": 1.5829295770201834, "language_loss": 0.7882883, "learning_rate": 7.993978192685331e-07, "loss": 0.86524999, "num_input_tokens_seen": 256117985, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10064697, "step": 11872, "time_per_iteration": 2.5886917114257812 }, { "auxiliary_loss_clip": 0.0642878, "auxiliary_loss_mlp": 0.01266068, "balance_loss_clip": 0.06282134, "balance_loss_mlp": 0.01255471, "epoch": 0.7138433789267999, "flos": 21695300426880.0, "grad_norm": 2.778673892784534, "language_loss": 0.83695388, "learning_rate": 7.990863606706606e-07, "loss": 0.91390234, "num_input_tokens_seen": 256134350, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10601807, "step": 11873, "time_per_iteration": 2.5821800231933594 }, { "auxiliary_loss_clip": 0.06421348, "auxiliary_loss_mlp": 0.01264367, "balance_loss_clip": 0.06282504, "balance_loss_mlp": 0.01255295, "epoch": 0.713903502179468, "flos": 17608447785600.0, "grad_norm": 1.9468236794060132, "language_loss": 0.8621608, "learning_rate": 7.987749476115539e-07, "loss": 0.93901789, "num_input_tokens_seen": 256150610, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09075928, "step": 11874, "time_per_iteration": 2.5936667919158936 }, { "auxiliary_loss_clip": 0.06423455, "auxiliary_loss_mlp": 0.01266033, "balance_loss_clip": 0.06282119, "balance_loss_mlp": 0.01255602, "epoch": 0.7139636254321359, "flos": 18046091260800.0, "grad_norm": 1.8692702203579603, "language_loss": 0.82924867, "learning_rate": 7.984635801030228e-07, "loss": 0.90614355, "num_input_tokens_seen": 256168620, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10437012, "step": 11875, "time_per_iteration": 2.575718402862549 }, { "auxiliary_loss_clip": 0.06429, "auxiliary_loss_mlp": 0.01269737, "balance_loss_clip": 0.06278695, "balance_loss_mlp": 0.01257638, "epoch": 0.7140237486848039, "flos": 23337826087680.0, "grad_norm": 1.7885073587360967, "language_loss": 0.69968235, "learning_rate": 7.981522581568721e-07, "loss": 0.77666968, "num_input_tokens_seen": 256186700, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.12091064, "step": 11876, "time_per_iteration": 2.595485210418701 }, { "auxiliary_loss_clip": 0.0642875, "auxiliary_loss_mlp": 0.01263351, "balance_loss_clip": 0.06283395, "balance_loss_mlp": 0.01253093, "epoch": 0.7140838719374718, "flos": 16842760375680.0, "grad_norm": 1.9859936453485965, "language_loss": 0.7825883, "learning_rate": 7.978409817849079e-07, "loss": 0.85950929, "num_input_tokens_seen": 256205390, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10266113, "step": 11877, "time_per_iteration": 2.5910892486572266 }, { "auxiliary_loss_clip": 0.0642576, "auxiliary_loss_mlp": 0.01264398, "balance_loss_clip": 0.06285246, "balance_loss_mlp": 0.01254986, "epoch": 0.7141439951901398, "flos": 21148350900480.0, "grad_norm": 1.7037101278088589, "language_loss": 0.69620568, "learning_rate": 7.97529750998934e-07, "loss": 0.77310729, "num_input_tokens_seen": 256224575, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09405518, "step": 11878, "time_per_iteration": 4.0183751583099365 }, { "auxiliary_loss_clip": 0.0642294, "auxiliary_loss_mlp": 0.01265839, "balance_loss_clip": 0.0628531, "balance_loss_mlp": 0.01256196, "epoch": 0.7142041184428077, "flos": 24724661414400.0, "grad_norm": 2.0242816802669643, "language_loss": 0.67786014, "learning_rate": 7.972185658107535e-07, "loss": 0.75474799, "num_input_tokens_seen": 256242130, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09637451, "step": 11879, "time_per_iteration": 2.648632526397705 }, { "auxiliary_loss_clip": 0.06425796, "auxiliary_loss_mlp": 0.01264403, "balance_loss_clip": 0.06283393, "balance_loss_mlp": 0.01254008, "epoch": 0.7142642416954758, "flos": 21914667216000.0, "grad_norm": 1.5834443128411841, "language_loss": 0.69209778, "learning_rate": 7.969074262321646e-07, "loss": 0.76899981, "num_input_tokens_seen": 256261920, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10400391, "step": 11880, "time_per_iteration": 2.554896593093872 }, { "auxiliary_loss_clip": 0.06423704, "auxiliary_loss_mlp": 0.01266042, "balance_loss_clip": 0.06278691, "balance_loss_mlp": 0.01256047, "epoch": 0.7143243649481437, "flos": 20810579892480.0, "grad_norm": 2.114121016784874, "language_loss": 0.80429792, "learning_rate": 7.965963322749674e-07, "loss": 0.88119543, "num_input_tokens_seen": 256277970, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.09991455, "step": 11881, "time_per_iteration": 2.5548884868621826 }, { "auxiliary_loss_clip": 0.06421332, "auxiliary_loss_mlp": 0.01265677, "balance_loss_clip": 0.06279621, "balance_loss_mlp": 0.01256397, "epoch": 0.7143844882008117, "flos": 27242348244480.0, "grad_norm": 1.5395721609615236, "language_loss": 0.64330339, "learning_rate": 7.962852839509579e-07, "loss": 0.72017348, "num_input_tokens_seen": 256298205, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09283447, "step": 11882, "time_per_iteration": 4.008605480194092 }, { "auxiliary_loss_clip": 0.06427464, "auxiliary_loss_mlp": 0.01265537, "balance_loss_clip": 0.06283346, "balance_loss_mlp": 0.01255344, "epoch": 0.7144446114534796, "flos": 17935150055040.0, "grad_norm": 1.7127589633592881, "language_loss": 0.69216663, "learning_rate": 7.959742812719304e-07, "loss": 0.76909661, "num_input_tokens_seen": 256316685, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10186768, "step": 11883, "time_per_iteration": 2.543919086456299 }, { "auxiliary_loss_clip": 0.06423996, "auxiliary_loss_mlp": 0.01265181, "balance_loss_clip": 0.06285951, "balance_loss_mlp": 0.01255061, "epoch": 0.7145047347061476, "flos": 20747282532480.0, "grad_norm": 1.8376610481602607, "language_loss": 0.78034389, "learning_rate": 7.956633242496788e-07, "loss": 0.85723561, "num_input_tokens_seen": 256334205, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.10125732, "step": 11884, "time_per_iteration": 2.550131320953369 }, { "auxiliary_loss_clip": 0.06430159, "auxiliary_loss_mlp": 0.01269306, "balance_loss_clip": 0.06281245, "balance_loss_mlp": 0.01257916, "epoch": 0.7145648579588155, "flos": 21184967934720.0, "grad_norm": 3.578268219487957, "language_loss": 0.73933828, "learning_rate": 7.953524128959954e-07, "loss": 0.81633294, "num_input_tokens_seen": 256353340, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.1138916, "step": 11885, "time_per_iteration": 2.613107681274414 }, { "auxiliary_loss_clip": 0.06323455, "auxiliary_loss_mlp": 0.01255995, "balance_loss_clip": 0.06265995, "balance_loss_mlp": 0.01254675, "epoch": 0.7146249812114835, "flos": 64805207702400.0, "grad_norm": 0.9434627683111589, "language_loss": 0.66433525, "learning_rate": 7.95041547222669e-07, "loss": 0.74012977, "num_input_tokens_seen": 256411550, "router_z_loss_clip": 0.57714844, "router_z_loss_mlp": 0.01320648, "step": 11886, "time_per_iteration": 3.301645278930664 }, { "auxiliary_loss_clip": 0.06421855, "auxiliary_loss_mlp": 0.01264042, "balance_loss_clip": 0.06280973, "balance_loss_mlp": 0.01254387, "epoch": 0.7146851044641516, "flos": 18119744599680.0, "grad_norm": 1.7845735641774423, "language_loss": 0.75109524, "learning_rate": 7.947307272414874e-07, "loss": 0.82795417, "num_input_tokens_seen": 256430360, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09649658, "step": 11887, "time_per_iteration": 2.56280255317688 }, { "auxiliary_loss_clip": 0.06425957, "auxiliary_loss_mlp": 0.0126436, "balance_loss_clip": 0.06282368, "balance_loss_mlp": 0.01254484, "epoch": 0.7147452277168195, "flos": 19249715635200.0, "grad_norm": 1.4907241239494378, "language_loss": 0.71994412, "learning_rate": 7.944199529642372e-07, "loss": 0.79684734, "num_input_tokens_seen": 256449750, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09881592, "step": 11888, "time_per_iteration": 2.590287208557129 }, { "auxiliary_loss_clip": 0.06429073, "auxiliary_loss_mlp": 0.01266342, "balance_loss_clip": 0.06282561, "balance_loss_mlp": 0.01255667, "epoch": 0.7148053509694875, "flos": 23770773734400.0, "grad_norm": 2.230769534603448, "language_loss": 0.84491003, "learning_rate": 7.941092244027041e-07, "loss": 0.92186415, "num_input_tokens_seen": 256467330, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10675049, "step": 11889, "time_per_iteration": 2.596134901046753 }, { "auxiliary_loss_clip": 0.06418861, "auxiliary_loss_mlp": 0.01265127, "balance_loss_clip": 0.06276277, "balance_loss_mlp": 0.01255316, "epoch": 0.7148654742221554, "flos": 22490770763520.0, "grad_norm": 1.6677832054674222, "language_loss": 0.76174784, "learning_rate": 7.937985415686695e-07, "loss": 0.8385877, "num_input_tokens_seen": 256485705, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.0980835, "step": 11890, "time_per_iteration": 2.585458278656006 }, { "auxiliary_loss_clip": 0.06417584, "auxiliary_loss_mlp": 0.01265451, "balance_loss_clip": 0.06277614, "balance_loss_mlp": 0.01255503, "epoch": 0.7149255974748234, "flos": 24685822247040.0, "grad_norm": 1.528670779751692, "language_loss": 0.74275267, "learning_rate": 7.934879044739147e-07, "loss": 0.81958306, "num_input_tokens_seen": 256504755, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.0994873, "step": 11891, "time_per_iteration": 2.5831973552703857 }, { "auxiliary_loss_clip": 0.06421439, "auxiliary_loss_mlp": 0.01269275, "balance_loss_clip": 0.06278056, "balance_loss_mlp": 0.01258451, "epoch": 0.7149857207274913, "flos": 18411464989440.0, "grad_norm": 1.682977330735754, "language_loss": 0.67861897, "learning_rate": 7.931773131302211e-07, "loss": 0.75552607, "num_input_tokens_seen": 256523670, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10827637, "step": 11892, "time_per_iteration": 2.550248861312866 }, { "auxiliary_loss_clip": 0.06423526, "auxiliary_loss_mlp": 0.01268732, "balance_loss_clip": 0.06276938, "balance_loss_mlp": 0.01257306, "epoch": 0.7150458439801594, "flos": 24975907482240.0, "grad_norm": 2.2837699122266373, "language_loss": 0.7394563, "learning_rate": 7.928667675493632e-07, "loss": 0.81637889, "num_input_tokens_seen": 256542225, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.11419678, "step": 11893, "time_per_iteration": 2.594336986541748 }, { "auxiliary_loss_clip": 0.0642622, "auxiliary_loss_mlp": 0.01265273, "balance_loss_clip": 0.06278225, "balance_loss_mlp": 0.01253548, "epoch": 0.7151059672328273, "flos": 16696376092800.0, "grad_norm": 1.9195223243130066, "language_loss": 0.6707831, "learning_rate": 7.925562677431185e-07, "loss": 0.74769807, "num_input_tokens_seen": 256560730, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.11724854, "step": 11894, "time_per_iteration": 2.5606822967529297 }, { "auxiliary_loss_clip": 0.06425279, "auxiliary_loss_mlp": 0.01265466, "balance_loss_clip": 0.06280109, "balance_loss_mlp": 0.01255888, "epoch": 0.7151660904854953, "flos": 27279216840960.0, "grad_norm": 1.647415994779245, "language_loss": 0.7798779, "learning_rate": 7.922458137232613e-07, "loss": 0.8567853, "num_input_tokens_seen": 256580505, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.09576416, "step": 11895, "time_per_iteration": 2.612175703048706 }, { "auxiliary_loss_clip": 0.06424628, "auxiliary_loss_mlp": 0.01263228, "balance_loss_clip": 0.06279144, "balance_loss_mlp": 0.01252952, "epoch": 0.7152262137381632, "flos": 18338063212800.0, "grad_norm": 2.320104164752345, "language_loss": 0.6982488, "learning_rate": 7.919354055015643e-07, "loss": 0.77512741, "num_input_tokens_seen": 256597330, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.1026001, "step": 11896, "time_per_iteration": 2.558387279510498 }, { "auxiliary_loss_clip": 0.06426386, "auxiliary_loss_mlp": 0.01269436, "balance_loss_clip": 0.06280282, "balance_loss_mlp": 0.01258141, "epoch": 0.7152863369908312, "flos": 21805822362240.0, "grad_norm": 1.9672411241822398, "language_loss": 0.86807519, "learning_rate": 7.91625043089798e-07, "loss": 0.94503343, "num_input_tokens_seen": 256616030, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.11297607, "step": 11897, "time_per_iteration": 2.5962791442871094 }, { "auxiliary_loss_clip": 0.06418338, "auxiliary_loss_mlp": 0.01264339, "balance_loss_clip": 0.06278045, "balance_loss_mlp": 0.01253777, "epoch": 0.7153464602434991, "flos": 22164068494080.0, "grad_norm": 3.6459524524787272, "language_loss": 0.7848177, "learning_rate": 7.913147264997304e-07, "loss": 0.86164439, "num_input_tokens_seen": 256635570, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.10559082, "step": 11898, "time_per_iteration": 2.5611376762390137 }, { "auxiliary_loss_clip": 0.06429429, "auxiliary_loss_mlp": 0.01263362, "balance_loss_clip": 0.06282657, "balance_loss_mlp": 0.01252866, "epoch": 0.7154065834961671, "flos": 24722732770560.0, "grad_norm": 1.6610967361275368, "language_loss": 0.73287153, "learning_rate": 7.910044557431302e-07, "loss": 0.80979943, "num_input_tokens_seen": 256655290, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.10498047, "step": 11899, "time_per_iteration": 2.5867559909820557 }, { "auxiliary_loss_clip": 0.06419364, "auxiliary_loss_mlp": 0.01266505, "balance_loss_clip": 0.06276614, "balance_loss_mlp": 0.01256003, "epoch": 0.7154667067488351, "flos": 22608084879360.0, "grad_norm": 1.8244209893896748, "language_loss": 0.75857794, "learning_rate": 7.906942308317614e-07, "loss": 0.8354367, "num_input_tokens_seen": 256671605, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.1050415, "step": 11900, "time_per_iteration": 4.006478548049927 }, { "auxiliary_loss_clip": 0.06420366, "auxiliary_loss_mlp": 0.01264527, "balance_loss_clip": 0.06277095, "balance_loss_mlp": 0.01254991, "epoch": 0.7155268300015031, "flos": 18777216061440.0, "grad_norm": 1.7579505967774267, "language_loss": 0.80941492, "learning_rate": 7.903840517773886e-07, "loss": 0.88626385, "num_input_tokens_seen": 256689680, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09545898, "step": 11901, "time_per_iteration": 2.528820037841797 }, { "auxiliary_loss_clip": 0.06426433, "auxiliary_loss_mlp": 0.01266843, "balance_loss_clip": 0.06277645, "balance_loss_mlp": 0.01255846, "epoch": 0.7155869532541711, "flos": 18302242792320.0, "grad_norm": 2.10267669011795, "language_loss": 0.82018697, "learning_rate": 7.900739185917744e-07, "loss": 0.89711976, "num_input_tokens_seen": 256707760, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.10992432, "step": 11902, "time_per_iteration": 2.5189127922058105 }, { "auxiliary_loss_clip": 0.06417349, "auxiliary_loss_mlp": 0.0126625, "balance_loss_clip": 0.06274388, "balance_loss_mlp": 0.01256189, "epoch": 0.715647076506839, "flos": 11985063298560.0, "grad_norm": 1.7176792639575003, "language_loss": 0.68492067, "learning_rate": 7.897638312866785e-07, "loss": 0.76175666, "num_input_tokens_seen": 256724150, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.10064697, "step": 11903, "time_per_iteration": 2.542592763900757 }, { "auxiliary_loss_clip": 0.06420992, "auxiliary_loss_mlp": 0.01266908, "balance_loss_clip": 0.06280468, "balance_loss_mlp": 0.01257323, "epoch": 0.715707199759507, "flos": 18957408266880.0, "grad_norm": 1.5785889895115817, "language_loss": 0.75987262, "learning_rate": 7.894537898738589e-07, "loss": 0.83675158, "num_input_tokens_seen": 256742780, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.0958252, "step": 11904, "time_per_iteration": 2.5614612102508545 }, { "auxiliary_loss_clip": 0.06420548, "auxiliary_loss_mlp": 0.01266144, "balance_loss_clip": 0.06278312, "balance_loss_mlp": 0.01255153, "epoch": 0.7157673230121749, "flos": 15309792328320.0, "grad_norm": 2.1240534853789472, "language_loss": 0.72007704, "learning_rate": 7.891437943650727e-07, "loss": 0.79694396, "num_input_tokens_seen": 256761355, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10998535, "step": 11905, "time_per_iteration": 2.5643417835235596 }, { "auxiliary_loss_clip": 0.06419225, "auxiliary_loss_mlp": 0.01265784, "balance_loss_clip": 0.06277489, "balance_loss_mlp": 0.01256092, "epoch": 0.715827446264843, "flos": 23228561963520.0, "grad_norm": 1.6091754983676376, "language_loss": 0.78391975, "learning_rate": 7.88833844772076e-07, "loss": 0.86076987, "num_input_tokens_seen": 256781335, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09698486, "step": 11906, "time_per_iteration": 2.549077033996582 }, { "auxiliary_loss_clip": 0.06311063, "auxiliary_loss_mlp": 0.01253387, "balance_loss_clip": 0.06253485, "balance_loss_mlp": 0.01252091, "epoch": 0.7158875695175109, "flos": 60993011145600.0, "grad_norm": 0.7163949416303262, "language_loss": 0.55231231, "learning_rate": 7.885239411066205e-07, "loss": 0.62795681, "num_input_tokens_seen": 256838890, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.01296234, "step": 11907, "time_per_iteration": 4.59168267250061 }, { "auxiliary_loss_clip": 0.06417299, "auxiliary_loss_mlp": 0.01263074, "balance_loss_clip": 0.06274653, "balance_loss_mlp": 0.01252995, "epoch": 0.7159476927701789, "flos": 17134480765440.0, "grad_norm": 1.8132498606229754, "language_loss": 0.69808698, "learning_rate": 7.882140833804593e-07, "loss": 0.77489066, "num_input_tokens_seen": 256858145, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10083008, "step": 11908, "time_per_iteration": 2.536618947982788 }, { "auxiliary_loss_clip": 0.06425177, "auxiliary_loss_mlp": 0.0126509, "balance_loss_clip": 0.06281348, "balance_loss_mlp": 0.01254844, "epoch": 0.7160078160228468, "flos": 22496934038400.0, "grad_norm": 1.6081448173722779, "language_loss": 0.71487558, "learning_rate": 7.879042716053415e-07, "loss": 0.79177821, "num_input_tokens_seen": 256878545, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10241699, "step": 11909, "time_per_iteration": 2.5889217853546143 }, { "auxiliary_loss_clip": 0.06423947, "auxiliary_loss_mlp": 0.01265722, "balance_loss_clip": 0.0627979, "balance_loss_mlp": 0.01254916, "epoch": 0.7160679392755148, "flos": 30598704990720.0, "grad_norm": 1.4642247124682968, "language_loss": 0.75039995, "learning_rate": 7.875945057930144e-07, "loss": 0.82729661, "num_input_tokens_seen": 256899920, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10809326, "step": 11910, "time_per_iteration": 2.6291489601135254 }, { "auxiliary_loss_clip": 0.06419504, "auxiliary_loss_mlp": 0.01266621, "balance_loss_clip": 0.06277362, "balance_loss_mlp": 0.01256435, "epoch": 0.7161280625281827, "flos": 21329884771200.0, "grad_norm": 1.4838778956404404, "language_loss": 0.76637864, "learning_rate": 7.872847859552251e-07, "loss": 0.84323996, "num_input_tokens_seen": 256918460, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10192871, "step": 11911, "time_per_iteration": 2.5470080375671387 }, { "auxiliary_loss_clip": 0.06424088, "auxiliary_loss_mlp": 0.01265722, "balance_loss_clip": 0.06280029, "balance_loss_mlp": 0.01255005, "epoch": 0.7161881857808508, "flos": 61873218288000.0, "grad_norm": 1.6670438904088398, "language_loss": 0.58982587, "learning_rate": 7.869751121037192e-07, "loss": 0.66672397, "num_input_tokens_seen": 256942015, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10723877, "step": 11912, "time_per_iteration": 2.9218618869781494 }, { "auxiliary_loss_clip": 0.06420351, "auxiliary_loss_mlp": 0.0126403, "balance_loss_clip": 0.06280626, "balance_loss_mlp": 0.0125388, "epoch": 0.7162483090335187, "flos": 20818126759680.0, "grad_norm": 1.7255306332370457, "language_loss": 0.78525233, "learning_rate": 7.866654842502376e-07, "loss": 0.86209619, "num_input_tokens_seen": 256961065, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.10137939, "step": 11913, "time_per_iteration": 2.58268141746521 }, { "auxiliary_loss_clip": 0.06418011, "auxiliary_loss_mlp": 0.01267567, "balance_loss_clip": 0.06278864, "balance_loss_mlp": 0.01257846, "epoch": 0.7163084322861867, "flos": 24104393965440.0, "grad_norm": 1.604958526038709, "language_loss": 0.74260056, "learning_rate": 7.863559024065234e-07, "loss": 0.81945634, "num_input_tokens_seen": 256982165, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.097229, "step": 11914, "time_per_iteration": 2.5820376873016357 }, { "auxiliary_loss_clip": 0.06413803, "auxiliary_loss_mlp": 0.01263887, "balance_loss_clip": 0.06275682, "balance_loss_mlp": 0.01254279, "epoch": 0.7163685555388547, "flos": 20086540761600.0, "grad_norm": 1.6811987482136386, "language_loss": 0.74259388, "learning_rate": 7.860463665843143e-07, "loss": 0.81937081, "num_input_tokens_seen": 256999825, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09613037, "step": 11915, "time_per_iteration": 2.5674514770507812 }, { "auxiliary_loss_clip": 0.06419455, "auxiliary_loss_mlp": 0.01265003, "balance_loss_clip": 0.06276312, "balance_loss_mlp": 0.01254792, "epoch": 0.7164286787915226, "flos": 17462692408320.0, "grad_norm": 1.7410673863357053, "language_loss": 0.81108159, "learning_rate": 7.85736876795349e-07, "loss": 0.88792616, "num_input_tokens_seen": 257017450, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10217285, "step": 11916, "time_per_iteration": 2.5451066493988037 }, { "auxiliary_loss_clip": 0.06419302, "auxiliary_loss_mlp": 0.012691, "balance_loss_clip": 0.06275651, "balance_loss_mlp": 0.01259027, "epoch": 0.7164888020441906, "flos": 19724982393600.0, "grad_norm": 2.3657217771431074, "language_loss": 0.68611813, "learning_rate": 7.854274330513626e-07, "loss": 0.76300216, "num_input_tokens_seen": 257035465, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10070801, "step": 11917, "time_per_iteration": 2.5671470165252686 }, { "auxiliary_loss_clip": 0.06417105, "auxiliary_loss_mlp": 0.01268937, "balance_loss_clip": 0.06275123, "balance_loss_mlp": 0.01258328, "epoch": 0.7165489252968585, "flos": 21476939886720.0, "grad_norm": 1.5422409459127875, "language_loss": 0.76236475, "learning_rate": 7.851180353640896e-07, "loss": 0.83922517, "num_input_tokens_seen": 257053750, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.1060791, "step": 11918, "time_per_iteration": 3.9865493774414062 }, { "auxiliary_loss_clip": 0.06318708, "auxiliary_loss_mlp": 0.01252564, "balance_loss_clip": 0.06261379, "balance_loss_mlp": 0.01251451, "epoch": 0.7166090485495266, "flos": 69949426216320.0, "grad_norm": 0.7276753626598385, "language_loss": 0.537808, "learning_rate": 7.848086837452639e-07, "loss": 0.61352074, "num_input_tokens_seen": 257121215, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.01114655, "step": 11919, "time_per_iteration": 3.2275726795196533 }, { "auxiliary_loss_clip": 0.06426968, "auxiliary_loss_mlp": 0.01264649, "balance_loss_clip": 0.06282441, "balance_loss_mlp": 0.01254826, "epoch": 0.7166691718021945, "flos": 27351151171200.0, "grad_norm": 2.5688772933638373, "language_loss": 0.69228017, "learning_rate": 7.844993782066132e-07, "loss": 0.76919639, "num_input_tokens_seen": 257143370, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.0980835, "step": 11920, "time_per_iteration": 2.6301443576812744 }, { "auxiliary_loss_clip": 0.06420311, "auxiliary_loss_mlp": 0.01264802, "balance_loss_clip": 0.06277922, "balance_loss_mlp": 0.01254788, "epoch": 0.7167292950548625, "flos": 30416667995520.0, "grad_norm": 1.9597366797049447, "language_loss": 0.75253868, "learning_rate": 7.841901187598678e-07, "loss": 0.82938981, "num_input_tokens_seen": 257162160, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10009766, "step": 11921, "time_per_iteration": 2.6285324096679688 }, { "auxiliary_loss_clip": 0.06430703, "auxiliary_loss_mlp": 0.01267253, "balance_loss_clip": 0.06283048, "balance_loss_mlp": 0.01254373, "epoch": 0.7167894183075304, "flos": 14575942270080.0, "grad_norm": 2.0134818507182772, "language_loss": 0.75747657, "learning_rate": 7.83880905416755e-07, "loss": 0.83445609, "num_input_tokens_seen": 257179300, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.12890625, "step": 11922, "time_per_iteration": 4.08564019203186 }, { "auxiliary_loss_clip": 0.06323074, "auxiliary_loss_mlp": 0.01256751, "balance_loss_clip": 0.06265719, "balance_loss_mlp": 0.01255368, "epoch": 0.7168495415601984, "flos": 64128365948160.0, "grad_norm": 0.7378621782420304, "language_loss": 0.55145848, "learning_rate": 7.83571738189001e-07, "loss": 0.62725663, "num_input_tokens_seen": 257235470, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.01384735, "step": 11923, "time_per_iteration": 2.9819798469543457 }, { "auxiliary_loss_clip": 0.06422536, "auxiliary_loss_mlp": 0.01266803, "balance_loss_clip": 0.0627806, "balance_loss_mlp": 0.01256092, "epoch": 0.7169096648128663, "flos": 24688421723520.0, "grad_norm": 1.942460068335334, "language_loss": 0.77284062, "learning_rate": 7.832626170883279e-07, "loss": 0.84973395, "num_input_tokens_seen": 257255850, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10717773, "step": 11924, "time_per_iteration": 2.5809130668640137 }, { "auxiliary_loss_clip": 0.06416043, "auxiliary_loss_mlp": 0.01264482, "balance_loss_clip": 0.06275989, "balance_loss_mlp": 0.01254928, "epoch": 0.7169697880655344, "flos": 20673754974720.0, "grad_norm": 2.0285993032167973, "language_loss": 0.68764478, "learning_rate": 7.829535421264588e-07, "loss": 0.76445007, "num_input_tokens_seen": 257275425, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09552002, "step": 11925, "time_per_iteration": 2.5669591426849365 }, { "auxiliary_loss_clip": 0.06411487, "auxiliary_loss_mlp": 0.01263981, "balance_loss_clip": 0.06274663, "balance_loss_mlp": 0.01255034, "epoch": 0.7170299113182023, "flos": 21039044849280.0, "grad_norm": 1.9296186011043792, "language_loss": 0.77644902, "learning_rate": 7.826445133151133e-07, "loss": 0.85320365, "num_input_tokens_seen": 257295740, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.08953857, "step": 11926, "time_per_iteration": 2.5791046619415283 }, { "auxiliary_loss_clip": 0.06427565, "auxiliary_loss_mlp": 0.01267204, "balance_loss_clip": 0.06279796, "balance_loss_mlp": 0.01256439, "epoch": 0.7170900345708703, "flos": 22899931050240.0, "grad_norm": 1.9825263904123391, "language_loss": 0.77169043, "learning_rate": 7.823355306660093e-07, "loss": 0.84863806, "num_input_tokens_seen": 257315970, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.10760498, "step": 11927, "time_per_iteration": 2.5799756050109863 }, { "auxiliary_loss_clip": 0.06417876, "auxiliary_loss_mlp": 0.01264133, "balance_loss_clip": 0.06279729, "balance_loss_mlp": 0.01253767, "epoch": 0.7171501578235383, "flos": 15523331258880.0, "grad_norm": 1.686881986164599, "language_loss": 0.69600445, "learning_rate": 7.820265941908642e-07, "loss": 0.77282453, "num_input_tokens_seen": 257334230, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.10375977, "step": 11928, "time_per_iteration": 2.5330090522766113 }, { "auxiliary_loss_clip": 0.06416397, "auxiliary_loss_mlp": 0.01264277, "balance_loss_clip": 0.06278019, "balance_loss_mlp": 0.01254401, "epoch": 0.7172102810762062, "flos": 26111496741120.0, "grad_norm": 1.8332535760191528, "language_loss": 0.65306008, "learning_rate": 7.817177039013931e-07, "loss": 0.72986686, "num_input_tokens_seen": 257352145, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09881592, "step": 11929, "time_per_iteration": 2.5903429985046387 }, { "auxiliary_loss_clip": 0.06424293, "auxiliary_loss_mlp": 0.01263382, "balance_loss_clip": 0.06281151, "balance_loss_mlp": 0.01252957, "epoch": 0.7172704043288742, "flos": 21513011869440.0, "grad_norm": 2.0040827448014165, "language_loss": 0.70432997, "learning_rate": 7.81408859809308e-07, "loss": 0.78120679, "num_input_tokens_seen": 257371460, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10430908, "step": 11930, "time_per_iteration": 2.54923939704895 }, { "auxiliary_loss_clip": 0.06419601, "auxiliary_loss_mlp": 0.01266525, "balance_loss_clip": 0.06276052, "balance_loss_mlp": 0.01256345, "epoch": 0.7173305275815421, "flos": 18776964499200.0, "grad_norm": 1.69188712010125, "language_loss": 0.80781704, "learning_rate": 7.811000619263219e-07, "loss": 0.88467836, "num_input_tokens_seen": 257390800, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10186768, "step": 11931, "time_per_iteration": 2.552936553955078 }, { "auxiliary_loss_clip": 0.06415687, "auxiliary_loss_mlp": 0.0126339, "balance_loss_clip": 0.06275789, "balance_loss_mlp": 0.0125383, "epoch": 0.7173906508342102, "flos": 16185372768000.0, "grad_norm": 1.8814171012246383, "language_loss": 0.78710806, "learning_rate": 7.80791310264143e-07, "loss": 0.86389887, "num_input_tokens_seen": 257407495, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09558105, "step": 11932, "time_per_iteration": 2.5190181732177734 }, { "auxiliary_loss_clip": 0.06416994, "auxiliary_loss_mlp": 0.01265617, "balance_loss_clip": 0.06277675, "balance_loss_mlp": 0.01255753, "epoch": 0.7174507740868781, "flos": 26620948765440.0, "grad_norm": 1.4467229492007185, "language_loss": 0.75289458, "learning_rate": 7.804826048344803e-07, "loss": 0.82972074, "num_input_tokens_seen": 257429675, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09863281, "step": 11933, "time_per_iteration": 2.6314330101013184 }, { "auxiliary_loss_clip": 0.06433681, "auxiliary_loss_mlp": 0.01272044, "balance_loss_clip": 0.06285411, "balance_loss_mlp": 0.0126023, "epoch": 0.7175108973395461, "flos": 18437264847360.0, "grad_norm": 2.690154652329028, "language_loss": 0.69378746, "learning_rate": 7.801739456490388e-07, "loss": 0.77084464, "num_input_tokens_seen": 257442765, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.11816406, "step": 11934, "time_per_iteration": 2.6399247646331787 }, { "auxiliary_loss_clip": 0.06416622, "auxiliary_loss_mlp": 0.01265667, "balance_loss_clip": 0.06276181, "balance_loss_mlp": 0.01256249, "epoch": 0.717571020592214, "flos": 23921769991680.0, "grad_norm": 2.356729172630934, "language_loss": 0.86481005, "learning_rate": 7.798653327195237e-07, "loss": 0.94163287, "num_input_tokens_seen": 257459310, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09423828, "step": 11935, "time_per_iteration": 2.586251735687256 }, { "auxiliary_loss_clip": 0.06419093, "auxiliary_loss_mlp": 0.01263595, "balance_loss_clip": 0.06277147, "balance_loss_mlp": 0.01253337, "epoch": 0.717631143844882, "flos": 38266647828480.0, "grad_norm": 1.5380359590627197, "language_loss": 0.73917282, "learning_rate": 7.795567660576388e-07, "loss": 0.81599963, "num_input_tokens_seen": 257484750, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10253906, "step": 11936, "time_per_iteration": 2.713176965713501 }, { "auxiliary_loss_clip": 0.0632498, "auxiliary_loss_mlp": 0.01250383, "balance_loss_clip": 0.06267878, "balance_loss_mlp": 0.01249148, "epoch": 0.7176912670975499, "flos": 65536961408640.0, "grad_norm": 0.7467184210851202, "language_loss": 0.55817533, "learning_rate": 7.79248245675082e-07, "loss": 0.63392901, "num_input_tokens_seen": 257543110, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01234436, "step": 11937, "time_per_iteration": 3.1569294929504395 }, { "auxiliary_loss_clip": 0.06422479, "auxiliary_loss_mlp": 0.01266252, "balance_loss_clip": 0.06277969, "balance_loss_mlp": 0.01255398, "epoch": 0.717751390350218, "flos": 31288433074560.0, "grad_norm": 1.7264621419704793, "language_loss": 0.54791498, "learning_rate": 7.789397715835542e-07, "loss": 0.62480229, "num_input_tokens_seen": 257567410, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10852051, "step": 11938, "time_per_iteration": 2.6534183025360107 }, { "auxiliary_loss_clip": 0.06414458, "auxiliary_loss_mlp": 0.01265226, "balance_loss_clip": 0.06277622, "balance_loss_mlp": 0.01255653, "epoch": 0.7178115136028859, "flos": 19864155225600.0, "grad_norm": 1.478254239358875, "language_loss": 0.76576126, "learning_rate": 7.786313437947527e-07, "loss": 0.84255815, "num_input_tokens_seen": 257586270, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.0958252, "step": 11939, "time_per_iteration": 4.02403450012207 }, { "auxiliary_loss_clip": 0.06324333, "auxiliary_loss_mlp": 0.01250805, "balance_loss_clip": 0.06267087, "balance_loss_mlp": 0.01249485, "epoch": 0.7178716368555539, "flos": 64369576725120.0, "grad_norm": 0.7388307694172891, "language_loss": 0.61200643, "learning_rate": 7.783229623203738e-07, "loss": 0.68775779, "num_input_tokens_seen": 257647415, "router_z_loss_clip": 0.57275391, "router_z_loss_mlp": 0.01321411, "step": 11940, "time_per_iteration": 3.1446313858032227 }, { "auxiliary_loss_clip": 0.06415515, "auxiliary_loss_mlp": 0.01265029, "balance_loss_clip": 0.0627779, "balance_loss_mlp": 0.01255439, "epoch": 0.7179317601082219, "flos": 26770184087040.0, "grad_norm": 1.6056930926795188, "language_loss": 0.58544397, "learning_rate": 7.780146271721097e-07, "loss": 0.66224945, "num_input_tokens_seen": 257669795, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09576416, "step": 11941, "time_per_iteration": 2.590951681137085 }, { "auxiliary_loss_clip": 0.06416456, "auxiliary_loss_mlp": 0.01263427, "balance_loss_clip": 0.06276822, "balance_loss_mlp": 0.01253813, "epoch": 0.7179918833608898, "flos": 23520575842560.0, "grad_norm": 1.7262125023727408, "language_loss": 0.79631871, "learning_rate": 7.777063383616543e-07, "loss": 0.87311757, "num_input_tokens_seen": 257687415, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09606934, "step": 11942, "time_per_iteration": 2.6011037826538086 }, { "auxiliary_loss_clip": 0.06417505, "auxiliary_loss_mlp": 0.0126816, "balance_loss_clip": 0.06277628, "balance_loss_mlp": 0.01257914, "epoch": 0.7180520066135578, "flos": 17171349361920.0, "grad_norm": 1.9404644854062572, "language_loss": 0.66069746, "learning_rate": 7.773980959006968e-07, "loss": 0.73755407, "num_input_tokens_seen": 257706215, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.10247803, "step": 11943, "time_per_iteration": 2.589560031890869 }, { "auxiliary_loss_clip": 0.06420151, "auxiliary_loss_mlp": 0.01268927, "balance_loss_clip": 0.06278977, "balance_loss_mlp": 0.01258973, "epoch": 0.7181121298662257, "flos": 17572417729920.0, "grad_norm": 1.7298238819367164, "language_loss": 0.79011059, "learning_rate": 7.770898998009254e-07, "loss": 0.86700141, "num_input_tokens_seen": 257724740, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09960938, "step": 11944, "time_per_iteration": 2.594104528427124 }, { "auxiliary_loss_clip": 0.06418174, "auxiliary_loss_mlp": 0.01268727, "balance_loss_clip": 0.06273721, "balance_loss_mlp": 0.012572, "epoch": 0.7181722531188938, "flos": 11952471260160.0, "grad_norm": 10.55015606243875, "language_loss": 0.62966323, "learning_rate": 7.767817500740277e-07, "loss": 0.70653224, "num_input_tokens_seen": 257742060, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.11529541, "step": 11945, "time_per_iteration": 2.521719217300415 }, { "auxiliary_loss_clip": 0.06322433, "auxiliary_loss_mlp": 0.01252142, "balance_loss_clip": 0.06265503, "balance_loss_mlp": 0.01250798, "epoch": 0.7182323763715617, "flos": 65522664288000.0, "grad_norm": 0.6895964403933581, "language_loss": 0.50889462, "learning_rate": 7.76473646731689e-07, "loss": 0.58464038, "num_input_tokens_seen": 257802250, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01345825, "step": 11946, "time_per_iteration": 4.544299364089966 }, { "auxiliary_loss_clip": 0.06421964, "auxiliary_loss_mlp": 0.0126682, "balance_loss_clip": 0.0627808, "balance_loss_mlp": 0.0125574, "epoch": 0.7182924996242297, "flos": 20637137940480.0, "grad_norm": 1.5739058850610672, "language_loss": 0.75127584, "learning_rate": 7.761655897855925e-07, "loss": 0.82816368, "num_input_tokens_seen": 257821155, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.11077881, "step": 11947, "time_per_iteration": 2.5721702575683594 }, { "auxiliary_loss_clip": 0.06415781, "auxiliary_loss_mlp": 0.01267567, "balance_loss_clip": 0.06277389, "balance_loss_mlp": 0.01258168, "epoch": 0.7183526228768976, "flos": 16221947875200.0, "grad_norm": 1.373880984536538, "language_loss": 0.72817802, "learning_rate": 7.758575792474187e-07, "loss": 0.80501151, "num_input_tokens_seen": 257839905, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09399414, "step": 11948, "time_per_iteration": 2.5294299125671387 }, { "auxiliary_loss_clip": 0.06422387, "auxiliary_loss_mlp": 0.01270072, "balance_loss_clip": 0.06280242, "balance_loss_mlp": 0.01259909, "epoch": 0.7184127461295656, "flos": 22238518446720.0, "grad_norm": 2.1489708725300947, "language_loss": 0.71707547, "learning_rate": 7.755496151288483e-07, "loss": 0.79400003, "num_input_tokens_seen": 257860055, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10168457, "step": 11949, "time_per_iteration": 2.5805513858795166 }, { "auxiliary_loss_clip": 0.06417675, "auxiliary_loss_mlp": 0.01265419, "balance_loss_clip": 0.06278852, "balance_loss_mlp": 0.01255918, "epoch": 0.7184728693822335, "flos": 27351863930880.0, "grad_norm": 1.9438866040329226, "language_loss": 0.76544851, "learning_rate": 7.752416974415598e-07, "loss": 0.84227949, "num_input_tokens_seen": 257879315, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.0949707, "step": 11950, "time_per_iteration": 2.5782601833343506 }, { "auxiliary_loss_clip": 0.06427149, "auxiliary_loss_mlp": 0.01269281, "balance_loss_clip": 0.06282941, "balance_loss_mlp": 0.01258207, "epoch": 0.7185329926349016, "flos": 16514129462400.0, "grad_norm": 2.383463484267243, "language_loss": 0.68010378, "learning_rate": 7.749338261972282e-07, "loss": 0.75706816, "num_input_tokens_seen": 257896570, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.11077881, "step": 11951, "time_per_iteration": 2.573011875152588 }, { "auxiliary_loss_clip": 0.06426248, "auxiliary_loss_mlp": 0.01270271, "balance_loss_clip": 0.06280978, "balance_loss_mlp": 0.01258088, "epoch": 0.7185931158875695, "flos": 23957800047360.0, "grad_norm": 1.631991712881482, "language_loss": 0.78561729, "learning_rate": 7.746260014075286e-07, "loss": 0.86258245, "num_input_tokens_seen": 257916855, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.12188721, "step": 11952, "time_per_iteration": 2.5739269256591797 }, { "auxiliary_loss_clip": 0.06427066, "auxiliary_loss_mlp": 0.01270614, "balance_loss_clip": 0.0628073, "balance_loss_mlp": 0.01259796, "epoch": 0.7186532391402375, "flos": 26549265997440.0, "grad_norm": 2.0318319401871534, "language_loss": 0.74931598, "learning_rate": 7.743182230841352e-07, "loss": 0.82629287, "num_input_tokens_seen": 257937140, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.1081543, "step": 11953, "time_per_iteration": 2.590081214904785 }, { "auxiliary_loss_clip": 0.06422085, "auxiliary_loss_mlp": 0.012669, "balance_loss_clip": 0.06279773, "balance_loss_mlp": 0.01256696, "epoch": 0.7187133623929055, "flos": 22389682412160.0, "grad_norm": 1.9567070524773225, "language_loss": 0.73368895, "learning_rate": 7.740104912387164e-07, "loss": 0.81057882, "num_input_tokens_seen": 257956785, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10211182, "step": 11954, "time_per_iteration": 2.6266262531280518 }, { "auxiliary_loss_clip": 0.06420903, "auxiliary_loss_mlp": 0.01275545, "balance_loss_clip": 0.06278502, "balance_loss_mlp": 0.01264894, "epoch": 0.7187734856455734, "flos": 15785184867840.0, "grad_norm": 1.5957407639417809, "language_loss": 0.74781311, "learning_rate": 7.737028058829425e-07, "loss": 0.8247776, "num_input_tokens_seen": 257975455, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10650635, "step": 11955, "time_per_iteration": 2.7072813510894775 }, { "auxiliary_loss_clip": 0.06427249, "auxiliary_loss_mlp": 0.0126598, "balance_loss_clip": 0.06282809, "balance_loss_mlp": 0.01255597, "epoch": 0.7188336088982414, "flos": 31767766755840.0, "grad_norm": 1.7312639569685604, "language_loss": 0.73334444, "learning_rate": 7.733951670284817e-07, "loss": 0.81027675, "num_input_tokens_seen": 257996850, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10375977, "step": 11956, "time_per_iteration": 2.6784794330596924 }, { "auxiliary_loss_clip": 0.06422828, "auxiliary_loss_mlp": 0.01269814, "balance_loss_clip": 0.06277426, "balance_loss_mlp": 0.01259198, "epoch": 0.7188937321509093, "flos": 21470734684800.0, "grad_norm": 1.5678327305492472, "language_loss": 0.70766151, "learning_rate": 7.730875746869987e-07, "loss": 0.78458798, "num_input_tokens_seen": 258016145, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10614014, "step": 11957, "time_per_iteration": 4.046545505523682 }, { "auxiliary_loss_clip": 0.06427935, "auxiliary_loss_mlp": 0.01271816, "balance_loss_clip": 0.06281607, "balance_loss_mlp": 0.01260318, "epoch": 0.7189538554035774, "flos": 27278839497600.0, "grad_norm": 2.3744244524913585, "language_loss": 0.74046063, "learning_rate": 7.727800288701582e-07, "loss": 0.81745815, "num_input_tokens_seen": 258035420, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.11499023, "step": 11958, "time_per_iteration": 2.589637041091919 }, { "auxiliary_loss_clip": 0.06417455, "auxiliary_loss_mlp": 0.01267881, "balance_loss_clip": 0.06278381, "balance_loss_mlp": 0.01257945, "epoch": 0.7190139786562453, "flos": 21587168332800.0, "grad_norm": 2.3434693399252278, "language_loss": 0.83951688, "learning_rate": 7.724725295896215e-07, "loss": 0.91637015, "num_input_tokens_seen": 258053520, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09936523, "step": 11959, "time_per_iteration": 2.543095350265503 }, { "auxiliary_loss_clip": 0.06427101, "auxiliary_loss_mlp": 0.0126646, "balance_loss_clip": 0.06281851, "balance_loss_mlp": 0.01256208, "epoch": 0.7190741019089133, "flos": 26727990756480.0, "grad_norm": 1.4657582580892916, "language_loss": 0.82190293, "learning_rate": 7.7216507685705e-07, "loss": 0.89883852, "num_input_tokens_seen": 258073020, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10253906, "step": 11960, "time_per_iteration": 2.5962607860565186 }, { "auxiliary_loss_clip": 0.06424358, "auxiliary_loss_mlp": 0.01266626, "balance_loss_clip": 0.06283239, "balance_loss_mlp": 0.0125607, "epoch": 0.7191342251615812, "flos": 26112041792640.0, "grad_norm": 1.5853414630604525, "language_loss": 0.78069746, "learning_rate": 7.718576706841013e-07, "loss": 0.85760736, "num_input_tokens_seen": 258093155, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10559082, "step": 11961, "time_per_iteration": 4.001144886016846 }, { "auxiliary_loss_clip": 0.06417113, "auxiliary_loss_mlp": 0.01266507, "balance_loss_clip": 0.06280637, "balance_loss_mlp": 0.0125716, "epoch": 0.7191943484142492, "flos": 22973794024320.0, "grad_norm": 1.4850318760044439, "language_loss": 0.75197715, "learning_rate": 7.715503110824326e-07, "loss": 0.82881331, "num_input_tokens_seen": 258113905, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.09338379, "step": 11962, "time_per_iteration": 2.5843417644500732 }, { "auxiliary_loss_clip": 0.06421633, "auxiliary_loss_mlp": 0.01266986, "balance_loss_clip": 0.06280084, "balance_loss_mlp": 0.01256525, "epoch": 0.7192544716669171, "flos": 22571970969600.0, "grad_norm": 1.6373274175490504, "language_loss": 0.75633848, "learning_rate": 7.712429980637001e-07, "loss": 0.83322465, "num_input_tokens_seen": 258132820, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10467529, "step": 11963, "time_per_iteration": 2.5919790267944336 }, { "auxiliary_loss_clip": 0.06433605, "auxiliary_loss_mlp": 0.01266217, "balance_loss_clip": 0.06285783, "balance_loss_mlp": 0.01254457, "epoch": 0.7193145949195852, "flos": 18986981558400.0, "grad_norm": 2.2705516881685512, "language_loss": 0.80851376, "learning_rate": 7.709357316395564e-07, "loss": 0.88551193, "num_input_tokens_seen": 258148055, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.11761475, "step": 11964, "time_per_iteration": 2.5298521518707275 }, { "auxiliary_loss_clip": 0.06420197, "auxiliary_loss_mlp": 0.01267496, "balance_loss_clip": 0.06280103, "balance_loss_mlp": 0.01257334, "epoch": 0.7193747181722531, "flos": 18010061205120.0, "grad_norm": 1.691071416579642, "language_loss": 0.75519478, "learning_rate": 7.70628511821652e-07, "loss": 0.83207172, "num_input_tokens_seen": 258165995, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.10168457, "step": 11965, "time_per_iteration": 2.59960675239563 }, { "auxiliary_loss_clip": 0.06427553, "auxiliary_loss_mlp": 0.01267873, "balance_loss_clip": 0.06282513, "balance_loss_mlp": 0.0125699, "epoch": 0.7194348414249211, "flos": 24396323990400.0, "grad_norm": 1.6538514637355335, "language_loss": 0.7784338, "learning_rate": 7.703213386216377e-07, "loss": 0.85538805, "num_input_tokens_seen": 258186165, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10876465, "step": 11966, "time_per_iteration": 2.5769448280334473 }, { "auxiliary_loss_clip": 0.06419435, "auxiliary_loss_mlp": 0.01270199, "balance_loss_clip": 0.06276838, "balance_loss_mlp": 0.01260209, "epoch": 0.7194949646775891, "flos": 22169938279680.0, "grad_norm": 2.2395456001784675, "language_loss": 0.73206198, "learning_rate": 7.700142120511619e-07, "loss": 0.80895835, "num_input_tokens_seen": 258204595, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09985352, "step": 11967, "time_per_iteration": 2.5628418922424316 }, { "auxiliary_loss_clip": 0.06413846, "auxiliary_loss_mlp": 0.0126943, "balance_loss_clip": 0.06278516, "balance_loss_mlp": 0.01259851, "epoch": 0.719555087930257, "flos": 20272560825600.0, "grad_norm": 2.285005007487135, "language_loss": 0.82222795, "learning_rate": 7.6970713212187e-07, "loss": 0.89906073, "num_input_tokens_seen": 258223110, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.09588623, "step": 11968, "time_per_iteration": 2.548086166381836 }, { "auxiliary_loss_clip": 0.06417401, "auxiliary_loss_mlp": 0.01265597, "balance_loss_clip": 0.06278674, "balance_loss_mlp": 0.01255714, "epoch": 0.719615211182925, "flos": 24723026259840.0, "grad_norm": 2.193493461744649, "language_loss": 0.76758742, "learning_rate": 7.69400098845407e-07, "loss": 0.84441739, "num_input_tokens_seen": 258242660, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09887695, "step": 11969, "time_per_iteration": 2.6026384830474854 }, { "auxiliary_loss_clip": 0.06418712, "auxiliary_loss_mlp": 0.01266725, "balance_loss_clip": 0.06276897, "balance_loss_mlp": 0.01256562, "epoch": 0.719675334435593, "flos": 20015570753280.0, "grad_norm": 1.4977638334552552, "language_loss": 0.71399367, "learning_rate": 7.69093112233417e-07, "loss": 0.79084808, "num_input_tokens_seen": 258261850, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10180664, "step": 11970, "time_per_iteration": 2.534616470336914 }, { "auxiliary_loss_clip": 0.0632671, "auxiliary_loss_mlp": 0.01250151, "balance_loss_clip": 0.06269018, "balance_loss_mlp": 0.01248802, "epoch": 0.719735457688261, "flos": 44215965169920.0, "grad_norm": 0.8905135501173604, "language_loss": 0.60567725, "learning_rate": 7.68786172297538e-07, "loss": 0.68144584, "num_input_tokens_seen": 258312570, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.01350403, "step": 11971, "time_per_iteration": 3.049238920211792 }, { "auxiliary_loss_clip": 0.06428286, "auxiliary_loss_mlp": 0.01264399, "balance_loss_clip": 0.06280266, "balance_loss_mlp": 0.01254207, "epoch": 0.7197955809409289, "flos": 16808952453120.0, "grad_norm": 1.909891920624225, "language_loss": 0.80377805, "learning_rate": 7.684792790494105e-07, "loss": 0.88070494, "num_input_tokens_seen": 258331600, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.10198975, "step": 11972, "time_per_iteration": 2.5432419776916504 }, { "auxiliary_loss_clip": 0.06425757, "auxiliary_loss_mlp": 0.01266604, "balance_loss_clip": 0.06282748, "balance_loss_mlp": 0.01256144, "epoch": 0.7198557041935969, "flos": 24542330929920.0, "grad_norm": 1.611454450983125, "language_loss": 0.76213771, "learning_rate": 7.681724325006733e-07, "loss": 0.83906132, "num_input_tokens_seen": 258351785, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10467529, "step": 11973, "time_per_iteration": 2.5783588886260986 }, { "auxiliary_loss_clip": 0.06324412, "auxiliary_loss_mlp": 0.0125175, "balance_loss_clip": 0.06267031, "balance_loss_mlp": 0.01250439, "epoch": 0.7199158274462648, "flos": 70729006204800.0, "grad_norm": 0.8332562091271757, "language_loss": 0.57013959, "learning_rate": 7.6786563266296e-07, "loss": 0.6459012, "num_input_tokens_seen": 258404035, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.01312256, "step": 11974, "time_per_iteration": 3.0369019508361816 }, { "auxiliary_loss_clip": 0.06423876, "auxiliary_loss_mlp": 0.012667, "balance_loss_clip": 0.06279653, "balance_loss_mlp": 0.01256233, "epoch": 0.7199759506989328, "flos": 29355151345920.0, "grad_norm": 1.796716287339698, "language_loss": 0.61287928, "learning_rate": 7.675588795479062e-07, "loss": 0.68978506, "num_input_tokens_seen": 258424850, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10473633, "step": 11975, "time_per_iteration": 2.6054131984710693 }, { "auxiliary_loss_clip": 0.06425712, "auxiliary_loss_mlp": 0.0126884, "balance_loss_clip": 0.06284784, "balance_loss_mlp": 0.01258725, "epoch": 0.7200360739516007, "flos": 24646689590400.0, "grad_norm": 1.899237018492531, "language_loss": 0.67809355, "learning_rate": 7.672521731671425e-07, "loss": 0.7550391, "num_input_tokens_seen": 258445485, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10113525, "step": 11976, "time_per_iteration": 2.605365753173828 }, { "auxiliary_loss_clip": 0.06424543, "auxiliary_loss_mlp": 0.01264163, "balance_loss_clip": 0.06282932, "balance_loss_mlp": 0.01254204, "epoch": 0.7200961972042688, "flos": 20819007227520.0, "grad_norm": 1.6923098362396278, "language_loss": 0.67484546, "learning_rate": 7.669455135323004e-07, "loss": 0.75173247, "num_input_tokens_seen": 258464505, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09960938, "step": 11977, "time_per_iteration": 2.5617361068725586 }, { "auxiliary_loss_clip": 0.06427317, "auxiliary_loss_mlp": 0.01269894, "balance_loss_clip": 0.06282863, "balance_loss_mlp": 0.01259421, "epoch": 0.7201563204569367, "flos": 31253493121920.0, "grad_norm": 1.6094412933141573, "language_loss": 0.75568855, "learning_rate": 7.666389006550074e-07, "loss": 0.83266068, "num_input_tokens_seen": 258487190, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10467529, "step": 11978, "time_per_iteration": 2.6608169078826904 }, { "auxiliary_loss_clip": 0.06421273, "auxiliary_loss_mlp": 0.01263682, "balance_loss_clip": 0.06281449, "balance_loss_mlp": 0.01253627, "epoch": 0.7202164437096047, "flos": 26658655902720.0, "grad_norm": 2.0767940387622614, "language_loss": 0.78785998, "learning_rate": 7.663323345468908e-07, "loss": 0.86470956, "num_input_tokens_seen": 258503790, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.1005249, "step": 11979, "time_per_iteration": 4.0762715339660645 }, { "auxiliary_loss_clip": 0.06423898, "auxiliary_loss_mlp": 0.01267902, "balance_loss_clip": 0.06279986, "balance_loss_mlp": 0.01257173, "epoch": 0.7202765669622727, "flos": 25966999175040.0, "grad_norm": 1.4773370817641327, "language_loss": 0.65273571, "learning_rate": 7.660258152195767e-07, "loss": 0.72965372, "num_input_tokens_seen": 258527335, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10723877, "step": 11980, "time_per_iteration": 2.687577247619629 }, { "auxiliary_loss_clip": 0.06426559, "auxiliary_loss_mlp": 0.01267133, "balance_loss_clip": 0.06283045, "balance_loss_mlp": 0.01256839, "epoch": 0.7203366902149406, "flos": 28519961374080.0, "grad_norm": 1.9152402468038932, "language_loss": 0.67630851, "learning_rate": 7.657193426846871e-07, "loss": 0.75324547, "num_input_tokens_seen": 258546690, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10296631, "step": 11981, "time_per_iteration": 2.715675115585327 }, { "auxiliary_loss_clip": 0.06426597, "auxiliary_loss_mlp": 0.01267149, "balance_loss_clip": 0.06282556, "balance_loss_mlp": 0.01256497, "epoch": 0.7203968134676086, "flos": 21112446625920.0, "grad_norm": 1.5840470953139547, "language_loss": 0.73917115, "learning_rate": 7.65412916953843e-07, "loss": 0.81610864, "num_input_tokens_seen": 258566340, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10656738, "step": 11982, "time_per_iteration": 2.5694310665130615 }, { "auxiliary_loss_clip": 0.06421193, "auxiliary_loss_mlp": 0.01266632, "balance_loss_clip": 0.06278291, "balance_loss_mlp": 0.01257137, "epoch": 0.7204569367202766, "flos": 18337937431680.0, "grad_norm": 1.7531351781583582, "language_loss": 0.6567899, "learning_rate": 7.65106538038665e-07, "loss": 0.73366821, "num_input_tokens_seen": 258584455, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09490967, "step": 11983, "time_per_iteration": 2.596494674682617 }, { "auxiliary_loss_clip": 0.06421781, "auxiliary_loss_mlp": 0.01265381, "balance_loss_clip": 0.0628024, "balance_loss_mlp": 0.01255063, "epoch": 0.7205170599729446, "flos": 23261279783040.0, "grad_norm": 1.5065631663084358, "language_loss": 0.66701823, "learning_rate": 7.648002059507715e-07, "loss": 0.74388981, "num_input_tokens_seen": 258604725, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10321045, "step": 11984, "time_per_iteration": 2.564335584640503 }, { "auxiliary_loss_clip": 0.06428738, "auxiliary_loss_mlp": 0.01270534, "balance_loss_clip": 0.06283891, "balance_loss_mlp": 0.01259602, "epoch": 0.7205771832256125, "flos": 20127140864640.0, "grad_norm": 3.2869122848072583, "language_loss": 0.74433005, "learning_rate": 7.644939207017771e-07, "loss": 0.8213228, "num_input_tokens_seen": 258622885, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10931396, "step": 11985, "time_per_iteration": 2.6549012660980225 }, { "auxiliary_loss_clip": 0.06422959, "auxiliary_loss_mlp": 0.01266279, "balance_loss_clip": 0.06281672, "balance_loss_mlp": 0.01256295, "epoch": 0.7206373064782805, "flos": 27709648865280.0, "grad_norm": 1.9699638293199289, "language_loss": 0.62839746, "learning_rate": 7.641876823032977e-07, "loss": 0.70528984, "num_input_tokens_seen": 258644305, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09985352, "step": 11986, "time_per_iteration": 4.073857069015503 }, { "auxiliary_loss_clip": 0.06432149, "auxiliary_loss_mlp": 0.01271044, "balance_loss_clip": 0.06288297, "balance_loss_mlp": 0.01259952, "epoch": 0.7206974297309484, "flos": 17974031149440.0, "grad_norm": 1.6634553636776108, "language_loss": 0.72831666, "learning_rate": 7.638814907669455e-07, "loss": 0.80534858, "num_input_tokens_seen": 258661775, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.11083984, "step": 11987, "time_per_iteration": 2.5763437747955322 }, { "auxiliary_loss_clip": 0.06427746, "auxiliary_loss_mlp": 0.01264815, "balance_loss_clip": 0.06282274, "balance_loss_mlp": 0.01254438, "epoch": 0.7207575529836164, "flos": 16988893096320.0, "grad_norm": 1.7866188990847467, "language_loss": 0.78597152, "learning_rate": 7.635753461043301e-07, "loss": 0.8628971, "num_input_tokens_seen": 258679830, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10369873, "step": 11988, "time_per_iteration": 2.5196964740753174 }, { "auxiliary_loss_clip": 0.06421538, "auxiliary_loss_mlp": 0.01264475, "balance_loss_clip": 0.06278674, "balance_loss_mlp": 0.0125436, "epoch": 0.7208176762362843, "flos": 18732465181440.0, "grad_norm": 2.3198705749423563, "language_loss": 0.7890923, "learning_rate": 7.632692483270618e-07, "loss": 0.86595249, "num_input_tokens_seen": 258697415, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10119629, "step": 11989, "time_per_iteration": 2.5348289012908936 }, { "auxiliary_loss_clip": 0.06412794, "auxiliary_loss_mlp": 0.01268651, "balance_loss_clip": 0.06275273, "balance_loss_mlp": 0.01258012, "epoch": 0.7208777994889524, "flos": 18740515173120.0, "grad_norm": 1.675076365112898, "language_loss": 0.83191669, "learning_rate": 7.629631974467481e-07, "loss": 0.9087311, "num_input_tokens_seen": 258716755, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.10650635, "step": 11990, "time_per_iteration": 2.5339276790618896 }, { "auxiliary_loss_clip": 0.06420313, "auxiliary_loss_mlp": 0.01271845, "balance_loss_clip": 0.0628077, "balance_loss_mlp": 0.01261999, "epoch": 0.7209379227416203, "flos": 14798705149440.0, "grad_norm": 2.035995081393507, "language_loss": 0.76609135, "learning_rate": 7.626571934749931e-07, "loss": 0.84301293, "num_input_tokens_seen": 258733270, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09844971, "step": 11991, "time_per_iteration": 2.518385171890259 }, { "auxiliary_loss_clip": 0.06418599, "auxiliary_loss_mlp": 0.01267605, "balance_loss_clip": 0.06280759, "balance_loss_mlp": 0.01257645, "epoch": 0.7209980459942883, "flos": 29643559499520.0, "grad_norm": 1.5503049448431765, "language_loss": 0.72855604, "learning_rate": 7.623512364234022e-07, "loss": 0.80541801, "num_input_tokens_seen": 258755270, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09960938, "step": 11992, "time_per_iteration": 2.628242015838623 }, { "auxiliary_loss_clip": 0.06427084, "auxiliary_loss_mlp": 0.01264625, "balance_loss_clip": 0.06281339, "balance_loss_mlp": 0.012542, "epoch": 0.7210581692469563, "flos": 23483916881280.0, "grad_norm": 1.5438961610521904, "language_loss": 0.66270328, "learning_rate": 7.620453263035755e-07, "loss": 0.73962033, "num_input_tokens_seen": 258775340, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10424805, "step": 11993, "time_per_iteration": 2.571768283843994 }, { "auxiliary_loss_clip": 0.06423177, "auxiliary_loss_mlp": 0.01268572, "balance_loss_clip": 0.06280604, "balance_loss_mlp": 0.01258231, "epoch": 0.7211182924996242, "flos": 26106297788160.0, "grad_norm": 2.3137249480624287, "language_loss": 0.65798229, "learning_rate": 7.61739463127115e-07, "loss": 0.73489976, "num_input_tokens_seen": 258794580, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10339355, "step": 11994, "time_per_iteration": 2.604383707046509 }, { "auxiliary_loss_clip": 0.06422816, "auxiliary_loss_mlp": 0.01265286, "balance_loss_clip": 0.06279735, "balance_loss_mlp": 0.0125445, "epoch": 0.7211784157522922, "flos": 17717795763840.0, "grad_norm": 1.6138595055351603, "language_loss": 0.67174006, "learning_rate": 7.614336469056172e-07, "loss": 0.74862111, "num_input_tokens_seen": 258812330, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10839844, "step": 11995, "time_per_iteration": 2.5315613746643066 }, { "auxiliary_loss_clip": 0.06419285, "auxiliary_loss_mlp": 0.01264836, "balance_loss_clip": 0.06282245, "balance_loss_mlp": 0.0125487, "epoch": 0.7212385390049602, "flos": 24430173840000.0, "grad_norm": 5.9772153284709235, "language_loss": 0.79617345, "learning_rate": 7.6112787765068e-07, "loss": 0.87301469, "num_input_tokens_seen": 258831770, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09967041, "step": 11996, "time_per_iteration": 2.6049726009368896 }, { "auxiliary_loss_clip": 0.06426401, "auxiliary_loss_mlp": 0.012645, "balance_loss_clip": 0.06283593, "balance_loss_mlp": 0.01253652, "epoch": 0.7212986622576282, "flos": 28154755353600.0, "grad_norm": 1.9100627860587809, "language_loss": 0.81666929, "learning_rate": 7.60822155373899e-07, "loss": 0.89357829, "num_input_tokens_seen": 258849090, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10845947, "step": 11997, "time_per_iteration": 4.0639119148254395 }, { "auxiliary_loss_clip": 0.06429826, "auxiliary_loss_mlp": 0.01267126, "balance_loss_clip": 0.06284458, "balance_loss_mlp": 0.01256481, "epoch": 0.7213587855102961, "flos": 21842313615360.0, "grad_norm": 1.9798837001397593, "language_loss": 0.68028235, "learning_rate": 7.605164800868646e-07, "loss": 0.75725186, "num_input_tokens_seen": 258868230, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10644531, "step": 11998, "time_per_iteration": 2.5548272132873535 }, { "auxiliary_loss_clip": 0.06421967, "auxiliary_loss_mlp": 0.01266306, "balance_loss_clip": 0.06279624, "balance_loss_mlp": 0.01256514, "epoch": 0.7214189087629641, "flos": 14616877789440.0, "grad_norm": 1.8303170845139356, "language_loss": 0.72686303, "learning_rate": 7.602108518011696e-07, "loss": 0.80374575, "num_input_tokens_seen": 258885525, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09796143, "step": 11999, "time_per_iteration": 2.523167610168457 }, { "auxiliary_loss_clip": 0.06423475, "auxiliary_loss_mlp": 0.01266986, "balance_loss_clip": 0.06281699, "balance_loss_mlp": 0.01256573, "epoch": 0.721479032015632, "flos": 19396938458880.0, "grad_norm": 2.3427493618607733, "language_loss": 0.83123052, "learning_rate": 7.599052705284039e-07, "loss": 0.90813512, "num_input_tokens_seen": 258903245, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.10418701, "step": 12000, "time_per_iteration": 2.542065382003784 }, { "auxiliary_loss_clip": 0.06430743, "auxiliary_loss_mlp": 0.01265077, "balance_loss_clip": 0.06287133, "balance_loss_mlp": 0.01254491, "epoch": 0.7215391552683, "flos": 18518423126400.0, "grad_norm": 3.0045127619167893, "language_loss": 0.77243614, "learning_rate": 7.59599736280154e-07, "loss": 0.84939432, "num_input_tokens_seen": 258921245, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10583496, "step": 12001, "time_per_iteration": 3.821695566177368 }, { "auxiliary_loss_clip": 0.06418331, "auxiliary_loss_mlp": 0.0126629, "balance_loss_clip": 0.06280564, "balance_loss_mlp": 0.01256497, "epoch": 0.721599278520968, "flos": 23265514414080.0, "grad_norm": 1.8080703731483694, "language_loss": 0.81856728, "learning_rate": 7.592942490680066e-07, "loss": 0.8954134, "num_input_tokens_seen": 258939425, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09796143, "step": 12002, "time_per_iteration": 2.796354293823242 }, { "auxiliary_loss_clip": 0.0643111, "auxiliary_loss_mlp": 0.01267514, "balance_loss_clip": 0.06286643, "balance_loss_mlp": 0.01256935, "epoch": 0.721659401773636, "flos": 39207831615360.0, "grad_norm": 1.8547505690795458, "language_loss": 0.62339354, "learning_rate": 7.589888089035462e-07, "loss": 0.70037985, "num_input_tokens_seen": 258960710, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10583496, "step": 12003, "time_per_iteration": 2.7743167877197266 }, { "auxiliary_loss_clip": 0.06428675, "auxiliary_loss_mlp": 0.01269859, "balance_loss_clip": 0.06283763, "balance_loss_mlp": 0.01258927, "epoch": 0.7217195250263039, "flos": 14945299067520.0, "grad_norm": 2.4149237178268947, "language_loss": 0.68201941, "learning_rate": 7.586834157983544e-07, "loss": 0.75900471, "num_input_tokens_seen": 258978475, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.109375, "step": 12004, "time_per_iteration": 2.5469322204589844 }, { "auxiliary_loss_clip": 0.06326884, "auxiliary_loss_mlp": 0.01255729, "balance_loss_clip": 0.06269232, "balance_loss_mlp": 0.01254275, "epoch": 0.7217796482789719, "flos": 70889477973120.0, "grad_norm": 1.0090073663529908, "language_loss": 0.54096341, "learning_rate": 7.583780697640112e-07, "loss": 0.61678946, "num_input_tokens_seen": 259037520, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.014534, "step": 12005, "time_per_iteration": 3.145476818084717 }, { "auxiliary_loss_clip": 0.0642311, "auxiliary_loss_mlp": 0.01263027, "balance_loss_clip": 0.0628166, "balance_loss_mlp": 0.01252763, "epoch": 0.7218397715316398, "flos": 37460653804800.0, "grad_norm": 1.4495838739633684, "language_loss": 0.63266611, "learning_rate": 7.580727708120962e-07, "loss": 0.70952749, "num_input_tokens_seen": 259061325, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10266113, "step": 12006, "time_per_iteration": 2.704357624053955 }, { "auxiliary_loss_clip": 0.06426807, "auxiliary_loss_mlp": 0.01264228, "balance_loss_clip": 0.0628166, "balance_loss_mlp": 0.01254578, "epoch": 0.7218998947843078, "flos": 22717223222400.0, "grad_norm": 1.6954284183586301, "language_loss": 0.91693115, "learning_rate": 7.577675189541865e-07, "loss": 0.99384153, "num_input_tokens_seen": 259078135, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.09649658, "step": 12007, "time_per_iteration": 2.695800542831421 }, { "auxiliary_loss_clip": 0.06428798, "auxiliary_loss_mlp": 0.01265204, "balance_loss_clip": 0.0628344, "balance_loss_mlp": 0.01254565, "epoch": 0.7219600180369758, "flos": 12172131538560.0, "grad_norm": 1.750884629974531, "language_loss": 0.64062953, "learning_rate": 7.574623142018568e-07, "loss": 0.71756947, "num_input_tokens_seen": 259095910, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.10644531, "step": 12008, "time_per_iteration": 2.5676887035369873 }, { "auxiliary_loss_clip": 0.06430679, "auxiliary_loss_mlp": 0.01266141, "balance_loss_clip": 0.06284161, "balance_loss_mlp": 0.01255329, "epoch": 0.7220201412896438, "flos": 22602340874880.0, "grad_norm": 2.299364621283438, "language_loss": 0.79494333, "learning_rate": 7.57157156566681e-07, "loss": 0.87191159, "num_input_tokens_seen": 259114225, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10803223, "step": 12009, "time_per_iteration": 2.6152231693267822 }, { "auxiliary_loss_clip": 0.06428598, "auxiliary_loss_mlp": 0.01267648, "balance_loss_clip": 0.06282099, "balance_loss_mlp": 0.01256287, "epoch": 0.7220802645423118, "flos": 26724972009600.0, "grad_norm": 1.843687850814739, "language_loss": 0.64423519, "learning_rate": 7.568520460602297e-07, "loss": 0.72119766, "num_input_tokens_seen": 259134660, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.11358643, "step": 12010, "time_per_iteration": 2.6168253421783447 }, { "auxiliary_loss_clip": 0.06423503, "auxiliary_loss_mlp": 0.01265858, "balance_loss_clip": 0.06281678, "balance_loss_mlp": 0.01255856, "epoch": 0.7221403877949797, "flos": 24426568114560.0, "grad_norm": 2.0964902924955044, "language_loss": 0.77354991, "learning_rate": 7.565469826940742e-07, "loss": 0.85044348, "num_input_tokens_seen": 259153300, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09991455, "step": 12011, "time_per_iteration": 2.5875065326690674 }, { "auxiliary_loss_clip": 0.06427932, "auxiliary_loss_mlp": 0.01264299, "balance_loss_clip": 0.0628616, "balance_loss_mlp": 0.01255322, "epoch": 0.7222005110476477, "flos": 23521246675200.0, "grad_norm": 1.6131355931671127, "language_loss": 0.79508555, "learning_rate": 7.56241966479781e-07, "loss": 0.87200785, "num_input_tokens_seen": 259172115, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.08972168, "step": 12012, "time_per_iteration": 2.566286325454712 }, { "auxiliary_loss_clip": 0.064283, "auxiliary_loss_mlp": 0.0126353, "balance_loss_clip": 0.0628427, "balance_loss_mlp": 0.01254195, "epoch": 0.7222606343003156, "flos": 23119255912320.0, "grad_norm": 1.8303706552090877, "language_loss": 0.76080722, "learning_rate": 7.559369974289171e-07, "loss": 0.83772552, "num_input_tokens_seen": 259191345, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09326172, "step": 12013, "time_per_iteration": 2.5532267093658447 }, { "auxiliary_loss_clip": 0.06419467, "auxiliary_loss_mlp": 0.01265069, "balance_loss_clip": 0.06280429, "balance_loss_mlp": 0.01255288, "epoch": 0.7223207575529836, "flos": 24357778312320.0, "grad_norm": 1.3596410022038132, "language_loss": 0.76144207, "learning_rate": 7.556320755530484e-07, "loss": 0.83828747, "num_input_tokens_seen": 259211700, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09783936, "step": 12014, "time_per_iteration": 2.603416919708252 }, { "auxiliary_loss_clip": 0.06428625, "auxiliary_loss_mlp": 0.01264356, "balance_loss_clip": 0.06284487, "balance_loss_mlp": 0.01254039, "epoch": 0.7223808808056515, "flos": 28337798597760.0, "grad_norm": 1.6349102686945312, "language_loss": 0.86800122, "learning_rate": 7.553272008637346e-07, "loss": 0.94493115, "num_input_tokens_seen": 259233825, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10314941, "step": 12015, "time_per_iteration": 2.6087939739227295 }, { "auxiliary_loss_clip": 0.06425405, "auxiliary_loss_mlp": 0.01265749, "balance_loss_clip": 0.06284454, "balance_loss_mlp": 0.01256552, "epoch": 0.7224410040583196, "flos": 21075829591680.0, "grad_norm": 1.877659399586635, "language_loss": 0.78359127, "learning_rate": 7.55022373372538e-07, "loss": 0.86050284, "num_input_tokens_seen": 259253055, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09197998, "step": 12016, "time_per_iteration": 2.5371172428131104 }, { "auxiliary_loss_clip": 0.06424746, "auxiliary_loss_mlp": 0.01265765, "balance_loss_clip": 0.06283635, "balance_loss_mlp": 0.01256014, "epoch": 0.7225011273109875, "flos": 26802398782080.0, "grad_norm": 1.373472321712267, "language_loss": 0.77922982, "learning_rate": 7.547175930910186e-07, "loss": 0.85613495, "num_input_tokens_seen": 259273420, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09741211, "step": 12017, "time_per_iteration": 2.5763111114501953 }, { "auxiliary_loss_clip": 0.06422749, "auxiliary_loss_mlp": 0.01267344, "balance_loss_clip": 0.06283632, "balance_loss_mlp": 0.01257711, "epoch": 0.7225612505636555, "flos": 23589826842240.0, "grad_norm": 1.8293916902775904, "language_loss": 0.74306822, "learning_rate": 7.54412860030732e-07, "loss": 0.81996918, "num_input_tokens_seen": 259291000, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09625244, "step": 12018, "time_per_iteration": 2.5845608711242676 }, { "auxiliary_loss_clip": 0.06416456, "auxiliary_loss_mlp": 0.01270014, "balance_loss_clip": 0.06281091, "balance_loss_mlp": 0.01260537, "epoch": 0.7226213738163234, "flos": 20783983420800.0, "grad_norm": 1.5395116735821859, "language_loss": 0.7755549, "learning_rate": 7.541081742032347e-07, "loss": 0.85241961, "num_input_tokens_seen": 259312390, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.09472656, "step": 12019, "time_per_iteration": 3.9955620765686035 }, { "auxiliary_loss_clip": 0.06420772, "auxiliary_loss_mlp": 0.01263285, "balance_loss_clip": 0.06280375, "balance_loss_mlp": 0.01253587, "epoch": 0.7226814970689914, "flos": 32644227663360.0, "grad_norm": 1.787894460942287, "language_loss": 0.743195, "learning_rate": 7.53803535620081e-07, "loss": 0.82003558, "num_input_tokens_seen": 259332645, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09698486, "step": 12020, "time_per_iteration": 2.631412982940674 }, { "auxiliary_loss_clip": 0.06429756, "auxiliary_loss_mlp": 0.01264102, "balance_loss_clip": 0.06285162, "balance_loss_mlp": 0.01253898, "epoch": 0.7227416203216595, "flos": 22460736274560.0, "grad_norm": 1.7492629727788553, "language_loss": 0.77222121, "learning_rate": 7.534989442928219e-07, "loss": 0.84915972, "num_input_tokens_seen": 259353810, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10205078, "step": 12021, "time_per_iteration": 2.5748353004455566 }, { "auxiliary_loss_clip": 0.06425655, "auxiliary_loss_mlp": 0.01264564, "balance_loss_clip": 0.06285136, "balance_loss_mlp": 0.01254294, "epoch": 0.7228017435743274, "flos": 21658641465600.0, "grad_norm": 1.5466893871602532, "language_loss": 0.68477333, "learning_rate": 7.531944002330073e-07, "loss": 0.76167554, "num_input_tokens_seen": 259372460, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.1027832, "step": 12022, "time_per_iteration": 2.558023452758789 }, { "auxiliary_loss_clip": 0.06422959, "auxiliary_loss_mlp": 0.01266057, "balance_loss_clip": 0.06279865, "balance_loss_mlp": 0.01255805, "epoch": 0.7228618668269954, "flos": 29541171409920.0, "grad_norm": 1.784219995110002, "language_loss": 0.69746792, "learning_rate": 7.528899034521858e-07, "loss": 0.77435803, "num_input_tokens_seen": 259393275, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10241699, "step": 12023, "time_per_iteration": 2.6450018882751465 }, { "auxiliary_loss_clip": 0.06420604, "auxiliary_loss_mlp": 0.01262227, "balance_loss_clip": 0.06279787, "balance_loss_mlp": 0.01252565, "epoch": 0.7229219900796633, "flos": 27461169982080.0, "grad_norm": 1.6590596716621697, "language_loss": 0.71391618, "learning_rate": 7.525854539619052e-07, "loss": 0.79074448, "num_input_tokens_seen": 259416205, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09661865, "step": 12024, "time_per_iteration": 2.5957536697387695 }, { "auxiliary_loss_clip": 0.06426992, "auxiliary_loss_mlp": 0.01267911, "balance_loss_clip": 0.06285401, "balance_loss_mlp": 0.01257498, "epoch": 0.7229821133323313, "flos": 16294888454400.0, "grad_norm": 1.8432973087420523, "language_loss": 0.7551226, "learning_rate": 7.522810517737089e-07, "loss": 0.83207166, "num_input_tokens_seen": 259433115, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.10412598, "step": 12025, "time_per_iteration": 3.957407236099243 }, { "auxiliary_loss_clip": 0.06420341, "auxiliary_loss_mlp": 0.01266877, "balance_loss_clip": 0.0628169, "balance_loss_mlp": 0.0125712, "epoch": 0.7230422365849992, "flos": 20418567765120.0, "grad_norm": 1.8632120515010722, "language_loss": 0.76475489, "learning_rate": 7.519766968991395e-07, "loss": 0.841627, "num_input_tokens_seen": 259450475, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09753418, "step": 12026, "time_per_iteration": 2.5287938117980957 }, { "auxiliary_loss_clip": 0.06419287, "auxiliary_loss_mlp": 0.01263862, "balance_loss_clip": 0.06277352, "balance_loss_mlp": 0.01253783, "epoch": 0.7231023598376672, "flos": 25600619197440.0, "grad_norm": 4.5398222333262, "language_loss": 0.67756104, "learning_rate": 7.516723893497388e-07, "loss": 0.7543925, "num_input_tokens_seen": 259469355, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10083008, "step": 12027, "time_per_iteration": 2.58834171295166 }, { "auxiliary_loss_clip": 0.06425408, "auxiliary_loss_mlp": 0.01269223, "balance_loss_clip": 0.06281272, "balance_loss_mlp": 0.0125865, "epoch": 0.7231624830903352, "flos": 25155638490240.0, "grad_norm": 2.5241847282117287, "language_loss": 0.79560429, "learning_rate": 7.513681291370469e-07, "loss": 0.87255061, "num_input_tokens_seen": 259486565, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10552979, "step": 12028, "time_per_iteration": 2.705152988433838 }, { "auxiliary_loss_clip": 0.06423102, "auxiliary_loss_mlp": 0.01265152, "balance_loss_clip": 0.06280761, "balance_loss_mlp": 0.01255377, "epoch": 0.7232226063430032, "flos": 21732169023360.0, "grad_norm": 1.6907647772226486, "language_loss": 0.82561648, "learning_rate": 7.510639162726e-07, "loss": 0.90249908, "num_input_tokens_seen": 259505070, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09771729, "step": 12029, "time_per_iteration": 2.5615181922912598 }, { "auxiliary_loss_clip": 0.0633293, "auxiliary_loss_mlp": 0.01251699, "balance_loss_clip": 0.06275182, "balance_loss_mlp": 0.0125026, "epoch": 0.7232827295956711, "flos": 68458693426560.0, "grad_norm": 0.781355927241196, "language_loss": 0.61628503, "learning_rate": 7.507597507679347e-07, "loss": 0.69213128, "num_input_tokens_seen": 259569135, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 0.01436615, "step": 12030, "time_per_iteration": 3.2431418895721436 }, { "auxiliary_loss_clip": 0.06420524, "auxiliary_loss_mlp": 0.01263497, "balance_loss_clip": 0.06281225, "balance_loss_mlp": 0.01253758, "epoch": 0.7233428528483391, "flos": 20198697851520.0, "grad_norm": 1.6072209032570768, "language_loss": 0.78278601, "learning_rate": 7.504556326345859e-07, "loss": 0.85962617, "num_input_tokens_seen": 259587035, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09741211, "step": 12031, "time_per_iteration": 2.537497043609619 }, { "auxiliary_loss_clip": 0.06422544, "auxiliary_loss_mlp": 0.01266038, "balance_loss_clip": 0.06277578, "balance_loss_mlp": 0.01255345, "epoch": 0.723402976101007, "flos": 23955955257600.0, "grad_norm": 1.859000731461175, "language_loss": 0.81673568, "learning_rate": 7.501515618840834e-07, "loss": 0.8936215, "num_input_tokens_seen": 259606140, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10693359, "step": 12032, "time_per_iteration": 2.584240436553955 }, { "auxiliary_loss_clip": 0.06427906, "auxiliary_loss_mlp": 0.01267377, "balance_loss_clip": 0.06278953, "balance_loss_mlp": 0.01256946, "epoch": 0.723463099353675, "flos": 20819636133120.0, "grad_norm": 1.7313800878732384, "language_loss": 0.75409555, "learning_rate": 7.498475385279592e-07, "loss": 0.83104837, "num_input_tokens_seen": 259624275, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.10443115, "step": 12033, "time_per_iteration": 2.5319724082946777 }, { "auxiliary_loss_clip": 0.06416003, "auxiliary_loss_mlp": 0.01263344, "balance_loss_clip": 0.06277102, "balance_loss_mlp": 0.01254308, "epoch": 0.723523222606343, "flos": 19103876403840.0, "grad_norm": 1.588615530085053, "language_loss": 0.75128084, "learning_rate": 7.495435625777423e-07, "loss": 0.82807434, "num_input_tokens_seen": 259643465, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09039307, "step": 12034, "time_per_iteration": 2.5393803119659424 }, { "auxiliary_loss_clip": 0.06420216, "auxiliary_loss_mlp": 0.01263216, "balance_loss_clip": 0.06278884, "balance_loss_mlp": 0.012535, "epoch": 0.723583345859011, "flos": 26514493752960.0, "grad_norm": 1.6523553147487522, "language_loss": 0.80906844, "learning_rate": 7.492396340449578e-07, "loss": 0.88590276, "num_input_tokens_seen": 259662500, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09716797, "step": 12035, "time_per_iteration": 2.5892536640167236 }, { "auxiliary_loss_clip": 0.06426039, "auxiliary_loss_mlp": 0.01264055, "balance_loss_clip": 0.06281364, "balance_loss_mlp": 0.01254077, "epoch": 0.723643469111679, "flos": 16039323901440.0, "grad_norm": 2.241622701021826, "language_loss": 0.61468637, "learning_rate": 7.489357529411326e-07, "loss": 0.69158733, "num_input_tokens_seen": 259680140, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.09973145, "step": 12036, "time_per_iteration": 3.9467341899871826 }, { "auxiliary_loss_clip": 0.06414516, "auxiliary_loss_mlp": 0.0126586, "balance_loss_clip": 0.06276965, "balance_loss_mlp": 0.01256961, "epoch": 0.7237035923643469, "flos": 21952164718080.0, "grad_norm": 1.810804011217532, "language_loss": 0.67961395, "learning_rate": 7.486319192777883e-07, "loss": 0.75641775, "num_input_tokens_seen": 259700160, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08898926, "step": 12037, "time_per_iteration": 2.554602861404419 }, { "auxiliary_loss_clip": 0.0642086, "auxiliary_loss_mlp": 0.01264562, "balance_loss_clip": 0.06280369, "balance_loss_mlp": 0.01253857, "epoch": 0.7237637156170149, "flos": 23589281790720.0, "grad_norm": 2.4141837530686336, "language_loss": 0.72328633, "learning_rate": 7.483281330664479e-07, "loss": 0.80014056, "num_input_tokens_seen": 259720525, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.10705566, "step": 12038, "time_per_iteration": 2.5631988048553467 }, { "auxiliary_loss_clip": 0.06423016, "auxiliary_loss_mlp": 0.01269221, "balance_loss_clip": 0.06279996, "balance_loss_mlp": 0.01257669, "epoch": 0.7238238388696828, "flos": 20600940176640.0, "grad_norm": 1.6047995928041565, "language_loss": 0.72555804, "learning_rate": 7.480243943186293e-07, "loss": 0.8024804, "num_input_tokens_seen": 259738680, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.11541748, "step": 12039, "time_per_iteration": 2.555644989013672 }, { "auxiliary_loss_clip": 0.06417446, "auxiliary_loss_mlp": 0.0126533, "balance_loss_clip": 0.06276394, "balance_loss_mlp": 0.01256354, "epoch": 0.7238839621223508, "flos": 24213909651840.0, "grad_norm": 1.841885284026903, "language_loss": 0.76282328, "learning_rate": 7.477207030458513e-07, "loss": 0.83965105, "num_input_tokens_seen": 259758790, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.08978271, "step": 12040, "time_per_iteration": 2.555396556854248 }, { "auxiliary_loss_clip": 0.0642462, "auxiliary_loss_mlp": 0.01266193, "balance_loss_clip": 0.06281135, "balance_loss_mlp": 0.01255667, "epoch": 0.7239440853750188, "flos": 14214928953600.0, "grad_norm": 1.5260252171082, "language_loss": 0.76812434, "learning_rate": 7.474170592596301e-07, "loss": 0.84503245, "num_input_tokens_seen": 259777370, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10522461, "step": 12041, "time_per_iteration": 3.937822103500366 }, { "auxiliary_loss_clip": 0.06421365, "auxiliary_loss_mlp": 0.01262804, "balance_loss_clip": 0.0627777, "balance_loss_mlp": 0.01252958, "epoch": 0.7240042086276868, "flos": 21620976255360.0, "grad_norm": 2.4010781715113496, "language_loss": 0.63909483, "learning_rate": 7.471134629714797e-07, "loss": 0.71593654, "num_input_tokens_seen": 259794665, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.09838867, "step": 12042, "time_per_iteration": 2.522883176803589 }, { "auxiliary_loss_clip": 0.06426187, "auxiliary_loss_mlp": 0.01268365, "balance_loss_clip": 0.06281343, "balance_loss_mlp": 0.01256372, "epoch": 0.7240643318803547, "flos": 23338203431040.0, "grad_norm": 1.7408802179631846, "language_loss": 0.83460271, "learning_rate": 7.468099141929116e-07, "loss": 0.91154826, "num_input_tokens_seen": 259811110, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.11987305, "step": 12043, "time_per_iteration": 2.578481674194336 }, { "auxiliary_loss_clip": 0.06420276, "auxiliary_loss_mlp": 0.01265679, "balance_loss_clip": 0.0627639, "balance_loss_mlp": 0.01254002, "epoch": 0.7241244551330227, "flos": 24031746875520.0, "grad_norm": 1.5612459910514276, "language_loss": 0.64523047, "learning_rate": 7.465064129354379e-07, "loss": 0.72209001, "num_input_tokens_seen": 259831080, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.11669922, "step": 12044, "time_per_iteration": 2.5597567558288574 }, { "auxiliary_loss_clip": 0.0642374, "auxiliary_loss_mlp": 0.01266917, "balance_loss_clip": 0.06280015, "balance_loss_mlp": 0.01256111, "epoch": 0.7241845783856906, "flos": 18735651636480.0, "grad_norm": 1.5684974864248793, "language_loss": 0.82136625, "learning_rate": 7.462029592105658e-07, "loss": 0.89827287, "num_input_tokens_seen": 259850135, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.1081543, "step": 12045, "time_per_iteration": 2.540070056915283 }, { "auxiliary_loss_clip": 0.06418452, "auxiliary_loss_mlp": 0.0126818, "balance_loss_clip": 0.06280316, "balance_loss_mlp": 0.01257737, "epoch": 0.7242447016383586, "flos": 19504483574400.0, "grad_norm": 1.5167280719433252, "language_loss": 0.71862066, "learning_rate": 7.458995530298034e-07, "loss": 0.79548699, "num_input_tokens_seen": 259868185, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.10443115, "step": 12046, "time_per_iteration": 2.518847942352295 }, { "auxiliary_loss_clip": 0.06421321, "auxiliary_loss_mlp": 0.01265473, "balance_loss_clip": 0.06276907, "balance_loss_mlp": 0.01255174, "epoch": 0.7243048248910267, "flos": 22169980206720.0, "grad_norm": 7.407039172373825, "language_loss": 0.71561396, "learning_rate": 7.455961944046553e-07, "loss": 0.79248196, "num_input_tokens_seen": 259887055, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10296631, "step": 12047, "time_per_iteration": 2.5395548343658447 }, { "auxiliary_loss_clip": 0.06422774, "auxiliary_loss_mlp": 0.01265968, "balance_loss_clip": 0.06277138, "balance_loss_mlp": 0.01255406, "epoch": 0.7243649481436946, "flos": 27680159427840.0, "grad_norm": 2.3941184391132877, "language_loss": 0.70046198, "learning_rate": 7.45292883346627e-07, "loss": 0.77734947, "num_input_tokens_seen": 259908295, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10559082, "step": 12048, "time_per_iteration": 2.5865089893341064 }, { "auxiliary_loss_clip": 0.06315212, "auxiliary_loss_mlp": 0.01254071, "balance_loss_clip": 0.06257977, "balance_loss_mlp": 0.01252736, "epoch": 0.7244250713963626, "flos": 63263686538880.0, "grad_norm": 0.8283768578446677, "language_loss": 0.53721976, "learning_rate": 7.449896198672168e-07, "loss": 0.6129126, "num_input_tokens_seen": 259968475, "router_z_loss_clip": 0.57128906, "router_z_loss_mlp": 0.0133667, "step": 12049, "time_per_iteration": 3.223402976989746 }, { "auxiliary_loss_clip": 0.06429944, "auxiliary_loss_mlp": 0.01266373, "balance_loss_clip": 0.06279013, "balance_loss_mlp": 0.01254827, "epoch": 0.7244851946490305, "flos": 17972815265280.0, "grad_norm": 2.152980342104778, "language_loss": 0.59816134, "learning_rate": 7.446864039779258e-07, "loss": 0.67512453, "num_input_tokens_seen": 259984865, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.11553955, "step": 12050, "time_per_iteration": 2.6162514686584473 }, { "auxiliary_loss_clip": 0.0632075, "auxiliary_loss_mlp": 0.01254381, "balance_loss_clip": 0.0626335, "balance_loss_mlp": 0.01253106, "epoch": 0.7245453179016985, "flos": 70964179488000.0, "grad_norm": 0.7028651447916233, "language_loss": 0.53127873, "learning_rate": 7.443832356902528e-07, "loss": 0.60702997, "num_input_tokens_seen": 260046735, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.01275635, "step": 12051, "time_per_iteration": 3.1716768741607666 }, { "auxiliary_loss_clip": 0.06416806, "auxiliary_loss_mlp": 0.01262869, "balance_loss_clip": 0.06276891, "balance_loss_mlp": 0.01253713, "epoch": 0.7246054411543664, "flos": 24574839114240.0, "grad_norm": 1.4939593596733662, "language_loss": 0.72015882, "learning_rate": 7.440801150156927e-07, "loss": 0.79695559, "num_input_tokens_seen": 260067950, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.0914917, "step": 12052, "time_per_iteration": 2.649723768234253 }, { "auxiliary_loss_clip": 0.06419704, "auxiliary_loss_mlp": 0.01266548, "balance_loss_clip": 0.06277585, "balance_loss_mlp": 0.01256022, "epoch": 0.7246655644070344, "flos": 32345715093120.0, "grad_norm": 1.6384279390438854, "language_loss": 0.74252063, "learning_rate": 7.437770419657415e-07, "loss": 0.81938314, "num_input_tokens_seen": 260087730, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10516357, "step": 12053, "time_per_iteration": 2.659726858139038 }, { "auxiliary_loss_clip": 0.06419373, "auxiliary_loss_mlp": 0.01266988, "balance_loss_clip": 0.06278191, "balance_loss_mlp": 0.01256349, "epoch": 0.7247256876597024, "flos": 21879056430720.0, "grad_norm": 2.204432303528671, "language_loss": 0.78141683, "learning_rate": 7.434740165518898e-07, "loss": 0.85828042, "num_input_tokens_seen": 260107760, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10644531, "step": 12054, "time_per_iteration": 2.583691358566284 }, { "auxiliary_loss_clip": 0.06420927, "auxiliary_loss_mlp": 0.0126469, "balance_loss_clip": 0.06279327, "balance_loss_mlp": 0.01254379, "epoch": 0.7247858109123704, "flos": 16218048660480.0, "grad_norm": 2.307987118721713, "language_loss": 0.68658483, "learning_rate": 7.431710387856301e-07, "loss": 0.76344097, "num_input_tokens_seen": 260123660, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.10308838, "step": 12055, "time_per_iteration": 2.5050904750823975 }, { "auxiliary_loss_clip": 0.06418802, "auxiliary_loss_mlp": 0.01265742, "balance_loss_clip": 0.06277379, "balance_loss_mlp": 0.0125658, "epoch": 0.7248459341650383, "flos": 20857091708160.0, "grad_norm": 2.1016609501367016, "language_loss": 0.74100494, "learning_rate": 7.428681086784496e-07, "loss": 0.81785041, "num_input_tokens_seen": 260142690, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09161377, "step": 12056, "time_per_iteration": 2.572913408279419 }, { "auxiliary_loss_clip": 0.06414533, "auxiliary_loss_mlp": 0.0126236, "balance_loss_clip": 0.06275797, "balance_loss_mlp": 0.01252471, "epoch": 0.7249060574177063, "flos": 25928956621440.0, "grad_norm": 1.4050981863743262, "language_loss": 0.70892167, "learning_rate": 7.425652262418368e-07, "loss": 0.78569067, "num_input_tokens_seen": 260162590, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09887695, "step": 12057, "time_per_iteration": 2.5728859901428223 }, { "auxiliary_loss_clip": 0.06424871, "auxiliary_loss_mlp": 0.01266837, "balance_loss_clip": 0.06278062, "balance_loss_mlp": 0.01255846, "epoch": 0.7249661806703742, "flos": 17350912661760.0, "grad_norm": 1.5967704459448941, "language_loss": 0.6286881, "learning_rate": 7.42262391487277e-07, "loss": 0.70560515, "num_input_tokens_seen": 260181065, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10986328, "step": 12058, "time_per_iteration": 2.5352869033813477 }, { "auxiliary_loss_clip": 0.06419145, "auxiliary_loss_mlp": 0.01267352, "balance_loss_clip": 0.06278466, "balance_loss_mlp": 0.01257035, "epoch": 0.7250263039230422, "flos": 19580400973440.0, "grad_norm": 1.9020483322893411, "language_loss": 0.7511254, "learning_rate": 7.419596044262535e-07, "loss": 0.82799035, "num_input_tokens_seen": 260200330, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10321045, "step": 12059, "time_per_iteration": 3.9914214611053467 }, { "auxiliary_loss_clip": 0.06413259, "auxiliary_loss_mlp": 0.01264549, "balance_loss_clip": 0.06275876, "balance_loss_mlp": 0.01255334, "epoch": 0.7250864271757103, "flos": 21982366915200.0, "grad_norm": 1.6230458515074815, "language_loss": 0.79637516, "learning_rate": 7.416568650702472e-07, "loss": 0.87315327, "num_input_tokens_seen": 260219975, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09216309, "step": 12060, "time_per_iteration": 2.6273560523986816 }, { "auxiliary_loss_clip": 0.06422964, "auxiliary_loss_mlp": 0.01265898, "balance_loss_clip": 0.0627944, "balance_loss_mlp": 0.01255616, "epoch": 0.7251465504283782, "flos": 25020113310720.0, "grad_norm": 1.7013119617630283, "language_loss": 0.76063406, "learning_rate": 7.413541734307393e-07, "loss": 0.83752269, "num_input_tokens_seen": 260242025, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10284424, "step": 12061, "time_per_iteration": 2.5972788333892822 }, { "auxiliary_loss_clip": 0.06413089, "auxiliary_loss_mlp": 0.01264265, "balance_loss_clip": 0.0627605, "balance_loss_mlp": 0.01254633, "epoch": 0.7252066736810462, "flos": 16695621406080.0, "grad_norm": 1.8585882335529156, "language_loss": 0.81591713, "learning_rate": 7.410515295192068e-07, "loss": 0.89269066, "num_input_tokens_seen": 260260015, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09631348, "step": 12062, "time_per_iteration": 2.51674747467041 }, { "auxiliary_loss_clip": 0.06428348, "auxiliary_loss_mlp": 0.01265635, "balance_loss_clip": 0.06281427, "balance_loss_mlp": 0.01254781, "epoch": 0.7252667969337141, "flos": 25710176810880.0, "grad_norm": 2.050743896779575, "language_loss": 0.69474638, "learning_rate": 7.407489333471262e-07, "loss": 0.7716862, "num_input_tokens_seen": 260278635, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.10852051, "step": 12063, "time_per_iteration": 2.5757665634155273 }, { "auxiliary_loss_clip": 0.06415311, "auxiliary_loss_mlp": 0.01266335, "balance_loss_clip": 0.06277808, "balance_loss_mlp": 0.01256358, "epoch": 0.7253269201863821, "flos": 18265835393280.0, "grad_norm": 1.788335123801939, "language_loss": 0.70443827, "learning_rate": 7.40446384925973e-07, "loss": 0.78125471, "num_input_tokens_seen": 260298510, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09979248, "step": 12064, "time_per_iteration": 2.532087802886963 }, { "auxiliary_loss_clip": 0.06418076, "auxiliary_loss_mlp": 0.012684, "balance_loss_clip": 0.0627768, "balance_loss_mlp": 0.01257922, "epoch": 0.72538704343905, "flos": 20417938859520.0, "grad_norm": 1.6509232405312106, "language_loss": 0.90661907, "learning_rate": 7.401438842672192e-07, "loss": 0.98348379, "num_input_tokens_seen": 260317405, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.10479736, "step": 12065, "time_per_iteration": 4.020964622497559 }, { "auxiliary_loss_clip": 0.06310654, "auxiliary_loss_mlp": 0.0125102, "balance_loss_clip": 0.0625371, "balance_loss_mlp": 0.01249794, "epoch": 0.725447166691718, "flos": 70173321125760.0, "grad_norm": 0.6403215813238357, "language_loss": 0.56031895, "learning_rate": 7.398414313823349e-07, "loss": 0.63593566, "num_input_tokens_seen": 260388085, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01224518, "step": 12066, "time_per_iteration": 3.320293426513672 }, { "auxiliary_loss_clip": 0.06416653, "auxiliary_loss_mlp": 0.0126699, "balance_loss_clip": 0.06275915, "balance_loss_mlp": 0.01257591, "epoch": 0.725507289944386, "flos": 27059598489600.0, "grad_norm": 1.7572964341785078, "language_loss": 0.76829147, "learning_rate": 7.395390262827897e-07, "loss": 0.84512794, "num_input_tokens_seen": 260406165, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09405518, "step": 12067, "time_per_iteration": 2.573662519454956 }, { "auxiliary_loss_clip": 0.06312457, "auxiliary_loss_mlp": 0.01251443, "balance_loss_clip": 0.06255342, "balance_loss_mlp": 0.01250118, "epoch": 0.725567413197054, "flos": 62941973587200.0, "grad_norm": 0.7218501986878606, "language_loss": 0.56987548, "learning_rate": 7.392366689800515e-07, "loss": 0.64551443, "num_input_tokens_seen": 260461365, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01325989, "step": 12068, "time_per_iteration": 3.1012837886810303 }, { "auxiliary_loss_clip": 0.06317586, "auxiliary_loss_mlp": 0.01250836, "balance_loss_clip": 0.06260321, "balance_loss_mlp": 0.01249583, "epoch": 0.7256275364497219, "flos": 60315735392640.0, "grad_norm": 0.6481441075302604, "language_loss": 0.55370808, "learning_rate": 7.389343594855848e-07, "loss": 0.62939227, "num_input_tokens_seen": 260523795, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.01252747, "step": 12069, "time_per_iteration": 3.1888654232025146 }, { "auxiliary_loss_clip": 0.06416482, "auxiliary_loss_mlp": 0.01263687, "balance_loss_clip": 0.06279051, "balance_loss_mlp": 0.01254836, "epoch": 0.7256876597023899, "flos": 24505378479360.0, "grad_norm": 1.7080555161934772, "language_loss": 0.7955122, "learning_rate": 7.38632097810854e-07, "loss": 0.87231386, "num_input_tokens_seen": 260544765, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.08843994, "step": 12070, "time_per_iteration": 2.612107753753662 }, { "auxiliary_loss_clip": 0.06414576, "auxiliary_loss_mlp": 0.01264601, "balance_loss_clip": 0.06278966, "balance_loss_mlp": 0.01255273, "epoch": 0.7257477829550578, "flos": 24359623102080.0, "grad_norm": 1.7142086078130099, "language_loss": 0.72499603, "learning_rate": 7.383298839673197e-07, "loss": 0.80178785, "num_input_tokens_seen": 260564340, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.09320068, "step": 12071, "time_per_iteration": 2.58373761177063 }, { "auxiliary_loss_clip": 0.06415915, "auxiliary_loss_mlp": 0.0126834, "balance_loss_clip": 0.06276581, "balance_loss_mlp": 0.01258565, "epoch": 0.7258079062077258, "flos": 17208008323200.0, "grad_norm": 1.8283171917490595, "language_loss": 0.70352036, "learning_rate": 7.380277179664436e-07, "loss": 0.7803629, "num_input_tokens_seen": 260582565, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09777832, "step": 12072, "time_per_iteration": 2.549453020095825 }, { "auxiliary_loss_clip": 0.06420211, "auxiliary_loss_mlp": 0.01264693, "balance_loss_clip": 0.06274688, "balance_loss_mlp": 0.01253928, "epoch": 0.7258680294603939, "flos": 21586832916480.0, "grad_norm": 2.090724964115837, "language_loss": 0.78685278, "learning_rate": 7.377255998196821e-07, "loss": 0.86370182, "num_input_tokens_seen": 260601700, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10772705, "step": 12073, "time_per_iteration": 2.573674440383911 }, { "auxiliary_loss_clip": 0.06418694, "auxiliary_loss_mlp": 0.01269398, "balance_loss_clip": 0.06279856, "balance_loss_mlp": 0.01259062, "epoch": 0.7259281527130618, "flos": 34863150360960.0, "grad_norm": 1.4278408351178742, "language_loss": 0.70289493, "learning_rate": 7.374235295384923e-07, "loss": 0.7797758, "num_input_tokens_seen": 260623040, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10339355, "step": 12074, "time_per_iteration": 2.7088406085968018 }, { "auxiliary_loss_clip": 0.06420553, "auxiliary_loss_mlp": 0.01263996, "balance_loss_clip": 0.0627654, "balance_loss_mlp": 0.01254591, "epoch": 0.7259882759657298, "flos": 25410657991680.0, "grad_norm": 2.0901655250090747, "language_loss": 0.74585545, "learning_rate": 7.371215071343302e-07, "loss": 0.82270098, "num_input_tokens_seen": 260642735, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.09405518, "step": 12075, "time_per_iteration": 2.731621503829956 }, { "auxiliary_loss_clip": 0.06422519, "auxiliary_loss_mlp": 0.01266638, "balance_loss_clip": 0.06280372, "balance_loss_mlp": 0.01255659, "epoch": 0.7260483992183977, "flos": 62966781924480.0, "grad_norm": 1.4044325614059046, "language_loss": 0.63895881, "learning_rate": 7.368195326186458e-07, "loss": 0.71585035, "num_input_tokens_seen": 260669935, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10968018, "step": 12076, "time_per_iteration": 4.53315281867981 }, { "auxiliary_loss_clip": 0.06418569, "auxiliary_loss_mlp": 0.01265064, "balance_loss_clip": 0.06276487, "balance_loss_mlp": 0.01254627, "epoch": 0.7261085224710657, "flos": 26474522555520.0, "grad_norm": 3.3093833293792634, "language_loss": 0.79203033, "learning_rate": 7.365176060028912e-07, "loss": 0.86886668, "num_input_tokens_seen": 260689605, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10437012, "step": 12077, "time_per_iteration": 2.5684359073638916 }, { "auxiliary_loss_clip": 0.06317729, "auxiliary_loss_mlp": 0.0125349, "balance_loss_clip": 0.06260862, "balance_loss_mlp": 0.0125219, "epoch": 0.7261686457237336, "flos": 66790634198400.0, "grad_norm": 0.8692491748311013, "language_loss": 0.65010792, "learning_rate": 7.362157272985163e-07, "loss": 0.72582012, "num_input_tokens_seen": 260748265, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01300812, "step": 12078, "time_per_iteration": 3.1583359241485596 }, { "auxiliary_loss_clip": 0.06322388, "auxiliary_loss_mlp": 0.01255442, "balance_loss_clip": 0.06265154, "balance_loss_mlp": 0.01254098, "epoch": 0.7262287689764017, "flos": 70020731640960.0, "grad_norm": 0.709372940720036, "language_loss": 0.59413737, "learning_rate": 7.359138965169671e-07, "loss": 0.66991568, "num_input_tokens_seen": 260816715, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01345825, "step": 12079, "time_per_iteration": 3.287677764892578 }, { "auxiliary_loss_clip": 0.06416763, "auxiliary_loss_mlp": 0.01266875, "balance_loss_clip": 0.06276257, "balance_loss_mlp": 0.01256927, "epoch": 0.7262888922290696, "flos": 23812212378240.0, "grad_norm": 1.9086205583454239, "language_loss": 0.64780545, "learning_rate": 7.356121136696895e-07, "loss": 0.7246418, "num_input_tokens_seen": 260836765, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.0994873, "step": 12080, "time_per_iteration": 4.019871711730957 }, { "auxiliary_loss_clip": 0.06426591, "auxiliary_loss_mlp": 0.01266625, "balance_loss_clip": 0.06284092, "balance_loss_mlp": 0.01256325, "epoch": 0.7263490154817376, "flos": 19506412218240.0, "grad_norm": 2.2298085922841744, "language_loss": 0.70540166, "learning_rate": 7.35310378768128e-07, "loss": 0.78233373, "num_input_tokens_seen": 260854610, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10296631, "step": 12081, "time_per_iteration": 2.5353922843933105 }, { "auxiliary_loss_clip": 0.0642564, "auxiliary_loss_mlp": 0.01264776, "balance_loss_clip": 0.06280012, "balance_loss_mlp": 0.01254739, "epoch": 0.7264091387344055, "flos": 16291240801920.0, "grad_norm": 1.6812451704754376, "language_loss": 0.81124765, "learning_rate": 7.350086918237237e-07, "loss": 0.88815182, "num_input_tokens_seen": 260871620, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.1003418, "step": 12082, "time_per_iteration": 2.53167986869812 }, { "auxiliary_loss_clip": 0.06432298, "auxiliary_loss_mlp": 0.01266339, "balance_loss_clip": 0.06281974, "balance_loss_mlp": 0.01255223, "epoch": 0.7264692619870735, "flos": 24358784561280.0, "grad_norm": 1.6272450115156312, "language_loss": 0.77267158, "learning_rate": 7.347070528479158e-07, "loss": 0.84965789, "num_input_tokens_seen": 260890490, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.11114502, "step": 12083, "time_per_iteration": 2.6200764179229736 }, { "auxiliary_loss_clip": 0.06427339, "auxiliary_loss_mlp": 0.01264631, "balance_loss_clip": 0.06282485, "balance_loss_mlp": 0.01254522, "epoch": 0.7265293852397414, "flos": 25126568323200.0, "grad_norm": 1.7235072292696856, "language_loss": 0.73311865, "learning_rate": 7.344054618521433e-07, "loss": 0.81003833, "num_input_tokens_seen": 260909700, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10113525, "step": 12084, "time_per_iteration": 2.5882575511932373 }, { "auxiliary_loss_clip": 0.06423993, "auxiliary_loss_mlp": 0.01268019, "balance_loss_clip": 0.0627972, "balance_loss_mlp": 0.01257707, "epoch": 0.7265895084924094, "flos": 22644869621760.0, "grad_norm": 1.9071611355160145, "language_loss": 0.77809358, "learning_rate": 7.34103918847843e-07, "loss": 0.85501367, "num_input_tokens_seen": 260929090, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10302734, "step": 12085, "time_per_iteration": 2.679142713546753 }, { "auxiliary_loss_clip": 0.06420538, "auxiliary_loss_mlp": 0.0126465, "balance_loss_clip": 0.06277332, "balance_loss_mlp": 0.01255173, "epoch": 0.7266496317450775, "flos": 23375030100480.0, "grad_norm": 3.378243833900218, "language_loss": 0.72409582, "learning_rate": 7.338024238464493e-07, "loss": 0.80094767, "num_input_tokens_seen": 260946615, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.0947876, "step": 12086, "time_per_iteration": 2.576894998550415 }, { "auxiliary_loss_clip": 0.06420502, "auxiliary_loss_mlp": 0.01263925, "balance_loss_clip": 0.06280558, "balance_loss_mlp": 0.01254108, "epoch": 0.7267097549977454, "flos": 28082150190720.0, "grad_norm": 1.6514818938910745, "language_loss": 0.70288932, "learning_rate": 7.335009768593938e-07, "loss": 0.77973366, "num_input_tokens_seen": 260968515, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.0980835, "step": 12087, "time_per_iteration": 2.6723861694335938 }, { "auxiliary_loss_clip": 0.0642488, "auxiliary_loss_mlp": 0.0126578, "balance_loss_clip": 0.06279361, "balance_loss_mlp": 0.01254437, "epoch": 0.7267698782504134, "flos": 22201272506880.0, "grad_norm": 1.7612470663280386, "language_loss": 0.79204488, "learning_rate": 7.331995778981088e-07, "loss": 0.8689515, "num_input_tokens_seen": 260986790, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.11334229, "step": 12088, "time_per_iteration": 2.5591211318969727 }, { "auxiliary_loss_clip": 0.06422956, "auxiliary_loss_mlp": 0.0126761, "balance_loss_clip": 0.0627941, "balance_loss_mlp": 0.01257656, "epoch": 0.7268300015030813, "flos": 18520729113600.0, "grad_norm": 1.6509174544807759, "language_loss": 0.73844731, "learning_rate": 7.328982269740221e-07, "loss": 0.81535304, "num_input_tokens_seen": 261004925, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.0994873, "step": 12089, "time_per_iteration": 2.5489697456359863 }, { "auxiliary_loss_clip": 0.06423363, "auxiliary_loss_mlp": 0.01268543, "balance_loss_clip": 0.06279401, "balance_loss_mlp": 0.01258851, "epoch": 0.7268901247557493, "flos": 23992530364800.0, "grad_norm": 1.645881496773101, "language_loss": 0.71335649, "learning_rate": 7.325969240985616e-07, "loss": 0.79027557, "num_input_tokens_seen": 261023895, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.09692383, "step": 12090, "time_per_iteration": 2.5759332180023193 }, { "auxiliary_loss_clip": 0.0642259, "auxiliary_loss_mlp": 0.01265016, "balance_loss_clip": 0.06278788, "balance_loss_mlp": 0.01254776, "epoch": 0.7269502480084172, "flos": 32096313815040.0, "grad_norm": 1.6705418529145553, "language_loss": 0.7718339, "learning_rate": 7.322956692831528e-07, "loss": 0.84870994, "num_input_tokens_seen": 261045445, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10247803, "step": 12091, "time_per_iteration": 2.6535301208496094 }, { "auxiliary_loss_clip": 0.06420334, "auxiliary_loss_mlp": 0.01263156, "balance_loss_clip": 0.0627819, "balance_loss_mlp": 0.01253303, "epoch": 0.7270103712610853, "flos": 19068852597120.0, "grad_norm": 1.6618649414639455, "language_loss": 0.71474642, "learning_rate": 7.319944625392205e-07, "loss": 0.79158127, "num_input_tokens_seen": 261064275, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09863281, "step": 12092, "time_per_iteration": 2.5338845252990723 }, { "auxiliary_loss_clip": 0.06417955, "auxiliary_loss_mlp": 0.01264694, "balance_loss_clip": 0.06277515, "balance_loss_mlp": 0.01254865, "epoch": 0.7270704945137532, "flos": 34541605117440.0, "grad_norm": 1.9276169662098073, "language_loss": 0.60826743, "learning_rate": 7.31693303878184e-07, "loss": 0.68509388, "num_input_tokens_seen": 261083310, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.0982666, "step": 12093, "time_per_iteration": 2.6641180515289307 }, { "auxiliary_loss_clip": 0.06419493, "auxiliary_loss_mlp": 0.01269423, "balance_loss_clip": 0.0627938, "balance_loss_mlp": 0.0125969, "epoch": 0.7271306177664212, "flos": 21514101972480.0, "grad_norm": 1.4328440708928352, "language_loss": 0.75572842, "learning_rate": 7.313921933114644e-07, "loss": 0.83261758, "num_input_tokens_seen": 261103460, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09735107, "step": 12094, "time_per_iteration": 2.616293430328369 }, { "auxiliary_loss_clip": 0.06409488, "auxiliary_loss_mlp": 0.01269203, "balance_loss_clip": 0.06272351, "balance_loss_mlp": 0.01260656, "epoch": 0.7271907410190891, "flos": 22278866987520.0, "grad_norm": 1.7238438783705505, "language_loss": 0.85097367, "learning_rate": 7.310911308504808e-07, "loss": 0.9277606, "num_input_tokens_seen": 261121375, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.08551025, "step": 12095, "time_per_iteration": 2.6072661876678467 }, { "auxiliary_loss_clip": 0.06416164, "auxiliary_loss_mlp": 0.01267081, "balance_loss_clip": 0.06273074, "balance_loss_mlp": 0.01257133, "epoch": 0.7272508642717571, "flos": 22899721415040.0, "grad_norm": 1.634402384004287, "language_loss": 0.78040588, "learning_rate": 7.307901165066479e-07, "loss": 0.85723835, "num_input_tokens_seen": 261141105, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.09954834, "step": 12096, "time_per_iteration": 2.732461929321289 }, { "auxiliary_loss_clip": 0.06418891, "auxiliary_loss_mlp": 0.01266208, "balance_loss_clip": 0.0627657, "balance_loss_mlp": 0.0125629, "epoch": 0.727310987524425, "flos": 11660667016320.0, "grad_norm": 1.9205539668911555, "language_loss": 0.7227174, "learning_rate": 7.30489150291381e-07, "loss": 0.79956841, "num_input_tokens_seen": 261159255, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.09918213, "step": 12097, "time_per_iteration": 2.6554009914398193 }, { "auxiliary_loss_clip": 0.06423144, "auxiliary_loss_mlp": 0.01266635, "balance_loss_clip": 0.06280427, "balance_loss_mlp": 0.0125599, "epoch": 0.727371110777093, "flos": 24542247075840.0, "grad_norm": 2.904448673611196, "language_loss": 0.76812559, "learning_rate": 7.301882322160935e-07, "loss": 0.84502339, "num_input_tokens_seen": 261177960, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10638428, "step": 12098, "time_per_iteration": 4.054180860519409 }, { "auxiliary_loss_clip": 0.06422148, "auxiliary_loss_mlp": 0.0126691, "balance_loss_clip": 0.06277414, "balance_loss_mlp": 0.01256688, "epoch": 0.7274312340297611, "flos": 74755175690880.0, "grad_norm": 1.7148010290917373, "language_loss": 0.67592323, "learning_rate": 7.298873622921952e-07, "loss": 0.75281388, "num_input_tokens_seen": 261205660, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10223389, "step": 12099, "time_per_iteration": 2.979790210723877 }, { "auxiliary_loss_clip": 0.06427403, "auxiliary_loss_mlp": 0.01269597, "balance_loss_clip": 0.06279733, "balance_loss_mlp": 0.01258368, "epoch": 0.727491357282429, "flos": 22348872673920.0, "grad_norm": 1.8220932261447993, "language_loss": 0.72420496, "learning_rate": 7.29586540531095e-07, "loss": 0.801175, "num_input_tokens_seen": 261225185, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.11212158, "step": 12100, "time_per_iteration": 2.563567876815796 }, { "auxiliary_loss_clip": 0.064173, "auxiliary_loss_mlp": 0.01267035, "balance_loss_clip": 0.06275507, "balance_loss_mlp": 0.01257701, "epoch": 0.727551480535097, "flos": 23304730924800.0, "grad_norm": 1.2906241957193283, "language_loss": 0.74908221, "learning_rate": 7.292857669442005e-07, "loss": 0.82592553, "num_input_tokens_seen": 261247965, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09338379, "step": 12101, "time_per_iteration": 2.6287059783935547 }, { "auxiliary_loss_clip": 0.06417194, "auxiliary_loss_mlp": 0.01264759, "balance_loss_clip": 0.06279488, "balance_loss_mlp": 0.01255825, "epoch": 0.7276116037877649, "flos": 21476981813760.0, "grad_norm": 1.75095158229029, "language_loss": 0.82402587, "learning_rate": 7.289850415429177e-07, "loss": 0.90084529, "num_input_tokens_seen": 261267585, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.08929443, "step": 12102, "time_per_iteration": 2.556845188140869 }, { "auxiliary_loss_clip": 0.06418129, "auxiliary_loss_mlp": 0.01269542, "balance_loss_clip": 0.06278198, "balance_loss_mlp": 0.01260852, "epoch": 0.7276717270404329, "flos": 21469393019520.0, "grad_norm": 1.9372555444110993, "language_loss": 0.81717151, "learning_rate": 7.286843643386495e-07, "loss": 0.89404821, "num_input_tokens_seen": 261285200, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.08685303, "step": 12103, "time_per_iteration": 2.604043960571289 }, { "auxiliary_loss_clip": 0.06425067, "auxiliary_loss_mlp": 0.0126602, "balance_loss_clip": 0.06282283, "balance_loss_mlp": 0.0125563, "epoch": 0.7277318502931008, "flos": 16842928083840.0, "grad_norm": 1.7225487727732547, "language_loss": 0.66936988, "learning_rate": 7.283837353427968e-07, "loss": 0.74628073, "num_input_tokens_seen": 261303645, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10388184, "step": 12104, "time_per_iteration": 2.543692111968994 }, { "auxiliary_loss_clip": 0.06417582, "auxiliary_loss_mlp": 0.01268971, "balance_loss_clip": 0.0628049, "balance_loss_mlp": 0.01258886, "epoch": 0.7277919735457689, "flos": 33408824970240.0, "grad_norm": 2.7940881758071265, "language_loss": 0.65905315, "learning_rate": 7.280831545667611e-07, "loss": 0.73591864, "num_input_tokens_seen": 261323265, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.10089111, "step": 12105, "time_per_iteration": 4.09175443649292 }, { "auxiliary_loss_clip": 0.06423339, "auxiliary_loss_mlp": 0.01265996, "balance_loss_clip": 0.06283531, "balance_loss_mlp": 0.01256638, "epoch": 0.7278520967984368, "flos": 19212218133120.0, "grad_norm": 2.300209032726254, "language_loss": 0.75356644, "learning_rate": 7.27782622021939e-07, "loss": 0.83045977, "num_input_tokens_seen": 261339745, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09362793, "step": 12106, "time_per_iteration": 2.5365114212036133 }, { "auxiliary_loss_clip": 0.06424192, "auxiliary_loss_mlp": 0.0126656, "balance_loss_clip": 0.06278117, "balance_loss_mlp": 0.01255467, "epoch": 0.7279122200511048, "flos": 34103206955520.0, "grad_norm": 2.3546508609765526, "language_loss": 0.70508504, "learning_rate": 7.274821377197273e-07, "loss": 0.78199255, "num_input_tokens_seen": 261359310, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.11096191, "step": 12107, "time_per_iteration": 2.683701992034912 }, { "auxiliary_loss_clip": 0.06416875, "auxiliary_loss_mlp": 0.01264024, "balance_loss_clip": 0.06276494, "balance_loss_mlp": 0.01254571, "epoch": 0.7279723433037727, "flos": 54610913865600.0, "grad_norm": 1.3834248168143615, "language_loss": 0.75393975, "learning_rate": 7.271817016715205e-07, "loss": 0.83074874, "num_input_tokens_seen": 261384640, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09454346, "step": 12108, "time_per_iteration": 2.8466484546661377 }, { "auxiliary_loss_clip": 0.06424142, "auxiliary_loss_mlp": 0.01271629, "balance_loss_clip": 0.0628097, "balance_loss_mlp": 0.01261991, "epoch": 0.7280324665564407, "flos": 36146297859840.0, "grad_norm": 1.5001572241051382, "language_loss": 0.67298067, "learning_rate": 7.268813138887124e-07, "loss": 0.74993837, "num_input_tokens_seen": 261405290, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.09631348, "step": 12109, "time_per_iteration": 2.7105655670166016 }, { "auxiliary_loss_clip": 0.06421208, "auxiliary_loss_mlp": 0.01268429, "balance_loss_clip": 0.06280714, "balance_loss_mlp": 0.01257313, "epoch": 0.7280925898091086, "flos": 11623169514240.0, "grad_norm": 2.1648750103492342, "language_loss": 0.63902175, "learning_rate": 7.265809743826912e-07, "loss": 0.71591818, "num_input_tokens_seen": 261419710, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.11108398, "step": 12110, "time_per_iteration": 2.5259511470794678 }, { "auxiliary_loss_clip": 0.06421912, "auxiliary_loss_mlp": 0.01268412, "balance_loss_clip": 0.06274803, "balance_loss_mlp": 0.01258201, "epoch": 0.7281527130617766, "flos": 34285663221120.0, "grad_norm": 1.6449801316511607, "language_loss": 0.58525497, "learning_rate": 7.26280683164847e-07, "loss": 0.66215825, "num_input_tokens_seen": 261442385, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.10223389, "step": 12111, "time_per_iteration": 2.732961893081665 }, { "auxiliary_loss_clip": 0.06428744, "auxiliary_loss_mlp": 0.01267949, "balance_loss_clip": 0.06284779, "balance_loss_mlp": 0.01257482, "epoch": 0.7282128363144446, "flos": 13923208563840.0, "grad_norm": 2.92166098541679, "language_loss": 0.73889244, "learning_rate": 7.259804402465677e-07, "loss": 0.81585938, "num_input_tokens_seen": 261459805, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10479736, "step": 12112, "time_per_iteration": 2.52646803855896 }, { "auxiliary_loss_clip": 0.06418923, "auxiliary_loss_mlp": 0.01264556, "balance_loss_clip": 0.06278713, "balance_loss_mlp": 0.01255091, "epoch": 0.7282729595671126, "flos": 20783983420800.0, "grad_norm": 2.30770363334519, "language_loss": 0.66856903, "learning_rate": 7.25680245639237e-07, "loss": 0.74540377, "num_input_tokens_seen": 261477175, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09460449, "step": 12113, "time_per_iteration": 2.569315195083618 }, { "auxiliary_loss_clip": 0.06420563, "auxiliary_loss_mlp": 0.01266365, "balance_loss_clip": 0.06277627, "balance_loss_mlp": 0.01256196, "epoch": 0.7283330828197806, "flos": 16330876583040.0, "grad_norm": 1.6592968387071863, "language_loss": 0.73313081, "learning_rate": 7.253800993542399e-07, "loss": 0.81000018, "num_input_tokens_seen": 261494990, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10168457, "step": 12114, "time_per_iteration": 2.540316104888916 }, { "auxiliary_loss_clip": 0.06417923, "auxiliary_loss_mlp": 0.0126807, "balance_loss_clip": 0.06277119, "balance_loss_mlp": 0.01257878, "epoch": 0.7283932060724485, "flos": 27497535454080.0, "grad_norm": 3.1205208970448384, "language_loss": 0.68995655, "learning_rate": 7.250800014029564e-07, "loss": 0.7668165, "num_input_tokens_seen": 261514445, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10192871, "step": 12115, "time_per_iteration": 4.040701866149902 }, { "auxiliary_loss_clip": 0.06423475, "auxiliary_loss_mlp": 0.01264749, "balance_loss_clip": 0.06278597, "balance_loss_mlp": 0.01254909, "epoch": 0.7284533293251165, "flos": 18373548216960.0, "grad_norm": 1.6457545882435487, "language_loss": 0.60723358, "learning_rate": 7.247799517967674e-07, "loss": 0.68411583, "num_input_tokens_seen": 261533565, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.09832764, "step": 12116, "time_per_iteration": 2.5794801712036133 }, { "auxiliary_loss_clip": 0.06422725, "auxiliary_loss_mlp": 0.01267428, "balance_loss_clip": 0.06282527, "balance_loss_mlp": 0.01257307, "epoch": 0.7285134525777844, "flos": 21731917461120.0, "grad_norm": 1.7614050097803347, "language_loss": 0.72818649, "learning_rate": 7.2447995054705e-07, "loss": 0.80508804, "num_input_tokens_seen": 261553795, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.10113525, "step": 12117, "time_per_iteration": 2.5635879039764404 }, { "auxiliary_loss_clip": 0.06422716, "auxiliary_loss_mlp": 0.01267176, "balance_loss_clip": 0.0628157, "balance_loss_mlp": 0.0125684, "epoch": 0.7285735758304525, "flos": 20747743729920.0, "grad_norm": 1.9328520771484818, "language_loss": 0.69674087, "learning_rate": 7.241799976651807e-07, "loss": 0.7736398, "num_input_tokens_seen": 261572565, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10345459, "step": 12118, "time_per_iteration": 2.574033260345459 }, { "auxiliary_loss_clip": 0.0641463, "auxiliary_loss_mlp": 0.01268621, "balance_loss_clip": 0.06279614, "balance_loss_mlp": 0.0125971, "epoch": 0.7286336990831204, "flos": 17316643541760.0, "grad_norm": 1.7311246432592418, "language_loss": 0.84755743, "learning_rate": 7.238800931625346e-07, "loss": 0.9243899, "num_input_tokens_seen": 261590910, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.08917236, "step": 12119, "time_per_iteration": 2.5520219802856445 }, { "auxiliary_loss_clip": 0.06420988, "auxiliary_loss_mlp": 0.01267753, "balance_loss_clip": 0.06278975, "balance_loss_mlp": 0.01257954, "epoch": 0.7286938223357884, "flos": 19792724019840.0, "grad_norm": 2.6986725350423213, "language_loss": 0.8230499, "learning_rate": 7.235802370504831e-07, "loss": 0.89993733, "num_input_tokens_seen": 261606005, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.0980835, "step": 12120, "time_per_iteration": 3.9662415981292725 }, { "auxiliary_loss_clip": 0.06423441, "auxiliary_loss_mlp": 0.01266206, "balance_loss_clip": 0.06280871, "balance_loss_mlp": 0.01256449, "epoch": 0.7287539455884563, "flos": 15346241654400.0, "grad_norm": 3.363673382749807, "language_loss": 0.7872498, "learning_rate": 7.232804293403963e-07, "loss": 0.86414635, "num_input_tokens_seen": 261622305, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09753418, "step": 12121, "time_per_iteration": 2.6626358032226562 }, { "auxiliary_loss_clip": 0.06423508, "auxiliary_loss_mlp": 0.01266311, "balance_loss_clip": 0.06276764, "balance_loss_mlp": 0.01255576, "epoch": 0.7288140688411243, "flos": 25199592756480.0, "grad_norm": 1.5467058450666529, "language_loss": 0.69527352, "learning_rate": 7.229806700436441e-07, "loss": 0.77217174, "num_input_tokens_seen": 261642465, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.10742188, "step": 12122, "time_per_iteration": 2.585179090499878 }, { "auxiliary_loss_clip": 0.06415781, "auxiliary_loss_mlp": 0.01267434, "balance_loss_clip": 0.06277178, "balance_loss_mlp": 0.01258547, "epoch": 0.7288741920937922, "flos": 23990350158720.0, "grad_norm": 1.8575289752884288, "language_loss": 0.87115276, "learning_rate": 7.226809591715923e-07, "loss": 0.94798505, "num_input_tokens_seen": 261661420, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.08886719, "step": 12123, "time_per_iteration": 2.568082809448242 }, { "auxiliary_loss_clip": 0.06418895, "auxiliary_loss_mlp": 0.01266334, "balance_loss_clip": 0.06279011, "balance_loss_mlp": 0.01256017, "epoch": 0.7289343153464602, "flos": 22751114999040.0, "grad_norm": 1.8633887401961102, "language_loss": 0.83045566, "learning_rate": 7.223812967356065e-07, "loss": 0.90730792, "num_input_tokens_seen": 261680865, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.10314941, "step": 12124, "time_per_iteration": 2.540499210357666 }, { "auxiliary_loss_clip": 0.06424376, "auxiliary_loss_mlp": 0.01268008, "balance_loss_clip": 0.06283747, "balance_loss_mlp": 0.01257559, "epoch": 0.7289944385991282, "flos": 24906991898880.0, "grad_norm": 1.6554312525760106, "language_loss": 0.67181784, "learning_rate": 7.220816827470499e-07, "loss": 0.74874169, "num_input_tokens_seen": 261701455, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.10443115, "step": 12125, "time_per_iteration": 2.61173152923584 }, { "auxiliary_loss_clip": 0.06428587, "auxiliary_loss_mlp": 0.01268378, "balance_loss_clip": 0.06281987, "balance_loss_mlp": 0.01257089, "epoch": 0.7290545618517962, "flos": 22973835951360.0, "grad_norm": 1.8687681692364744, "language_loss": 0.75341821, "learning_rate": 7.217821172172855e-07, "loss": 0.83038783, "num_input_tokens_seen": 261721260, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.11303711, "step": 12126, "time_per_iteration": 2.535503625869751 }, { "auxiliary_loss_clip": 0.06319293, "auxiliary_loss_mlp": 0.0125179, "balance_loss_clip": 0.06262323, "balance_loss_mlp": 0.0125054, "epoch": 0.7291146851044642, "flos": 61921602092160.0, "grad_norm": 0.7936210575784225, "language_loss": 0.58597171, "learning_rate": 7.2148260015767e-07, "loss": 0.66168261, "num_input_tokens_seen": 261779370, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01249695, "step": 12127, "time_per_iteration": 3.12725830078125 }, { "auxiliary_loss_clip": 0.06424724, "auxiliary_loss_mlp": 0.01267639, "balance_loss_clip": 0.06287157, "balance_loss_mlp": 0.01259023, "epoch": 0.7291748083571321, "flos": 23337616452480.0, "grad_norm": 2.3371677324053026, "language_loss": 0.68720067, "learning_rate": 7.21183131579562e-07, "loss": 0.76412427, "num_input_tokens_seen": 261798050, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.08618164, "step": 12128, "time_per_iteration": 2.565723419189453 }, { "auxiliary_loss_clip": 0.06422317, "auxiliary_loss_mlp": 0.01265788, "balance_loss_clip": 0.06279558, "balance_loss_mlp": 0.01254701, "epoch": 0.7292349316098001, "flos": 28337588962560.0, "grad_norm": 1.98313199895784, "language_loss": 0.65450096, "learning_rate": 7.20883711494319e-07, "loss": 0.73138201, "num_input_tokens_seen": 261817660, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.11090088, "step": 12129, "time_per_iteration": 2.6129820346832275 }, { "auxiliary_loss_clip": 0.06418169, "auxiliary_loss_mlp": 0.0126852, "balance_loss_clip": 0.06279933, "balance_loss_mlp": 0.01258835, "epoch": 0.729295054862468, "flos": 24138788866560.0, "grad_norm": 1.755383930984322, "language_loss": 0.74424869, "learning_rate": 7.205843399132927e-07, "loss": 0.82111555, "num_input_tokens_seen": 261837935, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09680176, "step": 12130, "time_per_iteration": 2.5671334266662598 }, { "auxiliary_loss_clip": 0.0642198, "auxiliary_loss_mlp": 0.01268362, "balance_loss_clip": 0.06280367, "balance_loss_mlp": 0.0125842, "epoch": 0.7293551781151361, "flos": 22822168861440.0, "grad_norm": 1.8444553024848052, "language_loss": 0.6993981, "learning_rate": 7.202850168478374e-07, "loss": 0.7763015, "num_input_tokens_seen": 261857575, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09942627, "step": 12131, "time_per_iteration": 2.5812220573425293 }, { "auxiliary_loss_clip": 0.06420524, "auxiliary_loss_mlp": 0.01265801, "balance_loss_clip": 0.06281821, "balance_loss_mlp": 0.01256372, "epoch": 0.729415301367804, "flos": 22133111610240.0, "grad_norm": 1.4910303016703186, "language_loss": 0.77777493, "learning_rate": 7.199857423093025e-07, "loss": 0.85463816, "num_input_tokens_seen": 261877265, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09429932, "step": 12132, "time_per_iteration": 2.5645751953125 }, { "auxiliary_loss_clip": 0.06421028, "auxiliary_loss_mlp": 0.01268472, "balance_loss_clip": 0.06280923, "balance_loss_mlp": 0.01258602, "epoch": 0.729475424620472, "flos": 12354587804160.0, "grad_norm": 2.0009376333944573, "language_loss": 0.79411989, "learning_rate": 7.196865163090358e-07, "loss": 0.87101483, "num_input_tokens_seen": 261893695, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09869385, "step": 12133, "time_per_iteration": 2.5630860328674316 }, { "auxiliary_loss_clip": 0.06418732, "auxiliary_loss_mlp": 0.01264716, "balance_loss_clip": 0.0627844, "balance_loss_mlp": 0.01255221, "epoch": 0.7295355478731399, "flos": 22201020944640.0, "grad_norm": 4.711689715155017, "language_loss": 0.72626346, "learning_rate": 7.193873388583846e-07, "loss": 0.8030979, "num_input_tokens_seen": 261911825, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09490967, "step": 12134, "time_per_iteration": 2.5710065364837646 }, { "auxiliary_loss_clip": 0.06421669, "auxiliary_loss_mlp": 0.01265974, "balance_loss_clip": 0.06278954, "balance_loss_mlp": 0.01256234, "epoch": 0.7295956711258079, "flos": 23228771598720.0, "grad_norm": 1.7047179675744426, "language_loss": 0.71672869, "learning_rate": 7.190882099686939e-07, "loss": 0.79360509, "num_input_tokens_seen": 261931190, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09729004, "step": 12135, "time_per_iteration": 2.576383113861084 }, { "auxiliary_loss_clip": 0.06426439, "auxiliary_loss_mlp": 0.01267055, "balance_loss_clip": 0.06282233, "balance_loss_mlp": 0.0125759, "epoch": 0.7296557943784758, "flos": 31877282442240.0, "grad_norm": 2.099505539023626, "language_loss": 0.6281482, "learning_rate": 7.187891296513075e-07, "loss": 0.70508313, "num_input_tokens_seen": 261951240, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.09472656, "step": 12136, "time_per_iteration": 2.6857664585113525 }, { "auxiliary_loss_clip": 0.06418292, "auxiliary_loss_mlp": 0.01267786, "balance_loss_clip": 0.06277569, "balance_loss_mlp": 0.01258637, "epoch": 0.7297159176311439, "flos": 26659033246080.0, "grad_norm": 1.8061817211530955, "language_loss": 0.75070369, "learning_rate": 7.184900979175654e-07, "loss": 0.82756454, "num_input_tokens_seen": 261971605, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.0914917, "step": 12137, "time_per_iteration": 2.5785443782806396 }, { "auxiliary_loss_clip": 0.06420065, "auxiliary_loss_mlp": 0.01267207, "balance_loss_clip": 0.06279199, "balance_loss_mlp": 0.01257491, "epoch": 0.7297760408838118, "flos": 24755744079360.0, "grad_norm": 1.688202171302544, "language_loss": 0.74390113, "learning_rate": 7.181911147788069e-07, "loss": 0.82077384, "num_input_tokens_seen": 261990830, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09716797, "step": 12138, "time_per_iteration": 3.9694063663482666 }, { "auxiliary_loss_clip": 0.06417879, "auxiliary_loss_mlp": 0.01267643, "balance_loss_clip": 0.06278148, "balance_loss_mlp": 0.01258166, "epoch": 0.7298361641364798, "flos": 18079018715520.0, "grad_norm": 2.1531903820538965, "language_loss": 0.72003233, "learning_rate": 7.178921802463702e-07, "loss": 0.79688758, "num_input_tokens_seen": 262008190, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09472656, "step": 12139, "time_per_iteration": 2.5300605297088623 }, { "auxiliary_loss_clip": 0.06411465, "auxiliary_loss_mlp": 0.01266328, "balance_loss_clip": 0.06275577, "balance_loss_mlp": 0.01257399, "epoch": 0.7298962873891478, "flos": 29902897486080.0, "grad_norm": 1.3944887275091737, "language_loss": 0.73828876, "learning_rate": 7.175932943315898e-07, "loss": 0.8150667, "num_input_tokens_seen": 262030460, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.0892334, "step": 12140, "time_per_iteration": 2.6261496543884277 }, { "auxiliary_loss_clip": 0.0642021, "auxiliary_loss_mlp": 0.01271175, "balance_loss_clip": 0.06278084, "balance_loss_mlp": 0.01260238, "epoch": 0.7299564106418157, "flos": 32273613054720.0, "grad_norm": 1.5279807249131547, "language_loss": 0.55892456, "learning_rate": 7.172944570458003e-07, "loss": 0.63583845, "num_input_tokens_seen": 262050830, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.109375, "step": 12141, "time_per_iteration": 2.635854959487915 }, { "auxiliary_loss_clip": 0.06420456, "auxiliary_loss_mlp": 0.01263064, "balance_loss_clip": 0.06283787, "balance_loss_mlp": 0.01254332, "epoch": 0.7300165338944837, "flos": 22937009281920.0, "grad_norm": 1.6616636285347386, "language_loss": 0.72868389, "learning_rate": 7.169956684003342e-07, "loss": 0.8055191, "num_input_tokens_seen": 262071245, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.08734131, "step": 12142, "time_per_iteration": 2.562450408935547 }, { "auxiliary_loss_clip": 0.06418702, "auxiliary_loss_mlp": 0.01264125, "balance_loss_clip": 0.06278939, "balance_loss_mlp": 0.01255375, "epoch": 0.7300766571471516, "flos": 19834959277440.0, "grad_norm": 1.8593429982147769, "language_loss": 0.73998308, "learning_rate": 7.16696928406521e-07, "loss": 0.81681132, "num_input_tokens_seen": 262087525, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.08746338, "step": 12143, "time_per_iteration": 2.576392412185669 }, { "auxiliary_loss_clip": 0.06419474, "auxiliary_loss_mlp": 0.01265339, "balance_loss_clip": 0.06277998, "balance_loss_mlp": 0.01255534, "epoch": 0.7301367803998197, "flos": 24353879097600.0, "grad_norm": 2.0313123532729023, "language_loss": 0.66987622, "learning_rate": 7.163982370756882e-07, "loss": 0.74672437, "num_input_tokens_seen": 262107355, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09796143, "step": 12144, "time_per_iteration": 2.6407463550567627 }, { "auxiliary_loss_clip": 0.0642321, "auxiliary_loss_mlp": 0.01265696, "balance_loss_clip": 0.06281708, "balance_loss_mlp": 0.01255641, "epoch": 0.7301969036524876, "flos": 15309918109440.0, "grad_norm": 1.6733485318918926, "language_loss": 0.79135311, "learning_rate": 7.160995944191627e-07, "loss": 0.86824214, "num_input_tokens_seen": 262125645, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.10058594, "step": 12145, "time_per_iteration": 3.9931883811950684 }, { "auxiliary_loss_clip": 0.06416672, "auxiliary_loss_mlp": 0.01266257, "balance_loss_clip": 0.06277978, "balance_loss_mlp": 0.01256262, "epoch": 0.7302570269051556, "flos": 23512945121280.0, "grad_norm": 1.6668595617951745, "language_loss": 0.91638476, "learning_rate": 7.158010004482702e-07, "loss": 0.99321401, "num_input_tokens_seen": 262144075, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.10003662, "step": 12146, "time_per_iteration": 2.5673940181732178 }, { "auxiliary_loss_clip": 0.06420632, "auxiliary_loss_mlp": 0.01262594, "balance_loss_clip": 0.06282325, "balance_loss_mlp": 0.01253337, "epoch": 0.7303171501578235, "flos": 20529508970880.0, "grad_norm": 1.9627752971962007, "language_loss": 0.62552238, "learning_rate": 7.155024551743316e-07, "loss": 0.70235461, "num_input_tokens_seen": 262165940, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09259033, "step": 12147, "time_per_iteration": 2.597196340560913 }, { "auxiliary_loss_clip": 0.06426387, "auxiliary_loss_mlp": 0.0126837, "balance_loss_clip": 0.06283604, "balance_loss_mlp": 0.01257802, "epoch": 0.7303772734104915, "flos": 18338482483200.0, "grad_norm": 1.904872049672741, "language_loss": 0.75483131, "learning_rate": 7.152039586086693e-07, "loss": 0.83177882, "num_input_tokens_seen": 262184520, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10565186, "step": 12148, "time_per_iteration": 2.558832883834839 }, { "auxiliary_loss_clip": 0.06324551, "auxiliary_loss_mlp": 0.01251868, "balance_loss_clip": 0.062673, "balance_loss_mlp": 0.01250615, "epoch": 0.7304373966631594, "flos": 60673604181120.0, "grad_norm": 0.674889241287975, "language_loss": 0.56571507, "learning_rate": 7.149055107626017e-07, "loss": 0.64147925, "num_input_tokens_seen": 262247070, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01252747, "step": 12149, "time_per_iteration": 3.159248113632202 }, { "auxiliary_loss_clip": 0.06418514, "auxiliary_loss_mlp": 0.01264983, "balance_loss_clip": 0.06275471, "balance_loss_mlp": 0.01255667, "epoch": 0.7304975199158275, "flos": 19834120736640.0, "grad_norm": 1.9152818340943722, "language_loss": 0.74018162, "learning_rate": 7.146071116474451e-07, "loss": 0.8170166, "num_input_tokens_seen": 262266605, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09320068, "step": 12150, "time_per_iteration": 2.54667329788208 }, { "auxiliary_loss_clip": 0.06424005, "auxiliary_loss_mlp": 0.0126885, "balance_loss_clip": 0.06279665, "balance_loss_mlp": 0.0125889, "epoch": 0.7305576431684954, "flos": 13228910432640.0, "grad_norm": 2.303206088124699, "language_loss": 0.84021991, "learning_rate": 7.143087612745158e-07, "loss": 0.91714847, "num_input_tokens_seen": 262283880, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.09967041, "step": 12151, "time_per_iteration": 2.5702760219573975 }, { "auxiliary_loss_clip": 0.06422918, "auxiliary_loss_mlp": 0.01267179, "balance_loss_clip": 0.06281978, "balance_loss_mlp": 0.01257559, "epoch": 0.7306177664211634, "flos": 24067231879680.0, "grad_norm": 1.7939001014146247, "language_loss": 0.78416932, "learning_rate": 7.14010459655127e-07, "loss": 0.86107033, "num_input_tokens_seen": 262304155, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09625244, "step": 12152, "time_per_iteration": 2.588046073913574 }, { "auxiliary_loss_clip": 0.06420238, "auxiliary_loss_mlp": 0.01267788, "balance_loss_clip": 0.0627993, "balance_loss_mlp": 0.012579, "epoch": 0.7306778896738314, "flos": 27096425159040.0, "grad_norm": 2.155790526779896, "language_loss": 0.79646921, "learning_rate": 7.137122068005919e-07, "loss": 0.87334943, "num_input_tokens_seen": 262325660, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09887695, "step": 12153, "time_per_iteration": 2.6028056144714355 }, { "auxiliary_loss_clip": 0.06423081, "auxiliary_loss_mlp": 0.01268535, "balance_loss_clip": 0.06277724, "balance_loss_mlp": 0.01258599, "epoch": 0.7307380129264993, "flos": 16696250311680.0, "grad_norm": 1.8425645981135126, "language_loss": 0.67501903, "learning_rate": 7.134140027222173e-07, "loss": 0.75193518, "num_input_tokens_seen": 262344075, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.09924316, "step": 12154, "time_per_iteration": 2.5427358150482178 }, { "auxiliary_loss_clip": 0.06423654, "auxiliary_loss_mlp": 0.01267703, "balance_loss_clip": 0.06280361, "balance_loss_mlp": 0.01257314, "epoch": 0.7307981361791673, "flos": 21732169023360.0, "grad_norm": 1.8342121474138415, "language_loss": 0.66838825, "learning_rate": 7.131158474313128e-07, "loss": 0.74530184, "num_input_tokens_seen": 262363305, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10388184, "step": 12155, "time_per_iteration": 3.9892327785491943 }, { "auxiliary_loss_clip": 0.06414359, "auxiliary_loss_mlp": 0.01265139, "balance_loss_clip": 0.0627593, "balance_loss_mlp": 0.0125571, "epoch": 0.7308582594318352, "flos": 18046468604160.0, "grad_norm": 1.6771004703104966, "language_loss": 0.81971014, "learning_rate": 7.128177409391851e-07, "loss": 0.89650506, "num_input_tokens_seen": 262380730, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09429932, "step": 12156, "time_per_iteration": 2.5412111282348633 }, { "auxiliary_loss_clip": 0.06418288, "auxiliary_loss_mlp": 0.01265509, "balance_loss_clip": 0.06279573, "balance_loss_mlp": 0.01256694, "epoch": 0.7309183826845033, "flos": 13850100276480.0, "grad_norm": 3.8638569812240244, "language_loss": 0.75495279, "learning_rate": 7.125196832571367e-07, "loss": 0.83179069, "num_input_tokens_seen": 262395480, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.08807373, "step": 12157, "time_per_iteration": 2.515550136566162 }, { "auxiliary_loss_clip": 0.06415164, "auxiliary_loss_mlp": 0.01265443, "balance_loss_clip": 0.06278484, "balance_loss_mlp": 0.01256932, "epoch": 0.7309785059371712, "flos": 17024881224960.0, "grad_norm": 2.9670845513176674, "language_loss": 0.74143344, "learning_rate": 7.122216743964713e-07, "loss": 0.81823945, "num_input_tokens_seen": 262413340, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.08508301, "step": 12158, "time_per_iteration": 2.537161350250244 }, { "auxiliary_loss_clip": 0.06425887, "auxiliary_loss_mlp": 0.01263961, "balance_loss_clip": 0.06283663, "balance_loss_mlp": 0.01254299, "epoch": 0.7310386291898392, "flos": 26509127091840.0, "grad_norm": 1.6942300131330543, "language_loss": 0.85938716, "learning_rate": 7.119237143684896e-07, "loss": 0.93628561, "num_input_tokens_seen": 262433455, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09655762, "step": 12159, "time_per_iteration": 2.594712495803833 }, { "auxiliary_loss_clip": 0.06424144, "auxiliary_loss_mlp": 0.01265107, "balance_loss_clip": 0.06277224, "balance_loss_mlp": 0.01254069, "epoch": 0.7310987524425071, "flos": 16951521375360.0, "grad_norm": 1.952420001392578, "language_loss": 0.7386452, "learning_rate": 7.116258031844895e-07, "loss": 0.81553769, "num_input_tokens_seen": 262450335, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.11035156, "step": 12160, "time_per_iteration": 3.8823986053466797 }, { "auxiliary_loss_clip": 0.06427898, "auxiliary_loss_mlp": 0.01265939, "balance_loss_clip": 0.06282581, "balance_loss_mlp": 0.01255782, "epoch": 0.7311588756951751, "flos": 13850477619840.0, "grad_norm": 1.9661099296036835, "language_loss": 0.73155838, "learning_rate": 7.113279408557675e-07, "loss": 0.80849683, "num_input_tokens_seen": 262468240, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10162354, "step": 12161, "time_per_iteration": 2.525374174118042 }, { "auxiliary_loss_clip": 0.06434575, "auxiliary_loss_mlp": 0.01265333, "balance_loss_clip": 0.0628485, "balance_loss_mlp": 0.01254664, "epoch": 0.731218998947843, "flos": 28775567854080.0, "grad_norm": 1.7224498664379448, "language_loss": 0.69905525, "learning_rate": 7.110301273936192e-07, "loss": 0.77605432, "num_input_tokens_seen": 262487045, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.10662842, "step": 12162, "time_per_iteration": 2.6183226108551025 }, { "auxiliary_loss_clip": 0.06425017, "auxiliary_loss_mlp": 0.01269399, "balance_loss_clip": 0.06280496, "balance_loss_mlp": 0.01259338, "epoch": 0.7312791222005111, "flos": 27096047815680.0, "grad_norm": 1.6286647887254562, "language_loss": 0.67042267, "learning_rate": 7.107323628093382e-07, "loss": 0.74736679, "num_input_tokens_seen": 262504855, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.1005249, "step": 12163, "time_per_iteration": 2.594918727874756 }, { "auxiliary_loss_clip": 0.06421512, "auxiliary_loss_mlp": 0.01269747, "balance_loss_clip": 0.06279487, "balance_loss_mlp": 0.01260192, "epoch": 0.731339245453179, "flos": 20930493484800.0, "grad_norm": 1.504908242337857, "language_loss": 0.69199634, "learning_rate": 7.104346471142153e-07, "loss": 0.76890892, "num_input_tokens_seen": 262524920, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09545898, "step": 12164, "time_per_iteration": 2.5569067001342773 }, { "auxiliary_loss_clip": 0.06416553, "auxiliary_loss_mlp": 0.01262263, "balance_loss_clip": 0.0627984, "balance_loss_mlp": 0.01253131, "epoch": 0.731399368705847, "flos": 23082345388800.0, "grad_norm": 1.680716609472864, "language_loss": 0.73697996, "learning_rate": 7.101369803195391e-07, "loss": 0.81376809, "num_input_tokens_seen": 262545725, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.09136963, "step": 12165, "time_per_iteration": 2.5856451988220215 }, { "auxiliary_loss_clip": 0.064224, "auxiliary_loss_mlp": 0.01264044, "balance_loss_clip": 0.06279573, "balance_loss_mlp": 0.01254173, "epoch": 0.731459491958515, "flos": 23588778666240.0, "grad_norm": 1.8368002369544383, "language_loss": 0.76987159, "learning_rate": 7.098393624365988e-07, "loss": 0.84673595, "num_input_tokens_seen": 262565480, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09869385, "step": 12166, "time_per_iteration": 2.553823471069336 }, { "auxiliary_loss_clip": 0.06418973, "auxiliary_loss_mlp": 0.01265529, "balance_loss_clip": 0.06280427, "balance_loss_mlp": 0.01255998, "epoch": 0.7315196152111829, "flos": 22385280072960.0, "grad_norm": 1.724686298397232, "language_loss": 0.79885542, "learning_rate": 7.095417934766781e-07, "loss": 0.87570053, "num_input_tokens_seen": 262584145, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09527588, "step": 12167, "time_per_iteration": 2.5626375675201416 }, { "auxiliary_loss_clip": 0.06421906, "auxiliary_loss_mlp": 0.01267537, "balance_loss_clip": 0.06284283, "balance_loss_mlp": 0.01258323, "epoch": 0.7315797384638509, "flos": 26184227685120.0, "grad_norm": 1.8154417014869755, "language_loss": 0.77111375, "learning_rate": 7.092442734510622e-07, "loss": 0.84800816, "num_input_tokens_seen": 262604045, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09216309, "step": 12168, "time_per_iteration": 2.586759090423584 }, { "auxiliary_loss_clip": 0.06429337, "auxiliary_loss_mlp": 0.01264873, "balance_loss_clip": 0.06283404, "balance_loss_mlp": 0.01253965, "epoch": 0.7316398617165188, "flos": 21512634526080.0, "grad_norm": 1.4028740688013694, "language_loss": 0.82269394, "learning_rate": 7.089468023710326e-07, "loss": 0.89963603, "num_input_tokens_seen": 262624540, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10894775, "step": 12169, "time_per_iteration": 2.5735559463500977 }, { "auxiliary_loss_clip": 0.06427869, "auxiliary_loss_mlp": 0.01266102, "balance_loss_clip": 0.06283882, "balance_loss_mlp": 0.01256523, "epoch": 0.7316999849691869, "flos": 30490489042560.0, "grad_norm": 1.857631126004142, "language_loss": 0.70333529, "learning_rate": 7.08649380247871e-07, "loss": 0.78027499, "num_input_tokens_seen": 262644545, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.09576416, "step": 12170, "time_per_iteration": 2.6037192344665527 }, { "auxiliary_loss_clip": 0.06424629, "auxiliary_loss_mlp": 0.01264653, "balance_loss_clip": 0.06282465, "balance_loss_mlp": 0.01254127, "epoch": 0.7317601082218548, "flos": 21550257809280.0, "grad_norm": 2.059355299996237, "language_loss": 0.70194721, "learning_rate": 7.083520070928533e-07, "loss": 0.77884001, "num_input_tokens_seen": 262662570, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10516357, "step": 12171, "time_per_iteration": 2.5489675998687744 }, { "auxiliary_loss_clip": 0.06419321, "auxiliary_loss_mlp": 0.01265254, "balance_loss_clip": 0.06279035, "balance_loss_mlp": 0.01255342, "epoch": 0.7318202314745228, "flos": 33259338086400.0, "grad_norm": 1.5911517633828112, "language_loss": 0.65993905, "learning_rate": 7.080546829172564e-07, "loss": 0.73678482, "num_input_tokens_seen": 262683245, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09906006, "step": 12172, "time_per_iteration": 2.632388114929199 }, { "auxiliary_loss_clip": 0.06424607, "auxiliary_loss_mlp": 0.01265206, "balance_loss_clip": 0.06282842, "balance_loss_mlp": 0.01254912, "epoch": 0.7318803547271907, "flos": 20163254774400.0, "grad_norm": 2.728596496418244, "language_loss": 0.62061417, "learning_rate": 7.077574077323564e-07, "loss": 0.69751227, "num_input_tokens_seen": 262701585, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10296631, "step": 12173, "time_per_iteration": 2.538412570953369 }, { "auxiliary_loss_clip": 0.06425925, "auxiliary_loss_mlp": 0.01263389, "balance_loss_clip": 0.06284402, "balance_loss_mlp": 0.01254162, "epoch": 0.7319404779798587, "flos": 20564826266880.0, "grad_norm": 1.824190596482806, "language_loss": 0.74429929, "learning_rate": 7.074601815494243e-07, "loss": 0.82119238, "num_input_tokens_seen": 262719295, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09234619, "step": 12174, "time_per_iteration": 2.534823179244995 }, { "auxiliary_loss_clip": 0.06418569, "auxiliary_loss_mlp": 0.01262418, "balance_loss_clip": 0.06280999, "balance_loss_mlp": 0.01253581, "epoch": 0.7320006012325266, "flos": 28703130399360.0, "grad_norm": 1.5976111109795632, "language_loss": 0.81121439, "learning_rate": 7.071630043797317e-07, "loss": 0.88802427, "num_input_tokens_seen": 262739995, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.08828735, "step": 12175, "time_per_iteration": 2.6164138317108154 }, { "auxiliary_loss_clip": 0.06421831, "auxiliary_loss_mlp": 0.01264448, "balance_loss_clip": 0.06280936, "balance_loss_mlp": 0.01254619, "epoch": 0.7320607244851947, "flos": 16368290231040.0, "grad_norm": 1.9455606413258708, "language_loss": 0.76985002, "learning_rate": 7.068658762345488e-07, "loss": 0.84671283, "num_input_tokens_seen": 262757680, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.0982666, "step": 12176, "time_per_iteration": 2.5288302898406982 }, { "auxiliary_loss_clip": 0.0641868, "auxiliary_loss_mlp": 0.01271173, "balance_loss_clip": 0.06279936, "balance_loss_mlp": 0.01261267, "epoch": 0.7321208477378626, "flos": 20960653754880.0, "grad_norm": 1.514254814185345, "language_loss": 0.76453805, "learning_rate": 7.065687971251399e-07, "loss": 0.84143662, "num_input_tokens_seen": 262776990, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09906006, "step": 12177, "time_per_iteration": 2.5845775604248047 }, { "auxiliary_loss_clip": 0.06419092, "auxiliary_loss_mlp": 0.01264133, "balance_loss_clip": 0.06279111, "balance_loss_mlp": 0.01254906, "epoch": 0.7321809709905306, "flos": 13850226057600.0, "grad_norm": 2.3745503482733583, "language_loss": 0.74893022, "learning_rate": 7.06271767062772e-07, "loss": 0.82576245, "num_input_tokens_seen": 262795440, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09228516, "step": 12178, "time_per_iteration": 3.9668219089508057 }, { "auxiliary_loss_clip": 0.06421173, "auxiliary_loss_mlp": 0.01264049, "balance_loss_clip": 0.06277238, "balance_loss_mlp": 0.01254334, "epoch": 0.7322410942431986, "flos": 26987286816000.0, "grad_norm": 3.5042205800951285, "language_loss": 0.82539266, "learning_rate": 7.059747860587084e-07, "loss": 0.90224493, "num_input_tokens_seen": 262816385, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.0970459, "step": 12179, "time_per_iteration": 2.588285207748413 }, { "auxiliary_loss_clip": 0.06416015, "auxiliary_loss_mlp": 0.0126401, "balance_loss_clip": 0.06281705, "balance_loss_mlp": 0.0125526, "epoch": 0.7323012174958665, "flos": 17645526017280.0, "grad_norm": 1.9289012949796556, "language_loss": 0.74696171, "learning_rate": 7.056778541242115e-07, "loss": 0.82376194, "num_input_tokens_seen": 262834955, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.08752441, "step": 12180, "time_per_iteration": 2.5357489585876465 }, { "auxiliary_loss_clip": 0.06426669, "auxiliary_loss_mlp": 0.01266276, "balance_loss_clip": 0.06281054, "balance_loss_mlp": 0.01255899, "epoch": 0.7323613407485345, "flos": 32350914046080.0, "grad_norm": 2.1640814964358093, "language_loss": 0.80010974, "learning_rate": 7.053809712705396e-07, "loss": 0.87703913, "num_input_tokens_seen": 262853555, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10388184, "step": 12181, "time_per_iteration": 2.617389440536499 }, { "auxiliary_loss_clip": 0.06425171, "auxiliary_loss_mlp": 0.01270018, "balance_loss_clip": 0.06281147, "balance_loss_mlp": 0.01259855, "epoch": 0.7324214640012024, "flos": 18367594577280.0, "grad_norm": 2.376582835160608, "language_loss": 0.71861011, "learning_rate": 7.050841375089506e-07, "loss": 0.79556203, "num_input_tokens_seen": 262870975, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10168457, "step": 12182, "time_per_iteration": 2.53483510017395 }, { "auxiliary_loss_clip": 0.06426866, "auxiliary_loss_mlp": 0.01266271, "balance_loss_clip": 0.06285271, "balance_loss_mlp": 0.01256412, "epoch": 0.7324815872538705, "flos": 30820503548160.0, "grad_norm": 1.5865078330123683, "language_loss": 0.71334869, "learning_rate": 7.047873528507015e-07, "loss": 0.79028004, "num_input_tokens_seen": 262892635, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09857178, "step": 12183, "time_per_iteration": 2.624906063079834 }, { "auxiliary_loss_clip": 0.0642617, "auxiliary_loss_mlp": 0.01268707, "balance_loss_clip": 0.06281219, "balance_loss_mlp": 0.01258574, "epoch": 0.7325417105065384, "flos": 21511167079680.0, "grad_norm": 2.2944501609230987, "language_loss": 0.72806382, "learning_rate": 7.04490617307045e-07, "loss": 0.80501258, "num_input_tokens_seen": 262910725, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10131836, "step": 12184, "time_per_iteration": 4.0171403884887695 }, { "auxiliary_loss_clip": 0.06334051, "auxiliary_loss_mlp": 0.01253117, "balance_loss_clip": 0.06276834, "balance_loss_mlp": 0.01251796, "epoch": 0.7326018337592064, "flos": 67277514746880.0, "grad_norm": 0.7438017305709381, "language_loss": 0.65146238, "learning_rate": 7.041939308892344e-07, "loss": 0.72733402, "num_input_tokens_seen": 262974150, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01322174, "step": 12185, "time_per_iteration": 3.136469841003418 }, { "auxiliary_loss_clip": 0.06424187, "auxiliary_loss_mlp": 0.01264906, "balance_loss_clip": 0.06279817, "balance_loss_mlp": 0.01254665, "epoch": 0.7326619570118743, "flos": 22863733286400.0, "grad_norm": 1.841138040343406, "language_loss": 0.80778009, "learning_rate": 7.038972936085197e-07, "loss": 0.88467097, "num_input_tokens_seen": 262993370, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10241699, "step": 12186, "time_per_iteration": 2.5559632778167725 }, { "auxiliary_loss_clip": 0.06424173, "auxiliary_loss_mlp": 0.01266774, "balance_loss_clip": 0.06280121, "balance_loss_mlp": 0.01256129, "epoch": 0.7327220802645423, "flos": 23333591456640.0, "grad_norm": 2.0946833776629763, "language_loss": 0.74024403, "learning_rate": 7.036007054761508e-07, "loss": 0.81715351, "num_input_tokens_seen": 263012665, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10650635, "step": 12187, "time_per_iteration": 2.5857715606689453 }, { "auxiliary_loss_clip": 0.06430405, "auxiliary_loss_mlp": 0.01269135, "balance_loss_clip": 0.0628773, "balance_loss_mlp": 0.01258978, "epoch": 0.7327822035172102, "flos": 23186578268160.0, "grad_norm": 1.7585098939404733, "language_loss": 0.89172202, "learning_rate": 7.033041665033716e-07, "loss": 0.96871746, "num_input_tokens_seen": 263031475, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10162354, "step": 12188, "time_per_iteration": 2.5706255435943604 }, { "auxiliary_loss_clip": 0.0642371, "auxiliary_loss_mlp": 0.01267288, "balance_loss_clip": 0.06279458, "balance_loss_mlp": 0.01257012, "epoch": 0.7328423267698783, "flos": 21072517355520.0, "grad_norm": 2.013191633045659, "language_loss": 0.7480129, "learning_rate": 7.030076767014284e-07, "loss": 0.82492286, "num_input_tokens_seen": 263051445, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10272217, "step": 12189, "time_per_iteration": 2.589998483657837 }, { "auxiliary_loss_clip": 0.06426604, "auxiliary_loss_mlp": 0.01269306, "balance_loss_clip": 0.06282087, "balance_loss_mlp": 0.01259304, "epoch": 0.7329024500225462, "flos": 21696055113600.0, "grad_norm": 3.740074364696426, "language_loss": 0.82564789, "learning_rate": 7.027112360815648e-07, "loss": 0.90260696, "num_input_tokens_seen": 263070835, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10003662, "step": 12190, "time_per_iteration": 2.57161021232605 }, { "auxiliary_loss_clip": 0.06422551, "auxiliary_loss_mlp": 0.01265558, "balance_loss_clip": 0.06280394, "balance_loss_mlp": 0.01254901, "epoch": 0.7329625732752142, "flos": 24169829604480.0, "grad_norm": 1.8301266253247799, "language_loss": 0.72087431, "learning_rate": 7.024148446550204e-07, "loss": 0.79775536, "num_input_tokens_seen": 263090070, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10656738, "step": 12191, "time_per_iteration": 2.665109872817993 }, { "auxiliary_loss_clip": 0.06421617, "auxiliary_loss_mlp": 0.0126715, "balance_loss_clip": 0.0628015, "balance_loss_mlp": 0.01258096, "epoch": 0.7330226965278822, "flos": 30085227970560.0, "grad_norm": 1.4919566419810681, "language_loss": 0.69215131, "learning_rate": 7.021185024330361e-07, "loss": 0.76903892, "num_input_tokens_seen": 263110030, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09051514, "step": 12192, "time_per_iteration": 2.7970638275146484 }, { "auxiliary_loss_clip": 0.06419103, "auxiliary_loss_mlp": 0.01264261, "balance_loss_clip": 0.06278582, "balance_loss_mlp": 0.01254814, "epoch": 0.7330828197805501, "flos": 23375113954560.0, "grad_norm": 1.565059097403947, "language_loss": 0.73674905, "learning_rate": 7.01822209426848e-07, "loss": 0.81358272, "num_input_tokens_seen": 263129735, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09448242, "step": 12193, "time_per_iteration": 2.596585750579834 }, { "auxiliary_loss_clip": 0.06428155, "auxiliary_loss_mlp": 0.0126727, "balance_loss_clip": 0.06283336, "balance_loss_mlp": 0.01257251, "epoch": 0.7331429430332181, "flos": 21039170630400.0, "grad_norm": 1.6911972542405236, "language_loss": 0.77562577, "learning_rate": 7.015259656476911e-07, "loss": 0.85258007, "num_input_tokens_seen": 263149100, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10021973, "step": 12194, "time_per_iteration": 2.5603160858154297 }, { "auxiliary_loss_clip": 0.06421103, "auxiliary_loss_mlp": 0.01263316, "balance_loss_clip": 0.06281333, "balance_loss_mlp": 0.01253196, "epoch": 0.733203066285886, "flos": 14653201334400.0, "grad_norm": 1.6316150033960324, "language_loss": 0.71072322, "learning_rate": 7.012297711067998e-07, "loss": 0.78756738, "num_input_tokens_seen": 263166620, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.10107422, "step": 12195, "time_per_iteration": 3.972494602203369 }, { "auxiliary_loss_clip": 0.06423526, "auxiliary_loss_mlp": 0.0126307, "balance_loss_clip": 0.06280573, "balance_loss_mlp": 0.01253414, "epoch": 0.7332631895385541, "flos": 17171013945600.0, "grad_norm": 1.8518305325781539, "language_loss": 0.72904187, "learning_rate": 7.009336258154057e-07, "loss": 0.80590785, "num_input_tokens_seen": 263184780, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09643555, "step": 12196, "time_per_iteration": 2.54575514793396 }, { "auxiliary_loss_clip": 0.06421358, "auxiliary_loss_mlp": 0.01265111, "balance_loss_clip": 0.0628123, "balance_loss_mlp": 0.01255664, "epoch": 0.733323312791222, "flos": 28665758678400.0, "grad_norm": 1.5629442411645955, "language_loss": 0.71636736, "learning_rate": 7.006375297847394e-07, "loss": 0.79323208, "num_input_tokens_seen": 263204625, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09442139, "step": 12197, "time_per_iteration": 2.604520797729492 }, { "auxiliary_loss_clip": 0.0643124, "auxiliary_loss_mlp": 0.01268058, "balance_loss_clip": 0.06282432, "balance_loss_mlp": 0.01256954, "epoch": 0.73338343604389, "flos": 16624106346240.0, "grad_norm": 1.9849815547290828, "language_loss": 0.78297865, "learning_rate": 7.003414830260282e-07, "loss": 0.85997158, "num_input_tokens_seen": 263221565, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.11102295, "step": 12198, "time_per_iteration": 2.532038450241089 }, { "auxiliary_loss_clip": 0.06421028, "auxiliary_loss_mlp": 0.01267137, "balance_loss_clip": 0.06279036, "balance_loss_mlp": 0.01258017, "epoch": 0.7334435592965579, "flos": 21148434754560.0, "grad_norm": 2.1178716301166642, "language_loss": 0.74412286, "learning_rate": 7.000454855504974e-07, "loss": 0.82100451, "num_input_tokens_seen": 263240620, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09112549, "step": 12199, "time_per_iteration": 4.070716857910156 }, { "auxiliary_loss_clip": 0.06427282, "auxiliary_loss_mlp": 0.01264504, "balance_loss_clip": 0.06280591, "balance_loss_mlp": 0.01254115, "epoch": 0.7335036825492259, "flos": 17130455769600.0, "grad_norm": 3.5507565855183643, "language_loss": 0.77277827, "learning_rate": 6.997495373693729e-07, "loss": 0.8496961, "num_input_tokens_seen": 263254365, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10388184, "step": 12200, "time_per_iteration": 2.4966914653778076 }, { "auxiliary_loss_clip": 0.06422522, "auxiliary_loss_mlp": 0.01264418, "balance_loss_clip": 0.06280986, "balance_loss_mlp": 0.01254046, "epoch": 0.7335638058018938, "flos": 23738475185280.0, "grad_norm": 1.5866374151660825, "language_loss": 0.61587536, "learning_rate": 6.994536384938754e-07, "loss": 0.69274479, "num_input_tokens_seen": 263275880, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.1036377, "step": 12201, "time_per_iteration": 2.556981325149536 }, { "auxiliary_loss_clip": 0.06415134, "auxiliary_loss_mlp": 0.01265867, "balance_loss_clip": 0.06276917, "balance_loss_mlp": 0.01256796, "epoch": 0.7336239290545619, "flos": 34941876871680.0, "grad_norm": 1.766480813808245, "language_loss": 0.52239048, "learning_rate": 6.991577889352264e-07, "loss": 0.59920049, "num_input_tokens_seen": 263298315, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09063721, "step": 12202, "time_per_iteration": 2.6697769165039062 }, { "auxiliary_loss_clip": 0.06420352, "auxiliary_loss_mlp": 0.01262758, "balance_loss_clip": 0.06280367, "balance_loss_mlp": 0.01253198, "epoch": 0.7336840523072298, "flos": 21108966681600.0, "grad_norm": 2.055315206512443, "language_loss": 0.68888634, "learning_rate": 6.98861988704645e-07, "loss": 0.76571745, "num_input_tokens_seen": 263318615, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09558105, "step": 12203, "time_per_iteration": 2.5529627799987793 }, { "auxiliary_loss_clip": 0.06433337, "auxiliary_loss_mlp": 0.01270968, "balance_loss_clip": 0.06283714, "balance_loss_mlp": 0.01260305, "epoch": 0.7337441755598978, "flos": 24031243751040.0, "grad_norm": 1.9386869316194009, "language_loss": 0.65927309, "learning_rate": 6.985662378133474e-07, "loss": 0.73631608, "num_input_tokens_seen": 263336705, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.10668945, "step": 12204, "time_per_iteration": 2.553419828414917 }, { "auxiliary_loss_clip": 0.06424128, "auxiliary_loss_mlp": 0.01263535, "balance_loss_clip": 0.06284513, "balance_loss_mlp": 0.01254082, "epoch": 0.7338042988125658, "flos": 22717977909120.0, "grad_norm": 1.9253511085567379, "language_loss": 0.77404296, "learning_rate": 6.982705362725479e-07, "loss": 0.8509196, "num_input_tokens_seen": 263355065, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09454346, "step": 12205, "time_per_iteration": 2.5646045207977295 }, { "auxiliary_loss_clip": 0.0641699, "auxiliary_loss_mlp": 0.01263669, "balance_loss_clip": 0.06279157, "balance_loss_mlp": 0.01254746, "epoch": 0.7338644220652337, "flos": 21367382273280.0, "grad_norm": 1.901478438421926, "language_loss": 0.79993331, "learning_rate": 6.979748840934601e-07, "loss": 0.87673992, "num_input_tokens_seen": 263374460, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.08917236, "step": 12206, "time_per_iteration": 2.567173719406128 }, { "auxiliary_loss_clip": 0.0642385, "auxiliary_loss_mlp": 0.01266648, "balance_loss_clip": 0.0628197, "balance_loss_mlp": 0.01257248, "epoch": 0.7339245453179017, "flos": 30928216371840.0, "grad_norm": 1.9237119146620485, "language_loss": 0.72122121, "learning_rate": 6.976792812872958e-07, "loss": 0.79812622, "num_input_tokens_seen": 263393610, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09405518, "step": 12207, "time_per_iteration": 2.642646074295044 }, { "auxiliary_loss_clip": 0.06329455, "auxiliary_loss_mlp": 0.01252337, "balance_loss_clip": 0.06271987, "balance_loss_mlp": 0.01251081, "epoch": 0.7339846685705697, "flos": 67916789873280.0, "grad_norm": 0.769336332202311, "language_loss": 0.54591864, "learning_rate": 6.97383727865263e-07, "loss": 0.62173653, "num_input_tokens_seen": 263450340, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.01255798, "step": 12208, "time_per_iteration": 3.241727352142334 }, { "auxiliary_loss_clip": 0.06424285, "auxiliary_loss_mlp": 0.01262267, "balance_loss_clip": 0.06282324, "balance_loss_mlp": 0.01252891, "epoch": 0.7340447918232377, "flos": 22243298129280.0, "grad_norm": 1.3300622446453791, "language_loss": 0.80396593, "learning_rate": 6.970882238385703e-07, "loss": 0.88083136, "num_input_tokens_seen": 263471735, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09368896, "step": 12209, "time_per_iteration": 2.5994350910186768 }, { "auxiliary_loss_clip": 0.06415413, "auxiliary_loss_mlp": 0.01264329, "balance_loss_clip": 0.06276711, "balance_loss_mlp": 0.01254995, "epoch": 0.7341049150759056, "flos": 23770857588480.0, "grad_norm": 1.4469368959917417, "language_loss": 0.79381561, "learning_rate": 6.96792769218423e-07, "loss": 0.87061304, "num_input_tokens_seen": 263493245, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09332275, "step": 12210, "time_per_iteration": 2.5743532180786133 }, { "auxiliary_loss_clip": 0.06422102, "auxiliary_loss_mlp": 0.01264171, "balance_loss_clip": 0.06283145, "balance_loss_mlp": 0.01254372, "epoch": 0.7341650383285736, "flos": 17241983953920.0, "grad_norm": 2.0618292869248593, "language_loss": 0.7675097, "learning_rate": 6.964973640160236e-07, "loss": 0.84437239, "num_input_tokens_seen": 263511660, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09796143, "step": 12211, "time_per_iteration": 2.538113832473755 }, { "auxiliary_loss_clip": 0.06423333, "auxiliary_loss_mlp": 0.01266675, "balance_loss_clip": 0.06280296, "balance_loss_mlp": 0.01257209, "epoch": 0.7342251615812415, "flos": 23410640885760.0, "grad_norm": 1.8816908057315467, "language_loss": 0.72173262, "learning_rate": 6.962020082425748e-07, "loss": 0.79863268, "num_input_tokens_seen": 263530875, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09472656, "step": 12212, "time_per_iteration": 2.5383806228637695 }, { "auxiliary_loss_clip": 0.06422982, "auxiliary_loss_mlp": 0.01267025, "balance_loss_clip": 0.06282276, "balance_loss_mlp": 0.01256827, "epoch": 0.7342852848339095, "flos": 22753756402560.0, "grad_norm": 1.4354048802316637, "language_loss": 0.68821615, "learning_rate": 6.959067019092766e-07, "loss": 0.76511621, "num_input_tokens_seen": 263551585, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10198975, "step": 12213, "time_per_iteration": 2.5550811290740967 }, { "auxiliary_loss_clip": 0.06327194, "auxiliary_loss_mlp": 0.01253187, "balance_loss_clip": 0.06269968, "balance_loss_mlp": 0.01251963, "epoch": 0.7343454080865774, "flos": 53960219856000.0, "grad_norm": 0.69574255657866, "language_loss": 0.54209983, "learning_rate": 6.956114450273276e-07, "loss": 0.61790365, "num_input_tokens_seen": 263609545, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01222229, "step": 12214, "time_per_iteration": 3.0676705837249756 }, { "auxiliary_loss_clip": 0.06429857, "auxiliary_loss_mlp": 0.01263705, "balance_loss_clip": 0.06283578, "balance_loss_mlp": 0.01253262, "epoch": 0.7344055313392455, "flos": 12171754195200.0, "grad_norm": 1.889319697373653, "language_loss": 0.710329, "learning_rate": 6.953162376079233e-07, "loss": 0.78726459, "num_input_tokens_seen": 263627880, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.10443115, "step": 12215, "time_per_iteration": 2.525698661804199 }, { "auxiliary_loss_clip": 0.06415097, "auxiliary_loss_mlp": 0.01267478, "balance_loss_clip": 0.06277983, "balance_loss_mlp": 0.01257935, "epoch": 0.7344656545919134, "flos": 18555710993280.0, "grad_norm": 1.5371572635530715, "language_loss": 0.73049831, "learning_rate": 6.950210796622573e-07, "loss": 0.80732405, "num_input_tokens_seen": 263645665, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09545898, "step": 12216, "time_per_iteration": 2.526526927947998 }, { "auxiliary_loss_clip": 0.06432717, "auxiliary_loss_mlp": 0.01268029, "balance_loss_clip": 0.06283675, "balance_loss_mlp": 0.01257062, "epoch": 0.7345257778445814, "flos": 23668762988160.0, "grad_norm": 1.9319237073057676, "language_loss": 0.78673816, "learning_rate": 6.947259712015236e-07, "loss": 0.86374557, "num_input_tokens_seen": 263668170, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.10968018, "step": 12217, "time_per_iteration": 4.094513416290283 }, { "auxiliary_loss_clip": 0.06419214, "auxiliary_loss_mlp": 0.01264616, "balance_loss_clip": 0.06281047, "balance_loss_mlp": 0.01255747, "epoch": 0.7345859010972494, "flos": 13813818658560.0, "grad_norm": 1.7113893304778414, "language_loss": 0.77959406, "learning_rate": 6.94430912236911e-07, "loss": 0.85643244, "num_input_tokens_seen": 263684190, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.08874512, "step": 12218, "time_per_iteration": 2.545422077178955 }, { "auxiliary_loss_clip": 0.06416238, "auxiliary_loss_mlp": 0.01266589, "balance_loss_clip": 0.06277947, "balance_loss_mlp": 0.01256749, "epoch": 0.7346460243499173, "flos": 22279202403840.0, "grad_norm": 1.8284137729056376, "language_loss": 0.72260463, "learning_rate": 6.941359027796092e-07, "loss": 0.79943287, "num_input_tokens_seen": 263702095, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09844971, "step": 12219, "time_per_iteration": 2.5724129676818848 }, { "auxiliary_loss_clip": 0.06415043, "auxiliary_loss_mlp": 0.01265069, "balance_loss_clip": 0.06277925, "balance_loss_mlp": 0.01256021, "epoch": 0.7347061476025853, "flos": 23261447491200.0, "grad_norm": 1.7911710145597743, "language_loss": 0.75017715, "learning_rate": 6.938409428408061e-07, "loss": 0.82697833, "num_input_tokens_seen": 263721385, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09057617, "step": 12220, "time_per_iteration": 2.5651588439941406 }, { "auxiliary_loss_clip": 0.06421605, "auxiliary_loss_mlp": 0.01267193, "balance_loss_clip": 0.06277052, "balance_loss_mlp": 0.01256316, "epoch": 0.7347662708552533, "flos": 15272881804800.0, "grad_norm": 2.046610004293495, "language_loss": 0.66045392, "learning_rate": 6.93546032431684e-07, "loss": 0.73734188, "num_input_tokens_seen": 263737835, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10882568, "step": 12221, "time_per_iteration": 2.5249099731445312 }, { "auxiliary_loss_clip": 0.0642218, "auxiliary_loss_mlp": 0.01263369, "balance_loss_clip": 0.06280572, "balance_loss_mlp": 0.01254185, "epoch": 0.7348263941079213, "flos": 24866349868800.0, "grad_norm": 1.9627735918299511, "language_loss": 0.69080043, "learning_rate": 6.932511715634273e-07, "loss": 0.76765597, "num_input_tokens_seen": 263756480, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09191895, "step": 12222, "time_per_iteration": 2.5803372859954834 }, { "auxiliary_loss_clip": 0.06414026, "auxiliary_loss_mlp": 0.01264861, "balance_loss_clip": 0.06274594, "balance_loss_mlp": 0.01256629, "epoch": 0.7348865173605892, "flos": 24358868415360.0, "grad_norm": 2.0648541541245744, "language_loss": 0.66186839, "learning_rate": 6.92956360247217e-07, "loss": 0.73865724, "num_input_tokens_seen": 263776440, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.08227539, "step": 12223, "time_per_iteration": 2.572218179702759 }, { "auxiliary_loss_clip": 0.06416824, "auxiliary_loss_mlp": 0.01265113, "balance_loss_clip": 0.06276678, "balance_loss_mlp": 0.01255463, "epoch": 0.7349466406132572, "flos": 20009700967680.0, "grad_norm": 1.6068183741252111, "language_loss": 0.72634345, "learning_rate": 6.926615984942332e-07, "loss": 0.80316281, "num_input_tokens_seen": 263793700, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09649658, "step": 12224, "time_per_iteration": 4.11909031867981 }, { "auxiliary_loss_clip": 0.06424659, "auxiliary_loss_mlp": 0.01265214, "balance_loss_clip": 0.06281503, "balance_loss_mlp": 0.01255224, "epoch": 0.7350067638659251, "flos": 29832766018560.0, "grad_norm": 1.6649083699780436, "language_loss": 0.72770858, "learning_rate": 6.92366886315652e-07, "loss": 0.80460727, "num_input_tokens_seen": 263814620, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09991455, "step": 12225, "time_per_iteration": 2.618501901626587 }, { "auxiliary_loss_clip": 0.06426996, "auxiliary_loss_mlp": 0.0126498, "balance_loss_clip": 0.06279679, "balance_loss_mlp": 0.01253995, "epoch": 0.7350668871185931, "flos": 21871677271680.0, "grad_norm": 1.634984145879845, "language_loss": 0.76517272, "learning_rate": 6.920722237226501e-07, "loss": 0.84209251, "num_input_tokens_seen": 263832725, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.10992432, "step": 12226, "time_per_iteration": 2.5435163974761963 }, { "auxiliary_loss_clip": 0.06418286, "auxiliary_loss_mlp": 0.01263372, "balance_loss_clip": 0.06277185, "balance_loss_mlp": 0.01253788, "epoch": 0.735127010371261, "flos": 22572893364480.0, "grad_norm": 1.4542849730225658, "language_loss": 0.6724124, "learning_rate": 6.917776107264008e-07, "loss": 0.74922895, "num_input_tokens_seen": 263853850, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.0958252, "step": 12227, "time_per_iteration": 2.5709376335144043 }, { "auxiliary_loss_clip": 0.06420745, "auxiliary_loss_mlp": 0.0126218, "balance_loss_clip": 0.06278267, "balance_loss_mlp": 0.01252852, "epoch": 0.7351871336239291, "flos": 25891333338240.0, "grad_norm": 1.532083025138688, "language_loss": 0.63565123, "learning_rate": 6.914830473380749e-07, "loss": 0.71248049, "num_input_tokens_seen": 263874760, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09326172, "step": 12228, "time_per_iteration": 2.6045455932617188 }, { "auxiliary_loss_clip": 0.06418695, "auxiliary_loss_mlp": 0.01263653, "balance_loss_clip": 0.06277083, "balance_loss_mlp": 0.01254456, "epoch": 0.735247256876597, "flos": 17938126874880.0, "grad_norm": 1.4752873016118266, "language_loss": 0.63660997, "learning_rate": 6.911885335688427e-07, "loss": 0.7134335, "num_input_tokens_seen": 263893390, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09197998, "step": 12229, "time_per_iteration": 2.53515362739563 }, { "auxiliary_loss_clip": 0.06421131, "auxiliary_loss_mlp": 0.01264759, "balance_loss_clip": 0.06276426, "balance_loss_mlp": 0.01253988, "epoch": 0.735307380129265, "flos": 28882484064000.0, "grad_norm": 1.555322237188254, "language_loss": 0.73564583, "learning_rate": 6.908940694298726e-07, "loss": 0.81250477, "num_input_tokens_seen": 263911180, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10766602, "step": 12230, "time_per_iteration": 2.59507417678833 }, { "auxiliary_loss_clip": 0.06420799, "auxiliary_loss_mlp": 0.01267158, "balance_loss_clip": 0.06278205, "balance_loss_mlp": 0.01256745, "epoch": 0.7353675033819329, "flos": 13630691560320.0, "grad_norm": 2.0272983392045227, "language_loss": 0.72403264, "learning_rate": 6.90599654932332e-07, "loss": 0.80091214, "num_input_tokens_seen": 263928975, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10412598, "step": 12231, "time_per_iteration": 2.529949426651001 }, { "auxiliary_loss_clip": 0.06425554, "auxiliary_loss_mlp": 0.01270484, "balance_loss_clip": 0.06281437, "balance_loss_mlp": 0.01260136, "epoch": 0.7354276266346009, "flos": 19469040497280.0, "grad_norm": 2.114211390991936, "language_loss": 0.64309496, "learning_rate": 6.903052900873823e-07, "loss": 0.72005534, "num_input_tokens_seen": 263944495, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10333252, "step": 12232, "time_per_iteration": 2.5322797298431396 }, { "auxiliary_loss_clip": 0.06418667, "auxiliary_loss_mlp": 0.01269567, "balance_loss_clip": 0.0627637, "balance_loss_mlp": 0.01259429, "epoch": 0.735487749887269, "flos": 15776170554240.0, "grad_norm": 1.7549661696126417, "language_loss": 0.75668287, "learning_rate": 6.900109749061874e-07, "loss": 0.83356524, "num_input_tokens_seen": 263961325, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10137939, "step": 12233, "time_per_iteration": 2.53078031539917 }, { "auxiliary_loss_clip": 0.06420462, "auxiliary_loss_mlp": 0.01267799, "balance_loss_clip": 0.06278011, "balance_loss_mlp": 0.0125775, "epoch": 0.7355478731399369, "flos": 18266673934080.0, "grad_norm": 1.45262324051379, "language_loss": 0.73593301, "learning_rate": 6.897167093999079e-07, "loss": 0.81281561, "num_input_tokens_seen": 263980445, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10046387, "step": 12234, "time_per_iteration": 3.9513065814971924 }, { "auxiliary_loss_clip": 0.0642026, "auxiliary_loss_mlp": 0.01267399, "balance_loss_clip": 0.06277868, "balance_loss_mlp": 0.01257874, "epoch": 0.7356079963926049, "flos": 26549307924480.0, "grad_norm": 2.2101473662901108, "language_loss": 0.60465479, "learning_rate": 6.894224935797017e-07, "loss": 0.68153143, "num_input_tokens_seen": 263999330, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09521484, "step": 12235, "time_per_iteration": 2.5941054821014404 }, { "auxiliary_loss_clip": 0.06416449, "auxiliary_loss_mlp": 0.01264373, "balance_loss_clip": 0.06277706, "balance_loss_mlp": 0.01254914, "epoch": 0.7356681196452728, "flos": 10782990224640.0, "grad_norm": 2.190316823084192, "language_loss": 0.86431563, "learning_rate": 6.891283274567259e-07, "loss": 0.94112384, "num_input_tokens_seen": 264014150, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09460449, "step": 12236, "time_per_iteration": 2.511399269104004 }, { "auxiliary_loss_clip": 0.06419072, "auxiliary_loss_mlp": 0.01262281, "balance_loss_clip": 0.06276187, "balance_loss_mlp": 0.01252935, "epoch": 0.7357282428979408, "flos": 19724730831360.0, "grad_norm": 1.5902387523010195, "language_loss": 0.69341087, "learning_rate": 6.888342110421364e-07, "loss": 0.77022445, "num_input_tokens_seen": 264033140, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09344482, "step": 12237, "time_per_iteration": 2.580277681350708 }, { "auxiliary_loss_clip": 0.06417593, "auxiliary_loss_mlp": 0.01263819, "balance_loss_clip": 0.06275326, "balance_loss_mlp": 0.01254109, "epoch": 0.7357883661506087, "flos": 19470130600320.0, "grad_norm": 1.5596710490673689, "language_loss": 0.7245754, "learning_rate": 6.885401443470839e-07, "loss": 0.80138952, "num_input_tokens_seen": 264052105, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09710693, "step": 12238, "time_per_iteration": 2.539108991622925 }, { "auxiliary_loss_clip": 0.06430542, "auxiliary_loss_mlp": 0.0126719, "balance_loss_clip": 0.06280716, "balance_loss_mlp": 0.01256909, "epoch": 0.7358484894032767, "flos": 27129897665280.0, "grad_norm": 1.823329728466741, "language_loss": 0.72449374, "learning_rate": 6.882461273827205e-07, "loss": 0.80147105, "num_input_tokens_seen": 264070690, "router_z_loss_clip": 1.49804688, "router_z_loss_mlp": 0.10284424, "step": 12239, "time_per_iteration": 4.023684740066528 }, { "auxiliary_loss_clip": 0.06413434, "auxiliary_loss_mlp": 0.01265488, "balance_loss_clip": 0.06276633, "balance_loss_mlp": 0.01256941, "epoch": 0.7359086126559446, "flos": 24509780818560.0, "grad_norm": 1.4637139702439836, "language_loss": 0.79201454, "learning_rate": 6.879521601601954e-07, "loss": 0.86880374, "num_input_tokens_seen": 264094225, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.08544922, "step": 12240, "time_per_iteration": 2.6412301063537598 }, { "auxiliary_loss_clip": 0.06422073, "auxiliary_loss_mlp": 0.0127043, "balance_loss_clip": 0.06281929, "balance_loss_mlp": 0.01260988, "epoch": 0.7359687359086127, "flos": 23337993795840.0, "grad_norm": 1.6579304244450601, "language_loss": 0.83169162, "learning_rate": 6.876582426906565e-07, "loss": 0.90861666, "num_input_tokens_seen": 264113190, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09442139, "step": 12241, "time_per_iteration": 2.5639162063598633 }, { "auxiliary_loss_clip": 0.06416931, "auxiliary_loss_mlp": 0.01264833, "balance_loss_clip": 0.06278896, "balance_loss_mlp": 0.01255493, "epoch": 0.7360288591612806, "flos": 20199578319360.0, "grad_norm": 1.837823729377287, "language_loss": 0.79406506, "learning_rate": 6.873643749852484e-07, "loss": 0.87088269, "num_input_tokens_seen": 264132050, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09338379, "step": 12242, "time_per_iteration": 2.530242919921875 }, { "auxiliary_loss_clip": 0.06414156, "auxiliary_loss_mlp": 0.01267173, "balance_loss_clip": 0.06275675, "balance_loss_mlp": 0.01258065, "epoch": 0.7360889824139486, "flos": 24979722842880.0, "grad_norm": 2.073797405081722, "language_loss": 0.80086577, "learning_rate": 6.870705570551145e-07, "loss": 0.87767905, "num_input_tokens_seen": 264152800, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09106445, "step": 12243, "time_per_iteration": 2.6041574478149414 }, { "auxiliary_loss_clip": 0.06419785, "auxiliary_loss_mlp": 0.01265659, "balance_loss_clip": 0.06274835, "balance_loss_mlp": 0.01255205, "epoch": 0.7361491056666165, "flos": 15017610741120.0, "grad_norm": 2.7016608344541893, "language_loss": 0.73852944, "learning_rate": 6.867767889113969e-07, "loss": 0.81538391, "num_input_tokens_seen": 264169650, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10461426, "step": 12244, "time_per_iteration": 2.5385711193084717 }, { "auxiliary_loss_clip": 0.06419608, "auxiliary_loss_mlp": 0.01266282, "balance_loss_clip": 0.0627517, "balance_loss_mlp": 0.0125671, "epoch": 0.7362092289192845, "flos": 22937135063040.0, "grad_norm": 1.580295385908335, "language_loss": 0.69576269, "learning_rate": 6.864830705652347e-07, "loss": 0.77262163, "num_input_tokens_seen": 264190530, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.09576416, "step": 12245, "time_per_iteration": 2.571214437484741 }, { "auxiliary_loss_clip": 0.06410863, "auxiliary_loss_mlp": 0.01267292, "balance_loss_clip": 0.06275086, "balance_loss_mlp": 0.01257248, "epoch": 0.7362693521719526, "flos": 20708694927360.0, "grad_norm": 1.465110864161276, "language_loss": 0.73272151, "learning_rate": 6.861894020277658e-07, "loss": 0.80950308, "num_input_tokens_seen": 264210820, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.10040283, "step": 12246, "time_per_iteration": 2.5607995986938477 }, { "auxiliary_loss_clip": 0.0641012, "auxiliary_loss_mlp": 0.01269404, "balance_loss_clip": 0.06274663, "balance_loss_mlp": 0.01260941, "epoch": 0.7363294754246205, "flos": 13115747093760.0, "grad_norm": 1.9904849165038612, "language_loss": 0.73586863, "learning_rate": 6.858957833101266e-07, "loss": 0.81266391, "num_input_tokens_seen": 264227430, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.08462524, "step": 12247, "time_per_iteration": 2.5091493129730225 }, { "auxiliary_loss_clip": 0.06416221, "auxiliary_loss_mlp": 0.01264762, "balance_loss_clip": 0.06278418, "balance_loss_mlp": 0.01254903, "epoch": 0.7363895986772885, "flos": 14032598469120.0, "grad_norm": 1.5627275279137198, "language_loss": 0.74190366, "learning_rate": 6.856022144234526e-07, "loss": 0.81871355, "num_input_tokens_seen": 264245230, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09857178, "step": 12248, "time_per_iteration": 2.531367778778076 }, { "auxiliary_loss_clip": 0.06419308, "auxiliary_loss_mlp": 0.01270168, "balance_loss_clip": 0.06275897, "balance_loss_mlp": 0.01259922, "epoch": 0.7364497219299564, "flos": 19726240204800.0, "grad_norm": 1.8019929156100452, "language_loss": 0.72776324, "learning_rate": 6.853086953788727e-07, "loss": 0.804658, "num_input_tokens_seen": 264263945, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10253906, "step": 12249, "time_per_iteration": 2.5404622554779053 }, { "auxiliary_loss_clip": 0.06417698, "auxiliary_loss_mlp": 0.01266937, "balance_loss_clip": 0.06276903, "balance_loss_mlp": 0.01257133, "epoch": 0.7365098451826244, "flos": 21367843470720.0, "grad_norm": 1.7492890658967906, "language_loss": 0.7726084, "learning_rate": 6.850152261875189e-07, "loss": 0.84945476, "num_input_tokens_seen": 264281500, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09802246, "step": 12250, "time_per_iteration": 2.5649802684783936 }, { "auxiliary_loss_clip": 0.0641971, "auxiliary_loss_mlp": 0.012658, "balance_loss_clip": 0.06276422, "balance_loss_mlp": 0.01255435, "epoch": 0.7365699684352923, "flos": 23375030100480.0, "grad_norm": 1.6096061389644267, "language_loss": 0.71090764, "learning_rate": 6.8472180686052e-07, "loss": 0.78776276, "num_input_tokens_seen": 264301625, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10369873, "step": 12251, "time_per_iteration": 2.5756044387817383 }, { "auxiliary_loss_clip": 0.06416152, "auxiliary_loss_mlp": 0.01263458, "balance_loss_clip": 0.06276153, "balance_loss_mlp": 0.01254338, "epoch": 0.7366300916879603, "flos": 59537610380160.0, "grad_norm": 1.4580924010614669, "language_loss": 0.65713668, "learning_rate": 6.844284374090015e-07, "loss": 0.73393273, "num_input_tokens_seen": 264323975, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09118652, "step": 12252, "time_per_iteration": 2.9176721572875977 }, { "auxiliary_loss_clip": 0.06423207, "auxiliary_loss_mlp": 0.01266677, "balance_loss_clip": 0.06279293, "balance_loss_mlp": 0.01256968, "epoch": 0.7366902149406283, "flos": 20929445308800.0, "grad_norm": 1.6195278807183264, "language_loss": 0.79172879, "learning_rate": 6.841351178440884e-07, "loss": 0.86862767, "num_input_tokens_seen": 264343785, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.09710693, "step": 12253, "time_per_iteration": 2.5597941875457764 }, { "auxiliary_loss_clip": 0.06414798, "auxiliary_loss_mlp": 0.01262785, "balance_loss_clip": 0.06276107, "balance_loss_mlp": 0.01254035, "epoch": 0.7367503381932963, "flos": 17353973335680.0, "grad_norm": 1.9356712926209445, "language_loss": 0.76459748, "learning_rate": 6.83841848176905e-07, "loss": 0.84137332, "num_input_tokens_seen": 264361130, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.08746338, "step": 12254, "time_per_iteration": 2.5371451377868652 }, { "auxiliary_loss_clip": 0.06419605, "auxiliary_loss_mlp": 0.01266023, "balance_loss_clip": 0.06279281, "balance_loss_mlp": 0.01255747, "epoch": 0.7368104614459642, "flos": 17827017960960.0, "grad_norm": 2.4878982110580448, "language_loss": 0.69384134, "learning_rate": 6.835486284185692e-07, "loss": 0.77069759, "num_input_tokens_seen": 264376965, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.1027832, "step": 12255, "time_per_iteration": 2.516010284423828 }, { "auxiliary_loss_clip": 0.06424049, "auxiliary_loss_mlp": 0.01266469, "balance_loss_clip": 0.06281728, "balance_loss_mlp": 0.0125611, "epoch": 0.7368705846986322, "flos": 24612672032640.0, "grad_norm": 1.6920368584186192, "language_loss": 0.75454736, "learning_rate": 6.832554585802012e-07, "loss": 0.83145249, "num_input_tokens_seen": 264396310, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10357666, "step": 12256, "time_per_iteration": 2.5767242908477783 }, { "auxiliary_loss_clip": 0.06420961, "auxiliary_loss_mlp": 0.01264047, "balance_loss_clip": 0.06278332, "balance_loss_mlp": 0.0125426, "epoch": 0.7369307079513001, "flos": 34978829322240.0, "grad_norm": 1.6986991190859138, "language_loss": 0.74162352, "learning_rate": 6.829623386729182e-07, "loss": 0.81847364, "num_input_tokens_seen": 264418085, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09796143, "step": 12257, "time_per_iteration": 4.137256622314453 }, { "auxiliary_loss_clip": 0.06416324, "auxiliary_loss_mlp": 0.0126442, "balance_loss_clip": 0.06276241, "balance_loss_mlp": 0.01255345, "epoch": 0.7369908312039681, "flos": 21220872209280.0, "grad_norm": 1.4750457853215617, "language_loss": 0.78292823, "learning_rate": 6.826692687078362e-07, "loss": 0.85973561, "num_input_tokens_seen": 264437595, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09072876, "step": 12258, "time_per_iteration": 2.5509212017059326 }, { "auxiliary_loss_clip": 0.06424864, "auxiliary_loss_mlp": 0.0126682, "balance_loss_clip": 0.0628041, "balance_loss_mlp": 0.0125736, "epoch": 0.7370509544566362, "flos": 23630510799360.0, "grad_norm": 1.4640147031329123, "language_loss": 0.66621178, "learning_rate": 6.823762486960674e-07, "loss": 0.7431286, "num_input_tokens_seen": 264457385, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.09454346, "step": 12259, "time_per_iteration": 2.601034164428711 }, { "auxiliary_loss_clip": 0.06420783, "auxiliary_loss_mlp": 0.01264331, "balance_loss_clip": 0.06278852, "balance_loss_mlp": 0.0125449, "epoch": 0.7371110777093041, "flos": 24834764079360.0, "grad_norm": 1.549868114526836, "language_loss": 0.73303711, "learning_rate": 6.820832786487225e-07, "loss": 0.80988824, "num_input_tokens_seen": 264477205, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09838867, "step": 12260, "time_per_iteration": 2.5800225734710693 }, { "auxiliary_loss_clip": 0.06419109, "auxiliary_loss_mlp": 0.01266362, "balance_loss_clip": 0.06277283, "balance_loss_mlp": 0.01256366, "epoch": 0.7371712009619721, "flos": 23156292216960.0, "grad_norm": 2.368637782099124, "language_loss": 0.73794103, "learning_rate": 6.817903585769125e-07, "loss": 0.81479579, "num_input_tokens_seen": 264497195, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10003662, "step": 12261, "time_per_iteration": 2.561117649078369 }, { "auxiliary_loss_clip": 0.06424304, "auxiliary_loss_mlp": 0.01267008, "balance_loss_clip": 0.06278354, "balance_loss_mlp": 0.01256268, "epoch": 0.73723132421464, "flos": 23119675182720.0, "grad_norm": 2.421433799640992, "language_loss": 0.67310023, "learning_rate": 6.814974884917438e-07, "loss": 0.75001335, "num_input_tokens_seen": 264516950, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10736084, "step": 12262, "time_per_iteration": 2.5621352195739746 }, { "auxiliary_loss_clip": 0.06421417, "auxiliary_loss_mlp": 0.01266399, "balance_loss_clip": 0.06278624, "balance_loss_mlp": 0.01256636, "epoch": 0.737291447467308, "flos": 19278031115520.0, "grad_norm": 1.9901680260870622, "language_loss": 0.88344693, "learning_rate": 6.81204668404322e-07, "loss": 0.96032506, "num_input_tokens_seen": 264532675, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09765625, "step": 12263, "time_per_iteration": 4.0060577392578125 }, { "auxiliary_loss_clip": 0.06411244, "auxiliary_loss_mlp": 0.0126246, "balance_loss_clip": 0.06276671, "balance_loss_mlp": 0.01253913, "epoch": 0.7373515707199759, "flos": 25125142803840.0, "grad_norm": 1.72767641483648, "language_loss": 0.67674267, "learning_rate": 6.809118983257522e-07, "loss": 0.75347972, "num_input_tokens_seen": 264555635, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.08544922, "step": 12264, "time_per_iteration": 2.60787034034729 }, { "auxiliary_loss_clip": 0.06412829, "auxiliary_loss_mlp": 0.01264486, "balance_loss_clip": 0.06274024, "balance_loss_mlp": 0.01255391, "epoch": 0.737411693972644, "flos": 32415427290240.0, "grad_norm": 1.664714392675998, "language_loss": 0.80071998, "learning_rate": 6.806191782671356e-07, "loss": 0.87749302, "num_input_tokens_seen": 264573140, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09094238, "step": 12265, "time_per_iteration": 2.6391069889068604 }, { "auxiliary_loss_clip": 0.06426036, "auxiliary_loss_mlp": 0.01264728, "balance_loss_clip": 0.06278405, "balance_loss_mlp": 0.01254506, "epoch": 0.7374718172253119, "flos": 24322586797440.0, "grad_norm": 1.688378506071776, "language_loss": 0.74672824, "learning_rate": 6.803265082395711e-07, "loss": 0.82363588, "num_input_tokens_seen": 264591610, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.10223389, "step": 12266, "time_per_iteration": 2.580040454864502 }, { "auxiliary_loss_clip": 0.06419626, "auxiliary_loss_mlp": 0.01265156, "balance_loss_clip": 0.06277716, "balance_loss_mlp": 0.01255202, "epoch": 0.7375319404779799, "flos": 27162447776640.0, "grad_norm": 1.6917871181978128, "language_loss": 0.73659921, "learning_rate": 6.800338882541576e-07, "loss": 0.81344712, "num_input_tokens_seen": 264611170, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.0994873, "step": 12267, "time_per_iteration": 2.612699031829834 }, { "auxiliary_loss_clip": 0.06417261, "auxiliary_loss_mlp": 0.01263921, "balance_loss_clip": 0.06275497, "balance_loss_mlp": 0.0125513, "epoch": 0.7375920637306478, "flos": 18885977061120.0, "grad_norm": 1.9080943947566409, "language_loss": 0.83218288, "learning_rate": 6.797413183219923e-07, "loss": 0.90899479, "num_input_tokens_seen": 264629365, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.08789062, "step": 12268, "time_per_iteration": 2.5202670097351074 }, { "auxiliary_loss_clip": 0.06415407, "auxiliary_loss_mlp": 0.01268358, "balance_loss_clip": 0.06275976, "balance_loss_mlp": 0.01258559, "epoch": 0.7376521869833158, "flos": 15675291838080.0, "grad_norm": 1.7208158943916747, "language_loss": 0.73556465, "learning_rate": 6.794487984541677e-07, "loss": 0.81240225, "num_input_tokens_seen": 264647915, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09802246, "step": 12269, "time_per_iteration": 2.547368049621582 }, { "auxiliary_loss_clip": 0.06421763, "auxiliary_loss_mlp": 0.01266386, "balance_loss_clip": 0.06277274, "balance_loss_mlp": 0.01256092, "epoch": 0.7377123102359837, "flos": 36980146166400.0, "grad_norm": 1.9278714967817014, "language_loss": 0.70868409, "learning_rate": 6.791563286617776e-07, "loss": 0.78556561, "num_input_tokens_seen": 264669620, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10302734, "step": 12270, "time_per_iteration": 2.6917684078216553 }, { "auxiliary_loss_clip": 0.06416279, "auxiliary_loss_mlp": 0.01265379, "balance_loss_clip": 0.0627685, "balance_loss_mlp": 0.01256969, "epoch": 0.7377724334886517, "flos": 24502779002880.0, "grad_norm": 1.6284154120311525, "language_loss": 0.69745994, "learning_rate": 6.788639089559119e-07, "loss": 0.77427649, "num_input_tokens_seen": 264689345, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.08404541, "step": 12271, "time_per_iteration": 2.618985414505005 }, { "auxiliary_loss_clip": 0.06416749, "auxiliary_loss_mlp": 0.01265578, "balance_loss_clip": 0.0627429, "balance_loss_mlp": 0.01255255, "epoch": 0.7378325567413198, "flos": 24397036750080.0, "grad_norm": 1.9439424613104233, "language_loss": 0.679232, "learning_rate": 6.785715393476586e-07, "loss": 0.7560553, "num_input_tokens_seen": 264707625, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10327148, "step": 12272, "time_per_iteration": 2.5783419609069824 }, { "auxiliary_loss_clip": 0.06414413, "auxiliary_loss_mlp": 0.01265674, "balance_loss_clip": 0.06276452, "balance_loss_mlp": 0.0125553, "epoch": 0.7378926799939877, "flos": 17421421472640.0, "grad_norm": 1.7869900799559002, "language_loss": 0.78538537, "learning_rate": 6.782792198481049e-07, "loss": 0.86218619, "num_input_tokens_seen": 264725575, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.10150146, "step": 12273, "time_per_iteration": 3.9612948894500732 }, { "auxiliary_loss_clip": 0.06414378, "auxiliary_loss_mlp": 0.01267993, "balance_loss_clip": 0.06273678, "balance_loss_mlp": 0.01258408, "epoch": 0.7379528032466557, "flos": 18479374323840.0, "grad_norm": 2.254774212068315, "language_loss": 0.83769727, "learning_rate": 6.779869504683355e-07, "loss": 0.91452098, "num_input_tokens_seen": 264742855, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09588623, "step": 12274, "time_per_iteration": 2.56597900390625 }, { "auxiliary_loss_clip": 0.06428609, "auxiliary_loss_mlp": 0.01270322, "balance_loss_clip": 0.06278851, "balance_loss_mlp": 0.01259945, "epoch": 0.7380129264993236, "flos": 17827814574720.0, "grad_norm": 2.283039297946445, "language_loss": 0.74084592, "learning_rate": 6.776947312194341e-07, "loss": 0.81783515, "num_input_tokens_seen": 264761155, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.10375977, "step": 12275, "time_per_iteration": 2.527627468109131 }, { "auxiliary_loss_clip": 0.06422642, "auxiliary_loss_mlp": 0.01268687, "balance_loss_clip": 0.06277032, "balance_loss_mlp": 0.01258662, "epoch": 0.7380730497519916, "flos": 23003115753600.0, "grad_norm": 2.724632282819473, "language_loss": 0.73946035, "learning_rate": 6.774025621124813e-07, "loss": 0.81637359, "num_input_tokens_seen": 264780660, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10028076, "step": 12276, "time_per_iteration": 2.5584988594055176 }, { "auxiliary_loss_clip": 0.06420486, "auxiliary_loss_mlp": 0.01261972, "balance_loss_clip": 0.06277563, "balance_loss_mlp": 0.01252501, "epoch": 0.7381331730046595, "flos": 20272435044480.0, "grad_norm": 2.0234662559442755, "language_loss": 0.77990156, "learning_rate": 6.771104431585551e-07, "loss": 0.85672617, "num_input_tokens_seen": 264798850, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09466553, "step": 12277, "time_per_iteration": 2.5386016368865967 }, { "auxiliary_loss_clip": 0.06418034, "auxiliary_loss_mlp": 0.01270094, "balance_loss_clip": 0.06278776, "balance_loss_mlp": 0.01260558, "epoch": 0.7381932962573275, "flos": 19760467397760.0, "grad_norm": 1.956309523575068, "language_loss": 0.79178691, "learning_rate": 6.768183743687338e-07, "loss": 0.8686682, "num_input_tokens_seen": 264816795, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09533691, "step": 12278, "time_per_iteration": 3.9542031288146973 }, { "auxiliary_loss_clip": 0.06421681, "auxiliary_loss_mlp": 0.01264104, "balance_loss_clip": 0.06277096, "balance_loss_mlp": 0.01254549, "epoch": 0.7382534195099955, "flos": 17310060996480.0, "grad_norm": 3.1697695163653448, "language_loss": 0.72113758, "learning_rate": 6.765263557540921e-07, "loss": 0.79799539, "num_input_tokens_seen": 264834105, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.09558105, "step": 12279, "time_per_iteration": 2.5254805088043213 }, { "auxiliary_loss_clip": 0.06422442, "auxiliary_loss_mlp": 0.01264943, "balance_loss_clip": 0.06277265, "balance_loss_mlp": 0.0125487, "epoch": 0.7383135427626635, "flos": 18703269233280.0, "grad_norm": 2.075358907785993, "language_loss": 0.86161387, "learning_rate": 6.762343873257034e-07, "loss": 0.93848771, "num_input_tokens_seen": 264850895, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10064697, "step": 12280, "time_per_iteration": 2.5307278633117676 }, { "auxiliary_loss_clip": 0.06423583, "auxiliary_loss_mlp": 0.01264625, "balance_loss_clip": 0.06279501, "balance_loss_mlp": 0.01254153, "epoch": 0.7383736660153314, "flos": 20886706926720.0, "grad_norm": 1.7505532931644086, "language_loss": 0.72275519, "learning_rate": 6.759424690946408e-07, "loss": 0.79963732, "num_input_tokens_seen": 264869505, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10467529, "step": 12281, "time_per_iteration": 2.5458195209503174 }, { "auxiliary_loss_clip": 0.06423591, "auxiliary_loss_mlp": 0.01268087, "balance_loss_clip": 0.06279694, "balance_loss_mlp": 0.01257936, "epoch": 0.7384337892679994, "flos": 20668723729920.0, "grad_norm": 1.770168508905231, "language_loss": 0.61167073, "learning_rate": 6.756506010719711e-07, "loss": 0.68858755, "num_input_tokens_seen": 264886915, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10162354, "step": 12282, "time_per_iteration": 2.552976131439209 }, { "auxiliary_loss_clip": 0.0642647, "auxiliary_loss_mlp": 0.01266642, "balance_loss_clip": 0.06280763, "balance_loss_mlp": 0.01256498, "epoch": 0.7384939125206673, "flos": 29177432835840.0, "grad_norm": 1.607192221178587, "language_loss": 0.6788637, "learning_rate": 6.753587832687632e-07, "loss": 0.75579488, "num_input_tokens_seen": 264910350, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10144043, "step": 12283, "time_per_iteration": 2.6077864170074463 }, { "auxiliary_loss_clip": 0.06418867, "auxiliary_loss_mlp": 0.01264624, "balance_loss_clip": 0.06278142, "balance_loss_mlp": 0.0125489, "epoch": 0.7385540357733353, "flos": 36320494498560.0, "grad_norm": 1.6178689057555697, "language_loss": 0.76164472, "learning_rate": 6.750670156960832e-07, "loss": 0.83847964, "num_input_tokens_seen": 264930705, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09741211, "step": 12284, "time_per_iteration": 2.7151100635528564 }, { "auxiliary_loss_clip": 0.06422719, "auxiliary_loss_mlp": 0.01263126, "balance_loss_clip": 0.06278402, "balance_loss_mlp": 0.01253024, "epoch": 0.7386141590260034, "flos": 20308758589440.0, "grad_norm": 2.033225491743784, "language_loss": 0.69770867, "learning_rate": 6.747752983649954e-07, "loss": 0.77456713, "num_input_tokens_seen": 264946975, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10089111, "step": 12285, "time_per_iteration": 2.5606656074523926 }, { "auxiliary_loss_clip": 0.06429967, "auxiliary_loss_mlp": 0.01267902, "balance_loss_clip": 0.06280784, "balance_loss_mlp": 0.01256923, "epoch": 0.7386742822786713, "flos": 25490851948800.0, "grad_norm": 2.15741665525567, "language_loss": 0.80085051, "learning_rate": 6.744836312865602e-07, "loss": 0.87782919, "num_input_tokens_seen": 264967665, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.10986328, "step": 12286, "time_per_iteration": 2.5988357067108154 }, { "auxiliary_loss_clip": 0.06418728, "auxiliary_loss_mlp": 0.0126567, "balance_loss_clip": 0.06278359, "balance_loss_mlp": 0.01256092, "epoch": 0.7387344055313393, "flos": 13777075843200.0, "grad_norm": 1.9885798110064619, "language_loss": 0.6616348, "learning_rate": 6.741920144718396e-07, "loss": 0.73847884, "num_input_tokens_seen": 264985480, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09576416, "step": 12287, "time_per_iteration": 2.76961350440979 }, { "auxiliary_loss_clip": 0.0641176, "auxiliary_loss_mlp": 0.01263733, "balance_loss_clip": 0.06276185, "balance_loss_mlp": 0.01254763, "epoch": 0.7387945287840072, "flos": 27862615693440.0, "grad_norm": 1.6997755334485238, "language_loss": 0.76484036, "learning_rate": 6.739004479318903e-07, "loss": 0.84159529, "num_input_tokens_seen": 265004790, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.08966064, "step": 12288, "time_per_iteration": 2.710304021835327 }, { "auxiliary_loss_clip": 0.06425518, "auxiliary_loss_mlp": 0.01269983, "balance_loss_clip": 0.06279318, "balance_loss_mlp": 0.01259045, "epoch": 0.7388546520366752, "flos": 44242492515840.0, "grad_norm": 3.321615794163892, "language_loss": 0.58227241, "learning_rate": 6.736089316777684e-07, "loss": 0.65922737, "num_input_tokens_seen": 265028790, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.109375, "step": 12289, "time_per_iteration": 2.7838377952575684 }, { "auxiliary_loss_clip": 0.06323142, "auxiliary_loss_mlp": 0.01255896, "balance_loss_clip": 0.06266245, "balance_loss_mlp": 0.01254468, "epoch": 0.7389147752893431, "flos": 70700145672960.0, "grad_norm": 0.6265621694963516, "language_loss": 0.49189866, "learning_rate": 6.733174657205287e-07, "loss": 0.56768906, "num_input_tokens_seen": 265096660, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01426697, "step": 12290, "time_per_iteration": 3.2939107418060303 }, { "auxiliary_loss_clip": 0.06419828, "auxiliary_loss_mlp": 0.01270053, "balance_loss_clip": 0.06276739, "balance_loss_mlp": 0.01258508, "epoch": 0.7389748985420111, "flos": 26002190689920.0, "grad_norm": 1.9508608400264542, "language_loss": 0.6753881, "learning_rate": 6.730260500712237e-07, "loss": 0.75228691, "num_input_tokens_seen": 265116375, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.11553955, "step": 12291, "time_per_iteration": 2.611442804336548 }, { "auxiliary_loss_clip": 0.06318922, "auxiliary_loss_mlp": 0.0125832, "balance_loss_clip": 0.06262241, "balance_loss_mlp": 0.01256965, "epoch": 0.7390350217946791, "flos": 54419428558080.0, "grad_norm": 0.9529851854484599, "language_loss": 0.60766184, "learning_rate": 6.727346847409052e-07, "loss": 0.68343431, "num_input_tokens_seen": 265161230, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01357269, "step": 12292, "time_per_iteration": 2.8478996753692627 }, { "auxiliary_loss_clip": 0.06418037, "auxiliary_loss_mlp": 0.01267154, "balance_loss_clip": 0.06278518, "balance_loss_mlp": 0.01258288, "epoch": 0.7390951450473471, "flos": 32205116741760.0, "grad_norm": 1.8436752060652517, "language_loss": 0.66989505, "learning_rate": 6.724433697406191e-07, "loss": 0.7467469, "num_input_tokens_seen": 265182515, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.08865356, "step": 12293, "time_per_iteration": 2.668013334274292 }, { "auxiliary_loss_clip": 0.06421173, "auxiliary_loss_mlp": 0.01263412, "balance_loss_clip": 0.06281008, "balance_loss_mlp": 0.01253922, "epoch": 0.739155268300015, "flos": 16688745371520.0, "grad_norm": 1.9449120496293337, "language_loss": 0.83517826, "learning_rate": 6.721521050814134e-07, "loss": 0.91202414, "num_input_tokens_seen": 265198160, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09484863, "step": 12294, "time_per_iteration": 2.584284543991089 }, { "auxiliary_loss_clip": 0.06412376, "auxiliary_loss_mlp": 0.01265609, "balance_loss_clip": 0.06274863, "balance_loss_mlp": 0.01255894, "epoch": 0.739215391552683, "flos": 31657831799040.0, "grad_norm": 1.5415270560121102, "language_loss": 0.72642535, "learning_rate": 6.718608907743337e-07, "loss": 0.80320519, "num_input_tokens_seen": 265218480, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09716797, "step": 12295, "time_per_iteration": 2.65415358543396 }, { "auxiliary_loss_clip": 0.06413896, "auxiliary_loss_mlp": 0.0126592, "balance_loss_clip": 0.06277359, "balance_loss_mlp": 0.01257147, "epoch": 0.7392755148053509, "flos": 29726688349440.0, "grad_norm": 1.6881673513018738, "language_loss": 0.78899634, "learning_rate": 6.715697268304215e-07, "loss": 0.86579454, "num_input_tokens_seen": 265240165, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.08764648, "step": 12296, "time_per_iteration": 2.692183256149292 }, { "auxiliary_loss_clip": 0.06415159, "auxiliary_loss_mlp": 0.0126677, "balance_loss_clip": 0.0627492, "balance_loss_mlp": 0.01256136, "epoch": 0.7393356380580189, "flos": 37059585436800.0, "grad_norm": 2.063361296128602, "language_loss": 0.6702134, "learning_rate": 6.712786132607182e-07, "loss": 0.74703264, "num_input_tokens_seen": 265263295, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.10626221, "step": 12297, "time_per_iteration": 4.1717870235443115 }, { "auxiliary_loss_clip": 0.0642196, "auxiliary_loss_mlp": 0.01265107, "balance_loss_clip": 0.06279683, "balance_loss_mlp": 0.01254688, "epoch": 0.739395761310687, "flos": 19725820934400.0, "grad_norm": 1.5634315401696084, "language_loss": 0.68669844, "learning_rate": 6.709875500762645e-07, "loss": 0.76356912, "num_input_tokens_seen": 265282740, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10430908, "step": 12298, "time_per_iteration": 2.554670572280884 }, { "auxiliary_loss_clip": 0.06421144, "auxiliary_loss_mlp": 0.01265624, "balance_loss_clip": 0.06279299, "balance_loss_mlp": 0.012557, "epoch": 0.7394558845633549, "flos": 11806254685440.0, "grad_norm": 1.7434056582570863, "language_loss": 0.74375159, "learning_rate": 6.706965372880946e-07, "loss": 0.82061929, "num_input_tokens_seen": 265300175, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09912109, "step": 12299, "time_per_iteration": 2.533867835998535 }, { "auxiliary_loss_clip": 0.06319776, "auxiliary_loss_mlp": 0.01254108, "balance_loss_clip": 0.06262524, "balance_loss_mlp": 0.01252828, "epoch": 0.7395160078160229, "flos": 66214782213120.0, "grad_norm": 0.7066495133017208, "language_loss": 0.60670251, "learning_rate": 6.704055749072455e-07, "loss": 0.68244135, "num_input_tokens_seen": 265363275, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01280212, "step": 12300, "time_per_iteration": 3.234323501586914 }, { "auxiliary_loss_clip": 0.06422384, "auxiliary_loss_mlp": 0.0126526, "balance_loss_clip": 0.06278721, "balance_loss_mlp": 0.01254967, "epoch": 0.7395761310686908, "flos": 21255770234880.0, "grad_norm": 2.106610607409074, "language_loss": 0.80201983, "learning_rate": 6.7011466294475e-07, "loss": 0.8788963, "num_input_tokens_seen": 265382935, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10290527, "step": 12301, "time_per_iteration": 2.5467591285705566 }, { "auxiliary_loss_clip": 0.06421535, "auxiliary_loss_mlp": 0.01264408, "balance_loss_clip": 0.06281412, "balance_loss_mlp": 0.01254955, "epoch": 0.7396362543213588, "flos": 25961967930240.0, "grad_norm": 1.7665337070608969, "language_loss": 0.73549169, "learning_rate": 6.698238014116406e-07, "loss": 0.81235117, "num_input_tokens_seen": 265403245, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09460449, "step": 12302, "time_per_iteration": 2.5917704105377197 }, { "auxiliary_loss_clip": 0.06426276, "auxiliary_loss_mlp": 0.01265217, "balance_loss_clip": 0.0628105, "balance_loss_mlp": 0.01255769, "epoch": 0.7396963775740267, "flos": 27384791385600.0, "grad_norm": 2.3219720637966668, "language_loss": 0.73841232, "learning_rate": 6.695329903189451e-07, "loss": 0.81532729, "num_input_tokens_seen": 265423105, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.09448242, "step": 12303, "time_per_iteration": 4.0351362228393555 }, { "auxiliary_loss_clip": 0.06419942, "auxiliary_loss_mlp": 0.01264375, "balance_loss_clip": 0.06280514, "balance_loss_mlp": 0.01255238, "epoch": 0.7397565008266948, "flos": 25527175493760.0, "grad_norm": 1.6156586920038738, "language_loss": 0.5383454, "learning_rate": 6.692422296776927e-07, "loss": 0.61518854, "num_input_tokens_seen": 265443445, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09136963, "step": 12304, "time_per_iteration": 2.5899319648742676 }, { "auxiliary_loss_clip": 0.06423016, "auxiliary_loss_mlp": 0.01262961, "balance_loss_clip": 0.06281702, "balance_loss_mlp": 0.01253502, "epoch": 0.7398166240793627, "flos": 23733737429760.0, "grad_norm": 1.9710255050506422, "language_loss": 0.85218114, "learning_rate": 6.689515194989084e-07, "loss": 0.92904091, "num_input_tokens_seen": 265462085, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09466553, "step": 12305, "time_per_iteration": 2.5794882774353027 }, { "auxiliary_loss_clip": 0.06323811, "auxiliary_loss_mlp": 0.01253374, "balance_loss_clip": 0.06266795, "balance_loss_mlp": 0.01252114, "epoch": 0.7398767473320307, "flos": 67289002755840.0, "grad_norm": 0.8680312698009991, "language_loss": 0.57792336, "learning_rate": 6.68660859793615e-07, "loss": 0.65369523, "num_input_tokens_seen": 265521190, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01259613, "step": 12306, "time_per_iteration": 3.1734442710876465 }, { "auxiliary_loss_clip": 0.06424855, "auxiliary_loss_mlp": 0.01266369, "balance_loss_clip": 0.06281915, "balance_loss_mlp": 0.01255998, "epoch": 0.7399368705846986, "flos": 22025356859520.0, "grad_norm": 1.9482611786771316, "language_loss": 0.82172185, "learning_rate": 6.683702505728355e-07, "loss": 0.89863414, "num_input_tokens_seen": 265539705, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10369873, "step": 12307, "time_per_iteration": 2.577113151550293 }, { "auxiliary_loss_clip": 0.06417985, "auxiliary_loss_mlp": 0.01265209, "balance_loss_clip": 0.06281534, "balance_loss_mlp": 0.01256274, "epoch": 0.7399969938373666, "flos": 14179150460160.0, "grad_norm": 1.668195857167932, "language_loss": 0.69922739, "learning_rate": 6.680796918475893e-07, "loss": 0.77605933, "num_input_tokens_seen": 265555855, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.08935547, "step": 12308, "time_per_iteration": 2.5304934978485107 }, { "auxiliary_loss_clip": 0.06416874, "auxiliary_loss_mlp": 0.01263732, "balance_loss_clip": 0.06279842, "balance_loss_mlp": 0.01254404, "epoch": 0.7400571170900345, "flos": 25308521464320.0, "grad_norm": 2.0139661314334636, "language_loss": 0.82013494, "learning_rate": 6.67789183628896e-07, "loss": 0.89694095, "num_input_tokens_seen": 265575455, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09332275, "step": 12309, "time_per_iteration": 2.6113717555999756 }, { "auxiliary_loss_clip": 0.06424172, "auxiliary_loss_mlp": 0.01271229, "balance_loss_clip": 0.06277643, "balance_loss_mlp": 0.01260805, "epoch": 0.7401172403427025, "flos": 22718019836160.0, "grad_norm": 1.87725441693044, "language_loss": 0.73104197, "learning_rate": 6.674987259277692e-07, "loss": 0.80799598, "num_input_tokens_seen": 265595250, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10424805, "step": 12310, "time_per_iteration": 2.605830669403076 }, { "auxiliary_loss_clip": 0.06421679, "auxiliary_loss_mlp": 0.01266791, "balance_loss_clip": 0.06280123, "balance_loss_mlp": 0.01256098, "epoch": 0.7401773635953706, "flos": 18071639556480.0, "grad_norm": 2.722165773159531, "language_loss": 0.88830185, "learning_rate": 6.672083187552239e-07, "loss": 0.96518654, "num_input_tokens_seen": 265606945, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10699463, "step": 12311, "time_per_iteration": 2.5240955352783203 }, { "auxiliary_loss_clip": 0.06420654, "auxiliary_loss_mlp": 0.01263312, "balance_loss_clip": 0.0627982, "balance_loss_mlp": 0.01254097, "epoch": 0.7402374868480385, "flos": 22718942231040.0, "grad_norm": 1.7370829725716714, "language_loss": 0.80348134, "learning_rate": 6.669179621222738e-07, "loss": 0.88032097, "num_input_tokens_seen": 265626115, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09210205, "step": 12312, "time_per_iteration": 2.5916926860809326 }, { "auxiliary_loss_clip": 0.06421612, "auxiliary_loss_mlp": 0.01265297, "balance_loss_clip": 0.06282271, "balance_loss_mlp": 0.01255832, "epoch": 0.7402976101007065, "flos": 22863272088960.0, "grad_norm": 1.6681582065147014, "language_loss": 0.78701359, "learning_rate": 6.666276560399273e-07, "loss": 0.86388266, "num_input_tokens_seen": 265646520, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09472656, "step": 12313, "time_per_iteration": 4.014758110046387 }, { "auxiliary_loss_clip": 0.06419937, "auxiliary_loss_mlp": 0.01265666, "balance_loss_clip": 0.06275079, "balance_loss_mlp": 0.01255098, "epoch": 0.7403577333533744, "flos": 12350143537920.0, "grad_norm": 1.948090959585417, "language_loss": 0.78909737, "learning_rate": 6.663374005191937e-07, "loss": 0.86595339, "num_input_tokens_seen": 265661875, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10565186, "step": 12314, "time_per_iteration": 2.5271267890930176 }, { "auxiliary_loss_clip": 0.0632749, "auxiliary_loss_mlp": 0.01250995, "balance_loss_clip": 0.06270049, "balance_loss_mlp": 0.01249609, "epoch": 0.7404178566060424, "flos": 60346189152000.0, "grad_norm": 0.8019841562671813, "language_loss": 0.55231917, "learning_rate": 6.660471955710809e-07, "loss": 0.62810403, "num_input_tokens_seen": 265721255, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.01386261, "step": 12315, "time_per_iteration": 3.145721673965454 }, { "auxiliary_loss_clip": 0.06416633, "auxiliary_loss_mlp": 0.01270379, "balance_loss_clip": 0.06279466, "balance_loss_mlp": 0.0126111, "epoch": 0.7404779798587103, "flos": 32022786257280.0, "grad_norm": 1.508789443502888, "language_loss": 0.79706824, "learning_rate": 6.65757041206591e-07, "loss": 0.87393844, "num_input_tokens_seen": 265743970, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.0927124, "step": 12316, "time_per_iteration": 2.6660478115081787 }, { "auxiliary_loss_clip": 0.06418546, "auxiliary_loss_mlp": 0.01261907, "balance_loss_clip": 0.06277329, "balance_loss_mlp": 0.01252632, "epoch": 0.7405381031113784, "flos": 12893571192960.0, "grad_norm": 1.6977892652391346, "language_loss": 0.74751318, "learning_rate": 6.654669374367275e-07, "loss": 0.82431769, "num_input_tokens_seen": 265760890, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09277344, "step": 12317, "time_per_iteration": 2.574230432510376 }, { "auxiliary_loss_clip": 0.06411727, "auxiliary_loss_mlp": 0.01267723, "balance_loss_clip": 0.06277144, "balance_loss_mlp": 0.01258836, "epoch": 0.7405982263640463, "flos": 20235189104640.0, "grad_norm": 1.526633877162974, "language_loss": 0.81774676, "learning_rate": 6.651768842724917e-07, "loss": 0.89454114, "num_input_tokens_seen": 265779600, "router_z_loss_clip": 1.34667969, "router_z_loss_mlp": 0.08886719, "step": 12318, "time_per_iteration": 4.064401865005493 }, { "auxiliary_loss_clip": 0.06423347, "auxiliary_loss_mlp": 0.01266297, "balance_loss_clip": 0.06279521, "balance_loss_mlp": 0.01256004, "epoch": 0.7406583496167143, "flos": 17573088562560.0, "grad_norm": 1.8503819052680759, "language_loss": 0.77124351, "learning_rate": 6.648868817248827e-07, "loss": 0.84813994, "num_input_tokens_seen": 265797030, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10296631, "step": 12319, "time_per_iteration": 2.5234298706054688 }, { "auxiliary_loss_clip": 0.06419543, "auxiliary_loss_mlp": 0.01263101, "balance_loss_clip": 0.06280033, "balance_loss_mlp": 0.01254286, "epoch": 0.7407184728693822, "flos": 18301530032640.0, "grad_norm": 2.2592973119334316, "language_loss": 0.64481646, "learning_rate": 6.64596929804897e-07, "loss": 0.72164285, "num_input_tokens_seen": 265815055, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.08807373, "step": 12320, "time_per_iteration": 2.5203263759613037 }, { "auxiliary_loss_clip": 0.06427357, "auxiliary_loss_mlp": 0.0126254, "balance_loss_clip": 0.06281488, "balance_loss_mlp": 0.01252378, "epoch": 0.7407785961220502, "flos": 16696124530560.0, "grad_norm": 2.9901488629011874, "language_loss": 0.83092415, "learning_rate": 6.643070285235288e-07, "loss": 0.90782309, "num_input_tokens_seen": 265828480, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10168457, "step": 12321, "time_per_iteration": 2.520336866378784 }, { "auxiliary_loss_clip": 0.06425333, "auxiliary_loss_mlp": 0.01271516, "balance_loss_clip": 0.06277335, "balance_loss_mlp": 0.01259827, "epoch": 0.7408387193747181, "flos": 22094440151040.0, "grad_norm": 3.6528604325944283, "language_loss": 0.7249316, "learning_rate": 6.640171778917727e-07, "loss": 0.80190009, "num_input_tokens_seen": 265845825, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.11694336, "step": 12322, "time_per_iteration": 2.549067735671997 }, { "auxiliary_loss_clip": 0.06423791, "auxiliary_loss_mlp": 0.0126404, "balance_loss_clip": 0.06281795, "balance_loss_mlp": 0.01253818, "epoch": 0.7408988426273861, "flos": 24242476694400.0, "grad_norm": 1.9415673305034595, "language_loss": 0.64083087, "learning_rate": 6.637273779206183e-07, "loss": 0.71770918, "num_input_tokens_seen": 265866335, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.10211182, "step": 12323, "time_per_iteration": 2.6054866313934326 }, { "auxiliary_loss_clip": 0.06424734, "auxiliary_loss_mlp": 0.01265238, "balance_loss_clip": 0.0628061, "balance_loss_mlp": 0.01255105, "epoch": 0.7409589658800542, "flos": 29030671209600.0, "grad_norm": 1.374399580543986, "language_loss": 0.76054311, "learning_rate": 6.634376286210559e-07, "loss": 0.83744287, "num_input_tokens_seen": 265888945, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10131836, "step": 12324, "time_per_iteration": 2.6333014965057373 }, { "auxiliary_loss_clip": 0.06420071, "auxiliary_loss_mlp": 0.01264554, "balance_loss_clip": 0.06279578, "balance_loss_mlp": 0.01254457, "epoch": 0.7410190891327221, "flos": 19356925334400.0, "grad_norm": 2.600958174437463, "language_loss": 0.74643749, "learning_rate": 6.63147930004073e-07, "loss": 0.82328367, "num_input_tokens_seen": 265908030, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.10089111, "step": 12325, "time_per_iteration": 2.535104990005493 }, { "auxiliary_loss_clip": 0.06431052, "auxiliary_loss_mlp": 0.01266814, "balance_loss_clip": 0.06282742, "balance_loss_mlp": 0.01256973, "epoch": 0.7410792123853901, "flos": 22754301454080.0, "grad_norm": 2.167945337619082, "language_loss": 0.6850419, "learning_rate": 6.628582820806545e-07, "loss": 0.76202059, "num_input_tokens_seen": 265927030, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.09844971, "step": 12326, "time_per_iteration": 2.581860065460205 }, { "auxiliary_loss_clip": 0.06419161, "auxiliary_loss_mlp": 0.01264566, "balance_loss_clip": 0.0627737, "balance_loss_mlp": 0.01254922, "epoch": 0.741139335638058, "flos": 25379156056320.0, "grad_norm": 1.5725647089965693, "language_loss": 0.89780158, "learning_rate": 6.625686848617835e-07, "loss": 0.97463882, "num_input_tokens_seen": 265945490, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09643555, "step": 12327, "time_per_iteration": 2.5557615756988525 }, { "auxiliary_loss_clip": 0.06420018, "auxiliary_loss_mlp": 0.01270371, "balance_loss_clip": 0.0627905, "balance_loss_mlp": 0.01260089, "epoch": 0.741199458890726, "flos": 18591154070400.0, "grad_norm": 2.079499470360175, "language_loss": 0.85649848, "learning_rate": 6.62279138358442e-07, "loss": 0.93340236, "num_input_tokens_seen": 265963265, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10284424, "step": 12328, "time_per_iteration": 2.5431036949157715 }, { "auxiliary_loss_clip": 0.0641596, "auxiliary_loss_mlp": 0.01265196, "balance_loss_clip": 0.06278127, "balance_loss_mlp": 0.01255779, "epoch": 0.7412595821433939, "flos": 22133572807680.0, "grad_norm": 1.7114590722247411, "language_loss": 0.67337024, "learning_rate": 6.619896425816103e-07, "loss": 0.75018179, "num_input_tokens_seen": 265982270, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09417725, "step": 12329, "time_per_iteration": 2.5565106868743896 }, { "auxiliary_loss_clip": 0.06429514, "auxiliary_loss_mlp": 0.01267642, "balance_loss_clip": 0.06281705, "balance_loss_mlp": 0.01257461, "epoch": 0.741319705396062, "flos": 29177516689920.0, "grad_norm": 1.6777407550330334, "language_loss": 0.6724214, "learning_rate": 6.617001975422647e-07, "loss": 0.74939299, "num_input_tokens_seen": 266003835, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.10180664, "step": 12330, "time_per_iteration": 2.6397905349731445 }, { "auxiliary_loss_clip": 0.06428992, "auxiliary_loss_mlp": 0.01264334, "balance_loss_clip": 0.06281897, "balance_loss_mlp": 0.0125348, "epoch": 0.7413798286487299, "flos": 20673713047680.0, "grad_norm": 1.9791568170230367, "language_loss": 0.8576867, "learning_rate": 6.614108032513823e-07, "loss": 0.93461996, "num_input_tokens_seen": 266021595, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10852051, "step": 12331, "time_per_iteration": 2.564638614654541 }, { "auxiliary_loss_clip": 0.06425855, "auxiliary_loss_mlp": 0.01263615, "balance_loss_clip": 0.06281981, "balance_loss_mlp": 0.01253566, "epoch": 0.7414399519013979, "flos": 16404446067840.0, "grad_norm": 1.9577322204129632, "language_loss": 0.70346552, "learning_rate": 6.611214597199364e-07, "loss": 0.78036022, "num_input_tokens_seen": 266039860, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10040283, "step": 12332, "time_per_iteration": 2.56561279296875 }, { "auxiliary_loss_clip": 0.06422469, "auxiliary_loss_mlp": 0.01267792, "balance_loss_clip": 0.06281143, "balance_loss_mlp": 0.01257802, "epoch": 0.7415000751540658, "flos": 25637403939840.0, "grad_norm": 1.9232628876237485, "language_loss": 0.63433838, "learning_rate": 6.608321669588984e-07, "loss": 0.71124101, "num_input_tokens_seen": 266058050, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09991455, "step": 12333, "time_per_iteration": 2.5945589542388916 }, { "auxiliary_loss_clip": 0.06418108, "auxiliary_loss_mlp": 0.01266182, "balance_loss_clip": 0.06281752, "balance_loss_mlp": 0.01257245, "epoch": 0.7415601984067338, "flos": 24506803998720.0, "grad_norm": 1.6484119632428087, "language_loss": 0.71401572, "learning_rate": 6.605429249792387e-07, "loss": 0.79085869, "num_input_tokens_seen": 266078060, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.08926392, "step": 12334, "time_per_iteration": 2.694418430328369 }, { "auxiliary_loss_clip": 0.06415831, "auxiliary_loss_mlp": 0.01263642, "balance_loss_clip": 0.06275728, "balance_loss_mlp": 0.01254427, "epoch": 0.7416203216594017, "flos": 20893541034240.0, "grad_norm": 1.6226984093130852, "language_loss": 0.82699281, "learning_rate": 6.602537337919257e-07, "loss": 0.90378755, "num_input_tokens_seen": 266097110, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09216309, "step": 12335, "time_per_iteration": 2.704423666000366 }, { "auxiliary_loss_clip": 0.06418143, "auxiliary_loss_mlp": 0.01266614, "balance_loss_clip": 0.0627476, "balance_loss_mlp": 0.01256749, "epoch": 0.7416804449120697, "flos": 15628276897920.0, "grad_norm": 2.5671951133772724, "language_loss": 0.75178647, "learning_rate": 6.599645934079259e-07, "loss": 0.82863402, "num_input_tokens_seen": 266110870, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09869385, "step": 12336, "time_per_iteration": 4.023822784423828 }, { "auxiliary_loss_clip": 0.06426984, "auxiliary_loss_mlp": 0.01264828, "balance_loss_clip": 0.06283576, "balance_loss_mlp": 0.01254677, "epoch": 0.7417405681647377, "flos": 17124795619200.0, "grad_norm": 1.728551699431042, "language_loss": 0.7371341, "learning_rate": 6.596755038382029e-07, "loss": 0.8140521, "num_input_tokens_seen": 266127845, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10150146, "step": 12337, "time_per_iteration": 2.5553576946258545 }, { "auxiliary_loss_clip": 0.06421203, "auxiliary_loss_mlp": 0.01265026, "balance_loss_clip": 0.06283423, "balance_loss_mlp": 0.01255644, "epoch": 0.7418006914174057, "flos": 18886354404480.0, "grad_norm": 4.617206727398869, "language_loss": 0.76905966, "learning_rate": 6.593864650937186e-07, "loss": 0.84592193, "num_input_tokens_seen": 266145400, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09381104, "step": 12338, "time_per_iteration": 2.563286542892456 }, { "auxiliary_loss_clip": 0.06415506, "auxiliary_loss_mlp": 0.01266076, "balance_loss_clip": 0.06277007, "balance_loss_mlp": 0.0125707, "epoch": 0.7418608146700737, "flos": 21587294113920.0, "grad_norm": 1.6155196688085527, "language_loss": 0.72846574, "learning_rate": 6.590974771854345e-07, "loss": 0.80528158, "num_input_tokens_seen": 266164430, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09002686, "step": 12339, "time_per_iteration": 2.5885040760040283 }, { "auxiliary_loss_clip": 0.06418213, "auxiliary_loss_mlp": 0.01265066, "balance_loss_clip": 0.06277266, "balance_loss_mlp": 0.01254892, "epoch": 0.7419209379227416, "flos": 22346063562240.0, "grad_norm": 1.6719692141265243, "language_loss": 0.80172312, "learning_rate": 6.588085401243077e-07, "loss": 0.87855589, "num_input_tokens_seen": 266183855, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10174561, "step": 12340, "time_per_iteration": 2.591165781021118 }, { "auxiliary_loss_clip": 0.06422132, "auxiliary_loss_mlp": 0.01263155, "balance_loss_clip": 0.06280377, "balance_loss_mlp": 0.01253535, "epoch": 0.7419810611754096, "flos": 16767639590400.0, "grad_norm": 1.4014576438229935, "language_loss": 0.76000738, "learning_rate": 6.585196539212958e-07, "loss": 0.83686024, "num_input_tokens_seen": 266202085, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09619141, "step": 12341, "time_per_iteration": 2.7354836463928223 }, { "auxiliary_loss_clip": 0.06414431, "auxiliary_loss_mlp": 0.01269172, "balance_loss_clip": 0.06282198, "balance_loss_mlp": 0.01259492, "epoch": 0.7420411844280775, "flos": 26220048105600.0, "grad_norm": 1.359803309749161, "language_loss": 0.80133426, "learning_rate": 6.582308185873535e-07, "loss": 0.87817025, "num_input_tokens_seen": 266223445, "router_z_loss_clip": 1.32226562, "router_z_loss_mlp": 0.09680176, "step": 12342, "time_per_iteration": 4.013925552368164 }, { "auxiliary_loss_clip": 0.06421284, "auxiliary_loss_mlp": 0.01267924, "balance_loss_clip": 0.06280607, "balance_loss_mlp": 0.01258047, "epoch": 0.7421013076807456, "flos": 68542354857600.0, "grad_norm": 1.653479804885031, "language_loss": 0.77797914, "learning_rate": 6.57942034133433e-07, "loss": 0.85487121, "num_input_tokens_seen": 266246575, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09875488, "step": 12343, "time_per_iteration": 2.947169542312622 }, { "auxiliary_loss_clip": 0.06415664, "auxiliary_loss_mlp": 0.01267106, "balance_loss_clip": 0.06274609, "balance_loss_mlp": 0.01257767, "epoch": 0.7421614309334135, "flos": 24432144410880.0, "grad_norm": 1.5260712708346877, "language_loss": 0.67699879, "learning_rate": 6.576533005704843e-07, "loss": 0.7538265, "num_input_tokens_seen": 266266055, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09338379, "step": 12344, "time_per_iteration": 2.580355405807495 }, { "auxiliary_loss_clip": 0.06421141, "auxiliary_loss_mlp": 0.01266196, "balance_loss_clip": 0.06278138, "balance_loss_mlp": 0.01255587, "epoch": 0.7422215541860815, "flos": 12315706709760.0, "grad_norm": 2.323891053531326, "language_loss": 0.81205237, "learning_rate": 6.573646179094572e-07, "loss": 0.88892579, "num_input_tokens_seen": 266282240, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.1060791, "step": 12345, "time_per_iteration": 2.5044267177581787 }, { "auxiliary_loss_clip": 0.0641839, "auxiliary_loss_mlp": 0.0126352, "balance_loss_clip": 0.06277518, "balance_loss_mlp": 0.0125393, "epoch": 0.7422816774387494, "flos": 19651580616960.0, "grad_norm": 2.0150532891677777, "language_loss": 0.70759499, "learning_rate": 6.570759861612988e-07, "loss": 0.78441405, "num_input_tokens_seen": 266300980, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09594727, "step": 12346, "time_per_iteration": 2.5627546310424805 }, { "auxiliary_loss_clip": 0.06421211, "auxiliary_loss_mlp": 0.01266077, "balance_loss_clip": 0.06278995, "balance_loss_mlp": 0.0125663, "epoch": 0.7423418006914174, "flos": 32024337557760.0, "grad_norm": 1.5737817399959613, "language_loss": 0.73710692, "learning_rate": 6.56787405336953e-07, "loss": 0.81397974, "num_input_tokens_seen": 266322215, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09454346, "step": 12347, "time_per_iteration": 2.632396936416626 }, { "auxiliary_loss_clip": 0.06424749, "auxiliary_loss_mlp": 0.01265996, "balance_loss_clip": 0.06277934, "balance_loss_mlp": 0.01255863, "epoch": 0.7424019239440853, "flos": 18923013365760.0, "grad_norm": 1.6729866605120385, "language_loss": 0.8108359, "learning_rate": 6.564988754473642e-07, "loss": 0.88774335, "num_input_tokens_seen": 266341600, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.10131836, "step": 12348, "time_per_iteration": 2.54392409324646 }, { "auxiliary_loss_clip": 0.06413192, "auxiliary_loss_mlp": 0.0126385, "balance_loss_clip": 0.06275921, "balance_loss_mlp": 0.01254456, "epoch": 0.7424620471967533, "flos": 35884360396800.0, "grad_norm": 1.5880371744757054, "language_loss": 0.72695178, "learning_rate": 6.562103965034724e-07, "loss": 0.8037222, "num_input_tokens_seen": 266362895, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09399414, "step": 12349, "time_per_iteration": 2.6711738109588623 }, { "auxiliary_loss_clip": 0.06428907, "auxiliary_loss_mlp": 0.01266107, "balance_loss_clip": 0.06281446, "balance_loss_mlp": 0.01255641, "epoch": 0.7425221704494213, "flos": 27023987704320.0, "grad_norm": 1.952403593103545, "language_loss": 0.79723322, "learning_rate": 6.559219685162165e-07, "loss": 0.87418336, "num_input_tokens_seen": 266384015, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.10473633, "step": 12350, "time_per_iteration": 2.6680185794830322 }, { "auxiliary_loss_clip": 0.06415925, "auxiliary_loss_mlp": 0.01263187, "balance_loss_clip": 0.0627453, "balance_loss_mlp": 0.01253222, "epoch": 0.7425822937020893, "flos": 34175602483200.0, "grad_norm": 1.8692636842001045, "language_loss": 0.75347179, "learning_rate": 6.556335914965343e-07, "loss": 0.8302629, "num_input_tokens_seen": 266405990, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09967041, "step": 12351, "time_per_iteration": 2.642761468887329 }, { "auxiliary_loss_clip": 0.06421852, "auxiliary_loss_mlp": 0.01263112, "balance_loss_clip": 0.06281269, "balance_loss_mlp": 0.01253694, "epoch": 0.7426424169547573, "flos": 21289200814080.0, "grad_norm": 2.0688961202100016, "language_loss": 0.81742096, "learning_rate": 6.553452654553611e-07, "loss": 0.8942706, "num_input_tokens_seen": 266424260, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09429932, "step": 12352, "time_per_iteration": 3.977924108505249 }, { "auxiliary_loss_clip": 0.06426901, "auxiliary_loss_mlp": 0.0126392, "balance_loss_clip": 0.06284127, "balance_loss_mlp": 0.01254527, "epoch": 0.7427025402074252, "flos": 22453818312960.0, "grad_norm": 1.8187362842384638, "language_loss": 0.71762335, "learning_rate": 6.550569904036307e-07, "loss": 0.79453158, "num_input_tokens_seen": 266444580, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09399414, "step": 12353, "time_per_iteration": 2.611398458480835 }, { "auxiliary_loss_clip": 0.06418595, "auxiliary_loss_mlp": 0.01264135, "balance_loss_clip": 0.06278448, "balance_loss_mlp": 0.01255105, "epoch": 0.7427626634600932, "flos": 22530532325760.0, "grad_norm": 1.5782529492457142, "language_loss": 0.72248417, "learning_rate": 6.547687663522739e-07, "loss": 0.79931146, "num_input_tokens_seen": 266465640, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09033203, "step": 12354, "time_per_iteration": 2.6070287227630615 }, { "auxiliary_loss_clip": 0.06326497, "auxiliary_loss_mlp": 0.01251152, "balance_loss_clip": 0.06269433, "balance_loss_mlp": 0.01249822, "epoch": 0.7428227867127611, "flos": 67227271424640.0, "grad_norm": 0.6874472079352038, "language_loss": 0.59363079, "learning_rate": 6.544805933122199e-07, "loss": 0.66940725, "num_input_tokens_seen": 266531950, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01332092, "step": 12355, "time_per_iteration": 3.298386812210083 }, { "auxiliary_loss_clip": 0.06418478, "auxiliary_loss_mlp": 0.01264404, "balance_loss_clip": 0.06276511, "balance_loss_mlp": 0.01254593, "epoch": 0.7428829099654292, "flos": 14726603111040.0, "grad_norm": 1.8171335356090133, "language_loss": 0.6810075, "learning_rate": 6.541924712943971e-07, "loss": 0.75783634, "num_input_tokens_seen": 266550665, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.0980835, "step": 12356, "time_per_iteration": 2.5382981300354004 }, { "auxiliary_loss_clip": 0.06420181, "auxiliary_loss_mlp": 0.01263456, "balance_loss_clip": 0.06274968, "balance_loss_mlp": 0.01253187, "epoch": 0.7429430332180971, "flos": 48656466696960.0, "grad_norm": 1.5967825669536173, "language_loss": 0.72686887, "learning_rate": 6.539044003097301e-07, "loss": 0.80370522, "num_input_tokens_seen": 266572455, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.10272217, "step": 12357, "time_per_iteration": 2.7904860973358154 }, { "auxiliary_loss_clip": 0.06417002, "auxiliary_loss_mlp": 0.01264236, "balance_loss_clip": 0.06279963, "balance_loss_mlp": 0.01255438, "epoch": 0.7430031564707651, "flos": 16769735942400.0, "grad_norm": 2.018070648457953, "language_loss": 0.65605807, "learning_rate": 6.53616380369143e-07, "loss": 0.73287046, "num_input_tokens_seen": 266590895, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.08795166, "step": 12358, "time_per_iteration": 3.9634337425231934 }, { "auxiliary_loss_clip": 0.06421231, "auxiliary_loss_mlp": 0.01265942, "balance_loss_clip": 0.06275153, "balance_loss_mlp": 0.01255243, "epoch": 0.743063279723433, "flos": 23876054789760.0, "grad_norm": 1.6771061462648833, "language_loss": 0.81112343, "learning_rate": 6.533284114835591e-07, "loss": 0.88799524, "num_input_tokens_seen": 266607660, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10681152, "step": 12359, "time_per_iteration": 2.6048550605773926 }, { "auxiliary_loss_clip": 0.06418178, "auxiliary_loss_mlp": 0.01268817, "balance_loss_clip": 0.06275765, "balance_loss_mlp": 0.01258982, "epoch": 0.743123402976101, "flos": 14396840167680.0, "grad_norm": 2.046576823217641, "language_loss": 0.68253028, "learning_rate": 6.530404936638956e-07, "loss": 0.75940025, "num_input_tokens_seen": 266624260, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09844971, "step": 12360, "time_per_iteration": 2.572418212890625 }, { "auxiliary_loss_clip": 0.06415872, "auxiliary_loss_mlp": 0.01268368, "balance_loss_clip": 0.06276498, "balance_loss_mlp": 0.01258432, "epoch": 0.7431835262287689, "flos": 27461756960640.0, "grad_norm": 1.703270697358029, "language_loss": 0.72811866, "learning_rate": 6.527526269210715e-07, "loss": 0.80496109, "num_input_tokens_seen": 266644210, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09942627, "step": 12361, "time_per_iteration": 2.617760181427002 }, { "auxiliary_loss_clip": 0.06421661, "auxiliary_loss_mlp": 0.01264519, "balance_loss_clip": 0.06276628, "balance_loss_mlp": 0.01254136, "epoch": 0.743243649481437, "flos": 20965810780800.0, "grad_norm": 1.8760179240434591, "language_loss": 0.5640052, "learning_rate": 6.524648112660027e-07, "loss": 0.64086699, "num_input_tokens_seen": 266664230, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.1038208, "step": 12362, "time_per_iteration": 2.6315362453460693 }, { "auxiliary_loss_clip": 0.06417207, "auxiliary_loss_mlp": 0.01263781, "balance_loss_clip": 0.06276165, "balance_loss_mlp": 0.0125456, "epoch": 0.7433037727341049, "flos": 22789660677120.0, "grad_norm": 1.9020661843661575, "language_loss": 0.77760667, "learning_rate": 6.521770467096039e-07, "loss": 0.85441661, "num_input_tokens_seen": 266683270, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09222412, "step": 12363, "time_per_iteration": 2.5626096725463867 }, { "auxiliary_loss_clip": 0.06420011, "auxiliary_loss_mlp": 0.01262841, "balance_loss_clip": 0.06278227, "balance_loss_mlp": 0.01254109, "epoch": 0.7433638959867729, "flos": 22202656099200.0, "grad_norm": 1.6398391709982085, "language_loss": 0.77980733, "learning_rate": 6.518893332627862e-07, "loss": 0.85663581, "num_input_tokens_seen": 266701235, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.08728027, "step": 12364, "time_per_iteration": 2.5510809421539307 }, { "auxiliary_loss_clip": 0.06417936, "auxiliary_loss_mlp": 0.01263478, "balance_loss_clip": 0.06276309, "balance_loss_mlp": 0.01253715, "epoch": 0.7434240192394409, "flos": 23303808529920.0, "grad_norm": 1.6959428830835448, "language_loss": 0.78941423, "learning_rate": 6.516016709364604e-07, "loss": 0.86622834, "num_input_tokens_seen": 266721495, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09765625, "step": 12365, "time_per_iteration": 2.582942485809326 }, { "auxiliary_loss_clip": 0.06421594, "auxiliary_loss_mlp": 0.01265152, "balance_loss_clip": 0.06275488, "balance_loss_mlp": 0.01254923, "epoch": 0.7434841424921088, "flos": 54020387416320.0, "grad_norm": 1.6324531805908011, "language_loss": 0.7735998, "learning_rate": 6.513140597415346e-07, "loss": 0.85046726, "num_input_tokens_seen": 266747400, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10223389, "step": 12366, "time_per_iteration": 2.840075969696045 }, { "auxiliary_loss_clip": 0.06415167, "auxiliary_loss_mlp": 0.01266012, "balance_loss_clip": 0.06277897, "balance_loss_mlp": 0.01257727, "epoch": 0.7435442657447768, "flos": 21440364779520.0, "grad_norm": 1.328769078833671, "language_loss": 0.71218371, "learning_rate": 6.510264996889141e-07, "loss": 0.7889955, "num_input_tokens_seen": 266767630, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.08282471, "step": 12367, "time_per_iteration": 2.5541141033172607 }, { "auxiliary_loss_clip": 0.06421323, "auxiliary_loss_mlp": 0.01264187, "balance_loss_clip": 0.06275541, "balance_loss_mlp": 0.01254847, "epoch": 0.7436043889974447, "flos": 24506426655360.0, "grad_norm": 5.006357466265217, "language_loss": 0.74987185, "learning_rate": 6.507389907895038e-07, "loss": 0.82672703, "num_input_tokens_seen": 266788015, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.09344482, "step": 12368, "time_per_iteration": 2.601694107055664 }, { "auxiliary_loss_clip": 0.06413883, "auxiliary_loss_mlp": 0.01267207, "balance_loss_clip": 0.06275316, "balance_loss_mlp": 0.01258278, "epoch": 0.7436645122501128, "flos": 40707997989120.0, "grad_norm": 1.7012242279726988, "language_loss": 0.68853509, "learning_rate": 6.50451533054207e-07, "loss": 0.76534605, "num_input_tokens_seen": 266809010, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.08929443, "step": 12369, "time_per_iteration": 2.743699550628662 }, { "auxiliary_loss_clip": 0.06420533, "auxiliary_loss_mlp": 0.01268467, "balance_loss_clip": 0.06277877, "balance_loss_mlp": 0.0125865, "epoch": 0.7437246355027807, "flos": 18913537854720.0, "grad_norm": 1.8570975682712254, "language_loss": 0.75854909, "learning_rate": 6.501641264939233e-07, "loss": 0.83543909, "num_input_tokens_seen": 266825390, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09820557, "step": 12370, "time_per_iteration": 2.5426905155181885 }, { "auxiliary_loss_clip": 0.0641086, "auxiliary_loss_mlp": 0.01265576, "balance_loss_clip": 0.06273356, "balance_loss_mlp": 0.01256021, "epoch": 0.7437847587554487, "flos": 21550299736320.0, "grad_norm": 1.3848770384970719, "language_loss": 0.78162777, "learning_rate": 6.498767711195503e-07, "loss": 0.85839218, "num_input_tokens_seen": 266844675, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09558105, "step": 12371, "time_per_iteration": 2.5554678440093994 }, { "auxiliary_loss_clip": 0.06418717, "auxiliary_loss_mlp": 0.01264444, "balance_loss_clip": 0.06277733, "balance_loss_mlp": 0.01254693, "epoch": 0.7438448820081166, "flos": 27789926676480.0, "grad_norm": 1.8322288863613754, "language_loss": 0.69461977, "learning_rate": 6.495894669419857e-07, "loss": 0.77145141, "num_input_tokens_seen": 266865160, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09747314, "step": 12372, "time_per_iteration": 2.6019654273986816 }, { "auxiliary_loss_clip": 0.06416579, "auxiliary_loss_mlp": 0.01263639, "balance_loss_clip": 0.06275778, "balance_loss_mlp": 0.01254079, "epoch": 0.7439050052607846, "flos": 17973653806080.0, "grad_norm": 1.9619644919735666, "language_loss": 0.75150859, "learning_rate": 6.493022139721245e-07, "loss": 0.82831079, "num_input_tokens_seen": 266883285, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09570312, "step": 12373, "time_per_iteration": 2.522754669189453 }, { "auxiliary_loss_clip": 0.0641892, "auxiliary_loss_mlp": 0.01264561, "balance_loss_clip": 0.06273434, "balance_loss_mlp": 0.01253755, "epoch": 0.7439651285134525, "flos": 22964066951040.0, "grad_norm": 2.665683217982895, "language_loss": 0.77686346, "learning_rate": 6.49015012220858e-07, "loss": 0.85369825, "num_input_tokens_seen": 266900960, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.1081543, "step": 12374, "time_per_iteration": 2.591163158416748 }, { "auxiliary_loss_clip": 0.06418341, "auxiliary_loss_mlp": 0.01266652, "balance_loss_clip": 0.06275838, "balance_loss_mlp": 0.01256985, "epoch": 0.7440252517661206, "flos": 18812701065600.0, "grad_norm": 1.9827192373059912, "language_loss": 0.7630108, "learning_rate": 6.487278616990774e-07, "loss": 0.8398608, "num_input_tokens_seen": 266917710, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09667969, "step": 12375, "time_per_iteration": 2.5480597019195557 }, { "auxiliary_loss_clip": 0.06412312, "auxiliary_loss_mlp": 0.01265779, "balance_loss_clip": 0.06274222, "balance_loss_mlp": 0.01256677, "epoch": 0.7440853750187885, "flos": 20272476971520.0, "grad_norm": 1.8214999610334566, "language_loss": 0.7774595, "learning_rate": 6.484407624176733e-07, "loss": 0.85424042, "num_input_tokens_seen": 266934220, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09100342, "step": 12376, "time_per_iteration": 4.022783517837524 }, { "auxiliary_loss_clip": 0.06421126, "auxiliary_loss_mlp": 0.01266186, "balance_loss_clip": 0.06277806, "balance_loss_mlp": 0.01255534, "epoch": 0.7441454982714565, "flos": 25344216103680.0, "grad_norm": 1.9802276791024962, "language_loss": 0.79799318, "learning_rate": 6.481537143875296e-07, "loss": 0.87486637, "num_input_tokens_seen": 266955210, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10644531, "step": 12377, "time_per_iteration": 2.5856614112854004 }, { "auxiliary_loss_clip": 0.06421739, "auxiliary_loss_mlp": 0.01263952, "balance_loss_clip": 0.06277886, "balance_loss_mlp": 0.0125398, "epoch": 0.7442056215241245, "flos": 64493460915840.0, "grad_norm": 1.9446695371894223, "language_loss": 0.67487305, "learning_rate": 6.478667176195322e-07, "loss": 0.75172997, "num_input_tokens_seen": 266976555, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.09960938, "step": 12378, "time_per_iteration": 2.9253861904144287 }, { "auxiliary_loss_clip": 0.06419756, "auxiliary_loss_mlp": 0.01269536, "balance_loss_clip": 0.06275651, "balance_loss_mlp": 0.01258855, "epoch": 0.7442657447767924, "flos": 31293464319360.0, "grad_norm": 1.8569811250754673, "language_loss": 0.7206232, "learning_rate": 6.475797721245648e-07, "loss": 0.79751611, "num_input_tokens_seen": 266997640, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10693359, "step": 12379, "time_per_iteration": 2.638772487640381 }, { "auxiliary_loss_clip": 0.06417397, "auxiliary_loss_mlp": 0.01263472, "balance_loss_clip": 0.06276207, "balance_loss_mlp": 0.01253676, "epoch": 0.7443258680294604, "flos": 20813221296000.0, "grad_norm": 1.6684176819861671, "language_loss": 0.65685356, "learning_rate": 6.472928779135085e-07, "loss": 0.73366231, "num_input_tokens_seen": 267016165, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09799194, "step": 12380, "time_per_iteration": 2.552347183227539 }, { "auxiliary_loss_clip": 0.0642146, "auxiliary_loss_mlp": 0.01264476, "balance_loss_clip": 0.06278223, "balance_loss_mlp": 0.01254283, "epoch": 0.7443859912821283, "flos": 22206303751680.0, "grad_norm": 1.9980257393731995, "language_loss": 0.79237789, "learning_rate": 6.470060349972411e-07, "loss": 0.8692373, "num_input_tokens_seen": 267034075, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10186768, "step": 12381, "time_per_iteration": 2.711379289627075 }, { "auxiliary_loss_clip": 0.06420907, "auxiliary_loss_mlp": 0.01267743, "balance_loss_clip": 0.06275934, "balance_loss_mlp": 0.01256621, "epoch": 0.7444461145347964, "flos": 22024350610560.0, "grad_norm": 1.940990519190268, "language_loss": 0.72789538, "learning_rate": 6.467192433866411e-07, "loss": 0.80478185, "num_input_tokens_seen": 267053645, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.11108398, "step": 12382, "time_per_iteration": 4.129249095916748 }, { "auxiliary_loss_clip": 0.06315701, "auxiliary_loss_mlp": 0.01251827, "balance_loss_clip": 0.06258827, "balance_loss_mlp": 0.01250594, "epoch": 0.7445062377874643, "flos": 70582313704320.0, "grad_norm": 0.7144797581046124, "language_loss": 0.54668915, "learning_rate": 6.464325030925831e-07, "loss": 0.6223644, "num_input_tokens_seen": 267121830, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01232147, "step": 12383, "time_per_iteration": 3.321314573287964 }, { "auxiliary_loss_clip": 0.06419423, "auxiliary_loss_mlp": 0.01265019, "balance_loss_clip": 0.06277743, "balance_loss_mlp": 0.01255608, "epoch": 0.7445663610401323, "flos": 22171070309760.0, "grad_norm": 1.9680383060075863, "language_loss": 0.76506269, "learning_rate": 6.461458141259395e-07, "loss": 0.84190714, "num_input_tokens_seen": 267141145, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09411621, "step": 12384, "time_per_iteration": 2.575010061264038 }, { "auxiliary_loss_clip": 0.06414826, "auxiliary_loss_mlp": 0.0126722, "balance_loss_clip": 0.06273817, "balance_loss_mlp": 0.01257559, "epoch": 0.7446264842928002, "flos": 24177082982400.0, "grad_norm": 1.919383194677515, "language_loss": 0.79714012, "learning_rate": 6.458591764975823e-07, "loss": 0.87396061, "num_input_tokens_seen": 267159280, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09661865, "step": 12385, "time_per_iteration": 2.5856146812438965 }, { "auxiliary_loss_clip": 0.06422123, "auxiliary_loss_mlp": 0.01270124, "balance_loss_clip": 0.06276826, "balance_loss_mlp": 0.01258668, "epoch": 0.7446866075454682, "flos": 24141514124160.0, "grad_norm": 1.7216397710689209, "language_loss": 0.81849992, "learning_rate": 6.455725902183813e-07, "loss": 0.8954224, "num_input_tokens_seen": 267179390, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.11462402, "step": 12386, "time_per_iteration": 2.5795681476593018 }, { "auxiliary_loss_clip": 0.06410773, "auxiliary_loss_mlp": 0.01265405, "balance_loss_clip": 0.06273498, "balance_loss_mlp": 0.0125585, "epoch": 0.7447467307981361, "flos": 23554467619200.0, "grad_norm": 2.6365514740245506, "language_loss": 0.7112605, "learning_rate": 6.452860552992037e-07, "loss": 0.78802228, "num_input_tokens_seen": 267198165, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09552002, "step": 12387, "time_per_iteration": 2.5608372688293457 }, { "auxiliary_loss_clip": 0.06417093, "auxiliary_loss_mlp": 0.01264847, "balance_loss_clip": 0.06276847, "balance_loss_mlp": 0.01255382, "epoch": 0.7448068540508042, "flos": 19573021814400.0, "grad_norm": 1.9376214656239135, "language_loss": 0.70522332, "learning_rate": 6.449995717509138e-07, "loss": 0.78204274, "num_input_tokens_seen": 267214520, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09466553, "step": 12388, "time_per_iteration": 2.5301530361175537 }, { "auxiliary_loss_clip": 0.06415878, "auxiliary_loss_mlp": 0.01267906, "balance_loss_clip": 0.06274144, "balance_loss_mlp": 0.01258554, "epoch": 0.7448669773034721, "flos": 21846925589760.0, "grad_norm": 1.6469339405047376, "language_loss": 0.8532629, "learning_rate": 6.447131395843761e-07, "loss": 0.93010068, "num_input_tokens_seen": 267236555, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09350586, "step": 12389, "time_per_iteration": 2.618539810180664 }, { "auxiliary_loss_clip": 0.06419627, "auxiliary_loss_mlp": 0.01266998, "balance_loss_clip": 0.06276654, "balance_loss_mlp": 0.01257062, "epoch": 0.7449271005561401, "flos": 25162388743680.0, "grad_norm": 1.7267627769627392, "language_loss": 0.79440683, "learning_rate": 6.444267588104526e-07, "loss": 0.8712731, "num_input_tokens_seen": 267254800, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09936523, "step": 12390, "time_per_iteration": 2.5770838260650635 }, { "auxiliary_loss_clip": 0.06416535, "auxiliary_loss_mlp": 0.01263813, "balance_loss_clip": 0.0627258, "balance_loss_mlp": 0.01253973, "epoch": 0.7449872238088081, "flos": 22279915163520.0, "grad_norm": 1.6651484696965348, "language_loss": 0.85274601, "learning_rate": 6.441404294400014e-07, "loss": 0.92954946, "num_input_tokens_seen": 267274610, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.09851074, "step": 12391, "time_per_iteration": 3.987799882888794 }, { "auxiliary_loss_clip": 0.06416875, "auxiliary_loss_mlp": 0.01265824, "balance_loss_clip": 0.06276266, "balance_loss_mlp": 0.01256544, "epoch": 0.745047347061476, "flos": 20601065957760.0, "grad_norm": 1.8301265705409604, "language_loss": 0.74343085, "learning_rate": 6.438541514838811e-07, "loss": 0.82025778, "num_input_tokens_seen": 267292600, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09277344, "step": 12392, "time_per_iteration": 2.5489330291748047 }, { "auxiliary_loss_clip": 0.06410296, "auxiliary_loss_mlp": 0.01261805, "balance_loss_clip": 0.06273146, "balance_loss_mlp": 0.01252733, "epoch": 0.745107470314144, "flos": 22134117859200.0, "grad_norm": 1.6323134764872738, "language_loss": 0.76600921, "learning_rate": 6.435679249529487e-07, "loss": 0.84273022, "num_input_tokens_seen": 267311295, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09075928, "step": 12393, "time_per_iteration": 2.552452802658081 }, { "auxiliary_loss_clip": 0.06414489, "auxiliary_loss_mlp": 0.01264653, "balance_loss_clip": 0.0627497, "balance_loss_mlp": 0.01253715, "epoch": 0.745167593566812, "flos": 22243004640000.0, "grad_norm": 1.7271552515864566, "language_loss": 0.7261191, "learning_rate": 6.432817498580552e-07, "loss": 0.80291057, "num_input_tokens_seen": 267328390, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.10943604, "step": 12394, "time_per_iteration": 2.562407970428467 }, { "auxiliary_loss_clip": 0.06416523, "auxiliary_loss_mlp": 0.01267164, "balance_loss_clip": 0.06276613, "balance_loss_mlp": 0.01257014, "epoch": 0.74522771681948, "flos": 20672245601280.0, "grad_norm": 1.7447422774282528, "language_loss": 0.82104897, "learning_rate": 6.429956262100535e-07, "loss": 0.8978858, "num_input_tokens_seen": 267348185, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.10150146, "step": 12395, "time_per_iteration": 2.610147476196289 }, { "auxiliary_loss_clip": 0.06419044, "auxiliary_loss_mlp": 0.01265045, "balance_loss_clip": 0.06274301, "balance_loss_mlp": 0.0125533, "epoch": 0.7452878400721479, "flos": 21113578656000.0, "grad_norm": 3.284634032757954, "language_loss": 0.72127348, "learning_rate": 6.427095540197937e-07, "loss": 0.79811436, "num_input_tokens_seen": 267367010, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.09716797, "step": 12396, "time_per_iteration": 2.544802665710449 }, { "auxiliary_loss_clip": 0.06421502, "auxiliary_loss_mlp": 0.01269505, "balance_loss_clip": 0.06277122, "balance_loss_mlp": 0.0125927, "epoch": 0.7453479633248159, "flos": 26695356791040.0, "grad_norm": 1.9850707638992544, "language_loss": 0.68272281, "learning_rate": 6.424235332981245e-07, "loss": 0.75963289, "num_input_tokens_seen": 267386605, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10229492, "step": 12397, "time_per_iteration": 2.614527702331543 }, { "auxiliary_loss_clip": 0.06411117, "auxiliary_loss_mlp": 0.01266133, "balance_loss_clip": 0.0627268, "balance_loss_mlp": 0.01257025, "epoch": 0.7454080865774838, "flos": 17021191645440.0, "grad_norm": 2.2317537471392086, "language_loss": 0.76816821, "learning_rate": 6.421375640558908e-07, "loss": 0.84494078, "num_input_tokens_seen": 267404135, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09106445, "step": 12398, "time_per_iteration": 3.880147695541382 }, { "auxiliary_loss_clip": 0.06412502, "auxiliary_loss_mlp": 0.01263444, "balance_loss_clip": 0.06274604, "balance_loss_mlp": 0.01254551, "epoch": 0.7454682098301518, "flos": 21330178260480.0, "grad_norm": 1.5430819899922394, "language_loss": 0.78267598, "learning_rate": 6.418516463039363e-07, "loss": 0.85943538, "num_input_tokens_seen": 267423120, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.08898926, "step": 12399, "time_per_iteration": 2.6430933475494385 }, { "auxiliary_loss_clip": 0.06408606, "auxiliary_loss_mlp": 0.01263866, "balance_loss_clip": 0.06273394, "balance_loss_mlp": 0.0125498, "epoch": 0.7455283330828197, "flos": 17864138119680.0, "grad_norm": 1.871363054751911, "language_loss": 0.74345434, "learning_rate": 6.415657800531038e-07, "loss": 0.8201791, "num_input_tokens_seen": 267441250, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.08892822, "step": 12400, "time_per_iteration": 2.5634095668792725 }, { "auxiliary_loss_clip": 0.06412885, "auxiliary_loss_mlp": 0.0126356, "balance_loss_clip": 0.06273837, "balance_loss_mlp": 0.01254071, "epoch": 0.7455884563354878, "flos": 30782209432320.0, "grad_norm": 1.6910902387687885, "language_loss": 0.82610798, "learning_rate": 6.412799653142327e-07, "loss": 0.90287244, "num_input_tokens_seen": 267462820, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09484863, "step": 12401, "time_per_iteration": 2.608616352081299 }, { "auxiliary_loss_clip": 0.06416927, "auxiliary_loss_mlp": 0.01262868, "balance_loss_clip": 0.06277423, "balance_loss_mlp": 0.0125326, "epoch": 0.7456485795881557, "flos": 23192280345600.0, "grad_norm": 1.8777543079031875, "language_loss": 0.65219879, "learning_rate": 6.409942020981611e-07, "loss": 0.72899669, "num_input_tokens_seen": 267483065, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09606934, "step": 12402, "time_per_iteration": 2.561105489730835 }, { "auxiliary_loss_clip": 0.06410847, "auxiliary_loss_mlp": 0.01264637, "balance_loss_clip": 0.06273028, "balance_loss_mlp": 0.01255703, "epoch": 0.7457087028408237, "flos": 38736254436480.0, "grad_norm": 1.5974840080738557, "language_loss": 0.73689306, "learning_rate": 6.407084904157265e-07, "loss": 0.81364787, "num_input_tokens_seen": 267504825, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08935547, "step": 12403, "time_per_iteration": 2.7150769233703613 }, { "auxiliary_loss_clip": 0.06313049, "auxiliary_loss_mlp": 0.01252795, "balance_loss_clip": 0.06256115, "balance_loss_mlp": 0.01251447, "epoch": 0.7457688260934917, "flos": 56059480523520.0, "grad_norm": 0.8201374390499653, "language_loss": 0.5852828, "learning_rate": 6.404228302777621e-07, "loss": 0.66094124, "num_input_tokens_seen": 267559260, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.0134964, "step": 12404, "time_per_iteration": 3.005913019180298 }, { "auxiliary_loss_clip": 0.06415986, "auxiliary_loss_mlp": 0.01265371, "balance_loss_clip": 0.06275599, "balance_loss_mlp": 0.01255918, "epoch": 0.7458289493461596, "flos": 20121606495360.0, "grad_norm": 1.5247532794956937, "language_loss": 0.7784189, "learning_rate": 6.401372216950995e-07, "loss": 0.85523248, "num_input_tokens_seen": 267578720, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09460449, "step": 12405, "time_per_iteration": 2.5581560134887695 }, { "auxiliary_loss_clip": 0.06412078, "auxiliary_loss_mlp": 0.01271207, "balance_loss_clip": 0.06275514, "balance_loss_mlp": 0.01261998, "epoch": 0.7458890725988276, "flos": 20199200976000.0, "grad_norm": 1.4507143014499813, "language_loss": 0.69199562, "learning_rate": 6.398516646785698e-07, "loss": 0.76882851, "num_input_tokens_seen": 267598250, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.09197998, "step": 12406, "time_per_iteration": 2.5322647094726562 }, { "auxiliary_loss_clip": 0.06427275, "auxiliary_loss_mlp": 0.01265543, "balance_loss_clip": 0.0627956, "balance_loss_mlp": 0.01255351, "epoch": 0.7459491958514956, "flos": 17024336173440.0, "grad_norm": 1.7871000120521112, "language_loss": 0.65459222, "learning_rate": 6.39566159239002e-07, "loss": 0.73152041, "num_input_tokens_seen": 267615430, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.10198975, "step": 12407, "time_per_iteration": 2.509582281112671 }, { "auxiliary_loss_clip": 0.06419273, "auxiliary_loss_mlp": 0.01262086, "balance_loss_clip": 0.06275413, "balance_loss_mlp": 0.01251685, "epoch": 0.7460093191041636, "flos": 25085087752320.0, "grad_norm": 2.6753493582461743, "language_loss": 0.72530615, "learning_rate": 6.392807053872212e-07, "loss": 0.80211979, "num_input_tokens_seen": 267635075, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10400391, "step": 12408, "time_per_iteration": 2.5768826007843018 }, { "auxiliary_loss_clip": 0.06423597, "auxiliary_loss_mlp": 0.01269765, "balance_loss_clip": 0.06277476, "balance_loss_mlp": 0.01259215, "epoch": 0.7460694423568315, "flos": 21915044559360.0, "grad_norm": 1.9404000436476174, "language_loss": 0.72668231, "learning_rate": 6.38995303134053e-07, "loss": 0.80361593, "num_input_tokens_seen": 267654105, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10546875, "step": 12409, "time_per_iteration": 2.5508999824523926 }, { "auxiliary_loss_clip": 0.06408624, "auxiliary_loss_mlp": 0.01266214, "balance_loss_clip": 0.06272364, "balance_loss_mlp": 0.01257881, "epoch": 0.7461295656094995, "flos": 21222213874560.0, "grad_norm": 1.9384146184616697, "language_loss": 0.65990782, "learning_rate": 6.38709952490319e-07, "loss": 0.73665613, "num_input_tokens_seen": 267673090, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.08337402, "step": 12410, "time_per_iteration": 2.5397894382476807 }, { "auxiliary_loss_clip": 0.0641147, "auxiliary_loss_mlp": 0.01263507, "balance_loss_clip": 0.0627417, "balance_loss_mlp": 0.01253976, "epoch": 0.7461896888621674, "flos": 22353526575360.0, "grad_norm": 2.5563922174890252, "language_loss": 0.84630829, "learning_rate": 6.384246534668396e-07, "loss": 0.92305803, "num_input_tokens_seen": 267690605, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09533691, "step": 12411, "time_per_iteration": 2.5393104553222656 }, { "auxiliary_loss_clip": 0.06420353, "auxiliary_loss_mlp": 0.01266239, "balance_loss_clip": 0.06278491, "balance_loss_mlp": 0.01257066, "epoch": 0.7462498121148354, "flos": 25489845699840.0, "grad_norm": 1.4961736897019537, "language_loss": 0.78384268, "learning_rate": 6.381394060744339e-07, "loss": 0.86070859, "num_input_tokens_seen": 267710540, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.0916748, "step": 12412, "time_per_iteration": 2.5869905948638916 }, { "auxiliary_loss_clip": 0.06418853, "auxiliary_loss_mlp": 0.01264312, "balance_loss_clip": 0.06275877, "balance_loss_mlp": 0.01254859, "epoch": 0.7463099353675033, "flos": 33956319548160.0, "grad_norm": 2.3174635579114478, "language_loss": 0.62589312, "learning_rate": 6.378542103239188e-07, "loss": 0.70272481, "num_input_tokens_seen": 267730780, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09454346, "step": 12413, "time_per_iteration": 2.6516921520233154 }, { "auxiliary_loss_clip": 0.06311691, "auxiliary_loss_mlp": 0.01250747, "balance_loss_clip": 0.06254695, "balance_loss_mlp": 0.01249615, "epoch": 0.7463700586201714, "flos": 62786365355520.0, "grad_norm": 0.7027039956859595, "language_loss": 0.54768342, "learning_rate": 6.375690662261082e-07, "loss": 0.62330782, "num_input_tokens_seen": 267794240, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01133728, "step": 12414, "time_per_iteration": 3.1898388862609863 }, { "auxiliary_loss_clip": 0.06416385, "auxiliary_loss_mlp": 0.01265099, "balance_loss_clip": 0.06273869, "balance_loss_mlp": 0.01255217, "epoch": 0.7464301818728393, "flos": 33440201124480.0, "grad_norm": 1.9888786095445063, "language_loss": 0.55134368, "learning_rate": 6.372839737918154e-07, "loss": 0.62815851, "num_input_tokens_seen": 267817190, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09881592, "step": 12415, "time_per_iteration": 4.174130439758301 }, { "auxiliary_loss_clip": 0.06416737, "auxiliary_loss_mlp": 0.01262538, "balance_loss_clip": 0.06275991, "balance_loss_mlp": 0.01252751, "epoch": 0.7464903051255073, "flos": 26877100296960.0, "grad_norm": 1.7017603310687814, "language_loss": 0.74980628, "learning_rate": 6.369989330318506e-07, "loss": 0.826599, "num_input_tokens_seen": 267836245, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09802246, "step": 12416, "time_per_iteration": 2.6264302730560303 }, { "auxiliary_loss_clip": 0.06415142, "auxiliary_loss_mlp": 0.01266426, "balance_loss_clip": 0.06274103, "balance_loss_mlp": 0.01255841, "epoch": 0.7465504283781753, "flos": 44096359795200.0, "grad_norm": 1.741094970061398, "language_loss": 0.69682002, "learning_rate": 6.367139439570233e-07, "loss": 0.77363569, "num_input_tokens_seen": 267858310, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10583496, "step": 12417, "time_per_iteration": 2.7877135276794434 }, { "auxiliary_loss_clip": 0.06423958, "auxiliary_loss_mlp": 0.01264421, "balance_loss_clip": 0.06279478, "balance_loss_mlp": 0.01253942, "epoch": 0.7466105516308432, "flos": 19681111981440.0, "grad_norm": 1.8891357306800862, "language_loss": 0.73716843, "learning_rate": 6.364290065781392e-07, "loss": 0.81405228, "num_input_tokens_seen": 267876345, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10473633, "step": 12418, "time_per_iteration": 2.5850062370300293 }, { "auxiliary_loss_clip": 0.06414947, "auxiliary_loss_mlp": 0.01264903, "balance_loss_clip": 0.06275053, "balance_loss_mlp": 0.01255152, "epoch": 0.7466706748835112, "flos": 20526783713280.0, "grad_norm": 1.5628064529259489, "language_loss": 0.69385099, "learning_rate": 6.361441209060039e-07, "loss": 0.77064955, "num_input_tokens_seen": 267896740, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09753418, "step": 12419, "time_per_iteration": 2.575124740600586 }, { "auxiliary_loss_clip": 0.06408805, "auxiliary_loss_mlp": 0.01268895, "balance_loss_clip": 0.06274278, "balance_loss_mlp": 0.01259842, "epoch": 0.7467307981361792, "flos": 21696851727360.0, "grad_norm": 1.6662872137929325, "language_loss": 0.752267, "learning_rate": 6.358592869514216e-07, "loss": 0.82904404, "num_input_tokens_seen": 267914765, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.09051514, "step": 12420, "time_per_iteration": 2.5857765674591064 }, { "auxiliary_loss_clip": 0.06423333, "auxiliary_loss_mlp": 0.01266572, "balance_loss_clip": 0.06279978, "balance_loss_mlp": 0.01256028, "epoch": 0.7467909213888472, "flos": 19579855921920.0, "grad_norm": 1.731420809679806, "language_loss": 0.67551082, "learning_rate": 6.355745047251904e-07, "loss": 0.75240988, "num_input_tokens_seen": 267934085, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10534668, "step": 12421, "time_per_iteration": 3.980186700820923 }, { "auxiliary_loss_clip": 0.06420682, "auxiliary_loss_mlp": 0.01264787, "balance_loss_clip": 0.0627569, "balance_loss_mlp": 0.01254529, "epoch": 0.7468510446415151, "flos": 23701858151040.0, "grad_norm": 1.7157638258517829, "language_loss": 0.72636974, "learning_rate": 6.352897742381107e-07, "loss": 0.80322444, "num_input_tokens_seen": 267955170, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10253906, "step": 12422, "time_per_iteration": 2.569708824157715 }, { "auxiliary_loss_clip": 0.06413906, "auxiliary_loss_mlp": 0.01265458, "balance_loss_clip": 0.06275126, "balance_loss_mlp": 0.01255885, "epoch": 0.7469111678941831, "flos": 29323649410560.0, "grad_norm": 1.8283600717539397, "language_loss": 0.75131387, "learning_rate": 6.350050955009796e-07, "loss": 0.82810754, "num_input_tokens_seen": 267974980, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09576416, "step": 12423, "time_per_iteration": 2.5811946392059326 }, { "auxiliary_loss_clip": 0.06413533, "auxiliary_loss_mlp": 0.01262807, "balance_loss_clip": 0.06275102, "balance_loss_mlp": 0.01253795, "epoch": 0.746971291146851, "flos": 21805067675520.0, "grad_norm": 2.7744173349991605, "language_loss": 0.68010831, "learning_rate": 6.347204685245929e-07, "loss": 0.7568717, "num_input_tokens_seen": 267994985, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09008789, "step": 12424, "time_per_iteration": 2.550135612487793 }, { "auxiliary_loss_clip": 0.0642447, "auxiliary_loss_mlp": 0.01271655, "balance_loss_clip": 0.06279474, "balance_loss_mlp": 0.01261456, "epoch": 0.747031414399519, "flos": 36253591413120.0, "grad_norm": 2.453452963746923, "language_loss": 0.74491215, "learning_rate": 6.344358933197418e-07, "loss": 0.82187343, "num_input_tokens_seen": 268014985, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10192871, "step": 12425, "time_per_iteration": 2.6952035427093506 }, { "auxiliary_loss_clip": 0.06414167, "auxiliary_loss_mlp": 0.01265192, "balance_loss_clip": 0.06272523, "balance_loss_mlp": 0.01255399, "epoch": 0.7470915376521869, "flos": 19981133925120.0, "grad_norm": 2.542094828068436, "language_loss": 0.70396292, "learning_rate": 6.341513698972194e-07, "loss": 0.78075653, "num_input_tokens_seen": 268034395, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09790039, "step": 12426, "time_per_iteration": 2.554447889328003 }, { "auxiliary_loss_clip": 0.06415387, "auxiliary_loss_mlp": 0.01267673, "balance_loss_clip": 0.06277987, "balance_loss_mlp": 0.01258011, "epoch": 0.747151660904855, "flos": 20090523830400.0, "grad_norm": 1.4039713982877886, "language_loss": 0.65729141, "learning_rate": 6.338668982678139e-07, "loss": 0.73412198, "num_input_tokens_seen": 268054485, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09655762, "step": 12427, "time_per_iteration": 2.6058719158172607 }, { "auxiliary_loss_clip": 0.06419976, "auxiliary_loss_mlp": 0.01263657, "balance_loss_clip": 0.06277296, "balance_loss_mlp": 0.01254329, "epoch": 0.7472117841575229, "flos": 16296062411520.0, "grad_norm": 2.533313710630167, "language_loss": 0.75010914, "learning_rate": 6.335824784423118e-07, "loss": 0.82694542, "num_input_tokens_seen": 268072250, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09332275, "step": 12428, "time_per_iteration": 2.6658577919006348 }, { "auxiliary_loss_clip": 0.0642438, "auxiliary_loss_mlp": 0.01264621, "balance_loss_clip": 0.06276475, "balance_loss_mlp": 0.01253797, "epoch": 0.7472719074101909, "flos": 21395068848000.0, "grad_norm": 1.852656545328458, "language_loss": 0.58279002, "learning_rate": 6.33298110431499e-07, "loss": 0.65968001, "num_input_tokens_seen": 268089840, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.10827637, "step": 12429, "time_per_iteration": 2.7402327060699463 }, { "auxiliary_loss_clip": 0.06427397, "auxiliary_loss_mlp": 0.01267353, "balance_loss_clip": 0.06281554, "balance_loss_mlp": 0.0125719, "epoch": 0.7473320306628589, "flos": 29651064439680.0, "grad_norm": 1.7977619402104004, "language_loss": 0.60958785, "learning_rate": 6.330137942461595e-07, "loss": 0.68653536, "num_input_tokens_seen": 268109360, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10168457, "step": 12430, "time_per_iteration": 2.64809513092041 }, { "auxiliary_loss_clip": 0.0641069, "auxiliary_loss_mlp": 0.01265607, "balance_loss_clip": 0.06272928, "balance_loss_mlp": 0.01256308, "epoch": 0.7473921539155268, "flos": 24143316986880.0, "grad_norm": 1.4587176787477105, "language_loss": 0.75567579, "learning_rate": 6.327295298970734e-07, "loss": 0.83243877, "num_input_tokens_seen": 268131840, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09295654, "step": 12431, "time_per_iteration": 4.1087117195129395 }, { "auxiliary_loss_clip": 0.06415692, "auxiliary_loss_mlp": 0.01265672, "balance_loss_clip": 0.06274127, "balance_loss_mlp": 0.01255766, "epoch": 0.7474522771681948, "flos": 17492768824320.0, "grad_norm": 2.4320764643610575, "language_loss": 0.75599879, "learning_rate": 6.32445317395021e-07, "loss": 0.83281243, "num_input_tokens_seen": 268148300, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09912109, "step": 12432, "time_per_iteration": 2.5067806243896484 }, { "auxiliary_loss_clip": 0.06421483, "auxiliary_loss_mlp": 0.0126589, "balance_loss_clip": 0.06274995, "balance_loss_mlp": 0.01254851, "epoch": 0.7475124004208628, "flos": 16732909272960.0, "grad_norm": 2.3627512054415725, "language_loss": 0.7064451, "learning_rate": 6.321611567507787e-07, "loss": 0.78331882, "num_input_tokens_seen": 268166450, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.1104126, "step": 12433, "time_per_iteration": 2.5343265533447266 }, { "auxiliary_loss_clip": 0.0641924, "auxiliary_loss_mlp": 0.01266946, "balance_loss_clip": 0.06277362, "balance_loss_mlp": 0.01256533, "epoch": 0.7475725236735308, "flos": 19726533694080.0, "grad_norm": 1.9102096090917684, "language_loss": 0.67535782, "learning_rate": 6.318770479751232e-07, "loss": 0.75221968, "num_input_tokens_seen": 268186165, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10412598, "step": 12434, "time_per_iteration": 2.5586273670196533 }, { "auxiliary_loss_clip": 0.06404856, "auxiliary_loss_mlp": 0.01266039, "balance_loss_clip": 0.06271392, "balance_loss_mlp": 0.01257421, "epoch": 0.7476326469261987, "flos": 26293114465920.0, "grad_norm": 1.4081511119964492, "language_loss": 0.7981413, "learning_rate": 6.315929910788263e-07, "loss": 0.87485027, "num_input_tokens_seen": 268208145, "router_z_loss_clip": 1.33300781, "router_z_loss_mlp": 0.08624268, "step": 12435, "time_per_iteration": 2.59708833694458 }, { "auxiliary_loss_clip": 0.06421205, "auxiliary_loss_mlp": 0.01266148, "balance_loss_clip": 0.0627626, "balance_loss_mlp": 0.01255973, "epoch": 0.7476927701788667, "flos": 31839868794240.0, "grad_norm": 1.7430359382175276, "language_loss": 0.67908919, "learning_rate": 6.313089860726604e-07, "loss": 0.75596267, "num_input_tokens_seen": 268228345, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10174561, "step": 12436, "time_per_iteration": 2.65633487701416 }, { "auxiliary_loss_clip": 0.06419985, "auxiliary_loss_mlp": 0.01265281, "balance_loss_clip": 0.06273896, "balance_loss_mlp": 0.01254785, "epoch": 0.7477528934315346, "flos": 31803545249280.0, "grad_norm": 1.45227468002823, "language_loss": 0.7105574, "learning_rate": 6.31025032967396e-07, "loss": 0.78741008, "num_input_tokens_seen": 268250260, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10491943, "step": 12437, "time_per_iteration": 4.011284828186035 }, { "auxiliary_loss_clip": 0.06409213, "auxiliary_loss_mlp": 0.01265613, "balance_loss_clip": 0.06272623, "balance_loss_mlp": 0.01256589, "epoch": 0.7478130166842026, "flos": 20377548391680.0, "grad_norm": 1.6791422781738707, "language_loss": 0.6722517, "learning_rate": 6.307411317737986e-07, "loss": 0.74899995, "num_input_tokens_seen": 268268440, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.09020996, "step": 12438, "time_per_iteration": 2.5956361293792725 }, { "auxiliary_loss_clip": 0.06411334, "auxiliary_loss_mlp": 0.01269826, "balance_loss_clip": 0.06271629, "balance_loss_mlp": 0.01260736, "epoch": 0.7478731399368705, "flos": 18154558771200.0, "grad_norm": 1.6674445076182618, "language_loss": 0.81000054, "learning_rate": 6.304572825026344e-07, "loss": 0.88681221, "num_input_tokens_seen": 268285765, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09088135, "step": 12439, "time_per_iteration": 2.519893169403076 }, { "auxiliary_loss_clip": 0.06413932, "auxiliary_loss_mlp": 0.01263543, "balance_loss_clip": 0.06276034, "balance_loss_mlp": 0.01254269, "epoch": 0.7479332631895386, "flos": 15273259148160.0, "grad_norm": 2.5922835575332526, "language_loss": 0.71129191, "learning_rate": 6.301734851646674e-07, "loss": 0.78806663, "num_input_tokens_seen": 268304015, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09283447, "step": 12440, "time_per_iteration": 2.560589551925659 }, { "auxiliary_loss_clip": 0.06412583, "auxiliary_loss_mlp": 0.0126472, "balance_loss_clip": 0.06275377, "balance_loss_mlp": 0.01255833, "epoch": 0.7479933864422065, "flos": 21148937879040.0, "grad_norm": 1.5617425993417786, "language_loss": 0.74012053, "learning_rate": 6.298897397706597e-07, "loss": 0.81689358, "num_input_tokens_seen": 268323290, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.08892822, "step": 12441, "time_per_iteration": 2.553867816925049 }, { "auxiliary_loss_clip": 0.06418731, "auxiliary_loss_mlp": 0.01267413, "balance_loss_clip": 0.06275138, "balance_loss_mlp": 0.01256893, "epoch": 0.7480535096948745, "flos": 14397217511040.0, "grad_norm": 1.8680957951072474, "language_loss": 0.83117795, "learning_rate": 6.296060463313698e-07, "loss": 0.90803933, "num_input_tokens_seen": 268339490, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10522461, "step": 12442, "time_per_iteration": 2.5209193229675293 }, { "auxiliary_loss_clip": 0.06421675, "auxiliary_loss_mlp": 0.01264569, "balance_loss_clip": 0.06276682, "balance_loss_mlp": 0.01253858, "epoch": 0.7481136329475425, "flos": 27352073566080.0, "grad_norm": 1.7225336887067382, "language_loss": 0.63125801, "learning_rate": 6.293224048575565e-07, "loss": 0.70812041, "num_input_tokens_seen": 268359865, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10717773, "step": 12443, "time_per_iteration": 2.5943098068237305 }, { "auxiliary_loss_clip": 0.06414146, "auxiliary_loss_mlp": 0.01265048, "balance_loss_clip": 0.06274691, "balance_loss_mlp": 0.01255255, "epoch": 0.7481737562002104, "flos": 19536656342400.0, "grad_norm": 1.6808834528839376, "language_loss": 0.71433604, "learning_rate": 6.29038815359975e-07, "loss": 0.79112798, "num_input_tokens_seen": 268377065, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09796143, "step": 12444, "time_per_iteration": 2.5319623947143555 }, { "auxiliary_loss_clip": 0.06414102, "auxiliary_loss_mlp": 0.01265344, "balance_loss_clip": 0.06273127, "balance_loss_mlp": 0.0125582, "epoch": 0.7482338794528784, "flos": 21766102727040.0, "grad_norm": 1.4801757755645426, "language_loss": 0.69152403, "learning_rate": 6.287552778493786e-07, "loss": 0.76831853, "num_input_tokens_seen": 268396935, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09521484, "step": 12445, "time_per_iteration": 2.555018186569214 }, { "auxiliary_loss_clip": 0.06412056, "auxiliary_loss_mlp": 0.01264293, "balance_loss_clip": 0.06272554, "balance_loss_mlp": 0.01255054, "epoch": 0.7482940027055464, "flos": 18703269233280.0, "grad_norm": 1.6546961595474377, "language_loss": 0.74192894, "learning_rate": 6.28471792336519e-07, "loss": 0.81869245, "num_input_tokens_seen": 268414460, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09240723, "step": 12446, "time_per_iteration": 2.5581228733062744 }, { "auxiliary_loss_clip": 0.06421152, "auxiliary_loss_mlp": 0.01264832, "balance_loss_clip": 0.06276219, "balance_loss_mlp": 0.01254365, "epoch": 0.7483541259582144, "flos": 16003587335040.0, "grad_norm": 2.093031230950258, "language_loss": 0.72982663, "learning_rate": 6.281883588321475e-07, "loss": 0.80668646, "num_input_tokens_seen": 268432225, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10473633, "step": 12447, "time_per_iteration": 2.5685689449310303 }, { "auxiliary_loss_clip": 0.06410928, "auxiliary_loss_mlp": 0.01262147, "balance_loss_clip": 0.0627178, "balance_loss_mlp": 0.01253141, "epoch": 0.7484142492108823, "flos": 25563289403520.0, "grad_norm": 2.7376404786590456, "language_loss": 0.71830869, "learning_rate": 6.279049773470109e-07, "loss": 0.79503942, "num_input_tokens_seen": 268449270, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09002686, "step": 12448, "time_per_iteration": 2.5821189880371094 }, { "auxiliary_loss_clip": 0.06419744, "auxiliary_loss_mlp": 0.01264175, "balance_loss_clip": 0.06276381, "balance_loss_mlp": 0.01254471, "epoch": 0.7484743724635503, "flos": 22893432359040.0, "grad_norm": 1.7059727044499606, "language_loss": 0.73363662, "learning_rate": 6.276216478918543e-07, "loss": 0.81047583, "num_input_tokens_seen": 268467250, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.0970459, "step": 12449, "time_per_iteration": 2.55167293548584 }, { "auxiliary_loss_clip": 0.06424865, "auxiliary_loss_mlp": 0.01265116, "balance_loss_clip": 0.06277789, "balance_loss_mlp": 0.01254899, "epoch": 0.7485344957162182, "flos": 25307137872000.0, "grad_norm": 2.280252942820604, "language_loss": 0.6167658, "learning_rate": 6.273383704774225e-07, "loss": 0.69366562, "num_input_tokens_seen": 268487270, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.10211182, "step": 12450, "time_per_iteration": 2.602647542953491 }, { "auxiliary_loss_clip": 0.06411125, "auxiliary_loss_mlp": 0.01262752, "balance_loss_clip": 0.06275027, "balance_loss_mlp": 0.01254228, "epoch": 0.7485946189688862, "flos": 27060395103360.0, "grad_norm": 2.1997705146407287, "language_loss": 0.70637977, "learning_rate": 6.270551451144577e-07, "loss": 0.78311861, "num_input_tokens_seen": 268508020, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.08526611, "step": 12451, "time_per_iteration": 2.5941500663757324 }, { "auxiliary_loss_clip": 0.06426397, "auxiliary_loss_mlp": 0.01266307, "balance_loss_clip": 0.06278601, "balance_loss_mlp": 0.01256007, "epoch": 0.7486547422215541, "flos": 26914052747520.0, "grad_norm": 6.594829663449951, "language_loss": 0.80537176, "learning_rate": 6.267719718136988e-07, "loss": 0.88229871, "num_input_tokens_seen": 268527375, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10302734, "step": 12452, "time_per_iteration": 2.614593744277954 }, { "auxiliary_loss_clip": 0.06427065, "auxiliary_loss_mlp": 0.01267352, "balance_loss_clip": 0.06279267, "balance_loss_mlp": 0.01257052, "epoch": 0.7487148654742222, "flos": 22352855742720.0, "grad_norm": 2.147082578003022, "language_loss": 0.72112, "learning_rate": 6.264888505858843e-07, "loss": 0.79806417, "num_input_tokens_seen": 268544870, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.10308838, "step": 12453, "time_per_iteration": 2.604820966720581 }, { "auxiliary_loss_clip": 0.06419325, "auxiliary_loss_mlp": 0.01265557, "balance_loss_clip": 0.06277053, "balance_loss_mlp": 0.01256384, "epoch": 0.7487749887268901, "flos": 23045392938240.0, "grad_norm": 1.980133688273624, "language_loss": 0.74294376, "learning_rate": 6.262057814417517e-07, "loss": 0.81979257, "num_input_tokens_seen": 268564580, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09179688, "step": 12454, "time_per_iteration": 2.5738701820373535 }, { "auxiliary_loss_clip": 0.06311305, "auxiliary_loss_mlp": 0.01251343, "balance_loss_clip": 0.06254306, "balance_loss_mlp": 0.01250346, "epoch": 0.7488351119795581, "flos": 71545565842560.0, "grad_norm": 0.7206015162033983, "language_loss": 0.59438586, "learning_rate": 6.259227643920322e-07, "loss": 0.67001235, "num_input_tokens_seen": 268629550, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.00997162, "step": 12455, "time_per_iteration": 4.6738176345825195 }, { "auxiliary_loss_clip": 0.06413591, "auxiliary_loss_mlp": 0.01260893, "balance_loss_clip": 0.0627447, "balance_loss_mlp": 0.01251994, "epoch": 0.748895235232226, "flos": 17201048434560.0, "grad_norm": 2.110074627662532, "language_loss": 0.7995097, "learning_rate": 6.256397994474592e-07, "loss": 0.87625444, "num_input_tokens_seen": 268646645, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.08892822, "step": 12456, "time_per_iteration": 2.5581724643707275 }, { "auxiliary_loss_clip": 0.06315151, "auxiliary_loss_mlp": 0.01250216, "balance_loss_clip": 0.06257907, "balance_loss_mlp": 0.01249029, "epoch": 0.748955358484894, "flos": 58998276846720.0, "grad_norm": 0.8216355516547262, "language_loss": 0.61377996, "learning_rate": 6.25356886618763e-07, "loss": 0.68943357, "num_input_tokens_seen": 268702275, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01185608, "step": 12457, "time_per_iteration": 3.102350950241089 }, { "auxiliary_loss_clip": 0.06419194, "auxiliary_loss_mlp": 0.01269095, "balance_loss_clip": 0.0627687, "balance_loss_mlp": 0.01259213, "epoch": 0.749015481737562, "flos": 11364544287360.0, "grad_norm": 2.869945297387917, "language_loss": 0.67353475, "learning_rate": 6.250740259166711e-07, "loss": 0.75041759, "num_input_tokens_seen": 268716265, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.09887695, "step": 12458, "time_per_iteration": 2.524522066116333 }, { "auxiliary_loss_clip": 0.0641441, "auxiliary_loss_mlp": 0.01266335, "balance_loss_clip": 0.0627541, "balance_loss_mlp": 0.01256268, "epoch": 0.74907560499023, "flos": 21112991677440.0, "grad_norm": 1.646676311046939, "language_loss": 0.79744804, "learning_rate": 6.247912173519106e-07, "loss": 0.87425542, "num_input_tokens_seen": 268734330, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.10064697, "step": 12459, "time_per_iteration": 2.548457145690918 }, { "auxiliary_loss_clip": 0.06412724, "auxiliary_loss_mlp": 0.01266684, "balance_loss_clip": 0.06273492, "balance_loss_mlp": 0.012571, "epoch": 0.749135728242898, "flos": 22273709961600.0, "grad_norm": 2.10414370365573, "language_loss": 0.8064459, "learning_rate": 6.245084609352043e-07, "loss": 0.88323998, "num_input_tokens_seen": 268753500, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09576416, "step": 12460, "time_per_iteration": 4.13334321975708 }, { "auxiliary_loss_clip": 0.0641571, "auxiliary_loss_mlp": 0.01265072, "balance_loss_clip": 0.06276087, "balance_loss_mlp": 0.01254916, "epoch": 0.7491958514955659, "flos": 24063793862400.0, "grad_norm": 1.6450404699597794, "language_loss": 0.86161536, "learning_rate": 6.242257566772755e-07, "loss": 0.93842322, "num_input_tokens_seen": 268772055, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.1015625, "step": 12461, "time_per_iteration": 2.6026525497436523 }, { "auxiliary_loss_clip": 0.0641093, "auxiliary_loss_mlp": 0.01263158, "balance_loss_clip": 0.06273749, "balance_loss_mlp": 0.01254033, "epoch": 0.7492559747482339, "flos": 24497915466240.0, "grad_norm": 1.8347279794346056, "language_loss": 0.69642526, "learning_rate": 6.239431045888435e-07, "loss": 0.77316612, "num_input_tokens_seen": 268792265, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09124756, "step": 12462, "time_per_iteration": 2.5689566135406494 }, { "auxiliary_loss_clip": 0.06416735, "auxiliary_loss_mlp": 0.01266231, "balance_loss_clip": 0.06276634, "balance_loss_mlp": 0.01256033, "epoch": 0.7493160980009018, "flos": 27752680736640.0, "grad_norm": 1.739817239376622, "language_loss": 0.70877481, "learning_rate": 6.236605046806267e-07, "loss": 0.78560448, "num_input_tokens_seen": 268812735, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.10195923, "step": 12463, "time_per_iteration": 2.6583735942840576 }, { "auxiliary_loss_clip": 0.06421118, "auxiliary_loss_mlp": 0.01265498, "balance_loss_clip": 0.06279525, "balance_loss_mlp": 0.0125558, "epoch": 0.7493762212535698, "flos": 30233918240640.0, "grad_norm": 1.6674754609459388, "language_loss": 0.77669752, "learning_rate": 6.233779569633419e-07, "loss": 0.85356373, "num_input_tokens_seen": 268833090, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09918213, "step": 12464, "time_per_iteration": 2.6085193157196045 }, { "auxiliary_loss_clip": 0.06412518, "auxiliary_loss_mlp": 0.01265174, "balance_loss_clip": 0.06272562, "balance_loss_mlp": 0.01255744, "epoch": 0.7494363445062378, "flos": 21950906906880.0, "grad_norm": 1.5822084799699658, "language_loss": 0.78493631, "learning_rate": 6.230954614477034e-07, "loss": 0.86171317, "num_input_tokens_seen": 268851880, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09429932, "step": 12465, "time_per_iteration": 2.601685047149658 }, { "auxiliary_loss_clip": 0.06430501, "auxiliary_loss_mlp": 0.01268758, "balance_loss_clip": 0.06280892, "balance_loss_mlp": 0.01257528, "epoch": 0.7494964677589058, "flos": 12494473395840.0, "grad_norm": 2.397009772704601, "language_loss": 0.75456554, "learning_rate": 6.22813018144422e-07, "loss": 0.83155817, "num_input_tokens_seen": 268867910, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.11242676, "step": 12466, "time_per_iteration": 2.5297505855560303 }, { "auxiliary_loss_clip": 0.06416787, "auxiliary_loss_mlp": 0.01261852, "balance_loss_clip": 0.06273504, "balance_loss_mlp": 0.01252434, "epoch": 0.7495565910115737, "flos": 21659521933440.0, "grad_norm": 1.9480599724592655, "language_loss": 0.66848493, "learning_rate": 6.22530627064209e-07, "loss": 0.74527133, "num_input_tokens_seen": 268887260, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.09417725, "step": 12467, "time_per_iteration": 2.5899837017059326 }, { "auxiliary_loss_clip": 0.0641894, "auxiliary_loss_mlp": 0.01264421, "balance_loss_clip": 0.06275182, "balance_loss_mlp": 0.01254353, "epoch": 0.7496167142642417, "flos": 15274013834880.0, "grad_norm": 2.3655940019602837, "language_loss": 0.76742452, "learning_rate": 6.222482882177735e-07, "loss": 0.84425813, "num_input_tokens_seen": 268902520, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10064697, "step": 12468, "time_per_iteration": 2.5215442180633545 }, { "auxiliary_loss_clip": 0.06414282, "auxiliary_loss_mlp": 0.01265039, "balance_loss_clip": 0.06274542, "balance_loss_mlp": 0.01255121, "epoch": 0.7496768375169096, "flos": 22061554623360.0, "grad_norm": 2.493062510416533, "language_loss": 0.69801301, "learning_rate": 6.219660016158201e-07, "loss": 0.7748062, "num_input_tokens_seen": 268920970, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09918213, "step": 12469, "time_per_iteration": 2.5709388256073 }, { "auxiliary_loss_clip": 0.06418115, "auxiliary_loss_mlp": 0.01267218, "balance_loss_clip": 0.06275722, "balance_loss_mlp": 0.01257705, "epoch": 0.7497369607695776, "flos": 19062144270720.0, "grad_norm": 1.7518519637897887, "language_loss": 0.69790721, "learning_rate": 6.216837672690543e-07, "loss": 0.7747606, "num_input_tokens_seen": 268936600, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.09515381, "step": 12470, "time_per_iteration": 3.9472193717956543 }, { "auxiliary_loss_clip": 0.06425254, "auxiliary_loss_mlp": 0.01268686, "balance_loss_clip": 0.06275925, "balance_loss_mlp": 0.01257009, "epoch": 0.7497970840222457, "flos": 21624036929280.0, "grad_norm": 3.6252328807931384, "language_loss": 0.75433767, "learning_rate": 6.214015851881793e-07, "loss": 0.83127707, "num_input_tokens_seen": 268956560, "router_z_loss_clip": 1.49121094, "router_z_loss_mlp": 0.11669922, "step": 12471, "time_per_iteration": 2.587568521499634 }, { "auxiliary_loss_clip": 0.06422991, "auxiliary_loss_mlp": 0.01267728, "balance_loss_clip": 0.06280172, "balance_loss_mlp": 0.01258126, "epoch": 0.7498572072749136, "flos": 13740710371200.0, "grad_norm": 2.0275359941754396, "language_loss": 0.77163267, "learning_rate": 6.211194553838929e-07, "loss": 0.84853983, "num_input_tokens_seen": 268973945, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09606934, "step": 12472, "time_per_iteration": 2.5453438758850098 }, { "auxiliary_loss_clip": 0.06416382, "auxiliary_loss_mlp": 0.01263252, "balance_loss_clip": 0.06277297, "balance_loss_mlp": 0.01254091, "epoch": 0.7499173305275816, "flos": 22973207045760.0, "grad_norm": 3.965783744813405, "language_loss": 0.84621739, "learning_rate": 6.208373778668951e-07, "loss": 0.92301381, "num_input_tokens_seen": 268993245, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09173584, "step": 12473, "time_per_iteration": 2.6196677684783936 }, { "auxiliary_loss_clip": 0.06421852, "auxiliary_loss_mlp": 0.01267867, "balance_loss_clip": 0.06277166, "balance_loss_mlp": 0.01256852, "epoch": 0.7499774537802495, "flos": 22745916046080.0, "grad_norm": 1.8474784780020344, "language_loss": 0.73918712, "learning_rate": 6.205553526478829e-07, "loss": 0.81608433, "num_input_tokens_seen": 269012125, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.11022949, "step": 12474, "time_per_iteration": 2.649477481842041 }, { "auxiliary_loss_clip": 0.06425723, "auxiliary_loss_mlp": 0.0126884, "balance_loss_clip": 0.06278797, "balance_loss_mlp": 0.01257706, "epoch": 0.7500375770329175, "flos": 18302494354560.0, "grad_norm": 1.7618684997093548, "language_loss": 0.74751383, "learning_rate": 6.202733797375492e-07, "loss": 0.82445943, "num_input_tokens_seen": 269030545, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.11126709, "step": 12475, "time_per_iteration": 2.670360565185547 }, { "auxiliary_loss_clip": 0.06426585, "auxiliary_loss_mlp": 0.01270888, "balance_loss_clip": 0.06277543, "balance_loss_mlp": 0.01259897, "epoch": 0.7500977002855854, "flos": 19175684952960.0, "grad_norm": 2.0940520520377057, "language_loss": 0.80331612, "learning_rate": 6.199914591465878e-07, "loss": 0.88029087, "num_input_tokens_seen": 269048180, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.10992432, "step": 12476, "time_per_iteration": 2.7510645389556885 }, { "auxiliary_loss_clip": 0.06418708, "auxiliary_loss_mlp": 0.0126551, "balance_loss_clip": 0.06276105, "balance_loss_mlp": 0.01255764, "epoch": 0.7501578235382534, "flos": 22170441404160.0, "grad_norm": 1.77073202243603, "language_loss": 0.77605975, "learning_rate": 6.19709590885688e-07, "loss": 0.85290194, "num_input_tokens_seen": 269068600, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09741211, "step": 12477, "time_per_iteration": 3.9709012508392334 }, { "auxiliary_loss_clip": 0.06317292, "auxiliary_loss_mlp": 0.01251228, "balance_loss_clip": 0.06260242, "balance_loss_mlp": 0.01250125, "epoch": 0.7502179467909214, "flos": 64481035783680.0, "grad_norm": 0.7850227505881942, "language_loss": 0.54408598, "learning_rate": 6.194277749655394e-07, "loss": 0.61977112, "num_input_tokens_seen": 269119045, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01104736, "step": 12478, "time_per_iteration": 3.1522181034088135 }, { "auxiliary_loss_clip": 0.06413282, "auxiliary_loss_mlp": 0.01267249, "balance_loss_clip": 0.06275617, "balance_loss_mlp": 0.01257558, "epoch": 0.7502780700435894, "flos": 20483332571520.0, "grad_norm": 1.7515319479331823, "language_loss": 0.80619693, "learning_rate": 6.191460113968272e-07, "loss": 0.88300216, "num_input_tokens_seen": 269136755, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09686279, "step": 12479, "time_per_iteration": 2.6631875038146973 }, { "auxiliary_loss_clip": 0.06426392, "auxiliary_loss_mlp": 0.01267238, "balance_loss_clip": 0.0627912, "balance_loss_mlp": 0.01256623, "epoch": 0.7503381932962573, "flos": 20450908241280.0, "grad_norm": 3.355896765394872, "language_loss": 0.62469739, "learning_rate": 6.188643001902369e-07, "loss": 0.70163369, "num_input_tokens_seen": 269156120, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.10614014, "step": 12480, "time_per_iteration": 2.618129014968872 }, { "auxiliary_loss_clip": 0.06410791, "auxiliary_loss_mlp": 0.01265591, "balance_loss_clip": 0.06274463, "balance_loss_mlp": 0.01256418, "epoch": 0.7503983165489253, "flos": 22388382673920.0, "grad_norm": 1.5833682496198123, "language_loss": 0.78089565, "learning_rate": 6.185826413564512e-07, "loss": 0.85765946, "num_input_tokens_seen": 269175650, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.09173584, "step": 12481, "time_per_iteration": 2.55871319770813 }, { "auxiliary_loss_clip": 0.06419205, "auxiliary_loss_mlp": 0.01270339, "balance_loss_clip": 0.06275891, "balance_loss_mlp": 0.01259521, "epoch": 0.7504584398015932, "flos": 24906321066240.0, "grad_norm": 1.6272956980851232, "language_loss": 0.71509564, "learning_rate": 6.183010349061501e-07, "loss": 0.79199111, "num_input_tokens_seen": 269197080, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10809326, "step": 12482, "time_per_iteration": 2.6025948524475098 }, { "auxiliary_loss_clip": 0.06417407, "auxiliary_loss_mlp": 0.01265683, "balance_loss_clip": 0.06276305, "balance_loss_mlp": 0.01255098, "epoch": 0.7505185630542612, "flos": 25892381514240.0, "grad_norm": 1.9150231528571031, "language_loss": 0.70564854, "learning_rate": 6.180194808500118e-07, "loss": 0.78247941, "num_input_tokens_seen": 269218600, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10583496, "step": 12483, "time_per_iteration": 2.605135917663574 }, { "auxiliary_loss_clip": 0.06418757, "auxiliary_loss_mlp": 0.01266535, "balance_loss_clip": 0.06276945, "balance_loss_mlp": 0.01257684, "epoch": 0.7505786863069293, "flos": 23149709671680.0, "grad_norm": 1.6264222959784886, "language_loss": 0.74510348, "learning_rate": 6.177379791987131e-07, "loss": 0.8219564, "num_input_tokens_seen": 269239245, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.08856201, "step": 12484, "time_per_iteration": 2.5442540645599365 }, { "auxiliary_loss_clip": 0.06415094, "auxiliary_loss_mlp": 0.01265627, "balance_loss_clip": 0.06275363, "balance_loss_mlp": 0.01255566, "epoch": 0.7506388095595972, "flos": 16989144658560.0, "grad_norm": 2.0089990533315416, "language_loss": 0.85091174, "learning_rate": 6.174565299629295e-07, "loss": 0.92771894, "num_input_tokens_seen": 269258520, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.10070801, "step": 12485, "time_per_iteration": 2.5582096576690674 }, { "auxiliary_loss_clip": 0.0641554, "auxiliary_loss_mlp": 0.01263375, "balance_loss_clip": 0.06274719, "balance_loss_mlp": 0.01254398, "epoch": 0.7506989328122652, "flos": 22351346369280.0, "grad_norm": 1.4804790918723783, "language_loss": 0.78613693, "learning_rate": 6.171751331533323e-07, "loss": 0.86292607, "num_input_tokens_seen": 269278320, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.08978271, "step": 12486, "time_per_iteration": 2.547410249710083 }, { "auxiliary_loss_clip": 0.06421356, "auxiliary_loss_mlp": 0.0126395, "balance_loss_clip": 0.06277478, "balance_loss_mlp": 0.01254151, "epoch": 0.7507590560649331, "flos": 25783243171200.0, "grad_norm": 2.5075787853336773, "language_loss": 0.73159969, "learning_rate": 6.168937887805932e-07, "loss": 0.80845273, "num_input_tokens_seen": 269298025, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.09777832, "step": 12487, "time_per_iteration": 2.6225831508636475 }, { "auxiliary_loss_clip": 0.06415699, "auxiliary_loss_mlp": 0.0126284, "balance_loss_clip": 0.06271306, "balance_loss_mlp": 0.01253643, "epoch": 0.7508191793176011, "flos": 24286221325440.0, "grad_norm": 1.7943487234162012, "language_loss": 0.67702341, "learning_rate": 6.166124968553801e-07, "loss": 0.75380874, "num_input_tokens_seen": 269316770, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.09197998, "step": 12488, "time_per_iteration": 2.5986340045928955 }, { "auxiliary_loss_clip": 0.06413197, "auxiliary_loss_mlp": 0.01264582, "balance_loss_clip": 0.06273054, "balance_loss_mlp": 0.0125523, "epoch": 0.750879302570269, "flos": 19905384234240.0, "grad_norm": 1.882630773146164, "language_loss": 0.77284098, "learning_rate": 6.163312573883592e-07, "loss": 0.84961879, "num_input_tokens_seen": 269334755, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09356689, "step": 12489, "time_per_iteration": 2.6240487098693848 }, { "auxiliary_loss_clip": 0.06412821, "auxiliary_loss_mlp": 0.01264723, "balance_loss_clip": 0.06275193, "balance_loss_mlp": 0.01254947, "epoch": 0.750939425822937, "flos": 29213420964480.0, "grad_norm": 1.5965564234823317, "language_loss": 0.75182235, "learning_rate": 6.160500703901956e-07, "loss": 0.82859778, "num_input_tokens_seen": 269353810, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09777832, "step": 12490, "time_per_iteration": 2.6343064308166504 }, { "auxiliary_loss_clip": 0.06420797, "auxiliary_loss_mlp": 0.01266442, "balance_loss_clip": 0.06281167, "balance_loss_mlp": 0.01256357, "epoch": 0.750999549075605, "flos": 21148686316800.0, "grad_norm": 1.4765894065723526, "language_loss": 0.78699696, "learning_rate": 6.157689358715527e-07, "loss": 0.86386937, "num_input_tokens_seen": 269372910, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.10095215, "step": 12491, "time_per_iteration": 2.596560478210449 }, { "auxiliary_loss_clip": 0.06407578, "auxiliary_loss_mlp": 0.01266396, "balance_loss_clip": 0.06271362, "balance_loss_mlp": 0.01257282, "epoch": 0.751059672328273, "flos": 23554090275840.0, "grad_norm": 1.6615521332974248, "language_loss": 0.76402342, "learning_rate": 6.154878538430899e-07, "loss": 0.84076315, "num_input_tokens_seen": 269391545, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.09112549, "step": 12492, "time_per_iteration": 2.597545623779297 }, { "auxiliary_loss_clip": 0.06416883, "auxiliary_loss_mlp": 0.01267893, "balance_loss_clip": 0.06276111, "balance_loss_mlp": 0.01258272, "epoch": 0.7511197955809409, "flos": 18995786236800.0, "grad_norm": 1.9393764216215184, "language_loss": 0.7158643, "learning_rate": 6.152068243154671e-07, "loss": 0.79271203, "num_input_tokens_seen": 269408530, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09613037, "step": 12493, "time_per_iteration": 2.581571102142334 }, { "auxiliary_loss_clip": 0.06419924, "auxiliary_loss_mlp": 0.01268351, "balance_loss_clip": 0.06277714, "balance_loss_mlp": 0.01258064, "epoch": 0.7511799188336089, "flos": 22052246820480.0, "grad_norm": 1.6403964335624253, "language_loss": 0.80375427, "learning_rate": 6.149258472993395e-07, "loss": 0.88063705, "num_input_tokens_seen": 269425930, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10290527, "step": 12494, "time_per_iteration": 2.5472354888916016 }, { "auxiliary_loss_clip": 0.06416639, "auxiliary_loss_mlp": 0.01266683, "balance_loss_clip": 0.06274421, "balance_loss_mlp": 0.01256014, "epoch": 0.7512400420862768, "flos": 16471894204800.0, "grad_norm": 2.6406617823188396, "language_loss": 0.79027319, "learning_rate": 6.146449228053634e-07, "loss": 0.86710644, "num_input_tokens_seen": 269443945, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10668945, "step": 12495, "time_per_iteration": 3.9757535457611084 }, { "auxiliary_loss_clip": 0.06411789, "auxiliary_loss_mlp": 0.01263281, "balance_loss_clip": 0.0627313, "balance_loss_mlp": 0.01253894, "epoch": 0.7513001653389448, "flos": 20454472039680.0, "grad_norm": 1.8974788783292424, "language_loss": 0.71211165, "learning_rate": 6.143640508441898e-07, "loss": 0.78886235, "num_input_tokens_seen": 269463625, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09381104, "step": 12496, "time_per_iteration": 2.635751247406006 }, { "auxiliary_loss_clip": 0.06417964, "auxiliary_loss_mlp": 0.01262928, "balance_loss_clip": 0.06276831, "balance_loss_mlp": 0.01253594, "epoch": 0.7513602885916129, "flos": 23483497610880.0, "grad_norm": 1.751077970223294, "language_loss": 0.78477085, "learning_rate": 6.140832314264705e-07, "loss": 0.86157984, "num_input_tokens_seen": 269483415, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09332275, "step": 12497, "time_per_iteration": 2.58927059173584 }, { "auxiliary_loss_clip": 0.06416095, "auxiliary_loss_mlp": 0.01265322, "balance_loss_clip": 0.06274509, "balance_loss_mlp": 0.01255321, "epoch": 0.7514204118442808, "flos": 26804495134080.0, "grad_norm": 1.5551209774974337, "language_loss": 0.77391279, "learning_rate": 6.13802464562855e-07, "loss": 0.8507269, "num_input_tokens_seen": 269504635, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09997559, "step": 12498, "time_per_iteration": 2.6118907928466797 }, { "auxiliary_loss_clip": 0.0641293, "auxiliary_loss_mlp": 0.01262777, "balance_loss_clip": 0.06276249, "balance_loss_mlp": 0.01254253, "epoch": 0.7514805350969488, "flos": 19871869800960.0, "grad_norm": 1.634911036141691, "language_loss": 0.74013114, "learning_rate": 6.135217502639878e-07, "loss": 0.81688815, "num_input_tokens_seen": 269523955, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.08520508, "step": 12499, "time_per_iteration": 2.608484983444214 }, { "auxiliary_loss_clip": 0.06412161, "auxiliary_loss_mlp": 0.01263553, "balance_loss_clip": 0.06273136, "balance_loss_mlp": 0.01254743, "epoch": 0.7515406583496167, "flos": 24578444839680.0, "grad_norm": 1.9176814076782487, "language_loss": 0.79541427, "learning_rate": 6.132410885405148e-07, "loss": 0.8721714, "num_input_tokens_seen": 269544410, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.08813477, "step": 12500, "time_per_iteration": 4.007875204086304 }, { "auxiliary_loss_clip": 0.06432711, "auxiliary_loss_mlp": 0.0126513, "balance_loss_clip": 0.06281064, "balance_loss_mlp": 0.01253602, "epoch": 0.7516007816022847, "flos": 20126386177920.0, "grad_norm": 1.9599450190305794, "language_loss": 0.74064952, "learning_rate": 6.129604794030794e-07, "loss": 0.81762791, "num_input_tokens_seen": 269563315, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.11523438, "step": 12501, "time_per_iteration": 2.576622724533081 }, { "auxiliary_loss_clip": 0.0641559, "auxiliary_loss_mlp": 0.01266769, "balance_loss_clip": 0.06275262, "balance_loss_mlp": 0.0125722, "epoch": 0.7516609048549526, "flos": 22791379685760.0, "grad_norm": 1.6045837072132245, "language_loss": 0.78976476, "learning_rate": 6.126799228623207e-07, "loss": 0.86658829, "num_input_tokens_seen": 269583950, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09552002, "step": 12502, "time_per_iteration": 2.570216417312622 }, { "auxiliary_loss_clip": 0.06419889, "auxiliary_loss_mlp": 0.0126378, "balance_loss_clip": 0.06276768, "balance_loss_mlp": 0.01253576, "epoch": 0.7517210281076206, "flos": 10638576512640.0, "grad_norm": 2.142817329391781, "language_loss": 0.7070781, "learning_rate": 6.123994189288786e-07, "loss": 0.7839148, "num_input_tokens_seen": 269600120, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10205078, "step": 12503, "time_per_iteration": 2.528627634048462 }, { "auxiliary_loss_clip": 0.06320326, "auxiliary_loss_mlp": 0.01252033, "balance_loss_clip": 0.06263104, "balance_loss_mlp": 0.01250826, "epoch": 0.7517811513602886, "flos": 66071542458240.0, "grad_norm": 0.9737488707201916, "language_loss": 0.63860244, "learning_rate": 6.121189676133903e-07, "loss": 0.71432602, "num_input_tokens_seen": 269659815, "router_z_loss_clip": 0.57275391, "router_z_loss_mlp": 0.01205444, "step": 12504, "time_per_iteration": 3.0716543197631836 }, { "auxiliary_loss_clip": 0.06409764, "auxiliary_loss_mlp": 0.01267217, "balance_loss_clip": 0.06272906, "balance_loss_mlp": 0.01258681, "epoch": 0.7518412746129566, "flos": 37277317071360.0, "grad_norm": 1.6419190141973468, "language_loss": 0.68907785, "learning_rate": 6.118385689264896e-07, "loss": 0.76584768, "num_input_tokens_seen": 269684565, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.08532715, "step": 12505, "time_per_iteration": 2.7584006786346436 }, { "auxiliary_loss_clip": 0.06319496, "auxiliary_loss_mlp": 0.01253738, "balance_loss_clip": 0.06262085, "balance_loss_mlp": 0.01252482, "epoch": 0.7519013978656245, "flos": 60539001396480.0, "grad_norm": 0.6277300681439374, "language_loss": 0.54949802, "learning_rate": 6.11558222878809e-07, "loss": 0.62523031, "num_input_tokens_seen": 269752325, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 0.01255798, "step": 12506, "time_per_iteration": 3.319270133972168 }, { "auxiliary_loss_clip": 0.06425188, "auxiliary_loss_mlp": 0.01265226, "balance_loss_clip": 0.06281389, "balance_loss_mlp": 0.01255225, "epoch": 0.7519615211182925, "flos": 18812826846720.0, "grad_norm": 2.4249037445373283, "language_loss": 0.78475296, "learning_rate": 6.112779294809796e-07, "loss": 0.86165714, "num_input_tokens_seen": 269770630, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09997559, "step": 12507, "time_per_iteration": 2.566612958908081 }, { "auxiliary_loss_clip": 0.06411098, "auxiliary_loss_mlp": 0.01266294, "balance_loss_clip": 0.0627308, "balance_loss_mlp": 0.01256966, "epoch": 0.7520216443709604, "flos": 14580596171520.0, "grad_norm": 1.6028459341897656, "language_loss": 0.71456468, "learning_rate": 6.10997688743631e-07, "loss": 0.79133868, "num_input_tokens_seen": 269787280, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09320068, "step": 12508, "time_per_iteration": 2.546268939971924 }, { "auxiliary_loss_clip": 0.06411768, "auxiliary_loss_mlp": 0.01265967, "balance_loss_clip": 0.06271546, "balance_loss_mlp": 0.01256895, "epoch": 0.7520817676236284, "flos": 17062420654080.0, "grad_norm": 1.6683451287968536, "language_loss": 0.72358197, "learning_rate": 6.107175006773885e-07, "loss": 0.80035931, "num_input_tokens_seen": 269805205, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09075928, "step": 12509, "time_per_iteration": 2.5546188354492188 }, { "auxiliary_loss_clip": 0.06426804, "auxiliary_loss_mlp": 0.01270924, "balance_loss_clip": 0.06279095, "balance_loss_mlp": 0.01259909, "epoch": 0.7521418908762965, "flos": 25673517849600.0, "grad_norm": 1.5823250069197623, "language_loss": 0.62632293, "learning_rate": 6.104373652928785e-07, "loss": 0.70330024, "num_input_tokens_seen": 269824820, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.11029053, "step": 12510, "time_per_iteration": 4.072866439819336 }, { "auxiliary_loss_clip": 0.06413922, "auxiliary_loss_mlp": 0.01265376, "balance_loss_clip": 0.06276827, "balance_loss_mlp": 0.01255917, "epoch": 0.7522020141289644, "flos": 20893079836800.0, "grad_norm": 1.6151075658641054, "language_loss": 0.8167429, "learning_rate": 6.10157282600722e-07, "loss": 0.89353597, "num_input_tokens_seen": 269842825, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09460449, "step": 12511, "time_per_iteration": 2.570094347000122 }, { "auxiliary_loss_clip": 0.06422184, "auxiliary_loss_mlp": 0.01267687, "balance_loss_clip": 0.06275574, "balance_loss_mlp": 0.01256737, "epoch": 0.7522621373816324, "flos": 12645134236800.0, "grad_norm": 2.1302574418646265, "language_loss": 0.76608056, "learning_rate": 6.098772526115412e-07, "loss": 0.84297919, "num_input_tokens_seen": 269859000, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10943604, "step": 12512, "time_per_iteration": 2.5441811084747314 }, { "auxiliary_loss_clip": 0.06408872, "auxiliary_loss_mlp": 0.01264004, "balance_loss_clip": 0.06274396, "balance_loss_mlp": 0.01254902, "epoch": 0.7523222606343003, "flos": 25632624257280.0, "grad_norm": 1.6091722062647753, "language_loss": 0.82550418, "learning_rate": 6.095972753359537e-07, "loss": 0.90223289, "num_input_tokens_seen": 269878895, "router_z_loss_clip": 1.34667969, "router_z_loss_mlp": 0.09106445, "step": 12513, "time_per_iteration": 2.593069314956665 }, { "auxiliary_loss_clip": 0.06422614, "auxiliary_loss_mlp": 0.01265418, "balance_loss_clip": 0.06277306, "balance_loss_mlp": 0.01255285, "epoch": 0.7523823838869683, "flos": 20455142872320.0, "grad_norm": 1.922571006691088, "language_loss": 0.75112611, "learning_rate": 6.093173507845771e-07, "loss": 0.82800645, "num_input_tokens_seen": 269897280, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10125732, "step": 12514, "time_per_iteration": 2.552924156188965 }, { "auxiliary_loss_clip": 0.06408775, "auxiliary_loss_mlp": 0.0126293, "balance_loss_clip": 0.06272336, "balance_loss_mlp": 0.01254764, "epoch": 0.7524425071396362, "flos": 14725890351360.0, "grad_norm": 1.7931151081211456, "language_loss": 0.68957508, "learning_rate": 6.090374789680271e-07, "loss": 0.7662921, "num_input_tokens_seen": 269914640, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.08166504, "step": 12515, "time_per_iteration": 2.5177834033966064 }, { "auxiliary_loss_clip": 0.06418663, "auxiliary_loss_mlp": 0.01267542, "balance_loss_clip": 0.06276499, "balance_loss_mlp": 0.01258065, "epoch": 0.7525026303923043, "flos": 30600004728960.0, "grad_norm": 1.5507278851634256, "language_loss": 0.70620215, "learning_rate": 6.087576598969137e-07, "loss": 0.78306419, "num_input_tokens_seen": 269934960, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09472656, "step": 12516, "time_per_iteration": 4.093014240264893 }, { "auxiliary_loss_clip": 0.06412437, "auxiliary_loss_mlp": 0.01265889, "balance_loss_clip": 0.06275612, "balance_loss_mlp": 0.0125714, "epoch": 0.7525627536449722, "flos": 24798901731840.0, "grad_norm": 1.5088080337536602, "language_loss": 0.89743447, "learning_rate": 6.084778935818495e-07, "loss": 0.97421771, "num_input_tokens_seen": 269956655, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.08752441, "step": 12517, "time_per_iteration": 2.6330244541168213 }, { "auxiliary_loss_clip": 0.06425385, "auxiliary_loss_mlp": 0.01264898, "balance_loss_clip": 0.06280257, "balance_loss_mlp": 0.01254998, "epoch": 0.7526228768976402, "flos": 20786499043200.0, "grad_norm": 1.7060170487886428, "language_loss": 0.74777377, "learning_rate": 6.081981800334437e-07, "loss": 0.82467657, "num_input_tokens_seen": 269976835, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.09906006, "step": 12518, "time_per_iteration": 2.591549873352051 }, { "auxiliary_loss_clip": 0.06313981, "auxiliary_loss_mlp": 0.01250605, "balance_loss_clip": 0.06257011, "balance_loss_mlp": 0.01249496, "epoch": 0.7526830001503081, "flos": 66578017662720.0, "grad_norm": 0.6915496902725843, "language_loss": 0.55491614, "learning_rate": 6.079185192623017e-07, "loss": 0.63056195, "num_input_tokens_seen": 270040630, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.0111084, "step": 12519, "time_per_iteration": 3.239090919494629 }, { "auxiliary_loss_clip": 0.06415042, "auxiliary_loss_mlp": 0.01264667, "balance_loss_clip": 0.06274414, "balance_loss_mlp": 0.01255386, "epoch": 0.7527431234029761, "flos": 23484755422080.0, "grad_norm": 1.4602613065743097, "language_loss": 0.78215313, "learning_rate": 6.07638911279029e-07, "loss": 0.8589502, "num_input_tokens_seen": 270059695, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09289551, "step": 12520, "time_per_iteration": 2.5873355865478516 }, { "auxiliary_loss_clip": 0.06416484, "auxiliary_loss_mlp": 0.01265104, "balance_loss_clip": 0.06277177, "balance_loss_mlp": 0.01255371, "epoch": 0.752803246655644, "flos": 22055265567360.0, "grad_norm": 2.099033675681221, "language_loss": 0.73960942, "learning_rate": 6.07359356094229e-07, "loss": 0.81642532, "num_input_tokens_seen": 270078420, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.097229, "step": 12521, "time_per_iteration": 2.5928611755371094 }, { "auxiliary_loss_clip": 0.0642871, "auxiliary_loss_mlp": 0.01267429, "balance_loss_clip": 0.06280392, "balance_loss_mlp": 0.01256187, "epoch": 0.752863369908312, "flos": 30161606567040.0, "grad_norm": 1.9757721923976064, "language_loss": 0.67515379, "learning_rate": 6.070798537185016e-07, "loss": 0.75211513, "num_input_tokens_seen": 270097040, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.11236572, "step": 12522, "time_per_iteration": 2.701714277267456 }, { "auxiliary_loss_clip": 0.06423192, "auxiliary_loss_mlp": 0.01270813, "balance_loss_clip": 0.06278672, "balance_loss_mlp": 0.01260883, "epoch": 0.7529234931609801, "flos": 24573874792320.0, "grad_norm": 1.650772652919492, "language_loss": 0.78413248, "learning_rate": 6.068004041624453e-07, "loss": 0.86107254, "num_input_tokens_seen": 270116365, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.09936523, "step": 12523, "time_per_iteration": 2.6767454147338867 }, { "auxiliary_loss_clip": 0.06412579, "auxiliary_loss_mlp": 0.01266085, "balance_loss_clip": 0.06273553, "balance_loss_mlp": 0.01256185, "epoch": 0.752983616413648, "flos": 23119088204160.0, "grad_norm": 1.8487108729325246, "language_loss": 0.80766892, "learning_rate": 6.065210074366571e-07, "loss": 0.88445556, "num_input_tokens_seen": 270135395, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09893799, "step": 12524, "time_per_iteration": 2.55692458152771 }, { "auxiliary_loss_clip": 0.06415547, "auxiliary_loss_mlp": 0.01269742, "balance_loss_clip": 0.06276383, "balance_loss_mlp": 0.0126045, "epoch": 0.753043739666316, "flos": 24323928462720.0, "grad_norm": 1.7270423315086283, "language_loss": 0.74248409, "learning_rate": 6.062416635517326e-07, "loss": 0.81933695, "num_input_tokens_seen": 270156425, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09289551, "step": 12525, "time_per_iteration": 2.59102725982666 }, { "auxiliary_loss_clip": 0.06412999, "auxiliary_loss_mlp": 0.01264911, "balance_loss_clip": 0.06275217, "balance_loss_mlp": 0.0125569, "epoch": 0.7531038629189839, "flos": 24250149342720.0, "grad_norm": 2.074847612344655, "language_loss": 0.72616303, "learning_rate": 6.059623725182641e-07, "loss": 0.80294216, "num_input_tokens_seen": 270176905, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09222412, "step": 12526, "time_per_iteration": 2.581191301345825 }, { "auxiliary_loss_clip": 0.06414688, "auxiliary_loss_mlp": 0.01263053, "balance_loss_clip": 0.06275683, "balance_loss_mlp": 0.01253952, "epoch": 0.7531639861716519, "flos": 30196378811520.0, "grad_norm": 2.0946140324101754, "language_loss": 0.7238701, "learning_rate": 6.056831343468414e-07, "loss": 0.8006475, "num_input_tokens_seen": 270196640, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09100342, "step": 12527, "time_per_iteration": 2.6069207191467285 }, { "auxiliary_loss_clip": 0.06412399, "auxiliary_loss_mlp": 0.01268859, "balance_loss_clip": 0.06273612, "balance_loss_mlp": 0.01259519, "epoch": 0.7532241094243198, "flos": 18229050650880.0, "grad_norm": 2.0914861432778618, "language_loss": 0.80888748, "learning_rate": 6.054039490480539e-07, "loss": 0.88569999, "num_input_tokens_seen": 270213905, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09344482, "step": 12528, "time_per_iteration": 2.513190269470215 }, { "auxiliary_loss_clip": 0.06418522, "auxiliary_loss_mlp": 0.01263653, "balance_loss_clip": 0.06274994, "balance_loss_mlp": 0.01253282, "epoch": 0.7532842326769879, "flos": 20886413437440.0, "grad_norm": 1.8054113378386163, "language_loss": 0.85326755, "learning_rate": 6.051248166324892e-07, "loss": 0.93008935, "num_input_tokens_seen": 270231995, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10375977, "step": 12529, "time_per_iteration": 2.585261821746826 }, { "auxiliary_loss_clip": 0.06422807, "auxiliary_loss_mlp": 0.01265105, "balance_loss_clip": 0.06276299, "balance_loss_mlp": 0.01255122, "epoch": 0.7533443559296558, "flos": 18084762720000.0, "grad_norm": 1.8344182568451697, "language_loss": 0.7413637, "learning_rate": 6.048457371107303e-07, "loss": 0.81824279, "num_input_tokens_seen": 270251480, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.09979248, "step": 12530, "time_per_iteration": 2.5601353645324707 }, { "auxiliary_loss_clip": 0.06316496, "auxiliary_loss_mlp": 0.01250118, "balance_loss_clip": 0.06259719, "balance_loss_mlp": 0.01249089, "epoch": 0.7534044791823238, "flos": 50271668398080.0, "grad_norm": 0.8119522355365616, "language_loss": 0.63735878, "learning_rate": 6.045667104933612e-07, "loss": 0.71302491, "num_input_tokens_seen": 270306480, "router_z_loss_clip": 0.56835938, "router_z_loss_mlp": 0.01029205, "step": 12531, "time_per_iteration": 3.0187389850616455 }, { "auxiliary_loss_clip": 0.06420399, "auxiliary_loss_mlp": 0.01264609, "balance_loss_clip": 0.06275629, "balance_loss_mlp": 0.01254542, "epoch": 0.7534646024349917, "flos": 20856588583680.0, "grad_norm": 1.9746699513849184, "language_loss": 0.70174444, "learning_rate": 6.042877367909633e-07, "loss": 0.77859449, "num_input_tokens_seen": 270324595, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10058594, "step": 12532, "time_per_iteration": 2.5810844898223877 }, { "auxiliary_loss_clip": 0.06411978, "auxiliary_loss_mlp": 0.01266623, "balance_loss_clip": 0.06276006, "balance_loss_mlp": 0.01257557, "epoch": 0.7535247256876597, "flos": 23077775341440.0, "grad_norm": 1.6483260846504941, "language_loss": 0.77688956, "learning_rate": 6.040088160141132e-07, "loss": 0.85367548, "num_input_tokens_seen": 270344375, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.09075928, "step": 12533, "time_per_iteration": 2.56437611579895 }, { "auxiliary_loss_clip": 0.06321092, "auxiliary_loss_mlp": 0.01249655, "balance_loss_clip": 0.06264377, "balance_loss_mlp": 0.01248531, "epoch": 0.7535848489403276, "flos": 58643888002560.0, "grad_norm": 0.7751134976808849, "language_loss": 0.573421, "learning_rate": 6.037299481733886e-07, "loss": 0.6491285, "num_input_tokens_seen": 270405235, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01126099, "step": 12534, "time_per_iteration": 4.666095972061157 }, { "auxiliary_loss_clip": 0.06417258, "auxiliary_loss_mlp": 0.01268663, "balance_loss_clip": 0.06275925, "balance_loss_mlp": 0.01258846, "epoch": 0.7536449721929956, "flos": 26585044490880.0, "grad_norm": 1.3932531001418549, "language_loss": 0.71546233, "learning_rate": 6.03451133279365e-07, "loss": 0.79232156, "num_input_tokens_seen": 270425820, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09820557, "step": 12535, "time_per_iteration": 2.617175340652466 }, { "auxiliary_loss_clip": 0.06421055, "auxiliary_loss_mlp": 0.01265787, "balance_loss_clip": 0.06275879, "balance_loss_mlp": 0.01256018, "epoch": 0.7537050954456637, "flos": 25742559214080.0, "grad_norm": 1.5919404100693002, "language_loss": 0.80574214, "learning_rate": 6.031723713426135e-07, "loss": 0.88261056, "num_input_tokens_seen": 270447120, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.09765625, "step": 12536, "time_per_iteration": 2.5941548347473145 }, { "auxiliary_loss_clip": 0.06410874, "auxiliary_loss_mlp": 0.01265362, "balance_loss_clip": 0.06272522, "balance_loss_mlp": 0.01255635, "epoch": 0.7537652186983316, "flos": 30231863815680.0, "grad_norm": 2.3237525561912724, "language_loss": 0.75038332, "learning_rate": 6.028936623737067e-07, "loss": 0.8271457, "num_input_tokens_seen": 270468680, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09735107, "step": 12537, "time_per_iteration": 2.642709255218506 }, { "auxiliary_loss_clip": 0.06425163, "auxiliary_loss_mlp": 0.0126793, "balance_loss_clip": 0.06282024, "balance_loss_mlp": 0.01258244, "epoch": 0.7538253419509996, "flos": 12646224339840.0, "grad_norm": 1.8056692748954881, "language_loss": 0.74371839, "learning_rate": 6.026150063832111e-07, "loss": 0.82064933, "num_input_tokens_seen": 270486310, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09692383, "step": 12538, "time_per_iteration": 2.5239756107330322 }, { "auxiliary_loss_clip": 0.06419498, "auxiliary_loss_mlp": 0.01267784, "balance_loss_clip": 0.06277985, "balance_loss_mlp": 0.01257073, "epoch": 0.7538854652036675, "flos": 23192783470080.0, "grad_norm": 1.5391260533509858, "language_loss": 0.67724705, "learning_rate": 6.023364033816956e-07, "loss": 0.75411987, "num_input_tokens_seen": 270507210, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.1071167, "step": 12539, "time_per_iteration": 2.596665620803833 }, { "auxiliary_loss_clip": 0.06415444, "auxiliary_loss_mlp": 0.01262845, "balance_loss_clip": 0.06278075, "balance_loss_mlp": 0.01253797, "epoch": 0.7539455884563355, "flos": 23193076959360.0, "grad_norm": 1.7469839582865279, "language_loss": 0.7501151, "learning_rate": 6.020578533797229e-07, "loss": 0.82689798, "num_input_tokens_seen": 270525250, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09051514, "step": 12540, "time_per_iteration": 4.001197814941406 }, { "auxiliary_loss_clip": 0.0641821, "auxiliary_loss_mlp": 0.01263743, "balance_loss_clip": 0.06274404, "balance_loss_mlp": 0.01253473, "epoch": 0.7540057117090034, "flos": 13184998093440.0, "grad_norm": 2.077562434520207, "language_loss": 0.73090655, "learning_rate": 6.017793563878566e-07, "loss": 0.80772609, "num_input_tokens_seen": 270539295, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10272217, "step": 12541, "time_per_iteration": 2.5219287872314453 }, { "auxiliary_loss_clip": 0.06417067, "auxiliary_loss_mlp": 0.01266788, "balance_loss_clip": 0.06275181, "balance_loss_mlp": 0.01257126, "epoch": 0.7540658349616715, "flos": 45488561783040.0, "grad_norm": 1.5158024101475684, "language_loss": 0.72285414, "learning_rate": 6.015009124166576e-07, "loss": 0.79969263, "num_input_tokens_seen": 270562815, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09661865, "step": 12542, "time_per_iteration": 2.7765090465545654 }, { "auxiliary_loss_clip": 0.0641018, "auxiliary_loss_mlp": 0.01264643, "balance_loss_clip": 0.06272432, "balance_loss_mlp": 0.01254964, "epoch": 0.7541259582143394, "flos": 19935754139520.0, "grad_norm": 2.256115856298067, "language_loss": 0.84772962, "learning_rate": 6.012225214766844e-07, "loss": 0.92447782, "num_input_tokens_seen": 270579055, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09680176, "step": 12543, "time_per_iteration": 2.5457584857940674 }, { "auxiliary_loss_clip": 0.06416073, "auxiliary_loss_mlp": 0.01264428, "balance_loss_clip": 0.06277815, "balance_loss_mlp": 0.01255034, "epoch": 0.7541860814670074, "flos": 27205521575040.0, "grad_norm": 2.0163659984420326, "language_loss": 0.73467195, "learning_rate": 6.009441835784927e-07, "loss": 0.81147695, "num_input_tokens_seen": 270599080, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09381104, "step": 12544, "time_per_iteration": 2.604526996612549 }, { "auxiliary_loss_clip": 0.06416294, "auxiliary_loss_mlp": 0.01265705, "balance_loss_clip": 0.06276621, "balance_loss_mlp": 0.01256145, "epoch": 0.7542462047196753, "flos": 21330471749760.0, "grad_norm": 2.356781919606349, "language_loss": 0.68175614, "learning_rate": 6.006658987326383e-07, "loss": 0.75857615, "num_input_tokens_seen": 270618715, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09564209, "step": 12545, "time_per_iteration": 2.5649731159210205 }, { "auxiliary_loss_clip": 0.06420074, "auxiliary_loss_mlp": 0.01263439, "balance_loss_clip": 0.06277372, "balance_loss_mlp": 0.01254368, "epoch": 0.7543063279723433, "flos": 11944630903680.0, "grad_norm": 1.8275104205748962, "language_loss": 0.69031531, "learning_rate": 6.003876669496728e-07, "loss": 0.76715046, "num_input_tokens_seen": 270635695, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09063721, "step": 12546, "time_per_iteration": 2.533332586288452 }, { "auxiliary_loss_clip": 0.0642417, "auxiliary_loss_mlp": 0.01271616, "balance_loss_clip": 0.06278761, "balance_loss_mlp": 0.01260309, "epoch": 0.7543664512250112, "flos": 22826529273600.0, "grad_norm": 2.093791309150485, "language_loss": 0.73228592, "learning_rate": 6.00109488240147e-07, "loss": 0.8092438, "num_input_tokens_seen": 270654325, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.11315918, "step": 12547, "time_per_iteration": 2.542019844055176 }, { "auxiliary_loss_clip": 0.06419422, "auxiliary_loss_mlp": 0.01267688, "balance_loss_clip": 0.06278479, "balance_loss_mlp": 0.01257514, "epoch": 0.7544265744776792, "flos": 20930283849600.0, "grad_norm": 1.779579943394838, "language_loss": 0.68174851, "learning_rate": 5.998313626146099e-07, "loss": 0.75861961, "num_input_tokens_seen": 270674260, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10174561, "step": 12548, "time_per_iteration": 2.568249225616455 }, { "auxiliary_loss_clip": 0.06419081, "auxiliary_loss_mlp": 0.01266099, "balance_loss_clip": 0.06274945, "balance_loss_mlp": 0.01255692, "epoch": 0.7544866977303473, "flos": 15200947474560.0, "grad_norm": 4.754676651628715, "language_loss": 0.87398225, "learning_rate": 5.995532900836088e-07, "loss": 0.9508341, "num_input_tokens_seen": 270692200, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10406494, "step": 12549, "time_per_iteration": 3.973172426223755 }, { "auxiliary_loss_clip": 0.06408979, "auxiliary_loss_mlp": 0.01265333, "balance_loss_clip": 0.06273816, "balance_loss_mlp": 0.0125517, "epoch": 0.7545468209830152, "flos": 27090094176000.0, "grad_norm": 1.6414337200026368, "language_loss": 0.77476931, "learning_rate": 5.992752706576865e-07, "loss": 0.85151243, "num_input_tokens_seen": 270709675, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.1015625, "step": 12550, "time_per_iteration": 2.6000702381134033 }, { "auxiliary_loss_clip": 0.06420661, "auxiliary_loss_mlp": 0.01266974, "balance_loss_clip": 0.06277819, "balance_loss_mlp": 0.0125764, "epoch": 0.7546069442356832, "flos": 26879238576000.0, "grad_norm": 1.4456910815234503, "language_loss": 0.69770658, "learning_rate": 5.98997304347386e-07, "loss": 0.77458286, "num_input_tokens_seen": 270733055, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09332275, "step": 12551, "time_per_iteration": 2.6231539249420166 }, { "auxiliary_loss_clip": 0.06417455, "auxiliary_loss_mlp": 0.01266593, "balance_loss_clip": 0.06276573, "balance_loss_mlp": 0.01256662, "epoch": 0.7546670674883511, "flos": 15748735541760.0, "grad_norm": 2.807408918743234, "language_loss": 0.86535001, "learning_rate": 5.987193911632487e-07, "loss": 0.94219053, "num_input_tokens_seen": 270749275, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.0993042, "step": 12552, "time_per_iteration": 2.514768600463867 }, { "auxiliary_loss_clip": 0.06420481, "auxiliary_loss_mlp": 0.01267422, "balance_loss_clip": 0.06278267, "balance_loss_mlp": 0.01257927, "epoch": 0.7547271907410191, "flos": 23484545786880.0, "grad_norm": 1.8324774799711019, "language_loss": 0.78500175, "learning_rate": 5.98441531115812e-07, "loss": 0.86188078, "num_input_tokens_seen": 270768230, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09490967, "step": 12553, "time_per_iteration": 2.5836827754974365 }, { "auxiliary_loss_clip": 0.06419539, "auxiliary_loss_mlp": 0.01263399, "balance_loss_clip": 0.06278122, "balance_loss_mlp": 0.01253701, "epoch": 0.754787313993687, "flos": 31730898159360.0, "grad_norm": 2.156402442219142, "language_loss": 0.63583648, "learning_rate": 5.981637242156135e-07, "loss": 0.7126658, "num_input_tokens_seen": 270786285, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09698486, "step": 12554, "time_per_iteration": 2.636873245239258 }, { "auxiliary_loss_clip": 0.0641543, "auxiliary_loss_mlp": 0.01265205, "balance_loss_clip": 0.0627574, "balance_loss_mlp": 0.01255793, "epoch": 0.7548474372463551, "flos": 27570392179200.0, "grad_norm": 1.6172639336585464, "language_loss": 0.73551583, "learning_rate": 5.978859704731864e-07, "loss": 0.81232214, "num_input_tokens_seen": 270805505, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09411621, "step": 12555, "time_per_iteration": 2.604004383087158 }, { "auxiliary_loss_clip": 0.06424394, "auxiliary_loss_mlp": 0.01264727, "balance_loss_clip": 0.06280317, "balance_loss_mlp": 0.01254534, "epoch": 0.754907560499023, "flos": 19324752566400.0, "grad_norm": 1.7904894915336378, "language_loss": 0.78601289, "learning_rate": 5.976082698990645e-07, "loss": 0.86290407, "num_input_tokens_seen": 270824610, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10186768, "step": 12556, "time_per_iteration": 3.954501152038574 }, { "auxiliary_loss_clip": 0.06321601, "auxiliary_loss_mlp": 0.01249914, "balance_loss_clip": 0.06264603, "balance_loss_mlp": 0.01248819, "epoch": 0.754967683751691, "flos": 69765795993600.0, "grad_norm": 0.6901457122748809, "language_loss": 0.5036217, "learning_rate": 5.973306225037769e-07, "loss": 0.57933688, "num_input_tokens_seen": 270886155, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01097107, "step": 12557, "time_per_iteration": 3.153017044067383 }, { "auxiliary_loss_clip": 0.06420904, "auxiliary_loss_mlp": 0.0126436, "balance_loss_clip": 0.06277552, "balance_loss_mlp": 0.01254233, "epoch": 0.7550278070043589, "flos": 24428161342080.0, "grad_norm": 1.7505565762282636, "language_loss": 0.71750939, "learning_rate": 5.970530282978525e-07, "loss": 0.79436201, "num_input_tokens_seen": 270905325, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.10125732, "step": 12558, "time_per_iteration": 2.5966482162475586 }, { "auxiliary_loss_clip": 0.06413867, "auxiliary_loss_mlp": 0.01264568, "balance_loss_clip": 0.06272079, "balance_loss_mlp": 0.01255598, "epoch": 0.7550879302570269, "flos": 32642802144000.0, "grad_norm": 1.7348486018749443, "language_loss": 0.80838192, "learning_rate": 5.967754872918187e-07, "loss": 0.88516629, "num_input_tokens_seen": 270927535, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.08966064, "step": 12559, "time_per_iteration": 2.6336865425109863 }, { "auxiliary_loss_clip": 0.06418956, "auxiliary_loss_mlp": 0.01266942, "balance_loss_clip": 0.06274022, "balance_loss_mlp": 0.01256786, "epoch": 0.7551480535096948, "flos": 21801461950080.0, "grad_norm": 1.9453362933194296, "language_loss": 0.78937161, "learning_rate": 5.96497999496199e-07, "loss": 0.86623061, "num_input_tokens_seen": 270946920, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10150146, "step": 12560, "time_per_iteration": 2.606966257095337 }, { "auxiliary_loss_clip": 0.06414136, "auxiliary_loss_mlp": 0.01266159, "balance_loss_clip": 0.0627522, "balance_loss_mlp": 0.01256646, "epoch": 0.7552081767623628, "flos": 18521022602880.0, "grad_norm": 1.6431191383249213, "language_loss": 0.71229196, "learning_rate": 5.96220564921515e-07, "loss": 0.78909492, "num_input_tokens_seen": 270965705, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09515381, "step": 12561, "time_per_iteration": 2.536505937576294 }, { "auxiliary_loss_clip": 0.06414644, "auxiliary_loss_mlp": 0.01268094, "balance_loss_clip": 0.06272289, "balance_loss_mlp": 0.01258248, "epoch": 0.7552683000150308, "flos": 27641949166080.0, "grad_norm": 1.6469388864756844, "language_loss": 0.75807554, "learning_rate": 5.959431835782889e-07, "loss": 0.83490288, "num_input_tokens_seen": 270986550, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09844971, "step": 12562, "time_per_iteration": 2.6162312030792236 }, { "auxiliary_loss_clip": 0.06413773, "auxiliary_loss_mlp": 0.01267981, "balance_loss_clip": 0.06274594, "balance_loss_mlp": 0.01257777, "epoch": 0.7553284232676988, "flos": 20309135932800.0, "grad_norm": 3.1193560737322845, "language_loss": 0.75973767, "learning_rate": 5.956658554770371e-07, "loss": 0.83655512, "num_input_tokens_seen": 271006250, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.10211182, "step": 12563, "time_per_iteration": 2.5811545848846436 }, { "auxiliary_loss_clip": 0.0643141, "auxiliary_loss_mlp": 0.01265946, "balance_loss_clip": 0.06280212, "balance_loss_mlp": 0.01253667, "epoch": 0.7553885465203668, "flos": 33263866206720.0, "grad_norm": 2.184190458966677, "language_loss": 0.67672908, "learning_rate": 5.953885806282768e-07, "loss": 0.75370264, "num_input_tokens_seen": 271025575, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.12286377, "step": 12564, "time_per_iteration": 2.697533369064331 }, { "auxiliary_loss_clip": 0.06421477, "auxiliary_loss_mlp": 0.01265756, "balance_loss_clip": 0.06276716, "balance_loss_mlp": 0.01255063, "epoch": 0.7554486697730347, "flos": 21622653336960.0, "grad_norm": 2.471916415963898, "language_loss": 0.68733364, "learning_rate": 5.951113590425228e-07, "loss": 0.76420593, "num_input_tokens_seen": 271045805, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10693359, "step": 12565, "time_per_iteration": 2.574742317199707 }, { "auxiliary_loss_clip": 0.06424801, "auxiliary_loss_mlp": 0.01267483, "balance_loss_clip": 0.0627545, "balance_loss_mlp": 0.01256605, "epoch": 0.7555087930257027, "flos": 27639810887040.0, "grad_norm": 1.4927923749254335, "language_loss": 0.75359046, "learning_rate": 5.94834190730287e-07, "loss": 0.8305133, "num_input_tokens_seen": 271066065, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.10888672, "step": 12566, "time_per_iteration": 2.5953423976898193 }, { "auxiliary_loss_clip": 0.06427621, "auxiliary_loss_mlp": 0.01267534, "balance_loss_clip": 0.06280214, "balance_loss_mlp": 0.01256728, "epoch": 0.7555689162783706, "flos": 23628162885120.0, "grad_norm": 2.4617777492615125, "language_loss": 0.74183416, "learning_rate": 5.945570757020789e-07, "loss": 0.81878573, "num_input_tokens_seen": 271085870, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10821533, "step": 12567, "time_per_iteration": 2.5681350231170654 }, { "auxiliary_loss_clip": 0.06418313, "auxiliary_loss_mlp": 0.01264773, "balance_loss_clip": 0.06276175, "balance_loss_mlp": 0.01255218, "epoch": 0.7556290395310387, "flos": 24869955594240.0, "grad_norm": 3.84516671275107, "language_loss": 0.63323075, "learning_rate": 5.942800139684073e-07, "loss": 0.71006155, "num_input_tokens_seen": 271104260, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09558105, "step": 12568, "time_per_iteration": 2.609917640686035 }, { "auxiliary_loss_clip": 0.06413372, "auxiliary_loss_mlp": 0.01267141, "balance_loss_clip": 0.06272858, "balance_loss_mlp": 0.01257437, "epoch": 0.7556891627837066, "flos": 43553770680960.0, "grad_norm": 2.0309074540710403, "language_loss": 0.66468233, "learning_rate": 5.940030055397789e-07, "loss": 0.74148738, "num_input_tokens_seen": 271125745, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09692383, "step": 12569, "time_per_iteration": 2.742161512374878 }, { "auxiliary_loss_clip": 0.06421132, "auxiliary_loss_mlp": 0.01267076, "balance_loss_clip": 0.06274904, "balance_loss_mlp": 0.01256139, "epoch": 0.7557492860363746, "flos": 26658110851200.0, "grad_norm": 2.1828129938670315, "language_loss": 0.67876214, "learning_rate": 5.93726050426697e-07, "loss": 0.7556442, "num_input_tokens_seen": 271147145, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10931396, "step": 12570, "time_per_iteration": 2.7333202362060547 }, { "auxiliary_loss_clip": 0.0641975, "auxiliary_loss_mlp": 0.01266115, "balance_loss_clip": 0.06276609, "balance_loss_mlp": 0.01256298, "epoch": 0.7558094092890425, "flos": 55194857769600.0, "grad_norm": 1.999748533850613, "language_loss": 0.71439922, "learning_rate": 5.934491486396647e-07, "loss": 0.79125786, "num_input_tokens_seen": 271170865, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.0980835, "step": 12571, "time_per_iteration": 2.951896905899048 }, { "auxiliary_loss_clip": 0.06421781, "auxiliary_loss_mlp": 0.01265002, "balance_loss_clip": 0.06276041, "balance_loss_mlp": 0.01254994, "epoch": 0.7558695325417105, "flos": 23995171768320.0, "grad_norm": 1.7050074756678137, "language_loss": 0.74067485, "learning_rate": 5.931723001891811e-07, "loss": 0.81754267, "num_input_tokens_seen": 271191450, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10009766, "step": 12572, "time_per_iteration": 2.6455750465393066 }, { "auxiliary_loss_clip": 0.0642523, "auxiliary_loss_mlp": 0.01268908, "balance_loss_clip": 0.06278699, "balance_loss_mlp": 0.01258895, "epoch": 0.7559296557943784, "flos": 14616542373120.0, "grad_norm": 2.470620761801129, "language_loss": 0.76776409, "learning_rate": 5.928955050857456e-07, "loss": 0.84470552, "num_input_tokens_seen": 271207335, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10015869, "step": 12573, "time_per_iteration": 2.513397455215454 }, { "auxiliary_loss_clip": 0.06417638, "auxiliary_loss_mlp": 0.01265181, "balance_loss_clip": 0.06271604, "balance_loss_mlp": 0.01254572, "epoch": 0.7559897790470465, "flos": 18556214117760.0, "grad_norm": 1.7541339113286427, "language_loss": 0.69695491, "learning_rate": 5.926187633398527e-07, "loss": 0.77378309, "num_input_tokens_seen": 271226895, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10614014, "step": 12574, "time_per_iteration": 3.995025634765625 }, { "auxiliary_loss_clip": 0.06415016, "auxiliary_loss_mlp": 0.01264907, "balance_loss_clip": 0.06274055, "balance_loss_mlp": 0.01255538, "epoch": 0.7560499022997144, "flos": 17973695733120.0, "grad_norm": 2.225346129726625, "language_loss": 0.72350138, "learning_rate": 5.923420749619974e-07, "loss": 0.8003006, "num_input_tokens_seen": 271244375, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09375, "step": 12575, "time_per_iteration": 2.5291996002197266 }, { "auxiliary_loss_clip": 0.06416781, "auxiliary_loss_mlp": 0.01262786, "balance_loss_clip": 0.06274502, "balance_loss_mlp": 0.01253476, "epoch": 0.7561100255523824, "flos": 15742530339840.0, "grad_norm": 2.7939445909320972, "language_loss": 0.71651173, "learning_rate": 5.92065439962673e-07, "loss": 0.79330742, "num_input_tokens_seen": 271259530, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09301758, "step": 12576, "time_per_iteration": 2.5310497283935547 }, { "auxiliary_loss_clip": 0.06415252, "auxiliary_loss_mlp": 0.01265874, "balance_loss_clip": 0.06274804, "balance_loss_mlp": 0.0125521, "epoch": 0.7561701488050504, "flos": 15893568524160.0, "grad_norm": 1.9793649111688925, "language_loss": 0.66868407, "learning_rate": 5.917888583523669e-07, "loss": 0.74549532, "num_input_tokens_seen": 271276835, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.10662842, "step": 12577, "time_per_iteration": 2.540797710418701 }, { "auxiliary_loss_clip": 0.06410509, "auxiliary_loss_mlp": 0.01264268, "balance_loss_clip": 0.06271195, "balance_loss_mlp": 0.01254433, "epoch": 0.7562302720577183, "flos": 20345333696640.0, "grad_norm": 1.5515886842316773, "language_loss": 0.7843377, "learning_rate": 5.915123301415685e-07, "loss": 0.86108547, "num_input_tokens_seen": 271296275, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.0982666, "step": 12578, "time_per_iteration": 2.535614490509033 }, { "auxiliary_loss_clip": 0.06417382, "auxiliary_loss_mlp": 0.01263373, "balance_loss_clip": 0.06273025, "balance_loss_mlp": 0.01253526, "epoch": 0.7562903953103863, "flos": 20818252540800.0, "grad_norm": 1.595428695752001, "language_loss": 0.75842381, "learning_rate": 5.912358553407641e-07, "loss": 0.8352313, "num_input_tokens_seen": 271315685, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.09844971, "step": 12579, "time_per_iteration": 4.061729669570923 }, { "auxiliary_loss_clip": 0.06422783, "auxiliary_loss_mlp": 0.01263457, "balance_loss_clip": 0.06274907, "balance_loss_mlp": 0.01252329, "epoch": 0.7563505185630542, "flos": 37606073765760.0, "grad_norm": 2.1591720313589784, "language_loss": 0.62980837, "learning_rate": 5.90959433960437e-07, "loss": 0.70667088, "num_input_tokens_seen": 271336790, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.11138916, "step": 12580, "time_per_iteration": 2.7319517135620117 }, { "auxiliary_loss_clip": 0.06415792, "auxiliary_loss_mlp": 0.01267567, "balance_loss_clip": 0.06275246, "balance_loss_mlp": 0.01257386, "epoch": 0.7564106418157223, "flos": 20237369310720.0, "grad_norm": 1.6054249738965185, "language_loss": 0.75244129, "learning_rate": 5.906830660110691e-07, "loss": 0.82927489, "num_input_tokens_seen": 271355470, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.10174561, "step": 12581, "time_per_iteration": 2.5609402656555176 }, { "auxiliary_loss_clip": 0.06422375, "auxiliary_loss_mlp": 0.01266608, "balance_loss_clip": 0.06277287, "balance_loss_mlp": 0.01256511, "epoch": 0.7564707650683902, "flos": 24761949281280.0, "grad_norm": 1.6094442680348713, "language_loss": 0.63261449, "learning_rate": 5.904067515031412e-07, "loss": 0.70950437, "num_input_tokens_seen": 271375810, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10089111, "step": 12582, "time_per_iteration": 2.6284170150756836 }, { "auxiliary_loss_clip": 0.06320715, "auxiliary_loss_mlp": 0.01254069, "balance_loss_clip": 0.06263541, "balance_loss_mlp": 0.01252835, "epoch": 0.7565308883210582, "flos": 48544965711360.0, "grad_norm": 0.9172444595206075, "language_loss": 0.60662293, "learning_rate": 5.901304904471307e-07, "loss": 0.68237078, "num_input_tokens_seen": 271424775, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.0123291, "step": 12583, "time_per_iteration": 2.942283868789673 }, { "auxiliary_loss_clip": 0.06423321, "auxiliary_loss_mlp": 0.01264759, "balance_loss_clip": 0.06279947, "balance_loss_mlp": 0.01255279, "epoch": 0.7565910115737261, "flos": 12500007765120.0, "grad_norm": 1.863848783186054, "language_loss": 0.79302287, "learning_rate": 5.898542828535125e-07, "loss": 0.86990368, "num_input_tokens_seen": 271440500, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.09484863, "step": 12584, "time_per_iteration": 2.6040444374084473 }, { "auxiliary_loss_clip": 0.06416297, "auxiliary_loss_mlp": 0.01264748, "balance_loss_clip": 0.06277832, "balance_loss_mlp": 0.01255265, "epoch": 0.7566511348263941, "flos": 21178427316480.0, "grad_norm": 2.0291497362606843, "language_loss": 0.77467406, "learning_rate": 5.895781287327612e-07, "loss": 0.85148454, "num_input_tokens_seen": 271458180, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.0947876, "step": 12585, "time_per_iteration": 2.5898170471191406 }, { "auxiliary_loss_clip": 0.06418562, "auxiliary_loss_mlp": 0.0126469, "balance_loss_clip": 0.06273676, "balance_loss_mlp": 0.01254396, "epoch": 0.756711258079062, "flos": 21760023306240.0, "grad_norm": 1.9273984485091464, "language_loss": 0.8282913, "learning_rate": 5.893020280953493e-07, "loss": 0.90512383, "num_input_tokens_seen": 271475730, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10284424, "step": 12586, "time_per_iteration": 2.586972236633301 }, { "auxiliary_loss_clip": 0.06418446, "auxiliary_loss_mlp": 0.01265854, "balance_loss_clip": 0.06273332, "balance_loss_mlp": 0.01256133, "epoch": 0.75677138133173, "flos": 22389514704000.0, "grad_norm": 1.9538558611504622, "language_loss": 0.83874679, "learning_rate": 5.890259809517459e-07, "loss": 0.91558975, "num_input_tokens_seen": 271495030, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.09716797, "step": 12587, "time_per_iteration": 2.6230428218841553 }, { "auxiliary_loss_clip": 0.06417542, "auxiliary_loss_mlp": 0.01263564, "balance_loss_clip": 0.06275178, "balance_loss_mlp": 0.01253771, "epoch": 0.756831504584398, "flos": 22715252651520.0, "grad_norm": 1.5196641243379299, "language_loss": 0.71256793, "learning_rate": 5.88749987312418e-07, "loss": 0.789379, "num_input_tokens_seen": 271515355, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09790039, "step": 12588, "time_per_iteration": 2.6067628860473633 }, { "auxiliary_loss_clip": 0.06421445, "auxiliary_loss_mlp": 0.01263742, "balance_loss_clip": 0.06275193, "balance_loss_mlp": 0.01253091, "epoch": 0.756891627837066, "flos": 24105358287360.0, "grad_norm": 1.6571988776483144, "language_loss": 0.69258165, "learning_rate": 5.884740471878327e-07, "loss": 0.7694335, "num_input_tokens_seen": 271535090, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.10644531, "step": 12589, "time_per_iteration": 4.0184166431427 }, { "auxiliary_loss_clip": 0.06416977, "auxiliary_loss_mlp": 0.01266298, "balance_loss_clip": 0.06274789, "balance_loss_mlp": 0.01256583, "epoch": 0.756951751089734, "flos": 19754010633600.0, "grad_norm": 1.6078831245767065, "language_loss": 0.92209721, "learning_rate": 5.881981605884522e-07, "loss": 0.99892998, "num_input_tokens_seen": 271551075, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.09716797, "step": 12590, "time_per_iteration": 2.5184826850891113 }, { "auxiliary_loss_clip": 0.06414318, "auxiliary_loss_mlp": 0.01265219, "balance_loss_clip": 0.06275774, "balance_loss_mlp": 0.01255212, "epoch": 0.7570118743424019, "flos": 35087883811200.0, "grad_norm": 1.8337740494992971, "language_loss": 0.655285, "learning_rate": 5.879223275247391e-07, "loss": 0.7320804, "num_input_tokens_seen": 271571035, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09997559, "step": 12591, "time_per_iteration": 2.6517255306243896 }, { "auxiliary_loss_clip": 0.06415063, "auxiliary_loss_mlp": 0.0126463, "balance_loss_clip": 0.06275074, "balance_loss_mlp": 0.01255528, "epoch": 0.7570719975950699, "flos": 25601835081600.0, "grad_norm": 1.5489715100109906, "language_loss": 0.73968089, "learning_rate": 5.876465480071528e-07, "loss": 0.81647778, "num_input_tokens_seen": 271592950, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09100342, "step": 12592, "time_per_iteration": 2.5756094455718994 }, { "auxiliary_loss_clip": 0.06418207, "auxiliary_loss_mlp": 0.01267777, "balance_loss_clip": 0.06273614, "balance_loss_mlp": 0.01257835, "epoch": 0.7571321208477378, "flos": 10820781216000.0, "grad_norm": 2.2708027350505975, "language_loss": 0.71591473, "learning_rate": 5.873708220461522e-07, "loss": 0.79277456, "num_input_tokens_seen": 271608835, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.09936523, "step": 12593, "time_per_iteration": 2.5051677227020264 }, { "auxiliary_loss_clip": 0.06419684, "auxiliary_loss_mlp": 0.01263189, "balance_loss_clip": 0.06275062, "balance_loss_mlp": 0.01253259, "epoch": 0.7571922441004059, "flos": 18266045028480.0, "grad_norm": 2.003420478329125, "language_loss": 0.66790056, "learning_rate": 5.870951496521903e-07, "loss": 0.74472922, "num_input_tokens_seen": 271627730, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.0993042, "step": 12594, "time_per_iteration": 2.539635419845581 }, { "auxiliary_loss_clip": 0.06423898, "auxiliary_loss_mlp": 0.01266229, "balance_loss_clip": 0.06276268, "balance_loss_mlp": 0.01255924, "epoch": 0.7572523673530738, "flos": 22896660741120.0, "grad_norm": 1.5284805518648297, "language_loss": 0.81123459, "learning_rate": 5.86819530835722e-07, "loss": 0.88813591, "num_input_tokens_seen": 271646415, "router_z_loss_clip": 1.47753906, "router_z_loss_mlp": 0.10302734, "step": 12595, "time_per_iteration": 3.9790375232696533 }, { "auxiliary_loss_clip": 0.06415025, "auxiliary_loss_mlp": 0.01266268, "balance_loss_clip": 0.06275466, "balance_loss_mlp": 0.0125691, "epoch": 0.7573124906057418, "flos": 21002679377280.0, "grad_norm": 2.2619529703736725, "language_loss": 0.72384727, "learning_rate": 5.865439656071993e-07, "loss": 0.80066013, "num_input_tokens_seen": 271666240, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09350586, "step": 12596, "time_per_iteration": 2.5503947734832764 }, { "auxiliary_loss_clip": 0.06409866, "auxiliary_loss_mlp": 0.0126506, "balance_loss_clip": 0.06271585, "balance_loss_mlp": 0.0125597, "epoch": 0.7573726138584097, "flos": 20892534785280.0, "grad_norm": 1.549894910125957, "language_loss": 0.80980498, "learning_rate": 5.862684539770706e-07, "loss": 0.88655424, "num_input_tokens_seen": 271686370, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09088135, "step": 12597, "time_per_iteration": 2.5353801250457764 }, { "auxiliary_loss_clip": 0.0642405, "auxiliary_loss_mlp": 0.01266753, "balance_loss_clip": 0.0627848, "balance_loss_mlp": 0.01256084, "epoch": 0.7574327371110777, "flos": 24536628852480.0, "grad_norm": 1.771616606522625, "language_loss": 0.83636838, "learning_rate": 5.859929959557835e-07, "loss": 0.91327637, "num_input_tokens_seen": 271705050, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10668945, "step": 12598, "time_per_iteration": 2.5789718627929688 }, { "auxiliary_loss_clip": 0.06410736, "auxiliary_loss_mlp": 0.0126344, "balance_loss_clip": 0.06271306, "balance_loss_mlp": 0.01254094, "epoch": 0.7574928603637456, "flos": 23370711615360.0, "grad_norm": 1.5981153485047719, "language_loss": 0.62798697, "learning_rate": 5.857175915537845e-07, "loss": 0.70472872, "num_input_tokens_seen": 271724915, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09350586, "step": 12599, "time_per_iteration": 2.573014497756958 }, { "auxiliary_loss_clip": 0.06425378, "auxiliary_loss_mlp": 0.01265515, "balance_loss_clip": 0.06277487, "balance_loss_mlp": 0.01253606, "epoch": 0.7575529836164137, "flos": 13521301655040.0, "grad_norm": 2.206044427073462, "language_loss": 0.63937247, "learning_rate": 5.854422407815161e-07, "loss": 0.71628141, "num_input_tokens_seen": 271742410, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.11907959, "step": 12600, "time_per_iteration": 2.5421440601348877 }, { "auxiliary_loss_clip": 0.06413445, "auxiliary_loss_mlp": 0.01267292, "balance_loss_clip": 0.06275502, "balance_loss_mlp": 0.0125695, "epoch": 0.7576131068690816, "flos": 19652754574080.0, "grad_norm": 1.6337750377360505, "language_loss": 0.66837066, "learning_rate": 5.851669436494191e-07, "loss": 0.74517798, "num_input_tokens_seen": 271761425, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.10339355, "step": 12601, "time_per_iteration": 2.5371339321136475 }, { "auxiliary_loss_clip": 0.06410025, "auxiliary_loss_mlp": 0.0126468, "balance_loss_clip": 0.06271145, "balance_loss_mlp": 0.01255578, "epoch": 0.7576732301217496, "flos": 20054535701760.0, "grad_norm": 1.5257830986188943, "language_loss": 0.67820847, "learning_rate": 5.848917001679335e-07, "loss": 0.75495553, "num_input_tokens_seen": 271780875, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09106445, "step": 12602, "time_per_iteration": 2.545910120010376 }, { "auxiliary_loss_clip": 0.06416723, "auxiliary_loss_mlp": 0.01265272, "balance_loss_clip": 0.06274504, "balance_loss_mlp": 0.01254705, "epoch": 0.7577333533744176, "flos": 15382439418240.0, "grad_norm": 3.265635060608977, "language_loss": 0.67178196, "learning_rate": 5.846165103474967e-07, "loss": 0.74860191, "num_input_tokens_seen": 271799490, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10565186, "step": 12603, "time_per_iteration": 2.5300650596618652 }, { "auxiliary_loss_clip": 0.06409091, "auxiliary_loss_mlp": 0.01266341, "balance_loss_clip": 0.06270398, "balance_loss_mlp": 0.01256679, "epoch": 0.7577934766270855, "flos": 17900671299840.0, "grad_norm": 2.0205234075624863, "language_loss": 0.61725438, "learning_rate": 5.843413741985439e-07, "loss": 0.69400871, "num_input_tokens_seen": 271817040, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09661865, "step": 12604, "time_per_iteration": 2.5367114543914795 }, { "auxiliary_loss_clip": 0.06417532, "auxiliary_loss_mlp": 0.01267459, "balance_loss_clip": 0.06276698, "balance_loss_mlp": 0.01257153, "epoch": 0.7578535998797535, "flos": 21619760371200.0, "grad_norm": 1.7970434075491202, "language_loss": 0.80360848, "learning_rate": 5.840662917315076e-07, "loss": 0.88045841, "num_input_tokens_seen": 271835480, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10296631, "step": 12605, "time_per_iteration": 2.544260263442993 }, { "auxiliary_loss_clip": 0.06418474, "auxiliary_loss_mlp": 0.01268703, "balance_loss_clip": 0.06273299, "balance_loss_mlp": 0.01257622, "epoch": 0.7579137231324214, "flos": 18484237860480.0, "grad_norm": 2.158320927267174, "language_loss": 0.7960465, "learning_rate": 5.837912629568198e-07, "loss": 0.87291825, "num_input_tokens_seen": 271849835, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.11071777, "step": 12606, "time_per_iteration": 2.5369884967803955 }, { "auxiliary_loss_clip": 0.06410672, "auxiliary_loss_mlp": 0.01266712, "balance_loss_clip": 0.06276004, "balance_loss_mlp": 0.01258207, "epoch": 0.7579738463850895, "flos": 23261195928960.0, "grad_norm": 1.3343966627935655, "language_loss": 0.73128009, "learning_rate": 5.835162878849087e-07, "loss": 0.80805397, "num_input_tokens_seen": 271869560, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.08508301, "step": 12607, "time_per_iteration": 2.5721733570098877 }, { "auxiliary_loss_clip": 0.06426591, "auxiliary_loss_mlp": 0.01269035, "balance_loss_clip": 0.06279708, "balance_loss_mlp": 0.0125929, "epoch": 0.7580339696377574, "flos": 14032137271680.0, "grad_norm": 1.9946277511234791, "language_loss": 0.74998575, "learning_rate": 5.83241366526202e-07, "loss": 0.82694197, "num_input_tokens_seen": 271887950, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.09741211, "step": 12608, "time_per_iteration": 2.5251221656799316 }, { "auxiliary_loss_clip": 0.06417955, "auxiliary_loss_mlp": 0.01265717, "balance_loss_clip": 0.06278314, "balance_loss_mlp": 0.01255251, "epoch": 0.7580940928904254, "flos": 25089825507840.0, "grad_norm": 1.6230588940688753, "language_loss": 0.71911526, "learning_rate": 5.829664988911245e-07, "loss": 0.79595196, "num_input_tokens_seen": 271907700, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.10467529, "step": 12609, "time_per_iteration": 2.5815775394439697 }, { "auxiliary_loss_clip": 0.06418081, "auxiliary_loss_mlp": 0.01265909, "balance_loss_clip": 0.06275815, "balance_loss_mlp": 0.01254685, "epoch": 0.7581542161430933, "flos": 23842288794240.0, "grad_norm": 1.616740972974165, "language_loss": 0.8174175, "learning_rate": 5.826916849901007e-07, "loss": 0.89425749, "num_input_tokens_seen": 271926840, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.11212158, "step": 12610, "time_per_iteration": 2.5503785610198975 }, { "auxiliary_loss_clip": 0.06421246, "auxiliary_loss_mlp": 0.01262193, "balance_loss_clip": 0.06274982, "balance_loss_mlp": 0.01252114, "epoch": 0.7582143393957613, "flos": 22243591618560.0, "grad_norm": 1.8037753321772145, "language_loss": 0.70973837, "learning_rate": 5.824169248335488e-07, "loss": 0.78657275, "num_input_tokens_seen": 271946465, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.10070801, "step": 12611, "time_per_iteration": 2.5617260932922363 }, { "auxiliary_loss_clip": 0.06416685, "auxiliary_loss_mlp": 0.0126752, "balance_loss_clip": 0.06276211, "balance_loss_mlp": 0.01257852, "epoch": 0.7582744626484292, "flos": 21112865896320.0, "grad_norm": 1.9740927150111316, "language_loss": 0.71077728, "learning_rate": 5.821422184318893e-07, "loss": 0.78761935, "num_input_tokens_seen": 271967295, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09661865, "step": 12612, "time_per_iteration": 2.5760819911956787 }, { "auxiliary_loss_clip": 0.06422618, "auxiliary_loss_mlp": 0.01264804, "balance_loss_clip": 0.06277781, "balance_loss_mlp": 0.01254624, "epoch": 0.7583345859010973, "flos": 24611120732160.0, "grad_norm": 1.3916740475217007, "language_loss": 0.60040474, "learning_rate": 5.818675657955397e-07, "loss": 0.67727894, "num_input_tokens_seen": 271987960, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10186768, "step": 12613, "time_per_iteration": 2.5761382579803467 }, { "auxiliary_loss_clip": 0.06419129, "auxiliary_loss_mlp": 0.01266054, "balance_loss_clip": 0.06277339, "balance_loss_mlp": 0.01256452, "epoch": 0.7583947091537652, "flos": 33555167326080.0, "grad_norm": 1.7502187840576502, "language_loss": 0.59966671, "learning_rate": 5.815929669349135e-07, "loss": 0.67651856, "num_input_tokens_seen": 272011780, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09594727, "step": 12614, "time_per_iteration": 4.0805344581604 }, { "auxiliary_loss_clip": 0.06421857, "auxiliary_loss_mlp": 0.01265757, "balance_loss_clip": 0.06275883, "balance_loss_mlp": 0.01256, "epoch": 0.7584548324064332, "flos": 20127266645760.0, "grad_norm": 2.0958947732384043, "language_loss": 0.73416078, "learning_rate": 5.813184218604246e-07, "loss": 0.81103694, "num_input_tokens_seen": 272030825, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.09759521, "step": 12615, "time_per_iteration": 2.5406951904296875 }, { "auxiliary_loss_clip": 0.06322561, "auxiliary_loss_mlp": 0.01251012, "balance_loss_clip": 0.06265842, "balance_loss_mlp": 0.01249815, "epoch": 0.7585149556591012, "flos": 70424064069120.0, "grad_norm": 0.791533889048051, "language_loss": 0.67467809, "learning_rate": 5.810439305824828e-07, "loss": 0.75041389, "num_input_tokens_seen": 272095825, "router_z_loss_clip": 0.56982422, "router_z_loss_mlp": 0.01195526, "step": 12616, "time_per_iteration": 3.189840078353882 }, { "auxiliary_loss_clip": 0.06420819, "auxiliary_loss_mlp": 0.01265658, "balance_loss_clip": 0.06274291, "balance_loss_mlp": 0.01255865, "epoch": 0.7585750789117691, "flos": 16149342712320.0, "grad_norm": 1.6450694866346547, "language_loss": 0.84771329, "learning_rate": 5.807694931114979e-07, "loss": 0.92457807, "num_input_tokens_seen": 272113950, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.09783936, "step": 12617, "time_per_iteration": 2.550503969192505 }, { "auxiliary_loss_clip": 0.06420488, "auxiliary_loss_mlp": 0.01262659, "balance_loss_clip": 0.06277627, "balance_loss_mlp": 0.01253022, "epoch": 0.7586352021644371, "flos": 17498848245120.0, "grad_norm": 2.344192394476786, "language_loss": 0.74655819, "learning_rate": 5.804951094578757e-07, "loss": 0.82338965, "num_input_tokens_seen": 272130315, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09637451, "step": 12618, "time_per_iteration": 2.6276495456695557 }, { "auxiliary_loss_clip": 0.06424931, "auxiliary_loss_mlp": 0.01264832, "balance_loss_clip": 0.06276459, "balance_loss_mlp": 0.01253942, "epoch": 0.758695325417105, "flos": 17280990829440.0, "grad_norm": 2.8855611413504243, "language_loss": 0.77226031, "learning_rate": 5.802207796320209e-07, "loss": 0.84915799, "num_input_tokens_seen": 272149080, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10876465, "step": 12619, "time_per_iteration": 3.981928586959839 }, { "auxiliary_loss_clip": 0.06417463, "auxiliary_loss_mlp": 0.01267204, "balance_loss_clip": 0.06278153, "balance_loss_mlp": 0.01257101, "epoch": 0.7587554486697731, "flos": 29503128856320.0, "grad_norm": 1.664937471190813, "language_loss": 0.82356203, "learning_rate": 5.79946503644337e-07, "loss": 0.90040863, "num_input_tokens_seen": 272168285, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.10101318, "step": 12620, "time_per_iteration": 2.615499258041382 }, { "auxiliary_loss_clip": 0.06423149, "auxiliary_loss_mlp": 0.01271266, "balance_loss_clip": 0.06275733, "balance_loss_mlp": 0.01259393, "epoch": 0.758815571922441, "flos": 16105262664960.0, "grad_norm": 2.3829384475792885, "language_loss": 0.82052195, "learning_rate": 5.796722815052242e-07, "loss": 0.89746618, "num_input_tokens_seen": 272184585, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.11865234, "step": 12621, "time_per_iteration": 2.534242868423462 }, { "auxiliary_loss_clip": 0.06417637, "auxiliary_loss_mlp": 0.0126756, "balance_loss_clip": 0.06276291, "balance_loss_mlp": 0.01257647, "epoch": 0.758875695175109, "flos": 16149258858240.0, "grad_norm": 2.211096183621296, "language_loss": 0.74444419, "learning_rate": 5.7939811322508e-07, "loss": 0.82129616, "num_input_tokens_seen": 272200205, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09918213, "step": 12622, "time_per_iteration": 2.538707971572876 }, { "auxiliary_loss_clip": 0.06318134, "auxiliary_loss_mlp": 0.01250711, "balance_loss_clip": 0.06261508, "balance_loss_mlp": 0.01249463, "epoch": 0.7589358184277769, "flos": 68482019589120.0, "grad_norm": 0.8232618818147506, "language_loss": 0.60857344, "learning_rate": 5.791239988143024e-07, "loss": 0.68426192, "num_input_tokens_seen": 272259670, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.01247406, "step": 12623, "time_per_iteration": 3.206406831741333 }, { "auxiliary_loss_clip": 0.06414809, "auxiliary_loss_mlp": 0.01266212, "balance_loss_clip": 0.06275964, "balance_loss_mlp": 0.01256889, "epoch": 0.7589959416804449, "flos": 20053445598720.0, "grad_norm": 1.725849781107491, "language_loss": 0.6775254, "learning_rate": 5.788499382832847e-07, "loss": 0.75433558, "num_input_tokens_seen": 272277925, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09332275, "step": 12624, "time_per_iteration": 2.6218178272247314 }, { "auxiliary_loss_clip": 0.06414462, "auxiliary_loss_mlp": 0.01268873, "balance_loss_clip": 0.0627427, "balance_loss_mlp": 0.01258675, "epoch": 0.7590560649331128, "flos": 18777970748160.0, "grad_norm": 1.72420809226978, "language_loss": 0.75524318, "learning_rate": 5.785759316424196e-07, "loss": 0.83207655, "num_input_tokens_seen": 272296010, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.10205078, "step": 12625, "time_per_iteration": 2.6416127681732178 }, { "auxiliary_loss_clip": 0.06413646, "auxiliary_loss_mlp": 0.01265962, "balance_loss_clip": 0.06275226, "balance_loss_mlp": 0.01255859, "epoch": 0.7591161881857809, "flos": 29833017580800.0, "grad_norm": 1.6477075967811483, "language_loss": 0.62829888, "learning_rate": 5.783019789020977e-07, "loss": 0.70509499, "num_input_tokens_seen": 272318330, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.10101318, "step": 12626, "time_per_iteration": 2.6975300312042236 }, { "auxiliary_loss_clip": 0.06423371, "auxiliary_loss_mlp": 0.01267032, "balance_loss_clip": 0.06279208, "balance_loss_mlp": 0.01256643, "epoch": 0.7591763114384488, "flos": 20308884370560.0, "grad_norm": 1.86028188947197, "language_loss": 0.74245048, "learning_rate": 5.780280800727084e-07, "loss": 0.81935447, "num_input_tokens_seen": 272335265, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10375977, "step": 12627, "time_per_iteration": 2.614882469177246 }, { "auxiliary_loss_clip": 0.0642038, "auxiliary_loss_mlp": 0.0126705, "balance_loss_clip": 0.06276558, "balance_loss_mlp": 0.01257609, "epoch": 0.7592364346911168, "flos": 20819887695360.0, "grad_norm": 2.044281502930688, "language_loss": 0.69037157, "learning_rate": 5.777542351646356e-07, "loss": 0.76724589, "num_input_tokens_seen": 272354795, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09436035, "step": 12628, "time_per_iteration": 4.041961908340454 }, { "auxiliary_loss_clip": 0.06432675, "auxiliary_loss_mlp": 0.01265409, "balance_loss_clip": 0.06282465, "balance_loss_mlp": 0.01254787, "epoch": 0.7592965579437848, "flos": 21257866586880.0, "grad_norm": 1.8274085978658257, "language_loss": 0.63291931, "learning_rate": 5.774804441882648e-07, "loss": 0.70990014, "num_input_tokens_seen": 272372875, "router_z_loss_clip": 1.50292969, "router_z_loss_mlp": 0.10614014, "step": 12629, "time_per_iteration": 2.6020164489746094 }, { "auxiliary_loss_clip": 0.06414372, "auxiliary_loss_mlp": 0.01263288, "balance_loss_clip": 0.06277146, "balance_loss_mlp": 0.01253549, "epoch": 0.7593566811964527, "flos": 26220802792320.0, "grad_norm": 1.4876095815335861, "language_loss": 0.77571452, "learning_rate": 5.772067071539786e-07, "loss": 0.85249114, "num_input_tokens_seen": 272394715, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09741211, "step": 12630, "time_per_iteration": 2.598618745803833 }, { "auxiliary_loss_clip": 0.06323494, "auxiliary_loss_mlp": 0.01250811, "balance_loss_clip": 0.06266813, "balance_loss_mlp": 0.0124968, "epoch": 0.7594168044491207, "flos": 71258122010880.0, "grad_norm": 0.8111032317454603, "language_loss": 0.61530787, "learning_rate": 5.769330240721562e-07, "loss": 0.69105089, "num_input_tokens_seen": 272458775, "router_z_loss_clip": 0.56738281, "router_z_loss_mlp": 0.01132202, "step": 12631, "time_per_iteration": 3.292389392852783 }, { "auxiliary_loss_clip": 0.06429487, "auxiliary_loss_mlp": 0.01267898, "balance_loss_clip": 0.06280749, "balance_loss_mlp": 0.01255947, "epoch": 0.7594769277017887, "flos": 26620319859840.0, "grad_norm": 1.636880421630885, "language_loss": 0.73984867, "learning_rate": 5.766593949531767e-07, "loss": 0.81682247, "num_input_tokens_seen": 272479355, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.1194458, "step": 12632, "time_per_iteration": 2.6126716136932373 }, { "auxiliary_loss_clip": 0.06419823, "auxiliary_loss_mlp": 0.01263187, "balance_loss_clip": 0.06277735, "balance_loss_mlp": 0.01252941, "epoch": 0.7595370509544567, "flos": 17600523575040.0, "grad_norm": 1.8554244076919575, "language_loss": 0.7488119, "learning_rate": 5.763858198074154e-07, "loss": 0.82564199, "num_input_tokens_seen": 272493555, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10247803, "step": 12633, "time_per_iteration": 2.5251505374908447 }, { "auxiliary_loss_clip": 0.06418329, "auxiliary_loss_mlp": 0.01267031, "balance_loss_clip": 0.0627661, "balance_loss_mlp": 0.01257661, "epoch": 0.7595971742071246, "flos": 18008551831680.0, "grad_norm": 2.106706482082797, "language_loss": 0.73908085, "learning_rate": 5.76112298645246e-07, "loss": 0.81593448, "num_input_tokens_seen": 272508925, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09362793, "step": 12634, "time_per_iteration": 2.5113744735717773 }, { "auxiliary_loss_clip": 0.06415919, "auxiliary_loss_mlp": 0.01267073, "balance_loss_clip": 0.06274488, "balance_loss_mlp": 0.01257602, "epoch": 0.7596572974597926, "flos": 28847921454720.0, "grad_norm": 1.7040153748108848, "language_loss": 0.64841425, "learning_rate": 5.758388314770408e-07, "loss": 0.72524416, "num_input_tokens_seen": 272528805, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.0947876, "step": 12635, "time_per_iteration": 3.9535157680511475 }, { "auxiliary_loss_clip": 0.06421033, "auxiliary_loss_mlp": 0.01265987, "balance_loss_clip": 0.06275565, "balance_loss_mlp": 0.01255419, "epoch": 0.7597174207124605, "flos": 14288037240960.0, "grad_norm": 1.850956129705252, "language_loss": 0.68836725, "learning_rate": 5.7556541831317e-07, "loss": 0.76523745, "num_input_tokens_seen": 272546655, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10565186, "step": 12636, "time_per_iteration": 2.5616912841796875 }, { "auxiliary_loss_clip": 0.06425287, "auxiliary_loss_mlp": 0.01263525, "balance_loss_clip": 0.06279618, "balance_loss_mlp": 0.01252916, "epoch": 0.7597775439651285, "flos": 21695300426880.0, "grad_norm": 2.298143461099052, "language_loss": 0.81402296, "learning_rate": 5.752920591640018e-07, "loss": 0.8909111, "num_input_tokens_seen": 272564010, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10614014, "step": 12637, "time_per_iteration": 2.556835412979126 }, { "auxiliary_loss_clip": 0.06420077, "auxiliary_loss_mlp": 0.012647, "balance_loss_clip": 0.06275921, "balance_loss_mlp": 0.01255312, "epoch": 0.7598376672177964, "flos": 36110100096000.0, "grad_norm": 1.7336837652928143, "language_loss": 0.66615206, "learning_rate": 5.750187540399017e-07, "loss": 0.74299979, "num_input_tokens_seen": 272585840, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.09387207, "step": 12638, "time_per_iteration": 2.694528818130493 }, { "auxiliary_loss_clip": 0.06419973, "auxiliary_loss_mlp": 0.01267504, "balance_loss_clip": 0.06276236, "balance_loss_mlp": 0.01256226, "epoch": 0.7598977904704645, "flos": 18338147066880.0, "grad_norm": 1.998464316502097, "language_loss": 0.65836185, "learning_rate": 5.747455029512323e-07, "loss": 0.73523664, "num_input_tokens_seen": 272602300, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.11279297, "step": 12639, "time_per_iteration": 2.5173263549804688 }, { "auxiliary_loss_clip": 0.06420903, "auxiliary_loss_mlp": 0.01265202, "balance_loss_clip": 0.06279442, "balance_loss_mlp": 0.01255027, "epoch": 0.7599579137231324, "flos": 20198697851520.0, "grad_norm": 2.023852061541159, "language_loss": 0.70309234, "learning_rate": 5.744723059083572e-07, "loss": 0.7799533, "num_input_tokens_seen": 272619595, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10174561, "step": 12640, "time_per_iteration": 2.564039707183838 }, { "auxiliary_loss_clip": 0.06421011, "auxiliary_loss_mlp": 0.01269576, "balance_loss_clip": 0.06275769, "balance_loss_mlp": 0.01258466, "epoch": 0.7600180369758004, "flos": 24031746875520.0, "grad_norm": 1.9323625034081, "language_loss": 0.67058301, "learning_rate": 5.741991629216343e-07, "loss": 0.74748892, "num_input_tokens_seen": 272638825, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.11120605, "step": 12641, "time_per_iteration": 2.564523696899414 }, { "auxiliary_loss_clip": 0.06419981, "auxiliary_loss_mlp": 0.01266316, "balance_loss_clip": 0.06274153, "balance_loss_mlp": 0.01255617, "epoch": 0.7600781602284684, "flos": 18995534674560.0, "grad_norm": 1.9000539280222906, "language_loss": 0.66946059, "learning_rate": 5.73926074001422e-07, "loss": 0.74632353, "num_input_tokens_seen": 272657240, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10699463, "step": 12642, "time_per_iteration": 2.532073736190796 }, { "auxiliary_loss_clip": 0.06418468, "auxiliary_loss_mlp": 0.01267477, "balance_loss_clip": 0.06281, "balance_loss_mlp": 0.01258021, "epoch": 0.7601382834811363, "flos": 26074670071680.0, "grad_norm": 2.085752039378456, "language_loss": 0.76324153, "learning_rate": 5.736530391580765e-07, "loss": 0.840101, "num_input_tokens_seen": 272677520, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.0944519, "step": 12643, "time_per_iteration": 2.579354763031006 }, { "auxiliary_loss_clip": 0.06425294, "auxiliary_loss_mlp": 0.01265812, "balance_loss_clip": 0.06281117, "balance_loss_mlp": 0.01254582, "epoch": 0.7601984067338043, "flos": 18850324348800.0, "grad_norm": 2.3166112894594595, "language_loss": 0.7869845, "learning_rate": 5.733800584019508e-07, "loss": 0.8638956, "num_input_tokens_seen": 272696770, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.11224365, "step": 12644, "time_per_iteration": 2.5629327297210693 }, { "auxiliary_loss_clip": 0.06416804, "auxiliary_loss_mlp": 0.01264307, "balance_loss_clip": 0.06273903, "balance_loss_mlp": 0.01254806, "epoch": 0.7602585299864723, "flos": 24653607552000.0, "grad_norm": 1.4996358452047127, "language_loss": 0.80874252, "learning_rate": 5.731071317433957e-07, "loss": 0.8855536, "num_input_tokens_seen": 272718340, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09503174, "step": 12645, "time_per_iteration": 2.5979347229003906 }, { "auxiliary_loss_clip": 0.06422459, "auxiliary_loss_mlp": 0.01267987, "balance_loss_clip": 0.06278332, "balance_loss_mlp": 0.01257669, "epoch": 0.7603186532391403, "flos": 23848913266560.0, "grad_norm": 1.5861561033600549, "language_loss": 0.73059732, "learning_rate": 5.728342591927611e-07, "loss": 0.80750179, "num_input_tokens_seen": 272739575, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10327148, "step": 12646, "time_per_iteration": 2.5830886363983154 }, { "auxiliary_loss_clip": 0.06420639, "auxiliary_loss_mlp": 0.01268087, "balance_loss_clip": 0.06281225, "balance_loss_mlp": 0.01259206, "epoch": 0.7603787764918082, "flos": 22206387605760.0, "grad_norm": 1.8917924074230474, "language_loss": 0.67463428, "learning_rate": 5.725614407603949e-07, "loss": 0.75152159, "num_input_tokens_seen": 272758710, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.08880615, "step": 12647, "time_per_iteration": 2.5338385105133057 }, { "auxiliary_loss_clip": 0.06323643, "auxiliary_loss_mlp": 0.01250205, "balance_loss_clip": 0.0626677, "balance_loss_mlp": 0.01249021, "epoch": 0.7604388997444762, "flos": 54104549713920.0, "grad_norm": 0.662376200823093, "language_loss": 0.48926401, "learning_rate": 5.722886764566415e-07, "loss": 0.5650025, "num_input_tokens_seen": 272814855, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.0118103, "step": 12648, "time_per_iteration": 3.133915662765503 }, { "auxiliary_loss_clip": 0.06413001, "auxiliary_loss_mlp": 0.01263581, "balance_loss_clip": 0.06275257, "balance_loss_mlp": 0.01254945, "epoch": 0.7604990229971441, "flos": 19687904161920.0, "grad_norm": 1.7745526963363345, "language_loss": 0.76375258, "learning_rate": 5.720159662918451e-07, "loss": 0.84051847, "num_input_tokens_seen": 272834400, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.08630371, "step": 12649, "time_per_iteration": 2.556328296661377 }, { "auxiliary_loss_clip": 0.06415241, "auxiliary_loss_mlp": 0.01262804, "balance_loss_clip": 0.06274687, "balance_loss_mlp": 0.01253559, "epoch": 0.7605591462498121, "flos": 25234993906560.0, "grad_norm": 1.5449950458122386, "language_loss": 0.68736476, "learning_rate": 5.717433102763462e-07, "loss": 0.7641452, "num_input_tokens_seen": 272854760, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.0925293, "step": 12650, "time_per_iteration": 2.5942916870117188 }, { "auxiliary_loss_clip": 0.06318928, "auxiliary_loss_mlp": 0.01251843, "balance_loss_clip": 0.06262083, "balance_loss_mlp": 0.01250667, "epoch": 0.76061926950248, "flos": 66803505799680.0, "grad_norm": 0.7477489872788098, "language_loss": 0.62893021, "learning_rate": 5.714707084204838e-07, "loss": 0.70463789, "num_input_tokens_seen": 272919030, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01174927, "step": 12651, "time_per_iteration": 3.188476085662842 }, { "auxiliary_loss_clip": 0.06414989, "auxiliary_loss_mlp": 0.01267629, "balance_loss_clip": 0.06274624, "balance_loss_mlp": 0.01257693, "epoch": 0.7606793927551481, "flos": 25345473914880.0, "grad_norm": 1.5322486873291323, "language_loss": 0.71358049, "learning_rate": 5.711981607345951e-07, "loss": 0.7904067, "num_input_tokens_seen": 272938925, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09942627, "step": 12652, "time_per_iteration": 2.601417064666748 }, { "auxiliary_loss_clip": 0.06418161, "auxiliary_loss_mlp": 0.01267151, "balance_loss_clip": 0.0627596, "balance_loss_mlp": 0.01257155, "epoch": 0.760739516007816, "flos": 18229553775360.0, "grad_norm": 2.044016819836694, "language_loss": 0.80397332, "learning_rate": 5.709256672290152e-07, "loss": 0.88082635, "num_input_tokens_seen": 272954945, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09991455, "step": 12653, "time_per_iteration": 4.0032501220703125 }, { "auxiliary_loss_clip": 0.06422682, "auxiliary_loss_mlp": 0.01268787, "balance_loss_clip": 0.06276104, "balance_loss_mlp": 0.01258475, "epoch": 0.760799639260484, "flos": 22564717591680.0, "grad_norm": 1.526846146938569, "language_loss": 0.80263382, "learning_rate": 5.706532279140785e-07, "loss": 0.87954849, "num_input_tokens_seen": 272972855, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10308838, "step": 12654, "time_per_iteration": 2.590688705444336 }, { "auxiliary_loss_clip": 0.06421334, "auxiliary_loss_mlp": 0.0126561, "balance_loss_clip": 0.06277366, "balance_loss_mlp": 0.01255119, "epoch": 0.760859762513152, "flos": 22315819438080.0, "grad_norm": 2.361826602532196, "language_loss": 0.79863036, "learning_rate": 5.703808428001136e-07, "loss": 0.87549973, "num_input_tokens_seen": 272989895, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10491943, "step": 12655, "time_per_iteration": 2.5318169593811035 }, { "auxiliary_loss_clip": 0.06416696, "auxiliary_loss_mlp": 0.01263125, "balance_loss_clip": 0.06278214, "balance_loss_mlp": 0.01254554, "epoch": 0.7609198857658199, "flos": 24870919916160.0, "grad_norm": 5.123217312004112, "language_loss": 0.68825465, "learning_rate": 5.701085118974505e-07, "loss": 0.76505291, "num_input_tokens_seen": 273011695, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.08575439, "step": 12656, "time_per_iteration": 2.602588176727295 }, { "auxiliary_loss_clip": 0.06424671, "auxiliary_loss_mlp": 0.01266331, "balance_loss_clip": 0.06277094, "balance_loss_mlp": 0.01256139, "epoch": 0.760980009018488, "flos": 16842424959360.0, "grad_norm": 3.9052007775411917, "language_loss": 0.73296523, "learning_rate": 5.698362352164164e-07, "loss": 0.80987531, "num_input_tokens_seen": 273028815, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.10192871, "step": 12657, "time_per_iteration": 2.5242247581481934 }, { "auxiliary_loss_clip": 0.06323163, "auxiliary_loss_mlp": 0.01250737, "balance_loss_clip": 0.06266097, "balance_loss_mlp": 0.01249595, "epoch": 0.7610401322711559, "flos": 61248198355200.0, "grad_norm": 0.8309016989855555, "language_loss": 0.64875346, "learning_rate": 5.695640127673347e-07, "loss": 0.72449249, "num_input_tokens_seen": 273084080, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.0114212, "step": 12658, "time_per_iteration": 4.554612636566162 }, { "auxiliary_loss_clip": 0.06409996, "auxiliary_loss_mlp": 0.0126778, "balance_loss_clip": 0.06273426, "balance_loss_mlp": 0.01257402, "epoch": 0.7611002555238239, "flos": 19645920466560.0, "grad_norm": 3.5453500887188394, "language_loss": 0.80010045, "learning_rate": 5.692918445605293e-07, "loss": 0.8768782, "num_input_tokens_seen": 273102295, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.1038208, "step": 12659, "time_per_iteration": 2.546443223953247 }, { "auxiliary_loss_clip": 0.06416991, "auxiliary_loss_mlp": 0.01263503, "balance_loss_clip": 0.06275643, "balance_loss_mlp": 0.01254378, "epoch": 0.7611603787764918, "flos": 26879825554560.0, "grad_norm": 1.476825025046982, "language_loss": 0.69174296, "learning_rate": 5.690197306063209e-07, "loss": 0.76854795, "num_input_tokens_seen": 273123400, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09118652, "step": 12660, "time_per_iteration": 2.6214075088500977 }, { "auxiliary_loss_clip": 0.06418523, "auxiliary_loss_mlp": 0.01265485, "balance_loss_clip": 0.06276947, "balance_loss_mlp": 0.0125633, "epoch": 0.7612205020291598, "flos": 27351570441600.0, "grad_norm": 1.7317638448705668, "language_loss": 0.70693028, "learning_rate": 5.687476709150281e-07, "loss": 0.78377038, "num_input_tokens_seen": 273145150, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09155273, "step": 12661, "time_per_iteration": 2.606205463409424 }, { "auxiliary_loss_clip": 0.06418528, "auxiliary_loss_mlp": 0.01265474, "balance_loss_clip": 0.06276127, "balance_loss_mlp": 0.01255907, "epoch": 0.7612806252818277, "flos": 29322265818240.0, "grad_norm": 1.4581679519058326, "language_loss": 0.83738488, "learning_rate": 5.68475665496966e-07, "loss": 0.91422486, "num_input_tokens_seen": 273165180, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09570312, "step": 12662, "time_per_iteration": 2.6254405975341797 }, { "auxiliary_loss_clip": 0.06421962, "auxiliary_loss_mlp": 0.0126858, "balance_loss_clip": 0.06279306, "balance_loss_mlp": 0.01259329, "epoch": 0.7613407485344957, "flos": 19032067854720.0, "grad_norm": 1.6740858507593912, "language_loss": 0.69428343, "learning_rate": 5.682037143624505e-07, "loss": 0.77118886, "num_input_tokens_seen": 273184005, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09246826, "step": 12663, "time_per_iteration": 2.538541555404663 }, { "auxiliary_loss_clip": 0.06415728, "auxiliary_loss_mlp": 0.01263662, "balance_loss_clip": 0.06278585, "balance_loss_mlp": 0.01254591, "epoch": 0.7614008717871636, "flos": 23262369886080.0, "grad_norm": 1.707793618200062, "language_loss": 0.70379347, "learning_rate": 5.67931817521794e-07, "loss": 0.78058738, "num_input_tokens_seen": 273203565, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09063721, "step": 12664, "time_per_iteration": 2.5872628688812256 }, { "auxiliary_loss_clip": 0.0642547, "auxiliary_loss_mlp": 0.01270368, "balance_loss_clip": 0.06278904, "balance_loss_mlp": 0.01259716, "epoch": 0.7614609950398317, "flos": 21586329792000.0, "grad_norm": 2.2185172346322397, "language_loss": 0.79572546, "learning_rate": 5.676599749853066e-07, "loss": 0.87268382, "num_input_tokens_seen": 273221645, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10650635, "step": 12665, "time_per_iteration": 2.8351833820343018 }, { "auxiliary_loss_clip": 0.06416704, "auxiliary_loss_mlp": 0.01268175, "balance_loss_clip": 0.06279337, "balance_loss_mlp": 0.01258287, "epoch": 0.7615211182924996, "flos": 29285523002880.0, "grad_norm": 1.8209011160632866, "language_loss": 0.87866032, "learning_rate": 5.673881867632959e-07, "loss": 0.95550919, "num_input_tokens_seen": 273242040, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09881592, "step": 12666, "time_per_iteration": 2.6239395141601562 }, { "auxiliary_loss_clip": 0.06424421, "auxiliary_loss_mlp": 0.0126651, "balance_loss_clip": 0.06280705, "balance_loss_mlp": 0.01256336, "epoch": 0.7615812415451676, "flos": 13266156372480.0, "grad_norm": 2.4080725803119436, "language_loss": 0.83070451, "learning_rate": 5.671164528660693e-07, "loss": 0.90761387, "num_input_tokens_seen": 273257365, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10186768, "step": 12667, "time_per_iteration": 2.554356336593628 }, { "auxiliary_loss_clip": 0.06416681, "auxiliary_loss_mlp": 0.01265857, "balance_loss_clip": 0.06279359, "balance_loss_mlp": 0.01256987, "epoch": 0.7616413647978356, "flos": 18590105894400.0, "grad_norm": 1.6312460272133733, "language_loss": 0.78526157, "learning_rate": 5.668447733039296e-07, "loss": 0.86208695, "num_input_tokens_seen": 273274710, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.08868408, "step": 12668, "time_per_iteration": 4.010918617248535 }, { "auxiliary_loss_clip": 0.06415779, "auxiliary_loss_mlp": 0.01264709, "balance_loss_clip": 0.06274664, "balance_loss_mlp": 0.01255107, "epoch": 0.7617014880505035, "flos": 18522280414080.0, "grad_norm": 1.8593454716307516, "language_loss": 0.64645457, "learning_rate": 5.6657314808718e-07, "loss": 0.72325945, "num_input_tokens_seen": 273292870, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09613037, "step": 12669, "time_per_iteration": 2.537757158279419 }, { "auxiliary_loss_clip": 0.06424364, "auxiliary_loss_mlp": 0.01265186, "balance_loss_clip": 0.06280048, "balance_loss_mlp": 0.01254165, "epoch": 0.7617616113031715, "flos": 24980184040320.0, "grad_norm": 1.9765898525471193, "language_loss": 0.66756964, "learning_rate": 5.663015772261202e-07, "loss": 0.74446523, "num_input_tokens_seen": 273312375, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.11029053, "step": 12670, "time_per_iteration": 2.577510356903076 }, { "auxiliary_loss_clip": 0.06424799, "auxiliary_loss_mlp": 0.01265623, "balance_loss_clip": 0.06279821, "balance_loss_mlp": 0.01255824, "epoch": 0.7618217345558395, "flos": 23301796032000.0, "grad_norm": 1.9297712637893871, "language_loss": 0.732382, "learning_rate": 5.660300607310493e-07, "loss": 0.80928624, "num_input_tokens_seen": 273332590, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.09796143, "step": 12671, "time_per_iteration": 2.5809431076049805 }, { "auxiliary_loss_clip": 0.06414181, "auxiliary_loss_mlp": 0.01264942, "balance_loss_clip": 0.06274053, "balance_loss_mlp": 0.01256001, "epoch": 0.7618818578085075, "flos": 25489803772800.0, "grad_norm": 1.5461867262925448, "language_loss": 0.73765099, "learning_rate": 5.657585986122613e-07, "loss": 0.81444228, "num_input_tokens_seen": 273352885, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.08947754, "step": 12672, "time_per_iteration": 2.5922458171844482 }, { "auxiliary_loss_clip": 0.06320387, "auxiliary_loss_mlp": 0.01252385, "balance_loss_clip": 0.06263435, "balance_loss_mlp": 0.01251096, "epoch": 0.7619419810611754, "flos": 61168633303680.0, "grad_norm": 0.7529209271703785, "language_loss": 0.56605101, "learning_rate": 5.654871908800506e-07, "loss": 0.64177871, "num_input_tokens_seen": 273411730, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 0.01289368, "step": 12673, "time_per_iteration": 3.154937982559204 }, { "auxiliary_loss_clip": 0.06416529, "auxiliary_loss_mlp": 0.01264902, "balance_loss_clip": 0.06273963, "balance_loss_mlp": 0.01254417, "epoch": 0.7620021043138434, "flos": 23265430560000.0, "grad_norm": 1.8687346126591153, "language_loss": 0.74973065, "learning_rate": 5.652158375447102e-07, "loss": 0.826545, "num_input_tokens_seen": 273430020, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10491943, "step": 12674, "time_per_iteration": 2.600234031677246 }, { "auxiliary_loss_clip": 0.06411868, "auxiliary_loss_mlp": 0.01262474, "balance_loss_clip": 0.06273611, "balance_loss_mlp": 0.01254046, "epoch": 0.7620622275665113, "flos": 25089490091520.0, "grad_norm": 2.2852684139477826, "language_loss": 0.7214393, "learning_rate": 5.649445386165286e-07, "loss": 0.79818273, "num_input_tokens_seen": 273448690, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.08410645, "step": 12675, "time_per_iteration": 4.030980587005615 }, { "auxiliary_loss_clip": 0.06416461, "auxiliary_loss_mlp": 0.01263452, "balance_loss_clip": 0.06277567, "balance_loss_mlp": 0.01254148, "epoch": 0.7621223508191793, "flos": 20160864933120.0, "grad_norm": 2.052488585716011, "language_loss": 0.72545385, "learning_rate": 5.646732941057936e-07, "loss": 0.80225301, "num_input_tokens_seen": 273465190, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09301758, "step": 12676, "time_per_iteration": 2.5440094470977783 }, { "auxiliary_loss_clip": 0.06432973, "auxiliary_loss_mlp": 0.01265872, "balance_loss_clip": 0.06283569, "balance_loss_mlp": 0.01255375, "epoch": 0.7621824740718472, "flos": 18005323449600.0, "grad_norm": 2.573970274827895, "language_loss": 0.53701186, "learning_rate": 5.644021040227927e-07, "loss": 0.61400032, "num_input_tokens_seen": 273478620, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.10498047, "step": 12677, "time_per_iteration": 2.5019032955169678 }, { "auxiliary_loss_clip": 0.06421727, "auxiliary_loss_mlp": 0.01263355, "balance_loss_clip": 0.06280202, "balance_loss_mlp": 0.01252871, "epoch": 0.7622425973245153, "flos": 21732085169280.0, "grad_norm": 2.0749575757843726, "language_loss": 0.79641616, "learning_rate": 5.641309683778064e-07, "loss": 0.87326694, "num_input_tokens_seen": 273497635, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.1048584, "step": 12678, "time_per_iteration": 2.5531399250030518 }, { "auxiliary_loss_clip": 0.06417671, "auxiliary_loss_mlp": 0.01262392, "balance_loss_clip": 0.0627545, "balance_loss_mlp": 0.01253368, "epoch": 0.7623027205771832, "flos": 19724563123200.0, "grad_norm": 1.9781438984075008, "language_loss": 0.77249193, "learning_rate": 5.638598871811175e-07, "loss": 0.84929252, "num_input_tokens_seen": 273513955, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.09020996, "step": 12679, "time_per_iteration": 2.5162971019744873 }, { "auxiliary_loss_clip": 0.06420477, "auxiliary_loss_mlp": 0.01264791, "balance_loss_clip": 0.06278422, "balance_loss_mlp": 0.01255237, "epoch": 0.7623628438298512, "flos": 23995800673920.0, "grad_norm": 1.2877730658763136, "language_loss": 0.80037886, "learning_rate": 5.635888604430059e-07, "loss": 0.87723148, "num_input_tokens_seen": 273533970, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09545898, "step": 12680, "time_per_iteration": 2.571707010269165 }, { "auxiliary_loss_clip": 0.06419255, "auxiliary_loss_mlp": 0.01264972, "balance_loss_clip": 0.06278115, "balance_loss_mlp": 0.01255304, "epoch": 0.7624229670825191, "flos": 22352184910080.0, "grad_norm": 1.7498131101695218, "language_loss": 0.63257647, "learning_rate": 5.633178881737493e-07, "loss": 0.70941877, "num_input_tokens_seen": 273553090, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09667969, "step": 12681, "time_per_iteration": 2.54414439201355 }, { "auxiliary_loss_clip": 0.06415135, "auxiliary_loss_mlp": 0.01266361, "balance_loss_clip": 0.06276731, "balance_loss_mlp": 0.01257218, "epoch": 0.7624830903351871, "flos": 22718522960640.0, "grad_norm": 1.8405557970460187, "language_loss": 0.76290315, "learning_rate": 5.63046970383622e-07, "loss": 0.8397181, "num_input_tokens_seen": 273572460, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09136963, "step": 12682, "time_per_iteration": 2.5829811096191406 }, { "auxiliary_loss_clip": 0.06415683, "auxiliary_loss_mlp": 0.0126824, "balance_loss_clip": 0.06277302, "balance_loss_mlp": 0.0125927, "epoch": 0.7625432135878552, "flos": 25600870759680.0, "grad_norm": 1.4194210889676588, "language_loss": 0.67922634, "learning_rate": 5.627761070828974e-07, "loss": 0.75606561, "num_input_tokens_seen": 273592815, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.08978271, "step": 12683, "time_per_iteration": 2.5857021808624268 }, { "auxiliary_loss_clip": 0.06414004, "auxiliary_loss_mlp": 0.01264814, "balance_loss_clip": 0.06273706, "balance_loss_mlp": 0.01255134, "epoch": 0.7626033368405231, "flos": 23994417081600.0, "grad_norm": 3.1870418096773987, "language_loss": 0.83562493, "learning_rate": 5.625052982818472e-07, "loss": 0.91241312, "num_input_tokens_seen": 273611790, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09680176, "step": 12684, "time_per_iteration": 2.5709075927734375 }, { "auxiliary_loss_clip": 0.06423561, "auxiliary_loss_mlp": 0.01267582, "balance_loss_clip": 0.0627969, "balance_loss_mlp": 0.01257068, "epoch": 0.7626634600931911, "flos": 12603150541440.0, "grad_norm": 1.7832024353280338, "language_loss": 0.83143574, "learning_rate": 5.622345439907396e-07, "loss": 0.90834713, "num_input_tokens_seen": 273628340, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10516357, "step": 12685, "time_per_iteration": 2.5252344608306885 }, { "auxiliary_loss_clip": 0.06421445, "auxiliary_loss_mlp": 0.01265308, "balance_loss_clip": 0.06279258, "balance_loss_mlp": 0.01255449, "epoch": 0.762723583345859, "flos": 26329731500160.0, "grad_norm": 1.9169453696512988, "language_loss": 0.77475286, "learning_rate": 5.619638442198422e-07, "loss": 0.85162038, "num_input_tokens_seen": 273646585, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09857178, "step": 12686, "time_per_iteration": 2.5807154178619385 }, { "auxiliary_loss_clip": 0.0642145, "auxiliary_loss_mlp": 0.01265074, "balance_loss_clip": 0.06276497, "balance_loss_mlp": 0.01254637, "epoch": 0.762783706598527, "flos": 21913325550720.0, "grad_norm": 1.619560855804878, "language_loss": 0.72066218, "learning_rate": 5.616931989794198e-07, "loss": 0.79752743, "num_input_tokens_seen": 273665410, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10418701, "step": 12687, "time_per_iteration": 2.564939022064209 }, { "auxiliary_loss_clip": 0.06418756, "auxiliary_loss_mlp": 0.01267645, "balance_loss_clip": 0.06277438, "balance_loss_mlp": 0.01257656, "epoch": 0.7628438298511949, "flos": 15344983843200.0, "grad_norm": 1.9432058137731423, "language_loss": 0.6480478, "learning_rate": 5.614226082797369e-07, "loss": 0.72491181, "num_input_tokens_seen": 273683035, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09991455, "step": 12688, "time_per_iteration": 2.6623637676239014 }, { "auxiliary_loss_clip": 0.06412715, "auxiliary_loss_mlp": 0.0126552, "balance_loss_clip": 0.06274262, "balance_loss_mlp": 0.01256686, "epoch": 0.7629039531038629, "flos": 13011388433280.0, "grad_norm": 1.9398388771312194, "language_loss": 0.7119813, "learning_rate": 5.611520721310515e-07, "loss": 0.78876364, "num_input_tokens_seen": 273700130, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.08837891, "step": 12689, "time_per_iteration": 2.533459424972534 }, { "auxiliary_loss_clip": 0.06427463, "auxiliary_loss_mlp": 0.01263783, "balance_loss_clip": 0.06279553, "balance_loss_mlp": 0.0125424, "epoch": 0.7629640763565309, "flos": 26177938629120.0, "grad_norm": 2.717853023976747, "language_loss": 0.69758177, "learning_rate": 5.608815905436238e-07, "loss": 0.77449429, "num_input_tokens_seen": 273720310, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.09539795, "step": 12690, "time_per_iteration": 2.600489854812622 }, { "auxiliary_loss_clip": 0.06416465, "auxiliary_loss_mlp": 0.01265726, "balance_loss_clip": 0.06275386, "balance_loss_mlp": 0.01256166, "epoch": 0.7630241996091989, "flos": 36802553437440.0, "grad_norm": 1.7694308253475841, "language_loss": 0.69625854, "learning_rate": 5.606111635277109e-07, "loss": 0.77308047, "num_input_tokens_seen": 273744475, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09564209, "step": 12691, "time_per_iteration": 2.6920740604400635 }, { "auxiliary_loss_clip": 0.06413958, "auxiliary_loss_mlp": 0.01262701, "balance_loss_clip": 0.06273947, "balance_loss_mlp": 0.01253701, "epoch": 0.7630843228618668, "flos": 21841600855680.0, "grad_norm": 1.6520558234333649, "language_loss": 0.81858343, "learning_rate": 5.603407910935662e-07, "loss": 0.89534998, "num_input_tokens_seen": 273764635, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09002686, "step": 12692, "time_per_iteration": 2.5407516956329346 }, { "auxiliary_loss_clip": 0.06421688, "auxiliary_loss_mlp": 0.01268404, "balance_loss_clip": 0.06279427, "balance_loss_mlp": 0.01258933, "epoch": 0.7631444461145348, "flos": 12645385799040.0, "grad_norm": 3.6695157779233747, "language_loss": 0.77059436, "learning_rate": 5.600704732514438e-07, "loss": 0.84749532, "num_input_tokens_seen": 273780115, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.09466553, "step": 12693, "time_per_iteration": 3.9190406799316406 }, { "auxiliary_loss_clip": 0.06418563, "auxiliary_loss_mlp": 0.01265925, "balance_loss_clip": 0.06276146, "balance_loss_mlp": 0.01255888, "epoch": 0.7632045693672027, "flos": 16842215324160.0, "grad_norm": 2.208360912234172, "language_loss": 0.73151207, "learning_rate": 5.598002100115933e-07, "loss": 0.808357, "num_input_tokens_seen": 273796605, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.1003418, "step": 12694, "time_per_iteration": 2.506876230239868 }, { "auxiliary_loss_clip": 0.06417108, "auxiliary_loss_mlp": 0.01263733, "balance_loss_clip": 0.06276482, "balance_loss_mlp": 0.01253898, "epoch": 0.7632646926198707, "flos": 22023763632000.0, "grad_norm": 1.7610842596696779, "language_loss": 0.70729512, "learning_rate": 5.595300013842625e-07, "loss": 0.78410357, "num_input_tokens_seen": 273816515, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09832764, "step": 12695, "time_per_iteration": 2.582087278366089 }, { "auxiliary_loss_clip": 0.06417672, "auxiliary_loss_mlp": 0.01266299, "balance_loss_clip": 0.06276217, "balance_loss_mlp": 0.01256238, "epoch": 0.7633248158725388, "flos": 23120974920960.0, "grad_norm": 1.4297823394498856, "language_loss": 0.72364819, "learning_rate": 5.592598473796985e-07, "loss": 0.80048794, "num_input_tokens_seen": 273837060, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10058594, "step": 12696, "time_per_iteration": 2.660799503326416 }, { "auxiliary_loss_clip": 0.06415615, "auxiliary_loss_mlp": 0.01264302, "balance_loss_clip": 0.06272723, "balance_loss_mlp": 0.01254616, "epoch": 0.7633849391252067, "flos": 10894518408960.0, "grad_norm": 2.3464057327742407, "language_loss": 0.7201823, "learning_rate": 5.589897480081453e-07, "loss": 0.79698145, "num_input_tokens_seen": 273853365, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09692383, "step": 12697, "time_per_iteration": 2.5100557804107666 }, { "auxiliary_loss_clip": 0.06413107, "auxiliary_loss_mlp": 0.01262997, "balance_loss_clip": 0.06274301, "balance_loss_mlp": 0.01253651, "epoch": 0.7634450623778747, "flos": 21000163754880.0, "grad_norm": 2.4517556463770642, "language_loss": 0.66703415, "learning_rate": 5.587197032798461e-07, "loss": 0.74379528, "num_input_tokens_seen": 273870750, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09344482, "step": 12698, "time_per_iteration": 4.0406928062438965 }, { "auxiliary_loss_clip": 0.06418069, "auxiliary_loss_mlp": 0.01263935, "balance_loss_clip": 0.06276356, "balance_loss_mlp": 0.01254035, "epoch": 0.7635051856305426, "flos": 18888366902400.0, "grad_norm": 1.6439007301465798, "language_loss": 0.72441, "learning_rate": 5.5844971320504e-07, "loss": 0.80123001, "num_input_tokens_seen": 273890890, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09899902, "step": 12699, "time_per_iteration": 2.5828187465667725 }, { "auxiliary_loss_clip": 0.06415372, "auxiliary_loss_mlp": 0.01265225, "balance_loss_clip": 0.0627777, "balance_loss_mlp": 0.01256368, "epoch": 0.7635653088832106, "flos": 34795492588800.0, "grad_norm": 2.495884101600511, "language_loss": 0.73306459, "learning_rate": 5.581797777939648e-07, "loss": 0.80987054, "num_input_tokens_seen": 273914015, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.08856201, "step": 12700, "time_per_iteration": 2.681140422821045 }, { "auxiliary_loss_clip": 0.06417051, "auxiliary_loss_mlp": 0.01268525, "balance_loss_clip": 0.06274809, "balance_loss_mlp": 0.01258798, "epoch": 0.7636254321358785, "flos": 23183978791680.0, "grad_norm": 2.2670642408528248, "language_loss": 0.69274867, "learning_rate": 5.579098970568574e-07, "loss": 0.76960444, "num_input_tokens_seen": 273927415, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.097229, "step": 12701, "time_per_iteration": 2.516998529434204 }, { "auxiliary_loss_clip": 0.06415502, "auxiliary_loss_mlp": 0.01264269, "balance_loss_clip": 0.06275399, "balance_loss_mlp": 0.01254881, "epoch": 0.7636855553885465, "flos": 21331729560960.0, "grad_norm": 1.5002895116975041, "language_loss": 0.64505726, "learning_rate": 5.576400710039508e-07, "loss": 0.72185493, "num_input_tokens_seen": 273946690, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09387207, "step": 12702, "time_per_iteration": 2.562946319580078 }, { "auxiliary_loss_clip": 0.06417522, "auxiliary_loss_mlp": 0.01264894, "balance_loss_clip": 0.0627546, "balance_loss_mlp": 0.01255459, "epoch": 0.7637456786412145, "flos": 28665674824320.0, "grad_norm": 1.943922567960492, "language_loss": 0.65742731, "learning_rate": 5.57370299645477e-07, "loss": 0.7342515, "num_input_tokens_seen": 273966870, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09436035, "step": 12703, "time_per_iteration": 2.6479194164276123 }, { "auxiliary_loss_clip": 0.06419702, "auxiliary_loss_mlp": 0.01263247, "balance_loss_clip": 0.0627833, "balance_loss_mlp": 0.01254264, "epoch": 0.7638058018938825, "flos": 21913577112960.0, "grad_norm": 2.0090097521253747, "language_loss": 0.83505058, "learning_rate": 5.571005829916668e-07, "loss": 0.91188008, "num_input_tokens_seen": 273986360, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.08984375, "step": 12704, "time_per_iteration": 2.5324957370758057 }, { "auxiliary_loss_clip": 0.06416295, "auxiliary_loss_mlp": 0.01266859, "balance_loss_clip": 0.06275466, "balance_loss_mlp": 0.01257417, "epoch": 0.7638659251465504, "flos": 29651777199360.0, "grad_norm": 1.5683289046786455, "language_loss": 0.68249738, "learning_rate": 5.568309210527469e-07, "loss": 0.75932896, "num_input_tokens_seen": 274009745, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09448242, "step": 12705, "time_per_iteration": 2.612349033355713 }, { "auxiliary_loss_clip": 0.06412639, "auxiliary_loss_mlp": 0.01264661, "balance_loss_clip": 0.0627438, "balance_loss_mlp": 0.01255577, "epoch": 0.7639260483992184, "flos": 26148449191680.0, "grad_norm": 1.6979550189614205, "language_loss": 0.74118245, "learning_rate": 5.565613138389427e-07, "loss": 0.81795549, "num_input_tokens_seen": 274028775, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09088135, "step": 12706, "time_per_iteration": 2.593877077102661 }, { "auxiliary_loss_clip": 0.06416699, "auxiliary_loss_mlp": 0.01266109, "balance_loss_clip": 0.0627687, "balance_loss_mlp": 0.01256822, "epoch": 0.7639861716518863, "flos": 20162835504000.0, "grad_norm": 1.8112405420474513, "language_loss": 0.78662896, "learning_rate": 5.562917613604781e-07, "loss": 0.86345702, "num_input_tokens_seen": 274047520, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09289551, "step": 12707, "time_per_iteration": 2.5455377101898193 }, { "auxiliary_loss_clip": 0.06413092, "auxiliary_loss_mlp": 0.01264359, "balance_loss_clip": 0.06272257, "balance_loss_mlp": 0.01254465, "epoch": 0.7640462949045543, "flos": 18588219177600.0, "grad_norm": 1.7193271981716178, "language_loss": 0.80571461, "learning_rate": 5.560222636275751e-07, "loss": 0.88248903, "num_input_tokens_seen": 274065350, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09906006, "step": 12708, "time_per_iteration": 3.9692108631134033 }, { "auxiliary_loss_clip": 0.06322177, "auxiliary_loss_mlp": 0.01251099, "balance_loss_clip": 0.06265758, "balance_loss_mlp": 0.01249848, "epoch": 0.7641064181572224, "flos": 68342972538240.0, "grad_norm": 0.7941676028014655, "language_loss": 0.55446005, "learning_rate": 5.557528206504521e-07, "loss": 0.63019288, "num_input_tokens_seen": 274122315, "router_z_loss_clip": 0.56347656, "router_z_loss_mlp": 0.01250458, "step": 12709, "time_per_iteration": 3.197835683822632 }, { "auxiliary_loss_clip": 0.06421676, "auxiliary_loss_mlp": 0.01268012, "balance_loss_clip": 0.0627795, "balance_loss_mlp": 0.01257635, "epoch": 0.7641665414098903, "flos": 17974995471360.0, "grad_norm": 1.7460489198762066, "language_loss": 0.63691491, "learning_rate": 5.554834324393271e-07, "loss": 0.71381181, "num_input_tokens_seen": 274140555, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.1038208, "step": 12710, "time_per_iteration": 2.5269882678985596 }, { "auxiliary_loss_clip": 0.06417829, "auxiliary_loss_mlp": 0.01264273, "balance_loss_clip": 0.06273881, "balance_loss_mlp": 0.01254123, "epoch": 0.7642266646625583, "flos": 21258537419520.0, "grad_norm": 1.912230297572164, "language_loss": 0.65082425, "learning_rate": 5.552140990044154e-07, "loss": 0.72764528, "num_input_tokens_seen": 274161125, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10150146, "step": 12711, "time_per_iteration": 2.564969301223755 }, { "auxiliary_loss_clip": 0.06415703, "auxiliary_loss_mlp": 0.01266754, "balance_loss_clip": 0.06274637, "balance_loss_mlp": 0.01257587, "epoch": 0.7642867879152262, "flos": 22754469162240.0, "grad_norm": 1.731914167331031, "language_loss": 0.73010671, "learning_rate": 5.549448203559293e-07, "loss": 0.80693132, "num_input_tokens_seen": 274180835, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09161377, "step": 12712, "time_per_iteration": 2.696211338043213 }, { "auxiliary_loss_clip": 0.06413436, "auxiliary_loss_mlp": 0.01263768, "balance_loss_clip": 0.06275298, "balance_loss_mlp": 0.01254572, "epoch": 0.7643469111678942, "flos": 23339000044800.0, "grad_norm": 1.516316185649364, "language_loss": 0.80887675, "learning_rate": 5.546755965040804e-07, "loss": 0.88564879, "num_input_tokens_seen": 274201190, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09197998, "step": 12713, "time_per_iteration": 2.6813910007476807 }, { "auxiliary_loss_clip": 0.06424829, "auxiliary_loss_mlp": 0.01267222, "balance_loss_clip": 0.0628031, "balance_loss_mlp": 0.01256314, "epoch": 0.7644070344205621, "flos": 19861891165440.0, "grad_norm": 1.9647990366305492, "language_loss": 0.83630788, "learning_rate": 5.544064274590776e-07, "loss": 0.91322839, "num_input_tokens_seen": 274217595, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10900879, "step": 12714, "time_per_iteration": 3.9679417610168457 }, { "auxiliary_loss_clip": 0.0642112, "auxiliary_loss_mlp": 0.01266191, "balance_loss_clip": 0.06277979, "balance_loss_mlp": 0.01256249, "epoch": 0.7644671576732301, "flos": 22097123481600.0, "grad_norm": 1.6015403680937823, "language_loss": 0.73111308, "learning_rate": 5.541373132311287e-07, "loss": 0.80798614, "num_input_tokens_seen": 274237885, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09936523, "step": 12715, "time_per_iteration": 2.5935885906219482 }, { "auxiliary_loss_clip": 0.06413013, "auxiliary_loss_mlp": 0.012657, "balance_loss_clip": 0.06271297, "balance_loss_mlp": 0.01256694, "epoch": 0.7645272809258981, "flos": 25488084764160.0, "grad_norm": 1.7734135405779918, "language_loss": 0.63519311, "learning_rate": 5.538682538304376e-07, "loss": 0.71198022, "num_input_tokens_seen": 274258820, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09002686, "step": 12716, "time_per_iteration": 2.6192209720611572 }, { "auxiliary_loss_clip": 0.06423903, "auxiliary_loss_mlp": 0.01264118, "balance_loss_clip": 0.06277413, "balance_loss_mlp": 0.01253986, "epoch": 0.7645874041785661, "flos": 21548035676160.0, "grad_norm": 1.5141081370872516, "language_loss": 0.800201, "learning_rate": 5.535992492672068e-07, "loss": 0.87708122, "num_input_tokens_seen": 274278835, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10144043, "step": 12717, "time_per_iteration": 2.555312395095825 }, { "auxiliary_loss_clip": 0.06412531, "auxiliary_loss_mlp": 0.01264629, "balance_loss_clip": 0.06274518, "balance_loss_mlp": 0.01255355, "epoch": 0.764647527431234, "flos": 20637096013440.0, "grad_norm": 2.6865958841254804, "language_loss": 0.66639292, "learning_rate": 5.53330299551638e-07, "loss": 0.74316454, "num_input_tokens_seen": 274297110, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09277344, "step": 12718, "time_per_iteration": 2.580915927886963 }, { "auxiliary_loss_clip": 0.06412395, "auxiliary_loss_mlp": 0.01265538, "balance_loss_clip": 0.06273723, "balance_loss_mlp": 0.01256395, "epoch": 0.764707650683902, "flos": 21440490560640.0, "grad_norm": 1.8158748717195354, "language_loss": 0.77459693, "learning_rate": 5.530614046939286e-07, "loss": 0.8513763, "num_input_tokens_seen": 274315610, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09143066, "step": 12719, "time_per_iteration": 2.5610294342041016 }, { "auxiliary_loss_clip": 0.06416751, "auxiliary_loss_mlp": 0.01262518, "balance_loss_clip": 0.06274916, "balance_loss_mlp": 0.0125325, "epoch": 0.7647677739365699, "flos": 22717852128000.0, "grad_norm": 3.1463627502058795, "language_loss": 0.70178914, "learning_rate": 5.527925647042754e-07, "loss": 0.77858186, "num_input_tokens_seen": 274333975, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09265137, "step": 12720, "time_per_iteration": 2.6123569011688232 }, { "auxiliary_loss_clip": 0.06421033, "auxiliary_loss_mlp": 0.01263799, "balance_loss_clip": 0.06280205, "balance_loss_mlp": 0.01254, "epoch": 0.7648278971892379, "flos": 21330429822720.0, "grad_norm": 1.778924227315587, "language_loss": 0.74275661, "learning_rate": 5.52523779592875e-07, "loss": 0.81960493, "num_input_tokens_seen": 274353695, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09796143, "step": 12721, "time_per_iteration": 2.5673630237579346 }, { "auxiliary_loss_clip": 0.06418274, "auxiliary_loss_mlp": 0.01264618, "balance_loss_clip": 0.06275078, "balance_loss_mlp": 0.0125501, "epoch": 0.764888020441906, "flos": 20673545339520.0, "grad_norm": 1.7244805027862815, "language_loss": 0.73902279, "learning_rate": 5.522550493699163e-07, "loss": 0.81585169, "num_input_tokens_seen": 274371120, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09606934, "step": 12722, "time_per_iteration": 2.5473005771636963 }, { "auxiliary_loss_clip": 0.06413732, "auxiliary_loss_mlp": 0.0126621, "balance_loss_clip": 0.06274639, "balance_loss_mlp": 0.0125656, "epoch": 0.7649481436945739, "flos": 25089532018560.0, "grad_norm": 1.9767167653774675, "language_loss": 0.74534929, "learning_rate": 5.519863740455912e-07, "loss": 0.82214868, "num_input_tokens_seen": 274389665, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09649658, "step": 12723, "time_per_iteration": 2.568502187728882 }, { "auxiliary_loss_clip": 0.06418331, "auxiliary_loss_mlp": 0.01263472, "balance_loss_clip": 0.06273004, "balance_loss_mlp": 0.01253464, "epoch": 0.7650082669472419, "flos": 24907998147840.0, "grad_norm": 2.0807469126871294, "language_loss": 0.7312274, "learning_rate": 5.517177536300881e-07, "loss": 0.80804539, "num_input_tokens_seen": 274408750, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.09997559, "step": 12724, "time_per_iteration": 2.573145866394043 }, { "auxiliary_loss_clip": 0.06412205, "auxiliary_loss_mlp": 0.0126647, "balance_loss_clip": 0.06272992, "balance_loss_mlp": 0.01256832, "epoch": 0.7650683901999098, "flos": 14652614355840.0, "grad_norm": 1.8145760832407687, "language_loss": 0.84325731, "learning_rate": 5.514491881335935e-07, "loss": 0.92004406, "num_input_tokens_seen": 274424600, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09637451, "step": 12725, "time_per_iteration": 2.496466875076294 }, { "auxiliary_loss_clip": 0.06418191, "auxiliary_loss_mlp": 0.01269907, "balance_loss_clip": 0.06279654, "balance_loss_mlp": 0.01259232, "epoch": 0.7651285134525778, "flos": 26358466250880.0, "grad_norm": 1.6205414510531322, "language_loss": 0.77562857, "learning_rate": 5.511806775662901e-07, "loss": 0.8525095, "num_input_tokens_seen": 274443075, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.10668945, "step": 12726, "time_per_iteration": 2.58561110496521 }, { "auxiliary_loss_clip": 0.06416528, "auxiliary_loss_mlp": 0.01264936, "balance_loss_clip": 0.06276301, "balance_loss_mlp": 0.012559, "epoch": 0.7651886367052457, "flos": 26653373095680.0, "grad_norm": 1.8672805740652043, "language_loss": 0.70727187, "learning_rate": 5.509122219383615e-07, "loss": 0.78408647, "num_input_tokens_seen": 274463240, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09039307, "step": 12727, "time_per_iteration": 2.587804079055786 }, { "auxiliary_loss_clip": 0.06408501, "auxiliary_loss_mlp": 0.01262461, "balance_loss_clip": 0.0627096, "balance_loss_mlp": 0.01253258, "epoch": 0.7652487599579137, "flos": 25709967175680.0, "grad_norm": 1.5984183897500854, "language_loss": 0.79554307, "learning_rate": 5.506438212599864e-07, "loss": 0.8722527, "num_input_tokens_seen": 274482750, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09197998, "step": 12728, "time_per_iteration": 2.6740529537200928 }, { "auxiliary_loss_clip": 0.0642083, "auxiliary_loss_mlp": 0.01266566, "balance_loss_clip": 0.06277955, "balance_loss_mlp": 0.01256314, "epoch": 0.7653088832105817, "flos": 28593237369600.0, "grad_norm": 2.496859328289993, "language_loss": 0.55518836, "learning_rate": 5.503754755413424e-07, "loss": 0.63206232, "num_input_tokens_seen": 274503545, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.1026001, "step": 12729, "time_per_iteration": 2.6369428634643555 }, { "auxiliary_loss_clip": 0.06412964, "auxiliary_loss_mlp": 0.01268028, "balance_loss_clip": 0.06273389, "balance_loss_mlp": 0.01257168, "epoch": 0.7653690064632497, "flos": 23373311091840.0, "grad_norm": 3.449233542736839, "language_loss": 0.77848351, "learning_rate": 5.501071847926055e-07, "loss": 0.85529345, "num_input_tokens_seen": 274523825, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.10870361, "step": 12730, "time_per_iteration": 2.5966739654541016 }, { "auxiliary_loss_clip": 0.06423707, "auxiliary_loss_mlp": 0.01263974, "balance_loss_clip": 0.06279893, "balance_loss_mlp": 0.01253471, "epoch": 0.7654291297159176, "flos": 15778560395520.0, "grad_norm": 1.7227819554063424, "language_loss": 0.69268596, "learning_rate": 5.498389490239495e-07, "loss": 0.76956278, "num_input_tokens_seen": 274541625, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10510254, "step": 12731, "time_per_iteration": 2.539896249771118 }, { "auxiliary_loss_clip": 0.06421433, "auxiliary_loss_mlp": 0.01264469, "balance_loss_clip": 0.06278597, "balance_loss_mlp": 0.01254789, "epoch": 0.7654892529685856, "flos": 18038460539520.0, "grad_norm": 2.8203120485192334, "language_loss": 0.70511115, "learning_rate": 5.495707682455471e-07, "loss": 0.78197014, "num_input_tokens_seen": 274557580, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09686279, "step": 12732, "time_per_iteration": 2.545440196990967 }, { "auxiliary_loss_clip": 0.06420912, "auxiliary_loss_mlp": 0.01267708, "balance_loss_clip": 0.06277116, "balance_loss_mlp": 0.01257546, "epoch": 0.7655493762212535, "flos": 27243522201600.0, "grad_norm": 1.65847376602568, "language_loss": 0.78611922, "learning_rate": 5.493026424675653e-07, "loss": 0.8630054, "num_input_tokens_seen": 274578135, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10162354, "step": 12733, "time_per_iteration": 4.0010576248168945 }, { "auxiliary_loss_clip": 0.0641752, "auxiliary_loss_mlp": 0.01264784, "balance_loss_clip": 0.06279401, "balance_loss_mlp": 0.01255152, "epoch": 0.7656094994739215, "flos": 20779706862720.0, "grad_norm": 1.838754921011946, "language_loss": 0.77701437, "learning_rate": 5.490345717001726e-07, "loss": 0.85383743, "num_input_tokens_seen": 274595655, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09631348, "step": 12734, "time_per_iteration": 2.5346713066101074 }, { "auxiliary_loss_clip": 0.06423667, "auxiliary_loss_mlp": 0.01266399, "balance_loss_clip": 0.06279016, "balance_loss_mlp": 0.01255998, "epoch": 0.7656696227265896, "flos": 23045896062720.0, "grad_norm": 1.788263128451083, "language_loss": 0.732777, "learning_rate": 5.48766555953535e-07, "loss": 0.80967772, "num_input_tokens_seen": 274616305, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10406494, "step": 12735, "time_per_iteration": 2.6011204719543457 }, { "auxiliary_loss_clip": 0.06418547, "auxiliary_loss_mlp": 0.01265744, "balance_loss_clip": 0.06275886, "balance_loss_mlp": 0.01256416, "epoch": 0.7657297459792575, "flos": 27532810823040.0, "grad_norm": 1.778470111610184, "language_loss": 0.73209536, "learning_rate": 5.484985952378145e-07, "loss": 0.80893826, "num_input_tokens_seen": 274638110, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09338379, "step": 12736, "time_per_iteration": 2.6032748222351074 }, { "auxiliary_loss_clip": 0.06425664, "auxiliary_loss_mlp": 0.01268826, "balance_loss_clip": 0.06279959, "balance_loss_mlp": 0.01257472, "epoch": 0.7657898692319255, "flos": 17134103422080.0, "grad_norm": 2.1419749675541992, "language_loss": 0.77957696, "learning_rate": 5.482306895631728e-07, "loss": 0.85652184, "num_input_tokens_seen": 274656565, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.11358643, "step": 12737, "time_per_iteration": 3.966278314590454 }, { "auxiliary_loss_clip": 0.0641682, "auxiliary_loss_mlp": 0.01266745, "balance_loss_clip": 0.06275108, "balance_loss_mlp": 0.01256368, "epoch": 0.7658499924845934, "flos": 21471363590400.0, "grad_norm": 1.797188636775861, "language_loss": 0.76470613, "learning_rate": 5.479628389397699e-07, "loss": 0.84154177, "num_input_tokens_seen": 274674215, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.1038208, "step": 12738, "time_per_iteration": 2.614213705062866 }, { "auxiliary_loss_clip": 0.06424162, "auxiliary_loss_mlp": 0.01265799, "balance_loss_clip": 0.06278997, "balance_loss_mlp": 0.01255314, "epoch": 0.7659101157372614, "flos": 29504302813440.0, "grad_norm": 2.2236066663577407, "language_loss": 0.63292527, "learning_rate": 5.476950433777603e-07, "loss": 0.70982492, "num_input_tokens_seen": 274693445, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10491943, "step": 12739, "time_per_iteration": 2.623121976852417 }, { "auxiliary_loss_clip": 0.06419854, "auxiliary_loss_mlp": 0.01267853, "balance_loss_clip": 0.06278248, "balance_loss_mlp": 0.01257762, "epoch": 0.7659702389899293, "flos": 18557765418240.0, "grad_norm": 1.916947266227244, "language_loss": 0.79143018, "learning_rate": 5.474273028873004e-07, "loss": 0.86830723, "num_input_tokens_seen": 274712815, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.10089111, "step": 12740, "time_per_iteration": 2.560011386871338 }, { "auxiliary_loss_clip": 0.06419093, "auxiliary_loss_mlp": 0.01263413, "balance_loss_clip": 0.06277964, "balance_loss_mlp": 0.01252863, "epoch": 0.7660303622425974, "flos": 23555767357440.0, "grad_norm": 2.0732199524718293, "language_loss": 0.65696502, "learning_rate": 5.471596174785429e-07, "loss": 0.7337901, "num_input_tokens_seen": 274732690, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10546875, "step": 12741, "time_per_iteration": 2.580723285675049 }, { "auxiliary_loss_clip": 0.06417534, "auxiliary_loss_mlp": 0.01265601, "balance_loss_clip": 0.06277063, "balance_loss_mlp": 0.0125554, "epoch": 0.7660904854952653, "flos": 18922761803520.0, "grad_norm": 1.8457157959517239, "language_loss": 0.76091969, "learning_rate": 5.468919871616386e-07, "loss": 0.83775103, "num_input_tokens_seen": 274752460, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.10064697, "step": 12742, "time_per_iteration": 2.5986576080322266 }, { "auxiliary_loss_clip": 0.06409672, "auxiliary_loss_mlp": 0.01262797, "balance_loss_clip": 0.06272885, "balance_loss_mlp": 0.01254024, "epoch": 0.7661506087479333, "flos": 23153986229760.0, "grad_norm": 1.3833040494861137, "language_loss": 0.76776719, "learning_rate": 5.46624411946736e-07, "loss": 0.84449196, "num_input_tokens_seen": 274773070, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.08776855, "step": 12743, "time_per_iteration": 2.5648269653320312 }, { "auxiliary_loss_clip": 0.06418072, "auxiliary_loss_mlp": 0.01265338, "balance_loss_clip": 0.06276235, "balance_loss_mlp": 0.01255921, "epoch": 0.7662107320006012, "flos": 17571411480960.0, "grad_norm": 2.9089785360071554, "language_loss": 0.75277889, "learning_rate": 5.463568918439805e-07, "loss": 0.82961303, "num_input_tokens_seen": 274790220, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09423828, "step": 12744, "time_per_iteration": 2.5260236263275146 }, { "auxiliary_loss_clip": 0.0642014, "auxiliary_loss_mlp": 0.01266924, "balance_loss_clip": 0.06277215, "balance_loss_mlp": 0.0125669, "epoch": 0.7662708552532692, "flos": 22308524133120.0, "grad_norm": 2.1938439024909164, "language_loss": 0.70996797, "learning_rate": 5.460894268635181e-07, "loss": 0.78683865, "num_input_tokens_seen": 274805095, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10235596, "step": 12745, "time_per_iteration": 2.5297255516052246 }, { "auxiliary_loss_clip": 0.06415535, "auxiliary_loss_mlp": 0.01264216, "balance_loss_clip": 0.06274235, "balance_loss_mlp": 0.01254232, "epoch": 0.7663309785059371, "flos": 15747477730560.0, "grad_norm": 2.5707575722038203, "language_loss": 0.77017963, "learning_rate": 5.458220170154896e-07, "loss": 0.84697717, "num_input_tokens_seen": 274821800, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09979248, "step": 12746, "time_per_iteration": 2.514404773712158 }, { "auxiliary_loss_clip": 0.0632099, "auxiliary_loss_mlp": 0.01252573, "balance_loss_clip": 0.0626464, "balance_loss_mlp": 0.01251178, "epoch": 0.7663911017586051, "flos": 62184503877120.0, "grad_norm": 0.6646465240003405, "language_loss": 0.56716466, "learning_rate": 5.455546623100362e-07, "loss": 0.64290029, "num_input_tokens_seen": 274886970, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01396179, "step": 12747, "time_per_iteration": 4.659748315811157 }, { "auxiliary_loss_clip": 0.06414509, "auxiliary_loss_mlp": 0.01264676, "balance_loss_clip": 0.06276302, "balance_loss_mlp": 0.01256093, "epoch": 0.7664512250112732, "flos": 26513361722880.0, "grad_norm": 1.5038488676621171, "language_loss": 0.72687507, "learning_rate": 5.452873627572956e-07, "loss": 0.80366695, "num_input_tokens_seen": 274907240, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.08575439, "step": 12748, "time_per_iteration": 2.594836473464966 }, { "auxiliary_loss_clip": 0.064215, "auxiliary_loss_mlp": 0.01268636, "balance_loss_clip": 0.06280436, "balance_loss_mlp": 0.01259022, "epoch": 0.7665113482639411, "flos": 16254497986560.0, "grad_norm": 1.6589183386527042, "language_loss": 0.6951648, "learning_rate": 5.450201183674052e-07, "loss": 0.77206618, "num_input_tokens_seen": 274924650, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09613037, "step": 12749, "time_per_iteration": 2.5294320583343506 }, { "auxiliary_loss_clip": 0.0641868, "auxiliary_loss_mlp": 0.01265966, "balance_loss_clip": 0.06276026, "balance_loss_mlp": 0.01255464, "epoch": 0.7665714715166091, "flos": 27205102304640.0, "grad_norm": 1.6358786304235724, "language_loss": 0.73672581, "learning_rate": 5.447529291504967e-07, "loss": 0.81357229, "num_input_tokens_seen": 274944550, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.1050415, "step": 12750, "time_per_iteration": 2.6040806770324707 }, { "auxiliary_loss_clip": 0.06415505, "auxiliary_loss_mlp": 0.01265698, "balance_loss_clip": 0.06278637, "balance_loss_mlp": 0.01256596, "epoch": 0.766631594769277, "flos": 21073900947840.0, "grad_norm": 2.0649492999777266, "language_loss": 0.76274037, "learning_rate": 5.444857951167026e-07, "loss": 0.8395524, "num_input_tokens_seen": 274961330, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09094238, "step": 12751, "time_per_iteration": 2.569941282272339 }, { "auxiliary_loss_clip": 0.06419306, "auxiliary_loss_mlp": 0.01266404, "balance_loss_clip": 0.06279401, "balance_loss_mlp": 0.01256712, "epoch": 0.766691718021945, "flos": 24104897089920.0, "grad_norm": 1.8290827237207552, "language_loss": 0.61264938, "learning_rate": 5.442187162761537e-07, "loss": 0.68950653, "num_input_tokens_seen": 274981655, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09692383, "step": 12752, "time_per_iteration": 2.6929454803466797 }, { "auxiliary_loss_clip": 0.06422368, "auxiliary_loss_mlp": 0.01266566, "balance_loss_clip": 0.06279729, "balance_loss_mlp": 0.01255975, "epoch": 0.7667518412746129, "flos": 23447383701120.0, "grad_norm": 2.6404837669805508, "language_loss": 0.69105327, "learning_rate": 5.439516926389767e-07, "loss": 0.76794261, "num_input_tokens_seen": 274999970, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.105896, "step": 12753, "time_per_iteration": 2.5753331184387207 }, { "auxiliary_loss_clip": 0.0641721, "auxiliary_loss_mlp": 0.01265905, "balance_loss_clip": 0.06276967, "balance_loss_mlp": 0.01255778, "epoch": 0.766811964527281, "flos": 18154391063040.0, "grad_norm": 2.3174402333606947, "language_loss": 0.61967438, "learning_rate": 5.436847242152971e-07, "loss": 0.69650549, "num_input_tokens_seen": 275015805, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.10125732, "step": 12754, "time_per_iteration": 3.9434096813201904 }, { "auxiliary_loss_clip": 0.06416047, "auxiliary_loss_mlp": 0.0126424, "balance_loss_clip": 0.06277511, "balance_loss_mlp": 0.01254673, "epoch": 0.7668720877799489, "flos": 19542023003520.0, "grad_norm": 2.189366356595298, "language_loss": 0.80270362, "learning_rate": 5.434178110152401e-07, "loss": 0.87950647, "num_input_tokens_seen": 275031810, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09558105, "step": 12755, "time_per_iteration": 2.522589683532715 }, { "auxiliary_loss_clip": 0.06416921, "auxiliary_loss_mlp": 0.01267099, "balance_loss_clip": 0.06277739, "balance_loss_mlp": 0.01257121, "epoch": 0.7669322110326169, "flos": 22680899677440.0, "grad_norm": 2.933824105935251, "language_loss": 0.70682973, "learning_rate": 5.431509530489242e-07, "loss": 0.78366989, "num_input_tokens_seen": 275049325, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09985352, "step": 12756, "time_per_iteration": 2.544177293777466 }, { "auxiliary_loss_clip": 0.0642231, "auxiliary_loss_mlp": 0.01266357, "balance_loss_clip": 0.06280199, "balance_loss_mlp": 0.01256909, "epoch": 0.7669923342852848, "flos": 26476702761600.0, "grad_norm": 1.5828351301435553, "language_loss": 0.70111704, "learning_rate": 5.428841503264706e-07, "loss": 0.77800369, "num_input_tokens_seen": 275070865, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09442139, "step": 12757, "time_per_iteration": 2.5825278759002686 }, { "auxiliary_loss_clip": 0.06416842, "auxiliary_loss_mlp": 0.01265228, "balance_loss_clip": 0.06277329, "balance_loss_mlp": 0.01254618, "epoch": 0.7670524575379528, "flos": 22862643183360.0, "grad_norm": 1.8600571760110314, "language_loss": 0.7622205, "learning_rate": 5.426174028579955e-07, "loss": 0.83904123, "num_input_tokens_seen": 275088015, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.10614014, "step": 12758, "time_per_iteration": 2.526986598968506 }, { "auxiliary_loss_clip": 0.06414017, "auxiliary_loss_mlp": 0.01267431, "balance_loss_clip": 0.06276228, "balance_loss_mlp": 0.01257823, "epoch": 0.7671125807906207, "flos": 22458136798080.0, "grad_norm": 1.7232311541977339, "language_loss": 0.76497084, "learning_rate": 5.423507106536156e-07, "loss": 0.84178531, "num_input_tokens_seen": 275106975, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09606934, "step": 12759, "time_per_iteration": 2.6233041286468506 }, { "auxiliary_loss_clip": 0.06419712, "auxiliary_loss_mlp": 0.01263479, "balance_loss_clip": 0.06275416, "balance_loss_mlp": 0.01254473, "epoch": 0.7671727040432887, "flos": 35380275033600.0, "grad_norm": 5.309648035931793, "language_loss": 0.68483728, "learning_rate": 5.420840737234425e-07, "loss": 0.76166916, "num_input_tokens_seen": 275129560, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.09014893, "step": 12760, "time_per_iteration": 2.8535895347595215 }, { "auxiliary_loss_clip": 0.06419646, "auxiliary_loss_mlp": 0.0126407, "balance_loss_clip": 0.06276961, "balance_loss_mlp": 0.01253985, "epoch": 0.7672328272959568, "flos": 22502007210240.0, "grad_norm": 1.5159111817113309, "language_loss": 0.79294336, "learning_rate": 5.418174920775871e-07, "loss": 0.86978048, "num_input_tokens_seen": 275151180, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10083008, "step": 12761, "time_per_iteration": 2.638519048690796 }, { "auxiliary_loss_clip": 0.0641218, "auxiliary_loss_mlp": 0.0126761, "balance_loss_clip": 0.06275117, "balance_loss_mlp": 0.01257882, "epoch": 0.7672929505486247, "flos": 22821372247680.0, "grad_norm": 1.5460071234440722, "language_loss": 0.66071308, "learning_rate": 5.415509657261589e-07, "loss": 0.73751092, "num_input_tokens_seen": 275170605, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09735107, "step": 12762, "time_per_iteration": 2.571284770965576 }, { "auxiliary_loss_clip": 0.06421287, "auxiliary_loss_mlp": 0.01264825, "balance_loss_clip": 0.06278982, "balance_loss_mlp": 0.01255419, "epoch": 0.7673530738012927, "flos": 20344956353280.0, "grad_norm": 1.6832392538274876, "language_loss": 0.74226236, "learning_rate": 5.412844946792639e-07, "loss": 0.81912345, "num_input_tokens_seen": 275188750, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09405518, "step": 12763, "time_per_iteration": 2.5310287475585938 }, { "auxiliary_loss_clip": 0.06416477, "auxiliary_loss_mlp": 0.01264604, "balance_loss_clip": 0.06276949, "balance_loss_mlp": 0.01254596, "epoch": 0.7674131970539606, "flos": 34942212288000.0, "grad_norm": 1.344287772720094, "language_loss": 0.70859832, "learning_rate": 5.410180789470067e-07, "loss": 0.78540909, "num_input_tokens_seen": 275211365, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.10009766, "step": 12764, "time_per_iteration": 2.7313780784606934 }, { "auxiliary_loss_clip": 0.06415581, "auxiliary_loss_mlp": 0.01266417, "balance_loss_clip": 0.06277237, "balance_loss_mlp": 0.01257327, "epoch": 0.7674733203066286, "flos": 28336247297280.0, "grad_norm": 1.5882317628169291, "language_loss": 0.69272029, "learning_rate": 5.40751718539491e-07, "loss": 0.76954025, "num_input_tokens_seen": 275231670, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09082031, "step": 12765, "time_per_iteration": 2.6241776943206787 }, { "auxiliary_loss_clip": 0.06410491, "auxiliary_loss_mlp": 0.01264645, "balance_loss_clip": 0.06272124, "balance_loss_mlp": 0.01255293, "epoch": 0.7675334435592965, "flos": 16295307724800.0, "grad_norm": 1.906349518457467, "language_loss": 0.60923409, "learning_rate": 5.404854134668162e-07, "loss": 0.68598545, "num_input_tokens_seen": 275249425, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09350586, "step": 12766, "time_per_iteration": 2.528245687484741 }, { "auxiliary_loss_clip": 0.06325853, "auxiliary_loss_mlp": 0.01250123, "balance_loss_clip": 0.06269279, "balance_loss_mlp": 0.01248769, "epoch": 0.7675935668119646, "flos": 64847778376320.0, "grad_norm": 0.7203215158358007, "language_loss": 0.60684395, "learning_rate": 5.402191637390803e-07, "loss": 0.68260372, "num_input_tokens_seen": 275312485, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.01355743, "step": 12767, "time_per_iteration": 3.283768653869629 }, { "auxiliary_loss_clip": 0.06413597, "auxiliary_loss_mlp": 0.0126488, "balance_loss_clip": 0.06274661, "balance_loss_mlp": 0.01255689, "epoch": 0.7676536900646325, "flos": 22682157488640.0, "grad_norm": 1.637409741636717, "language_loss": 0.69420576, "learning_rate": 5.399529693663801e-07, "loss": 0.77099055, "num_input_tokens_seen": 275331680, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09185791, "step": 12768, "time_per_iteration": 2.5735435485839844 }, { "auxiliary_loss_clip": 0.06422757, "auxiliary_loss_mlp": 0.01270161, "balance_loss_clip": 0.06276288, "balance_loss_mlp": 0.0125917, "epoch": 0.7677138133173005, "flos": 26946393223680.0, "grad_norm": 1.8177903134586888, "language_loss": 0.70950544, "learning_rate": 5.3968683035881e-07, "loss": 0.78643465, "num_input_tokens_seen": 275351615, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.10998535, "step": 12769, "time_per_iteration": 2.584031820297241 }, { "auxiliary_loss_clip": 0.06424207, "auxiliary_loss_mlp": 0.01270258, "balance_loss_clip": 0.06279714, "balance_loss_mlp": 0.01259839, "epoch": 0.7677739365699684, "flos": 23805336343680.0, "grad_norm": 2.1101848841440125, "language_loss": 0.80675066, "learning_rate": 5.394207467264611e-07, "loss": 0.88369524, "num_input_tokens_seen": 275368815, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10418701, "step": 12770, "time_per_iteration": 2.5555319786071777 }, { "auxiliary_loss_clip": 0.06413297, "auxiliary_loss_mlp": 0.01264546, "balance_loss_clip": 0.06275787, "balance_loss_mlp": 0.01255856, "epoch": 0.7678340598226364, "flos": 34463423658240.0, "grad_norm": 3.709060499109123, "language_loss": 0.78773785, "learning_rate": 5.391547184794245e-07, "loss": 0.86451626, "num_input_tokens_seen": 275389345, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08691406, "step": 12771, "time_per_iteration": 2.664888620376587 }, { "auxiliary_loss_clip": 0.0641668, "auxiliary_loss_mlp": 0.01264143, "balance_loss_clip": 0.062755, "balance_loss_mlp": 0.01254344, "epoch": 0.7678941830753043, "flos": 23848493996160.0, "grad_norm": 1.4059405666729632, "language_loss": 0.68840003, "learning_rate": 5.388887456277876e-07, "loss": 0.76520824, "num_input_tokens_seen": 275411240, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09802246, "step": 12772, "time_per_iteration": 3.9999709129333496 }, { "auxiliary_loss_clip": 0.06411625, "auxiliary_loss_mlp": 0.01265972, "balance_loss_clip": 0.06277183, "balance_loss_mlp": 0.0125665, "epoch": 0.7679543063279723, "flos": 25417995223680.0, "grad_norm": 3.5023227428099015, "language_loss": 0.73839581, "learning_rate": 5.386228281816349e-07, "loss": 0.81517172, "num_input_tokens_seen": 275432010, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.09320068, "step": 12773, "time_per_iteration": 2.6077373027801514 }, { "auxiliary_loss_clip": 0.06407349, "auxiliary_loss_mlp": 0.01265671, "balance_loss_clip": 0.06270986, "balance_loss_mlp": 0.01257565, "epoch": 0.7680144295806404, "flos": 27969448049280.0, "grad_norm": 1.5788232878729045, "language_loss": 0.81079137, "learning_rate": 5.383569661510512e-07, "loss": 0.88752162, "num_input_tokens_seen": 275453710, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.08105469, "step": 12774, "time_per_iteration": 2.6080644130706787 }, { "auxiliary_loss_clip": 0.06413664, "auxiliary_loss_mlp": 0.01264979, "balance_loss_clip": 0.06276217, "balance_loss_mlp": 0.01255288, "epoch": 0.7680745528333083, "flos": 20419112816640.0, "grad_norm": 1.62316931537584, "language_loss": 0.70236176, "learning_rate": 5.380911595461177e-07, "loss": 0.77914822, "num_input_tokens_seen": 275472915, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09692383, "step": 12775, "time_per_iteration": 2.5515213012695312 }, { "auxiliary_loss_clip": 0.06325826, "auxiliary_loss_mlp": 0.01252118, "balance_loss_clip": 0.0626926, "balance_loss_mlp": 0.01250816, "epoch": 0.7681346760859763, "flos": 68423124568320.0, "grad_norm": 0.6770185509946666, "language_loss": 0.56821907, "learning_rate": 5.378254083769147e-07, "loss": 0.6439985, "num_input_tokens_seen": 275534785, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.01303864, "step": 12776, "time_per_iteration": 3.2338972091674805 }, { "auxiliary_loss_clip": 0.06415379, "auxiliary_loss_mlp": 0.01265283, "balance_loss_clip": 0.06275129, "balance_loss_mlp": 0.01256366, "epoch": 0.7681947993386442, "flos": 21257824659840.0, "grad_norm": 1.8149056890988908, "language_loss": 0.74326301, "learning_rate": 5.375597126535188e-07, "loss": 0.82006967, "num_input_tokens_seen": 275553205, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.08917236, "step": 12777, "time_per_iteration": 3.9992380142211914 }, { "auxiliary_loss_clip": 0.06421781, "auxiliary_loss_mlp": 0.01264945, "balance_loss_clip": 0.06281818, "balance_loss_mlp": 0.01255849, "epoch": 0.7682549225913122, "flos": 21404125088640.0, "grad_norm": 2.1459393886000293, "language_loss": 0.70404422, "learning_rate": 5.372940723860043e-07, "loss": 0.78091145, "num_input_tokens_seen": 275571490, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09100342, "step": 12778, "time_per_iteration": 2.564025640487671 }, { "auxiliary_loss_clip": 0.06415875, "auxiliary_loss_mlp": 0.01263096, "balance_loss_clip": 0.0627723, "balance_loss_mlp": 0.0125375, "epoch": 0.7683150458439801, "flos": 23045518719360.0, "grad_norm": 1.6744497990690423, "language_loss": 0.69981229, "learning_rate": 5.37028487584446e-07, "loss": 0.77660197, "num_input_tokens_seen": 275589665, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09338379, "step": 12779, "time_per_iteration": 2.555799722671509 }, { "auxiliary_loss_clip": 0.06418613, "auxiliary_loss_mlp": 0.01266632, "balance_loss_clip": 0.06278107, "balance_loss_mlp": 0.01256785, "epoch": 0.7683751690966482, "flos": 67346361204480.0, "grad_norm": 1.5799386690197985, "language_loss": 0.58633387, "learning_rate": 5.367629582589133e-07, "loss": 0.66318637, "num_input_tokens_seen": 275615605, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09844971, "step": 12780, "time_per_iteration": 2.969148635864258 }, { "auxiliary_loss_clip": 0.06423146, "auxiliary_loss_mlp": 0.0127106, "balance_loss_clip": 0.06276919, "balance_loss_mlp": 0.0126085, "epoch": 0.7684352923493161, "flos": 21805361164800.0, "grad_norm": 1.757858559699524, "language_loss": 0.68321157, "learning_rate": 5.364974844194759e-07, "loss": 0.76015365, "num_input_tokens_seen": 275634965, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10205078, "step": 12781, "time_per_iteration": 2.6093368530273438 }, { "auxiliary_loss_clip": 0.06417574, "auxiliary_loss_mlp": 0.01263605, "balance_loss_clip": 0.06275691, "balance_loss_mlp": 0.01254372, "epoch": 0.7684954156019841, "flos": 25854548595840.0, "grad_norm": 1.4849459422698716, "language_loss": 0.79371619, "learning_rate": 5.362320660762016e-07, "loss": 0.87052792, "num_input_tokens_seen": 275655785, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09222412, "step": 12782, "time_per_iteration": 2.610477924346924 }, { "auxiliary_loss_clip": 0.06419699, "auxiliary_loss_mlp": 0.01266358, "balance_loss_clip": 0.06276917, "balance_loss_mlp": 0.0125554, "epoch": 0.768555538854652, "flos": 25454444549760.0, "grad_norm": 1.7326261350354537, "language_loss": 0.67116952, "learning_rate": 5.35966703239153e-07, "loss": 0.74803007, "num_input_tokens_seen": 275676160, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10821533, "step": 12783, "time_per_iteration": 2.5951321125030518 }, { "auxiliary_loss_clip": 0.06420647, "auxiliary_loss_mlp": 0.01263522, "balance_loss_clip": 0.0627871, "balance_loss_mlp": 0.01253973, "epoch": 0.76861566210732, "flos": 19652503011840.0, "grad_norm": 1.8025729419113863, "language_loss": 0.69259417, "learning_rate": 5.357013959183938e-07, "loss": 0.76943588, "num_input_tokens_seen": 275695660, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09545898, "step": 12784, "time_per_iteration": 2.537168502807617 }, { "auxiliary_loss_clip": 0.06417409, "auxiliary_loss_mlp": 0.0126415, "balance_loss_clip": 0.0627705, "balance_loss_mlp": 0.01255477, "epoch": 0.7686757853599879, "flos": 22425586686720.0, "grad_norm": 1.6669598165303696, "language_loss": 0.80491579, "learning_rate": 5.354361441239843e-07, "loss": 0.88173139, "num_input_tokens_seen": 275714025, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.08673096, "step": 12785, "time_per_iteration": 2.5712528228759766 }, { "auxiliary_loss_clip": 0.0641723, "auxiliary_loss_mlp": 0.01268037, "balance_loss_clip": 0.06275722, "balance_loss_mlp": 0.01257087, "epoch": 0.768735908612656, "flos": 47784659690880.0, "grad_norm": 1.6666888760339904, "language_loss": 0.77428544, "learning_rate": 5.351709478659836e-07, "loss": 0.85113811, "num_input_tokens_seen": 275737300, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.10943604, "step": 12786, "time_per_iteration": 2.811969757080078 }, { "auxiliary_loss_clip": 0.06414931, "auxiliary_loss_mlp": 0.01263503, "balance_loss_clip": 0.06274853, "balance_loss_mlp": 0.01254103, "epoch": 0.7687960318653239, "flos": 30270996472320.0, "grad_norm": 1.810108566129326, "language_loss": 0.58999884, "learning_rate": 5.349058071544468e-07, "loss": 0.66678321, "num_input_tokens_seen": 275757895, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09399414, "step": 12787, "time_per_iteration": 4.047062873840332 }, { "auxiliary_loss_clip": 0.06409016, "auxiliary_loss_mlp": 0.01263181, "balance_loss_clip": 0.06272902, "balance_loss_mlp": 0.01254181, "epoch": 0.7688561551179919, "flos": 19579562432640.0, "grad_norm": 1.5299040938949247, "language_loss": 0.76294208, "learning_rate": 5.346407219994292e-07, "loss": 0.83966398, "num_input_tokens_seen": 275776745, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09002686, "step": 12788, "time_per_iteration": 2.701097011566162 }, { "auxiliary_loss_clip": 0.06419954, "auxiliary_loss_mlp": 0.01268109, "balance_loss_clip": 0.06278472, "balance_loss_mlp": 0.01258077, "epoch": 0.7689162783706599, "flos": 22790373436800.0, "grad_norm": 1.7668691345414007, "language_loss": 0.67290491, "learning_rate": 5.343756924109821e-07, "loss": 0.74978554, "num_input_tokens_seen": 275797205, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10028076, "step": 12789, "time_per_iteration": 2.5556981563568115 }, { "auxiliary_loss_clip": 0.0642129, "auxiliary_loss_mlp": 0.01268651, "balance_loss_clip": 0.06278306, "balance_loss_mlp": 0.01258768, "epoch": 0.7689764016233278, "flos": 34212764568960.0, "grad_norm": 2.076599433882247, "language_loss": 0.69086277, "learning_rate": 5.341107183991553e-07, "loss": 0.76776218, "num_input_tokens_seen": 275817935, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09887695, "step": 12790, "time_per_iteration": 2.6502723693847656 }, { "auxiliary_loss_clip": 0.06415365, "auxiliary_loss_mlp": 0.01265549, "balance_loss_clip": 0.06276583, "balance_loss_mlp": 0.01256387, "epoch": 0.7690365248759958, "flos": 17280152288640.0, "grad_norm": 1.6017059804677842, "language_loss": 0.68720603, "learning_rate": 5.338457999739969e-07, "loss": 0.7640152, "num_input_tokens_seen": 275837145, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09161377, "step": 12791, "time_per_iteration": 2.536367177963257 }, { "auxiliary_loss_clip": 0.06413934, "auxiliary_loss_mlp": 0.0126427, "balance_loss_clip": 0.06275397, "balance_loss_mlp": 0.01255419, "epoch": 0.7690966481286637, "flos": 18229008723840.0, "grad_norm": 1.758310401625906, "language_loss": 0.80197132, "learning_rate": 5.335809371455526e-07, "loss": 0.87875336, "num_input_tokens_seen": 275855705, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.08850098, "step": 12792, "time_per_iteration": 2.517840623855591 }, { "auxiliary_loss_clip": 0.06423398, "auxiliary_loss_mlp": 0.01266665, "balance_loss_clip": 0.06278482, "balance_loss_mlp": 0.01257015, "epoch": 0.7691567713813318, "flos": 21543004431360.0, "grad_norm": 1.863457945128159, "language_loss": 0.7356692, "learning_rate": 5.333161299238673e-07, "loss": 0.8125698, "num_input_tokens_seen": 275873930, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.09649658, "step": 12793, "time_per_iteration": 2.541229486465454 }, { "auxiliary_loss_clip": 0.06420483, "auxiliary_loss_mlp": 0.01266024, "balance_loss_clip": 0.06276573, "balance_loss_mlp": 0.0125607, "epoch": 0.7692168946339997, "flos": 39388568872320.0, "grad_norm": 1.5741145530237317, "language_loss": 0.64011669, "learning_rate": 5.330513783189803e-07, "loss": 0.71698177, "num_input_tokens_seen": 275895895, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.09954834, "step": 12794, "time_per_iteration": 4.128928184509277 }, { "auxiliary_loss_clip": 0.06420957, "auxiliary_loss_mlp": 0.01265465, "balance_loss_clip": 0.06276879, "balance_loss_mlp": 0.01255809, "epoch": 0.7692770178866677, "flos": 25017010709760.0, "grad_norm": 1.4629518587326196, "language_loss": 0.76453769, "learning_rate": 5.327866823409319e-07, "loss": 0.84140193, "num_input_tokens_seen": 275917825, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09655762, "step": 12795, "time_per_iteration": 2.595264434814453 }, { "auxiliary_loss_clip": 0.0641771, "auxiliary_loss_mlp": 0.01263066, "balance_loss_clip": 0.06273881, "balance_loss_mlp": 0.01253154, "epoch": 0.7693371411393356, "flos": 24722984332800.0, "grad_norm": 1.5144621788745642, "language_loss": 0.71962696, "learning_rate": 5.325220419997601e-07, "loss": 0.7964347, "num_input_tokens_seen": 275937890, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.09918213, "step": 12796, "time_per_iteration": 2.6023545265197754 }, { "auxiliary_loss_clip": 0.06415959, "auxiliary_loss_mlp": 0.01264844, "balance_loss_clip": 0.0627555, "balance_loss_mlp": 0.01254807, "epoch": 0.7693972643920036, "flos": 15930311339520.0, "grad_norm": 2.3626152018384925, "language_loss": 0.65262687, "learning_rate": 5.32257457305499e-07, "loss": 0.72943497, "num_input_tokens_seen": 275954495, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.1003418, "step": 12797, "time_per_iteration": 2.5177948474884033 }, { "auxiliary_loss_clip": 0.06422238, "auxiliary_loss_mlp": 0.01272422, "balance_loss_clip": 0.06280433, "balance_loss_mlp": 0.01261944, "epoch": 0.7694573876446715, "flos": 25412125438080.0, "grad_norm": 1.7721317206709943, "language_loss": 0.91653013, "learning_rate": 5.319929282681823e-07, "loss": 0.99347675, "num_input_tokens_seen": 275972395, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.1048584, "step": 12798, "time_per_iteration": 2.55914568901062 }, { "auxiliary_loss_clip": 0.06414866, "auxiliary_loss_mlp": 0.01268839, "balance_loss_clip": 0.06272598, "balance_loss_mlp": 0.01259279, "epoch": 0.7695175108973396, "flos": 16659800985600.0, "grad_norm": 1.885196258458535, "language_loss": 0.82947665, "learning_rate": 5.317284548978418e-07, "loss": 0.90631366, "num_input_tokens_seen": 275989020, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09558105, "step": 12799, "time_per_iteration": 2.510721206665039 }, { "auxiliary_loss_clip": 0.06417271, "auxiliary_loss_mlp": 0.01266554, "balance_loss_clip": 0.06275111, "balance_loss_mlp": 0.01256284, "epoch": 0.7695776341500075, "flos": 13631697809280.0, "grad_norm": 1.972582432663231, "language_loss": 0.78358459, "learning_rate": 5.314640372045045e-07, "loss": 0.86042285, "num_input_tokens_seen": 276006525, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10272217, "step": 12800, "time_per_iteration": 2.510012149810791 }, { "auxiliary_loss_clip": 0.06423097, "auxiliary_loss_mlp": 0.01268052, "balance_loss_clip": 0.06275524, "balance_loss_mlp": 0.01257251, "epoch": 0.7696377574026755, "flos": 24283034870400.0, "grad_norm": 1.8900178915476729, "language_loss": 0.84094739, "learning_rate": 5.31199675198198e-07, "loss": 0.9178589, "num_input_tokens_seen": 276027130, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.10797119, "step": 12801, "time_per_iteration": 2.6424825191497803 }, { "auxiliary_loss_clip": 0.06418993, "auxiliary_loss_mlp": 0.01265764, "balance_loss_clip": 0.06277348, "balance_loss_mlp": 0.0125612, "epoch": 0.7696978806553435, "flos": 20929445308800.0, "grad_norm": 1.9173879854017457, "language_loss": 0.72101152, "learning_rate": 5.30935368888947e-07, "loss": 0.79785913, "num_input_tokens_seen": 276045715, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09643555, "step": 12802, "time_per_iteration": 2.5407958030700684 }, { "auxiliary_loss_clip": 0.06413694, "auxiliary_loss_mlp": 0.01263227, "balance_loss_clip": 0.06276593, "balance_loss_mlp": 0.01253529, "epoch": 0.7697580039080114, "flos": 22936212668160.0, "grad_norm": 1.7459460172143182, "language_loss": 0.765248, "learning_rate": 5.306711182867747e-07, "loss": 0.84201717, "num_input_tokens_seen": 276065375, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09698486, "step": 12803, "time_per_iteration": 2.587442398071289 }, { "auxiliary_loss_clip": 0.06313176, "auxiliary_loss_mlp": 0.01251464, "balance_loss_clip": 0.06256669, "balance_loss_mlp": 0.01250193, "epoch": 0.7698181271606794, "flos": 68737751850240.0, "grad_norm": 0.7254819841544858, "language_loss": 0.55860966, "learning_rate": 5.304069234017001e-07, "loss": 0.63425606, "num_input_tokens_seen": 276131405, "router_z_loss_clip": 0.56298828, "router_z_loss_mlp": 0.01270294, "step": 12804, "time_per_iteration": 3.17899751663208 }, { "auxiliary_loss_clip": 0.06312536, "auxiliary_loss_mlp": 0.01253188, "balance_loss_clip": 0.06256152, "balance_loss_mlp": 0.01251967, "epoch": 0.7698782504133473, "flos": 67430523502080.0, "grad_norm": 0.7460096990154175, "language_loss": 0.53888571, "learning_rate": 5.301427842437429e-07, "loss": 0.61454296, "num_input_tokens_seen": 276200755, "router_z_loss_clip": 0.56298828, "router_z_loss_mlp": 0.01220703, "step": 12805, "time_per_iteration": 3.3180480003356934 }, { "auxiliary_loss_clip": 0.06418407, "auxiliary_loss_mlp": 0.01267209, "balance_loss_clip": 0.06278355, "balance_loss_mlp": 0.0125763, "epoch": 0.7699383736660154, "flos": 22494879613440.0, "grad_norm": 2.615105464442247, "language_loss": 0.73228097, "learning_rate": 5.298787008229187e-07, "loss": 0.80913711, "num_input_tokens_seen": 276217880, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09576416, "step": 12806, "time_per_iteration": 2.583350658416748 }, { "auxiliary_loss_clip": 0.06413228, "auxiliary_loss_mlp": 0.01265783, "balance_loss_clip": 0.06273173, "balance_loss_mlp": 0.01256354, "epoch": 0.7699984969186833, "flos": 21545520053760.0, "grad_norm": 2.0290401327481136, "language_loss": 0.75129342, "learning_rate": 5.296146731492408e-07, "loss": 0.82808352, "num_input_tokens_seen": 276234810, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09436035, "step": 12807, "time_per_iteration": 2.6828434467315674 }, { "auxiliary_loss_clip": 0.06418877, "auxiliary_loss_mlp": 0.01266764, "balance_loss_clip": 0.06273824, "balance_loss_mlp": 0.0125619, "epoch": 0.7700586201713513, "flos": 21724412520960.0, "grad_norm": 2.0720397249399953, "language_loss": 0.80944347, "learning_rate": 5.293507012327218e-07, "loss": 0.88629997, "num_input_tokens_seen": 276252850, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10577393, "step": 12808, "time_per_iteration": 2.5996196269989014 }, { "auxiliary_loss_clip": 0.06421416, "auxiliary_loss_mlp": 0.01267248, "balance_loss_clip": 0.06276142, "balance_loss_mlp": 0.01257091, "epoch": 0.7701187434240192, "flos": 27863580015360.0, "grad_norm": 2.0658114642681555, "language_loss": 0.79062223, "learning_rate": 5.290867850833718e-07, "loss": 0.86750889, "num_input_tokens_seen": 276272525, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.1015625, "step": 12809, "time_per_iteration": 2.614989995956421 }, { "auxiliary_loss_clip": 0.06412601, "auxiliary_loss_mlp": 0.01262427, "balance_loss_clip": 0.06275893, "balance_loss_mlp": 0.01253671, "epoch": 0.7701788666766872, "flos": 28628848154880.0, "grad_norm": 1.4238347618599652, "language_loss": 0.70435369, "learning_rate": 5.288229247111993e-07, "loss": 0.78110391, "num_input_tokens_seen": 276294210, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.08746338, "step": 12810, "time_per_iteration": 2.63643741607666 }, { "auxiliary_loss_clip": 0.06419785, "auxiliary_loss_mlp": 0.01270438, "balance_loss_clip": 0.06274389, "balance_loss_mlp": 0.01258565, "epoch": 0.7702389899293551, "flos": 14251671768960.0, "grad_norm": 2.4323907915886376, "language_loss": 0.78819644, "learning_rate": 5.285591201262079e-07, "loss": 0.86509866, "num_input_tokens_seen": 276310290, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.11865234, "step": 12811, "time_per_iteration": 2.5121734142303467 }, { "auxiliary_loss_clip": 0.06312905, "auxiliary_loss_mlp": 0.01251461, "balance_loss_clip": 0.06256405, "balance_loss_mlp": 0.01250267, "epoch": 0.7702991131820232, "flos": 70593816441600.0, "grad_norm": 0.7931732320184819, "language_loss": 0.5669167, "learning_rate": 5.28295371338402e-07, "loss": 0.64256036, "num_input_tokens_seen": 276371715, "router_z_loss_clip": 0.56542969, "router_z_loss_mlp": 0.01191711, "step": 12812, "time_per_iteration": 4.604060173034668 }, { "auxiliary_loss_clip": 0.06419688, "auxiliary_loss_mlp": 0.01266103, "balance_loss_clip": 0.06277503, "balance_loss_mlp": 0.01255774, "epoch": 0.7703592364346911, "flos": 25486449609600.0, "grad_norm": 1.6935404308489737, "language_loss": 0.71986628, "learning_rate": 5.280316783577836e-07, "loss": 0.79672426, "num_input_tokens_seen": 276389895, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10327148, "step": 12813, "time_per_iteration": 2.5701372623443604 }, { "auxiliary_loss_clip": 0.06420171, "auxiliary_loss_mlp": 0.01266126, "balance_loss_clip": 0.06276422, "balance_loss_mlp": 0.0125594, "epoch": 0.7704193596873591, "flos": 19286877720960.0, "grad_norm": 1.6599662213195807, "language_loss": 0.67035043, "learning_rate": 5.27768041194351e-07, "loss": 0.74721336, "num_input_tokens_seen": 276408990, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10192871, "step": 12814, "time_per_iteration": 2.56476092338562 }, { "auxiliary_loss_clip": 0.06419022, "auxiliary_loss_mlp": 0.01265754, "balance_loss_clip": 0.06278442, "balance_loss_mlp": 0.01256551, "epoch": 0.7704794829400271, "flos": 23665031481600.0, "grad_norm": 1.8972361785764666, "language_loss": 0.65676045, "learning_rate": 5.275044598581018e-07, "loss": 0.73360819, "num_input_tokens_seen": 276428190, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09210205, "step": 12815, "time_per_iteration": 2.578017473220825 }, { "auxiliary_loss_clip": 0.06416976, "auxiliary_loss_mlp": 0.01264479, "balance_loss_clip": 0.06276022, "balance_loss_mlp": 0.01254316, "epoch": 0.770539606192695, "flos": 18995283112320.0, "grad_norm": 2.2059372606119854, "language_loss": 0.65182585, "learning_rate": 5.272409343590322e-07, "loss": 0.72864038, "num_input_tokens_seen": 276446855, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10162354, "step": 12816, "time_per_iteration": 3.9828600883483887 }, { "auxiliary_loss_clip": 0.06423847, "auxiliary_loss_mlp": 0.01268247, "balance_loss_clip": 0.06280771, "balance_loss_mlp": 0.01258484, "epoch": 0.770599729445363, "flos": 11833605843840.0, "grad_norm": 2.06997467530103, "language_loss": 0.72211736, "learning_rate": 5.26977464707133e-07, "loss": 0.79903829, "num_input_tokens_seen": 276462000, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09771729, "step": 12817, "time_per_iteration": 2.573080062866211 }, { "auxiliary_loss_clip": 0.06419596, "auxiliary_loss_mlp": 0.0126513, "balance_loss_clip": 0.06278944, "balance_loss_mlp": 0.01255593, "epoch": 0.770659852698031, "flos": 17828527334400.0, "grad_norm": 1.7026940881083519, "language_loss": 0.61767471, "learning_rate": 5.267140509123957e-07, "loss": 0.69452196, "num_input_tokens_seen": 276481190, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09539795, "step": 12818, "time_per_iteration": 2.5403473377227783 }, { "auxiliary_loss_clip": 0.06413285, "auxiliary_loss_mlp": 0.01261243, "balance_loss_clip": 0.06275123, "balance_loss_mlp": 0.01252803, "epoch": 0.770719975950699, "flos": 21878469452160.0, "grad_norm": 1.6557638950232212, "language_loss": 0.67247301, "learning_rate": 5.264506929848093e-07, "loss": 0.74921829, "num_input_tokens_seen": 276499520, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.08435059, "step": 12819, "time_per_iteration": 2.6107351779937744 }, { "auxiliary_loss_clip": 0.06419364, "auxiliary_loss_mlp": 0.01265101, "balance_loss_clip": 0.0627604, "balance_loss_mlp": 0.01255063, "epoch": 0.7707800992033669, "flos": 21331519925760.0, "grad_norm": 1.678096621537256, "language_loss": 0.5765506, "learning_rate": 5.261873909343608e-07, "loss": 0.65339524, "num_input_tokens_seen": 276519110, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10040283, "step": 12820, "time_per_iteration": 2.58206844329834 }, { "auxiliary_loss_clip": 0.06418591, "auxiliary_loss_mlp": 0.01262921, "balance_loss_clip": 0.0627614, "balance_loss_mlp": 0.01253134, "epoch": 0.7708402224560349, "flos": 28186215361920.0, "grad_norm": 1.5763571251902866, "language_loss": 0.81117719, "learning_rate": 5.259241447710343e-07, "loss": 0.88799238, "num_input_tokens_seen": 276538805, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09790039, "step": 12821, "time_per_iteration": 2.621065139770508 }, { "auxiliary_loss_clip": 0.06420203, "auxiliary_loss_mlp": 0.01265206, "balance_loss_clip": 0.06278669, "balance_loss_mlp": 0.01255329, "epoch": 0.7709003457087028, "flos": 15382397491200.0, "grad_norm": 1.9624355980104207, "language_loss": 0.69038904, "learning_rate": 5.256609545048114e-07, "loss": 0.76724315, "num_input_tokens_seen": 276554770, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09881592, "step": 12822, "time_per_iteration": 2.51367449760437 }, { "auxiliary_loss_clip": 0.0641213, "auxiliary_loss_mlp": 0.01264952, "balance_loss_clip": 0.06273951, "balance_loss_mlp": 0.01255481, "epoch": 0.7709604689613708, "flos": 30628697552640.0, "grad_norm": 1.6221298888281315, "language_loss": 0.72160709, "learning_rate": 5.253978201456733e-07, "loss": 0.79837787, "num_input_tokens_seen": 276574535, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09472656, "step": 12823, "time_per_iteration": 2.622380256652832 }, { "auxiliary_loss_clip": 0.06423587, "auxiliary_loss_mlp": 0.0126513, "balance_loss_clip": 0.06277269, "balance_loss_mlp": 0.01254514, "epoch": 0.7710205922140387, "flos": 20307207288960.0, "grad_norm": 3.525268921293007, "language_loss": 0.76956499, "learning_rate": 5.251347417035969e-07, "loss": 0.84645212, "num_input_tokens_seen": 276592925, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.1060791, "step": 12824, "time_per_iteration": 2.5409302711486816 }, { "auxiliary_loss_clip": 0.06418409, "auxiliary_loss_mlp": 0.01265183, "balance_loss_clip": 0.06277367, "balance_loss_mlp": 0.01255944, "epoch": 0.7710807154667068, "flos": 19649987389440.0, "grad_norm": 1.9394168207640061, "language_loss": 0.72699034, "learning_rate": 5.248717191885592e-07, "loss": 0.80382627, "num_input_tokens_seen": 276610540, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09240723, "step": 12825, "time_per_iteration": 2.533080577850342 }, { "auxiliary_loss_clip": 0.0641192, "auxiliary_loss_mlp": 0.01266415, "balance_loss_clip": 0.06276873, "balance_loss_mlp": 0.01257427, "epoch": 0.7711408387193747, "flos": 20011713465600.0, "grad_norm": 1.3309699130608603, "language_loss": 0.73879981, "learning_rate": 5.246087526105343e-07, "loss": 0.81558311, "num_input_tokens_seen": 276629200, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.08990479, "step": 12826, "time_per_iteration": 3.9753551483154297 }, { "auxiliary_loss_clip": 0.06418037, "auxiliary_loss_mlp": 0.01264965, "balance_loss_clip": 0.0627538, "balance_loss_mlp": 0.01254505, "epoch": 0.7712009619720427, "flos": 24977794199040.0, "grad_norm": 1.6230684729218603, "language_loss": 0.81536502, "learning_rate": 5.243458419794933e-07, "loss": 0.89219511, "num_input_tokens_seen": 276648655, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10455322, "step": 12827, "time_per_iteration": 2.569596529006958 }, { "auxiliary_loss_clip": 0.06320922, "auxiliary_loss_mlp": 0.01250209, "balance_loss_clip": 0.06264329, "balance_loss_mlp": 0.0124909, "epoch": 0.7712610852247107, "flos": 63269682105600.0, "grad_norm": 0.8385300981842947, "language_loss": 0.5523001, "learning_rate": 5.240829873054051e-07, "loss": 0.62801147, "num_input_tokens_seen": 276716500, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.01121521, "step": 12828, "time_per_iteration": 3.311169147491455 }, { "auxiliary_loss_clip": 0.06408533, "auxiliary_loss_mlp": 0.0126474, "balance_loss_clip": 0.06272049, "balance_loss_mlp": 0.01255757, "epoch": 0.7713212084773786, "flos": 18703856211840.0, "grad_norm": 1.6797452275601463, "language_loss": 0.69844997, "learning_rate": 5.23820188598238e-07, "loss": 0.77518272, "num_input_tokens_seen": 276733535, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.08978271, "step": 12829, "time_per_iteration": 2.517570734024048 }, { "auxiliary_loss_clip": 0.06423356, "auxiliary_loss_mlp": 0.01264413, "balance_loss_clip": 0.0627775, "balance_loss_mlp": 0.01254114, "epoch": 0.7713813317300466, "flos": 14178563481600.0, "grad_norm": 3.508799722354771, "language_loss": 0.79873133, "learning_rate": 5.235574458679579e-07, "loss": 0.87560892, "num_input_tokens_seen": 276749575, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10302734, "step": 12830, "time_per_iteration": 2.5034167766571045 }, { "auxiliary_loss_clip": 0.06421555, "auxiliary_loss_mlp": 0.01264942, "balance_loss_clip": 0.06277082, "balance_loss_mlp": 0.01253778, "epoch": 0.7714414549827145, "flos": 25711266913920.0, "grad_norm": 1.554896278616418, "language_loss": 0.7815212, "learning_rate": 5.232947591245269e-07, "loss": 0.85838622, "num_input_tokens_seen": 276769460, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.11169434, "step": 12831, "time_per_iteration": 2.5743765830993652 }, { "auxiliary_loss_clip": 0.06416401, "auxiliary_loss_mlp": 0.01264999, "balance_loss_clip": 0.06275982, "balance_loss_mlp": 0.01255545, "epoch": 0.7715015782353826, "flos": 30563219986560.0, "grad_norm": 1.7068527139709508, "language_loss": 0.61182308, "learning_rate": 5.230321283779071e-07, "loss": 0.68863708, "num_input_tokens_seen": 276790820, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09454346, "step": 12832, "time_per_iteration": 2.618415594100952 }, { "auxiliary_loss_clip": 0.06423672, "auxiliary_loss_mlp": 0.01268262, "balance_loss_clip": 0.06279127, "balance_loss_mlp": 0.01258153, "epoch": 0.7715617014880505, "flos": 20235440666880.0, "grad_norm": 1.9208128328099434, "language_loss": 0.79074502, "learning_rate": 5.227695536380572e-07, "loss": 0.86766434, "num_input_tokens_seen": 276811345, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10113525, "step": 12833, "time_per_iteration": 4.001746654510498 }, { "auxiliary_loss_clip": 0.06322362, "auxiliary_loss_mlp": 0.01253041, "balance_loss_clip": 0.06265968, "balance_loss_mlp": 0.01251872, "epoch": 0.7716218247407185, "flos": 63681037326720.0, "grad_norm": 0.8585593853728065, "language_loss": 0.55121976, "learning_rate": 5.22507034914933e-07, "loss": 0.62697375, "num_input_tokens_seen": 276870950, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.0116806, "step": 12834, "time_per_iteration": 3.150380849838257 }, { "auxiliary_loss_clip": 0.06422976, "auxiliary_loss_mlp": 0.01264231, "balance_loss_clip": 0.06279542, "balance_loss_mlp": 0.01254003, "epoch": 0.7716819479933864, "flos": 19797881045760.0, "grad_norm": 1.990419983871301, "language_loss": 0.73115528, "learning_rate": 5.222445722184903e-07, "loss": 0.80802733, "num_input_tokens_seen": 276890760, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10229492, "step": 12835, "time_per_iteration": 2.5411336421966553 }, { "auxiliary_loss_clip": 0.06421153, "auxiliary_loss_mlp": 0.01266843, "balance_loss_clip": 0.06278542, "balance_loss_mlp": 0.01257348, "epoch": 0.7717420712460544, "flos": 18448082023680.0, "grad_norm": 1.8831668560826034, "language_loss": 0.70225346, "learning_rate": 5.219821655586814e-07, "loss": 0.77913344, "num_input_tokens_seen": 276909625, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09503174, "step": 12836, "time_per_iteration": 2.5179216861724854 }, { "auxiliary_loss_clip": 0.06412403, "auxiliary_loss_mlp": 0.01268886, "balance_loss_clip": 0.06275094, "balance_loss_mlp": 0.0125951, "epoch": 0.7718021944987223, "flos": 35198238038400.0, "grad_norm": 1.6450687406180482, "language_loss": 0.59973633, "learning_rate": 5.217198149454575e-07, "loss": 0.67654926, "num_input_tokens_seen": 276930760, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09375, "step": 12837, "time_per_iteration": 2.648601531982422 }, { "auxiliary_loss_clip": 0.06321535, "auxiliary_loss_mlp": 0.01252387, "balance_loss_clip": 0.06264947, "balance_loss_mlp": 0.01251101, "epoch": 0.7718623177513904, "flos": 67944503646720.0, "grad_norm": 0.847285301282435, "language_loss": 0.55682498, "learning_rate": 5.214575203887666e-07, "loss": 0.63256419, "num_input_tokens_seen": 276989580, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.01287079, "step": 12838, "time_per_iteration": 3.149155378341675 }, { "auxiliary_loss_clip": 0.06417932, "auxiliary_loss_mlp": 0.01265103, "balance_loss_clip": 0.06278822, "balance_loss_mlp": 0.01255906, "epoch": 0.7719224410040583, "flos": 18586206679680.0, "grad_norm": 2.8570813747464285, "language_loss": 0.69786417, "learning_rate": 5.211952818985538e-07, "loss": 0.7746945, "num_input_tokens_seen": 277005450, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09197998, "step": 12839, "time_per_iteration": 2.5047693252563477 }, { "auxiliary_loss_clip": 0.06414045, "auxiliary_loss_mlp": 0.01265856, "balance_loss_clip": 0.06275059, "balance_loss_mlp": 0.0125657, "epoch": 0.7719825642567263, "flos": 23082471169920.0, "grad_norm": 1.7629043288389903, "language_loss": 0.80112708, "learning_rate": 5.209330994847647e-07, "loss": 0.87792611, "num_input_tokens_seen": 277023055, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09283447, "step": 12840, "time_per_iteration": 2.5714850425720215 }, { "auxiliary_loss_clip": 0.06418402, "auxiliary_loss_mlp": 0.01264684, "balance_loss_clip": 0.0627581, "balance_loss_mlp": 0.01254742, "epoch": 0.7720426875093943, "flos": 20345249842560.0, "grad_norm": 1.9259961431749573, "language_loss": 0.80266714, "learning_rate": 5.206709731573402e-07, "loss": 0.879498, "num_input_tokens_seen": 277041150, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09942627, "step": 12841, "time_per_iteration": 2.538346290588379 }, { "auxiliary_loss_clip": 0.06418931, "auxiliary_loss_mlp": 0.01263468, "balance_loss_clip": 0.06277042, "balance_loss_mlp": 0.01254295, "epoch": 0.7721028107620622, "flos": 23887878215040.0, "grad_norm": 1.5644146382626398, "language_loss": 0.76291311, "learning_rate": 5.204089029262208e-07, "loss": 0.83973712, "num_input_tokens_seen": 277063895, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.0916748, "step": 12842, "time_per_iteration": 2.5884461402893066 }, { "auxiliary_loss_clip": 0.06424196, "auxiliary_loss_mlp": 0.01266956, "balance_loss_clip": 0.062801, "balance_loss_mlp": 0.01257413, "epoch": 0.7721629340147302, "flos": 26658865537920.0, "grad_norm": 1.5165843691162741, "language_loss": 0.69109559, "learning_rate": 5.201468888013445e-07, "loss": 0.7680071, "num_input_tokens_seen": 277084045, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.09552002, "step": 12843, "time_per_iteration": 2.5653560161590576 }, { "auxiliary_loss_clip": 0.06422985, "auxiliary_loss_mlp": 0.01264834, "balance_loss_clip": 0.0627754, "balance_loss_mlp": 0.01255321, "epoch": 0.7722230572673981, "flos": 21185261424000.0, "grad_norm": 2.1594829027543536, "language_loss": 0.74295121, "learning_rate": 5.198849307926465e-07, "loss": 0.81982934, "num_input_tokens_seen": 277102625, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.09509277, "step": 12844, "time_per_iteration": 2.5441603660583496 }, { "auxiliary_loss_clip": 0.06413579, "auxiliary_loss_mlp": 0.0126209, "balance_loss_clip": 0.06274329, "balance_loss_mlp": 0.01252947, "epoch": 0.7722831805200662, "flos": 27972089452800.0, "grad_norm": 1.402697111846333, "language_loss": 0.7170471, "learning_rate": 5.196230289100596e-07, "loss": 0.79380381, "num_input_tokens_seen": 277123210, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09143066, "step": 12845, "time_per_iteration": 2.593825340270996 }, { "auxiliary_loss_clip": 0.06414297, "auxiliary_loss_mlp": 0.01268328, "balance_loss_clip": 0.06276147, "balance_loss_mlp": 0.01259555, "epoch": 0.7723433037727341, "flos": 33884049801600.0, "grad_norm": 1.946192698110705, "language_loss": 0.64981174, "learning_rate": 5.193611831635159e-07, "loss": 0.72663802, "num_input_tokens_seen": 277144895, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.08770752, "step": 12846, "time_per_iteration": 2.661280632019043 }, { "auxiliary_loss_clip": 0.0631955, "auxiliary_loss_mlp": 0.01253982, "balance_loss_clip": 0.06263202, "balance_loss_mlp": 0.01252797, "epoch": 0.7724034270254021, "flos": 62868194467200.0, "grad_norm": 0.7126374815798193, "language_loss": 0.6144588, "learning_rate": 5.19099393562945e-07, "loss": 0.69019413, "num_input_tokens_seen": 277205160, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01182556, "step": 12847, "time_per_iteration": 3.0984795093536377 }, { "auxiliary_loss_clip": 0.06415987, "auxiliary_loss_mlp": 0.01266141, "balance_loss_clip": 0.06273912, "balance_loss_mlp": 0.01256557, "epoch": 0.77246355027807, "flos": 23302299156480.0, "grad_norm": 1.7557021478704953, "language_loss": 0.79049975, "learning_rate": 5.188376601182732e-07, "loss": 0.86732101, "num_input_tokens_seen": 277223005, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09588623, "step": 12848, "time_per_iteration": 2.5578768253326416 }, { "auxiliary_loss_clip": 0.06417223, "auxiliary_loss_mlp": 0.01267171, "balance_loss_clip": 0.06272767, "balance_loss_mlp": 0.01257396, "epoch": 0.772523673530738, "flos": 20127602062080.0, "grad_norm": 1.9435908563728834, "language_loss": 0.72898209, "learning_rate": 5.185759828394261e-07, "loss": 0.80582607, "num_input_tokens_seen": 277241785, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.09783936, "step": 12849, "time_per_iteration": 2.518901824951172 }, { "auxiliary_loss_clip": 0.06417111, "auxiliary_loss_mlp": 0.0126644, "balance_loss_clip": 0.06275753, "balance_loss_mlp": 0.01256975, "epoch": 0.7725837967834059, "flos": 17825592441600.0, "grad_norm": 2.328116275959929, "language_loss": 0.78557122, "learning_rate": 5.183143617363261e-07, "loss": 0.86240673, "num_input_tokens_seen": 277259050, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09472656, "step": 12850, "time_per_iteration": 2.5208539962768555 }, { "auxiliary_loss_clip": 0.06419103, "auxiliary_loss_mlp": 0.01266291, "balance_loss_clip": 0.06274326, "balance_loss_mlp": 0.01256177, "epoch": 0.772643920036074, "flos": 27206318188800.0, "grad_norm": 1.4649799200623344, "language_loss": 0.79837549, "learning_rate": 5.180527968188935e-07, "loss": 0.87522942, "num_input_tokens_seen": 277278235, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10125732, "step": 12851, "time_per_iteration": 3.9650847911834717 }, { "auxiliary_loss_clip": 0.06415578, "auxiliary_loss_mlp": 0.01265603, "balance_loss_clip": 0.06275875, "balance_loss_mlp": 0.01255696, "epoch": 0.7727040432887419, "flos": 21585868594560.0, "grad_norm": 1.4834934515986549, "language_loss": 0.73957342, "learning_rate": 5.177912880970474e-07, "loss": 0.81638521, "num_input_tokens_seen": 277298355, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09906006, "step": 12852, "time_per_iteration": 2.5602283477783203 }, { "auxiliary_loss_clip": 0.06412308, "auxiliary_loss_mlp": 0.01264964, "balance_loss_clip": 0.062738, "balance_loss_mlp": 0.01255969, "epoch": 0.7727641665414099, "flos": 22243172348160.0, "grad_norm": 2.7444326065134823, "language_loss": 0.82741439, "learning_rate": 5.17529835580704e-07, "loss": 0.90418708, "num_input_tokens_seen": 277316095, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.08990479, "step": 12853, "time_per_iteration": 2.5465402603149414 }, { "auxiliary_loss_clip": 0.06317827, "auxiliary_loss_mlp": 0.01251213, "balance_loss_clip": 0.06261243, "balance_loss_mlp": 0.01249916, "epoch": 0.7728242897940779, "flos": 54852613038720.0, "grad_norm": 0.7888352882640784, "language_loss": 0.54192257, "learning_rate": 5.172684392797786e-07, "loss": 0.61761296, "num_input_tokens_seen": 277380130, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.0129776, "step": 12854, "time_per_iteration": 3.273829698562622 }, { "auxiliary_loss_clip": 0.06423797, "auxiliary_loss_mlp": 0.01265812, "balance_loss_clip": 0.06279753, "balance_loss_mlp": 0.01255602, "epoch": 0.7728844130467458, "flos": 34470970525440.0, "grad_norm": 1.5447754864702499, "language_loss": 0.72418714, "learning_rate": 5.170070992041826e-07, "loss": 0.80108321, "num_input_tokens_seen": 277404015, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10217285, "step": 12855, "time_per_iteration": 2.670032501220703 }, { "auxiliary_loss_clip": 0.06419552, "auxiliary_loss_mlp": 0.0126925, "balance_loss_clip": 0.06279369, "balance_loss_mlp": 0.01259421, "epoch": 0.7729445362994138, "flos": 18922300606080.0, "grad_norm": 1.6892734445170106, "language_loss": 0.68429941, "learning_rate": 5.167458153638254e-07, "loss": 0.76118743, "num_input_tokens_seen": 277421375, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.0982666, "step": 12856, "time_per_iteration": 3.9742043018341064 }, { "auxiliary_loss_clip": 0.06414592, "auxiliary_loss_mlp": 0.01266673, "balance_loss_clip": 0.06274244, "balance_loss_mlp": 0.01257095, "epoch": 0.7730046595520818, "flos": 22206555313920.0, "grad_norm": 1.7625666300126557, "language_loss": 0.79275906, "learning_rate": 5.164845877686162e-07, "loss": 0.86957169, "num_input_tokens_seen": 277440170, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.0958252, "step": 12857, "time_per_iteration": 2.55112624168396 }, { "auxiliary_loss_clip": 0.06415831, "auxiliary_loss_mlp": 0.01266202, "balance_loss_clip": 0.06278011, "balance_loss_mlp": 0.01256868, "epoch": 0.7730647828047498, "flos": 13557289783680.0, "grad_norm": 1.9379021698607029, "language_loss": 0.78398275, "learning_rate": 5.162234164284591e-07, "loss": 0.86080301, "num_input_tokens_seen": 277456880, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09332275, "step": 12858, "time_per_iteration": 2.498863458633423 }, { "auxiliary_loss_clip": 0.06421599, "auxiliary_loss_mlp": 0.01266763, "balance_loss_clip": 0.062785, "balance_loss_mlp": 0.01256917, "epoch": 0.7731249060574177, "flos": 21981654155520.0, "grad_norm": 2.6396193684574323, "language_loss": 0.77488983, "learning_rate": 5.159623013532591e-07, "loss": 0.85177344, "num_input_tokens_seen": 277475365, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09851074, "step": 12859, "time_per_iteration": 2.5405569076538086 }, { "auxiliary_loss_clip": 0.0641885, "auxiliary_loss_mlp": 0.01264456, "balance_loss_clip": 0.06282346, "balance_loss_mlp": 0.01255694, "epoch": 0.7731850293100857, "flos": 22608462222720.0, "grad_norm": 1.4240881895533695, "language_loss": 0.68259668, "learning_rate": 5.157012425529186e-07, "loss": 0.75942981, "num_input_tokens_seen": 277494975, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.08764648, "step": 12860, "time_per_iteration": 2.543994665145874 }, { "auxiliary_loss_clip": 0.0642121, "auxiliary_loss_mlp": 0.01267986, "balance_loss_clip": 0.06276256, "balance_loss_mlp": 0.01257746, "epoch": 0.7732451525627536, "flos": 14103274988160.0, "grad_norm": 4.06843930915317, "language_loss": 0.74362826, "learning_rate": 5.154402400373343e-07, "loss": 0.82052028, "num_input_tokens_seen": 277510520, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10241699, "step": 12861, "time_per_iteration": 2.5023083686828613 }, { "auxiliary_loss_clip": 0.06423457, "auxiliary_loss_mlp": 0.01263887, "balance_loss_clip": 0.06278451, "balance_loss_mlp": 0.01253527, "epoch": 0.7733052758154216, "flos": 21476352908160.0, "grad_norm": 1.7870691002720198, "language_loss": 0.75095904, "learning_rate": 5.15179293816405e-07, "loss": 0.82783246, "num_input_tokens_seen": 277530505, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.1036377, "step": 12862, "time_per_iteration": 2.585352659225464 }, { "auxiliary_loss_clip": 0.06415419, "auxiliary_loss_mlp": 0.01267573, "balance_loss_clip": 0.06278104, "balance_loss_mlp": 0.01258495, "epoch": 0.7733653990680895, "flos": 21400142019840.0, "grad_norm": 1.4507382010793017, "language_loss": 0.83243716, "learning_rate": 5.149184039000256e-07, "loss": 0.90926707, "num_input_tokens_seen": 277550810, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09082031, "step": 12863, "time_per_iteration": 2.547102451324463 }, { "auxiliary_loss_clip": 0.06415729, "auxiliary_loss_mlp": 0.01269404, "balance_loss_clip": 0.06276942, "balance_loss_mlp": 0.01259677, "epoch": 0.7734255223207576, "flos": 17681849562240.0, "grad_norm": 1.547114658579074, "language_loss": 0.73405522, "learning_rate": 5.146575702980898e-07, "loss": 0.81090659, "num_input_tokens_seen": 277567680, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.097229, "step": 12864, "time_per_iteration": 2.523983955383301 }, { "auxiliary_loss_clip": 0.06416157, "auxiliary_loss_mlp": 0.01263371, "balance_loss_clip": 0.06276102, "balance_loss_mlp": 0.01254377, "epoch": 0.7734856455734255, "flos": 25238264215680.0, "grad_norm": 1.6023430977039244, "language_loss": 0.82078075, "learning_rate": 5.143967930204871e-07, "loss": 0.89757597, "num_input_tokens_seen": 277588970, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.08996582, "step": 12865, "time_per_iteration": 2.5824832916259766 }, { "auxiliary_loss_clip": 0.06423934, "auxiliary_loss_mlp": 0.01263423, "balance_loss_clip": 0.06278454, "balance_loss_mlp": 0.01253212, "epoch": 0.7735457688260935, "flos": 23438579022720.0, "grad_norm": 2.161704941741027, "language_loss": 0.72288537, "learning_rate": 5.141360720771077e-07, "loss": 0.79975891, "num_input_tokens_seen": 277605450, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10211182, "step": 12866, "time_per_iteration": 4.012588024139404 }, { "auxiliary_loss_clip": 0.06421738, "auxiliary_loss_mlp": 0.0126863, "balance_loss_clip": 0.06278876, "balance_loss_mlp": 0.01258026, "epoch": 0.7736058920787615, "flos": 18734393825280.0, "grad_norm": 2.113582106134426, "language_loss": 0.65087968, "learning_rate": 5.138754074778371e-07, "loss": 0.72778332, "num_input_tokens_seen": 277622530, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10595703, "step": 12867, "time_per_iteration": 2.494431257247925 }, { "auxiliary_loss_clip": 0.06412955, "auxiliary_loss_mlp": 0.01263912, "balance_loss_clip": 0.06274616, "balance_loss_mlp": 0.0125471, "epoch": 0.7736660153314294, "flos": 22899931050240.0, "grad_norm": 1.5940028476165242, "language_loss": 0.71065104, "learning_rate": 5.136147992325595e-07, "loss": 0.78741974, "num_input_tokens_seen": 277642700, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09204102, "step": 12868, "time_per_iteration": 2.559030771255493 }, { "auxiliary_loss_clip": 0.06421334, "auxiliary_loss_mlp": 0.01264231, "balance_loss_clip": 0.0627794, "balance_loss_mlp": 0.01254426, "epoch": 0.7737261385840974, "flos": 13804762417920.0, "grad_norm": 3.481312363687961, "language_loss": 0.78034276, "learning_rate": 5.133542473511578e-07, "loss": 0.85719836, "num_input_tokens_seen": 277660005, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.0980835, "step": 12869, "time_per_iteration": 2.516214609146118 }, { "auxiliary_loss_clip": 0.06409948, "auxiliary_loss_mlp": 0.01264595, "balance_loss_clip": 0.06273976, "balance_loss_mlp": 0.0125504, "epoch": 0.7737862618367654, "flos": 28738279987200.0, "grad_norm": 1.5216311888929397, "language_loss": 0.73557079, "learning_rate": 5.130937518435124e-07, "loss": 0.81231618, "num_input_tokens_seen": 277682890, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.09552002, "step": 12870, "time_per_iteration": 2.5999715328216553 }, { "auxiliary_loss_clip": 0.06420803, "auxiliary_loss_mlp": 0.01265983, "balance_loss_clip": 0.06278621, "balance_loss_mlp": 0.01256148, "epoch": 0.7738463850894334, "flos": 17024126538240.0, "grad_norm": 1.9301474305582298, "language_loss": 0.75627792, "learning_rate": 5.12833312719501e-07, "loss": 0.83314574, "num_input_tokens_seen": 277699330, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09838867, "step": 12871, "time_per_iteration": 2.5242319107055664 }, { "auxiliary_loss_clip": 0.0641661, "auxiliary_loss_mlp": 0.01263205, "balance_loss_clip": 0.06277474, "balance_loss_mlp": 0.01253763, "epoch": 0.7739065083421013, "flos": 20710246227840.0, "grad_norm": 1.7475840012839574, "language_loss": 0.69610822, "learning_rate": 5.12572929988999e-07, "loss": 0.77290636, "num_input_tokens_seen": 277718750, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09429932, "step": 12872, "time_per_iteration": 2.578115701675415 }, { "auxiliary_loss_clip": 0.06419522, "auxiliary_loss_mlp": 0.01264374, "balance_loss_clip": 0.06276868, "balance_loss_mlp": 0.01253747, "epoch": 0.7739666315947693, "flos": 20702322017280.0, "grad_norm": 2.5024272125648963, "language_loss": 0.85741562, "learning_rate": 5.123126036618804e-07, "loss": 0.93425465, "num_input_tokens_seen": 277734645, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10626221, "step": 12873, "time_per_iteration": 3.8832039833068848 }, { "auxiliary_loss_clip": 0.06418642, "auxiliary_loss_mlp": 0.01270519, "balance_loss_clip": 0.06277719, "balance_loss_mlp": 0.01260482, "epoch": 0.7740267548474372, "flos": 29578501203840.0, "grad_norm": 3.290929361345544, "language_loss": 0.66152906, "learning_rate": 5.120523337480174e-07, "loss": 0.73842072, "num_input_tokens_seen": 277755535, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10046387, "step": 12874, "time_per_iteration": 2.587949514389038 }, { "auxiliary_loss_clip": 0.06416997, "auxiliary_loss_mlp": 0.01265889, "balance_loss_clip": 0.06278138, "balance_loss_mlp": 0.01256072, "epoch": 0.7740868781001052, "flos": 23665786168320.0, "grad_norm": 1.6045373011750834, "language_loss": 0.62732935, "learning_rate": 5.117921202572785e-07, "loss": 0.70415819, "num_input_tokens_seen": 277775585, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09814453, "step": 12875, "time_per_iteration": 2.554626226425171 }, { "auxiliary_loss_clip": 0.06420463, "auxiliary_loss_mlp": 0.01264679, "balance_loss_clip": 0.06277452, "balance_loss_mlp": 0.01254654, "epoch": 0.7741470013527731, "flos": 24724200216960.0, "grad_norm": 1.8133875271089561, "language_loss": 0.65858132, "learning_rate": 5.115319631995318e-07, "loss": 0.73543274, "num_input_tokens_seen": 277794795, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10021973, "step": 12876, "time_per_iteration": 2.55326771736145 }, { "auxiliary_loss_clip": 0.06413545, "auxiliary_loss_mlp": 0.01262612, "balance_loss_clip": 0.062758, "balance_loss_mlp": 0.01253641, "epoch": 0.7742071246054412, "flos": 21878092108800.0, "grad_norm": 1.854703722093291, "language_loss": 0.71331775, "learning_rate": 5.112718625846433e-07, "loss": 0.79007936, "num_input_tokens_seen": 277813235, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.08978271, "step": 12877, "time_per_iteration": 2.518174648284912 }, { "auxiliary_loss_clip": 0.06424806, "auxiliary_loss_mlp": 0.01265724, "balance_loss_clip": 0.06278589, "balance_loss_mlp": 0.01255591, "epoch": 0.7742672478581091, "flos": 22680815823360.0, "grad_norm": 1.6817196543451136, "language_loss": 0.83273029, "learning_rate": 5.110118184224736e-07, "loss": 0.9096356, "num_input_tokens_seen": 277832560, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10125732, "step": 12878, "time_per_iteration": 2.5420548915863037 }, { "auxiliary_loss_clip": 0.06418552, "auxiliary_loss_mlp": 0.01267964, "balance_loss_clip": 0.06275888, "balance_loss_mlp": 0.01257652, "epoch": 0.7743273711107771, "flos": 18846425134080.0, "grad_norm": 1.9143304521209363, "language_loss": 0.73864287, "learning_rate": 5.10751830722885e-07, "loss": 0.81550801, "num_input_tokens_seen": 277850120, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10314941, "step": 12879, "time_per_iteration": 2.5125319957733154 }, { "auxiliary_loss_clip": 0.06412041, "auxiliary_loss_mlp": 0.01266545, "balance_loss_clip": 0.06275794, "balance_loss_mlp": 0.01257121, "epoch": 0.7743874943634451, "flos": 28736644832640.0, "grad_norm": 2.620404590901647, "language_loss": 0.79781759, "learning_rate": 5.104918994957364e-07, "loss": 0.87460345, "num_input_tokens_seen": 277871020, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.09429932, "step": 12880, "time_per_iteration": 2.5865230560302734 }, { "auxiliary_loss_clip": 0.06413479, "auxiliary_loss_mlp": 0.0126743, "balance_loss_clip": 0.06274666, "balance_loss_mlp": 0.01257369, "epoch": 0.774447617616113, "flos": 21916344297600.0, "grad_norm": 1.5177451215725029, "language_loss": 0.70462584, "learning_rate": 5.102320247508847e-07, "loss": 0.78143501, "num_input_tokens_seen": 277891525, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10064697, "step": 12881, "time_per_iteration": 2.5586211681365967 }, { "auxiliary_loss_clip": 0.06421669, "auxiliary_loss_mlp": 0.01266806, "balance_loss_clip": 0.06275503, "balance_loss_mlp": 0.01255184, "epoch": 0.774507740868781, "flos": 19506789561600.0, "grad_norm": 2.134576699314677, "language_loss": 0.85137159, "learning_rate": 5.099722064981832e-07, "loss": 0.92825639, "num_input_tokens_seen": 277910425, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.11627197, "step": 12882, "time_per_iteration": 2.5099191665649414 }, { "auxiliary_loss_clip": 0.06321892, "auxiliary_loss_mlp": 0.01255433, "balance_loss_clip": 0.06265451, "balance_loss_mlp": 0.01254169, "epoch": 0.774567864121449, "flos": 59447240622720.0, "grad_norm": 1.0346272890050274, "language_loss": 0.60050845, "learning_rate": 5.097124447474858e-07, "loss": 0.67628169, "num_input_tokens_seen": 277972795, "router_z_loss_clip": 0.56396484, "router_z_loss_mlp": 0.01264191, "step": 12883, "time_per_iteration": 3.0975611209869385 }, { "auxiliary_loss_clip": 0.06420091, "auxiliary_loss_mlp": 0.01265304, "balance_loss_clip": 0.06277961, "balance_loss_mlp": 0.01254725, "epoch": 0.774627987374117, "flos": 13230461733120.0, "grad_norm": 1.7007979627713472, "language_loss": 0.73458755, "learning_rate": 5.094527395086416e-07, "loss": 0.81144148, "num_input_tokens_seen": 277990675, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10571289, "step": 12884, "time_per_iteration": 2.549825429916382 }, { "auxiliary_loss_clip": 0.06413858, "auxiliary_loss_mlp": 0.01268606, "balance_loss_clip": 0.06276108, "balance_loss_mlp": 0.01259553, "epoch": 0.7746881106267849, "flos": 21399848530560.0, "grad_norm": 1.547067411205729, "language_loss": 0.81623018, "learning_rate": 5.091930907914986e-07, "loss": 0.89305484, "num_input_tokens_seen": 278010050, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09051514, "step": 12885, "time_per_iteration": 2.550427198410034 }, { "auxiliary_loss_clip": 0.06415209, "auxiliary_loss_mlp": 0.01263192, "balance_loss_clip": 0.0627811, "balance_loss_mlp": 0.01253893, "epoch": 0.7747482338794529, "flos": 25636355763840.0, "grad_norm": 1.6820987817019686, "language_loss": 0.64089066, "learning_rate": 5.089334986059029e-07, "loss": 0.71767467, "num_input_tokens_seen": 278030660, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09295654, "step": 12886, "time_per_iteration": 2.5891873836517334 }, { "auxiliary_loss_clip": 0.06421649, "auxiliary_loss_mlp": 0.01264959, "balance_loss_clip": 0.06279211, "balance_loss_mlp": 0.01256393, "epoch": 0.7748083571321208, "flos": 11551780235520.0, "grad_norm": 1.9957586731795733, "language_loss": 0.69324291, "learning_rate": 5.086739629616987e-07, "loss": 0.770109, "num_input_tokens_seen": 278047645, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.08569336, "step": 12887, "time_per_iteration": 2.5220561027526855 }, { "auxiliary_loss_clip": 0.06411242, "auxiliary_loss_mlp": 0.01265408, "balance_loss_clip": 0.0627366, "balance_loss_mlp": 0.01256199, "epoch": 0.7748684803847888, "flos": 19068433326720.0, "grad_norm": 1.8408706862835533, "language_loss": 0.70980281, "learning_rate": 5.084144838687275e-07, "loss": 0.78656936, "num_input_tokens_seen": 278066170, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09216309, "step": 12888, "time_per_iteration": 2.5265278816223145 }, { "auxiliary_loss_clip": 0.06422335, "auxiliary_loss_mlp": 0.0126807, "balance_loss_clip": 0.06278381, "balance_loss_mlp": 0.01257824, "epoch": 0.7749286036374567, "flos": 22279705528320.0, "grad_norm": 1.7724878116152931, "language_loss": 0.82203478, "learning_rate": 5.081550613368279e-07, "loss": 0.89893883, "num_input_tokens_seen": 278085545, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10241699, "step": 12889, "time_per_iteration": 2.5334742069244385 }, { "auxiliary_loss_clip": 0.0641769, "auxiliary_loss_mlp": 0.01267023, "balance_loss_clip": 0.06278048, "balance_loss_mlp": 0.01257558, "epoch": 0.7749887268901248, "flos": 20198488216320.0, "grad_norm": 1.997704045358383, "language_loss": 0.79595804, "learning_rate": 5.07895695375838e-07, "loss": 0.87280512, "num_input_tokens_seen": 278102995, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09460449, "step": 12890, "time_per_iteration": 2.5112528800964355 }, { "auxiliary_loss_clip": 0.06422181, "auxiliary_loss_mlp": 0.01270481, "balance_loss_clip": 0.06280109, "balance_loss_mlp": 0.01259931, "epoch": 0.7750488501427927, "flos": 20343446979840.0, "grad_norm": 1.8427198646896321, "language_loss": 0.66939467, "learning_rate": 5.076363859955932e-07, "loss": 0.74632126, "num_input_tokens_seen": 278121460, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10540771, "step": 12891, "time_per_iteration": 3.9631154537200928 }, { "auxiliary_loss_clip": 0.06422245, "auxiliary_loss_mlp": 0.01267277, "balance_loss_clip": 0.06280252, "balance_loss_mlp": 0.01257699, "epoch": 0.7751089733954607, "flos": 28371229176960.0, "grad_norm": 1.7574981495995738, "language_loss": 0.79291189, "learning_rate": 5.073771332059257e-07, "loss": 0.869807, "num_input_tokens_seen": 278143905, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.0958252, "step": 12892, "time_per_iteration": 2.6329867839813232 }, { "auxiliary_loss_clip": 0.06428, "auxiliary_loss_mlp": 0.01265846, "balance_loss_clip": 0.06283151, "balance_loss_mlp": 0.01255272, "epoch": 0.7751690966481286, "flos": 16949047680000.0, "grad_norm": 1.88297795004145, "language_loss": 0.67433339, "learning_rate": 5.071179370166669e-07, "loss": 0.75127184, "num_input_tokens_seen": 278160850, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10577393, "step": 12893, "time_per_iteration": 2.4913511276245117 }, { "auxiliary_loss_clip": 0.06319425, "auxiliary_loss_mlp": 0.0125303, "balance_loss_clip": 0.06262873, "balance_loss_mlp": 0.01251639, "epoch": 0.7752292199007966, "flos": 65690179799040.0, "grad_norm": 0.7948769586207126, "language_loss": 0.58574116, "learning_rate": 5.068587974376468e-07, "loss": 0.6614657, "num_input_tokens_seen": 278219950, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.01391602, "step": 12894, "time_per_iteration": 3.2134006023406982 }, { "auxiliary_loss_clip": 0.06420858, "auxiliary_loss_mlp": 0.01264402, "balance_loss_clip": 0.06277519, "balance_loss_mlp": 0.01253918, "epoch": 0.7752893431534646, "flos": 20600898249600.0, "grad_norm": 2.247714730406159, "language_loss": 0.78017795, "learning_rate": 5.065997144786895e-07, "loss": 0.85703057, "num_input_tokens_seen": 278237805, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10491943, "step": 12895, "time_per_iteration": 4.030488967895508 }, { "auxiliary_loss_clip": 0.06416801, "auxiliary_loss_mlp": 0.01265326, "balance_loss_clip": 0.06276911, "balance_loss_mlp": 0.01255318, "epoch": 0.7753494664061326, "flos": 20491592198400.0, "grad_norm": 1.6285837521782471, "language_loss": 0.67771333, "learning_rate": 5.063406881496209e-07, "loss": 0.7545346, "num_input_tokens_seen": 278257660, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.10009766, "step": 12896, "time_per_iteration": 2.5340802669525146 }, { "auxiliary_loss_clip": 0.06416346, "auxiliary_loss_mlp": 0.01263239, "balance_loss_clip": 0.06274633, "balance_loss_mlp": 0.01253845, "epoch": 0.7754095896588006, "flos": 20272015774080.0, "grad_norm": 1.7285511708633687, "language_loss": 0.68823379, "learning_rate": 5.060817184602629e-07, "loss": 0.76502961, "num_input_tokens_seen": 278275110, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09399414, "step": 12897, "time_per_iteration": 2.5489346981048584 }, { "auxiliary_loss_clip": 0.06421149, "auxiliary_loss_mlp": 0.01267566, "balance_loss_clip": 0.06278736, "balance_loss_mlp": 0.01257147, "epoch": 0.7754697129114685, "flos": 23337784160640.0, "grad_norm": 1.6854894112584484, "language_loss": 0.74916112, "learning_rate": 5.058228054204364e-07, "loss": 0.82604831, "num_input_tokens_seen": 278293035, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10412598, "step": 12898, "time_per_iteration": 2.566925525665283 }, { "auxiliary_loss_clip": 0.06422628, "auxiliary_loss_mlp": 0.01265602, "balance_loss_clip": 0.06280125, "balance_loss_mlp": 0.01255267, "epoch": 0.7755298361641365, "flos": 17353344430080.0, "grad_norm": 1.8845662439914708, "language_loss": 0.70261765, "learning_rate": 5.055639490399588e-07, "loss": 0.77950001, "num_input_tokens_seen": 278311010, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10333252, "step": 12899, "time_per_iteration": 2.5175297260284424 }, { "auxiliary_loss_clip": 0.06421796, "auxiliary_loss_mlp": 0.01267797, "balance_loss_clip": 0.06280373, "balance_loss_mlp": 0.01257163, "epoch": 0.7755899594168044, "flos": 19651916033280.0, "grad_norm": 1.954991457028906, "language_loss": 0.75604522, "learning_rate": 5.053051493286453e-07, "loss": 0.83294117, "num_input_tokens_seen": 278329900, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.10638428, "step": 12900, "time_per_iteration": 2.5164108276367188 }, { "auxiliary_loss_clip": 0.06412835, "auxiliary_loss_mlp": 0.01264875, "balance_loss_clip": 0.0627577, "balance_loss_mlp": 0.01256214, "epoch": 0.7756500826694724, "flos": 27421324565760.0, "grad_norm": 2.557471194220847, "language_loss": 0.77856809, "learning_rate": 5.050464062963113e-07, "loss": 0.85534513, "num_input_tokens_seen": 278349980, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.08660889, "step": 12901, "time_per_iteration": 2.5882322788238525 }, { "auxiliary_loss_clip": 0.06415069, "auxiliary_loss_mlp": 0.01266959, "balance_loss_clip": 0.06275909, "balance_loss_mlp": 0.01256689, "epoch": 0.7757102059221404, "flos": 28738028424960.0, "grad_norm": 1.3897784917113318, "language_loss": 0.77181512, "learning_rate": 5.047877199527666e-07, "loss": 0.84863544, "num_input_tokens_seen": 278372485, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.10266113, "step": 12902, "time_per_iteration": 2.6259140968322754 }, { "auxiliary_loss_clip": 0.06419668, "auxiliary_loss_mlp": 0.01265851, "balance_loss_clip": 0.06279482, "balance_loss_mlp": 0.01256201, "epoch": 0.7757703291748084, "flos": 22492489772160.0, "grad_norm": 1.8001905479320253, "language_loss": 0.73356307, "learning_rate": 5.045290903078215e-07, "loss": 0.81041825, "num_input_tokens_seen": 278391660, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09655762, "step": 12903, "time_per_iteration": 2.5635664463043213 }, { "auxiliary_loss_clip": 0.06414372, "auxiliary_loss_mlp": 0.01263041, "balance_loss_clip": 0.06274726, "balance_loss_mlp": 0.01252986, "epoch": 0.7758304524274763, "flos": 21435920513280.0, "grad_norm": 2.180214874714215, "language_loss": 0.76326573, "learning_rate": 5.042705173712835e-07, "loss": 0.84003985, "num_input_tokens_seen": 278409125, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.10058594, "step": 12904, "time_per_iteration": 2.505640745162964 }, { "auxiliary_loss_clip": 0.06412455, "auxiliary_loss_mlp": 0.0126541, "balance_loss_clip": 0.06276286, "balance_loss_mlp": 0.01256678, "epoch": 0.7758905756801443, "flos": 23665953876480.0, "grad_norm": 2.9049276792706857, "language_loss": 0.69172299, "learning_rate": 5.040120011529576e-07, "loss": 0.76850164, "num_input_tokens_seen": 278429450, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.08728027, "step": 12905, "time_per_iteration": 2.5487194061279297 }, { "auxiliary_loss_clip": 0.0641187, "auxiliary_loss_mlp": 0.01268223, "balance_loss_clip": 0.06276082, "balance_loss_mlp": 0.01258901, "epoch": 0.7759506989328122, "flos": 28372906258560.0, "grad_norm": 1.6388135263756736, "language_loss": 0.67509353, "learning_rate": 5.037535416626459e-07, "loss": 0.75189453, "num_input_tokens_seen": 278449925, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.09320068, "step": 12906, "time_per_iteration": 4.057203769683838 }, { "auxiliary_loss_clip": 0.06416391, "auxiliary_loss_mlp": 0.01264916, "balance_loss_clip": 0.06276116, "balance_loss_mlp": 0.01255296, "epoch": 0.7760108221854802, "flos": 14908053127680.0, "grad_norm": 1.9138859936585306, "language_loss": 0.81507504, "learning_rate": 5.034951389101498e-07, "loss": 0.89188814, "num_input_tokens_seen": 278467255, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09619141, "step": 12907, "time_per_iteration": 2.514061689376831 }, { "auxiliary_loss_clip": 0.06412184, "auxiliary_loss_mlp": 0.01269784, "balance_loss_clip": 0.06276806, "balance_loss_mlp": 0.01260378, "epoch": 0.7760709454381483, "flos": 14797615046400.0, "grad_norm": 2.207454476558175, "language_loss": 0.67951941, "learning_rate": 5.032367929052685e-07, "loss": 0.75633913, "num_input_tokens_seen": 278484250, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.09411621, "step": 12908, "time_per_iteration": 2.5049824714660645 }, { "auxiliary_loss_clip": 0.06421664, "auxiliary_loss_mlp": 0.01270638, "balance_loss_clip": 0.06279069, "balance_loss_mlp": 0.01260428, "epoch": 0.7761310686908162, "flos": 17384846365440.0, "grad_norm": 1.5572877354344594, "language_loss": 0.70540446, "learning_rate": 5.029785036577976e-07, "loss": 0.78232747, "num_input_tokens_seen": 278502740, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10211182, "step": 12909, "time_per_iteration": 2.5233519077301025 }, { "auxiliary_loss_clip": 0.06417383, "auxiliary_loss_mlp": 0.01268417, "balance_loss_clip": 0.06279628, "balance_loss_mlp": 0.01259339, "epoch": 0.7761911919434842, "flos": 25563582892800.0, "grad_norm": 2.082628590865171, "language_loss": 0.68213862, "learning_rate": 5.027202711775324e-07, "loss": 0.75899661, "num_input_tokens_seen": 278523890, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09082031, "step": 12910, "time_per_iteration": 2.5766873359680176 }, { "auxiliary_loss_clip": 0.06414923, "auxiliary_loss_mlp": 0.01265744, "balance_loss_clip": 0.06272968, "balance_loss_mlp": 0.01256088, "epoch": 0.7762513151961521, "flos": 23185530092160.0, "grad_norm": 1.5200918768543918, "language_loss": 0.71870226, "learning_rate": 5.024620954742646e-07, "loss": 0.79550892, "num_input_tokens_seen": 278543185, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09661865, "step": 12911, "time_per_iteration": 2.562772512435913 }, { "auxiliary_loss_clip": 0.06419618, "auxiliary_loss_mlp": 0.01267755, "balance_loss_clip": 0.06277575, "balance_loss_mlp": 0.01257127, "epoch": 0.7763114384488201, "flos": 21696097040640.0, "grad_norm": 2.362186627988047, "language_loss": 0.62995607, "learning_rate": 5.022039765577836e-07, "loss": 0.70682979, "num_input_tokens_seen": 278559220, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10620117, "step": 12912, "time_per_iteration": 3.9756972789764404 }, { "auxiliary_loss_clip": 0.06321572, "auxiliary_loss_mlp": 0.01249362, "balance_loss_clip": 0.06265251, "balance_loss_mlp": 0.01247965, "epoch": 0.776371561701488, "flos": 69048381335040.0, "grad_norm": 0.7590362721115791, "language_loss": 0.53188944, "learning_rate": 5.019459144378779e-07, "loss": 0.60759872, "num_input_tokens_seen": 278618185, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01396942, "step": 12913, "time_per_iteration": 3.2265353202819824 }, { "auxiliary_loss_clip": 0.06420673, "auxiliary_loss_mlp": 0.01264429, "balance_loss_clip": 0.06279747, "balance_loss_mlp": 0.01255143, "epoch": 0.776431684954156, "flos": 22900643809920.0, "grad_norm": 1.807856701939676, "language_loss": 0.62520659, "learning_rate": 5.016879091243338e-07, "loss": 0.7020576, "num_input_tokens_seen": 278636210, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09289551, "step": 12914, "time_per_iteration": 2.5442421436309814 }, { "auxiliary_loss_clip": 0.06417537, "auxiliary_loss_mlp": 0.01263984, "balance_loss_clip": 0.06277866, "balance_loss_mlp": 0.01254161, "epoch": 0.776491808206824, "flos": 20266942602240.0, "grad_norm": 1.8744026448460018, "language_loss": 0.82743156, "learning_rate": 5.014299606269339e-07, "loss": 0.90424675, "num_input_tokens_seen": 278653305, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09820557, "step": 12915, "time_per_iteration": 2.560025691986084 }, { "auxiliary_loss_clip": 0.06420301, "auxiliary_loss_mlp": 0.01264324, "balance_loss_clip": 0.06275586, "balance_loss_mlp": 0.01254436, "epoch": 0.776551931459492, "flos": 26766033310080.0, "grad_norm": 1.7767304638509127, "language_loss": 0.75491202, "learning_rate": 5.011720689554603e-07, "loss": 0.83175826, "num_input_tokens_seen": 278671850, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.09887695, "step": 12916, "time_per_iteration": 2.5730013847351074 }, { "auxiliary_loss_clip": 0.0642266, "auxiliary_loss_mlp": 0.01263754, "balance_loss_clip": 0.06280366, "balance_loss_mlp": 0.01254074, "epoch": 0.7766120547121599, "flos": 52676583960960.0, "grad_norm": 2.3264315666107858, "language_loss": 0.65552098, "learning_rate": 5.009142341196919e-07, "loss": 0.73238516, "num_input_tokens_seen": 278697860, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09674072, "step": 12917, "time_per_iteration": 2.834749221801758 }, { "auxiliary_loss_clip": 0.06417406, "auxiliary_loss_mlp": 0.01266937, "balance_loss_clip": 0.06275145, "balance_loss_mlp": 0.01257198, "epoch": 0.7766721779648279, "flos": 25163353065600.0, "grad_norm": 1.5409461647831366, "language_loss": 0.64509547, "learning_rate": 5.006564561294065e-07, "loss": 0.72193897, "num_input_tokens_seen": 278720655, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09747314, "step": 12918, "time_per_iteration": 2.596237897872925 }, { "auxiliary_loss_clip": 0.06417007, "auxiliary_loss_mlp": 0.0126558, "balance_loss_clip": 0.06277468, "balance_loss_mlp": 0.01256651, "epoch": 0.7767323012174958, "flos": 23766161760000.0, "grad_norm": 2.505384236764518, "language_loss": 0.73800921, "learning_rate": 5.003987349943777e-07, "loss": 0.81483507, "num_input_tokens_seen": 278737375, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.08929443, "step": 12919, "time_per_iteration": 2.518965721130371 }, { "auxiliary_loss_clip": 0.0642004, "auxiliary_loss_mlp": 0.01267018, "balance_loss_clip": 0.06277665, "balance_loss_mlp": 0.01256462, "epoch": 0.7767924244701638, "flos": 22092469580160.0, "grad_norm": 1.7684802833750237, "language_loss": 0.79429591, "learning_rate": 5.001410707243792e-07, "loss": 0.87116653, "num_input_tokens_seen": 278756510, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10552979, "step": 12920, "time_per_iteration": 2.5519604682922363 }, { "auxiliary_loss_clip": 0.06423067, "auxiliary_loss_mlp": 0.01264771, "balance_loss_clip": 0.06279911, "balance_loss_mlp": 0.01254764, "epoch": 0.7768525477228319, "flos": 21988194773760.0, "grad_norm": 1.5638064142751547, "language_loss": 0.7092098, "learning_rate": 4.998834633291829e-07, "loss": 0.78608817, "num_input_tokens_seen": 278775410, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10015869, "step": 12921, "time_per_iteration": 2.534127712249756 }, { "auxiliary_loss_clip": 0.06425349, "auxiliary_loss_mlp": 0.01268303, "balance_loss_clip": 0.06280098, "balance_loss_mlp": 0.01258313, "epoch": 0.7769126709754998, "flos": 21800329920000.0, "grad_norm": 4.676315347507834, "language_loss": 0.7618857, "learning_rate": 4.996259128185547e-07, "loss": 0.83882225, "num_input_tokens_seen": 278794260, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.09985352, "step": 12922, "time_per_iteration": 2.538950204849243 }, { "auxiliary_loss_clip": 0.06419387, "auxiliary_loss_mlp": 0.0126636, "balance_loss_clip": 0.06280135, "balance_loss_mlp": 0.01256048, "epoch": 0.7769727942281678, "flos": 20054242212480.0, "grad_norm": 1.6872283088858704, "language_loss": 0.80555928, "learning_rate": 4.993684192022625e-07, "loss": 0.88241678, "num_input_tokens_seen": 278813290, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.10302734, "step": 12923, "time_per_iteration": 2.508018732070923 }, { "auxiliary_loss_clip": 0.06420756, "auxiliary_loss_mlp": 0.01266818, "balance_loss_clip": 0.0628148, "balance_loss_mlp": 0.01257514, "epoch": 0.7770329174808357, "flos": 21692784804480.0, "grad_norm": 1.84305392031589, "language_loss": 0.9251703, "learning_rate": 4.991109824900699e-07, "loss": 1.00204611, "num_input_tokens_seen": 278830610, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09301758, "step": 12924, "time_per_iteration": 2.5276412963867188 }, { "auxiliary_loss_clip": 0.06413962, "auxiliary_loss_mlp": 0.01262904, "balance_loss_clip": 0.06271809, "balance_loss_mlp": 0.01253069, "epoch": 0.7770930407335037, "flos": 25856477239680.0, "grad_norm": 2.423586644107709, "language_loss": 0.66209918, "learning_rate": 4.988536026917401e-07, "loss": 0.73886782, "num_input_tokens_seen": 278849530, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09832764, "step": 12925, "time_per_iteration": 2.624986171722412 }, { "auxiliary_loss_clip": 0.06420735, "auxiliary_loss_mlp": 0.01268863, "balance_loss_clip": 0.06277238, "balance_loss_mlp": 0.01258701, "epoch": 0.7771531639861716, "flos": 24353921024640.0, "grad_norm": 2.1638841818942205, "language_loss": 0.72131366, "learning_rate": 4.985962798170314e-07, "loss": 0.79820967, "num_input_tokens_seen": 278869005, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10168457, "step": 12926, "time_per_iteration": 2.5744597911834717 }, { "auxiliary_loss_clip": 0.06420536, "auxiliary_loss_mlp": 0.01264181, "balance_loss_clip": 0.06277266, "balance_loss_mlp": 0.01253738, "epoch": 0.7772132872388396, "flos": 25637068523520.0, "grad_norm": 1.7687974360260734, "language_loss": 0.66262889, "learning_rate": 4.983390138757027e-07, "loss": 0.73947608, "num_input_tokens_seen": 278888790, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10449219, "step": 12927, "time_per_iteration": 2.565424919128418 }, { "auxiliary_loss_clip": 0.06419247, "auxiliary_loss_mlp": 0.01267933, "balance_loss_clip": 0.06277849, "balance_loss_mlp": 0.01257627, "epoch": 0.7772734104915076, "flos": 26074544290560.0, "grad_norm": 1.8834294327920504, "language_loss": 0.72523779, "learning_rate": 4.980818048775093e-07, "loss": 0.8021096, "num_input_tokens_seen": 278908150, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10302734, "step": 12928, "time_per_iteration": 2.5687007904052734 }, { "auxiliary_loss_clip": 0.06411803, "auxiliary_loss_mlp": 0.01265944, "balance_loss_clip": 0.06273372, "balance_loss_mlp": 0.01256723, "epoch": 0.7773335337441756, "flos": 22930887934080.0, "grad_norm": 1.6499119001841163, "language_loss": 0.74762547, "learning_rate": 4.978246528322036e-07, "loss": 0.82440293, "num_input_tokens_seen": 278927425, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09216309, "step": 12929, "time_per_iteration": 2.532776355743408 }, { "auxiliary_loss_clip": 0.06416285, "auxiliary_loss_mlp": 0.0126802, "balance_loss_clip": 0.06274734, "balance_loss_mlp": 0.01258298, "epoch": 0.7773936569968435, "flos": 20782977171840.0, "grad_norm": 1.7199356342745045, "language_loss": 0.77746356, "learning_rate": 4.975675577495377e-07, "loss": 0.85430658, "num_input_tokens_seen": 278946475, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.097229, "step": 12930, "time_per_iteration": 2.5224485397338867 }, { "auxiliary_loss_clip": 0.06417018, "auxiliary_loss_mlp": 0.01267436, "balance_loss_clip": 0.06276151, "balance_loss_mlp": 0.01257327, "epoch": 0.7774537802495115, "flos": 20377883808000.0, "grad_norm": 1.785769535908186, "language_loss": 0.80190802, "learning_rate": 4.973105196392613e-07, "loss": 0.87875259, "num_input_tokens_seen": 278964345, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10107422, "step": 12931, "time_per_iteration": 3.9478039741516113 }, { "auxiliary_loss_clip": 0.06316656, "auxiliary_loss_mlp": 0.01251138, "balance_loss_clip": 0.06260474, "balance_loss_mlp": 0.01249817, "epoch": 0.7775139035021794, "flos": 53930981980800.0, "grad_norm": 0.7767593386917065, "language_loss": 0.5959065, "learning_rate": 4.970535385111199e-07, "loss": 0.67158437, "num_input_tokens_seen": 279022380, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01322174, "step": 12932, "time_per_iteration": 3.115964651107788 }, { "auxiliary_loss_clip": 0.06421924, "auxiliary_loss_mlp": 0.01263024, "balance_loss_clip": 0.06278004, "balance_loss_mlp": 0.01253684, "epoch": 0.7775740267548474, "flos": 28850437077120.0, "grad_norm": 1.8095426577534814, "language_loss": 0.76482952, "learning_rate": 4.967966143748595e-07, "loss": 0.84167904, "num_input_tokens_seen": 279044275, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.09338379, "step": 12933, "time_per_iteration": 2.606945276260376 }, { "auxiliary_loss_clip": 0.06415509, "auxiliary_loss_mlp": 0.01265451, "balance_loss_clip": 0.06274841, "balance_loss_mlp": 0.01254811, "epoch": 0.7776341500075155, "flos": 21879056430720.0, "grad_norm": 1.9518950834063333, "language_loss": 0.7387737, "learning_rate": 4.965397472402215e-07, "loss": 0.81558335, "num_input_tokens_seen": 279063375, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10650635, "step": 12934, "time_per_iteration": 2.5497725009918213 }, { "auxiliary_loss_clip": 0.06416868, "auxiliary_loss_mlp": 0.0126441, "balance_loss_clip": 0.06274313, "balance_loss_mlp": 0.01253758, "epoch": 0.7776942732601834, "flos": 20236027645440.0, "grad_norm": 1.7792776921734308, "language_loss": 0.70505702, "learning_rate": 4.962829371169475e-07, "loss": 0.78186977, "num_input_tokens_seen": 279082680, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10650635, "step": 12935, "time_per_iteration": 3.9846694469451904 }, { "auxiliary_loss_clip": 0.06423596, "auxiliary_loss_mlp": 0.01265932, "balance_loss_clip": 0.06279741, "balance_loss_mlp": 0.01256134, "epoch": 0.7777543965128514, "flos": 22237554124800.0, "grad_norm": 1.551801991580359, "language_loss": 0.83631432, "learning_rate": 4.960261840147746e-07, "loss": 0.91320956, "num_input_tokens_seen": 279099805, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.09796143, "step": 12936, "time_per_iteration": 2.558753490447998 }, { "auxiliary_loss_clip": 0.06423777, "auxiliary_loss_mlp": 0.012638, "balance_loss_clip": 0.06276497, "balance_loss_mlp": 0.0125452, "epoch": 0.7778145197655193, "flos": 14507236321920.0, "grad_norm": 1.9264979951556553, "language_loss": 0.67649353, "learning_rate": 4.957694879434397e-07, "loss": 0.75336933, "num_input_tokens_seen": 279117975, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.09283447, "step": 12937, "time_per_iteration": 2.5220141410827637 }, { "auxiliary_loss_clip": 0.06417869, "auxiliary_loss_mlp": 0.01262585, "balance_loss_clip": 0.06275729, "balance_loss_mlp": 0.01253024, "epoch": 0.7778746430181873, "flos": 21146338402560.0, "grad_norm": 1.4818880109075345, "language_loss": 0.87435651, "learning_rate": 4.955128489126777e-07, "loss": 0.95116103, "num_input_tokens_seen": 279137255, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09576416, "step": 12938, "time_per_iteration": 2.529528856277466 }, { "auxiliary_loss_clip": 0.06415501, "auxiliary_loss_mlp": 0.01264842, "balance_loss_clip": 0.06274901, "balance_loss_mlp": 0.01254912, "epoch": 0.7779347662708552, "flos": 20272560825600.0, "grad_norm": 3.1851944779406076, "language_loss": 0.8515774, "learning_rate": 4.95256266932218e-07, "loss": 0.92838085, "num_input_tokens_seen": 279154500, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.0993042, "step": 12939, "time_per_iteration": 2.5544872283935547 }, { "auxiliary_loss_clip": 0.06411627, "auxiliary_loss_mlp": 0.01266217, "balance_loss_clip": 0.0627453, "balance_loss_mlp": 0.01256877, "epoch": 0.7779948895235232, "flos": 19215153025920.0, "grad_norm": 1.7494160588225727, "language_loss": 0.6932146, "learning_rate": 4.949997420117915e-07, "loss": 0.76999307, "num_input_tokens_seen": 279173635, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09338379, "step": 12940, "time_per_iteration": 2.537104845046997 }, { "auxiliary_loss_clip": 0.06418616, "auxiliary_loss_mlp": 0.01269314, "balance_loss_clip": 0.06277287, "balance_loss_mlp": 0.01259526, "epoch": 0.7780550127761912, "flos": 23921476502400.0, "grad_norm": 2.9905337038646533, "language_loss": 0.77969968, "learning_rate": 4.947432741611255e-07, "loss": 0.85657895, "num_input_tokens_seen": 279194430, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09790039, "step": 12941, "time_per_iteration": 2.578181266784668 }, { "auxiliary_loss_clip": 0.06425577, "auxiliary_loss_mlp": 0.01267511, "balance_loss_clip": 0.06280504, "balance_loss_mlp": 0.01256454, "epoch": 0.7781151360288592, "flos": 32424148114560.0, "grad_norm": 2.117107659608534, "language_loss": 0.73007476, "learning_rate": 4.944868633899462e-07, "loss": 0.80700564, "num_input_tokens_seen": 279212920, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.1105957, "step": 12942, "time_per_iteration": 2.6237523555755615 }, { "auxiliary_loss_clip": 0.06414177, "auxiliary_loss_mlp": 0.01267941, "balance_loss_clip": 0.06276955, "balance_loss_mlp": 0.01258815, "epoch": 0.7781752592815271, "flos": 22352981523840.0, "grad_norm": 1.9910427527449088, "language_loss": 0.67991185, "learning_rate": 4.942305097079751e-07, "loss": 0.75673294, "num_input_tokens_seen": 279232310, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09124756, "step": 12943, "time_per_iteration": 2.5686700344085693 }, { "auxiliary_loss_clip": 0.06314798, "auxiliary_loss_mlp": 0.01251134, "balance_loss_clip": 0.06258622, "balance_loss_mlp": 0.01249972, "epoch": 0.7782353825341951, "flos": 70479101802240.0, "grad_norm": 0.7628086932120579, "language_loss": 0.584867, "learning_rate": 4.939742131249347e-07, "loss": 0.66052634, "num_input_tokens_seen": 279295375, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01160431, "step": 12944, "time_per_iteration": 3.3035941123962402 }, { "auxiliary_loss_clip": 0.06425013, "auxiliary_loss_mlp": 0.01267335, "balance_loss_clip": 0.06280123, "balance_loss_mlp": 0.012561, "epoch": 0.778295505786863, "flos": 19068601034880.0, "grad_norm": 2.2322466825697584, "language_loss": 0.67869943, "learning_rate": 4.937179736505428e-07, "loss": 0.75562286, "num_input_tokens_seen": 279313660, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.11224365, "step": 12945, "time_per_iteration": 4.015491962432861 }, { "auxiliary_loss_clip": 0.06416672, "auxiliary_loss_mlp": 0.01263899, "balance_loss_clip": 0.0627505, "balance_loss_mlp": 0.0125391, "epoch": 0.778355629039531, "flos": 21006662446080.0, "grad_norm": 1.6561283842459533, "language_loss": 0.69327617, "learning_rate": 4.93461791294516e-07, "loss": 0.77008194, "num_input_tokens_seen": 279334495, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09991455, "step": 12946, "time_per_iteration": 2.5655040740966797 }, { "auxiliary_loss_clip": 0.06417631, "auxiliary_loss_mlp": 0.01263318, "balance_loss_clip": 0.06276254, "balance_loss_mlp": 0.01253418, "epoch": 0.7784157522921991, "flos": 21404586286080.0, "grad_norm": 1.9389986168863225, "language_loss": 0.65506554, "learning_rate": 4.932056660665689e-07, "loss": 0.731875, "num_input_tokens_seen": 279352985, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09899902, "step": 12947, "time_per_iteration": 2.527451276779175 }, { "auxiliary_loss_clip": 0.06415529, "auxiliary_loss_mlp": 0.01267014, "balance_loss_clip": 0.06275247, "balance_loss_mlp": 0.01257704, "epoch": 0.778475875544867, "flos": 20820181184640.0, "grad_norm": 2.271285323513296, "language_loss": 0.64897406, "learning_rate": 4.929495979764147e-07, "loss": 0.7257995, "num_input_tokens_seen": 279371360, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09307861, "step": 12948, "time_per_iteration": 2.517745018005371 }, { "auxiliary_loss_clip": 0.06414529, "auxiliary_loss_mlp": 0.01264035, "balance_loss_clip": 0.06275074, "balance_loss_mlp": 0.01253908, "epoch": 0.778535998797535, "flos": 14360516622720.0, "grad_norm": 1.8073550175097395, "language_loss": 0.75974929, "learning_rate": 4.926935870337625e-07, "loss": 0.83653492, "num_input_tokens_seen": 279389400, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.10131836, "step": 12949, "time_per_iteration": 2.5030405521392822 }, { "auxiliary_loss_clip": 0.06421725, "auxiliary_loss_mlp": 0.01265015, "balance_loss_clip": 0.06276461, "balance_loss_mlp": 0.01254453, "epoch": 0.7785961220502029, "flos": 19215781931520.0, "grad_norm": 2.0897825899692766, "language_loss": 0.68811357, "learning_rate": 4.924376332483202e-07, "loss": 0.76498097, "num_input_tokens_seen": 279409715, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10571289, "step": 12950, "time_per_iteration": 2.607940673828125 }, { "auxiliary_loss_clip": 0.06420113, "auxiliary_loss_mlp": 0.01266381, "balance_loss_clip": 0.06275722, "balance_loss_mlp": 0.01256767, "epoch": 0.7786562453028709, "flos": 25745787596160.0, "grad_norm": 1.8689869765944265, "language_loss": 0.72343969, "learning_rate": 4.921817366297938e-07, "loss": 0.80030465, "num_input_tokens_seen": 279427705, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.09619141, "step": 12951, "time_per_iteration": 2.5686450004577637 }, { "auxiliary_loss_clip": 0.06415143, "auxiliary_loss_mlp": 0.01262106, "balance_loss_clip": 0.06275547, "balance_loss_mlp": 0.01252093, "epoch": 0.7787163685555388, "flos": 25746584209920.0, "grad_norm": 1.6220805607870812, "language_loss": 0.65969986, "learning_rate": 4.919258971878877e-07, "loss": 0.73647237, "num_input_tokens_seen": 279448215, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.10009766, "step": 12952, "time_per_iteration": 3.943279266357422 }, { "auxiliary_loss_clip": 0.06408368, "auxiliary_loss_mlp": 0.01264999, "balance_loss_clip": 0.06275645, "balance_loss_mlp": 0.01256255, "epoch": 0.7787764918082068, "flos": 22754385308160.0, "grad_norm": 1.4382904640464158, "language_loss": 0.81322622, "learning_rate": 4.916701149323022e-07, "loss": 0.88995987, "num_input_tokens_seen": 279466260, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.08746338, "step": 12953, "time_per_iteration": 2.5307483673095703 }, { "auxiliary_loss_clip": 0.06422429, "auxiliary_loss_mlp": 0.01267392, "balance_loss_clip": 0.06277762, "balance_loss_mlp": 0.01257015, "epoch": 0.7788366150608748, "flos": 15195538886400.0, "grad_norm": 3.4905149520252494, "language_loss": 0.76728857, "learning_rate": 4.91414389872737e-07, "loss": 0.84418678, "num_input_tokens_seen": 279484520, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10375977, "step": 12954, "time_per_iteration": 2.533329725265503 }, { "auxiliary_loss_clip": 0.06422198, "auxiliary_loss_mlp": 0.01264265, "balance_loss_clip": 0.06276328, "balance_loss_mlp": 0.01254496, "epoch": 0.7788967383135428, "flos": 21215799037440.0, "grad_norm": 1.6415436383416173, "language_loss": 0.73064303, "learning_rate": 4.911587220188905e-07, "loss": 0.80750763, "num_input_tokens_seen": 279503130, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.09777832, "step": 12955, "time_per_iteration": 2.5408921241760254 }, { "auxiliary_loss_clip": 0.06416383, "auxiliary_loss_mlp": 0.01266272, "balance_loss_clip": 0.06274582, "balance_loss_mlp": 0.01255889, "epoch": 0.7789568615662107, "flos": 21688340538240.0, "grad_norm": 1.3585846431516642, "language_loss": 0.6882056, "learning_rate": 4.909031113804551e-07, "loss": 0.76503217, "num_input_tokens_seen": 279521930, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.1038208, "step": 12956, "time_per_iteration": 2.5710678100585938 }, { "auxiliary_loss_clip": 0.06416765, "auxiliary_loss_mlp": 0.01262984, "balance_loss_clip": 0.0627621, "balance_loss_mlp": 0.01253507, "epoch": 0.7790169848188787, "flos": 26367732126720.0, "grad_norm": 1.533017526660625, "language_loss": 0.76410913, "learning_rate": 4.906475579671252e-07, "loss": 0.84090668, "num_input_tokens_seen": 279542375, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09472656, "step": 12957, "time_per_iteration": 2.5786139965057373 }, { "auxiliary_loss_clip": 0.06416795, "auxiliary_loss_mlp": 0.01266218, "balance_loss_clip": 0.06275603, "balance_loss_mlp": 0.01255948, "epoch": 0.7790771080715466, "flos": 25522563519360.0, "grad_norm": 2.4980859805405475, "language_loss": 0.77958596, "learning_rate": 4.903920617885917e-07, "loss": 0.85641611, "num_input_tokens_seen": 279561885, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10266113, "step": 12958, "time_per_iteration": 2.5595662593841553 }, { "auxiliary_loss_clip": 0.0641937, "auxiliary_loss_mlp": 0.01271533, "balance_loss_clip": 0.06277608, "balance_loss_mlp": 0.01261031, "epoch": 0.7791372313242146, "flos": 16039701244800.0, "grad_norm": 1.905329019355201, "language_loss": 0.72048825, "learning_rate": 4.901366228545418e-07, "loss": 0.79739726, "num_input_tokens_seen": 279579965, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.10510254, "step": 12959, "time_per_iteration": 2.5322017669677734 }, { "auxiliary_loss_clip": 0.06417985, "auxiliary_loss_mlp": 0.01266839, "balance_loss_clip": 0.06275634, "balance_loss_mlp": 0.01256623, "epoch": 0.7791973545768827, "flos": 23849039047680.0, "grad_norm": 1.556003089050988, "language_loss": 0.78056353, "learning_rate": 4.898812411746632e-07, "loss": 0.8574118, "num_input_tokens_seen": 279599030, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10217285, "step": 12960, "time_per_iteration": 2.5496270656585693 }, { "auxiliary_loss_clip": 0.06418136, "auxiliary_loss_mlp": 0.01267754, "balance_loss_clip": 0.06276178, "balance_loss_mlp": 0.01257222, "epoch": 0.7792574778295506, "flos": 24174902776320.0, "grad_norm": 2.0372341433356747, "language_loss": 0.75454855, "learning_rate": 4.896259167586385e-07, "loss": 0.83140749, "num_input_tokens_seen": 279614400, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10534668, "step": 12961, "time_per_iteration": 2.5368380546569824 }, { "auxiliary_loss_clip": 0.06412394, "auxiliary_loss_mlp": 0.01266688, "balance_loss_clip": 0.06277823, "balance_loss_mlp": 0.01257521, "epoch": 0.7793176010822186, "flos": 21470399268480.0, "grad_norm": 1.6460565922985753, "language_loss": 0.73639482, "learning_rate": 4.893706496161511e-07, "loss": 0.81318569, "num_input_tokens_seen": 279633745, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.09161377, "step": 12962, "time_per_iteration": 2.550656795501709 }, { "auxiliary_loss_clip": 0.06410656, "auxiliary_loss_mlp": 0.01265514, "balance_loss_clip": 0.06273303, "balance_loss_mlp": 0.01255804, "epoch": 0.7793777243348865, "flos": 20672790652800.0, "grad_norm": 1.8521546433515717, "language_loss": 0.70106739, "learning_rate": 4.891154397568795e-07, "loss": 0.77782905, "num_input_tokens_seen": 279651165, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.0970459, "step": 12963, "time_per_iteration": 2.523592710494995 }, { "auxiliary_loss_clip": 0.0641617, "auxiliary_loss_mlp": 0.01265784, "balance_loss_clip": 0.06279206, "balance_loss_mlp": 0.01256474, "epoch": 0.7794378475875545, "flos": 27133126047360.0, "grad_norm": 1.7590370093723262, "language_loss": 0.63547492, "learning_rate": 4.888602871905019e-07, "loss": 0.71229446, "num_input_tokens_seen": 279671175, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09307861, "step": 12964, "time_per_iteration": 2.5942258834838867 }, { "auxiliary_loss_clip": 0.06418495, "auxiliary_loss_mlp": 0.01264273, "balance_loss_clip": 0.06275995, "balance_loss_mlp": 0.01254432, "epoch": 0.7794979708402224, "flos": 28081605139200.0, "grad_norm": 1.5196965291808322, "language_loss": 0.7646578, "learning_rate": 4.88605191926694e-07, "loss": 0.8414855, "num_input_tokens_seen": 279688675, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09838867, "step": 12965, "time_per_iteration": 2.602695941925049 }, { "auxiliary_loss_clip": 0.06407395, "auxiliary_loss_mlp": 0.01262434, "balance_loss_clip": 0.06274064, "balance_loss_mlp": 0.01253452, "epoch": 0.7795580940928905, "flos": 26876722953600.0, "grad_norm": 1.4746599048585196, "language_loss": 0.72779876, "learning_rate": 4.883501539751289e-07, "loss": 0.80449712, "num_input_tokens_seen": 279710245, "router_z_loss_clip": 1.33398438, "router_z_loss_mlp": 0.08972168, "step": 12966, "time_per_iteration": 2.614591598510742 }, { "auxiliary_loss_clip": 0.06413316, "auxiliary_loss_mlp": 0.01263069, "balance_loss_clip": 0.06277651, "balance_loss_mlp": 0.01254641, "epoch": 0.7796182173455584, "flos": 23841072910080.0, "grad_norm": 1.604508306703638, "language_loss": 0.7433461, "learning_rate": 4.880951733454768e-07, "loss": 0.8201099, "num_input_tokens_seen": 279729045, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.08428955, "step": 12967, "time_per_iteration": 2.5941030979156494 }, { "auxiliary_loss_clip": 0.06417505, "auxiliary_loss_mlp": 0.01262407, "balance_loss_clip": 0.06273925, "balance_loss_mlp": 0.01252864, "epoch": 0.7796783405982264, "flos": 19798384170240.0, "grad_norm": 2.605148862768311, "language_loss": 0.72569895, "learning_rate": 4.878402500474073e-07, "loss": 0.8024981, "num_input_tokens_seen": 279748350, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.09539795, "step": 12968, "time_per_iteration": 2.5360894203186035 }, { "auxiliary_loss_clip": 0.06411891, "auxiliary_loss_mlp": 0.01267421, "balance_loss_clip": 0.06272711, "balance_loss_mlp": 0.0125749, "epoch": 0.7797384638508943, "flos": 15455589632640.0, "grad_norm": 4.064188015426953, "language_loss": 0.61171389, "learning_rate": 4.875853840905874e-07, "loss": 0.68850696, "num_input_tokens_seen": 279765620, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09936523, "step": 12969, "time_per_iteration": 2.505993366241455 }, { "auxiliary_loss_clip": 0.06410787, "auxiliary_loss_mlp": 0.01267946, "balance_loss_clip": 0.06277208, "balance_loss_mlp": 0.01259315, "epoch": 0.7797985871035623, "flos": 20928984111360.0, "grad_norm": 1.6929300989720761, "language_loss": 0.69946539, "learning_rate": 4.873305754846811e-07, "loss": 0.77625263, "num_input_tokens_seen": 279782485, "router_z_loss_clip": 1.3359375, "router_z_loss_mlp": 0.08630371, "step": 12970, "time_per_iteration": 3.982135772705078 }, { "auxiliary_loss_clip": 0.06419554, "auxiliary_loss_mlp": 0.01269022, "balance_loss_clip": 0.06279563, "balance_loss_mlp": 0.01258686, "epoch": 0.7798587103562302, "flos": 36945667411200.0, "grad_norm": 1.5653805632447957, "language_loss": 0.72265059, "learning_rate": 4.870758242393507e-07, "loss": 0.79953635, "num_input_tokens_seen": 279804170, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.10345459, "step": 12971, "time_per_iteration": 2.6611995697021484 }, { "auxiliary_loss_clip": 0.06421944, "auxiliary_loss_mlp": 0.01264276, "balance_loss_clip": 0.06275412, "balance_loss_mlp": 0.01253565, "epoch": 0.7799188336088982, "flos": 22425880176000.0, "grad_norm": 1.843351593804704, "language_loss": 0.74506921, "learning_rate": 4.868211303642578e-07, "loss": 0.82193148, "num_input_tokens_seen": 279823730, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10705566, "step": 12972, "time_per_iteration": 2.548402786254883 }, { "auxiliary_loss_clip": 0.06419714, "auxiliary_loss_mlp": 0.01263416, "balance_loss_clip": 0.06277652, "balance_loss_mlp": 0.01253486, "epoch": 0.7799789568615663, "flos": 18886522112640.0, "grad_norm": 1.9274554883825243, "language_loss": 0.71627146, "learning_rate": 4.865664938690584e-07, "loss": 0.79310274, "num_input_tokens_seen": 279843035, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09936523, "step": 12973, "time_per_iteration": 2.622314214706421 }, { "auxiliary_loss_clip": 0.06417179, "auxiliary_loss_mlp": 0.01266271, "balance_loss_clip": 0.06278658, "balance_loss_mlp": 0.01257241, "epoch": 0.7800390801142342, "flos": 20267781143040.0, "grad_norm": 2.071634670567458, "language_loss": 0.77693069, "learning_rate": 4.863119147634089e-07, "loss": 0.85376513, "num_input_tokens_seen": 279861450, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.090271, "step": 12974, "time_per_iteration": 4.089869260787964 }, { "auxiliary_loss_clip": 0.06418125, "auxiliary_loss_mlp": 0.01265776, "balance_loss_clip": 0.06278564, "balance_loss_mlp": 0.01256043, "epoch": 0.7800992033669022, "flos": 16695831041280.0, "grad_norm": 1.5326586164627116, "language_loss": 0.6983254, "learning_rate": 4.86057393056964e-07, "loss": 0.77516437, "num_input_tokens_seen": 279878660, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09741211, "step": 12975, "time_per_iteration": 2.512787103652954 }, { "auxiliary_loss_clip": 0.0641809, "auxiliary_loss_mlp": 0.01264419, "balance_loss_clip": 0.06278968, "balance_loss_mlp": 0.01254901, "epoch": 0.7801593266195701, "flos": 18590650945920.0, "grad_norm": 2.1696790837928583, "language_loss": 0.81976032, "learning_rate": 4.858029287593739e-07, "loss": 0.89658546, "num_input_tokens_seen": 279895685, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09509277, "step": 12976, "time_per_iteration": 2.5102827548980713 }, { "auxiliary_loss_clip": 0.06417379, "auxiliary_loss_mlp": 0.0126318, "balance_loss_clip": 0.06274448, "balance_loss_mlp": 0.01253542, "epoch": 0.7802194498722381, "flos": 25492193614080.0, "grad_norm": 1.406314960752321, "language_loss": 0.66167712, "learning_rate": 4.85548521880289e-07, "loss": 0.73848271, "num_input_tokens_seen": 279917240, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09643555, "step": 12977, "time_per_iteration": 2.5795466899871826 }, { "auxiliary_loss_clip": 0.064134, "auxiliary_loss_mlp": 0.01267734, "balance_loss_clip": 0.06274838, "balance_loss_mlp": 0.01258293, "epoch": 0.780279573124906, "flos": 31184451757440.0, "grad_norm": 1.4285468718915553, "language_loss": 0.74694777, "learning_rate": 4.852941724293554e-07, "loss": 0.82375908, "num_input_tokens_seen": 279938665, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09436035, "step": 12978, "time_per_iteration": 2.5954935550689697 }, { "auxiliary_loss_clip": 0.06424832, "auxiliary_loss_mlp": 0.01268197, "balance_loss_clip": 0.06279602, "balance_loss_mlp": 0.01257301, "epoch": 0.780339696377574, "flos": 26951466395520.0, "grad_norm": 2.074297034011264, "language_loss": 0.62225121, "learning_rate": 4.85039880416219e-07, "loss": 0.69918156, "num_input_tokens_seen": 279957965, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10900879, "step": 12979, "time_per_iteration": 2.6095006465911865 }, { "auxiliary_loss_clip": 0.06417452, "auxiliary_loss_mlp": 0.01263659, "balance_loss_clip": 0.06277909, "balance_loss_mlp": 0.01253985, "epoch": 0.780399819630242, "flos": 27963662117760.0, "grad_norm": 2.126289841286343, "language_loss": 0.77804416, "learning_rate": 4.847856458505217e-07, "loss": 0.85485524, "num_input_tokens_seen": 279977490, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09674072, "step": 12980, "time_per_iteration": 2.5918257236480713 }, { "auxiliary_loss_clip": 0.06420317, "auxiliary_loss_mlp": 0.01269369, "balance_loss_clip": 0.06276002, "balance_loss_mlp": 0.01259403, "epoch": 0.78045994288291, "flos": 22492489772160.0, "grad_norm": 2.1020866292119034, "language_loss": 0.77880442, "learning_rate": 4.845314687419046e-07, "loss": 0.85570133, "num_input_tokens_seen": 279994220, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.09954834, "step": 12981, "time_per_iteration": 2.558230400085449 }, { "auxiliary_loss_clip": 0.06419678, "auxiliary_loss_mlp": 0.01273244, "balance_loss_clip": 0.06277627, "balance_loss_mlp": 0.01263958, "epoch": 0.7805200661355779, "flos": 20857259416320.0, "grad_norm": 1.8608462157794887, "language_loss": 0.73570102, "learning_rate": 4.842773491000067e-07, "loss": 0.8126303, "num_input_tokens_seen": 280012590, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09289551, "step": 12982, "time_per_iteration": 2.564084529876709 }, { "auxiliary_loss_clip": 0.06416799, "auxiliary_loss_mlp": 0.01261757, "balance_loss_clip": 0.06274775, "balance_loss_mlp": 0.01251827, "epoch": 0.7805801893882459, "flos": 25673014725120.0, "grad_norm": 1.3290223206120217, "language_loss": 0.73857903, "learning_rate": 4.840232869344636e-07, "loss": 0.8153646, "num_input_tokens_seen": 280033700, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09936523, "step": 12983, "time_per_iteration": 2.5777697563171387 }, { "auxiliary_loss_clip": 0.06420898, "auxiliary_loss_mlp": 0.0126525, "balance_loss_clip": 0.06279115, "balance_loss_mlp": 0.0125532, "epoch": 0.7806403126409138, "flos": 11332581154560.0, "grad_norm": 1.866525716063957, "language_loss": 0.7508918, "learning_rate": 4.837692822549086e-07, "loss": 0.82775331, "num_input_tokens_seen": 280052215, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.0993042, "step": 12984, "time_per_iteration": 2.5182557106018066 }, { "auxiliary_loss_clip": 0.06414588, "auxiliary_loss_mlp": 0.01263273, "balance_loss_clip": 0.06274931, "balance_loss_mlp": 0.01254022, "epoch": 0.7807004358935818, "flos": 19579478578560.0, "grad_norm": 2.140299748245573, "language_loss": 0.81308508, "learning_rate": 4.835153350709746e-07, "loss": 0.88986361, "num_input_tokens_seen": 280070525, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.0925293, "step": 12985, "time_per_iteration": 4.002557754516602 }, { "auxiliary_loss_clip": 0.06418603, "auxiliary_loss_mlp": 0.01269768, "balance_loss_clip": 0.0627929, "balance_loss_mlp": 0.01259224, "epoch": 0.7807605591462499, "flos": 19141918957440.0, "grad_norm": 1.7531679439782255, "language_loss": 0.77276552, "learning_rate": 4.832614453922915e-07, "loss": 0.84964919, "num_input_tokens_seen": 280089855, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.10552979, "step": 12986, "time_per_iteration": 2.5372116565704346 }, { "auxiliary_loss_clip": 0.0641968, "auxiliary_loss_mlp": 0.01264488, "balance_loss_clip": 0.06278016, "balance_loss_mlp": 0.01254576, "epoch": 0.7808206823989178, "flos": 32382038638080.0, "grad_norm": 1.5466452468348393, "language_loss": 0.74043381, "learning_rate": 4.830076132284859e-07, "loss": 0.81727552, "num_input_tokens_seen": 280109960, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09918213, "step": 12987, "time_per_iteration": 2.6447696685791016 }, { "auxiliary_loss_clip": 0.06312667, "auxiliary_loss_mlp": 0.01250721, "balance_loss_clip": 0.06256156, "balance_loss_mlp": 0.01249489, "epoch": 0.7808808056515858, "flos": 55070512381440.0, "grad_norm": 0.7360270486703959, "language_loss": 0.55080712, "learning_rate": 4.82753838589184e-07, "loss": 0.626441, "num_input_tokens_seen": 280169805, "router_z_loss_clip": 0.56542969, "router_z_loss_mlp": 0.01231384, "step": 12988, "time_per_iteration": 3.1791832447052 }, { "auxiliary_loss_clip": 0.06410693, "auxiliary_loss_mlp": 0.01271952, "balance_loss_clip": 0.0627488, "balance_loss_mlp": 0.01262874, "epoch": 0.7809409289042537, "flos": 12864375244800.0, "grad_norm": 2.703453109829829, "language_loss": 0.81098932, "learning_rate": 4.82500121484009e-07, "loss": 0.88781577, "num_input_tokens_seen": 280184630, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09082031, "step": 12989, "time_per_iteration": 2.5233168601989746 }, { "auxiliary_loss_clip": 0.06412596, "auxiliary_loss_mlp": 0.01263247, "balance_loss_clip": 0.06274165, "balance_loss_mlp": 0.01253788, "epoch": 0.7810010521569217, "flos": 21693329856000.0, "grad_norm": 1.9491734073133968, "language_loss": 0.70520735, "learning_rate": 4.822464619225806e-07, "loss": 0.78196573, "num_input_tokens_seen": 280203880, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09460449, "step": 12990, "time_per_iteration": 2.5258355140686035 }, { "auxiliary_loss_clip": 0.06415074, "auxiliary_loss_mlp": 0.01266385, "balance_loss_clip": 0.06273573, "balance_loss_mlp": 0.01255293, "epoch": 0.7810611754095896, "flos": 16761560169600.0, "grad_norm": 2.346266594093292, "language_loss": 0.78080803, "learning_rate": 4.819928599145184e-07, "loss": 0.85762262, "num_input_tokens_seen": 280220460, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.11096191, "step": 12991, "time_per_iteration": 3.8835105895996094 }, { "auxiliary_loss_clip": 0.0641861, "auxiliary_loss_mlp": 0.01267204, "balance_loss_clip": 0.06277684, "balance_loss_mlp": 0.01256689, "epoch": 0.7811212986622577, "flos": 43517489063040.0, "grad_norm": 1.504411443214962, "language_loss": 0.66560435, "learning_rate": 4.817393154694398e-07, "loss": 0.74246246, "num_input_tokens_seen": 280242680, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10516357, "step": 12992, "time_per_iteration": 2.721191167831421 }, { "auxiliary_loss_clip": 0.06422891, "auxiliary_loss_mlp": 0.01264468, "balance_loss_clip": 0.06279498, "balance_loss_mlp": 0.01254604, "epoch": 0.7811814219149256, "flos": 21763377469440.0, "grad_norm": 1.7149100847760874, "language_loss": 0.61605036, "learning_rate": 4.814858285969578e-07, "loss": 0.69292402, "num_input_tokens_seen": 280260655, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09869385, "step": 12993, "time_per_iteration": 2.5321407318115234 }, { "auxiliary_loss_clip": 0.0641265, "auxiliary_loss_mlp": 0.01263654, "balance_loss_clip": 0.06274086, "balance_loss_mlp": 0.01253581, "epoch": 0.7812415451675936, "flos": 24068447763840.0, "grad_norm": 1.478427448138883, "language_loss": 0.68719316, "learning_rate": 4.812323993066862e-07, "loss": 0.76395619, "num_input_tokens_seen": 280281185, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.10070801, "step": 12994, "time_per_iteration": 2.5517213344573975 }, { "auxiliary_loss_clip": 0.0642037, "auxiliary_loss_mlp": 0.01265089, "balance_loss_clip": 0.062806, "balance_loss_mlp": 0.01255713, "epoch": 0.7813016684202615, "flos": 18995744309760.0, "grad_norm": 2.4478794959710877, "language_loss": 0.69385153, "learning_rate": 4.809790276082335e-07, "loss": 0.77070618, "num_input_tokens_seen": 280298255, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09381104, "step": 12995, "time_per_iteration": 2.5093369483947754 }, { "auxiliary_loss_clip": 0.06409758, "auxiliary_loss_mlp": 0.0126236, "balance_loss_clip": 0.06274755, "balance_loss_mlp": 0.01253455, "epoch": 0.7813617916729295, "flos": 25267124747520.0, "grad_norm": 1.654060942616901, "language_loss": 0.74921143, "learning_rate": 4.807257135112088e-07, "loss": 0.82593268, "num_input_tokens_seen": 280319000, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.08911133, "step": 12996, "time_per_iteration": 2.5782511234283447 }, { "auxiliary_loss_clip": 0.06426015, "auxiliary_loss_mlp": 0.01267021, "balance_loss_clip": 0.06279363, "balance_loss_mlp": 0.01256018, "epoch": 0.7814219149255974, "flos": 17971557454080.0, "grad_norm": 2.633295292440728, "language_loss": 0.69001937, "learning_rate": 4.804724570252167e-07, "loss": 0.76694977, "num_input_tokens_seen": 280336375, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.11010742, "step": 12997, "time_per_iteration": 2.505911111831665 }, { "auxiliary_loss_clip": 0.06424491, "auxiliary_loss_mlp": 0.0126849, "balance_loss_clip": 0.06277806, "balance_loss_mlp": 0.01257332, "epoch": 0.7814820381782654, "flos": 25783368952320.0, "grad_norm": 1.6517979938396776, "language_loss": 0.82229602, "learning_rate": 4.802192581598614e-07, "loss": 0.89922583, "num_input_tokens_seen": 280358760, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.1114502, "step": 12998, "time_per_iteration": 2.6247222423553467 }, { "auxiliary_loss_clip": 0.06426118, "auxiliary_loss_mlp": 0.01266022, "balance_loss_clip": 0.06284195, "balance_loss_mlp": 0.01255865, "epoch": 0.7815421614309335, "flos": 20525442048000.0, "grad_norm": 2.3831696891062495, "language_loss": 0.74870962, "learning_rate": 4.799661169247453e-07, "loss": 0.82563102, "num_input_tokens_seen": 280377085, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.1015625, "step": 12999, "time_per_iteration": 2.529816150665283 }, { "auxiliary_loss_clip": 0.06420924, "auxiliary_loss_mlp": 0.01263031, "balance_loss_clip": 0.06275353, "balance_loss_mlp": 0.01252564, "epoch": 0.7816022846836014, "flos": 21293980496640.0, "grad_norm": 1.4990863194017157, "language_loss": 0.84657419, "learning_rate": 4.797130333294652e-07, "loss": 0.92341375, "num_input_tokens_seen": 280395465, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10473633, "step": 13000, "time_per_iteration": 2.5286731719970703 }, { "auxiliary_loss_clip": 0.06421963, "auxiliary_loss_mlp": 0.01263335, "balance_loss_clip": 0.06282042, "balance_loss_mlp": 0.01252928, "epoch": 0.7816624079362694, "flos": 19214440266240.0, "grad_norm": 1.916952450228389, "language_loss": 0.66271055, "learning_rate": 4.794600073836192e-07, "loss": 0.73956347, "num_input_tokens_seen": 280412775, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.10400391, "step": 13001, "time_per_iteration": 2.536261558532715 }, { "auxiliary_loss_clip": 0.06419887, "auxiliary_loss_mlp": 0.01263833, "balance_loss_clip": 0.06276601, "balance_loss_mlp": 0.01253903, "epoch": 0.7817225311889373, "flos": 26111957938560.0, "grad_norm": 1.4421729875428086, "language_loss": 0.66707206, "learning_rate": 4.792070390968027e-07, "loss": 0.74390924, "num_input_tokens_seen": 280432905, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09924316, "step": 13002, "time_per_iteration": 2.589947462081909 }, { "auxiliary_loss_clip": 0.06423885, "auxiliary_loss_mlp": 0.01267675, "balance_loss_clip": 0.06281203, "balance_loss_mlp": 0.01256255, "epoch": 0.7817826544416053, "flos": 21257195754240.0, "grad_norm": 2.1660112114860497, "language_loss": 0.73805654, "learning_rate": 4.78954128478607e-07, "loss": 0.81497204, "num_input_tokens_seen": 280450785, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.11419678, "step": 13003, "time_per_iteration": 2.5433034896850586 }, { "auxiliary_loss_clip": 0.06415685, "auxiliary_loss_mlp": 0.01264963, "balance_loss_clip": 0.06276245, "balance_loss_mlp": 0.01255075, "epoch": 0.7818427776942732, "flos": 19937347367040.0, "grad_norm": 2.19010246659068, "language_loss": 0.62700903, "learning_rate": 4.787012755386233e-07, "loss": 0.70381552, "num_input_tokens_seen": 280468400, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09881592, "step": 13004, "time_per_iteration": 2.5719645023345947 }, { "auxiliary_loss_clip": 0.06412355, "auxiliary_loss_mlp": 0.01262528, "balance_loss_clip": 0.06277241, "balance_loss_mlp": 0.01253456, "epoch": 0.7819029009469413, "flos": 11368443502080.0, "grad_norm": 1.7602356147663492, "language_loss": 0.83033282, "learning_rate": 4.784484802864403e-07, "loss": 0.90708166, "num_input_tokens_seen": 280483930, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.09075928, "step": 13005, "time_per_iteration": 2.542332887649536 }, { "auxiliary_loss_clip": 0.06417058, "auxiliary_loss_mlp": 0.01266759, "balance_loss_clip": 0.06275932, "balance_loss_mlp": 0.01256369, "epoch": 0.7819630241996092, "flos": 24286053617280.0, "grad_norm": 1.9896689693431553, "language_loss": 0.73421228, "learning_rate": 4.781957427316432e-07, "loss": 0.81105042, "num_input_tokens_seen": 280503465, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10388184, "step": 13006, "time_per_iteration": 2.5567829608917236 }, { "auxiliary_loss_clip": 0.06418569, "auxiliary_loss_mlp": 0.01264174, "balance_loss_clip": 0.0627521, "balance_loss_mlp": 0.01254017, "epoch": 0.7820231474522772, "flos": 22715168797440.0, "grad_norm": 1.5760169276544107, "language_loss": 0.71954334, "learning_rate": 4.779430628838157e-07, "loss": 0.79637074, "num_input_tokens_seen": 280523375, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10150146, "step": 13007, "time_per_iteration": 2.570849657058716 }, { "auxiliary_loss_clip": 0.06420384, "auxiliary_loss_mlp": 0.01268499, "balance_loss_clip": 0.06276654, "balance_loss_mlp": 0.01258068, "epoch": 0.7820832707049451, "flos": 20053571379840.0, "grad_norm": 1.9734061449961806, "language_loss": 0.69289541, "learning_rate": 4.776904407525397e-07, "loss": 0.76978427, "num_input_tokens_seen": 280542920, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10430908, "step": 13008, "time_per_iteration": 2.550365924835205 }, { "auxiliary_loss_clip": 0.06416301, "auxiliary_loss_mlp": 0.01267265, "balance_loss_clip": 0.06273814, "balance_loss_mlp": 0.01255636, "epoch": 0.7821433939576131, "flos": 27170246206080.0, "grad_norm": 1.6895188264252305, "language_loss": 0.7000668, "learning_rate": 4.774378763473954e-07, "loss": 0.7769025, "num_input_tokens_seen": 280561700, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.11633301, "step": 13009, "time_per_iteration": 3.99527645111084 }, { "auxiliary_loss_clip": 0.06414004, "auxiliary_loss_mlp": 0.01263679, "balance_loss_clip": 0.06274232, "balance_loss_mlp": 0.0125419, "epoch": 0.782203517210281, "flos": 22608755712000.0, "grad_norm": 2.2676489481886737, "language_loss": 0.81962931, "learning_rate": 4.771853696779586e-07, "loss": 0.89640617, "num_input_tokens_seen": 280580605, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09484863, "step": 13010, "time_per_iteration": 2.5304794311523438 }, { "auxiliary_loss_clip": 0.06418589, "auxiliary_loss_mlp": 0.01264567, "balance_loss_clip": 0.06280456, "balance_loss_mlp": 0.01254542, "epoch": 0.782263640462949, "flos": 29067539806080.0, "grad_norm": 1.4432657842940555, "language_loss": 0.62344015, "learning_rate": 4.76932920753806e-07, "loss": 0.70027167, "num_input_tokens_seen": 280601495, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.10028076, "step": 13011, "time_per_iteration": 2.6024816036224365 }, { "auxiliary_loss_clip": 0.06414035, "auxiliary_loss_mlp": 0.01265506, "balance_loss_clip": 0.06275965, "balance_loss_mlp": 0.01255898, "epoch": 0.782323763715617, "flos": 25306215477120.0, "grad_norm": 1.6092106171656202, "language_loss": 0.69870341, "learning_rate": 4.7668052958450913e-07, "loss": 0.77549881, "num_input_tokens_seen": 280622760, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09606934, "step": 13012, "time_per_iteration": 2.619828462600708 }, { "auxiliary_loss_clip": 0.06318425, "auxiliary_loss_mlp": 0.01250412, "balance_loss_clip": 0.06261855, "balance_loss_mlp": 0.01249131, "epoch": 0.782383886968285, "flos": 65216548195200.0, "grad_norm": 0.6956669195435657, "language_loss": 0.55057263, "learning_rate": 4.764281961796395e-07, "loss": 0.626261, "num_input_tokens_seen": 280687115, "router_z_loss_clip": 0.56591797, "router_z_loss_mlp": 0.01280975, "step": 13013, "time_per_iteration": 3.274096965789795 }, { "auxiliary_loss_clip": 0.06426166, "auxiliary_loss_mlp": 0.01266049, "balance_loss_clip": 0.06280579, "balance_loss_mlp": 0.0125594, "epoch": 0.782444010220953, "flos": 18411297281280.0, "grad_norm": 2.124780404600937, "language_loss": 0.65510321, "learning_rate": 4.76175920548765e-07, "loss": 0.73202538, "num_input_tokens_seen": 280705000, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10107422, "step": 13014, "time_per_iteration": 3.9414126873016357 }, { "auxiliary_loss_clip": 0.06324216, "auxiliary_loss_mlp": 0.01250689, "balance_loss_clip": 0.0626767, "balance_loss_mlp": 0.01249525, "epoch": 0.7825041334736209, "flos": 63977145327360.0, "grad_norm": 0.7142313470841622, "language_loss": 0.58247888, "learning_rate": 4.759237027014524e-07, "loss": 0.65822792, "num_input_tokens_seen": 280773525, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 0.0116272, "step": 13015, "time_per_iteration": 3.240997076034546 }, { "auxiliary_loss_clip": 0.0641925, "auxiliary_loss_mlp": 0.01267966, "balance_loss_clip": 0.06278467, "balance_loss_mlp": 0.01258299, "epoch": 0.7825642567262889, "flos": 20345585258880.0, "grad_norm": 1.7656156779469687, "language_loss": 0.75007176, "learning_rate": 4.756715426472666e-07, "loss": 0.82694393, "num_input_tokens_seen": 280791915, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09667969, "step": 13016, "time_per_iteration": 2.5377087593078613 }, { "auxiliary_loss_clip": 0.06420582, "auxiliary_loss_mlp": 0.01266171, "balance_loss_clip": 0.06277961, "balance_loss_mlp": 0.01254685, "epoch": 0.7826243799789568, "flos": 20268577756800.0, "grad_norm": 1.571785330765008, "language_loss": 0.75080657, "learning_rate": 4.7541944039576766e-07, "loss": 0.82767415, "num_input_tokens_seen": 280811460, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.11486816, "step": 13017, "time_per_iteration": 2.539702892303467 }, { "auxiliary_loss_clip": 0.06417505, "auxiliary_loss_mlp": 0.01267181, "balance_loss_clip": 0.06274296, "balance_loss_mlp": 0.01256733, "epoch": 0.7826845032316249, "flos": 21137743359360.0, "grad_norm": 2.1141730820115376, "language_loss": 0.76365954, "learning_rate": 4.7516739595651636e-07, "loss": 0.84050637, "num_input_tokens_seen": 280825415, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.10449219, "step": 13018, "time_per_iteration": 2.507554054260254 }, { "auxiliary_loss_clip": 0.06414375, "auxiliary_loss_mlp": 0.01269684, "balance_loss_clip": 0.06274229, "balance_loss_mlp": 0.01259057, "epoch": 0.7827446264842928, "flos": 22498862682240.0, "grad_norm": 1.3485547909027478, "language_loss": 0.7755934, "learning_rate": 4.749154093390708e-07, "loss": 0.85243398, "num_input_tokens_seen": 280845335, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.10620117, "step": 13019, "time_per_iteration": 2.5574889183044434 }, { "auxiliary_loss_clip": 0.06417127, "auxiliary_loss_mlp": 0.01262695, "balance_loss_clip": 0.06277531, "balance_loss_mlp": 0.01252872, "epoch": 0.7828047497369608, "flos": 28848298798080.0, "grad_norm": 1.5036165643798607, "language_loss": 0.68026066, "learning_rate": 4.746634805529852e-07, "loss": 0.75705892, "num_input_tokens_seen": 280867145, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09820557, "step": 13020, "time_per_iteration": 2.6489365100860596 }, { "auxiliary_loss_clip": 0.06417394, "auxiliary_loss_mlp": 0.01269293, "balance_loss_clip": 0.06276383, "balance_loss_mlp": 0.01258457, "epoch": 0.7828648729896287, "flos": 23264298529920.0, "grad_norm": 2.0283986027205034, "language_loss": 0.62791955, "learning_rate": 4.7441160960781325e-07, "loss": 0.70478642, "num_input_tokens_seen": 280886185, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10827637, "step": 13021, "time_per_iteration": 2.6277108192443848 }, { "auxiliary_loss_clip": 0.06416968, "auxiliary_loss_mlp": 0.01265056, "balance_loss_clip": 0.06276505, "balance_loss_mlp": 0.01256276, "epoch": 0.7829249962422967, "flos": 25272826824960.0, "grad_norm": 1.648358071364048, "language_loss": 0.69501519, "learning_rate": 4.7415979651310636e-07, "loss": 0.77183545, "num_input_tokens_seen": 280907665, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.08789062, "step": 13022, "time_per_iteration": 2.572547435760498 }, { "auxiliary_loss_clip": 0.06322555, "auxiliary_loss_mlp": 0.01250835, "balance_loss_clip": 0.06266362, "balance_loss_mlp": 0.01249717, "epoch": 0.7829851194949646, "flos": 70742087441280.0, "grad_norm": 0.6577364673848773, "language_loss": 0.56177068, "learning_rate": 4.739080412784131e-07, "loss": 0.63750458, "num_input_tokens_seen": 280971405, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01119995, "step": 13023, "time_per_iteration": 3.2848291397094727 }, { "auxiliary_loss_clip": 0.06406708, "auxiliary_loss_mlp": 0.01268311, "balance_loss_clip": 0.06272051, "balance_loss_mlp": 0.0125915, "epoch": 0.7830452427476327, "flos": 25666977231360.0, "grad_norm": 1.5479486917452818, "language_loss": 0.67025435, "learning_rate": 4.736563439132792e-07, "loss": 0.74700451, "num_input_tokens_seen": 280989615, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.0916748, "step": 13024, "time_per_iteration": 2.5439465045928955 }, { "auxiliary_loss_clip": 0.06417692, "auxiliary_loss_mlp": 0.01264178, "balance_loss_clip": 0.06275285, "balance_loss_mlp": 0.01253324, "epoch": 0.7831053660003006, "flos": 22791002342400.0, "grad_norm": 1.5193206788722975, "language_loss": 0.77814239, "learning_rate": 4.734047044272498e-07, "loss": 0.85496104, "num_input_tokens_seen": 281009450, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10858154, "step": 13025, "time_per_iteration": 4.027703046798706 }, { "auxiliary_loss_clip": 0.06415503, "auxiliary_loss_mlp": 0.01269711, "balance_loss_clip": 0.0627498, "balance_loss_mlp": 0.01259834, "epoch": 0.7831654892529686, "flos": 25819399008000.0, "grad_norm": 1.673517267271332, "language_loss": 0.78380466, "learning_rate": 4.731531228298673e-07, "loss": 0.86065674, "num_input_tokens_seen": 281028120, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09875488, "step": 13026, "time_per_iteration": 2.5686299800872803 }, { "auxiliary_loss_clip": 0.06416882, "auxiliary_loss_mlp": 0.01264756, "balance_loss_clip": 0.0627729, "balance_loss_mlp": 0.01255183, "epoch": 0.7832256125056366, "flos": 20776897751040.0, "grad_norm": 2.0691993415801244, "language_loss": 0.76086217, "learning_rate": 4.729015991306715e-07, "loss": 0.83767855, "num_input_tokens_seen": 281042130, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09564209, "step": 13027, "time_per_iteration": 2.505575180053711 }, { "auxiliary_loss_clip": 0.06409918, "auxiliary_loss_mlp": 0.01264509, "balance_loss_clip": 0.06272839, "balance_loss_mlp": 0.01254793, "epoch": 0.7832857357583045, "flos": 21512886088320.0, "grad_norm": 1.9942936475273663, "language_loss": 0.70758939, "learning_rate": 4.726501333391997e-07, "loss": 0.78433365, "num_input_tokens_seen": 281060945, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09716797, "step": 13028, "time_per_iteration": 2.5216970443725586 }, { "auxiliary_loss_clip": 0.06420402, "auxiliary_loss_mlp": 0.01268725, "balance_loss_clip": 0.06277075, "balance_loss_mlp": 0.01258992, "epoch": 0.7833458590109725, "flos": 18083714544000.0, "grad_norm": 2.160971687796165, "language_loss": 0.68676502, "learning_rate": 4.7239872546498774e-07, "loss": 0.76365626, "num_input_tokens_seen": 281079270, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.09735107, "step": 13029, "time_per_iteration": 2.502833843231201 }, { "auxiliary_loss_clip": 0.06417443, "auxiliary_loss_mlp": 0.01268853, "balance_loss_clip": 0.06272602, "balance_loss_mlp": 0.01258648, "epoch": 0.7834059822636404, "flos": 28295521413120.0, "grad_norm": 1.694023501817475, "language_loss": 0.80961251, "learning_rate": 4.721473755175698e-07, "loss": 0.88647544, "num_input_tokens_seen": 281099500, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10211182, "step": 13030, "time_per_iteration": 3.9349772930145264 }, { "auxiliary_loss_clip": 0.06421214, "auxiliary_loss_mlp": 0.0126304, "balance_loss_clip": 0.06276122, "balance_loss_mlp": 0.01252961, "epoch": 0.7834661055163085, "flos": 31694281125120.0, "grad_norm": 1.8332141115036815, "language_loss": 0.70844406, "learning_rate": 4.71896083506476e-07, "loss": 0.78528655, "num_input_tokens_seen": 281121250, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10083008, "step": 13031, "time_per_iteration": 2.642518997192383 }, { "auxiliary_loss_clip": 0.06420746, "auxiliary_loss_mlp": 0.01267311, "balance_loss_clip": 0.06276862, "balance_loss_mlp": 0.01257238, "epoch": 0.7835262287689764, "flos": 12938238218880.0, "grad_norm": 1.7305324529449082, "language_loss": 0.78765917, "learning_rate": 4.7164484944123574e-07, "loss": 0.86453968, "num_input_tokens_seen": 281138760, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10070801, "step": 13032, "time_per_iteration": 2.521381378173828 }, { "auxiliary_loss_clip": 0.06418142, "auxiliary_loss_mlp": 0.01269889, "balance_loss_clip": 0.06274459, "balance_loss_mlp": 0.0125981, "epoch": 0.7835863520216444, "flos": 16148671879680.0, "grad_norm": 1.7549741640903531, "language_loss": 0.6358763, "learning_rate": 4.7139367333137726e-07, "loss": 0.71275663, "num_input_tokens_seen": 281157420, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10083008, "step": 13033, "time_per_iteration": 2.515493154525757 }, { "auxiliary_loss_clip": 0.06419344, "auxiliary_loss_mlp": 0.01264651, "balance_loss_clip": 0.06277658, "balance_loss_mlp": 0.01254375, "epoch": 0.7836464752743123, "flos": 11514660076800.0, "grad_norm": 1.43101353798403, "language_loss": 0.72259772, "learning_rate": 4.7114255518642255e-07, "loss": 0.79943764, "num_input_tokens_seen": 281174620, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10272217, "step": 13034, "time_per_iteration": 2.508197546005249 }, { "auxiliary_loss_clip": 0.06418715, "auxiliary_loss_mlp": 0.01271308, "balance_loss_clip": 0.06275766, "balance_loss_mlp": 0.012608, "epoch": 0.7837065985269803, "flos": 18229637629440.0, "grad_norm": 1.7010280410393883, "language_loss": 0.72330046, "learning_rate": 4.7089149501589555e-07, "loss": 0.8002007, "num_input_tokens_seen": 281193865, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10510254, "step": 13035, "time_per_iteration": 2.553375005722046 }, { "auxiliary_loss_clip": 0.06418517, "auxiliary_loss_mlp": 0.01269464, "balance_loss_clip": 0.06275636, "balance_loss_mlp": 0.01258103, "epoch": 0.7837667217796482, "flos": 24761404229760.0, "grad_norm": 2.200493841803745, "language_loss": 0.66494632, "learning_rate": 4.7064049282931664e-07, "loss": 0.74182606, "num_input_tokens_seen": 281212250, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.11364746, "step": 13036, "time_per_iteration": 2.5706379413604736 }, { "auxiliary_loss_clip": 0.06426957, "auxiliary_loss_mlp": 0.01271732, "balance_loss_clip": 0.06279223, "balance_loss_mlp": 0.01260496, "epoch": 0.7838268450323163, "flos": 22389766266240.0, "grad_norm": 5.755283906507303, "language_loss": 0.73103559, "learning_rate": 4.703895486362031e-07, "loss": 0.8080225, "num_input_tokens_seen": 281230850, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.11230469, "step": 13037, "time_per_iteration": 2.545093059539795 }, { "auxiliary_loss_clip": 0.0641687, "auxiliary_loss_mlp": 0.0126494, "balance_loss_clip": 0.06275187, "balance_loss_mlp": 0.01254688, "epoch": 0.7838869682849842, "flos": 19506370291200.0, "grad_norm": 1.805777939687171, "language_loss": 0.60168391, "learning_rate": 4.701386624460717e-07, "loss": 0.67850208, "num_input_tokens_seen": 281249810, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.10253906, "step": 13038, "time_per_iteration": 2.504697799682617 }, { "auxiliary_loss_clip": 0.0641492, "auxiliary_loss_mlp": 0.01265109, "balance_loss_clip": 0.06275465, "balance_loss_mlp": 0.01255268, "epoch": 0.7839470915376522, "flos": 32901553152000.0, "grad_norm": 1.5536729553570006, "language_loss": 0.68111169, "learning_rate": 4.698878342684349e-07, "loss": 0.75791198, "num_input_tokens_seen": 281273730, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09838867, "step": 13039, "time_per_iteration": 2.6545164585113525 }, { "auxiliary_loss_clip": 0.0641236, "auxiliary_loss_mlp": 0.01261487, "balance_loss_clip": 0.06274924, "balance_loss_mlp": 0.01253088, "epoch": 0.7840072147903202, "flos": 29683153353600.0, "grad_norm": 1.760286940921342, "language_loss": 0.69714671, "learning_rate": 4.6963706411280537e-07, "loss": 0.77388513, "num_input_tokens_seen": 281293670, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.08398438, "step": 13040, "time_per_iteration": 2.5804269313812256 }, { "auxiliary_loss_clip": 0.06420173, "auxiliary_loss_mlp": 0.01268314, "balance_loss_clip": 0.06275981, "balance_loss_mlp": 0.0125755, "epoch": 0.7840673380429881, "flos": 18192601324800.0, "grad_norm": 1.4662706819799132, "language_loss": 0.67814028, "learning_rate": 4.6938635198869116e-07, "loss": 0.75502509, "num_input_tokens_seen": 281313070, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10766602, "step": 13041, "time_per_iteration": 2.5209314823150635 }, { "auxiliary_loss_clip": 0.06318741, "auxiliary_loss_mlp": 0.01251807, "balance_loss_clip": 0.06262255, "balance_loss_mlp": 0.01250688, "epoch": 0.7841274612956561, "flos": 66365694616320.0, "grad_norm": 0.8701570378089928, "language_loss": 0.57425404, "learning_rate": 4.691356979055998e-07, "loss": 0.6499595, "num_input_tokens_seen": 281374880, "router_z_loss_clip": 0.56298828, "router_z_loss_mlp": 0.01120758, "step": 13042, "time_per_iteration": 3.1133291721343994 }, { "auxiliary_loss_clip": 0.06420572, "auxiliary_loss_mlp": 0.01267802, "balance_loss_clip": 0.06277427, "balance_loss_mlp": 0.01258253, "epoch": 0.784187584548324, "flos": 26655259812480.0, "grad_norm": 1.882196489404152, "language_loss": 0.84500813, "learning_rate": 4.688851018730369e-07, "loss": 0.92189193, "num_input_tokens_seen": 281392620, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09552002, "step": 13043, "time_per_iteration": 2.5653364658355713 }, { "auxiliary_loss_clip": 0.0641198, "auxiliary_loss_mlp": 0.01267092, "balance_loss_clip": 0.06274189, "balance_loss_mlp": 0.01257251, "epoch": 0.7842477078009921, "flos": 25747422750720.0, "grad_norm": 1.4860851600156904, "language_loss": 0.88447428, "learning_rate": 4.6863456390050425e-07, "loss": 0.96126497, "num_input_tokens_seen": 281413140, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09844971, "step": 13044, "time_per_iteration": 2.5559141635894775 }, { "auxiliary_loss_clip": 0.06428771, "auxiliary_loss_mlp": 0.0127402, "balance_loss_clip": 0.0628046, "balance_loss_mlp": 0.0126366, "epoch": 0.78430783105366, "flos": 21987398160000.0, "grad_norm": 1.6210749279672951, "language_loss": 0.79072797, "learning_rate": 4.6838408399750195e-07, "loss": 0.86775589, "num_input_tokens_seen": 281430860, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10357666, "step": 13045, "time_per_iteration": 2.5487332344055176 }, { "auxiliary_loss_clip": 0.06415124, "auxiliary_loss_mlp": 0.01264496, "balance_loss_clip": 0.06274527, "balance_loss_mlp": 0.01254798, "epoch": 0.784367954306328, "flos": 23849122901760.0, "grad_norm": 1.3791178353929838, "language_loss": 0.72253156, "learning_rate": 4.6813366217352925e-07, "loss": 0.79932773, "num_input_tokens_seen": 281451385, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.0970459, "step": 13046, "time_per_iteration": 2.6249277591705322 }, { "auxiliary_loss_clip": 0.06416234, "auxiliary_loss_mlp": 0.01263434, "balance_loss_clip": 0.06278698, "balance_loss_mlp": 0.01253778, "epoch": 0.7844280775589959, "flos": 24833548195200.0, "grad_norm": 1.4367829696383632, "language_loss": 0.63011527, "learning_rate": 4.678832984380809e-07, "loss": 0.70691186, "num_input_tokens_seen": 281472255, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09661865, "step": 13047, "time_per_iteration": 2.582331657409668 }, { "auxiliary_loss_clip": 0.06410176, "auxiliary_loss_mlp": 0.01264487, "balance_loss_clip": 0.06273121, "balance_loss_mlp": 0.01254914, "epoch": 0.7844882008116639, "flos": 22462245648000.0, "grad_norm": 2.078950885573373, "language_loss": 0.73204613, "learning_rate": 4.676329928006515e-07, "loss": 0.80879277, "num_input_tokens_seen": 281492860, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09570312, "step": 13048, "time_per_iteration": 2.5517899990081787 }, { "auxiliary_loss_clip": 0.06421742, "auxiliary_loss_mlp": 0.01264583, "balance_loss_clip": 0.06276299, "balance_loss_mlp": 0.01253837, "epoch": 0.7845483240643318, "flos": 26111203251840.0, "grad_norm": 1.7901814435424852, "language_loss": 0.74828684, "learning_rate": 4.6738274527073243e-07, "loss": 0.82515001, "num_input_tokens_seen": 281511815, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10754395, "step": 13049, "time_per_iteration": 4.016519546508789 }, { "auxiliary_loss_clip": 0.06421074, "auxiliary_loss_mlp": 0.01264651, "balance_loss_clip": 0.06274801, "balance_loss_mlp": 0.01253004, "epoch": 0.7846084473169999, "flos": 19360363351680.0, "grad_norm": 1.7347025028809868, "language_loss": 0.72957164, "learning_rate": 4.6713255585781454e-07, "loss": 0.80642885, "num_input_tokens_seen": 281530090, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.11657715, "step": 13050, "time_per_iteration": 2.527062177658081 }, { "auxiliary_loss_clip": 0.06417764, "auxiliary_loss_mlp": 0.01264191, "balance_loss_clip": 0.06276152, "balance_loss_mlp": 0.01254624, "epoch": 0.7846685705696678, "flos": 23331620885760.0, "grad_norm": 2.0291982940805133, "language_loss": 0.73865342, "learning_rate": 4.668824245713825e-07, "loss": 0.81547296, "num_input_tokens_seen": 281547075, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09564209, "step": 13051, "time_per_iteration": 2.547637939453125 }, { "auxiliary_loss_clip": 0.06422035, "auxiliary_loss_mlp": 0.01271417, "balance_loss_clip": 0.06278516, "balance_loss_mlp": 0.01260432, "epoch": 0.7847286938223358, "flos": 35818379706240.0, "grad_norm": 1.85829325577968, "language_loss": 0.72895849, "learning_rate": 4.666323514209227e-07, "loss": 0.80589306, "num_input_tokens_seen": 281568080, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10986328, "step": 13052, "time_per_iteration": 2.6705331802368164 }, { "auxiliary_loss_clip": 0.06409611, "auxiliary_loss_mlp": 0.01263983, "balance_loss_clip": 0.06274121, "balance_loss_mlp": 0.01254751, "epoch": 0.7847888170750038, "flos": 18483986298240.0, "grad_norm": 1.8048996048845694, "language_loss": 0.69400132, "learning_rate": 4.663823364159183e-07, "loss": 0.77073717, "num_input_tokens_seen": 281586925, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.09234619, "step": 13053, "time_per_iteration": 2.5090620517730713 }, { "auxiliary_loss_clip": 0.06415418, "auxiliary_loss_mlp": 0.01264085, "balance_loss_clip": 0.06276038, "balance_loss_mlp": 0.01254935, "epoch": 0.7848489403276717, "flos": 25126190979840.0, "grad_norm": 1.864693840702884, "language_loss": 0.70260799, "learning_rate": 4.6613237956584893e-07, "loss": 0.77940297, "num_input_tokens_seen": 281603915, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.0914917, "step": 13054, "time_per_iteration": 4.007905006408691 }, { "auxiliary_loss_clip": 0.06421, "auxiliary_loss_mlp": 0.01264916, "balance_loss_clip": 0.06275867, "balance_loss_mlp": 0.01254699, "epoch": 0.7849090635803397, "flos": 26509169018880.0, "grad_norm": 1.8381107879846719, "language_loss": 0.75774914, "learning_rate": 4.658824808801938e-07, "loss": 0.83460832, "num_input_tokens_seen": 281624220, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10211182, "step": 13055, "time_per_iteration": 2.5666277408599854 }, { "auxiliary_loss_clip": 0.06424683, "auxiliary_loss_mlp": 0.01263487, "balance_loss_clip": 0.06278484, "balance_loss_mlp": 0.0125289, "epoch": 0.7849691868330076, "flos": 20965978488960.0, "grad_norm": 1.7506059551344926, "language_loss": 0.75136542, "learning_rate": 4.656326403684283e-07, "loss": 0.82824719, "num_input_tokens_seen": 281642325, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.10601807, "step": 13056, "time_per_iteration": 2.5387094020843506 }, { "auxiliary_loss_clip": 0.06414592, "auxiliary_loss_mlp": 0.01264584, "balance_loss_clip": 0.06274337, "balance_loss_mlp": 0.01255083, "epoch": 0.7850293100856757, "flos": 26074628144640.0, "grad_norm": 1.6415147002603183, "language_loss": 0.70546883, "learning_rate": 4.6538285804002744e-07, "loss": 0.78226066, "num_input_tokens_seen": 281663065, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.0949707, "step": 13057, "time_per_iteration": 2.566943407058716 }, { "auxiliary_loss_clip": 0.06422165, "auxiliary_loss_mlp": 0.01267731, "balance_loss_clip": 0.06278193, "balance_loss_mlp": 0.01257974, "epoch": 0.7850894333383436, "flos": 22498443411840.0, "grad_norm": 1.8269919220145527, "language_loss": 0.76859522, "learning_rate": 4.6513313390446175e-07, "loss": 0.84549421, "num_input_tokens_seen": 281681005, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.09759521, "step": 13058, "time_per_iteration": 2.5291683673858643 }, { "auxiliary_loss_clip": 0.06417432, "auxiliary_loss_mlp": 0.01265052, "balance_loss_clip": 0.0627529, "balance_loss_mlp": 0.01254544, "epoch": 0.7851495565910116, "flos": 20564952048000.0, "grad_norm": 2.013021519305239, "language_loss": 0.7109344, "learning_rate": 4.6488346797120146e-07, "loss": 0.78775924, "num_input_tokens_seen": 281697965, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.1050415, "step": 13059, "time_per_iteration": 2.555431365966797 }, { "auxiliary_loss_clip": 0.0642727, "auxiliary_loss_mlp": 0.01267905, "balance_loss_clip": 0.06277432, "balance_loss_mlp": 0.01256747, "epoch": 0.7852096798436795, "flos": 15930353266560.0, "grad_norm": 1.7396036142521527, "language_loss": 0.76576757, "learning_rate": 4.646338602497144e-07, "loss": 0.84271938, "num_input_tokens_seen": 281716035, "router_z_loss_clip": 1.49707031, "router_z_loss_mlp": 0.11169434, "step": 13060, "time_per_iteration": 2.5137009620666504 }, { "auxiliary_loss_clip": 0.06416243, "auxiliary_loss_mlp": 0.01265029, "balance_loss_clip": 0.06274231, "balance_loss_mlp": 0.01254861, "epoch": 0.7852698030963475, "flos": 19068265618560.0, "grad_norm": 1.949550671478908, "language_loss": 0.76980865, "learning_rate": 4.643843107494654e-07, "loss": 0.84662139, "num_input_tokens_seen": 281732815, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10168457, "step": 13061, "time_per_iteration": 2.518465995788574 }, { "auxiliary_loss_clip": 0.0642086, "auxiliary_loss_mlp": 0.01267122, "balance_loss_clip": 0.06277492, "balance_loss_mlp": 0.0125665, "epoch": 0.7853299263490154, "flos": 24651259637760.0, "grad_norm": 2.1561761309927587, "language_loss": 0.74339348, "learning_rate": 4.641348194799164e-07, "loss": 0.82027334, "num_input_tokens_seen": 281751980, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10473633, "step": 13062, "time_per_iteration": 2.5505740642547607 }, { "auxiliary_loss_clip": 0.06415182, "auxiliary_loss_mlp": 0.01263558, "balance_loss_clip": 0.06276324, "balance_loss_mlp": 0.01254415, "epoch": 0.7853900496016835, "flos": 22024518318720.0, "grad_norm": 1.8294078326472225, "language_loss": 0.68923402, "learning_rate": 4.638853864505297e-07, "loss": 0.76602149, "num_input_tokens_seen": 281772670, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09136963, "step": 13063, "time_per_iteration": 2.5491538047790527 }, { "auxiliary_loss_clip": 0.06418088, "auxiliary_loss_mlp": 0.01263892, "balance_loss_clip": 0.06279279, "balance_loss_mlp": 0.01254278, "epoch": 0.7854501728543514, "flos": 30235343760000.0, "grad_norm": 1.8250584656811224, "language_loss": 0.73348713, "learning_rate": 4.636360116707625e-07, "loss": 0.81030691, "num_input_tokens_seen": 281792930, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09625244, "step": 13064, "time_per_iteration": 4.027876377105713 }, { "auxiliary_loss_clip": 0.06417358, "auxiliary_loss_mlp": 0.01265501, "balance_loss_clip": 0.06273046, "balance_loss_mlp": 0.01255368, "epoch": 0.7855102961070194, "flos": 18849695443200.0, "grad_norm": 1.8886059478305939, "language_loss": 0.68025649, "learning_rate": 4.633866951500718e-07, "loss": 0.75708508, "num_input_tokens_seen": 281811805, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10144043, "step": 13065, "time_per_iteration": 2.4995267391204834 }, { "auxiliary_loss_clip": 0.06421427, "auxiliary_loss_mlp": 0.0126459, "balance_loss_clip": 0.06280211, "balance_loss_mlp": 0.01254624, "epoch": 0.7855704193596874, "flos": 22316574124800.0, "grad_norm": 1.7441793268628998, "language_loss": 0.76012826, "learning_rate": 4.6313743689791196e-07, "loss": 0.83698845, "num_input_tokens_seen": 281831885, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09960938, "step": 13066, "time_per_iteration": 2.5623302459716797 }, { "auxiliary_loss_clip": 0.0631414, "auxiliary_loss_mlp": 0.01250963, "balance_loss_clip": 0.06257718, "balance_loss_mlp": 0.01249743, "epoch": 0.7856305426123553, "flos": 60024224638080.0, "grad_norm": 0.6914972347288603, "language_loss": 0.53391159, "learning_rate": 4.628882369237346e-07, "loss": 0.60956258, "num_input_tokens_seen": 281900310, "router_z_loss_clip": 0.56494141, "router_z_loss_mlp": 0.01219177, "step": 13067, "time_per_iteration": 3.2263412475585938 }, { "auxiliary_loss_clip": 0.06417099, "auxiliary_loss_mlp": 0.01265202, "balance_loss_clip": 0.06273651, "balance_loss_mlp": 0.01254962, "epoch": 0.7856906658650233, "flos": 21874528310400.0, "grad_norm": 1.5388129754458169, "language_loss": 0.67544276, "learning_rate": 4.62639095236989e-07, "loss": 0.75226581, "num_input_tokens_seen": 281918870, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10241699, "step": 13068, "time_per_iteration": 2.6400339603424072 }, { "auxiliary_loss_clip": 0.06411258, "auxiliary_loss_mlp": 0.0126386, "balance_loss_clip": 0.06272146, "balance_loss_mlp": 0.01254473, "epoch": 0.7857507891176913, "flos": 23629672258560.0, "grad_norm": 1.7691790237671, "language_loss": 0.68218172, "learning_rate": 4.6239001184712267e-07, "loss": 0.75893289, "num_input_tokens_seen": 281936905, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09399414, "step": 13069, "time_per_iteration": 2.5786306858062744 }, { "auxiliary_loss_clip": 0.06419815, "auxiliary_loss_mlp": 0.01264313, "balance_loss_clip": 0.06278355, "balance_loss_mlp": 0.01254145, "epoch": 0.7858109123703593, "flos": 25527091639680.0, "grad_norm": 1.576303206287746, "language_loss": 0.76959723, "learning_rate": 4.6214098676358195e-07, "loss": 0.84643853, "num_input_tokens_seen": 281955625, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.10162354, "step": 13070, "time_per_iteration": 3.880463123321533 }, { "auxiliary_loss_clip": 0.06411015, "auxiliary_loss_mlp": 0.01263117, "balance_loss_clip": 0.0627269, "balance_loss_mlp": 0.01253926, "epoch": 0.7858710356230272, "flos": 17463195532800.0, "grad_norm": 1.6211993448070872, "language_loss": 0.66320968, "learning_rate": 4.618920199958083e-07, "loss": 0.73995101, "num_input_tokens_seen": 281973285, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09197998, "step": 13071, "time_per_iteration": 2.493189811706543 }, { "auxiliary_loss_clip": 0.06422415, "auxiliary_loss_mlp": 0.01263407, "balance_loss_clip": 0.06278136, "balance_loss_mlp": 0.01253429, "epoch": 0.7859311588756952, "flos": 24686367298560.0, "grad_norm": 1.5318234447770778, "language_loss": 0.74133754, "learning_rate": 4.616431115532442e-07, "loss": 0.8181957, "num_input_tokens_seen": 281991410, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.09979248, "step": 13072, "time_per_iteration": 2.580970525741577 }, { "auxiliary_loss_clip": 0.06417383, "auxiliary_loss_mlp": 0.01265606, "balance_loss_clip": 0.06274673, "balance_loss_mlp": 0.01254859, "epoch": 0.7859912821283631, "flos": 21805654654080.0, "grad_norm": 5.387795299815506, "language_loss": 0.71447027, "learning_rate": 4.613942614453268e-07, "loss": 0.79130018, "num_input_tokens_seen": 282010845, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10748291, "step": 13073, "time_per_iteration": 2.5309150218963623 }, { "auxiliary_loss_clip": 0.06420229, "auxiliary_loss_mlp": 0.01269148, "balance_loss_clip": 0.06278132, "balance_loss_mlp": 0.01258383, "epoch": 0.7860514053810311, "flos": 20853108639360.0, "grad_norm": 1.5595091971819224, "language_loss": 0.7694521, "learning_rate": 4.611454696814938e-07, "loss": 0.84634584, "num_input_tokens_seen": 282029635, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10766602, "step": 13074, "time_per_iteration": 2.5308048725128174 }, { "auxiliary_loss_clip": 0.06410009, "auxiliary_loss_mlp": 0.01266224, "balance_loss_clip": 0.06272233, "balance_loss_mlp": 0.01256437, "epoch": 0.786111528633699, "flos": 24322461016320.0, "grad_norm": 1.666465380370139, "language_loss": 0.75416172, "learning_rate": 4.608967362711782e-07, "loss": 0.83092403, "num_input_tokens_seen": 282050285, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09796143, "step": 13075, "time_per_iteration": 2.5443038940429688 }, { "auxiliary_loss_clip": 0.06415707, "auxiliary_loss_mlp": 0.01262897, "balance_loss_clip": 0.06274832, "balance_loss_mlp": 0.01253748, "epoch": 0.7861716518863671, "flos": 24360126226560.0, "grad_norm": 1.5015783437903858, "language_loss": 0.69030416, "learning_rate": 4.6064806122381283e-07, "loss": 0.7670902, "num_input_tokens_seen": 282071040, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.0914917, "step": 13076, "time_per_iteration": 2.5719408988952637 }, { "auxiliary_loss_clip": 0.06414139, "auxiliary_loss_mlp": 0.01268581, "balance_loss_clip": 0.06275868, "balance_loss_mlp": 0.01258871, "epoch": 0.786231775139035, "flos": 14026728683520.0, "grad_norm": 2.0062580233040106, "language_loss": 0.80619872, "learning_rate": 4.603994445488282e-07, "loss": 0.88302588, "num_input_tokens_seen": 282086610, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09710693, "step": 13077, "time_per_iteration": 2.5067951679229736 }, { "auxiliary_loss_clip": 0.06415136, "auxiliary_loss_mlp": 0.01267246, "balance_loss_clip": 0.06275202, "balance_loss_mlp": 0.01257167, "epoch": 0.786291898391703, "flos": 33731795733120.0, "grad_norm": 1.6647305197698252, "language_loss": 0.70795739, "learning_rate": 4.6015088625564956e-07, "loss": 0.78478134, "num_input_tokens_seen": 282107440, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.10076904, "step": 13078, "time_per_iteration": 2.681349277496338 }, { "auxiliary_loss_clip": 0.06414531, "auxiliary_loss_mlp": 0.01265282, "balance_loss_clip": 0.06275711, "balance_loss_mlp": 0.01255572, "epoch": 0.786352021644371, "flos": 25818476613120.0, "grad_norm": 1.4612009783421502, "language_loss": 0.81351626, "learning_rate": 4.599023863537039e-07, "loss": 0.8903144, "num_input_tokens_seen": 282127290, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.0970459, "step": 13079, "time_per_iteration": 2.5640084743499756 }, { "auxiliary_loss_clip": 0.06407483, "auxiliary_loss_mlp": 0.01268405, "balance_loss_clip": 0.06273205, "balance_loss_mlp": 0.01258987, "epoch": 0.7864121448970389, "flos": 28918010995200.0, "grad_norm": 1.9571733101148105, "language_loss": 0.68795538, "learning_rate": 4.596539448524146e-07, "loss": 0.76471424, "num_input_tokens_seen": 282147505, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.09417725, "step": 13080, "time_per_iteration": 2.632690906524658 }, { "auxiliary_loss_clip": 0.06414615, "auxiliary_loss_mlp": 0.01264604, "balance_loss_clip": 0.06273714, "balance_loss_mlp": 0.01253917, "epoch": 0.7864722681497069, "flos": 19214943390720.0, "grad_norm": 1.992541011208538, "language_loss": 0.69923609, "learning_rate": 4.594055617612016e-07, "loss": 0.77602828, "num_input_tokens_seen": 282166450, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10675049, "step": 13081, "time_per_iteration": 2.535670518875122 }, { "auxiliary_loss_clip": 0.06421679, "auxiliary_loss_mlp": 0.01265735, "balance_loss_clip": 0.06278408, "balance_loss_mlp": 0.01255918, "epoch": 0.7865323914023749, "flos": 21878008254720.0, "grad_norm": 1.6718992101557308, "language_loss": 0.68763846, "learning_rate": 4.591572370894838e-07, "loss": 0.76451266, "num_input_tokens_seen": 282186465, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.0980835, "step": 13082, "time_per_iteration": 2.55197811126709 }, { "auxiliary_loss_clip": 0.06413787, "auxiliary_loss_mlp": 0.0126567, "balance_loss_clip": 0.06275102, "balance_loss_mlp": 0.01255973, "epoch": 0.7865925146550429, "flos": 25527385128960.0, "grad_norm": 1.8296949270652292, "language_loss": 0.66623771, "learning_rate": 4.589089708466789e-07, "loss": 0.74303234, "num_input_tokens_seen": 282207180, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.0970459, "step": 13083, "time_per_iteration": 2.5734152793884277 }, { "auxiliary_loss_clip": 0.06423616, "auxiliary_loss_mlp": 0.01267731, "balance_loss_clip": 0.06278157, "balance_loss_mlp": 0.01256144, "epoch": 0.7866526379077108, "flos": 19103121717120.0, "grad_norm": 2.8636219722005745, "language_loss": 0.75255829, "learning_rate": 4.5866076304220015e-07, "loss": 0.82947171, "num_input_tokens_seen": 282225865, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.11578369, "step": 13084, "time_per_iteration": 2.5139212608337402 }, { "auxiliary_loss_clip": 0.06412815, "auxiliary_loss_mlp": 0.01268218, "balance_loss_clip": 0.06273844, "balance_loss_mlp": 0.01258372, "epoch": 0.7867127611603788, "flos": 16178245171200.0, "grad_norm": 1.895386237259159, "language_loss": 0.70226467, "learning_rate": 4.584126136854591e-07, "loss": 0.77907497, "num_input_tokens_seen": 282242895, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09838867, "step": 13085, "time_per_iteration": 2.481931209564209 }, { "auxiliary_loss_clip": 0.06419562, "auxiliary_loss_mlp": 0.01267506, "balance_loss_clip": 0.06272945, "balance_loss_mlp": 0.01256461, "epoch": 0.7867728844130467, "flos": 20779329519360.0, "grad_norm": 1.7058364110336768, "language_loss": 0.72613096, "learning_rate": 4.5816452278586617e-07, "loss": 0.80300164, "num_input_tokens_seen": 282260425, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.1104126, "step": 13086, "time_per_iteration": 2.532674789428711 }, { "auxiliary_loss_clip": 0.06419727, "auxiliary_loss_mlp": 0.01266449, "balance_loss_clip": 0.06278694, "balance_loss_mlp": 0.01257144, "epoch": 0.7868330076657147, "flos": 21766186581120.0, "grad_norm": 1.7507119763302896, "language_loss": 0.7486093, "learning_rate": 4.5791649035282965e-07, "loss": 0.8254711, "num_input_tokens_seen": 282279335, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09301758, "step": 13087, "time_per_iteration": 2.5216732025146484 }, { "auxiliary_loss_clip": 0.06413069, "auxiliary_loss_mlp": 0.01267179, "balance_loss_clip": 0.06274083, "balance_loss_mlp": 0.01257536, "epoch": 0.7868931309183826, "flos": 25707451553280.0, "grad_norm": 2.4039091941988935, "language_loss": 0.71549773, "learning_rate": 4.5766851639575456e-07, "loss": 0.79230022, "num_input_tokens_seen": 282299905, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09643555, "step": 13088, "time_per_iteration": 2.559548854827881 }, { "auxiliary_loss_clip": 0.06315652, "auxiliary_loss_mlp": 0.01250761, "balance_loss_clip": 0.06259474, "balance_loss_mlp": 0.01249401, "epoch": 0.7869532541710507, "flos": 64666579921920.0, "grad_norm": 0.6614471506556646, "language_loss": 0.55082965, "learning_rate": 4.574206009240431e-07, "loss": 0.62649369, "num_input_tokens_seen": 282367620, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.0136261, "step": 13089, "time_per_iteration": 4.661330938339233 }, { "auxiliary_loss_clip": 0.06316496, "auxiliary_loss_mlp": 0.01250925, "balance_loss_clip": 0.06260327, "balance_loss_mlp": 0.01249609, "epoch": 0.7870133774237186, "flos": 67475651725440.0, "grad_norm": 0.7085975552877544, "language_loss": 0.49561042, "learning_rate": 4.571727439470976e-07, "loss": 0.57128459, "num_input_tokens_seen": 282435695, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01316833, "step": 13090, "time_per_iteration": 3.2508184909820557 }, { "auxiliary_loss_clip": 0.064114, "auxiliary_loss_mlp": 0.0126428, "balance_loss_clip": 0.06273969, "balance_loss_mlp": 0.01254839, "epoch": 0.7870735006763866, "flos": 26075592466560.0, "grad_norm": 1.4280796608223087, "language_loss": 0.83945596, "learning_rate": 4.5692494547431583e-07, "loss": 0.9162128, "num_input_tokens_seen": 282456025, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09442139, "step": 13091, "time_per_iteration": 2.597332239151001 }, { "auxiliary_loss_clip": 0.06315254, "auxiliary_loss_mlp": 0.01252026, "balance_loss_clip": 0.06259339, "balance_loss_mlp": 0.01250779, "epoch": 0.7871336239290546, "flos": 70310439532800.0, "grad_norm": 0.6975751224390392, "language_loss": 0.63943034, "learning_rate": 4.566772055150947e-07, "loss": 0.71510315, "num_input_tokens_seen": 282520995, "router_z_loss_clip": 0.56054688, "router_z_loss_mlp": 0.01246643, "step": 13092, "time_per_iteration": 3.2398548126220703 }, { "auxiliary_loss_clip": 0.06417941, "auxiliary_loss_mlp": 0.01265942, "balance_loss_clip": 0.06276549, "balance_loss_mlp": 0.012556, "epoch": 0.7871937471817225, "flos": 15784010910720.0, "grad_norm": 1.9289188632974368, "language_loss": 0.79834282, "learning_rate": 4.564295240788285e-07, "loss": 0.87518173, "num_input_tokens_seen": 282539355, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.10339355, "step": 13093, "time_per_iteration": 4.116144180297852 }, { "auxiliary_loss_clip": 0.06413057, "auxiliary_loss_mlp": 0.01263461, "balance_loss_clip": 0.06274468, "balance_loss_mlp": 0.01254229, "epoch": 0.7872538704343905, "flos": 20491466417280.0, "grad_norm": 1.9874579949457496, "language_loss": 0.76053119, "learning_rate": 4.561819011749106e-07, "loss": 0.83729637, "num_input_tokens_seen": 282555735, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09234619, "step": 13094, "time_per_iteration": 2.5993340015411377 }, { "auxiliary_loss_clip": 0.06421779, "auxiliary_loss_mlp": 0.01266388, "balance_loss_clip": 0.06278543, "balance_loss_mlp": 0.01256356, "epoch": 0.7873139936870585, "flos": 25089699726720.0, "grad_norm": 2.386277367208037, "language_loss": 0.79655504, "learning_rate": 4.5593433681272884e-07, "loss": 0.87343669, "num_input_tokens_seen": 282574550, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.1003418, "step": 13095, "time_per_iteration": 2.6041297912597656 }, { "auxiliary_loss_clip": 0.06420369, "auxiliary_loss_mlp": 0.0126526, "balance_loss_clip": 0.06275047, "balance_loss_mlp": 0.01254895, "epoch": 0.7873741169397265, "flos": 30891054286080.0, "grad_norm": 2.211815315609535, "language_loss": 0.68030119, "learning_rate": 4.556868310016715e-07, "loss": 0.7571575, "num_input_tokens_seen": 282596520, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10375977, "step": 13096, "time_per_iteration": 2.674670696258545 }, { "auxiliary_loss_clip": 0.06408206, "auxiliary_loss_mlp": 0.01263823, "balance_loss_clip": 0.06272028, "balance_loss_mlp": 0.0125471, "epoch": 0.7874342401923944, "flos": 46802666165760.0, "grad_norm": 1.4199957339747293, "language_loss": 0.70986098, "learning_rate": 4.55439383751125e-07, "loss": 0.78658128, "num_input_tokens_seen": 282620560, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09106445, "step": 13097, "time_per_iteration": 2.801823616027832 }, { "auxiliary_loss_clip": 0.06426987, "auxiliary_loss_mlp": 0.01272832, "balance_loss_clip": 0.062815, "balance_loss_mlp": 0.01262288, "epoch": 0.7874943634450624, "flos": 23590958872320.0, "grad_norm": 1.7468497721534122, "language_loss": 0.80990088, "learning_rate": 4.5519199507047126e-07, "loss": 0.88689911, "num_input_tokens_seen": 282639830, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10540771, "step": 13098, "time_per_iteration": 2.600625991821289 }, { "auxiliary_loss_clip": 0.06414886, "auxiliary_loss_mlp": 0.01264537, "balance_loss_clip": 0.06273808, "balance_loss_mlp": 0.01255102, "epoch": 0.7875544866977303, "flos": 20196978842880.0, "grad_norm": 1.6128388015537853, "language_loss": 0.74304605, "learning_rate": 4.5494466496909177e-07, "loss": 0.81984025, "num_input_tokens_seen": 282660130, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09436035, "step": 13099, "time_per_iteration": 2.5723350048065186 }, { "auxiliary_loss_clip": 0.06418964, "auxiliary_loss_mlp": 0.01265281, "balance_loss_clip": 0.06278609, "balance_loss_mlp": 0.01255345, "epoch": 0.7876146099503983, "flos": 22609342690560.0, "grad_norm": 1.616155085881381, "language_loss": 0.78264201, "learning_rate": 4.5469739345636603e-07, "loss": 0.85948443, "num_input_tokens_seen": 282681125, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09936523, "step": 13100, "time_per_iteration": 2.5853374004364014 }, { "auxiliary_loss_clip": 0.0642667, "auxiliary_loss_mlp": 0.01265752, "balance_loss_clip": 0.06278475, "balance_loss_mlp": 0.01254862, "epoch": 0.7876747332030662, "flos": 10710217353600.0, "grad_norm": 3.2738124574389347, "language_loss": 0.66202581, "learning_rate": 4.5445018054167007e-07, "loss": 0.73895001, "num_input_tokens_seen": 282696690, "router_z_loss_clip": 1.48242188, "router_z_loss_mlp": 0.10882568, "step": 13101, "time_per_iteration": 2.5283634662628174 }, { "auxiliary_loss_clip": 0.06420481, "auxiliary_loss_mlp": 0.01263934, "balance_loss_clip": 0.06278804, "balance_loss_mlp": 0.01254159, "epoch": 0.7877348564557343, "flos": 38408462064000.0, "grad_norm": 1.3913229183334428, "language_loss": 0.78079927, "learning_rate": 4.5420302623437745e-07, "loss": 0.85764343, "num_input_tokens_seen": 282721210, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09777832, "step": 13102, "time_per_iteration": 2.6768198013305664 }, { "auxiliary_loss_clip": 0.06415643, "auxiliary_loss_mlp": 0.01266458, "balance_loss_clip": 0.06274473, "balance_loss_mlp": 0.01256962, "epoch": 0.7877949797084022, "flos": 18334876757760.0, "grad_norm": 1.8413248052759636, "language_loss": 0.82679349, "learning_rate": 4.5395593054386093e-07, "loss": 0.90361452, "num_input_tokens_seen": 282738505, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09490967, "step": 13103, "time_per_iteration": 2.525400161743164 }, { "auxiliary_loss_clip": 0.06423663, "auxiliary_loss_mlp": 0.0126388, "balance_loss_clip": 0.06279309, "balance_loss_mlp": 0.01254051, "epoch": 0.7878551029610702, "flos": 25812942243840.0, "grad_norm": 1.9044000016490272, "language_loss": 0.81178963, "learning_rate": 4.537088934794913e-07, "loss": 0.88866508, "num_input_tokens_seen": 282756895, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.0982666, "step": 13104, "time_per_iteration": 4.01233172416687 }, { "auxiliary_loss_clip": 0.06418113, "auxiliary_loss_mlp": 0.01267754, "balance_loss_clip": 0.0627607, "balance_loss_mlp": 0.01257085, "epoch": 0.7879152262137382, "flos": 22348663038720.0, "grad_norm": 1.5307359852873754, "language_loss": 0.74079913, "learning_rate": 4.5346191505063515e-07, "loss": 0.81765783, "num_input_tokens_seen": 282774955, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10675049, "step": 13105, "time_per_iteration": 2.526228904724121 }, { "auxiliary_loss_clip": 0.0642284, "auxiliary_loss_mlp": 0.01266794, "balance_loss_clip": 0.06277008, "balance_loss_mlp": 0.01256565, "epoch": 0.7879753494664061, "flos": 24791396791680.0, "grad_norm": 1.6923243343653265, "language_loss": 0.75364983, "learning_rate": 4.5321499526665776e-07, "loss": 0.83054614, "num_input_tokens_seen": 282793165, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10229492, "step": 13106, "time_per_iteration": 2.593798875808716 }, { "auxiliary_loss_clip": 0.06417888, "auxiliary_loss_mlp": 0.01265778, "balance_loss_clip": 0.06275098, "balance_loss_mlp": 0.01255484, "epoch": 0.7880354727190741, "flos": 16914610851840.0, "grad_norm": 1.9619709830717302, "language_loss": 0.73230517, "learning_rate": 4.5296813413692337e-07, "loss": 0.80914176, "num_input_tokens_seen": 282809820, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10296631, "step": 13107, "time_per_iteration": 2.5115435123443604 }, { "auxiliary_loss_clip": 0.06414932, "auxiliary_loss_mlp": 0.01265416, "balance_loss_clip": 0.062755, "balance_loss_mlp": 0.01255432, "epoch": 0.7880955959717421, "flos": 22236002824320.0, "grad_norm": 2.170434058329103, "language_loss": 0.73589015, "learning_rate": 4.5272133167079165e-07, "loss": 0.81269366, "num_input_tokens_seen": 282828600, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09997559, "step": 13108, "time_per_iteration": 2.591219663619995 }, { "auxiliary_loss_clip": 0.06312449, "auxiliary_loss_mlp": 0.0125157, "balance_loss_clip": 0.06256323, "balance_loss_mlp": 0.01250249, "epoch": 0.7881557192244101, "flos": 69201907943040.0, "grad_norm": 0.87056558712613, "language_loss": 0.60293025, "learning_rate": 4.5247458787762216e-07, "loss": 0.67857039, "num_input_tokens_seen": 282882775, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01321411, "step": 13109, "time_per_iteration": 3.1129400730133057 }, { "auxiliary_loss_clip": 0.06413642, "auxiliary_loss_mlp": 0.01263394, "balance_loss_clip": 0.0627664, "balance_loss_mlp": 0.01253833, "epoch": 0.788215842477078, "flos": 24942225340800.0, "grad_norm": 1.5846297785532917, "language_loss": 0.72296512, "learning_rate": 4.5222790276677126e-07, "loss": 0.79973555, "num_input_tokens_seen": 282902680, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09552002, "step": 13110, "time_per_iteration": 3.9691007137298584 }, { "auxiliary_loss_clip": 0.0641021, "auxiliary_loss_mlp": 0.01264168, "balance_loss_clip": 0.06273024, "balance_loss_mlp": 0.01254762, "epoch": 0.788275965729746, "flos": 26114054290560.0, "grad_norm": 1.3297127446781778, "language_loss": 0.75339365, "learning_rate": 4.5198127634759455e-07, "loss": 0.83013743, "num_input_tokens_seen": 282923625, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09411621, "step": 13111, "time_per_iteration": 2.6138932704925537 }, { "auxiliary_loss_clip": 0.06416693, "auxiliary_loss_mlp": 0.01268701, "balance_loss_clip": 0.06275984, "balance_loss_mlp": 0.01258831, "epoch": 0.7883360889824139, "flos": 21221123771520.0, "grad_norm": 1.9613625229385059, "language_loss": 0.62087744, "learning_rate": 4.5173470862944206e-07, "loss": 0.69773138, "num_input_tokens_seen": 282941955, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09869385, "step": 13112, "time_per_iteration": 2.550601005554199 }, { "auxiliary_loss_clip": 0.06415869, "auxiliary_loss_mlp": 0.01269154, "balance_loss_clip": 0.06274973, "balance_loss_mlp": 0.01258884, "epoch": 0.7883962122350819, "flos": 21148979806080.0, "grad_norm": 1.6770150886270925, "language_loss": 0.67312568, "learning_rate": 4.514881996216644e-07, "loss": 0.74997598, "num_input_tokens_seen": 282961280, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10272217, "step": 13113, "time_per_iteration": 2.545370101928711 }, { "auxiliary_loss_clip": 0.06412554, "auxiliary_loss_mlp": 0.01265783, "balance_loss_clip": 0.06273371, "balance_loss_mlp": 0.01255984, "epoch": 0.7884563354877498, "flos": 15308031392640.0, "grad_norm": 2.277182721146161, "language_loss": 0.58740205, "learning_rate": 4.5124174933361e-07, "loss": 0.6641854, "num_input_tokens_seen": 282978210, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09790039, "step": 13114, "time_per_iteration": 2.5034852027893066 }, { "auxiliary_loss_clip": 0.0641963, "auxiliary_loss_mlp": 0.01265174, "balance_loss_clip": 0.06274758, "balance_loss_mlp": 0.01255167, "epoch": 0.7885164587404179, "flos": 24395024252160.0, "grad_norm": 1.5831368992852257, "language_loss": 0.67135298, "learning_rate": 4.5099535777462306e-07, "loss": 0.74820101, "num_input_tokens_seen": 282998845, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10009766, "step": 13115, "time_per_iteration": 2.650559902191162 }, { "auxiliary_loss_clip": 0.06419607, "auxiliary_loss_mlp": 0.0126783, "balance_loss_clip": 0.06278527, "balance_loss_mlp": 0.01257315, "epoch": 0.7885765819930858, "flos": 14390047987200.0, "grad_norm": 1.909934056538016, "language_loss": 0.88613087, "learning_rate": 4.50749024954048e-07, "loss": 0.96300519, "num_input_tokens_seen": 283015200, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10522461, "step": 13116, "time_per_iteration": 2.5855116844177246 }, { "auxiliary_loss_clip": 0.06429234, "auxiliary_loss_mlp": 0.01271384, "balance_loss_clip": 0.06277809, "balance_loss_mlp": 0.01259434, "epoch": 0.7886367052457538, "flos": 18265835393280.0, "grad_norm": 2.913642529841651, "language_loss": 0.73331535, "learning_rate": 4.505027508812245e-07, "loss": 0.81032151, "num_input_tokens_seen": 283033680, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.11962891, "step": 13117, "time_per_iteration": 2.5468263626098633 }, { "auxiliary_loss_clip": 0.06413744, "auxiliary_loss_mlp": 0.01264387, "balance_loss_clip": 0.06275852, "balance_loss_mlp": 0.01255035, "epoch": 0.7886968284984217, "flos": 15310588942080.0, "grad_norm": 1.5715917513366484, "language_loss": 0.80140555, "learning_rate": 4.502565355654926e-07, "loss": 0.87818682, "num_input_tokens_seen": 283050620, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09350586, "step": 13118, "time_per_iteration": 2.5026519298553467 }, { "auxiliary_loss_clip": 0.06415774, "auxiliary_loss_mlp": 0.01265242, "balance_loss_clip": 0.06275433, "balance_loss_mlp": 0.0125573, "epoch": 0.7887569517510897, "flos": 21221878458240.0, "grad_norm": 1.8109303685666875, "language_loss": 0.7314471, "learning_rate": 4.500103790161878e-07, "loss": 0.80825722, "num_input_tokens_seen": 283070215, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09509277, "step": 13119, "time_per_iteration": 2.5587801933288574 }, { "auxiliary_loss_clip": 0.0641742, "auxiliary_loss_mlp": 0.01265373, "balance_loss_clip": 0.06274071, "balance_loss_mlp": 0.01255669, "epoch": 0.7888170750037578, "flos": 22717894055040.0, "grad_norm": 1.4778560488742218, "language_loss": 0.72101367, "learning_rate": 4.4976428124264454e-07, "loss": 0.79784155, "num_input_tokens_seen": 283091485, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09698486, "step": 13120, "time_per_iteration": 2.5593037605285645 }, { "auxiliary_loss_clip": 0.06416614, "auxiliary_loss_mlp": 0.01273192, "balance_loss_clip": 0.06276245, "balance_loss_mlp": 0.01262517, "epoch": 0.7888771982564257, "flos": 36437976322560.0, "grad_norm": 1.4972137643232175, "language_loss": 0.78946453, "learning_rate": 4.4951824225419564e-07, "loss": 0.86636257, "num_input_tokens_seen": 283115040, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.10681152, "step": 13121, "time_per_iteration": 2.6799709796905518 }, { "auxiliary_loss_clip": 0.06412953, "auxiliary_loss_mlp": 0.01267215, "balance_loss_clip": 0.06274765, "balance_loss_mlp": 0.01257035, "epoch": 0.7889373215090937, "flos": 27317678664960.0, "grad_norm": 1.3860972242752276, "language_loss": 0.80261946, "learning_rate": 4.4927226206017057e-07, "loss": 0.87942111, "num_input_tokens_seen": 283136925, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.10174561, "step": 13122, "time_per_iteration": 2.6370956897735596 }, { "auxiliary_loss_clip": 0.06416637, "auxiliary_loss_mlp": 0.0126448, "balance_loss_clip": 0.06272872, "balance_loss_mlp": 0.01255223, "epoch": 0.7889974447617616, "flos": 19835210839680.0, "grad_norm": 2.275843216380512, "language_loss": 0.78071427, "learning_rate": 4.4902634066989597e-07, "loss": 0.85752541, "num_input_tokens_seen": 283155725, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.0925293, "step": 13123, "time_per_iteration": 2.5609891414642334 }, { "auxiliary_loss_clip": 0.06421667, "auxiliary_loss_mlp": 0.01264492, "balance_loss_clip": 0.0627699, "balance_loss_mlp": 0.01254449, "epoch": 0.7890575680144296, "flos": 17276336928000.0, "grad_norm": 1.9908508061214434, "language_loss": 0.67321062, "learning_rate": 4.487804780926985e-07, "loss": 0.75007224, "num_input_tokens_seen": 283173845, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10040283, "step": 13124, "time_per_iteration": 2.522127628326416 }, { "auxiliary_loss_clip": 0.06426652, "auxiliary_loss_mlp": 0.0126782, "balance_loss_clip": 0.06280687, "balance_loss_mlp": 0.01257717, "epoch": 0.7891176912670975, "flos": 27607596192000.0, "grad_norm": 1.9849018986401448, "language_loss": 0.73186016, "learning_rate": 4.4853467433790036e-07, "loss": 0.80880487, "num_input_tokens_seen": 283191985, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.10107422, "step": 13125, "time_per_iteration": 2.624738931655884 }, { "auxiliary_loss_clip": 0.06418303, "auxiliary_loss_mlp": 0.01263556, "balance_loss_clip": 0.06273712, "balance_loss_mlp": 0.01253895, "epoch": 0.7891778145197655, "flos": 22718397179520.0, "grad_norm": 1.7929209726973883, "language_loss": 0.73348939, "learning_rate": 4.4828892941482267e-07, "loss": 0.81030798, "num_input_tokens_seen": 283210855, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.09661865, "step": 13126, "time_per_iteration": 2.56337571144104 }, { "auxiliary_loss_clip": 0.06422598, "auxiliary_loss_mlp": 0.01264264, "balance_loss_clip": 0.06277259, "balance_loss_mlp": 0.01254239, "epoch": 0.7892379377724335, "flos": 17316433906560.0, "grad_norm": 1.6447514910223193, "language_loss": 0.77065629, "learning_rate": 4.480432433327845e-07, "loss": 0.84752488, "num_input_tokens_seen": 283229665, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10028076, "step": 13127, "time_per_iteration": 2.5211424827575684 }, { "auxiliary_loss_clip": 0.06411798, "auxiliary_loss_mlp": 0.01266573, "balance_loss_clip": 0.06274119, "balance_loss_mlp": 0.0125616, "epoch": 0.7892980610251015, "flos": 25782781973760.0, "grad_norm": 1.603814358464319, "language_loss": 0.85755593, "learning_rate": 4.47797616101103e-07, "loss": 0.93433964, "num_input_tokens_seen": 283248615, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.10406494, "step": 13128, "time_per_iteration": 4.053072929382324 }, { "auxiliary_loss_clip": 0.06412332, "auxiliary_loss_mlp": 0.01266323, "balance_loss_clip": 0.06272663, "balance_loss_mlp": 0.01256548, "epoch": 0.7893581842777694, "flos": 21586371719040.0, "grad_norm": 2.8035720800282538, "language_loss": 0.69507086, "learning_rate": 4.475520477290904e-07, "loss": 0.77185738, "num_input_tokens_seen": 283267135, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09771729, "step": 13129, "time_per_iteration": 2.542914867401123 }, { "auxiliary_loss_clip": 0.06315685, "auxiliary_loss_mlp": 0.01250293, "balance_loss_clip": 0.06259278, "balance_loss_mlp": 0.01249081, "epoch": 0.7894183075304374, "flos": 69037773793920.0, "grad_norm": 0.7009093306592546, "language_loss": 0.61464071, "learning_rate": 4.473065382260597e-07, "loss": 0.69030046, "num_input_tokens_seen": 283328940, "router_z_loss_clip": 0.56298828, "router_z_loss_mlp": 0.01211548, "step": 13130, "time_per_iteration": 3.166541576385498 }, { "auxiliary_loss_clip": 0.06419681, "auxiliary_loss_mlp": 0.01264311, "balance_loss_clip": 0.06276863, "balance_loss_mlp": 0.01254375, "epoch": 0.7894784307831053, "flos": 24250107415680.0, "grad_norm": 1.521994355624245, "language_loss": 0.74108547, "learning_rate": 4.4706108760132124e-07, "loss": 0.81792533, "num_input_tokens_seen": 283350000, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.0993042, "step": 13131, "time_per_iteration": 2.6040759086608887 }, { "auxiliary_loss_clip": 0.06433714, "auxiliary_loss_mlp": 0.01267787, "balance_loss_clip": 0.0627995, "balance_loss_mlp": 0.01255699, "epoch": 0.7895385540357733, "flos": 20272770460800.0, "grad_norm": 2.8291908331283735, "language_loss": 0.69203103, "learning_rate": 4.4681569586418153e-07, "loss": 0.76904607, "num_input_tokens_seen": 283368020, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.12103271, "step": 13132, "time_per_iteration": 2.568220376968384 }, { "auxiliary_loss_clip": 0.0641786, "auxiliary_loss_mlp": 0.01265657, "balance_loss_clip": 0.06274803, "balance_loss_mlp": 0.01255077, "epoch": 0.7895986772884414, "flos": 21002972866560.0, "grad_norm": 2.0717009919156704, "language_loss": 0.61823893, "learning_rate": 4.465703630239468e-07, "loss": 0.69507414, "num_input_tokens_seen": 283387030, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10583496, "step": 13133, "time_per_iteration": 3.9634528160095215 }, { "auxiliary_loss_clip": 0.06423786, "auxiliary_loss_mlp": 0.01270931, "balance_loss_clip": 0.0627883, "balance_loss_mlp": 0.01259678, "epoch": 0.7896588005411093, "flos": 18663423816960.0, "grad_norm": 2.169135178183476, "language_loss": 0.80571222, "learning_rate": 4.463250890899195e-07, "loss": 0.88265944, "num_input_tokens_seen": 283402090, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.11254883, "step": 13134, "time_per_iteration": 2.5110530853271484 }, { "auxiliary_loss_clip": 0.06415519, "auxiliary_loss_mlp": 0.01267284, "balance_loss_clip": 0.06272072, "balance_loss_mlp": 0.01257962, "epoch": 0.7897189237937773, "flos": 18411842332800.0, "grad_norm": 1.7425259528540915, "language_loss": 0.80988467, "learning_rate": 4.460798740713998e-07, "loss": 0.88671267, "num_input_tokens_seen": 283421035, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09326172, "step": 13135, "time_per_iteration": 2.5230486392974854 }, { "auxiliary_loss_clip": 0.0641265, "auxiliary_loss_mlp": 0.0126271, "balance_loss_clip": 0.06272787, "balance_loss_mlp": 0.01252696, "epoch": 0.7897790470464452, "flos": 23738223623040.0, "grad_norm": 1.4964773613919962, "language_loss": 0.72630924, "learning_rate": 4.4583471797768733e-07, "loss": 0.80306286, "num_input_tokens_seen": 283441830, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.10028076, "step": 13136, "time_per_iteration": 2.564168691635132 }, { "auxiliary_loss_clip": 0.06426799, "auxiliary_loss_mlp": 0.0126721, "balance_loss_clip": 0.06277933, "balance_loss_mlp": 0.01255915, "epoch": 0.7898391702991132, "flos": 15923477232000.0, "grad_norm": 2.7221752012616167, "language_loss": 0.70996153, "learning_rate": 4.455896208180778e-07, "loss": 0.78690159, "num_input_tokens_seen": 283459540, "router_z_loss_clip": 1.48828125, "router_z_loss_mlp": 0.11297607, "step": 13137, "time_per_iteration": 2.536048173904419 }, { "auxiliary_loss_clip": 0.06415395, "auxiliary_loss_mlp": 0.01266695, "balance_loss_clip": 0.06276554, "balance_loss_mlp": 0.01255942, "epoch": 0.7898992935517811, "flos": 19835252766720.0, "grad_norm": 1.6345511024774957, "language_loss": 0.73995972, "learning_rate": 4.4534458260186645e-07, "loss": 0.81678063, "num_input_tokens_seen": 283478790, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.10760498, "step": 13138, "time_per_iteration": 2.54270601272583 }, { "auxiliary_loss_clip": 0.06415083, "auxiliary_loss_mlp": 0.01267081, "balance_loss_clip": 0.06274016, "balance_loss_mlp": 0.01256996, "epoch": 0.7899594168044491, "flos": 16221738240000.0, "grad_norm": 2.011217208079657, "language_loss": 0.68740094, "learning_rate": 4.4509960333834426e-07, "loss": 0.76422262, "num_input_tokens_seen": 283495720, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10083008, "step": 13139, "time_per_iteration": 2.5148112773895264 }, { "auxiliary_loss_clip": 0.06317016, "auxiliary_loss_mlp": 0.01251098, "balance_loss_clip": 0.06260669, "balance_loss_mlp": 0.01249911, "epoch": 0.790019540057117, "flos": 68353496225280.0, "grad_norm": 1.5004664411212465, "language_loss": 0.60156053, "learning_rate": 4.448546830368003e-07, "loss": 0.67724168, "num_input_tokens_seen": 283558795, "router_z_loss_clip": 0.56298828, "router_z_loss_mlp": 0.01185608, "step": 13140, "time_per_iteration": 3.2805047035217285 }, { "auxiliary_loss_clip": 0.06421377, "auxiliary_loss_mlp": 0.0126561, "balance_loss_clip": 0.06278155, "balance_loss_mlp": 0.0125559, "epoch": 0.7900796633097851, "flos": 30340037836800.0, "grad_norm": 1.9088412981720377, "language_loss": 0.76201355, "learning_rate": 4.4460982170652304e-07, "loss": 0.8388834, "num_input_tokens_seen": 283579305, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10015869, "step": 13141, "time_per_iteration": 2.70029354095459 }, { "auxiliary_loss_clip": 0.06418674, "auxiliary_loss_mlp": 0.01268438, "balance_loss_clip": 0.06275775, "balance_loss_mlp": 0.01257769, "epoch": 0.790139786562453, "flos": 22133237391360.0, "grad_norm": 2.074664153726498, "language_loss": 0.68655944, "learning_rate": 4.4436501935679694e-07, "loss": 0.76343054, "num_input_tokens_seen": 283597840, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10675049, "step": 13142, "time_per_iteration": 2.5658957958221436 }, { "auxiliary_loss_clip": 0.06314674, "auxiliary_loss_mlp": 0.01252586, "balance_loss_clip": 0.06258532, "balance_loss_mlp": 0.01251419, "epoch": 0.790199909815121, "flos": 58225210277760.0, "grad_norm": 0.8090670331627088, "language_loss": 0.59826493, "learning_rate": 4.441202759969049e-07, "loss": 0.67393756, "num_input_tokens_seen": 283647950, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01165009, "step": 13143, "time_per_iteration": 4.43589973449707 }, { "auxiliary_loss_clip": 0.06421839, "auxiliary_loss_mlp": 0.01269644, "balance_loss_clip": 0.06276405, "balance_loss_mlp": 0.01259225, "epoch": 0.7902600330677889, "flos": 34542066314880.0, "grad_norm": 1.4553688108963887, "language_loss": 0.74504149, "learning_rate": 4.4387559163612875e-07, "loss": 0.82195628, "num_input_tokens_seen": 283670645, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.10412598, "step": 13144, "time_per_iteration": 2.6684162616729736 }, { "auxiliary_loss_clip": 0.06417523, "auxiliary_loss_mlp": 0.01270118, "balance_loss_clip": 0.06273906, "balance_loss_mlp": 0.01259377, "epoch": 0.7903201563204569, "flos": 22352981523840.0, "grad_norm": 1.9799077903093947, "language_loss": 0.83445477, "learning_rate": 4.4363096628374605e-07, "loss": 0.91133118, "num_input_tokens_seen": 283688830, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10742188, "step": 13145, "time_per_iteration": 2.5521607398986816 }, { "auxiliary_loss_clip": 0.06406268, "auxiliary_loss_mlp": 0.01262206, "balance_loss_clip": 0.06269971, "balance_loss_mlp": 0.0125264, "epoch": 0.790380279573125, "flos": 22059919468800.0, "grad_norm": 1.847123086318638, "language_loss": 0.73415339, "learning_rate": 4.4338639994903235e-07, "loss": 0.81083816, "num_input_tokens_seen": 283708625, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.09570312, "step": 13146, "time_per_iteration": 2.5605368614196777 }, { "auxiliary_loss_clip": 0.06419541, "auxiliary_loss_mlp": 0.01261868, "balance_loss_clip": 0.06275219, "balance_loss_mlp": 0.01251807, "epoch": 0.7904404028257929, "flos": 20308758589440.0, "grad_norm": 1.9271663040396738, "language_loss": 0.75996912, "learning_rate": 4.4314189264126246e-07, "loss": 0.83678317, "num_input_tokens_seen": 283725710, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10058594, "step": 13147, "time_per_iteration": 2.5453145503997803 }, { "auxiliary_loss_clip": 0.06414595, "auxiliary_loss_mlp": 0.01269999, "balance_loss_clip": 0.06274915, "balance_loss_mlp": 0.0125927, "epoch": 0.7905005260784609, "flos": 20014732212480.0, "grad_norm": 1.5635276204902047, "language_loss": 0.72215044, "learning_rate": 4.428974443697087e-07, "loss": 0.79899639, "num_input_tokens_seen": 283744150, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.10736084, "step": 13148, "time_per_iteration": 2.5441880226135254 }, { "auxiliary_loss_clip": 0.06419401, "auxiliary_loss_mlp": 0.01265014, "balance_loss_clip": 0.06275463, "balance_loss_mlp": 0.01253945, "epoch": 0.7905606493311288, "flos": 26913088425600.0, "grad_norm": 2.7661963134660454, "language_loss": 0.71199512, "learning_rate": 4.4265305514363913e-07, "loss": 0.78883928, "num_input_tokens_seen": 283764170, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.11071777, "step": 13149, "time_per_iteration": 3.9526216983795166 }, { "auxiliary_loss_clip": 0.06422471, "auxiliary_loss_mlp": 0.01266451, "balance_loss_clip": 0.06277028, "balance_loss_mlp": 0.01255043, "epoch": 0.7906207725837968, "flos": 23703032108160.0, "grad_norm": 1.9316430518784329, "language_loss": 0.6499427, "learning_rate": 4.424087249723225e-07, "loss": 0.72683191, "num_input_tokens_seen": 283784305, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.11407471, "step": 13150, "time_per_iteration": 2.5464961528778076 }, { "auxiliary_loss_clip": 0.06412613, "auxiliary_loss_mlp": 0.01265683, "balance_loss_clip": 0.06272489, "balance_loss_mlp": 0.01255949, "epoch": 0.7906808958364647, "flos": 20854911502080.0, "grad_norm": 2.789931931271643, "language_loss": 0.70296389, "learning_rate": 4.421644538650231e-07, "loss": 0.77974683, "num_input_tokens_seen": 283804040, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09735107, "step": 13151, "time_per_iteration": 2.558595657348633 }, { "auxiliary_loss_clip": 0.06425534, "auxiliary_loss_mlp": 0.01267847, "balance_loss_clip": 0.06279483, "balance_loss_mlp": 0.01257267, "epoch": 0.7907410190891327, "flos": 40744866585600.0, "grad_norm": 1.8651174683008216, "language_loss": 0.70406508, "learning_rate": 4.4192024183100306e-07, "loss": 0.78099889, "num_input_tokens_seen": 283827120, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10571289, "step": 13152, "time_per_iteration": 2.723304271697998 }, { "auxiliary_loss_clip": 0.06416922, "auxiliary_loss_mlp": 0.01267381, "balance_loss_clip": 0.06275976, "balance_loss_mlp": 0.01257451, "epoch": 0.7908011423418007, "flos": 13266198299520.0, "grad_norm": 1.765489623491604, "language_loss": 0.73358214, "learning_rate": 4.4167608887952367e-07, "loss": 0.81042516, "num_input_tokens_seen": 283844820, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09936523, "step": 13153, "time_per_iteration": 2.529989719390869 }, { "auxiliary_loss_clip": 0.06414433, "auxiliary_loss_mlp": 0.01267905, "balance_loss_clip": 0.06272537, "balance_loss_mlp": 0.01257385, "epoch": 0.7908612655944687, "flos": 19760718960000.0, "grad_norm": 1.6720968761192065, "language_loss": 0.7879554, "learning_rate": 4.4143199501984306e-07, "loss": 0.86477876, "num_input_tokens_seen": 283862870, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10522461, "step": 13154, "time_per_iteration": 2.5287435054779053 }, { "auxiliary_loss_clip": 0.06423904, "auxiliary_loss_mlp": 0.01263628, "balance_loss_clip": 0.06274132, "balance_loss_mlp": 0.01252828, "epoch": 0.7909213888471366, "flos": 21294064350720.0, "grad_norm": 2.0949716682123403, "language_loss": 0.70660579, "learning_rate": 4.411879602612185e-07, "loss": 0.78348112, "num_input_tokens_seen": 283882405, "router_z_loss_clip": 1.49609375, "router_z_loss_mlp": 0.10797119, "step": 13155, "time_per_iteration": 2.5601608753204346 }, { "auxiliary_loss_clip": 0.06419359, "auxiliary_loss_mlp": 0.01268964, "balance_loss_clip": 0.06275547, "balance_loss_mlp": 0.01258688, "epoch": 0.7909815120998046, "flos": 22535521643520.0, "grad_norm": 1.83462990568128, "language_loss": 0.77383149, "learning_rate": 4.4094398461290174e-07, "loss": 0.85071474, "num_input_tokens_seen": 283902070, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10272217, "step": 13156, "time_per_iteration": 2.563610792160034 }, { "auxiliary_loss_clip": 0.06412385, "auxiliary_loss_mlp": 0.01265104, "balance_loss_clip": 0.0627162, "balance_loss_mlp": 0.01255687, "epoch": 0.7910416353524725, "flos": 26735537623680.0, "grad_norm": 1.6330373087278007, "language_loss": 0.65852249, "learning_rate": 4.4070006808414526e-07, "loss": 0.73529732, "num_input_tokens_seen": 283924100, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09417725, "step": 13157, "time_per_iteration": 2.6339640617370605 }, { "auxiliary_loss_clip": 0.0642058, "auxiliary_loss_mlp": 0.01265885, "balance_loss_clip": 0.06275436, "balance_loss_mlp": 0.01255026, "epoch": 0.7911017586051405, "flos": 24651804689280.0, "grad_norm": 2.0087807014246977, "language_loss": 0.7419706, "learning_rate": 4.4045621068419894e-07, "loss": 0.81883526, "num_input_tokens_seen": 283944955, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10858154, "step": 13158, "time_per_iteration": 2.571526050567627 }, { "auxiliary_loss_clip": 0.06408761, "auxiliary_loss_mlp": 0.01266094, "balance_loss_clip": 0.06271917, "balance_loss_mlp": 0.01257481, "epoch": 0.7911618818578086, "flos": 17571076064640.0, "grad_norm": 2.143042854299572, "language_loss": 0.6771419, "learning_rate": 4.40212412422309e-07, "loss": 0.75389051, "num_input_tokens_seen": 283963125, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.08612061, "step": 13159, "time_per_iteration": 2.542767286300659 }, { "auxiliary_loss_clip": 0.06413545, "auxiliary_loss_mlp": 0.01268395, "balance_loss_clip": 0.06274164, "balance_loss_mlp": 0.01257821, "epoch": 0.7912220051104765, "flos": 16726326727680.0, "grad_norm": 1.8950721449919326, "language_loss": 0.67955977, "learning_rate": 4.399686733077206e-07, "loss": 0.75637925, "num_input_tokens_seen": 283982850, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.10565186, "step": 13160, "time_per_iteration": 2.5470573902130127 }, { "auxiliary_loss_clip": 0.06407391, "auxiliary_loss_mlp": 0.01260513, "balance_loss_clip": 0.06273004, "balance_loss_mlp": 0.01252216, "epoch": 0.7912821283631445, "flos": 13703799847680.0, "grad_norm": 1.919201989064395, "language_loss": 0.72822726, "learning_rate": 4.3972499334967694e-07, "loss": 0.80490631, "num_input_tokens_seen": 283998275, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.08288574, "step": 13161, "time_per_iteration": 2.518815279006958 }, { "auxiliary_loss_clip": 0.06408291, "auxiliary_loss_mlp": 0.01265512, "balance_loss_clip": 0.06270996, "balance_loss_mlp": 0.01255707, "epoch": 0.7913422516158124, "flos": 23775804979200.0, "grad_norm": 1.819120080593599, "language_loss": 0.73455572, "learning_rate": 4.39481372557418e-07, "loss": 0.81129372, "num_input_tokens_seen": 284018750, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09796143, "step": 13162, "time_per_iteration": 2.558720588684082 }, { "auxiliary_loss_clip": 0.06421453, "auxiliary_loss_mlp": 0.01268615, "balance_loss_clip": 0.06277072, "balance_loss_mlp": 0.01257475, "epoch": 0.7914023748684804, "flos": 19944433036800.0, "grad_norm": 1.6955941990489494, "language_loss": 0.72058749, "learning_rate": 4.392378109401811e-07, "loss": 0.79748821, "num_input_tokens_seen": 284037850, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.1114502, "step": 13163, "time_per_iteration": 2.6996402740478516 }, { "auxiliary_loss_clip": 0.06417755, "auxiliary_loss_mlp": 0.01265196, "balance_loss_clip": 0.06277163, "balance_loss_mlp": 0.01254318, "epoch": 0.7914624981211483, "flos": 20601065957760.0, "grad_norm": 1.8551832148942835, "language_loss": 0.70039278, "learning_rate": 4.3899430850720296e-07, "loss": 0.77722228, "num_input_tokens_seen": 284056380, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.10876465, "step": 13164, "time_per_iteration": 2.5520401000976562 }, { "auxiliary_loss_clip": 0.06414558, "auxiliary_loss_mlp": 0.01263045, "balance_loss_clip": 0.06274472, "balance_loss_mlp": 0.0125299, "epoch": 0.7915226213738163, "flos": 21806031997440.0, "grad_norm": 1.953889315762281, "language_loss": 0.66447216, "learning_rate": 4.387508652677177e-07, "loss": 0.74124819, "num_input_tokens_seen": 284074945, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.1005249, "step": 13165, "time_per_iteration": 2.6528799533843994 }, { "auxiliary_loss_clip": 0.06408562, "auxiliary_loss_mlp": 0.01266062, "balance_loss_clip": 0.06271897, "balance_loss_mlp": 0.01256913, "epoch": 0.7915827446264843, "flos": 16293714497280.0, "grad_norm": 2.0589831494213726, "language_loss": 0.72767794, "learning_rate": 4.385074812309557e-07, "loss": 0.80442417, "num_input_tokens_seen": 284092070, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.09155273, "step": 13166, "time_per_iteration": 2.521840810775757 }, { "auxiliary_loss_clip": 0.06413326, "auxiliary_loss_mlp": 0.01268671, "balance_loss_clip": 0.06273666, "balance_loss_mlp": 0.01257913, "epoch": 0.7916428678791523, "flos": 25709673686400.0, "grad_norm": 2.982234798668548, "language_loss": 0.77650815, "learning_rate": 4.382641564061462e-07, "loss": 0.85332811, "num_input_tokens_seen": 284112255, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.10760498, "step": 13167, "time_per_iteration": 4.022733926773071 }, { "auxiliary_loss_clip": 0.06413521, "auxiliary_loss_mlp": 0.01267533, "balance_loss_clip": 0.06275614, "balance_loss_mlp": 0.01258277, "epoch": 0.7917029911318202, "flos": 23885320665600.0, "grad_norm": 1.5184590977542607, "language_loss": 0.84082341, "learning_rate": 4.3802089080251713e-07, "loss": 0.91763389, "num_input_tokens_seen": 284132330, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09259033, "step": 13168, "time_per_iteration": 2.5850157737731934 }, { "auxiliary_loss_clip": 0.0641712, "auxiliary_loss_mlp": 0.01264487, "balance_loss_clip": 0.06275691, "balance_loss_mlp": 0.01254682, "epoch": 0.7917631143844882, "flos": 21651975066240.0, "grad_norm": 1.480875360716, "language_loss": 0.72778696, "learning_rate": 4.3777768442929155e-07, "loss": 0.80460298, "num_input_tokens_seen": 284150640, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09820557, "step": 13169, "time_per_iteration": 2.5671427249908447 }, { "auxiliary_loss_clip": 0.06416734, "auxiliary_loss_mlp": 0.01264153, "balance_loss_clip": 0.06271647, "balance_loss_mlp": 0.0125417, "epoch": 0.7918232376371561, "flos": 38883519187200.0, "grad_norm": 1.9330946395531496, "language_loss": 0.67320424, "learning_rate": 4.3753453729569287e-07, "loss": 0.75001311, "num_input_tokens_seen": 284171910, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.09985352, "step": 13170, "time_per_iteration": 2.7066125869750977 }, { "auxiliary_loss_clip": 0.06413917, "auxiliary_loss_mlp": 0.01263912, "balance_loss_clip": 0.06272351, "balance_loss_mlp": 0.01254751, "epoch": 0.7918833608898241, "flos": 20781551652480.0, "grad_norm": 1.7525823060233217, "language_loss": 0.70853698, "learning_rate": 4.372914494109412e-07, "loss": 0.78531528, "num_input_tokens_seen": 284191340, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09155273, "step": 13171, "time_per_iteration": 2.537743091583252 }, { "auxiliary_loss_clip": 0.06414189, "auxiliary_loss_mlp": 0.01268515, "balance_loss_clip": 0.0627317, "balance_loss_mlp": 0.01258597, "epoch": 0.7919434841424922, "flos": 33918276994560.0, "grad_norm": 5.192800971125288, "language_loss": 0.6706093, "learning_rate": 4.370484207842553e-07, "loss": 0.74743634, "num_input_tokens_seen": 284212495, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09918213, "step": 13172, "time_per_iteration": 4.217901945114136 }, { "auxiliary_loss_clip": 0.06413569, "auxiliary_loss_mlp": 0.01265088, "balance_loss_clip": 0.062728, "balance_loss_mlp": 0.01254711, "epoch": 0.7920036073951601, "flos": 21070253295360.0, "grad_norm": 1.6241103218252002, "language_loss": 0.79577255, "learning_rate": 4.3680545142484893e-07, "loss": 0.87255913, "num_input_tokens_seen": 284230825, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10388184, "step": 13173, "time_per_iteration": 2.554351806640625 }, { "auxiliary_loss_clip": 0.06413631, "auxiliary_loss_mlp": 0.01263369, "balance_loss_clip": 0.0627202, "balance_loss_mlp": 0.01254255, "epoch": 0.7920637306478281, "flos": 23662138515840.0, "grad_norm": 1.7842676571808276, "language_loss": 0.76951575, "learning_rate": 4.365625413419365e-07, "loss": 0.8462857, "num_input_tokens_seen": 284250365, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09118652, "step": 13174, "time_per_iteration": 2.553990125656128 }, { "auxiliary_loss_clip": 0.06411932, "auxiliary_loss_mlp": 0.01261702, "balance_loss_clip": 0.06274341, "balance_loss_mlp": 0.0125263, "epoch": 0.792123853900496, "flos": 27202251265920.0, "grad_norm": 1.6621590589037691, "language_loss": 0.72095698, "learning_rate": 4.363196905447297e-07, "loss": 0.79769337, "num_input_tokens_seen": 284269635, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09069824, "step": 13175, "time_per_iteration": 2.5847673416137695 }, { "auxiliary_loss_clip": 0.0641274, "auxiliary_loss_mlp": 0.01265402, "balance_loss_clip": 0.06271734, "balance_loss_mlp": 0.0125571, "epoch": 0.792183977153164, "flos": 19104631090560.0, "grad_norm": 1.9036744963370091, "language_loss": 0.59724391, "learning_rate": 4.360768990424364e-07, "loss": 0.67402536, "num_input_tokens_seen": 284288380, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09686279, "step": 13176, "time_per_iteration": 2.52878475189209 }, { "auxiliary_loss_clip": 0.06412107, "auxiliary_loss_mlp": 0.01269913, "balance_loss_clip": 0.06274629, "balance_loss_mlp": 0.01260096, "epoch": 0.7922441004058319, "flos": 17134564619520.0, "grad_norm": 1.9673316568318624, "language_loss": 0.73819071, "learning_rate": 4.3583416684426376e-07, "loss": 0.81501091, "num_input_tokens_seen": 284306920, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09814453, "step": 13177, "time_per_iteration": 2.5562610626220703 }, { "auxiliary_loss_clip": 0.06412287, "auxiliary_loss_mlp": 0.01262468, "balance_loss_clip": 0.06273616, "balance_loss_mlp": 0.01253211, "epoch": 0.7923042236585, "flos": 17827395304320.0, "grad_norm": 1.783928529210857, "language_loss": 0.64015114, "learning_rate": 4.355914939594174e-07, "loss": 0.71689874, "num_input_tokens_seen": 284324700, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.0925293, "step": 13178, "time_per_iteration": 2.5260236263275146 }, { "auxiliary_loss_clip": 0.06414763, "auxiliary_loss_mlp": 0.01265628, "balance_loss_clip": 0.0627277, "balance_loss_mlp": 0.01256789, "epoch": 0.7923643469111679, "flos": 29943036391680.0, "grad_norm": 1.6043111519457014, "language_loss": 0.68687421, "learning_rate": 4.3534888039709726e-07, "loss": 0.76367813, "num_input_tokens_seen": 284345985, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.08837891, "step": 13179, "time_per_iteration": 2.61881685256958 }, { "auxiliary_loss_clip": 0.06413794, "auxiliary_loss_mlp": 0.01267104, "balance_loss_clip": 0.06275471, "balance_loss_mlp": 0.01257615, "epoch": 0.7924244701638359, "flos": 22681360874880.0, "grad_norm": 1.7364785266329237, "language_loss": 0.73948646, "learning_rate": 4.3510632616650444e-07, "loss": 0.81629539, "num_input_tokens_seen": 284364475, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09484863, "step": 13180, "time_per_iteration": 2.5330870151519775 }, { "auxiliary_loss_clip": 0.06416831, "auxiliary_loss_mlp": 0.01266522, "balance_loss_clip": 0.06274359, "balance_loss_mlp": 0.01255799, "epoch": 0.7924845934165038, "flos": 17974031149440.0, "grad_norm": 2.080762021755403, "language_loss": 0.81963527, "learning_rate": 4.3486383127683646e-07, "loss": 0.89646888, "num_input_tokens_seen": 284382125, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10717773, "step": 13181, "time_per_iteration": 2.5352509021759033 }, { "auxiliary_loss_clip": 0.06411074, "auxiliary_loss_mlp": 0.01266274, "balance_loss_clip": 0.0627261, "balance_loss_mlp": 0.01255605, "epoch": 0.7925447166691718, "flos": 23483665319040.0, "grad_norm": 1.9524403353216948, "language_loss": 0.77631783, "learning_rate": 4.346213957372895e-07, "loss": 0.8530913, "num_input_tokens_seen": 284401585, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.10662842, "step": 13182, "time_per_iteration": 2.5628182888031006 }, { "auxiliary_loss_clip": 0.06423585, "auxiliary_loss_mlp": 0.01265139, "balance_loss_clip": 0.06277694, "balance_loss_mlp": 0.01253915, "epoch": 0.7926048399218397, "flos": 20453591571840.0, "grad_norm": 2.051099498714017, "language_loss": 0.74516326, "learning_rate": 4.34379019557056e-07, "loss": 0.82205051, "num_input_tokens_seen": 284419125, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.11230469, "step": 13183, "time_per_iteration": 4.033412933349609 }, { "auxiliary_loss_clip": 0.06416985, "auxiliary_loss_mlp": 0.01264489, "balance_loss_clip": 0.0627742, "balance_loss_mlp": 0.01254714, "epoch": 0.7926649631745077, "flos": 37169184977280.0, "grad_norm": 1.6186064698697094, "language_loss": 0.68212569, "learning_rate": 4.341367027453264e-07, "loss": 0.75894046, "num_input_tokens_seen": 284440445, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09771729, "step": 13184, "time_per_iteration": 2.691523313522339 }, { "auxiliary_loss_clip": 0.06416672, "auxiliary_loss_mlp": 0.01264889, "balance_loss_clip": 0.06273748, "balance_loss_mlp": 0.01255114, "epoch": 0.7927250864271758, "flos": 17024168465280.0, "grad_norm": 2.647136762714042, "language_loss": 0.71217442, "learning_rate": 4.338944453112907e-07, "loss": 0.78899002, "num_input_tokens_seen": 284459370, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09777832, "step": 13185, "time_per_iteration": 2.577120304107666 }, { "auxiliary_loss_clip": 0.06415582, "auxiliary_loss_mlp": 0.01263637, "balance_loss_clip": 0.06272928, "balance_loss_mlp": 0.01253874, "epoch": 0.7927852096798437, "flos": 17755041703680.0, "grad_norm": 2.103750425123843, "language_loss": 0.65711594, "learning_rate": 4.3365224726413375e-07, "loss": 0.73390812, "num_input_tokens_seen": 284477525, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09747314, "step": 13186, "time_per_iteration": 2.526294231414795 }, { "auxiliary_loss_clip": 0.06411249, "auxiliary_loss_mlp": 0.01263759, "balance_loss_clip": 0.06273918, "balance_loss_mlp": 0.01254616, "epoch": 0.7928453329325117, "flos": 23844636708480.0, "grad_norm": 1.5322902456231018, "language_loss": 0.76889139, "learning_rate": 4.334101086130408e-07, "loss": 0.84564149, "num_input_tokens_seen": 284496590, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.0914917, "step": 13187, "time_per_iteration": 2.5564093589782715 }, { "auxiliary_loss_clip": 0.06410512, "auxiliary_loss_mlp": 0.01264881, "balance_loss_clip": 0.06270568, "balance_loss_mlp": 0.01255326, "epoch": 0.7929054561851796, "flos": 17460302567040.0, "grad_norm": 2.173901339222646, "language_loss": 0.7327643, "learning_rate": 4.3316802936719334e-07, "loss": 0.80951822, "num_input_tokens_seen": 284511470, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09552002, "step": 13188, "time_per_iteration": 2.54756498336792 }, { "auxiliary_loss_clip": 0.06415804, "auxiliary_loss_mlp": 0.01265585, "balance_loss_clip": 0.0627181, "balance_loss_mlp": 0.01254654, "epoch": 0.7929655794378476, "flos": 21987775503360.0, "grad_norm": 3.9260562583779253, "language_loss": 0.63201809, "learning_rate": 4.329260095357725e-07, "loss": 0.70883191, "num_input_tokens_seen": 284531125, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.109375, "step": 13189, "time_per_iteration": 4.100865602493286 }, { "auxiliary_loss_clip": 0.0641887, "auxiliary_loss_mlp": 0.01267746, "balance_loss_clip": 0.06278408, "balance_loss_mlp": 0.01257678, "epoch": 0.7930257026905155, "flos": 17279523383040.0, "grad_norm": 2.8810315758470346, "language_loss": 0.73399049, "learning_rate": 4.3268404912795307e-07, "loss": 0.81085664, "num_input_tokens_seen": 284549340, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.10076904, "step": 13190, "time_per_iteration": 2.5324060916900635 }, { "auxiliary_loss_clip": 0.06410711, "auxiliary_loss_mlp": 0.01262018, "balance_loss_clip": 0.06275183, "balance_loss_mlp": 0.01253727, "epoch": 0.7930858259431836, "flos": 27306693780480.0, "grad_norm": 1.91979631774231, "language_loss": 0.73600721, "learning_rate": 4.3244214815291166e-07, "loss": 0.81273448, "num_input_tokens_seen": 284567060, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.08282471, "step": 13191, "time_per_iteration": 2.6165578365325928 }, { "auxiliary_loss_clip": 0.06415392, "auxiliary_loss_mlp": 0.01267157, "balance_loss_clip": 0.06274887, "balance_loss_mlp": 0.01257287, "epoch": 0.7931459491958515, "flos": 19869647667840.0, "grad_norm": 1.8369481838424542, "language_loss": 0.69162142, "learning_rate": 4.322003066198219e-07, "loss": 0.76844692, "num_input_tokens_seen": 284586600, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09869385, "step": 13192, "time_per_iteration": 2.5315887928009033 }, { "auxiliary_loss_clip": 0.06416228, "auxiliary_loss_mlp": 0.01264782, "balance_loss_clip": 0.062741, "balance_loss_mlp": 0.012549, "epoch": 0.7932060724485195, "flos": 23153525032320.0, "grad_norm": 1.7672090888512506, "language_loss": 0.75447834, "learning_rate": 4.3195852453785274e-07, "loss": 0.83128846, "num_input_tokens_seen": 284605715, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09887695, "step": 13193, "time_per_iteration": 2.550612688064575 }, { "auxiliary_loss_clip": 0.06412418, "auxiliary_loss_mlp": 0.01265685, "balance_loss_clip": 0.06271487, "balance_loss_mlp": 0.01255189, "epoch": 0.7932661957011874, "flos": 29942617121280.0, "grad_norm": 1.501917850949579, "language_loss": 0.72291243, "learning_rate": 4.317168019161741e-07, "loss": 0.79969347, "num_input_tokens_seen": 284628540, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10491943, "step": 13194, "time_per_iteration": 2.6101555824279785 }, { "auxiliary_loss_clip": 0.06422512, "auxiliary_loss_mlp": 0.01263386, "balance_loss_clip": 0.0627688, "balance_loss_mlp": 0.01253295, "epoch": 0.7933263189538554, "flos": 22564717591680.0, "grad_norm": 2.2252782527201624, "language_loss": 0.70723462, "learning_rate": 4.314751387639517e-07, "loss": 0.78409362, "num_input_tokens_seen": 284646040, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10089111, "step": 13195, "time_per_iteration": 2.548694610595703 }, { "auxiliary_loss_clip": 0.06414202, "auxiliary_loss_mlp": 0.01269238, "balance_loss_clip": 0.06274585, "balance_loss_mlp": 0.01259355, "epoch": 0.7933864422065233, "flos": 25485317579520.0, "grad_norm": 1.4050257860554478, "language_loss": 0.773395, "learning_rate": 4.3123353509034844e-07, "loss": 0.85022938, "num_input_tokens_seen": 284665110, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09881592, "step": 13196, "time_per_iteration": 2.5784974098205566 }, { "auxiliary_loss_clip": 0.06420249, "auxiliary_loss_mlp": 0.01272259, "balance_loss_clip": 0.06277755, "balance_loss_mlp": 0.01261888, "epoch": 0.7934465654591913, "flos": 33591490871040.0, "grad_norm": 2.6510172293870884, "language_loss": 0.69181228, "learning_rate": 4.309919909045268e-07, "loss": 0.76873732, "num_input_tokens_seen": 284686515, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10369873, "step": 13197, "time_per_iteration": 2.6674654483795166 }, { "auxiliary_loss_clip": 0.06414271, "auxiliary_loss_mlp": 0.01267377, "balance_loss_clip": 0.06273323, "balance_loss_mlp": 0.01257459, "epoch": 0.7935066887118594, "flos": 31440854851200.0, "grad_norm": 1.5811943660892755, "language_loss": 0.64955044, "learning_rate": 4.30750506215646e-07, "loss": 0.72636688, "num_input_tokens_seen": 284707300, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09924316, "step": 13198, "time_per_iteration": 2.6321136951446533 }, { "auxiliary_loss_clip": 0.06420058, "auxiliary_loss_mlp": 0.01266486, "balance_loss_clip": 0.06276009, "balance_loss_mlp": 0.01256151, "epoch": 0.7935668119645273, "flos": 14687638162560.0, "grad_norm": 1.9955721242327675, "language_loss": 0.72287112, "learning_rate": 4.30509081032864e-07, "loss": 0.79973656, "num_input_tokens_seen": 284723545, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10333252, "step": 13199, "time_per_iteration": 2.5263876914978027 }, { "auxiliary_loss_clip": 0.06416456, "auxiliary_loss_mlp": 0.0126488, "balance_loss_clip": 0.06274011, "balance_loss_mlp": 0.01255177, "epoch": 0.7936269352171953, "flos": 18010061205120.0, "grad_norm": 1.793299687391567, "language_loss": 0.80919796, "learning_rate": 4.302677153653349e-07, "loss": 0.88601136, "num_input_tokens_seen": 284742650, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09692383, "step": 13200, "time_per_iteration": 2.5387237071990967 }, { "auxiliary_loss_clip": 0.06410532, "auxiliary_loss_mlp": 0.01265879, "balance_loss_clip": 0.06275015, "balance_loss_mlp": 0.01256164, "epoch": 0.7936870584698632, "flos": 18886228623360.0, "grad_norm": 1.644776495331187, "language_loss": 0.77799487, "learning_rate": 4.3002640922221077e-07, "loss": 0.85475898, "num_input_tokens_seen": 284760955, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.09716797, "step": 13201, "time_per_iteration": 2.5264322757720947 }, { "auxiliary_loss_clip": 0.06409697, "auxiliary_loss_mlp": 0.01268109, "balance_loss_clip": 0.06271484, "balance_loss_mlp": 0.01258162, "epoch": 0.7937471817225312, "flos": 23373604581120.0, "grad_norm": 1.5173077993954553, "language_loss": 0.67462331, "learning_rate": 4.2978516261264296e-07, "loss": 0.75140136, "num_input_tokens_seen": 284780745, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.0994873, "step": 13202, "time_per_iteration": 2.573446273803711 }, { "auxiliary_loss_clip": 0.06414606, "auxiliary_loss_mlp": 0.01268406, "balance_loss_clip": 0.06271978, "balance_loss_mlp": 0.01258118, "epoch": 0.7938073049751991, "flos": 22681025458560.0, "grad_norm": 2.3030021325305605, "language_loss": 0.75421274, "learning_rate": 4.2954397554577884e-07, "loss": 0.83104289, "num_input_tokens_seen": 284799000, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10290527, "step": 13203, "time_per_iteration": 2.536807060241699 }, { "auxiliary_loss_clip": 0.06415709, "auxiliary_loss_mlp": 0.01262176, "balance_loss_clip": 0.06275079, "balance_loss_mlp": 0.01252592, "epoch": 0.7938674282278672, "flos": 22857150741120.0, "grad_norm": 1.6451940811494654, "language_loss": 0.66608739, "learning_rate": 4.293028480307643e-07, "loss": 0.74286622, "num_input_tokens_seen": 284817450, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.0958252, "step": 13204, "time_per_iteration": 2.5470566749572754 }, { "auxiliary_loss_clip": 0.0641336, "auxiliary_loss_mlp": 0.01264036, "balance_loss_clip": 0.06274214, "balance_loss_mlp": 0.01254887, "epoch": 0.7939275514805351, "flos": 27019208021760.0, "grad_norm": 1.420654085666604, "language_loss": 0.7940433, "learning_rate": 4.290617800767438e-07, "loss": 0.8708173, "num_input_tokens_seen": 284838865, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09143066, "step": 13205, "time_per_iteration": 2.58245587348938 }, { "auxiliary_loss_clip": 0.06409225, "auxiliary_loss_mlp": 0.01265696, "balance_loss_clip": 0.06271547, "balance_loss_mlp": 0.01255915, "epoch": 0.7939876747332031, "flos": 21149315222400.0, "grad_norm": 1.6074015491391704, "language_loss": 0.77923656, "learning_rate": 4.28820771692858e-07, "loss": 0.85598576, "num_input_tokens_seen": 284857975, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09771729, "step": 13206, "time_per_iteration": 2.5368165969848633 }, { "auxiliary_loss_clip": 0.06418513, "auxiliary_loss_mlp": 0.01265642, "balance_loss_clip": 0.0627553, "balance_loss_mlp": 0.01255253, "epoch": 0.794047797985871, "flos": 23294836143360.0, "grad_norm": 2.1373676525038063, "language_loss": 0.79312432, "learning_rate": 4.285798228882456e-07, "loss": 0.86996585, "num_input_tokens_seen": 284877145, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10394287, "step": 13207, "time_per_iteration": 4.004126310348511 }, { "auxiliary_loss_clip": 0.0641181, "auxiliary_loss_mlp": 0.01266848, "balance_loss_clip": 0.06272261, "balance_loss_mlp": 0.01256894, "epoch": 0.794107921238539, "flos": 24614978019840.0, "grad_norm": 1.5614762367813309, "language_loss": 0.84215629, "learning_rate": 4.2833893367204375e-07, "loss": 0.91894293, "num_input_tokens_seen": 284895560, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.0994873, "step": 13208, "time_per_iteration": 2.6020212173461914 }, { "auxiliary_loss_clip": 0.06314683, "auxiliary_loss_mlp": 0.01250794, "balance_loss_clip": 0.06258292, "balance_loss_mlp": 0.01249644, "epoch": 0.7941680444912069, "flos": 64114641077760.0, "grad_norm": 0.7260055247366026, "language_loss": 0.58389199, "learning_rate": 4.280981040533875e-07, "loss": 0.65954679, "num_input_tokens_seen": 284963135, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.0114975, "step": 13209, "time_per_iteration": 3.245102882385254 }, { "auxiliary_loss_clip": 0.06421498, "auxiliary_loss_mlp": 0.01265467, "balance_loss_clip": 0.06275339, "balance_loss_mlp": 0.01254827, "epoch": 0.794228167743875, "flos": 24395653157760.0, "grad_norm": 2.074426064983562, "language_loss": 0.63469493, "learning_rate": 4.2785733404140825e-07, "loss": 0.7115646, "num_input_tokens_seen": 284981755, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.10632324, "step": 13210, "time_per_iteration": 2.5726609230041504 }, { "auxiliary_loss_clip": 0.06413935, "auxiliary_loss_mlp": 0.01265945, "balance_loss_clip": 0.0627471, "balance_loss_mlp": 0.01256319, "epoch": 0.794288290996543, "flos": 28520129082240.0, "grad_norm": 2.4440381242019975, "language_loss": 0.69624764, "learning_rate": 4.2761662364523676e-07, "loss": 0.77304637, "num_input_tokens_seen": 285003060, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09631348, "step": 13211, "time_per_iteration": 2.68219256401062 }, { "auxiliary_loss_clip": 0.06421436, "auxiliary_loss_mlp": 0.01265709, "balance_loss_clip": 0.06276646, "balance_loss_mlp": 0.01254867, "epoch": 0.7943484142492109, "flos": 25929333964800.0, "grad_norm": 1.427174239918852, "language_loss": 0.72498465, "learning_rate": 4.2737597287400074e-07, "loss": 0.80185604, "num_input_tokens_seen": 285021640, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.1083374, "step": 13212, "time_per_iteration": 4.088605880737305 }, { "auxiliary_loss_clip": 0.06409656, "auxiliary_loss_mlp": 0.01265628, "balance_loss_clip": 0.06273778, "balance_loss_mlp": 0.0125605, "epoch": 0.7944085375018789, "flos": 23922147335040.0, "grad_norm": 1.6154293213310154, "language_loss": 0.80898607, "learning_rate": 4.271353817368246e-07, "loss": 0.88573897, "num_input_tokens_seen": 285040490, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.09576416, "step": 13213, "time_per_iteration": 2.576835870742798 }, { "auxiliary_loss_clip": 0.06421087, "auxiliary_loss_mlp": 0.01264662, "balance_loss_clip": 0.06276258, "balance_loss_mlp": 0.01254201, "epoch": 0.7944686607545468, "flos": 20236153426560.0, "grad_norm": 2.3456580222495784, "language_loss": 0.68036383, "learning_rate": 4.268948502428327e-07, "loss": 0.75722134, "num_input_tokens_seen": 285059270, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10461426, "step": 13214, "time_per_iteration": 2.5455527305603027 }, { "auxiliary_loss_clip": 0.06411916, "auxiliary_loss_mlp": 0.01263659, "balance_loss_clip": 0.06275252, "balance_loss_mlp": 0.01253901, "epoch": 0.7945287840072148, "flos": 21987440087040.0, "grad_norm": 1.826108866309459, "language_loss": 0.72568226, "learning_rate": 4.2665437840114535e-07, "loss": 0.80243796, "num_input_tokens_seen": 285075390, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09759521, "step": 13215, "time_per_iteration": 2.559706687927246 }, { "auxiliary_loss_clip": 0.06409609, "auxiliary_loss_mlp": 0.01266014, "balance_loss_clip": 0.06273623, "balance_loss_mlp": 0.01255959, "epoch": 0.7945889072598827, "flos": 26405229628800.0, "grad_norm": 3.1083916702964625, "language_loss": 0.79362082, "learning_rate": 4.2641396622088253e-07, "loss": 0.870377, "num_input_tokens_seen": 285096290, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.10058594, "step": 13216, "time_per_iteration": 2.5806121826171875 }, { "auxiliary_loss_clip": 0.06416772, "auxiliary_loss_mlp": 0.01265319, "balance_loss_clip": 0.06274283, "balance_loss_mlp": 0.01254769, "epoch": 0.7946490305125508, "flos": 25817051093760.0, "grad_norm": 1.5994831114621415, "language_loss": 0.7413733, "learning_rate": 4.261736137111598e-07, "loss": 0.81819427, "num_input_tokens_seen": 285116020, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10552979, "step": 13217, "time_per_iteration": 2.593871831893921 }, { "auxiliary_loss_clip": 0.06413839, "auxiliary_loss_mlp": 0.01264457, "balance_loss_clip": 0.06278086, "balance_loss_mlp": 0.01254963, "epoch": 0.7947091537652187, "flos": 15966425249280.0, "grad_norm": 1.863015328570721, "language_loss": 0.74124444, "learning_rate": 4.259333208810907e-07, "loss": 0.81802744, "num_input_tokens_seen": 285133510, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.09494019, "step": 13218, "time_per_iteration": 2.5316238403320312 }, { "auxiliary_loss_clip": 0.0642032, "auxiliary_loss_mlp": 0.01265231, "balance_loss_clip": 0.06275372, "balance_loss_mlp": 0.01255599, "epoch": 0.7947692770178867, "flos": 18593753546880.0, "grad_norm": 1.8970195324000103, "language_loss": 0.83798146, "learning_rate": 4.2569308773978817e-07, "loss": 0.914837, "num_input_tokens_seen": 285151690, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.09637451, "step": 13219, "time_per_iteration": 2.521436929702759 }, { "auxiliary_loss_clip": 0.06421633, "auxiliary_loss_mlp": 0.01266372, "balance_loss_clip": 0.06275839, "balance_loss_mlp": 0.0125646, "epoch": 0.7948294002705546, "flos": 20447344442880.0, "grad_norm": 2.0810888926489106, "language_loss": 0.75734091, "learning_rate": 4.2545291429636123e-07, "loss": 0.83422095, "num_input_tokens_seen": 285170485, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.09912109, "step": 13220, "time_per_iteration": 2.5439741611480713 }, { "auxiliary_loss_clip": 0.06423405, "auxiliary_loss_mlp": 0.01262422, "balance_loss_clip": 0.06278923, "balance_loss_mlp": 0.01252021, "epoch": 0.7948895235232226, "flos": 38190436940160.0, "grad_norm": 1.8217580387788843, "language_loss": 0.72486866, "learning_rate": 4.252128005599176e-07, "loss": 0.80172694, "num_input_tokens_seen": 285191050, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10394287, "step": 13221, "time_per_iteration": 2.691411018371582 }, { "auxiliary_loss_clip": 0.06413496, "auxiliary_loss_mlp": 0.01265297, "balance_loss_clip": 0.06275424, "balance_loss_mlp": 0.01255766, "epoch": 0.7949496467758905, "flos": 15565231100160.0, "grad_norm": 1.8675966767630692, "language_loss": 0.7461189, "learning_rate": 4.249727465395634e-07, "loss": 0.82290685, "num_input_tokens_seen": 285208750, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09533691, "step": 13222, "time_per_iteration": 3.9623191356658936 }, { "auxiliary_loss_clip": 0.06314003, "auxiliary_loss_mlp": 0.01251403, "balance_loss_clip": 0.06257971, "balance_loss_mlp": 0.01250353, "epoch": 0.7950097700285585, "flos": 70915864809600.0, "grad_norm": 0.7515826182307743, "language_loss": 0.66343117, "learning_rate": 4.247327522443993e-07, "loss": 0.7390852, "num_input_tokens_seen": 285264605, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01051331, "step": 13223, "time_per_iteration": 3.011457920074463 }, { "auxiliary_loss_clip": 0.06414515, "auxiliary_loss_mlp": 0.01266393, "balance_loss_clip": 0.06273128, "balance_loss_mlp": 0.01255551, "epoch": 0.7950698932812266, "flos": 23958470880000.0, "grad_norm": 1.6599840118817104, "language_loss": 0.71101767, "learning_rate": 4.2449281768352717e-07, "loss": 0.78782672, "num_input_tokens_seen": 285283940, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10839844, "step": 13224, "time_per_iteration": 2.567164897918701 }, { "auxiliary_loss_clip": 0.063153, "auxiliary_loss_mlp": 0.012492, "balance_loss_clip": 0.06259164, "balance_loss_mlp": 0.01248115, "epoch": 0.7951300165338945, "flos": 60300096606720.0, "grad_norm": 0.6607898309284022, "language_loss": 0.55024403, "learning_rate": 4.2425294286604527e-07, "loss": 0.62588906, "num_input_tokens_seen": 285349525, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01086426, "step": 13225, "time_per_iteration": 3.2122647762298584 }, { "auxiliary_loss_clip": 0.0641389, "auxiliary_loss_mlp": 0.01262495, "balance_loss_clip": 0.06277726, "balance_loss_mlp": 0.01253358, "epoch": 0.7951901397865625, "flos": 22825397243520.0, "grad_norm": 2.2916793423934414, "language_loss": 0.64803672, "learning_rate": 4.2401312780105034e-07, "loss": 0.72480059, "num_input_tokens_seen": 285367355, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.09136963, "step": 13226, "time_per_iteration": 2.5374915599823 }, { "auxiliary_loss_clip": 0.06418458, "auxiliary_loss_mlp": 0.01266908, "balance_loss_clip": 0.06276971, "balance_loss_mlp": 0.01256912, "epoch": 0.7952502630392304, "flos": 35703748920960.0, "grad_norm": 4.683866596375493, "language_loss": 0.70584846, "learning_rate": 4.237733724976349e-07, "loss": 0.78270209, "num_input_tokens_seen": 285386190, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09991455, "step": 13227, "time_per_iteration": 2.665520668029785 }, { "auxiliary_loss_clip": 0.06409013, "auxiliary_loss_mlp": 0.01263079, "balance_loss_clip": 0.06271786, "balance_loss_mlp": 0.01254443, "epoch": 0.7953103862918984, "flos": 25636942742400.0, "grad_norm": 1.7639587138694508, "language_loss": 0.69052511, "learning_rate": 4.2353367696489184e-07, "loss": 0.76724601, "num_input_tokens_seen": 285406150, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.08630371, "step": 13228, "time_per_iteration": 4.060528039932251 }, { "auxiliary_loss_clip": 0.06413243, "auxiliary_loss_mlp": 0.01269277, "balance_loss_clip": 0.06271301, "balance_loss_mlp": 0.01259246, "epoch": 0.7953705095445663, "flos": 40561487925120.0, "grad_norm": 1.3064048606706924, "language_loss": 0.7100448, "learning_rate": 4.232940412119095e-07, "loss": 0.78687, "num_input_tokens_seen": 285429900, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10028076, "step": 13229, "time_per_iteration": 2.792799949645996 }, { "auxiliary_loss_clip": 0.06425332, "auxiliary_loss_mlp": 0.01267852, "balance_loss_clip": 0.06279305, "balance_loss_mlp": 0.01257475, "epoch": 0.7954306327972344, "flos": 27644129372160.0, "grad_norm": 1.8339958822415168, "language_loss": 0.71682924, "learning_rate": 4.2305446524777457e-07, "loss": 0.79376107, "num_input_tokens_seen": 285452555, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.1038208, "step": 13230, "time_per_iteration": 2.615058422088623 }, { "auxiliary_loss_clip": 0.06318744, "auxiliary_loss_mlp": 0.01249731, "balance_loss_clip": 0.06262751, "balance_loss_mlp": 0.0124859, "epoch": 0.7954907560499023, "flos": 59525505936000.0, "grad_norm": 0.8718642767614394, "language_loss": 0.63559473, "learning_rate": 4.2281494908157247e-07, "loss": 0.71127945, "num_input_tokens_seen": 285515700, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01141357, "step": 13231, "time_per_iteration": 3.2043724060058594 }, { "auxiliary_loss_clip": 0.06415884, "auxiliary_loss_mlp": 0.01265053, "balance_loss_clip": 0.06276517, "balance_loss_mlp": 0.01255171, "epoch": 0.7955508793025703, "flos": 20126721594240.0, "grad_norm": 1.6095822879487036, "language_loss": 0.70113391, "learning_rate": 4.2257549272238566e-07, "loss": 0.77794331, "num_input_tokens_seen": 285533910, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09887695, "step": 13232, "time_per_iteration": 2.5460658073425293 }, { "auxiliary_loss_clip": 0.06411354, "auxiliary_loss_mlp": 0.01263686, "balance_loss_clip": 0.0627164, "balance_loss_mlp": 0.01254036, "epoch": 0.7956110025552382, "flos": 26512607036160.0, "grad_norm": 2.135222772063426, "language_loss": 0.78236032, "learning_rate": 4.223360961792952e-07, "loss": 0.85911071, "num_input_tokens_seen": 285554080, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09649658, "step": 13233, "time_per_iteration": 2.581739664077759 }, { "auxiliary_loss_clip": 0.0641702, "auxiliary_loss_mlp": 0.01267611, "balance_loss_clip": 0.06274939, "balance_loss_mlp": 0.01257574, "epoch": 0.7956711258079062, "flos": 22572138677760.0, "grad_norm": 1.9943462615287124, "language_loss": 0.78972292, "learning_rate": 4.220967594613769e-07, "loss": 0.86656916, "num_input_tokens_seen": 285572325, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10040283, "step": 13234, "time_per_iteration": 2.5505077838897705 }, { "auxiliary_loss_clip": 0.06417312, "auxiliary_loss_mlp": 0.01264875, "balance_loss_clip": 0.06277791, "balance_loss_mlp": 0.01256143, "epoch": 0.7957312490605741, "flos": 17383882043520.0, "grad_norm": 1.5693687710201463, "language_loss": 0.70301449, "learning_rate": 4.218574825777077e-07, "loss": 0.77983636, "num_input_tokens_seen": 285589770, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.08728027, "step": 13235, "time_per_iteration": 2.623070478439331 }, { "auxiliary_loss_clip": 0.06416941, "auxiliary_loss_mlp": 0.0126751, "balance_loss_clip": 0.06274806, "balance_loss_mlp": 0.01256799, "epoch": 0.7957913723132422, "flos": 22497898360320.0, "grad_norm": 1.538270287201797, "language_loss": 0.68187642, "learning_rate": 4.2161826553736145e-07, "loss": 0.75872093, "num_input_tokens_seen": 285610065, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10723877, "step": 13236, "time_per_iteration": 2.6179306507110596 }, { "auxiliary_loss_clip": 0.06415348, "auxiliary_loss_mlp": 0.01264616, "balance_loss_clip": 0.06277195, "balance_loss_mlp": 0.01255568, "epoch": 0.7958514955659101, "flos": 22644701913600.0, "grad_norm": 1.6966074288850181, "language_loss": 0.75491846, "learning_rate": 4.2137910834940826e-07, "loss": 0.83171809, "num_input_tokens_seen": 285628480, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09051514, "step": 13237, "time_per_iteration": 2.55220103263855 }, { "auxiliary_loss_clip": 0.06420124, "auxiliary_loss_mlp": 0.01268922, "balance_loss_clip": 0.06278526, "balance_loss_mlp": 0.01257985, "epoch": 0.7959116188185781, "flos": 20710497790080.0, "grad_norm": 1.8973792182341316, "language_loss": 0.71617031, "learning_rate": 4.211400110229175e-07, "loss": 0.79306078, "num_input_tokens_seen": 285647805, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.10943604, "step": 13238, "time_per_iteration": 2.519747495651245 }, { "auxiliary_loss_clip": 0.06415199, "auxiliary_loss_mlp": 0.01265301, "balance_loss_clip": 0.06273183, "balance_loss_mlp": 0.01256152, "epoch": 0.7959717420712461, "flos": 19030474627200.0, "grad_norm": 1.6269512860260669, "language_loss": 0.74101985, "learning_rate": 4.2090097356695684e-07, "loss": 0.81782484, "num_input_tokens_seen": 285665505, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.0914917, "step": 13239, "time_per_iteration": 2.5568745136260986 }, { "auxiliary_loss_clip": 0.06420609, "auxiliary_loss_mlp": 0.01264357, "balance_loss_clip": 0.06277365, "balance_loss_mlp": 0.01254409, "epoch": 0.796031865323914, "flos": 26363371714560.0, "grad_norm": 2.05918393561227, "language_loss": 0.69621503, "learning_rate": 4.2066199599058814e-07, "loss": 0.77306467, "num_input_tokens_seen": 285685855, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09954834, "step": 13240, "time_per_iteration": 2.5876858234405518 }, { "auxiliary_loss_clip": 0.06317154, "auxiliary_loss_mlp": 0.01251677, "balance_loss_clip": 0.06261192, "balance_loss_mlp": 0.01250675, "epoch": 0.796091988576582, "flos": 62087119833600.0, "grad_norm": 0.9245913171172362, "language_loss": 0.5853048, "learning_rate": 4.2042307830287526e-07, "loss": 0.6609931, "num_input_tokens_seen": 285735710, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.0100174, "step": 13241, "time_per_iteration": 2.956055164337158 }, { "auxiliary_loss_clip": 0.06416236, "auxiliary_loss_mlp": 0.0127109, "balance_loss_clip": 0.06275113, "balance_loss_mlp": 0.01261684, "epoch": 0.7961521118292499, "flos": 39029442272640.0, "grad_norm": 1.7639095492141466, "language_loss": 0.64766693, "learning_rate": 4.201842205128772e-07, "loss": 0.72454023, "num_input_tokens_seen": 285757045, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09405518, "step": 13242, "time_per_iteration": 2.740760326385498 }, { "auxiliary_loss_clip": 0.06413794, "auxiliary_loss_mlp": 0.0127222, "balance_loss_clip": 0.06272281, "balance_loss_mlp": 0.01261503, "epoch": 0.796212235081918, "flos": 21769373036160.0, "grad_norm": 1.8828262892482852, "language_loss": 0.76070321, "learning_rate": 4.199454226296526e-07, "loss": 0.83756334, "num_input_tokens_seen": 285776050, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.1071167, "step": 13243, "time_per_iteration": 2.5628745555877686 }, { "auxiliary_loss_clip": 0.06415635, "auxiliary_loss_mlp": 0.01266849, "balance_loss_clip": 0.06272706, "balance_loss_mlp": 0.01256389, "epoch": 0.7962723583345859, "flos": 21185261424000.0, "grad_norm": 2.1588861794607266, "language_loss": 0.79441929, "learning_rate": 4.1970668466225565e-07, "loss": 0.87124407, "num_input_tokens_seen": 285796830, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.10461426, "step": 13244, "time_per_iteration": 2.609738826751709 }, { "auxiliary_loss_clip": 0.06422594, "auxiliary_loss_mlp": 0.01266726, "balance_loss_clip": 0.06277342, "balance_loss_mlp": 0.01256456, "epoch": 0.7963324815872539, "flos": 17134313057280.0, "grad_norm": 2.163395314193167, "language_loss": 0.68876314, "learning_rate": 4.1946800661973934e-07, "loss": 0.76565629, "num_input_tokens_seen": 285814755, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10272217, "step": 13245, "time_per_iteration": 2.517948865890503 }, { "auxiliary_loss_clip": 0.06417422, "auxiliary_loss_mlp": 0.01267107, "balance_loss_clip": 0.06275916, "balance_loss_mlp": 0.01256849, "epoch": 0.7963926048399218, "flos": 21403873526400.0, "grad_norm": 1.554745925150313, "language_loss": 0.79119444, "learning_rate": 4.192293885111549e-07, "loss": 0.86803973, "num_input_tokens_seen": 285834255, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10266113, "step": 13246, "time_per_iteration": 2.54606294631958 }, { "auxiliary_loss_clip": 0.06417271, "auxiliary_loss_mlp": 0.01265444, "balance_loss_clip": 0.06273053, "balance_loss_mlp": 0.01254596, "epoch": 0.7964527280925898, "flos": 25189907610240.0, "grad_norm": 1.6483499713715595, "language_loss": 0.66402888, "learning_rate": 4.1899083034555007e-07, "loss": 0.74085611, "num_input_tokens_seen": 285853540, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10852051, "step": 13247, "time_per_iteration": 3.9820635318756104 }, { "auxiliary_loss_clip": 0.06412058, "auxiliary_loss_mlp": 0.01262859, "balance_loss_clip": 0.06274125, "balance_loss_mlp": 0.01254265, "epoch": 0.7965128513452577, "flos": 27023149163520.0, "grad_norm": 2.0187673405340254, "language_loss": 0.71503377, "learning_rate": 4.1875233213197123e-07, "loss": 0.79178286, "num_input_tokens_seen": 285872705, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.0859375, "step": 13248, "time_per_iteration": 2.5823171138763428 }, { "auxiliary_loss_clip": 0.06416875, "auxiliary_loss_mlp": 0.01266289, "balance_loss_clip": 0.06273143, "balance_loss_mlp": 0.01256103, "epoch": 0.7965729745979258, "flos": 24425436084480.0, "grad_norm": 1.9897094250813998, "language_loss": 0.76275218, "learning_rate": 4.1851389387946255e-07, "loss": 0.83958387, "num_input_tokens_seen": 285890290, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10180664, "step": 13249, "time_per_iteration": 2.564666509628296 }, { "auxiliary_loss_clip": 0.06410915, "auxiliary_loss_mlp": 0.01262203, "balance_loss_clip": 0.06272754, "balance_loss_mlp": 0.01253054, "epoch": 0.7966330978505937, "flos": 18845838155520.0, "grad_norm": 2.78478446020036, "language_loss": 0.62314677, "learning_rate": 4.1827551559706674e-07, "loss": 0.69987798, "num_input_tokens_seen": 285909190, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.0914917, "step": 13250, "time_per_iteration": 2.528268575668335 }, { "auxiliary_loss_clip": 0.06412859, "auxiliary_loss_mlp": 0.01263862, "balance_loss_clip": 0.06273773, "balance_loss_mlp": 0.01253568, "epoch": 0.7966932211032617, "flos": 13157437299840.0, "grad_norm": 2.1941574377318527, "language_loss": 0.72556263, "learning_rate": 4.180371972938206e-07, "loss": 0.80232984, "num_input_tokens_seen": 285927570, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10296631, "step": 13251, "time_per_iteration": 3.980060338973999 }, { "auxiliary_loss_clip": 0.06425986, "auxiliary_loss_mlp": 0.0126563, "balance_loss_clip": 0.06280573, "balance_loss_mlp": 0.0125437, "epoch": 0.7967533443559297, "flos": 23956290673920.0, "grad_norm": 2.3488182622803366, "language_loss": 0.7300604, "learning_rate": 4.177989389787624e-07, "loss": 0.80697656, "num_input_tokens_seen": 285945810, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.11254883, "step": 13252, "time_per_iteration": 2.5606133937835693 }, { "auxiliary_loss_clip": 0.0641012, "auxiliary_loss_mlp": 0.0126595, "balance_loss_clip": 0.06274147, "balance_loss_mlp": 0.01255793, "epoch": 0.7968134676085976, "flos": 30375984038400.0, "grad_norm": 2.8301883339987493, "language_loss": 0.66982353, "learning_rate": 4.175607406609278e-07, "loss": 0.74658418, "num_input_tokens_seen": 285964235, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.10153198, "step": 13253, "time_per_iteration": 2.618215322494507 }, { "auxiliary_loss_clip": 0.06417719, "auxiliary_loss_mlp": 0.01266506, "balance_loss_clip": 0.06275842, "balance_loss_mlp": 0.01256212, "epoch": 0.7968735908612656, "flos": 23081590702080.0, "grad_norm": 1.418889041497863, "language_loss": 0.67853582, "learning_rate": 4.1732260234934767e-07, "loss": 0.75537813, "num_input_tokens_seen": 285983710, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10302734, "step": 13254, "time_per_iteration": 2.5496389865875244 }, { "auxiliary_loss_clip": 0.06415579, "auxiliary_loss_mlp": 0.01266602, "balance_loss_clip": 0.06275044, "balance_loss_mlp": 0.01257102, "epoch": 0.7969337141139335, "flos": 23588275541760.0, "grad_norm": 1.6929906510922037, "language_loss": 0.69277149, "learning_rate": 4.1708452405305314e-07, "loss": 0.76959336, "num_input_tokens_seen": 286003425, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09490967, "step": 13255, "time_per_iteration": 2.57954740524292 }, { "auxiliary_loss_clip": 0.06413844, "auxiliary_loss_mlp": 0.01266968, "balance_loss_clip": 0.06274697, "balance_loss_mlp": 0.01257378, "epoch": 0.7969938373666016, "flos": 19762018698240.0, "grad_norm": 1.8159491248734188, "language_loss": 0.79063344, "learning_rate": 4.168465057810733e-07, "loss": 0.86744153, "num_input_tokens_seen": 286020130, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09588623, "step": 13256, "time_per_iteration": 2.526076555252075 }, { "auxiliary_loss_clip": 0.06418223, "auxiliary_loss_mlp": 0.01266248, "balance_loss_clip": 0.06275204, "balance_loss_mlp": 0.01255925, "epoch": 0.7970539606192695, "flos": 24140969072640.0, "grad_norm": 1.754157967119615, "language_loss": 0.65866148, "learning_rate": 4.166085475424315e-07, "loss": 0.73550624, "num_input_tokens_seen": 286040230, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10321045, "step": 13257, "time_per_iteration": 2.586456060409546 }, { "auxiliary_loss_clip": 0.0642457, "auxiliary_loss_mlp": 0.01267033, "balance_loss_clip": 0.0627746, "balance_loss_mlp": 0.01257055, "epoch": 0.7971140838719375, "flos": 17974576200960.0, "grad_norm": 1.7296969834440872, "language_loss": 0.71998715, "learning_rate": 4.163706493461523e-07, "loss": 0.79690319, "num_input_tokens_seen": 286059475, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.09979248, "step": 13258, "time_per_iteration": 2.661987781524658 }, { "auxiliary_loss_clip": 0.06417264, "auxiliary_loss_mlp": 0.01265439, "balance_loss_clip": 0.06274439, "balance_loss_mlp": 0.01254937, "epoch": 0.7971742071246054, "flos": 19175181828480.0, "grad_norm": 1.9670321658244336, "language_loss": 0.69228983, "learning_rate": 4.1613281120125655e-07, "loss": 0.76911682, "num_input_tokens_seen": 286077820, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.1050415, "step": 13259, "time_per_iteration": 2.5345304012298584 }, { "auxiliary_loss_clip": 0.06410424, "auxiliary_loss_mlp": 0.01267089, "balance_loss_clip": 0.06273405, "balance_loss_mlp": 0.01257922, "epoch": 0.7972343303772734, "flos": 27133335682560.0, "grad_norm": 1.5994833256836576, "language_loss": 0.74014592, "learning_rate": 4.158950331167641e-07, "loss": 0.816921, "num_input_tokens_seen": 286097285, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.0916748, "step": 13260, "time_per_iteration": 2.5887205600738525 }, { "auxiliary_loss_clip": 0.06411608, "auxiliary_loss_mlp": 0.01263478, "balance_loss_clip": 0.06273845, "balance_loss_mlp": 0.01254072, "epoch": 0.7972944536299413, "flos": 21003056720640.0, "grad_norm": 1.6851793072247472, "language_loss": 0.78652507, "learning_rate": 4.1565731510169065e-07, "loss": 0.86327595, "num_input_tokens_seen": 286116000, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09411621, "step": 13261, "time_per_iteration": 2.5920281410217285 }, { "auxiliary_loss_clip": 0.06411314, "auxiliary_loss_mlp": 0.01262171, "balance_loss_clip": 0.06276833, "balance_loss_mlp": 0.01253785, "epoch": 0.7973545768826094, "flos": 21586455573120.0, "grad_norm": 1.3856919756199435, "language_loss": 0.75884318, "learning_rate": 4.154196571650501e-07, "loss": 0.83557796, "num_input_tokens_seen": 286135110, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.08380127, "step": 13262, "time_per_iteration": 4.1162426471710205 }, { "auxiliary_loss_clip": 0.06422109, "auxiliary_loss_mlp": 0.01269517, "balance_loss_clip": 0.06276758, "balance_loss_mlp": 0.01258383, "epoch": 0.7974147001352773, "flos": 20564826266880.0, "grad_norm": 2.8767339734714032, "language_loss": 0.7110436, "learning_rate": 4.1518205931585524e-07, "loss": 0.78795981, "num_input_tokens_seen": 286152835, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.11132812, "step": 13263, "time_per_iteration": 2.5608344078063965 }, { "auxiliary_loss_clip": 0.06426992, "auxiliary_loss_mlp": 0.01265795, "balance_loss_clip": 0.06279416, "balance_loss_mlp": 0.01255173, "epoch": 0.7974748233879453, "flos": 21003224428800.0, "grad_norm": 1.7480982326802104, "language_loss": 0.71285588, "learning_rate": 4.149445215631153e-07, "loss": 0.78978372, "num_input_tokens_seen": 286171785, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.10632324, "step": 13264, "time_per_iteration": 2.5726964473724365 }, { "auxiliary_loss_clip": 0.06410147, "auxiliary_loss_mlp": 0.01266843, "balance_loss_clip": 0.0627343, "balance_loss_mlp": 0.01258462, "epoch": 0.7975349466406133, "flos": 22571803261440.0, "grad_norm": 1.5955829925636422, "language_loss": 0.76958805, "learning_rate": 4.1470704391583776e-07, "loss": 0.84635794, "num_input_tokens_seen": 286190420, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.08392334, "step": 13265, "time_per_iteration": 2.545189619064331 }, { "auxiliary_loss_clip": 0.06419956, "auxiliary_loss_mlp": 0.01271307, "balance_loss_clip": 0.06277093, "balance_loss_mlp": 0.0126152, "epoch": 0.7975950698932812, "flos": 21696013186560.0, "grad_norm": 1.7206436790575463, "language_loss": 0.75802213, "learning_rate": 4.144696263830285e-07, "loss": 0.83493471, "num_input_tokens_seen": 286210105, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09783936, "step": 13266, "time_per_iteration": 2.5474648475646973 }, { "auxiliary_loss_clip": 0.06411786, "auxiliary_loss_mlp": 0.01267629, "balance_loss_clip": 0.06272162, "balance_loss_mlp": 0.0125842, "epoch": 0.7976551931459492, "flos": 19609806556800.0, "grad_norm": 1.5287355977514594, "language_loss": 0.84438032, "learning_rate": 4.1423226897369015e-07, "loss": 0.92117447, "num_input_tokens_seen": 286228180, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09210205, "step": 13267, "time_per_iteration": 2.53594970703125 }, { "auxiliary_loss_clip": 0.06414928, "auxiliary_loss_mlp": 0.01265536, "balance_loss_clip": 0.06275354, "balance_loss_mlp": 0.01256071, "epoch": 0.7977153163986171, "flos": 21693749126400.0, "grad_norm": 1.45458896148967, "language_loss": 0.76002157, "learning_rate": 4.139949716968223e-07, "loss": 0.83682621, "num_input_tokens_seen": 286247305, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09472656, "step": 13268, "time_per_iteration": 3.926762580871582 }, { "auxiliary_loss_clip": 0.06416744, "auxiliary_loss_mlp": 0.01267583, "balance_loss_clip": 0.06277388, "balance_loss_mlp": 0.01257993, "epoch": 0.7977754396512852, "flos": 23483455683840.0, "grad_norm": 1.6129339317634632, "language_loss": 0.78218502, "learning_rate": 4.1375773456142403e-07, "loss": 0.85902828, "num_input_tokens_seen": 286268145, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09594727, "step": 13269, "time_per_iteration": 2.5880627632141113 }, { "auxiliary_loss_clip": 0.06411636, "auxiliary_loss_mlp": 0.01262609, "balance_loss_clip": 0.06274761, "balance_loss_mlp": 0.01253919, "epoch": 0.7978355629039531, "flos": 22388718090240.0, "grad_norm": 2.4712464177974347, "language_loss": 0.82096499, "learning_rate": 4.135205575764922e-07, "loss": 0.8977074, "num_input_tokens_seen": 286286775, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.08685303, "step": 13270, "time_per_iteration": 2.539257764816284 }, { "auxiliary_loss_clip": 0.06414872, "auxiliary_loss_mlp": 0.01265566, "balance_loss_clip": 0.06274498, "balance_loss_mlp": 0.01256036, "epoch": 0.7978956861566211, "flos": 20272518898560.0, "grad_norm": 25.19021121998038, "language_loss": 0.5997265, "learning_rate": 4.1328344075101905e-07, "loss": 0.67653084, "num_input_tokens_seen": 286305590, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09539795, "step": 13271, "time_per_iteration": 2.5342419147491455 }, { "auxiliary_loss_clip": 0.06423867, "auxiliary_loss_mlp": 0.01266211, "balance_loss_clip": 0.06278292, "balance_loss_mlp": 0.01256001, "epoch": 0.797955809409289, "flos": 28120192744320.0, "grad_norm": 1.4683819979141193, "language_loss": 0.73498553, "learning_rate": 4.130463840939975e-07, "loss": 0.81188631, "num_input_tokens_seen": 286328050, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10223389, "step": 13272, "time_per_iteration": 2.615485429763794 }, { "auxiliary_loss_clip": 0.06413274, "auxiliary_loss_mlp": 0.01266958, "balance_loss_clip": 0.06274557, "balance_loss_mlp": 0.01257099, "epoch": 0.798015932661957, "flos": 15564979537920.0, "grad_norm": 1.7740863750428155, "language_loss": 0.72163093, "learning_rate": 4.128093876144161e-07, "loss": 0.7984333, "num_input_tokens_seen": 286345265, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09863281, "step": 13273, "time_per_iteration": 2.5239269733428955 }, { "auxiliary_loss_clip": 0.06419755, "auxiliary_loss_mlp": 0.01263949, "balance_loss_clip": 0.06275915, "balance_loss_mlp": 0.01254138, "epoch": 0.7980760559146249, "flos": 23957967755520.0, "grad_norm": 1.63154495209981, "language_loss": 0.76248777, "learning_rate": 4.1257245132126117e-07, "loss": 0.83932471, "num_input_tokens_seen": 286364465, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.0980835, "step": 13274, "time_per_iteration": 2.5748164653778076 }, { "auxiliary_loss_clip": 0.06406562, "auxiliary_loss_mlp": 0.0126446, "balance_loss_clip": 0.06271529, "balance_loss_mlp": 0.01255734, "epoch": 0.798136179167293, "flos": 28045617010560.0, "grad_norm": 3.462920150089584, "language_loss": 0.78102189, "learning_rate": 4.12335575223518e-07, "loss": 0.85773218, "num_input_tokens_seen": 286385565, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.08721924, "step": 13275, "time_per_iteration": 2.608133554458618 }, { "auxiliary_loss_clip": 0.06420787, "auxiliary_loss_mlp": 0.01264337, "balance_loss_clip": 0.06276625, "balance_loss_mlp": 0.01254091, "epoch": 0.7981963024199609, "flos": 35992157074560.0, "grad_norm": 1.7452665555765092, "language_loss": 0.6385169, "learning_rate": 4.1209875933016877e-07, "loss": 0.71536815, "num_input_tokens_seen": 286403950, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10247803, "step": 13276, "time_per_iteration": 2.6600425243377686 }, { "auxiliary_loss_clip": 0.06414236, "auxiliary_loss_mlp": 0.01266507, "balance_loss_clip": 0.06277701, "balance_loss_mlp": 0.01257339, "epoch": 0.7982564256726289, "flos": 25892004170880.0, "grad_norm": 1.6335711718029329, "language_loss": 0.6115467, "learning_rate": 4.118620036501945e-07, "loss": 0.68835413, "num_input_tokens_seen": 286426160, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.0916748, "step": 13277, "time_per_iteration": 2.6499476432800293 }, { "auxiliary_loss_clip": 0.06422823, "auxiliary_loss_mlp": 0.01268049, "balance_loss_clip": 0.06279692, "balance_loss_mlp": 0.01258084, "epoch": 0.7983165489252969, "flos": 25746248793600.0, "grad_norm": 2.550308654616045, "language_loss": 0.79879236, "learning_rate": 4.1162530819257227e-07, "loss": 0.87570107, "num_input_tokens_seen": 286446610, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09960938, "step": 13278, "time_per_iteration": 2.6049559116363525 }, { "auxiliary_loss_clip": 0.06417488, "auxiliary_loss_mlp": 0.01267085, "balance_loss_clip": 0.06274822, "balance_loss_mlp": 0.01257405, "epoch": 0.7983766721779648, "flos": 21914667216000.0, "grad_norm": 1.8005456174932761, "language_loss": 0.64197141, "learning_rate": 4.113886729662768e-07, "loss": 0.71881711, "num_input_tokens_seen": 286465460, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09680176, "step": 13279, "time_per_iteration": 2.5946602821350098 }, { "auxiliary_loss_clip": 0.06404649, "auxiliary_loss_mlp": 0.01266795, "balance_loss_clip": 0.0627274, "balance_loss_mlp": 0.0125839, "epoch": 0.7984367954306328, "flos": 29354480513280.0, "grad_norm": 1.5208397033545482, "language_loss": 0.71146363, "learning_rate": 4.111520979802825e-07, "loss": 0.78817809, "num_input_tokens_seen": 286485720, "router_z_loss_clip": 1.31933594, "router_z_loss_mlp": 0.08404541, "step": 13280, "time_per_iteration": 2.5938382148742676 }, { "auxiliary_loss_clip": 0.06422485, "auxiliary_loss_mlp": 0.01266862, "balance_loss_clip": 0.06278776, "balance_loss_mlp": 0.0125698, "epoch": 0.7984969186833007, "flos": 31365775992960.0, "grad_norm": 1.5959310965014435, "language_loss": 0.63017362, "learning_rate": 4.1091558324355955e-07, "loss": 0.70706707, "num_input_tokens_seen": 286507465, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09881592, "step": 13281, "time_per_iteration": 2.6600029468536377 }, { "auxiliary_loss_clip": 0.06418475, "auxiliary_loss_mlp": 0.01266462, "balance_loss_clip": 0.06272538, "balance_loss_mlp": 0.01255423, "epoch": 0.7985570419359688, "flos": 24319232634240.0, "grad_norm": 1.64552430222875, "language_loss": 0.8031249, "learning_rate": 4.1067912876507683e-07, "loss": 0.87997425, "num_input_tokens_seen": 286526345, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.1104126, "step": 13282, "time_per_iteration": 2.566059112548828 }, { "auxiliary_loss_clip": 0.06418109, "auxiliary_loss_mlp": 0.01264287, "balance_loss_clip": 0.06275439, "balance_loss_mlp": 0.01254613, "epoch": 0.7986171651886367, "flos": 15747687365760.0, "grad_norm": 2.1525361502881877, "language_loss": 0.71863675, "learning_rate": 4.10442734553802e-07, "loss": 0.7954607, "num_input_tokens_seen": 286544095, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09667969, "step": 13283, "time_per_iteration": 2.533567190170288 }, { "auxiliary_loss_clip": 0.06413408, "auxiliary_loss_mlp": 0.01263685, "balance_loss_clip": 0.0627537, "balance_loss_mlp": 0.01254423, "epoch": 0.7986772884413047, "flos": 11624175763200.0, "grad_norm": 2.1358942948563264, "language_loss": 0.73835361, "learning_rate": 4.102064006186967e-07, "loss": 0.81512457, "num_input_tokens_seen": 286560960, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09259033, "step": 13284, "time_per_iteration": 2.6518402099609375 }, { "auxiliary_loss_clip": 0.06413867, "auxiliary_loss_mlp": 0.012639, "balance_loss_clip": 0.06275013, "balance_loss_mlp": 0.012549, "epoch": 0.7987374116939726, "flos": 22097626606080.0, "grad_norm": 1.7580295255936955, "language_loss": 0.70786923, "learning_rate": 4.0997012696872415e-07, "loss": 0.78464687, "num_input_tokens_seen": 286579865, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09008789, "step": 13285, "time_per_iteration": 2.546607732772827 }, { "auxiliary_loss_clip": 0.06412869, "auxiliary_loss_mlp": 0.01263269, "balance_loss_clip": 0.06271966, "balance_loss_mlp": 0.01253857, "epoch": 0.7987975349466406, "flos": 17895807763200.0, "grad_norm": 1.5471943717042433, "language_loss": 0.73739898, "learning_rate": 4.097339136128437e-07, "loss": 0.81416035, "num_input_tokens_seen": 286597295, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09411621, "step": 13286, "time_per_iteration": 2.5208446979522705 }, { "auxiliary_loss_clip": 0.06411818, "auxiliary_loss_mlp": 0.01263527, "balance_loss_clip": 0.06272654, "balance_loss_mlp": 0.01254002, "epoch": 0.7988576581993085, "flos": 19725359736960.0, "grad_norm": 2.0493902876067356, "language_loss": 0.75433433, "learning_rate": 4.0949776056001296e-07, "loss": 0.83108777, "num_input_tokens_seen": 286616270, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09521484, "step": 13287, "time_per_iteration": 3.9508326053619385 }, { "auxiliary_loss_clip": 0.06414275, "auxiliary_loss_mlp": 0.01264434, "balance_loss_clip": 0.06275059, "balance_loss_mlp": 0.01254886, "epoch": 0.7989177814519766, "flos": 28043604512640.0, "grad_norm": 1.6196674873094652, "language_loss": 0.62095529, "learning_rate": 4.092616678191863e-07, "loss": 0.69774234, "num_input_tokens_seen": 286638315, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09545898, "step": 13288, "time_per_iteration": 2.606692314147949 }, { "auxiliary_loss_clip": 0.0641285, "auxiliary_loss_mlp": 0.01264177, "balance_loss_clip": 0.06276561, "balance_loss_mlp": 0.01255517, "epoch": 0.7989779047046445, "flos": 28877662454400.0, "grad_norm": 1.947334126597935, "language_loss": 0.71100152, "learning_rate": 4.090256353993169e-07, "loss": 0.78777182, "num_input_tokens_seen": 286658630, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.08660889, "step": 13289, "time_per_iteration": 2.587331771850586 }, { "auxiliary_loss_clip": 0.06412485, "auxiliary_loss_mlp": 0.01262118, "balance_loss_clip": 0.06278778, "balance_loss_mlp": 0.01253005, "epoch": 0.7990380279573125, "flos": 18192769032960.0, "grad_norm": 2.145263805445607, "language_loss": 0.6284833, "learning_rate": 4.0878966330935506e-07, "loss": 0.70522928, "num_input_tokens_seen": 286676870, "router_z_loss_clip": 1.33496094, "router_z_loss_mlp": 0.09118652, "step": 13290, "time_per_iteration": 2.532341718673706 }, { "auxiliary_loss_clip": 0.06418005, "auxiliary_loss_mlp": 0.01265757, "balance_loss_clip": 0.06277287, "balance_loss_mlp": 0.01255606, "epoch": 0.7990981512099805, "flos": 20885113699200.0, "grad_norm": 1.8390765391396091, "language_loss": 0.7174412, "learning_rate": 4.08553751558248e-07, "loss": 0.79427874, "num_input_tokens_seen": 286694300, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.1015625, "step": 13291, "time_per_iteration": 4.063623428344727 }, { "auxiliary_loss_clip": 0.06411133, "auxiliary_loss_mlp": 0.0126416, "balance_loss_clip": 0.06274483, "balance_loss_mlp": 0.01255261, "epoch": 0.7991582744626484, "flos": 26106381642240.0, "grad_norm": 1.454813051522508, "language_loss": 0.63814366, "learning_rate": 4.083179001549422e-07, "loss": 0.71489656, "num_input_tokens_seen": 286714545, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.08892822, "step": 13292, "time_per_iteration": 2.5776901245117188 }, { "auxiliary_loss_clip": 0.06410509, "auxiliary_loss_mlp": 0.01265461, "balance_loss_clip": 0.06272531, "balance_loss_mlp": 0.01255984, "epoch": 0.7992183977153164, "flos": 35304106072320.0, "grad_norm": 1.575028647820667, "language_loss": 0.55752099, "learning_rate": 4.0808210910838105e-07, "loss": 0.63428062, "num_input_tokens_seen": 286734525, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09472656, "step": 13293, "time_per_iteration": 2.6695518493652344 }, { "auxiliary_loss_clip": 0.06416897, "auxiliary_loss_mlp": 0.01264321, "balance_loss_clip": 0.06278324, "balance_loss_mlp": 0.01255035, "epoch": 0.7992785209679844, "flos": 51863294632320.0, "grad_norm": 2.589589785387776, "language_loss": 0.71926206, "learning_rate": 4.0784637842750704e-07, "loss": 0.79607421, "num_input_tokens_seen": 286753430, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09283447, "step": 13294, "time_per_iteration": 2.793778419494629 }, { "auxiliary_loss_clip": 0.06414285, "auxiliary_loss_mlp": 0.01263645, "balance_loss_clip": 0.06274439, "balance_loss_mlp": 0.0125362, "epoch": 0.7993386442206524, "flos": 22571719407360.0, "grad_norm": 2.150238106998554, "language_loss": 0.73188961, "learning_rate": 4.0761070812125675e-07, "loss": 0.80866885, "num_input_tokens_seen": 286771915, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.10015869, "step": 13295, "time_per_iteration": 2.55366849899292 }, { "auxiliary_loss_clip": 0.0641332, "auxiliary_loss_mlp": 0.01272422, "balance_loss_clip": 0.06276548, "balance_loss_mlp": 0.0126357, "epoch": 0.7993987674733203, "flos": 18805112271360.0, "grad_norm": 1.8602812616738287, "language_loss": 0.76527107, "learning_rate": 4.0737509819856797e-07, "loss": 0.84212846, "num_input_tokens_seen": 286789835, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.08856201, "step": 13296, "time_per_iteration": 2.523728132247925 }, { "auxiliary_loss_clip": 0.06316963, "auxiliary_loss_mlp": 0.01252087, "balance_loss_clip": 0.06261076, "balance_loss_mlp": 0.01250817, "epoch": 0.7994588907259883, "flos": 69443747625600.0, "grad_norm": 0.6767047402156189, "language_loss": 0.60821527, "learning_rate": 4.0713954866837573e-07, "loss": 0.68390578, "num_input_tokens_seen": 286855580, "router_z_loss_clip": 0.56054688, "router_z_loss_mlp": 0.01270294, "step": 13297, "time_per_iteration": 3.227952718734741 }, { "auxiliary_loss_clip": 0.06415139, "auxiliary_loss_mlp": 0.01268546, "balance_loss_clip": 0.06276707, "balance_loss_mlp": 0.01258997, "epoch": 0.7995190139786562, "flos": 13485439307520.0, "grad_norm": 1.9833181055929132, "language_loss": 0.70435703, "learning_rate": 4.0690405953961073e-07, "loss": 0.78119385, "num_input_tokens_seen": 286874360, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09539795, "step": 13298, "time_per_iteration": 2.518066644668579 }, { "auxiliary_loss_clip": 0.06421146, "auxiliary_loss_mlp": 0.01264374, "balance_loss_clip": 0.06278725, "balance_loss_mlp": 0.01254319, "epoch": 0.7995791372313242, "flos": 21659270371200.0, "grad_norm": 2.154951620722367, "language_loss": 0.76264107, "learning_rate": 4.066686308212037e-07, "loss": 0.83949625, "num_input_tokens_seen": 286891950, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10058594, "step": 13299, "time_per_iteration": 2.5323128700256348 }, { "auxiliary_loss_clip": 0.06410374, "auxiliary_loss_mlp": 0.01266841, "balance_loss_clip": 0.06274171, "balance_loss_mlp": 0.01258156, "epoch": 0.7996392604839921, "flos": 26075382831360.0, "grad_norm": 1.7481047282193456, "language_loss": 0.77644098, "learning_rate": 4.064332625220828e-07, "loss": 0.85321307, "num_input_tokens_seen": 286911725, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.0869751, "step": 13300, "time_per_iteration": 2.5733096599578857 }, { "auxiliary_loss_clip": 0.06420866, "auxiliary_loss_mlp": 0.01263059, "balance_loss_clip": 0.06277847, "balance_loss_mlp": 0.01253391, "epoch": 0.7996993837366602, "flos": 24613594427520.0, "grad_norm": 1.9350247363363946, "language_loss": 0.64311337, "learning_rate": 4.0619795465117115e-07, "loss": 0.71995258, "num_input_tokens_seen": 286931400, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09661865, "step": 13301, "time_per_iteration": 2.5620930194854736 }, { "auxiliary_loss_clip": 0.06410108, "auxiliary_loss_mlp": 0.01268244, "balance_loss_clip": 0.06273611, "balance_loss_mlp": 0.01259166, "epoch": 0.7997595069893281, "flos": 20997690059520.0, "grad_norm": 1.5811198008192717, "language_loss": 0.7194885, "learning_rate": 4.059627072173928e-07, "loss": 0.79627204, "num_input_tokens_seen": 286949795, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09082031, "step": 13302, "time_per_iteration": 4.001516580581665 }, { "auxiliary_loss_clip": 0.06422415, "auxiliary_loss_mlp": 0.01265727, "balance_loss_clip": 0.06278593, "balance_loss_mlp": 0.0125619, "epoch": 0.7998196302419961, "flos": 24433528003200.0, "grad_norm": 1.8067848336967631, "language_loss": 0.8356061, "learning_rate": 4.057275202296684e-07, "loss": 0.91248751, "num_input_tokens_seen": 286968805, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09527588, "step": 13303, "time_per_iteration": 2.5877931118011475 }, { "auxiliary_loss_clip": 0.06411477, "auxiliary_loss_mlp": 0.01262651, "balance_loss_clip": 0.06276336, "balance_loss_mlp": 0.01254086, "epoch": 0.7998797534946641, "flos": 30272715480960.0, "grad_norm": 1.6947463008274264, "language_loss": 0.59541005, "learning_rate": 4.054923936969166e-07, "loss": 0.67215133, "num_input_tokens_seen": 286990235, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.08569336, "step": 13304, "time_per_iteration": 2.6288857460021973 }, { "auxiliary_loss_clip": 0.06418687, "auxiliary_loss_mlp": 0.01262951, "balance_loss_clip": 0.06276228, "balance_loss_mlp": 0.01253736, "epoch": 0.799939876747332, "flos": 23520785477760.0, "grad_norm": 1.6333043399708596, "language_loss": 0.69210958, "learning_rate": 4.0525732762805265e-07, "loss": 0.76892602, "num_input_tokens_seen": 287011060, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09222412, "step": 13305, "time_per_iteration": 2.5970335006713867 }, { "auxiliary_loss_clip": 0.06410612, "auxiliary_loss_mlp": 0.01263664, "balance_loss_clip": 0.06273945, "balance_loss_mlp": 0.01255096, "epoch": 0.8, "flos": 19324207514880.0, "grad_norm": 1.5790528997372555, "language_loss": 0.69387197, "learning_rate": 4.0502232203199107e-07, "loss": 0.7706148, "num_input_tokens_seen": 287029215, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.08572388, "step": 13306, "time_per_iteration": 2.709233522415161 }, { "auxiliary_loss_clip": 0.06415363, "auxiliary_loss_mlp": 0.01263472, "balance_loss_clip": 0.06274317, "balance_loss_mlp": 0.01254024, "epoch": 0.800060123252668, "flos": 32420039264640.0, "grad_norm": 1.4967096438874081, "language_loss": 0.69809127, "learning_rate": 4.0478737691764286e-07, "loss": 0.77487957, "num_input_tokens_seen": 287050855, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09448242, "step": 13307, "time_per_iteration": 2.670459508895874 }, { "auxiliary_loss_clip": 0.06414492, "auxiliary_loss_mlp": 0.01266701, "balance_loss_clip": 0.06275323, "balance_loss_mlp": 0.01257581, "epoch": 0.800120246505336, "flos": 20016702783360.0, "grad_norm": 2.137958962467121, "language_loss": 0.77079999, "learning_rate": 4.0455249229391677e-07, "loss": 0.84761196, "num_input_tokens_seen": 287069915, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09118652, "step": 13308, "time_per_iteration": 3.9209065437316895 }, { "auxiliary_loss_clip": 0.06417856, "auxiliary_loss_mlp": 0.01268018, "balance_loss_clip": 0.06273504, "balance_loss_mlp": 0.01256944, "epoch": 0.8001803697580039, "flos": 31876318120320.0, "grad_norm": 1.5450617750293332, "language_loss": 0.79083699, "learning_rate": 4.0431766816972e-07, "loss": 0.86769569, "num_input_tokens_seen": 287091450, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.11083984, "step": 13309, "time_per_iteration": 2.63693904876709 }, { "auxiliary_loss_clip": 0.06318359, "auxiliary_loss_mlp": 0.01251631, "balance_loss_clip": 0.06262209, "balance_loss_mlp": 0.01250517, "epoch": 0.8002404930106719, "flos": 63411496341120.0, "grad_norm": 0.9112871909464575, "language_loss": 0.64671421, "learning_rate": 4.040829045539571e-07, "loss": 0.72241414, "num_input_tokens_seen": 287148365, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01115417, "step": 13310, "time_per_iteration": 3.087937355041504 }, { "auxiliary_loss_clip": 0.06418805, "auxiliary_loss_mlp": 0.01271483, "balance_loss_clip": 0.06279929, "balance_loss_mlp": 0.01261679, "epoch": 0.8003006162633398, "flos": 27862951109760.0, "grad_norm": 2.3775243632537557, "language_loss": 0.83033222, "learning_rate": 4.0384820145553156e-07, "loss": 0.90723515, "num_input_tokens_seen": 287168280, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09796143, "step": 13311, "time_per_iteration": 2.607879400253296 }, { "auxiliary_loss_clip": 0.06414789, "auxiliary_loss_mlp": 0.0126545, "balance_loss_clip": 0.06274948, "balance_loss_mlp": 0.01255925, "epoch": 0.8003607395160078, "flos": 18229218359040.0, "grad_norm": 1.9761156704333034, "language_loss": 0.66362786, "learning_rate": 4.0361355888334116e-07, "loss": 0.7404303, "num_input_tokens_seen": 287185980, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09521484, "step": 13312, "time_per_iteration": 2.506814956665039 }, { "auxiliary_loss_clip": 0.06419519, "auxiliary_loss_mlp": 0.01264962, "balance_loss_clip": 0.06276524, "balance_loss_mlp": 0.01254752, "epoch": 0.8004208627686757, "flos": 20893331399040.0, "grad_norm": 1.7061032309058834, "language_loss": 0.75681025, "learning_rate": 4.033789768462843e-07, "loss": 0.83365512, "num_input_tokens_seen": 287203875, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10211182, "step": 13313, "time_per_iteration": 2.5619866847991943 }, { "auxiliary_loss_clip": 0.06416498, "auxiliary_loss_mlp": 0.01267276, "balance_loss_clip": 0.06276049, "balance_loss_mlp": 0.0125753, "epoch": 0.8004809860213438, "flos": 26443984942080.0, "grad_norm": 1.3481249289206387, "language_loss": 0.7586416, "learning_rate": 4.031444553532575e-07, "loss": 0.83547932, "num_input_tokens_seen": 287226445, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09747314, "step": 13314, "time_per_iteration": 2.6207234859466553 }, { "auxiliary_loss_clip": 0.06318752, "auxiliary_loss_mlp": 0.01253856, "balance_loss_clip": 0.06262803, "balance_loss_mlp": 0.01252773, "epoch": 0.8005411092740117, "flos": 63668276778240.0, "grad_norm": 0.7677758034742129, "language_loss": 0.53737885, "learning_rate": 4.029099944131522e-07, "loss": 0.61310494, "num_input_tokens_seen": 287286240, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01084137, "step": 13315, "time_per_iteration": 3.1032776832580566 }, { "auxiliary_loss_clip": 0.06411403, "auxiliary_loss_mlp": 0.01268323, "balance_loss_clip": 0.06273301, "balance_loss_mlp": 0.01258625, "epoch": 0.8006012325266797, "flos": 36146968692480.0, "grad_norm": 1.6645256803859974, "language_loss": 0.71527052, "learning_rate": 4.026755940348603e-07, "loss": 0.79206777, "num_input_tokens_seen": 287310265, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09698486, "step": 13316, "time_per_iteration": 2.6843814849853516 }, { "auxiliary_loss_clip": 0.06419438, "auxiliary_loss_mlp": 0.01266113, "balance_loss_clip": 0.06276837, "balance_loss_mlp": 0.01256708, "epoch": 0.8006613557793477, "flos": 33847390840320.0, "grad_norm": 3.3023934676501496, "language_loss": 0.65109044, "learning_rate": 4.024412542272706e-07, "loss": 0.72794592, "num_input_tokens_seen": 287331610, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09411621, "step": 13317, "time_per_iteration": 2.668557643890381 }, { "auxiliary_loss_clip": 0.06317816, "auxiliary_loss_mlp": 0.01252602, "balance_loss_clip": 0.0626183, "balance_loss_mlp": 0.0125148, "epoch": 0.8007214790320156, "flos": 67371041502720.0, "grad_norm": 0.745287755949428, "language_loss": 0.58857292, "learning_rate": 4.0220697499926783e-07, "loss": 0.66427714, "num_input_tokens_seen": 287394795, "router_z_loss_clip": 0.56054688, "router_z_loss_mlp": 0.01123047, "step": 13318, "time_per_iteration": 3.2445552349090576 }, { "auxiliary_loss_clip": 0.0641102, "auxiliary_loss_mlp": 0.01261962, "balance_loss_clip": 0.062708, "balance_loss_mlp": 0.01253325, "epoch": 0.8007816022846836, "flos": 23192406126720.0, "grad_norm": 1.6391037856503607, "language_loss": 0.66316617, "learning_rate": 4.019727563597366e-07, "loss": 0.739896, "num_input_tokens_seen": 287414595, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.08636475, "step": 13319, "time_per_iteration": 2.5826663970947266 }, { "auxiliary_loss_clip": 0.06414811, "auxiliary_loss_mlp": 0.01264968, "balance_loss_clip": 0.06272765, "balance_loss_mlp": 0.01254847, "epoch": 0.8008417255373516, "flos": 21987901284480.0, "grad_norm": 2.161398060841538, "language_loss": 0.74188101, "learning_rate": 4.0173859831755873e-07, "loss": 0.8186788, "num_input_tokens_seen": 287434395, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10119629, "step": 13320, "time_per_iteration": 2.5695724487304688 }, { "auxiliary_loss_clip": 0.06417429, "auxiliary_loss_mlp": 0.01264536, "balance_loss_clip": 0.0627629, "balance_loss_mlp": 0.01254332, "epoch": 0.8009018487900196, "flos": 16732951200000.0, "grad_norm": 2.394096846690577, "language_loss": 0.80817449, "learning_rate": 4.015045008816138e-07, "loss": 0.88499415, "num_input_tokens_seen": 287450590, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10205078, "step": 13321, "time_per_iteration": 2.515460968017578 }, { "auxiliary_loss_clip": 0.0640949, "auxiliary_loss_mlp": 0.0126201, "balance_loss_clip": 0.0627379, "balance_loss_mlp": 0.01253427, "epoch": 0.8009619720426875, "flos": 20819887695360.0, "grad_norm": 1.961351230519313, "language_loss": 0.662193, "learning_rate": 4.0127046406077825e-07, "loss": 0.73890799, "num_input_tokens_seen": 287468455, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.08581543, "step": 13322, "time_per_iteration": 2.5651121139526367 }, { "auxiliary_loss_clip": 0.06410534, "auxiliary_loss_mlp": 0.01264478, "balance_loss_clip": 0.06271048, "balance_loss_mlp": 0.01254733, "epoch": 0.8010220952953555, "flos": 17936869063680.0, "grad_norm": 1.6222984148222812, "language_loss": 0.78121734, "learning_rate": 4.010364878639265e-07, "loss": 0.8579675, "num_input_tokens_seen": 287486485, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09753418, "step": 13323, "time_per_iteration": 2.538581609725952 }, { "auxiliary_loss_clip": 0.06418975, "auxiliary_loss_mlp": 0.01264588, "balance_loss_clip": 0.06275766, "balance_loss_mlp": 0.01254813, "epoch": 0.8010822185480234, "flos": 24579241453440.0, "grad_norm": 2.248143274698165, "language_loss": 0.72441232, "learning_rate": 4.00802572299932e-07, "loss": 0.80124801, "num_input_tokens_seen": 287503940, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09771729, "step": 13324, "time_per_iteration": 2.5913138389587402 }, { "auxiliary_loss_clip": 0.06416909, "auxiliary_loss_mlp": 0.01264621, "balance_loss_clip": 0.06273375, "balance_loss_mlp": 0.01254065, "epoch": 0.8011423418006914, "flos": 21835563361920.0, "grad_norm": 1.608131976360617, "language_loss": 0.76394486, "learning_rate": 4.005687173776635e-07, "loss": 0.84076023, "num_input_tokens_seen": 287521660, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10552979, "step": 13325, "time_per_iteration": 4.003183126449585 }, { "auxiliary_loss_clip": 0.06403865, "auxiliary_loss_mlp": 0.01263508, "balance_loss_clip": 0.06271075, "balance_loss_mlp": 0.01255276, "epoch": 0.8012024650533593, "flos": 23922021553920.0, "grad_norm": 1.4684731580812682, "language_loss": 0.79805481, "learning_rate": 4.003349231059898e-07, "loss": 0.87472856, "num_input_tokens_seen": 287541505, "router_z_loss_clip": 1.32519531, "router_z_loss_mlp": 0.08227539, "step": 13326, "time_per_iteration": 2.6745896339416504 }, { "auxiliary_loss_clip": 0.06410684, "auxiliary_loss_mlp": 0.01264868, "balance_loss_clip": 0.06274585, "balance_loss_mlp": 0.01256112, "epoch": 0.8012625883060274, "flos": 23593893765120.0, "grad_norm": 2.1163490823543256, "language_loss": 0.66161287, "learning_rate": 4.001011894937765e-07, "loss": 0.73836839, "num_input_tokens_seen": 287560015, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.08752441, "step": 13327, "time_per_iteration": 2.5725507736206055 }, { "auxiliary_loss_clip": 0.06409915, "auxiliary_loss_mlp": 0.01264965, "balance_loss_clip": 0.06274023, "balance_loss_mlp": 0.01255422, "epoch": 0.8013227115586953, "flos": 20820265038720.0, "grad_norm": 1.7476710884106912, "language_loss": 0.74138004, "learning_rate": 3.9986751654988636e-07, "loss": 0.81812882, "num_input_tokens_seen": 287579150, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09539795, "step": 13328, "time_per_iteration": 2.5811476707458496 }, { "auxiliary_loss_clip": 0.06419186, "auxiliary_loss_mlp": 0.01266692, "balance_loss_clip": 0.06276837, "balance_loss_mlp": 0.01256732, "epoch": 0.8013828348113633, "flos": 15893820086400.0, "grad_norm": 2.134730072177286, "language_loss": 0.74898428, "learning_rate": 3.996339042831798e-07, "loss": 0.8258431, "num_input_tokens_seen": 287597420, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09960938, "step": 13329, "time_per_iteration": 2.547121524810791 }, { "auxiliary_loss_clip": 0.06315236, "auxiliary_loss_mlp": 0.01249815, "balance_loss_clip": 0.06259284, "balance_loss_mlp": 0.01248738, "epoch": 0.8014429580640313, "flos": 71085183183360.0, "grad_norm": 0.6813191976583153, "language_loss": 0.52517045, "learning_rate": 3.9940035270251605e-07, "loss": 0.60082096, "num_input_tokens_seen": 287667280, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01078796, "step": 13330, "time_per_iteration": 4.715204477310181 }, { "auxiliary_loss_clip": 0.06418328, "auxiliary_loss_mlp": 0.01266225, "balance_loss_clip": 0.0627427, "balance_loss_mlp": 0.0125558, "epoch": 0.8015030813166992, "flos": 23083100075520.0, "grad_norm": 2.043609958938855, "language_loss": 0.73623419, "learning_rate": 3.991668618167519e-07, "loss": 0.81307971, "num_input_tokens_seen": 287687375, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10644531, "step": 13331, "time_per_iteration": 2.573659658432007 }, { "auxiliary_loss_clip": 0.06412245, "auxiliary_loss_mlp": 0.01264368, "balance_loss_clip": 0.06273274, "balance_loss_mlp": 0.0125504, "epoch": 0.8015632045693672, "flos": 21878888722560.0, "grad_norm": 1.6972289632895832, "language_loss": 0.77797908, "learning_rate": 3.989334316347401e-07, "loss": 0.85474527, "num_input_tokens_seen": 287707895, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09326172, "step": 13332, "time_per_iteration": 2.569746494293213 }, { "auxiliary_loss_clip": 0.06416397, "auxiliary_loss_mlp": 0.0127287, "balance_loss_clip": 0.06275226, "balance_loss_mlp": 0.01262814, "epoch": 0.8016233278220352, "flos": 23663018983680.0, "grad_norm": 1.9028493789259828, "language_loss": 0.83593744, "learning_rate": 3.987000621653338e-07, "loss": 0.91283005, "num_input_tokens_seen": 287723990, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10058594, "step": 13333, "time_per_iteration": 2.5654683113098145 }, { "auxiliary_loss_clip": 0.06419148, "auxiliary_loss_mlp": 0.01264963, "balance_loss_clip": 0.06277196, "balance_loss_mlp": 0.012543, "epoch": 0.8016834510747032, "flos": 16258732617600.0, "grad_norm": 1.7842413983254108, "language_loss": 0.73770523, "learning_rate": 3.9846675341738133e-07, "loss": 0.81454641, "num_input_tokens_seen": 287742380, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10668945, "step": 13334, "time_per_iteration": 2.5265564918518066 }, { "auxiliary_loss_clip": 0.06410147, "auxiliary_loss_mlp": 0.01264798, "balance_loss_clip": 0.06273945, "balance_loss_mlp": 0.01255768, "epoch": 0.8017435743273711, "flos": 12280892538240.0, "grad_norm": 2.045527236227539, "language_loss": 0.7492466, "learning_rate": 3.9823350539972967e-07, "loss": 0.8259961, "num_input_tokens_seen": 287760130, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.09039307, "step": 13335, "time_per_iteration": 2.5662851333618164 }, { "auxiliary_loss_clip": 0.06410295, "auxiliary_loss_mlp": 0.01264646, "balance_loss_clip": 0.06271878, "balance_loss_mlp": 0.01254752, "epoch": 0.8018036975800391, "flos": 17200880726400.0, "grad_norm": 1.6838199115026278, "language_loss": 0.75498039, "learning_rate": 3.9800031812122416e-07, "loss": 0.83172977, "num_input_tokens_seen": 287777565, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09881592, "step": 13336, "time_per_iteration": 2.54791259765625 }, { "auxiliary_loss_clip": 0.0642433, "auxiliary_loss_mlp": 0.01264192, "balance_loss_clip": 0.06276581, "balance_loss_mlp": 0.01253016, "epoch": 0.801863820832707, "flos": 20638228043520.0, "grad_norm": 2.037338267161042, "language_loss": 0.74845153, "learning_rate": 3.977671915907068e-07, "loss": 0.82533669, "num_input_tokens_seen": 287796310, "router_z_loss_clip": 1.47558594, "router_z_loss_mlp": 0.11175537, "step": 13337, "time_per_iteration": 2.5745341777801514 }, { "auxiliary_loss_clip": 0.06418414, "auxiliary_loss_mlp": 0.0126622, "balance_loss_clip": 0.06273019, "balance_loss_mlp": 0.01255581, "epoch": 0.801923944085375, "flos": 30453410810880.0, "grad_norm": 1.7682020724929708, "language_loss": 0.80043685, "learning_rate": 3.9753412581701883e-07, "loss": 0.87728316, "num_input_tokens_seen": 287817330, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10644531, "step": 13338, "time_per_iteration": 2.623164415359497 }, { "auxiliary_loss_clip": 0.06418486, "auxiliary_loss_mlp": 0.01265866, "balance_loss_clip": 0.0627465, "balance_loss_mlp": 0.01254702, "epoch": 0.801984067338043, "flos": 20016660856320.0, "grad_norm": 1.7561787909839206, "language_loss": 0.74755853, "learning_rate": 3.9730112080899733e-07, "loss": 0.82440203, "num_input_tokens_seen": 287835095, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.11169434, "step": 13339, "time_per_iteration": 2.5922961235046387 }, { "auxiliary_loss_clip": 0.06413326, "auxiliary_loss_mlp": 0.01263198, "balance_loss_clip": 0.06275221, "balance_loss_mlp": 0.01253888, "epoch": 0.802044190590711, "flos": 22790666926080.0, "grad_norm": 1.5773287380963108, "language_loss": 0.79150331, "learning_rate": 3.970681765754775e-07, "loss": 0.86826849, "num_input_tokens_seen": 287854595, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09313965, "step": 13340, "time_per_iteration": 4.136452674865723 }, { "auxiliary_loss_clip": 0.06421296, "auxiliary_loss_mlp": 0.01262919, "balance_loss_clip": 0.06280004, "balance_loss_mlp": 0.01253931, "epoch": 0.8021043138433789, "flos": 27607554264960.0, "grad_norm": 1.8118166894840708, "language_loss": 0.68368894, "learning_rate": 3.968352931252936e-07, "loss": 0.76053107, "num_input_tokens_seen": 287876960, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.08990479, "step": 13341, "time_per_iteration": 2.634608507156372 }, { "auxiliary_loss_clip": 0.0631488, "auxiliary_loss_mlp": 0.0125058, "balance_loss_clip": 0.06258912, "balance_loss_mlp": 0.01249473, "epoch": 0.8021644370960469, "flos": 62080453434240.0, "grad_norm": 0.8696799820789047, "language_loss": 0.61256087, "learning_rate": 3.9660247046727547e-07, "loss": 0.68821549, "num_input_tokens_seen": 287936530, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01109314, "step": 13342, "time_per_iteration": 3.151254177093506 }, { "auxiliary_loss_clip": 0.06418538, "auxiliary_loss_mlp": 0.01264576, "balance_loss_clip": 0.0627727, "balance_loss_mlp": 0.01254759, "epoch": 0.8022245603487148, "flos": 23367525160320.0, "grad_norm": 1.8427250942800635, "language_loss": 0.6417762, "learning_rate": 3.963697086102522e-07, "loss": 0.71860731, "num_input_tokens_seen": 287954285, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09814453, "step": 13343, "time_per_iteration": 2.5869505405426025 }, { "auxiliary_loss_clip": 0.0640666, "auxiliary_loss_mlp": 0.01265515, "balance_loss_clip": 0.06272312, "balance_loss_mlp": 0.01256699, "epoch": 0.8022846836013828, "flos": 10858027155840.0, "grad_norm": 1.8747176898083275, "language_loss": 0.68848938, "learning_rate": 3.96137007563051e-07, "loss": 0.76521111, "num_input_tokens_seen": 287971595, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.0881958, "step": 13344, "time_per_iteration": 2.5297975540161133 }, { "auxiliary_loss_clip": 0.06412995, "auxiliary_loss_mlp": 0.01265054, "balance_loss_clip": 0.06273167, "balance_loss_mlp": 0.01255726, "epoch": 0.8023448068540509, "flos": 29247899719680.0, "grad_norm": 1.5193057635240452, "language_loss": 0.70543444, "learning_rate": 3.9590436733449506e-07, "loss": 0.78221488, "num_input_tokens_seen": 287992540, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09329224, "step": 13345, "time_per_iteration": 2.651991844177246 }, { "auxiliary_loss_clip": 0.06318992, "auxiliary_loss_mlp": 0.01250458, "balance_loss_clip": 0.06262971, "balance_loss_mlp": 0.01249234, "epoch": 0.8024049301067188, "flos": 64172362141440.0, "grad_norm": 0.8598215156076664, "language_loss": 0.62909544, "learning_rate": 3.956717879334059e-07, "loss": 0.70479, "num_input_tokens_seen": 288052810, "router_z_loss_clip": 0.56201172, "router_z_loss_mlp": 0.01222992, "step": 13346, "time_per_iteration": 3.2537970542907715 }, { "auxiliary_loss_clip": 0.06411515, "auxiliary_loss_mlp": 0.01265068, "balance_loss_clip": 0.06275515, "balance_loss_mlp": 0.01255126, "epoch": 0.8024650533593868, "flos": 28592985807360.0, "grad_norm": 1.9481018565471786, "language_loss": 0.72866303, "learning_rate": 3.9543926936860327e-07, "loss": 0.80542886, "num_input_tokens_seen": 288073045, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09942627, "step": 13347, "time_per_iteration": 4.075297117233276 }, { "auxiliary_loss_clip": 0.06417604, "auxiliary_loss_mlp": 0.01266783, "balance_loss_clip": 0.06275675, "balance_loss_mlp": 0.01256436, "epoch": 0.8025251766120547, "flos": 16987844920320.0, "grad_norm": 1.9537510418387238, "language_loss": 0.72846431, "learning_rate": 3.9520681164890493e-07, "loss": 0.80530816, "num_input_tokens_seen": 288091165, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10339355, "step": 13348, "time_per_iteration": 2.5286777019500732 }, { "auxiliary_loss_clip": 0.06416091, "auxiliary_loss_mlp": 0.01263342, "balance_loss_clip": 0.0627707, "balance_loss_mlp": 0.01254336, "epoch": 0.8025852998647227, "flos": 22170189841920.0, "grad_norm": 1.877344924015296, "language_loss": 0.75894874, "learning_rate": 3.9497441478312444e-07, "loss": 0.83574307, "num_input_tokens_seen": 288110595, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09002686, "step": 13349, "time_per_iteration": 2.558551549911499 }, { "auxiliary_loss_clip": 0.06414419, "auxiliary_loss_mlp": 0.01264344, "balance_loss_clip": 0.06275237, "balance_loss_mlp": 0.01254957, "epoch": 0.8026454231173906, "flos": 22023386288640.0, "grad_norm": 2.1834607735739895, "language_loss": 0.83833486, "learning_rate": 3.947420787800755e-07, "loss": 0.91512245, "num_input_tokens_seen": 288128995, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09387207, "step": 13350, "time_per_iteration": 2.5646564960479736 }, { "auxiliary_loss_clip": 0.06415309, "auxiliary_loss_mlp": 0.01268269, "balance_loss_clip": 0.06276291, "balance_loss_mlp": 0.01258899, "epoch": 0.8027055463700586, "flos": 22497772579200.0, "grad_norm": 1.8512771050709054, "language_loss": 0.71628612, "learning_rate": 3.945098036485679e-07, "loss": 0.79312193, "num_input_tokens_seen": 288149265, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09368896, "step": 13351, "time_per_iteration": 2.6099469661712646 }, { "auxiliary_loss_clip": 0.064126, "auxiliary_loss_mlp": 0.01268526, "balance_loss_clip": 0.06274886, "balance_loss_mlp": 0.01258834, "epoch": 0.8027656696227266, "flos": 28920442763520.0, "grad_norm": 1.8615552240927622, "language_loss": 0.6194737, "learning_rate": 3.9427758939740885e-07, "loss": 0.69628501, "num_input_tokens_seen": 288170745, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09692383, "step": 13352, "time_per_iteration": 2.742249011993408 }, { "auxiliary_loss_clip": 0.06417726, "auxiliary_loss_mlp": 0.01266686, "balance_loss_clip": 0.06277615, "balance_loss_mlp": 0.01256809, "epoch": 0.8028257928753946, "flos": 18595514482560.0, "grad_norm": 1.8255585685152378, "language_loss": 0.77620304, "learning_rate": 3.940454360354046e-07, "loss": 0.85304719, "num_input_tokens_seen": 288189415, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09881592, "step": 13353, "time_per_iteration": 2.6193902492523193 }, { "auxiliary_loss_clip": 0.06426689, "auxiliary_loss_mlp": 0.01268269, "balance_loss_clip": 0.06278002, "balance_loss_mlp": 0.01257516, "epoch": 0.8028859161280625, "flos": 19135126776960.0, "grad_norm": 2.799698442308267, "language_loss": 0.73495191, "learning_rate": 3.938133435713582e-07, "loss": 0.81190151, "num_input_tokens_seen": 288206900, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.10754395, "step": 13354, "time_per_iteration": 2.6551437377929688 }, { "auxiliary_loss_clip": 0.0641678, "auxiliary_loss_mlp": 0.01262822, "balance_loss_clip": 0.06273788, "balance_loss_mlp": 0.01253643, "epoch": 0.8029460393807305, "flos": 20236069572480.0, "grad_norm": 1.8628990478406526, "language_loss": 0.65719825, "learning_rate": 3.935813120140714e-07, "loss": 0.73399425, "num_input_tokens_seen": 288224800, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09179688, "step": 13355, "time_per_iteration": 2.5786292552948 }, { "auxiliary_loss_clip": 0.06420119, "auxiliary_loss_mlp": 0.01267204, "balance_loss_clip": 0.06275134, "balance_loss_mlp": 0.01256612, "epoch": 0.8030061626333984, "flos": 49794445797120.0, "grad_norm": 2.8762099863789974, "language_loss": 0.69084394, "learning_rate": 3.9334934137234235e-07, "loss": 0.76771712, "num_input_tokens_seen": 288249400, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10595703, "step": 13356, "time_per_iteration": 2.8259432315826416 }, { "auxiliary_loss_clip": 0.06411839, "auxiliary_loss_mlp": 0.01263381, "balance_loss_clip": 0.06272946, "balance_loss_mlp": 0.01254858, "epoch": 0.8030662858860664, "flos": 21621479379840.0, "grad_norm": 1.52642995854032, "language_loss": 0.77534819, "learning_rate": 3.931174316549666e-07, "loss": 0.85210037, "num_input_tokens_seen": 288268780, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.08514404, "step": 13357, "time_per_iteration": 2.57578182220459 }, { "auxiliary_loss_clip": 0.06419969, "auxiliary_loss_mlp": 0.01263714, "balance_loss_clip": 0.06274009, "balance_loss_mlp": 0.01253599, "epoch": 0.8031264091387345, "flos": 25637194304640.0, "grad_norm": 1.3577895357441232, "language_loss": 0.77528489, "learning_rate": 3.9288558287073937e-07, "loss": 0.85212183, "num_input_tokens_seen": 288290830, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10113525, "step": 13358, "time_per_iteration": 2.5925943851470947 }, { "auxiliary_loss_clip": 0.06411544, "auxiliary_loss_mlp": 0.01263787, "balance_loss_clip": 0.06272691, "balance_loss_mlp": 0.01254345, "epoch": 0.8031865323914024, "flos": 19652335303680.0, "grad_norm": 1.5062970768250954, "language_loss": 0.84844959, "learning_rate": 3.9265379502845143e-07, "loss": 0.92520291, "num_input_tokens_seen": 288308865, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09442139, "step": 13359, "time_per_iteration": 2.5339059829711914 }, { "auxiliary_loss_clip": 0.06415437, "auxiliary_loss_mlp": 0.01264011, "balance_loss_clip": 0.06277248, "balance_loss_mlp": 0.0125513, "epoch": 0.8032466556440704, "flos": 26174961809280.0, "grad_norm": 1.767762274676764, "language_loss": 0.73532194, "learning_rate": 3.924220681368928e-07, "loss": 0.81211638, "num_input_tokens_seen": 288327325, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.08886719, "step": 13360, "time_per_iteration": 2.576730489730835 }, { "auxiliary_loss_clip": 0.06413816, "auxiliary_loss_mlp": 0.01264097, "balance_loss_clip": 0.06273641, "balance_loss_mlp": 0.01254656, "epoch": 0.8033067788967383, "flos": 25527049712640.0, "grad_norm": 1.8636929304584982, "language_loss": 0.70036519, "learning_rate": 3.921904022048512e-07, "loss": 0.77714431, "num_input_tokens_seen": 288347285, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09442139, "step": 13361, "time_per_iteration": 2.5747151374816895 }, { "auxiliary_loss_clip": 0.06422925, "auxiliary_loss_mlp": 0.01266179, "balance_loss_clip": 0.06279829, "balance_loss_mlp": 0.01255921, "epoch": 0.8033669021494063, "flos": 24031076042880.0, "grad_norm": 1.546723573340257, "language_loss": 0.70226377, "learning_rate": 3.919587972411098e-07, "loss": 0.77915484, "num_input_tokens_seen": 288367785, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.1026001, "step": 13362, "time_per_iteration": 2.569636821746826 }, { "auxiliary_loss_clip": 0.06426996, "auxiliary_loss_mlp": 0.0126894, "balance_loss_clip": 0.06277451, "balance_loss_mlp": 0.01257675, "epoch": 0.8034270254020742, "flos": 13592900568960.0, "grad_norm": 2.3702071727755687, "language_loss": 0.79014808, "learning_rate": 3.91727253254452e-07, "loss": 0.86710739, "num_input_tokens_seen": 288384135, "router_z_loss_clip": 1.49511719, "router_z_loss_mlp": 0.1126709, "step": 13363, "time_per_iteration": 2.513659954071045 }, { "auxiliary_loss_clip": 0.06416233, "auxiliary_loss_mlp": 0.01268456, "balance_loss_clip": 0.06274733, "balance_loss_mlp": 0.01258377, "epoch": 0.8034871486547422, "flos": 27419228213760.0, "grad_norm": 1.842885146074545, "language_loss": 0.75021386, "learning_rate": 3.9149577025365787e-07, "loss": 0.82706076, "num_input_tokens_seen": 288403805, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10083008, "step": 13364, "time_per_iteration": 2.5986125469207764 }, { "auxiliary_loss_clip": 0.06413135, "auxiliary_loss_mlp": 0.01263606, "balance_loss_clip": 0.06274937, "balance_loss_mlp": 0.01254433, "epoch": 0.8035472719074102, "flos": 32606855942400.0, "grad_norm": 2.025643082212105, "language_loss": 0.60554302, "learning_rate": 3.9126434824750596e-07, "loss": 0.68231046, "num_input_tokens_seen": 288424895, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09173584, "step": 13365, "time_per_iteration": 4.110854148864746 }, { "auxiliary_loss_clip": 0.06420311, "auxiliary_loss_mlp": 0.01265247, "balance_loss_clip": 0.06276883, "balance_loss_mlp": 0.01254679, "epoch": 0.8036073951600782, "flos": 21294357840000.0, "grad_norm": 2.145791810366102, "language_loss": 0.66663313, "learning_rate": 3.910329872447706e-07, "loss": 0.74348873, "num_input_tokens_seen": 288443865, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10571289, "step": 13366, "time_per_iteration": 2.5575876235961914 }, { "auxiliary_loss_clip": 0.06413087, "auxiliary_loss_mlp": 0.01262935, "balance_loss_clip": 0.06273737, "balance_loss_mlp": 0.01253899, "epoch": 0.8036675184127461, "flos": 18119702672640.0, "grad_norm": 2.3092267468855425, "language_loss": 0.74963677, "learning_rate": 3.908016872542259e-07, "loss": 0.82639706, "num_input_tokens_seen": 288461065, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09033203, "step": 13367, "time_per_iteration": 2.566923141479492 }, { "auxiliary_loss_clip": 0.06417711, "auxiliary_loss_mlp": 0.01264735, "balance_loss_clip": 0.06278884, "balance_loss_mlp": 0.01255145, "epoch": 0.8037276416654141, "flos": 26037298350720.0, "grad_norm": 1.6177227121072526, "language_loss": 0.7414006, "learning_rate": 3.905704482846428e-07, "loss": 0.81822515, "num_input_tokens_seen": 288481865, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.0960083, "step": 13368, "time_per_iteration": 2.5830445289611816 }, { "auxiliary_loss_clip": 0.06420602, "auxiliary_loss_mlp": 0.01265489, "balance_loss_clip": 0.06276958, "balance_loss_mlp": 0.0125563, "epoch": 0.803787764918082, "flos": 18807334404480.0, "grad_norm": 1.9435011629646874, "language_loss": 0.70045269, "learning_rate": 3.90339270344789e-07, "loss": 0.77731359, "num_input_tokens_seen": 288499345, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09863281, "step": 13369, "time_per_iteration": 4.011995553970337 }, { "auxiliary_loss_clip": 0.06412733, "auxiliary_loss_mlp": 0.01265132, "balance_loss_clip": 0.06274221, "balance_loss_mlp": 0.01256382, "epoch": 0.80384788817075, "flos": 20231289889920.0, "grad_norm": 1.758486533587698, "language_loss": 0.73737878, "learning_rate": 3.901081534434312e-07, "loss": 0.81415749, "num_input_tokens_seen": 288517660, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.08746338, "step": 13370, "time_per_iteration": 2.5961215496063232 }, { "auxiliary_loss_clip": 0.06419718, "auxiliary_loss_mlp": 0.01267849, "balance_loss_clip": 0.06273919, "balance_loss_mlp": 0.01258289, "epoch": 0.8039080114234181, "flos": 18521232238080.0, "grad_norm": 10.152043108373928, "language_loss": 0.87493587, "learning_rate": 3.898770975893342e-07, "loss": 0.95181149, "num_input_tokens_seen": 288534180, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.09570312, "step": 13371, "time_per_iteration": 2.5146172046661377 }, { "auxiliary_loss_clip": 0.06418256, "auxiliary_loss_mlp": 0.01266335, "balance_loss_clip": 0.06272655, "balance_loss_mlp": 0.01255827, "epoch": 0.803968134676086, "flos": 22389053506560.0, "grad_norm": 1.853159950095116, "language_loss": 0.74724638, "learning_rate": 3.89646102791259e-07, "loss": 0.82409227, "num_input_tokens_seen": 288553350, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10510254, "step": 13372, "time_per_iteration": 2.571868658065796 }, { "auxiliary_loss_clip": 0.06412863, "auxiliary_loss_mlp": 0.01267032, "balance_loss_clip": 0.06273559, "balance_loss_mlp": 0.01256637, "epoch": 0.804028257928754, "flos": 23849458318080.0, "grad_norm": 2.451838240106416, "language_loss": 0.79333007, "learning_rate": 3.894151690579646e-07, "loss": 0.87012899, "num_input_tokens_seen": 288571325, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.10400391, "step": 13373, "time_per_iteration": 2.564603805541992 }, { "auxiliary_loss_clip": 0.06414047, "auxiliary_loss_mlp": 0.01262456, "balance_loss_clip": 0.06276263, "balance_loss_mlp": 0.01254022, "epoch": 0.8040883811814219, "flos": 23557570220160.0, "grad_norm": 1.4422710596452628, "language_loss": 0.74676788, "learning_rate": 3.8918429639820815e-07, "loss": 0.82353294, "num_input_tokens_seen": 288592100, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.08435059, "step": 13374, "time_per_iteration": 2.628202438354492 }, { "auxiliary_loss_clip": 0.06418824, "auxiliary_loss_mlp": 0.01266075, "balance_loss_clip": 0.06273493, "balance_loss_mlp": 0.01254714, "epoch": 0.8041485044340899, "flos": 19032319416960.0, "grad_norm": 2.228827650734072, "language_loss": 0.69070446, "learning_rate": 3.889534848207452e-07, "loss": 0.76755345, "num_input_tokens_seen": 288612305, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.11364746, "step": 13375, "time_per_iteration": 2.6570022106170654 }, { "auxiliary_loss_clip": 0.06314483, "auxiliary_loss_mlp": 0.01249872, "balance_loss_clip": 0.06258705, "balance_loss_mlp": 0.01248802, "epoch": 0.8042086276867578, "flos": 70027817310720.0, "grad_norm": 0.7214428883671226, "language_loss": 0.55566514, "learning_rate": 3.887227343343271e-07, "loss": 0.63130867, "num_input_tokens_seen": 288676015, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.01071167, "step": 13376, "time_per_iteration": 3.241219997406006 }, { "auxiliary_loss_clip": 0.06417269, "auxiliary_loss_mlp": 0.01268374, "balance_loss_clip": 0.06273684, "balance_loss_mlp": 0.01258414, "epoch": 0.8042687509394258, "flos": 21879014503680.0, "grad_norm": 1.581435114272018, "language_loss": 0.73613781, "learning_rate": 3.8849204494770425e-07, "loss": 0.81299424, "num_input_tokens_seen": 288696455, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09967041, "step": 13377, "time_per_iteration": 2.542726516723633 }, { "auxiliary_loss_clip": 0.06413697, "auxiliary_loss_mlp": 0.01264813, "balance_loss_clip": 0.06272268, "balance_loss_mlp": 0.01254239, "epoch": 0.8043288741920938, "flos": 26622122722560.0, "grad_norm": 1.8077809066044428, "language_loss": 0.70766371, "learning_rate": 3.8826141666962567e-07, "loss": 0.7844488, "num_input_tokens_seen": 288715560, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10583496, "step": 13378, "time_per_iteration": 2.56247878074646 }, { "auxiliary_loss_clip": 0.06416445, "auxiliary_loss_mlp": 0.0126454, "balance_loss_clip": 0.06273805, "balance_loss_mlp": 0.01255069, "epoch": 0.8043889974447618, "flos": 33412137206400.0, "grad_norm": 1.4557093030020978, "language_loss": 0.69770634, "learning_rate": 3.880308495088347e-07, "loss": 0.77451611, "num_input_tokens_seen": 288739485, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09466553, "step": 13379, "time_per_iteration": 2.645026206970215 }, { "auxiliary_loss_clip": 0.06424125, "auxiliary_loss_mlp": 0.0126553, "balance_loss_clip": 0.06278206, "balance_loss_mlp": 0.0125467, "epoch": 0.8044491206974297, "flos": 20382202293120.0, "grad_norm": 1.7904610476530496, "language_loss": 0.76524156, "learning_rate": 3.8780034347407533e-07, "loss": 0.84213817, "num_input_tokens_seen": 288757420, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10864258, "step": 13380, "time_per_iteration": 4.040886402130127 }, { "auxiliary_loss_clip": 0.06413913, "auxiliary_loss_mlp": 0.01263325, "balance_loss_clip": 0.06274693, "balance_loss_mlp": 0.01254074, "epoch": 0.8045092439500977, "flos": 23410473177600.0, "grad_norm": 1.7274804731826718, "language_loss": 0.68711239, "learning_rate": 3.875698985740887e-07, "loss": 0.76388478, "num_input_tokens_seen": 288775535, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.0925293, "step": 13381, "time_per_iteration": 2.548623561859131 }, { "auxiliary_loss_clip": 0.06417625, "auxiliary_loss_mlp": 0.01266178, "balance_loss_clip": 0.06277204, "balance_loss_mlp": 0.0125654, "epoch": 0.8045693672027656, "flos": 24104058549120.0, "grad_norm": 1.76330173894952, "language_loss": 0.64047718, "learning_rate": 3.873395148176135e-07, "loss": 0.7173152, "num_input_tokens_seen": 288795035, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09643555, "step": 13382, "time_per_iteration": 2.5508873462677 }, { "auxiliary_loss_clip": 0.064105, "auxiliary_loss_mlp": 0.01265579, "balance_loss_clip": 0.06271531, "balance_loss_mlp": 0.01256365, "epoch": 0.8046294904554336, "flos": 27714218912640.0, "grad_norm": 3.6711601068983053, "language_loss": 0.76651788, "learning_rate": 3.8710919221338487e-07, "loss": 0.84327865, "num_input_tokens_seen": 288816270, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09210205, "step": 13383, "time_per_iteration": 2.621509313583374 }, { "auxiliary_loss_clip": 0.06412679, "auxiliary_loss_mlp": 0.01267893, "balance_loss_clip": 0.06273195, "balance_loss_mlp": 0.01258267, "epoch": 0.8046896137081017, "flos": 24979974405120.0, "grad_norm": 1.8200274954606785, "language_loss": 0.70293367, "learning_rate": 3.868789307701381e-07, "loss": 0.77973944, "num_input_tokens_seen": 288836050, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09619141, "step": 13384, "time_per_iteration": 2.567521572113037 }, { "auxiliary_loss_clip": 0.06418008, "auxiliary_loss_mlp": 0.01265913, "balance_loss_clip": 0.06274085, "balance_loss_mlp": 0.01255065, "epoch": 0.8047497369607696, "flos": 17681178729600.0, "grad_norm": 2.6917061067256034, "language_loss": 0.80126327, "learning_rate": 3.8664873049660375e-07, "loss": 0.87810248, "num_input_tokens_seen": 288852900, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10845947, "step": 13385, "time_per_iteration": 2.537553548812866 }, { "auxiliary_loss_clip": 0.06418253, "auxiliary_loss_mlp": 0.01269667, "balance_loss_clip": 0.06275702, "balance_loss_mlp": 0.01258831, "epoch": 0.8048098602134376, "flos": 22388550382080.0, "grad_norm": 38.97112100162316, "language_loss": 0.72494495, "learning_rate": 3.864185914015108e-07, "loss": 0.80182415, "num_input_tokens_seen": 288872625, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.1083374, "step": 13386, "time_per_iteration": 2.536362648010254 }, { "auxiliary_loss_clip": 0.06316391, "auxiliary_loss_mlp": 0.01250429, "balance_loss_clip": 0.06260345, "balance_loss_mlp": 0.01249396, "epoch": 0.8048699834661055, "flos": 71221840392960.0, "grad_norm": 0.6629349045931154, "language_loss": 0.51245725, "learning_rate": 3.861885134935865e-07, "loss": 0.58812535, "num_input_tokens_seen": 288939180, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01033783, "step": 13387, "time_per_iteration": 4.6205809116363525 }, { "auxiliary_loss_clip": 0.06417208, "auxiliary_loss_mlp": 0.01262853, "balance_loss_clip": 0.06275453, "balance_loss_mlp": 0.01253037, "epoch": 0.8049301067187735, "flos": 23667211687680.0, "grad_norm": 1.6617923639015897, "language_loss": 0.74396539, "learning_rate": 3.859584967815559e-07, "loss": 0.82076609, "num_input_tokens_seen": 288958925, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09820557, "step": 13388, "time_per_iteration": 2.6069624423980713 }, { "auxiliary_loss_clip": 0.06416203, "auxiliary_loss_mlp": 0.01265699, "balance_loss_clip": 0.06277505, "balance_loss_mlp": 0.01256211, "epoch": 0.8049902299714414, "flos": 24433318368000.0, "grad_norm": 1.53703630107572, "language_loss": 0.71803045, "learning_rate": 3.857285412741411e-07, "loss": 0.79484951, "num_input_tokens_seen": 288980935, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09487915, "step": 13389, "time_per_iteration": 2.5979366302490234 }, { "auxiliary_loss_clip": 0.06412119, "auxiliary_loss_mlp": 0.01263797, "balance_loss_clip": 0.06274188, "balance_loss_mlp": 0.01254165, "epoch": 0.8050503532241094, "flos": 17498219339520.0, "grad_norm": 1.8939387545636042, "language_loss": 0.82885021, "learning_rate": 3.8549864698006097e-07, "loss": 0.90560937, "num_input_tokens_seen": 288996780, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09631348, "step": 13390, "time_per_iteration": 2.567960262298584 }, { "auxiliary_loss_clip": 0.06312059, "auxiliary_loss_mlp": 0.01250163, "balance_loss_clip": 0.06256139, "balance_loss_mlp": 0.01249121, "epoch": 0.8051104764767774, "flos": 57675535493760.0, "grad_norm": 0.7633192936012175, "language_loss": 0.5557344, "learning_rate": 3.8526881390803424e-07, "loss": 0.6313566, "num_input_tokens_seen": 289057590, "router_z_loss_clip": 0.55957031, "router_z_loss_mlp": 0.01042938, "step": 13391, "time_per_iteration": 3.1655452251434326 }, { "auxiliary_loss_clip": 0.06408736, "auxiliary_loss_mlp": 0.01268826, "balance_loss_clip": 0.06274118, "balance_loss_mlp": 0.01259695, "epoch": 0.8051705997294454, "flos": 18009138810240.0, "grad_norm": 1.494831941086965, "language_loss": 0.84993666, "learning_rate": 3.850390420667762e-07, "loss": 0.92671227, "num_input_tokens_seen": 289076285, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.09130859, "step": 13392, "time_per_iteration": 2.587181568145752 }, { "auxiliary_loss_clip": 0.06414111, "auxiliary_loss_mlp": 0.01264449, "balance_loss_clip": 0.06272777, "balance_loss_mlp": 0.01254835, "epoch": 0.8052307229821133, "flos": 26405271555840.0, "grad_norm": 1.505226936080568, "language_loss": 0.70619273, "learning_rate": 3.8480933146499914e-07, "loss": 0.7829783, "num_input_tokens_seen": 289097585, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09613037, "step": 13393, "time_per_iteration": 2.5962843894958496 }, { "auxiliary_loss_clip": 0.0641509, "auxiliary_loss_mlp": 0.01265744, "balance_loss_clip": 0.06271256, "balance_loss_mlp": 0.01255778, "epoch": 0.8052908462347813, "flos": 21762580855680.0, "grad_norm": 1.9512668562935336, "language_loss": 0.77009249, "learning_rate": 3.84579682111414e-07, "loss": 0.84690082, "num_input_tokens_seen": 289116890, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.09973145, "step": 13394, "time_per_iteration": 2.637338876724243 }, { "auxiliary_loss_clip": 0.06418028, "auxiliary_loss_mlp": 0.01265669, "balance_loss_clip": 0.06276904, "balance_loss_mlp": 0.01256555, "epoch": 0.8053509694874492, "flos": 25448490910080.0, "grad_norm": 1.4501624134480722, "language_loss": 0.6503129, "learning_rate": 3.843500940147304e-07, "loss": 0.72714984, "num_input_tokens_seen": 289136670, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09112549, "step": 13395, "time_per_iteration": 2.57987642288208 }, { "auxiliary_loss_clip": 0.06312329, "auxiliary_loss_mlp": 0.01252052, "balance_loss_clip": 0.06256124, "balance_loss_mlp": 0.01250861, "epoch": 0.8054110927401172, "flos": 57687316992000.0, "grad_norm": 0.7372665485096641, "language_loss": 0.57256877, "learning_rate": 3.8412056718365206e-07, "loss": 0.64821255, "num_input_tokens_seen": 289200150, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01189423, "step": 13396, "time_per_iteration": 3.299489736557007 }, { "auxiliary_loss_clip": 0.06417488, "auxiliary_loss_mlp": 0.01266455, "balance_loss_clip": 0.06276564, "balance_loss_mlp": 0.01255851, "epoch": 0.8054712159927853, "flos": 19281385278720.0, "grad_norm": 1.6126517152623965, "language_loss": 0.77553558, "learning_rate": 3.8389110162688353e-07, "loss": 0.85237503, "num_input_tokens_seen": 289218125, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.1060791, "step": 13397, "time_per_iteration": 2.5880212783813477 }, { "auxiliary_loss_clip": 0.06418611, "auxiliary_loss_mlp": 0.01266396, "balance_loss_clip": 0.06278326, "balance_loss_mlp": 0.01256621, "epoch": 0.8055313392454532, "flos": 17973402243840.0, "grad_norm": 1.6481848211291152, "language_loss": 0.70488417, "learning_rate": 3.836616973531266e-07, "loss": 0.78173429, "num_input_tokens_seen": 289237115, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09765625, "step": 13398, "time_per_iteration": 2.5418927669525146 }, { "auxiliary_loss_clip": 0.06412661, "auxiliary_loss_mlp": 0.01266015, "balance_loss_clip": 0.06272724, "balance_loss_mlp": 0.01257253, "epoch": 0.8055914624981212, "flos": 13483133320320.0, "grad_norm": 2.00615956717813, "language_loss": 0.69860131, "learning_rate": 3.834323543710805e-07, "loss": 0.77538812, "num_input_tokens_seen": 289253635, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.08764648, "step": 13399, "time_per_iteration": 2.639796733856201 }, { "auxiliary_loss_clip": 0.06416617, "auxiliary_loss_mlp": 0.01267873, "balance_loss_clip": 0.06276435, "balance_loss_mlp": 0.01258241, "epoch": 0.8056515857507891, "flos": 13229832827520.0, "grad_norm": 2.4240044463068395, "language_loss": 0.72458458, "learning_rate": 3.8320307268944153e-07, "loss": 0.80142945, "num_input_tokens_seen": 289270085, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09625244, "step": 13400, "time_per_iteration": 2.529698371887207 }, { "auxiliary_loss_clip": 0.06413037, "auxiliary_loss_mlp": 0.01263663, "balance_loss_clip": 0.06274428, "balance_loss_mlp": 0.01254186, "epoch": 0.8057117090034571, "flos": 23884943322240.0, "grad_norm": 1.9033524543027396, "language_loss": 0.63838053, "learning_rate": 3.829738523169037e-07, "loss": 0.71514755, "num_input_tokens_seen": 289289645, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09472656, "step": 13401, "time_per_iteration": 2.6259663105010986 }, { "auxiliary_loss_clip": 0.0641534, "auxiliary_loss_mlp": 0.01266144, "balance_loss_clip": 0.06274943, "balance_loss_mlp": 0.01256965, "epoch": 0.805771832256125, "flos": 21220536792960.0, "grad_norm": 2.2688243364802023, "language_loss": 0.8386724, "learning_rate": 3.8274469326215985e-07, "loss": 0.91548729, "num_input_tokens_seen": 289306630, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09173584, "step": 13402, "time_per_iteration": 2.594648838043213 }, { "auxiliary_loss_clip": 0.06416237, "auxiliary_loss_mlp": 0.01264019, "balance_loss_clip": 0.0627519, "balance_loss_mlp": 0.0125453, "epoch": 0.805831955508793, "flos": 17572627365120.0, "grad_norm": 2.035463513251157, "language_loss": 0.68329513, "learning_rate": 3.8251559553389876e-07, "loss": 0.76009774, "num_input_tokens_seen": 289324960, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09484863, "step": 13403, "time_per_iteration": 2.5536980628967285 }, { "auxiliary_loss_clip": 0.06411768, "auxiliary_loss_mlp": 0.01263866, "balance_loss_clip": 0.06275822, "balance_loss_mlp": 0.01253948, "epoch": 0.805892078761461, "flos": 26914975142400.0, "grad_norm": 1.4884738750823159, "language_loss": 0.84995902, "learning_rate": 3.822865591408084e-07, "loss": 0.92671537, "num_input_tokens_seen": 289344980, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.09918213, "step": 13404, "time_per_iteration": 2.600093364715576 }, { "auxiliary_loss_clip": 0.06409248, "auxiliary_loss_mlp": 0.01263222, "balance_loss_clip": 0.06274137, "balance_loss_mlp": 0.01253906, "epoch": 0.805952202014129, "flos": 31514927460480.0, "grad_norm": 1.4747352645821263, "language_loss": 0.70503098, "learning_rate": 3.820575840915743e-07, "loss": 0.78175569, "num_input_tokens_seen": 289367500, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.09307861, "step": 13405, "time_per_iteration": 4.030258417129517 }, { "auxiliary_loss_clip": 0.0641361, "auxiliary_loss_mlp": 0.01262592, "balance_loss_clip": 0.06275506, "balance_loss_mlp": 0.01253395, "epoch": 0.8060123252667969, "flos": 24396952896000.0, "grad_norm": 2.9318689444357258, "language_loss": 0.75757319, "learning_rate": 3.818286703948788e-07, "loss": 0.83433527, "num_input_tokens_seen": 289385930, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09191895, "step": 13406, "time_per_iteration": 2.5775623321533203 }, { "auxiliary_loss_clip": 0.06415589, "auxiliary_loss_mlp": 0.01263768, "balance_loss_clip": 0.06275163, "balance_loss_mlp": 0.01254035, "epoch": 0.8060724485194649, "flos": 23487145263360.0, "grad_norm": 1.3982032926572896, "language_loss": 0.76680738, "learning_rate": 3.815998180594018e-07, "loss": 0.84360099, "num_input_tokens_seen": 289408025, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09741211, "step": 13407, "time_per_iteration": 2.569474697113037 }, { "auxiliary_loss_clip": 0.06408711, "auxiliary_loss_mlp": 0.01268735, "balance_loss_clip": 0.06269303, "balance_loss_mlp": 0.01259138, "epoch": 0.8061325717721328, "flos": 18630412508160.0, "grad_norm": 2.1756858660353022, "language_loss": 0.74225092, "learning_rate": 3.81371027093822e-07, "loss": 0.81902528, "num_input_tokens_seen": 289426575, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09594727, "step": 13408, "time_per_iteration": 2.535055637359619 }, { "auxiliary_loss_clip": 0.06410815, "auxiliary_loss_mlp": 0.01266897, "balance_loss_clip": 0.06272074, "balance_loss_mlp": 0.01256472, "epoch": 0.8061926950248008, "flos": 23588862520320.0, "grad_norm": 1.8453736248787258, "language_loss": 0.70932972, "learning_rate": 3.8114229750681523e-07, "loss": 0.78610682, "num_input_tokens_seen": 289447760, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.10418701, "step": 13409, "time_per_iteration": 4.00121283531189 }, { "auxiliary_loss_clip": 0.06415662, "auxiliary_loss_mlp": 0.0126269, "balance_loss_clip": 0.06273389, "balance_loss_mlp": 0.01252832, "epoch": 0.8062528182774689, "flos": 11147735047680.0, "grad_norm": 2.259837901101939, "language_loss": 0.77283567, "learning_rate": 3.809136293070545e-07, "loss": 0.84961921, "num_input_tokens_seen": 289463920, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09869385, "step": 13410, "time_per_iteration": 2.542146921157837 }, { "auxiliary_loss_clip": 0.06411776, "auxiliary_loss_mlp": 0.01267468, "balance_loss_clip": 0.06272803, "balance_loss_mlp": 0.01257198, "epoch": 0.8063129415301368, "flos": 22353484648320.0, "grad_norm": 2.0498915600551637, "language_loss": 0.68661833, "learning_rate": 3.806850225032117e-07, "loss": 0.76341081, "num_input_tokens_seen": 289482635, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.1027832, "step": 13411, "time_per_iteration": 2.53010630607605 }, { "auxiliary_loss_clip": 0.0641382, "auxiliary_loss_mlp": 0.01266009, "balance_loss_clip": 0.0627504, "balance_loss_mlp": 0.01256108, "epoch": 0.8063730647828048, "flos": 23995297549440.0, "grad_norm": 1.6652760727425635, "language_loss": 0.68413699, "learning_rate": 3.804564771039551e-07, "loss": 0.76093525, "num_input_tokens_seen": 289502040, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09899902, "step": 13412, "time_per_iteration": 2.5694031715393066 }, { "auxiliary_loss_clip": 0.06419609, "auxiliary_loss_mlp": 0.0126846, "balance_loss_clip": 0.06274637, "balance_loss_mlp": 0.01258083, "epoch": 0.8064331880354727, "flos": 21327369148800.0, "grad_norm": 1.6040484743393935, "language_loss": 0.81872624, "learning_rate": 3.8022799311795064e-07, "loss": 0.89560694, "num_input_tokens_seen": 289520740, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.1038208, "step": 13413, "time_per_iteration": 2.5818519592285156 }, { "auxiliary_loss_clip": 0.06413433, "auxiliary_loss_mlp": 0.01263511, "balance_loss_clip": 0.06274803, "balance_loss_mlp": 0.01253539, "epoch": 0.8064933112881407, "flos": 19689036192000.0, "grad_norm": 1.9274141971757768, "language_loss": 0.85314888, "learning_rate": 3.7999957055386303e-07, "loss": 0.92991829, "num_input_tokens_seen": 289535840, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09973145, "step": 13414, "time_per_iteration": 2.597064733505249 }, { "auxiliary_loss_clip": 0.06408602, "auxiliary_loss_mlp": 0.01260966, "balance_loss_clip": 0.06271799, "balance_loss_mlp": 0.01252353, "epoch": 0.8065534345408086, "flos": 19285494128640.0, "grad_norm": 1.650517995221073, "language_loss": 0.66976672, "learning_rate": 3.7977120942035467e-07, "loss": 0.74646235, "num_input_tokens_seen": 289555205, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.08618164, "step": 13415, "time_per_iteration": 2.5232133865356445 }, { "auxiliary_loss_clip": 0.06408155, "auxiliary_loss_mlp": 0.01264042, "balance_loss_clip": 0.06272489, "balance_loss_mlp": 0.01255233, "epoch": 0.8066135577934767, "flos": 19682998698240.0, "grad_norm": 1.5923718114269199, "language_loss": 0.76313448, "learning_rate": 3.7954290972608383e-07, "loss": 0.83985639, "num_input_tokens_seen": 289573000, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.08813477, "step": 13416, "time_per_iteration": 2.5575954914093018 }, { "auxiliary_loss_clip": 0.06419665, "auxiliary_loss_mlp": 0.01264143, "balance_loss_clip": 0.06273969, "balance_loss_mlp": 0.01254875, "epoch": 0.8066736810461446, "flos": 21150195690240.0, "grad_norm": 1.4698066537618073, "language_loss": 0.65269315, "learning_rate": 3.793146714797086e-07, "loss": 0.72953123, "num_input_tokens_seen": 289592625, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.09259033, "step": 13417, "time_per_iteration": 2.629908800125122 }, { "auxiliary_loss_clip": 0.06417423, "auxiliary_loss_mlp": 0.01265101, "balance_loss_clip": 0.06273913, "balance_loss_mlp": 0.01255356, "epoch": 0.8067338042988126, "flos": 22604311445760.0, "grad_norm": 1.6289575811214732, "language_loss": 0.81048298, "learning_rate": 3.7908649468988306e-07, "loss": 0.88730824, "num_input_tokens_seen": 289610780, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.09747314, "step": 13418, "time_per_iteration": 2.562941789627075 }, { "auxiliary_loss_clip": 0.06416889, "auxiliary_loss_mlp": 0.01267049, "balance_loss_clip": 0.06274556, "balance_loss_mlp": 0.01256731, "epoch": 0.8067939275514805, "flos": 16514003681280.0, "grad_norm": 2.009394419290704, "language_loss": 0.84987688, "learning_rate": 3.7885837936526066e-07, "loss": 0.92671627, "num_input_tokens_seen": 289628890, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10314941, "step": 13419, "time_per_iteration": 2.553828239440918 }, { "auxiliary_loss_clip": 0.06415967, "auxiliary_loss_mlp": 0.01264996, "balance_loss_clip": 0.06273369, "balance_loss_mlp": 0.01254571, "epoch": 0.8068540508041485, "flos": 28548276854400.0, "grad_norm": 2.0308205827667036, "language_loss": 0.76394665, "learning_rate": 3.7863032551449047e-07, "loss": 0.8407563, "num_input_tokens_seen": 289647220, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10424805, "step": 13420, "time_per_iteration": 4.007458209991455 }, { "auxiliary_loss_clip": 0.06408711, "auxiliary_loss_mlp": 0.0126051, "balance_loss_clip": 0.06270367, "balance_loss_mlp": 0.01252148, "epoch": 0.8069141740568164, "flos": 21658851100800.0, "grad_norm": 1.7669714238763048, "language_loss": 0.78517759, "learning_rate": 3.784023331462207e-07, "loss": 0.86186981, "num_input_tokens_seen": 289665800, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.0836792, "step": 13421, "time_per_iteration": 2.553535223007202 }, { "auxiliary_loss_clip": 0.06414448, "auxiliary_loss_mlp": 0.01262857, "balance_loss_clip": 0.06273516, "balance_loss_mlp": 0.012536, "epoch": 0.8069742973094844, "flos": 17534962154880.0, "grad_norm": 1.7847434793550974, "language_loss": 0.80115688, "learning_rate": 3.78174402269098e-07, "loss": 0.87792993, "num_input_tokens_seen": 289682705, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.0925293, "step": 13422, "time_per_iteration": 2.4974515438079834 }, { "auxiliary_loss_clip": 0.06413312, "auxiliary_loss_mlp": 0.01264864, "balance_loss_clip": 0.06273196, "balance_loss_mlp": 0.0125485, "epoch": 0.8070344205621525, "flos": 23373646508160.0, "grad_norm": 2.011647997007425, "language_loss": 0.68298662, "learning_rate": 3.7794653289176347e-07, "loss": 0.75976837, "num_input_tokens_seen": 289702920, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.10015869, "step": 13423, "time_per_iteration": 2.532383680343628 }, { "auxiliary_loss_clip": 0.06417051, "auxiliary_loss_mlp": 0.01265285, "balance_loss_clip": 0.06272416, "balance_loss_mlp": 0.01255003, "epoch": 0.8070945438148204, "flos": 22936883500800.0, "grad_norm": 1.9684717232173876, "language_loss": 0.80461645, "learning_rate": 3.7771872502285904e-07, "loss": 0.88143981, "num_input_tokens_seen": 289723280, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.1027832, "step": 13424, "time_per_iteration": 2.5403308868408203 }, { "auxiliary_loss_clip": 0.06424372, "auxiliary_loss_mlp": 0.01263875, "balance_loss_clip": 0.06279142, "balance_loss_mlp": 0.01254506, "epoch": 0.8071546670674884, "flos": 25307599069440.0, "grad_norm": 1.402274846142853, "language_loss": 0.78609383, "learning_rate": 3.774909786710232e-07, "loss": 0.86297631, "num_input_tokens_seen": 289743475, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.09368896, "step": 13425, "time_per_iteration": 2.591736316680908 }, { "auxiliary_loss_clip": 0.06414378, "auxiliary_loss_mlp": 0.01263599, "balance_loss_clip": 0.06274505, "balance_loss_mlp": 0.01254724, "epoch": 0.8072147903201563, "flos": 18119534964480.0, "grad_norm": 2.494477366660835, "language_loss": 0.75386369, "learning_rate": 3.772632938448923e-07, "loss": 0.83064353, "num_input_tokens_seen": 289761400, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.08880615, "step": 13426, "time_per_iteration": 3.964552640914917 }, { "auxiliary_loss_clip": 0.06414819, "auxiliary_loss_mlp": 0.01265882, "balance_loss_clip": 0.06274259, "balance_loss_mlp": 0.01255994, "epoch": 0.8072749135728243, "flos": 26695482572160.0, "grad_norm": 1.8126302751324939, "language_loss": 0.7340759, "learning_rate": 3.770356705530997e-07, "loss": 0.81088287, "num_input_tokens_seen": 289781025, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09881592, "step": 13427, "time_per_iteration": 2.5776326656341553 }, { "auxiliary_loss_clip": 0.06412178, "auxiliary_loss_mlp": 0.0126668, "balance_loss_clip": 0.06272732, "balance_loss_mlp": 0.01256928, "epoch": 0.8073350368254922, "flos": 19245564858240.0, "grad_norm": 1.5543541079200136, "language_loss": 0.70515835, "learning_rate": 3.768081088042774e-07, "loss": 0.7819469, "num_input_tokens_seen": 289798380, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09753418, "step": 13428, "time_per_iteration": 2.5650634765625 }, { "auxiliary_loss_clip": 0.06416318, "auxiliary_loss_mlp": 0.01262099, "balance_loss_clip": 0.06275804, "balance_loss_mlp": 0.01253189, "epoch": 0.8073951600781603, "flos": 13339642003200.0, "grad_norm": 1.7358507882571124, "language_loss": 0.75079191, "learning_rate": 3.765806086070544e-07, "loss": 0.82757604, "num_input_tokens_seen": 289814515, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.08905029, "step": 13429, "time_per_iteration": 2.536032199859619 }, { "auxiliary_loss_clip": 0.06407435, "auxiliary_loss_mlp": 0.01271086, "balance_loss_clip": 0.06272171, "balance_loss_mlp": 0.01261925, "epoch": 0.8074552833308282, "flos": 22859205166080.0, "grad_norm": 1.585652574291889, "language_loss": 0.67128652, "learning_rate": 3.763531699700568e-07, "loss": 0.74807173, "num_input_tokens_seen": 289834315, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.09155273, "step": 13430, "time_per_iteration": 2.558011293411255 }, { "auxiliary_loss_clip": 0.06411426, "auxiliary_loss_mlp": 0.01264816, "balance_loss_clip": 0.06273033, "balance_loss_mlp": 0.01255464, "epoch": 0.8075154065834962, "flos": 20345627185920.0, "grad_norm": 1.7338475476936452, "language_loss": 0.80318183, "learning_rate": 3.7612579290190994e-07, "loss": 0.87994421, "num_input_tokens_seen": 289853770, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09344482, "step": 13431, "time_per_iteration": 2.5512399673461914 }, { "auxiliary_loss_clip": 0.06407716, "auxiliary_loss_mlp": 0.01264631, "balance_loss_clip": 0.06271444, "balance_loss_mlp": 0.01254522, "epoch": 0.8075755298361641, "flos": 21914499507840.0, "grad_norm": 1.7293337855132904, "language_loss": 0.80498171, "learning_rate": 3.7589847741123593e-07, "loss": 0.88170516, "num_input_tokens_seen": 289870480, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.10119629, "step": 13432, "time_per_iteration": 2.5663013458251953 }, { "auxiliary_loss_clip": 0.06421839, "auxiliary_loss_mlp": 0.01266208, "balance_loss_clip": 0.06274607, "balance_loss_mlp": 0.01256516, "epoch": 0.8076356530888321, "flos": 15674746786560.0, "grad_norm": 1.8812792747966478, "language_loss": 0.71003813, "learning_rate": 3.7567122350665415e-07, "loss": 0.78691864, "num_input_tokens_seen": 289888275, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.09686279, "step": 13433, "time_per_iteration": 2.530333995819092 }, { "auxiliary_loss_clip": 0.06412602, "auxiliary_loss_mlp": 0.01263766, "balance_loss_clip": 0.06272505, "balance_loss_mlp": 0.01254546, "epoch": 0.8076957763415, "flos": 37786182117120.0, "grad_norm": 13.917819363747347, "language_loss": 0.72416598, "learning_rate": 3.754440311967828e-07, "loss": 0.80092967, "num_input_tokens_seen": 289911495, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09222412, "step": 13434, "time_per_iteration": 2.687812089920044 }, { "auxiliary_loss_clip": 0.06415071, "auxiliary_loss_mlp": 0.01262427, "balance_loss_clip": 0.06274822, "balance_loss_mlp": 0.01253165, "epoch": 0.807755899594168, "flos": 19617059934720.0, "grad_norm": 1.943662493038118, "language_loss": 0.68458271, "learning_rate": 3.752169004902361e-07, "loss": 0.76135767, "num_input_tokens_seen": 289930045, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09265137, "step": 13435, "time_per_iteration": 2.554163932800293 }, { "auxiliary_loss_clip": 0.06415877, "auxiliary_loss_mlp": 0.0126606, "balance_loss_clip": 0.06273113, "balance_loss_mlp": 0.01255612, "epoch": 0.8078160228468361, "flos": 23301628323840.0, "grad_norm": 1.449145618810242, "language_loss": 0.75165778, "learning_rate": 3.749898313956279e-07, "loss": 0.82847714, "num_input_tokens_seen": 289950815, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10455322, "step": 13436, "time_per_iteration": 2.5472707748413086 }, { "auxiliary_loss_clip": 0.06408231, "auxiliary_loss_mlp": 0.01263509, "balance_loss_clip": 0.06272744, "balance_loss_mlp": 0.01254599, "epoch": 0.807876146099504, "flos": 27170078497920.0, "grad_norm": 1.640898321629096, "language_loss": 0.70360231, "learning_rate": 3.747628239215674e-07, "loss": 0.78031969, "num_input_tokens_seen": 289971730, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.08911133, "step": 13437, "time_per_iteration": 2.5977320671081543 }, { "auxiliary_loss_clip": 0.06412692, "auxiliary_loss_mlp": 0.01264967, "balance_loss_clip": 0.06275737, "balance_loss_mlp": 0.01255978, "epoch": 0.807936269352172, "flos": 27167017824000.0, "grad_norm": 1.5204578058497853, "language_loss": 0.72651112, "learning_rate": 3.745358780766636e-07, "loss": 0.80328774, "num_input_tokens_seen": 289992995, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.08990479, "step": 13438, "time_per_iteration": 2.592980146408081 }, { "auxiliary_loss_clip": 0.06412897, "auxiliary_loss_mlp": 0.01265259, "balance_loss_clip": 0.06275111, "balance_loss_mlp": 0.01255871, "epoch": 0.8079963926048399, "flos": 20746653626880.0, "grad_norm": 1.9271435496875233, "language_loss": 0.7721771, "learning_rate": 3.7430899386952344e-07, "loss": 0.84895861, "num_input_tokens_seen": 290009405, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09393311, "step": 13439, "time_per_iteration": 2.532705307006836 }, { "auxiliary_loss_clip": 0.06408682, "auxiliary_loss_mlp": 0.0126728, "balance_loss_clip": 0.06271691, "balance_loss_mlp": 0.01258023, "epoch": 0.8080565158575079, "flos": 25016675293440.0, "grad_norm": 1.381758032698003, "language_loss": 0.78881145, "learning_rate": 3.7408217130874786e-07, "loss": 0.86557114, "num_input_tokens_seen": 290031085, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09246826, "step": 13440, "time_per_iteration": 2.6280906200408936 }, { "auxiliary_loss_clip": 0.06415741, "auxiliary_loss_mlp": 0.01265171, "balance_loss_clip": 0.06273732, "balance_loss_mlp": 0.01255384, "epoch": 0.8081166391101758, "flos": 18704107774080.0, "grad_norm": 1.772481273851728, "language_loss": 0.58964288, "learning_rate": 3.7385541040293946e-07, "loss": 0.66645199, "num_input_tokens_seen": 290048670, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09790039, "step": 13441, "time_per_iteration": 2.511873722076416 }, { "auxiliary_loss_clip": 0.06411418, "auxiliary_loss_mlp": 0.01267977, "balance_loss_clip": 0.06272383, "balance_loss_mlp": 0.01257552, "epoch": 0.8081767623628439, "flos": 19834791569280.0, "grad_norm": 3.0423351252422535, "language_loss": 0.7661612, "learning_rate": 3.7362871116069684e-07, "loss": 0.84295511, "num_input_tokens_seen": 290064085, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.10430908, "step": 13442, "time_per_iteration": 2.546874523162842 }, { "auxiliary_loss_clip": 0.0641319, "auxiliary_loss_mlp": 0.01265883, "balance_loss_clip": 0.06273699, "balance_loss_mlp": 0.01256555, "epoch": 0.8082368856155118, "flos": 35781762672000.0, "grad_norm": 1.421673182707561, "language_loss": 0.70686507, "learning_rate": 3.734020735906169e-07, "loss": 0.78365582, "num_input_tokens_seen": 290086255, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09313965, "step": 13443, "time_per_iteration": 2.6530587673187256 }, { "auxiliary_loss_clip": 0.06412265, "auxiliary_loss_mlp": 0.01261683, "balance_loss_clip": 0.06275633, "balance_loss_mlp": 0.01252271, "epoch": 0.8082970088681798, "flos": 17203102859520.0, "grad_norm": 1.8439180699861812, "language_loss": 0.82439613, "learning_rate": 3.7317549770129286e-07, "loss": 0.90113562, "num_input_tokens_seen": 290103995, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.09411621, "step": 13444, "time_per_iteration": 4.007241249084473 }, { "auxiliary_loss_clip": 0.06319608, "auxiliary_loss_mlp": 0.01254581, "balance_loss_clip": 0.06263492, "balance_loss_mlp": 0.0125339, "epoch": 0.8083571321208477, "flos": 63571437786240.0, "grad_norm": 0.8074024727452034, "language_loss": 0.53471124, "learning_rate": 3.7294898350131754e-07, "loss": 0.61045313, "num_input_tokens_seen": 290157245, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01189423, "step": 13445, "time_per_iteration": 3.018742084503174 }, { "auxiliary_loss_clip": 0.06414432, "auxiliary_loss_mlp": 0.01266155, "balance_loss_clip": 0.06276309, "balance_loss_mlp": 0.0125638, "epoch": 0.8084172553735157, "flos": 17936407866240.0, "grad_norm": 2.052372744668105, "language_loss": 0.72729182, "learning_rate": 3.7272253099927964e-07, "loss": 0.80409765, "num_input_tokens_seen": 290174970, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09777832, "step": 13446, "time_per_iteration": 2.573512315750122 }, { "auxiliary_loss_clip": 0.06417593, "auxiliary_loss_mlp": 0.0126876, "balance_loss_clip": 0.06274794, "balance_loss_mlp": 0.01258275, "epoch": 0.8084773786261836, "flos": 24104939016960.0, "grad_norm": 2.315732005492084, "language_loss": 0.71945965, "learning_rate": 3.7249614020376606e-07, "loss": 0.79632318, "num_input_tokens_seen": 290194395, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.1048584, "step": 13447, "time_per_iteration": 2.7486350536346436 }, { "auxiliary_loss_clip": 0.06422713, "auxiliary_loss_mlp": 0.01263415, "balance_loss_clip": 0.06279324, "balance_loss_mlp": 0.01252943, "epoch": 0.8085375018788516, "flos": 15592288769280.0, "grad_norm": 2.502922339903016, "language_loss": 0.75415289, "learning_rate": 3.7226981112336197e-07, "loss": 0.83101416, "num_input_tokens_seen": 290209200, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10479736, "step": 13448, "time_per_iteration": 3.988257646560669 }, { "auxiliary_loss_clip": 0.06319378, "auxiliary_loss_mlp": 0.01260974, "balance_loss_clip": 0.06263307, "balance_loss_mlp": 0.01259838, "epoch": 0.8085976251315197, "flos": 67583071059840.0, "grad_norm": 0.7237591865408194, "language_loss": 0.63846987, "learning_rate": 3.7204354376665024e-07, "loss": 0.71427339, "num_input_tokens_seen": 290274565, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01136017, "step": 13449, "time_per_iteration": 3.3018720149993896 }, { "auxiliary_loss_clip": 0.06415187, "auxiliary_loss_mlp": 0.01266148, "balance_loss_clip": 0.06277238, "balance_loss_mlp": 0.01256111, "epoch": 0.8086577483841876, "flos": 22567442849280.0, "grad_norm": 1.5541517747007476, "language_loss": 0.74143875, "learning_rate": 3.718173381422105e-07, "loss": 0.81825209, "num_input_tokens_seen": 290293630, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.10046387, "step": 13450, "time_per_iteration": 2.5635366439819336 }, { "auxiliary_loss_clip": 0.06414254, "auxiliary_loss_mlp": 0.01266031, "balance_loss_clip": 0.06274265, "balance_loss_mlp": 0.01256608, "epoch": 0.8087178716368556, "flos": 17973947295360.0, "grad_norm": 1.7831550647647099, "language_loss": 0.73856002, "learning_rate": 3.7159119425861986e-07, "loss": 0.81536287, "num_input_tokens_seen": 290311450, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09429932, "step": 13451, "time_per_iteration": 2.506342887878418 }, { "auxiliary_loss_clip": 0.06418929, "auxiliary_loss_mlp": 0.01266518, "balance_loss_clip": 0.06274043, "balance_loss_mlp": 0.01255509, "epoch": 0.8087779948895235, "flos": 21724915645440.0, "grad_norm": 1.5844542955289407, "language_loss": 0.80281651, "learning_rate": 3.713651121244543e-07, "loss": 0.87967098, "num_input_tokens_seen": 290330165, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.11016846, "step": 13452, "time_per_iteration": 2.562954902648926 }, { "auxiliary_loss_clip": 0.06417401, "auxiliary_loss_mlp": 0.01265245, "balance_loss_clip": 0.06274949, "balance_loss_mlp": 0.0125553, "epoch": 0.8088381181421915, "flos": 29100047990400.0, "grad_norm": 1.6517727156061204, "language_loss": 0.78963518, "learning_rate": 3.711390917482875e-07, "loss": 0.86646169, "num_input_tokens_seen": 290350815, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09710693, "step": 13453, "time_per_iteration": 2.6057512760162354 }, { "auxiliary_loss_clip": 0.06418359, "auxiliary_loss_mlp": 0.01265813, "balance_loss_clip": 0.06278038, "balance_loss_mlp": 0.01256175, "epoch": 0.8088982413948594, "flos": 22204668597120.0, "grad_norm": 2.47600159669237, "language_loss": 0.77311164, "learning_rate": 3.709131331386892e-07, "loss": 0.84995335, "num_input_tokens_seen": 290367380, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09631348, "step": 13454, "time_per_iteration": 2.5880396366119385 }, { "auxiliary_loss_clip": 0.06413639, "auxiliary_loss_mlp": 0.0126428, "balance_loss_clip": 0.06274715, "balance_loss_mlp": 0.01253909, "epoch": 0.8089583646475275, "flos": 28044023783040.0, "grad_norm": 1.9588311685657698, "language_loss": 0.76792502, "learning_rate": 3.7068723630422795e-07, "loss": 0.84470427, "num_input_tokens_seen": 290387965, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.10369873, "step": 13455, "time_per_iteration": 2.6284313201904297 }, { "auxiliary_loss_clip": 0.06413293, "auxiliary_loss_mlp": 0.01265602, "balance_loss_clip": 0.06272772, "balance_loss_mlp": 0.01256346, "epoch": 0.8090184879001954, "flos": 16623309732480.0, "grad_norm": 1.728004672869853, "language_loss": 0.78631115, "learning_rate": 3.70461401253471e-07, "loss": 0.86310017, "num_input_tokens_seen": 290404150, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.0925293, "step": 13456, "time_per_iteration": 2.5033342838287354 }, { "auxiliary_loss_clip": 0.06416467, "auxiliary_loss_mlp": 0.01264043, "balance_loss_clip": 0.06278348, "balance_loss_mlp": 0.01254733, "epoch": 0.8090786111528634, "flos": 27347545445760.0, "grad_norm": 2.005203800839834, "language_loss": 0.71580136, "learning_rate": 3.702356279949801e-07, "loss": 0.79260647, "num_input_tokens_seen": 290422370, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09301758, "step": 13457, "time_per_iteration": 2.578852891921997 }, { "auxiliary_loss_clip": 0.06411077, "auxiliary_loss_mlp": 0.01263363, "balance_loss_clip": 0.06272221, "balance_loss_mlp": 0.01254083, "epoch": 0.8091387344055313, "flos": 21112111209600.0, "grad_norm": 1.7794379968362652, "language_loss": 0.72982788, "learning_rate": 3.700099165373176e-07, "loss": 0.8065722, "num_input_tokens_seen": 290442645, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09277344, "step": 13458, "time_per_iteration": 2.5524299144744873 }, { "auxiliary_loss_clip": 0.06413972, "auxiliary_loss_mlp": 0.01264637, "balance_loss_clip": 0.06274348, "balance_loss_mlp": 0.01254987, "epoch": 0.8091988576581993, "flos": 11659702694400.0, "grad_norm": 3.003496557955809, "language_loss": 0.79326296, "learning_rate": 3.6978426688904275e-07, "loss": 0.87004912, "num_input_tokens_seen": 290458520, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09649658, "step": 13459, "time_per_iteration": 3.988370895385742 }, { "auxiliary_loss_clip": 0.06417106, "auxiliary_loss_mlp": 0.0126632, "balance_loss_clip": 0.0627283, "balance_loss_mlp": 0.01256598, "epoch": 0.8092589809108672, "flos": 22969475539200.0, "grad_norm": 2.1060875405815023, "language_loss": 0.80327952, "learning_rate": 3.695586790587113e-07, "loss": 0.88011378, "num_input_tokens_seen": 290474465, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.09716797, "step": 13460, "time_per_iteration": 2.531615734100342 }, { "auxiliary_loss_clip": 0.06416786, "auxiliary_loss_mlp": 0.01265388, "balance_loss_clip": 0.06275263, "balance_loss_mlp": 0.01255238, "epoch": 0.8093191041635353, "flos": 13265988664320.0, "grad_norm": 1.6790307569248946, "language_loss": 0.84864551, "learning_rate": 3.693331530548789e-07, "loss": 0.92546725, "num_input_tokens_seen": 290492060, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10144043, "step": 13461, "time_per_iteration": 2.5393824577331543 }, { "auxiliary_loss_clip": 0.06418101, "auxiliary_loss_mlp": 0.01268822, "balance_loss_clip": 0.06275526, "balance_loss_mlp": 0.0125913, "epoch": 0.8093792274162032, "flos": 25522353884160.0, "grad_norm": 1.7951017159262301, "language_loss": 0.76402736, "learning_rate": 3.69107688886096e-07, "loss": 0.84089655, "num_input_tokens_seen": 290511510, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09692383, "step": 13462, "time_per_iteration": 2.5861921310424805 }, { "auxiliary_loss_clip": 0.06417079, "auxiliary_loss_mlp": 0.01265376, "balance_loss_clip": 0.06275445, "balance_loss_mlp": 0.01254803, "epoch": 0.8094393506688712, "flos": 23552622829440.0, "grad_norm": 1.5982178878259108, "language_loss": 0.83178157, "learning_rate": 3.6888228656091357e-07, "loss": 0.90860611, "num_input_tokens_seen": 290530035, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10565186, "step": 13463, "time_per_iteration": 2.64949631690979 }, { "auxiliary_loss_clip": 0.06411718, "auxiliary_loss_mlp": 0.01263844, "balance_loss_clip": 0.06273669, "balance_loss_mlp": 0.01254915, "epoch": 0.8094994739215392, "flos": 17061624040320.0, "grad_norm": 1.8823961195166505, "language_loss": 0.62041038, "learning_rate": 3.686569460878779e-07, "loss": 0.69716597, "num_input_tokens_seen": 290548245, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.0892334, "step": 13464, "time_per_iteration": 2.508944511413574 }, { "auxiliary_loss_clip": 0.06412397, "auxiliary_loss_mlp": 0.01270243, "balance_loss_clip": 0.06275176, "balance_loss_mlp": 0.01261094, "epoch": 0.8095595971742071, "flos": 23558157198720.0, "grad_norm": 1.487560792648819, "language_loss": 0.61922175, "learning_rate": 3.684316674755341e-07, "loss": 0.69604814, "num_input_tokens_seen": 290568625, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.0914917, "step": 13465, "time_per_iteration": 2.543098211288452 }, { "auxiliary_loss_clip": 0.06419504, "auxiliary_loss_mlp": 0.01266156, "balance_loss_clip": 0.06280927, "balance_loss_mlp": 0.01257328, "epoch": 0.8096197204268751, "flos": 20378973911040.0, "grad_norm": 1.8401140067921968, "language_loss": 0.82419908, "learning_rate": 3.682064507324256e-07, "loss": 0.90105563, "num_input_tokens_seen": 290586575, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.08837891, "step": 13466, "time_per_iteration": 3.7697947025299072 }, { "auxiliary_loss_clip": 0.06420463, "auxiliary_loss_mlp": 0.01268024, "balance_loss_clip": 0.06277816, "balance_loss_mlp": 0.01257998, "epoch": 0.809679843679543, "flos": 27826208294400.0, "grad_norm": 1.8918712467384928, "language_loss": 0.76460773, "learning_rate": 3.6798129586709204e-07, "loss": 0.84149259, "num_input_tokens_seen": 290606790, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.1003418, "step": 13467, "time_per_iteration": 2.572537660598755 }, { "auxiliary_loss_clip": 0.06417713, "auxiliary_loss_mlp": 0.0126252, "balance_loss_clip": 0.0627763, "balance_loss_mlp": 0.01253252, "epoch": 0.8097399669322111, "flos": 22019990198400.0, "grad_norm": 2.2616339343757863, "language_loss": 0.79256088, "learning_rate": 3.6775620288807073e-07, "loss": 0.86936319, "num_input_tokens_seen": 290625525, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.0927124, "step": 13468, "time_per_iteration": 2.5311672687530518 }, { "auxiliary_loss_clip": 0.06411163, "auxiliary_loss_mlp": 0.01263884, "balance_loss_clip": 0.06274164, "balance_loss_mlp": 0.01254634, "epoch": 0.809800090184879, "flos": 18994905768960.0, "grad_norm": 1.7981720200154592, "language_loss": 0.67562771, "learning_rate": 3.675311718038978e-07, "loss": 0.75237823, "num_input_tokens_seen": 290644935, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.0925293, "step": 13469, "time_per_iteration": 2.5175669193267822 }, { "auxiliary_loss_clip": 0.06314515, "auxiliary_loss_mlp": 0.01253311, "balance_loss_clip": 0.06258589, "balance_loss_mlp": 0.01252258, "epoch": 0.809860213437547, "flos": 66120653750400.0, "grad_norm": 0.6784525623038228, "language_loss": 0.54428661, "learning_rate": 3.6730620262310683e-07, "loss": 0.6199649, "num_input_tokens_seen": 290710735, "router_z_loss_clip": 0.56201172, "router_z_loss_mlp": 0.01053619, "step": 13470, "time_per_iteration": 3.241851568222046 }, { "auxiliary_loss_clip": 0.06413327, "auxiliary_loss_mlp": 0.01264689, "balance_loss_clip": 0.06273835, "balance_loss_mlp": 0.01255195, "epoch": 0.8099203366902149, "flos": 20888090519040.0, "grad_norm": 1.7631228651502484, "language_loss": 0.69620466, "learning_rate": 3.670812953542279e-07, "loss": 0.77298486, "num_input_tokens_seen": 290729565, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09490967, "step": 13471, "time_per_iteration": 2.5431430339813232 }, { "auxiliary_loss_clip": 0.0641605, "auxiliary_loss_mlp": 0.0126458, "balance_loss_clip": 0.06275478, "balance_loss_mlp": 0.01255043, "epoch": 0.8099804599428829, "flos": 26038053037440.0, "grad_norm": 1.8681588102019038, "language_loss": 0.79696715, "learning_rate": 3.6685645000579003e-07, "loss": 0.87377346, "num_input_tokens_seen": 290749360, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09539795, "step": 13472, "time_per_iteration": 2.579331874847412 }, { "auxiliary_loss_clip": 0.06315151, "auxiliary_loss_mlp": 0.01255836, "balance_loss_clip": 0.06259274, "balance_loss_mlp": 0.01254728, "epoch": 0.8100405831955508, "flos": 69324127522560.0, "grad_norm": 0.7253154127866817, "language_loss": 0.57403576, "learning_rate": 3.666316665863201e-07, "loss": 0.6497457, "num_input_tokens_seen": 290812145, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.01109314, "step": 13473, "time_per_iteration": 3.123443365097046 }, { "auxiliary_loss_clip": 0.06420974, "auxiliary_loss_mlp": 0.01264927, "balance_loss_clip": 0.06277993, "balance_loss_mlp": 0.01255355, "epoch": 0.8101007064482189, "flos": 15017820376320.0, "grad_norm": 1.723567475092239, "language_loss": 0.74483025, "learning_rate": 3.664069451043399e-07, "loss": 0.82168931, "num_input_tokens_seen": 290829845, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09576416, "step": 13474, "time_per_iteration": 2.6151089668273926 }, { "auxiliary_loss_clip": 0.06419709, "auxiliary_loss_mlp": 0.01268756, "balance_loss_clip": 0.06275533, "balance_loss_mlp": 0.01258856, "epoch": 0.8101608297008868, "flos": 21073230115200.0, "grad_norm": 1.5913413738927482, "language_loss": 0.78689218, "learning_rate": 3.661822855683723e-07, "loss": 0.8637768, "num_input_tokens_seen": 290848815, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.09899902, "step": 13475, "time_per_iteration": 2.5589544773101807 }, { "auxiliary_loss_clip": 0.06413235, "auxiliary_loss_mlp": 0.01266404, "balance_loss_clip": 0.0627422, "balance_loss_mlp": 0.01256766, "epoch": 0.8102209529535548, "flos": 23737846279680.0, "grad_norm": 1.5613617556964754, "language_loss": 0.75832313, "learning_rate": 3.659576879869364e-07, "loss": 0.83511955, "num_input_tokens_seen": 290868580, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09637451, "step": 13476, "time_per_iteration": 2.607797384262085 }, { "auxiliary_loss_clip": 0.06424347, "auxiliary_loss_mlp": 0.01268509, "balance_loss_clip": 0.06279524, "balance_loss_mlp": 0.01258037, "epoch": 0.8102810762062228, "flos": 10959408996480.0, "grad_norm": 2.7165276058415206, "language_loss": 0.73892808, "learning_rate": 3.657331523685485e-07, "loss": 0.8158567, "num_input_tokens_seen": 290883540, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.1048584, "step": 13477, "time_per_iteration": 2.5439112186431885 }, { "auxiliary_loss_clip": 0.06411033, "auxiliary_loss_mlp": 0.01261994, "balance_loss_clip": 0.06271949, "balance_loss_mlp": 0.01252714, "epoch": 0.8103411994588907, "flos": 14654291437440.0, "grad_norm": 2.089694404191423, "language_loss": 0.69886237, "learning_rate": 3.6550867872172365e-07, "loss": 0.77559257, "num_input_tokens_seen": 290901560, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.0927124, "step": 13478, "time_per_iteration": 2.5558249950408936 }, { "auxiliary_loss_clip": 0.06317299, "auxiliary_loss_mlp": 0.01253587, "balance_loss_clip": 0.06261541, "balance_loss_mlp": 0.01252588, "epoch": 0.8104013227115587, "flos": 59170964112000.0, "grad_norm": 0.6698104503319293, "language_loss": 0.52145743, "learning_rate": 3.6528426705497293e-07, "loss": 0.5971663, "num_input_tokens_seen": 290959185, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.00998688, "step": 13479, "time_per_iteration": 3.106112241744995 }, { "auxiliary_loss_clip": 0.06418186, "auxiliary_loss_mlp": 0.01265816, "balance_loss_clip": 0.06277984, "balance_loss_mlp": 0.01256244, "epoch": 0.8104614459642266, "flos": 19834833496320.0, "grad_norm": 1.8137794846716644, "language_loss": 0.71591794, "learning_rate": 3.650599173768072e-07, "loss": 0.79275799, "num_input_tokens_seen": 290979585, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09576416, "step": 13480, "time_per_iteration": 2.5847625732421875 }, { "auxiliary_loss_clip": 0.06414861, "auxiliary_loss_mlp": 0.01262356, "balance_loss_clip": 0.06274207, "balance_loss_mlp": 0.01253391, "epoch": 0.8105215692168947, "flos": 25381294335360.0, "grad_norm": 1.694823666323494, "language_loss": 0.79599512, "learning_rate": 3.648356296957327e-07, "loss": 0.87276727, "num_input_tokens_seen": 291000865, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.08978271, "step": 13481, "time_per_iteration": 2.602691650390625 }, { "auxiliary_loss_clip": 0.06414474, "auxiliary_loss_mlp": 0.01268197, "balance_loss_clip": 0.06275237, "balance_loss_mlp": 0.01258756, "epoch": 0.8105816924695626, "flos": 20487357567360.0, "grad_norm": 2.016838986968953, "language_loss": 0.72609627, "learning_rate": 3.646114040202548e-07, "loss": 0.80292296, "num_input_tokens_seen": 291018285, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09448242, "step": 13482, "time_per_iteration": 2.565072774887085 }, { "auxiliary_loss_clip": 0.06420526, "auxiliary_loss_mlp": 0.01267672, "balance_loss_clip": 0.06278617, "balance_loss_mlp": 0.01257724, "epoch": 0.8106418157222306, "flos": 14544021064320.0, "grad_norm": 2.6388497655183403, "language_loss": 0.65822113, "learning_rate": 3.6438724035887705e-07, "loss": 0.73510313, "num_input_tokens_seen": 291035745, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09936523, "step": 13483, "time_per_iteration": 3.971825361251831 }, { "auxiliary_loss_clip": 0.06414603, "auxiliary_loss_mlp": 0.01266627, "balance_loss_clip": 0.06275277, "balance_loss_mlp": 0.012565, "epoch": 0.8107019389748985, "flos": 22570964720640.0, "grad_norm": 1.595749314058968, "language_loss": 0.76148134, "learning_rate": 3.641631387200992e-07, "loss": 0.83829367, "num_input_tokens_seen": 291053280, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.10131836, "step": 13484, "time_per_iteration": 2.5789027214050293 }, { "auxiliary_loss_clip": 0.06423977, "auxiliary_loss_mlp": 0.01267431, "balance_loss_clip": 0.06278013, "balance_loss_mlp": 0.01256458, "epoch": 0.8107620622275665, "flos": 19615634415360.0, "grad_norm": 1.4389746435102941, "language_loss": 0.72192591, "learning_rate": 3.639390991124183e-07, "loss": 0.79883999, "num_input_tokens_seen": 291072855, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.10980225, "step": 13485, "time_per_iteration": 2.5797781944274902 }, { "auxiliary_loss_clip": 0.06408149, "auxiliary_loss_mlp": 0.01264308, "balance_loss_clip": 0.06273, "balance_loss_mlp": 0.01255802, "epoch": 0.8108221854802344, "flos": 16149007296000.0, "grad_norm": 1.8024154126017746, "language_loss": 0.75868154, "learning_rate": 3.637151215443308e-07, "loss": 0.83540606, "num_input_tokens_seen": 291090285, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.08508301, "step": 13486, "time_per_iteration": 2.5528831481933594 }, { "auxiliary_loss_clip": 0.06422481, "auxiliary_loss_mlp": 0.01264076, "balance_loss_clip": 0.06275406, "balance_loss_mlp": 0.01253758, "epoch": 0.8108823087329025, "flos": 21112656261120.0, "grad_norm": 1.9807845016269274, "language_loss": 0.72225201, "learning_rate": 3.6349120602433045e-07, "loss": 0.79911757, "num_input_tokens_seen": 291107675, "router_z_loss_clip": 1.47167969, "router_z_loss_mlp": 0.10327148, "step": 13487, "time_per_iteration": 2.525214910507202 }, { "auxiliary_loss_clip": 0.06412667, "auxiliary_loss_mlp": 0.01267172, "balance_loss_clip": 0.06276602, "balance_loss_mlp": 0.01258053, "epoch": 0.8109424319855704, "flos": 29206377221760.0, "grad_norm": 1.5465669298590439, "language_loss": 0.84315056, "learning_rate": 3.6326735256090715e-07, "loss": 0.91994894, "num_input_tokens_seen": 291126900, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.09118652, "step": 13488, "time_per_iteration": 3.9622862339019775 }, { "auxiliary_loss_clip": 0.06418583, "auxiliary_loss_mlp": 0.01264295, "balance_loss_clip": 0.06276707, "balance_loss_mlp": 0.01254687, "epoch": 0.8110025552382384, "flos": 23118459298560.0, "grad_norm": 1.991605975028871, "language_loss": 0.74301332, "learning_rate": 3.630435611625502e-07, "loss": 0.8198421, "num_input_tokens_seen": 291145285, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.0960083, "step": 13489, "time_per_iteration": 2.5495107173919678 }, { "auxiliary_loss_clip": 0.06415349, "auxiliary_loss_mlp": 0.0126414, "balance_loss_clip": 0.06278552, "balance_loss_mlp": 0.01255205, "epoch": 0.8110626784909064, "flos": 22386076686720.0, "grad_norm": 24.87384930558749, "language_loss": 0.71930307, "learning_rate": 3.628198318377453e-07, "loss": 0.79609793, "num_input_tokens_seen": 291163485, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.08935547, "step": 13490, "time_per_iteration": 2.5665249824523926 }, { "auxiliary_loss_clip": 0.06417957, "auxiliary_loss_mlp": 0.01265263, "balance_loss_clip": 0.06275387, "balance_loss_mlp": 0.01254749, "epoch": 0.8111228017435743, "flos": 23374820465280.0, "grad_norm": 3.8622100346403285, "language_loss": 0.71572822, "learning_rate": 3.625961645949762e-07, "loss": 0.7925604, "num_input_tokens_seen": 291182215, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10516357, "step": 13491, "time_per_iteration": 2.5451107025146484 }, { "auxiliary_loss_clip": 0.06416646, "auxiliary_loss_mlp": 0.01265761, "balance_loss_clip": 0.06276313, "balance_loss_mlp": 0.01256422, "epoch": 0.8111829249962423, "flos": 21292680758400.0, "grad_norm": 1.4013092940886664, "language_loss": 0.67719495, "learning_rate": 3.623725594427245e-07, "loss": 0.75401902, "num_input_tokens_seen": 291203145, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09338379, "step": 13492, "time_per_iteration": 2.563246011734009 }, { "auxiliary_loss_clip": 0.06419151, "auxiliary_loss_mlp": 0.01266413, "balance_loss_clip": 0.06276585, "balance_loss_mlp": 0.0125659, "epoch": 0.8112430482489102, "flos": 22352017201920.0, "grad_norm": 1.690462877454, "language_loss": 0.7210021, "learning_rate": 3.6214901638947006e-07, "loss": 0.79785776, "num_input_tokens_seen": 291220600, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.0982666, "step": 13493, "time_per_iteration": 2.5469648838043213 }, { "auxiliary_loss_clip": 0.06415547, "auxiliary_loss_mlp": 0.01268613, "balance_loss_clip": 0.06274298, "balance_loss_mlp": 0.01258563, "epoch": 0.8113031715015783, "flos": 31146199568640.0, "grad_norm": 1.574757820591558, "language_loss": 0.71303666, "learning_rate": 3.619255354436885e-07, "loss": 0.78987825, "num_input_tokens_seen": 291241195, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.1005249, "step": 13494, "time_per_iteration": 2.6981067657470703 }, { "auxiliary_loss_clip": 0.06420313, "auxiliary_loss_mlp": 0.01267373, "balance_loss_clip": 0.06276287, "balance_loss_mlp": 0.01256906, "epoch": 0.8113632947542462, "flos": 25342077824640.0, "grad_norm": 2.029887817830868, "language_loss": 0.76791775, "learning_rate": 3.6170211661385543e-07, "loss": 0.84479463, "num_input_tokens_seen": 291258715, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10473633, "step": 13495, "time_per_iteration": 2.5966238975524902 }, { "auxiliary_loss_clip": 0.06420197, "auxiliary_loss_mlp": 0.01269365, "balance_loss_clip": 0.06276879, "balance_loss_mlp": 0.01259483, "epoch": 0.8114234180069142, "flos": 28446727305600.0, "grad_norm": 1.93350308169045, "language_loss": 0.79833657, "learning_rate": 3.614787599084417e-07, "loss": 0.87523222, "num_input_tokens_seen": 291278030, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.09881592, "step": 13496, "time_per_iteration": 2.6090731620788574 }, { "auxiliary_loss_clip": 0.06414594, "auxiliary_loss_mlp": 0.01263893, "balance_loss_clip": 0.06273845, "balance_loss_mlp": 0.01253641, "epoch": 0.8114835412595821, "flos": 20344998280320.0, "grad_norm": 1.6992915243752222, "language_loss": 0.71639836, "learning_rate": 3.6125546533591787e-07, "loss": 0.79318321, "num_input_tokens_seen": 291296740, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.10253906, "step": 13497, "time_per_iteration": 2.5944485664367676 }, { "auxiliary_loss_clip": 0.0642083, "auxiliary_loss_mlp": 0.0126563, "balance_loss_clip": 0.06276896, "balance_loss_mlp": 0.01256796, "epoch": 0.8115436645122501, "flos": 22497269454720.0, "grad_norm": 1.602600406574153, "language_loss": 0.76931465, "learning_rate": 3.610322329047508e-07, "loss": 0.84617931, "num_input_tokens_seen": 291318730, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.08831787, "step": 13498, "time_per_iteration": 4.121120929718018 }, { "auxiliary_loss_clip": 0.06412056, "auxiliary_loss_mlp": 0.01263938, "balance_loss_clip": 0.06271894, "balance_loss_mlp": 0.01254765, "epoch": 0.811603787764918, "flos": 13850477619840.0, "grad_norm": 1.7860422126080404, "language_loss": 0.84292316, "learning_rate": 3.608090626234055e-07, "loss": 0.9196831, "num_input_tokens_seen": 291336755, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09173584, "step": 13499, "time_per_iteration": 2.5407116413116455 }, { "auxiliary_loss_clip": 0.06416822, "auxiliary_loss_mlp": 0.01265411, "balance_loss_clip": 0.06278192, "balance_loss_mlp": 0.01254014, "epoch": 0.8116639110175861, "flos": 21620766620160.0, "grad_norm": 1.4706704850004018, "language_loss": 0.7641477, "learning_rate": 3.6058595450034603e-07, "loss": 0.84097004, "num_input_tokens_seen": 291356795, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.11407471, "step": 13500, "time_per_iteration": 2.55971097946167 }, { "auxiliary_loss_clip": 0.0631173, "auxiliary_loss_mlp": 0.01253194, "balance_loss_clip": 0.06256258, "balance_loss_mlp": 0.01252089, "epoch": 0.811724034270254, "flos": 64481021055360.0, "grad_norm": 0.790933226605589, "language_loss": 0.59742379, "learning_rate": 3.603629085440303e-07, "loss": 0.67307299, "num_input_tokens_seen": 291416005, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01107025, "step": 13501, "time_per_iteration": 3.179126739501953 }, { "auxiliary_loss_clip": 0.06407882, "auxiliary_loss_mlp": 0.01263714, "balance_loss_clip": 0.06272994, "balance_loss_mlp": 0.0125469, "epoch": 0.811784157522922, "flos": 24761068813440.0, "grad_norm": 1.5463789115367412, "language_loss": 0.79793781, "learning_rate": 3.6013992476291753e-07, "loss": 0.87465376, "num_input_tokens_seen": 291434870, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.09020996, "step": 13502, "time_per_iteration": 2.5716803073883057 }, { "auxiliary_loss_clip": 0.06413288, "auxiliary_loss_mlp": 0.01262936, "balance_loss_clip": 0.06275621, "balance_loss_mlp": 0.01253596, "epoch": 0.81184428077559, "flos": 12172089611520.0, "grad_norm": 1.8661363644717524, "language_loss": 0.71205556, "learning_rate": 3.599170031654635e-07, "loss": 0.78881776, "num_input_tokens_seen": 291452230, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09344482, "step": 13503, "time_per_iteration": 2.5295400619506836 }, { "auxiliary_loss_clip": 0.06414791, "auxiliary_loss_mlp": 0.01264631, "balance_loss_clip": 0.06273927, "balance_loss_mlp": 0.01254272, "epoch": 0.8119044040282579, "flos": 44432621429760.0, "grad_norm": 2.2157552294174754, "language_loss": 0.68025959, "learning_rate": 3.5969414376012065e-07, "loss": 0.75705373, "num_input_tokens_seen": 291477425, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10345459, "step": 13504, "time_per_iteration": 2.745567560195923 }, { "auxiliary_loss_clip": 0.06416563, "auxiliary_loss_mlp": 0.01264648, "balance_loss_clip": 0.06275582, "balance_loss_mlp": 0.01254438, "epoch": 0.8119645272809259, "flos": 52167131936640.0, "grad_norm": 1.9161425615143577, "language_loss": 0.74770802, "learning_rate": 3.594713465553403e-07, "loss": 0.82452011, "num_input_tokens_seen": 291501070, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10205078, "step": 13505, "time_per_iteration": 4.275745868682861 }, { "auxiliary_loss_clip": 0.06418148, "auxiliary_loss_mlp": 0.01264968, "balance_loss_clip": 0.06277528, "balance_loss_mlp": 0.01254972, "epoch": 0.8120246505335939, "flos": 30241842451200.0, "grad_norm": 1.9605860933803698, "language_loss": 0.72675097, "learning_rate": 3.5924861155957123e-07, "loss": 0.80358213, "num_input_tokens_seen": 291524945, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09997559, "step": 13506, "time_per_iteration": 2.6576762199401855 }, { "auxiliary_loss_clip": 0.064246, "auxiliary_loss_mlp": 0.01265237, "balance_loss_clip": 0.06277674, "balance_loss_mlp": 0.01254949, "epoch": 0.8120847737862619, "flos": 22134243640320.0, "grad_norm": 2.060251027009897, "language_loss": 0.76654571, "learning_rate": 3.590259387812593e-07, "loss": 0.84344405, "num_input_tokens_seen": 291544605, "router_z_loss_clip": 1.46777344, "router_z_loss_mlp": 0.10284424, "step": 13507, "time_per_iteration": 2.549072504043579 }, { "auxiliary_loss_clip": 0.06424163, "auxiliary_loss_mlp": 0.01263464, "balance_loss_clip": 0.06275973, "balance_loss_mlp": 0.01253319, "epoch": 0.8121448970389298, "flos": 23301963740160.0, "grad_norm": 1.9287697633064111, "language_loss": 0.70312333, "learning_rate": 3.5880332822884783e-07, "loss": 0.77999961, "num_input_tokens_seen": 291563850, "router_z_loss_clip": 1.48242188, "router_z_loss_mlp": 0.1015625, "step": 13508, "time_per_iteration": 2.5512001514434814 }, { "auxiliary_loss_clip": 0.06412427, "auxiliary_loss_mlp": 0.0126529, "balance_loss_clip": 0.06273575, "balance_loss_mlp": 0.01255742, "epoch": 0.8122050202915978, "flos": 22170734893440.0, "grad_norm": 2.013185898857593, "language_loss": 0.76301348, "learning_rate": 3.585807799107785e-07, "loss": 0.8397907, "num_input_tokens_seen": 291581730, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09545898, "step": 13509, "time_per_iteration": 2.5435121059417725 }, { "auxiliary_loss_clip": 0.06416316, "auxiliary_loss_mlp": 0.01262636, "balance_loss_clip": 0.06273472, "balance_loss_mlp": 0.01253093, "epoch": 0.8122651435442657, "flos": 23265765976320.0, "grad_norm": 1.7175689860041348, "language_loss": 0.77262068, "learning_rate": 3.58358293835491e-07, "loss": 0.84941018, "num_input_tokens_seen": 291601225, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09545898, "step": 13510, "time_per_iteration": 2.612243175506592 }, { "auxiliary_loss_clip": 0.06420889, "auxiliary_loss_mlp": 0.01266321, "balance_loss_clip": 0.06275845, "balance_loss_mlp": 0.01256361, "epoch": 0.8123252667969337, "flos": 16144940373120.0, "grad_norm": 2.5154416144011935, "language_loss": 0.69933122, "learning_rate": 3.581358700114212e-07, "loss": 0.77620333, "num_input_tokens_seen": 291616995, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.09954834, "step": 13511, "time_per_iteration": 2.5042972564697266 }, { "auxiliary_loss_clip": 0.06417225, "auxiliary_loss_mlp": 0.01264721, "balance_loss_clip": 0.06274094, "balance_loss_mlp": 0.01254863, "epoch": 0.8123853900496016, "flos": 21250738990080.0, "grad_norm": 1.773674059637532, "language_loss": 0.79439163, "learning_rate": 3.57913508447004e-07, "loss": 0.87121111, "num_input_tokens_seen": 291636145, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09857178, "step": 13512, "time_per_iteration": 2.5307767391204834 }, { "auxiliary_loss_clip": 0.06416208, "auxiliary_loss_mlp": 0.01265952, "balance_loss_clip": 0.06276144, "balance_loss_mlp": 0.01256672, "epoch": 0.8124455133022697, "flos": 64391156680320.0, "grad_norm": 1.8670878107394948, "language_loss": 0.63931513, "learning_rate": 3.5769120915067076e-07, "loss": 0.71613669, "num_input_tokens_seen": 291662440, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09277344, "step": 13513, "time_per_iteration": 2.929100275039673 }, { "auxiliary_loss_clip": 0.06420553, "auxiliary_loss_mlp": 0.01268975, "balance_loss_clip": 0.06276114, "balance_loss_mlp": 0.01258604, "epoch": 0.8125056365549376, "flos": 23849039047680.0, "grad_norm": 1.7843169548045832, "language_loss": 0.71788776, "learning_rate": 3.5746897213085194e-07, "loss": 0.794783, "num_input_tokens_seen": 291680950, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10375977, "step": 13514, "time_per_iteration": 2.555525779724121 }, { "auxiliary_loss_clip": 0.06414519, "auxiliary_loss_mlp": 0.01266514, "balance_loss_clip": 0.06275458, "balance_loss_mlp": 0.01256841, "epoch": 0.8125657598076056, "flos": 23557109022720.0, "grad_norm": 1.550156410883778, "language_loss": 0.63421112, "learning_rate": 3.5724679739597364e-07, "loss": 0.71102142, "num_input_tokens_seen": 291702395, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09680176, "step": 13515, "time_per_iteration": 2.566857099533081 }, { "auxiliary_loss_clip": 0.06408031, "auxiliary_loss_mlp": 0.0126775, "balance_loss_clip": 0.06274514, "balance_loss_mlp": 0.01259012, "epoch": 0.8126258830602736, "flos": 20710497790080.0, "grad_norm": 1.925870588820927, "language_loss": 0.75286406, "learning_rate": 3.570246849544616e-07, "loss": 0.82962191, "num_input_tokens_seen": 291721135, "router_z_loss_clip": 1.33496094, "router_z_loss_mlp": 0.08740234, "step": 13516, "time_per_iteration": 2.5206189155578613 }, { "auxiliary_loss_clip": 0.06418666, "auxiliary_loss_mlp": 0.01263649, "balance_loss_clip": 0.06275281, "balance_loss_mlp": 0.01253767, "epoch": 0.8126860063129415, "flos": 23624095962240.0, "grad_norm": 1.624114439777265, "language_loss": 0.91595268, "learning_rate": 3.5680263481473907e-07, "loss": 0.99277586, "num_input_tokens_seen": 291741235, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09881592, "step": 13517, "time_per_iteration": 2.569974899291992 }, { "auxiliary_loss_clip": 0.06419298, "auxiliary_loss_mlp": 0.01266149, "balance_loss_clip": 0.06277005, "balance_loss_mlp": 0.0125669, "epoch": 0.8127461295656095, "flos": 25013740400640.0, "grad_norm": 1.4898506510121805, "language_loss": 0.78828382, "learning_rate": 3.565806469852244e-07, "loss": 0.86513829, "num_input_tokens_seen": 291761430, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09466553, "step": 13518, "time_per_iteration": 2.572688579559326 }, { "auxiliary_loss_clip": 0.06412782, "auxiliary_loss_mlp": 0.01266176, "balance_loss_clip": 0.06273689, "balance_loss_mlp": 0.01257169, "epoch": 0.8128062528182775, "flos": 27349138673280.0, "grad_norm": 1.6473208857619832, "language_loss": 0.79161036, "learning_rate": 3.56358721474336e-07, "loss": 0.86839998, "num_input_tokens_seen": 291781755, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09008789, "step": 13519, "time_per_iteration": 2.5770773887634277 }, { "auxiliary_loss_clip": 0.06422389, "auxiliary_loss_mlp": 0.01263653, "balance_loss_clip": 0.06279979, "balance_loss_mlp": 0.01254745, "epoch": 0.8128663760709455, "flos": 26513697139200.0, "grad_norm": 1.501954055190095, "language_loss": 0.71001369, "learning_rate": 3.561368582904905e-07, "loss": 0.78687406, "num_input_tokens_seen": 291804410, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.08908081, "step": 13520, "time_per_iteration": 2.610966920852661 }, { "auxiliary_loss_clip": 0.06420256, "auxiliary_loss_mlp": 0.01265874, "balance_loss_clip": 0.06278172, "balance_loss_mlp": 0.0125567, "epoch": 0.8129264993236134, "flos": 17937036771840.0, "grad_norm": 2.787232600091414, "language_loss": 0.72843224, "learning_rate": 3.5591505744209925e-07, "loss": 0.80529356, "num_input_tokens_seen": 291823285, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10198975, "step": 13521, "time_per_iteration": 2.5356690883636475 }, { "auxiliary_loss_clip": 0.06416481, "auxiliary_loss_mlp": 0.01265433, "balance_loss_clip": 0.06275386, "balance_loss_mlp": 0.01255682, "epoch": 0.8129866225762814, "flos": 26184982371840.0, "grad_norm": 1.6834485112306288, "language_loss": 0.70137215, "learning_rate": 3.5569331893757394e-07, "loss": 0.77819127, "num_input_tokens_seen": 291845305, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09747314, "step": 13522, "time_per_iteration": 4.0455451011657715 }, { "auxiliary_loss_clip": 0.06408082, "auxiliary_loss_mlp": 0.01265839, "balance_loss_clip": 0.06272381, "balance_loss_mlp": 0.01256196, "epoch": 0.8130467458289493, "flos": 21038457870720.0, "grad_norm": 1.8179746734112587, "language_loss": 0.70742792, "learning_rate": 3.554716427853233e-07, "loss": 0.78416717, "num_input_tokens_seen": 291863715, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.09643555, "step": 13523, "time_per_iteration": 2.5381953716278076 }, { "auxiliary_loss_clip": 0.06413223, "auxiliary_loss_mlp": 0.0126315, "balance_loss_clip": 0.06273852, "balance_loss_mlp": 0.01253238, "epoch": 0.8131068690816173, "flos": 15492500156160.0, "grad_norm": 2.0759596782074383, "language_loss": 0.71009398, "learning_rate": 3.5525002899375256e-07, "loss": 0.78685772, "num_input_tokens_seen": 291880735, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09912109, "step": 13524, "time_per_iteration": 2.504857063293457 }, { "auxiliary_loss_clip": 0.06411266, "auxiliary_loss_mlp": 0.01264453, "balance_loss_clip": 0.06272335, "balance_loss_mlp": 0.01254982, "epoch": 0.8131669923342852, "flos": 29358924779520.0, "grad_norm": 1.9697838038076925, "language_loss": 0.62997699, "learning_rate": 3.550284775712653e-07, "loss": 0.70673418, "num_input_tokens_seen": 291900535, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09472656, "step": 13525, "time_per_iteration": 2.60036301612854 }, { "auxiliary_loss_clip": 0.06410298, "auxiliary_loss_mlp": 0.01262662, "balance_loss_clip": 0.06271506, "balance_loss_mlp": 0.01253549, "epoch": 0.8132271155869533, "flos": 35263883312640.0, "grad_norm": 1.5651136620668922, "language_loss": 0.65601248, "learning_rate": 3.548069885262628e-07, "loss": 0.73274207, "num_input_tokens_seen": 291919760, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09112549, "step": 13526, "time_per_iteration": 2.663879156112671 }, { "auxiliary_loss_clip": 0.06412032, "auxiliary_loss_mlp": 0.01263079, "balance_loss_clip": 0.06272965, "balance_loss_mlp": 0.01254126, "epoch": 0.8132872388396212, "flos": 27789255843840.0, "grad_norm": 1.6604151607773028, "language_loss": 0.75580144, "learning_rate": 3.5458556186714473e-07, "loss": 0.83255255, "num_input_tokens_seen": 291938915, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.08959961, "step": 13527, "time_per_iteration": 4.068870544433594 }, { "auxiliary_loss_clip": 0.06412016, "auxiliary_loss_mlp": 0.01265424, "balance_loss_clip": 0.06271786, "balance_loss_mlp": 0.01255887, "epoch": 0.8133473620922892, "flos": 27827172616320.0, "grad_norm": 2.6286752733112686, "language_loss": 0.70970649, "learning_rate": 3.5436419760230706e-07, "loss": 0.7864809, "num_input_tokens_seen": 291958145, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09533691, "step": 13528, "time_per_iteration": 2.580470323562622 }, { "auxiliary_loss_clip": 0.06414503, "auxiliary_loss_mlp": 0.01263783, "balance_loss_clip": 0.06273268, "balance_loss_mlp": 0.01254378, "epoch": 0.8134074853449572, "flos": 18995534674560.0, "grad_norm": 1.8052969967907244, "language_loss": 0.6893419, "learning_rate": 3.5414289574014357e-07, "loss": 0.76612473, "num_input_tokens_seen": 291976860, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09399414, "step": 13529, "time_per_iteration": 2.5137808322906494 }, { "auxiliary_loss_clip": 0.06411897, "auxiliary_loss_mlp": 0.0126212, "balance_loss_clip": 0.06274553, "balance_loss_mlp": 0.01253013, "epoch": 0.8134676085976251, "flos": 24249646218240.0, "grad_norm": 1.263789091062251, "language_loss": 0.77702272, "learning_rate": 3.5392165628904635e-07, "loss": 0.85376292, "num_input_tokens_seen": 291998085, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09106445, "step": 13530, "time_per_iteration": 2.561438798904419 }, { "auxiliary_loss_clip": 0.06411147, "auxiliary_loss_mlp": 0.0126435, "balance_loss_clip": 0.06272931, "balance_loss_mlp": 0.01255057, "epoch": 0.8135277318502931, "flos": 19068391399680.0, "grad_norm": 1.733126949786858, "language_loss": 0.82076544, "learning_rate": 3.537004792574052e-07, "loss": 0.89752042, "num_input_tokens_seen": 292016585, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09283447, "step": 13531, "time_per_iteration": 2.4983909130096436 }, { "auxiliary_loss_clip": 0.06418823, "auxiliary_loss_mlp": 0.01268585, "balance_loss_clip": 0.06276616, "balance_loss_mlp": 0.0125872, "epoch": 0.813587855102961, "flos": 17274617919360.0, "grad_norm": 2.012664214794293, "language_loss": 0.72011626, "learning_rate": 3.534793646536065e-07, "loss": 0.79699033, "num_input_tokens_seen": 292033255, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09863281, "step": 13532, "time_per_iteration": 2.4800314903259277 }, { "auxiliary_loss_clip": 0.06413336, "auxiliary_loss_mlp": 0.01266355, "balance_loss_clip": 0.06275105, "balance_loss_mlp": 0.01257445, "epoch": 0.8136479783556291, "flos": 20163883680000.0, "grad_norm": 1.8623062862879791, "language_loss": 0.77039421, "learning_rate": 3.5325831248603533e-07, "loss": 0.84719115, "num_input_tokens_seen": 292051800, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.08911133, "step": 13533, "time_per_iteration": 2.51930832862854 }, { "auxiliary_loss_clip": 0.06421815, "auxiliary_loss_mlp": 0.01263218, "balance_loss_clip": 0.06275836, "balance_loss_mlp": 0.01252501, "epoch": 0.813708101608297, "flos": 22058535876480.0, "grad_norm": 2.468637848157058, "language_loss": 0.7641204, "learning_rate": 3.5303732276307495e-07, "loss": 0.84097075, "num_input_tokens_seen": 292072215, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10717773, "step": 13534, "time_per_iteration": 2.539846897125244 }, { "auxiliary_loss_clip": 0.06412193, "auxiliary_loss_mlp": 0.01265919, "balance_loss_clip": 0.06272154, "balance_loss_mlp": 0.01257253, "epoch": 0.813768224860965, "flos": 16177825900800.0, "grad_norm": 2.1587682086043336, "language_loss": 0.92987967, "learning_rate": 3.5281639549310336e-07, "loss": 1.0066607, "num_input_tokens_seen": 292088830, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.08673096, "step": 13535, "time_per_iteration": 2.4892280101776123 }, { "auxiliary_loss_clip": 0.06411225, "auxiliary_loss_mlp": 0.01265331, "balance_loss_clip": 0.06275137, "balance_loss_mlp": 0.01256116, "epoch": 0.8138283481136329, "flos": 24359119977600.0, "grad_norm": 1.7461132505820305, "language_loss": 0.70601887, "learning_rate": 3.52595530684499e-07, "loss": 0.78278446, "num_input_tokens_seen": 292109225, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.09222412, "step": 13536, "time_per_iteration": 2.58070707321167 }, { "auxiliary_loss_clip": 0.06411482, "auxiliary_loss_mlp": 0.01266749, "balance_loss_clip": 0.06272297, "balance_loss_mlp": 0.01256735, "epoch": 0.8138884713663009, "flos": 25522773154560.0, "grad_norm": 1.5553095743995322, "language_loss": 0.75776458, "learning_rate": 3.5237472834563775e-07, "loss": 0.83454692, "num_input_tokens_seen": 292129660, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.10009766, "step": 13537, "time_per_iteration": 2.5599582195281982 }, { "auxiliary_loss_clip": 0.06413594, "auxiliary_loss_mlp": 0.01263007, "balance_loss_clip": 0.06276184, "balance_loss_mlp": 0.01253881, "epoch": 0.8139485946189688, "flos": 22460736274560.0, "grad_norm": 1.4723686942975653, "language_loss": 0.76014864, "learning_rate": 3.5215398848489163e-07, "loss": 0.83691466, "num_input_tokens_seen": 292149090, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09124756, "step": 13538, "time_per_iteration": 4.022353172302246 }, { "auxiliary_loss_clip": 0.06415947, "auxiliary_loss_mlp": 0.01263353, "balance_loss_clip": 0.06274886, "balance_loss_mlp": 0.0125424, "epoch": 0.8140087178716369, "flos": 21256566848640.0, "grad_norm": 1.5048046367649948, "language_loss": 0.78353333, "learning_rate": 3.5193331111063176e-07, "loss": 0.86032641, "num_input_tokens_seen": 292169260, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09115601, "step": 13539, "time_per_iteration": 2.527195453643799 }, { "auxiliary_loss_clip": 0.06411983, "auxiliary_loss_mlp": 0.01269397, "balance_loss_clip": 0.06276438, "balance_loss_mlp": 0.01260445, "epoch": 0.8140688411243048, "flos": 39424179657600.0, "grad_norm": 2.1374831167775814, "language_loss": 0.66503096, "learning_rate": 3.5171269623122533e-07, "loss": 0.74184477, "num_input_tokens_seen": 292188145, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.08959961, "step": 13540, "time_per_iteration": 2.659670114517212 }, { "auxiliary_loss_clip": 0.06413974, "auxiliary_loss_mlp": 0.01263195, "balance_loss_clip": 0.06274566, "balance_loss_mlp": 0.01254314, "epoch": 0.8141289643769728, "flos": 25423781155200.0, "grad_norm": 1.3810361448596702, "language_loss": 0.67611778, "learning_rate": 3.5149214385503913e-07, "loss": 0.75288945, "num_input_tokens_seen": 292212135, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.08874512, "step": 13541, "time_per_iteration": 2.610520839691162 }, { "auxiliary_loss_clip": 0.06414053, "auxiliary_loss_mlp": 0.01264752, "balance_loss_clip": 0.06275427, "balance_loss_mlp": 0.01254709, "epoch": 0.8141890876296408, "flos": 12572990271360.0, "grad_norm": 1.9891058542421902, "language_loss": 0.69015479, "learning_rate": 3.512716539904355e-07, "loss": 0.7669428, "num_input_tokens_seen": 292230645, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.10040283, "step": 13542, "time_per_iteration": 2.5400333404541016 }, { "auxiliary_loss_clip": 0.06418294, "auxiliary_loss_mlp": 0.01264653, "balance_loss_clip": 0.06273052, "balance_loss_mlp": 0.01254246, "epoch": 0.8142492108823087, "flos": 14971015071360.0, "grad_norm": 2.8651450973843846, "language_loss": 0.79430741, "learning_rate": 3.5105122664577613e-07, "loss": 0.87113684, "num_input_tokens_seen": 292243540, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10406494, "step": 13543, "time_per_iteration": 2.4607620239257812 }, { "auxiliary_loss_clip": 0.06421374, "auxiliary_loss_mlp": 0.01264496, "balance_loss_clip": 0.06274714, "balance_loss_mlp": 0.01253857, "epoch": 0.8143093341349767, "flos": 12426899477760.0, "grad_norm": 2.302992480490172, "language_loss": 0.78319126, "learning_rate": 3.5083086182942003e-07, "loss": 0.86005002, "num_input_tokens_seen": 292261715, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10638428, "step": 13544, "time_per_iteration": 2.4946844577789307 }, { "auxiliary_loss_clip": 0.06427935, "auxiliary_loss_mlp": 0.01264805, "balance_loss_clip": 0.0627955, "balance_loss_mlp": 0.01254177, "epoch": 0.8143694573876447, "flos": 11915267247360.0, "grad_norm": 3.3575892797568083, "language_loss": 0.73897302, "learning_rate": 3.5061055954972264e-07, "loss": 0.81590044, "num_input_tokens_seen": 292275080, "router_z_loss_clip": 1.48144531, "router_z_loss_mlp": 0.10620117, "step": 13545, "time_per_iteration": 3.9362199306488037 }, { "auxiliary_loss_clip": 0.06408042, "auxiliary_loss_mlp": 0.01265318, "balance_loss_clip": 0.06272526, "balance_loss_mlp": 0.01256091, "epoch": 0.8144295806403127, "flos": 21218901638400.0, "grad_norm": 1.5179500431269262, "language_loss": 0.76772642, "learning_rate": 3.5039031981503776e-07, "loss": 0.84446001, "num_input_tokens_seen": 292294635, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.09228516, "step": 13546, "time_per_iteration": 2.527601718902588 }, { "auxiliary_loss_clip": 0.0642108, "auxiliary_loss_mlp": 0.01263789, "balance_loss_clip": 0.06278193, "balance_loss_mlp": 0.01254908, "epoch": 0.8144897038929806, "flos": 19871450530560.0, "grad_norm": 2.1501765165504234, "language_loss": 0.70688689, "learning_rate": 3.501701426337178e-07, "loss": 0.78373557, "num_input_tokens_seen": 292312695, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.08880615, "step": 13547, "time_per_iteration": 2.5229735374450684 }, { "auxiliary_loss_clip": 0.06419702, "auxiliary_loss_mlp": 0.01265725, "balance_loss_clip": 0.06276602, "balance_loss_mlp": 0.01255283, "epoch": 0.8145498271456486, "flos": 24578654474880.0, "grad_norm": 2.2556225701143444, "language_loss": 0.71015584, "learning_rate": 3.49950028014111e-07, "loss": 0.78701019, "num_input_tokens_seen": 292332005, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10455322, "step": 13548, "time_per_iteration": 2.5949764251708984 }, { "auxiliary_loss_clip": 0.06418914, "auxiliary_loss_mlp": 0.01261932, "balance_loss_clip": 0.06276855, "balance_loss_mlp": 0.01251853, "epoch": 0.8146099503983165, "flos": 20199159048960.0, "grad_norm": 1.9921365847605474, "language_loss": 0.7694459, "learning_rate": 3.4972997596456444e-07, "loss": 0.84625435, "num_input_tokens_seen": 292348365, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10089111, "step": 13549, "time_per_iteration": 2.5234334468841553 }, { "auxiliary_loss_clip": 0.0641816, "auxiliary_loss_mlp": 0.01265877, "balance_loss_clip": 0.06275564, "balance_loss_mlp": 0.01255887, "epoch": 0.8146700736509845, "flos": 19543071179520.0, "grad_norm": 1.9443718780845651, "language_loss": 0.71665812, "learning_rate": 3.4950998649342233e-07, "loss": 0.79349846, "num_input_tokens_seen": 292368050, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09991455, "step": 13550, "time_per_iteration": 2.5395052433013916 }, { "auxiliary_loss_clip": 0.06409825, "auxiliary_loss_mlp": 0.01264338, "balance_loss_clip": 0.06273779, "balance_loss_mlp": 0.01255558, "epoch": 0.8147301969036524, "flos": 18047265217920.0, "grad_norm": 1.6453288849614303, "language_loss": 0.71685719, "learning_rate": 3.4929005960902826e-07, "loss": 0.79359877, "num_input_tokens_seen": 292385315, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.08776855, "step": 13551, "time_per_iteration": 2.540027141571045 }, { "auxiliary_loss_clip": 0.06423447, "auxiliary_loss_mlp": 0.01266255, "balance_loss_clip": 0.06277394, "balance_loss_mlp": 0.0125611, "epoch": 0.8147903201563205, "flos": 18010606256640.0, "grad_norm": 1.8519071075280205, "language_loss": 0.69039387, "learning_rate": 3.4907019531971926e-07, "loss": 0.76729083, "num_input_tokens_seen": 292403375, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10144043, "step": 13552, "time_per_iteration": 2.5200581550598145 }, { "auxiliary_loss_clip": 0.0641745, "auxiliary_loss_mlp": 0.01263865, "balance_loss_clip": 0.06276025, "balance_loss_mlp": 0.01254477, "epoch": 0.8148504434089884, "flos": 20264343125760.0, "grad_norm": 1.8864162313539434, "language_loss": 0.82413983, "learning_rate": 3.4885039363383407e-07, "loss": 0.90095294, "num_input_tokens_seen": 292419260, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09381104, "step": 13553, "time_per_iteration": 2.5338170528411865 }, { "auxiliary_loss_clip": 0.06417758, "auxiliary_loss_mlp": 0.01265087, "balance_loss_clip": 0.06276299, "balance_loss_mlp": 0.01255657, "epoch": 0.8149105666616564, "flos": 12499588494720.0, "grad_norm": 1.8735350216815458, "language_loss": 0.68120128, "learning_rate": 3.4863065455970795e-07, "loss": 0.75802976, "num_input_tokens_seen": 292436095, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09436035, "step": 13554, "time_per_iteration": 2.504023551940918 }, { "auxiliary_loss_clip": 0.06418492, "auxiliary_loss_mlp": 0.01266496, "balance_loss_clip": 0.06278048, "balance_loss_mlp": 0.01256972, "epoch": 0.8149706899143244, "flos": 32531609376000.0, "grad_norm": 1.6378039088158916, "language_loss": 0.66383839, "learning_rate": 3.484109781056723e-07, "loss": 0.7406882, "num_input_tokens_seen": 292457190, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09521484, "step": 13555, "time_per_iteration": 2.612238883972168 }, { "auxiliary_loss_clip": 0.06419256, "auxiliary_loss_mlp": 0.0126284, "balance_loss_clip": 0.06273547, "balance_loss_mlp": 0.01252374, "epoch": 0.8150308131669923, "flos": 19391362162560.0, "grad_norm": 1.7280895624612969, "language_loss": 0.73768473, "learning_rate": 3.4819136428005844e-07, "loss": 0.8145057, "num_input_tokens_seen": 292474300, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10467529, "step": 13556, "time_per_iteration": 2.5266799926757812 }, { "auxiliary_loss_clip": 0.06416865, "auxiliary_loss_mlp": 0.01266081, "balance_loss_clip": 0.06278772, "balance_loss_mlp": 0.01257177, "epoch": 0.8150909364196604, "flos": 17427249331200.0, "grad_norm": 1.634353414346751, "language_loss": 0.80682534, "learning_rate": 3.4797181309119307e-07, "loss": 0.88365483, "num_input_tokens_seen": 292492420, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.08911133, "step": 13557, "time_per_iteration": 2.517915964126587 }, { "auxiliary_loss_clip": 0.0642074, "auxiliary_loss_mlp": 0.01266338, "balance_loss_clip": 0.06276883, "balance_loss_mlp": 0.01255889, "epoch": 0.8151510596723283, "flos": 27170246206080.0, "grad_norm": 1.7568679033003463, "language_loss": 0.65746558, "learning_rate": 3.4775232454740255e-07, "loss": 0.73433638, "num_input_tokens_seen": 292512895, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10443115, "step": 13558, "time_per_iteration": 2.5920779705047607 }, { "auxiliary_loss_clip": 0.06313112, "auxiliary_loss_mlp": 0.01253423, "balance_loss_clip": 0.06257229, "balance_loss_mlp": 0.01252208, "epoch": 0.8152111829249963, "flos": 64236581896320.0, "grad_norm": 0.807557904325532, "language_loss": 0.56761891, "learning_rate": 3.4753289865700896e-07, "loss": 0.64328426, "num_input_tokens_seen": 292566580, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.01213837, "step": 13559, "time_per_iteration": 3.0955398082733154 }, { "auxiliary_loss_clip": 0.0631565, "auxiliary_loss_mlp": 0.01250307, "balance_loss_clip": 0.062596, "balance_loss_mlp": 0.01249178, "epoch": 0.8152713061776642, "flos": 67091201193600.0, "grad_norm": 0.6675642595464171, "language_loss": 0.55294883, "learning_rate": 3.473135354283334e-07, "loss": 0.62860841, "num_input_tokens_seen": 292621490, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01130676, "step": 13560, "time_per_iteration": 3.021084785461426 }, { "auxiliary_loss_clip": 0.06417195, "auxiliary_loss_mlp": 0.01264918, "balance_loss_clip": 0.06277835, "balance_loss_mlp": 0.0125587, "epoch": 0.8153314294303322, "flos": 14396169335040.0, "grad_norm": 1.6702576639260185, "language_loss": 0.67957085, "learning_rate": 3.470942348696948e-07, "loss": 0.756392, "num_input_tokens_seen": 292638660, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09039307, "step": 13561, "time_per_iteration": 2.498706579208374 }, { "auxiliary_loss_clip": 0.06420641, "auxiliary_loss_mlp": 0.01265025, "balance_loss_clip": 0.06276216, "balance_loss_mlp": 0.01254803, "epoch": 0.8153915526830001, "flos": 25629563583360.0, "grad_norm": 1.493316563904684, "language_loss": 0.81887352, "learning_rate": 3.468749969894085e-07, "loss": 0.8957302, "num_input_tokens_seen": 292658545, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10235596, "step": 13562, "time_per_iteration": 4.019538164138794 }, { "auxiliary_loss_clip": 0.06415647, "auxiliary_loss_mlp": 0.01265483, "balance_loss_clip": 0.06275274, "balance_loss_mlp": 0.01255958, "epoch": 0.8154516759356681, "flos": 23376120203520.0, "grad_norm": 1.478312459944706, "language_loss": 0.7185899, "learning_rate": 3.4665582179578734e-07, "loss": 0.79540122, "num_input_tokens_seen": 292678460, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09527588, "step": 13563, "time_per_iteration": 2.59629225730896 }, { "auxiliary_loss_clip": 0.06412227, "auxiliary_loss_mlp": 0.01269824, "balance_loss_clip": 0.06272196, "balance_loss_mlp": 0.01259608, "epoch": 0.815511799188336, "flos": 28157019413760.0, "grad_norm": 1.532694867127297, "language_loss": 0.70432734, "learning_rate": 3.4643670929714387e-07, "loss": 0.78114784, "num_input_tokens_seen": 292699815, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.10217285, "step": 13564, "time_per_iteration": 2.590348243713379 }, { "auxiliary_loss_clip": 0.06414902, "auxiliary_loss_mlp": 0.01264666, "balance_loss_clip": 0.06275147, "balance_loss_mlp": 0.01255659, "epoch": 0.8155719224410041, "flos": 16989186585600.0, "grad_norm": 1.8120717420807142, "language_loss": 0.70684248, "learning_rate": 3.462176595017854e-07, "loss": 0.78363818, "num_input_tokens_seen": 292717370, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09014893, "step": 13565, "time_per_iteration": 2.5274477005004883 }, { "auxiliary_loss_clip": 0.06411088, "auxiliary_loss_mlp": 0.01264016, "balance_loss_clip": 0.06272012, "balance_loss_mlp": 0.01254694, "epoch": 0.815632045693672, "flos": 24688757139840.0, "grad_norm": 1.8044990032664088, "language_loss": 0.79323149, "learning_rate": 3.459986724180188e-07, "loss": 0.86998248, "num_input_tokens_seen": 292737110, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09326172, "step": 13566, "time_per_iteration": 2.573667287826538 }, { "auxiliary_loss_clip": 0.06409349, "auxiliary_loss_mlp": 0.01262025, "balance_loss_clip": 0.06273751, "balance_loss_mlp": 0.01253203, "epoch": 0.81569216894634, "flos": 19944516890880.0, "grad_norm": 1.57861297608212, "language_loss": 0.82940638, "learning_rate": 3.457797480541491e-07, "loss": 0.90612006, "num_input_tokens_seen": 292756510, "router_z_loss_clip": 1.35644531, "router_z_loss_mlp": 0.0881958, "step": 13567, "time_per_iteration": 4.007611513137817 }, { "auxiliary_loss_clip": 0.06411035, "auxiliary_loss_mlp": 0.01261905, "balance_loss_clip": 0.06274419, "balance_loss_mlp": 0.01253381, "epoch": 0.8157522921990079, "flos": 21805948143360.0, "grad_norm": 2.087511234903991, "language_loss": 0.80328029, "learning_rate": 3.455608864184771e-07, "loss": 0.88000965, "num_input_tokens_seen": 292776710, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.08517456, "step": 13568, "time_per_iteration": 2.645308017730713 }, { "auxiliary_loss_clip": 0.06409884, "auxiliary_loss_mlp": 0.01264044, "balance_loss_clip": 0.06273518, "balance_loss_mlp": 0.01255139, "epoch": 0.8158124154516759, "flos": 18513098392320.0, "grad_norm": 1.9827326922937025, "language_loss": 0.77726442, "learning_rate": 3.453420875193016e-07, "loss": 0.85400367, "num_input_tokens_seen": 292794350, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.08911133, "step": 13569, "time_per_iteration": 2.5258429050445557 }, { "auxiliary_loss_clip": 0.06415972, "auxiliary_loss_mlp": 0.01263685, "balance_loss_clip": 0.06278129, "balance_loss_mlp": 0.01254333, "epoch": 0.815872538704344, "flos": 26837590296960.0, "grad_norm": 2.742534595955965, "language_loss": 0.59133291, "learning_rate": 3.451233513649199e-07, "loss": 0.66812944, "num_input_tokens_seen": 292814005, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09350586, "step": 13570, "time_per_iteration": 2.636136531829834 }, { "auxiliary_loss_clip": 0.0642073, "auxiliary_loss_mlp": 0.01264763, "balance_loss_clip": 0.06276688, "balance_loss_mlp": 0.01254577, "epoch": 0.8159326619570119, "flos": 21732127096320.0, "grad_norm": 1.8298320831280286, "language_loss": 0.82482052, "learning_rate": 3.4490467796362687e-07, "loss": 0.90167546, "num_input_tokens_seen": 292833485, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10192871, "step": 13571, "time_per_iteration": 2.579369068145752 }, { "auxiliary_loss_clip": 0.0641534, "auxiliary_loss_mlp": 0.01266603, "balance_loss_clip": 0.06275546, "balance_loss_mlp": 0.01256226, "epoch": 0.8159927852096799, "flos": 13845152885760.0, "grad_norm": 2.1327855853516064, "language_loss": 0.78869128, "learning_rate": 3.446860673237142e-07, "loss": 0.8655107, "num_input_tokens_seen": 292848045, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.1036377, "step": 13572, "time_per_iteration": 2.486907720565796 }, { "auxiliary_loss_clip": 0.06416076, "auxiliary_loss_mlp": 0.01264314, "balance_loss_clip": 0.0627453, "balance_loss_mlp": 0.0125526, "epoch": 0.8160529084623478, "flos": 24506552436480.0, "grad_norm": 1.5526043920137924, "language_loss": 0.65189803, "learning_rate": 3.4446751945347186e-07, "loss": 0.72870195, "num_input_tokens_seen": 292869965, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09051514, "step": 13573, "time_per_iteration": 2.58073091506958 }, { "auxiliary_loss_clip": 0.06409882, "auxiliary_loss_mlp": 0.01265892, "balance_loss_clip": 0.06272936, "balance_loss_mlp": 0.01257267, "epoch": 0.8161130317150158, "flos": 24833170851840.0, "grad_norm": 1.508822042546265, "language_loss": 0.75437647, "learning_rate": 3.442490343611868e-07, "loss": 0.8311342, "num_input_tokens_seen": 292889680, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.08624268, "step": 13574, "time_per_iteration": 2.5545501708984375 }, { "auxiliary_loss_clip": 0.06416955, "auxiliary_loss_mlp": 0.01264642, "balance_loss_clip": 0.06275566, "balance_loss_mlp": 0.01254873, "epoch": 0.8161731549676837, "flos": 30964497989760.0, "grad_norm": 2.1766951318046126, "language_loss": 0.59585726, "learning_rate": 3.4403061205514485e-07, "loss": 0.67267323, "num_input_tokens_seen": 292912360, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09771729, "step": 13575, "time_per_iteration": 2.640449285507202 }, { "auxiliary_loss_clip": 0.06411052, "auxiliary_loss_mlp": 0.01263653, "balance_loss_clip": 0.06272577, "balance_loss_mlp": 0.01253783, "epoch": 0.8162332782203517, "flos": 18557975053440.0, "grad_norm": 1.815656366344603, "language_loss": 0.74348491, "learning_rate": 3.4381225254362736e-07, "loss": 0.82023191, "num_input_tokens_seen": 292928325, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09863281, "step": 13576, "time_per_iteration": 2.509256601333618 }, { "auxiliary_loss_clip": 0.06315254, "auxiliary_loss_mlp": 0.01250747, "balance_loss_clip": 0.06259259, "balance_loss_mlp": 0.01249659, "epoch": 0.8162934014730197, "flos": 70405700025600.0, "grad_norm": 0.8094698583454976, "language_loss": 0.58583361, "learning_rate": 3.435939558349155e-07, "loss": 0.6614936, "num_input_tokens_seen": 292992795, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01089478, "step": 13577, "time_per_iteration": 4.595527648925781 }, { "auxiliary_loss_clip": 0.06408317, "auxiliary_loss_mlp": 0.01264888, "balance_loss_clip": 0.06273084, "balance_loss_mlp": 0.01256001, "epoch": 0.8163535247256877, "flos": 21221165698560.0, "grad_norm": 1.5275159274143617, "language_loss": 0.71195191, "learning_rate": 3.4337572193728747e-07, "loss": 0.78868389, "num_input_tokens_seen": 293011950, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.08886719, "step": 13578, "time_per_iteration": 2.5581376552581787 }, { "auxiliary_loss_clip": 0.06415817, "auxiliary_loss_mlp": 0.01264125, "balance_loss_clip": 0.06276181, "balance_loss_mlp": 0.01255137, "epoch": 0.8164136479783556, "flos": 21104061217920.0, "grad_norm": 1.5765635962216966, "language_loss": 0.7346921, "learning_rate": 3.431575508590172e-07, "loss": 0.81149155, "num_input_tokens_seen": 293030175, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.08990479, "step": 13579, "time_per_iteration": 2.558192491531372 }, { "auxiliary_loss_clip": 0.06416408, "auxiliary_loss_mlp": 0.01263169, "balance_loss_clip": 0.06274836, "balance_loss_mlp": 0.01254187, "epoch": 0.8164737712310236, "flos": 21726215383680.0, "grad_norm": 1.828236627553928, "language_loss": 0.79255056, "learning_rate": 3.4293944260837873e-07, "loss": 0.86934638, "num_input_tokens_seen": 293047980, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.08984375, "step": 13580, "time_per_iteration": 2.542912006378174 }, { "auxiliary_loss_clip": 0.06408301, "auxiliary_loss_mlp": 0.01267937, "balance_loss_clip": 0.06272174, "balance_loss_mlp": 0.01259116, "epoch": 0.8165338944836915, "flos": 19542903471360.0, "grad_norm": 3.1099409096890778, "language_loss": 0.69554752, "learning_rate": 3.4272139719364314e-07, "loss": 0.7723099, "num_input_tokens_seen": 293067030, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.08816528, "step": 13581, "time_per_iteration": 2.536541700363159 }, { "auxiliary_loss_clip": 0.0641311, "auxiliary_loss_mlp": 0.01263157, "balance_loss_clip": 0.06272452, "balance_loss_mlp": 0.0125433, "epoch": 0.8165940177363595, "flos": 22934996784000.0, "grad_norm": 2.3428355368128146, "language_loss": 0.60206902, "learning_rate": 3.4250341462307786e-07, "loss": 0.6788317, "num_input_tokens_seen": 293085575, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.08831787, "step": 13582, "time_per_iteration": 2.539072036743164 }, { "auxiliary_loss_clip": 0.06405816, "auxiliary_loss_mlp": 0.012646, "balance_loss_clip": 0.06272984, "balance_loss_mlp": 0.01256243, "epoch": 0.8166541409890276, "flos": 23377545722880.0, "grad_norm": 1.341928182344274, "language_loss": 0.82426572, "learning_rate": 3.4228549490494897e-07, "loss": 0.90096986, "num_input_tokens_seen": 293108200, "router_z_loss_clip": 1.32910156, "router_z_loss_mlp": 0.08349609, "step": 13583, "time_per_iteration": 2.6130175590515137 }, { "auxiliary_loss_clip": 0.06413643, "auxiliary_loss_mlp": 0.01263007, "balance_loss_clip": 0.06275166, "balance_loss_mlp": 0.01253381, "epoch": 0.8167142642416955, "flos": 18447872388480.0, "grad_norm": 2.658076888102971, "language_loss": 0.7453267, "learning_rate": 3.4206763804752093e-07, "loss": 0.82209319, "num_input_tokens_seen": 293126020, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09631348, "step": 13584, "time_per_iteration": 4.0395684242248535 }, { "auxiliary_loss_clip": 0.06421751, "auxiliary_loss_mlp": 0.01263754, "balance_loss_clip": 0.0628055, "balance_loss_mlp": 0.01253621, "epoch": 0.8167743874943635, "flos": 21221333406720.0, "grad_norm": 1.890098610522503, "language_loss": 0.74747366, "learning_rate": 3.4184984405905405e-07, "loss": 0.82432872, "num_input_tokens_seen": 293144620, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10131836, "step": 13585, "time_per_iteration": 2.550875663757324 }, { "auxiliary_loss_clip": 0.06417634, "auxiliary_loss_mlp": 0.01267052, "balance_loss_clip": 0.06278563, "balance_loss_mlp": 0.01257127, "epoch": 0.8168345107470314, "flos": 18703646576640.0, "grad_norm": 1.6051882930421941, "language_loss": 0.69683135, "learning_rate": 3.416321129478068e-07, "loss": 0.77367818, "num_input_tokens_seen": 293162850, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09918213, "step": 13586, "time_per_iteration": 2.525355577468872 }, { "auxiliary_loss_clip": 0.06414162, "auxiliary_loss_mlp": 0.01266529, "balance_loss_clip": 0.0627393, "balance_loss_mlp": 0.01257749, "epoch": 0.8168946339996994, "flos": 16258648763520.0, "grad_norm": 1.4735143201238887, "language_loss": 0.60942566, "learning_rate": 3.4141444472203594e-07, "loss": 0.68623257, "num_input_tokens_seen": 293181620, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.08770752, "step": 13587, "time_per_iteration": 2.5332911014556885 }, { "auxiliary_loss_clip": 0.0642019, "auxiliary_loss_mlp": 0.01265832, "balance_loss_clip": 0.06275989, "balance_loss_mlp": 0.01256045, "epoch": 0.8169547572523673, "flos": 26948615356800.0, "grad_norm": 2.2551899652256355, "language_loss": 0.69249392, "learning_rate": 3.4119683938999624e-07, "loss": 0.76935422, "num_input_tokens_seen": 293200270, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.09790039, "step": 13588, "time_per_iteration": 2.5848071575164795 }, { "auxiliary_loss_clip": 0.06417783, "auxiliary_loss_mlp": 0.01270706, "balance_loss_clip": 0.06276704, "balance_loss_mlp": 0.01260972, "epoch": 0.8170148805050353, "flos": 18958204880640.0, "grad_norm": 1.5297616896217068, "language_loss": 0.73212671, "learning_rate": 3.4097929695993854e-07, "loss": 0.80901164, "num_input_tokens_seen": 293218960, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09741211, "step": 13589, "time_per_iteration": 2.5743672847747803 }, { "auxiliary_loss_clip": 0.06413083, "auxiliary_loss_mlp": 0.01267886, "balance_loss_clip": 0.0627471, "balance_loss_mlp": 0.01258505, "epoch": 0.8170750037577033, "flos": 21841307366400.0, "grad_norm": 1.7175387560445263, "language_loss": 0.737005, "learning_rate": 3.4076181744011166e-07, "loss": 0.8138147, "num_input_tokens_seen": 293236450, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09375, "step": 13590, "time_per_iteration": 2.6420202255249023 }, { "auxiliary_loss_clip": 0.06421782, "auxiliary_loss_mlp": 0.01266331, "balance_loss_clip": 0.06276608, "balance_loss_mlp": 0.01255889, "epoch": 0.8171351270103713, "flos": 33514986493440.0, "grad_norm": 1.7544348600057504, "language_loss": 0.65209877, "learning_rate": 3.4054440083876345e-07, "loss": 0.72897989, "num_input_tokens_seen": 293256480, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10449219, "step": 13591, "time_per_iteration": 2.6540215015411377 }, { "auxiliary_loss_clip": 0.06419598, "auxiliary_loss_mlp": 0.0126663, "balance_loss_clip": 0.06274898, "balance_loss_mlp": 0.01256181, "epoch": 0.8171952502630392, "flos": 22714330256640.0, "grad_norm": 2.0559145170017565, "language_loss": 0.68542576, "learning_rate": 3.403270471641373e-07, "loss": 0.76228797, "num_input_tokens_seen": 293274960, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10449219, "step": 13592, "time_per_iteration": 2.5677566528320312 }, { "auxiliary_loss_clip": 0.06414236, "auxiliary_loss_mlp": 0.01263525, "balance_loss_clip": 0.06272422, "balance_loss_mlp": 0.01254013, "epoch": 0.8172553735157072, "flos": 26730883722240.0, "grad_norm": 1.6247164505312908, "language_loss": 0.66705406, "learning_rate": 3.401097564244759e-07, "loss": 0.74383169, "num_input_tokens_seen": 293295945, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09521484, "step": 13593, "time_per_iteration": 2.596264600753784 }, { "auxiliary_loss_clip": 0.06413756, "auxiliary_loss_mlp": 0.01264104, "balance_loss_clip": 0.06275584, "balance_loss_mlp": 0.01255492, "epoch": 0.8173154967683751, "flos": 15966551030400.0, "grad_norm": 2.0392143889819585, "language_loss": 0.70051366, "learning_rate": 3.398925286280188e-07, "loss": 0.77729225, "num_input_tokens_seen": 293313300, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.08605957, "step": 13594, "time_per_iteration": 2.532014846801758 }, { "auxiliary_loss_clip": 0.06418804, "auxiliary_loss_mlp": 0.01261972, "balance_loss_clip": 0.06274764, "balance_loss_mlp": 0.0125281, "epoch": 0.8173756200210431, "flos": 25992547470720.0, "grad_norm": 1.686657866378134, "language_loss": 0.66161698, "learning_rate": 3.3967536378300456e-07, "loss": 0.73842478, "num_input_tokens_seen": 293333085, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.0914917, "step": 13595, "time_per_iteration": 2.5679335594177246 }, { "auxiliary_loss_clip": 0.06423163, "auxiliary_loss_mlp": 0.01269866, "balance_loss_clip": 0.06277162, "balance_loss_mlp": 0.01260115, "epoch": 0.8174357432737112, "flos": 25671211862400.0, "grad_norm": 2.3426349400792925, "language_loss": 0.79164469, "learning_rate": 3.394582618976658e-07, "loss": 0.86857498, "num_input_tokens_seen": 293351895, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.09747314, "step": 13596, "time_per_iteration": 2.572453498840332 }, { "auxiliary_loss_clip": 0.06413764, "auxiliary_loss_mlp": 0.01262288, "balance_loss_clip": 0.06274876, "balance_loss_mlp": 0.01252907, "epoch": 0.8174958665263791, "flos": 21841517001600.0, "grad_norm": 2.211541001669009, "language_loss": 0.57833892, "learning_rate": 3.392412229802362e-07, "loss": 0.65509939, "num_input_tokens_seen": 293371165, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09387207, "step": 13597, "time_per_iteration": 2.558102607727051 }, { "auxiliary_loss_clip": 0.06412686, "auxiliary_loss_mlp": 0.0126716, "balance_loss_clip": 0.06275024, "balance_loss_mlp": 0.01258148, "epoch": 0.8175559897790471, "flos": 22462077939840.0, "grad_norm": 1.445230497453521, "language_loss": 0.82601804, "learning_rate": 3.390242470389462e-07, "loss": 0.90281653, "num_input_tokens_seen": 293391150, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09008789, "step": 13598, "time_per_iteration": 2.5672106742858887 }, { "auxiliary_loss_clip": 0.06414202, "auxiliary_loss_mlp": 0.01266999, "balance_loss_clip": 0.06272346, "balance_loss_mlp": 0.01257868, "epoch": 0.817616113031715, "flos": 23621328777600.0, "grad_norm": 2.0075579205351257, "language_loss": 0.82658064, "learning_rate": 3.3880733408202277e-07, "loss": 0.90339261, "num_input_tokens_seen": 293409440, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09130859, "step": 13599, "time_per_iteration": 2.5922939777374268 }, { "auxiliary_loss_clip": 0.06410255, "auxiliary_loss_mlp": 0.0126554, "balance_loss_clip": 0.0627276, "balance_loss_mlp": 0.0125682, "epoch": 0.817676236284383, "flos": 27679572449280.0, "grad_norm": 1.7616764830178364, "language_loss": 0.83953631, "learning_rate": 3.3859048411769186e-07, "loss": 0.91629428, "num_input_tokens_seen": 293428995, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.08721924, "step": 13600, "time_per_iteration": 2.5758984088897705 }, { "auxiliary_loss_clip": 0.06416255, "auxiliary_loss_mlp": 0.01264707, "balance_loss_clip": 0.06273098, "balance_loss_mlp": 0.0125498, "epoch": 0.8177363595370509, "flos": 24687918599040.0, "grad_norm": 1.7280634991584365, "language_loss": 0.74069393, "learning_rate": 3.383736971541766e-07, "loss": 0.81750345, "num_input_tokens_seen": 293449155, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09735107, "step": 13601, "time_per_iteration": 4.015136003494263 }, { "auxiliary_loss_clip": 0.06422819, "auxiliary_loss_mlp": 0.01262787, "balance_loss_clip": 0.06276595, "balance_loss_mlp": 0.01253114, "epoch": 0.817796482789719, "flos": 17351835056640.0, "grad_norm": 9.039730181307384, "language_loss": 0.68504429, "learning_rate": 3.3815697319969737e-07, "loss": 0.76190042, "num_input_tokens_seen": 293466125, "router_z_loss_clip": 1.46191406, "router_z_loss_mlp": 0.09674072, "step": 13602, "time_per_iteration": 2.5374081134796143 }, { "auxiliary_loss_clip": 0.06409505, "auxiliary_loss_mlp": 0.01264288, "balance_loss_clip": 0.06273323, "balance_loss_mlp": 0.01255193, "epoch": 0.8178566060423869, "flos": 17783105621760.0, "grad_norm": 2.0935026521369364, "language_loss": 0.83222276, "learning_rate": 3.379403122624718e-07, "loss": 0.90896064, "num_input_tokens_seen": 293481345, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09094238, "step": 13603, "time_per_iteration": 2.5133795738220215 }, { "auxiliary_loss_clip": 0.06414706, "auxiliary_loss_mlp": 0.01265492, "balance_loss_clip": 0.06274101, "balance_loss_mlp": 0.01255962, "epoch": 0.8179167292950549, "flos": 24980267894400.0, "grad_norm": 1.6343908409143164, "language_loss": 0.69266963, "learning_rate": 3.377237143507159e-07, "loss": 0.76947165, "num_input_tokens_seen": 293502330, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09521484, "step": 13604, "time_per_iteration": 2.5802996158599854 }, { "auxiliary_loss_clip": 0.06415023, "auxiliary_loss_mlp": 0.01265708, "balance_loss_clip": 0.06276045, "balance_loss_mlp": 0.01256708, "epoch": 0.8179768525477228, "flos": 22863397870080.0, "grad_norm": 1.600621195250874, "language_loss": 0.74176335, "learning_rate": 3.3750717947264406e-07, "loss": 0.81857073, "num_input_tokens_seen": 293521415, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09002686, "step": 13605, "time_per_iteration": 2.5471134185791016 }, { "auxiliary_loss_clip": 0.06412199, "auxiliary_loss_mlp": 0.0126752, "balance_loss_clip": 0.06275402, "balance_loss_mlp": 0.01256702, "epoch": 0.8180369758003908, "flos": 18521064529920.0, "grad_norm": 1.7721124482100516, "language_loss": 0.74993348, "learning_rate": 3.372907076364666e-07, "loss": 0.82673067, "num_input_tokens_seen": 293539245, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.10827637, "step": 13606, "time_per_iteration": 3.9619855880737305 }, { "auxiliary_loss_clip": 0.06412678, "auxiliary_loss_mlp": 0.01268558, "balance_loss_clip": 0.0627511, "balance_loss_mlp": 0.0125932, "epoch": 0.8180970990530587, "flos": 33190422503040.0, "grad_norm": 2.0692487013933154, "language_loss": 0.66233891, "learning_rate": 3.370742988503916e-07, "loss": 0.73915124, "num_input_tokens_seen": 293560640, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09228516, "step": 13607, "time_per_iteration": 2.69338321685791 }, { "auxiliary_loss_clip": 0.0641562, "auxiliary_loss_mlp": 0.01265336, "balance_loss_clip": 0.06275061, "balance_loss_mlp": 0.01255519, "epoch": 0.8181572223057267, "flos": 25017094563840.0, "grad_norm": 1.914179188818776, "language_loss": 0.70518392, "learning_rate": 3.3685795312262634e-07, "loss": 0.78199351, "num_input_tokens_seen": 293579465, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09820557, "step": 13608, "time_per_iteration": 2.553755760192871 }, { "auxiliary_loss_clip": 0.06411297, "auxiliary_loss_mlp": 0.01264886, "balance_loss_clip": 0.06272159, "balance_loss_mlp": 0.01255397, "epoch": 0.8182173455583948, "flos": 28556326846080.0, "grad_norm": 1.8909778558940538, "language_loss": 0.79726744, "learning_rate": 3.366416704613735e-07, "loss": 0.87402928, "num_input_tokens_seen": 293600540, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09490967, "step": 13609, "time_per_iteration": 2.5969386100769043 }, { "auxiliary_loss_clip": 0.06315054, "auxiliary_loss_mlp": 0.01254635, "balance_loss_clip": 0.06259283, "balance_loss_mlp": 0.0125358, "epoch": 0.8182774688110627, "flos": 72047051729280.0, "grad_norm": 0.740729689859858, "language_loss": 0.55908853, "learning_rate": 3.3642545087483544e-07, "loss": 0.63478541, "num_input_tokens_seen": 293665160, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.01055908, "step": 13610, "time_per_iteration": 3.2002816200256348 }, { "auxiliary_loss_clip": 0.06406484, "auxiliary_loss_mlp": 0.01265517, "balance_loss_clip": 0.06272732, "balance_loss_mlp": 0.01256522, "epoch": 0.8183375920637307, "flos": 19761431719680.0, "grad_norm": 1.8615352853226772, "language_loss": 0.78330886, "learning_rate": 3.362092943712107e-07, "loss": 0.86002886, "num_input_tokens_seen": 293683995, "router_z_loss_clip": 1.33886719, "router_z_loss_mlp": 0.08990479, "step": 13611, "time_per_iteration": 2.532656192779541 }, { "auxiliary_loss_clip": 0.06426415, "auxiliary_loss_mlp": 0.0126677, "balance_loss_clip": 0.06277777, "balance_loss_mlp": 0.01256316, "epoch": 0.8183977153163986, "flos": 22347740643840.0, "grad_norm": 2.018191658829253, "language_loss": 0.77429706, "learning_rate": 3.3599320095869745e-07, "loss": 0.85122889, "num_input_tokens_seen": 293704115, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.10461426, "step": 13612, "time_per_iteration": 2.5645484924316406 }, { "auxiliary_loss_clip": 0.06413095, "auxiliary_loss_mlp": 0.01265827, "balance_loss_clip": 0.06276095, "balance_loss_mlp": 0.01256482, "epoch": 0.8184578385690666, "flos": 17718256961280.0, "grad_norm": 2.9563603315165667, "language_loss": 0.8661111, "learning_rate": 3.3577717064548793e-07, "loss": 0.9429003, "num_input_tokens_seen": 293722225, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09344482, "step": 13613, "time_per_iteration": 2.529799222946167 }, { "auxiliary_loss_clip": 0.06418055, "auxiliary_loss_mlp": 0.01266343, "balance_loss_clip": 0.06280024, "balance_loss_mlp": 0.01256776, "epoch": 0.8185179618217345, "flos": 25707996604800.0, "grad_norm": 1.7229346853917744, "language_loss": 0.73046076, "learning_rate": 3.355612034397746e-07, "loss": 0.80730474, "num_input_tokens_seen": 293743995, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09558105, "step": 13614, "time_per_iteration": 2.5987424850463867 }, { "auxiliary_loss_clip": 0.06414371, "auxiliary_loss_mlp": 0.01265204, "balance_loss_clip": 0.06272543, "balance_loss_mlp": 0.01255798, "epoch": 0.8185780850744026, "flos": 25967837715840.0, "grad_norm": 1.504921901284088, "language_loss": 0.81246585, "learning_rate": 3.353452993497479e-07, "loss": 0.8892616, "num_input_tokens_seen": 293764935, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09411621, "step": 13615, "time_per_iteration": 2.642932176589966 }, { "auxiliary_loss_clip": 0.06416669, "auxiliary_loss_mlp": 0.01267562, "balance_loss_clip": 0.06276327, "balance_loss_mlp": 0.01257465, "epoch": 0.8186382083270705, "flos": 25235455104000.0, "grad_norm": 1.9678687023124313, "language_loss": 0.75937605, "learning_rate": 3.3512945838359375e-07, "loss": 0.83621842, "num_input_tokens_seen": 293784035, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.10107422, "step": 13616, "time_per_iteration": 2.6125364303588867 }, { "auxiliary_loss_clip": 0.06416252, "auxiliary_loss_mlp": 0.01264386, "balance_loss_clip": 0.06279732, "balance_loss_mlp": 0.01254384, "epoch": 0.8186983315797385, "flos": 22420890858240.0, "grad_norm": 1.7408806979388842, "language_loss": 0.7541554, "learning_rate": 3.349136805494979e-07, "loss": 0.83096176, "num_input_tokens_seen": 293803360, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.10003662, "step": 13617, "time_per_iteration": 4.049147605895996 }, { "auxiliary_loss_clip": 0.06413956, "auxiliary_loss_mlp": 0.01265512, "balance_loss_clip": 0.06276555, "balance_loss_mlp": 0.01256059, "epoch": 0.8187584548324064, "flos": 22024560245760.0, "grad_norm": 1.8831021890164563, "language_loss": 0.68247736, "learning_rate": 3.346979658556415e-07, "loss": 0.7592721, "num_input_tokens_seen": 293821325, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09454346, "step": 13618, "time_per_iteration": 2.5391292572021484 }, { "auxiliary_loss_clip": 0.06424335, "auxiliary_loss_mlp": 0.01263489, "balance_loss_clip": 0.06278208, "balance_loss_mlp": 0.01253506, "epoch": 0.8188185780850744, "flos": 29249325239040.0, "grad_norm": 1.7850638945212574, "language_loss": 0.69782162, "learning_rate": 3.344823143102058e-07, "loss": 0.77469993, "num_input_tokens_seen": 293840315, "router_z_loss_clip": 1.4609375, "router_z_loss_mlp": 0.09985352, "step": 13619, "time_per_iteration": 2.6070547103881836 }, { "auxiliary_loss_clip": 0.06421065, "auxiliary_loss_mlp": 0.01266154, "balance_loss_clip": 0.06279813, "balance_loss_mlp": 0.01256332, "epoch": 0.8188787013377423, "flos": 20701483476480.0, "grad_norm": 1.8395734595663913, "language_loss": 0.74413025, "learning_rate": 3.3426672592136694e-07, "loss": 0.82100242, "num_input_tokens_seen": 293855685, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09820557, "step": 13620, "time_per_iteration": 2.521611213684082 }, { "auxiliary_loss_clip": 0.06410391, "auxiliary_loss_mlp": 0.01265384, "balance_loss_clip": 0.06273913, "balance_loss_mlp": 0.01256282, "epoch": 0.8189388245904103, "flos": 23739816850560.0, "grad_norm": 1.5582379878308381, "language_loss": 0.76316512, "learning_rate": 3.340512006973011e-07, "loss": 0.83992285, "num_input_tokens_seen": 293875540, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.09100342, "step": 13621, "time_per_iteration": 2.5632846355438232 }, { "auxiliary_loss_clip": 0.0641508, "auxiliary_loss_mlp": 0.01262664, "balance_loss_clip": 0.06275851, "balance_loss_mlp": 0.01252233, "epoch": 0.8189989478430784, "flos": 28262342396160.0, "grad_norm": 1.9543778919360495, "language_loss": 0.66096151, "learning_rate": 3.3383573864618076e-07, "loss": 0.73773897, "num_input_tokens_seen": 293896570, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.10430908, "step": 13622, "time_per_iteration": 2.580676555633545 }, { "auxiliary_loss_clip": 0.06425329, "auxiliary_loss_mlp": 0.01265689, "balance_loss_clip": 0.06284198, "balance_loss_mlp": 0.01255902, "epoch": 0.8190590710957463, "flos": 21404125088640.0, "grad_norm": 1.7129481334742227, "language_loss": 0.75385869, "learning_rate": 3.3362033977617653e-07, "loss": 0.83076888, "num_input_tokens_seen": 293914680, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09783936, "step": 13623, "time_per_iteration": 2.551410675048828 }, { "auxiliary_loss_clip": 0.06416999, "auxiliary_loss_mlp": 0.01266408, "balance_loss_clip": 0.06275134, "balance_loss_mlp": 0.01257008, "epoch": 0.8191191943484143, "flos": 38804960384640.0, "grad_norm": 1.9093358350330356, "language_loss": 0.63326597, "learning_rate": 3.3340500409545527e-07, "loss": 0.71010005, "num_input_tokens_seen": 293936480, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09393311, "step": 13624, "time_per_iteration": 4.029195785522461 }, { "auxiliary_loss_clip": 0.06415062, "auxiliary_loss_mlp": 0.01265712, "balance_loss_clip": 0.06279723, "balance_loss_mlp": 0.01256592, "epoch": 0.8191793176010822, "flos": 25453438300800.0, "grad_norm": 1.5982256846676757, "language_loss": 0.78538072, "learning_rate": 3.3318973161218386e-07, "loss": 0.86218846, "num_input_tokens_seen": 293957815, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.09112549, "step": 13625, "time_per_iteration": 2.5901710987091064 }, { "auxiliary_loss_clip": 0.06423342, "auxiliary_loss_mlp": 0.01265616, "balance_loss_clip": 0.06274191, "balance_loss_mlp": 0.01255882, "epoch": 0.8192394408537502, "flos": 25090118997120.0, "grad_norm": 1.7976501392878212, "language_loss": 0.75907457, "learning_rate": 3.329745223345244e-07, "loss": 0.83596414, "num_input_tokens_seen": 293975440, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.09735107, "step": 13626, "time_per_iteration": 2.5509226322174072 }, { "auxiliary_loss_clip": 0.06417213, "auxiliary_loss_mlp": 0.01261729, "balance_loss_clip": 0.06278122, "balance_loss_mlp": 0.01252454, "epoch": 0.8192995641064181, "flos": 27681291457920.0, "grad_norm": 1.4540461251167358, "language_loss": 0.74065006, "learning_rate": 3.3275937627063823e-07, "loss": 0.81743944, "num_input_tokens_seen": 293997540, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09277344, "step": 13627, "time_per_iteration": 2.6296355724334717 }, { "auxiliary_loss_clip": 0.06419677, "auxiliary_loss_mlp": 0.0126601, "balance_loss_clip": 0.06276594, "balance_loss_mlp": 0.01256378, "epoch": 0.8193596873590862, "flos": 21294944818560.0, "grad_norm": 1.6167490359231955, "language_loss": 0.69411594, "learning_rate": 3.3254429342868353e-07, "loss": 0.77097273, "num_input_tokens_seen": 294017030, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09637451, "step": 13628, "time_per_iteration": 2.5574610233306885 }, { "auxiliary_loss_clip": 0.06424864, "auxiliary_loss_mlp": 0.01265495, "balance_loss_clip": 0.06279582, "balance_loss_mlp": 0.01255368, "epoch": 0.8194198106117541, "flos": 17498219339520.0, "grad_norm": 1.778466018403348, "language_loss": 0.85619867, "learning_rate": 3.323292738168171e-07, "loss": 0.93310225, "num_input_tokens_seen": 294035700, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.10125732, "step": 13629, "time_per_iteration": 2.5514369010925293 }, { "auxiliary_loss_clip": 0.06416489, "auxiliary_loss_mlp": 0.01263269, "balance_loss_clip": 0.06276046, "balance_loss_mlp": 0.01253774, "epoch": 0.8194799338644221, "flos": 15273301075200.0, "grad_norm": 1.9937159996178477, "language_loss": 0.74391007, "learning_rate": 3.3211431744319084e-07, "loss": 0.82070768, "num_input_tokens_seen": 294049730, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.0949707, "step": 13630, "time_per_iteration": 2.522930860519409 }, { "auxiliary_loss_clip": 0.06418662, "auxiliary_loss_mlp": 0.01263864, "balance_loss_clip": 0.06277149, "balance_loss_mlp": 0.01253958, "epoch": 0.81954005711709, "flos": 14723793999360.0, "grad_norm": 1.8730292557798793, "language_loss": 0.72851598, "learning_rate": 3.31899424315957e-07, "loss": 0.80534124, "num_input_tokens_seen": 294066545, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09899902, "step": 13631, "time_per_iteration": 2.525170087814331 }, { "auxiliary_loss_clip": 0.06416491, "auxiliary_loss_mlp": 0.0126574, "balance_loss_clip": 0.06275991, "balance_loss_mlp": 0.01256805, "epoch": 0.819600180369758, "flos": 23080416744960.0, "grad_norm": 1.6905040830363116, "language_loss": 0.76634991, "learning_rate": 3.3168459444326447e-07, "loss": 0.84317219, "num_input_tokens_seen": 294087455, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.08935547, "step": 13632, "time_per_iteration": 2.536292314529419 }, { "auxiliary_loss_clip": 0.06417034, "auxiliary_loss_mlp": 0.01264226, "balance_loss_clip": 0.06278483, "balance_loss_mlp": 0.01255411, "epoch": 0.8196603036224259, "flos": 27607176921600.0, "grad_norm": 1.7670347605248171, "language_loss": 0.66263258, "learning_rate": 3.314698278332588e-07, "loss": 0.73944515, "num_input_tokens_seen": 294107480, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.08813477, "step": 13633, "time_per_iteration": 2.5884618759155273 }, { "auxiliary_loss_clip": 0.06411768, "auxiliary_loss_mlp": 0.01263085, "balance_loss_clip": 0.06274559, "balance_loss_mlp": 0.01254126, "epoch": 0.8197204268750939, "flos": 28589086592640.0, "grad_norm": 1.4633176475052763, "language_loss": 0.76154208, "learning_rate": 3.3125512449408513e-07, "loss": 0.83829063, "num_input_tokens_seen": 294130115, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.08947754, "step": 13634, "time_per_iteration": 2.591649055480957 }, { "auxiliary_loss_clip": 0.06413197, "auxiliary_loss_mlp": 0.01266352, "balance_loss_clip": 0.06278162, "balance_loss_mlp": 0.01256905, "epoch": 0.819780550127762, "flos": 23265011289600.0, "grad_norm": 2.0370148516271915, "language_loss": 0.81842291, "learning_rate": 3.310404844338841e-07, "loss": 0.89521837, "num_input_tokens_seen": 294148495, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.09448242, "step": 13635, "time_per_iteration": 2.560720682144165 }, { "auxiliary_loss_clip": 0.0642069, "auxiliary_loss_mlp": 0.01266932, "balance_loss_clip": 0.06279178, "balance_loss_mlp": 0.01256287, "epoch": 0.8198406733804299, "flos": 26692086481920.0, "grad_norm": 1.5087208834572696, "language_loss": 0.76275241, "learning_rate": 3.308259076607949e-07, "loss": 0.8396287, "num_input_tokens_seen": 294169595, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.10644531, "step": 13636, "time_per_iteration": 2.5907559394836426 }, { "auxiliary_loss_clip": 0.06413593, "auxiliary_loss_mlp": 0.01264496, "balance_loss_clip": 0.06275646, "balance_loss_mlp": 0.0125521, "epoch": 0.8199007966330979, "flos": 20090272268160.0, "grad_norm": 1.8121444910403863, "language_loss": 0.81283098, "learning_rate": 3.3061139418295445e-07, "loss": 0.88961184, "num_input_tokens_seen": 294183885, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09283447, "step": 13637, "time_per_iteration": 2.529123067855835 }, { "auxiliary_loss_clip": 0.06412072, "auxiliary_loss_mlp": 0.0126408, "balance_loss_clip": 0.0627472, "balance_loss_mlp": 0.0125471, "epoch": 0.8199609198857658, "flos": 31910503386240.0, "grad_norm": 2.0359564719549423, "language_loss": 0.71230817, "learning_rate": 3.3039694400849725e-07, "loss": 0.78906977, "num_input_tokens_seen": 294200150, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09375, "step": 13638, "time_per_iteration": 2.686846971511841 }, { "auxiliary_loss_clip": 0.06421828, "auxiliary_loss_mlp": 0.01263963, "balance_loss_clip": 0.06276774, "balance_loss_mlp": 0.01253002, "epoch": 0.8200210431384338, "flos": 26477583229440.0, "grad_norm": 2.3081045654303796, "language_loss": 0.79987848, "learning_rate": 3.3018255714555564e-07, "loss": 0.8767364, "num_input_tokens_seen": 294220385, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10955811, "step": 13639, "time_per_iteration": 2.5999252796173096 }, { "auxiliary_loss_clip": 0.06411196, "auxiliary_loss_mlp": 0.01262911, "balance_loss_clip": 0.06272733, "balance_loss_mlp": 0.01254096, "epoch": 0.8200811663911017, "flos": 22098087803520.0, "grad_norm": 6.1093628839243745, "language_loss": 0.7937603, "learning_rate": 3.299682336022589e-07, "loss": 0.8705014, "num_input_tokens_seen": 294239355, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.0881958, "step": 13640, "time_per_iteration": 2.5502429008483887 }, { "auxiliary_loss_clip": 0.06426308, "auxiliary_loss_mlp": 0.01266729, "balance_loss_clip": 0.06278713, "balance_loss_mlp": 0.01256489, "epoch": 0.8201412896437698, "flos": 37602174551040.0, "grad_norm": 1.9464490496592661, "language_loss": 0.63160849, "learning_rate": 3.297539733867336e-07, "loss": 0.70853889, "num_input_tokens_seen": 294259395, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.10241699, "step": 13641, "time_per_iteration": 4.093515157699585 }, { "auxiliary_loss_clip": 0.06415968, "auxiliary_loss_mlp": 0.01267137, "balance_loss_clip": 0.06277252, "balance_loss_mlp": 0.01257213, "epoch": 0.8202014128964377, "flos": 19652461084800.0, "grad_norm": 1.70880240997748, "language_loss": 0.73908234, "learning_rate": 3.295397765071055e-07, "loss": 0.81591344, "num_input_tokens_seen": 294277365, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09924316, "step": 13642, "time_per_iteration": 2.5185670852661133 }, { "auxiliary_loss_clip": 0.06414589, "auxiliary_loss_mlp": 0.01267571, "balance_loss_clip": 0.06275752, "balance_loss_mlp": 0.01258141, "epoch": 0.8202615361491057, "flos": 31475375533440.0, "grad_norm": 1.60020359441694, "language_loss": 0.70909178, "learning_rate": 3.2932564297149615e-07, "loss": 0.78591335, "num_input_tokens_seen": 294297555, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09436035, "step": 13643, "time_per_iteration": 2.671037197113037 }, { "auxiliary_loss_clip": 0.06417639, "auxiliary_loss_mlp": 0.01267988, "balance_loss_clip": 0.06280192, "balance_loss_mlp": 0.01258898, "epoch": 0.8203216594017736, "flos": 24722145792000.0, "grad_norm": 1.7178815638327887, "language_loss": 0.65889537, "learning_rate": 3.291115727880256e-07, "loss": 0.73575169, "num_input_tokens_seen": 294317600, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09088135, "step": 13644, "time_per_iteration": 2.573659658432007 }, { "auxiliary_loss_clip": 0.06417495, "auxiliary_loss_mlp": 0.01266325, "balance_loss_clip": 0.06275641, "balance_loss_mlp": 0.01256561, "epoch": 0.8203817826544416, "flos": 26039101213440.0, "grad_norm": 1.3914850558903853, "language_loss": 0.70685631, "learning_rate": 3.2889756596481234e-07, "loss": 0.78369451, "num_input_tokens_seen": 294340215, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09759521, "step": 13645, "time_per_iteration": 4.073330402374268 }, { "auxiliary_loss_clip": 0.06412034, "auxiliary_loss_mlp": 0.01264718, "balance_loss_clip": 0.06274448, "balance_loss_mlp": 0.01255718, "epoch": 0.8204419059071095, "flos": 25961087462400.0, "grad_norm": 2.238842590032847, "language_loss": 0.71862662, "learning_rate": 3.286836225099707e-07, "loss": 0.79539406, "num_input_tokens_seen": 294358590, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09002686, "step": 13646, "time_per_iteration": 2.5596511363983154 }, { "auxiliary_loss_clip": 0.06424527, "auxiliary_loss_mlp": 0.01268515, "balance_loss_clip": 0.06281405, "balance_loss_mlp": 0.0125871, "epoch": 0.8205020291597775, "flos": 23585717992320.0, "grad_norm": 2.2762025952963505, "language_loss": 0.79095894, "learning_rate": 3.284697424316132e-07, "loss": 0.86788934, "num_input_tokens_seen": 294375825, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09802246, "step": 13647, "time_per_iteration": 2.5861287117004395 }, { "auxiliary_loss_clip": 0.06411259, "auxiliary_loss_mlp": 0.01265453, "balance_loss_clip": 0.06276567, "balance_loss_mlp": 0.01256298, "epoch": 0.8205621524124456, "flos": 26806759194240.0, "grad_norm": 1.3918080945347653, "language_loss": 0.68079543, "learning_rate": 3.2825592573785034e-07, "loss": 0.75756252, "num_input_tokens_seen": 294398500, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.09161377, "step": 13648, "time_per_iteration": 2.6201846599578857 }, { "auxiliary_loss_clip": 0.06413631, "auxiliary_loss_mlp": 0.01268176, "balance_loss_clip": 0.06271863, "balance_loss_mlp": 0.01258186, "epoch": 0.8206222756651135, "flos": 27535410299520.0, "grad_norm": 1.645135552985211, "language_loss": 0.80168366, "learning_rate": 3.28042172436791e-07, "loss": 0.87850171, "num_input_tokens_seen": 294418840, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09985352, "step": 13649, "time_per_iteration": 2.6148641109466553 }, { "auxiliary_loss_clip": 0.06417155, "auxiliary_loss_mlp": 0.01267179, "balance_loss_clip": 0.06277134, "balance_loss_mlp": 0.01257208, "epoch": 0.8206823989177815, "flos": 21184967934720.0, "grad_norm": 1.58670407119409, "language_loss": 0.69338405, "learning_rate": 3.278284825365396e-07, "loss": 0.77022737, "num_input_tokens_seen": 294438215, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09979248, "step": 13650, "time_per_iteration": 2.568804979324341 }, { "auxiliary_loss_clip": 0.06415059, "auxiliary_loss_mlp": 0.01265861, "balance_loss_clip": 0.06275585, "balance_loss_mlp": 0.01255592, "epoch": 0.8207425221704494, "flos": 11514324660480.0, "grad_norm": 2.1740917001741673, "language_loss": 0.609505, "learning_rate": 3.276148560452001e-07, "loss": 0.68631417, "num_input_tokens_seen": 294455260, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.10272217, "step": 13651, "time_per_iteration": 2.5554473400115967 }, { "auxiliary_loss_clip": 0.06419761, "auxiliary_loss_mlp": 0.01264877, "balance_loss_clip": 0.0627687, "balance_loss_mlp": 0.01254816, "epoch": 0.8208026454231174, "flos": 19798090680960.0, "grad_norm": 1.7731790804600052, "language_loss": 0.72423023, "learning_rate": 3.2740129297087293e-07, "loss": 0.80107665, "num_input_tokens_seen": 294473205, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10064697, "step": 13652, "time_per_iteration": 2.5520431995391846 }, { "auxiliary_loss_clip": 0.06410711, "auxiliary_loss_mlp": 0.01266037, "balance_loss_clip": 0.06277038, "balance_loss_mlp": 0.0125715, "epoch": 0.8208627686757853, "flos": 15672692361600.0, "grad_norm": 1.6646909764796176, "language_loss": 0.73062569, "learning_rate": 3.271877933216558e-07, "loss": 0.80739319, "num_input_tokens_seen": 294490645, "router_z_loss_clip": 1.3359375, "router_z_loss_mlp": 0.08892822, "step": 13653, "time_per_iteration": 2.5355093479156494 }, { "auxiliary_loss_clip": 0.06431574, "auxiliary_loss_mlp": 0.01272283, "balance_loss_clip": 0.06284863, "balance_loss_mlp": 0.0126109, "epoch": 0.8209228919284534, "flos": 37490897928960.0, "grad_norm": 2.5170164672529425, "language_loss": 0.62975866, "learning_rate": 3.269743571056451e-07, "loss": 0.70679724, "num_input_tokens_seen": 294513500, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.11199951, "step": 13654, "time_per_iteration": 2.6874775886535645 }, { "auxiliary_loss_clip": 0.06415288, "auxiliary_loss_mlp": 0.01263229, "balance_loss_clip": 0.06272905, "balance_loss_mlp": 0.01253543, "epoch": 0.8209830151811213, "flos": 23119759036800.0, "grad_norm": 1.8263620758700188, "language_loss": 0.70338798, "learning_rate": 3.2676098433093447e-07, "loss": 0.78017312, "num_input_tokens_seen": 294535710, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09680176, "step": 13655, "time_per_iteration": 2.569349765777588 }, { "auxiliary_loss_clip": 0.06411314, "auxiliary_loss_mlp": 0.01263806, "balance_loss_clip": 0.06274031, "balance_loss_mlp": 0.01254687, "epoch": 0.8210431384337893, "flos": 21294567475200.0, "grad_norm": 1.9412197044104236, "language_loss": 0.82286382, "learning_rate": 3.265476750056162e-07, "loss": 0.89961505, "num_input_tokens_seen": 294554055, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09118652, "step": 13656, "time_per_iteration": 3.9978692531585693 }, { "auxiliary_loss_clip": 0.06410316, "auxiliary_loss_mlp": 0.01265212, "balance_loss_clip": 0.06276061, "balance_loss_mlp": 0.01255479, "epoch": 0.8211032616864572, "flos": 11505897325440.0, "grad_norm": 2.2996086560898004, "language_loss": 0.73771477, "learning_rate": 3.2633442913777654e-07, "loss": 0.81447005, "num_input_tokens_seen": 294570390, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.09735107, "step": 13657, "time_per_iteration": 2.5066399574279785 }, { "auxiliary_loss_clip": 0.06414495, "auxiliary_loss_mlp": 0.01264123, "balance_loss_clip": 0.06274778, "balance_loss_mlp": 0.01255349, "epoch": 0.8211633849391252, "flos": 29828573314560.0, "grad_norm": 1.5985838161332833, "language_loss": 0.55785871, "learning_rate": 3.2612124673550325e-07, "loss": 0.63464487, "num_input_tokens_seen": 294593050, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.08764648, "step": 13658, "time_per_iteration": 2.6618990898132324 }, { "auxiliary_loss_clip": 0.064146, "auxiliary_loss_mlp": 0.01265564, "balance_loss_clip": 0.06272829, "balance_loss_mlp": 0.01255979, "epoch": 0.8212235081917931, "flos": 13120484849280.0, "grad_norm": 2.012936899693933, "language_loss": 0.79229259, "learning_rate": 3.259081278068805e-07, "loss": 0.86909425, "num_input_tokens_seen": 294608550, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09588623, "step": 13659, "time_per_iteration": 2.515076160430908 }, { "auxiliary_loss_clip": 0.06411625, "auxiliary_loss_mlp": 0.01265591, "balance_loss_clip": 0.06275757, "balance_loss_mlp": 0.01256919, "epoch": 0.8212836314444611, "flos": 40524828963840.0, "grad_norm": 2.082156437974094, "language_loss": 0.59879923, "learning_rate": 3.256950723599887e-07, "loss": 0.67557132, "num_input_tokens_seen": 294630380, "router_z_loss_clip": 1.35644531, "router_z_loss_mlp": 0.08673096, "step": 13660, "time_per_iteration": 2.7346901893615723 }, { "auxiliary_loss_clip": 0.06423372, "auxiliary_loss_mlp": 0.01267309, "balance_loss_clip": 0.06280763, "balance_loss_mlp": 0.01257391, "epoch": 0.8213437546971292, "flos": 18776503301760.0, "grad_norm": 1.835575431633917, "language_loss": 0.732207, "learning_rate": 3.254820804029075e-07, "loss": 0.8091138, "num_input_tokens_seen": 294648655, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09918213, "step": 13661, "time_per_iteration": 2.5324866771698 }, { "auxiliary_loss_clip": 0.06422274, "auxiliary_loss_mlp": 0.01268973, "balance_loss_clip": 0.06279013, "balance_loss_mlp": 0.01258876, "epoch": 0.8214038779497971, "flos": 19688323432320.0, "grad_norm": 2.566595372497912, "language_loss": 0.7550658, "learning_rate": 3.252691519437143e-07, "loss": 0.83197826, "num_input_tokens_seen": 294666915, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10095215, "step": 13662, "time_per_iteration": 2.5443196296691895 }, { "auxiliary_loss_clip": 0.06316642, "auxiliary_loss_mlp": 0.01251963, "balance_loss_clip": 0.06260827, "balance_loss_mlp": 0.01250924, "epoch": 0.8214640012024651, "flos": 71624040791040.0, "grad_norm": 0.7388431336850375, "language_loss": 0.54053074, "learning_rate": 3.250562869904825e-07, "loss": 0.61621684, "num_input_tokens_seen": 294731545, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.01039124, "step": 13663, "time_per_iteration": 4.6270787715911865 }, { "auxiliary_loss_clip": 0.06413825, "auxiliary_loss_mlp": 0.01268145, "balance_loss_clip": 0.0627367, "balance_loss_mlp": 0.01258066, "epoch": 0.821524124455133, "flos": 14762507385600.0, "grad_norm": 1.9549573239721267, "language_loss": 0.6634804, "learning_rate": 3.248434855512838e-07, "loss": 0.74030006, "num_input_tokens_seen": 294748745, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.10070801, "step": 13664, "time_per_iteration": 2.5055201053619385 }, { "auxiliary_loss_clip": 0.06413396, "auxiliary_loss_mlp": 0.01262884, "balance_loss_clip": 0.06275806, "balance_loss_mlp": 0.01253926, "epoch": 0.821584247707801, "flos": 25089238529280.0, "grad_norm": 1.3586602494898967, "language_loss": 0.74993968, "learning_rate": 3.246307476341881e-07, "loss": 0.82670242, "num_input_tokens_seen": 294768955, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08959961, "step": 13665, "time_per_iteration": 2.5796618461608887 }, { "auxiliary_loss_clip": 0.06415752, "auxiliary_loss_mlp": 0.01265156, "balance_loss_clip": 0.06275284, "balance_loss_mlp": 0.01256114, "epoch": 0.8216443709604689, "flos": 36839631669120.0, "grad_norm": 1.97735070248974, "language_loss": 0.66113079, "learning_rate": 3.2441807324726256e-07, "loss": 0.73793983, "num_input_tokens_seen": 294789250, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09033203, "step": 13666, "time_per_iteration": 2.659191846847534 }, { "auxiliary_loss_clip": 0.06414335, "auxiliary_loss_mlp": 0.01265316, "balance_loss_clip": 0.06274094, "balance_loss_mlp": 0.01256113, "epoch": 0.821704494213137, "flos": 25088693477760.0, "grad_norm": 1.6774884766905351, "language_loss": 0.77045906, "learning_rate": 3.2420546239857174e-07, "loss": 0.84725553, "num_input_tokens_seen": 294809760, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09197998, "step": 13667, "time_per_iteration": 2.595442295074463 }, { "auxiliary_loss_clip": 0.06419806, "auxiliary_loss_mlp": 0.01265475, "balance_loss_clip": 0.06277411, "balance_loss_mlp": 0.01255671, "epoch": 0.8217646174658049, "flos": 14361397090560.0, "grad_norm": 1.7907848238056499, "language_loss": 0.77284342, "learning_rate": 3.239929150961773e-07, "loss": 0.84969622, "num_input_tokens_seen": 294826495, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.0980835, "step": 13668, "time_per_iteration": 2.5214996337890625 }, { "auxiliary_loss_clip": 0.06413939, "auxiliary_loss_mlp": 0.01265883, "balance_loss_clip": 0.06276514, "balance_loss_mlp": 0.01256608, "epoch": 0.8218247407184729, "flos": 22097039627520.0, "grad_norm": 1.8078028603993161, "language_loss": 0.74092901, "learning_rate": 3.2378043134813984e-07, "loss": 0.81772721, "num_input_tokens_seen": 294845370, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09277344, "step": 13669, "time_per_iteration": 2.568298816680908 }, { "auxiliary_loss_clip": 0.0641326, "auxiliary_loss_mlp": 0.01266593, "balance_loss_clip": 0.06274317, "balance_loss_mlp": 0.01257342, "epoch": 0.8218848639711408, "flos": 16769694015360.0, "grad_norm": 1.7878391666107427, "language_loss": 0.79110527, "learning_rate": 3.235680111625161e-07, "loss": 0.86790371, "num_input_tokens_seen": 294863740, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09246826, "step": 13670, "time_per_iteration": 2.5341110229492188 }, { "auxiliary_loss_clip": 0.06421915, "auxiliary_loss_mlp": 0.0126503, "balance_loss_clip": 0.06278232, "balance_loss_mlp": 0.01255255, "epoch": 0.8219449872238088, "flos": 26001981054720.0, "grad_norm": 2.1566663733061837, "language_loss": 0.74948555, "learning_rate": 3.2335565454736123e-07, "loss": 0.82635504, "num_input_tokens_seen": 294882815, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09771729, "step": 13671, "time_per_iteration": 2.5739097595214844 }, { "auxiliary_loss_clip": 0.06421565, "auxiliary_loss_mlp": 0.01269014, "balance_loss_clip": 0.06275074, "balance_loss_mlp": 0.01258029, "epoch": 0.8220051104764767, "flos": 20784528472320.0, "grad_norm": 1.7447786866796433, "language_loss": 0.76579463, "learning_rate": 3.23143361510728e-07, "loss": 0.84270042, "num_input_tokens_seen": 294901985, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10986328, "step": 13672, "time_per_iteration": 2.5759451389312744 }, { "auxiliary_loss_clip": 0.06418113, "auxiliary_loss_mlp": 0.01264724, "balance_loss_clip": 0.06276956, "balance_loss_mlp": 0.01254239, "epoch": 0.8220652337291448, "flos": 14580134974080.0, "grad_norm": 2.33776680463824, "language_loss": 0.74973869, "learning_rate": 3.2293113206066733e-07, "loss": 0.82656705, "num_input_tokens_seen": 294919705, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10473633, "step": 13673, "time_per_iteration": 2.5552725791931152 }, { "auxiliary_loss_clip": 0.06422873, "auxiliary_loss_mlp": 0.0126522, "balance_loss_clip": 0.06280197, "balance_loss_mlp": 0.01255302, "epoch": 0.8221253569818128, "flos": 23812715502720.0, "grad_norm": 1.6110139689066028, "language_loss": 0.79662782, "learning_rate": 3.227189662052254e-07, "loss": 0.87350881, "num_input_tokens_seen": 294939900, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09918213, "step": 13674, "time_per_iteration": 2.5895586013793945 }, { "auxiliary_loss_clip": 0.06419201, "auxiliary_loss_mlp": 0.01265361, "balance_loss_clip": 0.06281143, "balance_loss_mlp": 0.01255103, "epoch": 0.8221854802344807, "flos": 21294651329280.0, "grad_norm": 2.0547543070265886, "language_loss": 0.70814699, "learning_rate": 3.225068639524484e-07, "loss": 0.78499264, "num_input_tokens_seen": 294959110, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.10253906, "step": 13675, "time_per_iteration": 2.5403494834899902 }, { "auxiliary_loss_clip": 0.06408211, "auxiliary_loss_mlp": 0.01266266, "balance_loss_clip": 0.06272727, "balance_loss_mlp": 0.01256992, "epoch": 0.8222456034871487, "flos": 20962624325760.0, "grad_norm": 1.5930896112624986, "language_loss": 0.74761462, "learning_rate": 3.2229482531037965e-07, "loss": 0.82435942, "num_input_tokens_seen": 294978660, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.0927124, "step": 13676, "time_per_iteration": 2.547600269317627 }, { "auxiliary_loss_clip": 0.06417362, "auxiliary_loss_mlp": 0.01267025, "balance_loss_clip": 0.06277363, "balance_loss_mlp": 0.01257769, "epoch": 0.8223057267398166, "flos": 21403915453440.0, "grad_norm": 1.7081602566409457, "language_loss": 0.80908883, "learning_rate": 3.2208285028705893e-07, "loss": 0.88593268, "num_input_tokens_seen": 294998075, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09259033, "step": 13677, "time_per_iteration": 2.5192949771881104 }, { "auxiliary_loss_clip": 0.06416659, "auxiliary_loss_mlp": 0.01265095, "balance_loss_clip": 0.06275121, "balance_loss_mlp": 0.01255678, "epoch": 0.8223658499924846, "flos": 15273636491520.0, "grad_norm": 1.8202691358495517, "language_loss": 0.70319247, "learning_rate": 3.218709388905245e-07, "loss": 0.78000998, "num_input_tokens_seen": 295015950, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09411621, "step": 13678, "time_per_iteration": 2.510516405105591 }, { "auxiliary_loss_clip": 0.06412949, "auxiliary_loss_mlp": 0.01266425, "balance_loss_clip": 0.06273959, "balance_loss_mlp": 0.01257371, "epoch": 0.8224259732451525, "flos": 31257727752960.0, "grad_norm": 1.4805239728092623, "language_loss": 0.71516812, "learning_rate": 3.216590911288133e-07, "loss": 0.79196185, "num_input_tokens_seen": 295036800, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09057617, "step": 13679, "time_per_iteration": 2.6112630367279053 }, { "auxiliary_loss_clip": 0.06408485, "auxiliary_loss_mlp": 0.01263974, "balance_loss_clip": 0.06271669, "balance_loss_mlp": 0.01254455, "epoch": 0.8224860964978206, "flos": 21580166517120.0, "grad_norm": 1.8061189563521696, "language_loss": 0.69546771, "learning_rate": 3.214473070099564e-07, "loss": 0.7721923, "num_input_tokens_seen": 295055300, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.09521484, "step": 13680, "time_per_iteration": 2.5237574577331543 }, { "auxiliary_loss_clip": 0.06416117, "auxiliary_loss_mlp": 0.01262696, "balance_loss_clip": 0.06277528, "balance_loss_mlp": 0.0125354, "epoch": 0.8225462197504885, "flos": 25490181116160.0, "grad_norm": 1.5981408286462506, "language_loss": 0.59985542, "learning_rate": 3.21235586541986e-07, "loss": 0.67664361, "num_input_tokens_seen": 295076420, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09155273, "step": 13681, "time_per_iteration": 3.9918081760406494 }, { "auxiliary_loss_clip": 0.06421886, "auxiliary_loss_mlp": 0.01267926, "balance_loss_clip": 0.06277357, "balance_loss_mlp": 0.01257996, "epoch": 0.8226063430031565, "flos": 39394941782400.0, "grad_norm": 1.6877920296330715, "language_loss": 0.69732732, "learning_rate": 3.2102392973293047e-07, "loss": 0.77422547, "num_input_tokens_seen": 295100540, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.0993042, "step": 13682, "time_per_iteration": 2.724766492843628 }, { "auxiliary_loss_clip": 0.06415685, "auxiliary_loss_mlp": 0.01264733, "balance_loss_clip": 0.0627612, "balance_loss_mlp": 0.01254308, "epoch": 0.8226664662558244, "flos": 22821036831360.0, "grad_norm": 1.6599192096213546, "language_loss": 0.79014516, "learning_rate": 3.20812336590816e-07, "loss": 0.86694938, "num_input_tokens_seen": 295120180, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.10430908, "step": 13683, "time_per_iteration": 2.5666518211364746 }, { "auxiliary_loss_clip": 0.0641011, "auxiliary_loss_mlp": 0.01263558, "balance_loss_clip": 0.06275572, "balance_loss_mlp": 0.01254892, "epoch": 0.8227265895084924, "flos": 25672595454720.0, "grad_norm": 1.8751695158455923, "language_loss": 0.86595666, "learning_rate": 3.206008071236661e-07, "loss": 0.94269335, "num_input_tokens_seen": 295138530, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.08673096, "step": 13684, "time_per_iteration": 2.5735204219818115 }, { "auxiliary_loss_clip": 0.06411944, "auxiliary_loss_mlp": 0.01264389, "balance_loss_clip": 0.06277469, "balance_loss_mlp": 0.01255133, "epoch": 0.8227867127611603, "flos": 26186827161600.0, "grad_norm": 1.5508536135916264, "language_loss": 0.7999292, "learning_rate": 3.2038934133950157e-07, "loss": 0.87669253, "num_input_tokens_seen": 295160260, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.09259033, "step": 13685, "time_per_iteration": 4.1706883907318115 }, { "auxiliary_loss_clip": 0.06418794, "auxiliary_loss_mlp": 0.01266042, "balance_loss_clip": 0.06278745, "balance_loss_mlp": 0.01256613, "epoch": 0.8228468360138284, "flos": 22024602172800.0, "grad_norm": 1.9554818219407337, "language_loss": 0.68915421, "learning_rate": 3.2017793924634194e-07, "loss": 0.76600254, "num_input_tokens_seen": 295177055, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09429932, "step": 13686, "time_per_iteration": 2.566110849380493 }, { "auxiliary_loss_clip": 0.06421533, "auxiliary_loss_mlp": 0.01264856, "balance_loss_clip": 0.06280833, "balance_loss_mlp": 0.01255451, "epoch": 0.8229069592664963, "flos": 14908723960320.0, "grad_norm": 1.9969520954705673, "language_loss": 0.78767169, "learning_rate": 3.1996660085220263e-07, "loss": 0.86453557, "num_input_tokens_seen": 295193870, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09411621, "step": 13687, "time_per_iteration": 2.5212607383728027 }, { "auxiliary_loss_clip": 0.06416136, "auxiliary_loss_mlp": 0.01263579, "balance_loss_clip": 0.06276232, "balance_loss_mlp": 0.01253852, "epoch": 0.8229670825191643, "flos": 15674956421760.0, "grad_norm": 1.7078312527008537, "language_loss": 0.72540247, "learning_rate": 3.1975532616509825e-07, "loss": 0.80219966, "num_input_tokens_seen": 295211040, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09735107, "step": 13688, "time_per_iteration": 2.5223894119262695 }, { "auxiliary_loss_clip": 0.06418304, "auxiliary_loss_mlp": 0.01267254, "balance_loss_clip": 0.06278095, "balance_loss_mlp": 0.01258116, "epoch": 0.8230272057718323, "flos": 23189890504320.0, "grad_norm": 1.6107910070803246, "language_loss": 0.73967552, "learning_rate": 3.1954411519304025e-07, "loss": 0.81653112, "num_input_tokens_seen": 295231300, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09124756, "step": 13689, "time_per_iteration": 2.5790531635284424 }, { "auxiliary_loss_clip": 0.06420518, "auxiliary_loss_mlp": 0.01263531, "balance_loss_clip": 0.06278262, "balance_loss_mlp": 0.01254167, "epoch": 0.8230873290245002, "flos": 21038709432960.0, "grad_norm": 1.9315872183070057, "language_loss": 0.69464576, "learning_rate": 3.1933296794403887e-07, "loss": 0.77148628, "num_input_tokens_seen": 295251045, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09362793, "step": 13690, "time_per_iteration": 2.5473763942718506 }, { "auxiliary_loss_clip": 0.06417018, "auxiliary_loss_mlp": 0.01266547, "balance_loss_clip": 0.06275598, "balance_loss_mlp": 0.01257225, "epoch": 0.8231474522771682, "flos": 21256273359360.0, "grad_norm": 1.6048683345900936, "language_loss": 0.85703516, "learning_rate": 3.191218844260988e-07, "loss": 0.93387091, "num_input_tokens_seen": 295270225, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09320068, "step": 13691, "time_per_iteration": 2.5439181327819824 }, { "auxiliary_loss_clip": 0.06419933, "auxiliary_loss_mlp": 0.01265995, "balance_loss_clip": 0.06278149, "balance_loss_mlp": 0.01256172, "epoch": 0.8232075755298361, "flos": 23848829412480.0, "grad_norm": 1.7137086544667015, "language_loss": 0.76999104, "learning_rate": 3.189108646472252e-07, "loss": 0.84685028, "num_input_tokens_seen": 295288950, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.0982666, "step": 13692, "time_per_iteration": 2.54357647895813 }, { "auxiliary_loss_clip": 0.06412187, "auxiliary_loss_mlp": 0.01265244, "balance_loss_clip": 0.06273589, "balance_loss_mlp": 0.01255612, "epoch": 0.8232676987825042, "flos": 21660570109440.0, "grad_norm": 1.4769654751451935, "language_loss": 0.71793085, "learning_rate": 3.186999086154205e-07, "loss": 0.79470515, "num_input_tokens_seen": 295309405, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09625244, "step": 13693, "time_per_iteration": 2.5353829860687256 }, { "auxiliary_loss_clip": 0.06411754, "auxiliary_loss_mlp": 0.01264692, "balance_loss_clip": 0.06276213, "balance_loss_mlp": 0.01255686, "epoch": 0.8233278220351721, "flos": 26329857281280.0, "grad_norm": 1.3547291257621583, "language_loss": 0.8386451, "learning_rate": 3.1848901633868355e-07, "loss": 0.91540956, "num_input_tokens_seen": 295331115, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.09008789, "step": 13694, "time_per_iteration": 2.5700244903564453 }, { "auxiliary_loss_clip": 0.06418965, "auxiliary_loss_mlp": 0.0126502, "balance_loss_clip": 0.06275763, "balance_loss_mlp": 0.0125565, "epoch": 0.8233879452878401, "flos": 21732252877440.0, "grad_norm": 1.5567771713951293, "language_loss": 0.77272487, "learning_rate": 3.182781878250118e-07, "loss": 0.84956467, "num_input_tokens_seen": 295350495, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09368896, "step": 13695, "time_per_iteration": 2.5443599224090576 }, { "auxiliary_loss_clip": 0.06419247, "auxiliary_loss_mlp": 0.01262814, "balance_loss_clip": 0.06279978, "balance_loss_mlp": 0.01254201, "epoch": 0.823448068540508, "flos": 20563903872000.0, "grad_norm": 1.9981528248252212, "language_loss": 0.81565475, "learning_rate": 3.1806742308239985e-07, "loss": 0.89247537, "num_input_tokens_seen": 295368225, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.08612061, "step": 13696, "time_per_iteration": 3.966369867324829 }, { "auxiliary_loss_clip": 0.06316712, "auxiliary_loss_mlp": 0.01250513, "balance_loss_clip": 0.0626059, "balance_loss_mlp": 0.01249418, "epoch": 0.823508191793176, "flos": 67296130352640.0, "grad_norm": 0.7222036983976556, "language_loss": 0.63630491, "learning_rate": 3.178567221188393e-07, "loss": 0.71197718, "num_input_tokens_seen": 295430035, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01097107, "step": 13697, "time_per_iteration": 3.2190396785736084 }, { "auxiliary_loss_clip": 0.06409907, "auxiliary_loss_mlp": 0.01264778, "balance_loss_clip": 0.06275275, "balance_loss_mlp": 0.01256088, "epoch": 0.8235683150458439, "flos": 17933724535680.0, "grad_norm": 1.5610756119742768, "language_loss": 0.72975385, "learning_rate": 3.1764608494232037e-07, "loss": 0.80650067, "num_input_tokens_seen": 295447765, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.08691406, "step": 13698, "time_per_iteration": 2.5242204666137695 }, { "auxiliary_loss_clip": 0.0642076, "auxiliary_loss_mlp": 0.01270362, "balance_loss_clip": 0.06280477, "balance_loss_mlp": 0.01260384, "epoch": 0.823628438298512, "flos": 18922007116800.0, "grad_norm": 1.8720753003651502, "language_loss": 0.72578156, "learning_rate": 3.174355115608305e-07, "loss": 0.80269277, "num_input_tokens_seen": 295464810, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09973145, "step": 13699, "time_per_iteration": 2.5249385833740234 }, { "auxiliary_loss_clip": 0.06411584, "auxiliary_loss_mlp": 0.01264972, "balance_loss_clip": 0.06273839, "balance_loss_mlp": 0.01255268, "epoch": 0.8236885615511799, "flos": 18702221057280.0, "grad_norm": 1.96339357110946, "language_loss": 0.82257611, "learning_rate": 3.1722500198235526e-07, "loss": 0.89934164, "num_input_tokens_seen": 295482605, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09698486, "step": 13700, "time_per_iteration": 2.5023438930511475 }, { "auxiliary_loss_clip": 0.06417637, "auxiliary_loss_mlp": 0.01262957, "balance_loss_clip": 0.06274825, "balance_loss_mlp": 0.01253731, "epoch": 0.8237486848038479, "flos": 23701606588800.0, "grad_norm": 1.693087233690757, "language_loss": 0.73278511, "learning_rate": 3.170145562148763e-07, "loss": 0.80959105, "num_input_tokens_seen": 295503780, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09234619, "step": 13701, "time_per_iteration": 2.548555612564087 }, { "auxiliary_loss_clip": 0.06416296, "auxiliary_loss_mlp": 0.01264814, "balance_loss_clip": 0.06274818, "balance_loss_mlp": 0.01254246, "epoch": 0.8238088080565159, "flos": 23448138387840.0, "grad_norm": 1.9370208701419707, "language_loss": 0.69716156, "learning_rate": 3.1680417426637384e-07, "loss": 0.77397263, "num_input_tokens_seen": 295522035, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10565186, "step": 13702, "time_per_iteration": 3.940242290496826 }, { "auxiliary_loss_clip": 0.06420182, "auxiliary_loss_mlp": 0.01263691, "balance_loss_clip": 0.06281101, "balance_loss_mlp": 0.01253528, "epoch": 0.8238689313091838, "flos": 22753001715840.0, "grad_norm": 1.8024583807069927, "language_loss": 0.74907219, "learning_rate": 3.1659385614482603e-07, "loss": 0.82591081, "num_input_tokens_seen": 295541190, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.10162354, "step": 13703, "time_per_iteration": 2.542210340499878 }, { "auxiliary_loss_clip": 0.06422855, "auxiliary_loss_mlp": 0.01266937, "balance_loss_clip": 0.06275816, "balance_loss_mlp": 0.01256286, "epoch": 0.8239290545618518, "flos": 25637236231680.0, "grad_norm": 1.764629874918471, "language_loss": 0.70282197, "learning_rate": 3.1638360185820755e-07, "loss": 0.77971989, "num_input_tokens_seen": 295558860, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.10650635, "step": 13704, "time_per_iteration": 2.5836780071258545 }, { "auxiliary_loss_clip": 0.0641629, "auxiliary_loss_mlp": 0.01265468, "balance_loss_clip": 0.06277091, "balance_loss_mlp": 0.01256391, "epoch": 0.8239891778145197, "flos": 26032854084480.0, "grad_norm": 1.8488607921779807, "language_loss": 0.6393342, "learning_rate": 3.161734114144916e-07, "loss": 0.71615171, "num_input_tokens_seen": 295578155, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09075928, "step": 13705, "time_per_iteration": 2.590015411376953 }, { "auxiliary_loss_clip": 0.06419446, "auxiliary_loss_mlp": 0.01265714, "balance_loss_clip": 0.06276588, "balance_loss_mlp": 0.01255956, "epoch": 0.8240493010671878, "flos": 21839378722560.0, "grad_norm": 1.968143604518926, "language_loss": 0.69633007, "learning_rate": 3.1596328482164915e-07, "loss": 0.77318168, "num_input_tokens_seen": 295599170, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09747314, "step": 13706, "time_per_iteration": 2.5612659454345703 }, { "auxiliary_loss_clip": 0.0642203, "auxiliary_loss_mlp": 0.01265896, "balance_loss_clip": 0.06280705, "balance_loss_mlp": 0.01255829, "epoch": 0.8241094243198557, "flos": 18557891199360.0, "grad_norm": 1.82778162321119, "language_loss": 0.70247251, "learning_rate": 3.157532220876475e-07, "loss": 0.77935171, "num_input_tokens_seen": 295617465, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.10064697, "step": 13707, "time_per_iteration": 2.534383773803711 }, { "auxiliary_loss_clip": 0.06418969, "auxiliary_loss_mlp": 0.01263263, "balance_loss_clip": 0.06276803, "balance_loss_mlp": 0.01253708, "epoch": 0.8241695475725237, "flos": 25454192987520.0, "grad_norm": 1.9068747885742396, "language_loss": 0.79089618, "learning_rate": 3.1554322322045226e-07, "loss": 0.86771846, "num_input_tokens_seen": 295634960, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09558105, "step": 13708, "time_per_iteration": 2.5842881202697754 }, { "auxiliary_loss_clip": 0.06412528, "auxiliary_loss_mlp": 0.01264232, "balance_loss_clip": 0.06271493, "balance_loss_mlp": 0.01253909, "epoch": 0.8242296708251916, "flos": 18995702382720.0, "grad_norm": 2.2576511897197427, "language_loss": 0.68837452, "learning_rate": 3.1533328822802664e-07, "loss": 0.76514208, "num_input_tokens_seen": 295652725, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10308838, "step": 13709, "time_per_iteration": 2.5236215591430664 }, { "auxiliary_loss_clip": 0.06420268, "auxiliary_loss_mlp": 0.01267126, "balance_loss_clip": 0.06278523, "balance_loss_mlp": 0.01256981, "epoch": 0.8242897940778596, "flos": 22607372119680.0, "grad_norm": 1.7455191349402739, "language_loss": 0.82490569, "learning_rate": 3.151234171183319e-07, "loss": 0.90177965, "num_input_tokens_seen": 295671195, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10144043, "step": 13710, "time_per_iteration": 2.545246124267578 }, { "auxiliary_loss_clip": 0.06418354, "auxiliary_loss_mlp": 0.01265511, "balance_loss_clip": 0.06278874, "balance_loss_mlp": 0.01255247, "epoch": 0.8243499173305275, "flos": 21474172702080.0, "grad_norm": 1.9012401726712418, "language_loss": 0.78642428, "learning_rate": 3.149136098993257e-07, "loss": 0.86326289, "num_input_tokens_seen": 295689130, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.1026001, "step": 13711, "time_per_iteration": 2.617445468902588 }, { "auxiliary_loss_clip": 0.06416032, "auxiliary_loss_mlp": 0.01265068, "balance_loss_clip": 0.0627632, "balance_loss_mlp": 0.0125509, "epoch": 0.8244100405831956, "flos": 20016409294080.0, "grad_norm": 1.8237329631751444, "language_loss": 0.66371888, "learning_rate": 3.1470386657896473e-07, "loss": 0.74052989, "num_input_tokens_seen": 295706385, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09979248, "step": 13712, "time_per_iteration": 2.524545431137085 }, { "auxiliary_loss_clip": 0.06416452, "auxiliary_loss_mlp": 0.01267055, "balance_loss_clip": 0.06275673, "balance_loss_mlp": 0.0125756, "epoch": 0.8244701638358635, "flos": 26437612032000.0, "grad_norm": 1.5810470293098655, "language_loss": 0.74659985, "learning_rate": 3.14494187165202e-07, "loss": 0.82343495, "num_input_tokens_seen": 295727925, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09490967, "step": 13713, "time_per_iteration": 2.562734603881836 }, { "auxiliary_loss_clip": 0.06418192, "auxiliary_loss_mlp": 0.01267901, "balance_loss_clip": 0.06276894, "balance_loss_mlp": 0.01258424, "epoch": 0.8245302870885315, "flos": 17645861433600.0, "grad_norm": 1.80912418745845, "language_loss": 0.81750917, "learning_rate": 3.1428457166598833e-07, "loss": 0.89437008, "num_input_tokens_seen": 295744420, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09484863, "step": 13714, "time_per_iteration": 2.506723403930664 }, { "auxiliary_loss_clip": 0.06419159, "auxiliary_loss_mlp": 0.01265601, "balance_loss_clip": 0.06280918, "balance_loss_mlp": 0.01255504, "epoch": 0.8245904103411995, "flos": 26216023109760.0, "grad_norm": 1.7813079721059162, "language_loss": 0.66312158, "learning_rate": 3.1407502008927235e-07, "loss": 0.73996919, "num_input_tokens_seen": 295765105, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.10101318, "step": 13715, "time_per_iteration": 2.603837728500366 }, { "auxiliary_loss_clip": 0.06418934, "auxiliary_loss_mlp": 0.01262605, "balance_loss_clip": 0.06275377, "balance_loss_mlp": 0.01253128, "epoch": 0.8246505335938674, "flos": 24211645591680.0, "grad_norm": 1.764073044209542, "language_loss": 0.75323004, "learning_rate": 3.1386553244300086e-07, "loss": 0.83004546, "num_input_tokens_seen": 295784200, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09472656, "step": 13716, "time_per_iteration": 2.539452314376831 }, { "auxiliary_loss_clip": 0.06321331, "auxiliary_loss_mlp": 0.01250873, "balance_loss_clip": 0.06265014, "balance_loss_mlp": 0.01249869, "epoch": 0.8247106568465354, "flos": 67114764190080.0, "grad_norm": 0.747918580155696, "language_loss": 0.58952367, "learning_rate": 3.136561087351175e-07, "loss": 0.66524571, "num_input_tokens_seen": 295846555, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01004028, "step": 13717, "time_per_iteration": 3.268791913986206 }, { "auxiliary_loss_clip": 0.06414348, "auxiliary_loss_mlp": 0.01266209, "balance_loss_clip": 0.06275256, "balance_loss_mlp": 0.01257304, "epoch": 0.8247707800992033, "flos": 12573199906560.0, "grad_norm": 2.98212521881067, "language_loss": 0.80049264, "learning_rate": 3.1344674897356373e-07, "loss": 0.87729824, "num_input_tokens_seen": 295863425, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08905029, "step": 13718, "time_per_iteration": 2.5059406757354736 }, { "auxiliary_loss_clip": 0.06414987, "auxiliary_loss_mlp": 0.01264927, "balance_loss_clip": 0.06278126, "balance_loss_mlp": 0.01255617, "epoch": 0.8248309033518714, "flos": 15928927747200.0, "grad_norm": 1.6052529525714736, "language_loss": 0.68869859, "learning_rate": 3.132374531662778e-07, "loss": 0.76549768, "num_input_tokens_seen": 295880925, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09320068, "step": 13719, "time_per_iteration": 2.512380838394165 }, { "auxiliary_loss_clip": 0.0641766, "auxiliary_loss_mlp": 0.01267433, "balance_loss_clip": 0.06276602, "balance_loss_mlp": 0.01257467, "epoch": 0.8248910266045393, "flos": 17570195596800.0, "grad_norm": 2.2891825013999654, "language_loss": 0.70120287, "learning_rate": 3.13028221321197e-07, "loss": 0.77805376, "num_input_tokens_seen": 295898205, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09960938, "step": 13720, "time_per_iteration": 3.9145712852478027 }, { "auxiliary_loss_clip": 0.06417923, "auxiliary_loss_mlp": 0.01266517, "balance_loss_clip": 0.0627394, "balance_loss_mlp": 0.01257225, "epoch": 0.8249511498572073, "flos": 28626919511040.0, "grad_norm": 1.6196952463986463, "language_loss": 0.76116127, "learning_rate": 3.1281905344625467e-07, "loss": 0.83800566, "num_input_tokens_seen": 295918130, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.09301758, "step": 13721, "time_per_iteration": 2.5896317958831787 }, { "auxiliary_loss_clip": 0.0641343, "auxiliary_loss_mlp": 0.01265024, "balance_loss_clip": 0.06275065, "balance_loss_mlp": 0.0125606, "epoch": 0.8250112731098752, "flos": 25563624819840.0, "grad_norm": 7.1309588465628755, "language_loss": 0.77528119, "learning_rate": 3.1260994954938305e-07, "loss": 0.85206574, "num_input_tokens_seen": 295937760, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.08959961, "step": 13722, "time_per_iteration": 2.5757534503936768 }, { "auxiliary_loss_clip": 0.06410801, "auxiliary_loss_mlp": 0.01265202, "balance_loss_clip": 0.06273761, "balance_loss_mlp": 0.01256321, "epoch": 0.8250713963625432, "flos": 27753645058560.0, "grad_norm": 1.6352813211802224, "language_loss": 0.63166916, "learning_rate": 3.1240090963851205e-07, "loss": 0.70842922, "num_input_tokens_seen": 295957585, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.08880615, "step": 13723, "time_per_iteration": 2.5810978412628174 }, { "auxiliary_loss_clip": 0.06415919, "auxiliary_loss_mlp": 0.01266108, "balance_loss_clip": 0.06274918, "balance_loss_mlp": 0.01256536, "epoch": 0.8251315196152111, "flos": 21616070791680.0, "grad_norm": 1.422881093519289, "language_loss": 0.74196899, "learning_rate": 3.121919337215666e-07, "loss": 0.81878924, "num_input_tokens_seen": 295977135, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09576416, "step": 13724, "time_per_iteration": 2.5458335876464844 }, { "auxiliary_loss_clip": 0.06415274, "auxiliary_loss_mlp": 0.01266621, "balance_loss_clip": 0.0627448, "balance_loss_mlp": 0.01256405, "epoch": 0.8251916428678792, "flos": 28585983991680.0, "grad_norm": 2.6846202859519197, "language_loss": 0.64819229, "learning_rate": 3.1198302180647253e-07, "loss": 0.72501123, "num_input_tokens_seen": 295996265, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.10217285, "step": 13725, "time_per_iteration": 4.092242002487183 }, { "auxiliary_loss_clip": 0.06414399, "auxiliary_loss_mlp": 0.01263891, "balance_loss_clip": 0.06275962, "balance_loss_mlp": 0.01254468, "epoch": 0.8252517661205471, "flos": 23081758410240.0, "grad_norm": 1.4914413730265899, "language_loss": 0.82087672, "learning_rate": 3.1177417390115125e-07, "loss": 0.89765954, "num_input_tokens_seen": 296014745, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09423828, "step": 13726, "time_per_iteration": 2.526902437210083 }, { "auxiliary_loss_clip": 0.06407741, "auxiliary_loss_mlp": 0.01265521, "balance_loss_clip": 0.06273217, "balance_loss_mlp": 0.01256491, "epoch": 0.8253118893732151, "flos": 31767724828800.0, "grad_norm": 2.39298194983824, "language_loss": 0.70679319, "learning_rate": 3.1156539001352286e-07, "loss": 0.78352576, "num_input_tokens_seen": 296036960, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.090271, "step": 13727, "time_per_iteration": 2.60807728767395 }, { "auxiliary_loss_clip": 0.06418175, "auxiliary_loss_mlp": 0.01268156, "balance_loss_clip": 0.06276052, "balance_loss_mlp": 0.01257445, "epoch": 0.8253720126258831, "flos": 18302326646400.0, "grad_norm": 2.380500871174016, "language_loss": 0.63097405, "learning_rate": 3.113566701515036e-07, "loss": 0.70783734, "num_input_tokens_seen": 296056540, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.1071167, "step": 13728, "time_per_iteration": 2.530285358428955 }, { "auxiliary_loss_clip": 0.06422041, "auxiliary_loss_mlp": 0.01267113, "balance_loss_clip": 0.06277003, "balance_loss_mlp": 0.01256545, "epoch": 0.825432135878551, "flos": 26804620915200.0, "grad_norm": 1.6448006923651495, "language_loss": 0.71689951, "learning_rate": 3.111480143230092e-07, "loss": 0.79379106, "num_input_tokens_seen": 296077950, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10571289, "step": 13729, "time_per_iteration": 2.578049421310425 }, { "auxiliary_loss_clip": 0.06312086, "auxiliary_loss_mlp": 0.01250457, "balance_loss_clip": 0.06256042, "balance_loss_mlp": 0.01249493, "epoch": 0.825492259131219, "flos": 54234498597120.0, "grad_norm": 0.8530531034441776, "language_loss": 0.62499928, "learning_rate": 3.109394225359514e-07, "loss": 0.7006247, "num_input_tokens_seen": 296127060, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.0096283, "step": 13730, "time_per_iteration": 2.963263988494873 }, { "auxiliary_loss_clip": 0.06416472, "auxiliary_loss_mlp": 0.01266919, "balance_loss_clip": 0.06276862, "balance_loss_mlp": 0.01257197, "epoch": 0.825552382383887, "flos": 43765087478400.0, "grad_norm": 1.9694317886537065, "language_loss": 0.63975382, "learning_rate": 3.1073089479823945e-07, "loss": 0.71658772, "num_input_tokens_seen": 296147775, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.097229, "step": 13731, "time_per_iteration": 2.714858293533325 }, { "auxiliary_loss_clip": 0.06422817, "auxiliary_loss_mlp": 0.01266213, "balance_loss_clip": 0.06275784, "balance_loss_mlp": 0.01255883, "epoch": 0.825612505636555, "flos": 12607469026560.0, "grad_norm": 3.695338885291431, "language_loss": 0.70401657, "learning_rate": 3.105224311177812e-07, "loss": 0.78090686, "num_input_tokens_seen": 296163560, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.10321045, "step": 13732, "time_per_iteration": 2.483642578125 }, { "auxiliary_loss_clip": 0.06422526, "auxiliary_loss_mlp": 0.01266963, "balance_loss_clip": 0.06277065, "balance_loss_mlp": 0.01256264, "epoch": 0.8256726288892229, "flos": 17600146231680.0, "grad_norm": 2.368532421200561, "language_loss": 0.71371895, "learning_rate": 3.103140315024817e-07, "loss": 0.79061389, "num_input_tokens_seen": 296178730, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10699463, "step": 13733, "time_per_iteration": 2.562931537628174 }, { "auxiliary_loss_clip": 0.06411442, "auxiliary_loss_mlp": 0.01264814, "balance_loss_clip": 0.06273865, "balance_loss_mlp": 0.01255748, "epoch": 0.8257327521418909, "flos": 23812631648640.0, "grad_norm": 1.5446143648121586, "language_loss": 0.82465196, "learning_rate": 3.1010569596024437e-07, "loss": 0.90141439, "num_input_tokens_seen": 296200175, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09069824, "step": 13734, "time_per_iteration": 2.5992045402526855 }, { "auxiliary_loss_clip": 0.06414669, "auxiliary_loss_mlp": 0.01264073, "balance_loss_clip": 0.06277125, "balance_loss_mlp": 0.01254507, "epoch": 0.8257928753945588, "flos": 19287129283200.0, "grad_norm": 2.115969184533196, "language_loss": 0.83290726, "learning_rate": 3.098974244989676e-07, "loss": 0.90969467, "num_input_tokens_seen": 296219305, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09570312, "step": 13735, "time_per_iteration": 2.5135138034820557 }, { "auxiliary_loss_clip": 0.06416985, "auxiliary_loss_mlp": 0.01266281, "balance_loss_clip": 0.06276964, "balance_loss_mlp": 0.01257031, "epoch": 0.8258529986472268, "flos": 18484782912000.0, "grad_norm": 2.1595436484712556, "language_loss": 0.71343291, "learning_rate": 3.096892171265497e-07, "loss": 0.79026556, "num_input_tokens_seen": 296236945, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09259033, "step": 13736, "time_per_iteration": 3.949894428253174 }, { "auxiliary_loss_clip": 0.06308544, "auxiliary_loss_mlp": 0.01251508, "balance_loss_clip": 0.06252466, "balance_loss_mlp": 0.01250443, "epoch": 0.8259131218998947, "flos": 62154903386880.0, "grad_norm": 0.8575248000251515, "language_loss": 0.67867362, "learning_rate": 3.0948107385088665e-07, "loss": 0.75427413, "num_input_tokens_seen": 296294685, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.01065826, "step": 13737, "time_per_iteration": 3.1722519397735596 }, { "auxiliary_loss_clip": 0.06416585, "auxiliary_loss_mlp": 0.01264546, "balance_loss_clip": 0.06275798, "balance_loss_mlp": 0.01254884, "epoch": 0.8259732451525628, "flos": 22164781253760.0, "grad_norm": 1.7258373856122844, "language_loss": 0.6974172, "learning_rate": 3.0927299467987e-07, "loss": 0.77422857, "num_input_tokens_seen": 296314790, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09661865, "step": 13738, "time_per_iteration": 2.5947227478027344 }, { "auxiliary_loss_clip": 0.06417262, "auxiliary_loss_mlp": 0.01268417, "balance_loss_clip": 0.06276365, "balance_loss_mlp": 0.01257563, "epoch": 0.8260333684052307, "flos": 38370587218560.0, "grad_norm": 2.095594346160027, "language_loss": 0.63659108, "learning_rate": 3.090649796213911e-07, "loss": 0.71344793, "num_input_tokens_seen": 296335355, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10852051, "step": 13739, "time_per_iteration": 2.711702346801758 }, { "auxiliary_loss_clip": 0.06313919, "auxiliary_loss_mlp": 0.01253051, "balance_loss_clip": 0.06257796, "balance_loss_mlp": 0.01251956, "epoch": 0.8260934916578987, "flos": 62204433949440.0, "grad_norm": 0.8222108075097934, "language_loss": 0.59171176, "learning_rate": 3.0885702868333853e-07, "loss": 0.66738153, "num_input_tokens_seen": 296399885, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 0.0109787, "step": 13740, "time_per_iteration": 3.213183641433716 }, { "auxiliary_loss_clip": 0.06426826, "auxiliary_loss_mlp": 0.01266738, "balance_loss_clip": 0.06279934, "balance_loss_mlp": 0.01256152, "epoch": 0.8261536149105667, "flos": 22572138677760.0, "grad_norm": 10.823694632477276, "language_loss": 0.75699008, "learning_rate": 3.086491418735959e-07, "loss": 0.83392572, "num_input_tokens_seen": 296417660, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.10577393, "step": 13741, "time_per_iteration": 2.543879985809326 }, { "auxiliary_loss_clip": 0.06415395, "auxiliary_loss_mlp": 0.01264604, "balance_loss_clip": 0.06274617, "balance_loss_mlp": 0.01254633, "epoch": 0.8262137381632346, "flos": 32533705728000.0, "grad_norm": 2.2009347876684062, "language_loss": 0.63121408, "learning_rate": 3.0844131920004726e-07, "loss": 0.70801413, "num_input_tokens_seen": 296438255, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09979248, "step": 13742, "time_per_iteration": 3.9840259552001953 }, { "auxiliary_loss_clip": 0.06426267, "auxiliary_loss_mlp": 0.01267879, "balance_loss_clip": 0.06278388, "balance_loss_mlp": 0.01255505, "epoch": 0.8262738614159026, "flos": 14141569104000.0, "grad_norm": 2.3447866451241914, "language_loss": 0.66446871, "learning_rate": 3.0823356067057327e-07, "loss": 0.74141014, "num_input_tokens_seen": 296454485, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.12371826, "step": 13743, "time_per_iteration": 2.5122456550598145 }, { "auxiliary_loss_clip": 0.06420101, "auxiliary_loss_mlp": 0.01267312, "balance_loss_clip": 0.06279074, "balance_loss_mlp": 0.0125756, "epoch": 0.8263339846685706, "flos": 19830934281600.0, "grad_norm": 1.7877899270864634, "language_loss": 0.67125332, "learning_rate": 3.0802586629305283e-07, "loss": 0.74812746, "num_input_tokens_seen": 296473740, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09753418, "step": 13744, "time_per_iteration": 2.5264787673950195 }, { "auxiliary_loss_clip": 0.06414808, "auxiliary_loss_mlp": 0.012676, "balance_loss_clip": 0.06274994, "balance_loss_mlp": 0.01258409, "epoch": 0.8263941079212386, "flos": 22752330883200.0, "grad_norm": 1.8138301083109203, "language_loss": 0.7566371, "learning_rate": 3.078182360753612e-07, "loss": 0.83346117, "num_input_tokens_seen": 296493355, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09191895, "step": 13745, "time_per_iteration": 2.5599613189697266 }, { "auxiliary_loss_clip": 0.06411891, "auxiliary_loss_mlp": 0.01264407, "balance_loss_clip": 0.06276416, "balance_loss_mlp": 0.01255306, "epoch": 0.8264542311739065, "flos": 20126847375360.0, "grad_norm": 1.8576515716954236, "language_loss": 0.79003125, "learning_rate": 3.076106700253709e-07, "loss": 0.86679423, "num_input_tokens_seen": 296510520, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.09100342, "step": 13746, "time_per_iteration": 2.508892774581909 }, { "auxiliary_loss_clip": 0.06427507, "auxiliary_loss_mlp": 0.01265663, "balance_loss_clip": 0.06280938, "balance_loss_mlp": 0.0125531, "epoch": 0.8265143544265745, "flos": 16842844229760.0, "grad_norm": 1.9674388804534817, "language_loss": 0.69477212, "learning_rate": 3.0740316815095415e-07, "loss": 0.77170384, "num_input_tokens_seen": 296528265, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10351562, "step": 13747, "time_per_iteration": 2.500126838684082 }, { "auxiliary_loss_clip": 0.06418775, "auxiliary_loss_mlp": 0.01265835, "balance_loss_clip": 0.06276666, "balance_loss_mlp": 0.01255482, "epoch": 0.8265744776792424, "flos": 22025231078400.0, "grad_norm": 1.878919266125991, "language_loss": 0.7544502, "learning_rate": 3.0719573045997835e-07, "loss": 0.83129632, "num_input_tokens_seen": 296547810, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10357666, "step": 13748, "time_per_iteration": 2.52771258354187 }, { "auxiliary_loss_clip": 0.06411287, "auxiliary_loss_mlp": 0.01264972, "balance_loss_clip": 0.06275504, "balance_loss_mlp": 0.01255793, "epoch": 0.8266346009319104, "flos": 19250889592320.0, "grad_norm": 2.9426789252949557, "language_loss": 0.63981009, "learning_rate": 3.069883569603102e-07, "loss": 0.7165727, "num_input_tokens_seen": 296565940, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.09179688, "step": 13749, "time_per_iteration": 2.520566940307617 }, { "auxiliary_loss_clip": 0.06413254, "auxiliary_loss_mlp": 0.0126567, "balance_loss_clip": 0.06273124, "balance_loss_mlp": 0.01256043, "epoch": 0.8266947241845783, "flos": 24173016059520.0, "grad_norm": 1.586948864715548, "language_loss": 0.73847789, "learning_rate": 3.067810476598132e-07, "loss": 0.81526709, "num_input_tokens_seen": 296585090, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09619141, "step": 13750, "time_per_iteration": 2.5589003562927246 }, { "auxiliary_loss_clip": 0.0642181, "auxiliary_loss_mlp": 0.01266926, "balance_loss_clip": 0.06279553, "balance_loss_mlp": 0.01256817, "epoch": 0.8267548474372464, "flos": 21112195063680.0, "grad_norm": 2.000700982398774, "language_loss": 0.66233718, "learning_rate": 3.065738025663496e-07, "loss": 0.73922455, "num_input_tokens_seen": 296604950, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10113525, "step": 13751, "time_per_iteration": 2.5593836307525635 }, { "auxiliary_loss_clip": 0.0641344, "auxiliary_loss_mlp": 0.01262102, "balance_loss_clip": 0.06275535, "balance_loss_mlp": 0.01253346, "epoch": 0.8268149706899143, "flos": 39977711729280.0, "grad_norm": 1.5152490318156167, "language_loss": 0.60402882, "learning_rate": 3.0636662168777607e-07, "loss": 0.68078423, "num_input_tokens_seen": 296627780, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.08758545, "step": 13752, "time_per_iteration": 2.69034481048584 }, { "auxiliary_loss_clip": 0.06311554, "auxiliary_loss_mlp": 0.01251987, "balance_loss_clip": 0.06255701, "balance_loss_mlp": 0.01250949, "epoch": 0.8268750939425823, "flos": 65799290943360.0, "grad_norm": 0.9278700339739318, "language_loss": 0.57337385, "learning_rate": 3.0615950503194986e-07, "loss": 0.64900923, "num_input_tokens_seen": 296683850, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.01038361, "step": 13753, "time_per_iteration": 3.1959292888641357 }, { "auxiliary_loss_clip": 0.06317682, "auxiliary_loss_mlp": 0.01255428, "balance_loss_clip": 0.06261773, "balance_loss_mlp": 0.01254362, "epoch": 0.8269352171952503, "flos": 52997108227200.0, "grad_norm": 0.7000414764919304, "language_loss": 0.5482372, "learning_rate": 3.0595245260672563e-07, "loss": 0.6239683, "num_input_tokens_seen": 296741420, "router_z_loss_clip": 0.56201172, "router_z_loss_mlp": 0.01067352, "step": 13754, "time_per_iteration": 3.2491140365600586 }, { "auxiliary_loss_clip": 0.06412349, "auxiliary_loss_mlp": 0.01263629, "balance_loss_clip": 0.06273713, "balance_loss_mlp": 0.01254509, "epoch": 0.8269953404479182, "flos": 23082848513280.0, "grad_norm": 1.7527967592329534, "language_loss": 0.69571805, "learning_rate": 3.0574546441995354e-07, "loss": 0.77247781, "num_input_tokens_seen": 296759620, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09112549, "step": 13755, "time_per_iteration": 2.5445735454559326 }, { "auxiliary_loss_clip": 0.06413017, "auxiliary_loss_mlp": 0.01263447, "balance_loss_clip": 0.06275418, "balance_loss_mlp": 0.01255281, "epoch": 0.8270554637005862, "flos": 14215222442880.0, "grad_norm": 2.2798454294304342, "language_loss": 0.70188451, "learning_rate": 3.0553854047948324e-07, "loss": 0.77864909, "num_input_tokens_seen": 296777275, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.08166504, "step": 13756, "time_per_iteration": 2.5429606437683105 }, { "auxiliary_loss_clip": 0.06416149, "auxiliary_loss_mlp": 0.01265572, "balance_loss_clip": 0.06276052, "balance_loss_mlp": 0.01255869, "epoch": 0.8271155869532542, "flos": 21768450641280.0, "grad_norm": 1.662151144613569, "language_loss": 0.72382164, "learning_rate": 3.053316807931623e-07, "loss": 0.80063891, "num_input_tokens_seen": 296796655, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09698486, "step": 13757, "time_per_iteration": 2.5263121128082275 }, { "auxiliary_loss_clip": 0.06422366, "auxiliary_loss_mlp": 0.01267221, "balance_loss_clip": 0.06278258, "balance_loss_mlp": 0.01256063, "epoch": 0.8271757102059222, "flos": 15125575127040.0, "grad_norm": 3.0456802214390235, "language_loss": 0.69179738, "learning_rate": 3.0512488536883283e-07, "loss": 0.76869321, "num_input_tokens_seen": 296813705, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.11157227, "step": 13758, "time_per_iteration": 2.6418862342834473 }, { "auxiliary_loss_clip": 0.06410502, "auxiliary_loss_mlp": 0.01268264, "balance_loss_clip": 0.06274357, "balance_loss_mlp": 0.01259257, "epoch": 0.8272358334585901, "flos": 24140549802240.0, "grad_norm": 1.5374906002654807, "language_loss": 0.69437456, "learning_rate": 3.0491815421433775e-07, "loss": 0.77116227, "num_input_tokens_seen": 296833985, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.09002686, "step": 13759, "time_per_iteration": 2.6014487743377686 }, { "auxiliary_loss_clip": 0.06413739, "auxiliary_loss_mlp": 0.0126265, "balance_loss_clip": 0.06275395, "balance_loss_mlp": 0.01253018, "epoch": 0.8272959567112581, "flos": 18996918266880.0, "grad_norm": 1.6029341696560708, "language_loss": 0.71149874, "learning_rate": 3.047114873375161e-07, "loss": 0.78826261, "num_input_tokens_seen": 296850150, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09631348, "step": 13760, "time_per_iteration": 3.965933084487915 }, { "auxiliary_loss_clip": 0.06411694, "auxiliary_loss_mlp": 0.0126586, "balance_loss_clip": 0.06275778, "balance_loss_mlp": 0.01256657, "epoch": 0.827356079963926, "flos": 20637934554240.0, "grad_norm": 1.6641057984878842, "language_loss": 0.77818221, "learning_rate": 3.0450488474620505e-07, "loss": 0.85495776, "num_input_tokens_seen": 296869585, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09204102, "step": 13761, "time_per_iteration": 2.5373375415802 }, { "auxiliary_loss_clip": 0.06411894, "auxiliary_loss_mlp": 0.01267879, "balance_loss_clip": 0.06275113, "balance_loss_mlp": 0.01258694, "epoch": 0.827416203216594, "flos": 22422777575040.0, "grad_norm": 1.6335793898840791, "language_loss": 0.70174372, "learning_rate": 3.042983464482387e-07, "loss": 0.77854145, "num_input_tokens_seen": 296887710, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09191895, "step": 13762, "time_per_iteration": 2.5191023349761963 }, { "auxiliary_loss_clip": 0.06411321, "auxiliary_loss_mlp": 0.01264293, "balance_loss_clip": 0.06273273, "balance_loss_mlp": 0.01255287, "epoch": 0.827476326469262, "flos": 19032235562880.0, "grad_norm": 1.8833115365948334, "language_loss": 0.70527649, "learning_rate": 3.0409187245144853e-07, "loss": 0.78203261, "num_input_tokens_seen": 296906265, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09002686, "step": 13763, "time_per_iteration": 2.503591775894165 }, { "auxiliary_loss_clip": 0.06314402, "auxiliary_loss_mlp": 0.01253448, "balance_loss_clip": 0.0625876, "balance_loss_mlp": 0.01252321, "epoch": 0.82753644972193, "flos": 68520942610560.0, "grad_norm": 0.8535053814590937, "language_loss": 0.65113384, "learning_rate": 3.038854627636651e-07, "loss": 0.72681236, "num_input_tokens_seen": 296971290, "router_z_loss_clip": 0.55664062, "router_z_loss_mlp": 0.01129913, "step": 13764, "time_per_iteration": 4.732106685638428 }, { "auxiliary_loss_clip": 0.06412229, "auxiliary_loss_mlp": 0.01265698, "balance_loss_clip": 0.06273654, "balance_loss_mlp": 0.01255559, "epoch": 0.8275965729745979, "flos": 18411255354240.0, "grad_norm": 5.511382024996849, "language_loss": 0.78206575, "learning_rate": 3.0367911739271423e-07, "loss": 0.85884506, "num_input_tokens_seen": 296989060, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.10131836, "step": 13765, "time_per_iteration": 2.5991666316986084 }, { "auxiliary_loss_clip": 0.06421856, "auxiliary_loss_mlp": 0.01265512, "balance_loss_clip": 0.06278545, "balance_loss_mlp": 0.01255474, "epoch": 0.8276566962272659, "flos": 28519625957760.0, "grad_norm": 1.7717536069024287, "language_loss": 0.62716538, "learning_rate": 3.034728363464214e-07, "loss": 0.7040391, "num_input_tokens_seen": 297011300, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10040283, "step": 13766, "time_per_iteration": 2.6111409664154053 }, { "auxiliary_loss_clip": 0.06418578, "auxiliary_loss_mlp": 0.01264046, "balance_loss_clip": 0.06278357, "balance_loss_mlp": 0.01253764, "epoch": 0.8277168194799339, "flos": 20236488842880.0, "grad_norm": 1.5809626961095928, "language_loss": 0.82481194, "learning_rate": 3.03266619632609e-07, "loss": 0.90163815, "num_input_tokens_seen": 297030350, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.10272217, "step": 13767, "time_per_iteration": 2.539872884750366 }, { "auxiliary_loss_clip": 0.06419558, "auxiliary_loss_mlp": 0.01265718, "balance_loss_clip": 0.06277398, "balance_loss_mlp": 0.01256235, "epoch": 0.8277769427326018, "flos": 28484350588800.0, "grad_norm": 1.7825597066350714, "language_loss": 0.69045717, "learning_rate": 3.030604672590964e-07, "loss": 0.7673099, "num_input_tokens_seen": 297049710, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09484863, "step": 13768, "time_per_iteration": 2.607652425765991 }, { "auxiliary_loss_clip": 0.06408721, "auxiliary_loss_mlp": 0.01266051, "balance_loss_clip": 0.0627231, "balance_loss_mlp": 0.01257045, "epoch": 0.8278370659852698, "flos": 27204808815360.0, "grad_norm": 1.7322150776115657, "language_loss": 0.74494016, "learning_rate": 3.028543792337006e-07, "loss": 0.82168788, "num_input_tokens_seen": 297070510, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09008789, "step": 13769, "time_per_iteration": 2.58831787109375 }, { "auxiliary_loss_clip": 0.06415284, "auxiliary_loss_mlp": 0.01267795, "balance_loss_clip": 0.06274206, "balance_loss_mlp": 0.01258026, "epoch": 0.8278971892379378, "flos": 37825272846720.0, "grad_norm": 1.869475521414948, "language_loss": 0.74496281, "learning_rate": 3.0264835556423675e-07, "loss": 0.8217935, "num_input_tokens_seen": 297092585, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09777832, "step": 13770, "time_per_iteration": 2.663250207901001 }, { "auxiliary_loss_clip": 0.064173, "auxiliary_loss_mlp": 0.01264828, "balance_loss_clip": 0.06274951, "balance_loss_mlp": 0.01254779, "epoch": 0.8279573124906058, "flos": 22565933475840.0, "grad_norm": 1.6535033172303655, "language_loss": 0.75898165, "learning_rate": 3.0244239625851785e-07, "loss": 0.83580291, "num_input_tokens_seen": 297110055, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.1005249, "step": 13771, "time_per_iteration": 2.526038408279419 }, { "auxiliary_loss_clip": 0.06412185, "auxiliary_loss_mlp": 0.01266093, "balance_loss_clip": 0.06271792, "balance_loss_mlp": 0.01256943, "epoch": 0.8280174357432737, "flos": 36073441134720.0, "grad_norm": 1.4181177486637457, "language_loss": 0.72491241, "learning_rate": 3.0223650132435284e-07, "loss": 0.80169523, "num_input_tokens_seen": 297132170, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09155273, "step": 13772, "time_per_iteration": 2.6568076610565186 }, { "auxiliary_loss_clip": 0.06412959, "auxiliary_loss_mlp": 0.01264308, "balance_loss_clip": 0.06276375, "balance_loss_mlp": 0.01254444, "epoch": 0.8280775589959417, "flos": 22966834135680.0, "grad_norm": 2.239752859529487, "language_loss": 0.74593413, "learning_rate": 3.0203067076955035e-07, "loss": 0.82270682, "num_input_tokens_seen": 297149515, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.09875488, "step": 13773, "time_per_iteration": 2.540867805480957 }, { "auxiliary_loss_clip": 0.06412728, "auxiliary_loss_mlp": 0.01264137, "balance_loss_clip": 0.06276324, "balance_loss_mlp": 0.01254684, "epoch": 0.8281376822486096, "flos": 26069722680960.0, "grad_norm": 1.8415825587175345, "language_loss": 0.76123047, "learning_rate": 3.01824904601915e-07, "loss": 0.83799911, "num_input_tokens_seen": 297170320, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09460449, "step": 13774, "time_per_iteration": 2.5665533542633057 }, { "auxiliary_loss_clip": 0.06425832, "auxiliary_loss_mlp": 0.0126367, "balance_loss_clip": 0.06279992, "balance_loss_mlp": 0.01253471, "epoch": 0.8281978055012776, "flos": 20674048464000.0, "grad_norm": 1.800212662803587, "language_loss": 0.75038207, "learning_rate": 3.01619202829249e-07, "loss": 0.82727706, "num_input_tokens_seen": 297189935, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10198975, "step": 13775, "time_per_iteration": 3.972846031188965 }, { "auxiliary_loss_clip": 0.064227, "auxiliary_loss_mlp": 0.01266184, "balance_loss_clip": 0.06276851, "balance_loss_mlp": 0.01254775, "epoch": 0.8282579287539455, "flos": 29323062432000.0, "grad_norm": 2.4840110468067693, "language_loss": 0.73869878, "learning_rate": 3.01413565459353e-07, "loss": 0.81558758, "num_input_tokens_seen": 297210885, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.11407471, "step": 13776, "time_per_iteration": 2.6118009090423584 }, { "auxiliary_loss_clip": 0.06414748, "auxiliary_loss_mlp": 0.0126516, "balance_loss_clip": 0.0627272, "balance_loss_mlp": 0.01255403, "epoch": 0.8283180520066136, "flos": 15711699237120.0, "grad_norm": 2.3723788450989374, "language_loss": 0.7798245, "learning_rate": 3.0120799250002483e-07, "loss": 0.85662353, "num_input_tokens_seen": 297228500, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09753418, "step": 13777, "time_per_iteration": 2.524665594100952 }, { "auxiliary_loss_clip": 0.06413296, "auxiliary_loss_mlp": 0.01266717, "balance_loss_clip": 0.06277311, "balance_loss_mlp": 0.01257657, "epoch": 0.8283781752592815, "flos": 24798566315520.0, "grad_norm": 1.4948636114669818, "language_loss": 0.83070755, "learning_rate": 3.010024839590604e-07, "loss": 0.90750772, "num_input_tokens_seen": 297249470, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.09051514, "step": 13778, "time_per_iteration": 2.560063123703003 }, { "auxiliary_loss_clip": 0.06406035, "auxiliary_loss_mlp": 0.01265729, "balance_loss_clip": 0.06271774, "balance_loss_mlp": 0.01256598, "epoch": 0.8284382985119495, "flos": 18987694318080.0, "grad_norm": 1.930139266320469, "language_loss": 0.74715745, "learning_rate": 3.0079703984425187e-07, "loss": 0.82387507, "num_input_tokens_seen": 297265970, "router_z_loss_clip": 1.34277344, "router_z_loss_mlp": 0.09136963, "step": 13779, "time_per_iteration": 2.5341684818267822 }, { "auxiliary_loss_clip": 0.06317662, "auxiliary_loss_mlp": 0.01254573, "balance_loss_clip": 0.06262025, "balance_loss_mlp": 0.01253436, "epoch": 0.8284984217646175, "flos": 61055832579840.0, "grad_norm": 0.7844916486215341, "language_loss": 0.56659102, "learning_rate": 3.0059166016338954e-07, "loss": 0.64231336, "num_input_tokens_seen": 297325525, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01138306, "step": 13780, "time_per_iteration": 3.2400519847869873 }, { "auxiliary_loss_clip": 0.06412326, "auxiliary_loss_mlp": 0.01264415, "balance_loss_clip": 0.06273703, "balance_loss_mlp": 0.01254836, "epoch": 0.8285585450172854, "flos": 19719993075840.0, "grad_norm": 1.662143294804392, "language_loss": 0.80272353, "learning_rate": 3.0038634492426205e-07, "loss": 0.87949097, "num_input_tokens_seen": 297345025, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09576416, "step": 13781, "time_per_iteration": 2.6466708183288574 }, { "auxiliary_loss_clip": 0.0641868, "auxiliary_loss_mlp": 0.01265265, "balance_loss_clip": 0.06278355, "balance_loss_mlp": 0.01254745, "epoch": 0.8286186682699535, "flos": 21695258499840.0, "grad_norm": 2.2535337865501046, "language_loss": 0.76080614, "learning_rate": 3.001810941346543e-07, "loss": 0.83764559, "num_input_tokens_seen": 297363570, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.10510254, "step": 13782, "time_per_iteration": 3.9368810653686523 }, { "auxiliary_loss_clip": 0.06415702, "auxiliary_loss_mlp": 0.01263046, "balance_loss_clip": 0.06273766, "balance_loss_mlp": 0.01253605, "epoch": 0.8286787915226214, "flos": 25782656192640.0, "grad_norm": 1.7306749064850668, "language_loss": 0.76379335, "learning_rate": 2.9997590780234983e-07, "loss": 0.84058082, "num_input_tokens_seen": 297385385, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09442139, "step": 13783, "time_per_iteration": 2.627617597579956 }, { "auxiliary_loss_clip": 0.0641509, "auxiliary_loss_mlp": 0.01266128, "balance_loss_clip": 0.06275037, "balance_loss_mlp": 0.01256741, "epoch": 0.8287389147752894, "flos": 21294777110400.0, "grad_norm": 1.6149610163847579, "language_loss": 0.7436561, "learning_rate": 2.997707859351304e-07, "loss": 0.82046825, "num_input_tokens_seen": 297403950, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09387207, "step": 13784, "time_per_iteration": 2.53644061088562 }, { "auxiliary_loss_clip": 0.06417403, "auxiliary_loss_mlp": 0.01269029, "balance_loss_clip": 0.0627305, "balance_loss_mlp": 0.01258211, "epoch": 0.8287990380279573, "flos": 33552903265920.0, "grad_norm": 1.5293769109028756, "language_loss": 0.69606215, "learning_rate": 2.99565728540772e-07, "loss": 0.77292645, "num_input_tokens_seen": 297424565, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10827637, "step": 13785, "time_per_iteration": 2.614213228225708 }, { "auxiliary_loss_clip": 0.06418446, "auxiliary_loss_mlp": 0.01265698, "balance_loss_clip": 0.06278577, "balance_loss_mlp": 0.01256232, "epoch": 0.8288591612806253, "flos": 22972997410560.0, "grad_norm": 1.426702650428751, "language_loss": 0.68728638, "learning_rate": 2.993607356270516e-07, "loss": 0.76412773, "num_input_tokens_seen": 297445180, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09466553, "step": 13786, "time_per_iteration": 2.617079496383667 }, { "auxiliary_loss_clip": 0.06423671, "auxiliary_loss_mlp": 0.01264304, "balance_loss_clip": 0.06277564, "balance_loss_mlp": 0.01254249, "epoch": 0.8289192845332932, "flos": 18595053285120.0, "grad_norm": 3.466046949358533, "language_loss": 0.76997817, "learning_rate": 2.991558072017426e-07, "loss": 0.84685785, "num_input_tokens_seen": 297463790, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10064697, "step": 13787, "time_per_iteration": 2.5075645446777344 }, { "auxiliary_loss_clip": 0.06412305, "auxiliary_loss_mlp": 0.01266056, "balance_loss_clip": 0.06275334, "balance_loss_mlp": 0.01256513, "epoch": 0.8289794077859612, "flos": 15455841194880.0, "grad_norm": 1.688973734679615, "language_loss": 0.80690658, "learning_rate": 2.989509432726163e-07, "loss": 0.88369012, "num_input_tokens_seen": 297480100, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09539795, "step": 13788, "time_per_iteration": 2.4841387271881104 }, { "auxiliary_loss_clip": 0.06414616, "auxiliary_loss_mlp": 0.01260684, "balance_loss_clip": 0.06275864, "balance_loss_mlp": 0.01251541, "epoch": 0.8290395310386292, "flos": 28885628592000.0, "grad_norm": 1.4578577790835767, "language_loss": 0.71534014, "learning_rate": 2.9874614384744014e-07, "loss": 0.7920931, "num_input_tokens_seen": 297499890, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09143066, "step": 13789, "time_per_iteration": 2.5897936820983887 }, { "auxiliary_loss_clip": 0.06415217, "auxiliary_loss_mlp": 0.01265056, "balance_loss_clip": 0.06273049, "balance_loss_mlp": 0.0125543, "epoch": 0.8290996542912972, "flos": 36585324927360.0, "grad_norm": 1.6351836965449242, "language_loss": 0.68150818, "learning_rate": 2.985414089339813e-07, "loss": 0.75831091, "num_input_tokens_seen": 297521440, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09631348, "step": 13790, "time_per_iteration": 2.644531726837158 }, { "auxiliary_loss_clip": 0.0641799, "auxiliary_loss_mlp": 0.01269176, "balance_loss_clip": 0.0627615, "balance_loss_mlp": 0.01258394, "epoch": 0.8291597775439651, "flos": 23629756112640.0, "grad_norm": 1.6304808922894076, "language_loss": 0.77553141, "learning_rate": 2.9833673854000265e-07, "loss": 0.85240304, "num_input_tokens_seen": 297539920, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.10778809, "step": 13791, "time_per_iteration": 2.5280988216400146 }, { "auxiliary_loss_clip": 0.06411911, "auxiliary_loss_mlp": 0.01267327, "balance_loss_clip": 0.06275932, "balance_loss_mlp": 0.01257748, "epoch": 0.8292199007966331, "flos": 21403873526400.0, "grad_norm": 1.4210813778340612, "language_loss": 0.70285201, "learning_rate": 2.981321326732651e-07, "loss": 0.77964437, "num_input_tokens_seen": 297560000, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09588623, "step": 13792, "time_per_iteration": 2.5898795127868652 }, { "auxiliary_loss_clip": 0.0641698, "auxiliary_loss_mlp": 0.0126544, "balance_loss_clip": 0.06275217, "balance_loss_mlp": 0.01255182, "epoch": 0.829280024049301, "flos": 28775232437760.0, "grad_norm": 1.5427446850705138, "language_loss": 0.65003085, "learning_rate": 2.9792759134152736e-07, "loss": 0.72685504, "num_input_tokens_seen": 297579300, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10253906, "step": 13793, "time_per_iteration": 2.5998592376708984 }, { "auxiliary_loss_clip": 0.06420286, "auxiliary_loss_mlp": 0.01265887, "balance_loss_clip": 0.06277432, "balance_loss_mlp": 0.01254603, "epoch": 0.829340147301969, "flos": 19944223401600.0, "grad_norm": 1.8050715026428312, "language_loss": 0.66203475, "learning_rate": 2.977231145525461e-07, "loss": 0.73889649, "num_input_tokens_seen": 297598095, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.112854, "step": 13794, "time_per_iteration": 2.5266835689544678 }, { "auxiliary_loss_clip": 0.06417197, "auxiliary_loss_mlp": 0.01262755, "balance_loss_clip": 0.06274471, "balance_loss_mlp": 0.01252986, "epoch": 0.829400270554637, "flos": 25235622812160.0, "grad_norm": 1.8688488656190807, "language_loss": 0.66360176, "learning_rate": 2.975187023140757e-07, "loss": 0.74040127, "num_input_tokens_seen": 297615955, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09765625, "step": 13795, "time_per_iteration": 2.5878872871398926 }, { "auxiliary_loss_clip": 0.06410889, "auxiliary_loss_mlp": 0.01264864, "balance_loss_clip": 0.06277683, "balance_loss_mlp": 0.01255434, "epoch": 0.829460393807305, "flos": 24470690088960.0, "grad_norm": 1.8741512159826197, "language_loss": 0.66796744, "learning_rate": 2.973143546338661e-07, "loss": 0.74472493, "num_input_tokens_seen": 297636285, "router_z_loss_clip": 1.33300781, "router_z_loss_mlp": 0.09429932, "step": 13796, "time_per_iteration": 2.581531047821045 }, { "auxiliary_loss_clip": 0.06412426, "auxiliary_loss_mlp": 0.01262223, "balance_loss_clip": 0.06274746, "balance_loss_mlp": 0.01252907, "epoch": 0.829520517059973, "flos": 15127923041280.0, "grad_norm": 1.748988732683904, "language_loss": 0.71715844, "learning_rate": 2.971100715196666e-07, "loss": 0.79390496, "num_input_tokens_seen": 297653315, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09313965, "step": 13797, "time_per_iteration": 2.5193934440612793 }, { "auxiliary_loss_clip": 0.06415657, "auxiliary_loss_mlp": 0.01265113, "balance_loss_clip": 0.0627572, "balance_loss_mlp": 0.01255803, "epoch": 0.8295806403126409, "flos": 21586413646080.0, "grad_norm": 2.2770737526828677, "language_loss": 0.72511017, "learning_rate": 2.969058529792243e-07, "loss": 0.80191791, "num_input_tokens_seen": 297673480, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09307861, "step": 13798, "time_per_iteration": 2.5696303844451904 }, { "auxiliary_loss_clip": 0.06410246, "auxiliary_loss_mlp": 0.01265276, "balance_loss_clip": 0.06275561, "balance_loss_mlp": 0.01255823, "epoch": 0.8296407635653089, "flos": 21733133345280.0, "grad_norm": 1.568063459639598, "language_loss": 0.76595473, "learning_rate": 2.967016990202822e-07, "loss": 0.8427099, "num_input_tokens_seen": 297693250, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.09454346, "step": 13799, "time_per_iteration": 3.9955027103424072 }, { "auxiliary_loss_clip": 0.06420309, "auxiliary_loss_mlp": 0.01270055, "balance_loss_clip": 0.06282324, "balance_loss_mlp": 0.01259618, "epoch": 0.8297008868179768, "flos": 11185777601280.0, "grad_norm": 1.8399876264126749, "language_loss": 0.67738724, "learning_rate": 2.9649760965058245e-07, "loss": 0.75429082, "num_input_tokens_seen": 297710975, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.10437012, "step": 13800, "time_per_iteration": 2.510993003845215 }, { "auxiliary_loss_clip": 0.06426647, "auxiliary_loss_mlp": 0.01267443, "balance_loss_clip": 0.06282602, "balance_loss_mlp": 0.01257591, "epoch": 0.8297610100706448, "flos": 20669688051840.0, "grad_norm": 1.7476913695151772, "language_loss": 0.74990225, "learning_rate": 2.9629358487786515e-07, "loss": 0.82684314, "num_input_tokens_seen": 297730860, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.09851074, "step": 13801, "time_per_iteration": 2.5671029090881348 }, { "auxiliary_loss_clip": 0.06419174, "auxiliary_loss_mlp": 0.01261764, "balance_loss_clip": 0.06277008, "balance_loss_mlp": 0.0125268, "epoch": 0.8298211333233128, "flos": 20382621563520.0, "grad_norm": 1.8214500151400304, "language_loss": 0.73997408, "learning_rate": 2.9608962470986476e-07, "loss": 0.81678343, "num_input_tokens_seen": 297749765, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09088135, "step": 13802, "time_per_iteration": 2.5193979740142822 }, { "auxiliary_loss_clip": 0.06420012, "auxiliary_loss_mlp": 0.01268949, "balance_loss_clip": 0.06278478, "balance_loss_mlp": 0.01259179, "epoch": 0.8298812565759808, "flos": 21515401710720.0, "grad_norm": 1.4554511358159603, "language_loss": 0.7488631, "learning_rate": 2.9588572915431644e-07, "loss": 0.82575274, "num_input_tokens_seen": 297770380, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09765625, "step": 13803, "time_per_iteration": 2.5832529067993164 }, { "auxiliary_loss_clip": 0.06415512, "auxiliary_loss_mlp": 0.01266409, "balance_loss_clip": 0.06275317, "balance_loss_mlp": 0.01256967, "epoch": 0.8299413798286487, "flos": 22825019900160.0, "grad_norm": 1.8878765422532167, "language_loss": 0.7938171, "learning_rate": 2.9568189821895215e-07, "loss": 0.87063634, "num_input_tokens_seen": 297789440, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09436035, "step": 13804, "time_per_iteration": 4.045304298400879 }, { "auxiliary_loss_clip": 0.06414916, "auxiliary_loss_mlp": 0.01265267, "balance_loss_clip": 0.06275646, "balance_loss_mlp": 0.01256514, "epoch": 0.8300015030813167, "flos": 29686884860160.0, "grad_norm": 1.5675669274523458, "language_loss": 0.73453701, "learning_rate": 2.954781319115016e-07, "loss": 0.81133878, "num_input_tokens_seen": 297810425, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.08755493, "step": 13805, "time_per_iteration": 2.7211382389068604 }, { "auxiliary_loss_clip": 0.06423149, "auxiliary_loss_mlp": 0.01266659, "balance_loss_clip": 0.06280847, "balance_loss_mlp": 0.01257277, "epoch": 0.8300616263339846, "flos": 19725653226240.0, "grad_norm": 2.7573778807292353, "language_loss": 0.77521574, "learning_rate": 2.952744302396906e-07, "loss": 0.85211384, "num_input_tokens_seen": 297827680, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.09387207, "step": 13806, "time_per_iteration": 2.5322279930114746 }, { "auxiliary_loss_clip": 0.06421569, "auxiliary_loss_mlp": 0.01268241, "balance_loss_clip": 0.06277955, "balance_loss_mlp": 0.01257459, "epoch": 0.8301217495866526, "flos": 19908151418880.0, "grad_norm": 2.7335721684132084, "language_loss": 0.63968343, "learning_rate": 2.950707932112444e-07, "loss": 0.71658152, "num_input_tokens_seen": 297848005, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10778809, "step": 13807, "time_per_iteration": 2.5485591888427734 }, { "auxiliary_loss_clip": 0.06415353, "auxiliary_loss_mlp": 0.01264797, "balance_loss_clip": 0.06276278, "balance_loss_mlp": 0.01255123, "epoch": 0.8301818728393207, "flos": 19721334741120.0, "grad_norm": 1.7815462456836453, "language_loss": 0.73487437, "learning_rate": 2.948672208338847e-07, "loss": 0.81167585, "num_input_tokens_seen": 297866730, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09667969, "step": 13808, "time_per_iteration": 2.520756721496582 }, { "auxiliary_loss_clip": 0.06423382, "auxiliary_loss_mlp": 0.0127029, "balance_loss_clip": 0.06278358, "balance_loss_mlp": 0.01259042, "epoch": 0.8302419960919886, "flos": 28301265417600.0, "grad_norm": 1.6378256110001284, "language_loss": 0.66484833, "learning_rate": 2.9466371311533046e-07, "loss": 0.74178505, "num_input_tokens_seen": 297886390, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.11248779, "step": 13809, "time_per_iteration": 2.598032236099243 }, { "auxiliary_loss_clip": 0.06419884, "auxiliary_loss_mlp": 0.01267472, "balance_loss_clip": 0.06278171, "balance_loss_mlp": 0.01257798, "epoch": 0.8303021193446566, "flos": 18229344140160.0, "grad_norm": 1.9748183119831475, "language_loss": 0.74370068, "learning_rate": 2.9446027006329896e-07, "loss": 0.82057422, "num_input_tokens_seen": 297905110, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09667969, "step": 13810, "time_per_iteration": 2.5047829151153564 }, { "auxiliary_loss_clip": 0.06411735, "auxiliary_loss_mlp": 0.01262092, "balance_loss_clip": 0.06276414, "balance_loss_mlp": 0.01253461, "epoch": 0.8303622425973245, "flos": 23117956174080.0, "grad_norm": 1.5354689455912935, "language_loss": 0.81311089, "learning_rate": 2.94256891685505e-07, "loss": 0.88984919, "num_input_tokens_seen": 297925460, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.08630371, "step": 13811, "time_per_iteration": 2.5523674488067627 }, { "auxiliary_loss_clip": 0.0642225, "auxiliary_loss_mlp": 0.01264667, "balance_loss_clip": 0.06278777, "balance_loss_mlp": 0.01255291, "epoch": 0.8304223658499925, "flos": 19578891600000.0, "grad_norm": 2.0578946042302104, "language_loss": 0.73606038, "learning_rate": 2.9405357798966156e-07, "loss": 0.81292951, "num_input_tokens_seen": 297941760, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.09375, "step": 13812, "time_per_iteration": 2.5444676876068115 }, { "auxiliary_loss_clip": 0.06411768, "auxiliary_loss_mlp": 0.01263808, "balance_loss_clip": 0.06276378, "balance_loss_mlp": 0.01253955, "epoch": 0.8304824891026604, "flos": 24433066805760.0, "grad_norm": 1.5302738031200853, "language_loss": 0.78785115, "learning_rate": 2.9385032898347664e-07, "loss": 0.86460692, "num_input_tokens_seen": 297959745, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.09851074, "step": 13813, "time_per_iteration": 2.5870201587677 }, { "auxiliary_loss_clip": 0.06418292, "auxiliary_loss_mlp": 0.01266682, "balance_loss_clip": 0.06275949, "balance_loss_mlp": 0.01256597, "epoch": 0.8305426123553284, "flos": 22388214965760.0, "grad_norm": 1.906797300442254, "language_loss": 0.70951062, "learning_rate": 2.93647144674658e-07, "loss": 0.78636032, "num_input_tokens_seen": 297977665, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10083008, "step": 13814, "time_per_iteration": 2.54891300201416 }, { "auxiliary_loss_clip": 0.06435198, "auxiliary_loss_mlp": 0.01265088, "balance_loss_clip": 0.06283104, "balance_loss_mlp": 0.01253107, "epoch": 0.8306027356079964, "flos": 14908975522560.0, "grad_norm": 2.5602285693546594, "language_loss": 0.68245673, "learning_rate": 2.9344402507091116e-07, "loss": 0.75945961, "num_input_tokens_seen": 297993525, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.11975098, "step": 13815, "time_per_iteration": 4.044780492782593 }, { "auxiliary_loss_clip": 0.06420758, "auxiliary_loss_mlp": 0.01268597, "balance_loss_clip": 0.06281178, "balance_loss_mlp": 0.01258983, "epoch": 0.8306628588606644, "flos": 19650406659840.0, "grad_norm": 2.2240695985680006, "language_loss": 0.76491815, "learning_rate": 2.9324097017993745e-07, "loss": 0.84181172, "num_input_tokens_seen": 298012920, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09613037, "step": 13816, "time_per_iteration": 2.512345790863037 }, { "auxiliary_loss_clip": 0.06415637, "auxiliary_loss_mlp": 0.01267414, "balance_loss_clip": 0.06276589, "balance_loss_mlp": 0.0125771, "epoch": 0.8307229821133323, "flos": 24396701333760.0, "grad_norm": 1.6503073397424008, "language_loss": 0.81736851, "learning_rate": 2.930379800094371e-07, "loss": 0.89419901, "num_input_tokens_seen": 298033310, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09710693, "step": 13817, "time_per_iteration": 2.5587878227233887 }, { "auxiliary_loss_clip": 0.06418907, "auxiliary_loss_mlp": 0.01264518, "balance_loss_clip": 0.06277291, "balance_loss_mlp": 0.01254332, "epoch": 0.8307831053660003, "flos": 21003392136960.0, "grad_norm": 1.691256552103454, "language_loss": 0.78327626, "learning_rate": 2.9283505456710875e-07, "loss": 0.86011052, "num_input_tokens_seen": 298053530, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.10186768, "step": 13818, "time_per_iteration": 2.553152322769165 }, { "auxiliary_loss_clip": 0.06421309, "auxiliary_loss_mlp": 0.01266092, "balance_loss_clip": 0.06279197, "balance_loss_mlp": 0.01256132, "epoch": 0.8308432286186682, "flos": 21403663891200.0, "grad_norm": 2.2306507217762968, "language_loss": 0.81906259, "learning_rate": 2.926321938606453e-07, "loss": 0.89593655, "num_input_tokens_seen": 298069305, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09967041, "step": 13819, "time_per_iteration": 2.5400094985961914 }, { "auxiliary_loss_clip": 0.06318625, "auxiliary_loss_mlp": 0.01253183, "balance_loss_clip": 0.06262721, "balance_loss_mlp": 0.01252018, "epoch": 0.8309033518713362, "flos": 62549724625920.0, "grad_norm": 0.7496209708060528, "language_loss": 0.56211996, "learning_rate": 2.924293978977399e-07, "loss": 0.63783807, "num_input_tokens_seen": 298125830, "router_z_loss_clip": 0.56152344, "router_z_loss_mlp": 0.01165009, "step": 13820, "time_per_iteration": 3.1608848571777344 }, { "auxiliary_loss_clip": 0.06414415, "auxiliary_loss_mlp": 0.01265249, "balance_loss_clip": 0.06277475, "balance_loss_mlp": 0.01255391, "epoch": 0.8309634751240043, "flos": 16984155340800.0, "grad_norm": 1.8399422171970465, "language_loss": 0.68604189, "learning_rate": 2.922266666860831e-07, "loss": 0.76283848, "num_input_tokens_seen": 298142320, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09857178, "step": 13821, "time_per_iteration": 3.84267520904541 }, { "auxiliary_loss_clip": 0.06427775, "auxiliary_loss_mlp": 0.01271414, "balance_loss_clip": 0.06281489, "balance_loss_mlp": 0.01261079, "epoch": 0.8310235983766722, "flos": 22681067385600.0, "grad_norm": 1.7559743792116151, "language_loss": 0.69370621, "learning_rate": 2.920240002333625e-07, "loss": 0.77069807, "num_input_tokens_seen": 298161845, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.10339355, "step": 13822, "time_per_iteration": 2.554013729095459 }, { "auxiliary_loss_clip": 0.06412963, "auxiliary_loss_mlp": 0.01266427, "balance_loss_clip": 0.06275622, "balance_loss_mlp": 0.01257034, "epoch": 0.8310837216293402, "flos": 30819539226240.0, "grad_norm": 2.1933605659928386, "language_loss": 0.62074316, "learning_rate": 2.918213985472631e-07, "loss": 0.69753712, "num_input_tokens_seen": 298184165, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09387207, "step": 13823, "time_per_iteration": 2.6088662147521973 }, { "auxiliary_loss_clip": 0.06318228, "auxiliary_loss_mlp": 0.01250309, "balance_loss_clip": 0.06262441, "balance_loss_mlp": 0.01249197, "epoch": 0.8311438448820081, "flos": 71297338521600.0, "grad_norm": 0.9083776931604486, "language_loss": 0.61841154, "learning_rate": 2.916188616354669e-07, "loss": 0.69409698, "num_input_tokens_seen": 298251720, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.01114655, "step": 13824, "time_per_iteration": 3.2496304512023926 }, { "auxiliary_loss_clip": 0.06420529, "auxiliary_loss_mlp": 0.01265218, "balance_loss_clip": 0.06279129, "balance_loss_mlp": 0.01255717, "epoch": 0.8312039681346761, "flos": 20893457180160.0, "grad_norm": 1.5039452839294047, "language_loss": 0.74609476, "learning_rate": 2.914163895056552e-07, "loss": 0.82295227, "num_input_tokens_seen": 298271910, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.0949707, "step": 13825, "time_per_iteration": 2.565335273742676 }, { "auxiliary_loss_clip": 0.06422096, "auxiliary_loss_mlp": 0.01266134, "balance_loss_clip": 0.06278904, "balance_loss_mlp": 0.01255811, "epoch": 0.831264091387344, "flos": 17022910654080.0, "grad_norm": 2.065292562091687, "language_loss": 0.8033278, "learning_rate": 2.9121398216550486e-07, "loss": 0.88021016, "num_input_tokens_seen": 298288105, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10327148, "step": 13826, "time_per_iteration": 2.515012741088867 }, { "auxiliary_loss_clip": 0.06421362, "auxiliary_loss_mlp": 0.01267517, "balance_loss_clip": 0.06280389, "balance_loss_mlp": 0.01257032, "epoch": 0.831324214640012, "flos": 24425436084480.0, "grad_norm": 1.508071815635594, "language_loss": 0.68286818, "learning_rate": 2.910116396226914e-07, "loss": 0.75975692, "num_input_tokens_seen": 298307600, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10491943, "step": 13827, "time_per_iteration": 2.5886693000793457 }, { "auxiliary_loss_clip": 0.06413895, "auxiliary_loss_mlp": 0.01263361, "balance_loss_clip": 0.06274264, "balance_loss_mlp": 0.01254838, "epoch": 0.83138433789268, "flos": 13549407500160.0, "grad_norm": 1.7183987902195272, "language_loss": 0.74359262, "learning_rate": 2.9080936188488834e-07, "loss": 0.82036519, "num_input_tokens_seen": 298323055, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.08526611, "step": 13828, "time_per_iteration": 2.7812507152557373 }, { "auxiliary_loss_clip": 0.06419156, "auxiliary_loss_mlp": 0.01267392, "balance_loss_clip": 0.06275945, "balance_loss_mlp": 0.01257974, "epoch": 0.831444461145348, "flos": 44502543262080.0, "grad_norm": 1.603985535115327, "language_loss": 0.67121238, "learning_rate": 2.906071489597657e-07, "loss": 0.74807787, "num_input_tokens_seen": 298346950, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.09411621, "step": 13829, "time_per_iteration": 2.72916841506958 }, { "auxiliary_loss_clip": 0.0642091, "auxiliary_loss_mlp": 0.01265445, "balance_loss_clip": 0.06276666, "balance_loss_mlp": 0.01255544, "epoch": 0.8315045843980159, "flos": 22710640677120.0, "grad_norm": 1.542554283719476, "language_loss": 0.83041072, "learning_rate": 2.9040500085499054e-07, "loss": 0.90727425, "num_input_tokens_seen": 298366315, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.09906006, "step": 13830, "time_per_iteration": 2.5488109588623047 }, { "auxiliary_loss_clip": 0.0641849, "auxiliary_loss_mlp": 0.01266024, "balance_loss_clip": 0.06277533, "balance_loss_mlp": 0.01256022, "epoch": 0.8315647076506839, "flos": 16879167774720.0, "grad_norm": 2.0475904201669564, "language_loss": 0.74625236, "learning_rate": 2.9020291757822925e-07, "loss": 0.82309747, "num_input_tokens_seen": 298385185, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10003662, "step": 13831, "time_per_iteration": 2.513939619064331 }, { "auxiliary_loss_clip": 0.06418288, "auxiliary_loss_mlp": 0.01266311, "balance_loss_clip": 0.06276669, "balance_loss_mlp": 0.01256041, "epoch": 0.8316248309033518, "flos": 13813902512640.0, "grad_norm": 1.7510100227478504, "language_loss": 0.71439326, "learning_rate": 2.9000089913714523e-07, "loss": 0.79123926, "num_input_tokens_seen": 298402335, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.10272217, "step": 13832, "time_per_iteration": 2.500544309616089 }, { "auxiliary_loss_clip": 0.06417312, "auxiliary_loss_mlp": 0.01266834, "balance_loss_clip": 0.06276552, "balance_loss_mlp": 0.01256677, "epoch": 0.8316849541560198, "flos": 23519066469120.0, "grad_norm": 1.6792364700158329, "language_loss": 0.84722161, "learning_rate": 2.897989455393979e-07, "loss": 0.92406315, "num_input_tokens_seen": 298423370, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.10162354, "step": 13833, "time_per_iteration": 2.558521270751953 }, { "auxiliary_loss_clip": 0.06421848, "auxiliary_loss_mlp": 0.01269474, "balance_loss_clip": 0.06277438, "balance_loss_mlp": 0.01259061, "epoch": 0.8317450774086879, "flos": 23778530236800.0, "grad_norm": 1.377342555770946, "language_loss": 0.76124954, "learning_rate": 2.8959705679264625e-07, "loss": 0.83816278, "num_input_tokens_seen": 298444835, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10418701, "step": 13834, "time_per_iteration": 2.563706636428833 }, { "auxiliary_loss_clip": 0.06408152, "auxiliary_loss_mlp": 0.01265481, "balance_loss_clip": 0.06272182, "balance_loss_mlp": 0.01255754, "epoch": 0.8318052006613558, "flos": 16220899699200.0, "grad_norm": 2.729843456885449, "language_loss": 0.80503762, "learning_rate": 2.893952329045459e-07, "loss": 0.88177395, "num_input_tokens_seen": 298461845, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09729004, "step": 13835, "time_per_iteration": 2.5762319564819336 }, { "auxiliary_loss_clip": 0.06420414, "auxiliary_loss_mlp": 0.01266947, "balance_loss_clip": 0.06277442, "balance_loss_mlp": 0.01256194, "epoch": 0.8318653239140238, "flos": 19980714654720.0, "grad_norm": 1.9431486983131128, "language_loss": 0.80993068, "learning_rate": 2.8919347388274905e-07, "loss": 0.88680434, "num_input_tokens_seen": 298479095, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10754395, "step": 13836, "time_per_iteration": 2.5135583877563477 }, { "auxiliary_loss_clip": 0.06413116, "auxiliary_loss_mlp": 0.0126649, "balance_loss_clip": 0.062758, "balance_loss_mlp": 0.01257258, "epoch": 0.8319254471666917, "flos": 17709200720640.0, "grad_norm": 1.8652786618499089, "language_loss": 0.77359974, "learning_rate": 2.8899177973490727e-07, "loss": 0.8503958, "num_input_tokens_seen": 298494475, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09234619, "step": 13837, "time_per_iteration": 2.526279926300049 }, { "auxiliary_loss_clip": 0.064225, "auxiliary_loss_mlp": 0.01265379, "balance_loss_clip": 0.0627619, "balance_loss_mlp": 0.01254364, "epoch": 0.8319855704193597, "flos": 19542609982080.0, "grad_norm": 1.7075637608195453, "language_loss": 0.83390033, "learning_rate": 2.887901504686685e-07, "loss": 0.91077924, "num_input_tokens_seen": 298513185, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.11022949, "step": 13838, "time_per_iteration": 2.532346248626709 }, { "auxiliary_loss_clip": 0.06415538, "auxiliary_loss_mlp": 0.01268506, "balance_loss_clip": 0.06277275, "balance_loss_mlp": 0.012586, "epoch": 0.8320456936720276, "flos": 21184339029120.0, "grad_norm": 1.756070356932545, "language_loss": 0.74672532, "learning_rate": 2.885885860916795e-07, "loss": 0.82356572, "num_input_tokens_seen": 298531885, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09906006, "step": 13839, "time_per_iteration": 3.9666764736175537 }, { "auxiliary_loss_clip": 0.06416252, "auxiliary_loss_mlp": 0.01267557, "balance_loss_clip": 0.06276084, "balance_loss_mlp": 0.01257609, "epoch": 0.8321058169246957, "flos": 33258499545600.0, "grad_norm": 1.4262212434116037, "language_loss": 0.67987573, "learning_rate": 2.8838708661158253e-07, "loss": 0.75671375, "num_input_tokens_seen": 298554905, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09942627, "step": 13840, "time_per_iteration": 2.6563467979431152 }, { "auxiliary_loss_clip": 0.064154, "auxiliary_loss_mlp": 0.01267002, "balance_loss_clip": 0.0627442, "balance_loss_mlp": 0.01257155, "epoch": 0.8321659401773636, "flos": 14213042236800.0, "grad_norm": 2.170435099289606, "language_loss": 0.79475975, "learning_rate": 2.8818565203601843e-07, "loss": 0.87158382, "num_input_tokens_seen": 298571185, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09844971, "step": 13841, "time_per_iteration": 2.4967246055603027 }, { "auxiliary_loss_clip": 0.06417745, "auxiliary_loss_mlp": 0.01264185, "balance_loss_clip": 0.0627885, "balance_loss_mlp": 0.01254046, "epoch": 0.8322260634300316, "flos": 15163575753600.0, "grad_norm": 1.7050399630117412, "language_loss": 0.68291414, "learning_rate": 2.879842823726262e-07, "loss": 0.75973344, "num_input_tokens_seen": 298588505, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.10131836, "step": 13842, "time_per_iteration": 2.562960147857666 }, { "auxiliary_loss_clip": 0.06414464, "auxiliary_loss_mlp": 0.01264168, "balance_loss_clip": 0.06275554, "balance_loss_mlp": 0.01254155, "epoch": 0.8322861866826995, "flos": 25307766777600.0, "grad_norm": 2.23798616872946, "language_loss": 0.73098171, "learning_rate": 2.8778297762904124e-07, "loss": 0.80776799, "num_input_tokens_seen": 298609295, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.10015869, "step": 13843, "time_per_iteration": 4.086496114730835 }, { "auxiliary_loss_clip": 0.06418423, "auxiliary_loss_mlp": 0.01267179, "balance_loss_clip": 0.06279922, "balance_loss_mlp": 0.01256904, "epoch": 0.8323463099353675, "flos": 17025048933120.0, "grad_norm": 2.516412848261258, "language_loss": 0.77704465, "learning_rate": 2.875817378128975e-07, "loss": 0.85390067, "num_input_tokens_seen": 298625765, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.1027832, "step": 13844, "time_per_iteration": 2.5128626823425293 }, { "auxiliary_loss_clip": 0.06316154, "auxiliary_loss_mlp": 0.01252387, "balance_loss_clip": 0.0626034, "balance_loss_mlp": 0.01251381, "epoch": 0.8324064331880354, "flos": 55623891473280.0, "grad_norm": 0.7620849943000426, "language_loss": 0.55200732, "learning_rate": 2.8738056293182624e-07, "loss": 0.6276927, "num_input_tokens_seen": 298683005, "router_z_loss_clip": 0.55908203, "router_z_loss_mlp": 0.01006317, "step": 13845, "time_per_iteration": 3.0635809898376465 }, { "auxiliary_loss_clip": 0.0642059, "auxiliary_loss_mlp": 0.01273815, "balance_loss_clip": 0.06277671, "balance_loss_mlp": 0.01263063, "epoch": 0.8324665564407034, "flos": 26145472371840.0, "grad_norm": 2.046352623493661, "language_loss": 0.7571938, "learning_rate": 2.871794529934555e-07, "loss": 0.83413786, "num_input_tokens_seen": 298703060, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10754395, "step": 13846, "time_per_iteration": 2.5740551948547363 }, { "auxiliary_loss_clip": 0.06423213, "auxiliary_loss_mlp": 0.01264925, "balance_loss_clip": 0.06277519, "balance_loss_mlp": 0.01254446, "epoch": 0.8325266796933715, "flos": 22054846296960.0, "grad_norm": 1.911492961816883, "language_loss": 0.78941274, "learning_rate": 2.8697840800541115e-07, "loss": 0.86629409, "num_input_tokens_seen": 298721765, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.1048584, "step": 13847, "time_per_iteration": 2.610563039779663 }, { "auxiliary_loss_clip": 0.06410903, "auxiliary_loss_mlp": 0.01265, "balance_loss_clip": 0.06272735, "balance_loss_mlp": 0.01255726, "epoch": 0.8325868029460394, "flos": 22822630058880.0, "grad_norm": 1.5562650073957847, "language_loss": 0.74579877, "learning_rate": 2.867774279753175e-07, "loss": 0.82255775, "num_input_tokens_seen": 298740825, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.0927124, "step": 13848, "time_per_iteration": 2.5678834915161133 }, { "auxiliary_loss_clip": 0.06412911, "auxiliary_loss_mlp": 0.01264706, "balance_loss_clip": 0.06273098, "balance_loss_mlp": 0.01255032, "epoch": 0.8326469261987074, "flos": 14762800874880.0, "grad_norm": 1.8011528027085126, "language_loss": 0.63652843, "learning_rate": 2.8657651291079554e-07, "loss": 0.71330464, "num_input_tokens_seen": 298758515, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09674072, "step": 13849, "time_per_iteration": 2.5195260047912598 }, { "auxiliary_loss_clip": 0.06420317, "auxiliary_loss_mlp": 0.01263975, "balance_loss_clip": 0.06276871, "balance_loss_mlp": 0.01253991, "epoch": 0.8327070494513753, "flos": 22932145745280.0, "grad_norm": 9.351102604982819, "language_loss": 0.79440504, "learning_rate": 2.863756628194638e-07, "loss": 0.87124795, "num_input_tokens_seen": 298776375, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09973145, "step": 13850, "time_per_iteration": 2.5449655055999756 }, { "auxiliary_loss_clip": 0.06408681, "auxiliary_loss_mlp": 0.01265242, "balance_loss_clip": 0.06273869, "balance_loss_mlp": 0.01255866, "epoch": 0.8327671727040433, "flos": 20671197425280.0, "grad_norm": 1.4905296090025137, "language_loss": 0.78348833, "learning_rate": 2.8617487770893877e-07, "loss": 0.86022758, "num_input_tokens_seen": 298795135, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.09375, "step": 13851, "time_per_iteration": 2.5761542320251465 }, { "auxiliary_loss_clip": 0.06313826, "auxiliary_loss_mlp": 0.01250119, "balance_loss_clip": 0.06258142, "balance_loss_mlp": 0.01248944, "epoch": 0.8328272959567112, "flos": 56079353940480.0, "grad_norm": 0.7550119614261925, "language_loss": 0.55904555, "learning_rate": 2.859741575868344e-07, "loss": 0.63468504, "num_input_tokens_seen": 298855475, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.01173401, "step": 13852, "time_per_iteration": 3.149547576904297 }, { "auxiliary_loss_clip": 0.06410864, "auxiliary_loss_mlp": 0.01263704, "balance_loss_clip": 0.06272595, "balance_loss_mlp": 0.01253965, "epoch": 0.8328874192093793, "flos": 32310691286400.0, "grad_norm": 1.6167703392736206, "language_loss": 0.67326969, "learning_rate": 2.8577350246076125e-07, "loss": 0.75001544, "num_input_tokens_seen": 298875875, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09741211, "step": 13853, "time_per_iteration": 2.665940999984741 }, { "auxiliary_loss_clip": 0.06417549, "auxiliary_loss_mlp": 0.0126548, "balance_loss_clip": 0.062765, "balance_loss_mlp": 0.01256152, "epoch": 0.8329475424620472, "flos": 23519276104320.0, "grad_norm": 1.4925093098025188, "language_loss": 0.78371692, "learning_rate": 2.855729123383286e-07, "loss": 0.86054718, "num_input_tokens_seen": 298895950, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09320068, "step": 13854, "time_per_iteration": 4.004335165023804 }, { "auxiliary_loss_clip": 0.06310772, "auxiliary_loss_mlp": 0.01252124, "balance_loss_clip": 0.06255148, "balance_loss_mlp": 0.01251096, "epoch": 0.8330076657147152, "flos": 67860410474880.0, "grad_norm": 0.7342788513656154, "language_loss": 0.58570898, "learning_rate": 2.8537238722714295e-07, "loss": 0.66133785, "num_input_tokens_seen": 298955770, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01027679, "step": 13855, "time_per_iteration": 3.048689603805542 }, { "auxiliary_loss_clip": 0.06417969, "auxiliary_loss_mlp": 0.01268985, "balance_loss_clip": 0.0627806, "balance_loss_mlp": 0.01259341, "epoch": 0.8330677889673831, "flos": 22899344071680.0, "grad_norm": 1.6262817560015994, "language_loss": 0.71927339, "learning_rate": 2.8517192713480853e-07, "loss": 0.79614288, "num_input_tokens_seen": 298976545, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09649658, "step": 13856, "time_per_iteration": 2.5817034244537354 }, { "auxiliary_loss_clip": 0.06413718, "auxiliary_loss_mlp": 0.01266148, "balance_loss_clip": 0.06274319, "balance_loss_mlp": 0.01256677, "epoch": 0.8331279122200511, "flos": 27352492836480.0, "grad_norm": 1.5696000110972583, "language_loss": 0.75661564, "learning_rate": 2.8497153206892677e-07, "loss": 0.83341426, "num_input_tokens_seen": 298996750, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09466553, "step": 13857, "time_per_iteration": 2.5907368659973145 }, { "auxiliary_loss_clip": 0.06409281, "auxiliary_loss_mlp": 0.01264721, "balance_loss_clip": 0.06274958, "balance_loss_mlp": 0.01256418, "epoch": 0.833188035472719, "flos": 19944349182720.0, "grad_norm": 1.586687132172935, "language_loss": 0.7357235, "learning_rate": 2.847712020370958e-07, "loss": 0.81246358, "num_input_tokens_seen": 299014895, "router_z_loss_clip": 1.34277344, "router_z_loss_mlp": 0.08300781, "step": 13858, "time_per_iteration": 2.556840181350708 }, { "auxiliary_loss_clip": 0.06423618, "auxiliary_loss_mlp": 0.01267004, "balance_loss_clip": 0.06276975, "balance_loss_mlp": 0.01256257, "epoch": 0.833248158725387, "flos": 15238193414400.0, "grad_norm": 2.656250573467059, "language_loss": 0.73261297, "learning_rate": 2.8457093704691316e-07, "loss": 0.80951923, "num_input_tokens_seen": 299032855, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10742188, "step": 13859, "time_per_iteration": 2.5402417182922363 }, { "auxiliary_loss_clip": 0.06410824, "auxiliary_loss_mlp": 0.01263013, "balance_loss_clip": 0.06275214, "balance_loss_mlp": 0.01253923, "epoch": 0.8333082819780551, "flos": 24542498638080.0, "grad_norm": 1.640970764434659, "language_loss": 0.79633439, "learning_rate": 2.8437073710597205e-07, "loss": 0.8730728, "num_input_tokens_seen": 299052055, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.09082031, "step": 13860, "time_per_iteration": 3.973562479019165 }, { "auxiliary_loss_clip": 0.06411149, "auxiliary_loss_mlp": 0.01264806, "balance_loss_clip": 0.06273171, "balance_loss_mlp": 0.01255246, "epoch": 0.833368405230723, "flos": 31475459387520.0, "grad_norm": 1.3524198849110636, "language_loss": 0.82285798, "learning_rate": 2.841706022218644e-07, "loss": 0.89961755, "num_input_tokens_seen": 299075285, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09564209, "step": 13861, "time_per_iteration": 2.6597213745117188 }, { "auxiliary_loss_clip": 0.06420922, "auxiliary_loss_mlp": 0.01263639, "balance_loss_clip": 0.06278788, "balance_loss_mlp": 0.012535, "epoch": 0.833428528483391, "flos": 14907969273600.0, "grad_norm": 1.8546835646886008, "language_loss": 0.79350424, "learning_rate": 2.839705324021806e-07, "loss": 0.87034982, "num_input_tokens_seen": 299092520, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10137939, "step": 13862, "time_per_iteration": 2.4971213340759277 }, { "auxiliary_loss_clip": 0.06421296, "auxiliary_loss_mlp": 0.0126495, "balance_loss_clip": 0.06277689, "balance_loss_mlp": 0.01255098, "epoch": 0.8334886517360589, "flos": 22206303751680.0, "grad_norm": 1.7076631453254794, "language_loss": 0.75005245, "learning_rate": 2.83770527654505e-07, "loss": 0.82691497, "num_input_tokens_seen": 299109450, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09851074, "step": 13863, "time_per_iteration": 2.5304670333862305 }, { "auxiliary_loss_clip": 0.06413786, "auxiliary_loss_mlp": 0.01264553, "balance_loss_clip": 0.06278363, "balance_loss_mlp": 0.01255493, "epoch": 0.8335487749887269, "flos": 30380386377600.0, "grad_norm": 3.473919465656018, "language_loss": 0.75249833, "learning_rate": 2.835705879864232e-07, "loss": 0.82928181, "num_input_tokens_seen": 299129540, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.09057617, "step": 13864, "time_per_iteration": 2.6032440662384033 }, { "auxiliary_loss_clip": 0.06418487, "auxiliary_loss_mlp": 0.01268204, "balance_loss_clip": 0.06277318, "balance_loss_mlp": 0.0125766, "epoch": 0.8336088982413948, "flos": 24688086307200.0, "grad_norm": 1.6060264569183424, "language_loss": 0.69591588, "learning_rate": 2.833707134055168e-07, "loss": 0.7727828, "num_input_tokens_seen": 299148670, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10540771, "step": 13865, "time_per_iteration": 2.5712292194366455 }, { "auxiliary_loss_clip": 0.06416859, "auxiliary_loss_mlp": 0.01263858, "balance_loss_clip": 0.06276664, "balance_loss_mlp": 0.01253397, "epoch": 0.8336690214940629, "flos": 38185783038720.0, "grad_norm": 1.6674503524091921, "language_loss": 0.75794846, "learning_rate": 2.831709039193653e-07, "loss": 0.83475566, "num_input_tokens_seen": 299169330, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.10461426, "step": 13866, "time_per_iteration": 2.692453622817993 }, { "auxiliary_loss_clip": 0.06313025, "auxiliary_loss_mlp": 0.01252254, "balance_loss_clip": 0.06257306, "balance_loss_mlp": 0.01251157, "epoch": 0.8337291447467308, "flos": 55580062988160.0, "grad_norm": 0.8497963531132299, "language_loss": 0.62937242, "learning_rate": 2.8297115953554465e-07, "loss": 0.7050252, "num_input_tokens_seen": 299220980, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.01098633, "step": 13867, "time_per_iteration": 3.0553572177886963 }, { "auxiliary_loss_clip": 0.06414701, "auxiliary_loss_mlp": 0.01265998, "balance_loss_clip": 0.0627574, "balance_loss_mlp": 0.01256366, "epoch": 0.8337892679993988, "flos": 24140340167040.0, "grad_norm": 1.9924643195996583, "language_loss": 0.72405457, "learning_rate": 2.827714802616301e-07, "loss": 0.80086154, "num_input_tokens_seen": 299240130, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09637451, "step": 13868, "time_per_iteration": 2.5543570518493652 }, { "auxiliary_loss_clip": 0.06418757, "auxiliary_loss_mlp": 0.01264512, "balance_loss_clip": 0.06277999, "balance_loss_mlp": 0.01254886, "epoch": 0.8338493912520667, "flos": 28191456241920.0, "grad_norm": 1.4391825534456733, "language_loss": 0.80231464, "learning_rate": 2.8257186610519325e-07, "loss": 0.87914729, "num_input_tokens_seen": 299260705, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09631348, "step": 13869, "time_per_iteration": 2.607215404510498 }, { "auxiliary_loss_clip": 0.06417632, "auxiliary_loss_mlp": 0.01268511, "balance_loss_clip": 0.06278235, "balance_loss_mlp": 0.01258695, "epoch": 0.8339095145047347, "flos": 22163984640000.0, "grad_norm": 2.2934413155786775, "language_loss": 0.82565093, "learning_rate": 2.823723170738028e-07, "loss": 0.90251237, "num_input_tokens_seen": 299278925, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09820557, "step": 13870, "time_per_iteration": 2.5827183723449707 }, { "auxiliary_loss_clip": 0.06418709, "auxiliary_loss_mlp": 0.01266377, "balance_loss_clip": 0.06273706, "balance_loss_mlp": 0.0125625, "epoch": 0.8339696377574026, "flos": 17312157348480.0, "grad_norm": 2.739783460864936, "language_loss": 0.70976281, "learning_rate": 2.821728331750264e-07, "loss": 0.7866137, "num_input_tokens_seen": 299291580, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10125732, "step": 13871, "time_per_iteration": 2.5151045322418213 }, { "auxiliary_loss_clip": 0.06413093, "auxiliary_loss_mlp": 0.01270339, "balance_loss_clip": 0.06276526, "balance_loss_mlp": 0.01260832, "epoch": 0.8340297610100706, "flos": 20674719296640.0, "grad_norm": 1.7179139277879345, "language_loss": 0.69352365, "learning_rate": 2.8197341441642853e-07, "loss": 0.77035797, "num_input_tokens_seen": 299310385, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.09509277, "step": 13872, "time_per_iteration": 2.5891668796539307 }, { "auxiliary_loss_clip": 0.06415494, "auxiliary_loss_mlp": 0.01264255, "balance_loss_clip": 0.0627528, "balance_loss_mlp": 0.01255064, "epoch": 0.8340898842627387, "flos": 20520620438400.0, "grad_norm": 1.7365137614296904, "language_loss": 0.73286688, "learning_rate": 2.817740608055712e-07, "loss": 0.80966437, "num_input_tokens_seen": 299327660, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09191895, "step": 13873, "time_per_iteration": 2.606389045715332 }, { "auxiliary_loss_clip": 0.06420831, "auxiliary_loss_mlp": 0.01265315, "balance_loss_clip": 0.06275815, "balance_loss_mlp": 0.01253579, "epoch": 0.8341500075154066, "flos": 21430889268480.0, "grad_norm": 2.2598380395349045, "language_loss": 0.75420624, "learning_rate": 2.81574772350013e-07, "loss": 0.83106768, "num_input_tokens_seen": 299343685, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.11737061, "step": 13874, "time_per_iteration": 2.6075189113616943 }, { "auxiliary_loss_clip": 0.06413724, "auxiliary_loss_mlp": 0.01263879, "balance_loss_clip": 0.06275205, "balance_loss_mlp": 0.01254241, "epoch": 0.8342101307680746, "flos": 22097542752000.0, "grad_norm": 1.999489745622499, "language_loss": 0.66286826, "learning_rate": 2.813755490573118e-07, "loss": 0.73964429, "num_input_tokens_seen": 299363305, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09643555, "step": 13875, "time_per_iteration": 2.619696855545044 }, { "auxiliary_loss_clip": 0.06417032, "auxiliary_loss_mlp": 0.0126589, "balance_loss_clip": 0.06278032, "balance_loss_mlp": 0.01256746, "epoch": 0.8342702540207425, "flos": 21877882473600.0, "grad_norm": 1.628522002887832, "language_loss": 0.79841137, "learning_rate": 2.8117639093502243e-07, "loss": 0.87524062, "num_input_tokens_seen": 299382630, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09143066, "step": 13876, "time_per_iteration": 2.662485361099243 }, { "auxiliary_loss_clip": 0.06413656, "auxiliary_loss_mlp": 0.01264165, "balance_loss_clip": 0.06276116, "balance_loss_mlp": 0.01254676, "epoch": 0.8343303772734105, "flos": 22535060446080.0, "grad_norm": 1.7233465904963214, "language_loss": 0.87455791, "learning_rate": 2.8097729799069615e-07, "loss": 0.95133615, "num_input_tokens_seen": 299402385, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09490967, "step": 13877, "time_per_iteration": 2.554457426071167 }, { "auxiliary_loss_clip": 0.06416635, "auxiliary_loss_mlp": 0.01263302, "balance_loss_clip": 0.06276628, "balance_loss_mlp": 0.01254278, "epoch": 0.8343905005260784, "flos": 14945131359360.0, "grad_norm": 1.606011762344904, "language_loss": 0.6991232, "learning_rate": 2.807782702318828e-07, "loss": 0.7759226, "num_input_tokens_seen": 299419820, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09020996, "step": 13878, "time_per_iteration": 2.5106303691864014 }, { "auxiliary_loss_clip": 0.06416906, "auxiliary_loss_mlp": 0.01264393, "balance_loss_clip": 0.06277303, "balance_loss_mlp": 0.01254928, "epoch": 0.8344506237787465, "flos": 15017778449280.0, "grad_norm": 2.9918377365665036, "language_loss": 0.79592299, "learning_rate": 2.805793076661309e-07, "loss": 0.87273604, "num_input_tokens_seen": 299436265, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09466553, "step": 13879, "time_per_iteration": 3.9285969734191895 }, { "auxiliary_loss_clip": 0.06410763, "auxiliary_loss_mlp": 0.01265194, "balance_loss_clip": 0.06272236, "balance_loss_mlp": 0.01255985, "epoch": 0.8345107470314144, "flos": 17565122424960.0, "grad_norm": 2.323970561982866, "language_loss": 0.83548075, "learning_rate": 2.803804103009828e-07, "loss": 0.91224027, "num_input_tokens_seen": 299451660, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09210205, "step": 13880, "time_per_iteration": 2.490313768386841 }, { "auxiliary_loss_clip": 0.0641773, "auxiliary_loss_mlp": 0.01262261, "balance_loss_clip": 0.06275973, "balance_loss_mlp": 0.01252528, "epoch": 0.8345708702840824, "flos": 25193513335680.0, "grad_norm": 2.04250466099096, "language_loss": 0.78448319, "learning_rate": 2.80181578143982e-07, "loss": 0.86128312, "num_input_tokens_seen": 299472070, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09729004, "step": 13881, "time_per_iteration": 2.6027612686157227 }, { "auxiliary_loss_clip": 0.06410664, "auxiliary_loss_mlp": 0.012675, "balance_loss_clip": 0.06276409, "balance_loss_mlp": 0.01258726, "epoch": 0.8346309935367503, "flos": 15088580749440.0, "grad_norm": 2.4742897444827756, "language_loss": 0.78431237, "learning_rate": 2.7998281120266807e-07, "loss": 0.86109406, "num_input_tokens_seen": 299486725, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.08770752, "step": 13882, "time_per_iteration": 2.507779598236084 }, { "auxiliary_loss_clip": 0.06413763, "auxiliary_loss_mlp": 0.01266441, "balance_loss_clip": 0.06274681, "balance_loss_mlp": 0.01256327, "epoch": 0.8346911167894183, "flos": 22937386625280.0, "grad_norm": 1.7522483961214599, "language_loss": 0.80894661, "learning_rate": 2.79784109484579e-07, "loss": 0.88574874, "num_input_tokens_seen": 299505435, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.10107422, "step": 13883, "time_per_iteration": 3.980365753173828 }, { "auxiliary_loss_clip": 0.06419411, "auxiliary_loss_mlp": 0.0126449, "balance_loss_clip": 0.06277318, "balance_loss_mlp": 0.01254882, "epoch": 0.8347512400420862, "flos": 20199159048960.0, "grad_norm": 1.8796960859922291, "language_loss": 0.74242032, "learning_rate": 2.795854729972482e-07, "loss": 0.81925935, "num_input_tokens_seen": 299523555, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09606934, "step": 13884, "time_per_iteration": 2.5280895233154297 }, { "auxiliary_loss_clip": 0.06426078, "auxiliary_loss_mlp": 0.01267929, "balance_loss_clip": 0.06277201, "balance_loss_mlp": 0.01256944, "epoch": 0.8348113632947542, "flos": 25961422878720.0, "grad_norm": 1.642153446337459, "language_loss": 0.70509857, "learning_rate": 2.7938690174820913e-07, "loss": 0.78203869, "num_input_tokens_seen": 299541660, "router_z_loss_clip": 1.48925781, "router_z_loss_mlp": 0.10986328, "step": 13885, "time_per_iteration": 2.609358310699463 }, { "auxiliary_loss_clip": 0.06416913, "auxiliary_loss_mlp": 0.01264199, "balance_loss_clip": 0.06275891, "balance_loss_mlp": 0.01254615, "epoch": 0.8348714865474223, "flos": 34213183839360.0, "grad_norm": 1.86113126210745, "language_loss": 0.69977552, "learning_rate": 2.791883957449912e-07, "loss": 0.77658665, "num_input_tokens_seen": 299562465, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09588623, "step": 13886, "time_per_iteration": 2.678363800048828 }, { "auxiliary_loss_clip": 0.06415973, "auxiliary_loss_mlp": 0.01268015, "balance_loss_clip": 0.06277531, "balance_loss_mlp": 0.01258103, "epoch": 0.8349316098000902, "flos": 24397162531200.0, "grad_norm": 1.5565230888533172, "language_loss": 0.79483759, "learning_rate": 2.7898995499512134e-07, "loss": 0.87167752, "num_input_tokens_seen": 299582700, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09918213, "step": 13887, "time_per_iteration": 2.5935401916503906 }, { "auxiliary_loss_clip": 0.06427828, "auxiliary_loss_mlp": 0.0126648, "balance_loss_clip": 0.06280875, "balance_loss_mlp": 0.0125568, "epoch": 0.8349917330527582, "flos": 23038307268480.0, "grad_norm": 2.4155945764150277, "language_loss": 0.64343202, "learning_rate": 2.7879157950612467e-07, "loss": 0.72037506, "num_input_tokens_seen": 299600310, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.10803223, "step": 13888, "time_per_iteration": 2.583861827850342 }, { "auxiliary_loss_clip": 0.06418264, "auxiliary_loss_mlp": 0.01266555, "balance_loss_clip": 0.06273456, "balance_loss_mlp": 0.01256648, "epoch": 0.8350518563054261, "flos": 13630943122560.0, "grad_norm": 3.2274365571168007, "language_loss": 0.67360866, "learning_rate": 2.785932692855244e-07, "loss": 0.75045681, "num_input_tokens_seen": 299617025, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.09912109, "step": 13889, "time_per_iteration": 2.5303664207458496 }, { "auxiliary_loss_clip": 0.06412028, "auxiliary_loss_mlp": 0.0126542, "balance_loss_clip": 0.06273782, "balance_loss_mlp": 0.01255573, "epoch": 0.8351119795580941, "flos": 21586204010880.0, "grad_norm": 1.819091124126335, "language_loss": 0.69065499, "learning_rate": 2.783950243408399e-07, "loss": 0.76742947, "num_input_tokens_seen": 299633050, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09844971, "step": 13890, "time_per_iteration": 2.587547540664673 }, { "auxiliary_loss_clip": 0.0642, "auxiliary_loss_mlp": 0.01268439, "balance_loss_clip": 0.06278534, "balance_loss_mlp": 0.01257931, "epoch": 0.835172102810762, "flos": 20042921911680.0, "grad_norm": 2.3006645781661255, "language_loss": 0.59205997, "learning_rate": 2.7819684467958817e-07, "loss": 0.66894436, "num_input_tokens_seen": 299646445, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.10516357, "step": 13891, "time_per_iteration": 2.505070209503174 }, { "auxiliary_loss_clip": 0.06415866, "auxiliary_loss_mlp": 0.01265421, "balance_loss_clip": 0.06275611, "balance_loss_mlp": 0.0125629, "epoch": 0.8352322260634301, "flos": 25117344374400.0, "grad_norm": 2.083277816820674, "language_loss": 0.71854621, "learning_rate": 2.779987303092846e-07, "loss": 0.79535908, "num_input_tokens_seen": 299662665, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09124756, "step": 13892, "time_per_iteration": 2.594355344772339 }, { "auxiliary_loss_clip": 0.06411736, "auxiliary_loss_mlp": 0.01265348, "balance_loss_clip": 0.06275111, "balance_loss_mlp": 0.01255693, "epoch": 0.835292349316098, "flos": 24870752208000.0, "grad_norm": 1.6034871439767187, "language_loss": 0.66236717, "learning_rate": 2.7780068123744207e-07, "loss": 0.73913807, "num_input_tokens_seen": 299683585, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09649658, "step": 13893, "time_per_iteration": 2.5756959915161133 }, { "auxiliary_loss_clip": 0.06413822, "auxiliary_loss_mlp": 0.01263828, "balance_loss_clip": 0.06272987, "balance_loss_mlp": 0.01253981, "epoch": 0.835352472568766, "flos": 19871785946880.0, "grad_norm": 2.03696623344354, "language_loss": 0.78673041, "learning_rate": 2.7760269747156996e-07, "loss": 0.86350685, "num_input_tokens_seen": 299702680, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09838867, "step": 13894, "time_per_iteration": 4.001600742340088 }, { "auxiliary_loss_clip": 0.06411889, "auxiliary_loss_mlp": 0.01267979, "balance_loss_clip": 0.06278603, "balance_loss_mlp": 0.01258621, "epoch": 0.8354125958214339, "flos": 22061344988160.0, "grad_norm": 1.7643947805904272, "language_loss": 0.7292251, "learning_rate": 2.7740477901917625e-07, "loss": 0.80602384, "num_input_tokens_seen": 299721050, "router_z_loss_clip": 1.33300781, "router_z_loss_mlp": 0.09356689, "step": 13895, "time_per_iteration": 2.5408802032470703 }, { "auxiliary_loss_clip": 0.06420128, "auxiliary_loss_mlp": 0.01269955, "balance_loss_clip": 0.06275454, "balance_loss_mlp": 0.01258833, "epoch": 0.8354727190741019, "flos": 21404250869760.0, "grad_norm": 1.946595671209662, "language_loss": 0.72220051, "learning_rate": 2.772069258877667e-07, "loss": 0.79910135, "num_input_tokens_seen": 299738255, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.11120605, "step": 13896, "time_per_iteration": 2.5920755863189697 }, { "auxiliary_loss_clip": 0.064109, "auxiliary_loss_mlp": 0.01268634, "balance_loss_clip": 0.06274258, "balance_loss_mlp": 0.01259043, "epoch": 0.8355328423267698, "flos": 50852230940160.0, "grad_norm": 2.3716966976346487, "language_loss": 0.58968544, "learning_rate": 2.770091380848423e-07, "loss": 0.66648078, "num_input_tokens_seen": 299761315, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09594727, "step": 13897, "time_per_iteration": 2.8039917945861816 }, { "auxiliary_loss_clip": 0.06321254, "auxiliary_loss_mlp": 0.01254711, "balance_loss_clip": 0.06265451, "balance_loss_mlp": 0.0125353, "epoch": 0.8355929655794379, "flos": 65571901361280.0, "grad_norm": 0.6840382761588055, "language_loss": 0.57739627, "learning_rate": 2.7681141561790423e-07, "loss": 0.65315592, "num_input_tokens_seen": 299828735, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.01180267, "step": 13898, "time_per_iteration": 3.2089641094207764 }, { "auxiliary_loss_clip": 0.06422073, "auxiliary_loss_mlp": 0.01269726, "balance_loss_clip": 0.06277023, "balance_loss_mlp": 0.01259408, "epoch": 0.8356530888321058, "flos": 19176313858560.0, "grad_norm": 1.854185729563994, "language_loss": 0.80396056, "learning_rate": 2.7661375849444967e-07, "loss": 0.88087851, "num_input_tokens_seen": 299848395, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.10314941, "step": 13899, "time_per_iteration": 2.5344369411468506 }, { "auxiliary_loss_clip": 0.06416728, "auxiliary_loss_mlp": 0.01265886, "balance_loss_clip": 0.06274624, "balance_loss_mlp": 0.0125663, "epoch": 0.8357132120847738, "flos": 44136624481920.0, "grad_norm": 1.5319919617432227, "language_loss": 0.68848598, "learning_rate": 2.764161667219749e-07, "loss": 0.76531208, "num_input_tokens_seen": 299871665, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09246826, "step": 13900, "time_per_iteration": 4.184235572814941 }, { "auxiliary_loss_clip": 0.06419549, "auxiliary_loss_mlp": 0.01269692, "balance_loss_clip": 0.06279598, "balance_loss_mlp": 0.01260119, "epoch": 0.8357733353374418, "flos": 24396659406720.0, "grad_norm": 1.390795816431263, "language_loss": 0.71251428, "learning_rate": 2.762186403079716e-07, "loss": 0.78940672, "num_input_tokens_seen": 299891960, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09576416, "step": 13901, "time_per_iteration": 2.602454423904419 }, { "auxiliary_loss_clip": 0.06421539, "auxiliary_loss_mlp": 0.01265573, "balance_loss_clip": 0.06275762, "balance_loss_mlp": 0.01255017, "epoch": 0.8358334585901097, "flos": 20921479171200.0, "grad_norm": 2.1086599936442805, "language_loss": 0.80285418, "learning_rate": 2.7602117925992963e-07, "loss": 0.87972528, "num_input_tokens_seen": 299905070, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10559082, "step": 13902, "time_per_iteration": 2.518820285797119 }, { "auxiliary_loss_clip": 0.06411929, "auxiliary_loss_mlp": 0.01264659, "balance_loss_clip": 0.06274832, "balance_loss_mlp": 0.01254722, "epoch": 0.8358935818427777, "flos": 19250092978560.0, "grad_norm": 1.3811858949008449, "language_loss": 0.62716532, "learning_rate": 2.758237835853379e-07, "loss": 0.70393121, "num_input_tokens_seen": 299925130, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.0993042, "step": 13903, "time_per_iteration": 2.5634334087371826 }, { "auxiliary_loss_clip": 0.06417363, "auxiliary_loss_mlp": 0.01269077, "balance_loss_clip": 0.0627796, "balance_loss_mlp": 0.01259463, "epoch": 0.8359537050954456, "flos": 24140927145600.0, "grad_norm": 1.981551243012371, "language_loss": 0.74434, "learning_rate": 2.7562645329168054e-07, "loss": 0.82120442, "num_input_tokens_seen": 299943845, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09613037, "step": 13904, "time_per_iteration": 2.567164897918701 }, { "auxiliary_loss_clip": 0.06411076, "auxiliary_loss_mlp": 0.01263738, "balance_loss_clip": 0.0627479, "balance_loss_mlp": 0.012541, "epoch": 0.8360138283481137, "flos": 16186001673600.0, "grad_norm": 1.6488350626986417, "language_loss": 0.72878706, "learning_rate": 2.7542918838644104e-07, "loss": 0.80553526, "num_input_tokens_seen": 299961620, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.09643555, "step": 13905, "time_per_iteration": 2.5087785720825195 }, { "auxiliary_loss_clip": 0.06409331, "auxiliary_loss_mlp": 0.01262244, "balance_loss_clip": 0.06274392, "balance_loss_mlp": 0.01253208, "epoch": 0.8360739516007816, "flos": 22205213648640.0, "grad_norm": 1.5111532202108873, "language_loss": 0.66573095, "learning_rate": 2.752319888771e-07, "loss": 0.74244666, "num_input_tokens_seen": 299982170, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.0904541, "step": 13906, "time_per_iteration": 2.552743911743164 }, { "auxiliary_loss_clip": 0.06414224, "auxiliary_loss_mlp": 0.01265254, "balance_loss_clip": 0.06274228, "balance_loss_mlp": 0.01255545, "epoch": 0.8361340748534496, "flos": 20929445308800.0, "grad_norm": 1.42909410572435, "language_loss": 0.74524534, "learning_rate": 2.7503485477113475e-07, "loss": 0.82204014, "num_input_tokens_seen": 300001330, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09710693, "step": 13907, "time_per_iteration": 2.5372185707092285 }, { "auxiliary_loss_clip": 0.06422819, "auxiliary_loss_mlp": 0.01267282, "balance_loss_clip": 0.06278024, "balance_loss_mlp": 0.01256857, "epoch": 0.8361941981061175, "flos": 26180202689280.0, "grad_norm": 1.6325858370985455, "language_loss": 0.75315332, "learning_rate": 2.7483778607602005e-07, "loss": 0.83005428, "num_input_tokens_seen": 300020645, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10418701, "step": 13908, "time_per_iteration": 2.576322555541992 }, { "auxiliary_loss_clip": 0.06413982, "auxiliary_loss_mlp": 0.01263238, "balance_loss_clip": 0.06272788, "balance_loss_mlp": 0.01252975, "epoch": 0.8362543213587855, "flos": 24425184522240.0, "grad_norm": 2.167001952764991, "language_loss": 0.71768141, "learning_rate": 2.7464078279922964e-07, "loss": 0.79445356, "num_input_tokens_seen": 300039945, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10253906, "step": 13909, "time_per_iteration": 2.56785249710083 }, { "auxiliary_loss_clip": 0.06420837, "auxiliary_loss_mlp": 0.01263339, "balance_loss_clip": 0.0627653, "balance_loss_mlp": 0.01253266, "epoch": 0.8363144446114534, "flos": 17208217958400.0, "grad_norm": 1.7855696751728618, "language_loss": 0.7384367, "learning_rate": 2.744438449482338e-07, "loss": 0.81527853, "num_input_tokens_seen": 300058260, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10076904, "step": 13910, "time_per_iteration": 2.5220212936401367 }, { "auxiliary_loss_clip": 0.0642028, "auxiliary_loss_mlp": 0.01265474, "balance_loss_clip": 0.06278706, "balance_loss_mlp": 0.01256098, "epoch": 0.8363745678641215, "flos": 19285116785280.0, "grad_norm": 1.8150227055763661, "language_loss": 0.73412359, "learning_rate": 2.742469725305001e-07, "loss": 0.81098115, "num_input_tokens_seen": 300076720, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09375, "step": 13911, "time_per_iteration": 2.552640676498413 }, { "auxiliary_loss_clip": 0.06417251, "auxiliary_loss_mlp": 0.01265384, "balance_loss_clip": 0.06274418, "balance_loss_mlp": 0.01255716, "epoch": 0.8364346911167894, "flos": 11879698389120.0, "grad_norm": 2.1230283696917995, "language_loss": 0.79067695, "learning_rate": 2.740501655534946e-07, "loss": 0.86750329, "num_input_tokens_seen": 300092950, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09661865, "step": 13912, "time_per_iteration": 2.523164987564087 }, { "auxiliary_loss_clip": 0.06410239, "auxiliary_loss_mlp": 0.0126367, "balance_loss_clip": 0.06270919, "balance_loss_mlp": 0.01254396, "epoch": 0.8364948143694574, "flos": 20230619057280.0, "grad_norm": 1.795506237318452, "language_loss": 0.79082119, "learning_rate": 2.738534240246797e-07, "loss": 0.86756027, "num_input_tokens_seen": 300110950, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09277344, "step": 13913, "time_per_iteration": 2.531759738922119 }, { "auxiliary_loss_clip": 0.06417473, "auxiliary_loss_mlp": 0.01268672, "balance_loss_clip": 0.06274393, "balance_loss_mlp": 0.01257758, "epoch": 0.8365549376221254, "flos": 21618754122240.0, "grad_norm": 1.8322217643101566, "language_loss": 0.73773897, "learning_rate": 2.736567479515153e-07, "loss": 0.81460047, "num_input_tokens_seen": 300128705, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.10913086, "step": 13914, "time_per_iteration": 2.551565647125244 }, { "auxiliary_loss_clip": 0.06416466, "auxiliary_loss_mlp": 0.01268186, "balance_loss_clip": 0.06275252, "balance_loss_mlp": 0.01257457, "epoch": 0.8366150608747933, "flos": 23300831710080.0, "grad_norm": 1.589975204360552, "language_loss": 0.71548903, "learning_rate": 2.7346013734146025e-07, "loss": 0.79233551, "num_input_tokens_seen": 300148635, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10726929, "step": 13915, "time_per_iteration": 2.5453906059265137 }, { "auxiliary_loss_clip": 0.06418355, "auxiliary_loss_mlp": 0.01267004, "balance_loss_clip": 0.06276339, "balance_loss_mlp": 0.012573, "epoch": 0.8366751841274613, "flos": 15273007585920.0, "grad_norm": 1.907359137144456, "language_loss": 0.7261101, "learning_rate": 2.7326359220197035e-07, "loss": 0.80296373, "num_input_tokens_seen": 300165490, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.0970459, "step": 13916, "time_per_iteration": 2.5319957733154297 }, { "auxiliary_loss_clip": 0.06418161, "auxiliary_loss_mlp": 0.01261684, "balance_loss_clip": 0.06276582, "balance_loss_mlp": 0.01251986, "epoch": 0.8367353073801292, "flos": 13230000535680.0, "grad_norm": 2.58408196647458, "language_loss": 0.74941754, "learning_rate": 2.7306711254049755e-07, "loss": 0.82621598, "num_input_tokens_seen": 300182130, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09698486, "step": 13917, "time_per_iteration": 2.527437925338745 }, { "auxiliary_loss_clip": 0.06413034, "auxiliary_loss_mlp": 0.01267392, "balance_loss_clip": 0.06278418, "balance_loss_mlp": 0.01258201, "epoch": 0.8367954306327973, "flos": 24211645591680.0, "grad_norm": 1.431888631422494, "language_loss": 0.79523069, "learning_rate": 2.728706983644933e-07, "loss": 0.87203503, "num_input_tokens_seen": 300203050, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.09191895, "step": 13918, "time_per_iteration": 2.5762486457824707 }, { "auxiliary_loss_clip": 0.06413496, "auxiliary_loss_mlp": 0.01264248, "balance_loss_clip": 0.06273865, "balance_loss_mlp": 0.0125442, "epoch": 0.8368555538854652, "flos": 24541576243200.0, "grad_norm": 1.5381367907881076, "language_loss": 0.68189621, "learning_rate": 2.7267434968140457e-07, "loss": 0.75867361, "num_input_tokens_seen": 300224380, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09832764, "step": 13919, "time_per_iteration": 4.087156295776367 }, { "auxiliary_loss_clip": 0.06411761, "auxiliary_loss_mlp": 0.01266517, "balance_loss_clip": 0.06273123, "balance_loss_mlp": 0.01257564, "epoch": 0.8369156771381332, "flos": 20264385052800.0, "grad_norm": 1.857509179306677, "language_loss": 0.74180818, "learning_rate": 2.7247806649867835e-07, "loss": 0.818591, "num_input_tokens_seen": 300242915, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.08959961, "step": 13920, "time_per_iteration": 2.529735803604126 }, { "auxiliary_loss_clip": 0.06413664, "auxiliary_loss_mlp": 0.01265106, "balance_loss_clip": 0.06273348, "balance_loss_mlp": 0.01255468, "epoch": 0.8369758003908011, "flos": 21842062053120.0, "grad_norm": 1.7267798696710155, "language_loss": 0.69325209, "learning_rate": 2.722818488237566e-07, "loss": 0.77003986, "num_input_tokens_seen": 300261905, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09637451, "step": 13921, "time_per_iteration": 2.555521249771118 }, { "auxiliary_loss_clip": 0.06423631, "auxiliary_loss_mlp": 0.01267963, "balance_loss_clip": 0.0627827, "balance_loss_mlp": 0.01258283, "epoch": 0.8370359236434691, "flos": 21724664083200.0, "grad_norm": 2.9192053425771634, "language_loss": 0.85946035, "learning_rate": 2.720856966640801e-07, "loss": 0.93637633, "num_input_tokens_seen": 300281145, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.09680176, "step": 13922, "time_per_iteration": 2.5445003509521484 }, { "auxiliary_loss_clip": 0.06413593, "auxiliary_loss_mlp": 0.0126627, "balance_loss_clip": 0.06275412, "balance_loss_mlp": 0.0125634, "epoch": 0.837096046896137, "flos": 23155579457280.0, "grad_norm": 1.5617361222464827, "language_loss": 0.72093326, "learning_rate": 2.71889610027088e-07, "loss": 0.79773188, "num_input_tokens_seen": 300301610, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09924316, "step": 13923, "time_per_iteration": 3.9874141216278076 }, { "auxiliary_loss_clip": 0.06412676, "auxiliary_loss_mlp": 0.01267632, "balance_loss_clip": 0.06276988, "balance_loss_mlp": 0.01257481, "epoch": 0.8371561701488051, "flos": 24498795934080.0, "grad_norm": 1.8795248151093715, "language_loss": 0.76489937, "learning_rate": 2.7169358892021433e-07, "loss": 0.8417024, "num_input_tokens_seen": 300319420, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.10144043, "step": 13924, "time_per_iteration": 2.688873767852783 }, { "auxiliary_loss_clip": 0.0641465, "auxiliary_loss_mlp": 0.01266672, "balance_loss_clip": 0.06275919, "balance_loss_mlp": 0.01257183, "epoch": 0.837216293401473, "flos": 29214636848640.0, "grad_norm": 1.4950145184216277, "language_loss": 0.64747536, "learning_rate": 2.7149763335089293e-07, "loss": 0.72428858, "num_input_tokens_seen": 300341325, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.0947876, "step": 13925, "time_per_iteration": 2.6331732273101807 }, { "auxiliary_loss_clip": 0.06419541, "auxiliary_loss_mlp": 0.01264326, "balance_loss_clip": 0.06277462, "balance_loss_mlp": 0.01254157, "epoch": 0.837276416654141, "flos": 25272365627520.0, "grad_norm": 1.5315074730437872, "language_loss": 0.74453795, "learning_rate": 2.713017433265543e-07, "loss": 0.82137662, "num_input_tokens_seen": 300361620, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10168457, "step": 13926, "time_per_iteration": 2.545940637588501 }, { "auxiliary_loss_clip": 0.06419023, "auxiliary_loss_mlp": 0.01266439, "balance_loss_clip": 0.06277274, "balance_loss_mlp": 0.01256288, "epoch": 0.837336539906809, "flos": 13887262362240.0, "grad_norm": 1.7213329901905319, "language_loss": 0.71492076, "learning_rate": 2.711059188546274e-07, "loss": 0.79177547, "num_input_tokens_seen": 300378675, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.10140991, "step": 13927, "time_per_iteration": 2.4935367107391357 }, { "auxiliary_loss_clip": 0.06318618, "auxiliary_loss_mlp": 0.01251375, "balance_loss_clip": 0.06262903, "balance_loss_mlp": 0.01250321, "epoch": 0.8373966631594769, "flos": 68891892635520.0, "grad_norm": 0.6890017063942271, "language_loss": 0.58692026, "learning_rate": 2.7091015994253695e-07, "loss": 0.66262019, "num_input_tokens_seen": 300449740, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.01054382, "step": 13928, "time_per_iteration": 3.276862144470215 }, { "auxiliary_loss_clip": 0.06416611, "auxiliary_loss_mlp": 0.01265599, "balance_loss_clip": 0.06276812, "balance_loss_mlp": 0.01255275, "epoch": 0.8374567864121449, "flos": 20455226726400.0, "grad_norm": 1.7357908990983728, "language_loss": 0.69994926, "learning_rate": 2.707144665977068e-07, "loss": 0.77677143, "num_input_tokens_seen": 300470000, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.10321045, "step": 13929, "time_per_iteration": 2.5451245307922363 }, { "auxiliary_loss_clip": 0.06422561, "auxiliary_loss_mlp": 0.01266316, "balance_loss_clip": 0.06278466, "balance_loss_mlp": 0.01255921, "epoch": 0.8375169096648128, "flos": 41914305694080.0, "grad_norm": 1.4950892431879883, "language_loss": 0.67421889, "learning_rate": 2.705188388275574e-07, "loss": 0.75110769, "num_input_tokens_seen": 300494975, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10388184, "step": 13930, "time_per_iteration": 2.7207744121551514 }, { "auxiliary_loss_clip": 0.06410357, "auxiliary_loss_mlp": 0.01263189, "balance_loss_clip": 0.0627284, "balance_loss_mlp": 0.01253724, "epoch": 0.8375770329174809, "flos": 20015235336960.0, "grad_norm": 1.5405165350157743, "language_loss": 0.71421212, "learning_rate": 2.703232766395067e-07, "loss": 0.79094756, "num_input_tokens_seen": 300513175, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09460449, "step": 13931, "time_per_iteration": 2.5512125492095947 }, { "auxiliary_loss_clip": 0.06414213, "auxiliary_loss_mlp": 0.01263038, "balance_loss_clip": 0.06275971, "balance_loss_mlp": 0.01253323, "epoch": 0.8376371561701488, "flos": 22790163801600.0, "grad_norm": 1.5649489173160094, "language_loss": 0.71947956, "learning_rate": 2.701277800409705e-07, "loss": 0.79625213, "num_input_tokens_seen": 300533770, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09710693, "step": 13932, "time_per_iteration": 2.5629031658172607 }, { "auxiliary_loss_clip": 0.06413964, "auxiliary_loss_mlp": 0.01264493, "balance_loss_clip": 0.06275029, "balance_loss_mlp": 0.01255099, "epoch": 0.8376972794228168, "flos": 23921183013120.0, "grad_norm": 2.168271421509211, "language_loss": 0.66821641, "learning_rate": 2.699323490393628e-07, "loss": 0.74500096, "num_input_tokens_seen": 300552995, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09393311, "step": 13933, "time_per_iteration": 4.01313853263855 }, { "auxiliary_loss_clip": 0.0641008, "auxiliary_loss_mlp": 0.01266008, "balance_loss_clip": 0.06272922, "balance_loss_mlp": 0.01256239, "epoch": 0.8377574026754847, "flos": 13739704122240.0, "grad_norm": 1.963590589340699, "language_loss": 0.7618084, "learning_rate": 2.697369836420933e-07, "loss": 0.83856928, "num_input_tokens_seen": 300570275, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09759521, "step": 13934, "time_per_iteration": 2.5590927600860596 }, { "auxiliary_loss_clip": 0.06416097, "auxiliary_loss_mlp": 0.01264179, "balance_loss_clip": 0.06277847, "balance_loss_mlp": 0.01254672, "epoch": 0.8378175259281527, "flos": 21657509435520.0, "grad_norm": 1.6033517280035763, "language_loss": 0.77446806, "learning_rate": 2.6954168385657115e-07, "loss": 0.85127085, "num_input_tokens_seen": 300590875, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09509277, "step": 13935, "time_per_iteration": 2.5899240970611572 }, { "auxiliary_loss_clip": 0.06413545, "auxiliary_loss_mlp": 0.01262647, "balance_loss_clip": 0.062737, "balance_loss_mlp": 0.01252878, "epoch": 0.8378776491808206, "flos": 15453954478080.0, "grad_norm": 3.041314761312603, "language_loss": 0.57045448, "learning_rate": 2.6934644969020135e-07, "loss": 0.64721632, "num_input_tokens_seen": 300607490, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09771729, "step": 13936, "time_per_iteration": 2.544152021408081 }, { "auxiliary_loss_clip": 0.06412605, "auxiliary_loss_mlp": 0.01263235, "balance_loss_clip": 0.06272192, "balance_loss_mlp": 0.01253794, "epoch": 0.8379377724334887, "flos": 14725638789120.0, "grad_norm": 1.59422019938778, "language_loss": 0.89774072, "learning_rate": 2.691512811503882e-07, "loss": 0.97449911, "num_input_tokens_seen": 300623635, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09442139, "step": 13937, "time_per_iteration": 2.5442001819610596 }, { "auxiliary_loss_clip": 0.06416936, "auxiliary_loss_mlp": 0.01263198, "balance_loss_clip": 0.06277381, "balance_loss_mlp": 0.01253512, "epoch": 0.8379978956861566, "flos": 24542163221760.0, "grad_norm": 1.779050224824203, "language_loss": 0.8193742, "learning_rate": 2.689561782445313e-07, "loss": 0.8961755, "num_input_tokens_seen": 300643835, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09680176, "step": 13938, "time_per_iteration": 2.59957218170166 }, { "auxiliary_loss_clip": 0.06421178, "auxiliary_loss_mlp": 0.01265377, "balance_loss_clip": 0.06277983, "balance_loss_mlp": 0.01254785, "epoch": 0.8380580189388246, "flos": 18958540296960.0, "grad_norm": 1.5961362540480137, "language_loss": 0.70955569, "learning_rate": 2.6876114098002965e-07, "loss": 0.78642124, "num_input_tokens_seen": 300662500, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10601807, "step": 13939, "time_per_iteration": 4.063822984695435 }, { "auxiliary_loss_clip": 0.06419729, "auxiliary_loss_mlp": 0.01269141, "balance_loss_clip": 0.06276386, "balance_loss_mlp": 0.01258185, "epoch": 0.8381181421914926, "flos": 26547253499520.0, "grad_norm": 1.66650198376073, "language_loss": 0.762528, "learning_rate": 2.6856616936428e-07, "loss": 0.83941668, "num_input_tokens_seen": 300681480, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10961914, "step": 13940, "time_per_iteration": 2.611445188522339 }, { "auxiliary_loss_clip": 0.06412643, "auxiliary_loss_mlp": 0.01263433, "balance_loss_clip": 0.06274512, "balance_loss_mlp": 0.01253896, "epoch": 0.8381782654441605, "flos": 23297645255040.0, "grad_norm": 1.6156022348145014, "language_loss": 0.76683009, "learning_rate": 2.6837126340467374e-07, "loss": 0.84359086, "num_input_tokens_seen": 300699165, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09539795, "step": 13941, "time_per_iteration": 2.5720438957214355 }, { "auxiliary_loss_clip": 0.06422989, "auxiliary_loss_mlp": 0.01263482, "balance_loss_clip": 0.06277209, "balance_loss_mlp": 0.01253909, "epoch": 0.8382383886968285, "flos": 26765739820800.0, "grad_norm": 2.3928301717959948, "language_loss": 0.7364682, "learning_rate": 2.6817642310860276e-07, "loss": 0.81333292, "num_input_tokens_seen": 300714615, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.09570312, "step": 13942, "time_per_iteration": 2.6359052658081055 }, { "auxiliary_loss_clip": 0.06427661, "auxiliary_loss_mlp": 0.01263141, "balance_loss_clip": 0.06278751, "balance_loss_mlp": 0.01252287, "epoch": 0.8382985119494964, "flos": 26111790230400.0, "grad_norm": 1.4863892477809795, "language_loss": 0.79706734, "learning_rate": 2.679816484834554e-07, "loss": 0.8739754, "num_input_tokens_seen": 300734860, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.10845947, "step": 13943, "time_per_iteration": 2.6327006816864014 }, { "auxiliary_loss_clip": 0.06418011, "auxiliary_loss_mlp": 0.01264504, "balance_loss_clip": 0.06278354, "balance_loss_mlp": 0.01254723, "epoch": 0.8383586352021645, "flos": 16440643831680.0, "grad_norm": 1.7866530062070973, "language_loss": 0.85737348, "learning_rate": 2.6778693953661766e-07, "loss": 0.93419862, "num_input_tokens_seen": 300752735, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09783936, "step": 13944, "time_per_iteration": 2.5269477367401123 }, { "auxiliary_loss_clip": 0.06316328, "auxiliary_loss_mlp": 0.01251039, "balance_loss_clip": 0.06260615, "balance_loss_mlp": 0.01249986, "epoch": 0.8384187584548324, "flos": 64215226304640.0, "grad_norm": 0.6104541613868393, "language_loss": 0.50279921, "learning_rate": 2.6759229627547263e-07, "loss": 0.57847297, "num_input_tokens_seen": 300820760, "router_z_loss_clip": 0.55810547, "router_z_loss_mlp": 0.01053619, "step": 13945, "time_per_iteration": 3.2948110103607178 }, { "auxiliary_loss_clip": 0.06415089, "auxiliary_loss_mlp": 0.01262623, "balance_loss_clip": 0.06278047, "balance_loss_mlp": 0.01252854, "epoch": 0.8384788817075004, "flos": 22389514704000.0, "grad_norm": 1.7704853291900653, "language_loss": 0.65025246, "learning_rate": 2.673977187074017e-07, "loss": 0.72702956, "num_input_tokens_seen": 300840025, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09765625, "step": 13946, "time_per_iteration": 2.5452001094818115 }, { "auxiliary_loss_clip": 0.06417122, "auxiliary_loss_mlp": 0.01264375, "balance_loss_clip": 0.06275132, "balance_loss_mlp": 0.01254385, "epoch": 0.8385390049601683, "flos": 29504512448640.0, "grad_norm": 1.4643797862145749, "language_loss": 0.67718649, "learning_rate": 2.672032068397829e-07, "loss": 0.7540015, "num_input_tokens_seen": 300860380, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09991455, "step": 13947, "time_per_iteration": 2.6348021030426025 }, { "auxiliary_loss_clip": 0.06418347, "auxiliary_loss_mlp": 0.01265832, "balance_loss_clip": 0.06276054, "balance_loss_mlp": 0.01255592, "epoch": 0.8385991282128363, "flos": 32716036212480.0, "grad_norm": 1.4470561802521813, "language_loss": 0.69947779, "learning_rate": 2.6700876067999176e-07, "loss": 0.7763195, "num_input_tokens_seen": 300881895, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10235596, "step": 13948, "time_per_iteration": 2.712416410446167 }, { "auxiliary_loss_clip": 0.06410545, "auxiliary_loss_mlp": 0.0126293, "balance_loss_clip": 0.06274579, "balance_loss_mlp": 0.01254073, "epoch": 0.8386592514655042, "flos": 25447023463680.0, "grad_norm": 1.7003137407999163, "language_loss": 0.85254407, "learning_rate": 2.6681438023540194e-07, "loss": 0.92927885, "num_input_tokens_seen": 300901575, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.08856201, "step": 13949, "time_per_iteration": 2.6308090686798096 }, { "auxiliary_loss_clip": 0.06410111, "auxiliary_loss_mlp": 0.01264373, "balance_loss_clip": 0.06273943, "balance_loss_mlp": 0.01255152, "epoch": 0.8387193747181723, "flos": 22022086550400.0, "grad_norm": 1.6904320323511643, "language_loss": 0.70834804, "learning_rate": 2.66620065513385e-07, "loss": 0.78509295, "num_input_tokens_seen": 300919735, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.09222412, "step": 13950, "time_per_iteration": 2.5838332176208496 }, { "auxiliary_loss_clip": 0.0641323, "auxiliary_loss_mlp": 0.01267204, "balance_loss_clip": 0.06275287, "balance_loss_mlp": 0.01257303, "epoch": 0.8387794979708402, "flos": 18156068144640.0, "grad_norm": 1.8014435244350575, "language_loss": 0.65075433, "learning_rate": 2.6642581652130913e-07, "loss": 0.72755873, "num_input_tokens_seen": 300939150, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09899902, "step": 13951, "time_per_iteration": 2.5732688903808594 }, { "auxiliary_loss_clip": 0.06417781, "auxiliary_loss_mlp": 0.01263145, "balance_loss_clip": 0.06277283, "balance_loss_mlp": 0.01253322, "epoch": 0.8388396212235082, "flos": 25418330640000.0, "grad_norm": 1.492978620410312, "language_loss": 0.70361394, "learning_rate": 2.662316332665393e-07, "loss": 0.78042316, "num_input_tokens_seen": 300959730, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09832764, "step": 13952, "time_per_iteration": 2.5944528579711914 }, { "auxiliary_loss_clip": 0.06409977, "auxiliary_loss_mlp": 0.01264565, "balance_loss_clip": 0.06271805, "balance_loss_mlp": 0.01255487, "epoch": 0.8388997444761762, "flos": 22279579747200.0, "grad_norm": 1.7585515993610532, "language_loss": 0.72985017, "learning_rate": 2.6603751575643987e-07, "loss": 0.80659562, "num_input_tokens_seen": 300976120, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09082031, "step": 13953, "time_per_iteration": 2.5251808166503906 }, { "auxiliary_loss_clip": 0.06415101, "auxiliary_loss_mlp": 0.01262975, "balance_loss_clip": 0.06276439, "balance_loss_mlp": 0.01253492, "epoch": 0.8389598677288441, "flos": 19579310870400.0, "grad_norm": 1.8551417942639779, "language_loss": 0.6855917, "learning_rate": 2.6584346399837176e-07, "loss": 0.76237243, "num_input_tokens_seen": 300995080, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09472656, "step": 13954, "time_per_iteration": 2.535381555557251 }, { "auxiliary_loss_clip": 0.06412043, "auxiliary_loss_mlp": 0.01266826, "balance_loss_clip": 0.06273127, "balance_loss_mlp": 0.01257987, "epoch": 0.8390199909815121, "flos": 17390548442880.0, "grad_norm": 1.8219741492003383, "language_loss": 0.73610085, "learning_rate": 2.656494779996932e-07, "loss": 0.81288952, "num_input_tokens_seen": 301012920, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.08837891, "step": 13955, "time_per_iteration": 2.5156197547912598 }, { "auxiliary_loss_clip": 0.06417848, "auxiliary_loss_mlp": 0.01265343, "balance_loss_clip": 0.06277134, "balance_loss_mlp": 0.01255747, "epoch": 0.83908011423418, "flos": 24645725268480.0, "grad_norm": 3.7871310410679566, "language_loss": 0.6631164, "learning_rate": 2.6545555776775995e-07, "loss": 0.73994839, "num_input_tokens_seen": 301028875, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09594727, "step": 13956, "time_per_iteration": 2.5823564529418945 }, { "auxiliary_loss_clip": 0.06421758, "auxiliary_loss_mlp": 0.01264624, "balance_loss_clip": 0.06278332, "balance_loss_mlp": 0.01254801, "epoch": 0.8391402374868481, "flos": 24725416101120.0, "grad_norm": 1.9729306102557693, "language_loss": 0.80070013, "learning_rate": 2.6526170330992667e-07, "loss": 0.87756395, "num_input_tokens_seen": 301050115, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09814453, "step": 13957, "time_per_iteration": 2.571903705596924 }, { "auxiliary_loss_clip": 0.06311761, "auxiliary_loss_mlp": 0.01252017, "balance_loss_clip": 0.06256086, "balance_loss_mlp": 0.01250874, "epoch": 0.839200360739516, "flos": 56891804728320.0, "grad_norm": 0.734115574584878, "language_loss": 0.53310955, "learning_rate": 2.6506791463354283e-07, "loss": 0.6087473, "num_input_tokens_seen": 301114155, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01142883, "step": 13958, "time_per_iteration": 4.708091735839844 }, { "auxiliary_loss_clip": 0.06413297, "auxiliary_loss_mlp": 0.01265995, "balance_loss_clip": 0.06275215, "balance_loss_mlp": 0.01256399, "epoch": 0.839260483992184, "flos": 18338692118400.0, "grad_norm": 1.816187101749097, "language_loss": 0.73579711, "learning_rate": 2.648741917459574e-07, "loss": 0.81259, "num_input_tokens_seen": 301133150, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09606934, "step": 13959, "time_per_iteration": 2.529301881790161 }, { "auxiliary_loss_clip": 0.0641328, "auxiliary_loss_mlp": 0.01264477, "balance_loss_clip": 0.06278417, "balance_loss_mlp": 0.01254815, "epoch": 0.8393206072448519, "flos": 27095041566720.0, "grad_norm": 1.714087912838853, "language_loss": 0.55937815, "learning_rate": 2.646805346545169e-07, "loss": 0.63615578, "num_input_tokens_seen": 301153600, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.09649658, "step": 13960, "time_per_iteration": 2.574711322784424 }, { "auxiliary_loss_clip": 0.06315163, "auxiliary_loss_mlp": 0.01250574, "balance_loss_clip": 0.06259699, "balance_loss_mlp": 0.0124947, "epoch": 0.8393807304975199, "flos": 61538619006720.0, "grad_norm": 0.7567705845288271, "language_loss": 0.60820401, "learning_rate": 2.6448694336656397e-07, "loss": 0.68386143, "num_input_tokens_seen": 301214335, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01105499, "step": 13961, "time_per_iteration": 3.270188331604004 }, { "auxiliary_loss_clip": 0.0641626, "auxiliary_loss_mlp": 0.01263553, "balance_loss_clip": 0.06276011, "balance_loss_mlp": 0.01254475, "epoch": 0.8394408537501878, "flos": 14898787251840.0, "grad_norm": 2.574590017359406, "language_loss": 0.6854111, "learning_rate": 2.642934178894405e-07, "loss": 0.76220924, "num_input_tokens_seen": 301228960, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09082031, "step": 13962, "time_per_iteration": 2.5547492504119873 }, { "auxiliary_loss_clip": 0.06417078, "auxiliary_loss_mlp": 0.01266415, "balance_loss_clip": 0.06273364, "balance_loss_mlp": 0.01256622, "epoch": 0.8395009770028559, "flos": 17416516008960.0, "grad_norm": 1.9867673010039306, "language_loss": 0.73219919, "learning_rate": 2.640999582304841e-07, "loss": 0.80903411, "num_input_tokens_seen": 301245875, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09796143, "step": 13963, "time_per_iteration": 4.027752161026001 }, { "auxiliary_loss_clip": 0.06415775, "auxiliary_loss_mlp": 0.01265538, "balance_loss_clip": 0.06275955, "balance_loss_mlp": 0.01256687, "epoch": 0.8395611002555238, "flos": 27931615130880.0, "grad_norm": 1.6780266432027415, "language_loss": 0.76587355, "learning_rate": 2.6390656439703173e-07, "loss": 0.84268671, "num_input_tokens_seen": 301265550, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.08850098, "step": 13964, "time_per_iteration": 2.702054500579834 }, { "auxiliary_loss_clip": 0.06419721, "auxiliary_loss_mlp": 0.01264389, "balance_loss_clip": 0.06274799, "balance_loss_mlp": 0.01252837, "epoch": 0.8396212235081918, "flos": 11104325832960.0, "grad_norm": 1.873343991127017, "language_loss": 0.79003853, "learning_rate": 2.637132363964161e-07, "loss": 0.86687964, "num_input_tokens_seen": 301282035, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.11553955, "step": 13965, "time_per_iteration": 2.5852696895599365 }, { "auxiliary_loss_clip": 0.06410956, "auxiliary_loss_mlp": 0.01263292, "balance_loss_clip": 0.0627344, "balance_loss_mlp": 0.01254101, "epoch": 0.8396813467608598, "flos": 35744307096960.0, "grad_norm": 1.5921949504988224, "language_loss": 0.65710473, "learning_rate": 2.635199742359684e-07, "loss": 0.73384726, "num_input_tokens_seen": 301305210, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09185791, "step": 13966, "time_per_iteration": 2.672983169555664 }, { "auxiliary_loss_clip": 0.06410741, "auxiliary_loss_mlp": 0.01262861, "balance_loss_clip": 0.06272326, "balance_loss_mlp": 0.0125373, "epoch": 0.8397414700135277, "flos": 26183850341760.0, "grad_norm": 1.4960629579276683, "language_loss": 0.74629319, "learning_rate": 2.633267779230177e-07, "loss": 0.82302922, "num_input_tokens_seen": 301324885, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09130859, "step": 13967, "time_per_iteration": 2.5781710147857666 }, { "auxiliary_loss_clip": 0.06412207, "auxiliary_loss_mlp": 0.01263567, "balance_loss_clip": 0.06273361, "balance_loss_mlp": 0.01254096, "epoch": 0.8398015932661957, "flos": 18339069461760.0, "grad_norm": 2.2484110124992776, "language_loss": 0.83425713, "learning_rate": 2.6313364746488974e-07, "loss": 0.9110148, "num_input_tokens_seen": 301343070, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09472656, "step": 13968, "time_per_iteration": 2.548293113708496 }, { "auxiliary_loss_clip": 0.06418605, "auxiliary_loss_mlp": 0.01264781, "balance_loss_clip": 0.06275691, "balance_loss_mlp": 0.01255113, "epoch": 0.8398617165188637, "flos": 17384469022080.0, "grad_norm": 3.3425799902666746, "language_loss": 0.77362341, "learning_rate": 2.629405828689075e-07, "loss": 0.85045719, "num_input_tokens_seen": 301359280, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09661865, "step": 13969, "time_per_iteration": 2.518155097961426 }, { "auxiliary_loss_clip": 0.0641713, "auxiliary_loss_mlp": 0.01265215, "balance_loss_clip": 0.0627382, "balance_loss_mlp": 0.01254939, "epoch": 0.8399218397715317, "flos": 22936296522240.0, "grad_norm": 2.192912075782233, "language_loss": 0.77583039, "learning_rate": 2.627475841423923e-07, "loss": 0.8526538, "num_input_tokens_seen": 301376465, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.1027832, "step": 13970, "time_per_iteration": 2.5698342323303223 }, { "auxiliary_loss_clip": 0.06414423, "auxiliary_loss_mlp": 0.01265875, "balance_loss_clip": 0.06273292, "balance_loss_mlp": 0.01255933, "epoch": 0.8399819630241996, "flos": 23156376071040.0, "grad_norm": 3.0949490785046914, "language_loss": 0.72265065, "learning_rate": 2.625546512926633e-07, "loss": 0.79945374, "num_input_tokens_seen": 301396000, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09942627, "step": 13971, "time_per_iteration": 2.6236138343811035 }, { "auxiliary_loss_clip": 0.06414806, "auxiliary_loss_mlp": 0.01266741, "balance_loss_clip": 0.06274092, "balance_loss_mlp": 0.01257013, "epoch": 0.8400420862768676, "flos": 16402727059200.0, "grad_norm": 2.0851771444603013, "language_loss": 0.7809974, "learning_rate": 2.623617843270358e-07, "loss": 0.85781288, "num_input_tokens_seen": 301413160, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09729004, "step": 13972, "time_per_iteration": 2.526864767074585 }, { "auxiliary_loss_clip": 0.06407365, "auxiliary_loss_mlp": 0.01266071, "balance_loss_clip": 0.0627078, "balance_loss_mlp": 0.01256886, "epoch": 0.8401022095295355, "flos": 21293770861440.0, "grad_norm": 1.2473467214832683, "language_loss": 0.68341947, "learning_rate": 2.6216898325282333e-07, "loss": 0.76015377, "num_input_tokens_seen": 301433325, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09185791, "step": 13973, "time_per_iteration": 4.071845054626465 }, { "auxiliary_loss_clip": 0.06413224, "auxiliary_loss_mlp": 0.01268034, "balance_loss_clip": 0.06273854, "balance_loss_mlp": 0.01258426, "epoch": 0.8401623327822035, "flos": 17317062812160.0, "grad_norm": 3.6967464125098903, "language_loss": 0.78321868, "learning_rate": 2.619762480773382e-07, "loss": 0.86003125, "num_input_tokens_seen": 301450265, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.0960083, "step": 13974, "time_per_iteration": 2.5275416374206543 }, { "auxiliary_loss_clip": 0.06415117, "auxiliary_loss_mlp": 0.01263913, "balance_loss_clip": 0.0627412, "balance_loss_mlp": 0.01254305, "epoch": 0.8402224560348714, "flos": 22243214275200.0, "grad_norm": 1.423914837222711, "language_loss": 0.72698224, "learning_rate": 2.617835788078868e-07, "loss": 0.80377257, "num_input_tokens_seen": 301470760, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09606934, "step": 13975, "time_per_iteration": 2.583815097808838 }, { "auxiliary_loss_clip": 0.06412346, "auxiliary_loss_mlp": 0.0126423, "balance_loss_clip": 0.06274045, "balance_loss_mlp": 0.01254127, "epoch": 0.8402825792875395, "flos": 20236153426560.0, "grad_norm": 1.7333090961300213, "language_loss": 0.72153854, "learning_rate": 2.6159097545177645e-07, "loss": 0.79830432, "num_input_tokens_seen": 301489425, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.10101318, "step": 13976, "time_per_iteration": 2.5436673164367676 }, { "auxiliary_loss_clip": 0.06412093, "auxiliary_loss_mlp": 0.01262424, "balance_loss_clip": 0.06272134, "balance_loss_mlp": 0.01253484, "epoch": 0.8403427025402074, "flos": 23295884319360.0, "grad_norm": 4.549372209529117, "language_loss": 0.72295523, "learning_rate": 2.61398438016311e-07, "loss": 0.79970038, "num_input_tokens_seen": 301508885, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.0894165, "step": 13977, "time_per_iteration": 2.639371633529663 }, { "auxiliary_loss_clip": 0.06419237, "auxiliary_loss_mlp": 0.0126329, "balance_loss_clip": 0.06276716, "balance_loss_mlp": 0.0125358, "epoch": 0.8404028257928754, "flos": 32684534277120.0, "grad_norm": 1.4009439774313484, "language_loss": 0.68542302, "learning_rate": 2.6120596650879043e-07, "loss": 0.76224828, "num_input_tokens_seen": 301533780, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09710693, "step": 13978, "time_per_iteration": 2.690227746963501 }, { "auxiliary_loss_clip": 0.06410163, "auxiliary_loss_mlp": 0.01265516, "balance_loss_clip": 0.06275354, "balance_loss_mlp": 0.01256027, "epoch": 0.8404629490455434, "flos": 16186127454720.0, "grad_norm": 1.6239465715535446, "language_loss": 0.78044647, "learning_rate": 2.610135609365145e-07, "loss": 0.8572033, "num_input_tokens_seen": 301551775, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.09484863, "step": 13979, "time_per_iteration": 3.8971409797668457 }, { "auxiliary_loss_clip": 0.06418702, "auxiliary_loss_mlp": 0.01265696, "balance_loss_clip": 0.06277162, "balance_loss_mlp": 0.01255695, "epoch": 0.8405230722982113, "flos": 15199731590400.0, "grad_norm": 1.8150562358629174, "language_loss": 0.78111708, "learning_rate": 2.60821221306778e-07, "loss": 0.85796106, "num_input_tokens_seen": 301570495, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09991455, "step": 13980, "time_per_iteration": 2.543429374694824 }, { "auxiliary_loss_clip": 0.06408946, "auxiliary_loss_mlp": 0.01269402, "balance_loss_clip": 0.06273728, "balance_loss_mlp": 0.01260282, "epoch": 0.8405831955508793, "flos": 27818787208320.0, "grad_norm": 2.202587349756622, "language_loss": 0.86941504, "learning_rate": 2.606289476268757e-07, "loss": 0.94619852, "num_input_tokens_seen": 301591705, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.09124756, "step": 13981, "time_per_iteration": 2.6214308738708496 }, { "auxiliary_loss_clip": 0.06415603, "auxiliary_loss_mlp": 0.01268124, "balance_loss_clip": 0.06277061, "balance_loss_mlp": 0.01258253, "epoch": 0.8406433188035473, "flos": 23776308103680.0, "grad_norm": 2.017395929747543, "language_loss": 0.67765725, "learning_rate": 2.6043673990409745e-07, "loss": 0.75449449, "num_input_tokens_seen": 301611670, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09875488, "step": 13982, "time_per_iteration": 2.568979263305664 }, { "auxiliary_loss_clip": 0.06418069, "auxiliary_loss_mlp": 0.01263037, "balance_loss_clip": 0.0627488, "balance_loss_mlp": 0.01252802, "epoch": 0.8407034420562153, "flos": 29213420964480.0, "grad_norm": 1.7211261441735035, "language_loss": 0.68482053, "learning_rate": 2.602445981457324e-07, "loss": 0.76163161, "num_input_tokens_seen": 301632540, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10235596, "step": 13983, "time_per_iteration": 2.598759889602661 }, { "auxiliary_loss_clip": 0.06414188, "auxiliary_loss_mlp": 0.01263468, "balance_loss_clip": 0.06273313, "balance_loss_mlp": 0.012542, "epoch": 0.8407635653088832, "flos": 26367396710400.0, "grad_norm": 1.7819289795122972, "language_loss": 0.79227692, "learning_rate": 2.6005252235906684e-07, "loss": 0.86905348, "num_input_tokens_seen": 301651480, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.0927124, "step": 13984, "time_per_iteration": 2.6031911373138428 }, { "auxiliary_loss_clip": 0.06411003, "auxiliary_loss_mlp": 0.01264489, "balance_loss_clip": 0.06271861, "balance_loss_mlp": 0.01254804, "epoch": 0.8408236885615512, "flos": 21474927388800.0, "grad_norm": 1.886879277661292, "language_loss": 0.60707253, "learning_rate": 2.598605125513842e-07, "loss": 0.6838274, "num_input_tokens_seen": 301670010, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09686279, "step": 13985, "time_per_iteration": 2.5524723529815674 }, { "auxiliary_loss_clip": 0.06414166, "auxiliary_loss_mlp": 0.01265249, "balance_loss_clip": 0.0627252, "balance_loss_mlp": 0.01255372, "epoch": 0.8408838118142191, "flos": 22969936736640.0, "grad_norm": 2.241177426045833, "language_loss": 0.82092512, "learning_rate": 2.5966856872996467e-07, "loss": 0.89771926, "num_input_tokens_seen": 301689785, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09869385, "step": 13986, "time_per_iteration": 2.59757661819458 }, { "auxiliary_loss_clip": 0.06411547, "auxiliary_loss_mlp": 0.01264811, "balance_loss_clip": 0.0627252, "balance_loss_mlp": 0.01255489, "epoch": 0.8409439350668871, "flos": 26807765443200.0, "grad_norm": 1.6513896131751713, "language_loss": 0.66029322, "learning_rate": 2.5947669090208755e-07, "loss": 0.73705679, "num_input_tokens_seen": 301712225, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09320068, "step": 13987, "time_per_iteration": 2.6251726150512695 }, { "auxiliary_loss_clip": 0.06414717, "auxiliary_loss_mlp": 0.01268089, "balance_loss_clip": 0.06276439, "balance_loss_mlp": 0.01258916, "epoch": 0.841004058319555, "flos": 26585966885760.0, "grad_norm": 1.8779394746962856, "language_loss": 0.67890215, "learning_rate": 2.5928487907502906e-07, "loss": 0.75573015, "num_input_tokens_seen": 301730955, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.0916748, "step": 13988, "time_per_iteration": 2.636021852493286 }, { "auxiliary_loss_clip": 0.06415877, "auxiliary_loss_mlp": 0.01267547, "balance_loss_clip": 0.06273429, "balance_loss_mlp": 0.01256884, "epoch": 0.8410641815722231, "flos": 14507152467840.0, "grad_norm": 1.9786194111465534, "language_loss": 0.81533039, "learning_rate": 2.590931332560622e-07, "loss": 0.89216465, "num_input_tokens_seen": 301746930, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10662842, "step": 13989, "time_per_iteration": 2.5002121925354004 }, { "auxiliary_loss_clip": 0.06421117, "auxiliary_loss_mlp": 0.01263729, "balance_loss_clip": 0.06278497, "balance_loss_mlp": 0.01253912, "epoch": 0.841124304824891, "flos": 29173994818560.0, "grad_norm": 2.086010726147049, "language_loss": 0.75486004, "learning_rate": 2.5890145345245826e-07, "loss": 0.83170843, "num_input_tokens_seen": 301766945, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.09814453, "step": 13990, "time_per_iteration": 2.64216685295105 }, { "auxiliary_loss_clip": 0.06406547, "auxiliary_loss_mlp": 0.0126858, "balance_loss_clip": 0.06270865, "balance_loss_mlp": 0.01258703, "epoch": 0.841184428077559, "flos": 22417410913920.0, "grad_norm": 1.7103470666626772, "language_loss": 0.80954111, "learning_rate": 2.5870983967148597e-07, "loss": 0.88629234, "num_input_tokens_seen": 301785460, "router_z_loss_clip": 1.35644531, "router_z_loss_mlp": 0.09881592, "step": 13991, "time_per_iteration": 2.549624443054199 }, { "auxiliary_loss_clip": 0.06412663, "auxiliary_loss_mlp": 0.01263906, "balance_loss_clip": 0.06273536, "balance_loss_mlp": 0.0125487, "epoch": 0.841244551330227, "flos": 22968846633600.0, "grad_norm": 2.121859411349284, "language_loss": 0.70400327, "learning_rate": 2.585182919204105e-07, "loss": 0.78076899, "num_input_tokens_seen": 301804180, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09033203, "step": 13992, "time_per_iteration": 2.619840621948242 }, { "auxiliary_loss_clip": 0.06413821, "auxiliary_loss_mlp": 0.01265152, "balance_loss_clip": 0.06272568, "balance_loss_mlp": 0.01255972, "epoch": 0.8413046745828949, "flos": 21039086776320.0, "grad_norm": 1.7949877448245224, "language_loss": 0.76721191, "learning_rate": 2.583268102064959e-07, "loss": 0.84400165, "num_input_tokens_seen": 301823670, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09179688, "step": 13993, "time_per_iteration": 2.5348730087280273 }, { "auxiliary_loss_clip": 0.06425073, "auxiliary_loss_mlp": 0.01267883, "balance_loss_clip": 0.06277283, "balance_loss_mlp": 0.01256863, "epoch": 0.841364797835563, "flos": 27059305000320.0, "grad_norm": 1.7849429319944607, "language_loss": 0.7422936, "learning_rate": 2.5813539453700393e-07, "loss": 0.81922317, "num_input_tokens_seen": 301845890, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.11016846, "step": 13994, "time_per_iteration": 2.640373945236206 }, { "auxiliary_loss_clip": 0.06409732, "auxiliary_loss_mlp": 0.01263483, "balance_loss_clip": 0.06274951, "balance_loss_mlp": 0.01254727, "epoch": 0.8414249210882309, "flos": 17901635621760.0, "grad_norm": 1.7756660974232796, "language_loss": 0.5942961, "learning_rate": 2.5794404491919163e-07, "loss": 0.6710282, "num_input_tokens_seen": 301863985, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.08752441, "step": 13995, "time_per_iteration": 2.525118112564087 }, { "auxiliary_loss_clip": 0.06416859, "auxiliary_loss_mlp": 0.01265013, "balance_loss_clip": 0.06277823, "balance_loss_mlp": 0.01255596, "epoch": 0.8414850443408989, "flos": 25447233098880.0, "grad_norm": 1.6821778636394824, "language_loss": 0.7203424, "learning_rate": 2.577527613603163e-07, "loss": 0.7971611, "num_input_tokens_seen": 301882765, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09417725, "step": 13996, "time_per_iteration": 2.726494550704956 }, { "auxiliary_loss_clip": 0.06415121, "auxiliary_loss_mlp": 0.01268581, "balance_loss_clip": 0.06275577, "balance_loss_mlp": 0.01259999, "epoch": 0.8415451675935668, "flos": 23226465611520.0, "grad_norm": 1.8193665102451055, "language_loss": 0.64093345, "learning_rate": 2.5756154386763017e-07, "loss": 0.71777052, "num_input_tokens_seen": 301902720, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.08581543, "step": 13997, "time_per_iteration": 2.5664255619049072 }, { "auxiliary_loss_clip": 0.0642138, "auxiliary_loss_mlp": 0.01266668, "balance_loss_clip": 0.06275736, "balance_loss_mlp": 0.01255844, "epoch": 0.8416052908462348, "flos": 18551560216320.0, "grad_norm": 1.7693399077954268, "language_loss": 0.82535964, "learning_rate": 2.5737039244838565e-07, "loss": 0.90224016, "num_input_tokens_seen": 301921245, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10827637, "step": 13998, "time_per_iteration": 3.970432758331299 }, { "auxiliary_loss_clip": 0.06415649, "auxiliary_loss_mlp": 0.01267647, "balance_loss_clip": 0.06274579, "balance_loss_mlp": 0.01257067, "epoch": 0.8416654140989027, "flos": 26112544917120.0, "grad_norm": 1.4842194940904878, "language_loss": 0.80405855, "learning_rate": 2.5717930710982984e-07, "loss": 0.88089144, "num_input_tokens_seen": 301942320, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10577393, "step": 13999, "time_per_iteration": 2.6458895206451416 }, { "auxiliary_loss_clip": 0.06420834, "auxiliary_loss_mlp": 0.01264088, "balance_loss_clip": 0.06277613, "balance_loss_mlp": 0.01253854, "epoch": 0.8417255373515707, "flos": 26440630778880.0, "grad_norm": 1.852069025890027, "language_loss": 0.66920471, "learning_rate": 2.569882878592096e-07, "loss": 0.74605387, "num_input_tokens_seen": 301963110, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10235596, "step": 14000, "time_per_iteration": 2.6096291542053223 }, { "auxiliary_loss_clip": 0.06418371, "auxiliary_loss_mlp": 0.01269116, "balance_loss_clip": 0.06274584, "balance_loss_mlp": 0.01258995, "epoch": 0.8417856606042387, "flos": 24724703341440.0, "grad_norm": 1.3658569891266186, "language_loss": 0.79493296, "learning_rate": 2.5679733470376885e-07, "loss": 0.87180781, "num_input_tokens_seen": 301984915, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10107422, "step": 14001, "time_per_iteration": 2.596897840499878 }, { "auxiliary_loss_clip": 0.06412339, "auxiliary_loss_mlp": 0.01263564, "balance_loss_clip": 0.06273119, "balance_loss_mlp": 0.01254379, "epoch": 0.8418457838569067, "flos": 20857259416320.0, "grad_norm": 1.822161804418279, "language_loss": 0.78838098, "learning_rate": 2.5660644765074703e-07, "loss": 0.86514002, "num_input_tokens_seen": 302004095, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09191895, "step": 14002, "time_per_iteration": 4.0256476402282715 }, { "auxiliary_loss_clip": 0.06410971, "auxiliary_loss_mlp": 0.01264924, "balance_loss_clip": 0.06273591, "balance_loss_mlp": 0.01255262, "epoch": 0.8419059071095746, "flos": 28668651644160.0, "grad_norm": 1.3760590608331544, "language_loss": 0.78257424, "learning_rate": 2.5641562670738334e-07, "loss": 0.85933322, "num_input_tokens_seen": 302027250, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09661865, "step": 14003, "time_per_iteration": 2.620387554168701 }, { "auxiliary_loss_clip": 0.06413697, "auxiliary_loss_mlp": 0.01266679, "balance_loss_clip": 0.06273182, "balance_loss_mlp": 0.01257166, "epoch": 0.8419660303622426, "flos": 21660150839040.0, "grad_norm": 1.6154741853761043, "language_loss": 0.65834385, "learning_rate": 2.5622487188091436e-07, "loss": 0.7351476, "num_input_tokens_seen": 302046950, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09509277, "step": 14004, "time_per_iteration": 2.5853047370910645 }, { "auxiliary_loss_clip": 0.0642052, "auxiliary_loss_mlp": 0.01265094, "balance_loss_clip": 0.06276534, "balance_loss_mlp": 0.01254866, "epoch": 0.8420261536149106, "flos": 25308102193920.0, "grad_norm": 2.084451579074731, "language_loss": 0.76333892, "learning_rate": 2.560341831785724e-07, "loss": 0.84019506, "num_input_tokens_seen": 302065470, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10235596, "step": 14005, "time_per_iteration": 2.596379518508911 }, { "auxiliary_loss_clip": 0.06420501, "auxiliary_loss_mlp": 0.0126627, "balance_loss_clip": 0.06277351, "balance_loss_mlp": 0.01256328, "epoch": 0.8420862768675785, "flos": 18768159820800.0, "grad_norm": 1.8097447821597874, "language_loss": 0.7780782, "learning_rate": 2.5584356060758906e-07, "loss": 0.85494578, "num_input_tokens_seen": 302083190, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.09936523, "step": 14006, "time_per_iteration": 2.6779637336730957 }, { "auxiliary_loss_clip": 0.0641403, "auxiliary_loss_mlp": 0.01261662, "balance_loss_clip": 0.06275031, "balance_loss_mlp": 0.01251976, "epoch": 0.8421464001202466, "flos": 18333157749120.0, "grad_norm": 1.8020161255331544, "language_loss": 0.77372122, "learning_rate": 2.556530041751932e-07, "loss": 0.85047817, "num_input_tokens_seen": 302098820, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09686279, "step": 14007, "time_per_iteration": 2.5500361919403076 }, { "auxiliary_loss_clip": 0.06415797, "auxiliary_loss_mlp": 0.01264537, "balance_loss_clip": 0.06273349, "balance_loss_mlp": 0.01254708, "epoch": 0.8422065233729145, "flos": 31544710387200.0, "grad_norm": 2.5124311867389233, "language_loss": 0.66266453, "learning_rate": 2.554625138886102e-07, "loss": 0.73946792, "num_input_tokens_seen": 302117075, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.09832764, "step": 14008, "time_per_iteration": 2.6887004375457764 }, { "auxiliary_loss_clip": 0.06315158, "auxiliary_loss_mlp": 0.01254667, "balance_loss_clip": 0.0625945, "balance_loss_mlp": 0.01253623, "epoch": 0.8422666466255825, "flos": 64316691999360.0, "grad_norm": 0.7072860970695984, "language_loss": 0.56779999, "learning_rate": 2.552720897550631e-07, "loss": 0.6434983, "num_input_tokens_seen": 302179735, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 0.01044464, "step": 14009, "time_per_iteration": 3.241641044616699 }, { "auxiliary_loss_clip": 0.06412525, "auxiliary_loss_mlp": 0.01264377, "balance_loss_clip": 0.06275851, "balance_loss_mlp": 0.01255526, "epoch": 0.8423267698782504, "flos": 24323676900480.0, "grad_norm": 1.198825002533116, "language_loss": 0.78274506, "learning_rate": 2.5508173178177304e-07, "loss": 0.85951406, "num_input_tokens_seen": 302202055, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.08850098, "step": 14010, "time_per_iteration": 2.6144256591796875 }, { "auxiliary_loss_clip": 0.06418002, "auxiliary_loss_mlp": 0.01266002, "balance_loss_clip": 0.06275383, "balance_loss_mlp": 0.01255113, "epoch": 0.8423868931309184, "flos": 18301949303040.0, "grad_norm": 1.9111376717205462, "language_loss": 0.72709966, "learning_rate": 2.548914399759592e-07, "loss": 0.8039397, "num_input_tokens_seen": 302221360, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10894775, "step": 14011, "time_per_iteration": 2.5534169673919678 }, { "auxiliary_loss_clip": 0.06414528, "auxiliary_loss_mlp": 0.01265312, "balance_loss_clip": 0.06275057, "balance_loss_mlp": 0.01255847, "epoch": 0.8424470163835863, "flos": 23556983241600.0, "grad_norm": 1.8663316068976326, "language_loss": 0.84615213, "learning_rate": 2.5470121434483636e-07, "loss": 0.92295051, "num_input_tokens_seen": 302240715, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09460449, "step": 14012, "time_per_iteration": 2.591337203979492 }, { "auxiliary_loss_clip": 0.06403828, "auxiliary_loss_mlp": 0.01265639, "balance_loss_clip": 0.06272388, "balance_loss_mlp": 0.01256913, "epoch": 0.8425071396362543, "flos": 23776350030720.0, "grad_norm": 1.667268815839991, "language_loss": 0.67851508, "learning_rate": 2.5451105489561884e-07, "loss": 0.75520974, "num_input_tokens_seen": 302260950, "router_z_loss_clip": 1.31445312, "router_z_loss_mlp": 0.08721924, "step": 14013, "time_per_iteration": 4.01805567741394 }, { "auxiliary_loss_clip": 0.06419077, "auxiliary_loss_mlp": 0.01264944, "balance_loss_clip": 0.06274994, "balance_loss_mlp": 0.01254638, "epoch": 0.8425672628889223, "flos": 16184240737920.0, "grad_norm": 2.078777119526905, "language_loss": 0.78940761, "learning_rate": 2.5432096163551644e-07, "loss": 0.86624777, "num_input_tokens_seen": 302277500, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10296631, "step": 14014, "time_per_iteration": 2.5007858276367188 }, { "auxiliary_loss_clip": 0.06414527, "auxiliary_loss_mlp": 0.01263804, "balance_loss_clip": 0.06275138, "balance_loss_mlp": 0.01254339, "epoch": 0.8426273861415903, "flos": 23155872946560.0, "grad_norm": 1.6481941638081306, "language_loss": 0.67563915, "learning_rate": 2.5413093457173884e-07, "loss": 0.75242245, "num_input_tokens_seen": 302297930, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09466553, "step": 14015, "time_per_iteration": 2.585158348083496 }, { "auxiliary_loss_clip": 0.06416821, "auxiliary_loss_mlp": 0.0126464, "balance_loss_clip": 0.06276392, "balance_loss_mlp": 0.01254882, "epoch": 0.8426875093942582, "flos": 17463614803200.0, "grad_norm": 2.0514251953055886, "language_loss": 0.76421881, "learning_rate": 2.5394097371149036e-07, "loss": 0.84103346, "num_input_tokens_seen": 302315735, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09753418, "step": 14016, "time_per_iteration": 2.532362461090088 }, { "auxiliary_loss_clip": 0.06416042, "auxiliary_loss_mlp": 0.01268127, "balance_loss_clip": 0.0627594, "balance_loss_mlp": 0.01258, "epoch": 0.8427476326469262, "flos": 19645710831360.0, "grad_norm": 2.139433314909083, "language_loss": 0.79412675, "learning_rate": 2.5375107906197544e-07, "loss": 0.8709684, "num_input_tokens_seen": 302332790, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.10125732, "step": 14017, "time_per_iteration": 2.5503060817718506 }, { "auxiliary_loss_clip": 0.06415951, "auxiliary_loss_mlp": 0.01263138, "balance_loss_clip": 0.06274937, "balance_loss_mlp": 0.0125384, "epoch": 0.8428077558995941, "flos": 11944882465920.0, "grad_norm": 1.923728779402201, "language_loss": 0.62984693, "learning_rate": 2.5356125063039525e-07, "loss": 0.70663786, "num_input_tokens_seen": 302346490, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09301758, "step": 14018, "time_per_iteration": 3.9440603256225586 }, { "auxiliary_loss_clip": 0.06417286, "auxiliary_loss_mlp": 0.01268798, "balance_loss_clip": 0.0627638, "balance_loss_mlp": 0.01259744, "epoch": 0.8428678791522621, "flos": 10456287955200.0, "grad_norm": 1.7328168717640076, "language_loss": 0.79689658, "learning_rate": 2.5337148842394687e-07, "loss": 0.87375736, "num_input_tokens_seen": 302363235, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09057617, "step": 14019, "time_per_iteration": 2.568286418914795 }, { "auxiliary_loss_clip": 0.06417502, "auxiliary_loss_mlp": 0.0126702, "balance_loss_clip": 0.06275666, "balance_loss_mlp": 0.01257114, "epoch": 0.8429280024049302, "flos": 28774813167360.0, "grad_norm": 11.250057041054463, "language_loss": 0.7877425, "learning_rate": 2.531817924498265e-07, "loss": 0.86458766, "num_input_tokens_seen": 302383270, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09899902, "step": 14020, "time_per_iteration": 2.6609702110290527 }, { "auxiliary_loss_clip": 0.0641517, "auxiliary_loss_mlp": 0.0126464, "balance_loss_clip": 0.06274867, "balance_loss_mlp": 0.01255014, "epoch": 0.8429881256575981, "flos": 19543238887680.0, "grad_norm": 1.9436177089034994, "language_loss": 0.71578217, "learning_rate": 2.5299216271522805e-07, "loss": 0.79258025, "num_input_tokens_seen": 302401355, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09625244, "step": 14021, "time_per_iteration": 2.5667941570281982 }, { "auxiliary_loss_clip": 0.06418814, "auxiliary_loss_mlp": 0.01269315, "balance_loss_clip": 0.0627576, "balance_loss_mlp": 0.01259522, "epoch": 0.8430482489102661, "flos": 24797937409920.0, "grad_norm": 1.6397286455276734, "language_loss": 0.69825208, "learning_rate": 2.5280259922734125e-07, "loss": 0.77513337, "num_input_tokens_seen": 302419515, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09790039, "step": 14022, "time_per_iteration": 2.6038718223571777 }, { "auxiliary_loss_clip": 0.0641896, "auxiliary_loss_mlp": 0.01265677, "balance_loss_clip": 0.06275927, "balance_loss_mlp": 0.01255694, "epoch": 0.843108372162934, "flos": 21550802860800.0, "grad_norm": 1.7452094076641766, "language_loss": 0.72396976, "learning_rate": 2.526131019933553e-07, "loss": 0.80081612, "num_input_tokens_seen": 302438280, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09991455, "step": 14023, "time_per_iteration": 2.5536186695098877 }, { "auxiliary_loss_clip": 0.06414658, "auxiliary_loss_mlp": 0.0126305, "balance_loss_clip": 0.06275694, "balance_loss_mlp": 0.0125303, "epoch": 0.843168495415602, "flos": 24615816560640.0, "grad_norm": 1.529288284949878, "language_loss": 0.67086905, "learning_rate": 2.524236710204559e-07, "loss": 0.74764609, "num_input_tokens_seen": 302460860, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10015869, "step": 14024, "time_per_iteration": 2.7075839042663574 }, { "auxiliary_loss_clip": 0.06412241, "auxiliary_loss_mlp": 0.01263893, "balance_loss_clip": 0.06274849, "balance_loss_mlp": 0.01253898, "epoch": 0.8432286186682699, "flos": 15128216530560.0, "grad_norm": 2.1222844421037874, "language_loss": 0.81164533, "learning_rate": 2.522343063158261e-07, "loss": 0.88840669, "num_input_tokens_seen": 302476980, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.10003662, "step": 14025, "time_per_iteration": 2.5832924842834473 }, { "auxiliary_loss_clip": 0.06412647, "auxiliary_loss_mlp": 0.01261884, "balance_loss_clip": 0.06275874, "balance_loss_mlp": 0.01253515, "epoch": 0.843288741920938, "flos": 20307920048640.0, "grad_norm": 1.42959737687503, "language_loss": 0.78083402, "learning_rate": 2.5204500788664606e-07, "loss": 0.85757935, "num_input_tokens_seen": 302496380, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.08355713, "step": 14026, "time_per_iteration": 2.644418716430664 }, { "auxiliary_loss_clip": 0.06410515, "auxiliary_loss_mlp": 0.01264196, "balance_loss_clip": 0.06273896, "balance_loss_mlp": 0.01254791, "epoch": 0.8433488651736059, "flos": 23338958117760.0, "grad_norm": 2.5640281803283465, "language_loss": 0.8250078, "learning_rate": 2.518557757400945e-07, "loss": 0.90175486, "num_input_tokens_seen": 302516845, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09405518, "step": 14027, "time_per_iteration": 2.5842676162719727 }, { "auxiliary_loss_clip": 0.06414527, "auxiliary_loss_mlp": 0.01262309, "balance_loss_clip": 0.0627594, "balance_loss_mlp": 0.01253053, "epoch": 0.8434089884262739, "flos": 39467546945280.0, "grad_norm": 1.4634235932060973, "language_loss": 0.56549466, "learning_rate": 2.5166660988334754e-07, "loss": 0.64226305, "num_input_tokens_seen": 302538865, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.0925293, "step": 14028, "time_per_iteration": 2.7246592044830322 }, { "auxiliary_loss_clip": 0.06415097, "auxiliary_loss_mlp": 0.01267794, "balance_loss_clip": 0.06276703, "balance_loss_mlp": 0.01258633, "epoch": 0.8434691116789418, "flos": 23775595344000.0, "grad_norm": 2.2794563602041404, "language_loss": 0.63927114, "learning_rate": 2.51477510323578e-07, "loss": 0.71610004, "num_input_tokens_seen": 302557970, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09155273, "step": 14029, "time_per_iteration": 2.5560572147369385 }, { "auxiliary_loss_clip": 0.06409878, "auxiliary_loss_mlp": 0.01264623, "balance_loss_clip": 0.0627468, "balance_loss_mlp": 0.01255688, "epoch": 0.8435292349316098, "flos": 22677503587200.0, "grad_norm": 1.596935546531201, "language_loss": 0.75348312, "learning_rate": 2.51288477067956e-07, "loss": 0.83022809, "num_input_tokens_seen": 302578915, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.08935547, "step": 14030, "time_per_iteration": 2.605057716369629 }, { "auxiliary_loss_clip": 0.06410868, "auxiliary_loss_mlp": 0.01268494, "balance_loss_clip": 0.06274232, "balance_loss_mlp": 0.01259118, "epoch": 0.8435893581842777, "flos": 18849611589120.0, "grad_norm": 1.7459373530647597, "language_loss": 0.83658469, "learning_rate": 2.510995101236502e-07, "loss": 0.91337824, "num_input_tokens_seen": 302596300, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09381104, "step": 14031, "time_per_iteration": 2.5375075340270996 }, { "auxiliary_loss_clip": 0.0641164, "auxiliary_loss_mlp": 0.01262935, "balance_loss_clip": 0.06275156, "balance_loss_mlp": 0.01253776, "epoch": 0.8436494814369457, "flos": 20710497790080.0, "grad_norm": 1.7287928088585482, "language_loss": 0.80411255, "learning_rate": 2.509106094978266e-07, "loss": 0.8808583, "num_input_tokens_seen": 302614975, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.09158325, "step": 14032, "time_per_iteration": 2.573421001434326 }, { "auxiliary_loss_clip": 0.06414919, "auxiliary_loss_mlp": 0.0126693, "balance_loss_clip": 0.0627393, "balance_loss_mlp": 0.01256547, "epoch": 0.8437096046896138, "flos": 22680731969280.0, "grad_norm": 1.3400330239889766, "language_loss": 0.75587708, "learning_rate": 2.507217751976478e-07, "loss": 0.83269548, "num_input_tokens_seen": 302636415, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.1038208, "step": 14033, "time_per_iteration": 2.549929618835449 }, { "auxiliary_loss_clip": 0.06414617, "auxiliary_loss_mlp": 0.01268031, "balance_loss_clip": 0.06274105, "balance_loss_mlp": 0.01259126, "epoch": 0.8437697279422817, "flos": 16185666257280.0, "grad_norm": 1.7272594497007936, "language_loss": 0.83698404, "learning_rate": 2.505330072302743e-07, "loss": 0.91381049, "num_input_tokens_seen": 302653605, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.08898926, "step": 14034, "time_per_iteration": 2.546165704727173 }, { "auxiliary_loss_clip": 0.06415005, "auxiliary_loss_mlp": 0.01263609, "balance_loss_clip": 0.06275155, "balance_loss_mlp": 0.01253631, "epoch": 0.8438298511949497, "flos": 28773178012800.0, "grad_norm": 1.3923039813039397, "language_loss": 0.78336108, "learning_rate": 2.503443056028656e-07, "loss": 0.86014724, "num_input_tokens_seen": 302673965, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09973145, "step": 14035, "time_per_iteration": 2.587204933166504 }, { "auxiliary_loss_clip": 0.06415467, "auxiliary_loss_mlp": 0.01263589, "balance_loss_clip": 0.06275962, "balance_loss_mlp": 0.01254362, "epoch": 0.8438899744476176, "flos": 33731837660160.0, "grad_norm": 1.5682095446612523, "language_loss": 0.72419667, "learning_rate": 2.501556703225751e-07, "loss": 0.80098724, "num_input_tokens_seen": 302695560, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09228516, "step": 14036, "time_per_iteration": 2.672020435333252 }, { "auxiliary_loss_clip": 0.06410711, "auxiliary_loss_mlp": 0.01262013, "balance_loss_clip": 0.06275418, "balance_loss_mlp": 0.01253752, "epoch": 0.8439500977002856, "flos": 25116421979520.0, "grad_norm": 1.6409431457907073, "language_loss": 0.70020366, "learning_rate": 2.49967101396557e-07, "loss": 0.77693093, "num_input_tokens_seen": 302713480, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.08258057, "step": 14037, "time_per_iteration": 4.103602647781372 }, { "auxiliary_loss_clip": 0.06412596, "auxiliary_loss_mlp": 0.01265702, "balance_loss_clip": 0.06275013, "balance_loss_mlp": 0.01256928, "epoch": 0.8440102209529535, "flos": 32858060083200.0, "grad_norm": 1.6149699072839883, "language_loss": 0.69033432, "learning_rate": 2.4977859883196227e-07, "loss": 0.76711726, "num_input_tokens_seen": 302736860, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.08776855, "step": 14038, "time_per_iteration": 2.6382055282592773 }, { "auxiliary_loss_clip": 0.0641728, "auxiliary_loss_mlp": 0.01267081, "balance_loss_clip": 0.06275566, "balance_loss_mlp": 0.01257717, "epoch": 0.8440703442056215, "flos": 23736588468480.0, "grad_norm": 1.5979306465085221, "language_loss": 0.76349288, "learning_rate": 2.49590162635938e-07, "loss": 0.8403365, "num_input_tokens_seen": 302757745, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09356689, "step": 14039, "time_per_iteration": 2.590860366821289 }, { "auxiliary_loss_clip": 0.06424356, "auxiliary_loss_mlp": 0.01262303, "balance_loss_clip": 0.06279033, "balance_loss_mlp": 0.01252337, "epoch": 0.8441304674582895, "flos": 20199955662720.0, "grad_norm": 1.974258689357277, "language_loss": 0.79824007, "learning_rate": 2.4940179281563046e-07, "loss": 0.87510663, "num_input_tokens_seen": 302774885, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.09967041, "step": 14040, "time_per_iteration": 2.573317766189575 }, { "auxiliary_loss_clip": 0.06413785, "auxiliary_loss_mlp": 0.01267776, "balance_loss_clip": 0.06274743, "balance_loss_mlp": 0.01257989, "epoch": 0.8441905907109575, "flos": 20224413855360.0, "grad_norm": 2.1396324283213772, "language_loss": 0.69615763, "learning_rate": 2.492134893781821e-07, "loss": 0.77297318, "num_input_tokens_seen": 302791035, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09790039, "step": 14041, "time_per_iteration": 2.5910048484802246 }, { "auxiliary_loss_clip": 0.06417377, "auxiliary_loss_mlp": 0.01266033, "balance_loss_clip": 0.06275186, "balance_loss_mlp": 0.01256151, "epoch": 0.8442507139636254, "flos": 13521511290240.0, "grad_norm": 1.7902754169313602, "language_loss": 0.69374663, "learning_rate": 2.490252523307341e-07, "loss": 0.77058071, "num_input_tokens_seen": 302808650, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09887695, "step": 14042, "time_per_iteration": 4.089637994766235 }, { "auxiliary_loss_clip": 0.06411648, "auxiliary_loss_mlp": 0.01265505, "balance_loss_clip": 0.06274858, "balance_loss_mlp": 0.01256231, "epoch": 0.8443108372162934, "flos": 18225570706560.0, "grad_norm": 1.677270610757823, "language_loss": 0.75462621, "learning_rate": 2.4883708168042373e-07, "loss": 0.83139777, "num_input_tokens_seen": 302824605, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09265137, "step": 14043, "time_per_iteration": 2.618328094482422 }, { "auxiliary_loss_clip": 0.06412156, "auxiliary_loss_mlp": 0.01263528, "balance_loss_clip": 0.06273052, "balance_loss_mlp": 0.01253925, "epoch": 0.8443709604689613, "flos": 16110293909760.0, "grad_norm": 2.157426318083925, "language_loss": 0.72612697, "learning_rate": 2.486489774343865e-07, "loss": 0.80288386, "num_input_tokens_seen": 302840170, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09606934, "step": 14044, "time_per_iteration": 2.5694377422332764 }, { "auxiliary_loss_clip": 0.06408851, "auxiliary_loss_mlp": 0.01262715, "balance_loss_clip": 0.06272385, "balance_loss_mlp": 0.01253756, "epoch": 0.8444310837216293, "flos": 18517542658560.0, "grad_norm": 1.5132738171786806, "language_loss": 0.7527414, "learning_rate": 2.484609395997559e-07, "loss": 0.82945704, "num_input_tokens_seen": 302858320, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.08966064, "step": 14045, "time_per_iteration": 2.5377323627471924 }, { "auxiliary_loss_clip": 0.06413536, "auxiliary_loss_mlp": 0.01266127, "balance_loss_clip": 0.06275082, "balance_loss_mlp": 0.01256948, "epoch": 0.8444912069742974, "flos": 14945215213440.0, "grad_norm": 1.6807612311471705, "language_loss": 0.78524393, "learning_rate": 2.4827296818366216e-07, "loss": 0.86204058, "num_input_tokens_seen": 302875255, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09179688, "step": 14046, "time_per_iteration": 2.5271975994110107 }, { "auxiliary_loss_clip": 0.06414462, "auxiliary_loss_mlp": 0.01266108, "balance_loss_clip": 0.06274101, "balance_loss_mlp": 0.01255803, "epoch": 0.8445513302269653, "flos": 20126470032000.0, "grad_norm": 2.629680916383611, "language_loss": 0.78474361, "learning_rate": 2.4808506319323255e-07, "loss": 0.86154926, "num_input_tokens_seen": 302894690, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.10302734, "step": 14047, "time_per_iteration": 2.6089813709259033 }, { "auxiliary_loss_clip": 0.06414671, "auxiliary_loss_mlp": 0.01267038, "balance_loss_clip": 0.0627616, "balance_loss_mlp": 0.01257674, "epoch": 0.8446114534796333, "flos": 31178162701440.0, "grad_norm": 1.7610286888618853, "language_loss": 0.72457349, "learning_rate": 2.478972246355935e-07, "loss": 0.80139059, "num_input_tokens_seen": 302912405, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09356689, "step": 14048, "time_per_iteration": 2.633047580718994 }, { "auxiliary_loss_clip": 0.0641534, "auxiliary_loss_mlp": 0.01264874, "balance_loss_clip": 0.06276774, "balance_loss_mlp": 0.01255557, "epoch": 0.8446715767323012, "flos": 23954613592320.0, "grad_norm": 1.4034408895325974, "language_loss": 0.73224622, "learning_rate": 2.477094525178667e-07, "loss": 0.80904835, "num_input_tokens_seen": 302932525, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09320068, "step": 14049, "time_per_iteration": 2.568056344985962 }, { "auxiliary_loss_clip": 0.06319219, "auxiliary_loss_mlp": 0.01254221, "balance_loss_clip": 0.06263451, "balance_loss_mlp": 0.01253086, "epoch": 0.8447316999849692, "flos": 68004362989440.0, "grad_norm": 0.7915591813535062, "language_loss": 0.60576051, "learning_rate": 2.475217468471729e-07, "loss": 0.68149495, "num_input_tokens_seen": 302991285, "router_z_loss_clip": 0.55615234, "router_z_loss_mlp": 0.0113678, "step": 14050, "time_per_iteration": 3.1268954277038574 }, { "auxiliary_loss_clip": 0.06413335, "auxiliary_loss_mlp": 0.01264971, "balance_loss_clip": 0.06274705, "balance_loss_mlp": 0.01254654, "epoch": 0.8447918232376371, "flos": 22425460905600.0, "grad_norm": 2.1957621128743408, "language_loss": 0.72304171, "learning_rate": 2.473341076306303e-07, "loss": 0.79982477, "num_input_tokens_seen": 303009515, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.10321045, "step": 14051, "time_per_iteration": 2.584775686264038 }, { "auxiliary_loss_clip": 0.0640948, "auxiliary_loss_mlp": 0.01266595, "balance_loss_clip": 0.06272162, "balance_loss_mlp": 0.01257446, "epoch": 0.8448519464903052, "flos": 23700600339840.0, "grad_norm": 1.7699842949373188, "language_loss": 0.74811971, "learning_rate": 2.471465348753547e-07, "loss": 0.82488042, "num_input_tokens_seen": 303026905, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09155273, "step": 14052, "time_per_iteration": 4.071548223495483 }, { "auxiliary_loss_clip": 0.06407554, "auxiliary_loss_mlp": 0.01267477, "balance_loss_clip": 0.06275551, "balance_loss_mlp": 0.01258679, "epoch": 0.8449120697429731, "flos": 13741087714560.0, "grad_norm": 1.7650786795579791, "language_loss": 0.74253547, "learning_rate": 2.469590285884575e-07, "loss": 0.81928581, "num_input_tokens_seen": 303045245, "router_z_loss_clip": 1.32128906, "router_z_loss_mlp": 0.08795166, "step": 14053, "time_per_iteration": 2.5663628578186035 }, { "auxiliary_loss_clip": 0.06414659, "auxiliary_loss_mlp": 0.01266605, "balance_loss_clip": 0.06276993, "balance_loss_mlp": 0.0125751, "epoch": 0.8449721929956411, "flos": 20893121763840.0, "grad_norm": 1.8064843345935901, "language_loss": 0.73834622, "learning_rate": 2.467715887770494e-07, "loss": 0.81515884, "num_input_tokens_seen": 303065205, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09094238, "step": 14054, "time_per_iteration": 2.5487799644470215 }, { "auxiliary_loss_clip": 0.06421559, "auxiliary_loss_mlp": 0.01262575, "balance_loss_clip": 0.0627528, "balance_loss_mlp": 0.01252764, "epoch": 0.845032316248309, "flos": 33224985112320.0, "grad_norm": 1.4821862856791028, "language_loss": 0.78446293, "learning_rate": 2.4658421544823895e-07, "loss": 0.86130422, "num_input_tokens_seen": 303088250, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.0980835, "step": 14055, "time_per_iteration": 2.6990082263946533 }, { "auxiliary_loss_clip": 0.06407236, "auxiliary_loss_mlp": 0.01266166, "balance_loss_clip": 0.06270271, "balance_loss_mlp": 0.01257118, "epoch": 0.845092439500977, "flos": 23591755486080.0, "grad_norm": 1.7776442518810005, "language_loss": 0.73093343, "learning_rate": 2.463969086091302e-07, "loss": 0.80766749, "num_input_tokens_seen": 303109280, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.0904541, "step": 14056, "time_per_iteration": 2.5821306705474854 }, { "auxiliary_loss_clip": 0.06421112, "auxiliary_loss_mlp": 0.01265217, "balance_loss_clip": 0.06275417, "balance_loss_mlp": 0.01254798, "epoch": 0.8451525627536449, "flos": 13338929243520.0, "grad_norm": 2.2861950597782617, "language_loss": 0.68066967, "learning_rate": 2.4620966826682686e-07, "loss": 0.75753295, "num_input_tokens_seen": 303126075, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10412598, "step": 14057, "time_per_iteration": 2.527601480484009 }, { "auxiliary_loss_clip": 0.06412806, "auxiliary_loss_mlp": 0.01263514, "balance_loss_clip": 0.06272186, "balance_loss_mlp": 0.01253668, "epoch": 0.8452126860063129, "flos": 27825285899520.0, "grad_norm": 1.7193044280825605, "language_loss": 0.77662611, "learning_rate": 2.460224944284284e-07, "loss": 0.85338932, "num_input_tokens_seen": 303146920, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09844971, "step": 14058, "time_per_iteration": 4.027530670166016 }, { "auxiliary_loss_clip": 0.06416088, "auxiliary_loss_mlp": 0.0126439, "balance_loss_clip": 0.06275448, "balance_loss_mlp": 0.01254829, "epoch": 0.845272809258981, "flos": 27131868236160.0, "grad_norm": 1.3952545533588225, "language_loss": 0.69574571, "learning_rate": 2.45835387101033e-07, "loss": 0.77255046, "num_input_tokens_seen": 303167885, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09558105, "step": 14059, "time_per_iteration": 2.5787649154663086 }, { "auxiliary_loss_clip": 0.06423339, "auxiliary_loss_mlp": 0.01265401, "balance_loss_clip": 0.06277311, "balance_loss_mlp": 0.01254917, "epoch": 0.8453329325116489, "flos": 18338440556160.0, "grad_norm": 1.9791742561543222, "language_loss": 0.57974243, "learning_rate": 2.4564834629173516e-07, "loss": 0.6566298, "num_input_tokens_seen": 303185000, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10479736, "step": 14060, "time_per_iteration": 2.514842987060547 }, { "auxiliary_loss_clip": 0.06420592, "auxiliary_loss_mlp": 0.01265928, "balance_loss_clip": 0.06275606, "balance_loss_mlp": 0.01254621, "epoch": 0.8453930557643169, "flos": 22681989780480.0, "grad_norm": 1.5801535807952485, "language_loss": 0.75916994, "learning_rate": 2.454613720076277e-07, "loss": 0.83603513, "num_input_tokens_seen": 303205210, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.11303711, "step": 14061, "time_per_iteration": 2.5564684867858887 }, { "auxiliary_loss_clip": 0.06416095, "auxiliary_loss_mlp": 0.0126448, "balance_loss_clip": 0.06271911, "balance_loss_mlp": 0.01254187, "epoch": 0.8454531790169848, "flos": 22493034823680.0, "grad_norm": 2.099208590799927, "language_loss": 0.70778084, "learning_rate": 2.452744642558013e-07, "loss": 0.78458655, "num_input_tokens_seen": 303224655, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10302734, "step": 14062, "time_per_iteration": 2.5571212768554688 }, { "auxiliary_loss_clip": 0.06315921, "auxiliary_loss_mlp": 0.01252568, "balance_loss_clip": 0.06260198, "balance_loss_mlp": 0.01251484, "epoch": 0.8455133022696528, "flos": 58295383672320.0, "grad_norm": 0.6259121318341521, "language_loss": 0.52644992, "learning_rate": 2.450876230433432e-07, "loss": 0.60213482, "num_input_tokens_seen": 303289645, "router_z_loss_clip": 0.55761719, "router_z_loss_mlp": 0.01085663, "step": 14063, "time_per_iteration": 3.2320079803466797 }, { "auxiliary_loss_clip": 0.06407285, "auxiliary_loss_mlp": 0.01266561, "balance_loss_clip": 0.06272434, "balance_loss_mlp": 0.0125793, "epoch": 0.8455734255223207, "flos": 21367717689600.0, "grad_norm": 1.9176063806122419, "language_loss": 0.82673013, "learning_rate": 2.449008483773378e-07, "loss": 0.90346861, "num_input_tokens_seen": 303308350, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.08630371, "step": 14064, "time_per_iteration": 2.547774076461792 }, { "auxiliary_loss_clip": 0.06423263, "auxiliary_loss_mlp": 0.0126454, "balance_loss_clip": 0.06280117, "balance_loss_mlp": 0.01254539, "epoch": 0.8456335487749888, "flos": 20455562142720.0, "grad_norm": 2.139416471713105, "language_loss": 0.72699553, "learning_rate": 2.447141402648685e-07, "loss": 0.80387354, "num_input_tokens_seen": 303325230, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09991455, "step": 14065, "time_per_iteration": 2.5458056926727295 }, { "auxiliary_loss_clip": 0.06408454, "auxiliary_loss_mlp": 0.01264152, "balance_loss_clip": 0.06272522, "balance_loss_mlp": 0.01255783, "epoch": 0.8456936720276567, "flos": 28848592287360.0, "grad_norm": 1.4203644652216647, "language_loss": 0.77456367, "learning_rate": 2.445274987130146e-07, "loss": 0.85128975, "num_input_tokens_seen": 303345810, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.0836792, "step": 14066, "time_per_iteration": 2.6179983615875244 }, { "auxiliary_loss_clip": 0.06419183, "auxiliary_loss_mlp": 0.01265017, "balance_loss_clip": 0.06279752, "balance_loss_mlp": 0.01255093, "epoch": 0.8457537952803247, "flos": 22679222595840.0, "grad_norm": 1.4919626782410838, "language_loss": 0.69768023, "learning_rate": 2.4434092372885363e-07, "loss": 0.77452219, "num_input_tokens_seen": 303365140, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09918213, "step": 14067, "time_per_iteration": 2.5671629905700684 }, { "auxiliary_loss_clip": 0.0641208, "auxiliary_loss_mlp": 0.01265085, "balance_loss_clip": 0.06275162, "balance_loss_mlp": 0.01255154, "epoch": 0.8458139185329926, "flos": 33811444638720.0, "grad_norm": 1.919842128473789, "language_loss": 0.70935118, "learning_rate": 2.4415441531946144e-07, "loss": 0.7861228, "num_input_tokens_seen": 303386150, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.0993042, "step": 14068, "time_per_iteration": 2.6604673862457275 }, { "auxiliary_loss_clip": 0.06314332, "auxiliary_loss_mlp": 0.01254612, "balance_loss_clip": 0.06258714, "balance_loss_mlp": 0.01253514, "epoch": 0.8458740417856606, "flos": 70317860618880.0, "grad_norm": 0.6809565027727057, "language_loss": 0.60444099, "learning_rate": 2.4396797349190976e-07, "loss": 0.68013042, "num_input_tokens_seen": 303453770, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01100159, "step": 14069, "time_per_iteration": 3.2878968715667725 }, { "auxiliary_loss_clip": 0.06417592, "auxiliary_loss_mlp": 0.01263099, "balance_loss_clip": 0.06276701, "balance_loss_mlp": 0.01253985, "epoch": 0.8459341650383285, "flos": 24177795742080.0, "grad_norm": 1.4172532426782916, "language_loss": 0.74759662, "learning_rate": 2.4378159825326804e-07, "loss": 0.82440352, "num_input_tokens_seen": 303474520, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09118652, "step": 14070, "time_per_iteration": 2.568695068359375 }, { "auxiliary_loss_clip": 0.06412966, "auxiliary_loss_mlp": 0.01265174, "balance_loss_clip": 0.06274235, "balance_loss_mlp": 0.01255065, "epoch": 0.8459942882909965, "flos": 38190395013120.0, "grad_norm": 1.667496448592589, "language_loss": 0.66655165, "learning_rate": 2.435952896106039e-07, "loss": 0.74333304, "num_input_tokens_seen": 303497345, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10101318, "step": 14071, "time_per_iteration": 2.7015151977539062 }, { "auxiliary_loss_clip": 0.06317912, "auxiliary_loss_mlp": 0.01253176, "balance_loss_clip": 0.06262139, "balance_loss_mlp": 0.01251958, "epoch": 0.8460544115436646, "flos": 64137212553600.0, "grad_norm": 0.7236586009256984, "language_loss": 0.61077249, "learning_rate": 2.4340904757098313e-07, "loss": 0.68648338, "num_input_tokens_seen": 303554890, "router_z_loss_clip": 0.55810547, "router_z_loss_mlp": 0.01216888, "step": 14072, "time_per_iteration": 3.039991855621338 }, { "auxiliary_loss_clip": 0.06416904, "auxiliary_loss_mlp": 0.01264996, "balance_loss_clip": 0.06274531, "balance_loss_mlp": 0.0125475, "epoch": 0.8461145347963325, "flos": 24177753815040.0, "grad_norm": 1.591410757293458, "language_loss": 0.72690117, "learning_rate": 2.4322287214146664e-07, "loss": 0.80372024, "num_input_tokens_seen": 303574380, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10241699, "step": 14073, "time_per_iteration": 2.598400115966797 }, { "auxiliary_loss_clip": 0.06428108, "auxiliary_loss_mlp": 0.01266377, "balance_loss_clip": 0.06279176, "balance_loss_mlp": 0.01255052, "epoch": 0.8461746580490005, "flos": 34901863747200.0, "grad_norm": 1.9114771360565423, "language_loss": 0.78185457, "learning_rate": 2.430367633291155e-07, "loss": 0.8587994, "num_input_tokens_seen": 303594910, "router_z_loss_clip": 1.49023438, "router_z_loss_mlp": 0.11322021, "step": 14074, "time_per_iteration": 2.710479736328125 }, { "auxiliary_loss_clip": 0.06416339, "auxiliary_loss_mlp": 0.01263955, "balance_loss_clip": 0.06278087, "balance_loss_mlp": 0.01254102, "epoch": 0.8462347813016684, "flos": 25564127944320.0, "grad_norm": 1.9204793154925286, "language_loss": 0.75694275, "learning_rate": 2.4285072114098583e-07, "loss": 0.83374578, "num_input_tokens_seen": 303613520, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09857178, "step": 14075, "time_per_iteration": 2.586520195007324 }, { "auxiliary_loss_clip": 0.06416322, "auxiliary_loss_mlp": 0.01265899, "balance_loss_clip": 0.06277838, "balance_loss_mlp": 0.01256309, "epoch": 0.8462949045543364, "flos": 21331855342080.0, "grad_norm": 2.3470017177830362, "language_loss": 0.73094606, "learning_rate": 2.4266474558413355e-07, "loss": 0.80776829, "num_input_tokens_seen": 303631225, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09594727, "step": 14076, "time_per_iteration": 2.575617551803589 }, { "auxiliary_loss_clip": 0.06424204, "auxiliary_loss_mlp": 0.01266695, "balance_loss_clip": 0.06279975, "balance_loss_mlp": 0.01256408, "epoch": 0.8463550278070043, "flos": 22643947226880.0, "grad_norm": 7.565001637683039, "language_loss": 0.77761036, "learning_rate": 2.4247883666560945e-07, "loss": 0.85451937, "num_input_tokens_seen": 303649175, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10290527, "step": 14077, "time_per_iteration": 3.9842045307159424 }, { "auxiliary_loss_clip": 0.0642177, "auxiliary_loss_mlp": 0.01266074, "balance_loss_clip": 0.06278278, "balance_loss_mlp": 0.0125615, "epoch": 0.8464151510596724, "flos": 13010549892480.0, "grad_norm": 1.9597022469678913, "language_loss": 0.7553798, "learning_rate": 2.422929943924643e-07, "loss": 0.83225822, "num_input_tokens_seen": 303665915, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09912109, "step": 14078, "time_per_iteration": 2.5619919300079346 }, { "auxiliary_loss_clip": 0.06410495, "auxiliary_loss_mlp": 0.01266222, "balance_loss_clip": 0.06273215, "balance_loss_mlp": 0.01256131, "epoch": 0.8464752743123403, "flos": 15710231790720.0, "grad_norm": 2.1537525366271133, "language_loss": 0.85069388, "learning_rate": 2.4210721877174565e-07, "loss": 0.92746109, "num_input_tokens_seen": 303679985, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.10095215, "step": 14079, "time_per_iteration": 2.5037665367126465 }, { "auxiliary_loss_clip": 0.0643018, "auxiliary_loss_mlp": 0.01264207, "balance_loss_clip": 0.06280546, "balance_loss_mlp": 0.01253275, "epoch": 0.8465353975650083, "flos": 21660570109440.0, "grad_norm": 2.4385555024715977, "language_loss": 0.58987963, "learning_rate": 2.419215098104965e-07, "loss": 0.66682351, "num_input_tokens_seen": 303698470, "router_z_loss_clip": 1.49316406, "router_z_loss_mlp": 0.109375, "step": 14080, "time_per_iteration": 2.538637399673462 }, { "auxiliary_loss_clip": 0.06422727, "auxiliary_loss_mlp": 0.012648, "balance_loss_clip": 0.06276894, "balance_loss_mlp": 0.01254679, "epoch": 0.8465955208176762, "flos": 18521651508480.0, "grad_norm": 1.9701950159460688, "language_loss": 0.6645689, "learning_rate": 2.4173586751576014e-07, "loss": 0.74144411, "num_input_tokens_seen": 303716415, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10119629, "step": 14081, "time_per_iteration": 2.5286948680877686 }, { "auxiliary_loss_clip": 0.06418839, "auxiliary_loss_mlp": 0.01264322, "balance_loss_clip": 0.06276719, "balance_loss_mlp": 0.0125516, "epoch": 0.8466556440703442, "flos": 24206362784640.0, "grad_norm": 1.9505478680416004, "language_loss": 0.73211455, "learning_rate": 2.41550291894576e-07, "loss": 0.80894613, "num_input_tokens_seen": 303734490, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09155273, "step": 14082, "time_per_iteration": 4.056720018386841 }, { "auxiliary_loss_clip": 0.06419756, "auxiliary_loss_mlp": 0.01263564, "balance_loss_clip": 0.062765, "balance_loss_mlp": 0.01253908, "epoch": 0.8467157673230121, "flos": 20382118439040.0, "grad_norm": 1.802477971288278, "language_loss": 0.75960195, "learning_rate": 2.413647829539809e-07, "loss": 0.83643508, "num_input_tokens_seen": 303752310, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.09649658, "step": 14083, "time_per_iteration": 2.5761592388153076 }, { "auxiliary_loss_clip": 0.0642163, "auxiliary_loss_mlp": 0.012679, "balance_loss_clip": 0.06275894, "balance_loss_mlp": 0.01257618, "epoch": 0.8467758905756801, "flos": 28480870644480.0, "grad_norm": 1.809207896732485, "language_loss": 0.66150486, "learning_rate": 2.411793407010092e-07, "loss": 0.73840022, "num_input_tokens_seen": 303776065, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10284424, "step": 14084, "time_per_iteration": 2.672759771347046 }, { "auxiliary_loss_clip": 0.06417152, "auxiliary_loss_mlp": 0.01264637, "balance_loss_clip": 0.06277272, "balance_loss_mlp": 0.01254874, "epoch": 0.8468360138283482, "flos": 11697367904640.0, "grad_norm": 2.3655341778731627, "language_loss": 0.70212835, "learning_rate": 2.409939651426938e-07, "loss": 0.77894622, "num_input_tokens_seen": 303793500, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09765625, "step": 14085, "time_per_iteration": 2.5420336723327637 }, { "auxiliary_loss_clip": 0.06415852, "auxiliary_loss_mlp": 0.01263739, "balance_loss_clip": 0.06275756, "balance_loss_mlp": 0.0125372, "epoch": 0.8468961370810161, "flos": 24614726457600.0, "grad_norm": 1.5923853482054564, "language_loss": 0.71190494, "learning_rate": 2.408086562860634e-07, "loss": 0.78870082, "num_input_tokens_seen": 303814835, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.10021973, "step": 14086, "time_per_iteration": 2.5663673877716064 }, { "auxiliary_loss_clip": 0.06417788, "auxiliary_loss_mlp": 0.01264848, "balance_loss_clip": 0.06277645, "balance_loss_mlp": 0.01255436, "epoch": 0.8469562603336841, "flos": 19615927904640.0, "grad_norm": 1.6763092135933455, "language_loss": 0.75582588, "learning_rate": 2.4062341413814445e-07, "loss": 0.83265221, "num_input_tokens_seen": 303834505, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09411621, "step": 14087, "time_per_iteration": 2.5533080101013184 }, { "auxiliary_loss_clip": 0.06418215, "auxiliary_loss_mlp": 0.01264433, "balance_loss_clip": 0.06278433, "balance_loss_mlp": 0.01254569, "epoch": 0.847016383586352, "flos": 22645708162560.0, "grad_norm": 1.3164453498444846, "language_loss": 0.73759407, "learning_rate": 2.4043823870596227e-07, "loss": 0.81442058, "num_input_tokens_seen": 303855050, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09863281, "step": 14088, "time_per_iteration": 2.549142599105835 }, { "auxiliary_loss_clip": 0.06415033, "auxiliary_loss_mlp": 0.01265965, "balance_loss_clip": 0.06274632, "balance_loss_mlp": 0.01256136, "epoch": 0.84707650683902, "flos": 20966565467520.0, "grad_norm": 1.8210714612702743, "language_loss": 0.72122067, "learning_rate": 2.402531299965387e-07, "loss": 0.79803073, "num_input_tokens_seen": 303875635, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09838867, "step": 14089, "time_per_iteration": 2.5470893383026123 }, { "auxiliary_loss_clip": 0.06410772, "auxiliary_loss_mlp": 0.01264066, "balance_loss_clip": 0.06276263, "balance_loss_mlp": 0.01255048, "epoch": 0.8471366300916879, "flos": 24099111158400.0, "grad_norm": 1.4136669552143921, "language_loss": 0.796287, "learning_rate": 2.400680880168928e-07, "loss": 0.87303543, "num_input_tokens_seen": 303896750, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.09020996, "step": 14090, "time_per_iteration": 2.570037364959717 }, { "auxiliary_loss_clip": 0.06420946, "auxiliary_loss_mlp": 0.01270288, "balance_loss_clip": 0.0627799, "balance_loss_mlp": 0.01259375, "epoch": 0.847196753344356, "flos": 18338817899520.0, "grad_norm": 2.8590700024630387, "language_loss": 0.771249, "learning_rate": 2.3988311277404085e-07, "loss": 0.84816134, "num_input_tokens_seen": 303915435, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10919189, "step": 14091, "time_per_iteration": 2.603670835494995 }, { "auxiliary_loss_clip": 0.06319931, "auxiliary_loss_mlp": 0.01251824, "balance_loss_clip": 0.06264326, "balance_loss_mlp": 0.01250712, "epoch": 0.8472568765970239, "flos": 49585252550400.0, "grad_norm": 0.7962572565688505, "language_loss": 0.59369165, "learning_rate": 2.396982042749982e-07, "loss": 0.66940916, "num_input_tokens_seen": 303977245, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01113129, "step": 14092, "time_per_iteration": 4.6673338413238525 }, { "auxiliary_loss_clip": 0.06416713, "auxiliary_loss_mlp": 0.01268672, "balance_loss_clip": 0.06274758, "balance_loss_mlp": 0.01258492, "epoch": 0.8473169998496919, "flos": 19284739441920.0, "grad_norm": 2.0995380243391013, "language_loss": 0.70498264, "learning_rate": 2.395133625267756e-07, "loss": 0.78183651, "num_input_tokens_seen": 303996055, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10180664, "step": 14093, "time_per_iteration": 2.523543357849121 }, { "auxiliary_loss_clip": 0.06415655, "auxiliary_loss_mlp": 0.0126388, "balance_loss_clip": 0.0627872, "balance_loss_mlp": 0.01254606, "epoch": 0.8473771231023598, "flos": 17681262583680.0, "grad_norm": 2.063015475156046, "language_loss": 0.83434653, "learning_rate": 2.3932858753638263e-07, "loss": 0.91114187, "num_input_tokens_seen": 304012205, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09283447, "step": 14094, "time_per_iteration": 2.513599395751953 }, { "auxiliary_loss_clip": 0.0640931, "auxiliary_loss_mlp": 0.01266944, "balance_loss_clip": 0.06273995, "balance_loss_mlp": 0.01257598, "epoch": 0.8474372463550278, "flos": 26367019367040.0, "grad_norm": 1.622571571440646, "language_loss": 0.71611154, "learning_rate": 2.3914387931082626e-07, "loss": 0.7928741, "num_input_tokens_seen": 304033475, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.09338379, "step": 14095, "time_per_iteration": 2.5936830043792725 }, { "auxiliary_loss_clip": 0.06413996, "auxiliary_loss_mlp": 0.01265773, "balance_loss_clip": 0.06276371, "balance_loss_mlp": 0.01255843, "epoch": 0.8474973696076957, "flos": 23408418752640.0, "grad_norm": 1.8079355916179087, "language_loss": 0.80902731, "learning_rate": 2.3895923785711105e-07, "loss": 0.88582504, "num_input_tokens_seen": 304051845, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.0993042, "step": 14096, "time_per_iteration": 2.5787246227264404 }, { "auxiliary_loss_clip": 0.0642824, "auxiliary_loss_mlp": 0.01269318, "balance_loss_clip": 0.06282482, "balance_loss_mlp": 0.01258673, "epoch": 0.8475574928603637, "flos": 25081523953920.0, "grad_norm": 1.8924253555661477, "language_loss": 0.77645481, "learning_rate": 2.387746631822374e-07, "loss": 0.85343039, "num_input_tokens_seen": 304069965, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10650635, "step": 14097, "time_per_iteration": 4.049326658248901 }, { "auxiliary_loss_clip": 0.06412008, "auxiliary_loss_mlp": 0.01266576, "balance_loss_clip": 0.06274481, "balance_loss_mlp": 0.01257265, "epoch": 0.8476176161130318, "flos": 19971532632960.0, "grad_norm": 1.634965710833686, "language_loss": 0.80247939, "learning_rate": 2.385901552932048e-07, "loss": 0.87926525, "num_input_tokens_seen": 304086805, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09307861, "step": 14098, "time_per_iteration": 2.5772197246551514 }, { "auxiliary_loss_clip": 0.06420425, "auxiliary_loss_mlp": 0.01268615, "balance_loss_clip": 0.06281014, "balance_loss_mlp": 0.01258369, "epoch": 0.8476777393656997, "flos": 21291842217600.0, "grad_norm": 2.1112340285288083, "language_loss": 0.71706563, "learning_rate": 2.3840571419701062e-07, "loss": 0.79395604, "num_input_tokens_seen": 304105865, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.10247803, "step": 14099, "time_per_iteration": 2.5619871616363525 }, { "auxiliary_loss_clip": 0.0641692, "auxiliary_loss_mlp": 0.01263554, "balance_loss_clip": 0.06277126, "balance_loss_mlp": 0.01253159, "epoch": 0.8477378626183677, "flos": 29979276082560.0, "grad_norm": 1.8257017125830943, "language_loss": 0.63769943, "learning_rate": 2.3822133990064787e-07, "loss": 0.71450424, "num_input_tokens_seen": 304128300, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.10394287, "step": 14100, "time_per_iteration": 2.6657660007476807 }, { "auxiliary_loss_clip": 0.06419348, "auxiliary_loss_mlp": 0.012634, "balance_loss_clip": 0.06275377, "balance_loss_mlp": 0.01252957, "epoch": 0.8477979858710356, "flos": 24243650651520.0, "grad_norm": 2.409479597308328, "language_loss": 0.74003208, "learning_rate": 2.380370324111085e-07, "loss": 0.8168596, "num_input_tokens_seen": 304143695, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10449219, "step": 14101, "time_per_iteration": 2.559368133544922 }, { "auxiliary_loss_clip": 0.06414896, "auxiliary_loss_mlp": 0.01263758, "balance_loss_clip": 0.06275311, "balance_loss_mlp": 0.01253762, "epoch": 0.8478581091237036, "flos": 25600828832640.0, "grad_norm": 1.6517906613380369, "language_loss": 0.72205019, "learning_rate": 2.3785279173538163e-07, "loss": 0.79883683, "num_input_tokens_seen": 304165800, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.10003662, "step": 14102, "time_per_iteration": 2.581969976425171 }, { "auxiliary_loss_clip": 0.06421757, "auxiliary_loss_mlp": 0.0126653, "balance_loss_clip": 0.06277019, "balance_loss_mlp": 0.01256027, "epoch": 0.8479182323763715, "flos": 12061945019520.0, "grad_norm": 1.924265832966116, "language_loss": 0.82422858, "learning_rate": 2.3766861788045366e-07, "loss": 0.90111148, "num_input_tokens_seen": 304182910, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10510254, "step": 14103, "time_per_iteration": 2.504532814025879 }, { "auxiliary_loss_clip": 0.06412962, "auxiliary_loss_mlp": 0.01263557, "balance_loss_clip": 0.06275202, "balance_loss_mlp": 0.01253829, "epoch": 0.8479783556290396, "flos": 21439693946880.0, "grad_norm": 2.3187230197047644, "language_loss": 0.78692698, "learning_rate": 2.374845108533079e-07, "loss": 0.86369216, "num_input_tokens_seen": 304200175, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09716797, "step": 14104, "time_per_iteration": 2.5143821239471436 }, { "auxiliary_loss_clip": 0.0642264, "auxiliary_loss_mlp": 0.01266009, "balance_loss_clip": 0.06279673, "balance_loss_mlp": 0.01255453, "epoch": 0.8480384788817075, "flos": 19648142599680.0, "grad_norm": 2.038517893750288, "language_loss": 0.79353082, "learning_rate": 2.3730047066092607e-07, "loss": 0.8704173, "num_input_tokens_seen": 304217775, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10559082, "step": 14105, "time_per_iteration": 2.524244785308838 }, { "auxiliary_loss_clip": 0.06423024, "auxiliary_loss_mlp": 0.01266046, "balance_loss_clip": 0.06277475, "balance_loss_mlp": 0.01255055, "epoch": 0.8480986021343755, "flos": 22495298883840.0, "grad_norm": 1.759370348592855, "language_loss": 0.50860739, "learning_rate": 2.3711649731028749e-07, "loss": 0.58549809, "num_input_tokens_seen": 304235760, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10992432, "step": 14106, "time_per_iteration": 2.543025493621826 }, { "auxiliary_loss_clip": 0.06411801, "auxiliary_loss_mlp": 0.01264487, "balance_loss_clip": 0.0627222, "balance_loss_mlp": 0.01254444, "epoch": 0.8481587253870434, "flos": 22097039627520.0, "grad_norm": 1.705501055194238, "language_loss": 0.75507939, "learning_rate": 2.3693259080836792e-07, "loss": 0.8318423, "num_input_tokens_seen": 304253985, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.10046387, "step": 14107, "time_per_iteration": 2.539440155029297 }, { "auxiliary_loss_clip": 0.06419021, "auxiliary_loss_mlp": 0.01266661, "balance_loss_clip": 0.06280009, "balance_loss_mlp": 0.01257226, "epoch": 0.8482188486397114, "flos": 33590945819520.0, "grad_norm": 1.698532191266301, "language_loss": 0.73831499, "learning_rate": 2.3674875116214087e-07, "loss": 0.81517178, "num_input_tokens_seen": 304276785, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09436035, "step": 14108, "time_per_iteration": 2.634509801864624 }, { "auxiliary_loss_clip": 0.06410293, "auxiliary_loss_mlp": 0.01264743, "balance_loss_clip": 0.06274529, "balance_loss_mlp": 0.01254628, "epoch": 0.8482789718923793, "flos": 20925084896640.0, "grad_norm": 1.7075428791089624, "language_loss": 0.72917205, "learning_rate": 2.3656497837857836e-07, "loss": 0.80592239, "num_input_tokens_seen": 304296310, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.10119629, "step": 14109, "time_per_iteration": 2.5314579010009766 }, { "auxiliary_loss_clip": 0.06413841, "auxiliary_loss_mlp": 0.01265395, "balance_loss_clip": 0.06275777, "balance_loss_mlp": 0.01255721, "epoch": 0.8483390951450474, "flos": 12901159987200.0, "grad_norm": 2.1677358201106465, "language_loss": 0.74175417, "learning_rate": 2.3638127246464811e-07, "loss": 0.81854653, "num_input_tokens_seen": 304311715, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09680176, "step": 14110, "time_per_iteration": 2.506075620651245 }, { "auxiliary_loss_clip": 0.06412649, "auxiliary_loss_mlp": 0.0126637, "balance_loss_clip": 0.06273304, "balance_loss_mlp": 0.01257066, "epoch": 0.8483992183977154, "flos": 25088483842560.0, "grad_norm": 1.6726553693679118, "language_loss": 0.76267517, "learning_rate": 2.3619763342731658e-07, "loss": 0.83946538, "num_input_tokens_seen": 304331910, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09301758, "step": 14111, "time_per_iteration": 2.552241325378418 }, { "auxiliary_loss_clip": 0.06414545, "auxiliary_loss_mlp": 0.01265244, "balance_loss_clip": 0.06277923, "balance_loss_mlp": 0.01256214, "epoch": 0.8484593416503833, "flos": 25564631068800.0, "grad_norm": 2.4725129870105813, "language_loss": 0.67138952, "learning_rate": 2.3601406127354772e-07, "loss": 0.74818742, "num_input_tokens_seen": 304351405, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.09033203, "step": 14112, "time_per_iteration": 2.556095600128174 }, { "auxiliary_loss_clip": 0.06417829, "auxiliary_loss_mlp": 0.01267692, "balance_loss_clip": 0.0627708, "balance_loss_mlp": 0.01258287, "epoch": 0.8485194649030513, "flos": 27205773137280.0, "grad_norm": 1.3320145134584302, "language_loss": 0.73630667, "learning_rate": 2.3583055601030312e-07, "loss": 0.81316185, "num_input_tokens_seen": 304372935, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09411621, "step": 14113, "time_per_iteration": 2.6017980575561523 }, { "auxiliary_loss_clip": 0.06413101, "auxiliary_loss_mlp": 0.01266721, "balance_loss_clip": 0.06275493, "balance_loss_mlp": 0.01257488, "epoch": 0.8485795881557192, "flos": 24212609913600.0, "grad_norm": 2.0314987578582286, "language_loss": 0.67211455, "learning_rate": 2.3564711764454003e-07, "loss": 0.74891275, "num_input_tokens_seen": 304393070, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09240723, "step": 14114, "time_per_iteration": 2.5432465076446533 }, { "auxiliary_loss_clip": 0.0641661, "auxiliary_loss_mlp": 0.01265154, "balance_loss_clip": 0.06275271, "balance_loss_mlp": 0.0125563, "epoch": 0.8486397114083872, "flos": 21147931630080.0, "grad_norm": 1.6022752936847005, "language_loss": 0.79147291, "learning_rate": 2.3546374618321495e-07, "loss": 0.86829054, "num_input_tokens_seen": 304411195, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09521484, "step": 14115, "time_per_iteration": 2.5312247276306152 }, { "auxiliary_loss_clip": 0.064179, "auxiliary_loss_mlp": 0.0126366, "balance_loss_clip": 0.06277017, "balance_loss_mlp": 0.0125354, "epoch": 0.8486998346610551, "flos": 19980966216960.0, "grad_norm": 1.9185126694741312, "language_loss": 0.79296654, "learning_rate": 2.3528044163328187e-07, "loss": 0.86978209, "num_input_tokens_seen": 304429425, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10125732, "step": 14116, "time_per_iteration": 3.943643569946289 }, { "auxiliary_loss_clip": 0.06420444, "auxiliary_loss_mlp": 0.01264427, "balance_loss_clip": 0.06276798, "balance_loss_mlp": 0.01254896, "epoch": 0.8487599579137232, "flos": 19798468024320.0, "grad_norm": 3.2026307116378265, "language_loss": 0.68890601, "learning_rate": 2.3509720400169076e-07, "loss": 0.7657547, "num_input_tokens_seen": 304447460, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09539795, "step": 14117, "time_per_iteration": 2.5290369987487793 }, { "auxiliary_loss_clip": 0.06416756, "auxiliary_loss_mlp": 0.01262348, "balance_loss_clip": 0.06273256, "balance_loss_mlp": 0.01252645, "epoch": 0.8488200811663911, "flos": 26403259057920.0, "grad_norm": 1.8096802394229627, "language_loss": 0.64887726, "learning_rate": 2.3491403329539096e-07, "loss": 0.72566831, "num_input_tokens_seen": 304468230, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09710693, "step": 14118, "time_per_iteration": 2.561192035675049 }, { "auxiliary_loss_clip": 0.06414779, "auxiliary_loss_mlp": 0.0126417, "balance_loss_clip": 0.06277509, "balance_loss_mlp": 0.01255182, "epoch": 0.8488802044190591, "flos": 16364307162240.0, "grad_norm": 1.5933147605203923, "language_loss": 0.73227787, "learning_rate": 2.3473092952132757e-07, "loss": 0.80906737, "num_input_tokens_seen": 304484860, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.08984375, "step": 14119, "time_per_iteration": 2.515439748764038 }, { "auxiliary_loss_clip": 0.06418759, "auxiliary_loss_mlp": 0.01265127, "balance_loss_clip": 0.0627751, "balance_loss_mlp": 0.01254785, "epoch": 0.848940327671727, "flos": 19214985317760.0, "grad_norm": 1.7504630467987452, "language_loss": 0.78191108, "learning_rate": 2.345478926864446e-07, "loss": 0.85874999, "num_input_tokens_seen": 304503575, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10345459, "step": 14120, "time_per_iteration": 2.5142879486083984 }, { "auxiliary_loss_clip": 0.06416248, "auxiliary_loss_mlp": 0.01263619, "balance_loss_clip": 0.06273668, "balance_loss_mlp": 0.0125357, "epoch": 0.849000450924395, "flos": 21877547057280.0, "grad_norm": 1.9923538841027941, "language_loss": 0.76062894, "learning_rate": 2.3436492279768227e-07, "loss": 0.83742762, "num_input_tokens_seen": 304525005, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10046387, "step": 14121, "time_per_iteration": 2.56290602684021 }, { "auxiliary_loss_clip": 0.06316094, "auxiliary_loss_mlp": 0.01252611, "balance_loss_clip": 0.06260693, "balance_loss_mlp": 0.01251566, "epoch": 0.8490605741770629, "flos": 71187697054080.0, "grad_norm": 0.8027181985418301, "language_loss": 0.60160935, "learning_rate": 2.3418201986197883e-07, "loss": 0.6772964, "num_input_tokens_seen": 304585220, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01045227, "step": 14122, "time_per_iteration": 4.5223915576934814 }, { "auxiliary_loss_clip": 0.06417796, "auxiliary_loss_mlp": 0.01267662, "balance_loss_clip": 0.06277272, "balance_loss_mlp": 0.01258692, "epoch": 0.849120697429731, "flos": 24980393675520.0, "grad_norm": 1.6597597251384373, "language_loss": 0.79992378, "learning_rate": 2.3399918388627048e-07, "loss": 0.87677836, "num_input_tokens_seen": 304604665, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.08966064, "step": 14123, "time_per_iteration": 2.596856117248535 }, { "auxiliary_loss_clip": 0.06412658, "auxiliary_loss_mlp": 0.01265203, "balance_loss_clip": 0.06277485, "balance_loss_mlp": 0.01255934, "epoch": 0.8491808206823989, "flos": 23037762216960.0, "grad_norm": 2.1695426182605604, "language_loss": 0.83134317, "learning_rate": 2.3381641487749016e-07, "loss": 0.90812176, "num_input_tokens_seen": 304620600, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.09265137, "step": 14124, "time_per_iteration": 2.552582025527954 }, { "auxiliary_loss_clip": 0.06417156, "auxiliary_loss_mlp": 0.01264583, "balance_loss_clip": 0.0627795, "balance_loss_mlp": 0.01255058, "epoch": 0.8492409439350669, "flos": 23885362592640.0, "grad_norm": 1.8895708781804266, "language_loss": 0.71635854, "learning_rate": 2.3363371284256805e-07, "loss": 0.793176, "num_input_tokens_seen": 304639540, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09527588, "step": 14125, "time_per_iteration": 2.555387020111084 }, { "auxiliary_loss_clip": 0.064289, "auxiliary_loss_mlp": 0.0126693, "balance_loss_clip": 0.06282033, "balance_loss_mlp": 0.01255777, "epoch": 0.8493010671877349, "flos": 22426592935680.0, "grad_norm": 1.507808622244618, "language_loss": 0.74113566, "learning_rate": 2.3345107778843288e-07, "loss": 0.81809396, "num_input_tokens_seen": 304660595, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.11151123, "step": 14126, "time_per_iteration": 2.5616302490234375 }, { "auxiliary_loss_clip": 0.06416699, "auxiliary_loss_mlp": 0.01266817, "balance_loss_clip": 0.06278406, "balance_loss_mlp": 0.01257423, "epoch": 0.8493611904404028, "flos": 17535087936000.0, "grad_norm": 1.457748536391072, "language_loss": 0.67369151, "learning_rate": 2.3326850972200928e-07, "loss": 0.75052667, "num_input_tokens_seen": 304679580, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09387207, "step": 14127, "time_per_iteration": 2.5209109783172607 }, { "auxiliary_loss_clip": 0.06421991, "auxiliary_loss_mlp": 0.0126504, "balance_loss_clip": 0.06279236, "balance_loss_mlp": 0.0125502, "epoch": 0.8494213136930708, "flos": 19468872789120.0, "grad_norm": 2.1969973811515042, "language_loss": 0.69301766, "learning_rate": 2.330860086502211e-07, "loss": 0.76988792, "num_input_tokens_seen": 304698385, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10021973, "step": 14128, "time_per_iteration": 2.5349104404449463 }, { "auxiliary_loss_clip": 0.06414733, "auxiliary_loss_mlp": 0.01265505, "balance_loss_clip": 0.06275159, "balance_loss_mlp": 0.01256034, "epoch": 0.8494814369457387, "flos": 18776209812480.0, "grad_norm": 2.5969572413619804, "language_loss": 0.78266704, "learning_rate": 2.3290357457998855e-07, "loss": 0.85946941, "num_input_tokens_seen": 304715430, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09472656, "step": 14129, "time_per_iteration": 2.5250730514526367 }, { "auxiliary_loss_clip": 0.06417145, "auxiliary_loss_mlp": 0.01263758, "balance_loss_clip": 0.06275702, "balance_loss_mlp": 0.01254466, "epoch": 0.8495415601984068, "flos": 23338245358080.0, "grad_norm": 1.7176719484576917, "language_loss": 0.68249571, "learning_rate": 2.3272120751823031e-07, "loss": 0.75930476, "num_input_tokens_seen": 304734345, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09289551, "step": 14130, "time_per_iteration": 2.573594570159912 }, { "auxiliary_loss_clip": 0.06414284, "auxiliary_loss_mlp": 0.01264896, "balance_loss_clip": 0.06274189, "balance_loss_mlp": 0.01255347, "epoch": 0.8496016834510747, "flos": 26619774808320.0, "grad_norm": 1.801092108176554, "language_loss": 0.71457517, "learning_rate": 2.3253890747186e-07, "loss": 0.79136699, "num_input_tokens_seen": 304755030, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09545898, "step": 14131, "time_per_iteration": 4.095928907394409 }, { "auxiliary_loss_clip": 0.06421137, "auxiliary_loss_mlp": 0.01266037, "balance_loss_clip": 0.0627728, "balance_loss_mlp": 0.01256495, "epoch": 0.8496618067037427, "flos": 25486868880000.0, "grad_norm": 1.7711335468144298, "language_loss": 0.6870895, "learning_rate": 2.3235667444779162e-07, "loss": 0.7639612, "num_input_tokens_seen": 304774320, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.09545898, "step": 14132, "time_per_iteration": 2.5631165504455566 }, { "auxiliary_loss_clip": 0.06413053, "auxiliary_loss_mlp": 0.01266247, "balance_loss_clip": 0.0627557, "balance_loss_mlp": 0.01257217, "epoch": 0.8497219299564106, "flos": 25381671678720.0, "grad_norm": 2.0896263744495145, "language_loss": 0.70556849, "learning_rate": 2.3217450845293564e-07, "loss": 0.78236151, "num_input_tokens_seen": 304795355, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09033203, "step": 14133, "time_per_iteration": 2.5678563117980957 }, { "auxiliary_loss_clip": 0.06313254, "auxiliary_loss_mlp": 0.01253136, "balance_loss_clip": 0.06257728, "balance_loss_mlp": 0.01252148, "epoch": 0.8497820532090786, "flos": 67802102432640.0, "grad_norm": 0.7193184356995662, "language_loss": 0.5764569, "learning_rate": 2.3199240949419918e-07, "loss": 0.65212083, "num_input_tokens_seen": 304863915, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00988007, "step": 14134, "time_per_iteration": 3.2366690635681152 }, { "auxiliary_loss_clip": 0.06420092, "auxiliary_loss_mlp": 0.01263284, "balance_loss_clip": 0.06277049, "balance_loss_mlp": 0.01253283, "epoch": 0.8498421764617465, "flos": 23447257920000.0, "grad_norm": 1.9406051739344181, "language_loss": 0.79139113, "learning_rate": 2.3181037757848787e-07, "loss": 0.86822486, "num_input_tokens_seen": 304881555, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10003662, "step": 14135, "time_per_iteration": 2.5522067546844482 }, { "auxiliary_loss_clip": 0.06422541, "auxiliary_loss_mlp": 0.01265175, "balance_loss_clip": 0.06278309, "balance_loss_mlp": 0.01254691, "epoch": 0.8499022997144146, "flos": 17718424669440.0, "grad_norm": 1.730369672777004, "language_loss": 0.6373353, "learning_rate": 2.316284127127044e-07, "loss": 0.71421242, "num_input_tokens_seen": 304898760, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10491943, "step": 14136, "time_per_iteration": 2.631338596343994 }, { "auxiliary_loss_clip": 0.06421863, "auxiliary_loss_mlp": 0.01267171, "balance_loss_clip": 0.06278226, "balance_loss_mlp": 0.01256603, "epoch": 0.8499624229670825, "flos": 18594508233600.0, "grad_norm": 1.7846949920178714, "language_loss": 0.84586608, "learning_rate": 2.3144651490374835e-07, "loss": 0.92275649, "num_input_tokens_seen": 304915465, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10565186, "step": 14137, "time_per_iteration": 3.9745850563049316 }, { "auxiliary_loss_clip": 0.06412227, "auxiliary_loss_mlp": 0.01265432, "balance_loss_clip": 0.06274877, "balance_loss_mlp": 0.01256164, "epoch": 0.8500225462197505, "flos": 24351573110400.0, "grad_norm": 2.7184078798968345, "language_loss": 0.78740084, "learning_rate": 2.3126468415851773e-07, "loss": 0.86417747, "num_input_tokens_seen": 304933190, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09259033, "step": 14138, "time_per_iteration": 2.6197757720947266 }, { "auxiliary_loss_clip": 0.06415872, "auxiliary_loss_mlp": 0.01265247, "balance_loss_clip": 0.06276036, "balance_loss_mlp": 0.01255073, "epoch": 0.8500826694724185, "flos": 16551207694080.0, "grad_norm": 1.5731080480813548, "language_loss": 0.64678991, "learning_rate": 2.310829204839073e-07, "loss": 0.72360104, "num_input_tokens_seen": 304951110, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.10180664, "step": 14139, "time_per_iteration": 2.5577921867370605 }, { "auxiliary_loss_clip": 0.06414174, "auxiliary_loss_mlp": 0.01263067, "balance_loss_clip": 0.06275075, "balance_loss_mlp": 0.0125359, "epoch": 0.8501427927250864, "flos": 16294930381440.0, "grad_norm": 1.5551931133767412, "language_loss": 0.710163, "learning_rate": 2.3090122388681043e-07, "loss": 0.78693539, "num_input_tokens_seen": 304969095, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09472656, "step": 14140, "time_per_iteration": 2.5463662147521973 }, { "auxiliary_loss_clip": 0.06420578, "auxiliary_loss_mlp": 0.01266894, "balance_loss_clip": 0.06275846, "balance_loss_mlp": 0.012561, "epoch": 0.8502029159777544, "flos": 26695189082880.0, "grad_norm": 2.5535042402290524, "language_loss": 0.64689028, "learning_rate": 2.3071959437411648e-07, "loss": 0.72376502, "num_input_tokens_seen": 304989315, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10797119, "step": 14141, "time_per_iteration": 2.591463327407837 }, { "auxiliary_loss_clip": 0.06411754, "auxiliary_loss_mlp": 0.01267524, "balance_loss_clip": 0.06273971, "balance_loss_mlp": 0.01258005, "epoch": 0.8502630392304223, "flos": 35599599895680.0, "grad_norm": 1.6566421624732113, "language_loss": 0.71115577, "learning_rate": 2.3053803195271214e-07, "loss": 0.78794855, "num_input_tokens_seen": 305011020, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09515381, "step": 14142, "time_per_iteration": 2.6650776863098145 }, { "auxiliary_loss_clip": 0.06418386, "auxiliary_loss_mlp": 0.01263964, "balance_loss_clip": 0.06276338, "balance_loss_mlp": 0.01255416, "epoch": 0.8503231624830904, "flos": 21655329229440.0, "grad_norm": 1.5121577183763553, "language_loss": 0.65356743, "learning_rate": 2.3035653662948375e-07, "loss": 0.73039097, "num_input_tokens_seen": 305033550, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.08551025, "step": 14143, "time_per_iteration": 2.611405849456787 }, { "auxiliary_loss_clip": 0.06420299, "auxiliary_loss_mlp": 0.01267129, "balance_loss_clip": 0.06275812, "balance_loss_mlp": 0.01257181, "epoch": 0.8503832857357583, "flos": 22423741896960.0, "grad_norm": 2.7262534137901326, "language_loss": 0.67841291, "learning_rate": 2.3017510841131216e-07, "loss": 0.75528717, "num_input_tokens_seen": 305052885, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.09942627, "step": 14144, "time_per_iteration": 2.5577735900878906 }, { "auxiliary_loss_clip": 0.06408423, "auxiliary_loss_mlp": 0.01266425, "balance_loss_clip": 0.06272864, "balance_loss_mlp": 0.01256567, "epoch": 0.8504434089884263, "flos": 18703981992960.0, "grad_norm": 2.045234963506442, "language_loss": 0.65447885, "learning_rate": 2.299937473050777e-07, "loss": 0.73122728, "num_input_tokens_seen": 305071995, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.09857178, "step": 14145, "time_per_iteration": 2.5343949794769287 }, { "auxiliary_loss_clip": 0.0641498, "auxiliary_loss_mlp": 0.01263899, "balance_loss_clip": 0.0627494, "balance_loss_mlp": 0.01253677, "epoch": 0.8505035322410942, "flos": 20013642109440.0, "grad_norm": 2.1570488604705287, "language_loss": 0.85497534, "learning_rate": 2.2981245331765842e-07, "loss": 0.93176413, "num_input_tokens_seen": 305090190, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.10229492, "step": 14146, "time_per_iteration": 2.575287342071533 }, { "auxiliary_loss_clip": 0.06412671, "auxiliary_loss_mlp": 0.01264622, "balance_loss_clip": 0.06273561, "balance_loss_mlp": 0.01255961, "epoch": 0.8505636554937622, "flos": 20818210613760.0, "grad_norm": 1.6889941237473576, "language_loss": 0.84300256, "learning_rate": 2.2963122645592814e-07, "loss": 0.91977555, "num_input_tokens_seen": 305109355, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.08654785, "step": 14147, "time_per_iteration": 2.532209873199463 }, { "auxiliary_loss_clip": 0.06420762, "auxiliary_loss_mlp": 0.01264003, "balance_loss_clip": 0.06275349, "balance_loss_mlp": 0.01254031, "epoch": 0.8506237787464301, "flos": 14179821292800.0, "grad_norm": 2.3673591989105063, "language_loss": 0.86294293, "learning_rate": 2.2945006672675894e-07, "loss": 0.93979061, "num_input_tokens_seen": 305124165, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.09973145, "step": 14148, "time_per_iteration": 2.5010101795196533 }, { "auxiliary_loss_clip": 0.06416839, "auxiliary_loss_mlp": 0.01266878, "balance_loss_clip": 0.062791, "balance_loss_mlp": 0.01257431, "epoch": 0.8506839019990982, "flos": 23265095143680.0, "grad_norm": 2.3636158591571363, "language_loss": 0.72724539, "learning_rate": 2.292689741370204e-07, "loss": 0.80408251, "num_input_tokens_seen": 305143940, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09448242, "step": 14149, "time_per_iteration": 2.5358128547668457 }, { "auxiliary_loss_clip": 0.06415181, "auxiliary_loss_mlp": 0.01264346, "balance_loss_clip": 0.06275703, "balance_loss_mlp": 0.0125419, "epoch": 0.8507440252517661, "flos": 23665911949440.0, "grad_norm": 1.6658653811035335, "language_loss": 0.76345956, "learning_rate": 2.290879486935804e-07, "loss": 0.84025484, "num_input_tokens_seen": 305163505, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.10162354, "step": 14150, "time_per_iteration": 2.5753607749938965 }, { "auxiliary_loss_clip": 0.06410914, "auxiliary_loss_mlp": 0.0126451, "balance_loss_clip": 0.06273252, "balance_loss_mlp": 0.0125514, "epoch": 0.8508041485044341, "flos": 18667323031680.0, "grad_norm": 1.6085783241521363, "language_loss": 0.7261138, "learning_rate": 2.2890699040330231e-07, "loss": 0.80286801, "num_input_tokens_seen": 305182325, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09368896, "step": 14151, "time_per_iteration": 2.516327142715454 }, { "auxiliary_loss_clip": 0.06312083, "auxiliary_loss_mlp": 0.01249989, "balance_loss_clip": 0.06256629, "balance_loss_mlp": 0.01248975, "epoch": 0.8508642717571021, "flos": 52527124275840.0, "grad_norm": 0.8561813232430509, "language_loss": 0.59553564, "learning_rate": 2.2872609927304909e-07, "loss": 0.67115629, "num_input_tokens_seen": 305230775, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01013947, "step": 14152, "time_per_iteration": 2.932328462600708 }, { "auxiliary_loss_clip": 0.06314747, "auxiliary_loss_mlp": 0.01252853, "balance_loss_clip": 0.06259154, "balance_loss_mlp": 0.01251629, "epoch": 0.85092439500977, "flos": 69316622582400.0, "grad_norm": 0.867132596226419, "language_loss": 0.60686415, "learning_rate": 2.285452753096797e-07, "loss": 0.68254018, "num_input_tokens_seen": 305296000, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01222992, "step": 14153, "time_per_iteration": 3.191765546798706 }, { "auxiliary_loss_clip": 0.06411317, "auxiliary_loss_mlp": 0.01265376, "balance_loss_clip": 0.06273591, "balance_loss_mlp": 0.01255702, "epoch": 0.850984518262438, "flos": 24396701333760.0, "grad_norm": 1.6018834130867734, "language_loss": 0.80977285, "learning_rate": 2.2836451852005067e-07, "loss": 0.88653982, "num_input_tokens_seen": 305314705, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09680176, "step": 14154, "time_per_iteration": 2.5457262992858887 }, { "auxiliary_loss_clip": 0.06406697, "auxiliary_loss_mlp": 0.01262504, "balance_loss_clip": 0.06271912, "balance_loss_mlp": 0.01254076, "epoch": 0.851044641515106, "flos": 23301544469760.0, "grad_norm": 1.6211245614317902, "language_loss": 0.80039281, "learning_rate": 2.281838289110165e-07, "loss": 0.87708485, "num_input_tokens_seen": 305333870, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.08428955, "step": 14155, "time_per_iteration": 2.5527641773223877 }, { "auxiliary_loss_clip": 0.06418806, "auxiliary_loss_mlp": 0.01268133, "balance_loss_clip": 0.06274565, "balance_loss_mlp": 0.01258018, "epoch": 0.851104764767774, "flos": 22055894472960.0, "grad_norm": 2.1986159457452916, "language_loss": 0.70534581, "learning_rate": 2.2800320648942904e-07, "loss": 0.78221524, "num_input_tokens_seen": 305352780, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10119629, "step": 14156, "time_per_iteration": 3.9455208778381348 }, { "auxiliary_loss_clip": 0.06411512, "auxiliary_loss_mlp": 0.01266098, "balance_loss_clip": 0.06274404, "balance_loss_mlp": 0.01256627, "epoch": 0.8511648880204419, "flos": 20711084768640.0, "grad_norm": 1.8404318687135661, "language_loss": 0.73972297, "learning_rate": 2.278226512621386e-07, "loss": 0.81649905, "num_input_tokens_seen": 305371370, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09466553, "step": 14157, "time_per_iteration": 2.5629334449768066 }, { "auxiliary_loss_clip": 0.06407864, "auxiliary_loss_mlp": 0.01263823, "balance_loss_clip": 0.06272279, "balance_loss_mlp": 0.0125524, "epoch": 0.8512250112731099, "flos": 24031537240320.0, "grad_norm": 1.9275630261789107, "language_loss": 0.79495215, "learning_rate": 2.2764216323598995e-07, "loss": 0.87166905, "num_input_tokens_seen": 305387955, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.08587646, "step": 14158, "time_per_iteration": 2.551668643951416 }, { "auxiliary_loss_clip": 0.06415109, "auxiliary_loss_mlp": 0.01267211, "balance_loss_clip": 0.06275019, "balance_loss_mlp": 0.01257483, "epoch": 0.8512851345257778, "flos": 22021583425920.0, "grad_norm": 1.9732560225226985, "language_loss": 0.79065794, "learning_rate": 2.27461742417828e-07, "loss": 0.86748111, "num_input_tokens_seen": 305406285, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.097229, "step": 14159, "time_per_iteration": 2.54669189453125 }, { "auxiliary_loss_clip": 0.06415258, "auxiliary_loss_mlp": 0.01263784, "balance_loss_clip": 0.06275646, "balance_loss_mlp": 0.01253616, "epoch": 0.8513452577784458, "flos": 14835531818880.0, "grad_norm": 1.7940079966028826, "language_loss": 0.71513045, "learning_rate": 2.2728138881449488e-07, "loss": 0.7919209, "num_input_tokens_seen": 305424500, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.10162354, "step": 14160, "time_per_iteration": 2.5388107299804688 }, { "auxiliary_loss_clip": 0.06423698, "auxiliary_loss_mlp": 0.01267771, "balance_loss_clip": 0.06277931, "balance_loss_mlp": 0.01256881, "epoch": 0.8514053810311137, "flos": 33043870512000.0, "grad_norm": 2.279049069063754, "language_loss": 0.71021295, "learning_rate": 2.2710110243282866e-07, "loss": 0.78712761, "num_input_tokens_seen": 305442990, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10894775, "step": 14161, "time_per_iteration": 4.07213568687439 }, { "auxiliary_loss_clip": 0.06415614, "auxiliary_loss_mlp": 0.01265047, "balance_loss_clip": 0.06272171, "balance_loss_mlp": 0.01255081, "epoch": 0.8514655042837818, "flos": 27572027333760.0, "grad_norm": 2.7205167634480487, "language_loss": 0.7831465, "learning_rate": 2.2692088327966653e-07, "loss": 0.85995311, "num_input_tokens_seen": 305463065, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09973145, "step": 14162, "time_per_iteration": 2.6016411781311035 }, { "auxiliary_loss_clip": 0.0641064, "auxiliary_loss_mlp": 0.01265817, "balance_loss_clip": 0.0627239, "balance_loss_mlp": 0.01255738, "epoch": 0.8515256275364497, "flos": 35565163067520.0, "grad_norm": 2.170540469060299, "language_loss": 0.77219617, "learning_rate": 2.2674073136184235e-07, "loss": 0.8489607, "num_input_tokens_seen": 305489070, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.10083008, "step": 14163, "time_per_iteration": 2.683957576751709 }, { "auxiliary_loss_clip": 0.06308577, "auxiliary_loss_mlp": 0.01251326, "balance_loss_clip": 0.06253044, "balance_loss_mlp": 0.01250324, "epoch": 0.8515857507891177, "flos": 70226681777280.0, "grad_norm": 0.6992549114689064, "language_loss": 0.54906476, "learning_rate": 2.2656064668618735e-07, "loss": 0.62466383, "num_input_tokens_seen": 305551490, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01000977, "step": 14164, "time_per_iteration": 3.241954803466797 }, { "auxiliary_loss_clip": 0.06417403, "auxiliary_loss_mlp": 0.01268539, "balance_loss_clip": 0.06277582, "balance_loss_mlp": 0.01258346, "epoch": 0.8516458740417857, "flos": 22682031707520.0, "grad_norm": 1.9542915003913042, "language_loss": 0.73008728, "learning_rate": 2.2638062925953005e-07, "loss": 0.80694664, "num_input_tokens_seen": 305570535, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.10198975, "step": 14165, "time_per_iteration": 2.5761470794677734 }, { "auxiliary_loss_clip": 0.06409167, "auxiliary_loss_mlp": 0.01263453, "balance_loss_clip": 0.06272683, "balance_loss_mlp": 0.01254202, "epoch": 0.8517059972944536, "flos": 22754049891840.0, "grad_norm": 1.6056860563978115, "language_loss": 0.67751521, "learning_rate": 2.26200679088697e-07, "loss": 0.75424147, "num_input_tokens_seen": 305590800, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.0925293, "step": 14166, "time_per_iteration": 2.564535140991211 }, { "auxiliary_loss_clip": 0.06419723, "auxiliary_loss_mlp": 0.01265004, "balance_loss_clip": 0.06277668, "balance_loss_mlp": 0.0125539, "epoch": 0.8517661205471216, "flos": 21695551989120.0, "grad_norm": 1.7273161217404607, "language_loss": 0.74045843, "learning_rate": 2.260207961805125e-07, "loss": 0.81730568, "num_input_tokens_seen": 305609495, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09613037, "step": 14167, "time_per_iteration": 2.5550906658172607 }, { "auxiliary_loss_clip": 0.0641304, "auxiliary_loss_mlp": 0.01262669, "balance_loss_clip": 0.06274129, "balance_loss_mlp": 0.01253055, "epoch": 0.8518262437997896, "flos": 25381965168000.0, "grad_norm": 1.5490923630139424, "language_loss": 0.80842447, "learning_rate": 2.258409805417969e-07, "loss": 0.88518155, "num_input_tokens_seen": 305629420, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09613037, "step": 14168, "time_per_iteration": 2.604206085205078 }, { "auxiliary_loss_clip": 0.0641251, "auxiliary_loss_mlp": 0.01264859, "balance_loss_clip": 0.06273552, "balance_loss_mlp": 0.01255495, "epoch": 0.8518863670524576, "flos": 27242809441920.0, "grad_norm": 1.7895251618408143, "language_loss": 0.76112229, "learning_rate": 2.2566123217936893e-07, "loss": 0.83789599, "num_input_tokens_seen": 305649835, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09375, "step": 14169, "time_per_iteration": 2.5770504474639893 }, { "auxiliary_loss_clip": 0.06420095, "auxiliary_loss_mlp": 0.01264395, "balance_loss_clip": 0.06278253, "balance_loss_mlp": 0.01254602, "epoch": 0.8519464903051255, "flos": 20965810780800.0, "grad_norm": 1.5910603210158443, "language_loss": 0.64083314, "learning_rate": 2.254815511000452e-07, "loss": 0.71767801, "num_input_tokens_seen": 305668840, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09796143, "step": 14170, "time_per_iteration": 2.5294454097747803 }, { "auxiliary_loss_clip": 0.06413679, "auxiliary_loss_mlp": 0.01264788, "balance_loss_clip": 0.0627522, "balance_loss_mlp": 0.01255561, "epoch": 0.8520066135577935, "flos": 18447578899200.0, "grad_norm": 3.140763468236584, "language_loss": 0.87697238, "learning_rate": 2.253019373106384e-07, "loss": 0.95375705, "num_input_tokens_seen": 305686955, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09222412, "step": 14171, "time_per_iteration": 3.9458236694335938 }, { "auxiliary_loss_clip": 0.06414357, "auxiliary_loss_mlp": 0.01264465, "balance_loss_clip": 0.06273429, "balance_loss_mlp": 0.012546, "epoch": 0.8520667368104614, "flos": 29137545492480.0, "grad_norm": 1.770998253292113, "language_loss": 0.54907894, "learning_rate": 2.2512239081796003e-07, "loss": 0.62586713, "num_input_tokens_seen": 305706290, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09863281, "step": 14172, "time_per_iteration": 2.591245651245117 }, { "auxiliary_loss_clip": 0.0640975, "auxiliary_loss_mlp": 0.01263012, "balance_loss_clip": 0.06273835, "balance_loss_mlp": 0.0125434, "epoch": 0.8521268600631294, "flos": 16039910880000.0, "grad_norm": 2.1103273956216784, "language_loss": 0.69719803, "learning_rate": 2.2494291162881862e-07, "loss": 0.7739256, "num_input_tokens_seen": 305723835, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.08666992, "step": 14173, "time_per_iteration": 2.5481083393096924 }, { "auxiliary_loss_clip": 0.06416817, "auxiliary_loss_mlp": 0.01268467, "balance_loss_clip": 0.06276223, "balance_loss_mlp": 0.01257351, "epoch": 0.8521869833157973, "flos": 22461323253120.0, "grad_norm": 2.2448723425163, "language_loss": 0.77430427, "learning_rate": 2.247634997500205e-07, "loss": 0.85115713, "num_input_tokens_seen": 305741655, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.11108398, "step": 14174, "time_per_iteration": 2.5287089347839355 }, { "auxiliary_loss_clip": 0.0641931, "auxiliary_loss_mlp": 0.01264336, "balance_loss_clip": 0.06277676, "balance_loss_mlp": 0.0125434, "epoch": 0.8522471065684654, "flos": 24978842375040.0, "grad_norm": 1.597879966949291, "language_loss": 0.81844664, "learning_rate": 2.245841551883676e-07, "loss": 0.89528304, "num_input_tokens_seen": 305761890, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09991455, "step": 14175, "time_per_iteration": 2.5683515071868896 }, { "auxiliary_loss_clip": 0.06425238, "auxiliary_loss_mlp": 0.01264836, "balance_loss_clip": 0.0628062, "balance_loss_mlp": 0.01254936, "epoch": 0.8523072298211333, "flos": 17716076755200.0, "grad_norm": 2.4968401467758183, "language_loss": 0.66048568, "learning_rate": 2.2440487795066153e-07, "loss": 0.73738641, "num_input_tokens_seen": 305779190, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.09899902, "step": 14176, "time_per_iteration": 3.9033288955688477 }, { "auxiliary_loss_clip": 0.06408971, "auxiliary_loss_mlp": 0.01263534, "balance_loss_clip": 0.06273577, "balance_loss_mlp": 0.01254129, "epoch": 0.8523673530738013, "flos": 25453060957440.0, "grad_norm": 1.6630445395116613, "language_loss": 0.78994203, "learning_rate": 2.2422566804370068e-07, "loss": 0.86666703, "num_input_tokens_seen": 305799870, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.09411621, "step": 14177, "time_per_iteration": 2.588212013244629 }, { "auxiliary_loss_clip": 0.06414276, "auxiliary_loss_mlp": 0.01267389, "balance_loss_clip": 0.0627408, "balance_loss_mlp": 0.01257167, "epoch": 0.8524274763264693, "flos": 31437416833920.0, "grad_norm": 1.5826959638947375, "language_loss": 0.73519319, "learning_rate": 2.2404652547428026e-07, "loss": 0.81200981, "num_input_tokens_seen": 305819695, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.10223389, "step": 14178, "time_per_iteration": 2.60996150970459 }, { "auxiliary_loss_clip": 0.06416744, "auxiliary_loss_mlp": 0.01262949, "balance_loss_clip": 0.06275362, "balance_loss_mlp": 0.01252972, "epoch": 0.8524875995791372, "flos": 17718466596480.0, "grad_norm": 1.6989849780139044, "language_loss": 0.75364339, "learning_rate": 2.238674502491935e-07, "loss": 0.83044034, "num_input_tokens_seen": 305837270, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09973145, "step": 14179, "time_per_iteration": 2.508847713470459 }, { "auxiliary_loss_clip": 0.06410749, "auxiliary_loss_mlp": 0.01264457, "balance_loss_clip": 0.06274289, "balance_loss_mlp": 0.01255302, "epoch": 0.8525477228318052, "flos": 21693413710080.0, "grad_norm": 2.0745785771958607, "language_loss": 0.81559426, "learning_rate": 2.2368844237523165e-07, "loss": 0.89234632, "num_input_tokens_seen": 305855250, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09161377, "step": 14180, "time_per_iteration": 2.545863389968872 }, { "auxiliary_loss_clip": 0.06416993, "auxiliary_loss_mlp": 0.012673, "balance_loss_clip": 0.06277533, "balance_loss_mlp": 0.0125737, "epoch": 0.8526078460844732, "flos": 24834009392640.0, "grad_norm": 2.877637873683178, "language_loss": 0.61755645, "learning_rate": 2.235095018591815e-07, "loss": 0.69439942, "num_input_tokens_seen": 305875660, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.0993042, "step": 14181, "time_per_iteration": 2.577763795852661 }, { "auxiliary_loss_clip": 0.06410862, "auxiliary_loss_mlp": 0.01265178, "balance_loss_clip": 0.06273852, "balance_loss_mlp": 0.01255719, "epoch": 0.8526679693371412, "flos": 13521469363200.0, "grad_norm": 2.0502978439899904, "language_loss": 0.72522956, "learning_rate": 2.2333062870782894e-07, "loss": 0.80198991, "num_input_tokens_seen": 305892415, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09460449, "step": 14182, "time_per_iteration": 2.499600648880005 }, { "auxiliary_loss_clip": 0.06411555, "auxiliary_loss_mlp": 0.01266733, "balance_loss_clip": 0.06274091, "balance_loss_mlp": 0.01257143, "epoch": 0.8527280925898091, "flos": 23520911258880.0, "grad_norm": 1.4956678720039596, "language_loss": 0.70861435, "learning_rate": 2.2315182292795697e-07, "loss": 0.78539729, "num_input_tokens_seen": 305912665, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.0958252, "step": 14183, "time_per_iteration": 2.599982976913452 }, { "auxiliary_loss_clip": 0.06410232, "auxiliary_loss_mlp": 0.01264569, "balance_loss_clip": 0.06273814, "balance_loss_mlp": 0.01255187, "epoch": 0.8527882158424771, "flos": 20309261713920.0, "grad_norm": 2.2782537191287244, "language_loss": 0.73136276, "learning_rate": 2.2297308452634644e-07, "loss": 0.80811077, "num_input_tokens_seen": 305931515, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09381104, "step": 14184, "time_per_iteration": 2.610309600830078 }, { "auxiliary_loss_clip": 0.06418348, "auxiliary_loss_mlp": 0.0126374, "balance_loss_clip": 0.06279452, "balance_loss_mlp": 0.01254102, "epoch": 0.852848339095145, "flos": 17208343739520.0, "grad_norm": 1.6129061703636285, "language_loss": 0.77196884, "learning_rate": 2.2279441350977457e-07, "loss": 0.84878975, "num_input_tokens_seen": 305949965, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09631348, "step": 14185, "time_per_iteration": 2.5573997497558594 }, { "auxiliary_loss_clip": 0.06419531, "auxiliary_loss_mlp": 0.01262141, "balance_loss_clip": 0.06275898, "balance_loss_mlp": 0.01252723, "epoch": 0.852908462347813, "flos": 18374847955200.0, "grad_norm": 1.9680149004003524, "language_loss": 0.79900861, "learning_rate": 2.2261580988501637e-07, "loss": 0.87582535, "num_input_tokens_seen": 305967820, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09417725, "step": 14186, "time_per_iteration": 2.5366387367248535 }, { "auxiliary_loss_clip": 0.06414491, "auxiliary_loss_mlp": 0.0126341, "balance_loss_clip": 0.06272571, "balance_loss_mlp": 0.01253438, "epoch": 0.8529685856004809, "flos": 18630873705600.0, "grad_norm": 2.3671288898334315, "language_loss": 0.62771702, "learning_rate": 2.224372736588449e-07, "loss": 0.70449603, "num_input_tokens_seen": 305985505, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09960938, "step": 14187, "time_per_iteration": 2.590599775314331 }, { "auxiliary_loss_clip": 0.06420873, "auxiliary_loss_mlp": 0.01264249, "balance_loss_clip": 0.06275807, "balance_loss_mlp": 0.01254325, "epoch": 0.853028708853149, "flos": 29615579435520.0, "grad_norm": 1.5162363824152318, "language_loss": 0.76580662, "learning_rate": 2.2225880483803005e-07, "loss": 0.84265786, "num_input_tokens_seen": 306005220, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.0993042, "step": 14188, "time_per_iteration": 2.622284173965454 }, { "auxiliary_loss_clip": 0.06417986, "auxiliary_loss_mlp": 0.01263552, "balance_loss_clip": 0.06275287, "balance_loss_mlp": 0.01252716, "epoch": 0.8530888321058169, "flos": 26359304791680.0, "grad_norm": 6.70176036061413, "language_loss": 0.78477776, "learning_rate": 2.2208040342933932e-07, "loss": 0.86159313, "num_input_tokens_seen": 306023785, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.1083374, "step": 14189, "time_per_iteration": 2.588470697402954 }, { "auxiliary_loss_clip": 0.06417812, "auxiliary_loss_mlp": 0.01265322, "balance_loss_clip": 0.06276655, "balance_loss_mlp": 0.01255261, "epoch": 0.8531489553584849, "flos": 20528251159680.0, "grad_norm": 1.7886345783254896, "language_loss": 0.79981387, "learning_rate": 2.2190206943953793e-07, "loss": 0.87664521, "num_input_tokens_seen": 306041600, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10058594, "step": 14190, "time_per_iteration": 2.585649013519287 }, { "auxiliary_loss_clip": 0.06415918, "auxiliary_loss_mlp": 0.01269941, "balance_loss_clip": 0.06277825, "balance_loss_mlp": 0.01259779, "epoch": 0.8532090786111529, "flos": 20710581644160.0, "grad_norm": 2.0776992486852572, "language_loss": 0.76384491, "learning_rate": 2.2172380287538894e-07, "loss": 0.84070349, "num_input_tokens_seen": 306060345, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.10162354, "step": 14191, "time_per_iteration": 2.679236888885498 }, { "auxiliary_loss_clip": 0.06415138, "auxiliary_loss_mlp": 0.0126482, "balance_loss_clip": 0.06276214, "balance_loss_mlp": 0.01254902, "epoch": 0.8532692018638208, "flos": 19835085058560.0, "grad_norm": 1.8958142758728815, "language_loss": 0.69573212, "learning_rate": 2.2154560374365073e-07, "loss": 0.77253163, "num_input_tokens_seen": 306078285, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09912109, "step": 14192, "time_per_iteration": 2.536572217941284 }, { "auxiliary_loss_clip": 0.0642277, "auxiliary_loss_mlp": 0.01268278, "balance_loss_clip": 0.06275329, "balance_loss_mlp": 0.01256298, "epoch": 0.8533293251164888, "flos": 21003224428800.0, "grad_norm": 2.2229339250014375, "language_loss": 0.62588173, "learning_rate": 2.2136747205108164e-07, "loss": 0.70279217, "num_input_tokens_seen": 306093760, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.11968994, "step": 14193, "time_per_iteration": 2.546351909637451 }, { "auxiliary_loss_clip": 0.06414243, "auxiliary_loss_mlp": 0.01262432, "balance_loss_clip": 0.06275493, "balance_loss_mlp": 0.0125274, "epoch": 0.8533894483691568, "flos": 22426257519360.0, "grad_norm": 1.7163476870795602, "language_loss": 0.77219164, "learning_rate": 2.211894078044365e-07, "loss": 0.84895843, "num_input_tokens_seen": 306112595, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09680176, "step": 14194, "time_per_iteration": 2.562391519546509 }, { "auxiliary_loss_clip": 0.06414623, "auxiliary_loss_mlp": 0.01264476, "balance_loss_clip": 0.06272955, "balance_loss_mlp": 0.01255488, "epoch": 0.8534495716218248, "flos": 21622988753280.0, "grad_norm": 1.8624434221937138, "language_loss": 0.70320368, "learning_rate": 2.2101141101046705e-07, "loss": 0.77999467, "num_input_tokens_seen": 306131800, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.08984375, "step": 14195, "time_per_iteration": 3.955108404159546 }, { "auxiliary_loss_clip": 0.06412565, "auxiliary_loss_mlp": 0.01268771, "balance_loss_clip": 0.06272069, "balance_loss_mlp": 0.01258757, "epoch": 0.8535096948744927, "flos": 22352855742720.0, "grad_norm": 2.468697818537601, "language_loss": 0.85994482, "learning_rate": 2.2083348167592343e-07, "loss": 0.9367581, "num_input_tokens_seen": 306150590, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.10015869, "step": 14196, "time_per_iteration": 2.544677734375 }, { "auxiliary_loss_clip": 0.06313202, "auxiliary_loss_mlp": 0.01251938, "balance_loss_clip": 0.06257905, "balance_loss_mlp": 0.01250932, "epoch": 0.8535698181271607, "flos": 52778118781440.0, "grad_norm": 0.7475533887095541, "language_loss": 0.55136919, "learning_rate": 2.2065561980755243e-07, "loss": 0.6270206, "num_input_tokens_seen": 306205850, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01006317, "step": 14197, "time_per_iteration": 3.0851809978485107 }, { "auxiliary_loss_clip": 0.0641119, "auxiliary_loss_mlp": 0.01261956, "balance_loss_clip": 0.06275006, "balance_loss_mlp": 0.01252407, "epoch": 0.8536299413798286, "flos": 19068978378240.0, "grad_norm": 1.5088257916541492, "language_loss": 0.81670558, "learning_rate": 2.2047782541209826e-07, "loss": 0.89343703, "num_input_tokens_seen": 306225220, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.09539795, "step": 14198, "time_per_iteration": 2.535313129425049 }, { "auxiliary_loss_clip": 0.06414152, "auxiliary_loss_mlp": 0.01265077, "balance_loss_clip": 0.06274261, "balance_loss_mlp": 0.01255815, "epoch": 0.8536900646324966, "flos": 49355670291840.0, "grad_norm": 2.460581502563969, "language_loss": 0.6877284, "learning_rate": 2.203000984963035e-07, "loss": 0.76452065, "num_input_tokens_seen": 306249865, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09265137, "step": 14199, "time_per_iteration": 2.7915477752685547 }, { "auxiliary_loss_clip": 0.06405913, "auxiliary_loss_mlp": 0.01265985, "balance_loss_clip": 0.06271343, "balance_loss_mlp": 0.01256758, "epoch": 0.8537501878851645, "flos": 21768786057600.0, "grad_norm": 2.0410804836156804, "language_loss": 0.86467385, "learning_rate": 2.201224390669072e-07, "loss": 0.9413929, "num_input_tokens_seen": 306270215, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.09228516, "step": 14200, "time_per_iteration": 2.533048391342163 }, { "auxiliary_loss_clip": 0.0641549, "auxiliary_loss_mlp": 0.01264605, "balance_loss_clip": 0.0627567, "balance_loss_mlp": 0.01255384, "epoch": 0.8538103111378326, "flos": 22275051626880.0, "grad_norm": 1.907466133572942, "language_loss": 0.77919143, "learning_rate": 2.1994484713064666e-07, "loss": 0.85599244, "num_input_tokens_seen": 306288960, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09216309, "step": 14201, "time_per_iteration": 3.967698335647583 }, { "auxiliary_loss_clip": 0.0641377, "auxiliary_loss_mlp": 0.01265643, "balance_loss_clip": 0.06276341, "balance_loss_mlp": 0.01257298, "epoch": 0.8538704343905005, "flos": 20310309889920.0, "grad_norm": 1.90161986546287, "language_loss": 0.6877597, "learning_rate": 2.19767322694256e-07, "loss": 0.76455384, "num_input_tokens_seen": 306308735, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.08337402, "step": 14202, "time_per_iteration": 2.606799602508545 }, { "auxiliary_loss_clip": 0.06420836, "auxiliary_loss_mlp": 0.0126753, "balance_loss_clip": 0.06279564, "balance_loss_mlp": 0.01257934, "epoch": 0.8539305576431685, "flos": 24762284697600.0, "grad_norm": 1.6362835078740374, "language_loss": 0.80288994, "learning_rate": 2.195898657644666e-07, "loss": 0.87977362, "num_input_tokens_seen": 306329015, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09588623, "step": 14203, "time_per_iteration": 2.576802968978882 }, { "auxiliary_loss_clip": 0.06419571, "auxiliary_loss_mlp": 0.01266914, "balance_loss_clip": 0.062769, "balance_loss_mlp": 0.01256292, "epoch": 0.8539906808958365, "flos": 26694853666560.0, "grad_norm": 2.2214488260295275, "language_loss": 0.65874922, "learning_rate": 2.1941247634800808e-07, "loss": 0.73561406, "num_input_tokens_seen": 306349085, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10626221, "step": 14204, "time_per_iteration": 2.5850679874420166 }, { "auxiliary_loss_clip": 0.06421737, "auxiliary_loss_mlp": 0.01266228, "balance_loss_clip": 0.06278755, "balance_loss_mlp": 0.01256012, "epoch": 0.8540508041485044, "flos": 13369718419200.0, "grad_norm": 1.9669680700626249, "language_loss": 0.60264838, "learning_rate": 2.1923515445160667e-07, "loss": 0.67952806, "num_input_tokens_seen": 306365385, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10223389, "step": 14205, "time_per_iteration": 2.5255420207977295 }, { "auxiliary_loss_clip": 0.06411985, "auxiliary_loss_mlp": 0.0126073, "balance_loss_clip": 0.06274509, "balance_loss_mlp": 0.01251557, "epoch": 0.8541109274011724, "flos": 32789144499840.0, "grad_norm": 2.966326672660012, "language_loss": 0.72551918, "learning_rate": 2.1905790008198655e-07, "loss": 0.80224633, "num_input_tokens_seen": 306384585, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.0916748, "step": 14206, "time_per_iteration": 2.655914545059204 }, { "auxiliary_loss_clip": 0.06417751, "auxiliary_loss_mlp": 0.01268817, "balance_loss_clip": 0.06275601, "balance_loss_mlp": 0.01258982, "epoch": 0.8541710506538404, "flos": 17645022892800.0, "grad_norm": 3.0030329592977094, "language_loss": 0.7634939, "learning_rate": 2.1888071324586987e-07, "loss": 0.84035957, "num_input_tokens_seen": 306401565, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09838867, "step": 14207, "time_per_iteration": 2.535428285598755 }, { "auxiliary_loss_clip": 0.06417821, "auxiliary_loss_mlp": 0.01264394, "balance_loss_clip": 0.06275886, "balance_loss_mlp": 0.01254416, "epoch": 0.8542311739065084, "flos": 20268703537920.0, "grad_norm": 1.665720159409514, "language_loss": 0.85189998, "learning_rate": 2.1870359394997485e-07, "loss": 0.92872208, "num_input_tokens_seen": 306419995, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09967041, "step": 14208, "time_per_iteration": 2.5749528408050537 }, { "auxiliary_loss_clip": 0.0641266, "auxiliary_loss_mlp": 0.01264239, "balance_loss_clip": 0.062728, "balance_loss_mlp": 0.01254541, "epoch": 0.8542912971591763, "flos": 17791491029760.0, "grad_norm": 1.4595722500281338, "language_loss": 0.66694391, "learning_rate": 2.1852654220101785e-07, "loss": 0.74371284, "num_input_tokens_seen": 306439240, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09698486, "step": 14209, "time_per_iteration": 2.516331195831299 }, { "auxiliary_loss_clip": 0.06410746, "auxiliary_loss_mlp": 0.01264175, "balance_loss_clip": 0.06272929, "balance_loss_mlp": 0.01254858, "epoch": 0.8543514204118443, "flos": 26986783691520.0, "grad_norm": 2.0393912183881455, "language_loss": 0.70408058, "learning_rate": 2.1834955800571287e-07, "loss": 0.78082979, "num_input_tokens_seen": 306458425, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09313965, "step": 14210, "time_per_iteration": 4.127155780792236 }, { "auxiliary_loss_clip": 0.06418161, "auxiliary_loss_mlp": 0.01267237, "balance_loss_clip": 0.06277673, "balance_loss_mlp": 0.01257563, "epoch": 0.8544115436645122, "flos": 24031453386240.0, "grad_norm": 1.4685760373873562, "language_loss": 0.70521849, "learning_rate": 2.1817264137077141e-07, "loss": 0.78207242, "num_input_tokens_seen": 306477210, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09674072, "step": 14211, "time_per_iteration": 2.6477677822113037 }, { "auxiliary_loss_clip": 0.06416775, "auxiliary_loss_mlp": 0.01264344, "balance_loss_clip": 0.06275433, "balance_loss_mlp": 0.01254289, "epoch": 0.8544716669171802, "flos": 16623603221760.0, "grad_norm": 2.5398010488374108, "language_loss": 0.815795, "learning_rate": 2.1799579230290166e-07, "loss": 0.8926062, "num_input_tokens_seen": 306495820, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.1005249, "step": 14212, "time_per_iteration": 2.5322823524475098 }, { "auxiliary_loss_clip": 0.06412193, "auxiliary_loss_mlp": 0.01264748, "balance_loss_clip": 0.0627149, "balance_loss_mlp": 0.01254097, "epoch": 0.8545317901698481, "flos": 40015376939520.0, "grad_norm": 2.5415349077661524, "language_loss": 0.66706944, "learning_rate": 2.178190108088105e-07, "loss": 0.74383879, "num_input_tokens_seen": 306516420, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10656738, "step": 14213, "time_per_iteration": 2.7314839363098145 }, { "auxiliary_loss_clip": 0.06414168, "auxiliary_loss_mlp": 0.01263981, "balance_loss_clip": 0.06275824, "balance_loss_mlp": 0.0125445, "epoch": 0.8545919134225162, "flos": 19908822251520.0, "grad_norm": 3.1764286905143866, "language_loss": 0.78017437, "learning_rate": 2.1764229689520098e-07, "loss": 0.85695589, "num_input_tokens_seen": 306534785, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09533691, "step": 14214, "time_per_iteration": 2.54150128364563 }, { "auxiliary_loss_clip": 0.06420153, "auxiliary_loss_mlp": 0.0126605, "balance_loss_clip": 0.06276105, "balance_loss_mlp": 0.01255095, "epoch": 0.8546520366751841, "flos": 18958959567360.0, "grad_norm": 3.8696648822159734, "language_loss": 0.67136884, "learning_rate": 2.1746565056877397e-07, "loss": 0.74823081, "num_input_tokens_seen": 306552440, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10949707, "step": 14215, "time_per_iteration": 3.836559534072876 }, { "auxiliary_loss_clip": 0.06412166, "auxiliary_loss_mlp": 0.0126414, "balance_loss_clip": 0.06274211, "balance_loss_mlp": 0.01255092, "epoch": 0.8547121599278521, "flos": 35629298968320.0, "grad_norm": 8.04172056205364, "language_loss": 0.62395656, "learning_rate": 2.172890718362279e-07, "loss": 0.70071965, "num_input_tokens_seen": 306573600, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09051514, "step": 14216, "time_per_iteration": 2.7196762561798096 }, { "auxiliary_loss_clip": 0.0641829, "auxiliary_loss_mlp": 0.01263176, "balance_loss_clip": 0.06276573, "balance_loss_mlp": 0.01253413, "epoch": 0.8547722831805201, "flos": 16915742881920.0, "grad_norm": 1.9555529696004297, "language_loss": 0.6567049, "learning_rate": 2.17112560704259e-07, "loss": 0.73351943, "num_input_tokens_seen": 306592840, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09759521, "step": 14217, "time_per_iteration": 2.571322441101074 }, { "auxiliary_loss_clip": 0.06412337, "auxiliary_loss_mlp": 0.01264245, "balance_loss_clip": 0.06274627, "balance_loss_mlp": 0.01255477, "epoch": 0.854832406433188, "flos": 23009237101440.0, "grad_norm": 1.5741518347966283, "language_loss": 0.64991528, "learning_rate": 2.1693611717956072e-07, "loss": 0.72668111, "num_input_tokens_seen": 306613210, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.08764648, "step": 14218, "time_per_iteration": 2.6040050983428955 }, { "auxiliary_loss_clip": 0.06418926, "auxiliary_loss_mlp": 0.01268624, "balance_loss_clip": 0.06275702, "balance_loss_mlp": 0.0125901, "epoch": 0.854892529685856, "flos": 20418861254400.0, "grad_norm": 1.644326689162391, "language_loss": 0.70462656, "learning_rate": 2.167597412688238e-07, "loss": 0.78150213, "num_input_tokens_seen": 306631620, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09613037, "step": 14219, "time_per_iteration": 2.5922162532806396 }, { "auxiliary_loss_clip": 0.06420027, "auxiliary_loss_mlp": 0.01264357, "balance_loss_clip": 0.06276295, "balance_loss_mlp": 0.01254051, "epoch": 0.854952652938524, "flos": 16404236432640.0, "grad_norm": 2.266030061410146, "language_loss": 0.68054289, "learning_rate": 2.1658343297873549e-07, "loss": 0.75738668, "num_input_tokens_seen": 306646695, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10308838, "step": 14220, "time_per_iteration": 2.529036045074463 }, { "auxiliary_loss_clip": 0.06407659, "auxiliary_loss_mlp": 0.01261755, "balance_loss_clip": 0.06271348, "balance_loss_mlp": 0.01252576, "epoch": 0.855012776191192, "flos": 21185051788800.0, "grad_norm": 1.874243727062503, "language_loss": 0.71864021, "learning_rate": 2.164071923159827e-07, "loss": 0.79533434, "num_input_tokens_seen": 306665465, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.09185791, "step": 14221, "time_per_iteration": 2.541855812072754 }, { "auxiliary_loss_clip": 0.06412984, "auxiliary_loss_mlp": 0.0126502, "balance_loss_clip": 0.06272303, "balance_loss_mlp": 0.01255209, "epoch": 0.8550728994438599, "flos": 26148239556480.0, "grad_norm": 1.8089822742213055, "language_loss": 0.59662247, "learning_rate": 2.1623101928724763e-07, "loss": 0.67340249, "num_input_tokens_seen": 306685950, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.0980835, "step": 14222, "time_per_iteration": 2.568873167037964 }, { "auxiliary_loss_clip": 0.06413953, "auxiliary_loss_mlp": 0.01263433, "balance_loss_clip": 0.06276256, "balance_loss_mlp": 0.01254104, "epoch": 0.8551330226965279, "flos": 22793895308160.0, "grad_norm": 1.530332253261509, "language_loss": 0.84413129, "learning_rate": 2.1605491389921093e-07, "loss": 0.92090511, "num_input_tokens_seen": 306705740, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09326172, "step": 14223, "time_per_iteration": 2.5662636756896973 }, { "auxiliary_loss_clip": 0.06412077, "auxiliary_loss_mlp": 0.01264966, "balance_loss_clip": 0.06273639, "balance_loss_mlp": 0.0125531, "epoch": 0.8551931459491958, "flos": 22425586686720.0, "grad_norm": 1.540472427678053, "language_loss": 0.73945153, "learning_rate": 2.158788761585515e-07, "loss": 0.81622195, "num_input_tokens_seen": 306725065, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09667969, "step": 14224, "time_per_iteration": 2.550832748413086 }, { "auxiliary_loss_clip": 0.06416184, "auxiliary_loss_mlp": 0.01265056, "balance_loss_clip": 0.06276889, "balance_loss_mlp": 0.01255746, "epoch": 0.8552532692018638, "flos": 19579268943360.0, "grad_norm": 1.9269950458552245, "language_loss": 0.75539625, "learning_rate": 2.1570290607194307e-07, "loss": 0.83220863, "num_input_tokens_seen": 306743630, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09307861, "step": 14225, "time_per_iteration": 2.512772560119629 }, { "auxiliary_loss_clip": 0.06414564, "auxiliary_loss_mlp": 0.01264021, "balance_loss_clip": 0.06276575, "balance_loss_mlp": 0.01254919, "epoch": 0.8553133924545318, "flos": 26440043800320.0, "grad_norm": 1.6782524753962498, "language_loss": 0.7729615, "learning_rate": 2.1552700364605925e-07, "loss": 0.84974736, "num_input_tokens_seen": 306763105, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09100342, "step": 14226, "time_per_iteration": 2.606653928756714 }, { "auxiliary_loss_clip": 0.06423593, "auxiliary_loss_mlp": 0.01264736, "balance_loss_clip": 0.06278544, "balance_loss_mlp": 0.01254943, "epoch": 0.8553735157071998, "flos": 16367996741760.0, "grad_norm": 1.8922421567851122, "language_loss": 0.55108774, "learning_rate": 2.153511688875702e-07, "loss": 0.62797099, "num_input_tokens_seen": 306779875, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.09790039, "step": 14227, "time_per_iteration": 2.519007682800293 }, { "auxiliary_loss_clip": 0.06411887, "auxiliary_loss_mlp": 0.01265548, "balance_loss_clip": 0.06274596, "balance_loss_mlp": 0.0125588, "epoch": 0.8554336389598677, "flos": 20893750669440.0, "grad_norm": 1.9382860919812173, "language_loss": 0.6601966, "learning_rate": 2.151754018031442e-07, "loss": 0.73697096, "num_input_tokens_seen": 306800015, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09680176, "step": 14228, "time_per_iteration": 2.5519211292266846 }, { "auxiliary_loss_clip": 0.06418584, "auxiliary_loss_mlp": 0.01263899, "balance_loss_clip": 0.06275509, "balance_loss_mlp": 0.01253533, "epoch": 0.8554937622125357, "flos": 21290542479360.0, "grad_norm": 2.0639656855220894, "language_loss": 0.74219304, "learning_rate": 2.1499970239944542e-07, "loss": 0.81901789, "num_input_tokens_seen": 306814160, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10369873, "step": 14229, "time_per_iteration": 2.544097423553467 }, { "auxiliary_loss_clip": 0.06413168, "auxiliary_loss_mlp": 0.01263432, "balance_loss_clip": 0.06275794, "balance_loss_mlp": 0.01254479, "epoch": 0.8555538854652037, "flos": 22418752579200.0, "grad_norm": 1.7145186686311926, "language_loss": 0.73353618, "learning_rate": 2.1482407068313724e-07, "loss": 0.81030214, "num_input_tokens_seen": 306833310, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.08959961, "step": 14230, "time_per_iteration": 2.5708274841308594 }, { "auxiliary_loss_clip": 0.064174, "auxiliary_loss_mlp": 0.01264325, "balance_loss_clip": 0.06277852, "balance_loss_mlp": 0.01254806, "epoch": 0.8556140087178716, "flos": 20199955662720.0, "grad_norm": 1.769810917081937, "language_loss": 0.83178449, "learning_rate": 2.1464850666087897e-07, "loss": 0.9086017, "num_input_tokens_seen": 306851345, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09521484, "step": 14231, "time_per_iteration": 2.566772699356079 }, { "auxiliary_loss_clip": 0.06421743, "auxiliary_loss_mlp": 0.01265813, "balance_loss_clip": 0.0627913, "balance_loss_mlp": 0.01255192, "epoch": 0.8556741319705397, "flos": 22644743840640.0, "grad_norm": 1.9311339660300508, "language_loss": 0.68302304, "learning_rate": 2.1447301033932796e-07, "loss": 0.75989866, "num_input_tokens_seen": 306871040, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10620117, "step": 14232, "time_per_iteration": 2.6703364849090576 }, { "auxiliary_loss_clip": 0.06416844, "auxiliary_loss_mlp": 0.0126604, "balance_loss_clip": 0.0627393, "balance_loss_mlp": 0.01255907, "epoch": 0.8557342552232076, "flos": 23555935065600.0, "grad_norm": 1.5503455112096396, "language_loss": 0.67382592, "learning_rate": 2.1429758172513955e-07, "loss": 0.7506547, "num_input_tokens_seen": 306891625, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10131836, "step": 14233, "time_per_iteration": 2.5847620964050293 }, { "auxiliary_loss_clip": 0.06409214, "auxiliary_loss_mlp": 0.01262924, "balance_loss_clip": 0.06271602, "balance_loss_mlp": 0.01253966, "epoch": 0.8557943784758756, "flos": 19616011758720.0, "grad_norm": 1.6621611565722219, "language_loss": 0.77055812, "learning_rate": 2.1412222082496556e-07, "loss": 0.84727955, "num_input_tokens_seen": 306910020, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08959961, "step": 14234, "time_per_iteration": 2.5635457038879395 }, { "auxiliary_loss_clip": 0.0631211, "auxiliary_loss_mlp": 0.01252944, "balance_loss_clip": 0.06256786, "balance_loss_mlp": 0.01251874, "epoch": 0.8558545017285435, "flos": 70660719527040.0, "grad_norm": 0.7453221649306829, "language_loss": 0.58092618, "learning_rate": 2.1394692764545684e-07, "loss": 0.65657669, "num_input_tokens_seen": 306969505, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01071167, "step": 14235, "time_per_iteration": 4.604978084564209 }, { "auxiliary_loss_clip": 0.06309026, "auxiliary_loss_mlp": 0.01252217, "balance_loss_clip": 0.06253843, "balance_loss_mlp": 0.01251248, "epoch": 0.8559146249812115, "flos": 56669586900480.0, "grad_norm": 0.7671191564234447, "language_loss": 0.56621063, "learning_rate": 2.1377170219325858e-07, "loss": 0.64182305, "num_input_tokens_seen": 307027710, "router_z_loss_clip": 0.55419922, "router_z_loss_mlp": 0.00967407, "step": 14236, "time_per_iteration": 3.065890312194824 }, { "auxiliary_loss_clip": 0.06414605, "auxiliary_loss_mlp": 0.01263632, "balance_loss_clip": 0.06275389, "balance_loss_mlp": 0.01253976, "epoch": 0.8559747482338794, "flos": 22894019337600.0, "grad_norm": 1.7579067579018668, "language_loss": 0.70794171, "learning_rate": 2.1359654447501673e-07, "loss": 0.78472412, "num_input_tokens_seen": 307045515, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09661865, "step": 14237, "time_per_iteration": 2.5689313411712646 }, { "auxiliary_loss_clip": 0.06413673, "auxiliary_loss_mlp": 0.01261684, "balance_loss_clip": 0.06274252, "balance_loss_mlp": 0.01252743, "epoch": 0.8560348714865474, "flos": 22608588003840.0, "grad_norm": 2.410912239253838, "language_loss": 0.63970876, "learning_rate": 2.1342145449737314e-07, "loss": 0.71646231, "num_input_tokens_seen": 307064470, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.0894165, "step": 14238, "time_per_iteration": 2.5761260986328125 }, { "auxiliary_loss_clip": 0.06408276, "auxiliary_loss_mlp": 0.01263669, "balance_loss_clip": 0.06272401, "balance_loss_mlp": 0.01254812, "epoch": 0.8560949947392154, "flos": 17937288334080.0, "grad_norm": 1.5217105442463752, "language_loss": 0.69348866, "learning_rate": 2.1324643226696648e-07, "loss": 0.77020806, "num_input_tokens_seen": 307083900, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.08862305, "step": 14239, "time_per_iteration": 2.5635693073272705 }, { "auxiliary_loss_clip": 0.06421224, "auxiliary_loss_mlp": 0.01265991, "balance_loss_clip": 0.06277072, "balance_loss_mlp": 0.01256204, "epoch": 0.8561551179918834, "flos": 31033623208320.0, "grad_norm": 3.723767773789602, "language_loss": 0.66607654, "learning_rate": 2.1307147779043455e-07, "loss": 0.74294865, "num_input_tokens_seen": 307104590, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.09790039, "step": 14240, "time_per_iteration": 4.019199371337891 }, { "auxiliary_loss_clip": 0.06413992, "auxiliary_loss_mlp": 0.01265637, "balance_loss_clip": 0.06272134, "balance_loss_mlp": 0.0125501, "epoch": 0.8562152412445513, "flos": 30673196870400.0, "grad_norm": 1.7632910514406193, "language_loss": 0.61981148, "learning_rate": 2.1289659107441182e-07, "loss": 0.69660777, "num_input_tokens_seen": 307125580, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10620117, "step": 14241, "time_per_iteration": 2.598151445388794 }, { "auxiliary_loss_clip": 0.0642009, "auxiliary_loss_mlp": 0.01267091, "balance_loss_clip": 0.06273752, "balance_loss_mlp": 0.01256362, "epoch": 0.8562753644972193, "flos": 31584094606080.0, "grad_norm": 1.4853822173547642, "language_loss": 0.74497259, "learning_rate": 2.1272177212552855e-07, "loss": 0.82184446, "num_input_tokens_seen": 307147625, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10736084, "step": 14242, "time_per_iteration": 2.611501693725586 }, { "auxiliary_loss_clip": 0.06413826, "auxiliary_loss_mlp": 0.01268034, "balance_loss_clip": 0.06273032, "balance_loss_mlp": 0.01257943, "epoch": 0.8563354877498872, "flos": 26220844719360.0, "grad_norm": 1.8803761382555673, "language_loss": 0.76388597, "learning_rate": 2.1254702095041498e-07, "loss": 0.84070462, "num_input_tokens_seen": 307164665, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10101318, "step": 14243, "time_per_iteration": 2.5690765380859375 }, { "auxiliary_loss_clip": 0.0641724, "auxiliary_loss_mlp": 0.01264998, "balance_loss_clip": 0.06276231, "balance_loss_mlp": 0.01255747, "epoch": 0.8563956110025552, "flos": 24141262561920.0, "grad_norm": 1.92110343675081, "language_loss": 0.68134117, "learning_rate": 2.123723375556974e-07, "loss": 0.75816351, "num_input_tokens_seen": 307182530, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.0925293, "step": 14244, "time_per_iteration": 2.572141408920288 }, { "auxiliary_loss_clip": 0.06309348, "auxiliary_loss_mlp": 0.0125132, "balance_loss_clip": 0.06253894, "balance_loss_mlp": 0.01250261, "epoch": 0.8564557342552233, "flos": 56289329072640.0, "grad_norm": 0.7472825178131345, "language_loss": 0.58398449, "learning_rate": 2.1219772194800046e-07, "loss": 0.65959126, "num_input_tokens_seen": 307241240, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01060486, "step": 14245, "time_per_iteration": 3.062880516052246 }, { "auxiliary_loss_clip": 0.06421906, "auxiliary_loss_mlp": 0.0126805, "balance_loss_clip": 0.06275992, "balance_loss_mlp": 0.01257529, "epoch": 0.8565158575078912, "flos": 23447341774080.0, "grad_norm": 1.9860875055081357, "language_loss": 0.77711749, "learning_rate": 2.1202317413394488e-07, "loss": 0.85401708, "num_input_tokens_seen": 307261485, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10516357, "step": 14246, "time_per_iteration": 2.5496366024017334 }, { "auxiliary_loss_clip": 0.06411048, "auxiliary_loss_mlp": 0.01262653, "balance_loss_clip": 0.06271669, "balance_loss_mlp": 0.01252997, "epoch": 0.8565759807605592, "flos": 20382160366080.0, "grad_norm": 2.3977744054066257, "language_loss": 0.82127392, "learning_rate": 2.1184869412014938e-07, "loss": 0.89801091, "num_input_tokens_seen": 307279160, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09655762, "step": 14247, "time_per_iteration": 2.531761407852173 }, { "auxiliary_loss_clip": 0.06415785, "auxiliary_loss_mlp": 0.01264815, "balance_loss_clip": 0.06275816, "balance_loss_mlp": 0.01254646, "epoch": 0.8566361040132271, "flos": 18813078408960.0, "grad_norm": 1.845968422113334, "language_loss": 0.77630007, "learning_rate": 2.1167428191323112e-07, "loss": 0.85310602, "num_input_tokens_seen": 307297920, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.10168457, "step": 14248, "time_per_iteration": 2.55375599861145 }, { "auxiliary_loss_clip": 0.06414859, "auxiliary_loss_mlp": 0.01263339, "balance_loss_clip": 0.0627303, "balance_loss_mlp": 0.0125354, "epoch": 0.8566962272658951, "flos": 24542289002880.0, "grad_norm": 1.838074273503159, "language_loss": 0.77937078, "learning_rate": 2.1149993751980278e-07, "loss": 0.85615277, "num_input_tokens_seen": 307318320, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09802246, "step": 14249, "time_per_iteration": 4.0502119064331055 }, { "auxiliary_loss_clip": 0.06412359, "auxiliary_loss_mlp": 0.01264657, "balance_loss_clip": 0.06274602, "balance_loss_mlp": 0.01255371, "epoch": 0.856756350518563, "flos": 23184062645760.0, "grad_norm": 1.6427187271698425, "language_loss": 0.79040694, "learning_rate": 2.1132566094647597e-07, "loss": 0.86717713, "num_input_tokens_seen": 307336720, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09283447, "step": 14250, "time_per_iteration": 2.5509822368621826 }, { "auxiliary_loss_clip": 0.06410486, "auxiliary_loss_mlp": 0.01264479, "balance_loss_clip": 0.06275177, "balance_loss_mlp": 0.01256314, "epoch": 0.856816473771231, "flos": 20814017909760.0, "grad_norm": 1.8599487810641744, "language_loss": 0.80014527, "learning_rate": 2.1115145219985942e-07, "loss": 0.87689489, "num_input_tokens_seen": 307354120, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.081604, "step": 14251, "time_per_iteration": 2.6025922298431396 }, { "auxiliary_loss_clip": 0.06413789, "auxiliary_loss_mlp": 0.01266058, "balance_loss_clip": 0.06273918, "balance_loss_mlp": 0.01256467, "epoch": 0.856876597023899, "flos": 20234057074560.0, "grad_norm": 1.8483905238980862, "language_loss": 0.61788118, "learning_rate": 2.1097731128656005e-07, "loss": 0.69467962, "num_input_tokens_seen": 307373165, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09588623, "step": 14252, "time_per_iteration": 2.575282573699951 }, { "auxiliary_loss_clip": 0.06416298, "auxiliary_loss_mlp": 0.01268815, "balance_loss_clip": 0.06273529, "balance_loss_mlp": 0.01258623, "epoch": 0.856936720276567, "flos": 18301991230080.0, "grad_norm": 1.6801301583239703, "language_loss": 0.69829804, "learning_rate": 2.1080323821317924e-07, "loss": 0.77514917, "num_input_tokens_seen": 307391000, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10198975, "step": 14253, "time_per_iteration": 2.526642322540283 }, { "auxiliary_loss_clip": 0.06307977, "auxiliary_loss_mlp": 0.0125225, "balance_loss_clip": 0.06252631, "balance_loss_mlp": 0.01251326, "epoch": 0.8569968435292349, "flos": 69897547739520.0, "grad_norm": 0.7815314003948485, "language_loss": 0.59254527, "learning_rate": 2.1062923298631907e-07, "loss": 0.66814756, "num_input_tokens_seen": 307452865, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00922394, "step": 14254, "time_per_iteration": 3.208101511001587 }, { "auxiliary_loss_clip": 0.06408529, "auxiliary_loss_mlp": 0.01265979, "balance_loss_clip": 0.06272152, "balance_loss_mlp": 0.01255286, "epoch": 0.8570569667819029, "flos": 25855680625920.0, "grad_norm": 1.7748914569964458, "language_loss": 0.81503803, "learning_rate": 2.1045529561257825e-07, "loss": 0.89178312, "num_input_tokens_seen": 307471940, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.10687256, "step": 14255, "time_per_iteration": 4.104235887527466 }, { "auxiliary_loss_clip": 0.06412682, "auxiliary_loss_mlp": 0.01261274, "balance_loss_clip": 0.06276542, "balance_loss_mlp": 0.01252166, "epoch": 0.8571170900345708, "flos": 23263627697280.0, "grad_norm": 2.0298994010265927, "language_loss": 0.67261273, "learning_rate": 2.1028142609855126e-07, "loss": 0.74935222, "num_input_tokens_seen": 307488745, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.09100342, "step": 14256, "time_per_iteration": 2.5675854682922363 }, { "auxiliary_loss_clip": 0.06418609, "auxiliary_loss_mlp": 0.01266302, "balance_loss_clip": 0.06276885, "balance_loss_mlp": 0.01256617, "epoch": 0.8571772132872388, "flos": 18923851906560.0, "grad_norm": 1.8168816799548773, "language_loss": 0.70101225, "learning_rate": 2.1010762445083218e-07, "loss": 0.7778613, "num_input_tokens_seen": 307506855, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09674072, "step": 14257, "time_per_iteration": 2.535477638244629 }, { "auxiliary_loss_clip": 0.06414475, "auxiliary_loss_mlp": 0.01263074, "balance_loss_clip": 0.06275727, "balance_loss_mlp": 0.01253412, "epoch": 0.8572373365399069, "flos": 33257619077760.0, "grad_norm": 1.7493992807411944, "language_loss": 0.77460396, "learning_rate": 2.0993389067601197e-07, "loss": 0.85137939, "num_input_tokens_seen": 307526115, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09661865, "step": 14258, "time_per_iteration": 2.6220977306365967 }, { "auxiliary_loss_clip": 0.06417497, "auxiliary_loss_mlp": 0.01263996, "balance_loss_clip": 0.06281128, "balance_loss_mlp": 0.01254287, "epoch": 0.8572974597925748, "flos": 23333633383680.0, "grad_norm": 1.4543652930958793, "language_loss": 0.68064559, "learning_rate": 2.0976022478067735e-07, "loss": 0.75746047, "num_input_tokens_seen": 307545230, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09710693, "step": 14259, "time_per_iteration": 2.549281358718872 }, { "auxiliary_loss_clip": 0.06412989, "auxiliary_loss_mlp": 0.01264784, "balance_loss_clip": 0.06273451, "balance_loss_mlp": 0.01254723, "epoch": 0.8573575830452428, "flos": 24542875981440.0, "grad_norm": 1.7346282170456588, "language_loss": 0.76938802, "learning_rate": 2.0958662677141437e-07, "loss": 0.84616578, "num_input_tokens_seen": 307564900, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.10064697, "step": 14260, "time_per_iteration": 2.602891445159912 }, { "auxiliary_loss_clip": 0.06415632, "auxiliary_loss_mlp": 0.01264411, "balance_loss_clip": 0.06273559, "balance_loss_mlp": 0.01254612, "epoch": 0.8574177062979107, "flos": 24171422832000.0, "grad_norm": 1.754726512125019, "language_loss": 0.74478447, "learning_rate": 2.09413096654806e-07, "loss": 0.82158494, "num_input_tokens_seen": 307583500, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09783936, "step": 14261, "time_per_iteration": 2.5547592639923096 }, { "auxiliary_loss_clip": 0.06417315, "auxiliary_loss_mlp": 0.01265363, "balance_loss_clip": 0.0627429, "balance_loss_mlp": 0.0125473, "epoch": 0.8574778295505787, "flos": 17936449793280.0, "grad_norm": 1.9865392138015046, "language_loss": 0.79328656, "learning_rate": 2.0923963443743276e-07, "loss": 0.87011337, "num_input_tokens_seen": 307601430, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10632324, "step": 14262, "time_per_iteration": 2.5057804584503174 }, { "auxiliary_loss_clip": 0.06410415, "auxiliary_loss_mlp": 0.01266017, "balance_loss_clip": 0.06274004, "balance_loss_mlp": 0.01256647, "epoch": 0.8575379528032466, "flos": 21587252186880.0, "grad_norm": 1.7950166854769198, "language_loss": 0.68419367, "learning_rate": 2.0906624012587203e-07, "loss": 0.76095802, "num_input_tokens_seen": 307621495, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09375, "step": 14263, "time_per_iteration": 2.554340362548828 }, { "auxiliary_loss_clip": 0.0641516, "auxiliary_loss_mlp": 0.01262218, "balance_loss_clip": 0.06273675, "balance_loss_mlp": 0.01252401, "epoch": 0.8575980760559146, "flos": 21767905589760.0, "grad_norm": 1.521557552047708, "language_loss": 0.79909647, "learning_rate": 2.088929137266986e-07, "loss": 0.87587035, "num_input_tokens_seen": 307640840, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09820557, "step": 14264, "time_per_iteration": 2.5750837326049805 }, { "auxiliary_loss_clip": 0.06417279, "auxiliary_loss_mlp": 0.01267229, "balance_loss_clip": 0.06277379, "balance_loss_mlp": 0.01258241, "epoch": 0.8576581993085826, "flos": 34395011199360.0, "grad_norm": 1.226961056509452, "language_loss": 0.69733489, "learning_rate": 2.0871965524648582e-07, "loss": 0.77417994, "num_input_tokens_seen": 307663820, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.08990479, "step": 14265, "time_per_iteration": 2.666660785675049 }, { "auxiliary_loss_clip": 0.06408727, "auxiliary_loss_mlp": 0.012624, "balance_loss_clip": 0.06273684, "balance_loss_mlp": 0.01252929, "epoch": 0.8577183225612506, "flos": 23229316650240.0, "grad_norm": 1.7443784419767552, "language_loss": 0.66246372, "learning_rate": 2.085464646918027e-07, "loss": 0.73917496, "num_input_tokens_seen": 307682385, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.09472656, "step": 14266, "time_per_iteration": 2.6119072437286377 }, { "auxiliary_loss_clip": 0.06410129, "auxiliary_loss_mlp": 0.01265668, "balance_loss_clip": 0.0627261, "balance_loss_mlp": 0.01256507, "epoch": 0.8577784458139185, "flos": 28811807544960.0, "grad_norm": 2.564154021140919, "language_loss": 0.75800365, "learning_rate": 2.0837334206921731e-07, "loss": 0.83476162, "num_input_tokens_seen": 307704680, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.0916748, "step": 14267, "time_per_iteration": 2.627362012863159 }, { "auxiliary_loss_clip": 0.06410894, "auxiliary_loss_mlp": 0.01264739, "balance_loss_clip": 0.06273793, "balance_loss_mlp": 0.01255309, "epoch": 0.8578385690665865, "flos": 19761683281920.0, "grad_norm": 2.134534071015895, "language_loss": 0.88089889, "learning_rate": 2.082002873852946e-07, "loss": 0.95765519, "num_input_tokens_seen": 307723245, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09423828, "step": 14268, "time_per_iteration": 2.5357134342193604 }, { "auxiliary_loss_clip": 0.06421314, "auxiliary_loss_mlp": 0.01266868, "balance_loss_clip": 0.06278615, "balance_loss_mlp": 0.01256747, "epoch": 0.8578986923192544, "flos": 20710330081920.0, "grad_norm": 1.7130125361369313, "language_loss": 0.73245502, "learning_rate": 2.0802730064659667e-07, "loss": 0.80933684, "num_input_tokens_seen": 307742510, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10119629, "step": 14269, "time_per_iteration": 2.5679705142974854 }, { "auxiliary_loss_clip": 0.06421274, "auxiliary_loss_mlp": 0.01265114, "balance_loss_clip": 0.06279477, "balance_loss_mlp": 0.01255255, "epoch": 0.8579588155719224, "flos": 36110645147520.0, "grad_norm": 1.5529000512546658, "language_loss": 0.668504, "learning_rate": 2.0785438185968252e-07, "loss": 0.74536788, "num_input_tokens_seen": 307766030, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09851074, "step": 14270, "time_per_iteration": 2.6848177909851074 }, { "auxiliary_loss_clip": 0.06410742, "auxiliary_loss_mlp": 0.01266743, "balance_loss_clip": 0.0627297, "balance_loss_mlp": 0.01257182, "epoch": 0.8580189388245905, "flos": 22859540582400.0, "grad_norm": 1.5316214467474236, "language_loss": 0.73973536, "learning_rate": 2.0768153103110997e-07, "loss": 0.8165102, "num_input_tokens_seen": 307785800, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09558105, "step": 14271, "time_per_iteration": 2.5702054500579834 }, { "auxiliary_loss_clip": 0.06309575, "auxiliary_loss_mlp": 0.01252439, "balance_loss_clip": 0.06254199, "balance_loss_mlp": 0.01251404, "epoch": 0.8580790620772584, "flos": 69664414152960.0, "grad_norm": 0.7777311440122044, "language_loss": 0.57781899, "learning_rate": 2.0750874816743358e-07, "loss": 0.65343916, "num_input_tokens_seen": 307850995, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01035309, "step": 14272, "time_per_iteration": 3.2031381130218506 }, { "auxiliary_loss_clip": 0.06420095, "auxiliary_loss_mlp": 0.01263805, "balance_loss_clip": 0.06274334, "balance_loss_mlp": 0.01253964, "epoch": 0.8581391853299264, "flos": 13339306586880.0, "grad_norm": 1.759194789306104, "language_loss": 0.75977087, "learning_rate": 2.0733603327520499e-07, "loss": 0.83660984, "num_input_tokens_seen": 307868585, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.09838867, "step": 14273, "time_per_iteration": 2.514240026473999 }, { "auxiliary_loss_clip": 0.06413415, "auxiliary_loss_mlp": 0.01265261, "balance_loss_clip": 0.06275091, "balance_loss_mlp": 0.01256213, "epoch": 0.8581993085825943, "flos": 19651664471040.0, "grad_norm": 1.7477179734626367, "language_loss": 0.8182739, "learning_rate": 2.0716338636097385e-07, "loss": 0.89506066, "num_input_tokens_seen": 307886820, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09051514, "step": 14274, "time_per_iteration": 2.5303351879119873 }, { "auxiliary_loss_clip": 0.0630926, "auxiliary_loss_mlp": 0.01251897, "balance_loss_clip": 0.06253839, "balance_loss_mlp": 0.01250945, "epoch": 0.8582594318352623, "flos": 55840826494080.0, "grad_norm": 0.7827965605181735, "language_loss": 0.60898429, "learning_rate": 2.0699080743128672e-07, "loss": 0.68459582, "num_input_tokens_seen": 307944020, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00950623, "step": 14275, "time_per_iteration": 4.607723236083984 }, { "auxiliary_loss_clip": 0.0641632, "auxiliary_loss_mlp": 0.01262667, "balance_loss_clip": 0.06274582, "balance_loss_mlp": 0.01252761, "epoch": 0.8583195550879302, "flos": 24286389033600.0, "grad_norm": 1.9226709343061197, "language_loss": 0.59624743, "learning_rate": 2.0681829649268768e-07, "loss": 0.67303729, "num_input_tokens_seen": 307961055, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09912109, "step": 14276, "time_per_iteration": 2.564465045928955 }, { "auxiliary_loss_clip": 0.06411573, "auxiliary_loss_mlp": 0.01263468, "balance_loss_clip": 0.06272458, "balance_loss_mlp": 0.01253663, "epoch": 0.8583796783405983, "flos": 13449283470720.0, "grad_norm": 1.9263158441087107, "language_loss": 0.76387239, "learning_rate": 2.0664585355171838e-07, "loss": 0.84062278, "num_input_tokens_seen": 307978690, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.0980835, "step": 14277, "time_per_iteration": 2.5258078575134277 }, { "auxiliary_loss_clip": 0.064136, "auxiliary_loss_mlp": 0.01268216, "balance_loss_clip": 0.06274546, "balance_loss_mlp": 0.01257887, "epoch": 0.8584398015932662, "flos": 16185833965440.0, "grad_norm": 1.5240033041496697, "language_loss": 0.83708262, "learning_rate": 2.0647347861491803e-07, "loss": 0.91390079, "num_input_tokens_seen": 307995870, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.10327148, "step": 14278, "time_per_iteration": 2.5429909229278564 }, { "auxiliary_loss_clip": 0.06419709, "auxiliary_loss_mlp": 0.01267445, "balance_loss_clip": 0.06275229, "balance_loss_mlp": 0.01256806, "epoch": 0.8584999248459342, "flos": 17455061687040.0, "grad_norm": 1.9476370760246788, "language_loss": 0.75086868, "learning_rate": 2.0630117168882366e-07, "loss": 0.82774019, "num_input_tokens_seen": 308013645, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10644531, "step": 14279, "time_per_iteration": 2.6117186546325684 }, { "auxiliary_loss_clip": 0.06410453, "auxiliary_loss_mlp": 0.01265097, "balance_loss_clip": 0.06272767, "balance_loss_mlp": 0.01256335, "epoch": 0.8585600480986021, "flos": 23447802971520.0, "grad_norm": 3.391073977644334, "language_loss": 0.67017925, "learning_rate": 2.0612893277996845e-07, "loss": 0.74693477, "num_input_tokens_seen": 308032490, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.08752441, "step": 14280, "time_per_iteration": 4.00246787071228 }, { "auxiliary_loss_clip": 0.06410781, "auxiliary_loss_mlp": 0.01263779, "balance_loss_clip": 0.06272824, "balance_loss_mlp": 0.012546, "epoch": 0.8586201713512701, "flos": 19944055693440.0, "grad_norm": 1.622683189286268, "language_loss": 0.62534249, "learning_rate": 2.0595676189488343e-07, "loss": 0.702088, "num_input_tokens_seen": 308052110, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09179688, "step": 14281, "time_per_iteration": 2.6337196826934814 }, { "auxiliary_loss_clip": 0.06419203, "auxiliary_loss_mlp": 0.01265152, "balance_loss_clip": 0.06278975, "balance_loss_mlp": 0.01255281, "epoch": 0.858680294603938, "flos": 15310211598720.0, "grad_norm": 1.6419236406766917, "language_loss": 0.73406112, "learning_rate": 2.0578465904009845e-07, "loss": 0.81090462, "num_input_tokens_seen": 308070660, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09863281, "step": 14282, "time_per_iteration": 2.53110933303833 }, { "auxiliary_loss_clip": 0.06408425, "auxiliary_loss_mlp": 0.01263697, "balance_loss_clip": 0.06270134, "balance_loss_mlp": 0.01254816, "epoch": 0.858740417856606, "flos": 22717894055040.0, "grad_norm": 2.268924740810928, "language_loss": 0.75893825, "learning_rate": 2.0561262422213832e-07, "loss": 0.83565944, "num_input_tokens_seen": 308089520, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.08874512, "step": 14283, "time_per_iteration": 2.5577752590179443 }, { "auxiliary_loss_clip": 0.06411567, "auxiliary_loss_mlp": 0.01262322, "balance_loss_clip": 0.06272614, "balance_loss_mlp": 0.01253179, "epoch": 0.8588005411092741, "flos": 34062187582080.0, "grad_norm": 1.8828754620197676, "language_loss": 0.60128349, "learning_rate": 2.0544065744752736e-07, "loss": 0.67802238, "num_input_tokens_seen": 308111545, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.0914917, "step": 14284, "time_per_iteration": 2.727745294570923 }, { "auxiliary_loss_clip": 0.06410971, "auxiliary_loss_mlp": 0.01264944, "balance_loss_clip": 0.06274852, "balance_loss_mlp": 0.01255777, "epoch": 0.858860664361942, "flos": 28921239377280.0, "grad_norm": 1.6889948212090975, "language_loss": 0.75602841, "learning_rate": 2.0526875872278749e-07, "loss": 0.83278751, "num_input_tokens_seen": 308129690, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.0916748, "step": 14285, "time_per_iteration": 2.6000869274139404 }, { "auxiliary_loss_clip": 0.06415698, "auxiliary_loss_mlp": 0.01265411, "balance_loss_clip": 0.06274126, "balance_loss_mlp": 0.01255684, "epoch": 0.85892078761461, "flos": 19798719586560.0, "grad_norm": 1.7990759116901982, "language_loss": 0.74329853, "learning_rate": 2.0509692805443524e-07, "loss": 0.82010961, "num_input_tokens_seen": 308147410, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.097229, "step": 14286, "time_per_iteration": 2.5505800247192383 }, { "auxiliary_loss_clip": 0.06311392, "auxiliary_loss_mlp": 0.01254082, "balance_loss_clip": 0.06256045, "balance_loss_mlp": 0.0125302, "epoch": 0.8589809108672779, "flos": 67125512240640.0, "grad_norm": 0.7514239831032667, "language_loss": 0.49311721, "learning_rate": 2.0492516544898718e-07, "loss": 0.56877196, "num_input_tokens_seen": 308204875, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01063538, "step": 14287, "time_per_iteration": 3.1274526119232178 }, { "auxiliary_loss_clip": 0.06413671, "auxiliary_loss_mlp": 0.01264573, "balance_loss_clip": 0.0627323, "balance_loss_mlp": 0.01254714, "epoch": 0.8590410341199459, "flos": 29724046945920.0, "grad_norm": 1.6536529474194734, "language_loss": 0.79526746, "learning_rate": 2.0475347091295704e-07, "loss": 0.87204999, "num_input_tokens_seen": 308225690, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09857178, "step": 14288, "time_per_iteration": 2.6220703125 }, { "auxiliary_loss_clip": 0.0641991, "auxiliary_loss_mlp": 0.01266038, "balance_loss_clip": 0.0627832, "balance_loss_mlp": 0.01255625, "epoch": 0.8591011573726138, "flos": 23994165519360.0, "grad_norm": 1.9400535758274366, "language_loss": 0.80813885, "learning_rate": 2.045818444528553e-07, "loss": 0.88499832, "num_input_tokens_seen": 308245255, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.10400391, "step": 14289, "time_per_iteration": 4.064080238342285 }, { "auxiliary_loss_clip": 0.06415579, "auxiliary_loss_mlp": 0.0126445, "balance_loss_clip": 0.06276228, "balance_loss_mlp": 0.012548, "epoch": 0.8591612806252819, "flos": 14433876472320.0, "grad_norm": 1.7837015908673939, "language_loss": 0.65523827, "learning_rate": 2.0441028607518973e-07, "loss": 0.73203856, "num_input_tokens_seen": 308261755, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09649658, "step": 14290, "time_per_iteration": 2.5542006492614746 }, { "auxiliary_loss_clip": 0.0641858, "auxiliary_loss_mlp": 0.0126204, "balance_loss_clip": 0.06274732, "balance_loss_mlp": 0.01252164, "epoch": 0.8592214038779498, "flos": 31585268563200.0, "grad_norm": 1.7961883365785734, "language_loss": 0.55855465, "learning_rate": 2.0423879578646642e-07, "loss": 0.63536084, "num_input_tokens_seen": 308285145, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.09869385, "step": 14291, "time_per_iteration": 2.6262528896331787 }, { "auxiliary_loss_clip": 0.06416568, "auxiliary_loss_mlp": 0.01263041, "balance_loss_clip": 0.06276543, "balance_loss_mlp": 0.01253373, "epoch": 0.8592815271306178, "flos": 17463069751680.0, "grad_norm": 2.1356096100790976, "language_loss": 0.71882081, "learning_rate": 2.0406737359318792e-07, "loss": 0.79561692, "num_input_tokens_seen": 308304130, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09667969, "step": 14292, "time_per_iteration": 2.5443315505981445 }, { "auxiliary_loss_clip": 0.06412055, "auxiliary_loss_mlp": 0.01267118, "balance_loss_clip": 0.06270824, "balance_loss_mlp": 0.01257855, "epoch": 0.8593416503832857, "flos": 25418498348160.0, "grad_norm": 1.486300399123454, "language_loss": 0.71188551, "learning_rate": 2.038960195018542e-07, "loss": 0.78867722, "num_input_tokens_seen": 308324670, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.0927124, "step": 14293, "time_per_iteration": 2.5790112018585205 }, { "auxiliary_loss_clip": 0.06411408, "auxiliary_loss_mlp": 0.01262367, "balance_loss_clip": 0.06273632, "balance_loss_mlp": 0.01253403, "epoch": 0.8594017736359537, "flos": 21003056720640.0, "grad_norm": 1.8252552698194702, "language_loss": 0.68735766, "learning_rate": 2.0372473351896358e-07, "loss": 0.76409543, "num_input_tokens_seen": 308344215, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.08959961, "step": 14294, "time_per_iteration": 2.550305128097534 }, { "auxiliary_loss_clip": 0.06408083, "auxiliary_loss_mlp": 0.01263113, "balance_loss_clip": 0.06271939, "balance_loss_mlp": 0.01253892, "epoch": 0.8594618968886216, "flos": 22097626606080.0, "grad_norm": 1.9947295236000555, "language_loss": 0.78173405, "learning_rate": 2.0355351565101087e-07, "loss": 0.858446, "num_input_tokens_seen": 308360520, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.09222412, "step": 14295, "time_per_iteration": 4.00639796257019 }, { "auxiliary_loss_clip": 0.06422812, "auxiliary_loss_mlp": 0.01265522, "balance_loss_clip": 0.06278118, "balance_loss_mlp": 0.01254233, "epoch": 0.8595220201412896, "flos": 11661086286720.0, "grad_norm": 6.109829074833062, "language_loss": 0.68826878, "learning_rate": 2.0338236590448975e-07, "loss": 0.7651521, "num_input_tokens_seen": 308376865, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.112854, "step": 14296, "time_per_iteration": 2.519380569458008 }, { "auxiliary_loss_clip": 0.06415439, "auxiliary_loss_mlp": 0.01265689, "balance_loss_clip": 0.06276022, "balance_loss_mlp": 0.01255681, "epoch": 0.8595821433939577, "flos": 25046416293120.0, "grad_norm": 2.085032078281635, "language_loss": 0.79365849, "learning_rate": 2.0321128428588842e-07, "loss": 0.87046975, "num_input_tokens_seen": 308395870, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.10015869, "step": 14297, "time_per_iteration": 2.605316400527954 }, { "auxiliary_loss_clip": 0.06408617, "auxiliary_loss_mlp": 0.01267453, "balance_loss_clip": 0.06271341, "balance_loss_mlp": 0.01258798, "epoch": 0.8596422666466256, "flos": 28518997052160.0, "grad_norm": 1.621834296444293, "language_loss": 0.68223166, "learning_rate": 2.030402708016954e-07, "loss": 0.75899237, "num_input_tokens_seen": 308417250, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.08654785, "step": 14298, "time_per_iteration": 2.6042473316192627 }, { "auxiliary_loss_clip": 0.06413101, "auxiliary_loss_mlp": 0.01261767, "balance_loss_clip": 0.06276837, "balance_loss_mlp": 0.01252612, "epoch": 0.8597023898992936, "flos": 13594158380160.0, "grad_norm": 1.909253445066946, "language_loss": 0.6835264, "learning_rate": 2.0286932545839576e-07, "loss": 0.76027507, "num_input_tokens_seen": 308434565, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.0914917, "step": 14299, "time_per_iteration": 2.5067954063415527 }, { "auxiliary_loss_clip": 0.06416532, "auxiliary_loss_mlp": 0.01264953, "balance_loss_clip": 0.06275004, "balance_loss_mlp": 0.01254821, "epoch": 0.8597625131519615, "flos": 32308049882880.0, "grad_norm": 2.438928330178296, "language_loss": 0.71448767, "learning_rate": 2.0269844826247096e-07, "loss": 0.79130244, "num_input_tokens_seen": 308450040, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10137939, "step": 14300, "time_per_iteration": 2.5916807651519775 }, { "auxiliary_loss_clip": 0.0641398, "auxiliary_loss_mlp": 0.01268548, "balance_loss_clip": 0.06275558, "balance_loss_mlp": 0.01259327, "epoch": 0.8598226364046295, "flos": 28737860716800.0, "grad_norm": 1.4586694691420763, "language_loss": 0.69456613, "learning_rate": 2.0252763922040116e-07, "loss": 0.77139139, "num_input_tokens_seen": 308470545, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09222412, "step": 14301, "time_per_iteration": 2.5992746353149414 }, { "auxiliary_loss_clip": 0.06416556, "auxiliary_loss_mlp": 0.01265154, "balance_loss_clip": 0.06276515, "balance_loss_mlp": 0.0125632, "epoch": 0.8598827596572974, "flos": 21878301744000.0, "grad_norm": 1.6642128235829867, "language_loss": 0.74357706, "learning_rate": 2.023568983386641e-07, "loss": 0.82039416, "num_input_tokens_seen": 308490020, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.08825684, "step": 14302, "time_per_iteration": 2.532212972640991 }, { "auxiliary_loss_clip": 0.06412241, "auxiliary_loss_mlp": 0.01260864, "balance_loss_clip": 0.06275802, "balance_loss_mlp": 0.01251762, "epoch": 0.8599428829099655, "flos": 23773792481280.0, "grad_norm": 1.6944235183482135, "language_loss": 0.8389284, "learning_rate": 2.02186225623733e-07, "loss": 0.91565943, "num_input_tokens_seen": 308509065, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09106445, "step": 14303, "time_per_iteration": 2.5583019256591797 }, { "auxiliary_loss_clip": 0.0641797, "auxiliary_loss_mlp": 0.01265735, "balance_loss_clip": 0.06275935, "balance_loss_mlp": 0.01255489, "epoch": 0.8600030061626334, "flos": 16217671317120.0, "grad_norm": 2.812135289427334, "language_loss": 0.77198756, "learning_rate": 2.0201562108208025e-07, "loss": 0.84882456, "num_input_tokens_seen": 308524725, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10247803, "step": 14304, "time_per_iteration": 2.5102686882019043 }, { "auxiliary_loss_clip": 0.06417072, "auxiliary_loss_mlp": 0.01265956, "balance_loss_clip": 0.06276476, "balance_loss_mlp": 0.01255662, "epoch": 0.8600631294153014, "flos": 15674830640640.0, "grad_norm": 2.146301809530819, "language_loss": 0.5491159, "learning_rate": 2.0184508472017537e-07, "loss": 0.62594616, "num_input_tokens_seen": 308543525, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.1027832, "step": 14305, "time_per_iteration": 2.499819755554199 }, { "auxiliary_loss_clip": 0.06412756, "auxiliary_loss_mlp": 0.01267198, "balance_loss_clip": 0.06274457, "balance_loss_mlp": 0.01257233, "epoch": 0.8601232526679693, "flos": 17498764391040.0, "grad_norm": 1.8979784295911812, "language_loss": 0.83554816, "learning_rate": 2.0167461654448558e-07, "loss": 0.91234773, "num_input_tokens_seen": 308557995, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09967041, "step": 14306, "time_per_iteration": 2.4862258434295654 }, { "auxiliary_loss_clip": 0.06410141, "auxiliary_loss_mlp": 0.01264842, "balance_loss_clip": 0.06273222, "balance_loss_mlp": 0.01256051, "epoch": 0.8601833759206373, "flos": 26994288631680.0, "grad_norm": 1.3301499427888734, "language_loss": 0.71527976, "learning_rate": 2.01504216561474e-07, "loss": 0.79202962, "num_input_tokens_seen": 308582750, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.0880127, "step": 14307, "time_per_iteration": 2.641495704650879 }, { "auxiliary_loss_clip": 0.0642516, "auxiliary_loss_mlp": 0.01267935, "balance_loss_clip": 0.06280294, "balance_loss_mlp": 0.01257212, "epoch": 0.8602434991733052, "flos": 25237006404480.0, "grad_norm": 1.6098013837413108, "language_loss": 0.63687289, "learning_rate": 2.0133388477760316e-07, "loss": 0.71380377, "num_input_tokens_seen": 308603770, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.1071167, "step": 14308, "time_per_iteration": 2.578606605529785 }, { "auxiliary_loss_clip": 0.06310692, "auxiliary_loss_mlp": 0.01253923, "balance_loss_clip": 0.06255388, "balance_loss_mlp": 0.01253047, "epoch": 0.8603036224259732, "flos": 71035694547840.0, "grad_norm": 0.7640908234935522, "language_loss": 0.48267841, "learning_rate": 2.0116362119933172e-07, "loss": 0.55832458, "num_input_tokens_seen": 308667735, "router_z_loss_clip": 0.55419922, "router_z_loss_mlp": 0.0087738, "step": 14309, "time_per_iteration": 3.282708168029785 }, { "auxiliary_loss_clip": 0.06420079, "auxiliary_loss_mlp": 0.01265998, "balance_loss_clip": 0.06278304, "balance_loss_mlp": 0.0125515, "epoch": 0.8603637456786413, "flos": 20306452602240.0, "grad_norm": 2.2137174735227005, "language_loss": 0.67413002, "learning_rate": 2.0099342583311563e-07, "loss": 0.75099075, "num_input_tokens_seen": 308686300, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10845947, "step": 14310, "time_per_iteration": 2.5476317405700684 }, { "auxiliary_loss_clip": 0.06412244, "auxiliary_loss_mlp": 0.01262618, "balance_loss_clip": 0.06270984, "balance_loss_mlp": 0.01253874, "epoch": 0.8604238689313092, "flos": 21842397469440.0, "grad_norm": 1.7875152314766551, "language_loss": 0.7918222, "learning_rate": 2.0082329868540905e-07, "loss": 0.86857086, "num_input_tokens_seen": 308705825, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.08740234, "step": 14311, "time_per_iteration": 2.571974039077759 }, { "auxiliary_loss_clip": 0.06413357, "auxiliary_loss_mlp": 0.01262672, "balance_loss_clip": 0.06273923, "balance_loss_mlp": 0.01253464, "epoch": 0.8604839921839772, "flos": 18010019278080.0, "grad_norm": 2.7355775799493047, "language_loss": 0.72143775, "learning_rate": 2.006532397626639e-07, "loss": 0.79819804, "num_input_tokens_seen": 308723340, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09204102, "step": 14312, "time_per_iteration": 2.515775442123413 }, { "auxiliary_loss_clip": 0.0641167, "auxiliary_loss_mlp": 0.01264479, "balance_loss_clip": 0.06272811, "balance_loss_mlp": 0.01254894, "epoch": 0.8605441154366451, "flos": 16257558660480.0, "grad_norm": 1.9094554502733216, "language_loss": 0.78246897, "learning_rate": 2.0048324907132797e-07, "loss": 0.85923046, "num_input_tokens_seen": 308741280, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09588623, "step": 14313, "time_per_iteration": 2.557354688644409 }, { "auxiliary_loss_clip": 0.06414323, "auxiliary_loss_mlp": 0.01265978, "balance_loss_clip": 0.06277224, "balance_loss_mlp": 0.01255517, "epoch": 0.8606042386893131, "flos": 32274745084800.0, "grad_norm": 1.3984353855270182, "language_loss": 0.72781813, "learning_rate": 2.003133266178474e-07, "loss": 0.80462116, "num_input_tokens_seen": 308762875, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.10467529, "step": 14314, "time_per_iteration": 4.099895000457764 }, { "auxiliary_loss_clip": 0.06411016, "auxiliary_loss_mlp": 0.01265006, "balance_loss_clip": 0.06272157, "balance_loss_mlp": 0.0125552, "epoch": 0.860664361941981, "flos": 20235943791360.0, "grad_norm": 1.8216181464872805, "language_loss": 0.69328403, "learning_rate": 2.001434724086657e-07, "loss": 0.77004427, "num_input_tokens_seen": 308780315, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09481812, "step": 14315, "time_per_iteration": 2.5302608013153076 }, { "auxiliary_loss_clip": 0.06413917, "auxiliary_loss_mlp": 0.01266926, "balance_loss_clip": 0.06275365, "balance_loss_mlp": 0.012573, "epoch": 0.8607244851946491, "flos": 25198586507520.0, "grad_norm": 1.6869453380630834, "language_loss": 0.72154945, "learning_rate": 1.9997368645022418e-07, "loss": 0.7983579, "num_input_tokens_seen": 308799435, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09631348, "step": 14316, "time_per_iteration": 2.5729100704193115 }, { "auxiliary_loss_clip": 0.06418843, "auxiliary_loss_mlp": 0.0126855, "balance_loss_clip": 0.06277655, "balance_loss_mlp": 0.01258656, "epoch": 0.860784608447317, "flos": 20487776837760.0, "grad_norm": 3.4096424479336607, "language_loss": 0.82866251, "learning_rate": 1.9980396874896056e-07, "loss": 0.90553653, "num_input_tokens_seen": 308817730, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09887695, "step": 14317, "time_per_iteration": 2.5395219326019287 }, { "auxiliary_loss_clip": 0.06410518, "auxiliary_loss_mlp": 0.01264647, "balance_loss_clip": 0.06273268, "balance_loss_mlp": 0.01255444, "epoch": 0.860844731699985, "flos": 50487192627840.0, "grad_norm": 1.8263584350166688, "language_loss": 0.67504519, "learning_rate": 1.996343193113108e-07, "loss": 0.75179684, "num_input_tokens_seen": 308841735, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09204102, "step": 14318, "time_per_iteration": 2.780439615249634 }, { "auxiliary_loss_clip": 0.06411888, "auxiliary_loss_mlp": 0.01262604, "balance_loss_clip": 0.06275444, "balance_loss_mlp": 0.01253866, "epoch": 0.8609048549526529, "flos": 41182468133760.0, "grad_norm": 1.7625886723843547, "language_loss": 0.71634185, "learning_rate": 1.9946473814370911e-07, "loss": 0.79308677, "num_input_tokens_seen": 308865050, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.08734131, "step": 14319, "time_per_iteration": 4.1640002727508545 }, { "auxiliary_loss_clip": 0.06418197, "auxiliary_loss_mlp": 0.01264513, "balance_loss_clip": 0.06276813, "balance_loss_mlp": 0.01254452, "epoch": 0.8609649782053209, "flos": 23957967755520.0, "grad_norm": 1.5788922215307757, "language_loss": 0.67079973, "learning_rate": 1.992952252525839e-07, "loss": 0.74762678, "num_input_tokens_seen": 308885375, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10058594, "step": 14320, "time_per_iteration": 2.5540904998779297 }, { "auxiliary_loss_clip": 0.06417942, "auxiliary_loss_mlp": 0.01263156, "balance_loss_clip": 0.06275924, "balance_loss_mlp": 0.01252934, "epoch": 0.8610251014579888, "flos": 23119297839360.0, "grad_norm": 1.9102730329333746, "language_loss": 0.79888248, "learning_rate": 1.9912578064436446e-07, "loss": 0.87569344, "num_input_tokens_seen": 308904700, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10223389, "step": 14321, "time_per_iteration": 2.536015510559082 }, { "auxiliary_loss_clip": 0.06408156, "auxiliary_loss_mlp": 0.01265506, "balance_loss_clip": 0.06274563, "balance_loss_mlp": 0.01256404, "epoch": 0.8610852247106568, "flos": 19432800806400.0, "grad_norm": 2.023286200412068, "language_loss": 0.71907771, "learning_rate": 1.9895640432547567e-07, "loss": 0.79581428, "num_input_tokens_seen": 308922985, "router_z_loss_clip": 1.33496094, "router_z_loss_mlp": 0.09100342, "step": 14322, "time_per_iteration": 2.507040500640869 }, { "auxiliary_loss_clip": 0.06422131, "auxiliary_loss_mlp": 0.01266467, "balance_loss_clip": 0.0627653, "balance_loss_mlp": 0.01255792, "epoch": 0.8611453479633249, "flos": 19317163772160.0, "grad_norm": 1.707255364884784, "language_loss": 0.56402683, "learning_rate": 1.9878709630234102e-07, "loss": 0.64091277, "num_input_tokens_seen": 308940765, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10668945, "step": 14323, "time_per_iteration": 2.669546365737915 }, { "auxiliary_loss_clip": 0.06413662, "auxiliary_loss_mlp": 0.01264816, "balance_loss_clip": 0.06275198, "balance_loss_mlp": 0.01255828, "epoch": 0.8612054712159928, "flos": 23259602701440.0, "grad_norm": 1.8647726010744303, "language_loss": 0.75765049, "learning_rate": 1.986178565813801e-07, "loss": 0.83443528, "num_input_tokens_seen": 308960110, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.08990479, "step": 14324, "time_per_iteration": 2.5576014518737793 }, { "auxiliary_loss_clip": 0.06413452, "auxiliary_loss_mlp": 0.01265289, "balance_loss_clip": 0.06273311, "balance_loss_mlp": 0.01254954, "epoch": 0.8612655944686608, "flos": 16032992918400.0, "grad_norm": 2.1040184027779993, "language_loss": 0.66765541, "learning_rate": 1.9844868516901036e-07, "loss": 0.74444282, "num_input_tokens_seen": 308976665, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.10333252, "step": 14325, "time_per_iteration": 2.5042054653167725 }, { "auxiliary_loss_clip": 0.06416157, "auxiliary_loss_mlp": 0.01263861, "balance_loss_clip": 0.06276546, "balance_loss_mlp": 0.01254134, "epoch": 0.8613257177213287, "flos": 22499407733760.0, "grad_norm": 2.673219452309101, "language_loss": 0.65134555, "learning_rate": 1.982795820716472e-07, "loss": 0.72814572, "num_input_tokens_seen": 308997015, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.097229, "step": 14326, "time_per_iteration": 2.6165060997009277 }, { "auxiliary_loss_clip": 0.06416098, "auxiliary_loss_mlp": 0.01265905, "balance_loss_clip": 0.06275886, "balance_loss_mlp": 0.01256172, "epoch": 0.8613858409739967, "flos": 17243744889600.0, "grad_norm": 1.9496130080944458, "language_loss": 0.84752548, "learning_rate": 1.9811054729570253e-07, "loss": 0.92434555, "num_input_tokens_seen": 309015250, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09735107, "step": 14327, "time_per_iteration": 2.546067714691162 }, { "auxiliary_loss_clip": 0.06414236, "auxiliary_loss_mlp": 0.01267494, "balance_loss_clip": 0.06273274, "balance_loss_mlp": 0.01257278, "epoch": 0.8614459642266646, "flos": 22827870938880.0, "grad_norm": 2.0786561777008847, "language_loss": 0.75394762, "learning_rate": 1.9794158084758661e-07, "loss": 0.83076501, "num_input_tokens_seen": 309034140, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10229492, "step": 14328, "time_per_iteration": 2.577221632003784 }, { "auxiliary_loss_clip": 0.06412881, "auxiliary_loss_mlp": 0.01264007, "balance_loss_clip": 0.06275643, "balance_loss_mlp": 0.01254476, "epoch": 0.8615060874793327, "flos": 26511349224960.0, "grad_norm": 1.7069807437264843, "language_loss": 0.79911232, "learning_rate": 1.9777268273370673e-07, "loss": 0.8758812, "num_input_tokens_seen": 309055075, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09527588, "step": 14329, "time_per_iteration": 4.119786739349365 }, { "auxiliary_loss_clip": 0.06414106, "auxiliary_loss_mlp": 0.01268333, "balance_loss_clip": 0.06274422, "balance_loss_mlp": 0.0125904, "epoch": 0.8615662107320006, "flos": 24067860785280.0, "grad_norm": 2.1079669597441644, "language_loss": 0.77305079, "learning_rate": 1.9760385296046757e-07, "loss": 0.84987515, "num_input_tokens_seen": 309074650, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09289551, "step": 14330, "time_per_iteration": 2.554558038711548 }, { "auxiliary_loss_clip": 0.06412935, "auxiliary_loss_mlp": 0.01264818, "balance_loss_clip": 0.06275447, "balance_loss_mlp": 0.01255472, "epoch": 0.8616263339846686, "flos": 24171003561600.0, "grad_norm": 2.2646860563649964, "language_loss": 0.65428972, "learning_rate": 1.974350915342702e-07, "loss": 0.73106724, "num_input_tokens_seen": 309094385, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09350586, "step": 14331, "time_per_iteration": 2.5458784103393555 }, { "auxiliary_loss_clip": 0.06411564, "auxiliary_loss_mlp": 0.01266407, "balance_loss_clip": 0.06274426, "balance_loss_mlp": 0.01257788, "epoch": 0.8616864572373365, "flos": 21730533868800.0, "grad_norm": 1.8274755042391742, "language_loss": 0.76438797, "learning_rate": 1.9726639846151506e-07, "loss": 0.84116769, "num_input_tokens_seen": 309111815, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.08612061, "step": 14332, "time_per_iteration": 2.5229885578155518 }, { "auxiliary_loss_clip": 0.06419599, "auxiliary_loss_mlp": 0.01264968, "balance_loss_clip": 0.06276813, "balance_loss_mlp": 0.01254412, "epoch": 0.8617465804900045, "flos": 23773037794560.0, "grad_norm": 1.6297003764948712, "language_loss": 0.67567801, "learning_rate": 1.9709777374859904e-07, "loss": 0.75252372, "num_input_tokens_seen": 309131385, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10559082, "step": 14333, "time_per_iteration": 2.558367967605591 }, { "auxiliary_loss_clip": 0.06423981, "auxiliary_loss_mlp": 0.01265697, "balance_loss_clip": 0.06276613, "balance_loss_mlp": 0.01254944, "epoch": 0.8618067037426724, "flos": 37712612632320.0, "grad_norm": 2.6231419375678526, "language_loss": 0.62313652, "learning_rate": 1.969292174019157e-07, "loss": 0.70003331, "num_input_tokens_seen": 309155020, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.10760498, "step": 14334, "time_per_iteration": 4.064363479614258 }, { "auxiliary_loss_clip": 0.06424868, "auxiliary_loss_mlp": 0.0126551, "balance_loss_clip": 0.06281066, "balance_loss_mlp": 0.01255484, "epoch": 0.8618668269953405, "flos": 21477526865280.0, "grad_norm": 2.4607827862987555, "language_loss": 0.69694328, "learning_rate": 1.967607294278577e-07, "loss": 0.77384698, "num_input_tokens_seen": 309172865, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10028076, "step": 14335, "time_per_iteration": 2.5127344131469727 }, { "auxiliary_loss_clip": 0.06418428, "auxiliary_loss_mlp": 0.01267194, "balance_loss_clip": 0.06277192, "balance_loss_mlp": 0.01257198, "epoch": 0.8619269502480085, "flos": 22238560373760.0, "grad_norm": 1.4672498207734457, "language_loss": 0.83423185, "learning_rate": 1.965923098328135e-07, "loss": 0.91108805, "num_input_tokens_seen": 309193575, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09991455, "step": 14336, "time_per_iteration": 2.5509541034698486 }, { "auxiliary_loss_clip": 0.06421571, "auxiliary_loss_mlp": 0.012666, "balance_loss_clip": 0.06275239, "balance_loss_mlp": 0.01256658, "epoch": 0.8619870735006764, "flos": 22717181295360.0, "grad_norm": 2.2324736100296123, "language_loss": 0.67547786, "learning_rate": 1.9642395862316907e-07, "loss": 0.75235951, "num_input_tokens_seen": 309212680, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.0993042, "step": 14337, "time_per_iteration": 2.54146671295166 }, { "auxiliary_loss_clip": 0.06413649, "auxiliary_loss_mlp": 0.0126876, "balance_loss_clip": 0.06273621, "balance_loss_mlp": 0.01259163, "epoch": 0.8620471967533444, "flos": 37528730847360.0, "grad_norm": 1.5259159496287191, "language_loss": 0.67288584, "learning_rate": 1.962556758053089e-07, "loss": 0.74970996, "num_input_tokens_seen": 309234485, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.0960083, "step": 14338, "time_per_iteration": 2.686591863632202 }, { "auxiliary_loss_clip": 0.06416774, "auxiliary_loss_mlp": 0.01266283, "balance_loss_clip": 0.06276771, "balance_loss_mlp": 0.01257116, "epoch": 0.8621073200060123, "flos": 19688533067520.0, "grad_norm": 1.7699936051584284, "language_loss": 0.62290877, "learning_rate": 1.9608746138561448e-07, "loss": 0.69973934, "num_input_tokens_seen": 309253630, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09161377, "step": 14339, "time_per_iteration": 2.536451578140259 }, { "auxiliary_loss_clip": 0.06407481, "auxiliary_loss_mlp": 0.01261389, "balance_loss_clip": 0.06270475, "balance_loss_mlp": 0.0125199, "epoch": 0.8621674432586803, "flos": 14541882785280.0, "grad_norm": 1.992500532985096, "language_loss": 0.62673497, "learning_rate": 1.9591931537046458e-07, "loss": 0.70342374, "num_input_tokens_seen": 309270950, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09399414, "step": 14340, "time_per_iteration": 2.530670642852783 }, { "auxiliary_loss_clip": 0.06408132, "auxiliary_loss_mlp": 0.01269495, "balance_loss_clip": 0.06277589, "balance_loss_mlp": 0.01260829, "epoch": 0.8622275665113482, "flos": 20746276283520.0, "grad_norm": 1.5730972601961262, "language_loss": 0.80236024, "learning_rate": 1.9575123776623493e-07, "loss": 0.8791365, "num_input_tokens_seen": 309288780, "router_z_loss_clip": 1.30566406, "router_z_loss_mlp": 0.08660889, "step": 14341, "time_per_iteration": 2.5324699878692627 }, { "auxiliary_loss_clip": 0.06409863, "auxiliary_loss_mlp": 0.01264143, "balance_loss_clip": 0.06273139, "balance_loss_mlp": 0.01255006, "epoch": 0.8622876897640163, "flos": 24722565062400.0, "grad_norm": 1.6558250861375572, "language_loss": 0.75004411, "learning_rate": 1.9558322857929887e-07, "loss": 0.82678419, "num_input_tokens_seen": 309310875, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09130859, "step": 14342, "time_per_iteration": 2.6228044033050537 }, { "auxiliary_loss_clip": 0.06415843, "auxiliary_loss_mlp": 0.01264794, "balance_loss_clip": 0.0627457, "balance_loss_mlp": 0.01255031, "epoch": 0.8623478130166842, "flos": 17463153605760.0, "grad_norm": 1.948696433583812, "language_loss": 0.68539143, "learning_rate": 1.95415287816028e-07, "loss": 0.76219779, "num_input_tokens_seen": 309329900, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09765625, "step": 14343, "time_per_iteration": 2.5272583961486816 }, { "auxiliary_loss_clip": 0.06414364, "auxiliary_loss_mlp": 0.01268479, "balance_loss_clip": 0.06273881, "balance_loss_mlp": 0.01258525, "epoch": 0.8624079362693522, "flos": 18114252157440.0, "grad_norm": 1.6431879308941104, "language_loss": 0.68381238, "learning_rate": 1.9524741548278967e-07, "loss": 0.7606408, "num_input_tokens_seen": 309347870, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09954834, "step": 14344, "time_per_iteration": 2.5634355545043945 }, { "auxiliary_loss_clip": 0.06420957, "auxiliary_loss_mlp": 0.01264635, "balance_loss_clip": 0.06277791, "balance_loss_mlp": 0.01254604, "epoch": 0.8624680595220201, "flos": 30674664316800.0, "grad_norm": 1.380938645369285, "language_loss": 0.81298316, "learning_rate": 1.9507961158595054e-07, "loss": 0.88983905, "num_input_tokens_seen": 309371695, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10028076, "step": 14345, "time_per_iteration": 2.6388683319091797 }, { "auxiliary_loss_clip": 0.06416919, "auxiliary_loss_mlp": 0.01266679, "balance_loss_clip": 0.06274655, "balance_loss_mlp": 0.01256993, "epoch": 0.8625281827746881, "flos": 38007771039360.0, "grad_norm": 2.286180361955911, "language_loss": 0.51023865, "learning_rate": 1.9491187613187355e-07, "loss": 0.58707464, "num_input_tokens_seen": 309394645, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09680176, "step": 14346, "time_per_iteration": 2.6738157272338867 }, { "auxiliary_loss_clip": 0.06412701, "auxiliary_loss_mlp": 0.01266541, "balance_loss_clip": 0.06274062, "balance_loss_mlp": 0.01256951, "epoch": 0.862588306027356, "flos": 26256874775040.0, "grad_norm": 1.5722281247379193, "language_loss": 0.74830449, "learning_rate": 1.9474420912691913e-07, "loss": 0.82509691, "num_input_tokens_seen": 309413170, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09588623, "step": 14347, "time_per_iteration": 2.6496646404266357 }, { "auxiliary_loss_clip": 0.06418314, "auxiliary_loss_mlp": 0.01267636, "balance_loss_clip": 0.06278062, "balance_loss_mlp": 0.01257116, "epoch": 0.862648429280024, "flos": 25884876574080.0, "grad_norm": 1.824238429421765, "language_loss": 0.80588925, "learning_rate": 1.945766105774449e-07, "loss": 0.88274872, "num_input_tokens_seen": 309431315, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.10516357, "step": 14348, "time_per_iteration": 2.562674045562744 }, { "auxiliary_loss_clip": 0.06407662, "auxiliary_loss_mlp": 0.01262777, "balance_loss_clip": 0.06271818, "balance_loss_mlp": 0.01253419, "epoch": 0.862708552532692, "flos": 37825608263040.0, "grad_norm": 1.6433133154191186, "language_loss": 0.66348898, "learning_rate": 1.9440908048980665e-07, "loss": 0.74019337, "num_input_tokens_seen": 309453020, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.09350586, "step": 14349, "time_per_iteration": 2.7048537731170654 }, { "auxiliary_loss_clip": 0.0641215, "auxiliary_loss_mlp": 0.0126747, "balance_loss_clip": 0.06274392, "balance_loss_mlp": 0.01257379, "epoch": 0.86276867578536, "flos": 19096623025920.0, "grad_norm": 2.106333120296997, "language_loss": 0.69863462, "learning_rate": 1.942416188703573e-07, "loss": 0.77543086, "num_input_tokens_seen": 309469780, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.10089111, "step": 14350, "time_per_iteration": 2.4866387844085693 }, { "auxiliary_loss_clip": 0.06412849, "auxiliary_loss_mlp": 0.01267277, "balance_loss_clip": 0.06272816, "balance_loss_mlp": 0.0125724, "epoch": 0.862828799038028, "flos": 22170902601600.0, "grad_norm": 1.7797839832949292, "language_loss": 0.77016139, "learning_rate": 1.9407422572544618e-07, "loss": 0.84696269, "num_input_tokens_seen": 309489610, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.10028076, "step": 14351, "time_per_iteration": 2.5572948455810547 }, { "auxiliary_loss_clip": 0.06412366, "auxiliary_loss_mlp": 0.01266178, "balance_loss_clip": 0.06272372, "balance_loss_mlp": 0.0125688, "epoch": 0.8628889222906959, "flos": 23151722169600.0, "grad_norm": 2.7585865768815956, "language_loss": 0.85291958, "learning_rate": 1.9390690106142204e-07, "loss": 0.92970502, "num_input_tokens_seen": 309508295, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09301758, "step": 14352, "time_per_iteration": 2.5585389137268066 }, { "auxiliary_loss_clip": 0.06313597, "auxiliary_loss_mlp": 0.01249792, "balance_loss_clip": 0.0625823, "balance_loss_mlp": 0.01248894, "epoch": 0.8629490455433639, "flos": 57837600489600.0, "grad_norm": 0.7746144807146574, "language_loss": 0.61892796, "learning_rate": 1.9373964488462913e-07, "loss": 0.6945619, "num_input_tokens_seen": 309567960, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00897217, "step": 14353, "time_per_iteration": 3.152954578399658 }, { "auxiliary_loss_clip": 0.06411737, "auxiliary_loss_mlp": 0.01266591, "balance_loss_clip": 0.06275834, "balance_loss_mlp": 0.01257608, "epoch": 0.8630091687960318, "flos": 15924315772800.0, "grad_norm": 1.5786669005483955, "language_loss": 0.81902206, "learning_rate": 1.9357245720140948e-07, "loss": 0.89580536, "num_input_tokens_seen": 309586050, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.08984375, "step": 14354, "time_per_iteration": 3.937073230743408 }, { "auxiliary_loss_clip": 0.06414357, "auxiliary_loss_mlp": 0.01262881, "balance_loss_clip": 0.06274888, "balance_loss_mlp": 0.01252861, "epoch": 0.8630692920486999, "flos": 17966484282240.0, "grad_norm": 2.0513885198977833, "language_loss": 0.86008608, "learning_rate": 1.934053380181031e-07, "loss": 0.93685842, "num_input_tokens_seen": 309602910, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.10021973, "step": 14355, "time_per_iteration": 2.501631259918213 }, { "auxiliary_loss_clip": 0.06420669, "auxiliary_loss_mlp": 0.0126414, "balance_loss_clip": 0.06279584, "balance_loss_mlp": 0.01253769, "epoch": 0.8631294153013678, "flos": 22461658669440.0, "grad_norm": 2.3452630467752424, "language_loss": 0.58902699, "learning_rate": 1.9323828734104763e-07, "loss": 0.66587508, "num_input_tokens_seen": 309621175, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10375977, "step": 14356, "time_per_iteration": 2.6815576553344727 }, { "auxiliary_loss_clip": 0.0642283, "auxiliary_loss_mlp": 0.01264339, "balance_loss_clip": 0.06276907, "balance_loss_mlp": 0.01253616, "epoch": 0.8631895385540358, "flos": 16842676521600.0, "grad_norm": 1.5290215670034135, "language_loss": 0.7711482, "learning_rate": 1.9307130517657756e-07, "loss": 0.84801984, "num_input_tokens_seen": 309639395, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10723877, "step": 14357, "time_per_iteration": 2.5699148178100586 }, { "auxiliary_loss_clip": 0.06415825, "auxiliary_loss_mlp": 0.01266967, "balance_loss_clip": 0.06275855, "balance_loss_mlp": 0.01256876, "epoch": 0.8632496618067037, "flos": 18703101525120.0, "grad_norm": 2.2804110881599895, "language_loss": 0.78181958, "learning_rate": 1.9290439153102468e-07, "loss": 0.85864753, "num_input_tokens_seen": 309657265, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.10101318, "step": 14358, "time_per_iteration": 2.5968680381774902 }, { "auxiliary_loss_clip": 0.0641541, "auxiliary_loss_mlp": 0.01263022, "balance_loss_clip": 0.06274754, "balance_loss_mlp": 0.01253759, "epoch": 0.8633097850593717, "flos": 24286808304000.0, "grad_norm": 1.2687224888186723, "language_loss": 0.75286186, "learning_rate": 1.9273754641071816e-07, "loss": 0.82964611, "num_input_tokens_seen": 309678610, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09265137, "step": 14359, "time_per_iteration": 4.038345813751221 }, { "auxiliary_loss_clip": 0.06405921, "auxiliary_loss_mlp": 0.01264593, "balance_loss_clip": 0.06271661, "balance_loss_mlp": 0.01255957, "epoch": 0.8633699083120396, "flos": 21184926007680.0, "grad_norm": 1.7997101844121817, "language_loss": 0.70877427, "learning_rate": 1.9257076982198517e-07, "loss": 0.78547943, "num_input_tokens_seen": 309697710, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.08636475, "step": 14360, "time_per_iteration": 2.5831520557403564 }, { "auxiliary_loss_clip": 0.06422292, "auxiliary_loss_mlp": 0.01265206, "balance_loss_clip": 0.06280158, "balance_loss_mlp": 0.0125508, "epoch": 0.8634300315647077, "flos": 19250931519360.0, "grad_norm": 1.6884400997899505, "language_loss": 0.76375985, "learning_rate": 1.9240406177114953e-07, "loss": 0.84063482, "num_input_tokens_seen": 309715985, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10131836, "step": 14361, "time_per_iteration": 2.5546042919158936 }, { "auxiliary_loss_clip": 0.06310996, "auxiliary_loss_mlp": 0.0125161, "balance_loss_clip": 0.0625595, "balance_loss_mlp": 0.01250623, "epoch": 0.8634901548173756, "flos": 66214572577920.0, "grad_norm": 0.8662648568810292, "language_loss": 0.58689928, "learning_rate": 1.922374222645329e-07, "loss": 0.66252536, "num_input_tokens_seen": 309779930, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00985718, "step": 14362, "time_per_iteration": 3.19610595703125 }, { "auxiliary_loss_clip": 0.06422545, "auxiliary_loss_mlp": 0.01268457, "balance_loss_clip": 0.06280363, "balance_loss_mlp": 0.01257913, "epoch": 0.8635502780700436, "flos": 24796302255360.0, "grad_norm": 1.7996167017186628, "language_loss": 0.81073546, "learning_rate": 1.9207085130845524e-07, "loss": 0.88764542, "num_input_tokens_seen": 309800580, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10546875, "step": 14363, "time_per_iteration": 2.6461215019226074 }, { "auxiliary_loss_clip": 0.0641985, "auxiliary_loss_mlp": 0.01264551, "balance_loss_clip": 0.06277813, "balance_loss_mlp": 0.01253953, "epoch": 0.8636104013227116, "flos": 25196657863680.0, "grad_norm": 2.5306124160080574, "language_loss": 0.7256754, "learning_rate": 1.9190434890923112e-07, "loss": 0.80251944, "num_input_tokens_seen": 309821725, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10601807, "step": 14364, "time_per_iteration": 2.625365972518921 }, { "auxiliary_loss_clip": 0.06420454, "auxiliary_loss_mlp": 0.01264586, "balance_loss_clip": 0.06277673, "balance_loss_mlp": 0.01254924, "epoch": 0.8636705245753795, "flos": 23885236811520.0, "grad_norm": 1.4973960526535472, "language_loss": 0.71357197, "learning_rate": 1.917379150731755e-07, "loss": 0.79042238, "num_input_tokens_seen": 309841565, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09661865, "step": 14365, "time_per_iteration": 2.6158175468444824 }, { "auxiliary_loss_clip": 0.06423289, "auxiliary_loss_mlp": 0.01269492, "balance_loss_clip": 0.06278899, "balance_loss_mlp": 0.01258167, "epoch": 0.8637306478280475, "flos": 23116824144000.0, "grad_norm": 3.6920475237824895, "language_loss": 0.71211541, "learning_rate": 1.915715498065993e-07, "loss": 0.78904319, "num_input_tokens_seen": 309858635, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.11309814, "step": 14366, "time_per_iteration": 2.6248443126678467 }, { "auxiliary_loss_clip": 0.06408781, "auxiliary_loss_mlp": 0.01266932, "balance_loss_clip": 0.06272098, "balance_loss_mlp": 0.01257777, "epoch": 0.8637907710807154, "flos": 21913032061440.0, "grad_norm": 1.6164295516334635, "language_loss": 0.82300401, "learning_rate": 1.9140525311581146e-07, "loss": 0.8997612, "num_input_tokens_seen": 309877885, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09161377, "step": 14367, "time_per_iteration": 2.6190500259399414 }, { "auxiliary_loss_clip": 0.06416065, "auxiliary_loss_mlp": 0.01264327, "balance_loss_clip": 0.06275348, "balance_loss_mlp": 0.01253807, "epoch": 0.8638508943333835, "flos": 23586263043840.0, "grad_norm": 2.1888747417829295, "language_loss": 0.62143141, "learning_rate": 1.9123902500711743e-07, "loss": 0.69823533, "num_input_tokens_seen": 309893140, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10528564, "step": 14368, "time_per_iteration": 4.113758325576782 }, { "auxiliary_loss_clip": 0.06413099, "auxiliary_loss_mlp": 0.01265881, "balance_loss_clip": 0.06274254, "balance_loss_mlp": 0.0125591, "epoch": 0.8639110175860514, "flos": 25782991608960.0, "grad_norm": 1.845396487437779, "language_loss": 0.76370347, "learning_rate": 1.91072865486821e-07, "loss": 0.8404932, "num_input_tokens_seen": 309914175, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09967041, "step": 14369, "time_per_iteration": 2.6215062141418457 }, { "auxiliary_loss_clip": 0.06418592, "auxiliary_loss_mlp": 0.0126786, "balance_loss_clip": 0.0627428, "balance_loss_mlp": 0.01257471, "epoch": 0.8639711408387194, "flos": 23376455619840.0, "grad_norm": 1.8370962954424115, "language_loss": 0.64416552, "learning_rate": 1.9090677456122294e-07, "loss": 0.72103006, "num_input_tokens_seen": 309932395, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10388184, "step": 14370, "time_per_iteration": 2.7102620601654053 }, { "auxiliary_loss_clip": 0.06416771, "auxiliary_loss_mlp": 0.01264093, "balance_loss_clip": 0.0627688, "balance_loss_mlp": 0.01254347, "epoch": 0.8640312640913873, "flos": 22133740515840.0, "grad_norm": 1.827716392798189, "language_loss": 0.66420925, "learning_rate": 1.907407522366209e-07, "loss": 0.74101788, "num_input_tokens_seen": 309951720, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09747314, "step": 14371, "time_per_iteration": 2.6645846366882324 }, { "auxiliary_loss_clip": 0.06310559, "auxiliary_loss_mlp": 0.01251366, "balance_loss_clip": 0.06255531, "balance_loss_mlp": 0.0125035, "epoch": 0.8640913873440553, "flos": 57586998055680.0, "grad_norm": 0.8481340896053218, "language_loss": 0.56971955, "learning_rate": 1.905747985193107e-07, "loss": 0.64533877, "num_input_tokens_seen": 310006120, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01015472, "step": 14372, "time_per_iteration": 3.1378109455108643 }, { "auxiliary_loss_clip": 0.06412186, "auxiliary_loss_mlp": 0.01264142, "balance_loss_clip": 0.06276917, "balance_loss_mlp": 0.01254081, "epoch": 0.8641515105967232, "flos": 23994165519360.0, "grad_norm": 1.9456304620953617, "language_loss": 0.79783618, "learning_rate": 1.9040891341558597e-07, "loss": 0.87459946, "num_input_tokens_seen": 310026740, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.10070801, "step": 14373, "time_per_iteration": 2.7466611862182617 }, { "auxiliary_loss_clip": 0.06416918, "auxiliary_loss_mlp": 0.01265294, "balance_loss_clip": 0.06275818, "balance_loss_mlp": 0.01255548, "epoch": 0.8642116338493913, "flos": 19068810670080.0, "grad_norm": 1.6894587336063847, "language_loss": 0.64335978, "learning_rate": 1.9024309693173656e-07, "loss": 0.72018194, "num_input_tokens_seen": 310044135, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09753418, "step": 14374, "time_per_iteration": 4.213371992111206 }, { "auxiliary_loss_clip": 0.06411344, "auxiliary_loss_mlp": 0.01263364, "balance_loss_clip": 0.06274469, "balance_loss_mlp": 0.01254471, "epoch": 0.8642717571020592, "flos": 18259085139840.0, "grad_norm": 1.7640596542192193, "language_loss": 0.77742028, "learning_rate": 1.9007734907404993e-07, "loss": 0.85416734, "num_input_tokens_seen": 310061560, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.08892822, "step": 14375, "time_per_iteration": 2.665024757385254 }, { "auxiliary_loss_clip": 0.06415182, "auxiliary_loss_mlp": 0.01264426, "balance_loss_clip": 0.06275208, "balance_loss_mlp": 0.01254823, "epoch": 0.8643318803547272, "flos": 57675550222080.0, "grad_norm": 1.5279299684624794, "language_loss": 0.60850739, "learning_rate": 1.899116698488117e-07, "loss": 0.68530345, "num_input_tokens_seen": 310087310, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09606934, "step": 14376, "time_per_iteration": 2.932905912399292 }, { "auxiliary_loss_clip": 0.06415159, "auxiliary_loss_mlp": 0.01265925, "balance_loss_clip": 0.06277008, "balance_loss_mlp": 0.01256156, "epoch": 0.8643920036073952, "flos": 19615592488320.0, "grad_norm": 1.3798847142080284, "language_loss": 0.6659174, "learning_rate": 1.8974605926230457e-07, "loss": 0.74272829, "num_input_tokens_seen": 310106260, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09759521, "step": 14377, "time_per_iteration": 2.5604612827301025 }, { "auxiliary_loss_clip": 0.06417409, "auxiliary_loss_mlp": 0.01266021, "balance_loss_clip": 0.06275386, "balance_loss_mlp": 0.01256115, "epoch": 0.8644521268600631, "flos": 20856672437760.0, "grad_norm": 1.4997438711480748, "language_loss": 0.70767426, "learning_rate": 1.8958051732080804e-07, "loss": 0.78450859, "num_input_tokens_seen": 310125305, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09906006, "step": 14378, "time_per_iteration": 2.5497257709503174 }, { "auxiliary_loss_clip": 0.06307942, "auxiliary_loss_mlp": 0.01253522, "balance_loss_clip": 0.06252827, "balance_loss_mlp": 0.01252597, "epoch": 0.8645122501127311, "flos": 66740753491200.0, "grad_norm": 0.802996804943413, "language_loss": 0.60289824, "learning_rate": 1.894150440305995e-07, "loss": 0.67851281, "num_input_tokens_seen": 310189270, "router_z_loss_clip": 0.55224609, "router_z_loss_mlp": 0.00922394, "step": 14379, "time_per_iteration": 3.1786651611328125 }, { "auxiliary_loss_clip": 0.06413866, "auxiliary_loss_mlp": 0.01265125, "balance_loss_clip": 0.06276542, "balance_loss_mlp": 0.01255934, "epoch": 0.864572373365399, "flos": 21696558238080.0, "grad_norm": 1.5523111785049446, "language_loss": 0.74402857, "learning_rate": 1.8924963939795478e-07, "loss": 0.82081854, "num_input_tokens_seen": 310208395, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09185791, "step": 14380, "time_per_iteration": 2.605332136154175 }, { "auxiliary_loss_clip": 0.06422585, "auxiliary_loss_mlp": 0.01265876, "balance_loss_clip": 0.0627647, "balance_loss_mlp": 0.01255833, "epoch": 0.8646324966180671, "flos": 20272602752640.0, "grad_norm": 1.833968898746059, "language_loss": 0.75523365, "learning_rate": 1.8908430342914473e-07, "loss": 0.83211827, "num_input_tokens_seen": 310227415, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.1005249, "step": 14381, "time_per_iteration": 2.589980363845825 }, { "auxiliary_loss_clip": 0.0641534, "auxiliary_loss_mlp": 0.0126331, "balance_loss_clip": 0.06276846, "balance_loss_mlp": 0.01254339, "epoch": 0.864692619870735, "flos": 11950752251520.0, "grad_norm": 2.9509669565128145, "language_loss": 0.84876454, "learning_rate": 1.8891903613043892e-07, "loss": 0.92555106, "num_input_tokens_seen": 310242625, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.08963013, "step": 14382, "time_per_iteration": 2.5511014461517334 }, { "auxiliary_loss_clip": 0.06417377, "auxiliary_loss_mlp": 0.01263773, "balance_loss_clip": 0.06276792, "balance_loss_mlp": 0.01253944, "epoch": 0.864752743123403, "flos": 21477149521920.0, "grad_norm": 1.5819151191596694, "language_loss": 0.76036739, "learning_rate": 1.8875383750810504e-07, "loss": 0.83717883, "num_input_tokens_seen": 310260585, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.0982666, "step": 14383, "time_per_iteration": 2.5668439865112305 }, { "auxiliary_loss_clip": 0.06411634, "auxiliary_loss_mlp": 0.01268447, "balance_loss_clip": 0.06275547, "balance_loss_mlp": 0.01259065, "epoch": 0.8648128663760709, "flos": 19534979260800.0, "grad_norm": 1.6858144923332519, "language_loss": 0.85606265, "learning_rate": 1.8858870756840738e-07, "loss": 0.93286347, "num_input_tokens_seen": 310277210, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09381104, "step": 14384, "time_per_iteration": 2.5632894039154053 }, { "auxiliary_loss_clip": 0.06411591, "auxiliary_loss_mlp": 0.01265539, "balance_loss_clip": 0.06273569, "balance_loss_mlp": 0.01256312, "epoch": 0.8648729896287389, "flos": 21294315912960.0, "grad_norm": 1.8620372315412983, "language_loss": 0.8106879, "learning_rate": 1.884236463176072e-07, "loss": 0.88745922, "num_input_tokens_seen": 310296610, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09228516, "step": 14385, "time_per_iteration": 2.5694973468780518 }, { "auxiliary_loss_clip": 0.06418505, "auxiliary_loss_mlp": 0.01270185, "balance_loss_clip": 0.06276108, "balance_loss_mlp": 0.0126004, "epoch": 0.8649331128814068, "flos": 24610785315840.0, "grad_norm": 2.0361778430783026, "language_loss": 0.72617733, "learning_rate": 1.8825865376196437e-07, "loss": 0.80306423, "num_input_tokens_seen": 310316830, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10137939, "step": 14386, "time_per_iteration": 2.6205246448516846 }, { "auxiliary_loss_clip": 0.0641461, "auxiliary_loss_mlp": 0.01264041, "balance_loss_clip": 0.06276003, "balance_loss_mlp": 0.01254493, "epoch": 0.8649932361340749, "flos": 15383277959040.0, "grad_norm": 4.428826700508717, "language_loss": 0.82559454, "learning_rate": 1.8809372990773476e-07, "loss": 0.90238112, "num_input_tokens_seen": 310334355, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09552002, "step": 14387, "time_per_iteration": 2.550567626953125 }, { "auxiliary_loss_clip": 0.06415574, "auxiliary_loss_mlp": 0.01264919, "balance_loss_clip": 0.06279317, "balance_loss_mlp": 0.01255441, "epoch": 0.8650533593867428, "flos": 19907312878080.0, "grad_norm": 1.8693369707815812, "language_loss": 0.68613672, "learning_rate": 1.8792887476117224e-07, "loss": 0.76294172, "num_input_tokens_seen": 310352900, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09484863, "step": 14388, "time_per_iteration": 2.6237776279449463 }, { "auxiliary_loss_clip": 0.0641226, "auxiliary_loss_mlp": 0.01265464, "balance_loss_clip": 0.06278157, "balance_loss_mlp": 0.01256571, "epoch": 0.8651134826394108, "flos": 25633546652160.0, "grad_norm": 2.0177779525348494, "language_loss": 0.90484464, "learning_rate": 1.877640883285283e-07, "loss": 0.98162186, "num_input_tokens_seen": 310372855, "router_z_loss_clip": 1.34082031, "router_z_loss_mlp": 0.08886719, "step": 14389, "time_per_iteration": 2.5983712673187256 }, { "auxiliary_loss_clip": 0.06412219, "auxiliary_loss_mlp": 0.01266451, "balance_loss_clip": 0.06275094, "balance_loss_mlp": 0.01257421, "epoch": 0.8651736058920788, "flos": 18740557100160.0, "grad_norm": 1.5587216461030988, "language_loss": 0.70939779, "learning_rate": 1.8759937061605212e-07, "loss": 0.78618455, "num_input_tokens_seen": 310391595, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.090271, "step": 14390, "time_per_iteration": 2.5815353393554688 }, { "auxiliary_loss_clip": 0.06418069, "auxiliary_loss_mlp": 0.01267429, "balance_loss_clip": 0.06275631, "balance_loss_mlp": 0.01258006, "epoch": 0.8652337291447467, "flos": 20782977171840.0, "grad_norm": 1.6647522985204088, "language_loss": 0.82225251, "learning_rate": 1.8743472162998941e-07, "loss": 0.89910752, "num_input_tokens_seen": 310410090, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09429932, "step": 14391, "time_per_iteration": 2.5500566959381104 }, { "auxiliary_loss_clip": 0.06311043, "auxiliary_loss_mlp": 0.0125252, "balance_loss_clip": 0.06255832, "balance_loss_mlp": 0.01251555, "epoch": 0.8652938523974147, "flos": 64246895948160.0, "grad_norm": 0.7832031570052924, "language_loss": 0.67747462, "learning_rate": 1.8727014137658337e-07, "loss": 0.75311023, "num_input_tokens_seen": 310470055, "router_z_loss_clip": 0.55224609, "router_z_loss_mlp": 0.00964355, "step": 14392, "time_per_iteration": 3.0774123668670654 }, { "auxiliary_loss_clip": 0.06423189, "auxiliary_loss_mlp": 0.01264632, "balance_loss_clip": 0.0627857, "balance_loss_mlp": 0.01253868, "epoch": 0.8653539756500827, "flos": 18046384750080.0, "grad_norm": 1.9105732447082124, "language_loss": 0.75685918, "learning_rate": 1.8710562986207523e-07, "loss": 0.83373737, "num_input_tokens_seen": 310487665, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10766602, "step": 14393, "time_per_iteration": 3.9677672386169434 }, { "auxiliary_loss_clip": 0.0641757, "auxiliary_loss_mlp": 0.01264436, "balance_loss_clip": 0.0627455, "balance_loss_mlp": 0.01254452, "epoch": 0.8654140989027507, "flos": 17387865112320.0, "grad_norm": 1.8918409723875846, "language_loss": 0.74174541, "learning_rate": 1.8694118709270357e-07, "loss": 0.81856543, "num_input_tokens_seen": 310506130, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09985352, "step": 14394, "time_per_iteration": 2.541696786880493 }, { "auxiliary_loss_clip": 0.06414182, "auxiliary_loss_mlp": 0.01264719, "balance_loss_clip": 0.06272774, "balance_loss_mlp": 0.01254478, "epoch": 0.8654742221554186, "flos": 53296390212480.0, "grad_norm": 2.1087624219082537, "language_loss": 0.65537357, "learning_rate": 1.867768130747036e-07, "loss": 0.73216259, "num_input_tokens_seen": 310532445, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10235596, "step": 14395, "time_per_iteration": 2.8423421382904053 }, { "auxiliary_loss_clip": 0.06415728, "auxiliary_loss_mlp": 0.01264012, "balance_loss_clip": 0.06277362, "balance_loss_mlp": 0.01254374, "epoch": 0.8655343454080866, "flos": 23921476502400.0, "grad_norm": 1.7109520231721005, "language_loss": 0.68325037, "learning_rate": 1.8661250781430838e-07, "loss": 0.76004779, "num_input_tokens_seen": 310552300, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09637451, "step": 14396, "time_per_iteration": 2.562720775604248 }, { "auxiliary_loss_clip": 0.06422326, "auxiliary_loss_mlp": 0.01268587, "balance_loss_clip": 0.06278838, "balance_loss_mlp": 0.01258061, "epoch": 0.8655944686607545, "flos": 24104016622080.0, "grad_norm": 2.1172097786139767, "language_loss": 0.69835711, "learning_rate": 1.8644827131774954e-07, "loss": 0.77526629, "num_input_tokens_seen": 310572710, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10528564, "step": 14397, "time_per_iteration": 2.6101577281951904 }, { "auxiliary_loss_clip": 0.06414239, "auxiliary_loss_mlp": 0.01265162, "balance_loss_clip": 0.06274379, "balance_loss_mlp": 0.01255035, "epoch": 0.8656545919134225, "flos": 23119465547520.0, "grad_norm": 3.5122727490004237, "language_loss": 0.6378966, "learning_rate": 1.86284103591253e-07, "loss": 0.71469057, "num_input_tokens_seen": 310592460, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.10131836, "step": 14398, "time_per_iteration": 2.5802865028381348 }, { "auxiliary_loss_clip": 0.06417081, "auxiliary_loss_mlp": 0.0126817, "balance_loss_clip": 0.06277706, "balance_loss_mlp": 0.01258615, "epoch": 0.8657147151660904, "flos": 21148057411200.0, "grad_norm": 1.922157031421049, "language_loss": 0.7591278, "learning_rate": 1.8612000464104517e-07, "loss": 0.8359803, "num_input_tokens_seen": 310609375, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09552002, "step": 14399, "time_per_iteration": 4.125915050506592 }, { "auxiliary_loss_clip": 0.06409845, "auxiliary_loss_mlp": 0.0126289, "balance_loss_clip": 0.06272529, "balance_loss_mlp": 0.01254266, "epoch": 0.8657748384187585, "flos": 16294972308480.0, "grad_norm": 1.9672117451913547, "language_loss": 0.93855679, "learning_rate": 1.8595597447334855e-07, "loss": 1.01528418, "num_input_tokens_seen": 310627405, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08624268, "step": 14400, "time_per_iteration": 2.5490331649780273 }, { "auxiliary_loss_clip": 0.06416325, "auxiliary_loss_mlp": 0.01263414, "balance_loss_clip": 0.06275484, "balance_loss_mlp": 0.01254139, "epoch": 0.8658349616714264, "flos": 30851292723840.0, "grad_norm": 1.6881009173270498, "language_loss": 0.67612612, "learning_rate": 1.8579201309438353e-07, "loss": 0.75292349, "num_input_tokens_seen": 310649945, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.0927124, "step": 14401, "time_per_iteration": 2.6300861835479736 }, { "auxiliary_loss_clip": 0.06419007, "auxiliary_loss_mlp": 0.01266697, "balance_loss_clip": 0.06275925, "balance_loss_mlp": 0.01257214, "epoch": 0.8658950849240944, "flos": 18958833786240.0, "grad_norm": 2.3974896520389906, "language_loss": 0.7450124, "learning_rate": 1.8562812051036714e-07, "loss": 0.82186943, "num_input_tokens_seen": 310668285, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09490967, "step": 14402, "time_per_iteration": 2.547163486480713 }, { "auxiliary_loss_clip": 0.06407954, "auxiliary_loss_mlp": 0.01262055, "balance_loss_clip": 0.06271622, "balance_loss_mlp": 0.01253341, "epoch": 0.8659552081767624, "flos": 23370501980160.0, "grad_norm": 1.694999877332329, "language_loss": 0.751818, "learning_rate": 1.8546429672751397e-07, "loss": 0.82851809, "num_input_tokens_seen": 310687015, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.0871582, "step": 14403, "time_per_iteration": 2.5470571517944336 }, { "auxiliary_loss_clip": 0.06417162, "auxiliary_loss_mlp": 0.01264503, "balance_loss_clip": 0.06274859, "balance_loss_mlp": 0.01254496, "epoch": 0.8660153314294303, "flos": 23848787485440.0, "grad_norm": 1.7869591803387666, "language_loss": 0.733989, "learning_rate": 1.853005417520368e-07, "loss": 0.81080556, "num_input_tokens_seen": 310707580, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10009766, "step": 14404, "time_per_iteration": 2.6033859252929688 }, { "auxiliary_loss_clip": 0.06411292, "auxiliary_loss_mlp": 0.0126388, "balance_loss_clip": 0.06274952, "balance_loss_mlp": 0.01254046, "epoch": 0.8660754546820983, "flos": 23119172058240.0, "grad_norm": 1.5933567276280804, "language_loss": 0.70897913, "learning_rate": 1.851368555901447e-07, "loss": 0.78573084, "num_input_tokens_seen": 310727300, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.09844971, "step": 14405, "time_per_iteration": 2.55584454536438 }, { "auxiliary_loss_clip": 0.06416273, "auxiliary_loss_mlp": 0.01263447, "balance_loss_clip": 0.06273185, "balance_loss_mlp": 0.01253321, "epoch": 0.8661355779347663, "flos": 14397175584000.0, "grad_norm": 3.502164852090018, "language_loss": 0.6680572, "learning_rate": 1.8497323824804467e-07, "loss": 0.74485445, "num_input_tokens_seen": 310744935, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.10131836, "step": 14406, "time_per_iteration": 2.5560462474823 }, { "auxiliary_loss_clip": 0.06413805, "auxiliary_loss_mlp": 0.01268929, "balance_loss_clip": 0.06274239, "balance_loss_mlp": 0.01260417, "epoch": 0.8661957011874343, "flos": 21876331173120.0, "grad_norm": 1.5593623559544099, "language_loss": 0.83223689, "learning_rate": 1.8480968973194177e-07, "loss": 0.90906417, "num_input_tokens_seen": 310765085, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.08514404, "step": 14407, "time_per_iteration": 2.5494368076324463 }, { "auxiliary_loss_clip": 0.06412782, "auxiliary_loss_mlp": 0.01266369, "balance_loss_clip": 0.06275143, "balance_loss_mlp": 0.01256755, "epoch": 0.8662558244401022, "flos": 21841600855680.0, "grad_norm": 1.8835182058015447, "language_loss": 0.69995505, "learning_rate": 1.8464621004803748e-07, "loss": 0.77674657, "num_input_tokens_seen": 310783260, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09619141, "step": 14408, "time_per_iteration": 4.0152428150177 }, { "auxiliary_loss_clip": 0.06409234, "auxiliary_loss_mlp": 0.01265161, "balance_loss_clip": 0.06274021, "balance_loss_mlp": 0.01256256, "epoch": 0.8663159476927702, "flos": 17389835683200.0, "grad_norm": 2.535939359074333, "language_loss": 0.77535295, "learning_rate": 1.844827992025304e-07, "loss": 0.85209692, "num_input_tokens_seen": 310801970, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.08911133, "step": 14409, "time_per_iteration": 2.5230860710144043 }, { "auxiliary_loss_clip": 0.06420127, "auxiliary_loss_mlp": 0.0126517, "balance_loss_clip": 0.06277479, "balance_loss_mlp": 0.01255061, "epoch": 0.8663760709454381, "flos": 22754385308160.0, "grad_norm": 1.9102395082148662, "language_loss": 0.76972187, "learning_rate": 1.8431945720161757e-07, "loss": 0.84657484, "num_input_tokens_seen": 310822070, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10101318, "step": 14410, "time_per_iteration": 2.5580546855926514 }, { "auxiliary_loss_clip": 0.06420723, "auxiliary_loss_mlp": 0.01265362, "balance_loss_clip": 0.06280664, "balance_loss_mlp": 0.01255116, "epoch": 0.8664361941981061, "flos": 17381366421120.0, "grad_norm": 1.8632698631786717, "language_loss": 0.77873242, "learning_rate": 1.8415618405149315e-07, "loss": 0.85559332, "num_input_tokens_seen": 310838355, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.10247803, "step": 14411, "time_per_iteration": 2.50893235206604 }, { "auxiliary_loss_clip": 0.06409872, "auxiliary_loss_mlp": 0.0126433, "balance_loss_clip": 0.06271757, "balance_loss_mlp": 0.0125533, "epoch": 0.866496317450774, "flos": 16039994734080.0, "grad_norm": 1.586516403465073, "language_loss": 0.73903221, "learning_rate": 1.8399297975834794e-07, "loss": 0.8157742, "num_input_tokens_seen": 310856055, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09002686, "step": 14412, "time_per_iteration": 2.5528955459594727 }, { "auxiliary_loss_clip": 0.06410959, "auxiliary_loss_mlp": 0.01265753, "balance_loss_clip": 0.06275305, "balance_loss_mlp": 0.01257241, "epoch": 0.8665564407034421, "flos": 20821313214720.0, "grad_norm": 1.703413629368011, "language_loss": 0.69663095, "learning_rate": 1.83829844328371e-07, "loss": 0.77339804, "num_input_tokens_seen": 310876695, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.08520508, "step": 14413, "time_per_iteration": 3.9877769947052 }, { "auxiliary_loss_clip": 0.06414519, "auxiliary_loss_mlp": 0.01264759, "balance_loss_clip": 0.06273939, "balance_loss_mlp": 0.01254966, "epoch": 0.86661656395611, "flos": 15820627944960.0, "grad_norm": 2.764264000197011, "language_loss": 0.63295197, "learning_rate": 1.8366677776774874e-07, "loss": 0.70974469, "num_input_tokens_seen": 310893880, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09796143, "step": 14414, "time_per_iteration": 2.5334067344665527 }, { "auxiliary_loss_clip": 0.06413573, "auxiliary_loss_mlp": 0.01265617, "balance_loss_clip": 0.06276324, "balance_loss_mlp": 0.01256181, "epoch": 0.866676687208778, "flos": 23043170805120.0, "grad_norm": 1.6304197911698395, "language_loss": 0.63808328, "learning_rate": 1.8350378008266377e-07, "loss": 0.71487522, "num_input_tokens_seen": 310914145, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09442139, "step": 14415, "time_per_iteration": 2.5764496326446533 }, { "auxiliary_loss_clip": 0.06311893, "auxiliary_loss_mlp": 0.0125065, "balance_loss_clip": 0.06256884, "balance_loss_mlp": 0.01249807, "epoch": 0.866736810461446, "flos": 63823256104320.0, "grad_norm": 0.782797068657021, "language_loss": 0.60319287, "learning_rate": 1.8334085127929754e-07, "loss": 0.67881829, "num_input_tokens_seen": 310972825, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00844574, "step": 14416, "time_per_iteration": 3.2203643321990967 }, { "auxiliary_loss_clip": 0.0641865, "auxiliary_loss_mlp": 0.01265489, "balance_loss_clip": 0.06275539, "balance_loss_mlp": 0.0125538, "epoch": 0.8667969337141139, "flos": 20455687923840.0, "grad_norm": 1.6316614489661103, "language_loss": 0.74704981, "learning_rate": 1.831779913638285e-07, "loss": 0.82389116, "num_input_tokens_seen": 310992050, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10113525, "step": 14417, "time_per_iteration": 2.549008846282959 }, { "auxiliary_loss_clip": 0.06411475, "auxiliary_loss_mlp": 0.01265663, "balance_loss_clip": 0.06272569, "balance_loss_mlp": 0.01256013, "epoch": 0.866857056966782, "flos": 21660276620160.0, "grad_norm": 1.4612935951158297, "language_loss": 0.75190818, "learning_rate": 1.830152003424319e-07, "loss": 0.8286795, "num_input_tokens_seen": 311011105, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09643555, "step": 14418, "time_per_iteration": 2.539577007293701 }, { "auxiliary_loss_clip": 0.06407592, "auxiliary_loss_mlp": 0.01265435, "balance_loss_clip": 0.06270157, "balance_loss_mlp": 0.01256077, "epoch": 0.8669171802194499, "flos": 22858785895680.0, "grad_norm": 1.333829377825241, "language_loss": 0.67994499, "learning_rate": 1.8285247822128126e-07, "loss": 0.7566753, "num_input_tokens_seen": 311032080, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09362793, "step": 14419, "time_per_iteration": 2.5984344482421875 }, { "auxiliary_loss_clip": 0.06414191, "auxiliary_loss_mlp": 0.01265293, "balance_loss_clip": 0.06274061, "balance_loss_mlp": 0.01255769, "epoch": 0.8669773034721179, "flos": 18740137829760.0, "grad_norm": 1.670117897114449, "language_loss": 0.78827941, "learning_rate": 1.826898250065465e-07, "loss": 0.86507422, "num_input_tokens_seen": 311049735, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09539795, "step": 14420, "time_per_iteration": 2.5396876335144043 }, { "auxiliary_loss_clip": 0.06414199, "auxiliary_loss_mlp": 0.01265506, "balance_loss_clip": 0.06274706, "balance_loss_mlp": 0.01256178, "epoch": 0.8670374267247858, "flos": 18921923262720.0, "grad_norm": 1.4232799666368394, "language_loss": 0.83565283, "learning_rate": 1.8252724070439586e-07, "loss": 0.91244984, "num_input_tokens_seen": 311067675, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09338379, "step": 14421, "time_per_iteration": 2.6188154220581055 }, { "auxiliary_loss_clip": 0.06311827, "auxiliary_loss_mlp": 0.012522, "balance_loss_clip": 0.06256589, "balance_loss_mlp": 0.01251215, "epoch": 0.8670975499774538, "flos": 48834323458560.0, "grad_norm": 0.7853603721289825, "language_loss": 0.4875192, "learning_rate": 1.823647253209941e-07, "loss": 0.56315947, "num_input_tokens_seen": 311126605, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00984192, "step": 14422, "time_per_iteration": 3.2007293701171875 }, { "auxiliary_loss_clip": 0.06411058, "auxiliary_loss_mlp": 0.01267431, "balance_loss_clip": 0.06272693, "balance_loss_mlp": 0.01258264, "epoch": 0.8671576732301217, "flos": 26142579406080.0, "grad_norm": 1.5685341393114591, "language_loss": 0.74002492, "learning_rate": 1.8220227886250417e-07, "loss": 0.81680983, "num_input_tokens_seen": 311147325, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.0916748, "step": 14423, "time_per_iteration": 2.678065299987793 }, { "auxiliary_loss_clip": 0.06408738, "auxiliary_loss_mlp": 0.01262441, "balance_loss_clip": 0.06276425, "balance_loss_mlp": 0.01254097, "epoch": 0.8672177964827897, "flos": 18373045092480.0, "grad_norm": 1.4970646917450356, "language_loss": 0.77314442, "learning_rate": 1.8203990133508684e-07, "loss": 0.8498562, "num_input_tokens_seen": 311165385, "router_z_loss_clip": 1.32421875, "router_z_loss_mlp": 0.08343506, "step": 14424, "time_per_iteration": 2.5619070529937744 }, { "auxiliary_loss_clip": 0.06405605, "auxiliary_loss_mlp": 0.01263381, "balance_loss_clip": 0.06272225, "balance_loss_mlp": 0.0125481, "epoch": 0.8672779197354576, "flos": 28552385704320.0, "grad_norm": 1.5860360114734584, "language_loss": 0.71901286, "learning_rate": 1.8187759274489767e-07, "loss": 0.7957027, "num_input_tokens_seen": 311185860, "router_z_loss_clip": 1.33398438, "router_z_loss_mlp": 0.08569336, "step": 14425, "time_per_iteration": 2.609276056289673 }, { "auxiliary_loss_clip": 0.06417391, "auxiliary_loss_mlp": 0.01265148, "balance_loss_clip": 0.0627388, "balance_loss_mlp": 0.01254741, "epoch": 0.8673380429881257, "flos": 22389011579520.0, "grad_norm": 1.6146682642317154, "language_loss": 0.68096489, "learning_rate": 1.817153530980926e-07, "loss": 0.75779033, "num_input_tokens_seen": 311205810, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10400391, "step": 14426, "time_per_iteration": 2.5612168312072754 }, { "auxiliary_loss_clip": 0.0641467, "auxiliary_loss_mlp": 0.0126381, "balance_loss_clip": 0.06275351, "balance_loss_mlp": 0.01253707, "epoch": 0.8673981662407936, "flos": 21002805158400.0, "grad_norm": 1.6729502131511238, "language_loss": 0.70676458, "learning_rate": 1.815531824008234e-07, "loss": 0.78354943, "num_input_tokens_seen": 311226080, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.10089111, "step": 14427, "time_per_iteration": 2.5565714836120605 }, { "auxiliary_loss_clip": 0.06414162, "auxiliary_loss_mlp": 0.0126237, "balance_loss_clip": 0.0627562, "balance_loss_mlp": 0.01252696, "epoch": 0.8674582894934616, "flos": 24433863419520.0, "grad_norm": 1.9485768831936403, "language_loss": 0.68296778, "learning_rate": 1.8139108065924004e-07, "loss": 0.75973308, "num_input_tokens_seen": 311246380, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09680176, "step": 14428, "time_per_iteration": 2.61460542678833 }, { "auxiliary_loss_clip": 0.06411411, "auxiliary_loss_mlp": 0.01266916, "balance_loss_clip": 0.0627355, "balance_loss_mlp": 0.01257826, "epoch": 0.8675184127461296, "flos": 20743257536640.0, "grad_norm": 1.7141008914716118, "language_loss": 0.71195269, "learning_rate": 1.812290478794889e-07, "loss": 0.78873599, "num_input_tokens_seen": 311266465, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09100342, "step": 14429, "time_per_iteration": 2.629164457321167 }, { "auxiliary_loss_clip": 0.06411915, "auxiliary_loss_mlp": 0.0126391, "balance_loss_clip": 0.06274523, "balance_loss_mlp": 0.01254427, "epoch": 0.8675785359987975, "flos": 19141709322240.0, "grad_norm": 1.868956923782894, "language_loss": 0.66688311, "learning_rate": 1.810670840677151e-07, "loss": 0.74364138, "num_input_tokens_seen": 311285075, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.0947876, "step": 14430, "time_per_iteration": 2.543139934539795 }, { "auxiliary_loss_clip": 0.06413546, "auxiliary_loss_mlp": 0.01265818, "balance_loss_clip": 0.06272557, "balance_loss_mlp": 0.01256037, "epoch": 0.8676386592514655, "flos": 22717223222400.0, "grad_norm": 1.855153417338149, "language_loss": 0.69101959, "learning_rate": 1.8090518923005948e-07, "loss": 0.76781321, "num_input_tokens_seen": 311303230, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09783936, "step": 14431, "time_per_iteration": 2.565058946609497 }, { "auxiliary_loss_clip": 0.06419699, "auxiliary_loss_mlp": 0.01267903, "balance_loss_clip": 0.06278408, "balance_loss_mlp": 0.0125811, "epoch": 0.8676987825041335, "flos": 14215054734720.0, "grad_norm": 2.3122414857545137, "language_loss": 0.6370635, "learning_rate": 1.8074336337266116e-07, "loss": 0.71393949, "num_input_tokens_seen": 311318070, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09790039, "step": 14432, "time_per_iteration": 4.003607511520386 }, { "auxiliary_loss_clip": 0.06417419, "auxiliary_loss_mlp": 0.01265029, "balance_loss_clip": 0.06277514, "balance_loss_mlp": 0.0125585, "epoch": 0.8677589057568015, "flos": 13595080775040.0, "grad_norm": 1.9971912627686914, "language_loss": 0.78492594, "learning_rate": 1.8058160650165656e-07, "loss": 0.86175048, "num_input_tokens_seen": 311334885, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09173584, "step": 14433, "time_per_iteration": 2.5743155479431152 }, { "auxiliary_loss_clip": 0.06311241, "auxiliary_loss_mlp": 0.01252719, "balance_loss_clip": 0.06255896, "balance_loss_mlp": 0.01251698, "epoch": 0.8678190290094694, "flos": 68953303278720.0, "grad_norm": 0.6980743605752923, "language_loss": 0.58419818, "learning_rate": 1.804199186231805e-07, "loss": 0.65983778, "num_input_tokens_seen": 311399780, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01021576, "step": 14434, "time_per_iteration": 3.2399675846099854 }, { "auxiliary_loss_clip": 0.06406906, "auxiliary_loss_mlp": 0.01267847, "balance_loss_clip": 0.06272789, "balance_loss_mlp": 0.01259049, "epoch": 0.8678791522621374, "flos": 32565249590400.0, "grad_norm": 1.568502832123237, "language_loss": 0.80373287, "learning_rate": 1.802582997433628e-07, "loss": 0.88048041, "num_input_tokens_seen": 311419610, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.0880127, "step": 14435, "time_per_iteration": 2.6104040145874023 }, { "auxiliary_loss_clip": 0.0641416, "auxiliary_loss_mlp": 0.01264672, "balance_loss_clip": 0.06272288, "balance_loss_mlp": 0.01254075, "epoch": 0.8679392755148053, "flos": 35051224849920.0, "grad_norm": 1.8511869282027003, "language_loss": 0.6193347, "learning_rate": 1.8009674986833322e-07, "loss": 0.69612306, "num_input_tokens_seen": 311440045, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.1060791, "step": 14436, "time_per_iteration": 2.6436495780944824 }, { "auxiliary_loss_clip": 0.06416244, "auxiliary_loss_mlp": 0.01263434, "balance_loss_clip": 0.06276043, "balance_loss_mlp": 0.01253629, "epoch": 0.8679993987674733, "flos": 18558562032000.0, "grad_norm": 2.0240346328332417, "language_loss": 0.71379954, "learning_rate": 1.7993526900421706e-07, "loss": 0.79059631, "num_input_tokens_seen": 311456660, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.0980835, "step": 14437, "time_per_iteration": 2.512103319168091 }, { "auxiliary_loss_clip": 0.06415751, "auxiliary_loss_mlp": 0.01265806, "balance_loss_clip": 0.06276549, "balance_loss_mlp": 0.01256317, "epoch": 0.8680595220201412, "flos": 27461840814720.0, "grad_norm": 2.0264852519715375, "language_loss": 0.80648363, "learning_rate": 1.797738571571381e-07, "loss": 0.88329917, "num_input_tokens_seen": 311475460, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09490967, "step": 14438, "time_per_iteration": 2.568737030029297 }, { "auxiliary_loss_clip": 0.06406169, "auxiliary_loss_mlp": 0.01262956, "balance_loss_clip": 0.06271426, "balance_loss_mlp": 0.01253854, "epoch": 0.8681196452728093, "flos": 19214901463680.0, "grad_norm": 1.8669159185284003, "language_loss": 0.67880285, "learning_rate": 1.7961251433321656e-07, "loss": 0.75549412, "num_input_tokens_seen": 311494575, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.09106445, "step": 14439, "time_per_iteration": 3.930180788040161 }, { "auxiliary_loss_clip": 0.06413224, "auxiliary_loss_mlp": 0.0126374, "balance_loss_clip": 0.06274168, "balance_loss_mlp": 0.01255014, "epoch": 0.8681797685254772, "flos": 37569498658560.0, "grad_norm": 2.167804863238224, "language_loss": 0.63774526, "learning_rate": 1.7945124053857085e-07, "loss": 0.71451485, "num_input_tokens_seen": 311515805, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08734131, "step": 14440, "time_per_iteration": 2.7243354320526123 }, { "auxiliary_loss_clip": 0.06407315, "auxiliary_loss_mlp": 0.01263468, "balance_loss_clip": 0.06273541, "balance_loss_mlp": 0.01253967, "epoch": 0.8682398917781452, "flos": 23295842392320.0, "grad_norm": 7.808985527667455, "language_loss": 0.65448999, "learning_rate": 1.7929003577931722e-07, "loss": 0.73119777, "num_input_tokens_seen": 311536000, "router_z_loss_clip": 1.33886719, "router_z_loss_mlp": 0.09490967, "step": 14441, "time_per_iteration": 2.544867753982544 }, { "auxiliary_loss_clip": 0.06407575, "auxiliary_loss_mlp": 0.012608, "balance_loss_clip": 0.06273606, "balance_loss_mlp": 0.01252283, "epoch": 0.8683000150308132, "flos": 21879433774080.0, "grad_norm": 1.5455266127370098, "language_loss": 0.66177189, "learning_rate": 1.7912890006156722e-07, "loss": 0.73845565, "num_input_tokens_seen": 311556220, "router_z_loss_clip": 1.34082031, "router_z_loss_mlp": 0.08520508, "step": 14442, "time_per_iteration": 2.562180757522583 }, { "auxiliary_loss_clip": 0.06420656, "auxiliary_loss_mlp": 0.01268306, "balance_loss_clip": 0.06276616, "balance_loss_mlp": 0.01257577, "epoch": 0.8683601382834811, "flos": 14652404720640.0, "grad_norm": 1.6855597257207553, "language_loss": 0.72445798, "learning_rate": 1.7896783339143195e-07, "loss": 0.80134755, "num_input_tokens_seen": 311572530, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.1072998, "step": 14443, "time_per_iteration": 2.520029306411743 }, { "auxiliary_loss_clip": 0.06414044, "auxiliary_loss_mlp": 0.01265064, "balance_loss_clip": 0.06274435, "balance_loss_mlp": 0.01255313, "epoch": 0.8684202615361492, "flos": 26367187075200.0, "grad_norm": 1.629878903415412, "language_loss": 0.83517575, "learning_rate": 1.7880683577501877e-07, "loss": 0.9119668, "num_input_tokens_seen": 311591105, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09747314, "step": 14444, "time_per_iteration": 2.595968246459961 }, { "auxiliary_loss_clip": 0.06413113, "auxiliary_loss_mlp": 0.01263146, "balance_loss_clip": 0.06275741, "balance_loss_mlp": 0.01253925, "epoch": 0.8684803847888171, "flos": 20710246227840.0, "grad_norm": 5.249320260351721, "language_loss": 0.77313256, "learning_rate": 1.7864590721843342e-07, "loss": 0.84989518, "num_input_tokens_seen": 311608350, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09228516, "step": 14445, "time_per_iteration": 2.521728038787842 }, { "auxiliary_loss_clip": 0.06415214, "auxiliary_loss_mlp": 0.01262337, "balance_loss_clip": 0.06276359, "balance_loss_mlp": 0.0125221, "epoch": 0.8685405080414851, "flos": 22644743840640.0, "grad_norm": 1.7988309491102636, "language_loss": 0.67718059, "learning_rate": 1.7848504772777728e-07, "loss": 0.75395608, "num_input_tokens_seen": 311626380, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10125732, "step": 14446, "time_per_iteration": 2.5664234161376953 }, { "auxiliary_loss_clip": 0.06412873, "auxiliary_loss_mlp": 0.0126593, "balance_loss_clip": 0.06274553, "balance_loss_mlp": 0.01256066, "epoch": 0.868600631294153, "flos": 24828181534080.0, "grad_norm": 1.8028132369057424, "language_loss": 0.83210123, "learning_rate": 1.7832425730915102e-07, "loss": 0.90888929, "num_input_tokens_seen": 311644345, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09869385, "step": 14447, "time_per_iteration": 2.562809467315674 }, { "auxiliary_loss_clip": 0.06408842, "auxiliary_loss_mlp": 0.01264104, "balance_loss_clip": 0.06270805, "balance_loss_mlp": 0.0125555, "epoch": 0.868660754546821, "flos": 25120153486080.0, "grad_norm": 1.587057079463506, "language_loss": 0.74181056, "learning_rate": 1.781635359686515e-07, "loss": 0.81854004, "num_input_tokens_seen": 311663340, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.08557129, "step": 14448, "time_per_iteration": 4.0461695194244385 }, { "auxiliary_loss_clip": 0.06413277, "auxiliary_loss_mlp": 0.01263415, "balance_loss_clip": 0.06273669, "balance_loss_mlp": 0.01254099, "epoch": 0.8687208777994889, "flos": 12682841374080.0, "grad_norm": 8.076685240609791, "language_loss": 0.80900562, "learning_rate": 1.7800288371237303e-07, "loss": 0.88577259, "num_input_tokens_seen": 311679860, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09320068, "step": 14449, "time_per_iteration": 2.4984796047210693 }, { "auxiliary_loss_clip": 0.06309749, "auxiliary_loss_mlp": 0.01251573, "balance_loss_clip": 0.06254651, "balance_loss_mlp": 0.01250606, "epoch": 0.8687810010521569, "flos": 65636959656960.0, "grad_norm": 0.8036113198779936, "language_loss": 0.60366881, "learning_rate": 1.7784230054640758e-07, "loss": 0.67928207, "num_input_tokens_seen": 311738135, "router_z_loss_clip": 0.55273438, "router_z_loss_mlp": 0.00965881, "step": 14450, "time_per_iteration": 3.0736193656921387 }, { "auxiliary_loss_clip": 0.06420149, "auxiliary_loss_mlp": 0.01263756, "balance_loss_clip": 0.06278447, "balance_loss_mlp": 0.01254285, "epoch": 0.8688411243048249, "flos": 24250987883520.0, "grad_norm": 1.7205269791196478, "language_loss": 0.7599858, "learning_rate": 1.7768178647684517e-07, "loss": 0.83682489, "num_input_tokens_seen": 311756975, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09466553, "step": 14451, "time_per_iteration": 2.5793373584747314 }, { "auxiliary_loss_clip": 0.06411415, "auxiliary_loss_mlp": 0.0126638, "balance_loss_clip": 0.06273615, "balance_loss_mlp": 0.01256277, "epoch": 0.8689012475574929, "flos": 18227457423360.0, "grad_norm": 2.516729103416497, "language_loss": 0.72437066, "learning_rate": 1.7752134150977205e-07, "loss": 0.80114859, "num_input_tokens_seen": 311771830, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.10095215, "step": 14452, "time_per_iteration": 2.4895079135894775 }, { "auxiliary_loss_clip": 0.0641454, "auxiliary_loss_mlp": 0.01266556, "balance_loss_clip": 0.06272631, "balance_loss_mlp": 0.01256126, "epoch": 0.8689613708101608, "flos": 19652922282240.0, "grad_norm": 1.3261462249852547, "language_loss": 0.72331274, "learning_rate": 1.7736096565127201e-07, "loss": 0.80012369, "num_input_tokens_seen": 311790130, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10430908, "step": 14453, "time_per_iteration": 3.840790271759033 }, { "auxiliary_loss_clip": 0.06411746, "auxiliary_loss_mlp": 0.01264644, "balance_loss_clip": 0.06274727, "balance_loss_mlp": 0.0125537, "epoch": 0.8690214940628288, "flos": 11733523741440.0, "grad_norm": 1.903153882151441, "language_loss": 0.73259079, "learning_rate": 1.7720065890742664e-07, "loss": 0.80935466, "num_input_tokens_seen": 311808360, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.0927124, "step": 14454, "time_per_iteration": 2.5291452407836914 }, { "auxiliary_loss_clip": 0.06411939, "auxiliary_loss_mlp": 0.01262285, "balance_loss_clip": 0.06274214, "balance_loss_mlp": 0.0125279, "epoch": 0.8690816173154968, "flos": 34945566451200.0, "grad_norm": 2.0247727429850593, "language_loss": 0.59749877, "learning_rate": 1.7704042128431552e-07, "loss": 0.67424101, "num_input_tokens_seen": 311831325, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09490967, "step": 14455, "time_per_iteration": 2.678992509841919 }, { "auxiliary_loss_clip": 0.06412795, "auxiliary_loss_mlp": 0.01263884, "balance_loss_clip": 0.06272589, "balance_loss_mlp": 0.01254979, "epoch": 0.8691417405681647, "flos": 11618809102080.0, "grad_norm": 2.0940809461223377, "language_loss": 0.80307996, "learning_rate": 1.7688025278801378e-07, "loss": 0.87984675, "num_input_tokens_seen": 311848090, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.08911133, "step": 14456, "time_per_iteration": 2.528191566467285 }, { "auxiliary_loss_clip": 0.06419913, "auxiliary_loss_mlp": 0.01267485, "balance_loss_clip": 0.06276941, "balance_loss_mlp": 0.01257299, "epoch": 0.8692018638208328, "flos": 24614936092800.0, "grad_norm": 2.3002878783173317, "language_loss": 0.75361514, "learning_rate": 1.7672015342459568e-07, "loss": 0.83048904, "num_input_tokens_seen": 311867855, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10180664, "step": 14457, "time_per_iteration": 2.5682826042175293 }, { "auxiliary_loss_clip": 0.06408867, "auxiliary_loss_mlp": 0.01262689, "balance_loss_clip": 0.06273639, "balance_loss_mlp": 0.01253993, "epoch": 0.8692619870735007, "flos": 26002358398080.0, "grad_norm": 1.3937874388278273, "language_loss": 0.78505951, "learning_rate": 1.765601232001328e-07, "loss": 0.8617751, "num_input_tokens_seen": 311888675, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.08703613, "step": 14458, "time_per_iteration": 2.6009185314178467 }, { "auxiliary_loss_clip": 0.06413821, "auxiliary_loss_mlp": 0.01265027, "balance_loss_clip": 0.06276076, "balance_loss_mlp": 0.01254948, "epoch": 0.8693221103261687, "flos": 18047810269440.0, "grad_norm": 1.6260544307572498, "language_loss": 0.71267468, "learning_rate": 1.7640016212069187e-07, "loss": 0.78946316, "num_input_tokens_seen": 311907310, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.10076904, "step": 14459, "time_per_iteration": 2.5338399410247803 }, { "auxiliary_loss_clip": 0.06403196, "auxiliary_loss_mlp": 0.01262504, "balance_loss_clip": 0.06270964, "balance_loss_mlp": 0.01254291, "epoch": 0.8693822335788366, "flos": 27500051076480.0, "grad_norm": 1.280744685576852, "language_loss": 0.74067068, "learning_rate": 1.762402701923398e-07, "loss": 0.81732774, "num_input_tokens_seen": 311929635, "router_z_loss_clip": 1.32128906, "router_z_loss_mlp": 0.08215332, "step": 14460, "time_per_iteration": 2.6226134300231934 }, { "auxiliary_loss_clip": 0.06420431, "auxiliary_loss_mlp": 0.01263993, "balance_loss_clip": 0.06275393, "balance_loss_mlp": 0.01254146, "epoch": 0.8694423568315046, "flos": 24104603600640.0, "grad_norm": 1.9074142647798384, "language_loss": 0.65186304, "learning_rate": 1.7608044742113947e-07, "loss": 0.72870731, "num_input_tokens_seen": 311948800, "router_z_loss_clip": 1.45117188, "router_z_loss_mlp": 0.09838867, "step": 14461, "time_per_iteration": 2.572505474090576 }, { "auxiliary_loss_clip": 0.06412414, "auxiliary_loss_mlp": 0.01266518, "balance_loss_clip": 0.06271986, "balance_loss_mlp": 0.01257023, "epoch": 0.8695024800841725, "flos": 18366839890560.0, "grad_norm": 2.0229578764423635, "language_loss": 0.83045352, "learning_rate": 1.7592069381315123e-07, "loss": 0.90724277, "num_input_tokens_seen": 311964090, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09503174, "step": 14462, "time_per_iteration": 2.5655272006988525 }, { "auxiliary_loss_clip": 0.06415489, "auxiliary_loss_mlp": 0.01266453, "balance_loss_clip": 0.06275851, "balance_loss_mlp": 0.01256374, "epoch": 0.8695626033368405, "flos": 14032975812480.0, "grad_norm": 1.7785139862909236, "language_loss": 0.65526569, "learning_rate": 1.757610093744335e-07, "loss": 0.73208511, "num_input_tokens_seen": 311981460, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.10076904, "step": 14463, "time_per_iteration": 2.5125913619995117 }, { "auxiliary_loss_clip": 0.06419168, "auxiliary_loss_mlp": 0.01268718, "balance_loss_clip": 0.06275149, "balance_loss_mlp": 0.01258776, "epoch": 0.8696227265895085, "flos": 16842508813440.0, "grad_norm": 13.289482967421609, "language_loss": 0.66760731, "learning_rate": 1.7560139411104058e-07, "loss": 0.74448609, "num_input_tokens_seen": 312000115, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.09942627, "step": 14464, "time_per_iteration": 2.5299313068389893 }, { "auxiliary_loss_clip": 0.06420962, "auxiliary_loss_mlp": 0.01264136, "balance_loss_clip": 0.06276951, "balance_loss_mlp": 0.01253968, "epoch": 0.8696828498421765, "flos": 21805570800000.0, "grad_norm": 2.418702593414205, "language_loss": 0.63097143, "learning_rate": 1.7544184802902607e-07, "loss": 0.70782244, "num_input_tokens_seen": 312020770, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10162354, "step": 14465, "time_per_iteration": 2.582789421081543 }, { "auxiliary_loss_clip": 0.06406212, "auxiliary_loss_mlp": 0.01265632, "balance_loss_clip": 0.06272763, "balance_loss_mlp": 0.01256954, "epoch": 0.8697429730948444, "flos": 22901691985920.0, "grad_norm": 1.4856602418403566, "language_loss": 0.84826803, "learning_rate": 1.7528237113443934e-07, "loss": 0.92498648, "num_input_tokens_seen": 312041870, "router_z_loss_clip": 1.33300781, "router_z_loss_mlp": 0.08673096, "step": 14466, "time_per_iteration": 2.5867555141448975 }, { "auxiliary_loss_clip": 0.06419671, "auxiliary_loss_mlp": 0.01270909, "balance_loss_clip": 0.06276143, "balance_loss_mlp": 0.01260168, "epoch": 0.8698030963475124, "flos": 24724367925120.0, "grad_norm": 2.186785375452789, "language_loss": 0.62060142, "learning_rate": 1.7512296343332779e-07, "loss": 0.69750726, "num_input_tokens_seen": 312058210, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.1072998, "step": 14467, "time_per_iteration": 2.567129373550415 }, { "auxiliary_loss_clip": 0.0640628, "auxiliary_loss_mlp": 0.01262368, "balance_loss_clip": 0.06272222, "balance_loss_mlp": 0.01253923, "epoch": 0.8698632196001803, "flos": 28450291104000.0, "grad_norm": 1.4515898659770379, "language_loss": 0.68942595, "learning_rate": 1.7496362493173655e-07, "loss": 0.76611245, "num_input_tokens_seen": 312082665, "router_z_loss_clip": 1.33886719, "router_z_loss_mlp": 0.08447266, "step": 14468, "time_per_iteration": 2.63602352142334 }, { "auxiliary_loss_clip": 0.06412025, "auxiliary_loss_mlp": 0.01263959, "balance_loss_clip": 0.06274226, "balance_loss_mlp": 0.0125543, "epoch": 0.8699233428528483, "flos": 27643877809920.0, "grad_norm": 1.5143471185573725, "language_loss": 0.71191561, "learning_rate": 1.7480435563570773e-07, "loss": 0.78867543, "num_input_tokens_seen": 312101960, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.08526611, "step": 14469, "time_per_iteration": 2.592212200164795 }, { "auxiliary_loss_clip": 0.06407021, "auxiliary_loss_mlp": 0.0126309, "balance_loss_clip": 0.06274626, "balance_loss_mlp": 0.01254519, "epoch": 0.8699834661055164, "flos": 20051516954880.0, "grad_norm": 2.152741771479762, "language_loss": 0.84359694, "learning_rate": 1.7464515555128024e-07, "loss": 0.92029804, "num_input_tokens_seen": 312117125, "router_z_loss_clip": 1.32324219, "router_z_loss_mlp": 0.08569336, "step": 14470, "time_per_iteration": 2.7067577838897705 }, { "auxiliary_loss_clip": 0.06412049, "auxiliary_loss_mlp": 0.01264067, "balance_loss_clip": 0.06274818, "balance_loss_mlp": 0.01255013, "epoch": 0.8700435893581843, "flos": 23739607215360.0, "grad_norm": 1.7975449860235095, "language_loss": 0.73300678, "learning_rate": 1.7448602468449148e-07, "loss": 0.80976796, "num_input_tokens_seen": 312135775, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09057617, "step": 14471, "time_per_iteration": 2.571669816970825 }, { "auxiliary_loss_clip": 0.06411541, "auxiliary_loss_mlp": 0.01264328, "balance_loss_clip": 0.06274693, "balance_loss_mlp": 0.01255173, "epoch": 0.8701037126108523, "flos": 23554886889600.0, "grad_norm": 1.5551195988070567, "language_loss": 0.79261804, "learning_rate": 1.7432696304137573e-07, "loss": 0.86937678, "num_input_tokens_seen": 312156070, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09155273, "step": 14472, "time_per_iteration": 4.012131929397583 }, { "auxiliary_loss_clip": 0.06413199, "auxiliary_loss_mlp": 0.01261204, "balance_loss_clip": 0.0627484, "balance_loss_mlp": 0.01251971, "epoch": 0.8701638358635202, "flos": 18849401953920.0, "grad_norm": 2.213858052800608, "language_loss": 0.73073405, "learning_rate": 1.741679706279644e-07, "loss": 0.80747807, "num_input_tokens_seen": 312174380, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09234619, "step": 14473, "time_per_iteration": 2.5911667346954346 }, { "auxiliary_loss_clip": 0.06417498, "auxiliary_loss_mlp": 0.01262585, "balance_loss_clip": 0.06276266, "balance_loss_mlp": 0.01252667, "epoch": 0.8702239591161882, "flos": 27935807834880.0, "grad_norm": 1.5369196175470268, "language_loss": 0.72442114, "learning_rate": 1.7400904745028644e-07, "loss": 0.80122197, "num_input_tokens_seen": 312195130, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09918213, "step": 14474, "time_per_iteration": 2.6025424003601074 }, { "auxiliary_loss_clip": 0.06413729, "auxiliary_loss_mlp": 0.01266571, "balance_loss_clip": 0.06273244, "balance_loss_mlp": 0.01256522, "epoch": 0.8702840823688561, "flos": 17239007134080.0, "grad_norm": 2.082084856730815, "language_loss": 0.67450511, "learning_rate": 1.7385019351436925e-07, "loss": 0.75130808, "num_input_tokens_seen": 312212300, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.10058594, "step": 14475, "time_per_iteration": 2.546086549758911 }, { "auxiliary_loss_clip": 0.06417561, "auxiliary_loss_mlp": 0.01265939, "balance_loss_clip": 0.06276669, "balance_loss_mlp": 0.01256468, "epoch": 0.8703442056215241, "flos": 19433681274240.0, "grad_norm": 1.5344943384087193, "language_loss": 0.78148544, "learning_rate": 1.736914088262349e-07, "loss": 0.85832042, "num_input_tokens_seen": 312231735, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09466553, "step": 14476, "time_per_iteration": 2.5572779178619385 }, { "auxiliary_loss_clip": 0.06412202, "auxiliary_loss_mlp": 0.01261453, "balance_loss_clip": 0.06275991, "balance_loss_mlp": 0.01252285, "epoch": 0.8704043288741921, "flos": 22280502142080.0, "grad_norm": 1.661776895732526, "language_loss": 0.72347176, "learning_rate": 1.7353269339190525e-07, "loss": 0.80020833, "num_input_tokens_seen": 312253060, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.0916748, "step": 14477, "time_per_iteration": 2.631990909576416 }, { "auxiliary_loss_clip": 0.06412326, "auxiliary_loss_mlp": 0.01270131, "balance_loss_clip": 0.06274155, "balance_loss_mlp": 0.01260886, "epoch": 0.8704644521268601, "flos": 16653386148480.0, "grad_norm": 1.9722594859719336, "language_loss": 0.59719121, "learning_rate": 1.7337404721739946e-07, "loss": 0.67401576, "num_input_tokens_seen": 312269460, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09246826, "step": 14478, "time_per_iteration": 3.9859166145324707 }, { "auxiliary_loss_clip": 0.06412903, "auxiliary_loss_mlp": 0.01264705, "balance_loss_clip": 0.0627825, "balance_loss_mlp": 0.01256849, "epoch": 0.870524575379528, "flos": 24287143720320.0, "grad_norm": 1.9815882491547445, "language_loss": 0.71721542, "learning_rate": 1.732154703087323e-07, "loss": 0.79399145, "num_input_tokens_seen": 312289830, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.07861328, "step": 14479, "time_per_iteration": 2.593677520751953 }, { "auxiliary_loss_clip": 0.06411869, "auxiliary_loss_mlp": 0.01269296, "balance_loss_clip": 0.06272867, "balance_loss_mlp": 0.01258442, "epoch": 0.870584698632196, "flos": 28776490248960.0, "grad_norm": 1.4704154622854533, "language_loss": 0.71047235, "learning_rate": 1.7305696267191805e-07, "loss": 0.78728396, "num_input_tokens_seen": 312311320, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10852051, "step": 14480, "time_per_iteration": 2.6078531742095947 }, { "auxiliary_loss_clip": 0.06414844, "auxiliary_loss_mlp": 0.01265747, "balance_loss_clip": 0.06274271, "balance_loss_mlp": 0.01255966, "epoch": 0.8706448218848639, "flos": 32457369058560.0, "grad_norm": 1.5942521899569937, "language_loss": 0.69964015, "learning_rate": 1.728985243129666e-07, "loss": 0.77644604, "num_input_tokens_seen": 312332095, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09783936, "step": 14481, "time_per_iteration": 2.627357244491577 }, { "auxiliary_loss_clip": 0.06413841, "auxiliary_loss_mlp": 0.0126475, "balance_loss_clip": 0.06275196, "balance_loss_mlp": 0.01256078, "epoch": 0.8707049451375319, "flos": 22754720724480.0, "grad_norm": 1.7014342852916637, "language_loss": 0.77283752, "learning_rate": 1.7274015523788643e-07, "loss": 0.84962344, "num_input_tokens_seen": 312351225, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.08673096, "step": 14482, "time_per_iteration": 2.5485475063323975 }, { "auxiliary_loss_clip": 0.06415203, "auxiliary_loss_mlp": 0.01270334, "balance_loss_clip": 0.06276731, "balance_loss_mlp": 0.01261167, "epoch": 0.8707650683902, "flos": 15857496541440.0, "grad_norm": 1.7499205039934824, "language_loss": 0.76702309, "learning_rate": 1.7258185545268234e-07, "loss": 0.84387851, "num_input_tokens_seen": 312369730, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09161377, "step": 14483, "time_per_iteration": 2.5232315063476562 }, { "auxiliary_loss_clip": 0.06419495, "auxiliary_loss_mlp": 0.01267253, "balance_loss_clip": 0.06274547, "balance_loss_mlp": 0.01256543, "epoch": 0.8708251916428679, "flos": 16473068161920.0, "grad_norm": 2.6319040138949292, "language_loss": 0.62286127, "learning_rate": 1.7242362496335749e-07, "loss": 0.69972873, "num_input_tokens_seen": 312386780, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10705566, "step": 14484, "time_per_iteration": 2.553093433380127 }, { "auxiliary_loss_clip": 0.06416071, "auxiliary_loss_mlp": 0.01268802, "balance_loss_clip": 0.0627753, "balance_loss_mlp": 0.01259569, "epoch": 0.8708853148955359, "flos": 15383319886080.0, "grad_norm": 1.963203572334003, "language_loss": 0.68849003, "learning_rate": 1.7226546377591222e-07, "loss": 0.76533872, "num_input_tokens_seen": 312404875, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09234619, "step": 14485, "time_per_iteration": 2.52907395362854 }, { "auxiliary_loss_clip": 0.06411619, "auxiliary_loss_mlp": 0.01270142, "balance_loss_clip": 0.06272936, "balance_loss_mlp": 0.01260105, "epoch": 0.8709454381482038, "flos": 30558566085120.0, "grad_norm": 1.8748839339109276, "language_loss": 0.63527423, "learning_rate": 1.7210737189634373e-07, "loss": 0.7120918, "num_input_tokens_seen": 312425280, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.1003418, "step": 14486, "time_per_iteration": 2.614776372909546 }, { "auxiliary_loss_clip": 0.06422611, "auxiliary_loss_mlp": 0.0126899, "balance_loss_clip": 0.06279214, "balance_loss_mlp": 0.01258475, "epoch": 0.8710055614008718, "flos": 22608001025280.0, "grad_norm": 2.52976618005002, "language_loss": 0.61780447, "learning_rate": 1.7194934933064653e-07, "loss": 0.69472051, "num_input_tokens_seen": 312443835, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10510254, "step": 14487, "time_per_iteration": 4.03683066368103 }, { "auxiliary_loss_clip": 0.06412648, "auxiliary_loss_mlp": 0.01267276, "balance_loss_clip": 0.06274851, "balance_loss_mlp": 0.012588, "epoch": 0.8710656846535397, "flos": 18449214053760.0, "grad_norm": 2.1377552546835754, "language_loss": 0.68383849, "learning_rate": 1.7179139608481318e-07, "loss": 0.7606377, "num_input_tokens_seen": 312460830, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.08465576, "step": 14488, "time_per_iteration": 2.5284533500671387 }, { "auxiliary_loss_clip": 0.06415385, "auxiliary_loss_mlp": 0.01266703, "balance_loss_clip": 0.0627531, "balance_loss_mlp": 0.01257077, "epoch": 0.8711258079062077, "flos": 16508678947200.0, "grad_norm": 1.848168232497201, "language_loss": 0.85772431, "learning_rate": 1.716335121648338e-07, "loss": 0.93454516, "num_input_tokens_seen": 312477575, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09619141, "step": 14489, "time_per_iteration": 2.5127155780792236 }, { "auxiliary_loss_clip": 0.06422232, "auxiliary_loss_mlp": 0.01268007, "balance_loss_clip": 0.0627767, "balance_loss_mlp": 0.01257433, "epoch": 0.8711859311588757, "flos": 15667786897920.0, "grad_norm": 4.765067812321489, "language_loss": 0.76185822, "learning_rate": 1.7147569757669445e-07, "loss": 0.83876061, "num_input_tokens_seen": 312492140, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10571289, "step": 14490, "time_per_iteration": 2.5078930854797363 }, { "auxiliary_loss_clip": 0.06418468, "auxiliary_loss_mlp": 0.01267729, "balance_loss_clip": 0.06275868, "balance_loss_mlp": 0.0125706, "epoch": 0.8712460544115437, "flos": 15562589696640.0, "grad_norm": 2.189330283229125, "language_loss": 0.75715423, "learning_rate": 1.7131795232638012e-07, "loss": 0.8340162, "num_input_tokens_seen": 312508400, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10681152, "step": 14491, "time_per_iteration": 2.511023998260498 }, { "auxiliary_loss_clip": 0.06412756, "auxiliary_loss_mlp": 0.01265178, "balance_loss_clip": 0.06275174, "balance_loss_mlp": 0.01256023, "epoch": 0.8713061776642116, "flos": 16769148963840.0, "grad_norm": 1.4628921191182853, "language_loss": 0.66933787, "learning_rate": 1.711602764198723e-07, "loss": 0.74611723, "num_input_tokens_seen": 312525915, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09161377, "step": 14492, "time_per_iteration": 2.5423474311828613 }, { "auxiliary_loss_clip": 0.06412068, "auxiliary_loss_mlp": 0.01266009, "balance_loss_clip": 0.06275713, "balance_loss_mlp": 0.01257336, "epoch": 0.8713663009168796, "flos": 24286766376960.0, "grad_norm": 1.8223280940217461, "language_loss": 0.69830817, "learning_rate": 1.7100266986314992e-07, "loss": 0.77508891, "num_input_tokens_seen": 312544735, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.08666992, "step": 14493, "time_per_iteration": 3.9876339435577393 }, { "auxiliary_loss_clip": 0.06411842, "auxiliary_loss_mlp": 0.0126888, "balance_loss_clip": 0.06272963, "balance_loss_mlp": 0.01259063, "epoch": 0.8714264241695475, "flos": 23800724369280.0, "grad_norm": 2.2798530004864435, "language_loss": 0.89388967, "learning_rate": 1.7084513266218936e-07, "loss": 0.97069687, "num_input_tokens_seen": 312557910, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09820557, "step": 14494, "time_per_iteration": 2.5140063762664795 }, { "auxiliary_loss_clip": 0.06408039, "auxiliary_loss_mlp": 0.01263474, "balance_loss_clip": 0.06273123, "balance_loss_mlp": 0.01254766, "epoch": 0.8714865474222155, "flos": 38007016352640.0, "grad_norm": 1.8055925369014316, "language_loss": 0.59688473, "learning_rate": 1.7068766482296514e-07, "loss": 0.67359984, "num_input_tokens_seen": 312580360, "router_z_loss_clip": 1.34863281, "router_z_loss_mlp": 0.08703613, "step": 14495, "time_per_iteration": 2.687696933746338 }, { "auxiliary_loss_clip": 0.06416865, "auxiliary_loss_mlp": 0.01264612, "balance_loss_clip": 0.06276442, "balance_loss_mlp": 0.01255278, "epoch": 0.8715466706748836, "flos": 22462287575040.0, "grad_norm": 2.0036128353914333, "language_loss": 0.80590481, "learning_rate": 1.7053026635144762e-07, "loss": 0.88271958, "num_input_tokens_seen": 312597550, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09338379, "step": 14496, "time_per_iteration": 2.548748016357422 }, { "auxiliary_loss_clip": 0.06415257, "auxiliary_loss_mlp": 0.01264961, "balance_loss_clip": 0.06276136, "balance_loss_mlp": 0.01254852, "epoch": 0.8716067939275515, "flos": 21221501114880.0, "grad_norm": 1.9041901545024635, "language_loss": 0.78860241, "learning_rate": 1.7037293725360624e-07, "loss": 0.86540461, "num_input_tokens_seen": 312616435, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.10107422, "step": 14497, "time_per_iteration": 2.5755867958068848 }, { "auxiliary_loss_clip": 0.06419576, "auxiliary_loss_mlp": 0.01264821, "balance_loss_clip": 0.06277353, "balance_loss_mlp": 0.01255022, "epoch": 0.8716669171802195, "flos": 23003535024000.0, "grad_norm": 1.8544939452599536, "language_loss": 0.67281783, "learning_rate": 1.70215677535406e-07, "loss": 0.7496618, "num_input_tokens_seen": 312632770, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09802246, "step": 14498, "time_per_iteration": 2.542536497116089 }, { "auxiliary_loss_clip": 0.06413315, "auxiliary_loss_mlp": 0.01263958, "balance_loss_clip": 0.06273831, "balance_loss_mlp": 0.01254588, "epoch": 0.8717270404328874, "flos": 29790991958400.0, "grad_norm": 1.5150104835882885, "language_loss": 0.5749045, "learning_rate": 1.700584872028108e-07, "loss": 0.65167725, "num_input_tokens_seen": 312651900, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09368896, "step": 14499, "time_per_iteration": 2.638671398162842 }, { "auxiliary_loss_clip": 0.06419484, "auxiliary_loss_mlp": 0.01264822, "balance_loss_clip": 0.06278083, "balance_loss_mlp": 0.0125513, "epoch": 0.8717871636855554, "flos": 22024686026880.0, "grad_norm": 1.9980986531386953, "language_loss": 0.80106902, "learning_rate": 1.6990136626178097e-07, "loss": 0.87791204, "num_input_tokens_seen": 312671380, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09686279, "step": 14500, "time_per_iteration": 2.572309732437134 }, { "auxiliary_loss_clip": 0.06415517, "auxiliary_loss_mlp": 0.0126987, "balance_loss_clip": 0.06276979, "balance_loss_mlp": 0.01260083, "epoch": 0.8718472869382233, "flos": 16659842912640.0, "grad_norm": 1.7376248461837656, "language_loss": 0.73369932, "learning_rate": 1.6974431471827466e-07, "loss": 0.81055319, "num_input_tokens_seen": 312689215, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09783936, "step": 14501, "time_per_iteration": 2.5251498222351074 }, { "auxiliary_loss_clip": 0.06421091, "auxiliary_loss_mlp": 0.01265209, "balance_loss_clip": 0.06277861, "balance_loss_mlp": 0.01254546, "epoch": 0.8719074101908914, "flos": 19500584359680.0, "grad_norm": 1.717850049033411, "language_loss": 0.64736766, "learning_rate": 1.695873325782482e-07, "loss": 0.72423065, "num_input_tokens_seen": 312706400, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10656738, "step": 14502, "time_per_iteration": 2.5653140544891357 }, { "auxiliary_loss_clip": 0.0641358, "auxiliary_loss_mlp": 0.01263755, "balance_loss_clip": 0.0627443, "balance_loss_mlp": 0.01254737, "epoch": 0.8719675334435593, "flos": 33078894318720.0, "grad_norm": 1.6411564832102796, "language_loss": 0.69203091, "learning_rate": 1.6943041984765262e-07, "loss": 0.76880425, "num_input_tokens_seen": 312727985, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09014893, "step": 14503, "time_per_iteration": 2.6768367290496826 }, { "auxiliary_loss_clip": 0.06413831, "auxiliary_loss_mlp": 0.01264233, "balance_loss_clip": 0.06274283, "balance_loss_mlp": 0.01254786, "epoch": 0.8720276566962273, "flos": 13631404320000.0, "grad_norm": 2.1709732121959515, "language_loss": 0.69764191, "learning_rate": 1.6927357653243912e-07, "loss": 0.77442247, "num_input_tokens_seen": 312745025, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09442139, "step": 14504, "time_per_iteration": 2.521488904953003 }, { "auxiliary_loss_clip": 0.06412445, "auxiliary_loss_mlp": 0.01262884, "balance_loss_clip": 0.0627296, "balance_loss_mlp": 0.01253955, "epoch": 0.8720877799488952, "flos": 23520995112960.0, "grad_norm": 1.690467979933954, "language_loss": 0.704373, "learning_rate": 1.691168026385552e-07, "loss": 0.78112632, "num_input_tokens_seen": 312764170, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.0892334, "step": 14505, "time_per_iteration": 2.5949277877807617 }, { "auxiliary_loss_clip": 0.0641368, "auxiliary_loss_mlp": 0.01262828, "balance_loss_clip": 0.06275778, "balance_loss_mlp": 0.0125421, "epoch": 0.8721479032015632, "flos": 20820516600960.0, "grad_norm": 1.473664781867175, "language_loss": 0.78344923, "learning_rate": 1.6896009817194545e-07, "loss": 0.86021435, "num_input_tokens_seen": 312783830, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.08624268, "step": 14506, "time_per_iteration": 2.5547728538513184 }, { "auxiliary_loss_clip": 0.06418313, "auxiliary_loss_mlp": 0.01264269, "balance_loss_clip": 0.06275742, "balance_loss_mlp": 0.01254809, "epoch": 0.8722080264542311, "flos": 19469711329920.0, "grad_norm": 2.121029836162707, "language_loss": 0.74466336, "learning_rate": 1.6880346313855221e-07, "loss": 0.82148921, "num_input_tokens_seen": 312802015, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09454346, "step": 14507, "time_per_iteration": 2.5892724990844727 }, { "auxiliary_loss_clip": 0.06422476, "auxiliary_loss_mlp": 0.01266669, "balance_loss_clip": 0.06278233, "balance_loss_mlp": 0.01256584, "epoch": 0.8722681497068991, "flos": 21768241006080.0, "grad_norm": 2.2120870079492176, "language_loss": 0.7277146, "learning_rate": 1.686468975443156e-07, "loss": 0.80460608, "num_input_tokens_seen": 312820650, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10083008, "step": 14508, "time_per_iteration": 2.557619333267212 }, { "auxiliary_loss_clip": 0.06422113, "auxiliary_loss_mlp": 0.01266036, "balance_loss_clip": 0.06278133, "balance_loss_mlp": 0.01256011, "epoch": 0.8723282729595672, "flos": 28884790051200.0, "grad_norm": 2.4442634081130183, "language_loss": 0.6916101, "learning_rate": 1.6849040139517202e-07, "loss": 0.76849157, "num_input_tokens_seen": 312841310, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10028076, "step": 14509, "time_per_iteration": 2.6053781509399414 }, { "auxiliary_loss_clip": 0.06416443, "auxiliary_loss_mlp": 0.01266157, "balance_loss_clip": 0.06277225, "balance_loss_mlp": 0.01256572, "epoch": 0.8723883962122351, "flos": 26476409272320.0, "grad_norm": 1.8137553056896885, "language_loss": 0.58864927, "learning_rate": 1.683339746970558e-07, "loss": 0.66547525, "num_input_tokens_seen": 312862100, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.0958252, "step": 14510, "time_per_iteration": 2.597684621810913 }, { "auxiliary_loss_clip": 0.06429945, "auxiliary_loss_mlp": 0.012687, "balance_loss_clip": 0.06282291, "balance_loss_mlp": 0.01257649, "epoch": 0.8724485194649031, "flos": 20527664181120.0, "grad_norm": 2.335989596736163, "language_loss": 0.68138361, "learning_rate": 1.6817761745589865e-07, "loss": 0.75837004, "num_input_tokens_seen": 312880220, "router_z_loss_clip": 1.47460938, "router_z_loss_mlp": 0.11047363, "step": 14511, "time_per_iteration": 3.8654677867889404 }, { "auxiliary_loss_clip": 0.06416607, "auxiliary_loss_mlp": 0.01264971, "balance_loss_clip": 0.06274881, "balance_loss_mlp": 0.01255046, "epoch": 0.872508642717571, "flos": 24360335861760.0, "grad_norm": 1.559691714677256, "language_loss": 0.82153654, "learning_rate": 1.6802132967763027e-07, "loss": 0.89835227, "num_input_tokens_seen": 312900765, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09924316, "step": 14512, "time_per_iteration": 2.5825912952423096 }, { "auxiliary_loss_clip": 0.06311737, "auxiliary_loss_mlp": 0.01253946, "balance_loss_clip": 0.06256477, "balance_loss_mlp": 0.01253005, "epoch": 0.872568765970239, "flos": 61427132749440.0, "grad_norm": 0.8005994162085764, "language_loss": 0.58776015, "learning_rate": 1.6786511136817617e-07, "loss": 0.66341698, "num_input_tokens_seen": 312955840, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00939178, "step": 14513, "time_per_iteration": 3.059370756149292 }, { "auxiliary_loss_clip": 0.06413531, "auxiliary_loss_mlp": 0.01266597, "balance_loss_clip": 0.06273884, "balance_loss_mlp": 0.01256333, "epoch": 0.8726288892229069, "flos": 22604059883520.0, "grad_norm": 1.6827368450035864, "language_loss": 0.76720512, "learning_rate": 1.6770896253346112e-07, "loss": 0.84400648, "num_input_tokens_seen": 312973565, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.10266113, "step": 14514, "time_per_iteration": 2.5621442794799805 }, { "auxiliary_loss_clip": 0.06419642, "auxiliary_loss_mlp": 0.0126455, "balance_loss_clip": 0.06276432, "balance_loss_mlp": 0.01254393, "epoch": 0.872689012475575, "flos": 25892339587200.0, "grad_norm": 1.7204009730580092, "language_loss": 0.65527421, "learning_rate": 1.675528831794055e-07, "loss": 0.7321161, "num_input_tokens_seen": 312994660, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.1015625, "step": 14515, "time_per_iteration": 2.587367296218872 }, { "auxiliary_loss_clip": 0.06416277, "auxiliary_loss_mlp": 0.01267122, "balance_loss_clip": 0.06275937, "balance_loss_mlp": 0.01257586, "epoch": 0.8727491357282429, "flos": 21513095723520.0, "grad_norm": 2.0844201160793574, "language_loss": 0.78965962, "learning_rate": 1.6739687331192842e-07, "loss": 0.86649358, "num_input_tokens_seen": 313009860, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09539795, "step": 14516, "time_per_iteration": 2.560307502746582 }, { "auxiliary_loss_clip": 0.06418104, "auxiliary_loss_mlp": 0.01263938, "balance_loss_clip": 0.06275433, "balance_loss_mlp": 0.01253686, "epoch": 0.8728092589809109, "flos": 19213392090240.0, "grad_norm": 3.1812034771449267, "language_loss": 0.72570586, "learning_rate": 1.672409329369453e-07, "loss": 0.80252624, "num_input_tokens_seen": 313027025, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10241699, "step": 14517, "time_per_iteration": 2.5390923023223877 }, { "auxiliary_loss_clip": 0.06409273, "auxiliary_loss_mlp": 0.01270316, "balance_loss_clip": 0.06272858, "balance_loss_mlp": 0.01261328, "epoch": 0.8728693822335788, "flos": 20601652936320.0, "grad_norm": 2.3215491034243567, "language_loss": 0.72614056, "learning_rate": 1.6708506206036966e-07, "loss": 0.80293649, "num_input_tokens_seen": 313046830, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.08984375, "step": 14518, "time_per_iteration": 4.041727066040039 }, { "auxiliary_loss_clip": 0.06407869, "auxiliary_loss_mlp": 0.01264836, "balance_loss_clip": 0.06271379, "balance_loss_mlp": 0.01255806, "epoch": 0.8729295054862468, "flos": 21735523186560.0, "grad_norm": 1.3051449684189647, "language_loss": 0.74446332, "learning_rate": 1.6692926068811275e-07, "loss": 0.82119042, "num_input_tokens_seen": 313067715, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.090271, "step": 14519, "time_per_iteration": 2.58984112739563 }, { "auxiliary_loss_clip": 0.06421979, "auxiliary_loss_mlp": 0.01266733, "balance_loss_clip": 0.06278322, "balance_loss_mlp": 0.01256076, "epoch": 0.8729896287389147, "flos": 17678788888320.0, "grad_norm": 2.6050645319688037, "language_loss": 0.7673859, "learning_rate": 1.6677352882608142e-07, "loss": 0.84427303, "num_input_tokens_seen": 313082305, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10656738, "step": 14520, "time_per_iteration": 2.511780023574829 }, { "auxiliary_loss_clip": 0.06420887, "auxiliary_loss_mlp": 0.01267416, "balance_loss_clip": 0.06277988, "balance_loss_mlp": 0.01256938, "epoch": 0.8730497519915827, "flos": 24578738328960.0, "grad_norm": 2.2946148574954313, "language_loss": 0.82508308, "learning_rate": 1.666178664801816e-07, "loss": 0.90196609, "num_input_tokens_seen": 313101190, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10479736, "step": 14521, "time_per_iteration": 2.6204569339752197 }, { "auxiliary_loss_clip": 0.06418282, "auxiliary_loss_mlp": 0.01268189, "balance_loss_clip": 0.06277207, "balance_loss_mlp": 0.01257955, "epoch": 0.8731098752442508, "flos": 13448822273280.0, "grad_norm": 1.904985724080745, "language_loss": 0.76704741, "learning_rate": 1.6646227365631616e-07, "loss": 0.84391212, "num_input_tokens_seen": 313118965, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10235596, "step": 14522, "time_per_iteration": 2.5464541912078857 }, { "auxiliary_loss_clip": 0.06410502, "auxiliary_loss_mlp": 0.01266417, "balance_loss_clip": 0.06273395, "balance_loss_mlp": 0.0125728, "epoch": 0.8731699984969187, "flos": 23480730426240.0, "grad_norm": 1.6837589103146882, "language_loss": 0.76044101, "learning_rate": 1.66306750360385e-07, "loss": 0.83721018, "num_input_tokens_seen": 313139280, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09130859, "step": 14523, "time_per_iteration": 2.571915864944458 }, { "auxiliary_loss_clip": 0.06411114, "auxiliary_loss_mlp": 0.01265938, "balance_loss_clip": 0.06274214, "balance_loss_mlp": 0.01256676, "epoch": 0.8732301217495867, "flos": 17718466596480.0, "grad_norm": 4.106919376215043, "language_loss": 0.78736597, "learning_rate": 1.6615129659828542e-07, "loss": 0.86413646, "num_input_tokens_seen": 313156655, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09265137, "step": 14524, "time_per_iteration": 2.544550657272339 }, { "auxiliary_loss_clip": 0.06406941, "auxiliary_loss_mlp": 0.01264018, "balance_loss_clip": 0.06272671, "balance_loss_mlp": 0.0125534, "epoch": 0.8732902450022546, "flos": 22060883790720.0, "grad_norm": 2.053423357989193, "language_loss": 0.77593052, "learning_rate": 1.6599591237591272e-07, "loss": 0.85264009, "num_input_tokens_seen": 313174050, "router_z_loss_clip": 1.34277344, "router_z_loss_mlp": 0.08673096, "step": 14525, "time_per_iteration": 2.5701422691345215 }, { "auxiliary_loss_clip": 0.06417323, "auxiliary_loss_mlp": 0.01268652, "balance_loss_clip": 0.06275054, "balance_loss_mlp": 0.01259186, "epoch": 0.8733503682549226, "flos": 22279495893120.0, "grad_norm": 1.552762325191518, "language_loss": 0.69496524, "learning_rate": 1.6584059769915902e-07, "loss": 0.77182496, "num_input_tokens_seen": 313192765, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09454346, "step": 14526, "time_per_iteration": 2.552267551422119 }, { "auxiliary_loss_clip": 0.06423287, "auxiliary_loss_mlp": 0.01267025, "balance_loss_clip": 0.06278984, "balance_loss_mlp": 0.012566, "epoch": 0.8734104915075905, "flos": 23370501980160.0, "grad_norm": 1.8751640852841311, "language_loss": 0.61178052, "learning_rate": 1.6568535257391326e-07, "loss": 0.68868363, "num_input_tokens_seen": 313210925, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10418701, "step": 14527, "time_per_iteration": 4.023592472076416 }, { "auxiliary_loss_clip": 0.06425922, "auxiliary_loss_mlp": 0.01269285, "balance_loss_clip": 0.06278109, "balance_loss_mlp": 0.01257614, "epoch": 0.8734706147602586, "flos": 17718047326080.0, "grad_norm": 2.7539585946125653, "language_loss": 0.6597262, "learning_rate": 1.6553017700606265e-07, "loss": 0.73667824, "num_input_tokens_seen": 313228250, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.11676025, "step": 14528, "time_per_iteration": 2.5280497074127197 }, { "auxiliary_loss_clip": 0.06413473, "auxiliary_loss_mlp": 0.01265178, "balance_loss_clip": 0.06276929, "balance_loss_mlp": 0.01255134, "epoch": 0.8735307380129265, "flos": 22055055932160.0, "grad_norm": 1.7802964708309132, "language_loss": 0.89689064, "learning_rate": 1.6537507100149205e-07, "loss": 0.97367716, "num_input_tokens_seen": 313247880, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.10040283, "step": 14529, "time_per_iteration": 2.5815250873565674 }, { "auxiliary_loss_clip": 0.06411281, "auxiliary_loss_mlp": 0.01267128, "balance_loss_clip": 0.06276158, "balance_loss_mlp": 0.01257734, "epoch": 0.8735908612655945, "flos": 25345557768960.0, "grad_norm": 1.8225264637707952, "language_loss": 0.85264987, "learning_rate": 1.6522003456608258e-07, "loss": 0.92943394, "num_input_tokens_seen": 313266790, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.09393311, "step": 14530, "time_per_iteration": 2.5995051860809326 }, { "auxiliary_loss_clip": 0.06414592, "auxiliary_loss_mlp": 0.01269759, "balance_loss_clip": 0.06273419, "balance_loss_mlp": 0.01260855, "epoch": 0.8736509845182624, "flos": 21546903646080.0, "grad_norm": 1.5758971336086323, "language_loss": 0.74489796, "learning_rate": 1.650650677057128e-07, "loss": 0.82174146, "num_input_tokens_seen": 313286805, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.08905029, "step": 14531, "time_per_iteration": 2.5429601669311523 }, { "auxiliary_loss_clip": 0.06408291, "auxiliary_loss_mlp": 0.01268209, "balance_loss_clip": 0.06272367, "balance_loss_mlp": 0.01259113, "epoch": 0.8737111077709304, "flos": 22023637850880.0, "grad_norm": 1.803324898026441, "language_loss": 0.6198985, "learning_rate": 1.6491017042625966e-07, "loss": 0.69666356, "num_input_tokens_seen": 313305415, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.09088135, "step": 14532, "time_per_iteration": 2.560553550720215 }, { "auxiliary_loss_clip": 0.06315902, "auxiliary_loss_mlp": 0.01251164, "balance_loss_clip": 0.0626054, "balance_loss_mlp": 0.01250285, "epoch": 0.8737712310235983, "flos": 70086418842240.0, "grad_norm": 0.808665090074078, "language_loss": 0.58734274, "learning_rate": 1.6475534273359704e-07, "loss": 0.6630134, "num_input_tokens_seen": 313369940, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00880432, "step": 14533, "time_per_iteration": 4.616446018218994 }, { "auxiliary_loss_clip": 0.06413111, "auxiliary_loss_mlp": 0.01262498, "balance_loss_clip": 0.06275867, "balance_loss_mlp": 0.01253175, "epoch": 0.8738313542762663, "flos": 28665968313600.0, "grad_norm": 1.439597024705628, "language_loss": 0.7662425, "learning_rate": 1.646005846335954e-07, "loss": 0.84299856, "num_input_tokens_seen": 313390965, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09313965, "step": 14534, "time_per_iteration": 2.596853733062744 }, { "auxiliary_loss_clip": 0.0641748, "auxiliary_loss_mlp": 0.01264688, "balance_loss_clip": 0.06278028, "balance_loss_mlp": 0.01255562, "epoch": 0.8738914775289344, "flos": 22352981523840.0, "grad_norm": 1.6081535190604952, "language_loss": 0.75120962, "learning_rate": 1.6444589613212357e-07, "loss": 0.8280313, "num_input_tokens_seen": 313409680, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09130859, "step": 14535, "time_per_iteration": 2.577895402908325 }, { "auxiliary_loss_clip": 0.06415904, "auxiliary_loss_mlp": 0.01262133, "balance_loss_clip": 0.06275946, "balance_loss_mlp": 0.01253079, "epoch": 0.8739516007816023, "flos": 31767808682880.0, "grad_norm": 1.787433703767106, "language_loss": 0.74306846, "learning_rate": 1.64291277235048e-07, "loss": 0.81984878, "num_input_tokens_seen": 313431335, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09063721, "step": 14536, "time_per_iteration": 2.634326934814453 }, { "auxiliary_loss_clip": 0.06415668, "auxiliary_loss_mlp": 0.01261783, "balance_loss_clip": 0.06273383, "balance_loss_mlp": 0.01252187, "epoch": 0.8740117240342703, "flos": 21217518046080.0, "grad_norm": 1.593623523847416, "language_loss": 0.63942558, "learning_rate": 1.641367279482304e-07, "loss": 0.71620011, "num_input_tokens_seen": 313449225, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09594727, "step": 14537, "time_per_iteration": 2.600578784942627 }, { "auxiliary_loss_clip": 0.06414805, "auxiliary_loss_mlp": 0.0126668, "balance_loss_clip": 0.06277321, "balance_loss_mlp": 0.01257191, "epoch": 0.8740718472869382, "flos": 25192800576000.0, "grad_norm": 1.9483555691867058, "language_loss": 0.58070457, "learning_rate": 1.6398224827753216e-07, "loss": 0.65751946, "num_input_tokens_seen": 313467715, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09484863, "step": 14538, "time_per_iteration": 2.5805113315582275 }, { "auxiliary_loss_clip": 0.06412318, "auxiliary_loss_mlp": 0.01265828, "balance_loss_clip": 0.06277813, "balance_loss_mlp": 0.01256076, "epoch": 0.8741319705396062, "flos": 19507124977920.0, "grad_norm": 1.846759596055, "language_loss": 0.68571782, "learning_rate": 1.6382783822881142e-07, "loss": 0.76249921, "num_input_tokens_seen": 313486805, "router_z_loss_clip": 1.34277344, "router_z_loss_mlp": 0.09753418, "step": 14539, "time_per_iteration": 2.5678648948669434 }, { "auxiliary_loss_clip": 0.06422752, "auxiliary_loss_mlp": 0.01265435, "balance_loss_clip": 0.0627692, "balance_loss_mlp": 0.01255267, "epoch": 0.8741920937922741, "flos": 14106167953920.0, "grad_norm": 1.9285114676469477, "language_loss": 0.74286163, "learning_rate": 1.6367349780792262e-07, "loss": 0.81974351, "num_input_tokens_seen": 313504880, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10168457, "step": 14540, "time_per_iteration": 2.5301756858825684 }, { "auxiliary_loss_clip": 0.06416781, "auxiliary_loss_mlp": 0.01263455, "balance_loss_clip": 0.06278253, "balance_loss_mlp": 0.01253751, "epoch": 0.8742522170449422, "flos": 27717363440640.0, "grad_norm": 1.7727608867855724, "language_loss": 0.79590297, "learning_rate": 1.635192270207193e-07, "loss": 0.87270534, "num_input_tokens_seen": 313524995, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.0970459, "step": 14541, "time_per_iteration": 2.675139904022217 }, { "auxiliary_loss_clip": 0.06425965, "auxiliary_loss_mlp": 0.01269892, "balance_loss_clip": 0.06282184, "balance_loss_mlp": 0.01259044, "epoch": 0.8743123402976101, "flos": 21149021733120.0, "grad_norm": 2.098215402382077, "language_loss": 0.66754454, "learning_rate": 1.6336502587305035e-07, "loss": 0.74450308, "num_input_tokens_seen": 313541740, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10852051, "step": 14542, "time_per_iteration": 2.567486047744751 }, { "auxiliary_loss_clip": 0.06311993, "auxiliary_loss_mlp": 0.01250587, "balance_loss_clip": 0.06256768, "balance_loss_mlp": 0.01249586, "epoch": 0.8743724635502781, "flos": 60888275141760.0, "grad_norm": 0.7702351931549439, "language_loss": 0.54530102, "learning_rate": 1.632108943707642e-07, "loss": 0.62092686, "num_input_tokens_seen": 313593445, "router_z_loss_clip": 0.55322266, "router_z_loss_mlp": 0.01000214, "step": 14543, "time_per_iteration": 2.9679553508758545 }, { "auxiliary_loss_clip": 0.06417727, "auxiliary_loss_mlp": 0.01269201, "balance_loss_clip": 0.06276277, "balance_loss_mlp": 0.01259026, "epoch": 0.874432586802946, "flos": 28116545091840.0, "grad_norm": 2.0609287262644935, "language_loss": 0.69838589, "learning_rate": 1.6305683251970458e-07, "loss": 0.7752552, "num_input_tokens_seen": 313615640, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10186768, "step": 14544, "time_per_iteration": 2.587301015853882 }, { "auxiliary_loss_clip": 0.06410085, "auxiliary_loss_mlp": 0.01261757, "balance_loss_clip": 0.06276381, "balance_loss_mlp": 0.01253338, "epoch": 0.874492710055614, "flos": 23557067095680.0, "grad_norm": 1.5171124001082144, "language_loss": 0.76019442, "learning_rate": 1.62902840325714e-07, "loss": 0.83691287, "num_input_tokens_seen": 313635550, "router_z_loss_clip": 1.33496094, "router_z_loss_mlp": 0.084198, "step": 14545, "time_per_iteration": 2.574509382247925 }, { "auxiliary_loss_clip": 0.06414337, "auxiliary_loss_mlp": 0.01266918, "balance_loss_clip": 0.06274918, "balance_loss_mlp": 0.01255879, "epoch": 0.8745528333082819, "flos": 40925016864000.0, "grad_norm": 1.665989590479788, "language_loss": 0.66591889, "learning_rate": 1.6274891779463217e-07, "loss": 0.74273145, "num_input_tokens_seen": 313659275, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.1104126, "step": 14546, "time_per_iteration": 2.7376646995544434 }, { "auxiliary_loss_clip": 0.06415655, "auxiliary_loss_mlp": 0.01264124, "balance_loss_clip": 0.06277086, "balance_loss_mlp": 0.01254414, "epoch": 0.87461295656095, "flos": 23629630331520.0, "grad_norm": 1.5552089480289188, "language_loss": 0.73082072, "learning_rate": 1.6259506493229536e-07, "loss": 0.8076185, "num_input_tokens_seen": 313680595, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09710693, "step": 14547, "time_per_iteration": 2.5660512447357178 }, { "auxiliary_loss_clip": 0.06427912, "auxiliary_loss_mlp": 0.01266461, "balance_loss_clip": 0.06279972, "balance_loss_mlp": 0.01256239, "epoch": 0.874673079813618, "flos": 38802235127040.0, "grad_norm": 2.067915860267022, "language_loss": 0.6962682, "learning_rate": 1.6244128174453752e-07, "loss": 0.77321196, "num_input_tokens_seen": 313699730, "router_z_loss_clip": 1.47851562, "router_z_loss_mlp": 0.10217285, "step": 14548, "time_per_iteration": 2.684610366821289 }, { "auxiliary_loss_clip": 0.06420968, "auxiliary_loss_mlp": 0.01262704, "balance_loss_clip": 0.06276645, "balance_loss_mlp": 0.01253239, "epoch": 0.8747332030662859, "flos": 23702948254080.0, "grad_norm": 2.0666246288818946, "language_loss": 0.71123827, "learning_rate": 1.6228756823719093e-07, "loss": 0.78807497, "num_input_tokens_seen": 313720090, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.09466553, "step": 14549, "time_per_iteration": 2.620800018310547 }, { "auxiliary_loss_clip": 0.06422991, "auxiliary_loss_mlp": 0.01267543, "balance_loss_clip": 0.06276928, "balance_loss_mlp": 0.01256731, "epoch": 0.8747933263189539, "flos": 24469390350720.0, "grad_norm": 2.3384373673189085, "language_loss": 0.83744764, "learning_rate": 1.6213392441608352e-07, "loss": 0.91435301, "num_input_tokens_seen": 313736795, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.1081543, "step": 14550, "time_per_iteration": 2.564638614654541 }, { "auxiliary_loss_clip": 0.06419733, "auxiliary_loss_mlp": 0.01265264, "balance_loss_clip": 0.06277317, "balance_loss_mlp": 0.01255936, "epoch": 0.8748534495716218, "flos": 13814405637120.0, "grad_norm": 1.633927205888018, "language_loss": 0.72507942, "learning_rate": 1.6198035028704183e-07, "loss": 0.80192935, "num_input_tokens_seen": 313754820, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09338379, "step": 14551, "time_per_iteration": 3.9772186279296875 }, { "auxiliary_loss_clip": 0.06414358, "auxiliary_loss_mlp": 0.01262511, "balance_loss_clip": 0.06277247, "balance_loss_mlp": 0.01253368, "epoch": 0.8749135728242898, "flos": 29869886177280.0, "grad_norm": 1.8864576146126424, "language_loss": 0.64537656, "learning_rate": 1.6182684585588934e-07, "loss": 0.72214526, "num_input_tokens_seen": 313775830, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.0914917, "step": 14552, "time_per_iteration": 2.6525774002075195 }, { "auxiliary_loss_clip": 0.06421239, "auxiliary_loss_mlp": 0.01270725, "balance_loss_clip": 0.06277326, "balance_loss_mlp": 0.01259841, "epoch": 0.8749736960769577, "flos": 24140256312960.0, "grad_norm": 1.9706256512205769, "language_loss": 0.79960328, "learning_rate": 1.616734111284479e-07, "loss": 0.8765229, "num_input_tokens_seen": 313795745, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10882568, "step": 14553, "time_per_iteration": 2.568791627883911 }, { "auxiliary_loss_clip": 0.06420298, "auxiliary_loss_mlp": 0.01263872, "balance_loss_clip": 0.06276987, "balance_loss_mlp": 0.01253727, "epoch": 0.8750338193296258, "flos": 17208385666560.0, "grad_norm": 3.473219982441424, "language_loss": 0.69958758, "learning_rate": 1.6152004611053416e-07, "loss": 0.7764293, "num_input_tokens_seen": 313813895, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10144043, "step": 14554, "time_per_iteration": 2.577291250228882 }, { "auxiliary_loss_clip": 0.06412891, "auxiliary_loss_mlp": 0.01261679, "balance_loss_clip": 0.06274244, "balance_loss_mlp": 0.01252601, "epoch": 0.8750939425822937, "flos": 23740110339840.0, "grad_norm": 1.3968655671174022, "language_loss": 0.84087753, "learning_rate": 1.6136675080796457e-07, "loss": 0.91762316, "num_input_tokens_seen": 313834225, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09075928, "step": 14555, "time_per_iteration": 2.570086717605591 }, { "auxiliary_loss_clip": 0.06417233, "auxiliary_loss_mlp": 0.01268427, "balance_loss_clip": 0.06277845, "balance_loss_mlp": 0.01258676, "epoch": 0.8751540658349617, "flos": 26548888654080.0, "grad_norm": 1.6056192197448658, "language_loss": 0.71324587, "learning_rate": 1.6121352522655252e-07, "loss": 0.79010248, "num_input_tokens_seen": 313854430, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09747314, "step": 14556, "time_per_iteration": 2.5934898853302 }, { "auxiliary_loss_clip": 0.06421598, "auxiliary_loss_mlp": 0.01267555, "balance_loss_clip": 0.06277408, "balance_loss_mlp": 0.01256951, "epoch": 0.8752141890876296, "flos": 19392200703360.0, "grad_norm": 1.7731971822941868, "language_loss": 0.77121234, "learning_rate": 1.6106036937210732e-07, "loss": 0.84810388, "num_input_tokens_seen": 313871600, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.1060791, "step": 14557, "time_per_iteration": 3.9953525066375732 }, { "auxiliary_loss_clip": 0.06421164, "auxiliary_loss_mlp": 0.01265512, "balance_loss_clip": 0.06280103, "balance_loss_mlp": 0.01255307, "epoch": 0.8752743123402976, "flos": 25381462043520.0, "grad_norm": 1.7165230491139938, "language_loss": 0.83358502, "learning_rate": 1.6090728325043767e-07, "loss": 0.91045183, "num_input_tokens_seen": 313891570, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10189819, "step": 14558, "time_per_iteration": 2.5799829959869385 }, { "auxiliary_loss_clip": 0.06313455, "auxiliary_loss_mlp": 0.01251719, "balance_loss_clip": 0.0625812, "balance_loss_mlp": 0.01250729, "epoch": 0.8753344355929655, "flos": 59969578976640.0, "grad_norm": 0.7925793018742628, "language_loss": 0.56097758, "learning_rate": 1.6075426686734784e-07, "loss": 0.63662934, "num_input_tokens_seen": 313951290, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00989532, "step": 14559, "time_per_iteration": 3.1893389225006104 }, { "auxiliary_loss_clip": 0.06414272, "auxiliary_loss_mlp": 0.01264892, "balance_loss_clip": 0.06276444, "balance_loss_mlp": 0.012561, "epoch": 0.8753945588456336, "flos": 17900419737600.0, "grad_norm": 2.330609790207922, "language_loss": 0.66562438, "learning_rate": 1.606013202286407e-07, "loss": 0.74241602, "num_input_tokens_seen": 313968645, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.08795166, "step": 14560, "time_per_iteration": 2.538303852081299 }, { "auxiliary_loss_clip": 0.06412406, "auxiliary_loss_mlp": 0.012641, "balance_loss_clip": 0.06274696, "balance_loss_mlp": 0.01254456, "epoch": 0.8754546820983016, "flos": 30921969242880.0, "grad_norm": 1.980005210540459, "language_loss": 0.79382873, "learning_rate": 1.6044844334011541e-07, "loss": 0.87059373, "num_input_tokens_seen": 313987580, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09649658, "step": 14561, "time_per_iteration": 2.6016831398010254 }, { "auxiliary_loss_clip": 0.06418285, "auxiliary_loss_mlp": 0.01262923, "balance_loss_clip": 0.06274669, "balance_loss_mlp": 0.01252486, "epoch": 0.8755148053509695, "flos": 20637305648640.0, "grad_norm": 2.184134520076159, "language_loss": 0.78072989, "learning_rate": 1.6029563620756982e-07, "loss": 0.85754192, "num_input_tokens_seen": 314004460, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10430908, "step": 14562, "time_per_iteration": 2.5360219478607178 }, { "auxiliary_loss_clip": 0.06411275, "auxiliary_loss_mlp": 0.01266694, "balance_loss_clip": 0.06277496, "balance_loss_mlp": 0.01257509, "epoch": 0.8755749286036375, "flos": 34978326197760.0, "grad_norm": 1.379630929474751, "language_loss": 0.72073084, "learning_rate": 1.601428988367981e-07, "loss": 0.79751056, "num_input_tokens_seen": 314026855, "router_z_loss_clip": 1.33691406, "router_z_loss_mlp": 0.09191895, "step": 14563, "time_per_iteration": 2.6644766330718994 }, { "auxiliary_loss_clip": 0.06423168, "auxiliary_loss_mlp": 0.01266068, "balance_loss_clip": 0.06278501, "balance_loss_mlp": 0.01255793, "epoch": 0.8756350518563054, "flos": 18192265908480.0, "grad_norm": 2.475966714064481, "language_loss": 0.65872622, "learning_rate": 1.5999023123359235e-07, "loss": 0.73561859, "num_input_tokens_seen": 314042830, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10272217, "step": 14564, "time_per_iteration": 2.5209450721740723 }, { "auxiliary_loss_clip": 0.06417428, "auxiliary_loss_mlp": 0.01265993, "balance_loss_clip": 0.06277093, "balance_loss_mlp": 0.01256629, "epoch": 0.8756951751089734, "flos": 20090188414080.0, "grad_norm": 1.5809317269575132, "language_loss": 0.7102747, "learning_rate": 1.598376334037408e-07, "loss": 0.7871089, "num_input_tokens_seen": 314062225, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09362793, "step": 14565, "time_per_iteration": 2.5543081760406494 }, { "auxiliary_loss_clip": 0.06424704, "auxiliary_loss_mlp": 0.01265883, "balance_loss_clip": 0.06277262, "balance_loss_mlp": 0.01254856, "epoch": 0.8757552983616413, "flos": 27532349625600.0, "grad_norm": 1.6108915406377144, "language_loss": 0.7804938, "learning_rate": 1.5968510535303102e-07, "loss": 0.8573997, "num_input_tokens_seen": 314082325, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.11016846, "step": 14566, "time_per_iteration": 4.127814054489136 }, { "auxiliary_loss_clip": 0.06416087, "auxiliary_loss_mlp": 0.012682, "balance_loss_clip": 0.06276084, "balance_loss_mlp": 0.01258866, "epoch": 0.8758154216143094, "flos": 18078138247680.0, "grad_norm": 1.546888763134085, "language_loss": 0.71139634, "learning_rate": 1.5953264708724624e-07, "loss": 0.78823918, "num_input_tokens_seen": 314100310, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09332275, "step": 14567, "time_per_iteration": 2.5845084190368652 }, { "auxiliary_loss_clip": 0.0641579, "auxiliary_loss_mlp": 0.01268352, "balance_loss_clip": 0.06278018, "balance_loss_mlp": 0.01258821, "epoch": 0.8758755448669773, "flos": 25052621495040.0, "grad_norm": 3.1831937851820973, "language_loss": 0.7435171, "learning_rate": 1.5938025861216776e-07, "loss": 0.82035851, "num_input_tokens_seen": 314121330, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09533691, "step": 14568, "time_per_iteration": 2.6010162830352783 }, { "auxiliary_loss_clip": 0.06411356, "auxiliary_loss_mlp": 0.01268043, "balance_loss_clip": 0.06272515, "balance_loss_mlp": 0.01258614, "epoch": 0.8759356681196453, "flos": 22863439797120.0, "grad_norm": 1.9084147348858937, "language_loss": 0.8723253, "learning_rate": 1.5922793993357475e-07, "loss": 0.94911933, "num_input_tokens_seen": 314139875, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09423828, "step": 14569, "time_per_iteration": 2.5556890964508057 }, { "auxiliary_loss_clip": 0.06419777, "auxiliary_loss_mlp": 0.01262976, "balance_loss_clip": 0.06278127, "balance_loss_mlp": 0.01254066, "epoch": 0.8759957913723132, "flos": 21038835214080.0, "grad_norm": 1.6584469381902927, "language_loss": 0.74400568, "learning_rate": 1.5907569105724284e-07, "loss": 0.82083321, "num_input_tokens_seen": 314157850, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.08905029, "step": 14570, "time_per_iteration": 2.549790859222412 }, { "auxiliary_loss_clip": 0.06423268, "auxiliary_loss_mlp": 0.01263334, "balance_loss_clip": 0.06279646, "balance_loss_mlp": 0.01253356, "epoch": 0.8760559146249812, "flos": 20016535075200.0, "grad_norm": 1.5489712832640152, "language_loss": 0.68229347, "learning_rate": 1.5892351198894472e-07, "loss": 0.75915951, "num_input_tokens_seen": 314176720, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.09973145, "step": 14571, "time_per_iteration": 2.5970299243927 }, { "auxiliary_loss_clip": 0.06414479, "auxiliary_loss_mlp": 0.01263792, "balance_loss_clip": 0.06276617, "balance_loss_mlp": 0.01254714, "epoch": 0.8761160378776491, "flos": 19980253457280.0, "grad_norm": 1.8924771860903589, "language_loss": 0.62888789, "learning_rate": 1.5877140273445156e-07, "loss": 0.70567065, "num_input_tokens_seen": 314196645, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09069824, "step": 14572, "time_per_iteration": 2.531372308731079 }, { "auxiliary_loss_clip": 0.06412537, "auxiliary_loss_mlp": 0.01263735, "balance_loss_clip": 0.06276636, "balance_loss_mlp": 0.01254651, "epoch": 0.8761761611303172, "flos": 28812101034240.0, "grad_norm": 1.823185530334644, "language_loss": 0.74062002, "learning_rate": 1.5861936329953162e-07, "loss": 0.81738275, "num_input_tokens_seen": 314217430, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.09082031, "step": 14573, "time_per_iteration": 4.034012079238892 }, { "auxiliary_loss_clip": 0.06411505, "auxiliary_loss_mlp": 0.01265487, "balance_loss_clip": 0.06275758, "balance_loss_mlp": 0.0125697, "epoch": 0.8762362843829851, "flos": 18338356702080.0, "grad_norm": 1.8596119975324348, "language_loss": 0.73188579, "learning_rate": 1.5846739368994966e-07, "loss": 0.80865568, "num_input_tokens_seen": 314235310, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.08514404, "step": 14574, "time_per_iteration": 2.4991939067840576 }, { "auxiliary_loss_clip": 0.06416912, "auxiliary_loss_mlp": 0.01264063, "balance_loss_clip": 0.06278153, "balance_loss_mlp": 0.0125455, "epoch": 0.8762964076356531, "flos": 15784681743360.0, "grad_norm": 1.7028772893761157, "language_loss": 0.76075375, "learning_rate": 1.5831549391146903e-07, "loss": 0.83756351, "num_input_tokens_seen": 314252355, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09509277, "step": 14575, "time_per_iteration": 2.498279094696045 }, { "auxiliary_loss_clip": 0.06412736, "auxiliary_loss_mlp": 0.01267732, "balance_loss_clip": 0.06277342, "balance_loss_mlp": 0.01258166, "epoch": 0.8763565308883211, "flos": 33184175374080.0, "grad_norm": 1.8619680580667375, "language_loss": 0.66757941, "learning_rate": 1.5816366396984916e-07, "loss": 0.74438405, "num_input_tokens_seen": 314272755, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.09564209, "step": 14576, "time_per_iteration": 2.631295919418335 }, { "auxiliary_loss_clip": 0.06411318, "auxiliary_loss_mlp": 0.01264824, "balance_loss_clip": 0.06273106, "balance_loss_mlp": 0.01255675, "epoch": 0.876416654140989, "flos": 15893568524160.0, "grad_norm": 14.329602771445064, "language_loss": 0.66685116, "learning_rate": 1.5801190387084806e-07, "loss": 0.74361253, "num_input_tokens_seen": 314291365, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.0914917, "step": 14577, "time_per_iteration": 2.567842960357666 }, { "auxiliary_loss_clip": 0.06420557, "auxiliary_loss_mlp": 0.01265865, "balance_loss_clip": 0.06278703, "balance_loss_mlp": 0.01255696, "epoch": 0.876476777393657, "flos": 25892381514240.0, "grad_norm": 2.158777468973238, "language_loss": 0.70712215, "learning_rate": 1.5786021362021962e-07, "loss": 0.78398633, "num_input_tokens_seen": 314310075, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10180664, "step": 14578, "time_per_iteration": 2.5719635486602783 }, { "auxiliary_loss_clip": 0.0642305, "auxiliary_loss_mlp": 0.0126732, "balance_loss_clip": 0.06280531, "balance_loss_mlp": 0.01257587, "epoch": 0.876536900646325, "flos": 13594787285760.0, "grad_norm": 1.874338294562412, "language_loss": 0.71551406, "learning_rate": 1.5770859322371676e-07, "loss": 0.79241782, "num_input_tokens_seen": 314325695, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09735107, "step": 14579, "time_per_iteration": 2.488358497619629 }, { "auxiliary_loss_clip": 0.06412187, "auxiliary_loss_mlp": 0.01263676, "balance_loss_clip": 0.06276509, "balance_loss_mlp": 0.01254944, "epoch": 0.876597023898993, "flos": 12208245448320.0, "grad_norm": 1.6980232667945918, "language_loss": 0.70178068, "learning_rate": 1.5755704268708912e-07, "loss": 0.7785393, "num_input_tokens_seen": 314343605, "router_z_loss_clip": 1.35644531, "router_z_loss_mlp": 0.08734131, "step": 14580, "time_per_iteration": 2.520836353302002 }, { "auxiliary_loss_clip": 0.06412383, "auxiliary_loss_mlp": 0.01265213, "balance_loss_clip": 0.06275211, "balance_loss_mlp": 0.01255569, "epoch": 0.8766571471516609, "flos": 25343629125120.0, "grad_norm": 1.542392197253291, "language_loss": 0.65997982, "learning_rate": 1.5740556201608256e-07, "loss": 0.73675573, "num_input_tokens_seen": 314364275, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09643555, "step": 14581, "time_per_iteration": 2.5685102939605713 }, { "auxiliary_loss_clip": 0.06413266, "auxiliary_loss_mlp": 0.01264407, "balance_loss_clip": 0.06276541, "balance_loss_mlp": 0.01255479, "epoch": 0.8767172704043289, "flos": 30120419485440.0, "grad_norm": 1.53278884689288, "language_loss": 0.73933089, "learning_rate": 1.572541512164416e-07, "loss": 0.81610757, "num_input_tokens_seen": 314385140, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.0894165, "step": 14582, "time_per_iteration": 2.6101670265197754 }, { "auxiliary_loss_clip": 0.06413686, "auxiliary_loss_mlp": 0.01267749, "balance_loss_clip": 0.06274544, "balance_loss_mlp": 0.01257807, "epoch": 0.8767773936569968, "flos": 19287171210240.0, "grad_norm": 2.074278461627842, "language_loss": 0.67369419, "learning_rate": 1.5710281029390826e-07, "loss": 0.75050849, "num_input_tokens_seen": 314403715, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.0994873, "step": 14583, "time_per_iteration": 2.5441246032714844 }, { "auxiliary_loss_clip": 0.06420955, "auxiliary_loss_mlp": 0.01262564, "balance_loss_clip": 0.06279086, "balance_loss_mlp": 0.01253171, "epoch": 0.8768375169096648, "flos": 21252877269120.0, "grad_norm": 2.1013816342651546, "language_loss": 0.79453248, "learning_rate": 1.5695153925422067e-07, "loss": 0.87136769, "num_input_tokens_seen": 314421880, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09393311, "step": 14584, "time_per_iteration": 2.539299726486206 }, { "auxiliary_loss_clip": 0.06418361, "auxiliary_loss_mlp": 0.01267501, "balance_loss_clip": 0.06275433, "balance_loss_mlp": 0.01257929, "epoch": 0.8768976401623327, "flos": 23302383010560.0, "grad_norm": 1.583068436581797, "language_loss": 0.72478509, "learning_rate": 1.5680033810311555e-07, "loss": 0.80164367, "num_input_tokens_seen": 314441585, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09576416, "step": 14585, "time_per_iteration": 2.570953130722046 }, { "auxiliary_loss_clip": 0.06418025, "auxiliary_loss_mlp": 0.01266119, "balance_loss_clip": 0.06280845, "balance_loss_mlp": 0.01256612, "epoch": 0.8769577634150008, "flos": 21367675762560.0, "grad_norm": 2.0628758473767173, "language_loss": 0.74452513, "learning_rate": 1.5664920684632654e-07, "loss": 0.82136655, "num_input_tokens_seen": 314459020, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09503174, "step": 14586, "time_per_iteration": 2.542581558227539 }, { "auxiliary_loss_clip": 0.06415209, "auxiliary_loss_mlp": 0.01262454, "balance_loss_clip": 0.0627618, "balance_loss_mlp": 0.01253054, "epoch": 0.8770178866676687, "flos": 23520869331840.0, "grad_norm": 1.6640797753988688, "language_loss": 0.79146963, "learning_rate": 1.564981454895844e-07, "loss": 0.86824632, "num_input_tokens_seen": 314478935, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09399414, "step": 14587, "time_per_iteration": 2.563652992248535 }, { "auxiliary_loss_clip": 0.06418065, "auxiliary_loss_mlp": 0.01269794, "balance_loss_clip": 0.06278475, "balance_loss_mlp": 0.01260013, "epoch": 0.8770780099203367, "flos": 19725150101760.0, "grad_norm": 1.5167648960079763, "language_loss": 0.74002421, "learning_rate": 1.5634715403861697e-07, "loss": 0.81690282, "num_input_tokens_seen": 314497635, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09771729, "step": 14588, "time_per_iteration": 2.532278299331665 }, { "auxiliary_loss_clip": 0.06412573, "auxiliary_loss_mlp": 0.0126507, "balance_loss_clip": 0.0627635, "balance_loss_mlp": 0.01256141, "epoch": 0.8771381331730047, "flos": 21402028736640.0, "grad_norm": 2.452809049794946, "language_loss": 0.66839731, "learning_rate": 1.5619623249915016e-07, "loss": 0.74517369, "num_input_tokens_seen": 314515445, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.08929443, "step": 14589, "time_per_iteration": 2.567333698272705 }, { "auxiliary_loss_clip": 0.06415258, "auxiliary_loss_mlp": 0.01265839, "balance_loss_clip": 0.06277347, "balance_loss_mlp": 0.01256744, "epoch": 0.8771982564256726, "flos": 20267194164480.0, "grad_norm": 1.990645495503406, "language_loss": 0.71274388, "learning_rate": 1.5604538087690732e-07, "loss": 0.78955483, "num_input_tokens_seen": 314533040, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09094238, "step": 14590, "time_per_iteration": 4.016010761260986 }, { "auxiliary_loss_clip": 0.06428055, "auxiliary_loss_mlp": 0.0126642, "balance_loss_clip": 0.0628149, "balance_loss_mlp": 0.01255834, "epoch": 0.8772583796783406, "flos": 12493341365760.0, "grad_norm": 2.07764295803931, "language_loss": 0.74900007, "learning_rate": 1.558945991776086e-07, "loss": 0.8259449, "num_input_tokens_seen": 314548280, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.105896, "step": 14591, "time_per_iteration": 2.4902291297912598 }, { "auxiliary_loss_clip": 0.06408521, "auxiliary_loss_mlp": 0.01268558, "balance_loss_clip": 0.06275851, "balance_loss_mlp": 0.01259331, "epoch": 0.8773185029310085, "flos": 15925992854400.0, "grad_norm": 1.6094375543125066, "language_loss": 0.80098838, "learning_rate": 1.5574388740697096e-07, "loss": 0.8777591, "num_input_tokens_seen": 314565345, "router_z_loss_clip": 1.32714844, "router_z_loss_mlp": 0.09222412, "step": 14592, "time_per_iteration": 2.526341438293457 }, { "auxiliary_loss_clip": 0.06411988, "auxiliary_loss_mlp": 0.01268646, "balance_loss_clip": 0.06276488, "balance_loss_mlp": 0.0126026, "epoch": 0.8773786261836766, "flos": 21510538174080.0, "grad_norm": 1.7761958352653746, "language_loss": 0.83002543, "learning_rate": 1.5559324557071052e-07, "loss": 0.90683186, "num_input_tokens_seen": 314584190, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.0838623, "step": 14593, "time_per_iteration": 2.528221607208252 }, { "auxiliary_loss_clip": 0.06412338, "auxiliary_loss_mlp": 0.01264542, "balance_loss_clip": 0.06274647, "balance_loss_mlp": 0.01255714, "epoch": 0.8774387494363445, "flos": 26768884348800.0, "grad_norm": 1.352797529061245, "language_loss": 0.76158321, "learning_rate": 1.5544267367453845e-07, "loss": 0.83835196, "num_input_tokens_seen": 314605625, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.0881958, "step": 14594, "time_per_iteration": 2.5744855403900146 }, { "auxiliary_loss_clip": 0.06416416, "auxiliary_loss_mlp": 0.01264538, "balance_loss_clip": 0.06275544, "balance_loss_mlp": 0.01255031, "epoch": 0.8774988726890125, "flos": 18484782912000.0, "grad_norm": 2.2606227231031153, "language_loss": 0.78051174, "learning_rate": 1.552921717241651e-07, "loss": 0.85732126, "num_input_tokens_seen": 314622630, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09509277, "step": 14595, "time_per_iteration": 2.5143930912017822 }, { "auxiliary_loss_clip": 0.06415742, "auxiliary_loss_mlp": 0.01266454, "balance_loss_clip": 0.06276686, "balance_loss_mlp": 0.01256637, "epoch": 0.8775589959416804, "flos": 24433360295040.0, "grad_norm": 1.448156246040193, "language_loss": 0.70584315, "learning_rate": 1.5514173972529743e-07, "loss": 0.78266513, "num_input_tokens_seen": 314642460, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09820557, "step": 14596, "time_per_iteration": 2.5731751918792725 }, { "auxiliary_loss_clip": 0.06411515, "auxiliary_loss_mlp": 0.01263504, "balance_loss_clip": 0.06275096, "balance_loss_mlp": 0.01254134, "epoch": 0.8776191191943484, "flos": 23446796722560.0, "grad_norm": 1.8196343216680708, "language_loss": 0.8609879, "learning_rate": 1.5499137768364067e-07, "loss": 0.937738, "num_input_tokens_seen": 314659875, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09375, "step": 14597, "time_per_iteration": 3.984241247177124 }, { "auxiliary_loss_clip": 0.0641301, "auxiliary_loss_mlp": 0.0126571, "balance_loss_clip": 0.0627386, "balance_loss_mlp": 0.01256311, "epoch": 0.8776792424470163, "flos": 26837674151040.0, "grad_norm": 1.7914410692907345, "language_loss": 0.73218381, "learning_rate": 1.5484108560489494e-07, "loss": 0.80897105, "num_input_tokens_seen": 314680260, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09399414, "step": 14598, "time_per_iteration": 2.5732388496398926 }, { "auxiliary_loss_clip": 0.0641441, "auxiliary_loss_mlp": 0.01264098, "balance_loss_clip": 0.06274845, "balance_loss_mlp": 0.01254513, "epoch": 0.8777393656996844, "flos": 15630499031040.0, "grad_norm": 2.8817245899209323, "language_loss": 0.77386624, "learning_rate": 1.5469086349476036e-07, "loss": 0.85065126, "num_input_tokens_seen": 314696260, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.0958252, "step": 14599, "time_per_iteration": 2.48661732673645 }, { "auxiliary_loss_clip": 0.06420667, "auxiliary_loss_mlp": 0.01265442, "balance_loss_clip": 0.06280893, "balance_loss_mlp": 0.01256209, "epoch": 0.8777994889523523, "flos": 18885977061120.0, "grad_norm": 2.788611893127973, "language_loss": 0.67967248, "learning_rate": 1.545407113589332e-07, "loss": 0.7565335, "num_input_tokens_seen": 314714215, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09240723, "step": 14600, "time_per_iteration": 2.5126023292541504 }, { "auxiliary_loss_clip": 0.06414242, "auxiliary_loss_mlp": 0.0126602, "balance_loss_clip": 0.06275725, "balance_loss_mlp": 0.01255726, "epoch": 0.8778596122050203, "flos": 48836113850880.0, "grad_norm": 1.8099461467165125, "language_loss": 0.69678819, "learning_rate": 1.543906292031072e-07, "loss": 0.7735908, "num_input_tokens_seen": 314735700, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.10290527, "step": 14601, "time_per_iteration": 2.7383532524108887 }, { "auxiliary_loss_clip": 0.06425138, "auxiliary_loss_mlp": 0.01266857, "balance_loss_clip": 0.06279871, "balance_loss_mlp": 0.01256743, "epoch": 0.8779197354576883, "flos": 25666264471680.0, "grad_norm": 1.7744452390369159, "language_loss": 0.73533773, "learning_rate": 1.542406170329733e-07, "loss": 0.81225765, "num_input_tokens_seen": 314753335, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.10113525, "step": 14602, "time_per_iteration": 2.5840797424316406 }, { "auxiliary_loss_clip": 0.06411055, "auxiliary_loss_mlp": 0.01266742, "balance_loss_clip": 0.06273758, "balance_loss_mlp": 0.01257402, "epoch": 0.8779798587103562, "flos": 18849150391680.0, "grad_norm": 1.7045043356237908, "language_loss": 0.71257567, "learning_rate": 1.5409067485422056e-07, "loss": 0.78935361, "num_input_tokens_seen": 314770800, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09332275, "step": 14603, "time_per_iteration": 2.512082576751709 }, { "auxiliary_loss_clip": 0.06314252, "auxiliary_loss_mlp": 0.0125099, "balance_loss_clip": 0.06259041, "balance_loss_mlp": 0.01250002, "epoch": 0.8780399819630242, "flos": 68634022095360.0, "grad_norm": 0.7298790253832602, "language_loss": 0.5406003, "learning_rate": 1.539408026725344e-07, "loss": 0.61625278, "num_input_tokens_seen": 314837275, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00988007, "step": 14604, "time_per_iteration": 3.160938262939453 }, { "auxiliary_loss_clip": 0.06315894, "auxiliary_loss_mlp": 0.0125096, "balance_loss_clip": 0.06260783, "balance_loss_mlp": 0.01249946, "epoch": 0.8781001052156922, "flos": 65755908927360.0, "grad_norm": 0.704728495114905, "language_loss": 0.59232038, "learning_rate": 1.537910004935976e-07, "loss": 0.66798896, "num_input_tokens_seen": 314902220, "router_z_loss_clip": 0.55322266, "router_z_loss_mlp": 0.01014709, "step": 14605, "time_per_iteration": 4.57100772857666 }, { "auxiliary_loss_clip": 0.06419502, "auxiliary_loss_mlp": 0.0126474, "balance_loss_clip": 0.06277455, "balance_loss_mlp": 0.01255293, "epoch": 0.8781602284683602, "flos": 22055391348480.0, "grad_norm": 2.2990293744818953, "language_loss": 0.8539353, "learning_rate": 1.536412683230912e-07, "loss": 0.93077779, "num_input_tokens_seen": 314921645, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09436035, "step": 14606, "time_per_iteration": 2.563302755355835 }, { "auxiliary_loss_clip": 0.064197, "auxiliary_loss_mlp": 0.01266049, "balance_loss_clip": 0.06277263, "balance_loss_mlp": 0.01256048, "epoch": 0.8782203517210281, "flos": 17568099244800.0, "grad_norm": 2.05085953929546, "language_loss": 0.70952213, "learning_rate": 1.534916061666931e-07, "loss": 0.78637958, "num_input_tokens_seen": 314939390, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10009766, "step": 14607, "time_per_iteration": 2.5174360275268555 }, { "auxiliary_loss_clip": 0.06413884, "auxiliary_loss_mlp": 0.01264198, "balance_loss_clip": 0.06276956, "balance_loss_mlp": 0.01255419, "epoch": 0.8782804749736961, "flos": 25527510910080.0, "grad_norm": 1.7423859097785375, "language_loss": 0.72510409, "learning_rate": 1.533420140300785e-07, "loss": 0.80188489, "num_input_tokens_seen": 314959205, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.08776855, "step": 14608, "time_per_iteration": 2.6422152519226074 }, { "auxiliary_loss_clip": 0.06427176, "auxiliary_loss_mlp": 0.01266675, "balance_loss_clip": 0.06282796, "balance_loss_mlp": 0.0125687, "epoch": 0.878340598226364, "flos": 21805193456640.0, "grad_norm": 2.173932647677119, "language_loss": 0.87703836, "learning_rate": 1.5319249191891936e-07, "loss": 0.95397681, "num_input_tokens_seen": 314977485, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.09802246, "step": 14609, "time_per_iteration": 2.5368764400482178 }, { "auxiliary_loss_clip": 0.06416748, "auxiliary_loss_mlp": 0.01267355, "balance_loss_clip": 0.06278815, "balance_loss_mlp": 0.01257366, "epoch": 0.878400721479032, "flos": 21108211994880.0, "grad_norm": 1.453649038444674, "language_loss": 0.70540047, "learning_rate": 1.5304303983888643e-07, "loss": 0.78224158, "num_input_tokens_seen": 314997830, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09997559, "step": 14610, "time_per_iteration": 2.5713980197906494 }, { "auxiliary_loss_clip": 0.06411268, "auxiliary_loss_mlp": 0.01265416, "balance_loss_clip": 0.06276226, "balance_loss_mlp": 0.0125554, "epoch": 0.8784608447316999, "flos": 20929906506240.0, "grad_norm": 2.1057916180839604, "language_loss": 0.8016147, "learning_rate": 1.5289365779564612e-07, "loss": 0.87838155, "num_input_tokens_seen": 315016480, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.09875488, "step": 14611, "time_per_iteration": 2.544692277908325 }, { "auxiliary_loss_clip": 0.06413226, "auxiliary_loss_mlp": 0.01264192, "balance_loss_clip": 0.06273147, "balance_loss_mlp": 0.01254482, "epoch": 0.878520967984368, "flos": 23337281036160.0, "grad_norm": 1.5461567281818556, "language_loss": 0.76788765, "learning_rate": 1.5274434579486338e-07, "loss": 0.84466183, "num_input_tokens_seen": 315036135, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09710693, "step": 14612, "time_per_iteration": 2.559490442276001 }, { "auxiliary_loss_clip": 0.06415065, "auxiliary_loss_mlp": 0.01266184, "balance_loss_clip": 0.06277448, "balance_loss_mlp": 0.01256999, "epoch": 0.8785810912370359, "flos": 25525833828480.0, "grad_norm": 1.3529171519344259, "language_loss": 0.72368461, "learning_rate": 1.525951038422002e-07, "loss": 0.80049706, "num_input_tokens_seen": 315057995, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09179688, "step": 14613, "time_per_iteration": 4.270549058914185 }, { "auxiliary_loss_clip": 0.06316155, "auxiliary_loss_mlp": 0.0125126, "balance_loss_clip": 0.06261033, "balance_loss_mlp": 0.01250415, "epoch": 0.8786412144897039, "flos": 61857103576320.0, "grad_norm": 1.0313022198445208, "language_loss": 0.64783734, "learning_rate": 1.5244593194331667e-07, "loss": 0.72351146, "num_input_tokens_seen": 315104010, "router_z_loss_clip": 0.55224609, "router_z_loss_mlp": 0.00846863, "step": 14614, "time_per_iteration": 2.91609787940979 }, { "auxiliary_loss_clip": 0.063144, "auxiliary_loss_mlp": 0.01250751, "balance_loss_clip": 0.06259329, "balance_loss_mlp": 0.01249805, "epoch": 0.8787013377423719, "flos": 71011445990400.0, "grad_norm": 0.6472791678064256, "language_loss": 0.5810082, "learning_rate": 1.5229683010386762e-07, "loss": 0.65665972, "num_input_tokens_seen": 315174550, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00943756, "step": 14615, "time_per_iteration": 3.243532180786133 }, { "auxiliary_loss_clip": 0.06412965, "auxiliary_loss_mlp": 0.01265133, "balance_loss_clip": 0.06273932, "balance_loss_mlp": 0.01255775, "epoch": 0.8787614609950398, "flos": 17353092867840.0, "grad_norm": 3.8351623840824054, "language_loss": 0.73040134, "learning_rate": 1.5214779832950807e-07, "loss": 0.80718231, "num_input_tokens_seen": 315191825, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09362793, "step": 14616, "time_per_iteration": 2.5503287315368652 }, { "auxiliary_loss_clip": 0.06313768, "auxiliary_loss_mlp": 0.01251321, "balance_loss_clip": 0.06258728, "balance_loss_mlp": 0.01250299, "epoch": 0.8788215842477078, "flos": 72532003633920.0, "grad_norm": 0.8214860990399303, "language_loss": 0.57847917, "learning_rate": 1.5199883662588953e-07, "loss": 0.6541301, "num_input_tokens_seen": 315255075, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01021576, "step": 14617, "time_per_iteration": 3.315992593765259 }, { "auxiliary_loss_clip": 0.06412325, "auxiliary_loss_mlp": 0.01267127, "balance_loss_clip": 0.06276598, "balance_loss_mlp": 0.01258085, "epoch": 0.8788817075003758, "flos": 24834470590080.0, "grad_norm": 1.8102261420208516, "language_loss": 0.83927298, "learning_rate": 1.5184994499865987e-07, "loss": 0.91606754, "num_input_tokens_seen": 315273995, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.09039307, "step": 14618, "time_per_iteration": 2.582315683364868 }, { "auxiliary_loss_clip": 0.06410582, "auxiliary_loss_mlp": 0.01263428, "balance_loss_clip": 0.06278352, "balance_loss_mlp": 0.01254857, "epoch": 0.8789418307530438, "flos": 22645498527360.0, "grad_norm": 1.4875704375235437, "language_loss": 0.69286174, "learning_rate": 1.5170112345346598e-07, "loss": 0.76960188, "num_input_tokens_seen": 315294485, "router_z_loss_clip": 1.32226562, "router_z_loss_mlp": 0.08563232, "step": 14619, "time_per_iteration": 2.545243501663208 }, { "auxiliary_loss_clip": 0.06419311, "auxiliary_loss_mlp": 0.01262979, "balance_loss_clip": 0.06276432, "balance_loss_mlp": 0.01253842, "epoch": 0.8790019540057117, "flos": 19790795376000.0, "grad_norm": 2.652354954311631, "language_loss": 0.77337754, "learning_rate": 1.5155237199595016e-07, "loss": 0.85020047, "num_input_tokens_seen": 315310420, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09136963, "step": 14620, "time_per_iteration": 2.547337055206299 }, { "auxiliary_loss_clip": 0.06417055, "auxiliary_loss_mlp": 0.01267398, "balance_loss_clip": 0.0627672, "balance_loss_mlp": 0.01257474, "epoch": 0.8790620772583797, "flos": 20235943791360.0, "grad_norm": 1.69057770761046, "language_loss": 0.79339683, "learning_rate": 1.514036906317542e-07, "loss": 0.87024128, "num_input_tokens_seen": 315330110, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.0993042, "step": 14621, "time_per_iteration": 2.5297138690948486 }, { "auxiliary_loss_clip": 0.06422499, "auxiliary_loss_mlp": 0.01265286, "balance_loss_clip": 0.06278619, "balance_loss_mlp": 0.01255487, "epoch": 0.8791222005110476, "flos": 24137111784960.0, "grad_norm": 1.6688498418040076, "language_loss": 0.67261642, "learning_rate": 1.5125507936651506e-07, "loss": 0.74949431, "num_input_tokens_seen": 315350080, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.09802246, "step": 14622, "time_per_iteration": 2.5666239261627197 }, { "auxiliary_loss_clip": 0.06415933, "auxiliary_loss_mlp": 0.01265388, "balance_loss_clip": 0.06278943, "balance_loss_mlp": 0.01255482, "epoch": 0.8791823237637156, "flos": 21620263495680.0, "grad_norm": 2.0816247710609717, "language_loss": 0.73009348, "learning_rate": 1.511065382058687e-07, "loss": 0.80690664, "num_input_tokens_seen": 315366360, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09906006, "step": 14623, "time_per_iteration": 2.5120606422424316 }, { "auxiliary_loss_clip": 0.06411001, "auxiliary_loss_mlp": 0.01263081, "balance_loss_clip": 0.06273536, "balance_loss_mlp": 0.01253616, "epoch": 0.8792424470163835, "flos": 24250275123840.0, "grad_norm": 1.6438101787200428, "language_loss": 0.79102504, "learning_rate": 1.5095806715544801e-07, "loss": 0.8677659, "num_input_tokens_seen": 315385890, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09472656, "step": 14624, "time_per_iteration": 2.5529754161834717 }, { "auxiliary_loss_clip": 0.06416957, "auxiliary_loss_mlp": 0.01268362, "balance_loss_clip": 0.06276022, "balance_loss_mlp": 0.01258068, "epoch": 0.8793025702690516, "flos": 24899025761280.0, "grad_norm": 1.7141099479071022, "language_loss": 0.7993201, "learning_rate": 1.5080966622088265e-07, "loss": 0.87617326, "num_input_tokens_seen": 315403400, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10290527, "step": 14625, "time_per_iteration": 2.569875717163086 }, { "auxiliary_loss_clip": 0.06413286, "auxiliary_loss_mlp": 0.01267325, "balance_loss_clip": 0.0627705, "balance_loss_mlp": 0.01258599, "epoch": 0.8793626935217195, "flos": 25379952670080.0, "grad_norm": 1.504502152412595, "language_loss": 0.73883909, "learning_rate": 1.5066133540779967e-07, "loss": 0.81564522, "num_input_tokens_seen": 315423670, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.08721924, "step": 14626, "time_per_iteration": 2.5877344608306885 }, { "auxiliary_loss_clip": 0.06420066, "auxiliary_loss_mlp": 0.01264675, "balance_loss_clip": 0.06277789, "balance_loss_mlp": 0.01255305, "epoch": 0.8794228167743875, "flos": 34686563880960.0, "grad_norm": 1.5102993471141912, "language_loss": 0.71308875, "learning_rate": 1.505130747218246e-07, "loss": 0.78993618, "num_input_tokens_seen": 315446265, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09375, "step": 14627, "time_per_iteration": 2.668933868408203 }, { "auxiliary_loss_clip": 0.06411655, "auxiliary_loss_mlp": 0.01264795, "balance_loss_clip": 0.06272118, "balance_loss_mlp": 0.01255372, "epoch": 0.8794829400270555, "flos": 19470130600320.0, "grad_norm": 1.69993266923457, "language_loss": 0.72707987, "learning_rate": 1.5036488416857873e-07, "loss": 0.80384445, "num_input_tokens_seen": 315464655, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09423828, "step": 14628, "time_per_iteration": 2.5531795024871826 }, { "auxiliary_loss_clip": 0.06417918, "auxiliary_loss_mlp": 0.01265355, "balance_loss_clip": 0.06279118, "balance_loss_mlp": 0.01255121, "epoch": 0.8795430632797234, "flos": 15236767895040.0, "grad_norm": 2.989371607037312, "language_loss": 0.69154209, "learning_rate": 1.5021676375368175e-07, "loss": 0.76837486, "num_input_tokens_seen": 315481090, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.10235596, "step": 14629, "time_per_iteration": 2.506929874420166 }, { "auxiliary_loss_clip": 0.06406073, "auxiliary_loss_mlp": 0.0126251, "balance_loss_clip": 0.06270522, "balance_loss_mlp": 0.01253599, "epoch": 0.8796031865323914, "flos": 27751967976960.0, "grad_norm": 2.515076194136709, "language_loss": 0.68778396, "learning_rate": 1.5006871348275053e-07, "loss": 0.7644698, "num_input_tokens_seen": 315502010, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.08911133, "step": 14630, "time_per_iteration": 4.076311111450195 }, { "auxiliary_loss_clip": 0.06411747, "auxiliary_loss_mlp": 0.01264571, "balance_loss_clip": 0.06277258, "balance_loss_mlp": 0.01255041, "epoch": 0.8796633097850594, "flos": 31293506246400.0, "grad_norm": 1.4012345999509546, "language_loss": 0.74463022, "learning_rate": 1.499207333613999e-07, "loss": 0.82139343, "num_input_tokens_seen": 315523040, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.09521484, "step": 14631, "time_per_iteration": 2.6414949893951416 }, { "auxiliary_loss_clip": 0.06405447, "auxiliary_loss_mlp": 0.01266849, "balance_loss_clip": 0.06272045, "balance_loss_mlp": 0.01257533, "epoch": 0.8797234330377274, "flos": 24249981634560.0, "grad_norm": 2.0202340172910103, "language_loss": 0.69647902, "learning_rate": 1.4977282339523954e-07, "loss": 0.77320194, "num_input_tokens_seen": 315541865, "router_z_loss_clip": 1.33300781, "router_z_loss_mlp": 0.09307861, "step": 14632, "time_per_iteration": 2.554344654083252 }, { "auxiliary_loss_clip": 0.06413133, "auxiliary_loss_mlp": 0.0126184, "balance_loss_clip": 0.06275524, "balance_loss_mlp": 0.01252696, "epoch": 0.8797835562903953, "flos": 24173770746240.0, "grad_norm": 1.975176798078704, "language_loss": 0.65126157, "learning_rate": 1.4962498358987929e-07, "loss": 0.72801131, "num_input_tokens_seen": 315561470, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.0914917, "step": 14633, "time_per_iteration": 2.556940793991089 }, { "auxiliary_loss_clip": 0.06418253, "auxiliary_loss_mlp": 0.01267471, "balance_loss_clip": 0.06280537, "balance_loss_mlp": 0.01258543, "epoch": 0.8798436795430633, "flos": 19291280060160.0, "grad_norm": 1.4881177931234804, "language_loss": 0.84031153, "learning_rate": 1.4947721395092528e-07, "loss": 0.9171688, "num_input_tokens_seen": 315583140, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.08935547, "step": 14634, "time_per_iteration": 2.580238103866577 }, { "auxiliary_loss_clip": 0.0641333, "auxiliary_loss_mlp": 0.01265558, "balance_loss_clip": 0.06274208, "balance_loss_mlp": 0.01255527, "epoch": 0.8799038027957312, "flos": 28186173434880.0, "grad_norm": 2.142442526695947, "language_loss": 0.80011916, "learning_rate": 1.4932951448398056e-07, "loss": 0.87690806, "num_input_tokens_seen": 315601935, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.10028076, "step": 14635, "time_per_iteration": 2.6158711910247803 }, { "auxiliary_loss_clip": 0.06419431, "auxiliary_loss_mlp": 0.01265675, "balance_loss_clip": 0.0627977, "balance_loss_mlp": 0.0125621, "epoch": 0.8799639260483992, "flos": 24651636981120.0, "grad_norm": 1.6134224378258994, "language_loss": 0.65220225, "learning_rate": 1.4918188519464648e-07, "loss": 0.72905326, "num_input_tokens_seen": 315619995, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09466553, "step": 14636, "time_per_iteration": 4.053942441940308 }, { "auxiliary_loss_clip": 0.06415831, "auxiliary_loss_mlp": 0.01268522, "balance_loss_clip": 0.06276268, "balance_loss_mlp": 0.01258705, "epoch": 0.8800240493010671, "flos": 22207058438400.0, "grad_norm": 1.4642012639616904, "language_loss": 0.70680547, "learning_rate": 1.4903432608852074e-07, "loss": 0.78364903, "num_input_tokens_seen": 315637895, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09820557, "step": 14637, "time_per_iteration": 2.5726070404052734 }, { "auxiliary_loss_clip": 0.06417733, "auxiliary_loss_mlp": 0.01265556, "balance_loss_clip": 0.06279488, "balance_loss_mlp": 0.01256001, "epoch": 0.8800841725537352, "flos": 14251252498560.0, "grad_norm": 1.8483610853297205, "language_loss": 0.66179103, "learning_rate": 1.4888683717119843e-07, "loss": 0.73862386, "num_input_tokens_seen": 315655520, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09552002, "step": 14638, "time_per_iteration": 2.5134871006011963 }, { "auxiliary_loss_clip": 0.06419288, "auxiliary_loss_mlp": 0.01263086, "balance_loss_clip": 0.06277096, "balance_loss_mlp": 0.01253257, "epoch": 0.8801442958064031, "flos": 37425043019520.0, "grad_norm": 2.13749521610781, "language_loss": 0.58674979, "learning_rate": 1.4873941844827286e-07, "loss": 0.6635735, "num_input_tokens_seen": 315678955, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09820557, "step": 14639, "time_per_iteration": 2.6758766174316406 }, { "auxiliary_loss_clip": 0.06415123, "auxiliary_loss_mlp": 0.01265065, "balance_loss_clip": 0.06274432, "balance_loss_mlp": 0.01255296, "epoch": 0.8802044190590711, "flos": 25054550138880.0, "grad_norm": 1.4838760551047772, "language_loss": 0.74679565, "learning_rate": 1.4859206992533402e-07, "loss": 0.82359749, "num_input_tokens_seen": 315700360, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09765625, "step": 14640, "time_per_iteration": 2.583171844482422 }, { "auxiliary_loss_clip": 0.06412932, "auxiliary_loss_mlp": 0.01262541, "balance_loss_clip": 0.06273584, "balance_loss_mlp": 0.01252462, "epoch": 0.8802645423117391, "flos": 24140717510400.0, "grad_norm": 1.8922816943449898, "language_loss": 0.70263195, "learning_rate": 1.4844479160796985e-07, "loss": 0.77938664, "num_input_tokens_seen": 315719270, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.10083008, "step": 14641, "time_per_iteration": 2.555044651031494 }, { "auxiliary_loss_clip": 0.06422339, "auxiliary_loss_mlp": 0.0126377, "balance_loss_clip": 0.06280334, "balance_loss_mlp": 0.0125359, "epoch": 0.880324665564407, "flos": 17936994844800.0, "grad_norm": 2.0941623841363954, "language_loss": 0.85387415, "learning_rate": 1.4829758350176457e-07, "loss": 0.93073523, "num_input_tokens_seen": 315737425, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10192871, "step": 14642, "time_per_iteration": 2.5318069458007812 }, { "auxiliary_loss_clip": 0.06415761, "auxiliary_loss_mlp": 0.01264397, "balance_loss_clip": 0.06278035, "balance_loss_mlp": 0.01254384, "epoch": 0.880384788817075, "flos": 21293938569600.0, "grad_norm": 1.6658561882672223, "language_loss": 0.79276311, "learning_rate": 1.4815044561230038e-07, "loss": 0.86956465, "num_input_tokens_seen": 315755725, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.10003662, "step": 14643, "time_per_iteration": 2.544884443283081 }, { "auxiliary_loss_clip": 0.06409905, "auxiliary_loss_mlp": 0.01264239, "balance_loss_clip": 0.062749, "balance_loss_mlp": 0.01255668, "epoch": 0.880444912069743, "flos": 12463390730880.0, "grad_norm": 1.538156299327847, "language_loss": 0.73071539, "learning_rate": 1.4800337794515705e-07, "loss": 0.80745685, "num_input_tokens_seen": 315773835, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.08569336, "step": 14644, "time_per_iteration": 2.5393013954162598 }, { "auxiliary_loss_clip": 0.06425385, "auxiliary_loss_mlp": 0.01265647, "balance_loss_clip": 0.06280853, "balance_loss_mlp": 0.01255234, "epoch": 0.880505035322411, "flos": 13631026976640.0, "grad_norm": 1.783869543969494, "language_loss": 0.79511976, "learning_rate": 1.47856380505911e-07, "loss": 0.87203008, "num_input_tokens_seen": 315790615, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10418701, "step": 14645, "time_per_iteration": 3.960568428039551 }, { "auxiliary_loss_clip": 0.06410065, "auxiliary_loss_mlp": 0.01265749, "balance_loss_clip": 0.06276308, "balance_loss_mlp": 0.01256397, "epoch": 0.8805651585750789, "flos": 23189387379840.0, "grad_norm": 1.4508382008962613, "language_loss": 0.64152145, "learning_rate": 1.477094533001364e-07, "loss": 0.7182796, "num_input_tokens_seen": 315811010, "router_z_loss_clip": 1.33789062, "router_z_loss_mlp": 0.09350586, "step": 14646, "time_per_iteration": 2.5599021911621094 }, { "auxiliary_loss_clip": 0.06422578, "auxiliary_loss_mlp": 0.01266388, "balance_loss_clip": 0.06277224, "balance_loss_mlp": 0.01256065, "epoch": 0.8806252818277469, "flos": 14908304689920.0, "grad_norm": 2.123675084987688, "language_loss": 0.77681756, "learning_rate": 1.475625963334055e-07, "loss": 0.85370719, "num_input_tokens_seen": 315828130, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10327148, "step": 14647, "time_per_iteration": 2.5039706230163574 }, { "auxiliary_loss_clip": 0.0641202, "auxiliary_loss_mlp": 0.01265363, "balance_loss_clip": 0.06275164, "balance_loss_mlp": 0.01256464, "epoch": 0.8806854050804148, "flos": 17644897111680.0, "grad_norm": 1.8384299741231178, "language_loss": 0.75264901, "learning_rate": 1.4741580961128652e-07, "loss": 0.82942283, "num_input_tokens_seen": 315844900, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.08898926, "step": 14648, "time_per_iteration": 2.5013649463653564 }, { "auxiliary_loss_clip": 0.06416713, "auxiliary_loss_mlp": 0.01266572, "balance_loss_clip": 0.06275666, "balance_loss_mlp": 0.01257131, "epoch": 0.8807455283330828, "flos": 25338514026240.0, "grad_norm": 1.6225697506345858, "language_loss": 0.65580899, "learning_rate": 1.4726909313934522e-07, "loss": 0.73264182, "num_input_tokens_seen": 315863745, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09448242, "step": 14649, "time_per_iteration": 2.5601441860198975 }, { "auxiliary_loss_clip": 0.06416167, "auxiliary_loss_mlp": 0.01268266, "balance_loss_clip": 0.06276751, "balance_loss_mlp": 0.01257901, "epoch": 0.8808056515857507, "flos": 25272239846400.0, "grad_norm": 1.3774708964765825, "language_loss": 0.624951, "learning_rate": 1.4712244692314578e-07, "loss": 0.70179534, "num_input_tokens_seen": 315885765, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.10369873, "step": 14650, "time_per_iteration": 2.58457088470459 }, { "auxiliary_loss_clip": 0.06411244, "auxiliary_loss_mlp": 0.01261165, "balance_loss_clip": 0.06274557, "balance_loss_mlp": 0.01252386, "epoch": 0.8808657748384188, "flos": 26586176520960.0, "grad_norm": 1.3757908401877408, "language_loss": 0.73143589, "learning_rate": 1.4697587096824914e-07, "loss": 0.80815995, "num_input_tokens_seen": 315907340, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.08770752, "step": 14651, "time_per_iteration": 2.5760374069213867 }, { "auxiliary_loss_clip": 0.06419183, "auxiliary_loss_mlp": 0.01264265, "balance_loss_clip": 0.06278352, "balance_loss_mlp": 0.01254299, "epoch": 0.8809258980910867, "flos": 18667197250560.0, "grad_norm": 1.7386821053994785, "language_loss": 0.71917582, "learning_rate": 1.4682936528021284e-07, "loss": 0.79601032, "num_input_tokens_seen": 315924935, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09979248, "step": 14652, "time_per_iteration": 3.8870902061462402 }, { "auxiliary_loss_clip": 0.06413183, "auxiliary_loss_mlp": 0.01261052, "balance_loss_clip": 0.06274928, "balance_loss_mlp": 0.01251897, "epoch": 0.8809860213437547, "flos": 19798426097280.0, "grad_norm": 1.7802354550058852, "language_loss": 0.75255764, "learning_rate": 1.4668292986459286e-07, "loss": 0.82930005, "num_input_tokens_seen": 315943165, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09155273, "step": 14653, "time_per_iteration": 2.54642391204834 }, { "auxiliary_loss_clip": 0.06418647, "auxiliary_loss_mlp": 0.01267802, "balance_loss_clip": 0.06275573, "balance_loss_mlp": 0.01258272, "epoch": 0.8810461445964227, "flos": 17900210102400.0, "grad_norm": 1.9684443500992057, "language_loss": 0.72101694, "learning_rate": 1.465365647269421e-07, "loss": 0.79788148, "num_input_tokens_seen": 315961340, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09527588, "step": 14654, "time_per_iteration": 2.5015499591827393 }, { "auxiliary_loss_clip": 0.06418654, "auxiliary_loss_mlp": 0.01270072, "balance_loss_clip": 0.06279704, "balance_loss_mlp": 0.01260333, "epoch": 0.8811062678490906, "flos": 29170766436480.0, "grad_norm": 1.7590915349364855, "language_loss": 0.7221173, "learning_rate": 1.4639026987281012e-07, "loss": 0.79900455, "num_input_tokens_seen": 315981335, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09735107, "step": 14655, "time_per_iteration": 2.610786199569702 }, { "auxiliary_loss_clip": 0.0640927, "auxiliary_loss_mlp": 0.01266808, "balance_loss_clip": 0.06272174, "balance_loss_mlp": 0.01257516, "epoch": 0.8811663911017587, "flos": 20344956353280.0, "grad_norm": 1.7482888658364575, "language_loss": 0.81360579, "learning_rate": 1.462440453077449e-07, "loss": 0.89036655, "num_input_tokens_seen": 316001325, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09289551, "step": 14656, "time_per_iteration": 2.61846923828125 }, { "auxiliary_loss_clip": 0.06417093, "auxiliary_loss_mlp": 0.01265208, "balance_loss_clip": 0.06278124, "balance_loss_mlp": 0.01256131, "epoch": 0.8812265143544266, "flos": 25892926565760.0, "grad_norm": 1.6781618260288047, "language_loss": 0.68371701, "learning_rate": 1.460978910372914e-07, "loss": 0.76054001, "num_input_tokens_seen": 316022540, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09082031, "step": 14657, "time_per_iteration": 2.5894482135772705 }, { "auxiliary_loss_clip": 0.06418636, "auxiliary_loss_mlp": 0.01267139, "balance_loss_clip": 0.06277774, "balance_loss_mlp": 0.01257882, "epoch": 0.8812866376070946, "flos": 27202335120000.0, "grad_norm": 1.8712542802326304, "language_loss": 0.84405738, "learning_rate": 1.4595180706699207e-07, "loss": 0.92091513, "num_input_tokens_seen": 316037735, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.0925293, "step": 14658, "time_per_iteration": 2.5887210369110107 }, { "auxiliary_loss_clip": 0.06422231, "auxiliary_loss_mlp": 0.01267459, "balance_loss_clip": 0.06276366, "balance_loss_mlp": 0.0125707, "epoch": 0.8813467608597625, "flos": 23814266803200.0, "grad_norm": 1.962985427846686, "language_loss": 0.77514362, "learning_rate": 1.4580579340238554e-07, "loss": 0.85204053, "num_input_tokens_seen": 316058105, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.10394287, "step": 14659, "time_per_iteration": 2.582577705383301 }, { "auxiliary_loss_clip": 0.0641388, "auxiliary_loss_mlp": 0.01263182, "balance_loss_clip": 0.06275034, "balance_loss_mlp": 0.01253491, "epoch": 0.8814068841124305, "flos": 21111775793280.0, "grad_norm": 1.9107825713119844, "language_loss": 0.60894173, "learning_rate": 1.4565985004900894e-07, "loss": 0.68571234, "num_input_tokens_seen": 316074415, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09698486, "step": 14660, "time_per_iteration": 2.528184413909912 }, { "auxiliary_loss_clip": 0.06413101, "auxiliary_loss_mlp": 0.0126368, "balance_loss_clip": 0.06274021, "balance_loss_mlp": 0.01254066, "epoch": 0.8814670073650984, "flos": 24723822873600.0, "grad_norm": 1.6621571767421752, "language_loss": 0.77852106, "learning_rate": 1.455139770123972e-07, "loss": 0.85528898, "num_input_tokens_seen": 316094405, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09606934, "step": 14661, "time_per_iteration": 2.6634011268615723 }, { "auxiliary_loss_clip": 0.06416164, "auxiliary_loss_mlp": 0.01264307, "balance_loss_clip": 0.06275052, "balance_loss_mlp": 0.01254979, "epoch": 0.8815271306177664, "flos": 22972913556480.0, "grad_norm": 1.6715709832429273, "language_loss": 0.76969391, "learning_rate": 1.45368174298081e-07, "loss": 0.84649861, "num_input_tokens_seen": 316113390, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09332275, "step": 14662, "time_per_iteration": 2.627736806869507 }, { "auxiliary_loss_clip": 0.06409518, "auxiliary_loss_mlp": 0.01267663, "balance_loss_clip": 0.06274919, "balance_loss_mlp": 0.01259026, "epoch": 0.8815872538704344, "flos": 19465518625920.0, "grad_norm": 2.4638703109817075, "language_loss": 0.73565245, "learning_rate": 1.4522244191158929e-07, "loss": 0.8124243, "num_input_tokens_seen": 316131085, "router_z_loss_clip": 1.34667969, "router_z_loss_mlp": 0.08636475, "step": 14663, "time_per_iteration": 2.5462138652801514 }, { "auxiliary_loss_clip": 0.06411396, "auxiliary_loss_mlp": 0.01263743, "balance_loss_clip": 0.06273529, "balance_loss_mlp": 0.01254826, "epoch": 0.8816473771231024, "flos": 32164097368320.0, "grad_norm": 1.485636446279066, "language_loss": 0.70412064, "learning_rate": 1.450767798584489e-07, "loss": 0.78087205, "num_input_tokens_seen": 316151440, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.08917236, "step": 14664, "time_per_iteration": 2.634878158569336 }, { "auxiliary_loss_clip": 0.06414393, "auxiliary_loss_mlp": 0.01265329, "balance_loss_clip": 0.06276651, "balance_loss_mlp": 0.01256311, "epoch": 0.8817075003757703, "flos": 19688323432320.0, "grad_norm": 1.4044339354163367, "language_loss": 0.81333673, "learning_rate": 1.449311881441828e-07, "loss": 0.89013392, "num_input_tokens_seen": 316170750, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09020996, "step": 14665, "time_per_iteration": 2.5360357761383057 }, { "auxiliary_loss_clip": 0.06413786, "auxiliary_loss_mlp": 0.01263105, "balance_loss_clip": 0.06275059, "balance_loss_mlp": 0.01253658, "epoch": 0.8817676236284383, "flos": 15673950172800.0, "grad_norm": 2.0526988586465986, "language_loss": 0.59197527, "learning_rate": 1.447856667743117e-07, "loss": 0.66874415, "num_input_tokens_seen": 316187265, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09448242, "step": 14666, "time_per_iteration": 2.5094153881073 }, { "auxiliary_loss_clip": 0.06413831, "auxiliary_loss_mlp": 0.01267808, "balance_loss_clip": 0.06274866, "balance_loss_mlp": 0.01257211, "epoch": 0.8818277468811063, "flos": 17901048643200.0, "grad_norm": 4.169367163081421, "language_loss": 0.84094131, "learning_rate": 1.4464021575435403e-07, "loss": 0.91775769, "num_input_tokens_seen": 316206555, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10595703, "step": 14667, "time_per_iteration": 2.529160976409912 }, { "auxiliary_loss_clip": 0.06417611, "auxiliary_loss_mlp": 0.01265797, "balance_loss_clip": 0.06279075, "balance_loss_mlp": 0.01255682, "epoch": 0.8818878701337742, "flos": 18776461374720.0, "grad_norm": 1.7172611168266025, "language_loss": 0.62529975, "learning_rate": 1.4449483508982563e-07, "loss": 0.70213377, "num_input_tokens_seen": 316225210, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.10107422, "step": 14668, "time_per_iteration": 2.5074379444122314 }, { "auxiliary_loss_clip": 0.0640935, "auxiliary_loss_mlp": 0.01263603, "balance_loss_clip": 0.06272855, "balance_loss_mlp": 0.01254513, "epoch": 0.8819479933864423, "flos": 17718047326080.0, "grad_norm": 2.403512076517864, "language_loss": 0.5775246, "learning_rate": 1.4434952478623918e-07, "loss": 0.65425408, "num_input_tokens_seen": 316242685, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.09082031, "step": 14669, "time_per_iteration": 2.50544810295105 }, { "auxiliary_loss_clip": 0.06415602, "auxiliary_loss_mlp": 0.01265252, "balance_loss_clip": 0.06276162, "balance_loss_mlp": 0.01255799, "epoch": 0.8820081166391102, "flos": 11733523741440.0, "grad_norm": 1.782103916395947, "language_loss": 0.71429402, "learning_rate": 1.442042848491043e-07, "loss": 0.79110253, "num_input_tokens_seen": 316260935, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09454346, "step": 14670, "time_per_iteration": 3.9455678462982178 }, { "auxiliary_loss_clip": 0.06411134, "auxiliary_loss_mlp": 0.01269578, "balance_loss_clip": 0.0627239, "balance_loss_mlp": 0.0125966, "epoch": 0.8820682398917782, "flos": 27497745089280.0, "grad_norm": 2.0648225138220804, "language_loss": 0.73916721, "learning_rate": 1.44059115283929e-07, "loss": 0.81597435, "num_input_tokens_seen": 316281190, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09924316, "step": 14671, "time_per_iteration": 2.574728488922119 }, { "auxiliary_loss_clip": 0.06418122, "auxiliary_loss_mlp": 0.01270308, "balance_loss_clip": 0.06275923, "balance_loss_mlp": 0.01259937, "epoch": 0.8821283631444461, "flos": 16879587045120.0, "grad_norm": 2.0247046096751222, "language_loss": 0.85078418, "learning_rate": 1.43914016096218e-07, "loss": 0.92766845, "num_input_tokens_seen": 316297115, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.1036377, "step": 14672, "time_per_iteration": 2.545781373977661 }, { "auxiliary_loss_clip": 0.06409148, "auxiliary_loss_mlp": 0.01269055, "balance_loss_clip": 0.06274033, "balance_loss_mlp": 0.01259614, "epoch": 0.8821884863971141, "flos": 24288024188160.0, "grad_norm": 1.49085002732392, "language_loss": 0.72780091, "learning_rate": 1.4376898729147336e-07, "loss": 0.80458295, "num_input_tokens_seen": 316318235, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.09442139, "step": 14673, "time_per_iteration": 2.5498692989349365 }, { "auxiliary_loss_clip": 0.06310246, "auxiliary_loss_mlp": 0.01250659, "balance_loss_clip": 0.06255348, "balance_loss_mlp": 0.01249653, "epoch": 0.882248609649782, "flos": 59453990876160.0, "grad_norm": 0.8249980890058307, "language_loss": 0.4923774, "learning_rate": 1.4362402887519487e-07, "loss": 0.56798643, "num_input_tokens_seen": 316384705, "router_z_loss_clip": 0.55029297, "router_z_loss_mlp": 0.01005554, "step": 14674, "time_per_iteration": 3.2630367279052734 }, { "auxiliary_loss_clip": 0.06415442, "auxiliary_loss_mlp": 0.01265009, "balance_loss_clip": 0.06275213, "balance_loss_mlp": 0.01255336, "epoch": 0.88230873290245, "flos": 19943887985280.0, "grad_norm": 1.8522634516009968, "language_loss": 0.7712931, "learning_rate": 1.4347914085287971e-07, "loss": 0.84809756, "num_input_tokens_seen": 316401165, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09674072, "step": 14675, "time_per_iteration": 2.555616617202759 }, { "auxiliary_loss_clip": 0.06409872, "auxiliary_loss_mlp": 0.01264419, "balance_loss_clip": 0.0627443, "balance_loss_mlp": 0.01255013, "epoch": 0.882368856155118, "flos": 16368374085120.0, "grad_norm": 1.8230287641762375, "language_loss": 0.79465169, "learning_rate": 1.4333432323002105e-07, "loss": 0.87139463, "num_input_tokens_seen": 316418780, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.09405518, "step": 14676, "time_per_iteration": 4.018355131149292 }, { "auxiliary_loss_clip": 0.06316634, "auxiliary_loss_mlp": 0.01251714, "balance_loss_clip": 0.06261574, "balance_loss_mlp": 0.01250837, "epoch": 0.882428979407786, "flos": 70617672927360.0, "grad_norm": 0.8129881823511576, "language_loss": 0.54570615, "learning_rate": 1.431895760121109e-07, "loss": 0.62138963, "num_input_tokens_seen": 316482030, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00878143, "step": 14677, "time_per_iteration": 3.2577927112579346 }, { "auxiliary_loss_clip": 0.06414559, "auxiliary_loss_mlp": 0.01264058, "balance_loss_clip": 0.06274578, "balance_loss_mlp": 0.01253961, "epoch": 0.8824891026604539, "flos": 18156151998720.0, "grad_norm": 2.180072252017404, "language_loss": 0.65106082, "learning_rate": 1.4304489920463847e-07, "loss": 0.72784698, "num_input_tokens_seen": 316499175, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.10101318, "step": 14678, "time_per_iteration": 2.5123143196105957 }, { "auxiliary_loss_clip": 0.0641855, "auxiliary_loss_mlp": 0.01267955, "balance_loss_clip": 0.06277207, "balance_loss_mlp": 0.01257882, "epoch": 0.8825492259131219, "flos": 27239664913920.0, "grad_norm": 1.8758630982803912, "language_loss": 0.71205068, "learning_rate": 1.4290029281308936e-07, "loss": 0.78891575, "num_input_tokens_seen": 316519495, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.10089111, "step": 14679, "time_per_iteration": 2.610584020614624 }, { "auxiliary_loss_clip": 0.06411532, "auxiliary_loss_mlp": 0.01265584, "balance_loss_clip": 0.06273577, "balance_loss_mlp": 0.01257323, "epoch": 0.8826093491657898, "flos": 22281172974720.0, "grad_norm": 1.8105851096539267, "language_loss": 0.64134187, "learning_rate": 1.4275575684294694e-07, "loss": 0.71811295, "num_input_tokens_seen": 316538180, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.08258057, "step": 14680, "time_per_iteration": 2.5442137718200684 }, { "auxiliary_loss_clip": 0.06407358, "auxiliary_loss_mlp": 0.01265379, "balance_loss_clip": 0.06271579, "balance_loss_mlp": 0.01256134, "epoch": 0.8826694724184578, "flos": 14209101095040.0, "grad_norm": 2.227974949921465, "language_loss": 0.77477223, "learning_rate": 1.4261129129969328e-07, "loss": 0.85149956, "num_input_tokens_seen": 316551750, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.0925293, "step": 14681, "time_per_iteration": 2.4999265670776367 }, { "auxiliary_loss_clip": 0.06416562, "auxiliary_loss_mlp": 0.01263018, "balance_loss_clip": 0.0627487, "balance_loss_mlp": 0.01253201, "epoch": 0.8827295956711259, "flos": 20638018408320.0, "grad_norm": 1.7151620723128354, "language_loss": 0.73095322, "learning_rate": 1.424668961888047e-07, "loss": 0.80774903, "num_input_tokens_seen": 316570680, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09820557, "step": 14682, "time_per_iteration": 2.52921724319458 }, { "auxiliary_loss_clip": 0.06420642, "auxiliary_loss_mlp": 0.01270531, "balance_loss_clip": 0.06277558, "balance_loss_mlp": 0.01259969, "epoch": 0.8827897189237938, "flos": 18518632761600.0, "grad_norm": 2.794213115440613, "language_loss": 0.7527371, "learning_rate": 1.4232257151575765e-07, "loss": 0.82964885, "num_input_tokens_seen": 316588635, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.10552979, "step": 14683, "time_per_iteration": 2.541478395462036 }, { "auxiliary_loss_clip": 0.06416416, "auxiliary_loss_mlp": 0.01264075, "balance_loss_clip": 0.06276256, "balance_loss_mlp": 0.01254025, "epoch": 0.8828498421764618, "flos": 22754007964800.0, "grad_norm": 1.552390413002403, "language_loss": 0.66012394, "learning_rate": 1.4217831728602492e-07, "loss": 0.73692882, "num_input_tokens_seen": 316607550, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.1005249, "step": 14684, "time_per_iteration": 4.096042633056641 }, { "auxiliary_loss_clip": 0.0641014, "auxiliary_loss_mlp": 0.01263944, "balance_loss_clip": 0.06272212, "balance_loss_mlp": 0.01253692, "epoch": 0.8829099654291297, "flos": 15017694595200.0, "grad_norm": 1.7166972345405827, "language_loss": 0.69746244, "learning_rate": 1.4203413350507677e-07, "loss": 0.7742033, "num_input_tokens_seen": 316624460, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.10247803, "step": 14685, "time_per_iteration": 2.5138559341430664 }, { "auxiliary_loss_clip": 0.06418465, "auxiliary_loss_mlp": 0.01264658, "balance_loss_clip": 0.06276193, "balance_loss_mlp": 0.01254805, "epoch": 0.8829700886817977, "flos": 16725026989440.0, "grad_norm": 1.780587263740956, "language_loss": 0.74642664, "learning_rate": 1.418900201783806e-07, "loss": 0.82325792, "num_input_tokens_seen": 316640765, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09851074, "step": 14686, "time_per_iteration": 2.556892156600952 }, { "auxiliary_loss_clip": 0.06406619, "auxiliary_loss_mlp": 0.01265438, "balance_loss_clip": 0.06270112, "balance_loss_mlp": 0.01255884, "epoch": 0.8830302119344656, "flos": 15267850560000.0, "grad_norm": 2.185040887737978, "language_loss": 0.63649392, "learning_rate": 1.417459773114007e-07, "loss": 0.71321452, "num_input_tokens_seen": 316656120, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.09552002, "step": 14687, "time_per_iteration": 2.487947463989258 }, { "auxiliary_loss_clip": 0.06417464, "auxiliary_loss_mlp": 0.01264821, "balance_loss_clip": 0.06276411, "balance_loss_mlp": 0.01254599, "epoch": 0.8830903351871336, "flos": 28624697377920.0, "grad_norm": 1.9080903534661549, "language_loss": 0.68993491, "learning_rate": 1.4160200490959984e-07, "loss": 0.76675773, "num_input_tokens_seen": 316676095, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10217285, "step": 14688, "time_per_iteration": 2.6072275638580322 }, { "auxiliary_loss_clip": 0.06408505, "auxiliary_loss_mlp": 0.012681, "balance_loss_clip": 0.0627415, "balance_loss_mlp": 0.0125926, "epoch": 0.8831504584398016, "flos": 28009167684480.0, "grad_norm": 1.6100841872341995, "language_loss": 0.66627687, "learning_rate": 1.4145810297843697e-07, "loss": 0.74304295, "num_input_tokens_seen": 316696235, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.08837891, "step": 14689, "time_per_iteration": 2.662757158279419 }, { "auxiliary_loss_clip": 0.06411972, "auxiliary_loss_mlp": 0.01265989, "balance_loss_clip": 0.0627498, "balance_loss_mlp": 0.01257245, "epoch": 0.8832105816924696, "flos": 26587098915840.0, "grad_norm": 1.8294782938621874, "language_loss": 0.74713707, "learning_rate": 1.4131427152336905e-07, "loss": 0.82391673, "num_input_tokens_seen": 316719680, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.08746338, "step": 14690, "time_per_iteration": 2.7075533866882324 }, { "auxiliary_loss_clip": 0.06413655, "auxiliary_loss_mlp": 0.01265719, "balance_loss_clip": 0.06274667, "balance_loss_mlp": 0.01255312, "epoch": 0.8832707049451375, "flos": 24905524452480.0, "grad_norm": 1.453147471126695, "language_loss": 0.73402727, "learning_rate": 1.4117051054985018e-07, "loss": 0.81082106, "num_input_tokens_seen": 316739830, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10400391, "step": 14691, "time_per_iteration": 3.956364631652832 }, { "auxiliary_loss_clip": 0.06419601, "auxiliary_loss_mlp": 0.01262978, "balance_loss_clip": 0.06276493, "balance_loss_mlp": 0.01253286, "epoch": 0.8833308281978055, "flos": 15456679735680.0, "grad_norm": 1.7117061117302932, "language_loss": 0.52257597, "learning_rate": 1.4102682006333243e-07, "loss": 0.59940177, "num_input_tokens_seen": 316758105, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.09680176, "step": 14692, "time_per_iteration": 2.513564348220825 }, { "auxiliary_loss_clip": 0.06416897, "auxiliary_loss_mlp": 0.0126475, "balance_loss_clip": 0.06276424, "balance_loss_mlp": 0.01254817, "epoch": 0.8833909514504734, "flos": 20307500778240.0, "grad_norm": 4.93171892947284, "language_loss": 0.60982746, "learning_rate": 1.4088320006926346e-07, "loss": 0.68664396, "num_input_tokens_seen": 316777455, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09939575, "step": 14693, "time_per_iteration": 2.5528836250305176 }, { "auxiliary_loss_clip": 0.06407952, "auxiliary_loss_mlp": 0.0126224, "balance_loss_clip": 0.06273907, "balance_loss_mlp": 0.01253412, "epoch": 0.8834510747031414, "flos": 20379938232960.0, "grad_norm": 1.399732049687352, "language_loss": 0.75669324, "learning_rate": 1.407396505730898e-07, "loss": 0.83339518, "num_input_tokens_seen": 316796300, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.08831787, "step": 14694, "time_per_iteration": 2.520575523376465 }, { "auxiliary_loss_clip": 0.06414376, "auxiliary_loss_mlp": 0.01265252, "balance_loss_clip": 0.06271623, "balance_loss_mlp": 0.01256311, "epoch": 0.8835111979558095, "flos": 29759699658240.0, "grad_norm": 1.6322118299142736, "language_loss": 0.72897232, "learning_rate": 1.4059617158025527e-07, "loss": 0.80576867, "num_input_tokens_seen": 316819090, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.08935547, "step": 14695, "time_per_iteration": 2.642979860305786 }, { "auxiliary_loss_clip": 0.0640509, "auxiliary_loss_mlp": 0.01263441, "balance_loss_clip": 0.06270976, "balance_loss_mlp": 0.01254417, "epoch": 0.8835713212084774, "flos": 24141514124160.0, "grad_norm": 1.4786903489519887, "language_loss": 0.79903424, "learning_rate": 1.404527630961998e-07, "loss": 0.87571955, "num_input_tokens_seen": 316839250, "router_z_loss_clip": 1.34082031, "router_z_loss_mlp": 0.090271, "step": 14696, "time_per_iteration": 2.5582780838012695 }, { "auxiliary_loss_clip": 0.06416654, "auxiliary_loss_mlp": 0.01264821, "balance_loss_clip": 0.06276722, "balance_loss_mlp": 0.01255254, "epoch": 0.8836314444611454, "flos": 27679656303360.0, "grad_norm": 5.330628849001974, "language_loss": 0.74992669, "learning_rate": 1.4030942512636236e-07, "loss": 0.82674134, "num_input_tokens_seen": 316861315, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09564209, "step": 14697, "time_per_iteration": 2.6327152252197266 }, { "auxiliary_loss_clip": 0.06411798, "auxiliary_loss_mlp": 0.01265446, "balance_loss_clip": 0.06274556, "balance_loss_mlp": 0.01256136, "epoch": 0.8836915677138133, "flos": 16842634594560.0, "grad_norm": 2.2819461866877173, "language_loss": 0.71882582, "learning_rate": 1.401661576761779e-07, "loss": 0.79559821, "num_input_tokens_seen": 316879325, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09313965, "step": 14698, "time_per_iteration": 2.5236494541168213 }, { "auxiliary_loss_clip": 0.06311627, "auxiliary_loss_mlp": 0.01250259, "balance_loss_clip": 0.06256281, "balance_loss_mlp": 0.01249291, "epoch": 0.8837516909664813, "flos": 69332261368320.0, "grad_norm": 0.774832555873091, "language_loss": 0.53612036, "learning_rate": 1.4002296075107856e-07, "loss": 0.61173916, "num_input_tokens_seen": 316936425, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00967407, "step": 14699, "time_per_iteration": 3.1587159633636475 }, { "auxiliary_loss_clip": 0.06414512, "auxiliary_loss_mlp": 0.01264809, "balance_loss_clip": 0.06270146, "balance_loss_mlp": 0.01254909, "epoch": 0.8838118142191492, "flos": 21331142582400.0, "grad_norm": 1.6034007736990241, "language_loss": 0.76968503, "learning_rate": 1.3987983435649508e-07, "loss": 0.84647822, "num_input_tokens_seen": 316956360, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.09899902, "step": 14700, "time_per_iteration": 2.546584129333496 }, { "auxiliary_loss_clip": 0.06410955, "auxiliary_loss_mlp": 0.01261069, "balance_loss_clip": 0.06274562, "balance_loss_mlp": 0.01252128, "epoch": 0.8838719374718172, "flos": 21476981813760.0, "grad_norm": 1.7819430828277025, "language_loss": 0.73546612, "learning_rate": 1.3973677849785494e-07, "loss": 0.81218636, "num_input_tokens_seen": 316975295, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.08947754, "step": 14701, "time_per_iteration": 2.5466275215148926 }, { "auxiliary_loss_clip": 0.06419738, "auxiliary_loss_mlp": 0.01263455, "balance_loss_clip": 0.06274962, "balance_loss_mlp": 0.01253161, "epoch": 0.8839320607244852, "flos": 26476157710080.0, "grad_norm": 1.7219776328459957, "language_loss": 0.71798348, "learning_rate": 1.3959379318058262e-07, "loss": 0.79481542, "num_input_tokens_seen": 316994520, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10284424, "step": 14702, "time_per_iteration": 2.5682950019836426 }, { "auxiliary_loss_clip": 0.06423236, "auxiliary_loss_mlp": 0.01266674, "balance_loss_clip": 0.06282235, "balance_loss_mlp": 0.01257089, "epoch": 0.8839921839771532, "flos": 45232577959680.0, "grad_norm": 1.6103767223037289, "language_loss": 0.71763611, "learning_rate": 1.3945087841010006e-07, "loss": 0.79453522, "num_input_tokens_seen": 317018095, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09588623, "step": 14703, "time_per_iteration": 2.749760866165161 }, { "auxiliary_loss_clip": 0.06406386, "auxiliary_loss_mlp": 0.01266062, "balance_loss_clip": 0.06270938, "balance_loss_mlp": 0.01256716, "epoch": 0.8840523072298211, "flos": 20012342371200.0, "grad_norm": 1.7905971956947535, "language_loss": 0.6698277, "learning_rate": 1.3930803419182645e-07, "loss": 0.74655217, "num_input_tokens_seen": 317035755, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.09344482, "step": 14704, "time_per_iteration": 2.528175115585327 }, { "auxiliary_loss_clip": 0.06407326, "auxiliary_loss_mlp": 0.01262176, "balance_loss_clip": 0.0627353, "balance_loss_mlp": 0.01253706, "epoch": 0.8841124304824891, "flos": 24432941024640.0, "grad_norm": 1.6129349013415448, "language_loss": 0.70882475, "learning_rate": 1.3916526053117905e-07, "loss": 0.78551984, "num_input_tokens_seen": 317055765, "router_z_loss_clip": 1.33886719, "router_z_loss_mlp": 0.08462524, "step": 14705, "time_per_iteration": 2.5890252590179443 }, { "auxiliary_loss_clip": 0.06409542, "auxiliary_loss_mlp": 0.01267041, "balance_loss_clip": 0.06272553, "balance_loss_mlp": 0.0125825, "epoch": 0.884172553735157, "flos": 31292583851520.0, "grad_norm": 1.4118141850859343, "language_loss": 0.71202308, "learning_rate": 1.3902255743357104e-07, "loss": 0.78878891, "num_input_tokens_seen": 317077955, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.08795166, "step": 14706, "time_per_iteration": 2.6262125968933105 }, { "auxiliary_loss_clip": 0.06410414, "auxiliary_loss_mlp": 0.01266566, "balance_loss_clip": 0.06271562, "balance_loss_mlp": 0.01258091, "epoch": 0.884232676987825, "flos": 21396494367360.0, "grad_norm": 1.951932430323384, "language_loss": 0.74868846, "learning_rate": 1.3887992490441413e-07, "loss": 0.82545829, "num_input_tokens_seen": 317095825, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.08477783, "step": 14707, "time_per_iteration": 2.5393810272216797 }, { "auxiliary_loss_clip": 0.06312877, "auxiliary_loss_mlp": 0.01250849, "balance_loss_clip": 0.06257927, "balance_loss_mlp": 0.01249846, "epoch": 0.8842928002404931, "flos": 57928668278400.0, "grad_norm": 0.8592519890275302, "language_loss": 0.60352468, "learning_rate": 1.387373629491173e-07, "loss": 0.67916197, "num_input_tokens_seen": 317152875, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01002502, "step": 14708, "time_per_iteration": 3.0088090896606445 }, { "auxiliary_loss_clip": 0.06403784, "auxiliary_loss_mlp": 0.01264234, "balance_loss_clip": 0.0627173, "balance_loss_mlp": 0.01256021, "epoch": 0.884352923493161, "flos": 41473517690880.0, "grad_norm": 1.5669377100626025, "language_loss": 0.67825043, "learning_rate": 1.3859487157308625e-07, "loss": 0.75493068, "num_input_tokens_seen": 317176725, "router_z_loss_clip": 1.3203125, "router_z_loss_mlp": 0.08215332, "step": 14709, "time_per_iteration": 4.334847450256348 }, { "auxiliary_loss_clip": 0.06419801, "auxiliary_loss_mlp": 0.01266392, "balance_loss_clip": 0.06274886, "balance_loss_mlp": 0.01255919, "epoch": 0.884413046745829, "flos": 46552677909120.0, "grad_norm": 1.7151802490966608, "language_loss": 0.62599051, "learning_rate": 1.3845245078172373e-07, "loss": 0.70285249, "num_input_tokens_seen": 317206880, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10461426, "step": 14710, "time_per_iteration": 2.7944467067718506 }, { "auxiliary_loss_clip": 0.06410463, "auxiliary_loss_mlp": 0.01264933, "balance_loss_clip": 0.06275659, "balance_loss_mlp": 0.01256124, "epoch": 0.8844731699984969, "flos": 19141331978880.0, "grad_norm": 2.4282694383493197, "language_loss": 0.64081037, "learning_rate": 1.38310100580431e-07, "loss": 0.71756434, "num_input_tokens_seen": 317224135, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.0880127, "step": 14711, "time_per_iteration": 2.5173730850219727 }, { "auxiliary_loss_clip": 0.06418705, "auxiliary_loss_mlp": 0.01263829, "balance_loss_clip": 0.06274661, "balance_loss_mlp": 0.01254101, "epoch": 0.8845332932511649, "flos": 23267736547200.0, "grad_norm": 1.9815181658554677, "language_loss": 0.76029688, "learning_rate": 1.38167820974606e-07, "loss": 0.8371222, "num_input_tokens_seen": 317244505, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.09729004, "step": 14712, "time_per_iteration": 2.55340838432312 }, { "auxiliary_loss_clip": 0.0641616, "auxiliary_loss_mlp": 0.01265682, "balance_loss_clip": 0.06275748, "balance_loss_mlp": 0.01256616, "epoch": 0.8845934165038328, "flos": 17570027888640.0, "grad_norm": 2.1397672255985123, "language_loss": 0.81605661, "learning_rate": 1.3802561196964368e-07, "loss": 0.89287508, "num_input_tokens_seen": 317257830, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09063721, "step": 14713, "time_per_iteration": 2.4873721599578857 }, { "auxiliary_loss_clip": 0.06410829, "auxiliary_loss_mlp": 0.01261625, "balance_loss_clip": 0.06273851, "balance_loss_mlp": 0.01251862, "epoch": 0.8846535397565009, "flos": 27492336501120.0, "grad_norm": 1.4124731950912668, "language_loss": 0.55806363, "learning_rate": 1.3788347357093688e-07, "loss": 0.63478816, "num_input_tokens_seen": 317278430, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.09759521, "step": 14714, "time_per_iteration": 2.5626142024993896 }, { "auxiliary_loss_clip": 0.06411861, "auxiliary_loss_mlp": 0.01266132, "balance_loss_clip": 0.06272563, "balance_loss_mlp": 0.01256685, "epoch": 0.8847136630091688, "flos": 28768020986880.0, "grad_norm": 1.6291736114289248, "language_loss": 0.73653811, "learning_rate": 1.377414057838755e-07, "loss": 0.81331801, "num_input_tokens_seen": 317295970, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09448242, "step": 14715, "time_per_iteration": 4.02855110168457 }, { "auxiliary_loss_clip": 0.06416672, "auxiliary_loss_mlp": 0.01265612, "balance_loss_clip": 0.06276174, "balance_loss_mlp": 0.0125604, "epoch": 0.8847737862618368, "flos": 23483623392000.0, "grad_norm": 1.7132689657585582, "language_loss": 0.7539739, "learning_rate": 1.375994086138461e-07, "loss": 0.83079672, "num_input_tokens_seen": 317316185, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09570312, "step": 14716, "time_per_iteration": 2.5579099655151367 }, { "auxiliary_loss_clip": 0.06410572, "auxiliary_loss_mlp": 0.012618, "balance_loss_clip": 0.06273259, "balance_loss_mlp": 0.01252591, "epoch": 0.8848339095145047, "flos": 18666777980160.0, "grad_norm": 2.0551802819512983, "language_loss": 0.71428311, "learning_rate": 1.3745748206623397e-07, "loss": 0.7910068, "num_input_tokens_seen": 317333275, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09204102, "step": 14717, "time_per_iteration": 2.540172576904297 }, { "auxiliary_loss_clip": 0.06404924, "auxiliary_loss_mlp": 0.01263316, "balance_loss_clip": 0.06271958, "balance_loss_mlp": 0.01254578, "epoch": 0.8848940327671727, "flos": 32278518518400.0, "grad_norm": 2.7718026477272186, "language_loss": 0.74304342, "learning_rate": 1.373156261464208e-07, "loss": 0.81972581, "num_input_tokens_seen": 317351245, "router_z_loss_clip": 1.32910156, "router_z_loss_mlp": 0.08740234, "step": 14718, "time_per_iteration": 2.6241023540496826 }, { "auxiliary_loss_clip": 0.06414666, "auxiliary_loss_mlp": 0.0126558, "balance_loss_clip": 0.06272382, "balance_loss_mlp": 0.01255745, "epoch": 0.8849541560198406, "flos": 24028225004160.0, "grad_norm": 1.467838552862924, "language_loss": 0.78596234, "learning_rate": 1.3717384085978602e-07, "loss": 0.86276484, "num_input_tokens_seen": 317370740, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.0982666, "step": 14719, "time_per_iteration": 2.576401948928833 }, { "auxiliary_loss_clip": 0.0641517, "auxiliary_loss_mlp": 0.0126757, "balance_loss_clip": 0.06275876, "balance_loss_mlp": 0.01258951, "epoch": 0.8850142792725086, "flos": 16878664650240.0, "grad_norm": 1.8074146810675005, "language_loss": 0.72227979, "learning_rate": 1.3703212621170579e-07, "loss": 0.79910725, "num_input_tokens_seen": 317388370, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.08624268, "step": 14720, "time_per_iteration": 2.513181209564209 }, { "auxiliary_loss_clip": 0.06418508, "auxiliary_loss_mlp": 0.01264553, "balance_loss_clip": 0.06274225, "balance_loss_mlp": 0.01254772, "epoch": 0.8850744025251767, "flos": 24030824480640.0, "grad_norm": 59.08691141136634, "language_loss": 0.82391191, "learning_rate": 1.3689048220755383e-07, "loss": 0.90074253, "num_input_tokens_seen": 317407390, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.09771729, "step": 14721, "time_per_iteration": 2.5741844177246094 }, { "auxiliary_loss_clip": 0.06415391, "auxiliary_loss_mlp": 0.01265199, "balance_loss_clip": 0.06274766, "balance_loss_mlp": 0.01255591, "epoch": 0.8851345257778446, "flos": 47965816218240.0, "grad_norm": 1.907981385482167, "language_loss": 0.62582725, "learning_rate": 1.3674890885270186e-07, "loss": 0.70263314, "num_input_tokens_seen": 317430825, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09606934, "step": 14722, "time_per_iteration": 2.7764065265655518 }, { "auxiliary_loss_clip": 0.06412928, "auxiliary_loss_mlp": 0.01266118, "balance_loss_clip": 0.06271105, "balance_loss_mlp": 0.01256087, "epoch": 0.8851946490305126, "flos": 36619761755520.0, "grad_norm": 1.8308296592671327, "language_loss": 0.68990129, "learning_rate": 1.3660740615251754e-07, "loss": 0.76669174, "num_input_tokens_seen": 317451905, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.10028076, "step": 14723, "time_per_iteration": 2.659937620162964 }, { "auxiliary_loss_clip": 0.0641254, "auxiliary_loss_mlp": 0.0126383, "balance_loss_clip": 0.06273383, "balance_loss_mlp": 0.01254836, "epoch": 0.8852547722831805, "flos": 21550802860800.0, "grad_norm": 1.9179897956671317, "language_loss": 0.78001237, "learning_rate": 1.3646597411236703e-07, "loss": 0.85677606, "num_input_tokens_seen": 317470030, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09002686, "step": 14724, "time_per_iteration": 3.975165843963623 }, { "auxiliary_loss_clip": 0.06313408, "auxiliary_loss_mlp": 0.01250542, "balance_loss_clip": 0.0625834, "balance_loss_mlp": 0.01249636, "epoch": 0.8853148955358485, "flos": 63077876110080.0, "grad_norm": 0.7806076719745162, "language_loss": 0.58847553, "learning_rate": 1.363246127376143e-07, "loss": 0.66411495, "num_input_tokens_seen": 317527460, "router_z_loss_clip": 0.55224609, "router_z_loss_mlp": 0.00904846, "step": 14725, "time_per_iteration": 3.0282340049743652 }, { "auxiliary_loss_clip": 0.06423433, "auxiliary_loss_mlp": 0.0126904, "balance_loss_clip": 0.06276739, "balance_loss_mlp": 0.01259158, "epoch": 0.8853750187885164, "flos": 18155606947200.0, "grad_norm": 1.9288163775841096, "language_loss": 0.69058615, "learning_rate": 1.3618332203361837e-07, "loss": 0.76751089, "num_input_tokens_seen": 317544070, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.09887695, "step": 14726, "time_per_iteration": 2.515890121459961 }, { "auxiliary_loss_clip": 0.06412515, "auxiliary_loss_mlp": 0.01268055, "balance_loss_clip": 0.06274414, "balance_loss_mlp": 0.01258799, "epoch": 0.8854351420411845, "flos": 39580500648960.0, "grad_norm": 1.2760694537330572, "language_loss": 0.69575286, "learning_rate": 1.3604210200573785e-07, "loss": 0.77255857, "num_input_tokens_seen": 317570275, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.0925293, "step": 14727, "time_per_iteration": 2.708299160003662 }, { "auxiliary_loss_clip": 0.06418364, "auxiliary_loss_mlp": 0.01266313, "balance_loss_clip": 0.06278573, "balance_loss_mlp": 0.01256883, "epoch": 0.8854952652938524, "flos": 23776140395520.0, "grad_norm": 1.5973371769638889, "language_loss": 0.69932258, "learning_rate": 1.3590095265932733e-07, "loss": 0.7761693, "num_input_tokens_seen": 317590160, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09429932, "step": 14728, "time_per_iteration": 2.558600902557373 }, { "auxiliary_loss_clip": 0.06417041, "auxiliary_loss_mlp": 0.0126458, "balance_loss_clip": 0.06274836, "balance_loss_mlp": 0.01254739, "epoch": 0.8855553885465204, "flos": 18295199049600.0, "grad_norm": 2.1005118360748884, "language_loss": 0.66260719, "learning_rate": 1.3575987399973987e-07, "loss": 0.73942345, "num_input_tokens_seen": 317608340, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09838867, "step": 14729, "time_per_iteration": 2.5182275772094727 }, { "auxiliary_loss_clip": 0.06411565, "auxiliary_loss_mlp": 0.01260294, "balance_loss_clip": 0.06274346, "balance_loss_mlp": 0.01251944, "epoch": 0.8856155117991883, "flos": 36876374484480.0, "grad_norm": 1.4572595055610689, "language_loss": 0.63371402, "learning_rate": 1.3561886603232453e-07, "loss": 0.71043259, "num_input_tokens_seen": 317629910, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.08349609, "step": 14730, "time_per_iteration": 2.692777395248413 }, { "auxiliary_loss_clip": 0.06408219, "auxiliary_loss_mlp": 0.01263605, "balance_loss_clip": 0.06272987, "balance_loss_mlp": 0.01254498, "epoch": 0.8856756350518563, "flos": 22170441404160.0, "grad_norm": 1.6229258092484207, "language_loss": 0.79344821, "learning_rate": 1.3547792876242904e-07, "loss": 0.87016642, "num_input_tokens_seen": 317650265, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.09106445, "step": 14731, "time_per_iteration": 3.9847233295440674 }, { "auxiliary_loss_clip": 0.06413625, "auxiliary_loss_mlp": 0.01268502, "balance_loss_clip": 0.06273924, "balance_loss_mlp": 0.01259323, "epoch": 0.8857357583045242, "flos": 20747282532480.0, "grad_norm": 1.893214280809702, "language_loss": 0.83428758, "learning_rate": 1.3533706219539708e-07, "loss": 0.91110885, "num_input_tokens_seen": 317669045, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09173584, "step": 14732, "time_per_iteration": 2.533308744430542 }, { "auxiliary_loss_clip": 0.0631023, "auxiliary_loss_mlp": 0.01250501, "balance_loss_clip": 0.06254989, "balance_loss_mlp": 0.01249584, "epoch": 0.8857958815571922, "flos": 69913815431040.0, "grad_norm": 0.8747784357943916, "language_loss": 0.59726566, "learning_rate": 1.3519626633657045e-07, "loss": 0.67287302, "num_input_tokens_seen": 317728065, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00914764, "step": 14733, "time_per_iteration": 3.1397476196289062 }, { "auxiliary_loss_clip": 0.06414171, "auxiliary_loss_mlp": 0.01263367, "balance_loss_clip": 0.06274826, "balance_loss_mlp": 0.01253729, "epoch": 0.8858560048098603, "flos": 15127294135680.0, "grad_norm": 1.683905444235242, "language_loss": 0.66339362, "learning_rate": 1.3505554119128838e-07, "loss": 0.74016899, "num_input_tokens_seen": 317746120, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09637451, "step": 14734, "time_per_iteration": 2.5667428970336914 }, { "auxiliary_loss_clip": 0.06410375, "auxiliary_loss_mlp": 0.01265845, "balance_loss_clip": 0.06275027, "balance_loss_mlp": 0.01256648, "epoch": 0.8859161280625282, "flos": 16615469376000.0, "grad_norm": 1.9414105719663133, "language_loss": 0.75493103, "learning_rate": 1.3491488676488682e-07, "loss": 0.83169329, "num_input_tokens_seen": 317762280, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.09204102, "step": 14735, "time_per_iteration": 2.511503219604492 }, { "auxiliary_loss_clip": 0.06418672, "auxiliary_loss_mlp": 0.01262294, "balance_loss_clip": 0.06276903, "balance_loss_mlp": 0.01252745, "epoch": 0.8859762513151962, "flos": 18699915070080.0, "grad_norm": 2.13985733066428, "language_loss": 0.70561564, "learning_rate": 1.3477430306270066e-07, "loss": 0.78242528, "num_input_tokens_seen": 317780615, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09545898, "step": 14736, "time_per_iteration": 2.5314927101135254 }, { "auxiliary_loss_clip": 0.06413313, "auxiliary_loss_mlp": 0.01264153, "balance_loss_clip": 0.06274092, "balance_loss_mlp": 0.01254896, "epoch": 0.8860363745678641, "flos": 19542987325440.0, "grad_norm": 1.824963549054055, "language_loss": 0.8479827, "learning_rate": 1.3463379009005892e-07, "loss": 0.92475736, "num_input_tokens_seen": 317798830, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09259033, "step": 14737, "time_per_iteration": 2.529195785522461 }, { "auxiliary_loss_clip": 0.06424178, "auxiliary_loss_mlp": 0.0126513, "balance_loss_clip": 0.06276239, "balance_loss_mlp": 0.01254872, "epoch": 0.8860964978205321, "flos": 35963673886080.0, "grad_norm": 2.2089547857703873, "language_loss": 0.68758893, "learning_rate": 1.3449334785229093e-07, "loss": 0.76448202, "num_input_tokens_seen": 317819235, "router_z_loss_clip": 1.47949219, "router_z_loss_mlp": 0.1026001, "step": 14738, "time_per_iteration": 2.654308795928955 }, { "auxiliary_loss_clip": 0.06419368, "auxiliary_loss_mlp": 0.0126628, "balance_loss_clip": 0.06273955, "balance_loss_mlp": 0.0125601, "epoch": 0.8861566210732, "flos": 21218524295040.0, "grad_norm": 1.650448977580678, "language_loss": 0.7560029, "learning_rate": 1.343529763547222e-07, "loss": 0.8328594, "num_input_tokens_seen": 317836785, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.10272217, "step": 14739, "time_per_iteration": 2.5416290760040283 }, { "auxiliary_loss_clip": 0.06410027, "auxiliary_loss_mlp": 0.01263838, "balance_loss_clip": 0.06272843, "balance_loss_mlp": 0.01255178, "epoch": 0.886216744325868, "flos": 14613984823680.0, "grad_norm": 1.7035302490606898, "language_loss": 0.87131321, "learning_rate": 1.3421267560267559e-07, "loss": 0.94805193, "num_input_tokens_seen": 317854225, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.08660889, "step": 14740, "time_per_iteration": 2.497887372970581 }, { "auxiliary_loss_clip": 0.06411294, "auxiliary_loss_mlp": 0.01262706, "balance_loss_clip": 0.0627379, "balance_loss_mlp": 0.01252985, "epoch": 0.886276867578536, "flos": 26658949392000.0, "grad_norm": 1.849335136049204, "language_loss": 0.63670862, "learning_rate": 1.34072445601471e-07, "loss": 0.71344858, "num_input_tokens_seen": 317874865, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09716797, "step": 14741, "time_per_iteration": 2.5613796710968018 }, { "auxiliary_loss_clip": 0.06414018, "auxiliary_loss_mlp": 0.01269844, "balance_loss_clip": 0.06275675, "balance_loss_mlp": 0.01260384, "epoch": 0.886336990831204, "flos": 16769735942400.0, "grad_norm": 1.6448130957121787, "language_loss": 0.73102164, "learning_rate": 1.3393228635642717e-07, "loss": 0.80786026, "num_input_tokens_seen": 317892830, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09454346, "step": 14742, "time_per_iteration": 2.5159714221954346 }, { "auxiliary_loss_clip": 0.0640894, "auxiliary_loss_mlp": 0.0126465, "balance_loss_clip": 0.06271097, "balance_loss_mlp": 0.01255131, "epoch": 0.8863971140838719, "flos": 25272365627520.0, "grad_norm": 2.1212395297669957, "language_loss": 0.60056722, "learning_rate": 1.3379219787285733e-07, "loss": 0.67730314, "num_input_tokens_seen": 317911780, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09515381, "step": 14743, "time_per_iteration": 2.5551066398620605 }, { "auxiliary_loss_clip": 0.06419355, "auxiliary_loss_mlp": 0.01268237, "balance_loss_clip": 0.06277326, "balance_loss_mlp": 0.01257669, "epoch": 0.8864572373365399, "flos": 23411060156160.0, "grad_norm": 1.659595468652208, "language_loss": 0.60378289, "learning_rate": 1.3365218015607437e-07, "loss": 0.68065888, "num_input_tokens_seen": 317932855, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10565186, "step": 14744, "time_per_iteration": 2.5697150230407715 }, { "auxiliary_loss_clip": 0.06411708, "auxiliary_loss_mlp": 0.01265096, "balance_loss_clip": 0.06272803, "balance_loss_mlp": 0.0125519, "epoch": 0.8865173605892078, "flos": 18554201619840.0, "grad_norm": 1.5789253710806153, "language_loss": 0.77067393, "learning_rate": 1.3351223321138762e-07, "loss": 0.84744197, "num_input_tokens_seen": 317952090, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09906006, "step": 14745, "time_per_iteration": 2.5429608821868896 }, { "auxiliary_loss_clip": 0.06412837, "auxiliary_loss_mlp": 0.01268465, "balance_loss_clip": 0.06274958, "balance_loss_mlp": 0.01258565, "epoch": 0.8865774838418758, "flos": 19031858219520.0, "grad_norm": 1.5965353500073634, "language_loss": 0.77355635, "learning_rate": 1.3337235704410454e-07, "loss": 0.85036945, "num_input_tokens_seen": 317970370, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09899902, "step": 14746, "time_per_iteration": 2.548360586166382 }, { "auxiliary_loss_clip": 0.06411759, "auxiliary_loss_mlp": 0.01265114, "balance_loss_clip": 0.06272417, "balance_loss_mlp": 0.01255446, "epoch": 0.8866376070945439, "flos": 22169602863360.0, "grad_norm": 1.958688984406745, "language_loss": 0.7701484, "learning_rate": 1.3323255165952873e-07, "loss": 0.84691709, "num_input_tokens_seen": 317989125, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09655762, "step": 14747, "time_per_iteration": 2.5761337280273438 }, { "auxiliary_loss_clip": 0.06407152, "auxiliary_loss_mlp": 0.01262637, "balance_loss_clip": 0.0627085, "balance_loss_mlp": 0.0125375, "epoch": 0.8866977303472118, "flos": 20710539717120.0, "grad_norm": 1.7016220537245326, "language_loss": 0.83062863, "learning_rate": 1.3309281706296127e-07, "loss": 0.90732658, "num_input_tokens_seen": 318007820, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.08880615, "step": 14748, "time_per_iteration": 2.5858941078186035 }, { "auxiliary_loss_clip": 0.06412379, "auxiliary_loss_mlp": 0.01269642, "balance_loss_clip": 0.06273142, "balance_loss_mlp": 0.01259998, "epoch": 0.8867578535998798, "flos": 48804779623680.0, "grad_norm": 1.9899313051863854, "language_loss": 0.77831078, "learning_rate": 1.3295315325970148e-07, "loss": 0.85513103, "num_input_tokens_seen": 318030435, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09643555, "step": 14749, "time_per_iteration": 4.2220845222473145 }, { "auxiliary_loss_clip": 0.06421717, "auxiliary_loss_mlp": 0.01266527, "balance_loss_clip": 0.06276724, "balance_loss_mlp": 0.01256388, "epoch": 0.8868179768525477, "flos": 21111608085120.0, "grad_norm": 1.7876224233562394, "language_loss": 0.70135069, "learning_rate": 1.328135602550451e-07, "loss": 0.77823317, "num_input_tokens_seen": 318049465, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10137939, "step": 14750, "time_per_iteration": 2.5481836795806885 }, { "auxiliary_loss_clip": 0.06413087, "auxiliary_loss_mlp": 0.01262241, "balance_loss_clip": 0.06274839, "balance_loss_mlp": 0.01253461, "epoch": 0.8868781001052157, "flos": 21836653464960.0, "grad_norm": 1.777728325487424, "language_loss": 0.59726316, "learning_rate": 1.3267403805428546e-07, "loss": 0.67401642, "num_input_tokens_seen": 318067760, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.08770752, "step": 14751, "time_per_iteration": 2.525883197784424 }, { "auxiliary_loss_clip": 0.06410929, "auxiliary_loss_mlp": 0.01263775, "balance_loss_clip": 0.06272376, "balance_loss_mlp": 0.01253964, "epoch": 0.8869382233578836, "flos": 13521469363200.0, "grad_norm": 2.102577135006736, "language_loss": 0.81415826, "learning_rate": 1.3253458666271344e-07, "loss": 0.89090532, "num_input_tokens_seen": 318082785, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09814453, "step": 14752, "time_per_iteration": 2.498999834060669 }, { "auxiliary_loss_clip": 0.06418508, "auxiliary_loss_mlp": 0.01264879, "balance_loss_clip": 0.06273171, "balance_loss_mlp": 0.01254454, "epoch": 0.8869983466105517, "flos": 22710598750080.0, "grad_norm": 1.7970902469327277, "language_loss": 0.80624199, "learning_rate": 1.3239520608561793e-07, "loss": 0.88307583, "num_input_tokens_seen": 318101925, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10430908, "step": 14753, "time_per_iteration": 2.6088857650756836 }, { "auxiliary_loss_clip": 0.06412221, "auxiliary_loss_mlp": 0.01266172, "balance_loss_clip": 0.06274302, "balance_loss_mlp": 0.01256594, "epoch": 0.8870584698632196, "flos": 15346115873280.0, "grad_norm": 1.576971527616768, "language_loss": 0.65929627, "learning_rate": 1.3225589632828248e-07, "loss": 0.73608017, "num_input_tokens_seen": 318119945, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.0958252, "step": 14754, "time_per_iteration": 2.541983127593994 }, { "auxiliary_loss_clip": 0.06417294, "auxiliary_loss_mlp": 0.01268889, "balance_loss_clip": 0.06276575, "balance_loss_mlp": 0.01259376, "epoch": 0.8871185931158876, "flos": 26623003190400.0, "grad_norm": 1.707759532158611, "language_loss": 0.74900985, "learning_rate": 1.3211665739599065e-07, "loss": 0.82587165, "num_input_tokens_seen": 318139685, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09515381, "step": 14755, "time_per_iteration": 4.030487775802612 }, { "auxiliary_loss_clip": 0.06414639, "auxiliary_loss_mlp": 0.01272929, "balance_loss_clip": 0.06274781, "balance_loss_mlp": 0.01262844, "epoch": 0.8871787163685555, "flos": 21805528872960.0, "grad_norm": 1.438409932128789, "language_loss": 0.78375947, "learning_rate": 1.3197748929402262e-07, "loss": 0.86063516, "num_input_tokens_seen": 318160375, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.10095215, "step": 14756, "time_per_iteration": 2.623056650161743 }, { "auxiliary_loss_clip": 0.06413127, "auxiliary_loss_mlp": 0.01264226, "balance_loss_clip": 0.06272648, "balance_loss_mlp": 0.01254564, "epoch": 0.8872388396212235, "flos": 14908262762880.0, "grad_norm": 1.832711444815, "language_loss": 0.76906168, "learning_rate": 1.3183839202765535e-07, "loss": 0.84583521, "num_input_tokens_seen": 318177995, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09655762, "step": 14757, "time_per_iteration": 2.5442471504211426 }, { "auxiliary_loss_clip": 0.06409087, "auxiliary_loss_mlp": 0.01268021, "balance_loss_clip": 0.06273824, "balance_loss_mlp": 0.0125908, "epoch": 0.8872989628738914, "flos": 26439331040640.0, "grad_norm": 1.885293706512717, "language_loss": 0.68207842, "learning_rate": 1.316993656021632e-07, "loss": 0.7588495, "num_input_tokens_seen": 318197030, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.0894165, "step": 14758, "time_per_iteration": 2.5550613403320312 }, { "auxiliary_loss_clip": 0.06413311, "auxiliary_loss_mlp": 0.01265839, "balance_loss_clip": 0.06273579, "balance_loss_mlp": 0.01255969, "epoch": 0.8873590861265594, "flos": 48153597217920.0, "grad_norm": 1.9279229762407415, "language_loss": 0.68971026, "learning_rate": 1.3156041002281915e-07, "loss": 0.76650178, "num_input_tokens_seen": 318221780, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09875488, "step": 14759, "time_per_iteration": 2.775501251220703 }, { "auxiliary_loss_clip": 0.06407914, "auxiliary_loss_mlp": 0.01265658, "balance_loss_clip": 0.06269689, "balance_loss_mlp": 0.012563, "epoch": 0.8874192093792275, "flos": 18338901753600.0, "grad_norm": 1.9008978934930438, "language_loss": 0.74824899, "learning_rate": 1.3142152529489092e-07, "loss": 0.82498473, "num_input_tokens_seen": 318239710, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09350586, "step": 14760, "time_per_iteration": 2.5130796432495117 }, { "auxiliary_loss_clip": 0.06417554, "auxiliary_loss_mlp": 0.01273484, "balance_loss_clip": 0.06274382, "balance_loss_mlp": 0.01263429, "epoch": 0.8874793326318954, "flos": 17899916613120.0, "grad_norm": 2.2461121687965986, "language_loss": 0.76563859, "learning_rate": 1.3128271142364565e-07, "loss": 0.84254897, "num_input_tokens_seen": 318257425, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10058594, "step": 14761, "time_per_iteration": 2.5331220626831055 }, { "auxiliary_loss_clip": 0.06410576, "auxiliary_loss_mlp": 0.01262613, "balance_loss_clip": 0.06270285, "balance_loss_mlp": 0.01253183, "epoch": 0.8875394558845634, "flos": 31110169512960.0, "grad_norm": 1.7151047853822616, "language_loss": 0.6147539, "learning_rate": 1.3114396841434717e-07, "loss": 0.69148582, "num_input_tokens_seen": 318278485, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09429932, "step": 14762, "time_per_iteration": 2.592606544494629 }, { "auxiliary_loss_clip": 0.06409512, "auxiliary_loss_mlp": 0.0126522, "balance_loss_clip": 0.06271741, "balance_loss_mlp": 0.01254777, "epoch": 0.8875995791372313, "flos": 21148392827520.0, "grad_norm": 1.9589137196120545, "language_loss": 0.64711571, "learning_rate": 1.3100529627225697e-07, "loss": 0.72386301, "num_input_tokens_seen": 318297560, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.10437012, "step": 14763, "time_per_iteration": 4.0006208419799805 }, { "auxiliary_loss_clip": 0.06415197, "auxiliary_loss_mlp": 0.01261638, "balance_loss_clip": 0.06275807, "balance_loss_mlp": 0.01252024, "epoch": 0.8876597023898993, "flos": 17460554129280.0, "grad_norm": 2.1881226592676533, "language_loss": 0.70780146, "learning_rate": 1.3086669500263335e-07, "loss": 0.78456974, "num_input_tokens_seen": 318313060, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.0960083, "step": 14764, "time_per_iteration": 2.523343324661255 }, { "auxiliary_loss_clip": 0.06417324, "auxiliary_loss_mlp": 0.01264798, "balance_loss_clip": 0.06273136, "balance_loss_mlp": 0.0125516, "epoch": 0.8877198256425672, "flos": 22714036767360.0, "grad_norm": 2.0523899032241157, "language_loss": 0.65647519, "learning_rate": 1.3072816461073166e-07, "loss": 0.73329639, "num_input_tokens_seen": 318332030, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.09643555, "step": 14765, "time_per_iteration": 2.545527696609497 }, { "auxiliary_loss_clip": 0.06408544, "auxiliary_loss_mlp": 0.01265075, "balance_loss_clip": 0.0627328, "balance_loss_mlp": 0.0125682, "epoch": 0.8877799488952353, "flos": 24541995513600.0, "grad_norm": 1.5644889405872333, "language_loss": 0.76541221, "learning_rate": 1.3058970510180568e-07, "loss": 0.84214842, "num_input_tokens_seen": 318351090, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.08251953, "step": 14766, "time_per_iteration": 2.580967426300049 }, { "auxiliary_loss_clip": 0.06406889, "auxiliary_loss_mlp": 0.01266723, "balance_loss_clip": 0.06271229, "balance_loss_mlp": 0.01257872, "epoch": 0.8878400721479032, "flos": 20965433437440.0, "grad_norm": 1.8470136267680461, "language_loss": 0.73293787, "learning_rate": 1.3045131648110496e-07, "loss": 0.80967402, "num_input_tokens_seen": 318372000, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.08856201, "step": 14767, "time_per_iteration": 2.540581226348877 }, { "auxiliary_loss_clip": 0.06406975, "auxiliary_loss_mlp": 0.01262593, "balance_loss_clip": 0.06273133, "balance_loss_mlp": 0.01253414, "epoch": 0.8879001954005712, "flos": 25301268086400.0, "grad_norm": 1.8830120909454207, "language_loss": 0.71229172, "learning_rate": 1.303129987538778e-07, "loss": 0.7889874, "num_input_tokens_seen": 318391530, "router_z_loss_clip": 1.33789062, "router_z_loss_mlp": 0.09173584, "step": 14768, "time_per_iteration": 2.562917470932007 }, { "auxiliary_loss_clip": 0.06408523, "auxiliary_loss_mlp": 0.01264631, "balance_loss_clip": 0.0627043, "balance_loss_mlp": 0.01255279, "epoch": 0.8879603186532391, "flos": 23192028783360.0, "grad_norm": 1.6781598553094277, "language_loss": 0.70355868, "learning_rate": 1.3017475192536932e-07, "loss": 0.78029025, "num_input_tokens_seen": 318410690, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09350586, "step": 14769, "time_per_iteration": 2.5588855743408203 }, { "auxiliary_loss_clip": 0.0641582, "auxiliary_loss_mlp": 0.01262551, "balance_loss_clip": 0.06278408, "balance_loss_mlp": 0.01253241, "epoch": 0.8880204419059071, "flos": 13659342456960.0, "grad_norm": 2.022665247285506, "language_loss": 0.67079735, "learning_rate": 1.3003657600082174e-07, "loss": 0.74758112, "num_input_tokens_seen": 318427380, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09301758, "step": 14770, "time_per_iteration": 3.9368841648101807 }, { "auxiliary_loss_clip": 0.06409192, "auxiliary_loss_mlp": 0.01268077, "balance_loss_clip": 0.06276659, "balance_loss_mlp": 0.01259005, "epoch": 0.888080565158575, "flos": 20638228043520.0, "grad_norm": 2.4949688865172845, "language_loss": 0.65418005, "learning_rate": 1.2989847098547424e-07, "loss": 0.73095274, "num_input_tokens_seen": 318448530, "router_z_loss_clip": 1.32324219, "router_z_loss_mlp": 0.09075928, "step": 14771, "time_per_iteration": 2.5839309692382812 }, { "auxiliary_loss_clip": 0.06410801, "auxiliary_loss_mlp": 0.01262213, "balance_loss_clip": 0.06272774, "balance_loss_mlp": 0.0125323, "epoch": 0.888140688411243, "flos": 28627338781440.0, "grad_norm": 1.4045359863725089, "language_loss": 0.82754529, "learning_rate": 1.2976043688456396e-07, "loss": 0.90427542, "num_input_tokens_seen": 318468655, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.08984375, "step": 14772, "time_per_iteration": 2.595707893371582 }, { "auxiliary_loss_clip": 0.06408584, "auxiliary_loss_mlp": 0.01263078, "balance_loss_clip": 0.06274246, "balance_loss_mlp": 0.01254502, "epoch": 0.8882008116639111, "flos": 25527301274880.0, "grad_norm": 1.4575042263741695, "language_loss": 0.76504171, "learning_rate": 1.296224737033258e-07, "loss": 0.84175831, "num_input_tokens_seen": 318488740, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.08581543, "step": 14773, "time_per_iteration": 2.5794894695281982 }, { "auxiliary_loss_clip": 0.06407898, "auxiliary_loss_mlp": 0.01264336, "balance_loss_clip": 0.0627296, "balance_loss_mlp": 0.01255324, "epoch": 0.888260934916579, "flos": 27681249530880.0, "grad_norm": 1.9089582612469425, "language_loss": 0.75004065, "learning_rate": 1.294845814469907e-07, "loss": 0.82676303, "num_input_tokens_seen": 318508810, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.09020996, "step": 14774, "time_per_iteration": 2.5958216190338135 }, { "auxiliary_loss_clip": 0.06416133, "auxiliary_loss_mlp": 0.01268378, "balance_loss_clip": 0.06276242, "balance_loss_mlp": 0.01258466, "epoch": 0.888321058169247, "flos": 21616615843200.0, "grad_norm": 2.2922537841044313, "language_loss": 0.72601223, "learning_rate": 1.2934676012078783e-07, "loss": 0.80285734, "num_input_tokens_seen": 318526860, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09918213, "step": 14775, "time_per_iteration": 2.5860559940338135 }, { "auxiliary_loss_clip": 0.06411678, "auxiliary_loss_mlp": 0.01268194, "balance_loss_clip": 0.06273989, "balance_loss_mlp": 0.01259277, "epoch": 0.8883811814219149, "flos": 18154768406400.0, "grad_norm": 1.660542345425006, "language_loss": 0.80583751, "learning_rate": 1.292090097299432e-07, "loss": 0.88263625, "num_input_tokens_seen": 318545180, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08929443, "step": 14776, "time_per_iteration": 2.543671131134033 }, { "auxiliary_loss_clip": 0.06421471, "auxiliary_loss_mlp": 0.01263226, "balance_loss_clip": 0.06276796, "balance_loss_mlp": 0.01253654, "epoch": 0.8884413046745829, "flos": 28331341833600.0, "grad_norm": 1.986125592307387, "language_loss": 0.69776052, "learning_rate": 1.290713302796802e-07, "loss": 0.77460748, "num_input_tokens_seen": 318564350, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.09576416, "step": 14777, "time_per_iteration": 2.6322784423828125 }, { "auxiliary_loss_clip": 0.0640936, "auxiliary_loss_mlp": 0.01265031, "balance_loss_clip": 0.06272131, "balance_loss_mlp": 0.01255869, "epoch": 0.8885014279272508, "flos": 15164162732160.0, "grad_norm": 1.572727019124234, "language_loss": 0.71485531, "learning_rate": 1.2893372177522e-07, "loss": 0.79159915, "num_input_tokens_seen": 318582275, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09161377, "step": 14778, "time_per_iteration": 2.511488199234009 }, { "auxiliary_loss_clip": 0.06414828, "auxiliary_loss_mlp": 0.01262821, "balance_loss_clip": 0.06275654, "balance_loss_mlp": 0.01253689, "epoch": 0.8885615511799189, "flos": 19105721193600.0, "grad_norm": 2.5628298373254284, "language_loss": 0.77575612, "learning_rate": 1.287961842217804e-07, "loss": 0.85253251, "num_input_tokens_seen": 318601230, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09130859, "step": 14779, "time_per_iteration": 2.5761091709136963 }, { "auxiliary_loss_clip": 0.06313452, "auxiliary_loss_mlp": 0.01251592, "balance_loss_clip": 0.06258352, "balance_loss_mlp": 0.01250597, "epoch": 0.8886216744325868, "flos": 51200735270400.0, "grad_norm": 0.8508781021691859, "language_loss": 0.56765634, "learning_rate": 1.2865871762457747e-07, "loss": 0.64330673, "num_input_tokens_seen": 318645595, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00994873, "step": 14780, "time_per_iteration": 3.0003647804260254 }, { "auxiliary_loss_clip": 0.06315687, "auxiliary_loss_mlp": 0.01250865, "balance_loss_clip": 0.0626047, "balance_loss_mlp": 0.01249927, "epoch": 0.8886817976852548, "flos": 61633571281920.0, "grad_norm": 0.7782872694231067, "language_loss": 0.62438977, "learning_rate": 1.2852132198882326e-07, "loss": 0.7000553, "num_input_tokens_seen": 318707850, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.0093689, "step": 14781, "time_per_iteration": 3.251394271850586 }, { "auxiliary_loss_clip": 0.0631348, "auxiliary_loss_mlp": 0.01251016, "balance_loss_clip": 0.06258322, "balance_loss_mlp": 0.01250039, "epoch": 0.8887419209379227, "flos": 60664464086400.0, "grad_norm": 0.7792885437979228, "language_loss": 0.5813852, "learning_rate": 1.2838399731972805e-07, "loss": 0.65703022, "num_input_tokens_seen": 318764915, "router_z_loss_clip": 0.55126953, "router_z_loss_mlp": 0.009758, "step": 14782, "time_per_iteration": 3.0183796882629395 }, { "auxiliary_loss_clip": 0.0641016, "auxiliary_loss_mlp": 0.01264242, "balance_loss_clip": 0.06273434, "balance_loss_mlp": 0.0125644, "epoch": 0.8888020441905907, "flos": 29213630599680.0, "grad_norm": 1.4853993982889835, "language_loss": 0.65866792, "learning_rate": 1.2824674362249922e-07, "loss": 0.73541188, "num_input_tokens_seen": 318785660, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.07800293, "step": 14783, "time_per_iteration": 2.599372625350952 }, { "auxiliary_loss_clip": 0.06416032, "auxiliary_loss_mlp": 0.01264859, "balance_loss_clip": 0.06274306, "balance_loss_mlp": 0.01255501, "epoch": 0.8888621674432586, "flos": 22169057811840.0, "grad_norm": 1.4926232784855742, "language_loss": 0.77564836, "learning_rate": 1.281095609023415e-07, "loss": 0.85245728, "num_input_tokens_seen": 318806080, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09362793, "step": 14784, "time_per_iteration": 2.5632612705230713 }, { "auxiliary_loss_clip": 0.06414147, "auxiliary_loss_mlp": 0.0127447, "balance_loss_clip": 0.06273749, "balance_loss_mlp": 0.01264486, "epoch": 0.8889222906959267, "flos": 27680243281920.0, "grad_norm": 2.1315846726387475, "language_loss": 0.60583973, "learning_rate": 1.279724491644565e-07, "loss": 0.68272591, "num_input_tokens_seen": 318826445, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09985352, "step": 14785, "time_per_iteration": 2.5807106494903564 }, { "auxiliary_loss_clip": 0.06412669, "auxiliary_loss_mlp": 0.01266023, "balance_loss_clip": 0.06274942, "balance_loss_mlp": 0.01256516, "epoch": 0.8889824139485947, "flos": 14173029112320.0, "grad_norm": 1.8173177966432221, "language_loss": 0.65808505, "learning_rate": 1.278354084140445e-07, "loss": 0.73487198, "num_input_tokens_seen": 318843915, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09490967, "step": 14786, "time_per_iteration": 2.5433542728424072 }, { "auxiliary_loss_clip": 0.06421251, "auxiliary_loss_mlp": 0.01265853, "balance_loss_clip": 0.06276481, "balance_loss_mlp": 0.01254749, "epoch": 0.8890425372012626, "flos": 12856828377600.0, "grad_norm": 2.4202225529474672, "language_loss": 0.85483015, "learning_rate": 1.276984386563009e-07, "loss": 0.93170118, "num_input_tokens_seen": 318859670, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.11108398, "step": 14787, "time_per_iteration": 2.533247232437134 }, { "auxiliary_loss_clip": 0.06417905, "auxiliary_loss_mlp": 0.01264041, "balance_loss_clip": 0.0627915, "balance_loss_mlp": 0.01254976, "epoch": 0.8891026604539306, "flos": 21695719697280.0, "grad_norm": 2.5234198940439287, "language_loss": 0.70619255, "learning_rate": 1.2756153989642027e-07, "loss": 0.78301203, "num_input_tokens_seen": 318877855, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09063721, "step": 14788, "time_per_iteration": 3.9729485511779785 }, { "auxiliary_loss_clip": 0.06409104, "auxiliary_loss_mlp": 0.01265039, "balance_loss_clip": 0.06275298, "balance_loss_mlp": 0.01255693, "epoch": 0.8891627837065985, "flos": 21877840546560.0, "grad_norm": 1.5639202960351792, "language_loss": 0.70187163, "learning_rate": 1.274247121395935e-07, "loss": 0.77861309, "num_input_tokens_seen": 318896045, "router_z_loss_clip": 1.33789062, "router_z_loss_mlp": 0.09338379, "step": 14789, "time_per_iteration": 2.547323703765869 }, { "auxiliary_loss_clip": 0.06410492, "auxiliary_loss_mlp": 0.01263673, "balance_loss_clip": 0.06274886, "balance_loss_mlp": 0.01254291, "epoch": 0.8892229069592665, "flos": 21586707135360.0, "grad_norm": 1.5624705108290136, "language_loss": 0.70825404, "learning_rate": 1.2728795539100956e-07, "loss": 0.78499568, "num_input_tokens_seen": 318915515, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.09387207, "step": 14790, "time_per_iteration": 2.5531842708587646 }, { "auxiliary_loss_clip": 0.06411068, "auxiliary_loss_mlp": 0.0126456, "balance_loss_clip": 0.0627258, "balance_loss_mlp": 0.01255381, "epoch": 0.8892830302119344, "flos": 23082680805120.0, "grad_norm": 1.662082037794738, "language_loss": 0.7321077, "learning_rate": 1.2715126965585387e-07, "loss": 0.80886394, "num_input_tokens_seen": 318934305, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09179688, "step": 14791, "time_per_iteration": 2.5588009357452393 }, { "auxiliary_loss_clip": 0.06409501, "auxiliary_loss_mlp": 0.01268532, "balance_loss_clip": 0.06275214, "balance_loss_mlp": 0.01259281, "epoch": 0.8893431534646025, "flos": 23078194611840.0, "grad_norm": 1.3762209547783781, "language_loss": 0.7392717, "learning_rate": 1.2701465493931008e-07, "loss": 0.81605208, "num_input_tokens_seen": 318953880, "router_z_loss_clip": 1.34277344, "router_z_loss_mlp": 0.09246826, "step": 14792, "time_per_iteration": 2.58992075920105 }, { "auxiliary_loss_clip": 0.06419709, "auxiliary_loss_mlp": 0.01266124, "balance_loss_clip": 0.0627687, "balance_loss_mlp": 0.01256099, "epoch": 0.8894032767172704, "flos": 22461449034240.0, "grad_norm": 2.0715637653234786, "language_loss": 0.66318554, "learning_rate": 1.2687811124655801e-07, "loss": 0.74004388, "num_input_tokens_seen": 318971395, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.1003418, "step": 14793, "time_per_iteration": 2.567258596420288 }, { "auxiliary_loss_clip": 0.06413651, "auxiliary_loss_mlp": 0.01264818, "balance_loss_clip": 0.06272857, "balance_loss_mlp": 0.01255216, "epoch": 0.8894633999699384, "flos": 25345348133760.0, "grad_norm": 1.670153743824817, "language_loss": 0.71336943, "learning_rate": 1.2674163858277552e-07, "loss": 0.7901541, "num_input_tokens_seen": 318990580, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09606934, "step": 14794, "time_per_iteration": 2.5733377933502197 }, { "auxiliary_loss_clip": 0.06417005, "auxiliary_loss_mlp": 0.012656, "balance_loss_clip": 0.0627262, "balance_loss_mlp": 0.0125536, "epoch": 0.8895235232226063, "flos": 21000079900800.0, "grad_norm": 1.7537532914976672, "language_loss": 0.75256789, "learning_rate": 1.2660523695313785e-07, "loss": 0.82939398, "num_input_tokens_seen": 319010040, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10241699, "step": 14795, "time_per_iteration": 4.017632246017456 }, { "auxiliary_loss_clip": 0.06312473, "auxiliary_loss_mlp": 0.0125119, "balance_loss_clip": 0.06257148, "balance_loss_mlp": 0.0125025, "epoch": 0.8895836464752743, "flos": 69752169705600.0, "grad_norm": 0.764027087346609, "language_loss": 0.56155664, "learning_rate": 1.2646890636281727e-07, "loss": 0.63719326, "num_input_tokens_seen": 319063860, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00938416, "step": 14796, "time_per_iteration": 3.084007501602173 }, { "auxiliary_loss_clip": 0.0641302, "auxiliary_loss_mlp": 0.01262356, "balance_loss_clip": 0.06272407, "balance_loss_mlp": 0.0125155, "epoch": 0.8896437697279422, "flos": 23228520036480.0, "grad_norm": 2.010753171585925, "language_loss": 0.70148998, "learning_rate": 1.263326468169843e-07, "loss": 0.77824378, "num_input_tokens_seen": 319082335, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.1081543, "step": 14797, "time_per_iteration": 2.5661425590515137 }, { "auxiliary_loss_clip": 0.06317626, "auxiliary_loss_mlp": 0.012517, "balance_loss_clip": 0.06262408, "balance_loss_mlp": 0.01250845, "epoch": 0.8897038929806103, "flos": 70771786513920.0, "grad_norm": 0.7293898703350944, "language_loss": 0.58007157, "learning_rate": 1.2619645832080417e-07, "loss": 0.65576482, "num_input_tokens_seen": 319147075, "router_z_loss_clip": 0.55419922, "router_z_loss_mlp": 0.00857544, "step": 14798, "time_per_iteration": 3.227447748184204 }, { "auxiliary_loss_clip": 0.06414787, "auxiliary_loss_mlp": 0.01263561, "balance_loss_clip": 0.06275836, "balance_loss_mlp": 0.01254502, "epoch": 0.8897640162332782, "flos": 19251183081600.0, "grad_norm": 1.6990994857913932, "language_loss": 0.79676145, "learning_rate": 1.2606034087944251e-07, "loss": 0.87354493, "num_input_tokens_seen": 319166630, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09063721, "step": 14799, "time_per_iteration": 2.542684555053711 }, { "auxiliary_loss_clip": 0.06314693, "auxiliary_loss_mlp": 0.012507, "balance_loss_clip": 0.06259601, "balance_loss_mlp": 0.01249762, "epoch": 0.8898241394859462, "flos": 41372288830080.0, "grad_norm": 0.8914675687202944, "language_loss": 0.57994825, "learning_rate": 1.2592429449806053e-07, "loss": 0.6556021, "num_input_tokens_seen": 319221865, "router_z_loss_clip": 0.55419922, "router_z_loss_mlp": 0.00936127, "step": 14800, "time_per_iteration": 3.1299262046813965 }, { "auxiliary_loss_clip": 0.06412467, "auxiliary_loss_mlp": 0.01264686, "balance_loss_clip": 0.06274338, "balance_loss_mlp": 0.01255668, "epoch": 0.8898842627386142, "flos": 18991761240960.0, "grad_norm": 1.633495671567337, "language_loss": 0.6651113, "learning_rate": 1.2578831918181698e-07, "loss": 0.7418828, "num_input_tokens_seen": 319240710, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09020996, "step": 14801, "time_per_iteration": 2.5351271629333496 }, { "auxiliary_loss_clip": 0.06420912, "auxiliary_loss_mlp": 0.01267322, "balance_loss_clip": 0.0627949, "balance_loss_mlp": 0.01256504, "epoch": 0.8899443859912821, "flos": 13220944295040.0, "grad_norm": 2.9091339033644403, "language_loss": 0.75283217, "learning_rate": 1.256524149358682e-07, "loss": 0.82971454, "num_input_tokens_seen": 319256495, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10809326, "step": 14802, "time_per_iteration": 2.533489465713501 }, { "auxiliary_loss_clip": 0.0641167, "auxiliary_loss_mlp": 0.01264692, "balance_loss_clip": 0.06276978, "balance_loss_mlp": 0.01255537, "epoch": 0.8900045092439501, "flos": 22681318947840.0, "grad_norm": 1.8831830457909853, "language_loss": 0.73828828, "learning_rate": 1.2551658176536805e-07, "loss": 0.81505191, "num_input_tokens_seen": 319273620, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.09155273, "step": 14803, "time_per_iteration": 4.206336975097656 }, { "auxiliary_loss_clip": 0.06412703, "auxiliary_loss_mlp": 0.0126398, "balance_loss_clip": 0.06274255, "balance_loss_mlp": 0.01254217, "epoch": 0.890064632496618, "flos": 21147889703040.0, "grad_norm": 1.6825596098845794, "language_loss": 0.71957695, "learning_rate": 1.2538081967546664e-07, "loss": 0.7963438, "num_input_tokens_seen": 319291720, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09765625, "step": 14804, "time_per_iteration": 2.5689613819122314 }, { "auxiliary_loss_clip": 0.06413629, "auxiliary_loss_mlp": 0.01265113, "balance_loss_clip": 0.06273668, "balance_loss_mlp": 0.01256018, "epoch": 0.8901247557492861, "flos": 23402590894080.0, "grad_norm": 1.66512373015833, "language_loss": 0.81614047, "learning_rate": 1.252451286713123e-07, "loss": 0.89292789, "num_input_tokens_seen": 319310380, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09100342, "step": 14805, "time_per_iteration": 2.579892158508301 }, { "auxiliary_loss_clip": 0.0641838, "auxiliary_loss_mlp": 0.01264376, "balance_loss_clip": 0.06277838, "balance_loss_mlp": 0.01254839, "epoch": 0.890184879001954, "flos": 29177390908800.0, "grad_norm": 2.05192560483432, "language_loss": 0.67312741, "learning_rate": 1.251095087580505e-07, "loss": 0.749955, "num_input_tokens_seen": 319331765, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09539795, "step": 14806, "time_per_iteration": 2.58443284034729 }, { "auxiliary_loss_clip": 0.06413642, "auxiliary_loss_mlp": 0.01264829, "balance_loss_clip": 0.06275371, "balance_loss_mlp": 0.01254881, "epoch": 0.890245002254622, "flos": 14432912150400.0, "grad_norm": 1.7572398835747318, "language_loss": 0.67587066, "learning_rate": 1.2497395994082438e-07, "loss": 0.75265527, "num_input_tokens_seen": 319349135, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09936523, "step": 14807, "time_per_iteration": 2.525108575820923 }, { "auxiliary_loss_clip": 0.06409112, "auxiliary_loss_mlp": 0.01263153, "balance_loss_clip": 0.06271649, "balance_loss_mlp": 0.01254087, "epoch": 0.8903051255072899, "flos": 22388676163200.0, "grad_norm": 1.805830025332912, "language_loss": 0.75363213, "learning_rate": 1.248384822247732e-07, "loss": 0.83035481, "num_input_tokens_seen": 319368410, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09063721, "step": 14808, "time_per_iteration": 2.5379035472869873 }, { "auxiliary_loss_clip": 0.06412905, "auxiliary_loss_mlp": 0.01263623, "balance_loss_clip": 0.06273295, "balance_loss_mlp": 0.01255058, "epoch": 0.8903652487599579, "flos": 20783689931520.0, "grad_norm": 2.6797662733730063, "language_loss": 0.81616974, "learning_rate": 1.2470307561503513e-07, "loss": 0.89293504, "num_input_tokens_seen": 319387535, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.08563232, "step": 14809, "time_per_iteration": 2.554919719696045 }, { "auxiliary_loss_clip": 0.06411994, "auxiliary_loss_mlp": 0.01264544, "balance_loss_clip": 0.06274404, "balance_loss_mlp": 0.01254751, "epoch": 0.8904253720126258, "flos": 24431180088960.0, "grad_norm": 2.6793052362035534, "language_loss": 0.68629432, "learning_rate": 1.2456774011674442e-07, "loss": 0.76305974, "num_input_tokens_seen": 319407210, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09796143, "step": 14810, "time_per_iteration": 3.9842607975006104 }, { "auxiliary_loss_clip": 0.06414748, "auxiliary_loss_mlp": 0.01263008, "balance_loss_clip": 0.06273474, "balance_loss_mlp": 0.01253865, "epoch": 0.8904854952652939, "flos": 19469962892160.0, "grad_norm": 2.5595855320838976, "language_loss": 0.70730531, "learning_rate": 1.2443247573503257e-07, "loss": 0.78408289, "num_input_tokens_seen": 319425340, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09143066, "step": 14811, "time_per_iteration": 2.5386078357696533 }, { "auxiliary_loss_clip": 0.06417513, "auxiliary_loss_mlp": 0.01266704, "balance_loss_clip": 0.06275476, "balance_loss_mlp": 0.01257025, "epoch": 0.8905456185179618, "flos": 50811337347840.0, "grad_norm": 2.020804509496827, "language_loss": 0.6584599, "learning_rate": 1.2429728247502924e-07, "loss": 0.73530209, "num_input_tokens_seen": 319448150, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09674072, "step": 14812, "time_per_iteration": 2.805738687515259 }, { "auxiliary_loss_clip": 0.06409835, "auxiliary_loss_mlp": 0.01264419, "balance_loss_clip": 0.06272466, "balance_loss_mlp": 0.01255675, "epoch": 0.8906057417706298, "flos": 17790568634880.0, "grad_norm": 2.278567554083488, "language_loss": 0.68405384, "learning_rate": 1.24162160341861e-07, "loss": 0.76079631, "num_input_tokens_seen": 319466115, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.08740234, "step": 14813, "time_per_iteration": 2.487245798110962 }, { "auxiliary_loss_clip": 0.06423151, "auxiliary_loss_mlp": 0.01265523, "balance_loss_clip": 0.06275785, "balance_loss_mlp": 0.01254681, "epoch": 0.8906658650232978, "flos": 21951368104320.0, "grad_norm": 1.798661493791184, "language_loss": 0.76047754, "learning_rate": 1.2402710934065198e-07, "loss": 0.83736432, "num_input_tokens_seen": 319485255, "router_z_loss_clip": 1.47363281, "router_z_loss_mlp": 0.10852051, "step": 14814, "time_per_iteration": 2.534093141555786 }, { "auxiliary_loss_clip": 0.06418225, "auxiliary_loss_mlp": 0.01265407, "balance_loss_clip": 0.06274664, "balance_loss_mlp": 0.01254684, "epoch": 0.8907259882759657, "flos": 21294148204800.0, "grad_norm": 2.1612971340416345, "language_loss": 0.74514389, "learning_rate": 1.2389212947652229e-07, "loss": 0.82198018, "num_input_tokens_seen": 319501800, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10723877, "step": 14815, "time_per_iteration": 2.5363926887512207 }, { "auxiliary_loss_clip": 0.06408238, "auxiliary_loss_mlp": 0.01265885, "balance_loss_clip": 0.06273179, "balance_loss_mlp": 0.01257368, "epoch": 0.8907861115286337, "flos": 20126595813120.0, "grad_norm": 1.6937354538669074, "language_loss": 0.75530195, "learning_rate": 1.237572207545914e-07, "loss": 0.83204317, "num_input_tokens_seen": 319520415, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.08514404, "step": 14816, "time_per_iteration": 2.560655117034912 }, { "auxiliary_loss_clip": 0.06415931, "auxiliary_loss_mlp": 0.01265506, "balance_loss_clip": 0.06275706, "balance_loss_mlp": 0.01256095, "epoch": 0.8908462347813016, "flos": 20090356122240.0, "grad_norm": 1.920550828670468, "language_loss": 0.77691805, "learning_rate": 1.2362238317997476e-07, "loss": 0.85373235, "num_input_tokens_seen": 319538410, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09417725, "step": 14817, "time_per_iteration": 2.531740188598633 }, { "auxiliary_loss_clip": 0.06311178, "auxiliary_loss_mlp": 0.0125238, "balance_loss_clip": 0.06256002, "balance_loss_mlp": 0.01251364, "epoch": 0.8909063580339697, "flos": 65522664288000.0, "grad_norm": 0.7522759402736297, "language_loss": 0.5658195, "learning_rate": 1.2348761675778517e-07, "loss": 0.64145505, "num_input_tokens_seen": 319602565, "router_z_loss_clip": 0.55224609, "router_z_loss_mlp": 0.01016998, "step": 14818, "time_per_iteration": 3.212049961090088 }, { "auxiliary_loss_clip": 0.06414481, "auxiliary_loss_mlp": 0.01269266, "balance_loss_clip": 0.0627583, "balance_loss_mlp": 0.01259914, "epoch": 0.8909664812866376, "flos": 29871018207360.0, "grad_norm": 1.9999416120160611, "language_loss": 0.65022802, "learning_rate": 1.2335292149313325e-07, "loss": 0.72706556, "num_input_tokens_seen": 319624645, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09350586, "step": 14819, "time_per_iteration": 2.634033441543579 }, { "auxiliary_loss_clip": 0.06415487, "auxiliary_loss_mlp": 0.01263804, "balance_loss_clip": 0.06275228, "balance_loss_mlp": 0.01253338, "epoch": 0.8910266045393056, "flos": 25454151060480.0, "grad_norm": 1.7730654352232518, "language_loss": 0.78459728, "learning_rate": 1.2321829739112731e-07, "loss": 0.86139017, "num_input_tokens_seen": 319644040, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.10473633, "step": 14820, "time_per_iteration": 2.627016544342041 }, { "auxiliary_loss_clip": 0.06412558, "auxiliary_loss_mlp": 0.01266643, "balance_loss_clip": 0.0627455, "balance_loss_mlp": 0.01256928, "epoch": 0.8910867277919735, "flos": 24506091239040.0, "grad_norm": 1.8334710828605432, "language_loss": 0.76175374, "learning_rate": 1.2308374445687087e-07, "loss": 0.8385458, "num_input_tokens_seen": 319663930, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.0970459, "step": 14821, "time_per_iteration": 2.588510751724243 }, { "auxiliary_loss_clip": 0.06314045, "auxiliary_loss_mlp": 0.01251124, "balance_loss_clip": 0.0625873, "balance_loss_mlp": 0.01250078, "epoch": 0.8911468510446415, "flos": 60706447781760.0, "grad_norm": 0.7809816943257233, "language_loss": 0.59316164, "learning_rate": 1.2294926269546712e-07, "loss": 0.66881335, "num_input_tokens_seen": 319721245, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.0104599, "step": 14822, "time_per_iteration": 3.0629091262817383 }, { "auxiliary_loss_clip": 0.06413238, "auxiliary_loss_mlp": 0.01261987, "balance_loss_clip": 0.06272923, "balance_loss_mlp": 0.01252313, "epoch": 0.8912069742973094, "flos": 25344467665920.0, "grad_norm": 1.7799535500211663, "language_loss": 0.69238132, "learning_rate": 1.2281485211201515e-07, "loss": 0.76913357, "num_input_tokens_seen": 319741200, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09674072, "step": 14823, "time_per_iteration": 2.6039366722106934 }, { "auxiliary_loss_clip": 0.06408672, "auxiliary_loss_mlp": 0.01264171, "balance_loss_clip": 0.06271859, "balance_loss_mlp": 0.01254843, "epoch": 0.8912670975499775, "flos": 18229427994240.0, "grad_norm": 2.6094987081264147, "language_loss": 0.69500244, "learning_rate": 1.2268051271161262e-07, "loss": 0.7717309, "num_input_tokens_seen": 319759265, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09326172, "step": 14824, "time_per_iteration": 2.5345675945281982 }, { "auxiliary_loss_clip": 0.06416824, "auxiliary_loss_mlp": 0.01263263, "balance_loss_clip": 0.06274414, "balance_loss_mlp": 0.0125356, "epoch": 0.8913272208026454, "flos": 26511558860160.0, "grad_norm": 1.805808348050452, "language_loss": 0.70744741, "learning_rate": 1.2254624449935303e-07, "loss": 0.78424829, "num_input_tokens_seen": 319777560, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.09698486, "step": 14825, "time_per_iteration": 2.6284053325653076 }, { "auxiliary_loss_clip": 0.06412121, "auxiliary_loss_mlp": 0.01267397, "balance_loss_clip": 0.06274441, "balance_loss_mlp": 0.01257914, "epoch": 0.8913873440553134, "flos": 18807502112640.0, "grad_norm": 3.0970703159104227, "language_loss": 0.71342361, "learning_rate": 1.2241204748032786e-07, "loss": 0.79021877, "num_input_tokens_seen": 319794125, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.0947876, "step": 14826, "time_per_iteration": 2.581033229827881 }, { "auxiliary_loss_clip": 0.06410579, "auxiliary_loss_mlp": 0.01263773, "balance_loss_clip": 0.06272573, "balance_loss_mlp": 0.0125445, "epoch": 0.8914474673079814, "flos": 20890899630720.0, "grad_norm": 1.9954467290822406, "language_loss": 0.75122887, "learning_rate": 1.2227792165962615e-07, "loss": 0.82797235, "num_input_tokens_seen": 319810310, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09320068, "step": 14827, "time_per_iteration": 2.6695473194122314 }, { "auxiliary_loss_clip": 0.06416074, "auxiliary_loss_mlp": 0.01265357, "balance_loss_clip": 0.06276064, "balance_loss_mlp": 0.01255617, "epoch": 0.8915075905606493, "flos": 20957551153920.0, "grad_norm": 1.7972581672155992, "language_loss": 0.78156763, "learning_rate": 1.221438670423336e-07, "loss": 0.85838193, "num_input_tokens_seen": 319828505, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09735107, "step": 14828, "time_per_iteration": 3.9500679969787598 }, { "auxiliary_loss_clip": 0.06410608, "auxiliary_loss_mlp": 0.01263769, "balance_loss_clip": 0.06272897, "balance_loss_mlp": 0.01254992, "epoch": 0.8915677138133173, "flos": 23083058148480.0, "grad_norm": 1.5885421814720635, "language_loss": 0.75211298, "learning_rate": 1.2200988363353392e-07, "loss": 0.82885677, "num_input_tokens_seen": 319848680, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.08773804, "step": 14829, "time_per_iteration": 2.5445711612701416 }, { "auxiliary_loss_clip": 0.06411277, "auxiliary_loss_mlp": 0.01265107, "balance_loss_clip": 0.06270851, "balance_loss_mlp": 0.01255553, "epoch": 0.8916278370659853, "flos": 23446922503680.0, "grad_norm": 1.5094933162605892, "language_loss": 0.84608316, "learning_rate": 1.2187597143830773e-07, "loss": 0.92284703, "num_input_tokens_seen": 319868835, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09552002, "step": 14830, "time_per_iteration": 2.563889741897583 }, { "auxiliary_loss_clip": 0.06408866, "auxiliary_loss_mlp": 0.0126291, "balance_loss_clip": 0.06273081, "balance_loss_mlp": 0.01254471, "epoch": 0.8916879603186533, "flos": 25168342383360.0, "grad_norm": 1.4626277792787439, "language_loss": 0.75206769, "learning_rate": 1.2174213046173299e-07, "loss": 0.82878542, "num_input_tokens_seen": 319891585, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.08435059, "step": 14831, "time_per_iteration": 2.6346218585968018 }, { "auxiliary_loss_clip": 0.06415869, "auxiliary_loss_mlp": 0.01265066, "balance_loss_clip": 0.06273487, "balance_loss_mlp": 0.012551, "epoch": 0.8917480835713212, "flos": 20236027645440.0, "grad_norm": 1.7587938906550082, "language_loss": 0.73097461, "learning_rate": 1.216083607088847e-07, "loss": 0.80778396, "num_input_tokens_seen": 319910315, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09973145, "step": 14832, "time_per_iteration": 2.574206829071045 }, { "auxiliary_loss_clip": 0.06414804, "auxiliary_loss_mlp": 0.01264512, "balance_loss_clip": 0.06273961, "balance_loss_mlp": 0.0125494, "epoch": 0.8918082068239892, "flos": 26108729556480.0, "grad_norm": 2.1160523116875427, "language_loss": 0.67205179, "learning_rate": 1.214746621848355e-07, "loss": 0.74884492, "num_input_tokens_seen": 319932275, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09576416, "step": 14833, "time_per_iteration": 2.5972342491149902 }, { "auxiliary_loss_clip": 0.0641631, "auxiliary_loss_mlp": 0.01265695, "balance_loss_clip": 0.06274496, "balance_loss_mlp": 0.01255491, "epoch": 0.8918683300766571, "flos": 24839124491520.0, "grad_norm": 1.9790012863417115, "language_loss": 0.7405917, "learning_rate": 1.2134103489465575e-07, "loss": 0.81741178, "num_input_tokens_seen": 319955335, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10205078, "step": 14834, "time_per_iteration": 2.6318421363830566 }, { "auxiliary_loss_clip": 0.06414942, "auxiliary_loss_mlp": 0.01262848, "balance_loss_clip": 0.06276284, "balance_loss_mlp": 0.01253216, "epoch": 0.8919284533293251, "flos": 22310955901440.0, "grad_norm": 2.0375702584395308, "language_loss": 0.7898373, "learning_rate": 1.2120747884341188e-07, "loss": 0.86661524, "num_input_tokens_seen": 319973990, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09637451, "step": 14835, "time_per_iteration": 4.011335849761963 }, { "auxiliary_loss_clip": 0.06406239, "auxiliary_loss_mlp": 0.01263606, "balance_loss_clip": 0.06271291, "balance_loss_mlp": 0.01254993, "epoch": 0.891988576581993, "flos": 30381518407680.0, "grad_norm": 1.4949663634424508, "language_loss": 0.74358392, "learning_rate": 1.210739940361689e-07, "loss": 0.82028234, "num_input_tokens_seen": 319995555, "router_z_loss_clip": 1.34863281, "router_z_loss_mlp": 0.08612061, "step": 14836, "time_per_iteration": 2.6367008686065674 }, { "auxiliary_loss_clip": 0.06410854, "auxiliary_loss_mlp": 0.01263506, "balance_loss_clip": 0.06272475, "balance_loss_mlp": 0.01254267, "epoch": 0.8920486998346611, "flos": 15557223035520.0, "grad_norm": 4.794921371721858, "language_loss": 0.69490469, "learning_rate": 1.2094058047798838e-07, "loss": 0.77164829, "num_input_tokens_seen": 320012385, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09234619, "step": 14837, "time_per_iteration": 2.511657238006592 }, { "auxiliary_loss_clip": 0.06417853, "auxiliary_loss_mlp": 0.01266111, "balance_loss_clip": 0.06274372, "balance_loss_mlp": 0.01256353, "epoch": 0.892108823087329, "flos": 21221333406720.0, "grad_norm": 1.668541383154065, "language_loss": 0.67546928, "learning_rate": 1.2080723817392913e-07, "loss": 0.75230896, "num_input_tokens_seen": 320032390, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09753418, "step": 14838, "time_per_iteration": 2.5560648441314697 }, { "auxiliary_loss_clip": 0.064153, "auxiliary_loss_mlp": 0.01264824, "balance_loss_clip": 0.06275465, "balance_loss_mlp": 0.01255734, "epoch": 0.892168946339997, "flos": 21985092172800.0, "grad_norm": 1.8339188222356761, "language_loss": 0.76351434, "learning_rate": 1.2067396712904777e-07, "loss": 0.84031558, "num_input_tokens_seen": 320052885, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09088135, "step": 14839, "time_per_iteration": 2.548922300338745 }, { "auxiliary_loss_clip": 0.06313655, "auxiliary_loss_mlp": 0.01251573, "balance_loss_clip": 0.06258264, "balance_loss_mlp": 0.01250515, "epoch": 0.892229069592665, "flos": 67494869038080.0, "grad_norm": 0.6646500497604942, "language_loss": 0.49242711, "learning_rate": 1.205407673483978e-07, "loss": 0.56807941, "num_input_tokens_seen": 320113685, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01059723, "step": 14840, "time_per_iteration": 3.139770746231079 }, { "auxiliary_loss_clip": 0.06423955, "auxiliary_loss_mlp": 0.01264385, "balance_loss_clip": 0.06277463, "balance_loss_mlp": 0.01253126, "epoch": 0.8922891928453329, "flos": 19464931647360.0, "grad_norm": 2.0309452655490254, "language_loss": 0.64508444, "learning_rate": 1.2040763883703074e-07, "loss": 0.72196782, "num_input_tokens_seen": 320130810, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.11248779, "step": 14841, "time_per_iteration": 2.5318992137908936 }, { "auxiliary_loss_clip": 0.06409288, "auxiliary_loss_mlp": 0.01265087, "balance_loss_clip": 0.06275296, "balance_loss_mlp": 0.01256563, "epoch": 0.8923493160980009, "flos": 23374065778560.0, "grad_norm": 1.436195402113681, "language_loss": 0.685296, "learning_rate": 1.2027458159999438e-07, "loss": 0.76203972, "num_input_tokens_seen": 320152170, "router_z_loss_clip": 1.33984375, "router_z_loss_mlp": 0.08526611, "step": 14842, "time_per_iteration": 4.042694568634033 }, { "auxiliary_loss_clip": 0.06408846, "auxiliary_loss_mlp": 0.01265215, "balance_loss_clip": 0.06273665, "balance_loss_mlp": 0.01256662, "epoch": 0.8924094393506689, "flos": 26184227685120.0, "grad_norm": 2.1307129488316288, "language_loss": 0.80476582, "learning_rate": 1.2014159564233373e-07, "loss": 0.8815065, "num_input_tokens_seen": 320172360, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.08557129, "step": 14843, "time_per_iteration": 2.5893075466156006 }, { "auxiliary_loss_clip": 0.06417543, "auxiliary_loss_mlp": 0.01262922, "balance_loss_clip": 0.06273427, "balance_loss_mlp": 0.01252747, "epoch": 0.8924695626033369, "flos": 22025147224320.0, "grad_norm": 1.8323559881239684, "language_loss": 0.68398452, "learning_rate": 1.2000868096909257e-07, "loss": 0.7607891, "num_input_tokens_seen": 320192130, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10174561, "step": 14844, "time_per_iteration": 2.5473711490631104 }, { "auxiliary_loss_clip": 0.06412882, "auxiliary_loss_mlp": 0.01268018, "balance_loss_clip": 0.06273679, "balance_loss_mlp": 0.0125869, "epoch": 0.8925296858560048, "flos": 14799292128000.0, "grad_norm": 3.3764215291655693, "language_loss": 0.91322088, "learning_rate": 1.1987583758531038e-07, "loss": 0.99002993, "num_input_tokens_seen": 320207760, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09326172, "step": 14845, "time_per_iteration": 2.512097120285034 }, { "auxiliary_loss_clip": 0.06409354, "auxiliary_loss_mlp": 0.01264399, "balance_loss_clip": 0.06274557, "balance_loss_mlp": 0.01255774, "epoch": 0.8925898091086728, "flos": 22353275013120.0, "grad_norm": 2.0288743435644423, "language_loss": 0.72673416, "learning_rate": 1.1974306549602476e-07, "loss": 0.80347168, "num_input_tokens_seen": 320225325, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.08624268, "step": 14846, "time_per_iteration": 2.5367558002471924 }, { "auxiliary_loss_clip": 0.06414464, "auxiliary_loss_mlp": 0.0126435, "balance_loss_clip": 0.06274614, "balance_loss_mlp": 0.01254867, "epoch": 0.8926499323613407, "flos": 45816773425920.0, "grad_norm": 1.5859819758173714, "language_loss": 0.56771427, "learning_rate": 1.1961036470627094e-07, "loss": 0.6445024, "num_input_tokens_seen": 320247645, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.0947876, "step": 14847, "time_per_iteration": 2.7514400482177734 }, { "auxiliary_loss_clip": 0.06415495, "auxiliary_loss_mlp": 0.01263365, "balance_loss_clip": 0.06275273, "balance_loss_mlp": 0.01254585, "epoch": 0.8927100556140087, "flos": 22133530880640.0, "grad_norm": 1.9386237432714848, "language_loss": 0.7679162, "learning_rate": 1.1947773522108052e-07, "loss": 0.84470481, "num_input_tokens_seen": 320266005, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.08776855, "step": 14848, "time_per_iteration": 2.535449266433716 }, { "auxiliary_loss_clip": 0.06409499, "auxiliary_loss_mlp": 0.01264787, "balance_loss_clip": 0.06273691, "balance_loss_mlp": 0.01255853, "epoch": 0.8927701788666766, "flos": 28337756670720.0, "grad_norm": 1.7528671122893564, "language_loss": 0.69332576, "learning_rate": 1.1934517704548251e-07, "loss": 0.77006859, "num_input_tokens_seen": 320285555, "router_z_loss_clip": 1.35644531, "router_z_loss_mlp": 0.08935547, "step": 14849, "time_per_iteration": 2.572160005569458 }, { "auxiliary_loss_clip": 0.06416251, "auxiliary_loss_mlp": 0.012662, "balance_loss_clip": 0.06275933, "balance_loss_mlp": 0.01256061, "epoch": 0.8928303021193447, "flos": 25300932670080.0, "grad_norm": 1.541697633197063, "language_loss": 0.80837679, "learning_rate": 1.1921269018450364e-07, "loss": 0.88520133, "num_input_tokens_seen": 320305395, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.10137939, "step": 14850, "time_per_iteration": 4.052154064178467 }, { "auxiliary_loss_clip": 0.06414212, "auxiliary_loss_mlp": 0.01268732, "balance_loss_clip": 0.06277756, "balance_loss_mlp": 0.01259302, "epoch": 0.8928904253720126, "flos": 22243256202240.0, "grad_norm": 1.5439617459276427, "language_loss": 0.751454, "learning_rate": 1.1908027464316872e-07, "loss": 0.82828343, "num_input_tokens_seen": 320324220, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.09423828, "step": 14851, "time_per_iteration": 2.573467969894409 }, { "auxiliary_loss_clip": 0.06409314, "auxiliary_loss_mlp": 0.01265443, "balance_loss_clip": 0.06271619, "balance_loss_mlp": 0.01255524, "epoch": 0.8929505486246806, "flos": 27100240519680.0, "grad_norm": 1.583600267022891, "language_loss": 0.78686082, "learning_rate": 1.1894793042649775e-07, "loss": 0.86360848, "num_input_tokens_seen": 320347195, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09918213, "step": 14852, "time_per_iteration": 2.631805658340454 }, { "auxiliary_loss_clip": 0.06412725, "auxiliary_loss_mlp": 0.01261607, "balance_loss_clip": 0.06277545, "balance_loss_mlp": 0.01252916, "epoch": 0.8930106718773486, "flos": 23046021843840.0, "grad_norm": 1.3243965623830432, "language_loss": 0.69241714, "learning_rate": 1.1881565753951006e-07, "loss": 0.76916051, "num_input_tokens_seen": 320366850, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.08691406, "step": 14853, "time_per_iteration": 2.544349431991577 }, { "auxiliary_loss_clip": 0.06412432, "auxiliary_loss_mlp": 0.01263615, "balance_loss_clip": 0.06274096, "balance_loss_mlp": 0.01254251, "epoch": 0.8930707951300165, "flos": 35635378389120.0, "grad_norm": 1.544640208574078, "language_loss": 0.67147976, "learning_rate": 1.1868345598722118e-07, "loss": 0.74824023, "num_input_tokens_seen": 320388895, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09368896, "step": 14854, "time_per_iteration": 2.668121337890625 }, { "auxiliary_loss_clip": 0.06407288, "auxiliary_loss_mlp": 0.01262194, "balance_loss_clip": 0.06273283, "balance_loss_mlp": 0.01253218, "epoch": 0.8931309183826845, "flos": 23046650749440.0, "grad_norm": 1.3987292905627409, "language_loss": 0.75064802, "learning_rate": 1.1855132577464399e-07, "loss": 0.82734287, "num_input_tokens_seen": 320408520, "router_z_loss_clip": 1.34082031, "router_z_loss_mlp": 0.08972168, "step": 14855, "time_per_iteration": 2.5473318099975586 }, { "auxiliary_loss_clip": 0.06412475, "auxiliary_loss_mlp": 0.01265713, "balance_loss_clip": 0.0627527, "balance_loss_mlp": 0.01256754, "epoch": 0.8931910416353525, "flos": 26511726568320.0, "grad_norm": 2.0919403609850584, "language_loss": 0.64759338, "learning_rate": 1.1841926690678893e-07, "loss": 0.72437513, "num_input_tokens_seen": 320427400, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.08953857, "step": 14856, "time_per_iteration": 2.5872106552124023 }, { "auxiliary_loss_clip": 0.06411645, "auxiliary_loss_mlp": 0.01267191, "balance_loss_clip": 0.06273557, "balance_loss_mlp": 0.01258071, "epoch": 0.8932511648880205, "flos": 24980687164800.0, "grad_norm": 1.9661494913863982, "language_loss": 0.66599572, "learning_rate": 1.1828727938866378e-07, "loss": 0.74278408, "num_input_tokens_seen": 320447570, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09118652, "step": 14857, "time_per_iteration": 2.563026189804077 }, { "auxiliary_loss_clip": 0.06416936, "auxiliary_loss_mlp": 0.0126862, "balance_loss_clip": 0.06276464, "balance_loss_mlp": 0.01258314, "epoch": 0.8933112881406884, "flos": 24467377852800.0, "grad_norm": 2.2243592336849334, "language_loss": 0.750718, "learning_rate": 1.1815536322527408e-07, "loss": 0.82757354, "num_input_tokens_seen": 320464405, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.10308838, "step": 14858, "time_per_iteration": 2.5445401668548584 }, { "auxiliary_loss_clip": 0.06411724, "auxiliary_loss_mlp": 0.0126639, "balance_loss_clip": 0.06272718, "balance_loss_mlp": 0.01256973, "epoch": 0.8933714113933564, "flos": 28300594584960.0, "grad_norm": 1.682397366445473, "language_loss": 0.69850391, "learning_rate": 1.1802351842162139e-07, "loss": 0.77528507, "num_input_tokens_seen": 320485525, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09417725, "step": 14859, "time_per_iteration": 2.6091556549072266 }, { "auxiliary_loss_clip": 0.06405791, "auxiliary_loss_mlp": 0.01263759, "balance_loss_clip": 0.06274338, "balance_loss_mlp": 0.0125517, "epoch": 0.8934315346460243, "flos": 21441412955520.0, "grad_norm": 1.5753664896913457, "language_loss": 0.75506675, "learning_rate": 1.1789174498270526e-07, "loss": 0.83176225, "num_input_tokens_seen": 320506725, "router_z_loss_clip": 1.31445312, "router_z_loss_mlp": 0.08587646, "step": 14860, "time_per_iteration": 2.543668270111084 }, { "auxiliary_loss_clip": 0.06413645, "auxiliary_loss_mlp": 0.01267419, "balance_loss_clip": 0.06272434, "balance_loss_mlp": 0.01257358, "epoch": 0.8934916578986923, "flos": 23776475811840.0, "grad_norm": 1.6826415738939524, "language_loss": 0.57999289, "learning_rate": 1.1776004291352303e-07, "loss": 0.65680349, "num_input_tokens_seen": 320525425, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10064697, "step": 14861, "time_per_iteration": 2.5417404174804688 }, { "auxiliary_loss_clip": 0.06407477, "auxiliary_loss_mlp": 0.01264907, "balance_loss_clip": 0.06271175, "balance_loss_mlp": 0.01255948, "epoch": 0.8935517811513602, "flos": 18922090970880.0, "grad_norm": 1.7109210864787043, "language_loss": 0.63454199, "learning_rate": 1.176284122190685e-07, "loss": 0.71126586, "num_input_tokens_seen": 320543010, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.08966064, "step": 14862, "time_per_iteration": 2.520395040512085 }, { "auxiliary_loss_clip": 0.06406504, "auxiliary_loss_mlp": 0.01263484, "balance_loss_clip": 0.06270994, "balance_loss_mlp": 0.01254793, "epoch": 0.8936119044040283, "flos": 24068280055680.0, "grad_norm": 1.7099721992439967, "language_loss": 0.78586507, "learning_rate": 1.1749685290433298e-07, "loss": 0.86256492, "num_input_tokens_seen": 320562180, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.08685303, "step": 14863, "time_per_iteration": 2.558030366897583 }, { "auxiliary_loss_clip": 0.06408179, "auxiliary_loss_mlp": 0.01261697, "balance_loss_clip": 0.06272872, "balance_loss_mlp": 0.01253037, "epoch": 0.8936720276566962, "flos": 21330387895680.0, "grad_norm": 1.9261710377115953, "language_loss": 0.71283406, "learning_rate": 1.1736536497430627e-07, "loss": 0.78953284, "num_input_tokens_seen": 320580395, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.08654785, "step": 14864, "time_per_iteration": 2.5289523601531982 }, { "auxiliary_loss_clip": 0.06422874, "auxiliary_loss_mlp": 0.01268579, "balance_loss_clip": 0.06276979, "balance_loss_mlp": 0.01258541, "epoch": 0.8937321509093642, "flos": 18412093895040.0, "grad_norm": 2.308237921427394, "language_loss": 0.76559067, "learning_rate": 1.1723394843397283e-07, "loss": 0.84250516, "num_input_tokens_seen": 320599505, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.1003418, "step": 14865, "time_per_iteration": 2.5181357860565186 }, { "auxiliary_loss_clip": 0.06408655, "auxiliary_loss_mlp": 0.01265033, "balance_loss_clip": 0.06273717, "balance_loss_mlp": 0.01255711, "epoch": 0.8937922741620322, "flos": 22061344988160.0, "grad_norm": 2.109015105420492, "language_loss": 0.72209406, "learning_rate": 1.1710260328831668e-07, "loss": 0.79883099, "num_input_tokens_seen": 320619825, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.09320068, "step": 14866, "time_per_iteration": 2.572391986846924 }, { "auxiliary_loss_clip": 0.06419031, "auxiliary_loss_mlp": 0.01264657, "balance_loss_clip": 0.06275748, "balance_loss_mlp": 0.01253935, "epoch": 0.8938523974147001, "flos": 25671169935360.0, "grad_norm": 1.790653439108871, "language_loss": 0.84362888, "learning_rate": 1.1697132954231869e-07, "loss": 0.92046577, "num_input_tokens_seen": 320638515, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10723877, "step": 14867, "time_per_iteration": 4.109760522842407 }, { "auxiliary_loss_clip": 0.0641299, "auxiliary_loss_mlp": 0.01265149, "balance_loss_clip": 0.06273459, "balance_loss_mlp": 0.01256375, "epoch": 0.8939125206673681, "flos": 25750567278720.0, "grad_norm": 1.518403158919788, "language_loss": 0.80966061, "learning_rate": 1.168401272009567e-07, "loss": 0.88644207, "num_input_tokens_seen": 320659430, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.08764648, "step": 14868, "time_per_iteration": 2.5884432792663574 }, { "auxiliary_loss_clip": 0.06413535, "auxiliary_loss_mlp": 0.01266783, "balance_loss_clip": 0.06273901, "balance_loss_mlp": 0.01256322, "epoch": 0.8939726439200361, "flos": 27351863930880.0, "grad_norm": 1.8991821640533604, "language_loss": 0.77459085, "learning_rate": 1.167089962692056e-07, "loss": 0.85139394, "num_input_tokens_seen": 320679295, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.10461426, "step": 14869, "time_per_iteration": 2.620819091796875 }, { "auxiliary_loss_clip": 0.064121, "auxiliary_loss_mlp": 0.01263122, "balance_loss_clip": 0.06274953, "balance_loss_mlp": 0.01253704, "epoch": 0.8940327671727041, "flos": 20344956353280.0, "grad_norm": 1.3819568051470146, "language_loss": 0.65848464, "learning_rate": 1.1657793675203853e-07, "loss": 0.73523688, "num_input_tokens_seen": 320697535, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09411621, "step": 14870, "time_per_iteration": 2.5137383937835693 }, { "auxiliary_loss_clip": 0.06319143, "auxiliary_loss_mlp": 0.01250266, "balance_loss_clip": 0.06263766, "balance_loss_mlp": 0.01249276, "epoch": 0.894092890425372, "flos": 58425919534080.0, "grad_norm": 0.7912160431228502, "language_loss": 0.56022429, "learning_rate": 1.1644694865442461e-07, "loss": 0.63591838, "num_input_tokens_seen": 320758635, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.0098877, "step": 14871, "time_per_iteration": 3.21124005317688 }, { "auxiliary_loss_clip": 0.06410707, "auxiliary_loss_mlp": 0.01267884, "balance_loss_clip": 0.06275375, "balance_loss_mlp": 0.0125908, "epoch": 0.89415301367804, "flos": 19835965526400.0, "grad_norm": 1.9169050811976949, "language_loss": 0.77039987, "learning_rate": 1.16316031981331e-07, "loss": 0.84718573, "num_input_tokens_seen": 320777175, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.08807373, "step": 14872, "time_per_iteration": 2.5243403911590576 }, { "auxiliary_loss_clip": 0.06410653, "auxiliary_loss_mlp": 0.01264357, "balance_loss_clip": 0.06275366, "balance_loss_mlp": 0.0125572, "epoch": 0.8942131369307079, "flos": 25782907754880.0, "grad_norm": 1.717238596641702, "language_loss": 0.66914922, "learning_rate": 1.1618518673772215e-07, "loss": 0.74589932, "num_input_tokens_seen": 320797670, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.08636475, "step": 14873, "time_per_iteration": 2.569059133529663 }, { "auxiliary_loss_clip": 0.06406611, "auxiliary_loss_mlp": 0.01266035, "balance_loss_clip": 0.06271888, "balance_loss_mlp": 0.01256254, "epoch": 0.8942732601833759, "flos": 23155747165440.0, "grad_norm": 1.4973312714034868, "language_loss": 0.5994128, "learning_rate": 1.1605441292856033e-07, "loss": 0.67613924, "num_input_tokens_seen": 320817410, "router_z_loss_clip": 1.34667969, "router_z_loss_mlp": 0.09783936, "step": 14874, "time_per_iteration": 4.033392429351807 }, { "auxiliary_loss_clip": 0.06418147, "auxiliary_loss_mlp": 0.0126403, "balance_loss_clip": 0.06277017, "balance_loss_mlp": 0.01254195, "epoch": 0.8943333834360438, "flos": 27863034963840.0, "grad_norm": 2.001133231469623, "language_loss": 0.75584245, "learning_rate": 1.1592371055880356e-07, "loss": 0.83266425, "num_input_tokens_seen": 320836745, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09832764, "step": 14875, "time_per_iteration": 2.579148530960083 }, { "auxiliary_loss_clip": 0.06424637, "auxiliary_loss_mlp": 0.01264045, "balance_loss_clip": 0.06278779, "balance_loss_mlp": 0.01253472, "epoch": 0.8943935066887119, "flos": 22170525258240.0, "grad_norm": 2.3953546119929805, "language_loss": 0.7745297, "learning_rate": 1.1579307963340857e-07, "loss": 0.85141653, "num_input_tokens_seen": 320853305, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10565186, "step": 14876, "time_per_iteration": 2.531466245651245 }, { "auxiliary_loss_clip": 0.06412511, "auxiliary_loss_mlp": 0.01264293, "balance_loss_clip": 0.06273719, "balance_loss_mlp": 0.01254584, "epoch": 0.8944536299413798, "flos": 21476394835200.0, "grad_norm": 1.9496128906880914, "language_loss": 0.78691119, "learning_rate": 1.156625201573287e-07, "loss": 0.86367917, "num_input_tokens_seen": 320872885, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09710693, "step": 14877, "time_per_iteration": 2.519996404647827 }, { "auxiliary_loss_clip": 0.0641048, "auxiliary_loss_mlp": 0.01264829, "balance_loss_clip": 0.0627225, "balance_loss_mlp": 0.01255113, "epoch": 0.8945137531940478, "flos": 17754538579200.0, "grad_norm": 2.1211648739334485, "language_loss": 0.75278544, "learning_rate": 1.155320321355151e-07, "loss": 0.82953858, "num_input_tokens_seen": 320889755, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.097229, "step": 14878, "time_per_iteration": 2.494795322418213 }, { "auxiliary_loss_clip": 0.06416987, "auxiliary_loss_mlp": 0.0126575, "balance_loss_clip": 0.06275444, "balance_loss_mlp": 0.01256016, "epoch": 0.8945738764467158, "flos": 21148644389760.0, "grad_norm": 2.1628071908083433, "language_loss": 0.76204097, "learning_rate": 1.1540161557291539e-07, "loss": 0.83886832, "num_input_tokens_seen": 320907860, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09735107, "step": 14879, "time_per_iteration": 2.5180270671844482 }, { "auxiliary_loss_clip": 0.06415313, "auxiliary_loss_mlp": 0.01264264, "balance_loss_clip": 0.06277157, "balance_loss_mlp": 0.01255055, "epoch": 0.8946339996993837, "flos": 14908304689920.0, "grad_norm": 3.244217662295977, "language_loss": 0.74798566, "learning_rate": 1.1527127047447538e-07, "loss": 0.82478142, "num_input_tokens_seen": 320925825, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09216309, "step": 14880, "time_per_iteration": 2.51415753364563 }, { "auxiliary_loss_clip": 0.06411755, "auxiliary_loss_mlp": 0.01262214, "balance_loss_clip": 0.0627263, "balance_loss_mlp": 0.01253113, "epoch": 0.8946941229520518, "flos": 27389738776320.0, "grad_norm": 1.5953414467010032, "language_loss": 0.83117133, "learning_rate": 1.1514099684513822e-07, "loss": 0.90791106, "num_input_tokens_seen": 320946165, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09112549, "step": 14881, "time_per_iteration": 2.5736615657806396 }, { "auxiliary_loss_clip": 0.06410778, "auxiliary_loss_mlp": 0.01261394, "balance_loss_clip": 0.06274538, "balance_loss_mlp": 0.01252769, "epoch": 0.8947542462047197, "flos": 31804467644160.0, "grad_norm": 1.5992493394434164, "language_loss": 0.67517996, "learning_rate": 1.1501079468984287e-07, "loss": 0.75190175, "num_input_tokens_seen": 320969330, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.08624268, "step": 14882, "time_per_iteration": 4.05056095123291 }, { "auxiliary_loss_clip": 0.06424902, "auxiliary_loss_mlp": 0.01267575, "balance_loss_clip": 0.06280579, "balance_loss_mlp": 0.01257347, "epoch": 0.8948143694573877, "flos": 20889390257280.0, "grad_norm": 2.1296704580829386, "language_loss": 0.75676429, "learning_rate": 1.1488066401352691e-07, "loss": 0.83368909, "num_input_tokens_seen": 320985055, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10229492, "step": 14883, "time_per_iteration": 2.5066094398498535 }, { "auxiliary_loss_clip": 0.06406975, "auxiliary_loss_mlp": 0.01266854, "balance_loss_clip": 0.06273098, "balance_loss_mlp": 0.01257568, "epoch": 0.8948744927100556, "flos": 28222287344640.0, "grad_norm": 1.4584433185986614, "language_loss": 0.72766995, "learning_rate": 1.147506048211253e-07, "loss": 0.80440819, "num_input_tokens_seen": 321004720, "router_z_loss_clip": 1.33984375, "router_z_loss_mlp": 0.09283447, "step": 14884, "time_per_iteration": 2.575214147567749 }, { "auxiliary_loss_clip": 0.06411894, "auxiliary_loss_mlp": 0.01266448, "balance_loss_clip": 0.06274694, "balance_loss_mlp": 0.01257669, "epoch": 0.8949346159627236, "flos": 21908210451840.0, "grad_norm": 1.8610691580865, "language_loss": 0.75944507, "learning_rate": 1.1462061711756987e-07, "loss": 0.83622849, "num_input_tokens_seen": 321022350, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.08776855, "step": 14885, "time_per_iteration": 2.534428834915161 }, { "auxiliary_loss_clip": 0.06415835, "auxiliary_loss_mlp": 0.01266876, "balance_loss_clip": 0.06272096, "balance_loss_mlp": 0.01257375, "epoch": 0.8949947392153915, "flos": 21365202067200.0, "grad_norm": 1.696957138615415, "language_loss": 0.82224089, "learning_rate": 1.1449070090778911e-07, "loss": 0.899068, "num_input_tokens_seen": 321040450, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.09503174, "step": 14886, "time_per_iteration": 2.523308515548706 }, { "auxiliary_loss_clip": 0.06414489, "auxiliary_loss_mlp": 0.01263904, "balance_loss_clip": 0.06274942, "balance_loss_mlp": 0.01254582, "epoch": 0.8950548624680595, "flos": 52456672120320.0, "grad_norm": 1.7007775722966452, "language_loss": 0.63955152, "learning_rate": 1.1436085619671043e-07, "loss": 0.71633542, "num_input_tokens_seen": 321063970, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09326172, "step": 14887, "time_per_iteration": 2.824730157852173 }, { "auxiliary_loss_clip": 0.06417595, "auxiliary_loss_mlp": 0.01265485, "balance_loss_clip": 0.06275705, "balance_loss_mlp": 0.01255889, "epoch": 0.8951149857207275, "flos": 20127643989120.0, "grad_norm": 1.7100943324849043, "language_loss": 0.61304235, "learning_rate": 1.1423108298925698e-07, "loss": 0.68987322, "num_input_tokens_seen": 321083840, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09594727, "step": 14888, "time_per_iteration": 2.52483868598938 }, { "auxiliary_loss_clip": 0.06416602, "auxiliary_loss_mlp": 0.01264273, "balance_loss_clip": 0.06275333, "balance_loss_mlp": 0.01255011, "epoch": 0.8951751089733955, "flos": 29870515082880.0, "grad_norm": 3.246154666114325, "language_loss": 0.70080578, "learning_rate": 1.1410138129034952e-07, "loss": 0.77761453, "num_input_tokens_seen": 321104165, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09259033, "step": 14889, "time_per_iteration": 3.98899245262146 }, { "auxiliary_loss_clip": 0.06416297, "auxiliary_loss_mlp": 0.01269103, "balance_loss_clip": 0.06276122, "balance_loss_mlp": 0.01259156, "epoch": 0.8952352322260634, "flos": 15267305508480.0, "grad_norm": 2.388935692371419, "language_loss": 0.71179748, "learning_rate": 1.1397175110490676e-07, "loss": 0.78865147, "num_input_tokens_seen": 321117290, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09942627, "step": 14890, "time_per_iteration": 2.4767863750457764 }, { "auxiliary_loss_clip": 0.06410623, "auxiliary_loss_mlp": 0.01261522, "balance_loss_clip": 0.06270088, "balance_loss_mlp": 0.01252021, "epoch": 0.8952953554787314, "flos": 26805794872320.0, "grad_norm": 1.5236790657569512, "language_loss": 0.75860399, "learning_rate": 1.1384219243784454e-07, "loss": 0.83532548, "num_input_tokens_seen": 321137115, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09503174, "step": 14891, "time_per_iteration": 2.572124481201172 }, { "auxiliary_loss_clip": 0.06418653, "auxiliary_loss_mlp": 0.01266239, "balance_loss_clip": 0.062757, "balance_loss_mlp": 0.01256375, "epoch": 0.8953554787313994, "flos": 14142449571840.0, "grad_norm": 1.6714998907969658, "language_loss": 0.76604974, "learning_rate": 1.1371270529407517e-07, "loss": 0.84289867, "num_input_tokens_seen": 321154490, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09863281, "step": 14892, "time_per_iteration": 2.4869489669799805 }, { "auxiliary_loss_clip": 0.06412923, "auxiliary_loss_mlp": 0.01265316, "balance_loss_clip": 0.06273413, "balance_loss_mlp": 0.01255899, "epoch": 0.8954156019840673, "flos": 25710512227200.0, "grad_norm": 1.3338455707793353, "language_loss": 0.81516612, "learning_rate": 1.1358328967850895e-07, "loss": 0.89194846, "num_input_tokens_seen": 321175625, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09417725, "step": 14893, "time_per_iteration": 2.564918279647827 }, { "auxiliary_loss_clip": 0.0641092, "auxiliary_loss_mlp": 0.01264283, "balance_loss_clip": 0.06274832, "balance_loss_mlp": 0.0125539, "epoch": 0.8954757252367354, "flos": 21914415653760.0, "grad_norm": 1.6740706002572756, "language_loss": 0.74943924, "learning_rate": 1.1345394559605348e-07, "loss": 0.82619131, "num_input_tokens_seen": 321193895, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.08892822, "step": 14894, "time_per_iteration": 2.534693717956543 }, { "auxiliary_loss_clip": 0.06420138, "auxiliary_loss_mlp": 0.01264271, "balance_loss_clip": 0.06278978, "balance_loss_mlp": 0.01254001, "epoch": 0.8955358484894033, "flos": 12975568012800.0, "grad_norm": 2.216448615903385, "language_loss": 0.67015177, "learning_rate": 1.1332467305161352e-07, "loss": 0.74699587, "num_input_tokens_seen": 321211610, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.1026001, "step": 14895, "time_per_iteration": 2.500796318054199 }, { "auxiliary_loss_clip": 0.06417021, "auxiliary_loss_mlp": 0.01265112, "balance_loss_clip": 0.06274299, "balance_loss_mlp": 0.0125427, "epoch": 0.8955959717420713, "flos": 17279565310080.0, "grad_norm": 1.4874350583126108, "language_loss": 0.6750685, "learning_rate": 1.1319547205009094e-07, "loss": 0.75188982, "num_input_tokens_seen": 321229805, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10852051, "step": 14896, "time_per_iteration": 2.5344784259796143 }, { "auxiliary_loss_clip": 0.06416354, "auxiliary_loss_mlp": 0.01267233, "balance_loss_clip": 0.06277544, "balance_loss_mlp": 0.01257876, "epoch": 0.8956560949947392, "flos": 14799208273920.0, "grad_norm": 1.7720596055393663, "language_loss": 0.75964111, "learning_rate": 1.1306634259638492e-07, "loss": 0.83647698, "num_input_tokens_seen": 321247165, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09362793, "step": 14897, "time_per_iteration": 2.5148813724517822 }, { "auxiliary_loss_clip": 0.06315793, "auxiliary_loss_mlp": 0.01251146, "balance_loss_clip": 0.06260616, "balance_loss_mlp": 0.01250091, "epoch": 0.8957162182474072, "flos": 63626754280320.0, "grad_norm": 0.7368110284974887, "language_loss": 0.55219352, "learning_rate": 1.129372846953931e-07, "loss": 0.62786287, "num_input_tokens_seen": 321308425, "router_z_loss_clip": 0.55175781, "router_z_loss_mlp": 0.01055908, "step": 14898, "time_per_iteration": 3.1747565269470215 }, { "auxiliary_loss_clip": 0.06412986, "auxiliary_loss_mlp": 0.01265206, "balance_loss_clip": 0.06272361, "balance_loss_mlp": 0.01255133, "epoch": 0.8957763415000751, "flos": 25016884928640.0, "grad_norm": 1.7759342818771344, "language_loss": 0.70435637, "learning_rate": 1.12808298352008e-07, "loss": 0.7811383, "num_input_tokens_seen": 321329295, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.10076904, "step": 14899, "time_per_iteration": 2.691188335418701 }, { "auxiliary_loss_clip": 0.06416048, "auxiliary_loss_mlp": 0.01263734, "balance_loss_clip": 0.06275147, "balance_loss_mlp": 0.01253237, "epoch": 0.8958364647527431, "flos": 19834749642240.0, "grad_norm": 1.6195912698211823, "language_loss": 0.73789275, "learning_rate": 1.1267938357112106e-07, "loss": 0.81469059, "num_input_tokens_seen": 321347580, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.1050415, "step": 14900, "time_per_iteration": 2.5503342151641846 }, { "auxiliary_loss_clip": 0.06319854, "auxiliary_loss_mlp": 0.01252898, "balance_loss_clip": 0.06264697, "balance_loss_mlp": 0.01251784, "epoch": 0.895896588005411, "flos": 65555717523840.0, "grad_norm": 0.7451214205954911, "language_loss": 0.61593109, "learning_rate": 1.1255054035762124e-07, "loss": 0.69165862, "num_input_tokens_seen": 321407820, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01115417, "step": 14901, "time_per_iteration": 3.1668105125427246 }, { "auxiliary_loss_clip": 0.06415384, "auxiliary_loss_mlp": 0.0126936, "balance_loss_clip": 0.06274675, "balance_loss_mlp": 0.01259782, "epoch": 0.8959567112580791, "flos": 25597726231680.0, "grad_norm": 1.578642000376664, "language_loss": 0.70254236, "learning_rate": 1.1242176871639441e-07, "loss": 0.7793898, "num_input_tokens_seen": 321426745, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09576416, "step": 14902, "time_per_iteration": 2.6051979064941406 }, { "auxiliary_loss_clip": 0.06408593, "auxiliary_loss_mlp": 0.01264993, "balance_loss_clip": 0.06274048, "balance_loss_mlp": 0.01256285, "epoch": 0.896016834510747, "flos": 24207788304000.0, "grad_norm": 1.610191650503202, "language_loss": 0.78038794, "learning_rate": 1.1229306865232313e-07, "loss": 0.85712379, "num_input_tokens_seen": 321446165, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.0871582, "step": 14903, "time_per_iteration": 2.584703207015991 }, { "auxiliary_loss_clip": 0.06417722, "auxiliary_loss_mlp": 0.01265258, "balance_loss_clip": 0.06275527, "balance_loss_mlp": 0.01255459, "epoch": 0.896076957763415, "flos": 23082638878080.0, "grad_norm": 1.7067474711930333, "language_loss": 0.73189378, "learning_rate": 1.121644401702877e-07, "loss": 0.80872357, "num_input_tokens_seen": 321465285, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09802246, "step": 14904, "time_per_iteration": 2.5525240898132324 }, { "auxiliary_loss_clip": 0.06414288, "auxiliary_loss_mlp": 0.01264011, "balance_loss_clip": 0.06273305, "balance_loss_mlp": 0.01253532, "epoch": 0.8961370810160829, "flos": 22243130421120.0, "grad_norm": 1.891882223189114, "language_loss": 0.75277823, "learning_rate": 1.12035883275166e-07, "loss": 0.82956117, "num_input_tokens_seen": 321483670, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10473633, "step": 14905, "time_per_iteration": 2.538797616958618 }, { "auxiliary_loss_clip": 0.0641045, "auxiliary_loss_mlp": 0.01265927, "balance_loss_clip": 0.06273882, "balance_loss_mlp": 0.01256688, "epoch": 0.8961972042687509, "flos": 23078404247040.0, "grad_norm": 1.6211962659534034, "language_loss": 0.76793504, "learning_rate": 1.1190739797183279e-07, "loss": 0.84469879, "num_input_tokens_seen": 321501190, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09234619, "step": 14906, "time_per_iteration": 2.5458717346191406 }, { "auxiliary_loss_clip": 0.06419182, "auxiliary_loss_mlp": 0.01265687, "balance_loss_clip": 0.06279071, "balance_loss_mlp": 0.01256108, "epoch": 0.896257327521419, "flos": 18191595075840.0, "grad_norm": 1.6575783019001793, "language_loss": 0.74272281, "learning_rate": 1.1177898426515996e-07, "loss": 0.8195715, "num_input_tokens_seen": 321518540, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.0958252, "step": 14907, "time_per_iteration": 3.955207586288452 }, { "auxiliary_loss_clip": 0.06410654, "auxiliary_loss_mlp": 0.01268317, "balance_loss_clip": 0.06274556, "balance_loss_mlp": 0.01258518, "epoch": 0.8963174507740869, "flos": 17901384059520.0, "grad_norm": 1.9957951301090306, "language_loss": 0.8298378, "learning_rate": 1.1165064216001785e-07, "loss": 0.90662754, "num_input_tokens_seen": 321536555, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.09796143, "step": 14908, "time_per_iteration": 2.4888908863067627 }, { "auxiliary_loss_clip": 0.06417468, "auxiliary_loss_mlp": 0.01268036, "balance_loss_clip": 0.06274872, "balance_loss_mlp": 0.01257486, "epoch": 0.8963775740267549, "flos": 21038541724800.0, "grad_norm": 2.8870490280147107, "language_loss": 0.70904863, "learning_rate": 1.1152237166127232e-07, "loss": 0.78590369, "num_input_tokens_seen": 321557655, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10552979, "step": 14909, "time_per_iteration": 2.5266356468200684 }, { "auxiliary_loss_clip": 0.06412958, "auxiliary_loss_mlp": 0.01267852, "balance_loss_clip": 0.06272543, "balance_loss_mlp": 0.01257796, "epoch": 0.8964376972794228, "flos": 23185362384000.0, "grad_norm": 1.9560690753506051, "language_loss": 0.72165471, "learning_rate": 1.113941727737877e-07, "loss": 0.79846281, "num_input_tokens_seen": 321576160, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.10064697, "step": 14910, "time_per_iteration": 2.521967649459839 }, { "auxiliary_loss_clip": 0.06411484, "auxiliary_loss_mlp": 0.01264063, "balance_loss_clip": 0.06274058, "balance_loss_mlp": 0.01255027, "epoch": 0.8964978205320908, "flos": 24979974405120.0, "grad_norm": 4.631983930227519, "language_loss": 0.62782848, "learning_rate": 1.1126604550242502e-07, "loss": 0.70458388, "num_input_tokens_seen": 321596205, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09039307, "step": 14911, "time_per_iteration": 2.5459790229797363 }, { "auxiliary_loss_clip": 0.06420298, "auxiliary_loss_mlp": 0.01264331, "balance_loss_clip": 0.06278844, "balance_loss_mlp": 0.01254842, "epoch": 0.8965579437847587, "flos": 19178074794240.0, "grad_norm": 1.574273153630243, "language_loss": 0.75049853, "learning_rate": 1.111379898520437e-07, "loss": 0.82734478, "num_input_tokens_seen": 321614800, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.0947876, "step": 14912, "time_per_iteration": 2.5075433254241943 }, { "auxiliary_loss_clip": 0.06411813, "auxiliary_loss_mlp": 0.01264847, "balance_loss_clip": 0.06273393, "balance_loss_mlp": 0.01255692, "epoch": 0.8966180670374267, "flos": 24283034870400.0, "grad_norm": 1.6237634238947563, "language_loss": 0.81881505, "learning_rate": 1.1101000582749876e-07, "loss": 0.89558172, "num_input_tokens_seen": 321633445, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09161377, "step": 14913, "time_per_iteration": 2.5645341873168945 }, { "auxiliary_loss_clip": 0.06416014, "auxiliary_loss_mlp": 0.01270977, "balance_loss_clip": 0.06275305, "balance_loss_mlp": 0.01260534, "epoch": 0.8966781902900947, "flos": 13558296032640.0, "grad_norm": 2.84398243265853, "language_loss": 0.61802918, "learning_rate": 1.1088209343364407e-07, "loss": 0.69489908, "num_input_tokens_seen": 321650890, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10443115, "step": 14914, "time_per_iteration": 3.946341037750244 }, { "auxiliary_loss_clip": 0.06317978, "auxiliary_loss_mlp": 0.0125102, "balance_loss_clip": 0.06262735, "balance_loss_mlp": 0.0125004, "epoch": 0.8967383135427627, "flos": 65085104666880.0, "grad_norm": 0.6989218149211293, "language_loss": 0.54965699, "learning_rate": 1.1075425267532956e-07, "loss": 0.62534696, "num_input_tokens_seen": 321710960, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00979614, "step": 14915, "time_per_iteration": 3.1584465503692627 }, { "auxiliary_loss_clip": 0.06407009, "auxiliary_loss_mlp": 0.01262338, "balance_loss_clip": 0.06271991, "balance_loss_mlp": 0.01253517, "epoch": 0.8967984367954306, "flos": 29720273512320.0, "grad_norm": 1.3955871879422341, "language_loss": 0.72010422, "learning_rate": 1.1062648355740289e-07, "loss": 0.79679769, "num_input_tokens_seen": 321733290, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.08831787, "step": 14916, "time_per_iteration": 2.5901169776916504 }, { "auxiliary_loss_clip": 0.06412317, "auxiliary_loss_mlp": 0.01263276, "balance_loss_clip": 0.0627439, "balance_loss_mlp": 0.01253906, "epoch": 0.8968585600480986, "flos": 25709547905280.0, "grad_norm": 1.7443848003320312, "language_loss": 0.7792511, "learning_rate": 1.1049878608470931e-07, "loss": 0.85600698, "num_input_tokens_seen": 321753120, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09375, "step": 14917, "time_per_iteration": 2.562532424926758 }, { "auxiliary_loss_clip": 0.06421819, "auxiliary_loss_mlp": 0.01266885, "balance_loss_clip": 0.06277719, "balance_loss_mlp": 0.01256222, "epoch": 0.8969186833007665, "flos": 30052552078080.0, "grad_norm": 2.0653713700338536, "language_loss": 0.6848979, "learning_rate": 1.1037116026209137e-07, "loss": 0.76178497, "num_input_tokens_seen": 321772840, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10668945, "step": 14918, "time_per_iteration": 2.596221923828125 }, { "auxiliary_loss_clip": 0.06417768, "auxiliary_loss_mlp": 0.01264225, "balance_loss_clip": 0.06276181, "balance_loss_mlp": 0.0125529, "epoch": 0.8969788065534345, "flos": 22824390994560.0, "grad_norm": 3.0088656454617895, "language_loss": 0.83744967, "learning_rate": 1.102436060943881e-07, "loss": 0.91426957, "num_input_tokens_seen": 321791020, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.08929443, "step": 14919, "time_per_iteration": 2.54140305519104 }, { "auxiliary_loss_clip": 0.06418268, "auxiliary_loss_mlp": 0.01269909, "balance_loss_clip": 0.06275918, "balance_loss_mlp": 0.01260087, "epoch": 0.8970389298061026, "flos": 13266575642880.0, "grad_norm": 3.2213456332294377, "language_loss": 0.72247255, "learning_rate": 1.1011612358643696e-07, "loss": 0.79935431, "num_input_tokens_seen": 321810075, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.0982666, "step": 14920, "time_per_iteration": 2.5255894660949707 }, { "auxiliary_loss_clip": 0.06414425, "auxiliary_loss_mlp": 0.0126394, "balance_loss_clip": 0.06276164, "balance_loss_mlp": 0.01253945, "epoch": 0.8970990530587705, "flos": 10270058256000.0, "grad_norm": 2.567271033763957, "language_loss": 0.90639651, "learning_rate": 1.0998871274307164e-07, "loss": 0.98318011, "num_input_tokens_seen": 321822635, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09991455, "step": 14921, "time_per_iteration": 2.463639259338379 }, { "auxiliary_loss_clip": 0.06416261, "auxiliary_loss_mlp": 0.01264715, "balance_loss_clip": 0.06272855, "balance_loss_mlp": 0.01254773, "epoch": 0.8971591763114385, "flos": 20308884370560.0, "grad_norm": 1.7565245912920548, "language_loss": 0.74014419, "learning_rate": 1.0986137356912384e-07, "loss": 0.8169539, "num_input_tokens_seen": 321841130, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09942627, "step": 14922, "time_per_iteration": 3.990349054336548 }, { "auxiliary_loss_clip": 0.06412384, "auxiliary_loss_mlp": 0.01262895, "balance_loss_clip": 0.06272982, "balance_loss_mlp": 0.01253287, "epoch": 0.8972192995641064, "flos": 23263543843200.0, "grad_norm": 1.7531646785307549, "language_loss": 0.70696461, "learning_rate": 1.097341060694219e-07, "loss": 0.78371739, "num_input_tokens_seen": 321859855, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09606934, "step": 14923, "time_per_iteration": 2.541083574295044 }, { "auxiliary_loss_clip": 0.06419104, "auxiliary_loss_mlp": 0.01266759, "balance_loss_clip": 0.06275119, "balance_loss_mlp": 0.01256071, "epoch": 0.8972794228167744, "flos": 18375560714880.0, "grad_norm": 1.8610668127428185, "language_loss": 0.70794964, "learning_rate": 1.0960691024879221e-07, "loss": 0.78480828, "num_input_tokens_seen": 321877990, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10687256, "step": 14924, "time_per_iteration": 2.511324882507324 }, { "auxiliary_loss_clip": 0.06414127, "auxiliary_loss_mlp": 0.01261853, "balance_loss_clip": 0.06273942, "balance_loss_mlp": 0.01253622, "epoch": 0.8973395460694423, "flos": 23958974004480.0, "grad_norm": 1.5396810445065048, "language_loss": 0.72177631, "learning_rate": 1.0947978611205844e-07, "loss": 0.79853612, "num_input_tokens_seen": 321898120, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.08233643, "step": 14925, "time_per_iteration": 2.5542426109313965 }, { "auxiliary_loss_clip": 0.06415868, "auxiliary_loss_mlp": 0.01267108, "balance_loss_clip": 0.06274227, "balance_loss_mlp": 0.01257297, "epoch": 0.8973996693221103, "flos": 24977458782720.0, "grad_norm": 1.6756322622149795, "language_loss": 0.82390618, "learning_rate": 1.0935273366404008e-07, "loss": 0.90073597, "num_input_tokens_seen": 321918140, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09814453, "step": 14926, "time_per_iteration": 2.5656394958496094 }, { "auxiliary_loss_clip": 0.06416063, "auxiliary_loss_mlp": 0.01263868, "balance_loss_clip": 0.06276078, "balance_loss_mlp": 0.01254397, "epoch": 0.8974597925747783, "flos": 25745997231360.0, "grad_norm": 1.4892875222005406, "language_loss": 0.7904942, "learning_rate": 1.092257529095555e-07, "loss": 0.86729348, "num_input_tokens_seen": 321938580, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09466553, "step": 14927, "time_per_iteration": 2.568970203399658 }, { "auxiliary_loss_clip": 0.06413601, "auxiliary_loss_mlp": 0.01264085, "balance_loss_clip": 0.06275581, "balance_loss_mlp": 0.01255138, "epoch": 0.8975199158274463, "flos": 38081172816000.0, "grad_norm": 1.5496384329500235, "language_loss": 0.6647774, "learning_rate": 1.0909884385341994e-07, "loss": 0.74155426, "num_input_tokens_seen": 321961135, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.08947754, "step": 14928, "time_per_iteration": 4.120391368865967 }, { "auxiliary_loss_clip": 0.06420341, "auxiliary_loss_mlp": 0.01267881, "balance_loss_clip": 0.062773, "balance_loss_mlp": 0.01256526, "epoch": 0.8975800390801142, "flos": 25418875691520.0, "grad_norm": 1.786196676061143, "language_loss": 0.71428132, "learning_rate": 1.0897200650044602e-07, "loss": 0.79116356, "num_input_tokens_seen": 321980945, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.11340332, "step": 14929, "time_per_iteration": 2.579129457473755 }, { "auxiliary_loss_clip": 0.06413467, "auxiliary_loss_mlp": 0.01264944, "balance_loss_clip": 0.06274109, "balance_loss_mlp": 0.01255592, "epoch": 0.8976401623327822, "flos": 21765599602560.0, "grad_norm": 1.8108442311149446, "language_loss": 0.6835041, "learning_rate": 1.0884524085544256e-07, "loss": 0.76028824, "num_input_tokens_seen": 322000350, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09350586, "step": 14930, "time_per_iteration": 2.534350872039795 }, { "auxiliary_loss_clip": 0.06410623, "auxiliary_loss_mlp": 0.01265364, "balance_loss_clip": 0.06273289, "balance_loss_mlp": 0.01256245, "epoch": 0.8977002855854501, "flos": 13850519546880.0, "grad_norm": 1.7108809179796967, "language_loss": 0.75122845, "learning_rate": 1.0871854692321769e-07, "loss": 0.82798839, "num_input_tokens_seen": 322018980, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09118652, "step": 14931, "time_per_iteration": 2.54413104057312 }, { "auxiliary_loss_clip": 0.06411928, "auxiliary_loss_mlp": 0.01269636, "balance_loss_clip": 0.06273949, "balance_loss_mlp": 0.01260654, "epoch": 0.8977604088381181, "flos": 19433639347200.0, "grad_norm": 1.6780620363072705, "language_loss": 0.629924, "learning_rate": 1.0859192470857492e-07, "loss": 0.70673966, "num_input_tokens_seen": 322037675, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.08984375, "step": 14932, "time_per_iteration": 2.5228497982025146 }, { "auxiliary_loss_clip": 0.06407019, "auxiliary_loss_mlp": 0.01266362, "balance_loss_clip": 0.06274487, "balance_loss_mlp": 0.01257874, "epoch": 0.8978205320907862, "flos": 22747802762880.0, "grad_norm": 1.8346266553877006, "language_loss": 0.72044778, "learning_rate": 1.0846537421631552e-07, "loss": 0.79718161, "num_input_tokens_seen": 322055130, "router_z_loss_clip": 1.32617188, "router_z_loss_mlp": 0.08483887, "step": 14933, "time_per_iteration": 2.5479252338409424 }, { "auxiliary_loss_clip": 0.06420861, "auxiliary_loss_mlp": 0.01267439, "balance_loss_clip": 0.06277229, "balance_loss_mlp": 0.0125705, "epoch": 0.8978806553434541, "flos": 21366837221760.0, "grad_norm": 2.0534976074513067, "language_loss": 0.74465269, "learning_rate": 1.0833889545123898e-07, "loss": 0.82153571, "num_input_tokens_seen": 322074850, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.1038208, "step": 14934, "time_per_iteration": 2.582674741744995 }, { "auxiliary_loss_clip": 0.06413917, "auxiliary_loss_mlp": 0.01266458, "balance_loss_clip": 0.06275883, "balance_loss_mlp": 0.01257207, "epoch": 0.8979407785961221, "flos": 20930661192960.0, "grad_norm": 1.6404347160583905, "language_loss": 0.6058917, "learning_rate": 1.0821248841814123e-07, "loss": 0.68269551, "num_input_tokens_seen": 322093315, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09246826, "step": 14935, "time_per_iteration": 2.5405733585357666 }, { "auxiliary_loss_clip": 0.06406631, "auxiliary_loss_mlp": 0.01262019, "balance_loss_clip": 0.0627192, "balance_loss_mlp": 0.01253096, "epoch": 0.89800090184879, "flos": 25236042082560.0, "grad_norm": 1.975120488177689, "language_loss": 0.76775682, "learning_rate": 1.0808615312181512e-07, "loss": 0.84444332, "num_input_tokens_seen": 322112555, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.0892334, "step": 14936, "time_per_iteration": 2.5536916255950928 }, { "auxiliary_loss_clip": 0.06413231, "auxiliary_loss_mlp": 0.01263261, "balance_loss_clip": 0.06274465, "balance_loss_mlp": 0.01253718, "epoch": 0.898061025101458, "flos": 22568868368640.0, "grad_norm": 1.636554359958915, "language_loss": 0.74119818, "learning_rate": 1.0795988956705193e-07, "loss": 0.81796312, "num_input_tokens_seen": 322130440, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09539795, "step": 14937, "time_per_iteration": 2.5293798446655273 }, { "auxiliary_loss_clip": 0.06318127, "auxiliary_loss_mlp": 0.01251079, "balance_loss_clip": 0.06262662, "balance_loss_mlp": 0.01250095, "epoch": 0.8981211483541259, "flos": 56208799699200.0, "grad_norm": 0.8285213942031279, "language_loss": 0.63570827, "learning_rate": 1.0783369775863915e-07, "loss": 0.71140033, "num_input_tokens_seen": 322187295, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00982666, "step": 14938, "time_per_iteration": 3.049588918685913 }, { "auxiliary_loss_clip": 0.06410474, "auxiliary_loss_mlp": 0.01263271, "balance_loss_clip": 0.06275432, "balance_loss_mlp": 0.01253896, "epoch": 0.898181271606794, "flos": 16397234616960.0, "grad_norm": 1.98495938192708, "language_loss": 0.803716, "learning_rate": 1.0770757770136251e-07, "loss": 0.88045347, "num_input_tokens_seen": 322202965, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.09375, "step": 14939, "time_per_iteration": 2.4884607791900635 }, { "auxiliary_loss_clip": 0.06317405, "auxiliary_loss_mlp": 0.01251024, "balance_loss_clip": 0.06262262, "balance_loss_mlp": 0.01249984, "epoch": 0.8982413948594619, "flos": 63461655809280.0, "grad_norm": 0.7108745278073848, "language_loss": 0.52897525, "learning_rate": 1.0758152940000375e-07, "loss": 0.60465956, "num_input_tokens_seen": 322269490, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01040649, "step": 14940, "time_per_iteration": 3.292224645614624 }, { "auxiliary_loss_clip": 0.06415562, "auxiliary_loss_mlp": 0.01266771, "balance_loss_clip": 0.06275027, "balance_loss_mlp": 0.01256423, "epoch": 0.8983015181121299, "flos": 21841810490880.0, "grad_norm": 1.8324890923277217, "language_loss": 0.77792281, "learning_rate": 1.0745555285934327e-07, "loss": 0.8547461, "num_input_tokens_seen": 322288060, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.10351562, "step": 14941, "time_per_iteration": 2.526577949523926 }, { "auxiliary_loss_clip": 0.06415282, "auxiliary_loss_mlp": 0.01267997, "balance_loss_clip": 0.06274255, "balance_loss_mlp": 0.01258001, "epoch": 0.8983616413647978, "flos": 28957604849280.0, "grad_norm": 2.013773649373288, "language_loss": 0.74061257, "learning_rate": 1.0732964808415834e-07, "loss": 0.8174454, "num_input_tokens_seen": 322307930, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09997559, "step": 14942, "time_per_iteration": 2.5953073501586914 }, { "auxiliary_loss_clip": 0.06415249, "auxiliary_loss_mlp": 0.01266092, "balance_loss_clip": 0.06274308, "balance_loss_mlp": 0.0125609, "epoch": 0.8984217646174658, "flos": 17790820197120.0, "grad_norm": 2.120406364006468, "language_loss": 0.80284053, "learning_rate": 1.0720381507922205e-07, "loss": 0.87965393, "num_input_tokens_seen": 322326155, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09997559, "step": 14943, "time_per_iteration": 2.5020201206207275 }, { "auxiliary_loss_clip": 0.06417426, "auxiliary_loss_mlp": 0.01269938, "balance_loss_clip": 0.06275095, "balance_loss_mlp": 0.01259686, "epoch": 0.8984818878701337, "flos": 23411311718400.0, "grad_norm": 2.3402166205905055, "language_loss": 0.71337545, "learning_rate": 1.0707805384930701e-07, "loss": 0.79024911, "num_input_tokens_seen": 322345850, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10253906, "step": 14944, "time_per_iteration": 2.5526723861694336 }, { "auxiliary_loss_clip": 0.06420342, "auxiliary_loss_mlp": 0.01266552, "balance_loss_clip": 0.06275094, "balance_loss_mlp": 0.01256062, "epoch": 0.8985420111228017, "flos": 22352604180480.0, "grad_norm": 1.9214810895904444, "language_loss": 0.75913846, "learning_rate": 1.0695236439918187e-07, "loss": 0.83600736, "num_input_tokens_seen": 322364715, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10498047, "step": 14945, "time_per_iteration": 2.5313384532928467 }, { "auxiliary_loss_clip": 0.06427765, "auxiliary_loss_mlp": 0.01267991, "balance_loss_clip": 0.06279199, "balance_loss_mlp": 0.01257263, "epoch": 0.8986021343754698, "flos": 21398381084160.0, "grad_norm": 1.9335642267868745, "language_loss": 0.73342192, "learning_rate": 1.0682674673361302e-07, "loss": 0.81037951, "num_input_tokens_seen": 322383570, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.10717773, "step": 14946, "time_per_iteration": 2.54296875 }, { "auxiliary_loss_clip": 0.06415315, "auxiliary_loss_mlp": 0.012639, "balance_loss_clip": 0.06275265, "balance_loss_mlp": 0.01253612, "epoch": 0.8986622576281377, "flos": 21331897269120.0, "grad_norm": 2.041855834428078, "language_loss": 0.6477223, "learning_rate": 1.0670120085736334e-07, "loss": 0.72451442, "num_input_tokens_seen": 322401375, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.10284424, "step": 14947, "time_per_iteration": 4.057248830795288 }, { "auxiliary_loss_clip": 0.06412877, "auxiliary_loss_mlp": 0.01263451, "balance_loss_clip": 0.06274088, "balance_loss_mlp": 0.01254403, "epoch": 0.8987223808808057, "flos": 23995171768320.0, "grad_norm": 6.00720363760191, "language_loss": 0.70051289, "learning_rate": 1.0657572677519411e-07, "loss": 0.77727616, "num_input_tokens_seen": 322421890, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.0904541, "step": 14948, "time_per_iteration": 2.5353705883026123 }, { "auxiliary_loss_clip": 0.06413707, "auxiliary_loss_mlp": 0.01264595, "balance_loss_clip": 0.06275582, "balance_loss_mlp": 0.01255148, "epoch": 0.8987825041334736, "flos": 41510679776640.0, "grad_norm": 1.653976437501672, "language_loss": 0.75188756, "learning_rate": 1.0645032449186309e-07, "loss": 0.82867062, "num_input_tokens_seen": 322445730, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09442139, "step": 14949, "time_per_iteration": 2.707073450088501 }, { "auxiliary_loss_clip": 0.06418358, "auxiliary_loss_mlp": 0.01267233, "balance_loss_clip": 0.06276891, "balance_loss_mlp": 0.01256116, "epoch": 0.8988426273861416, "flos": 27571817698560.0, "grad_norm": 1.6923779319569998, "language_loss": 0.75981003, "learning_rate": 1.0632499401212513e-07, "loss": 0.83666593, "num_input_tokens_seen": 322464595, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.11114502, "step": 14950, "time_per_iteration": 2.5780556201934814 }, { "auxiliary_loss_clip": 0.06417417, "auxiliary_loss_mlp": 0.01263104, "balance_loss_clip": 0.06280456, "balance_loss_mlp": 0.01254426, "epoch": 0.8989027506388095, "flos": 17098408782720.0, "grad_norm": 2.561409421451247, "language_loss": 0.66679758, "learning_rate": 1.0619973534073334e-07, "loss": 0.74360275, "num_input_tokens_seen": 322483305, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.08685303, "step": 14951, "time_per_iteration": 2.517632246017456 }, { "auxiliary_loss_clip": 0.06417961, "auxiliary_loss_mlp": 0.01264215, "balance_loss_clip": 0.06272331, "balance_loss_mlp": 0.01254779, "epoch": 0.8989628738914776, "flos": 20560843198080.0, "grad_norm": 1.8510849733365586, "language_loss": 0.73891318, "learning_rate": 1.0607454848243769e-07, "loss": 0.81573498, "num_input_tokens_seen": 322501905, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.09429932, "step": 14952, "time_per_iteration": 2.51705002784729 }, { "auxiliary_loss_clip": 0.06411214, "auxiliary_loss_mlp": 0.012648, "balance_loss_clip": 0.06273516, "balance_loss_mlp": 0.01255442, "epoch": 0.8990229971441455, "flos": 16256300849280.0, "grad_norm": 2.51280792366346, "language_loss": 0.56985903, "learning_rate": 1.0594943344198481e-07, "loss": 0.64661914, "num_input_tokens_seen": 322518135, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09356689, "step": 14953, "time_per_iteration": 3.9823670387268066 }, { "auxiliary_loss_clip": 0.06412713, "auxiliary_loss_mlp": 0.01263939, "balance_loss_clip": 0.06275127, "balance_loss_mlp": 0.01253985, "epoch": 0.8990831203968135, "flos": 21987817430400.0, "grad_norm": 1.9616165365017058, "language_loss": 0.82024336, "learning_rate": 1.0582439022411915e-07, "loss": 0.89700985, "num_input_tokens_seen": 322537905, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09954834, "step": 14954, "time_per_iteration": 2.543794870376587 }, { "auxiliary_loss_clip": 0.06412511, "auxiliary_loss_mlp": 0.0126583, "balance_loss_clip": 0.06276608, "balance_loss_mlp": 0.01256413, "epoch": 0.8991432436494814, "flos": 27453413479680.0, "grad_norm": 1.8821306972849612, "language_loss": 0.60356367, "learning_rate": 1.0569941883358224e-07, "loss": 0.68034703, "num_input_tokens_seen": 322557945, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.09423828, "step": 14955, "time_per_iteration": 2.560149908065796 }, { "auxiliary_loss_clip": 0.06411232, "auxiliary_loss_mlp": 0.0126234, "balance_loss_clip": 0.06275821, "balance_loss_mlp": 0.0125356, "epoch": 0.8992033669021494, "flos": 21586245937920.0, "grad_norm": 1.9831269931299003, "language_loss": 0.55290753, "learning_rate": 1.0557451927511341e-07, "loss": 0.62964326, "num_input_tokens_seen": 322575765, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.08770752, "step": 14956, "time_per_iteration": 2.543945550918579 }, { "auxiliary_loss_clip": 0.06412043, "auxiliary_loss_mlp": 0.01261748, "balance_loss_clip": 0.06272417, "balance_loss_mlp": 0.01252139, "epoch": 0.8992634901548173, "flos": 28591644142080.0, "grad_norm": 2.007190625328747, "language_loss": 0.80195498, "learning_rate": 1.0544969155344863e-07, "loss": 0.87869287, "num_input_tokens_seen": 322595665, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09619141, "step": 14957, "time_per_iteration": 2.5841431617736816 }, { "auxiliary_loss_clip": 0.06416638, "auxiliary_loss_mlp": 0.01264723, "balance_loss_clip": 0.06273555, "balance_loss_mlp": 0.01254882, "epoch": 0.8993236134074853, "flos": 19873966152960.0, "grad_norm": 1.7010588328034968, "language_loss": 0.78935546, "learning_rate": 1.0532493567332123e-07, "loss": 0.86616904, "num_input_tokens_seen": 322614755, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09838867, "step": 14958, "time_per_iteration": 2.566082715988159 }, { "auxiliary_loss_clip": 0.0641347, "auxiliary_loss_mlp": 0.01262736, "balance_loss_clip": 0.06275517, "balance_loss_mlp": 0.01253951, "epoch": 0.8993837366601534, "flos": 19396686896640.0, "grad_norm": 2.783569529414731, "language_loss": 0.74833161, "learning_rate": 1.0520025163946277e-07, "loss": 0.82509363, "num_input_tokens_seen": 322633425, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.08782959, "step": 14959, "time_per_iteration": 2.519129514694214 }, { "auxiliary_loss_clip": 0.06408859, "auxiliary_loss_mlp": 0.0126649, "balance_loss_clip": 0.06273007, "balance_loss_mlp": 0.01257096, "epoch": 0.8994438599128213, "flos": 18557681564160.0, "grad_norm": 1.9682627057154798, "language_loss": 0.69040537, "learning_rate": 1.0507563945660015e-07, "loss": 0.76715881, "num_input_tokens_seen": 322652065, "router_z_loss_clip": 1.35644531, "router_z_loss_mlp": 0.09399414, "step": 14960, "time_per_iteration": 2.5348031520843506 }, { "auxiliary_loss_clip": 0.06409987, "auxiliary_loss_mlp": 0.01261817, "balance_loss_clip": 0.06273019, "balance_loss_mlp": 0.01252596, "epoch": 0.8995039831654893, "flos": 24434785814400.0, "grad_norm": 1.958217490809096, "language_loss": 0.6611172, "learning_rate": 1.049510991294591e-07, "loss": 0.73783523, "num_input_tokens_seen": 322673275, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09222412, "step": 14961, "time_per_iteration": 3.9985859394073486 }, { "auxiliary_loss_clip": 0.06412836, "auxiliary_loss_mlp": 0.01265476, "balance_loss_clip": 0.06276715, "balance_loss_mlp": 0.01256803, "epoch": 0.8995641064181572, "flos": 21257656951680.0, "grad_norm": 1.5579441854659728, "language_loss": 0.8330642, "learning_rate": 1.0482663066276254e-07, "loss": 0.90984738, "num_input_tokens_seen": 322693375, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.08660889, "step": 14962, "time_per_iteration": 2.535893678665161 }, { "auxiliary_loss_clip": 0.06425406, "auxiliary_loss_mlp": 0.01265341, "balance_loss_clip": 0.06282072, "balance_loss_mlp": 0.01254624, "epoch": 0.8996242296708252, "flos": 23520408134400.0, "grad_norm": 1.8572474962920213, "language_loss": 0.77212882, "learning_rate": 1.047022340612298e-07, "loss": 0.84903622, "num_input_tokens_seen": 322712615, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10717773, "step": 14963, "time_per_iteration": 2.5743114948272705 }, { "auxiliary_loss_clip": 0.06315176, "auxiliary_loss_mlp": 0.01251562, "balance_loss_clip": 0.06260023, "balance_loss_mlp": 0.01250561, "epoch": 0.8996843529234931, "flos": 62421872094720.0, "grad_norm": 0.7628294669715159, "language_loss": 0.57435548, "learning_rate": 1.0457790932957867e-07, "loss": 0.65002292, "num_input_tokens_seen": 322766855, "router_z_loss_clip": 0.55371094, "router_z_loss_mlp": 0.01000214, "step": 14964, "time_per_iteration": 3.0067312717437744 }, { "auxiliary_loss_clip": 0.06421973, "auxiliary_loss_mlp": 0.01265188, "balance_loss_clip": 0.06276827, "balance_loss_mlp": 0.01255061, "epoch": 0.8997444761761612, "flos": 24242602475520.0, "grad_norm": 3.3408482145555767, "language_loss": 0.6844359, "learning_rate": 1.0445365647252269e-07, "loss": 0.76130748, "num_input_tokens_seen": 322781130, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10137939, "step": 14965, "time_per_iteration": 2.5268824100494385 }, { "auxiliary_loss_clip": 0.06414498, "auxiliary_loss_mlp": 0.01262442, "balance_loss_clip": 0.062745, "balance_loss_mlp": 0.01253013, "epoch": 0.8998045994288291, "flos": 21367508054400.0, "grad_norm": 2.213493439515014, "language_loss": 0.7219708, "learning_rate": 1.0432947549477433e-07, "loss": 0.79874015, "num_input_tokens_seen": 322800310, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09429932, "step": 14966, "time_per_iteration": 2.5125300884246826 }, { "auxiliary_loss_clip": 0.06415936, "auxiliary_loss_mlp": 0.01274721, "balance_loss_clip": 0.06276734, "balance_loss_mlp": 0.01264439, "epoch": 0.8998647226814971, "flos": 28993760686080.0, "grad_norm": 1.8688357452720317, "language_loss": 0.73213571, "learning_rate": 1.0420536640104205e-07, "loss": 0.80904222, "num_input_tokens_seen": 322820955, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.1027832, "step": 14967, "time_per_iteration": 2.590603828430176 }, { "auxiliary_loss_clip": 0.06413302, "auxiliary_loss_mlp": 0.01263331, "balance_loss_clip": 0.06274901, "balance_loss_mlp": 0.01254044, "epoch": 0.899924845934165, "flos": 13630985049600.0, "grad_norm": 1.9817113369226964, "language_loss": 0.7256794, "learning_rate": 1.040813291960323e-07, "loss": 0.80244565, "num_input_tokens_seen": 322838780, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09283447, "step": 14968, "time_per_iteration": 3.7497851848602295 }, { "auxiliary_loss_clip": 0.06414932, "auxiliary_loss_mlp": 0.01264449, "balance_loss_clip": 0.06274287, "balance_loss_mlp": 0.01254787, "epoch": 0.899984969186833, "flos": 20888258227200.0, "grad_norm": 1.9324830758797371, "language_loss": 0.7119419, "learning_rate": 1.0395736388444864e-07, "loss": 0.78873575, "num_input_tokens_seen": 322856710, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09655762, "step": 14969, "time_per_iteration": 2.5968141555786133 }, { "auxiliary_loss_clip": 0.06417461, "auxiliary_loss_mlp": 0.01264344, "balance_loss_clip": 0.06276089, "balance_loss_mlp": 0.01254652, "epoch": 0.9000450924395009, "flos": 20927894008320.0, "grad_norm": 2.006285124790398, "language_loss": 0.76204568, "learning_rate": 1.0383347047099201e-07, "loss": 0.83886373, "num_input_tokens_seen": 322876070, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09692383, "step": 14970, "time_per_iteration": 2.5192291736602783 }, { "auxiliary_loss_clip": 0.06413893, "auxiliary_loss_mlp": 0.01264278, "balance_loss_clip": 0.06273838, "balance_loss_mlp": 0.01255362, "epoch": 0.900105215692169, "flos": 17170720456320.0, "grad_norm": 1.7798192536268018, "language_loss": 0.73578477, "learning_rate": 1.0370964896035972e-07, "loss": 0.81256652, "num_input_tokens_seen": 322895095, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.08911133, "step": 14971, "time_per_iteration": 2.528130292892456 }, { "auxiliary_loss_clip": 0.06416488, "auxiliary_loss_mlp": 0.01265897, "balance_loss_clip": 0.06278005, "balance_loss_mlp": 0.01255168, "epoch": 0.900165338944837, "flos": 19937053877760.0, "grad_norm": 2.203210294105977, "language_loss": 0.8215996, "learning_rate": 1.035858993572476e-07, "loss": 0.89842343, "num_input_tokens_seen": 322911845, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.10742188, "step": 14972, "time_per_iteration": 2.5084450244903564 }, { "auxiliary_loss_clip": 0.06422997, "auxiliary_loss_mlp": 0.0126566, "balance_loss_clip": 0.06278144, "balance_loss_mlp": 0.01256058, "epoch": 0.9002254621975049, "flos": 16112599896960.0, "grad_norm": 1.7425870049180885, "language_loss": 0.81707025, "learning_rate": 1.0346222166634855e-07, "loss": 0.89395684, "num_input_tokens_seen": 322928170, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.0960083, "step": 14973, "time_per_iteration": 2.5251622200012207 }, { "auxiliary_loss_clip": 0.06412679, "auxiliary_loss_mlp": 0.01268313, "balance_loss_clip": 0.06272972, "balance_loss_mlp": 0.01258257, "epoch": 0.9002855854501729, "flos": 28483763610240.0, "grad_norm": 3.9279709035435113, "language_loss": 0.58636159, "learning_rate": 1.0333861589235193e-07, "loss": 0.66317153, "num_input_tokens_seen": 322948165, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.10058594, "step": 14974, "time_per_iteration": 2.5967135429382324 }, { "auxiliary_loss_clip": 0.06421573, "auxiliary_loss_mlp": 0.01267987, "balance_loss_clip": 0.06280096, "balance_loss_mlp": 0.01258468, "epoch": 0.9003457087028408, "flos": 25637487793920.0, "grad_norm": 1.7799731896708644, "language_loss": 0.63418746, "learning_rate": 1.0321508203994489e-07, "loss": 0.71108305, "num_input_tokens_seen": 322968880, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09527588, "step": 14975, "time_per_iteration": 2.5807976722717285 }, { "auxiliary_loss_clip": 0.06417316, "auxiliary_loss_mlp": 0.01264658, "balance_loss_clip": 0.06275703, "balance_loss_mlp": 0.01254961, "epoch": 0.9004058319555088, "flos": 24396323990400.0, "grad_norm": 1.8465710447069006, "language_loss": 0.73026943, "learning_rate": 1.0309162011381257e-07, "loss": 0.80708915, "num_input_tokens_seen": 322989395, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09692383, "step": 14976, "time_per_iteration": 2.58842134475708 }, { "auxiliary_loss_clip": 0.0641522, "auxiliary_loss_mlp": 0.01263192, "balance_loss_clip": 0.06274688, "balance_loss_mlp": 0.01254114, "epoch": 0.9004659552081767, "flos": 29066994754560.0, "grad_norm": 1.662739343819649, "language_loss": 0.69995397, "learning_rate": 1.0296823011863565e-07, "loss": 0.77673811, "num_input_tokens_seen": 323009060, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09075928, "step": 14977, "time_per_iteration": 2.6182332038879395 }, { "auxiliary_loss_clip": 0.06418312, "auxiliary_loss_mlp": 0.01265191, "balance_loss_clip": 0.06276899, "balance_loss_mlp": 0.01254612, "epoch": 0.9005260784608448, "flos": 16769484380160.0, "grad_norm": 2.0838898697528734, "language_loss": 0.65761662, "learning_rate": 1.0284491205909351e-07, "loss": 0.73445159, "num_input_tokens_seen": 323027530, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10583496, "step": 14978, "time_per_iteration": 2.511570692062378 }, { "auxiliary_loss_clip": 0.0642053, "auxiliary_loss_mlp": 0.01267603, "balance_loss_clip": 0.06276216, "balance_loss_mlp": 0.01257023, "epoch": 0.9005862017135127, "flos": 20382244220160.0, "grad_norm": 1.7832543761408501, "language_loss": 0.79298043, "learning_rate": 1.0272166593986286e-07, "loss": 0.86986178, "num_input_tokens_seen": 323045370, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10583496, "step": 14979, "time_per_iteration": 2.56034517288208 }, { "auxiliary_loss_clip": 0.06313181, "auxiliary_loss_mlp": 0.01251325, "balance_loss_clip": 0.06257709, "balance_loss_mlp": 0.01250283, "epoch": 0.9006463249661807, "flos": 67599101917440.0, "grad_norm": 0.7227720409407143, "language_loss": 0.53088677, "learning_rate": 1.0259849176561642e-07, "loss": 0.60653186, "num_input_tokens_seen": 323105660, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01042938, "step": 14980, "time_per_iteration": 3.1604809761047363 }, { "auxiliary_loss_clip": 0.06421066, "auxiliary_loss_mlp": 0.01269646, "balance_loss_clip": 0.06275806, "balance_loss_mlp": 0.01259489, "epoch": 0.9007064482188486, "flos": 28300888074240.0, "grad_norm": 1.8284413845862622, "language_loss": 0.82710016, "learning_rate": 1.0247538954102553e-07, "loss": 0.9040072, "num_input_tokens_seen": 323126365, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.10150146, "step": 14981, "time_per_iteration": 2.5742712020874023 }, { "auxiliary_loss_clip": 0.06406803, "auxiliary_loss_mlp": 0.01263739, "balance_loss_clip": 0.06272686, "balance_loss_mlp": 0.01254149, "epoch": 0.9007665714715166, "flos": 21622737191040.0, "grad_norm": 1.4072766490533515, "language_loss": 0.81691974, "learning_rate": 1.0235235927075758e-07, "loss": 0.89362514, "num_input_tokens_seen": 323145655, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.0958252, "step": 14982, "time_per_iteration": 2.514432191848755 }, { "auxiliary_loss_clip": 0.06410309, "auxiliary_loss_mlp": 0.01265518, "balance_loss_clip": 0.06276985, "balance_loss_mlp": 0.01256691, "epoch": 0.9008266947241845, "flos": 26549098289280.0, "grad_norm": 1.7077447050680667, "language_loss": 0.72113729, "learning_rate": 1.0222940095947885e-07, "loss": 0.79789555, "num_input_tokens_seen": 323164540, "router_z_loss_clip": 1.33203125, "router_z_loss_mlp": 0.08831787, "step": 14983, "time_per_iteration": 2.559314727783203 }, { "auxiliary_loss_clip": 0.06416298, "auxiliary_loss_mlp": 0.01265949, "balance_loss_clip": 0.06279336, "balance_loss_mlp": 0.01257199, "epoch": 0.9008868179768525, "flos": 23116907998080.0, "grad_norm": 1.6091106147528598, "language_loss": 0.75142074, "learning_rate": 1.0210651461185115e-07, "loss": 0.82824326, "num_input_tokens_seen": 323186960, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.08746338, "step": 14984, "time_per_iteration": 2.565925359725952 }, { "auxiliary_loss_clip": 0.06409831, "auxiliary_loss_mlp": 0.01266702, "balance_loss_clip": 0.06274685, "balance_loss_mlp": 0.01257487, "epoch": 0.9009469412295206, "flos": 19066546609920.0, "grad_norm": 1.451493427877911, "language_loss": 0.70676643, "learning_rate": 1.0198370023253456e-07, "loss": 0.78353179, "num_input_tokens_seen": 323206135, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.09222412, "step": 14985, "time_per_iteration": 2.523343324661255 }, { "auxiliary_loss_clip": 0.06415526, "auxiliary_loss_mlp": 0.01264766, "balance_loss_clip": 0.06272865, "balance_loss_mlp": 0.01255349, "epoch": 0.9010070644821885, "flos": 23229065088000.0, "grad_norm": 1.9556725104494592, "language_loss": 0.70431149, "learning_rate": 1.0186095782618643e-07, "loss": 0.7811144, "num_input_tokens_seen": 323225980, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09423828, "step": 14986, "time_per_iteration": 2.55108642578125 }, { "auxiliary_loss_clip": 0.0641987, "auxiliary_loss_mlp": 0.01263189, "balance_loss_clip": 0.06277531, "balance_loss_mlp": 0.01254251, "epoch": 0.9010671877348565, "flos": 17390674224000.0, "grad_norm": 1.7434261981772283, "language_loss": 0.76932645, "learning_rate": 1.0173828739746104e-07, "loss": 0.84615707, "num_input_tokens_seen": 323243700, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.08932495, "step": 14987, "time_per_iteration": 4.117687463760376 }, { "auxiliary_loss_clip": 0.06417837, "auxiliary_loss_mlp": 0.01264117, "balance_loss_clip": 0.06278843, "balance_loss_mlp": 0.01254652, "epoch": 0.9011273109875244, "flos": 21914625288960.0, "grad_norm": 2.2427404574862817, "language_loss": 0.73486382, "learning_rate": 1.0161568895100981e-07, "loss": 0.8116833, "num_input_tokens_seen": 323261535, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09466553, "step": 14988, "time_per_iteration": 2.6851959228515625 }, { "auxiliary_loss_clip": 0.06421167, "auxiliary_loss_mlp": 0.01264793, "balance_loss_clip": 0.06278285, "balance_loss_mlp": 0.0125407, "epoch": 0.9011874342401924, "flos": 24067651150080.0, "grad_norm": 2.026582315409429, "language_loss": 0.69325459, "learning_rate": 1.0149316249148188e-07, "loss": 0.77011412, "num_input_tokens_seen": 323281855, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.1071167, "step": 14989, "time_per_iteration": 2.563642978668213 }, { "auxiliary_loss_clip": 0.06418368, "auxiliary_loss_mlp": 0.01264505, "balance_loss_clip": 0.06276411, "balance_loss_mlp": 0.01255338, "epoch": 0.9012475574928603, "flos": 16763572667520.0, "grad_norm": 1.8110057844714422, "language_loss": 0.80176395, "learning_rate": 1.0137070802352376e-07, "loss": 0.87859267, "num_input_tokens_seen": 323299505, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.0916748, "step": 14990, "time_per_iteration": 2.5383081436157227 }, { "auxiliary_loss_clip": 0.06421344, "auxiliary_loss_mlp": 0.01264774, "balance_loss_clip": 0.06278843, "balance_loss_mlp": 0.01255118, "epoch": 0.9013076807455284, "flos": 19976689658880.0, "grad_norm": 1.8742952021671837, "language_loss": 0.78179568, "learning_rate": 1.0124832555177842e-07, "loss": 0.85865676, "num_input_tokens_seen": 323318365, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09655762, "step": 14991, "time_per_iteration": 2.5324196815490723 }, { "auxiliary_loss_clip": 0.06313717, "auxiliary_loss_mlp": 0.01251859, "balance_loss_clip": 0.06258152, "balance_loss_mlp": 0.01250931, "epoch": 0.9013678039981963, "flos": 65200070868480.0, "grad_norm": 0.7576128519252238, "language_loss": 0.59867913, "learning_rate": 1.0112601508088726e-07, "loss": 0.67433488, "num_input_tokens_seen": 323371835, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00925446, "step": 14992, "time_per_iteration": 3.0539660453796387 }, { "auxiliary_loss_clip": 0.06409502, "auxiliary_loss_mlp": 0.01267812, "balance_loss_clip": 0.06272236, "balance_loss_mlp": 0.01257935, "epoch": 0.9014279272508643, "flos": 20527370691840.0, "grad_norm": 1.986963374564927, "language_loss": 0.83098376, "learning_rate": 1.0100377661548764e-07, "loss": 0.90775692, "num_input_tokens_seen": 323388495, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09875488, "step": 14993, "time_per_iteration": 4.047800540924072 }, { "auxiliary_loss_clip": 0.06416605, "auxiliary_loss_mlp": 0.01264238, "balance_loss_clip": 0.06276477, "balance_loss_mlp": 0.01254296, "epoch": 0.9014880505035322, "flos": 17314421408640.0, "grad_norm": 2.629924578506766, "language_loss": 0.7352742, "learning_rate": 1.0088161016021502e-07, "loss": 0.81208265, "num_input_tokens_seen": 323405280, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09942627, "step": 14994, "time_per_iteration": 2.6061925888061523 }, { "auxiliary_loss_clip": 0.06409049, "auxiliary_loss_mlp": 0.01264824, "balance_loss_clip": 0.06273633, "balance_loss_mlp": 0.01256014, "epoch": 0.9015481737562002, "flos": 28410445687680.0, "grad_norm": 1.8058682074332497, "language_loss": 0.64806569, "learning_rate": 1.0075951571970187e-07, "loss": 0.72480446, "num_input_tokens_seen": 323425310, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.08813477, "step": 14995, "time_per_iteration": 2.6045191287994385 }, { "auxiliary_loss_clip": 0.06418642, "auxiliary_loss_mlp": 0.01266958, "balance_loss_clip": 0.06277341, "balance_loss_mlp": 0.01257243, "epoch": 0.9016082970088681, "flos": 29760454344960.0, "grad_norm": 1.5079063369270975, "language_loss": 0.670138, "learning_rate": 1.0063749329857873e-07, "loss": 0.74699396, "num_input_tokens_seen": 323447805, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09710693, "step": 14996, "time_per_iteration": 2.6065497398376465 }, { "auxiliary_loss_clip": 0.06411821, "auxiliary_loss_mlp": 0.01264677, "balance_loss_clip": 0.06274515, "balance_loss_mlp": 0.01255652, "epoch": 0.9016684202615362, "flos": 23519905009920.0, "grad_norm": 1.5886100375184704, "language_loss": 0.65909421, "learning_rate": 1.0051554290147168e-07, "loss": 0.73585916, "num_input_tokens_seen": 323467150, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09020996, "step": 14997, "time_per_iteration": 2.540560722351074 }, { "auxiliary_loss_clip": 0.06411083, "auxiliary_loss_mlp": 0.01266155, "balance_loss_clip": 0.06273414, "balance_loss_mlp": 0.01256964, "epoch": 0.9017285435142042, "flos": 16984323048960.0, "grad_norm": 1.8196823903083363, "language_loss": 0.77646369, "learning_rate": 1.0039366453300613e-07, "loss": 0.85323608, "num_input_tokens_seen": 323484250, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09185791, "step": 14998, "time_per_iteration": 2.511720895767212 }, { "auxiliary_loss_clip": 0.06416053, "auxiliary_loss_mlp": 0.01263794, "balance_loss_clip": 0.06274527, "balance_loss_mlp": 0.01254132, "epoch": 0.9017886667668721, "flos": 21399051916800.0, "grad_norm": 2.64911179205885, "language_loss": 0.75517011, "learning_rate": 1.0027185819780281e-07, "loss": 0.83196855, "num_input_tokens_seen": 323502910, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09661865, "step": 14999, "time_per_iteration": 2.5733423233032227 }, { "auxiliary_loss_clip": 0.06414074, "auxiliary_loss_mlp": 0.01266695, "balance_loss_clip": 0.06276524, "balance_loss_mlp": 0.01256556, "epoch": 0.9018487900195401, "flos": 21002972866560.0, "grad_norm": 2.4479269361460445, "language_loss": 0.76128256, "learning_rate": 1.0015012390048117e-07, "loss": 0.83809024, "num_input_tokens_seen": 323521820, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.10137939, "step": 15000, "time_per_iteration": 4.033568382263184 }, { "auxiliary_loss_clip": 0.06411619, "auxiliary_loss_mlp": 0.01267927, "balance_loss_clip": 0.06275196, "balance_loss_mlp": 0.01259248, "epoch": 0.901908913272208, "flos": 53370085478400.0, "grad_norm": 2.2092468395015494, "language_loss": 0.8077637, "learning_rate": 1.0002846164565704e-07, "loss": 0.88455915, "num_input_tokens_seen": 323543200, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.08679199, "step": 15001, "time_per_iteration": 2.816932201385498 }, { "auxiliary_loss_clip": 0.06415026, "auxiliary_loss_mlp": 0.01265726, "balance_loss_clip": 0.0627673, "balance_loss_mlp": 0.01256815, "epoch": 0.901969036524876, "flos": 22096201086720.0, "grad_norm": 1.4878791082619165, "language_loss": 0.78452873, "learning_rate": 9.990687143794407e-08, "loss": 0.86133623, "num_input_tokens_seen": 323563075, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.08911133, "step": 15002, "time_per_iteration": 2.567392349243164 }, { "auxiliary_loss_clip": 0.06417149, "auxiliary_loss_mlp": 0.01265442, "balance_loss_clip": 0.06278642, "balance_loss_mlp": 0.01255232, "epoch": 0.9020291597775439, "flos": 23840653639680.0, "grad_norm": 2.1212772343269903, "language_loss": 0.68103468, "learning_rate": 9.978535328195347e-08, "loss": 0.7578606, "num_input_tokens_seen": 323579065, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.10211182, "step": 15003, "time_per_iteration": 2.5498883724212646 }, { "auxiliary_loss_clip": 0.06419215, "auxiliary_loss_mlp": 0.01263082, "balance_loss_clip": 0.06277127, "balance_loss_mlp": 0.01253421, "epoch": 0.902089283030212, "flos": 18330767907840.0, "grad_norm": 4.200041040528294, "language_loss": 0.86161971, "learning_rate": 9.9663907182292e-08, "loss": 0.93844265, "num_input_tokens_seen": 323594835, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09667969, "step": 15004, "time_per_iteration": 2.5281403064727783 }, { "auxiliary_loss_clip": 0.06418586, "auxiliary_loss_mlp": 0.01265209, "balance_loss_clip": 0.06277354, "balance_loss_mlp": 0.01254904, "epoch": 0.9021494062828799, "flos": 24177208763520.0, "grad_norm": 2.3999140613018377, "language_loss": 0.72834754, "learning_rate": 9.954253314356575e-08, "loss": 0.80518544, "num_input_tokens_seen": 323611475, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10302734, "step": 15005, "time_per_iteration": 2.5564522743225098 }, { "auxiliary_loss_clip": 0.06418286, "auxiliary_loss_mlp": 0.01264216, "balance_loss_clip": 0.06274298, "balance_loss_mlp": 0.01254095, "epoch": 0.9022095295355479, "flos": 21623366096640.0, "grad_norm": 1.8677059152676552, "language_loss": 0.71742904, "learning_rate": 9.942123117037748e-08, "loss": 0.79425406, "num_input_tokens_seen": 323629730, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10113525, "step": 15006, "time_per_iteration": 2.5432472229003906 }, { "auxiliary_loss_clip": 0.06416448, "auxiliary_loss_mlp": 0.01264298, "balance_loss_clip": 0.06275181, "balance_loss_mlp": 0.01255649, "epoch": 0.9022696527882158, "flos": 18730871953920.0, "grad_norm": 2.1797849275094725, "language_loss": 0.84844124, "learning_rate": 9.930000126732618e-08, "loss": 0.92524874, "num_input_tokens_seen": 323646000, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.08648682, "step": 15007, "time_per_iteration": 3.9488654136657715 }, { "auxiliary_loss_clip": 0.06411022, "auxiliary_loss_mlp": 0.01264545, "balance_loss_clip": 0.06275001, "balance_loss_mlp": 0.01255586, "epoch": 0.9023297760408838, "flos": 26768548932480.0, "grad_norm": 1.9287760953168405, "language_loss": 0.7861498, "learning_rate": 9.917884343900928e-08, "loss": 0.8629055, "num_input_tokens_seen": 323667250, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.08966064, "step": 15008, "time_per_iteration": 2.5782318115234375 }, { "auxiliary_loss_clip": 0.06411439, "auxiliary_loss_mlp": 0.01266482, "balance_loss_clip": 0.0627871, "balance_loss_mlp": 0.01257595, "epoch": 0.9023898992935517, "flos": 20528921992320.0, "grad_norm": 1.8118910514992854, "language_loss": 0.73794413, "learning_rate": 9.905775769002156e-08, "loss": 0.81472337, "num_input_tokens_seen": 323687150, "router_z_loss_clip": 1.32714844, "router_z_loss_mlp": 0.08886719, "step": 15009, "time_per_iteration": 2.5388829708099365 }, { "auxiliary_loss_clip": 0.06414627, "auxiliary_loss_mlp": 0.0126347, "balance_loss_clip": 0.06276853, "balance_loss_mlp": 0.01253504, "epoch": 0.9024500225462198, "flos": 17462315064960.0, "grad_norm": 1.715370521081744, "language_loss": 0.73555809, "learning_rate": 9.893674402495399e-08, "loss": 0.81233907, "num_input_tokens_seen": 323703660, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09967041, "step": 15010, "time_per_iteration": 2.51100754737854 }, { "auxiliary_loss_clip": 0.06414312, "auxiliary_loss_mlp": 0.01266927, "balance_loss_clip": 0.06273724, "balance_loss_mlp": 0.01256735, "epoch": 0.9025101457988878, "flos": 20819887695360.0, "grad_norm": 2.043190980676178, "language_loss": 0.74543124, "learning_rate": 9.881580244839538e-08, "loss": 0.82224363, "num_input_tokens_seen": 323722060, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.10198975, "step": 15011, "time_per_iteration": 2.5507867336273193 }, { "auxiliary_loss_clip": 0.06418484, "auxiliary_loss_mlp": 0.01262365, "balance_loss_clip": 0.06274705, "balance_loss_mlp": 0.01252608, "epoch": 0.9025702690515557, "flos": 19032445198080.0, "grad_norm": 1.9412310046989498, "language_loss": 0.74011385, "learning_rate": 9.869493296493204e-08, "loss": 0.81692231, "num_input_tokens_seen": 323740645, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.09759521, "step": 15012, "time_per_iteration": 2.5286073684692383 }, { "auxiliary_loss_clip": 0.06411291, "auxiliary_loss_mlp": 0.01263704, "balance_loss_clip": 0.06274824, "balance_loss_mlp": 0.01254317, "epoch": 0.9026303923042237, "flos": 19688952337920.0, "grad_norm": 1.5160510236758873, "language_loss": 0.69289672, "learning_rate": 9.857413557914763e-08, "loss": 0.76964664, "num_input_tokens_seen": 323758905, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.09387207, "step": 15013, "time_per_iteration": 2.542198419570923 }, { "auxiliary_loss_clip": 0.06412484, "auxiliary_loss_mlp": 0.0126239, "balance_loss_clip": 0.06277923, "balance_loss_mlp": 0.01253473, "epoch": 0.9026905155568916, "flos": 24615019946880.0, "grad_norm": 1.349666642809706, "language_loss": 0.72910857, "learning_rate": 9.845341029562249e-08, "loss": 0.8058573, "num_input_tokens_seen": 323780595, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.0892334, "step": 15014, "time_per_iteration": 2.5830321311950684 }, { "auxiliary_loss_clip": 0.06415457, "auxiliary_loss_mlp": 0.01265802, "balance_loss_clip": 0.06274769, "balance_loss_mlp": 0.01255884, "epoch": 0.9027506388095596, "flos": 20528041524480.0, "grad_norm": 1.8109903037256834, "language_loss": 0.72043771, "learning_rate": 9.833275711893474e-08, "loss": 0.79725027, "num_input_tokens_seen": 323798160, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09918213, "step": 15015, "time_per_iteration": 2.5365517139434814 }, { "auxiliary_loss_clip": 0.06411991, "auxiliary_loss_mlp": 0.0126681, "balance_loss_clip": 0.06272171, "balance_loss_mlp": 0.0125719, "epoch": 0.9028107620622275, "flos": 22791211977600.0, "grad_norm": 1.8922557697284166, "language_loss": 0.69262719, "learning_rate": 9.821217605365895e-08, "loss": 0.76941526, "num_input_tokens_seen": 323816810, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09619141, "step": 15016, "time_per_iteration": 2.5306692123413086 }, { "auxiliary_loss_clip": 0.06413502, "auxiliary_loss_mlp": 0.01262374, "balance_loss_clip": 0.06275459, "balance_loss_mlp": 0.01253659, "epoch": 0.9028708853148956, "flos": 25417534026240.0, "grad_norm": 1.6422636795633945, "language_loss": 0.7063297, "learning_rate": 9.809166710436855e-08, "loss": 0.78308845, "num_input_tokens_seen": 323836900, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.08712769, "step": 15017, "time_per_iteration": 2.6150190830230713 }, { "auxiliary_loss_clip": 0.06416433, "auxiliary_loss_mlp": 0.01266853, "balance_loss_clip": 0.06278032, "balance_loss_mlp": 0.01257031, "epoch": 0.9029310085675635, "flos": 21877714765440.0, "grad_norm": 2.154092494312966, "language_loss": 0.69598877, "learning_rate": 9.797123027563237e-08, "loss": 0.77282166, "num_input_tokens_seen": 323855325, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.0982666, "step": 15018, "time_per_iteration": 2.648113965988159 }, { "auxiliary_loss_clip": 0.06412757, "auxiliary_loss_mlp": 0.01265509, "balance_loss_clip": 0.06274402, "balance_loss_mlp": 0.01255919, "epoch": 0.9029911318202315, "flos": 26221725187200.0, "grad_norm": 1.8591745687395795, "language_loss": 0.69494915, "learning_rate": 9.785086557201782e-08, "loss": 0.77173173, "num_input_tokens_seen": 323875650, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.0960083, "step": 15019, "time_per_iteration": 2.5775396823883057 }, { "auxiliary_loss_clip": 0.06413402, "auxiliary_loss_mlp": 0.01264076, "balance_loss_clip": 0.06275779, "balance_loss_mlp": 0.01255767, "epoch": 0.9030512550728994, "flos": 15966886446720.0, "grad_norm": 1.768957948759449, "language_loss": 0.72130674, "learning_rate": 9.773057299808951e-08, "loss": 0.79808152, "num_input_tokens_seen": 323892920, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.08306885, "step": 15020, "time_per_iteration": 2.523695468902588 }, { "auxiliary_loss_clip": 0.06415705, "auxiliary_loss_mlp": 0.01267131, "balance_loss_clip": 0.06273456, "balance_loss_mlp": 0.01257516, "epoch": 0.9031113783255674, "flos": 23994375154560.0, "grad_norm": 1.3864606456804913, "language_loss": 0.74332702, "learning_rate": 9.7610352558408e-08, "loss": 0.82015538, "num_input_tokens_seen": 323913835, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09613037, "step": 15021, "time_per_iteration": 2.5964925289154053 }, { "auxiliary_loss_clip": 0.06421255, "auxiliary_loss_mlp": 0.01265918, "balance_loss_clip": 0.0627787, "balance_loss_mlp": 0.01255368, "epoch": 0.9031715015782353, "flos": 22243843180800.0, "grad_norm": 2.510967582550143, "language_loss": 0.72624147, "learning_rate": 9.749020425753251e-08, "loss": 0.80311328, "num_input_tokens_seen": 323933440, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.10540771, "step": 15022, "time_per_iteration": 2.5403783321380615 }, { "auxiliary_loss_clip": 0.06407619, "auxiliary_loss_mlp": 0.01266108, "balance_loss_clip": 0.06276106, "balance_loss_mlp": 0.01256464, "epoch": 0.9032316248309034, "flos": 26330402332800.0, "grad_norm": 1.9752201620170329, "language_loss": 0.72598147, "learning_rate": 9.737012810001943e-08, "loss": 0.80271876, "num_input_tokens_seen": 323954090, "router_z_loss_clip": 1.31347656, "router_z_loss_mlp": 0.09649658, "step": 15023, "time_per_iteration": 2.604851007461548 }, { "auxiliary_loss_clip": 0.06416307, "auxiliary_loss_mlp": 0.01263945, "balance_loss_clip": 0.06277286, "balance_loss_mlp": 0.0125498, "epoch": 0.9032917480835713, "flos": 22643066759040.0, "grad_norm": 1.651424088632548, "language_loss": 0.82679689, "learning_rate": 9.725012409042155e-08, "loss": 0.90359938, "num_input_tokens_seen": 323974040, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08953857, "step": 15024, "time_per_iteration": 2.5538887977600098 }, { "auxiliary_loss_clip": 0.0641634, "auxiliary_loss_mlp": 0.01261628, "balance_loss_clip": 0.06274489, "balance_loss_mlp": 0.01252473, "epoch": 0.9033518713362393, "flos": 23885614154880.0, "grad_norm": 3.208285389426053, "language_loss": 0.69833189, "learning_rate": 9.713019223328966e-08, "loss": 0.77511156, "num_input_tokens_seen": 323996125, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09155273, "step": 15025, "time_per_iteration": 2.5691146850585938 }, { "auxiliary_loss_clip": 0.06412539, "auxiliary_loss_mlp": 0.01265533, "balance_loss_clip": 0.06275089, "balance_loss_mlp": 0.01256747, "epoch": 0.9034119945889073, "flos": 26912333738880.0, "grad_norm": 1.8396262501828087, "language_loss": 0.77469367, "learning_rate": 9.70103325331717e-08, "loss": 0.8514744, "num_input_tokens_seen": 324017645, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08776855, "step": 15026, "time_per_iteration": 4.0077714920043945 }, { "auxiliary_loss_clip": 0.06417814, "auxiliary_loss_mlp": 0.01266011, "balance_loss_clip": 0.06278753, "balance_loss_mlp": 0.01256516, "epoch": 0.9034721178415752, "flos": 20856462802560.0, "grad_norm": 1.7355736324067503, "language_loss": 0.68748975, "learning_rate": 9.68905449946129e-08, "loss": 0.764328, "num_input_tokens_seen": 324036875, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09490967, "step": 15027, "time_per_iteration": 2.5579042434692383 }, { "auxiliary_loss_clip": 0.0640962, "auxiliary_loss_mlp": 0.01263327, "balance_loss_clip": 0.06276803, "balance_loss_mlp": 0.01253898, "epoch": 0.9035322410942432, "flos": 22240447090560.0, "grad_norm": 1.559217168927451, "language_loss": 0.75909203, "learning_rate": 9.677082962215477e-08, "loss": 0.83582145, "num_input_tokens_seen": 324057045, "router_z_loss_clip": 1.32714844, "router_z_loss_mlp": 0.09436035, "step": 15028, "time_per_iteration": 2.5707976818084717 }, { "auxiliary_loss_clip": 0.0641259, "auxiliary_loss_mlp": 0.01265401, "balance_loss_clip": 0.06275657, "balance_loss_mlp": 0.01255495, "epoch": 0.9035923643469111, "flos": 25930843338240.0, "grad_norm": 1.5658484309921812, "language_loss": 0.69310278, "learning_rate": 9.665118642033765e-08, "loss": 0.76988268, "num_input_tokens_seen": 324079735, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09906006, "step": 15029, "time_per_iteration": 2.5806620121002197 }, { "auxiliary_loss_clip": 0.06421598, "auxiliary_loss_mlp": 0.0126594, "balance_loss_clip": 0.06278881, "balance_loss_mlp": 0.0125564, "epoch": 0.9036524875995792, "flos": 20346088383360.0, "grad_norm": 1.8063654379115721, "language_loss": 0.73991495, "learning_rate": 9.653161539369858e-08, "loss": 0.81679034, "num_input_tokens_seen": 324097785, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10296631, "step": 15030, "time_per_iteration": 2.598292589187622 }, { "auxiliary_loss_clip": 0.06416324, "auxiliary_loss_mlp": 0.01263089, "balance_loss_clip": 0.06273324, "balance_loss_mlp": 0.01253648, "epoch": 0.9037126108522471, "flos": 40124137939200.0, "grad_norm": 1.8027186410342966, "language_loss": 0.68463814, "learning_rate": 9.641211654677151e-08, "loss": 0.76143229, "num_input_tokens_seen": 324121625, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09442139, "step": 15031, "time_per_iteration": 2.6977367401123047 }, { "auxiliary_loss_clip": 0.06409624, "auxiliary_loss_mlp": 0.01262487, "balance_loss_clip": 0.06272919, "balance_loss_mlp": 0.01253427, "epoch": 0.9037727341049151, "flos": 23338874263680.0, "grad_norm": 1.4864630546572237, "language_loss": 0.76745689, "learning_rate": 9.629268988408723e-08, "loss": 0.84417796, "num_input_tokens_seen": 324142535, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.09057617, "step": 15032, "time_per_iteration": 4.046594858169556 }, { "auxiliary_loss_clip": 0.06418446, "auxiliary_loss_mlp": 0.01265335, "balance_loss_clip": 0.06277305, "balance_loss_mlp": 0.01255828, "epoch": 0.903832857357583, "flos": 12827506648320.0, "grad_norm": 1.8199448014349844, "language_loss": 0.75357163, "learning_rate": 9.617333541017502e-08, "loss": 0.83040953, "num_input_tokens_seen": 324159610, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09503174, "step": 15033, "time_per_iteration": 2.56156325340271 }, { "auxiliary_loss_clip": 0.06414714, "auxiliary_loss_mlp": 0.01263356, "balance_loss_clip": 0.06273788, "balance_loss_mlp": 0.0125379, "epoch": 0.903892980610251, "flos": 25710176810880.0, "grad_norm": 1.6211375836642938, "language_loss": 0.73833442, "learning_rate": 9.605405312956105e-08, "loss": 0.81511515, "num_input_tokens_seen": 324182510, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09564209, "step": 15034, "time_per_iteration": 2.606245517730713 }, { "auxiliary_loss_clip": 0.06413844, "auxiliary_loss_mlp": 0.0126821, "balance_loss_clip": 0.06274961, "balance_loss_mlp": 0.01258268, "epoch": 0.9039531038629189, "flos": 14689357171200.0, "grad_norm": 1.6939418088314402, "language_loss": 0.64162529, "learning_rate": 9.593484304676791e-08, "loss": 0.71844584, "num_input_tokens_seen": 324200555, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09936523, "step": 15035, "time_per_iteration": 2.545931100845337 }, { "auxiliary_loss_clip": 0.06421924, "auxiliary_loss_mlp": 0.01267054, "balance_loss_clip": 0.06282487, "balance_loss_mlp": 0.01256784, "epoch": 0.904013227115587, "flos": 24031830729600.0, "grad_norm": 2.333634401141326, "language_loss": 0.62505502, "learning_rate": 9.581570516631643e-08, "loss": 0.70194477, "num_input_tokens_seen": 324220255, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.10272217, "step": 15036, "time_per_iteration": 2.5735831260681152 }, { "auxiliary_loss_clip": 0.064124, "auxiliary_loss_mlp": 0.01267226, "balance_loss_clip": 0.06278935, "balance_loss_mlp": 0.01258429, "epoch": 0.9040733503682549, "flos": 22863020526720.0, "grad_norm": 2.5150147369829794, "language_loss": 0.82324636, "learning_rate": 9.569663949272455e-08, "loss": 0.90004265, "num_input_tokens_seen": 324237855, "router_z_loss_clip": 1.33398438, "router_z_loss_mlp": 0.0880127, "step": 15037, "time_per_iteration": 2.58347749710083 }, { "auxiliary_loss_clip": 0.06417091, "auxiliary_loss_mlp": 0.01263849, "balance_loss_clip": 0.06275004, "balance_loss_mlp": 0.01254419, "epoch": 0.9041334736209229, "flos": 19981175852160.0, "grad_norm": 1.8291947616640527, "language_loss": 0.67766708, "learning_rate": 9.557764603050667e-08, "loss": 0.75447649, "num_input_tokens_seen": 324257050, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09442139, "step": 15038, "time_per_iteration": 2.561122417449951 }, { "auxiliary_loss_clip": 0.06413242, "auxiliary_loss_mlp": 0.01265555, "balance_loss_clip": 0.06274931, "balance_loss_mlp": 0.01255976, "epoch": 0.9041935968735909, "flos": 17536387674240.0, "grad_norm": 1.9411866156127795, "language_loss": 0.75278413, "learning_rate": 9.545872478417494e-08, "loss": 0.82957214, "num_input_tokens_seen": 324275510, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09576416, "step": 15039, "time_per_iteration": 2.580112934112549 }, { "auxiliary_loss_clip": 0.06410475, "auxiliary_loss_mlp": 0.01265324, "balance_loss_clip": 0.06275655, "balance_loss_mlp": 0.01256509, "epoch": 0.9042537201262588, "flos": 22786138805760.0, "grad_norm": 1.6271815859924292, "language_loss": 0.70403558, "learning_rate": 9.533987575823977e-08, "loss": 0.78079355, "num_input_tokens_seen": 324295150, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.0881958, "step": 15040, "time_per_iteration": 4.003229379653931 }, { "auxiliary_loss_clip": 0.06410246, "auxiliary_loss_mlp": 0.01262514, "balance_loss_clip": 0.06273142, "balance_loss_mlp": 0.01253597, "epoch": 0.9043138433789268, "flos": 20601778717440.0, "grad_norm": 1.7596654660061, "language_loss": 0.68001002, "learning_rate": 9.522109895720709e-08, "loss": 0.75673765, "num_input_tokens_seen": 324313855, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.08911133, "step": 15041, "time_per_iteration": 2.526331663131714 }, { "auxiliary_loss_clip": 0.064132, "auxiliary_loss_mlp": 0.01266672, "balance_loss_clip": 0.06274623, "balance_loss_mlp": 0.0125679, "epoch": 0.9043739666315948, "flos": 32971223422080.0, "grad_norm": 6.9166397893405325, "language_loss": 0.58080292, "learning_rate": 9.510239438558155e-08, "loss": 0.65760159, "num_input_tokens_seen": 324338465, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09881592, "step": 15042, "time_per_iteration": 2.807600736618042 }, { "auxiliary_loss_clip": 0.0631869, "auxiliary_loss_mlp": 0.01251072, "balance_loss_clip": 0.06263708, "balance_loss_mlp": 0.01250121, "epoch": 0.9044340898842628, "flos": 67316563549440.0, "grad_norm": 0.7658309319395269, "language_loss": 0.56982207, "learning_rate": 9.498376204786351e-08, "loss": 0.64551967, "num_input_tokens_seen": 324398740, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00949097, "step": 15043, "time_per_iteration": 3.1630070209503174 }, { "auxiliary_loss_clip": 0.06413311, "auxiliary_loss_mlp": 0.01262846, "balance_loss_clip": 0.06274284, "balance_loss_mlp": 0.0125291, "epoch": 0.9044942131369307, "flos": 17719053575040.0, "grad_norm": 1.7815157445309608, "language_loss": 0.70319927, "learning_rate": 9.486520194855274e-08, "loss": 0.77996075, "num_input_tokens_seen": 324417335, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.0993042, "step": 15044, "time_per_iteration": 2.5885541439056396 }, { "auxiliary_loss_clip": 0.06417683, "auxiliary_loss_mlp": 0.01267076, "balance_loss_clip": 0.06275363, "balance_loss_mlp": 0.01257182, "epoch": 0.9045543363895987, "flos": 17826137493120.0, "grad_norm": 2.968047020767486, "language_loss": 0.69938242, "learning_rate": 9.474671409214407e-08, "loss": 0.77622998, "num_input_tokens_seen": 324433240, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09893799, "step": 15045, "time_per_iteration": 2.4973835945129395 }, { "auxiliary_loss_clip": 0.06419073, "auxiliary_loss_mlp": 0.01267204, "balance_loss_clip": 0.06276955, "balance_loss_mlp": 0.01256755, "epoch": 0.9046144596422666, "flos": 21879349920000.0, "grad_norm": 1.901986391239634, "language_loss": 0.65988153, "learning_rate": 9.462829848313081e-08, "loss": 0.73674428, "num_input_tokens_seen": 324452675, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10430908, "step": 15046, "time_per_iteration": 2.5562851428985596 }, { "auxiliary_loss_clip": 0.06416433, "auxiliary_loss_mlp": 0.01264148, "balance_loss_clip": 0.06273858, "balance_loss_mlp": 0.0125414, "epoch": 0.9046745828949346, "flos": 17677866493440.0, "grad_norm": 2.0742018938806726, "language_loss": 0.62395108, "learning_rate": 9.450995512600379e-08, "loss": 0.70075685, "num_input_tokens_seen": 324467865, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10003662, "step": 15047, "time_per_iteration": 3.9120447635650635 }, { "auxiliary_loss_clip": 0.06415622, "auxiliary_loss_mlp": 0.01268792, "balance_loss_clip": 0.06277816, "balance_loss_mlp": 0.0125966, "epoch": 0.9047347061476025, "flos": 25709631759360.0, "grad_norm": 1.3329067995244919, "language_loss": 0.71199238, "learning_rate": 9.439168402525032e-08, "loss": 0.78883648, "num_input_tokens_seen": 324490430, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09130859, "step": 15048, "time_per_iteration": 2.567577600479126 }, { "auxiliary_loss_clip": 0.06416812, "auxiliary_loss_mlp": 0.01266155, "balance_loss_clip": 0.06276204, "balance_loss_mlp": 0.01256493, "epoch": 0.9047948294002706, "flos": 15163449972480.0, "grad_norm": 2.4475932449833016, "language_loss": 0.75457078, "learning_rate": 9.427348518535483e-08, "loss": 0.83140045, "num_input_tokens_seen": 324506620, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09661865, "step": 15049, "time_per_iteration": 2.5327765941619873 }, { "auxiliary_loss_clip": 0.06410147, "auxiliary_loss_mlp": 0.01263752, "balance_loss_clip": 0.06273915, "balance_loss_mlp": 0.01254263, "epoch": 0.9048549526529385, "flos": 21878846795520.0, "grad_norm": 1.9134265279297271, "language_loss": 0.75870812, "learning_rate": 9.415535861079993e-08, "loss": 0.83544707, "num_input_tokens_seen": 324525505, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.0947876, "step": 15050, "time_per_iteration": 2.5635502338409424 }, { "auxiliary_loss_clip": 0.06415567, "auxiliary_loss_mlp": 0.01263249, "balance_loss_clip": 0.06275272, "balance_loss_mlp": 0.01253612, "epoch": 0.9049150759056065, "flos": 23552790537600.0, "grad_norm": 1.683741968061203, "language_loss": 0.82156688, "learning_rate": 9.403730430606472e-08, "loss": 0.89835507, "num_input_tokens_seen": 324544415, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09637451, "step": 15051, "time_per_iteration": 2.6020700931549072 }, { "auxiliary_loss_clip": 0.06413144, "auxiliary_loss_mlp": 0.01264571, "balance_loss_clip": 0.06273619, "balance_loss_mlp": 0.01255272, "epoch": 0.9049751991582745, "flos": 19651957960320.0, "grad_norm": 2.007167267385565, "language_loss": 0.89492762, "learning_rate": 9.391932227562582e-08, "loss": 0.97170472, "num_input_tokens_seen": 324562555, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09301758, "step": 15052, "time_per_iteration": 2.540337324142456 }, { "auxiliary_loss_clip": 0.06418227, "auxiliary_loss_mlp": 0.01265895, "balance_loss_clip": 0.06275249, "balance_loss_mlp": 0.01256179, "epoch": 0.9050353224109424, "flos": 15601638499200.0, "grad_norm": 2.3226204398988903, "language_loss": 0.7720679, "learning_rate": 9.380141252395724e-08, "loss": 0.84890914, "num_input_tokens_seen": 324580865, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.097229, "step": 15053, "time_per_iteration": 2.5431456565856934 }, { "auxiliary_loss_clip": 0.06411009, "auxiliary_loss_mlp": 0.0126488, "balance_loss_clip": 0.06274921, "balance_loss_mlp": 0.01255641, "epoch": 0.9050954456636104, "flos": 28191078898560.0, "grad_norm": 1.4195592416375513, "language_loss": 0.73160332, "learning_rate": 9.368357505553049e-08, "loss": 0.80836213, "num_input_tokens_seen": 324600665, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.09234619, "step": 15054, "time_per_iteration": 2.583399772644043 }, { "auxiliary_loss_clip": 0.064151, "auxiliary_loss_mlp": 0.01263907, "balance_loss_clip": 0.06275646, "balance_loss_mlp": 0.01254751, "epoch": 0.9051555689162784, "flos": 25737444115200.0, "grad_norm": 3.7626015057201174, "language_loss": 0.8332864, "learning_rate": 9.356580987481333e-08, "loss": 0.9100765, "num_input_tokens_seen": 324618145, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.0914917, "step": 15055, "time_per_iteration": 2.558271884918213 }, { "auxiliary_loss_clip": 0.06411611, "auxiliary_loss_mlp": 0.01264389, "balance_loss_clip": 0.06275429, "balance_loss_mlp": 0.01254942, "epoch": 0.9052156921689464, "flos": 23263795405440.0, "grad_norm": 1.5735726786070319, "language_loss": 0.85446882, "learning_rate": 9.344811698627176e-08, "loss": 0.93122882, "num_input_tokens_seen": 324638165, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.09448242, "step": 15056, "time_per_iteration": 2.53625226020813 }, { "auxiliary_loss_clip": 0.06414092, "auxiliary_loss_mlp": 0.01267279, "balance_loss_clip": 0.0627472, "balance_loss_mlp": 0.01257992, "epoch": 0.9052758154216143, "flos": 29571038190720.0, "grad_norm": 1.8917638235538692, "language_loss": 0.72088194, "learning_rate": 9.333049639436863e-08, "loss": 0.79769564, "num_input_tokens_seen": 324658560, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09283447, "step": 15057, "time_per_iteration": 2.5757694244384766 }, { "auxiliary_loss_clip": 0.06410197, "auxiliary_loss_mlp": 0.01263793, "balance_loss_clip": 0.06275068, "balance_loss_mlp": 0.01254947, "epoch": 0.9053359386742823, "flos": 22134285567360.0, "grad_norm": 1.566626522296516, "language_loss": 0.81292498, "learning_rate": 9.321294810356418e-08, "loss": 0.88966489, "num_input_tokens_seen": 324679185, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.08843994, "step": 15058, "time_per_iteration": 2.544379949569702 }, { "auxiliary_loss_clip": 0.06317933, "auxiliary_loss_mlp": 0.01250589, "balance_loss_clip": 0.06262862, "balance_loss_mlp": 0.0124962, "epoch": 0.9053960619269502, "flos": 67112332421760.0, "grad_norm": 0.7523126248271698, "language_loss": 0.51418841, "learning_rate": 9.309547211831592e-08, "loss": 0.58987355, "num_input_tokens_seen": 324744830, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.0096817, "step": 15059, "time_per_iteration": 3.2581875324249268 }, { "auxiliary_loss_clip": 0.06416745, "auxiliary_loss_mlp": 0.01266114, "balance_loss_clip": 0.06276573, "balance_loss_mlp": 0.01256411, "epoch": 0.9054561851796182, "flos": 15820921434240.0, "grad_norm": 2.10637282973138, "language_loss": 0.67413068, "learning_rate": 9.297806844307831e-08, "loss": 0.75095928, "num_input_tokens_seen": 324762905, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09710693, "step": 15060, "time_per_iteration": 2.5226168632507324 }, { "auxiliary_loss_clip": 0.06414069, "auxiliary_loss_mlp": 0.01267144, "balance_loss_clip": 0.06273453, "balance_loss_mlp": 0.01257738, "epoch": 0.9055163084322861, "flos": 17572837000320.0, "grad_norm": 1.8696778554505928, "language_loss": 0.64331627, "learning_rate": 9.286073708230357e-08, "loss": 0.72012842, "num_input_tokens_seen": 324781905, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09411621, "step": 15061, "time_per_iteration": 2.535156011581421 }, { "auxiliary_loss_clip": 0.0641322, "auxiliary_loss_mlp": 0.01264893, "balance_loss_clip": 0.06274577, "balance_loss_mlp": 0.01254492, "epoch": 0.9055764316849542, "flos": 17645358309120.0, "grad_norm": 1.637312230518786, "language_loss": 0.71586007, "learning_rate": 9.274347804044058e-08, "loss": 0.79264116, "num_input_tokens_seen": 324799260, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.10400391, "step": 15062, "time_per_iteration": 2.5021955966949463 }, { "auxiliary_loss_clip": 0.0641521, "auxiliary_loss_mlp": 0.01268073, "balance_loss_clip": 0.06277002, "balance_loss_mlp": 0.01258786, "epoch": 0.9056365549376221, "flos": 20127098937600.0, "grad_norm": 1.695393182216042, "language_loss": 0.71334136, "learning_rate": 9.2626291321936e-08, "loss": 0.79017419, "num_input_tokens_seen": 324817800, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09289551, "step": 15063, "time_per_iteration": 2.5389230251312256 }, { "auxiliary_loss_clip": 0.06409371, "auxiliary_loss_mlp": 0.01264433, "balance_loss_clip": 0.06274392, "balance_loss_mlp": 0.01254759, "epoch": 0.9056966781902901, "flos": 27606002964480.0, "grad_norm": 1.5060088450971227, "language_loss": 0.72483236, "learning_rate": 9.250917693123406e-08, "loss": 0.8015703, "num_input_tokens_seen": 324838445, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.09667969, "step": 15064, "time_per_iteration": 2.5915353298187256 }, { "auxiliary_loss_clip": 0.06413223, "auxiliary_loss_mlp": 0.01262836, "balance_loss_clip": 0.06271501, "balance_loss_mlp": 0.01253651, "epoch": 0.9057568014429581, "flos": 25926986050560.0, "grad_norm": 1.6588377515353963, "language_loss": 0.69929689, "learning_rate": 9.23921348727752e-08, "loss": 0.77605748, "num_input_tokens_seen": 324859895, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09179688, "step": 15065, "time_per_iteration": 2.597255229949951 }, { "auxiliary_loss_clip": 0.06414244, "auxiliary_loss_mlp": 0.0126486, "balance_loss_clip": 0.06276479, "balance_loss_mlp": 0.01255532, "epoch": 0.905816924695626, "flos": 22937093136000.0, "grad_norm": 2.185780999678716, "language_loss": 0.6299358, "learning_rate": 9.227516515099743e-08, "loss": 0.70672685, "num_input_tokens_seen": 324879580, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09326172, "step": 15066, "time_per_iteration": 4.1245481967926025 }, { "auxiliary_loss_clip": 0.0642224, "auxiliary_loss_mlp": 0.0126535, "balance_loss_clip": 0.06275763, "balance_loss_mlp": 0.01254753, "epoch": 0.905877047948294, "flos": 22162894536960.0, "grad_norm": 1.9561530539668264, "language_loss": 0.80360043, "learning_rate": 9.215826777033675e-08, "loss": 0.88047624, "num_input_tokens_seen": 324898950, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10595703, "step": 15067, "time_per_iteration": 2.547147035598755 }, { "auxiliary_loss_clip": 0.06411273, "auxiliary_loss_mlp": 0.01267176, "balance_loss_clip": 0.06271991, "balance_loss_mlp": 0.01256859, "epoch": 0.905937171200962, "flos": 15310253525760.0, "grad_norm": 1.6881565742726499, "language_loss": 0.70062387, "learning_rate": 9.204144273522563e-08, "loss": 0.77740836, "num_input_tokens_seen": 324917455, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.10327148, "step": 15068, "time_per_iteration": 2.5134785175323486 }, { "auxiliary_loss_clip": 0.06407961, "auxiliary_loss_mlp": 0.01267347, "balance_loss_clip": 0.06273131, "balance_loss_mlp": 0.01258705, "epoch": 0.90599729445363, "flos": 19468914716160.0, "grad_norm": 1.8919770198179995, "language_loss": 0.8563472, "learning_rate": 9.19246900500943e-08, "loss": 0.93310034, "num_input_tokens_seen": 324934495, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.08642578, "step": 15069, "time_per_iteration": 2.510537624359131 }, { "auxiliary_loss_clip": 0.06422167, "auxiliary_loss_mlp": 0.01266162, "balance_loss_clip": 0.06278001, "balance_loss_mlp": 0.01256035, "epoch": 0.9060574177062979, "flos": 23739816850560.0, "grad_norm": 2.172478119757369, "language_loss": 0.59406269, "learning_rate": 9.180800971936987e-08, "loss": 0.670946, "num_input_tokens_seen": 324953230, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10137939, "step": 15070, "time_per_iteration": 2.587470293045044 }, { "auxiliary_loss_clip": 0.06419998, "auxiliary_loss_mlp": 0.01264483, "balance_loss_clip": 0.0627664, "balance_loss_mlp": 0.01254618, "epoch": 0.9061175409589659, "flos": 17316853176960.0, "grad_norm": 1.944900566089708, "language_loss": 0.81633377, "learning_rate": 9.169140174747724e-08, "loss": 0.89317858, "num_input_tokens_seen": 324969880, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09863281, "step": 15071, "time_per_iteration": 2.486431121826172 }, { "auxiliary_loss_clip": 0.0641811, "auxiliary_loss_mlp": 0.01270495, "balance_loss_clip": 0.06274994, "balance_loss_mlp": 0.01260154, "epoch": 0.9061776642116338, "flos": 17783063694720.0, "grad_norm": 2.136156727988082, "language_loss": 0.62162971, "learning_rate": 9.157486613883758e-08, "loss": 0.69851577, "num_input_tokens_seen": 324987005, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10339355, "step": 15072, "time_per_iteration": 4.0210862159729 }, { "auxiliary_loss_clip": 0.06417632, "auxiliary_loss_mlp": 0.01265781, "balance_loss_clip": 0.06278123, "balance_loss_mlp": 0.0125572, "epoch": 0.9062377874643018, "flos": 42787580146560.0, "grad_norm": 1.655579307313308, "language_loss": 0.73170847, "learning_rate": 9.145840289787021e-08, "loss": 0.80854261, "num_input_tokens_seen": 325010700, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.10070801, "step": 15073, "time_per_iteration": 2.7683584690093994 }, { "auxiliary_loss_clip": 0.06408393, "auxiliary_loss_mlp": 0.01263507, "balance_loss_clip": 0.06273566, "balance_loss_mlp": 0.01254751, "epoch": 0.9062979107169697, "flos": 16367032419840.0, "grad_norm": 2.230925125703999, "language_loss": 0.81222546, "learning_rate": 9.134201202899161e-08, "loss": 0.88894445, "num_input_tokens_seen": 325028760, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.08758545, "step": 15074, "time_per_iteration": 2.5147030353546143 }, { "auxiliary_loss_clip": 0.06316903, "auxiliary_loss_mlp": 0.01252593, "balance_loss_clip": 0.06261843, "balance_loss_mlp": 0.01251643, "epoch": 0.9063580339696378, "flos": 69336286364160.0, "grad_norm": 0.7252371380187516, "language_loss": 0.52284622, "learning_rate": 9.122569353661513e-08, "loss": 0.59854114, "num_input_tokens_seen": 325093545, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00947571, "step": 15075, "time_per_iteration": 3.2105724811553955 }, { "auxiliary_loss_clip": 0.06320062, "auxiliary_loss_mlp": 0.01251417, "balance_loss_clip": 0.06264732, "balance_loss_mlp": 0.0125041, "epoch": 0.9064181572223057, "flos": 58813388812800.0, "grad_norm": 0.7134348012228713, "language_loss": 0.61943543, "learning_rate": 9.11094474251517e-08, "loss": 0.69515026, "num_input_tokens_seen": 325152295, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.0100708, "step": 15076, "time_per_iteration": 3.070340633392334 }, { "auxiliary_loss_clip": 0.06410736, "auxiliary_loss_mlp": 0.01266748, "balance_loss_clip": 0.06273277, "balance_loss_mlp": 0.01257152, "epoch": 0.9064782804749737, "flos": 21769205328000.0, "grad_norm": 1.8035936115283724, "language_loss": 0.82400429, "learning_rate": 9.09932736990091e-08, "loss": 0.90077919, "num_input_tokens_seen": 325169705, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.0960083, "step": 15077, "time_per_iteration": 2.573647975921631 }, { "auxiliary_loss_clip": 0.06409238, "auxiliary_loss_mlp": 0.0126659, "balance_loss_clip": 0.06273197, "balance_loss_mlp": 0.01258079, "epoch": 0.9065384037276417, "flos": 21403747745280.0, "grad_norm": 1.5211952118845649, "language_loss": 0.8426249, "learning_rate": 9.08771723625934e-08, "loss": 0.91938317, "num_input_tokens_seen": 325189175, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.08514404, "step": 15078, "time_per_iteration": 2.5656208992004395 }, { "auxiliary_loss_clip": 0.06407275, "auxiliary_loss_mlp": 0.01263022, "balance_loss_clip": 0.06274472, "balance_loss_mlp": 0.01254194, "epoch": 0.9065985269803096, "flos": 38291734926720.0, "grad_norm": 2.0764986823385514, "language_loss": 0.65504479, "learning_rate": 9.076114342030617e-08, "loss": 0.73174775, "num_input_tokens_seen": 325211020, "router_z_loss_clip": 1.32910156, "router_z_loss_mlp": 0.08813477, "step": 15079, "time_per_iteration": 4.264319181442261 }, { "auxiliary_loss_clip": 0.06412122, "auxiliary_loss_mlp": 0.01264003, "balance_loss_clip": 0.0627415, "balance_loss_mlp": 0.01254407, "epoch": 0.9066586502329776, "flos": 44828406990720.0, "grad_norm": 1.6183118639693495, "language_loss": 0.71072304, "learning_rate": 9.064518687654765e-08, "loss": 0.78748429, "num_input_tokens_seen": 325236970, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09594727, "step": 15080, "time_per_iteration": 2.7676634788513184 }, { "auxiliary_loss_clip": 0.06424708, "auxiliary_loss_mlp": 0.01262539, "balance_loss_clip": 0.06280024, "balance_loss_mlp": 0.01251971, "epoch": 0.9067187734856456, "flos": 18629825529600.0, "grad_norm": 2.362927090657955, "language_loss": 0.71336055, "learning_rate": 9.052930273571547e-08, "loss": 0.79023302, "num_input_tokens_seen": 325252670, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10565186, "step": 15081, "time_per_iteration": 2.519916534423828 }, { "auxiliary_loss_clip": 0.06414727, "auxiliary_loss_mlp": 0.01262747, "balance_loss_clip": 0.06277074, "balance_loss_mlp": 0.01252877, "epoch": 0.9067788967383136, "flos": 22754217600000.0, "grad_norm": 1.7549835611937947, "language_loss": 0.74430203, "learning_rate": 9.04134910022032e-08, "loss": 0.82107675, "num_input_tokens_seen": 325273860, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09863281, "step": 15082, "time_per_iteration": 2.5745716094970703 }, { "auxiliary_loss_clip": 0.06409203, "auxiliary_loss_mlp": 0.01265991, "balance_loss_clip": 0.06273258, "balance_loss_mlp": 0.01256842, "epoch": 0.9068390199909815, "flos": 27677853440640.0, "grad_norm": 2.09628917333844, "language_loss": 0.78337574, "learning_rate": 9.029775168040266e-08, "loss": 0.86012769, "num_input_tokens_seen": 325294140, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.0914917, "step": 15083, "time_per_iteration": 2.5952117443084717 }, { "auxiliary_loss_clip": 0.06408315, "auxiliary_loss_mlp": 0.01266307, "balance_loss_clip": 0.06273673, "balance_loss_mlp": 0.01257039, "epoch": 0.9068991432436495, "flos": 24250987883520.0, "grad_norm": 1.5381929995110293, "language_loss": 0.69386208, "learning_rate": 9.01820847747028e-08, "loss": 0.77060837, "num_input_tokens_seen": 325313130, "router_z_loss_clip": 1.34667969, "router_z_loss_mlp": 0.09265137, "step": 15084, "time_per_iteration": 2.568402051925659 }, { "auxiliary_loss_clip": 0.06414123, "auxiliary_loss_mlp": 0.01266351, "balance_loss_clip": 0.0627602, "balance_loss_mlp": 0.01256773, "epoch": 0.9069592664963174, "flos": 28040040714240.0, "grad_norm": 1.862578962605315, "language_loss": 0.67068624, "learning_rate": 9.006649028948965e-08, "loss": 0.747491, "num_input_tokens_seen": 325334880, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.0958252, "step": 15085, "time_per_iteration": 2.5879135131835938 }, { "auxiliary_loss_clip": 0.06319158, "auxiliary_loss_mlp": 0.01251104, "balance_loss_clip": 0.06263703, "balance_loss_mlp": 0.01250057, "epoch": 0.9070193897489854, "flos": 68796479162880.0, "grad_norm": 0.758307161586803, "language_loss": 0.61181915, "learning_rate": 8.995096822914638e-08, "loss": 0.68752176, "num_input_tokens_seen": 325394175, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01048279, "step": 15086, "time_per_iteration": 3.2135729789733887 }, { "auxiliary_loss_clip": 0.0640991, "auxiliary_loss_mlp": 0.01267067, "balance_loss_clip": 0.0627217, "balance_loss_mlp": 0.01257524, "epoch": 0.9070795130016533, "flos": 23448515731200.0, "grad_norm": 1.5667382600776312, "language_loss": 0.72286689, "learning_rate": 8.983551859805416e-08, "loss": 0.7996366, "num_input_tokens_seen": 325415020, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09545898, "step": 15087, "time_per_iteration": 4.054555177688599 }, { "auxiliary_loss_clip": 0.06411871, "auxiliary_loss_mlp": 0.01264436, "balance_loss_clip": 0.06273644, "balance_loss_mlp": 0.01254542, "epoch": 0.9071396362543214, "flos": 18922384460160.0, "grad_norm": 1.9679151421778205, "language_loss": 0.77004492, "learning_rate": 8.972014140059058e-08, "loss": 0.84680796, "num_input_tokens_seen": 325433595, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09899902, "step": 15088, "time_per_iteration": 2.5483930110931396 }, { "auxiliary_loss_clip": 0.06411631, "auxiliary_loss_mlp": 0.01263877, "balance_loss_clip": 0.06279274, "balance_loss_mlp": 0.01255139, "epoch": 0.9071997595069893, "flos": 25235706666240.0, "grad_norm": 1.8047678736547177, "language_loss": 0.73667115, "learning_rate": 8.960483664113038e-08, "loss": 0.81342626, "num_input_tokens_seen": 325451605, "router_z_loss_clip": 1.32519531, "router_z_loss_mlp": 0.08740234, "step": 15089, "time_per_iteration": 2.6515004634857178 }, { "auxiliary_loss_clip": 0.06409746, "auxiliary_loss_mlp": 0.01267444, "balance_loss_clip": 0.06275837, "balance_loss_mlp": 0.01258753, "epoch": 0.9072598827596573, "flos": 24352453578240.0, "grad_norm": 1.6624871693122336, "language_loss": 0.75521064, "learning_rate": 8.948960432404628e-08, "loss": 0.83198261, "num_input_tokens_seen": 325470645, "router_z_loss_clip": 1.33789062, "router_z_loss_mlp": 0.08685303, "step": 15090, "time_per_iteration": 2.5871176719665527 }, { "auxiliary_loss_clip": 0.06413545, "auxiliary_loss_mlp": 0.01267112, "balance_loss_clip": 0.06273305, "balance_loss_mlp": 0.01256854, "epoch": 0.9073200060123253, "flos": 22681654364160.0, "grad_norm": 2.0916838777036806, "language_loss": 0.78018165, "learning_rate": 8.93744444537079e-08, "loss": 0.85698825, "num_input_tokens_seen": 325488070, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.10247803, "step": 15091, "time_per_iteration": 2.546898126602173 }, { "auxiliary_loss_clip": 0.06410541, "auxiliary_loss_mlp": 0.01263584, "balance_loss_clip": 0.0627747, "balance_loss_mlp": 0.01255668, "epoch": 0.9073801292649932, "flos": 23702151640320.0, "grad_norm": 1.5948475335035837, "language_loss": 0.86027986, "learning_rate": 8.925935703448217e-08, "loss": 0.93702114, "num_input_tokens_seen": 325509285, "router_z_loss_clip": 1.33007812, "router_z_loss_mlp": 0.0791626, "step": 15092, "time_per_iteration": 2.5864109992980957 }, { "auxiliary_loss_clip": 0.06413545, "auxiliary_loss_mlp": 0.01263531, "balance_loss_clip": 0.06276447, "balance_loss_mlp": 0.01254346, "epoch": 0.9074402525176612, "flos": 25382636000640.0, "grad_norm": 1.698663208425652, "language_loss": 0.79317582, "learning_rate": 8.914434207073296e-08, "loss": 0.8699466, "num_input_tokens_seen": 325529360, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09179688, "step": 15093, "time_per_iteration": 2.6310317516326904 }, { "auxiliary_loss_clip": 0.06322058, "auxiliary_loss_mlp": 0.01252106, "balance_loss_clip": 0.06266744, "balance_loss_mlp": 0.01251037, "epoch": 0.9075003757703292, "flos": 67667178960000.0, "grad_norm": 0.7520779141420885, "language_loss": 0.57005417, "learning_rate": 8.902939956682188e-08, "loss": 0.64579588, "num_input_tokens_seen": 325583565, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01070404, "step": 15094, "time_per_iteration": 3.1308228969573975 }, { "auxiliary_loss_clip": 0.06415813, "auxiliary_loss_mlp": 0.01263351, "balance_loss_clip": 0.06273177, "balance_loss_mlp": 0.01253344, "epoch": 0.9075604990229972, "flos": 22459897733760.0, "grad_norm": 2.1378002409380596, "language_loss": 0.71814585, "learning_rate": 8.891452952710742e-08, "loss": 0.79493749, "num_input_tokens_seen": 325603690, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09997559, "step": 15095, "time_per_iteration": 2.549154281616211 }, { "auxiliary_loss_clip": 0.06412101, "auxiliary_loss_mlp": 0.01264645, "balance_loss_clip": 0.06274169, "balance_loss_mlp": 0.01255221, "epoch": 0.9076206222756651, "flos": 19542735763200.0, "grad_norm": 1.6398703410469346, "language_loss": 0.74214816, "learning_rate": 8.879973195594526e-08, "loss": 0.81891555, "num_input_tokens_seen": 325622255, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09423828, "step": 15096, "time_per_iteration": 2.6046321392059326 }, { "auxiliary_loss_clip": 0.06415252, "auxiliary_loss_mlp": 0.01262965, "balance_loss_clip": 0.06273586, "balance_loss_mlp": 0.01252171, "epoch": 0.9076807455283331, "flos": 30124654116480.0, "grad_norm": 1.7831294600444416, "language_loss": 0.57083565, "learning_rate": 8.868500685768898e-08, "loss": 0.64761782, "num_input_tokens_seen": 325640165, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.10791016, "step": 15097, "time_per_iteration": 2.592862606048584 }, { "auxiliary_loss_clip": 0.06406213, "auxiliary_loss_mlp": 0.01261235, "balance_loss_clip": 0.06270874, "balance_loss_mlp": 0.01252467, "epoch": 0.907740868781001, "flos": 18703478868480.0, "grad_norm": 1.5639173410084766, "language_loss": 0.79928434, "learning_rate": 8.857035423668935e-08, "loss": 0.8759588, "num_input_tokens_seen": 325659455, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.08770752, "step": 15098, "time_per_iteration": 2.575221538543701 }, { "auxiliary_loss_clip": 0.06417658, "auxiliary_loss_mlp": 0.01261294, "balance_loss_clip": 0.06275289, "balance_loss_mlp": 0.01251805, "epoch": 0.907800992033669, "flos": 22645540454400.0, "grad_norm": 1.893709371132307, "language_loss": 0.66196537, "learning_rate": 8.845577409729266e-08, "loss": 0.73875493, "num_input_tokens_seen": 325678095, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09484863, "step": 15099, "time_per_iteration": 2.5555732250213623 }, { "auxiliary_loss_clip": 0.06415279, "auxiliary_loss_mlp": 0.01265462, "balance_loss_clip": 0.06274261, "balance_loss_mlp": 0.01255001, "epoch": 0.907861115286337, "flos": 21293980496640.0, "grad_norm": 2.0727444272114264, "language_loss": 0.70753664, "learning_rate": 8.834126644384477e-08, "loss": 0.78434408, "num_input_tokens_seen": 325695825, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10455322, "step": 15100, "time_per_iteration": 2.5737757682800293 }, { "auxiliary_loss_clip": 0.0631808, "auxiliary_loss_mlp": 0.01254543, "balance_loss_clip": 0.06262703, "balance_loss_mlp": 0.01253423, "epoch": 0.907921238539005, "flos": 69759800426880.0, "grad_norm": 0.6142122314462487, "language_loss": 0.53422332, "learning_rate": 8.822683128068775e-08, "loss": 0.60994959, "num_input_tokens_seen": 325764515, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01120758, "step": 15101, "time_per_iteration": 3.225374937057495 }, { "auxiliary_loss_clip": 0.06415159, "auxiliary_loss_mlp": 0.01264171, "balance_loss_clip": 0.06277185, "balance_loss_mlp": 0.01254557, "epoch": 0.9079813617916729, "flos": 23484168443520.0, "grad_norm": 1.8279528495372768, "language_loss": 0.68400544, "learning_rate": 8.811246861216081e-08, "loss": 0.76079869, "num_input_tokens_seen": 325783235, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09613037, "step": 15102, "time_per_iteration": 2.609720230102539 }, { "auxiliary_loss_clip": 0.06411523, "auxiliary_loss_mlp": 0.01266577, "balance_loss_clip": 0.06273594, "balance_loss_mlp": 0.01257142, "epoch": 0.9080414850443409, "flos": 22936590011520.0, "grad_norm": 1.951341755399837, "language_loss": 0.79320645, "learning_rate": 8.799817844260049e-08, "loss": 0.86998749, "num_input_tokens_seen": 325800195, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09442139, "step": 15103, "time_per_iteration": 2.5848426818847656 }, { "auxiliary_loss_clip": 0.06415021, "auxiliary_loss_mlp": 0.01265739, "balance_loss_clip": 0.06273992, "balance_loss_mlp": 0.01256483, "epoch": 0.9081016082970089, "flos": 26184269612160.0, "grad_norm": 1.8073451174524167, "language_loss": 0.72178662, "learning_rate": 8.78839607763413e-08, "loss": 0.79859424, "num_input_tokens_seen": 325820215, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09259033, "step": 15104, "time_per_iteration": 2.608980894088745 }, { "auxiliary_loss_clip": 0.06410496, "auxiliary_loss_mlp": 0.01264197, "balance_loss_clip": 0.06274536, "balance_loss_mlp": 0.01255339, "epoch": 0.9081617315496768, "flos": 24469054934400.0, "grad_norm": 1.633965489993663, "language_loss": 0.775527, "learning_rate": 8.77698156177138e-08, "loss": 0.85227394, "num_input_tokens_seen": 325838415, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.08850098, "step": 15105, "time_per_iteration": 2.5577025413513184 }, { "auxiliary_loss_clip": 0.06417312, "auxiliary_loss_mlp": 0.01264452, "balance_loss_clip": 0.06277694, "balance_loss_mlp": 0.01255148, "epoch": 0.9082218548023449, "flos": 24752599551360.0, "grad_norm": 1.9435382325396253, "language_loss": 0.74149847, "learning_rate": 8.765574297104628e-08, "loss": 0.8183161, "num_input_tokens_seen": 325855580, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09301758, "step": 15106, "time_per_iteration": 3.9736437797546387 }, { "auxiliary_loss_clip": 0.06418771, "auxiliary_loss_mlp": 0.01264271, "balance_loss_clip": 0.06277223, "balance_loss_mlp": 0.01254091, "epoch": 0.9082819780550128, "flos": 24427448582400.0, "grad_norm": 1.5902750025466539, "language_loss": 0.80566216, "learning_rate": 8.754174284066462e-08, "loss": 0.88249254, "num_input_tokens_seen": 325874890, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10174561, "step": 15107, "time_per_iteration": 2.599802255630493 }, { "auxiliary_loss_clip": 0.06319433, "auxiliary_loss_mlp": 0.01251178, "balance_loss_clip": 0.0626398, "balance_loss_mlp": 0.01250115, "epoch": 0.9083421013076808, "flos": 59630535429120.0, "grad_norm": 0.8212910827646733, "language_loss": 0.59859776, "learning_rate": 8.742781523089205e-08, "loss": 0.67430389, "num_input_tokens_seen": 325935835, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01064301, "step": 15108, "time_per_iteration": 3.1364948749542236 }, { "auxiliary_loss_clip": 0.06411491, "auxiliary_loss_mlp": 0.0126178, "balance_loss_clip": 0.0627095, "balance_loss_mlp": 0.01252803, "epoch": 0.9084022245603487, "flos": 33628652956800.0, "grad_norm": 1.8860790675347707, "language_loss": 0.74106854, "learning_rate": 8.73139601460482e-08, "loss": 0.8178013, "num_input_tokens_seen": 325958035, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.08972168, "step": 15109, "time_per_iteration": 2.6716275215148926 }, { "auxiliary_loss_clip": 0.06408379, "auxiliary_loss_mlp": 0.01264863, "balance_loss_clip": 0.06271668, "balance_loss_mlp": 0.0125557, "epoch": 0.9084623478130167, "flos": 24978465031680.0, "grad_norm": 1.5691605398593584, "language_loss": 0.71857285, "learning_rate": 8.720017759045073e-08, "loss": 0.79530525, "num_input_tokens_seen": 325979870, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09283447, "step": 15110, "time_per_iteration": 2.639317750930786 }, { "auxiliary_loss_clip": 0.06410738, "auxiliary_loss_mlp": 0.01264322, "balance_loss_clip": 0.06274427, "balance_loss_mlp": 0.0125543, "epoch": 0.9085224710656846, "flos": 31468918769280.0, "grad_norm": 1.7845872108702006, "language_loss": 0.6854645, "learning_rate": 8.708646756841421e-08, "loss": 0.76221514, "num_input_tokens_seen": 325998245, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.08898926, "step": 15111, "time_per_iteration": 2.6438183784484863 }, { "auxiliary_loss_clip": 0.06315692, "auxiliary_loss_mlp": 0.01251448, "balance_loss_clip": 0.0626042, "balance_loss_mlp": 0.01250481, "epoch": 0.9085825943183526, "flos": 64935450074880.0, "grad_norm": 0.6867337676004893, "language_loss": 0.5166871, "learning_rate": 8.697283008425026e-08, "loss": 0.59235847, "num_input_tokens_seen": 326061770, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00965881, "step": 15112, "time_per_iteration": 4.740037202835083 }, { "auxiliary_loss_clip": 0.06413032, "auxiliary_loss_mlp": 0.01263762, "balance_loss_clip": 0.06273934, "balance_loss_mlp": 0.0125466, "epoch": 0.9086427175710206, "flos": 18959253056640.0, "grad_norm": 1.8950818728695529, "language_loss": 0.70109451, "learning_rate": 8.685926514226837e-08, "loss": 0.77786243, "num_input_tokens_seen": 326080945, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09106445, "step": 15113, "time_per_iteration": 2.551482915878296 }, { "auxiliary_loss_clip": 0.06410578, "auxiliary_loss_mlp": 0.01266118, "balance_loss_clip": 0.06271577, "balance_loss_mlp": 0.01256546, "epoch": 0.9087028408236886, "flos": 34022258311680.0, "grad_norm": 2.194065097931696, "language_loss": 0.79158908, "learning_rate": 8.674577274677508e-08, "loss": 0.86835599, "num_input_tokens_seen": 326100630, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09576416, "step": 15114, "time_per_iteration": 2.682227373123169 }, { "auxiliary_loss_clip": 0.06424482, "auxiliary_loss_mlp": 0.01267096, "balance_loss_clip": 0.06280908, "balance_loss_mlp": 0.01256075, "epoch": 0.9087629640763565, "flos": 21951032688000.0, "grad_norm": 1.8377539702991115, "language_loss": 0.70863056, "learning_rate": 8.663235290207405e-08, "loss": 0.7855463, "num_input_tokens_seen": 326120145, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.11022949, "step": 15115, "time_per_iteration": 2.5450007915496826 }, { "auxiliary_loss_clip": 0.06422465, "auxiliary_loss_mlp": 0.0126524, "balance_loss_clip": 0.06279109, "balance_loss_mlp": 0.01254863, "epoch": 0.9088230873290245, "flos": 21769456890240.0, "grad_norm": 1.4778328262565408, "language_loss": 0.65949762, "learning_rate": 8.651900561246561e-08, "loss": 0.73637462, "num_input_tokens_seen": 326140715, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.1038208, "step": 15116, "time_per_iteration": 2.594804286956787 }, { "auxiliary_loss_clip": 0.06407989, "auxiliary_loss_mlp": 0.0126706, "balance_loss_clip": 0.06274615, "balance_loss_mlp": 0.01257428, "epoch": 0.9088832105816925, "flos": 21547322916480.0, "grad_norm": 1.562413963573179, "language_loss": 0.69298583, "learning_rate": 8.640573088224812e-08, "loss": 0.76973635, "num_input_tokens_seen": 326159130, "router_z_loss_clip": 1.33496094, "router_z_loss_mlp": 0.09631348, "step": 15117, "time_per_iteration": 2.5474910736083984 }, { "auxiliary_loss_clip": 0.06413186, "auxiliary_loss_mlp": 0.01268931, "balance_loss_clip": 0.06275384, "balance_loss_mlp": 0.01259537, "epoch": 0.9089433338343604, "flos": 26004203187840.0, "grad_norm": 1.3991528573106617, "language_loss": 0.74631441, "learning_rate": 8.629252871571745e-08, "loss": 0.82313561, "num_input_tokens_seen": 326181375, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09399414, "step": 15118, "time_per_iteration": 2.605512857437134 }, { "auxiliary_loss_clip": 0.064226, "auxiliary_loss_mlp": 0.01266114, "balance_loss_clip": 0.0627622, "balance_loss_mlp": 0.0125529, "epoch": 0.9090034570870285, "flos": 21184758299520.0, "grad_norm": 2.0892412868608043, "language_loss": 0.73473668, "learning_rate": 8.617939911716554e-08, "loss": 0.81162375, "num_input_tokens_seen": 326199740, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.1081543, "step": 15119, "time_per_iteration": 3.973588466644287 }, { "auxiliary_loss_clip": 0.06423807, "auxiliary_loss_mlp": 0.0126463, "balance_loss_clip": 0.06279336, "balance_loss_mlp": 0.01253883, "epoch": 0.9090635803396964, "flos": 16147036725120.0, "grad_norm": 2.3257972784151524, "language_loss": 0.71565962, "learning_rate": 8.60663420908827e-08, "loss": 0.79254401, "num_input_tokens_seen": 326214350, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10748291, "step": 15120, "time_per_iteration": 2.53613543510437 }, { "auxiliary_loss_clip": 0.06414554, "auxiliary_loss_mlp": 0.01263477, "balance_loss_clip": 0.0627328, "balance_loss_mlp": 0.01253726, "epoch": 0.9091237035923644, "flos": 20597250597120.0, "grad_norm": 3.2128458317193958, "language_loss": 0.66515028, "learning_rate": 8.595335764115596e-08, "loss": 0.7419306, "num_input_tokens_seen": 326234580, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09747314, "step": 15121, "time_per_iteration": 2.5764482021331787 }, { "auxiliary_loss_clip": 0.06415338, "auxiliary_loss_mlp": 0.01268114, "balance_loss_clip": 0.06275773, "balance_loss_mlp": 0.01258291, "epoch": 0.9091838268450323, "flos": 52239275902080.0, "grad_norm": 1.7842278299934684, "language_loss": 0.70610797, "learning_rate": 8.58404457722699e-08, "loss": 0.78294253, "num_input_tokens_seen": 326259080, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09820557, "step": 15122, "time_per_iteration": 2.8414812088012695 }, { "auxiliary_loss_clip": 0.06409757, "auxiliary_loss_mlp": 0.01262257, "balance_loss_clip": 0.06273136, "balance_loss_mlp": 0.01253489, "epoch": 0.9092439500977003, "flos": 20566084078080.0, "grad_norm": 1.2419594717725981, "language_loss": 0.74566776, "learning_rate": 8.572760648850575e-08, "loss": 0.82238793, "num_input_tokens_seen": 326280175, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.08764648, "step": 15123, "time_per_iteration": 2.5766615867614746 }, { "auxiliary_loss_clip": 0.06410974, "auxiliary_loss_mlp": 0.01263028, "balance_loss_clip": 0.06275328, "balance_loss_mlp": 0.01254028, "epoch": 0.9093040733503682, "flos": 28624823159040.0, "grad_norm": 2.3382766152297383, "language_loss": 0.7617259, "learning_rate": 8.561483979414253e-08, "loss": 0.83846587, "num_input_tokens_seen": 326297990, "router_z_loss_clip": 1.35644531, "router_z_loss_mlp": 0.09008789, "step": 15124, "time_per_iteration": 2.648848056793213 }, { "auxiliary_loss_clip": 0.06409019, "auxiliary_loss_mlp": 0.01268612, "balance_loss_clip": 0.06271614, "balance_loss_mlp": 0.01259088, "epoch": 0.9093641966030362, "flos": 23446838649600.0, "grad_norm": 1.876358311426104, "language_loss": 0.72478986, "learning_rate": 8.55021456934566e-08, "loss": 0.80156618, "num_input_tokens_seen": 326316735, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09527588, "step": 15125, "time_per_iteration": 2.575897216796875 }, { "auxiliary_loss_clip": 0.06409997, "auxiliary_loss_mlp": 0.01262044, "balance_loss_clip": 0.06274711, "balance_loss_mlp": 0.01252907, "epoch": 0.9094243198557042, "flos": 16805807925120.0, "grad_norm": 1.5387264088907147, "language_loss": 0.79364669, "learning_rate": 8.538952419072143e-08, "loss": 0.87036705, "num_input_tokens_seen": 326334370, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.09130859, "step": 15126, "time_per_iteration": 4.052874326705933 }, { "auxiliary_loss_clip": 0.06411502, "auxiliary_loss_mlp": 0.01265944, "balance_loss_clip": 0.06276436, "balance_loss_mlp": 0.01256985, "epoch": 0.9094844431083722, "flos": 24279051801600.0, "grad_norm": 1.6763793054443088, "language_loss": 0.76012969, "learning_rate": 8.527697529020694e-08, "loss": 0.83690417, "num_input_tokens_seen": 326353435, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.08950806, "step": 15127, "time_per_iteration": 2.566458225250244 }, { "auxiliary_loss_clip": 0.06412877, "auxiliary_loss_mlp": 0.01266854, "balance_loss_clip": 0.06272695, "balance_loss_mlp": 0.01257723, "epoch": 0.9095445663610401, "flos": 21951116542080.0, "grad_norm": 1.796335325145551, "language_loss": 0.62491345, "learning_rate": 8.516449899618173e-08, "loss": 0.70171076, "num_input_tokens_seen": 326371810, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09136963, "step": 15128, "time_per_iteration": 2.5770068168640137 }, { "auxiliary_loss_clip": 0.06413125, "auxiliary_loss_mlp": 0.01265245, "balance_loss_clip": 0.06274714, "balance_loss_mlp": 0.01255547, "epoch": 0.9096046896137081, "flos": 19799096929920.0, "grad_norm": 1.7946349704306677, "language_loss": 0.77094185, "learning_rate": 8.505209531291013e-08, "loss": 0.84772563, "num_input_tokens_seen": 326391380, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09698486, "step": 15129, "time_per_iteration": 2.544419050216675 }, { "auxiliary_loss_clip": 0.06417202, "auxiliary_loss_mlp": 0.01263219, "balance_loss_clip": 0.06276132, "balance_loss_mlp": 0.01253611, "epoch": 0.909664812866376, "flos": 22644701913600.0, "grad_norm": 1.808627053984091, "language_loss": 0.83733857, "learning_rate": 8.49397642446552e-08, "loss": 0.91414279, "num_input_tokens_seen": 326408800, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09613037, "step": 15130, "time_per_iteration": 2.5968520641326904 }, { "auxiliary_loss_clip": 0.0641771, "auxiliary_loss_mlp": 0.01262806, "balance_loss_clip": 0.062782, "balance_loss_mlp": 0.01252846, "epoch": 0.909724936119044, "flos": 39860439540480.0, "grad_norm": 1.8882016974482072, "language_loss": 0.75105697, "learning_rate": 8.482750579567644e-08, "loss": 0.82786214, "num_input_tokens_seen": 326431565, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09967041, "step": 15131, "time_per_iteration": 2.709406852722168 }, { "auxiliary_loss_clip": 0.064127, "auxiliary_loss_mlp": 0.01262099, "balance_loss_clip": 0.06274008, "balance_loss_mlp": 0.01252789, "epoch": 0.9097850593717121, "flos": 35078953351680.0, "grad_norm": 2.2344300426785755, "language_loss": 0.60186219, "learning_rate": 8.471531997023085e-08, "loss": 0.67861021, "num_input_tokens_seen": 326451715, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09307861, "step": 15132, "time_per_iteration": 2.671396255493164 }, { "auxiliary_loss_clip": 0.06409238, "auxiliary_loss_mlp": 0.01261849, "balance_loss_clip": 0.06271519, "balance_loss_mlp": 0.0125345, "epoch": 0.90984518262438, "flos": 23374149632640.0, "grad_norm": 1.600030863906877, "language_loss": 0.8255676, "learning_rate": 8.460320677257193e-08, "loss": 0.90227848, "num_input_tokens_seen": 326470855, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.08404541, "step": 15133, "time_per_iteration": 2.548733711242676 }, { "auxiliary_loss_clip": 0.06410608, "auxiliary_loss_mlp": 0.01263212, "balance_loss_clip": 0.06271823, "balance_loss_mlp": 0.01253431, "epoch": 0.909905305877048, "flos": 27530085565440.0, "grad_norm": 5.190957366063886, "language_loss": 0.74099725, "learning_rate": 8.449116620695118e-08, "loss": 0.81773543, "num_input_tokens_seen": 326490480, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09783936, "step": 15134, "time_per_iteration": 2.6631596088409424 }, { "auxiliary_loss_clip": 0.06425627, "auxiliary_loss_mlp": 0.01265301, "balance_loss_clip": 0.06279181, "balance_loss_mlp": 0.01255437, "epoch": 0.9099654291297159, "flos": 24353921024640.0, "grad_norm": 1.4634941296591666, "language_loss": 0.73028851, "learning_rate": 8.437919827761786e-08, "loss": 0.80719781, "num_input_tokens_seen": 326509445, "router_z_loss_clip": 1.46386719, "router_z_loss_mlp": 0.09863281, "step": 15135, "time_per_iteration": 2.548583745956421 }, { "auxiliary_loss_clip": 0.0641198, "auxiliary_loss_mlp": 0.01261681, "balance_loss_clip": 0.06276079, "balance_loss_mlp": 0.01252335, "epoch": 0.9100255523823839, "flos": 21221626896000.0, "grad_norm": 1.6399975078427573, "language_loss": 0.69585186, "learning_rate": 8.426730298881702e-08, "loss": 0.77258849, "num_input_tokens_seen": 326528380, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09344482, "step": 15136, "time_per_iteration": 2.6409659385681152 }, { "auxiliary_loss_clip": 0.06314959, "auxiliary_loss_mlp": 0.01252014, "balance_loss_clip": 0.06259699, "balance_loss_mlp": 0.01251002, "epoch": 0.9100856756350518, "flos": 46067292005760.0, "grad_norm": 0.8122998713501678, "language_loss": 0.59233081, "learning_rate": 8.415548034479214e-08, "loss": 0.66800058, "num_input_tokens_seen": 326576940, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01011658, "step": 15137, "time_per_iteration": 2.9486145973205566 }, { "auxiliary_loss_clip": 0.06416357, "auxiliary_loss_mlp": 0.01265912, "balance_loss_clip": 0.06276552, "balance_loss_mlp": 0.01256935, "epoch": 0.9101457988877198, "flos": 20236111499520.0, "grad_norm": 1.487038347768811, "language_loss": 0.82436562, "learning_rate": 8.40437303497834e-08, "loss": 0.90118831, "num_input_tokens_seen": 326596100, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.0897522, "step": 15138, "time_per_iteration": 2.5636613368988037 }, { "auxiliary_loss_clip": 0.06408723, "auxiliary_loss_mlp": 0.01260793, "balance_loss_clip": 0.06275154, "balance_loss_mlp": 0.01252437, "epoch": 0.9102059221403878, "flos": 26622458138880.0, "grad_norm": 1.5157407066666282, "language_loss": 0.81473047, "learning_rate": 8.39320530080283e-08, "loss": 0.89142561, "num_input_tokens_seen": 326615700, "router_z_loss_clip": 1.33691406, "router_z_loss_mlp": 0.08361816, "step": 15139, "time_per_iteration": 2.6010777950286865 }, { "auxiliary_loss_clip": 0.06415561, "auxiliary_loss_mlp": 0.0126373, "balance_loss_clip": 0.06278475, "balance_loss_mlp": 0.01254396, "epoch": 0.9102660453930558, "flos": 21915086486400.0, "grad_norm": 1.6052244069690047, "language_loss": 0.77400494, "learning_rate": 8.382044832376167e-08, "loss": 0.85079777, "num_input_tokens_seen": 326635905, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09332275, "step": 15140, "time_per_iteration": 2.5776352882385254 }, { "auxiliary_loss_clip": 0.06411747, "auxiliary_loss_mlp": 0.01262404, "balance_loss_clip": 0.06274043, "balance_loss_mlp": 0.01253512, "epoch": 0.9103261686457237, "flos": 36185933640960.0, "grad_norm": 1.4965973547186942, "language_loss": 0.66638386, "learning_rate": 8.370891630121569e-08, "loss": 0.74312538, "num_input_tokens_seen": 326661855, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.08898926, "step": 15141, "time_per_iteration": 2.706035852432251 }, { "auxiliary_loss_clip": 0.06417264, "auxiliary_loss_mlp": 0.01266317, "balance_loss_clip": 0.06276146, "balance_loss_mlp": 0.0125644, "epoch": 0.9103862918983917, "flos": 23885362592640.0, "grad_norm": 2.1852040241549977, "language_loss": 0.75166285, "learning_rate": 8.359745694462005e-08, "loss": 0.8284986, "num_input_tokens_seen": 326679320, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09875488, "step": 15142, "time_per_iteration": 2.560786724090576 }, { "auxiliary_loss_clip": 0.06410165, "auxiliary_loss_mlp": 0.01264842, "balance_loss_clip": 0.0627256, "balance_loss_mlp": 0.01255789, "epoch": 0.9104464151510596, "flos": 14944837870080.0, "grad_norm": 1.6102966717507725, "language_loss": 0.6458658, "learning_rate": 8.348607025820076e-08, "loss": 0.72261584, "num_input_tokens_seen": 326698110, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09057617, "step": 15143, "time_per_iteration": 2.5219738483428955 }, { "auxiliary_loss_clip": 0.06416455, "auxiliary_loss_mlp": 0.01264752, "balance_loss_clip": 0.06275736, "balance_loss_mlp": 0.01254757, "epoch": 0.9105065384037276, "flos": 33664096033920.0, "grad_norm": 2.0085570832640394, "language_loss": 0.61642951, "learning_rate": 8.337475624618152e-08, "loss": 0.6932416, "num_input_tokens_seen": 326718370, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.10003662, "step": 15144, "time_per_iteration": 2.6678521633148193 }, { "auxiliary_loss_clip": 0.06402871, "auxiliary_loss_mlp": 0.01268148, "balance_loss_clip": 0.06271844, "balance_loss_mlp": 0.01259094, "epoch": 0.9105666616563957, "flos": 24323634973440.0, "grad_norm": 1.6188738631774862, "language_loss": 0.71105182, "learning_rate": 8.326351491278382e-08, "loss": 0.78776199, "num_input_tokens_seen": 326738445, "router_z_loss_clip": 1.31054688, "router_z_loss_mlp": 0.09057617, "step": 15145, "time_per_iteration": 2.562516689300537 }, { "auxiliary_loss_clip": 0.06406651, "auxiliary_loss_mlp": 0.01264009, "balance_loss_clip": 0.0627235, "balance_loss_mlp": 0.01254526, "epoch": 0.9106267849090636, "flos": 29979527644800.0, "grad_norm": 1.4956977155032736, "language_loss": 0.71048999, "learning_rate": 8.315234626222545e-08, "loss": 0.78719664, "num_input_tokens_seen": 326758855, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.0947876, "step": 15146, "time_per_iteration": 4.078725814819336 }, { "auxiliary_loss_clip": 0.06413657, "auxiliary_loss_mlp": 0.01263953, "balance_loss_clip": 0.06275909, "balance_loss_mlp": 0.01255102, "epoch": 0.9106869081617316, "flos": 25344761155200.0, "grad_norm": 2.0193298623802436, "language_loss": 0.7314086, "learning_rate": 8.304125029872233e-08, "loss": 0.80818468, "num_input_tokens_seen": 326777140, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.08843994, "step": 15147, "time_per_iteration": 2.5815792083740234 }, { "auxiliary_loss_clip": 0.06417474, "auxiliary_loss_mlp": 0.01268803, "balance_loss_clip": 0.06274652, "balance_loss_mlp": 0.01259505, "epoch": 0.9107470314143995, "flos": 18192936741120.0, "grad_norm": 2.0141528922205, "language_loss": 0.79866832, "learning_rate": 8.293022702648711e-08, "loss": 0.87553108, "num_input_tokens_seen": 326794070, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09301758, "step": 15148, "time_per_iteration": 2.525223731994629 }, { "auxiliary_loss_clip": 0.06416733, "auxiliary_loss_mlp": 0.01263918, "balance_loss_clip": 0.06275209, "balance_loss_mlp": 0.01254143, "epoch": 0.9108071546670675, "flos": 23557696001280.0, "grad_norm": 1.9358971193848737, "language_loss": 0.68017483, "learning_rate": 8.281927644972996e-08, "loss": 0.75698131, "num_input_tokens_seen": 326814695, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09771729, "step": 15149, "time_per_iteration": 2.5708985328674316 }, { "auxiliary_loss_clip": 0.0641683, "auxiliary_loss_mlp": 0.01266653, "balance_loss_clip": 0.06277083, "balance_loss_mlp": 0.01256997, "epoch": 0.9108672779197354, "flos": 25637487793920.0, "grad_norm": 2.7760485855300185, "language_loss": 0.63460863, "learning_rate": 8.270839857265776e-08, "loss": 0.71144354, "num_input_tokens_seen": 326835295, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09655762, "step": 15150, "time_per_iteration": 2.614487886428833 }, { "auxiliary_loss_clip": 0.06414288, "auxiliary_loss_mlp": 0.0126488, "balance_loss_clip": 0.06274574, "balance_loss_mlp": 0.01256166, "epoch": 0.9109274011724035, "flos": 22344470334720.0, "grad_norm": 2.2305570077197974, "language_loss": 0.73338342, "learning_rate": 8.259759339947514e-08, "loss": 0.81017512, "num_input_tokens_seen": 326853350, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.0871582, "step": 15151, "time_per_iteration": 3.9921929836273193 }, { "auxiliary_loss_clip": 0.06410289, "auxiliary_loss_mlp": 0.01265447, "balance_loss_clip": 0.06272022, "balance_loss_mlp": 0.01256256, "epoch": 0.9109875244250714, "flos": 26695524499200.0, "grad_norm": 1.8310894790826289, "language_loss": 0.64651716, "learning_rate": 8.248686093438429e-08, "loss": 0.72327459, "num_input_tokens_seen": 326873425, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09191895, "step": 15152, "time_per_iteration": 2.6692047119140625 }, { "auxiliary_loss_clip": 0.06416015, "auxiliary_loss_mlp": 0.01267633, "balance_loss_clip": 0.06277184, "balance_loss_mlp": 0.01257685, "epoch": 0.9110476476777394, "flos": 22936799646720.0, "grad_norm": 1.9510584313497603, "language_loss": 0.73399746, "learning_rate": 8.23762011815834e-08, "loss": 0.81083393, "num_input_tokens_seen": 326893455, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.0994873, "step": 15153, "time_per_iteration": 2.6537201404571533 }, { "auxiliary_loss_clip": 0.06416717, "auxiliary_loss_mlp": 0.01264, "balance_loss_clip": 0.06277049, "balance_loss_mlp": 0.01254547, "epoch": 0.9111077709304073, "flos": 13476718483200.0, "grad_norm": 1.8727017360158935, "language_loss": 0.72429299, "learning_rate": 8.226561414526956e-08, "loss": 0.80110013, "num_input_tokens_seen": 326910210, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09448242, "step": 15154, "time_per_iteration": 2.5262913703918457 }, { "auxiliary_loss_clip": 0.06410187, "auxiliary_loss_mlp": 0.0126474, "balance_loss_clip": 0.06275052, "balance_loss_mlp": 0.01255221, "epoch": 0.9111678941830753, "flos": 20856924000000.0, "grad_norm": 1.793822934243697, "language_loss": 0.8244642, "learning_rate": 8.215509982963564e-08, "loss": 0.90121347, "num_input_tokens_seen": 326929350, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.09515381, "step": 15155, "time_per_iteration": 2.56331729888916 }, { "auxiliary_loss_clip": 0.06416463, "auxiliary_loss_mlp": 0.01266815, "balance_loss_clip": 0.06279399, "balance_loss_mlp": 0.01257404, "epoch": 0.9112280174357432, "flos": 19688281505280.0, "grad_norm": 1.5765063313601997, "language_loss": 0.59897119, "learning_rate": 8.204465823887252e-08, "loss": 0.67580402, "num_input_tokens_seen": 326949060, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09411621, "step": 15156, "time_per_iteration": 2.58705472946167 }, { "auxiliary_loss_clip": 0.06418441, "auxiliary_loss_mlp": 0.0126539, "balance_loss_clip": 0.06276637, "balance_loss_mlp": 0.01255144, "epoch": 0.9112881406884112, "flos": 25454192987520.0, "grad_norm": 2.70531016720619, "language_loss": 0.74294066, "learning_rate": 8.193428937716796e-08, "loss": 0.81977904, "num_input_tokens_seen": 326968950, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.10241699, "step": 15157, "time_per_iteration": 2.591799259185791 }, { "auxiliary_loss_clip": 0.06413234, "auxiliary_loss_mlp": 0.01261088, "balance_loss_clip": 0.062728, "balance_loss_mlp": 0.01252338, "epoch": 0.9113482639410793, "flos": 33074324271360.0, "grad_norm": 1.8853930517119688, "language_loss": 0.59416854, "learning_rate": 8.182399324870747e-08, "loss": 0.67091167, "num_input_tokens_seen": 326989455, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.08752441, "step": 15158, "time_per_iteration": 4.078821420669556 }, { "auxiliary_loss_clip": 0.06410663, "auxiliary_loss_mlp": 0.01264453, "balance_loss_clip": 0.06273724, "balance_loss_mlp": 0.01255637, "epoch": 0.9114083871937472, "flos": 21842103980160.0, "grad_norm": 1.4562103376500788, "language_loss": 0.68112129, "learning_rate": 8.171376985767375e-08, "loss": 0.75787246, "num_input_tokens_seen": 327009640, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.0881958, "step": 15159, "time_per_iteration": 2.5382938385009766 }, { "auxiliary_loss_clip": 0.06415001, "auxiliary_loss_mlp": 0.01262327, "balance_loss_clip": 0.0627552, "balance_loss_mlp": 0.01253172, "epoch": 0.9114685104464152, "flos": 27096299377920.0, "grad_norm": 1.8449657868543394, "language_loss": 0.78628421, "learning_rate": 8.160361920824588e-08, "loss": 0.86305749, "num_input_tokens_seen": 327027690, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09143066, "step": 15160, "time_per_iteration": 2.5735208988189697 }, { "auxiliary_loss_clip": 0.06420578, "auxiliary_loss_mlp": 0.01263854, "balance_loss_clip": 0.06280611, "balance_loss_mlp": 0.01254193, "epoch": 0.9115286336990831, "flos": 17972731411200.0, "grad_norm": 1.868471325538737, "language_loss": 0.6941635, "learning_rate": 8.149354130460073e-08, "loss": 0.77100778, "num_input_tokens_seen": 327045915, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09674072, "step": 15161, "time_per_iteration": 2.519155263900757 }, { "auxiliary_loss_clip": 0.06414892, "auxiliary_loss_mlp": 0.01264198, "balance_loss_clip": 0.06273881, "balance_loss_mlp": 0.01254006, "epoch": 0.9115887569517511, "flos": 22936506157440.0, "grad_norm": 1.6698053623790792, "language_loss": 0.76387852, "learning_rate": 8.138353615091321e-08, "loss": 0.84066939, "num_input_tokens_seen": 327066355, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10192871, "step": 15162, "time_per_iteration": 2.520914316177368 }, { "auxiliary_loss_clip": 0.06410982, "auxiliary_loss_mlp": 0.01264053, "balance_loss_clip": 0.0627204, "balance_loss_mlp": 0.01254695, "epoch": 0.911648880204419, "flos": 23995339476480.0, "grad_norm": 1.8575175804604616, "language_loss": 0.6687777, "learning_rate": 8.127360375135395e-08, "loss": 0.74552804, "num_input_tokens_seen": 327086735, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09356689, "step": 15163, "time_per_iteration": 2.5662765502929688 }, { "auxiliary_loss_clip": 0.06421466, "auxiliary_loss_mlp": 0.01262935, "balance_loss_clip": 0.06276174, "balance_loss_mlp": 0.0125332, "epoch": 0.911709003457087, "flos": 17060911280640.0, "grad_norm": 3.1000350718713556, "language_loss": 0.70685732, "learning_rate": 8.116374411009186e-08, "loss": 0.7837013, "num_input_tokens_seen": 327104035, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.09619141, "step": 15164, "time_per_iteration": 2.50892972946167 }, { "auxiliary_loss_clip": 0.06407882, "auxiliary_loss_mlp": 0.0126532, "balance_loss_clip": 0.06274468, "balance_loss_mlp": 0.01256754, "epoch": 0.911769126709755, "flos": 21659857349760.0, "grad_norm": 1.3408666266556297, "language_loss": 0.76031595, "learning_rate": 8.105395723129315e-08, "loss": 0.83704793, "num_input_tokens_seen": 327124370, "router_z_loss_clip": 1.33398438, "router_z_loss_mlp": 0.08557129, "step": 15165, "time_per_iteration": 2.542870044708252 }, { "auxiliary_loss_clip": 0.06413391, "auxiliary_loss_mlp": 0.0126314, "balance_loss_clip": 0.06273954, "balance_loss_mlp": 0.01253276, "epoch": 0.911829249962423, "flos": 24797224650240.0, "grad_norm": 2.4363484977058074, "language_loss": 0.72155416, "learning_rate": 8.094424311912074e-08, "loss": 0.79831946, "num_input_tokens_seen": 327140915, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09869385, "step": 15166, "time_per_iteration": 3.8637592792510986 }, { "auxiliary_loss_clip": 0.06416157, "auxiliary_loss_mlp": 0.01265404, "balance_loss_clip": 0.06274302, "balance_loss_mlp": 0.01255283, "epoch": 0.9118893732150909, "flos": 20965684999680.0, "grad_norm": 2.690158026250377, "language_loss": 0.73550206, "learning_rate": 8.083460177773482e-08, "loss": 0.81231767, "num_input_tokens_seen": 327158940, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10119629, "step": 15167, "time_per_iteration": 2.537846326828003 }, { "auxiliary_loss_clip": 0.06312051, "auxiliary_loss_mlp": 0.01249715, "balance_loss_clip": 0.06257008, "balance_loss_mlp": 0.01248732, "epoch": 0.9119494964677589, "flos": 67937753393280.0, "grad_norm": 0.7595193278897884, "language_loss": 0.65559471, "learning_rate": 8.072503321129298e-08, "loss": 0.73121238, "num_input_tokens_seen": 327217450, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00981903, "step": 15168, "time_per_iteration": 3.131540060043335 }, { "auxiliary_loss_clip": 0.06409157, "auxiliary_loss_mlp": 0.012632, "balance_loss_clip": 0.06272057, "balance_loss_mlp": 0.01254745, "epoch": 0.9120096197204268, "flos": 18557430001920.0, "grad_norm": 1.8818910871867447, "language_loss": 0.78597879, "learning_rate": 8.061553742395033e-08, "loss": 0.86270237, "num_input_tokens_seen": 327233905, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.08447266, "step": 15169, "time_per_iteration": 2.542131185531616 }, { "auxiliary_loss_clip": 0.06414281, "auxiliary_loss_mlp": 0.01266295, "balance_loss_clip": 0.06275417, "balance_loss_mlp": 0.01257045, "epoch": 0.9120697429730948, "flos": 19031690511360.0, "grad_norm": 1.5645069126242659, "language_loss": 0.82416904, "learning_rate": 8.05061144198591e-08, "loss": 0.90097475, "num_input_tokens_seen": 327252430, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09246826, "step": 15170, "time_per_iteration": 2.546452045440674 }, { "auxiliary_loss_clip": 0.0641734, "auxiliary_loss_mlp": 0.01266327, "balance_loss_clip": 0.06278239, "balance_loss_mlp": 0.01256355, "epoch": 0.9121298662257629, "flos": 17169127228800.0, "grad_norm": 2.532802964090018, "language_loss": 0.77411652, "learning_rate": 8.039676420316799e-08, "loss": 0.85095322, "num_input_tokens_seen": 327269215, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09973145, "step": 15171, "time_per_iteration": 2.526451587677002 }, { "auxiliary_loss_clip": 0.06407108, "auxiliary_loss_mlp": 0.0126552, "balance_loss_clip": 0.06270748, "balance_loss_mlp": 0.01255578, "epoch": 0.9121899894784308, "flos": 19688826556800.0, "grad_norm": 1.2598262476234166, "language_loss": 0.6719355, "learning_rate": 8.02874867780241e-08, "loss": 0.74866182, "num_input_tokens_seen": 327290320, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.09942627, "step": 15172, "time_per_iteration": 2.563462018966675 }, { "auxiliary_loss_clip": 0.06414051, "auxiliary_loss_mlp": 0.01266602, "balance_loss_clip": 0.06274348, "balance_loss_mlp": 0.01256583, "epoch": 0.9122501127310988, "flos": 22242124172160.0, "grad_norm": 1.7570744762446093, "language_loss": 0.75346649, "learning_rate": 8.017828214857103e-08, "loss": 0.83027303, "num_input_tokens_seen": 327310150, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.10028076, "step": 15173, "time_per_iteration": 2.57388973236084 }, { "auxiliary_loss_clip": 0.06423982, "auxiliary_loss_mlp": 0.01263802, "balance_loss_clip": 0.06279935, "balance_loss_mlp": 0.0125355, "epoch": 0.9123102359837667, "flos": 15961939056000.0, "grad_norm": 2.7382456559606263, "language_loss": 0.66217297, "learning_rate": 8.00691503189499e-08, "loss": 0.73905075, "num_input_tokens_seen": 327326660, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10247803, "step": 15174, "time_per_iteration": 2.5547540187835693 }, { "auxiliary_loss_clip": 0.0641775, "auxiliary_loss_mlp": 0.01267165, "balance_loss_clip": 0.06276698, "balance_loss_mlp": 0.01257479, "epoch": 0.9123703592364347, "flos": 25162849941120.0, "grad_norm": 1.8456445179414795, "language_loss": 0.74941301, "learning_rate": 7.996009129329894e-08, "loss": 0.82626224, "num_input_tokens_seen": 327346700, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09692383, "step": 15175, "time_per_iteration": 2.571474313735962 }, { "auxiliary_loss_clip": 0.06315248, "auxiliary_loss_mlp": 0.01250793, "balance_loss_clip": 0.06259956, "balance_loss_mlp": 0.01249765, "epoch": 0.9124304824891026, "flos": 60820659296640.0, "grad_norm": 0.96626486116068, "language_loss": 0.58517647, "learning_rate": 7.985110507575421e-08, "loss": 0.66083694, "num_input_tokens_seen": 327403050, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01028442, "step": 15176, "time_per_iteration": 3.1992640495300293 }, { "auxiliary_loss_clip": 0.06413797, "auxiliary_loss_mlp": 0.01264071, "balance_loss_clip": 0.0627397, "balance_loss_mlp": 0.01255088, "epoch": 0.9124906057417707, "flos": 18156906685440.0, "grad_norm": 3.056220554261941, "language_loss": 0.65743864, "learning_rate": 7.97421916704475e-08, "loss": 0.73421729, "num_input_tokens_seen": 327422225, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.08984375, "step": 15177, "time_per_iteration": 2.538882255554199 }, { "auxiliary_loss_clip": 0.06415211, "auxiliary_loss_mlp": 0.01265176, "balance_loss_clip": 0.06277858, "balance_loss_mlp": 0.01256111, "epoch": 0.9125507289944386, "flos": 11690617651200.0, "grad_norm": 2.730122639755672, "language_loss": 0.81341588, "learning_rate": 7.963335108150926e-08, "loss": 0.89021981, "num_input_tokens_seen": 327437025, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09063721, "step": 15178, "time_per_iteration": 2.516113758087158 }, { "auxiliary_loss_clip": 0.06412496, "auxiliary_loss_mlp": 0.01264411, "balance_loss_clip": 0.0627536, "balance_loss_mlp": 0.01254814, "epoch": 0.9126108522471066, "flos": 17754580506240.0, "grad_norm": 1.8958753880649484, "language_loss": 0.79176772, "learning_rate": 7.952458331306711e-08, "loss": 0.86853677, "num_input_tokens_seen": 327453915, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09594727, "step": 15179, "time_per_iteration": 2.511268377304077 }, { "auxiliary_loss_clip": 0.06410168, "auxiliary_loss_mlp": 0.0126171, "balance_loss_clip": 0.06272827, "balance_loss_mlp": 0.01253217, "epoch": 0.9126709754997745, "flos": 27643039269120.0, "grad_norm": 1.5411537684161751, "language_loss": 0.68462348, "learning_rate": 7.941588836924507e-08, "loss": 0.76134229, "num_input_tokens_seen": 327474415, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.0848999, "step": 15180, "time_per_iteration": 2.608417272567749 }, { "auxiliary_loss_clip": 0.06406723, "auxiliary_loss_mlp": 0.01265518, "balance_loss_clip": 0.06270916, "balance_loss_mlp": 0.01256786, "epoch": 0.9127310987524425, "flos": 15930520974720.0, "grad_norm": 1.6677437415837706, "language_loss": 0.75198251, "learning_rate": 7.930726625416495e-08, "loss": 0.82870495, "num_input_tokens_seen": 327492750, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.08721924, "step": 15181, "time_per_iteration": 2.515913963317871 }, { "auxiliary_loss_clip": 0.06422791, "auxiliary_loss_mlp": 0.01266351, "balance_loss_clip": 0.06278405, "balance_loss_mlp": 0.01256671, "epoch": 0.9127912220051104, "flos": 21542207817600.0, "grad_norm": 1.6269440236535413, "language_loss": 0.75274098, "learning_rate": 7.919871697194614e-08, "loss": 0.8296324, "num_input_tokens_seen": 327509470, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.09680176, "step": 15182, "time_per_iteration": 2.538457155227661 }, { "auxiliary_loss_clip": 0.06418783, "auxiliary_loss_mlp": 0.01265724, "balance_loss_clip": 0.06276929, "balance_loss_mlp": 0.01255418, "epoch": 0.9128513452577784, "flos": 24070837605120.0, "grad_norm": 1.4701309685200081, "language_loss": 0.76655281, "learning_rate": 7.909024052670421e-08, "loss": 0.84339792, "num_input_tokens_seen": 327530520, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10302734, "step": 15183, "time_per_iteration": 2.568540573120117 }, { "auxiliary_loss_clip": 0.06416622, "auxiliary_loss_mlp": 0.0126644, "balance_loss_clip": 0.06274286, "balance_loss_mlp": 0.01256861, "epoch": 0.9129114685104465, "flos": 16221989802240.0, "grad_norm": 5.265532018987261, "language_loss": 0.76700211, "learning_rate": 7.898183692255256e-08, "loss": 0.84383273, "num_input_tokens_seen": 327546960, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09588623, "step": 15184, "time_per_iteration": 2.563977003097534 }, { "auxiliary_loss_clip": 0.0641461, "auxiliary_loss_mlp": 0.01268253, "balance_loss_clip": 0.06276574, "balance_loss_mlp": 0.01259104, "epoch": 0.9129715917631144, "flos": 19389349664640.0, "grad_norm": 1.598391719955495, "language_loss": 0.74774396, "learning_rate": 7.887350616360233e-08, "loss": 0.82457256, "num_input_tokens_seen": 327564830, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09143066, "step": 15185, "time_per_iteration": 4.068664073944092 }, { "auxiliary_loss_clip": 0.06415827, "auxiliary_loss_mlp": 0.01266289, "balance_loss_clip": 0.06277829, "balance_loss_mlp": 0.01256574, "epoch": 0.9130317150157824, "flos": 20595992785920.0, "grad_norm": 2.529058244656767, "language_loss": 0.68937629, "learning_rate": 7.876524825396158e-08, "loss": 0.76619744, "num_input_tokens_seen": 327583675, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09716797, "step": 15186, "time_per_iteration": 2.5553321838378906 }, { "auxiliary_loss_clip": 0.06426622, "auxiliary_loss_mlp": 0.01264086, "balance_loss_clip": 0.0627984, "balance_loss_mlp": 0.01254138, "epoch": 0.9130918382684503, "flos": 20194714782720.0, "grad_norm": 1.8240970906468779, "language_loss": 0.77533066, "learning_rate": 7.865706319773502e-08, "loss": 0.85223776, "num_input_tokens_seen": 327602280, "router_z_loss_clip": 1.46972656, "router_z_loss_mlp": 0.09936523, "step": 15187, "time_per_iteration": 2.5155725479125977 }, { "auxiliary_loss_clip": 0.06411432, "auxiliary_loss_mlp": 0.01264005, "balance_loss_clip": 0.06273021, "balance_loss_mlp": 0.01254802, "epoch": 0.9131519615211183, "flos": 25563960236160.0, "grad_norm": 2.204772679889864, "language_loss": 0.65921241, "learning_rate": 7.854895099902515e-08, "loss": 0.7359668, "num_input_tokens_seen": 327623515, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09210205, "step": 15188, "time_per_iteration": 2.6079230308532715 }, { "auxiliary_loss_clip": 0.06409854, "auxiliary_loss_mlp": 0.01265171, "balance_loss_clip": 0.06272071, "balance_loss_mlp": 0.01255504, "epoch": 0.9132120847737862, "flos": 17937414115200.0, "grad_norm": 1.8275230776575015, "language_loss": 0.76521254, "learning_rate": 7.844091166193157e-08, "loss": 0.84196281, "num_input_tokens_seen": 327642875, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09674072, "step": 15189, "time_per_iteration": 2.516204833984375 }, { "auxiliary_loss_clip": 0.06409547, "auxiliary_loss_mlp": 0.01263841, "balance_loss_clip": 0.06273729, "balance_loss_mlp": 0.01255485, "epoch": 0.9132722080264543, "flos": 20053822942080.0, "grad_norm": 1.6221184465415532, "language_loss": 0.75216436, "learning_rate": 7.8332945190551e-08, "loss": 0.82889831, "num_input_tokens_seen": 327662450, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.08361816, "step": 15190, "time_per_iteration": 2.508155107498169 }, { "auxiliary_loss_clip": 0.06316097, "auxiliary_loss_mlp": 0.01250747, "balance_loss_clip": 0.06260942, "balance_loss_mlp": 0.01249801, "epoch": 0.9133323312791222, "flos": 70461603498240.0, "grad_norm": 0.7018400086621894, "language_loss": 0.57059407, "learning_rate": 7.822505158897797e-08, "loss": 0.64626253, "num_input_tokens_seen": 327723845, "router_z_loss_clip": 0.55273438, "router_z_loss_mlp": 0.00943756, "step": 15191, "time_per_iteration": 4.563427925109863 }, { "auxiliary_loss_clip": 0.06418671, "auxiliary_loss_mlp": 0.01264227, "balance_loss_clip": 0.06277108, "balance_loss_mlp": 0.01255292, "epoch": 0.9133924545317902, "flos": 25490851948800.0, "grad_norm": 1.7742834140652128, "language_loss": 0.74235761, "learning_rate": 7.81172308613034e-08, "loss": 0.81918657, "num_input_tokens_seen": 327742590, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.08935547, "step": 15192, "time_per_iteration": 2.5672874450683594 }, { "auxiliary_loss_clip": 0.06412742, "auxiliary_loss_mlp": 0.01266205, "balance_loss_clip": 0.06275207, "balance_loss_mlp": 0.01256478, "epoch": 0.9134525777844581, "flos": 39939920737920.0, "grad_norm": 1.4648979886770428, "language_loss": 0.69527829, "learning_rate": 7.800948301161647e-08, "loss": 0.77206779, "num_input_tokens_seen": 327764350, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09729004, "step": 15193, "time_per_iteration": 2.7054576873779297 }, { "auxiliary_loss_clip": 0.06408134, "auxiliary_loss_mlp": 0.01263061, "balance_loss_clip": 0.06272062, "balance_loss_mlp": 0.01253715, "epoch": 0.9135127010371261, "flos": 20893037909760.0, "grad_norm": 2.0858029101065796, "language_loss": 0.73127794, "learning_rate": 7.790180804400215e-08, "loss": 0.80798995, "num_input_tokens_seen": 327783120, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09350586, "step": 15194, "time_per_iteration": 2.535682201385498 }, { "auxiliary_loss_clip": 0.06418404, "auxiliary_loss_mlp": 0.01267941, "balance_loss_clip": 0.06273605, "balance_loss_mlp": 0.01257409, "epoch": 0.913572824289794, "flos": 20819468424960.0, "grad_norm": 2.2506327218745135, "language_loss": 0.61820477, "learning_rate": 7.779420596254383e-08, "loss": 0.69506824, "num_input_tokens_seen": 327801960, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10522461, "step": 15195, "time_per_iteration": 2.555462598800659 }, { "auxiliary_loss_clip": 0.06410149, "auxiliary_loss_mlp": 0.01263346, "balance_loss_clip": 0.06271116, "balance_loss_mlp": 0.01254066, "epoch": 0.913632947542462, "flos": 25710470300160.0, "grad_norm": 1.5419576318369435, "language_loss": 0.71460861, "learning_rate": 7.768667677132201e-08, "loss": 0.79134351, "num_input_tokens_seen": 327823795, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09283447, "step": 15196, "time_per_iteration": 2.6004908084869385 }, { "auxiliary_loss_clip": 0.0641057, "auxiliary_loss_mlp": 0.01266665, "balance_loss_clip": 0.06274442, "balance_loss_mlp": 0.01257492, "epoch": 0.9136930707951301, "flos": 26293366028160.0, "grad_norm": 1.5228144404156645, "language_loss": 0.71394825, "learning_rate": 7.757922047441411e-08, "loss": 0.79072064, "num_input_tokens_seen": 327845175, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.09185791, "step": 15197, "time_per_iteration": 2.606049060821533 }, { "auxiliary_loss_clip": 0.06420517, "auxiliary_loss_mlp": 0.01261653, "balance_loss_clip": 0.06277721, "balance_loss_mlp": 0.01252152, "epoch": 0.913753194047798, "flos": 22098590928000.0, "grad_norm": 2.2356128269355695, "language_loss": 0.78424942, "learning_rate": 7.747183707589489e-08, "loss": 0.86107111, "num_input_tokens_seen": 327863150, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.0949707, "step": 15198, "time_per_iteration": 4.002096176147461 }, { "auxiliary_loss_clip": 0.06406264, "auxiliary_loss_mlp": 0.01264229, "balance_loss_clip": 0.06271254, "balance_loss_mlp": 0.01255587, "epoch": 0.913813317300466, "flos": 23594061473280.0, "grad_norm": 1.3867726104513152, "language_loss": 0.67825711, "learning_rate": 7.736452657983616e-08, "loss": 0.75496209, "num_input_tokens_seen": 327883445, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.08636475, "step": 15199, "time_per_iteration": 2.637488842010498 }, { "auxiliary_loss_clip": 0.0641975, "auxiliary_loss_mlp": 0.01263255, "balance_loss_clip": 0.06279175, "balance_loss_mlp": 0.01254004, "epoch": 0.9138734405531339, "flos": 28883993437440.0, "grad_norm": 1.53612127471517, "language_loss": 0.67872941, "learning_rate": 7.725728899030714e-08, "loss": 0.75555944, "num_input_tokens_seen": 327905745, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09246826, "step": 15200, "time_per_iteration": 2.6053431034088135 }, { "auxiliary_loss_clip": 0.0641241, "auxiliary_loss_mlp": 0.0126803, "balance_loss_clip": 0.06278097, "balance_loss_mlp": 0.01259196, "epoch": 0.9139335638058019, "flos": 22827829011840.0, "grad_norm": 1.4622404672941927, "language_loss": 0.71666908, "learning_rate": 7.715012431137435e-08, "loss": 0.79347348, "num_input_tokens_seen": 327925435, "router_z_loss_clip": 1.34082031, "router_z_loss_mlp": 0.08822632, "step": 15201, "time_per_iteration": 2.5626306533813477 }, { "auxiliary_loss_clip": 0.06413909, "auxiliary_loss_mlp": 0.01260693, "balance_loss_clip": 0.06274141, "balance_loss_mlp": 0.01251669, "epoch": 0.9139936870584698, "flos": 18009977351040.0, "grad_norm": 1.7687067031405388, "language_loss": 0.70606506, "learning_rate": 7.704303254710165e-08, "loss": 0.78281105, "num_input_tokens_seen": 327944145, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09020996, "step": 15202, "time_per_iteration": 2.5387556552886963 }, { "auxiliary_loss_clip": 0.06411321, "auxiliary_loss_mlp": 0.01265597, "balance_loss_clip": 0.06272003, "balance_loss_mlp": 0.01255923, "epoch": 0.9140538103111379, "flos": 15818992790400.0, "grad_norm": 1.882031212678037, "language_loss": 0.67208886, "learning_rate": 7.693601370155001e-08, "loss": 0.74885798, "num_input_tokens_seen": 327960565, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09667969, "step": 15203, "time_per_iteration": 2.5395469665527344 }, { "auxiliary_loss_clip": 0.06418966, "auxiliary_loss_mlp": 0.01267789, "balance_loss_clip": 0.06279251, "balance_loss_mlp": 0.01258562, "epoch": 0.9141139335638058, "flos": 23993704321920.0, "grad_norm": 2.104134745484326, "language_loss": 0.69250739, "learning_rate": 7.682906777877751e-08, "loss": 0.76937497, "num_input_tokens_seen": 327981180, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09228516, "step": 15204, "time_per_iteration": 2.550884246826172 }, { "auxiliary_loss_clip": 0.06418185, "auxiliary_loss_mlp": 0.01263767, "balance_loss_clip": 0.06277353, "balance_loss_mlp": 0.01253783, "epoch": 0.9141740568164738, "flos": 24031243751040.0, "grad_norm": 2.4314138232742426, "language_loss": 0.5996443, "learning_rate": 7.672219478283915e-08, "loss": 0.67646384, "num_input_tokens_seen": 328001500, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09985352, "step": 15205, "time_per_iteration": 2.5615904331207275 }, { "auxiliary_loss_clip": 0.06407004, "auxiliary_loss_mlp": 0.01263847, "balance_loss_clip": 0.06271815, "balance_loss_mlp": 0.01254733, "epoch": 0.9142341800691417, "flos": 27025958275200.0, "grad_norm": 1.6055706145437119, "language_loss": 0.81498456, "learning_rate": 7.661539471778811e-08, "loss": 0.89169312, "num_input_tokens_seen": 328023025, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.09124756, "step": 15206, "time_per_iteration": 3.8596997261047363 }, { "auxiliary_loss_clip": 0.06419197, "auxiliary_loss_mlp": 0.01262261, "balance_loss_clip": 0.06277285, "balance_loss_mlp": 0.01252963, "epoch": 0.9142943033218097, "flos": 20418735473280.0, "grad_norm": 3.0413359282564767, "language_loss": 0.74337125, "learning_rate": 7.650866758767382e-08, "loss": 0.8201859, "num_input_tokens_seen": 328041410, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09295654, "step": 15207, "time_per_iteration": 2.5272929668426514 }, { "auxiliary_loss_clip": 0.06417073, "auxiliary_loss_mlp": 0.01267076, "balance_loss_clip": 0.06277695, "balance_loss_mlp": 0.01257658, "epoch": 0.9143544265744776, "flos": 19761389792640.0, "grad_norm": 2.218845686625676, "language_loss": 0.7318933, "learning_rate": 7.640201339654373e-08, "loss": 0.80873477, "num_input_tokens_seen": 328060495, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09417725, "step": 15208, "time_per_iteration": 2.517515182495117 }, { "auxiliary_loss_clip": 0.0641489, "auxiliary_loss_mlp": 0.01262528, "balance_loss_clip": 0.0627794, "balance_loss_mlp": 0.01253933, "epoch": 0.9144145498271457, "flos": 17171181653760.0, "grad_norm": 2.2331268630480614, "language_loss": 0.86831105, "learning_rate": 7.629543214844237e-08, "loss": 0.94508523, "num_input_tokens_seen": 328076905, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.08599854, "step": 15209, "time_per_iteration": 2.4918785095214844 }, { "auxiliary_loss_clip": 0.06413819, "auxiliary_loss_mlp": 0.01266312, "balance_loss_clip": 0.0627676, "balance_loss_mlp": 0.01256924, "epoch": 0.9144746730798137, "flos": 23731766858880.0, "grad_norm": 1.9699751596786732, "language_loss": 0.75465322, "learning_rate": 7.618892384741093e-08, "loss": 0.83145452, "num_input_tokens_seen": 328096960, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09381104, "step": 15210, "time_per_iteration": 2.5609846115112305 }, { "auxiliary_loss_clip": 0.06414272, "auxiliary_loss_mlp": 0.01265878, "balance_loss_clip": 0.06273121, "balance_loss_mlp": 0.01256288, "epoch": 0.9145347963324816, "flos": 25854842085120.0, "grad_norm": 1.7343978177667227, "language_loss": 0.78048587, "learning_rate": 7.6082488497488e-08, "loss": 0.85728741, "num_input_tokens_seen": 328115445, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09588623, "step": 15211, "time_per_iteration": 2.553713321685791 }, { "auxiliary_loss_clip": 0.06415103, "auxiliary_loss_mlp": 0.01261413, "balance_loss_clip": 0.06275548, "balance_loss_mlp": 0.01252204, "epoch": 0.9145949195851496, "flos": 19248457824000.0, "grad_norm": 1.6306888296252382, "language_loss": 0.8318584, "learning_rate": 7.597612610270986e-08, "loss": 0.90862358, "num_input_tokens_seen": 328133965, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09210205, "step": 15212, "time_per_iteration": 2.5632081031799316 }, { "auxiliary_loss_clip": 0.06409869, "auxiliary_loss_mlp": 0.01266436, "balance_loss_clip": 0.06273475, "balance_loss_mlp": 0.0125709, "epoch": 0.9146550428378175, "flos": 18302284719360.0, "grad_norm": 1.7321096304555277, "language_loss": 0.84162456, "learning_rate": 7.586983666711022e-08, "loss": 0.91838765, "num_input_tokens_seen": 328151520, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09350586, "step": 15213, "time_per_iteration": 2.4921743869781494 }, { "auxiliary_loss_clip": 0.06417169, "auxiliary_loss_mlp": 0.01266243, "balance_loss_clip": 0.06277888, "balance_loss_mlp": 0.01256832, "epoch": 0.9147151660904855, "flos": 20090481903360.0, "grad_norm": 1.8138751728677505, "language_loss": 0.71350694, "learning_rate": 7.576362019471894e-08, "loss": 0.79034102, "num_input_tokens_seen": 328171275, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09411621, "step": 15214, "time_per_iteration": 2.5402445793151855 }, { "auxiliary_loss_clip": 0.064192, "auxiliary_loss_mlp": 0.01266063, "balance_loss_clip": 0.06276758, "balance_loss_mlp": 0.01256508, "epoch": 0.9147752893431534, "flos": 24395988574080.0, "grad_norm": 1.6890301790427928, "language_loss": 0.63215625, "learning_rate": 7.565747668956413e-08, "loss": 0.70900881, "num_input_tokens_seen": 328192115, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09545898, "step": 15215, "time_per_iteration": 2.5943520069122314 }, { "auxiliary_loss_clip": 0.06418359, "auxiliary_loss_mlp": 0.01265594, "balance_loss_clip": 0.06275407, "balance_loss_mlp": 0.01255527, "epoch": 0.9148354125958215, "flos": 18156277779840.0, "grad_norm": 2.8079222906367662, "language_loss": 0.7659142, "learning_rate": 7.555140615567058e-08, "loss": 0.84275377, "num_input_tokens_seen": 328208990, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10064697, "step": 15216, "time_per_iteration": 2.512887716293335 }, { "auxiliary_loss_clip": 0.06417762, "auxiliary_loss_mlp": 0.01271727, "balance_loss_clip": 0.06279027, "balance_loss_mlp": 0.01261577, "epoch": 0.9148955358484894, "flos": 23374233486720.0, "grad_norm": 2.579524071601979, "language_loss": 0.68369639, "learning_rate": 7.544540859706062e-08, "loss": 0.76059127, "num_input_tokens_seen": 328227840, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.1015625, "step": 15217, "time_per_iteration": 2.565138816833496 }, { "auxiliary_loss_clip": 0.06411279, "auxiliary_loss_mlp": 0.01264604, "balance_loss_clip": 0.06274321, "balance_loss_mlp": 0.0125524, "epoch": 0.9149556591011574, "flos": 18082205170560.0, "grad_norm": 1.727470782285028, "language_loss": 0.80030119, "learning_rate": 7.533948401775347e-08, "loss": 0.87706006, "num_input_tokens_seen": 328246250, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09362793, "step": 15218, "time_per_iteration": 2.5334348678588867 }, { "auxiliary_loss_clip": 0.06316814, "auxiliary_loss_mlp": 0.01252314, "balance_loss_clip": 0.06261453, "balance_loss_mlp": 0.01251283, "epoch": 0.9150157823538253, "flos": 54602220240000.0, "grad_norm": 0.821413416783374, "language_loss": 0.58768094, "learning_rate": 7.523363242176595e-08, "loss": 0.66337228, "num_input_tokens_seen": 328303625, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01032257, "step": 15219, "time_per_iteration": 3.1329450607299805 }, { "auxiliary_loss_clip": 0.06411402, "auxiliary_loss_mlp": 0.01264325, "balance_loss_clip": 0.06274959, "balance_loss_mlp": 0.01255617, "epoch": 0.9150759056064933, "flos": 17898616874880.0, "grad_norm": 1.8966090048161692, "language_loss": 0.78809261, "learning_rate": 7.512785381311216e-08, "loss": 0.86484987, "num_input_tokens_seen": 328322135, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.08703613, "step": 15220, "time_per_iteration": 2.519901990890503 }, { "auxiliary_loss_clip": 0.06417057, "auxiliary_loss_mlp": 0.01267351, "balance_loss_clip": 0.06274922, "balance_loss_mlp": 0.01257141, "epoch": 0.9151360288591612, "flos": 18078725226240.0, "grad_norm": 3.263959624083446, "language_loss": 0.65923905, "learning_rate": 7.50221481958031e-08, "loss": 0.73608315, "num_input_tokens_seen": 328340750, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10217285, "step": 15221, "time_per_iteration": 2.520237445831299 }, { "auxiliary_loss_clip": 0.06409271, "auxiliary_loss_mlp": 0.01264277, "balance_loss_clip": 0.06271025, "balance_loss_mlp": 0.01254729, "epoch": 0.9151961521118293, "flos": 19360614913920.0, "grad_norm": 1.6073862954975429, "language_loss": 0.84045035, "learning_rate": 7.491651557384692e-08, "loss": 0.91718584, "num_input_tokens_seen": 328359995, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09545898, "step": 15222, "time_per_iteration": 2.5180981159210205 }, { "auxiliary_loss_clip": 0.06316355, "auxiliary_loss_mlp": 0.01250869, "balance_loss_clip": 0.06261137, "balance_loss_mlp": 0.01249871, "epoch": 0.9152562753644973, "flos": 72167174956800.0, "grad_norm": 0.7204153009323867, "language_loss": 0.4957397, "learning_rate": 7.481095595124953e-08, "loss": 0.57141191, "num_input_tokens_seen": 328426865, "router_z_loss_clip": 0.55419922, "router_z_loss_mlp": 0.00997162, "step": 15223, "time_per_iteration": 3.2003700733184814 }, { "auxiliary_loss_clip": 0.06416233, "auxiliary_loss_mlp": 0.01264659, "balance_loss_clip": 0.06276696, "balance_loss_mlp": 0.01254615, "epoch": 0.9153163986171652, "flos": 20783270661120.0, "grad_norm": 1.9332693395334537, "language_loss": 0.72603399, "learning_rate": 7.470546933201349e-08, "loss": 0.80284286, "num_input_tokens_seen": 328445970, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.1005249, "step": 15224, "time_per_iteration": 2.5696728229522705 }, { "auxiliary_loss_clip": 0.06411813, "auxiliary_loss_mlp": 0.01263521, "balance_loss_clip": 0.0627467, "balance_loss_mlp": 0.0125365, "epoch": 0.9153765218698332, "flos": 23046902311680.0, "grad_norm": 2.2068421829010227, "language_loss": 0.8128655, "learning_rate": 7.460005572013895e-08, "loss": 0.88961881, "num_input_tokens_seen": 328464585, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09869385, "step": 15225, "time_per_iteration": 3.9796977043151855 }, { "auxiliary_loss_clip": 0.06411688, "auxiliary_loss_mlp": 0.01262294, "balance_loss_clip": 0.06273408, "balance_loss_mlp": 0.01253067, "epoch": 0.9154366451225011, "flos": 28999295055360.0, "grad_norm": 1.3341380002332637, "language_loss": 0.71416426, "learning_rate": 7.44947151196238e-08, "loss": 0.79090416, "num_input_tokens_seen": 328490155, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09228516, "step": 15226, "time_per_iteration": 2.6311347484588623 }, { "auxiliary_loss_clip": 0.06413647, "auxiliary_loss_mlp": 0.01264288, "balance_loss_clip": 0.06273542, "balance_loss_mlp": 0.01254787, "epoch": 0.9154967683751691, "flos": 22316029073280.0, "grad_norm": 1.8498189941585008, "language_loss": 0.74584508, "learning_rate": 7.43894475344613e-08, "loss": 0.82262444, "num_input_tokens_seen": 328508275, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09503174, "step": 15227, "time_per_iteration": 2.5344011783599854 }, { "auxiliary_loss_clip": 0.06411357, "auxiliary_loss_mlp": 0.0126501, "balance_loss_clip": 0.06273656, "balance_loss_mlp": 0.01256361, "epoch": 0.915556891627837, "flos": 24578360985600.0, "grad_norm": 1.540716866539696, "language_loss": 0.74287689, "learning_rate": 7.428425296864404e-08, "loss": 0.81964052, "num_input_tokens_seen": 328529425, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08648682, "step": 15228, "time_per_iteration": 2.575563907623291 }, { "auxiliary_loss_clip": 0.06416914, "auxiliary_loss_mlp": 0.01265463, "balance_loss_clip": 0.06279249, "balance_loss_mlp": 0.01256412, "epoch": 0.9156170148805051, "flos": 22171363799040.0, "grad_norm": 1.4575676948034881, "language_loss": 0.72273922, "learning_rate": 7.417913142616106e-08, "loss": 0.79956305, "num_input_tokens_seen": 328550200, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09051514, "step": 15229, "time_per_iteration": 2.565932512283325 }, { "auxiliary_loss_clip": 0.06414722, "auxiliary_loss_mlp": 0.01264705, "balance_loss_clip": 0.06276469, "balance_loss_mlp": 0.01254888, "epoch": 0.915677138133173, "flos": 20926552343040.0, "grad_norm": 1.677709152362801, "language_loss": 0.83205116, "learning_rate": 7.407408291099848e-08, "loss": 0.90884537, "num_input_tokens_seen": 328568540, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09814453, "step": 15230, "time_per_iteration": 2.5875720977783203 }, { "auxiliary_loss_clip": 0.06410875, "auxiliary_loss_mlp": 0.01262185, "balance_loss_clip": 0.06275599, "balance_loss_mlp": 0.01253233, "epoch": 0.915737261385841, "flos": 24350734569600.0, "grad_norm": 1.7005491639158008, "language_loss": 0.83498406, "learning_rate": 7.396910742713957e-08, "loss": 0.91171467, "num_input_tokens_seen": 328587300, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.08953857, "step": 15231, "time_per_iteration": 4.043320894241333 }, { "auxiliary_loss_clip": 0.06412493, "auxiliary_loss_mlp": 0.01263821, "balance_loss_clip": 0.06274797, "balance_loss_mlp": 0.01254768, "epoch": 0.9157973846385089, "flos": 26768758567680.0, "grad_norm": 1.387468573831272, "language_loss": 0.72526228, "learning_rate": 7.386420497856516e-08, "loss": 0.80202544, "num_input_tokens_seen": 328610055, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09051514, "step": 15232, "time_per_iteration": 2.66115403175354 }, { "auxiliary_loss_clip": 0.06420062, "auxiliary_loss_mlp": 0.01263797, "balance_loss_clip": 0.06279607, "balance_loss_mlp": 0.01254475, "epoch": 0.9158575078911769, "flos": 18484657130880.0, "grad_norm": 1.8857951637948553, "language_loss": 0.67607993, "learning_rate": 7.375937556925338e-08, "loss": 0.75291848, "num_input_tokens_seen": 328626815, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09320068, "step": 15233, "time_per_iteration": 2.5116865634918213 }, { "auxiliary_loss_clip": 0.06421703, "auxiliary_loss_mlp": 0.01269782, "balance_loss_clip": 0.06279945, "balance_loss_mlp": 0.01259197, "epoch": 0.9159176311438448, "flos": 21805403091840.0, "grad_norm": 2.081759382379705, "language_loss": 0.69921654, "learning_rate": 7.365461920317861e-08, "loss": 0.77613145, "num_input_tokens_seen": 328643995, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.105896, "step": 15234, "time_per_iteration": 2.5508642196655273 }, { "auxiliary_loss_clip": 0.06421593, "auxiliary_loss_mlp": 0.01262305, "balance_loss_clip": 0.06281011, "balance_loss_mlp": 0.0125253, "epoch": 0.9159777543965129, "flos": 24789552001920.0, "grad_norm": 1.9382112718827207, "language_loss": 0.88473213, "learning_rate": 7.354993588431391e-08, "loss": 0.9615711, "num_input_tokens_seen": 328659565, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09771729, "step": 15235, "time_per_iteration": 2.5441761016845703 }, { "auxiliary_loss_clip": 0.06415711, "auxiliary_loss_mlp": 0.01265627, "balance_loss_clip": 0.06276041, "balance_loss_mlp": 0.01256209, "epoch": 0.9160378776491809, "flos": 26875800558720.0, "grad_norm": 1.6111610498923097, "language_loss": 0.77680069, "learning_rate": 7.344532561662853e-08, "loss": 0.85361409, "num_input_tokens_seen": 328679045, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09423828, "step": 15236, "time_per_iteration": 2.5725038051605225 }, { "auxiliary_loss_clip": 0.06317656, "auxiliary_loss_mlp": 0.01251388, "balance_loss_clip": 0.06262323, "balance_loss_mlp": 0.01250321, "epoch": 0.9160980009018488, "flos": 70598596124160.0, "grad_norm": 0.7430382095693591, "language_loss": 0.62165248, "learning_rate": 7.334078840409019e-08, "loss": 0.69734299, "num_input_tokens_seen": 328744565, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01068878, "step": 15237, "time_per_iteration": 4.546152114868164 }, { "auxiliary_loss_clip": 0.0641733, "auxiliary_loss_mlp": 0.01263388, "balance_loss_clip": 0.06276983, "balance_loss_mlp": 0.01253297, "epoch": 0.9161581241545168, "flos": 16294846527360.0, "grad_norm": 1.7481545205449038, "language_loss": 0.75010169, "learning_rate": 7.323632425066151e-08, "loss": 0.82690883, "num_input_tokens_seen": 328762455, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.10095215, "step": 15238, "time_per_iteration": 2.5329766273498535 }, { "auxiliary_loss_clip": 0.06415568, "auxiliary_loss_mlp": 0.01264266, "balance_loss_clip": 0.06276138, "balance_loss_mlp": 0.01254861, "epoch": 0.9162182474071847, "flos": 18443386195200.0, "grad_norm": 1.8013818454403998, "language_loss": 0.74880576, "learning_rate": 7.313193316030464e-08, "loss": 0.82560414, "num_input_tokens_seen": 328780320, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09411621, "step": 15239, "time_per_iteration": 2.5733044147491455 }, { "auxiliary_loss_clip": 0.06418893, "auxiliary_loss_mlp": 0.01268932, "balance_loss_clip": 0.06279346, "balance_loss_mlp": 0.01259503, "epoch": 0.9162783706598527, "flos": 19172498497920.0, "grad_norm": 2.305990918437967, "language_loss": 0.64010179, "learning_rate": 7.302761513697819e-08, "loss": 0.71698004, "num_input_tokens_seen": 328797570, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09429932, "step": 15240, "time_per_iteration": 2.5103108882904053 }, { "auxiliary_loss_clip": 0.06414755, "auxiliary_loss_mlp": 0.01263242, "balance_loss_clip": 0.06278653, "balance_loss_mlp": 0.01253967, "epoch": 0.9163384939125206, "flos": 20419322451840.0, "grad_norm": 3.306080925947303, "language_loss": 0.76779717, "learning_rate": 7.292337018463746e-08, "loss": 0.84457713, "num_input_tokens_seen": 328814075, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09265137, "step": 15241, "time_per_iteration": 2.5340216159820557 }, { "auxiliary_loss_clip": 0.06431396, "auxiliary_loss_mlp": 0.0126712, "balance_loss_clip": 0.06280875, "balance_loss_mlp": 0.01255455, "epoch": 0.9163986171651887, "flos": 19651957960320.0, "grad_norm": 4.800859520276446, "language_loss": 0.68094641, "learning_rate": 7.281919830723549e-08, "loss": 0.75793159, "num_input_tokens_seen": 328831990, "router_z_loss_clip": 1.50488281, "router_z_loss_mlp": 0.11669922, "step": 15242, "time_per_iteration": 2.5197463035583496 }, { "auxiliary_loss_clip": 0.06413619, "auxiliary_loss_mlp": 0.01262668, "balance_loss_clip": 0.06273616, "balance_loss_mlp": 0.01253507, "epoch": 0.9164587404178566, "flos": 12827967845760.0, "grad_norm": 1.7593922477247437, "language_loss": 0.81029159, "learning_rate": 7.271509950872334e-08, "loss": 0.8870545, "num_input_tokens_seen": 328849105, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.0916748, "step": 15243, "time_per_iteration": 2.5236129760742188 }, { "auxiliary_loss_clip": 0.06417252, "auxiliary_loss_mlp": 0.01264641, "balance_loss_clip": 0.06275406, "balance_loss_mlp": 0.0125514, "epoch": 0.9165188636705246, "flos": 22315903292160.0, "grad_norm": 2.194285108272713, "language_loss": 0.82167077, "learning_rate": 7.261107379304721e-08, "loss": 0.89848965, "num_input_tokens_seen": 328866810, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.0949707, "step": 15244, "time_per_iteration": 2.5375003814697266 }, { "auxiliary_loss_clip": 0.0641851, "auxiliary_loss_mlp": 0.01264425, "balance_loss_clip": 0.06274943, "balance_loss_mlp": 0.0125465, "epoch": 0.9165789869231925, "flos": 18229218359040.0, "grad_norm": 2.2102503481978903, "language_loss": 0.72830838, "learning_rate": 7.250712116415214e-08, "loss": 0.80513775, "num_input_tokens_seen": 328885325, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09777832, "step": 15245, "time_per_iteration": 3.9422500133514404 }, { "auxiliary_loss_clip": 0.06413114, "auxiliary_loss_mlp": 0.01262648, "balance_loss_clip": 0.06274832, "balance_loss_mlp": 0.01253285, "epoch": 0.9166391101758605, "flos": 13695414439680.0, "grad_norm": 1.8438397216637388, "language_loss": 0.75422651, "learning_rate": 7.240324162598033e-08, "loss": 0.83098412, "num_input_tokens_seen": 328902655, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09362793, "step": 15246, "time_per_iteration": 2.6230814456939697 }, { "auxiliary_loss_clip": 0.06419374, "auxiliary_loss_mlp": 0.01264443, "balance_loss_clip": 0.0628081, "balance_loss_mlp": 0.01255044, "epoch": 0.9166992334285284, "flos": 17352380108160.0, "grad_norm": 2.009615773910505, "language_loss": 0.75514561, "learning_rate": 7.229943518247106e-08, "loss": 0.8319838, "num_input_tokens_seen": 328918440, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09393311, "step": 15247, "time_per_iteration": 2.521082639694214 }, { "auxiliary_loss_clip": 0.06418287, "auxiliary_loss_mlp": 0.012629, "balance_loss_clip": 0.06276303, "balance_loss_mlp": 0.01253446, "epoch": 0.9167593566811965, "flos": 23737678571520.0, "grad_norm": 1.6769397141466957, "language_loss": 0.76421797, "learning_rate": 7.219570183756052e-08, "loss": 0.84102988, "num_input_tokens_seen": 328938055, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09448242, "step": 15248, "time_per_iteration": 2.565335273742676 }, { "auxiliary_loss_clip": 0.06414588, "auxiliary_loss_mlp": 0.01269073, "balance_loss_clip": 0.06274784, "balance_loss_mlp": 0.01258988, "epoch": 0.9168194799338644, "flos": 27825537461760.0, "grad_norm": 4.41722589827364, "language_loss": 0.73404348, "learning_rate": 7.209204159518178e-08, "loss": 0.81088006, "num_input_tokens_seen": 328957895, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.10083008, "step": 15249, "time_per_iteration": 2.575768232345581 }, { "auxiliary_loss_clip": 0.06414841, "auxiliary_loss_mlp": 0.0126491, "balance_loss_clip": 0.06276953, "balance_loss_mlp": 0.01255689, "epoch": 0.9168796031865324, "flos": 21722609658240.0, "grad_norm": 2.2506422285670116, "language_loss": 0.75989449, "learning_rate": 7.198845445926616e-08, "loss": 0.83669198, "num_input_tokens_seen": 328971365, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09228516, "step": 15250, "time_per_iteration": 2.533048629760742 }, { "auxiliary_loss_clip": 0.06413216, "auxiliary_loss_mlp": 0.01266789, "balance_loss_clip": 0.06275756, "balance_loss_mlp": 0.01257098, "epoch": 0.9169397264392004, "flos": 23411185937280.0, "grad_norm": 1.6210633458757189, "language_loss": 0.75965732, "learning_rate": 7.188494043374138e-08, "loss": 0.83645737, "num_input_tokens_seen": 328990830, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09686279, "step": 15251, "time_per_iteration": 2.559903144836426 }, { "auxiliary_loss_clip": 0.06417093, "auxiliary_loss_mlp": 0.01263683, "balance_loss_clip": 0.06277288, "balance_loss_mlp": 0.01253807, "epoch": 0.9169998496918683, "flos": 23957716193280.0, "grad_norm": 2.481325852003935, "language_loss": 0.80261594, "learning_rate": 7.178149952253298e-08, "loss": 0.87942374, "num_input_tokens_seen": 329008345, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09869385, "step": 15252, "time_per_iteration": 2.5556957721710205 }, { "auxiliary_loss_clip": 0.06417817, "auxiliary_loss_mlp": 0.01269747, "balance_loss_clip": 0.06278723, "balance_loss_mlp": 0.01259811, "epoch": 0.9170599729445363, "flos": 18338314775040.0, "grad_norm": 1.6118500928371013, "language_loss": 0.77249956, "learning_rate": 7.167813172956316e-08, "loss": 0.84937525, "num_input_tokens_seen": 329027440, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09942627, "step": 15253, "time_per_iteration": 2.5147619247436523 }, { "auxiliary_loss_clip": 0.06417762, "auxiliary_loss_mlp": 0.01264226, "balance_loss_clip": 0.0627847, "balance_loss_mlp": 0.01255238, "epoch": 0.9171200961972042, "flos": 22681528583040.0, "grad_norm": 1.7099471394478687, "language_loss": 0.73368502, "learning_rate": 7.157483705875256e-08, "loss": 0.81050485, "num_input_tokens_seen": 329046445, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08978271, "step": 15254, "time_per_iteration": 2.5756912231445312 }, { "auxiliary_loss_clip": 0.06407019, "auxiliary_loss_mlp": 0.01265618, "balance_loss_clip": 0.06272784, "balance_loss_mlp": 0.01256052, "epoch": 0.9171802194498723, "flos": 26725726696320.0, "grad_norm": 1.508278745401703, "language_loss": 0.79083741, "learning_rate": 7.14716155140167e-08, "loss": 0.86756378, "num_input_tokens_seen": 329065555, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.09564209, "step": 15255, "time_per_iteration": 2.5700416564941406 }, { "auxiliary_loss_clip": 0.0641674, "auxiliary_loss_mlp": 0.01268077, "balance_loss_clip": 0.06275091, "balance_loss_mlp": 0.01258547, "epoch": 0.9172403427025402, "flos": 37898423061120.0, "grad_norm": 1.8381307708987331, "language_loss": 0.6868844, "learning_rate": 7.136846709927047e-08, "loss": 0.76373255, "num_input_tokens_seen": 329087515, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09527588, "step": 15256, "time_per_iteration": 2.6953723430633545 }, { "auxiliary_loss_clip": 0.06413269, "auxiliary_loss_mlp": 0.01264116, "balance_loss_clip": 0.06277544, "balance_loss_mlp": 0.01255289, "epoch": 0.9173004659552082, "flos": 17060743572480.0, "grad_norm": 1.6197227993367327, "language_loss": 0.84313303, "learning_rate": 7.126539181842561e-08, "loss": 0.91990691, "num_input_tokens_seen": 329106820, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.08825684, "step": 15257, "time_per_iteration": 2.5416359901428223 }, { "auxiliary_loss_clip": 0.06412727, "auxiliary_loss_mlp": 0.01264957, "balance_loss_clip": 0.0627667, "balance_loss_mlp": 0.01256589, "epoch": 0.9173605892078761, "flos": 22208358176640.0, "grad_norm": 1.5493059190152525, "language_loss": 0.77613986, "learning_rate": 7.116238967539012e-08, "loss": 0.85291672, "num_input_tokens_seen": 329126515, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.08380127, "step": 15258, "time_per_iteration": 2.5473504066467285 }, { "auxiliary_loss_clip": 0.06417876, "auxiliary_loss_mlp": 0.01264534, "balance_loss_clip": 0.06280422, "balance_loss_mlp": 0.0125464, "epoch": 0.9174207124605441, "flos": 16513248994560.0, "grad_norm": 1.9305121227229387, "language_loss": 0.7872842, "learning_rate": 7.105946067406999e-08, "loss": 0.86410832, "num_input_tokens_seen": 329142660, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09899902, "step": 15259, "time_per_iteration": 2.5071868896484375 }, { "auxiliary_loss_clip": 0.06412184, "auxiliary_loss_mlp": 0.01267468, "balance_loss_clip": 0.06274077, "balance_loss_mlp": 0.01258665, "epoch": 0.917480835713212, "flos": 24542582492160.0, "grad_norm": 1.5046466882707141, "language_loss": 0.76271629, "learning_rate": 7.095660481836895e-08, "loss": 0.83951283, "num_input_tokens_seen": 329162575, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.08807373, "step": 15260, "time_per_iteration": 2.6102123260498047 }, { "auxiliary_loss_clip": 0.06411554, "auxiliary_loss_mlp": 0.0126259, "balance_loss_clip": 0.06273518, "balance_loss_mlp": 0.01253143, "epoch": 0.9175409589658801, "flos": 20886036094080.0, "grad_norm": 1.702572953056612, "language_loss": 0.6125716, "learning_rate": 7.085382211218637e-08, "loss": 0.68931299, "num_input_tokens_seen": 329182090, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09442139, "step": 15261, "time_per_iteration": 2.5586161613464355 }, { "auxiliary_loss_clip": 0.06409848, "auxiliary_loss_mlp": 0.01262948, "balance_loss_clip": 0.06273703, "balance_loss_mlp": 0.01254103, "epoch": 0.917601082218548, "flos": 14280113030400.0, "grad_norm": 1.7629169626186683, "language_loss": 0.7391082, "learning_rate": 7.075111255942002e-08, "loss": 0.81583619, "num_input_tokens_seen": 329196535, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.08843994, "step": 15262, "time_per_iteration": 2.5437073707580566 }, { "auxiliary_loss_clip": 0.06420811, "auxiliary_loss_mlp": 0.01265581, "balance_loss_clip": 0.06275795, "balance_loss_mlp": 0.01256163, "epoch": 0.917661205471216, "flos": 19105301923200.0, "grad_norm": 1.871447045729865, "language_loss": 0.77938724, "learning_rate": 7.064847616396496e-08, "loss": 0.85625118, "num_input_tokens_seen": 329215135, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.09411621, "step": 15263, "time_per_iteration": 2.5315616130828857 }, { "auxiliary_loss_clip": 0.06417173, "auxiliary_loss_mlp": 0.01265779, "balance_loss_clip": 0.06272954, "balance_loss_mlp": 0.01256069, "epoch": 0.917721328723884, "flos": 21113075531520.0, "grad_norm": 1.853311660193918, "language_loss": 0.7600441, "learning_rate": 7.054591292971324e-08, "loss": 0.83687359, "num_input_tokens_seen": 329235150, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.09710693, "step": 15264, "time_per_iteration": 4.027320146560669 }, { "auxiliary_loss_clip": 0.06410497, "auxiliary_loss_mlp": 0.0126634, "balance_loss_clip": 0.06270581, "balance_loss_mlp": 0.01256934, "epoch": 0.9177814519765519, "flos": 21949439460480.0, "grad_norm": 1.6053263195807135, "language_loss": 0.8308965, "learning_rate": 7.044342286055394e-08, "loss": 0.90766484, "num_input_tokens_seen": 329254365, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09405518, "step": 15265, "time_per_iteration": 2.5696349143981934 }, { "auxiliary_loss_clip": 0.06417026, "auxiliary_loss_mlp": 0.01268874, "balance_loss_clip": 0.06274465, "balance_loss_mlp": 0.01258354, "epoch": 0.9178415752292199, "flos": 24212693767680.0, "grad_norm": 1.7539658371113271, "language_loss": 0.73636037, "learning_rate": 7.034100596037306e-08, "loss": 0.81321937, "num_input_tokens_seen": 329274385, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10522461, "step": 15266, "time_per_iteration": 2.595625400543213 }, { "auxiliary_loss_clip": 0.06415984, "auxiliary_loss_mlp": 0.01267155, "balance_loss_clip": 0.06275456, "balance_loss_mlp": 0.01258161, "epoch": 0.9179016984818879, "flos": 20047324250880.0, "grad_norm": 1.5866940417142532, "language_loss": 0.77834392, "learning_rate": 7.023866223305486e-08, "loss": 0.85517532, "num_input_tokens_seen": 329292160, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.08996582, "step": 15267, "time_per_iteration": 2.587862730026245 }, { "auxiliary_loss_clip": 0.06316435, "auxiliary_loss_mlp": 0.01250457, "balance_loss_clip": 0.06261276, "balance_loss_mlp": 0.01249516, "epoch": 0.9179618217345559, "flos": 65577561511680.0, "grad_norm": 0.7269134009714948, "language_loss": 0.55602717, "learning_rate": 7.013639168247975e-08, "loss": 0.63169611, "num_input_tokens_seen": 329351870, "router_z_loss_clip": 0.55371094, "router_z_loss_mlp": 0.00939178, "step": 15268, "time_per_iteration": 3.176669120788574 }, { "auxiliary_loss_clip": 0.06413526, "auxiliary_loss_mlp": 0.01263154, "balance_loss_clip": 0.06275394, "balance_loss_mlp": 0.01253492, "epoch": 0.9180219449872238, "flos": 21331016801280.0, "grad_norm": 1.718302789282715, "language_loss": 0.76706839, "learning_rate": 7.0034194312526e-08, "loss": 0.84383518, "num_input_tokens_seen": 329370930, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09661865, "step": 15269, "time_per_iteration": 2.5800273418426514 }, { "auxiliary_loss_clip": 0.06413825, "auxiliary_loss_mlp": 0.0126546, "balance_loss_clip": 0.06275567, "balance_loss_mlp": 0.01255673, "epoch": 0.9180820682398918, "flos": 41069137086720.0, "grad_norm": 1.6738234068011975, "language_loss": 0.72524512, "learning_rate": 6.993207012706936e-08, "loss": 0.80203795, "num_input_tokens_seen": 329391275, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09790039, "step": 15270, "time_per_iteration": 4.160146474838257 }, { "auxiliary_loss_clip": 0.06409185, "auxiliary_loss_mlp": 0.012688, "balance_loss_clip": 0.06271834, "balance_loss_mlp": 0.01259282, "epoch": 0.9181421914925597, "flos": 28080179619840.0, "grad_norm": 1.5484954287082002, "language_loss": 0.80043656, "learning_rate": 6.98300191299821e-08, "loss": 0.87721634, "num_input_tokens_seen": 329412775, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09527588, "step": 15271, "time_per_iteration": 2.571373701095581 }, { "auxiliary_loss_clip": 0.06415835, "auxiliary_loss_mlp": 0.01267843, "balance_loss_clip": 0.06275208, "balance_loss_mlp": 0.01258479, "epoch": 0.9182023147452277, "flos": 29177181273600.0, "grad_norm": 2.5875666279811638, "language_loss": 0.72803122, "learning_rate": 6.972804132513355e-08, "loss": 0.80486798, "num_input_tokens_seen": 329432440, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09368896, "step": 15272, "time_per_iteration": 2.594411611557007 }, { "auxiliary_loss_clip": 0.0641333, "auxiliary_loss_mlp": 0.01264332, "balance_loss_clip": 0.06275404, "balance_loss_mlp": 0.01255278, "epoch": 0.9182624379978956, "flos": 24067651150080.0, "grad_norm": 1.9046997409377096, "language_loss": 0.72776401, "learning_rate": 6.962613671639105e-08, "loss": 0.80454063, "num_input_tokens_seen": 329450605, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09048462, "step": 15273, "time_per_iteration": 2.536994695663452 }, { "auxiliary_loss_clip": 0.06408778, "auxiliary_loss_mlp": 0.0126576, "balance_loss_clip": 0.06275786, "balance_loss_mlp": 0.01257475, "epoch": 0.9183225612505637, "flos": 23300035096320.0, "grad_norm": 1.4541353777439674, "language_loss": 0.74116397, "learning_rate": 6.952430530761933e-08, "loss": 0.8179093, "num_input_tokens_seen": 329470550, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.08288574, "step": 15274, "time_per_iteration": 2.539598226547241 }, { "auxiliary_loss_clip": 0.06416713, "auxiliary_loss_mlp": 0.01264232, "balance_loss_clip": 0.06276901, "balance_loss_mlp": 0.01254964, "epoch": 0.9183826845032316, "flos": 19615257072000.0, "grad_norm": 1.4571465275707334, "language_loss": 0.68761051, "learning_rate": 6.942254710267902e-08, "loss": 0.76441991, "num_input_tokens_seen": 329489765, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.0927124, "step": 15275, "time_per_iteration": 2.5273022651672363 }, { "auxiliary_loss_clip": 0.064115, "auxiliary_loss_mlp": 0.01264784, "balance_loss_clip": 0.06274293, "balance_loss_mlp": 0.01255331, "epoch": 0.9184428077558996, "flos": 18485034474240.0, "grad_norm": 1.91046036595737, "language_loss": 0.72692657, "learning_rate": 6.932086210542953e-08, "loss": 0.80368936, "num_input_tokens_seen": 329507040, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09454346, "step": 15276, "time_per_iteration": 2.5202620029449463 }, { "auxiliary_loss_clip": 0.06414099, "auxiliary_loss_mlp": 0.01264961, "balance_loss_clip": 0.0627675, "balance_loss_mlp": 0.01256455, "epoch": 0.9185029310085676, "flos": 20747366386560.0, "grad_norm": 2.245348191832766, "language_loss": 0.73550826, "learning_rate": 6.921925031972642e-08, "loss": 0.81229889, "num_input_tokens_seen": 329525540, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08508301, "step": 15277, "time_per_iteration": 4.089501142501831 }, { "auxiliary_loss_clip": 0.06312317, "auxiliary_loss_mlp": 0.01251526, "balance_loss_clip": 0.06257324, "balance_loss_mlp": 0.0125053, "epoch": 0.9185630542612355, "flos": 68229641491200.0, "grad_norm": 0.7132262317482314, "language_loss": 0.59233069, "learning_rate": 6.91177117494226e-08, "loss": 0.66796911, "num_input_tokens_seen": 329592905, "router_z_loss_clip": 0.54882812, "router_z_loss_mlp": 0.00995636, "step": 15278, "time_per_iteration": 3.249849796295166 }, { "auxiliary_loss_clip": 0.06408392, "auxiliary_loss_mlp": 0.01261283, "balance_loss_clip": 0.06272908, "balance_loss_mlp": 0.01253171, "epoch": 0.9186231775139035, "flos": 12244317431040.0, "grad_norm": 1.6595274129223703, "language_loss": 0.64524734, "learning_rate": 6.901624639836879e-08, "loss": 0.72194409, "num_input_tokens_seen": 329610150, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.08111572, "step": 15279, "time_per_iteration": 2.598928451538086 }, { "auxiliary_loss_clip": 0.06316669, "auxiliary_loss_mlp": 0.01251251, "balance_loss_clip": 0.06261586, "balance_loss_mlp": 0.01250266, "epoch": 0.9186833007665715, "flos": 63958739356800.0, "grad_norm": 0.8547063745312223, "language_loss": 0.60042644, "learning_rate": 6.891485427041211e-08, "loss": 0.67610562, "num_input_tokens_seen": 329673650, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00984192, "step": 15280, "time_per_iteration": 3.162097930908203 }, { "auxiliary_loss_clip": 0.06417361, "auxiliary_loss_mlp": 0.01264275, "balance_loss_clip": 0.06275994, "balance_loss_mlp": 0.01254661, "epoch": 0.9187434240192395, "flos": 19980882362880.0, "grad_norm": 1.6737317448854439, "language_loss": 0.70041442, "learning_rate": 6.881353536939815e-08, "loss": 0.77723074, "num_input_tokens_seen": 329692520, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09613037, "step": 15281, "time_per_iteration": 2.5381877422332764 }, { "auxiliary_loss_clip": 0.06412493, "auxiliary_loss_mlp": 0.0126462, "balance_loss_clip": 0.06272087, "balance_loss_mlp": 0.01254684, "epoch": 0.9188035472719074, "flos": 25234742344320.0, "grad_norm": 1.7746401760970205, "language_loss": 0.84535748, "learning_rate": 6.871228969916831e-08, "loss": 0.92212868, "num_input_tokens_seen": 329713750, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09936523, "step": 15282, "time_per_iteration": 2.5572047233581543 }, { "auxiliary_loss_clip": 0.06412424, "auxiliary_loss_mlp": 0.01270517, "balance_loss_clip": 0.06275564, "balance_loss_mlp": 0.01260235, "epoch": 0.9188636705245754, "flos": 18411423062400.0, "grad_norm": 1.7198229167778163, "language_loss": 0.603809, "learning_rate": 6.861111726356194e-08, "loss": 0.68063843, "num_input_tokens_seen": 329730960, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.10290527, "step": 15283, "time_per_iteration": 2.517845630645752 }, { "auxiliary_loss_clip": 0.06419963, "auxiliary_loss_mlp": 0.01264936, "balance_loss_clip": 0.06275217, "balance_loss_mlp": 0.01255221, "epoch": 0.9189237937772433, "flos": 23775930760320.0, "grad_norm": 2.059283765197434, "language_loss": 0.65616202, "learning_rate": 6.851001806641554e-08, "loss": 0.73301113, "num_input_tokens_seen": 329750975, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.09716797, "step": 15284, "time_per_iteration": 2.5490221977233887 }, { "auxiliary_loss_clip": 0.0641273, "auxiliary_loss_mlp": 0.01262569, "balance_loss_clip": 0.06274681, "balance_loss_mlp": 0.01253467, "epoch": 0.9189839170299113, "flos": 21220914136320.0, "grad_norm": 1.8211067125104776, "language_loss": 0.74059784, "learning_rate": 6.840899211156292e-08, "loss": 0.8173508, "num_input_tokens_seen": 329769645, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09100342, "step": 15285, "time_per_iteration": 3.9282643795013428 }, { "auxiliary_loss_clip": 0.06411023, "auxiliary_loss_mlp": 0.01262908, "balance_loss_clip": 0.06274033, "balance_loss_mlp": 0.0125417, "epoch": 0.9190440402825792, "flos": 16732993127040.0, "grad_norm": 1.9366270652993334, "language_loss": 0.7212854, "learning_rate": 6.830803940283458e-08, "loss": 0.79802465, "num_input_tokens_seen": 329788185, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.08734131, "step": 15286, "time_per_iteration": 2.5134243965148926 }, { "auxiliary_loss_clip": 0.06410791, "auxiliary_loss_mlp": 0.01266251, "balance_loss_clip": 0.062721, "balance_loss_mlp": 0.01256172, "epoch": 0.9191041635352473, "flos": 23448012606720.0, "grad_norm": 1.6833183958162625, "language_loss": 0.73664045, "learning_rate": 6.820715994405945e-08, "loss": 0.81341088, "num_input_tokens_seen": 329806780, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.10083008, "step": 15287, "time_per_iteration": 2.536222457885742 }, { "auxiliary_loss_clip": 0.06417634, "auxiliary_loss_mlp": 0.01264418, "balance_loss_clip": 0.06276672, "balance_loss_mlp": 0.01253886, "epoch": 0.9191642867879152, "flos": 18813581533440.0, "grad_norm": 3.9338653830050894, "language_loss": 0.65316701, "learning_rate": 6.810635373906226e-08, "loss": 0.7299875, "num_input_tokens_seen": 329826350, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10522461, "step": 15288, "time_per_iteration": 2.5439181327819824 }, { "auxiliary_loss_clip": 0.06414437, "auxiliary_loss_mlp": 0.01263373, "balance_loss_clip": 0.06276618, "balance_loss_mlp": 0.01254342, "epoch": 0.9192244100405832, "flos": 32169170540160.0, "grad_norm": 1.9712165464046862, "language_loss": 0.7136898, "learning_rate": 6.800562079166549e-08, "loss": 0.79046786, "num_input_tokens_seen": 329846160, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09033203, "step": 15289, "time_per_iteration": 2.6275575160980225 }, { "auxiliary_loss_clip": 0.06415124, "auxiliary_loss_mlp": 0.01266817, "balance_loss_clip": 0.06274419, "balance_loss_mlp": 0.01256702, "epoch": 0.9192845332932512, "flos": 16362420445440.0, "grad_norm": 1.8529130813000374, "language_loss": 0.74243647, "learning_rate": 6.790496110568921e-08, "loss": 0.81925595, "num_input_tokens_seen": 329862020, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10113525, "step": 15290, "time_per_iteration": 2.5605032444000244 }, { "auxiliary_loss_clip": 0.0641072, "auxiliary_loss_mlp": 0.01263697, "balance_loss_clip": 0.0627434, "balance_loss_mlp": 0.01254768, "epoch": 0.9193446565459191, "flos": 26621661525120.0, "grad_norm": 1.759131419503035, "language_loss": 0.71954405, "learning_rate": 6.78043746849506e-08, "loss": 0.79628825, "num_input_tokens_seen": 329880185, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.08929443, "step": 15291, "time_per_iteration": 2.606337547302246 }, { "auxiliary_loss_clip": 0.06413663, "auxiliary_loss_mlp": 0.01264772, "balance_loss_clip": 0.06277237, "balance_loss_mlp": 0.01255193, "epoch": 0.9194047797985871, "flos": 22498778828160.0, "grad_norm": 1.642712558101917, "language_loss": 0.71204764, "learning_rate": 6.770386153326346e-08, "loss": 0.78883207, "num_input_tokens_seen": 329900255, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.09576416, "step": 15292, "time_per_iteration": 2.5813870429992676 }, { "auxiliary_loss_clip": 0.06418133, "auxiliary_loss_mlp": 0.01262861, "balance_loss_clip": 0.06278895, "balance_loss_mlp": 0.01252889, "epoch": 0.9194649030512551, "flos": 25085171606400.0, "grad_norm": 1.732412726625263, "language_loss": 0.7329601, "learning_rate": 6.760342165443988e-08, "loss": 0.80977011, "num_input_tokens_seen": 329919095, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09979248, "step": 15293, "time_per_iteration": 2.5628421306610107 }, { "auxiliary_loss_clip": 0.06412391, "auxiliary_loss_mlp": 0.01267443, "balance_loss_clip": 0.06275657, "balance_loss_mlp": 0.0125793, "epoch": 0.9195250263039231, "flos": 11915938080000.0, "grad_norm": 1.737575279303334, "language_loss": 0.78571731, "learning_rate": 6.750305505228837e-08, "loss": 0.86251563, "num_input_tokens_seen": 329936505, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.09515381, "step": 15294, "time_per_iteration": 2.5271363258361816 }, { "auxiliary_loss_clip": 0.06422643, "auxiliary_loss_mlp": 0.01266269, "balance_loss_clip": 0.06281312, "balance_loss_mlp": 0.01256142, "epoch": 0.919585149556591, "flos": 21840426898560.0, "grad_norm": 2.228976819585457, "language_loss": 0.77102649, "learning_rate": 6.74027617306141e-08, "loss": 0.84791565, "num_input_tokens_seen": 329956795, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10119629, "step": 15295, "time_per_iteration": 2.569622039794922 }, { "auxiliary_loss_clip": 0.06410816, "auxiliary_loss_mlp": 0.01272225, "balance_loss_clip": 0.06277353, "balance_loss_mlp": 0.01263267, "epoch": 0.919645272809259, "flos": 28191623950080.0, "grad_norm": 1.9975084117786792, "language_loss": 0.71858662, "learning_rate": 6.730254169322114e-08, "loss": 0.79541707, "num_input_tokens_seen": 329977195, "router_z_loss_clip": 1.33496094, "router_z_loss_mlp": 0.08953857, "step": 15296, "time_per_iteration": 2.588630437850952 }, { "auxiliary_loss_clip": 0.06409016, "auxiliary_loss_mlp": 0.01264872, "balance_loss_clip": 0.06272575, "balance_loss_mlp": 0.01255067, "epoch": 0.9197053960619269, "flos": 18338734045440.0, "grad_norm": 1.8189850839750137, "language_loss": 0.75227964, "learning_rate": 6.720239494390912e-08, "loss": 0.82901847, "num_input_tokens_seen": 329992095, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.09802246, "step": 15297, "time_per_iteration": 2.4946887493133545 }, { "auxiliary_loss_clip": 0.06413426, "auxiliary_loss_mlp": 0.01266155, "balance_loss_clip": 0.06274325, "balance_loss_mlp": 0.01256695, "epoch": 0.9197655193145949, "flos": 28190911190400.0, "grad_norm": 1.5752971639516364, "language_loss": 0.7381258, "learning_rate": 6.710232148647676e-08, "loss": 0.81492162, "num_input_tokens_seen": 330011490, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09454346, "step": 15298, "time_per_iteration": 2.5935497283935547 }, { "auxiliary_loss_clip": 0.06418706, "auxiliary_loss_mlp": 0.01265437, "balance_loss_clip": 0.06277798, "balance_loss_mlp": 0.01255334, "epoch": 0.9198256425672628, "flos": 17311234953600.0, "grad_norm": 2.2534959467365274, "language_loss": 0.79245567, "learning_rate": 6.70023213247175e-08, "loss": 0.86929715, "num_input_tokens_seen": 330027885, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10107422, "step": 15299, "time_per_iteration": 2.4993326663970947 }, { "auxiliary_loss_clip": 0.06411551, "auxiliary_loss_mlp": 0.01263425, "balance_loss_clip": 0.06274749, "balance_loss_mlp": 0.01253895, "epoch": 0.9198857658199309, "flos": 17864347754880.0, "grad_norm": 3.160379304754051, "language_loss": 0.64151919, "learning_rate": 6.690239446242385e-08, "loss": 0.71826893, "num_input_tokens_seen": 330046230, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09533691, "step": 15300, "time_per_iteration": 2.5125577449798584 }, { "auxiliary_loss_clip": 0.06405756, "auxiliary_loss_mlp": 0.01264328, "balance_loss_clip": 0.06273874, "balance_loss_mlp": 0.01256275, "epoch": 0.9199458890725988, "flos": 22134117859200.0, "grad_norm": 1.7020331762428969, "language_loss": 0.6944434, "learning_rate": 6.680254090338545e-08, "loss": 0.77114415, "num_input_tokens_seen": 330065535, "router_z_loss_clip": 1.31835938, "router_z_loss_mlp": 0.08044434, "step": 15301, "time_per_iteration": 2.5245306491851807 }, { "auxiliary_loss_clip": 0.06416917, "auxiliary_loss_mlp": 0.01263656, "balance_loss_clip": 0.06276026, "balance_loss_mlp": 0.01253368, "epoch": 0.9200060123252668, "flos": 16039533536640.0, "grad_norm": 1.8665262109798832, "language_loss": 0.71176291, "learning_rate": 6.670276065138814e-08, "loss": 0.78856862, "num_input_tokens_seen": 330082920, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10290527, "step": 15302, "time_per_iteration": 2.5158638954162598 }, { "auxiliary_loss_clip": 0.06416671, "auxiliary_loss_mlp": 0.01264186, "balance_loss_clip": 0.0627623, "balance_loss_mlp": 0.0125484, "epoch": 0.9200661355779348, "flos": 26870853168000.0, "grad_norm": 2.270708885118312, "language_loss": 0.76627517, "learning_rate": 6.660305371021579e-08, "loss": 0.84308374, "num_input_tokens_seen": 330101165, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09344482, "step": 15303, "time_per_iteration": 2.564862012863159 }, { "auxiliary_loss_clip": 0.06414664, "auxiliary_loss_mlp": 0.01269234, "balance_loss_clip": 0.06277828, "balance_loss_mlp": 0.01259751, "epoch": 0.9201262588306027, "flos": 12791686227840.0, "grad_norm": 2.1038721580548736, "language_loss": 0.88332832, "learning_rate": 6.650342008365006e-08, "loss": 0.96016729, "num_input_tokens_seen": 330118775, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09484863, "step": 15304, "time_per_iteration": 3.9455018043518066 }, { "auxiliary_loss_clip": 0.06424981, "auxiliary_loss_mlp": 0.01266885, "balance_loss_clip": 0.06280074, "balance_loss_mlp": 0.01255543, "epoch": 0.9201863820832707, "flos": 20637934554240.0, "grad_norm": 2.049537260078558, "language_loss": 0.77669823, "learning_rate": 6.64038597754677e-08, "loss": 0.85361695, "num_input_tokens_seen": 330135570, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.11334229, "step": 15305, "time_per_iteration": 2.5938918590545654 }, { "auxiliary_loss_clip": 0.06415362, "auxiliary_loss_mlp": 0.01264245, "balance_loss_clip": 0.06274989, "balance_loss_mlp": 0.01255197, "epoch": 0.9202465053359387, "flos": 26403007495680.0, "grad_norm": 2.035176224128594, "language_loss": 0.81606466, "learning_rate": 6.630437278944501e-08, "loss": 0.89286065, "num_input_tokens_seen": 330152840, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.0904541, "step": 15306, "time_per_iteration": 2.579042673110962 }, { "auxiliary_loss_clip": 0.06413073, "auxiliary_loss_mlp": 0.01264721, "balance_loss_clip": 0.06276299, "balance_loss_mlp": 0.01256031, "epoch": 0.9203066285886067, "flos": 10492737281280.0, "grad_norm": 2.0255642775426974, "language_loss": 0.72632337, "learning_rate": 6.62049591293541e-08, "loss": 0.8031013, "num_input_tokens_seen": 330168605, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.08688354, "step": 15307, "time_per_iteration": 2.551593065261841 }, { "auxiliary_loss_clip": 0.06419426, "auxiliary_loss_mlp": 0.01263712, "balance_loss_clip": 0.06276643, "balance_loss_mlp": 0.01253835, "epoch": 0.9203667518412746, "flos": 19396770750720.0, "grad_norm": 2.004798967199003, "language_loss": 0.78517485, "learning_rate": 6.610561879896526e-08, "loss": 0.86200631, "num_input_tokens_seen": 330186160, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09881592, "step": 15308, "time_per_iteration": 2.542888641357422 }, { "auxiliary_loss_clip": 0.0641019, "auxiliary_loss_mlp": 0.01262186, "balance_loss_clip": 0.06272168, "balance_loss_mlp": 0.0125324, "epoch": 0.9204268750939426, "flos": 15930520974720.0, "grad_norm": 1.6844766345155249, "language_loss": 0.775392, "learning_rate": 6.600635180204484e-08, "loss": 0.85211575, "num_input_tokens_seen": 330201780, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.08947754, "step": 15309, "time_per_iteration": 2.5553393363952637 }, { "auxiliary_loss_clip": 0.06413656, "auxiliary_loss_mlp": 0.01262065, "balance_loss_clip": 0.06274995, "balance_loss_mlp": 0.01253362, "epoch": 0.9204869983466105, "flos": 16477302792960.0, "grad_norm": 1.9229473665778154, "language_loss": 0.66476071, "learning_rate": 6.590715814235781e-08, "loss": 0.74151796, "num_input_tokens_seen": 330219165, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.08700562, "step": 15310, "time_per_iteration": 4.1092047691345215 }, { "auxiliary_loss_clip": 0.06415536, "auxiliary_loss_mlp": 0.01266437, "balance_loss_clip": 0.06275278, "balance_loss_mlp": 0.01257008, "epoch": 0.9205471215992785, "flos": 21544933075200.0, "grad_norm": 1.5693939524315252, "language_loss": 0.66321814, "learning_rate": 6.580803782366495e-08, "loss": 0.74003786, "num_input_tokens_seen": 330238975, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09429932, "step": 15311, "time_per_iteration": 2.637444019317627 }, { "auxiliary_loss_clip": 0.06408712, "auxiliary_loss_mlp": 0.01264076, "balance_loss_clip": 0.06269626, "balance_loss_mlp": 0.01254903, "epoch": 0.9206072448519464, "flos": 25012272954240.0, "grad_norm": 2.0172475849747187, "language_loss": 0.76568055, "learning_rate": 6.570899084972503e-08, "loss": 0.84240848, "num_input_tokens_seen": 330259755, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09173584, "step": 15312, "time_per_iteration": 2.647188186645508 }, { "auxiliary_loss_clip": 0.06410939, "auxiliary_loss_mlp": 0.0126895, "balance_loss_clip": 0.06275554, "balance_loss_mlp": 0.01260123, "epoch": 0.9206673681046145, "flos": 20529047773440.0, "grad_norm": 1.7054553856075023, "language_loss": 0.79321587, "learning_rate": 6.561001722429394e-08, "loss": 0.87001479, "num_input_tokens_seen": 330277660, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.08825684, "step": 15313, "time_per_iteration": 2.6756696701049805 }, { "auxiliary_loss_clip": 0.06415205, "auxiliary_loss_mlp": 0.01262815, "balance_loss_clip": 0.06275349, "balance_loss_mlp": 0.01253284, "epoch": 0.9207274913572824, "flos": 20889222549120.0, "grad_norm": 1.6412780551462713, "language_loss": 0.78541136, "learning_rate": 6.55111169511251e-08, "loss": 0.86219156, "num_input_tokens_seen": 330295455, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09533691, "step": 15314, "time_per_iteration": 2.633404493331909 }, { "auxiliary_loss_clip": 0.06424265, "auxiliary_loss_mlp": 0.01264811, "balance_loss_clip": 0.06278562, "balance_loss_mlp": 0.0125472, "epoch": 0.9207876146099504, "flos": 22714414110720.0, "grad_norm": 2.0040519081081065, "language_loss": 0.79113275, "learning_rate": 6.541229003396864e-08, "loss": 0.86802351, "num_input_tokens_seen": 330315310, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.10095215, "step": 15315, "time_per_iteration": 2.550161838531494 }, { "auxiliary_loss_clip": 0.0642098, "auxiliary_loss_mlp": 0.01268041, "balance_loss_clip": 0.06276402, "balance_loss_mlp": 0.01257742, "epoch": 0.9208477378626184, "flos": 18511966362240.0, "grad_norm": 1.7778919290994404, "language_loss": 0.76127881, "learning_rate": 6.531353647657156e-08, "loss": 0.83816904, "num_input_tokens_seen": 330333260, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10314941, "step": 15316, "time_per_iteration": 4.0161638259887695 }, { "auxiliary_loss_clip": 0.06414209, "auxiliary_loss_mlp": 0.01266263, "balance_loss_clip": 0.06272739, "balance_loss_mlp": 0.01256357, "epoch": 0.9209078611152863, "flos": 23005757157120.0, "grad_norm": 1.642512403044653, "language_loss": 0.69177467, "learning_rate": 6.521485628267931e-08, "loss": 0.76857936, "num_input_tokens_seen": 330352465, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09899902, "step": 15317, "time_per_iteration": 2.5431745052337646 }, { "auxiliary_loss_clip": 0.06416994, "auxiliary_loss_mlp": 0.01266116, "balance_loss_clip": 0.06276862, "balance_loss_mlp": 0.01256359, "epoch": 0.9209679843679544, "flos": 24068447763840.0, "grad_norm": 2.2284624514880056, "language_loss": 0.83819914, "learning_rate": 6.511624945603378e-08, "loss": 0.91503024, "num_input_tokens_seen": 330372685, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09759521, "step": 15318, "time_per_iteration": 2.552680492401123 }, { "auxiliary_loss_clip": 0.06418848, "auxiliary_loss_mlp": 0.01264428, "balance_loss_clip": 0.06280262, "balance_loss_mlp": 0.01255296, "epoch": 0.9210281076206223, "flos": 13558505667840.0, "grad_norm": 1.7915743412248084, "language_loss": 0.85747218, "learning_rate": 6.501771600037354e-08, "loss": 0.93430501, "num_input_tokens_seen": 330388860, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09124756, "step": 15319, "time_per_iteration": 2.5131139755249023 }, { "auxiliary_loss_clip": 0.06318989, "auxiliary_loss_mlp": 0.01251037, "balance_loss_clip": 0.0626395, "balance_loss_mlp": 0.01250053, "epoch": 0.9210882308732903, "flos": 71448292851840.0, "grad_norm": 0.7533402564228212, "language_loss": 0.56111258, "learning_rate": 6.491925591943559e-08, "loss": 0.63681281, "num_input_tokens_seen": 330448735, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00983429, "step": 15320, "time_per_iteration": 3.1903178691864014 }, { "auxiliary_loss_clip": 0.06418821, "auxiliary_loss_mlp": 0.01268044, "balance_loss_clip": 0.06274854, "balance_loss_mlp": 0.01257202, "epoch": 0.9211483541259582, "flos": 18514020787200.0, "grad_norm": 2.1388089686406495, "language_loss": 0.64311194, "learning_rate": 6.482086921695384e-08, "loss": 0.7199806, "num_input_tokens_seen": 330465600, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10839844, "step": 15321, "time_per_iteration": 2.530526876449585 }, { "auxiliary_loss_clip": 0.06407367, "auxiliary_loss_mlp": 0.01265239, "balance_loss_clip": 0.0627545, "balance_loss_mlp": 0.01256763, "epoch": 0.9212084773786262, "flos": 23264927435520.0, "grad_norm": 1.5857730312978033, "language_loss": 0.72156054, "learning_rate": 6.47225558966582e-08, "loss": 0.79828656, "num_input_tokens_seen": 330485770, "router_z_loss_clip": 1.3203125, "router_z_loss_mlp": 0.08462524, "step": 15322, "time_per_iteration": 2.5759246349334717 }, { "auxiliary_loss_clip": 0.06408475, "auxiliary_loss_mlp": 0.01266327, "balance_loss_clip": 0.06271788, "balance_loss_mlp": 0.01257207, "epoch": 0.9212686006312941, "flos": 16295056162560.0, "grad_norm": 2.089130987254664, "language_loss": 0.6990028, "learning_rate": 6.462431596227725e-08, "loss": 0.77575082, "num_input_tokens_seen": 330504255, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09112549, "step": 15323, "time_per_iteration": 2.5386850833892822 }, { "auxiliary_loss_clip": 0.06414318, "auxiliary_loss_mlp": 0.01268134, "balance_loss_clip": 0.06273085, "balance_loss_mlp": 0.01257393, "epoch": 0.9213287238839621, "flos": 19790837303040.0, "grad_norm": 2.077845945646962, "language_loss": 0.74760365, "learning_rate": 6.452614941753597e-08, "loss": 0.8244282, "num_input_tokens_seen": 330520705, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10736084, "step": 15324, "time_per_iteration": 3.952986001968384 }, { "auxiliary_loss_clip": 0.06418967, "auxiliary_loss_mlp": 0.01267772, "balance_loss_clip": 0.06280692, "balance_loss_mlp": 0.01258056, "epoch": 0.92138884713663, "flos": 21036361518720.0, "grad_norm": 2.3441611795681454, "language_loss": 0.71713471, "learning_rate": 6.442805626615744e-08, "loss": 0.79400212, "num_input_tokens_seen": 330539245, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.097229, "step": 15325, "time_per_iteration": 2.6148085594177246 }, { "auxiliary_loss_clip": 0.06410058, "auxiliary_loss_mlp": 0.01262932, "balance_loss_clip": 0.0627223, "balance_loss_mlp": 0.01253717, "epoch": 0.9214489703892981, "flos": 28595207940480.0, "grad_norm": 1.4515997775523921, "language_loss": 0.78471851, "learning_rate": 6.433003651186109e-08, "loss": 0.86144835, "num_input_tokens_seen": 330561815, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09210205, "step": 15326, "time_per_iteration": 2.6226325035095215 }, { "auxiliary_loss_clip": 0.06425577, "auxiliary_loss_mlp": 0.01267088, "balance_loss_clip": 0.06282726, "balance_loss_mlp": 0.01256234, "epoch": 0.921509093641966, "flos": 16366864711680.0, "grad_norm": 2.1973491644922394, "language_loss": 0.71387619, "learning_rate": 6.42320901583635e-08, "loss": 0.79080284, "num_input_tokens_seen": 330579760, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10858154, "step": 15327, "time_per_iteration": 2.6449859142303467 }, { "auxiliary_loss_clip": 0.06419446, "auxiliary_loss_mlp": 0.01264036, "balance_loss_clip": 0.06275834, "balance_loss_mlp": 0.01254016, "epoch": 0.921569216894634, "flos": 26837632224000.0, "grad_norm": 1.7665054187519014, "language_loss": 0.77911824, "learning_rate": 6.413421720937906e-08, "loss": 0.8559531, "num_input_tokens_seen": 330598545, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10009766, "step": 15328, "time_per_iteration": 2.590099573135376 }, { "auxiliary_loss_clip": 0.06411578, "auxiliary_loss_mlp": 0.01262889, "balance_loss_clip": 0.06276077, "balance_loss_mlp": 0.01254133, "epoch": 0.921629340147302, "flos": 24652140105600.0, "grad_norm": 2.3061924352310768, "language_loss": 0.71637487, "learning_rate": 6.4036417668619e-08, "loss": 0.79311955, "num_input_tokens_seen": 330616700, "router_z_loss_clip": 1.35644531, "router_z_loss_mlp": 0.08764648, "step": 15329, "time_per_iteration": 2.5679218769073486 }, { "auxiliary_loss_clip": 0.06411076, "auxiliary_loss_mlp": 0.01263163, "balance_loss_clip": 0.06272309, "balance_loss_mlp": 0.01254389, "epoch": 0.9216894633999699, "flos": 15092018766720.0, "grad_norm": 1.6762612785405935, "language_loss": 0.869847, "learning_rate": 6.393869153979192e-08, "loss": 0.94658947, "num_input_tokens_seen": 330633355, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.08770752, "step": 15330, "time_per_iteration": 2.502420425415039 }, { "auxiliary_loss_clip": 0.06419138, "auxiliary_loss_mlp": 0.01265093, "balance_loss_clip": 0.06278455, "balance_loss_mlp": 0.0125499, "epoch": 0.921749586652638, "flos": 19209912145920.0, "grad_norm": 4.649668223153026, "language_loss": 0.76032448, "learning_rate": 6.384103882660397e-08, "loss": 0.83716679, "num_input_tokens_seen": 330651470, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.10101318, "step": 15331, "time_per_iteration": 2.549673557281494 }, { "auxiliary_loss_clip": 0.06415229, "auxiliary_loss_mlp": 0.01265182, "balance_loss_clip": 0.06275758, "balance_loss_mlp": 0.01256056, "epoch": 0.9218097099053059, "flos": 20528796211200.0, "grad_norm": 1.6721489746873561, "language_loss": 0.75716901, "learning_rate": 6.374345953275794e-08, "loss": 0.83397317, "num_input_tokens_seen": 330669170, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09130859, "step": 15332, "time_per_iteration": 2.529405117034912 }, { "auxiliary_loss_clip": 0.06412155, "auxiliary_loss_mlp": 0.01267248, "balance_loss_clip": 0.06274958, "balance_loss_mlp": 0.01258677, "epoch": 0.9218698331579739, "flos": 17354518387200.0, "grad_norm": 1.7186881968961534, "language_loss": 0.74866986, "learning_rate": 6.364595366195358e-08, "loss": 0.82546395, "num_input_tokens_seen": 330686635, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.08581543, "step": 15333, "time_per_iteration": 2.546229362487793 }, { "auxiliary_loss_clip": 0.06318633, "auxiliary_loss_mlp": 0.01251284, "balance_loss_clip": 0.06263445, "balance_loss_mlp": 0.01250255, "epoch": 0.9219299564106418, "flos": 61975717430400.0, "grad_norm": 0.7787328093202219, "language_loss": 0.52941412, "learning_rate": 6.354852121788879e-08, "loss": 0.60511327, "num_input_tokens_seen": 330749160, "router_z_loss_clip": 0.55224609, "router_z_loss_mlp": 0.01029205, "step": 15334, "time_per_iteration": 3.1408705711364746 }, { "auxiliary_loss_clip": 0.06408498, "auxiliary_loss_mlp": 0.01264128, "balance_loss_clip": 0.0627272, "balance_loss_mlp": 0.01255551, "epoch": 0.9219900796633098, "flos": 15706542211200.0, "grad_norm": 1.7219896200807476, "language_loss": 0.6290344, "learning_rate": 6.345116220425839e-08, "loss": 0.70576072, "num_input_tokens_seen": 330766840, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.08569336, "step": 15335, "time_per_iteration": 2.515177011489868 }, { "auxiliary_loss_clip": 0.06413175, "auxiliary_loss_mlp": 0.0126501, "balance_loss_clip": 0.06274638, "balance_loss_mlp": 0.01255658, "epoch": 0.9220502029159777, "flos": 24938996958720.0, "grad_norm": 1.7626185426844905, "language_loss": 0.71691668, "learning_rate": 6.335387662475366e-08, "loss": 0.79369855, "num_input_tokens_seen": 330785585, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09350586, "step": 15336, "time_per_iteration": 2.589369058609009 }, { "auxiliary_loss_clip": 0.0641433, "auxiliary_loss_mlp": 0.01263695, "balance_loss_clip": 0.06277813, "balance_loss_mlp": 0.01255005, "epoch": 0.9221103261686457, "flos": 15672315018240.0, "grad_norm": 1.8537067253136068, "language_loss": 0.72131342, "learning_rate": 6.325666448306433e-08, "loss": 0.79809368, "num_input_tokens_seen": 330800750, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.08691406, "step": 15337, "time_per_iteration": 2.4993865489959717 }, { "auxiliary_loss_clip": 0.06319115, "auxiliary_loss_mlp": 0.0125181, "balance_loss_clip": 0.06263863, "balance_loss_mlp": 0.0125078, "epoch": 0.9221704494213137, "flos": 67536643098240.0, "grad_norm": 0.8595386678831236, "language_loss": 0.651178, "learning_rate": 6.31595257828763e-08, "loss": 0.72688723, "num_input_tokens_seen": 330863640, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01030731, "step": 15338, "time_per_iteration": 3.1457438468933105 }, { "auxiliary_loss_clip": 0.06420034, "auxiliary_loss_mlp": 0.01264981, "balance_loss_clip": 0.06279501, "balance_loss_mlp": 0.0125579, "epoch": 0.9222305726739817, "flos": 30234798708480.0, "grad_norm": 1.9250815047327563, "language_loss": 0.67173803, "learning_rate": 6.306246052787289e-08, "loss": 0.7485882, "num_input_tokens_seen": 330884675, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09185791, "step": 15339, "time_per_iteration": 2.6093153953552246 }, { "auxiliary_loss_clip": 0.06412325, "auxiliary_loss_mlp": 0.01262887, "balance_loss_clip": 0.0627362, "balance_loss_mlp": 0.01253702, "epoch": 0.9222906959266496, "flos": 25344132249600.0, "grad_norm": 1.6972469879095156, "language_loss": 0.71680534, "learning_rate": 6.296546872173513e-08, "loss": 0.79355747, "num_input_tokens_seen": 330904125, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09185791, "step": 15340, "time_per_iteration": 2.5862207412719727 }, { "auxiliary_loss_clip": 0.06415537, "auxiliary_loss_mlp": 0.01266966, "balance_loss_clip": 0.06277966, "balance_loss_mlp": 0.01257519, "epoch": 0.9223508191793176, "flos": 27607260775680.0, "grad_norm": 1.484027931154371, "language_loss": 0.70622277, "learning_rate": 6.286855036814098e-08, "loss": 0.7830478, "num_input_tokens_seen": 330925140, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09448242, "step": 15341, "time_per_iteration": 2.587176561355591 }, { "auxiliary_loss_clip": 0.06405282, "auxiliary_loss_mlp": 0.01264491, "balance_loss_clip": 0.06271882, "balance_loss_mlp": 0.01256009, "epoch": 0.9224109424319856, "flos": 27314869553280.0, "grad_norm": 1.9765278780432964, "language_loss": 0.67600954, "learning_rate": 6.277170547076571e-08, "loss": 0.75270724, "num_input_tokens_seen": 330946625, "router_z_loss_clip": 1.33496094, "router_z_loss_mlp": 0.08477783, "step": 15342, "time_per_iteration": 2.604600429534912 }, { "auxiliary_loss_clip": 0.0641498, "auxiliary_loss_mlp": 0.01262592, "balance_loss_clip": 0.06277229, "balance_loss_mlp": 0.01253491, "epoch": 0.9224710656846535, "flos": 48218152389120.0, "grad_norm": 2.2538924933657314, "language_loss": 0.6945076, "learning_rate": 6.26749340332815e-08, "loss": 0.77128327, "num_input_tokens_seen": 330967795, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09100342, "step": 15343, "time_per_iteration": 2.771757125854492 }, { "auxiliary_loss_clip": 0.06317218, "auxiliary_loss_mlp": 0.01250687, "balance_loss_clip": 0.06262022, "balance_loss_mlp": 0.01249618, "epoch": 0.9225311889373216, "flos": 66743814165120.0, "grad_norm": 0.7185792901187861, "language_loss": 0.5194186, "learning_rate": 6.257823605935786e-08, "loss": 0.59509766, "num_input_tokens_seen": 331040850, "router_z_loss_clip": 0.55322266, "router_z_loss_mlp": 0.01070404, "step": 15344, "time_per_iteration": 4.788966417312622 }, { "auxiliary_loss_clip": 0.06405304, "auxiliary_loss_mlp": 0.01263025, "balance_loss_clip": 0.06272438, "balance_loss_mlp": 0.0125468, "epoch": 0.9225913121899895, "flos": 22277525322240.0, "grad_norm": 1.5833497063990813, "language_loss": 0.70618457, "learning_rate": 6.248161155266162e-08, "loss": 0.78286791, "num_input_tokens_seen": 331060595, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.08343506, "step": 15345, "time_per_iteration": 2.6425154209136963 }, { "auxiliary_loss_clip": 0.06415741, "auxiliary_loss_mlp": 0.01264682, "balance_loss_clip": 0.06276107, "balance_loss_mlp": 0.01255259, "epoch": 0.9226514354426575, "flos": 20088679040640.0, "grad_norm": 2.276708632011941, "language_loss": 0.77462953, "learning_rate": 6.238506051685677e-08, "loss": 0.85143375, "num_input_tokens_seen": 331080195, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09423828, "step": 15346, "time_per_iteration": 2.560673952102661 }, { "auxiliary_loss_clip": 0.06420451, "auxiliary_loss_mlp": 0.01268921, "balance_loss_clip": 0.06276065, "balance_loss_mlp": 0.01258526, "epoch": 0.9227115586953254, "flos": 16076402133120.0, "grad_norm": 1.8748110092453374, "language_loss": 0.7605437, "learning_rate": 6.228858295560457e-08, "loss": 0.83743739, "num_input_tokens_seen": 331097645, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10394287, "step": 15347, "time_per_iteration": 2.526628017425537 }, { "auxiliary_loss_clip": 0.06408317, "auxiliary_loss_mlp": 0.01265201, "balance_loss_clip": 0.06274299, "balance_loss_mlp": 0.01256779, "epoch": 0.9227716819479934, "flos": 20451788709120.0, "grad_norm": 1.5851655200125538, "language_loss": 0.77320367, "learning_rate": 6.219217887256367e-08, "loss": 0.84993887, "num_input_tokens_seen": 331116830, "router_z_loss_clip": 1.34277344, "router_z_loss_mlp": 0.08428955, "step": 15348, "time_per_iteration": 2.593111753463745 }, { "auxiliary_loss_clip": 0.06415766, "auxiliary_loss_mlp": 0.01265016, "balance_loss_clip": 0.0627379, "balance_loss_mlp": 0.0125468, "epoch": 0.9228318052006613, "flos": 25014033889920.0, "grad_norm": 1.717631214259022, "language_loss": 0.67760921, "learning_rate": 6.209584827138959e-08, "loss": 0.75441694, "num_input_tokens_seen": 331137235, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10345459, "step": 15349, "time_per_iteration": 2.588189125061035 }, { "auxiliary_loss_clip": 0.06416421, "auxiliary_loss_mlp": 0.01263608, "balance_loss_clip": 0.06276132, "balance_loss_mlp": 0.01254297, "epoch": 0.9228919284533293, "flos": 12682170541440.0, "grad_norm": 2.5332834035627987, "language_loss": 0.86914313, "learning_rate": 6.199959115573495e-08, "loss": 0.94594342, "num_input_tokens_seen": 331153155, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09307861, "step": 15350, "time_per_iteration": 3.979116678237915 }, { "auxiliary_loss_clip": 0.06319772, "auxiliary_loss_mlp": 0.01251738, "balance_loss_clip": 0.06264736, "balance_loss_mlp": 0.01250689, "epoch": 0.9229520517059973, "flos": 70005050928000.0, "grad_norm": 0.7805550220063343, "language_loss": 0.60260993, "learning_rate": 6.190340752924994e-08, "loss": 0.67832506, "num_input_tokens_seen": 331214895, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01050568, "step": 15351, "time_per_iteration": 3.115595579147339 }, { "auxiliary_loss_clip": 0.06419332, "auxiliary_loss_mlp": 0.01265381, "balance_loss_clip": 0.06277423, "balance_loss_mlp": 0.01256381, "epoch": 0.9230121749586653, "flos": 14799166346880.0, "grad_norm": 2.642399315862427, "language_loss": 0.78033423, "learning_rate": 6.180729739558233e-08, "loss": 0.85718131, "num_input_tokens_seen": 331232185, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09002686, "step": 15352, "time_per_iteration": 2.520127058029175 }, { "auxiliary_loss_clip": 0.06419896, "auxiliary_loss_mlp": 0.01263851, "balance_loss_clip": 0.06276189, "balance_loss_mlp": 0.01253528, "epoch": 0.9230722982113332, "flos": 22974003659520.0, "grad_norm": 2.1796935727609705, "language_loss": 0.59830856, "learning_rate": 6.171126075837585e-08, "loss": 0.67514598, "num_input_tokens_seen": 331251065, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10327148, "step": 15353, "time_per_iteration": 2.548182487487793 }, { "auxiliary_loss_clip": 0.06413051, "auxiliary_loss_mlp": 0.01263603, "balance_loss_clip": 0.06276074, "balance_loss_mlp": 0.01254424, "epoch": 0.9231324214640012, "flos": 18557346147840.0, "grad_norm": 1.5294735259307162, "language_loss": 0.74650735, "learning_rate": 6.161529762127293e-08, "loss": 0.8232739, "num_input_tokens_seen": 331269110, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09191895, "step": 15354, "time_per_iteration": 2.5215847492218018 }, { "auxiliary_loss_clip": 0.0642239, "auxiliary_loss_mlp": 0.01267991, "balance_loss_clip": 0.06278089, "balance_loss_mlp": 0.01257358, "epoch": 0.9231925447166691, "flos": 22087899532800.0, "grad_norm": 2.029592806277631, "language_loss": 0.6454888, "learning_rate": 6.1519407987912e-08, "loss": 0.72239262, "num_input_tokens_seen": 331286555, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10638428, "step": 15355, "time_per_iteration": 2.5387794971466064 }, { "auxiliary_loss_clip": 0.06410958, "auxiliary_loss_mlp": 0.01263317, "balance_loss_clip": 0.06276792, "balance_loss_mlp": 0.012544, "epoch": 0.9232526679693371, "flos": 26548259748480.0, "grad_norm": 1.4888999527749582, "language_loss": 0.74396849, "learning_rate": 6.142359186192947e-08, "loss": 0.82071126, "num_input_tokens_seen": 331307660, "router_z_loss_clip": 1.33984375, "router_z_loss_mlp": 0.08917236, "step": 15356, "time_per_iteration": 4.067466497421265 }, { "auxiliary_loss_clip": 0.06416973, "auxiliary_loss_mlp": 0.01265641, "balance_loss_clip": 0.06276221, "balance_loss_mlp": 0.01256009, "epoch": 0.9233127912220052, "flos": 14761878480000.0, "grad_norm": 1.7027184719789237, "language_loss": 0.61463189, "learning_rate": 6.132784924695844e-08, "loss": 0.69145799, "num_input_tokens_seen": 331324885, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09637451, "step": 15357, "time_per_iteration": 2.527022361755371 }, { "auxiliary_loss_clip": 0.06420457, "auxiliary_loss_mlp": 0.01264879, "balance_loss_clip": 0.06277694, "balance_loss_mlp": 0.01255169, "epoch": 0.9233729144746731, "flos": 25268298704640.0, "grad_norm": 1.3970823950530478, "language_loss": 0.70122921, "learning_rate": 6.123218014662956e-08, "loss": 0.77808261, "num_input_tokens_seen": 331345885, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09710693, "step": 15358, "time_per_iteration": 2.561272382736206 }, { "auxiliary_loss_clip": 0.06413592, "auxiliary_loss_mlp": 0.01263515, "balance_loss_clip": 0.06275566, "balance_loss_mlp": 0.01254693, "epoch": 0.9234330377273411, "flos": 27856368564480.0, "grad_norm": 2.8593101790443267, "language_loss": 0.73958826, "learning_rate": 6.113658456457104e-08, "loss": 0.81635934, "num_input_tokens_seen": 331364320, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.0881958, "step": 15359, "time_per_iteration": 2.5875003337860107 }, { "auxiliary_loss_clip": 0.06413706, "auxiliary_loss_mlp": 0.01266549, "balance_loss_clip": 0.06273988, "balance_loss_mlp": 0.01256977, "epoch": 0.923493160980009, "flos": 24615313436160.0, "grad_norm": 1.7286813156547893, "language_loss": 0.64849818, "learning_rate": 6.104106250440732e-08, "loss": 0.72530067, "num_input_tokens_seen": 331384135, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09576416, "step": 15360, "time_per_iteration": 2.5650815963745117 }, { "auxiliary_loss_clip": 0.06316838, "auxiliary_loss_mlp": 0.01250969, "balance_loss_clip": 0.06261628, "balance_loss_mlp": 0.01249961, "epoch": 0.923553284232677, "flos": 67721656913280.0, "grad_norm": 0.7673370867993129, "language_loss": 0.54922491, "learning_rate": 6.094561396976083e-08, "loss": 0.62490296, "num_input_tokens_seen": 331440645, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01007843, "step": 15361, "time_per_iteration": 3.1005606651306152 }, { "auxiliary_loss_clip": 0.06421639, "auxiliary_loss_mlp": 0.0126312, "balance_loss_clip": 0.0627776, "balance_loss_mlp": 0.01253386, "epoch": 0.9236134074853449, "flos": 18813246117120.0, "grad_norm": 1.7543852061000598, "language_loss": 0.69994724, "learning_rate": 6.085023896425112e-08, "loss": 0.77679485, "num_input_tokens_seen": 331459580, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.09735107, "step": 15362, "time_per_iteration": 2.5239222049713135 }, { "auxiliary_loss_clip": 0.06418125, "auxiliary_loss_mlp": 0.01263821, "balance_loss_clip": 0.06274149, "balance_loss_mlp": 0.01253278, "epoch": 0.923673530738013, "flos": 27789800895360.0, "grad_norm": 1.4300610742590854, "language_loss": 0.75966483, "learning_rate": 6.075493749149463e-08, "loss": 0.83648431, "num_input_tokens_seen": 331481560, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10534668, "step": 15363, "time_per_iteration": 4.070420265197754 }, { "auxiliary_loss_clip": 0.06418295, "auxiliary_loss_mlp": 0.01267123, "balance_loss_clip": 0.06279162, "balance_loss_mlp": 0.01257741, "epoch": 0.9237336539906809, "flos": 26804369352960.0, "grad_norm": 1.9246658064764584, "language_loss": 0.8394081, "learning_rate": 6.065970955510514e-08, "loss": 0.91626233, "num_input_tokens_seen": 331499090, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09381104, "step": 15364, "time_per_iteration": 2.593752384185791 }, { "auxiliary_loss_clip": 0.06410188, "auxiliary_loss_mlp": 0.0126679, "balance_loss_clip": 0.0627226, "balance_loss_mlp": 0.01257844, "epoch": 0.9237937772433489, "flos": 23594648451840.0, "grad_norm": 1.4264217976829554, "language_loss": 0.6810509, "learning_rate": 6.056455515869419e-08, "loss": 0.75782067, "num_input_tokens_seen": 331519420, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.0894165, "step": 15365, "time_per_iteration": 2.5596084594726562 }, { "auxiliary_loss_clip": 0.06411453, "auxiliary_loss_mlp": 0.01264945, "balance_loss_clip": 0.06273635, "balance_loss_mlp": 0.01254997, "epoch": 0.9238539004960168, "flos": 26147736432000.0, "grad_norm": 2.083066958633676, "language_loss": 0.63395894, "learning_rate": 6.046947430586913e-08, "loss": 0.71072292, "num_input_tokens_seen": 331538720, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09942627, "step": 15366, "time_per_iteration": 2.5897233486175537 }, { "auxiliary_loss_clip": 0.06414079, "auxiliary_loss_mlp": 0.01263655, "balance_loss_clip": 0.06277595, "balance_loss_mlp": 0.01254106, "epoch": 0.9239140237486848, "flos": 21074152510080.0, "grad_norm": 1.4153182995417222, "language_loss": 0.74544668, "learning_rate": 6.037446700023619e-08, "loss": 0.82222402, "num_input_tokens_seen": 331558505, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09545898, "step": 15367, "time_per_iteration": 2.5590269565582275 }, { "auxiliary_loss_clip": 0.06401341, "auxiliary_loss_mlp": 0.01266479, "balance_loss_clip": 0.06270849, "balance_loss_mlp": 0.0125764, "epoch": 0.9239741470013527, "flos": 24614810311680.0, "grad_norm": 2.1116765664450745, "language_loss": 0.65106732, "learning_rate": 6.027953324539759e-08, "loss": 0.72774553, "num_input_tokens_seen": 331578440, "router_z_loss_clip": 1.30664062, "router_z_loss_mlp": 0.08850098, "step": 15368, "time_per_iteration": 2.5675008296966553 }, { "auxiliary_loss_clip": 0.0641817, "auxiliary_loss_mlp": 0.01268118, "balance_loss_clip": 0.0627498, "balance_loss_mlp": 0.01258366, "epoch": 0.9240342702540207, "flos": 24725290320000.0, "grad_norm": 2.548592199007535, "language_loss": 0.74907553, "learning_rate": 6.018467304495401e-08, "loss": 0.82593846, "num_input_tokens_seen": 331598945, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.09747314, "step": 15369, "time_per_iteration": 2.5803608894348145 }, { "auxiliary_loss_clip": 0.06424029, "auxiliary_loss_mlp": 0.01265818, "balance_loss_clip": 0.06278363, "balance_loss_mlp": 0.01254392, "epoch": 0.9240943935066888, "flos": 20856253167360.0, "grad_norm": 1.8328405666067695, "language_loss": 0.76789558, "learning_rate": 6.008988640250145e-08, "loss": 0.84479403, "num_input_tokens_seen": 331616700, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.11437988, "step": 15370, "time_per_iteration": 2.596898317337036 }, { "auxiliary_loss_clip": 0.06415437, "auxiliary_loss_mlp": 0.01262709, "balance_loss_clip": 0.0627465, "balance_loss_mlp": 0.0125381, "epoch": 0.9241545167593567, "flos": 24469222642560.0, "grad_norm": 2.054435670730766, "language_loss": 0.66960329, "learning_rate": 5.999517332163528e-08, "loss": 0.74638474, "num_input_tokens_seen": 331635625, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.08898926, "step": 15371, "time_per_iteration": 2.5828044414520264 }, { "auxiliary_loss_clip": 0.06321214, "auxiliary_loss_mlp": 0.01251765, "balance_loss_clip": 0.06266145, "balance_loss_mlp": 0.01250766, "epoch": 0.9242146400120247, "flos": 61847110212480.0, "grad_norm": 0.7210236654302203, "language_loss": 0.57724202, "learning_rate": 5.99005338059464e-08, "loss": 0.6529718, "num_input_tokens_seen": 331698595, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00998688, "step": 15372, "time_per_iteration": 3.1717021465301514 }, { "auxiliary_loss_clip": 0.06409574, "auxiliary_loss_mlp": 0.0126855, "balance_loss_clip": 0.06274394, "balance_loss_mlp": 0.0126014, "epoch": 0.9242747632646926, "flos": 22053923902080.0, "grad_norm": 2.3898572839407985, "language_loss": 0.69885844, "learning_rate": 5.98059678590237e-08, "loss": 0.77563965, "num_input_tokens_seen": 331717975, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.08404541, "step": 15373, "time_per_iteration": 2.5487937927246094 }, { "auxiliary_loss_clip": 0.06417136, "auxiliary_loss_mlp": 0.01272547, "balance_loss_clip": 0.06277931, "balance_loss_mlp": 0.01262551, "epoch": 0.9243348865173606, "flos": 18484195933440.0, "grad_norm": 2.79068755122242, "language_loss": 0.75634325, "learning_rate": 5.971147548445299e-08, "loss": 0.83324003, "num_input_tokens_seen": 331737220, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.10003662, "step": 15374, "time_per_iteration": 2.611753463745117 }, { "auxiliary_loss_clip": 0.06414378, "auxiliary_loss_mlp": 0.01264443, "balance_loss_clip": 0.06275463, "balance_loss_mlp": 0.01255675, "epoch": 0.9243950097700285, "flos": 23265556341120.0, "grad_norm": 1.562514479988386, "language_loss": 0.64919788, "learning_rate": 5.961705668581784e-08, "loss": 0.72598612, "num_input_tokens_seen": 331757300, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.08776855, "step": 15375, "time_per_iteration": 2.5916361808776855 }, { "auxiliary_loss_clip": 0.06414125, "auxiliary_loss_mlp": 0.01262568, "balance_loss_clip": 0.06277435, "balance_loss_mlp": 0.01252865, "epoch": 0.9244551330226966, "flos": 29756261640960.0, "grad_norm": 2.0581729491230525, "language_loss": 0.66857529, "learning_rate": 5.952271146669829e-08, "loss": 0.74534225, "num_input_tokens_seen": 331776995, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.09710693, "step": 15376, "time_per_iteration": 2.5864851474761963 }, { "auxiliary_loss_clip": 0.06320901, "auxiliary_loss_mlp": 0.01251578, "balance_loss_clip": 0.06265901, "balance_loss_mlp": 0.0125054, "epoch": 0.9245152562753645, "flos": 68885310090240.0, "grad_norm": 0.6399688400070626, "language_loss": 0.61131722, "learning_rate": 5.94284398306717e-08, "loss": 0.687042, "num_input_tokens_seen": 331845015, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01039124, "step": 15377, "time_per_iteration": 3.2443666458129883 }, { "auxiliary_loss_clip": 0.06410761, "auxiliary_loss_mlp": 0.01265136, "balance_loss_clip": 0.06272492, "balance_loss_mlp": 0.01255361, "epoch": 0.9245753795280325, "flos": 21585575105280.0, "grad_norm": 1.5959031137069388, "language_loss": 0.74115992, "learning_rate": 5.933424178131341e-08, "loss": 0.8179189, "num_input_tokens_seen": 331862795, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09765625, "step": 15378, "time_per_iteration": 2.528395175933838 }, { "auxiliary_loss_clip": 0.06415884, "auxiliary_loss_mlp": 0.01263144, "balance_loss_clip": 0.06275968, "balance_loss_mlp": 0.01253816, "epoch": 0.9246355027807004, "flos": 34504694593920.0, "grad_norm": 2.0848662304617847, "language_loss": 0.62616181, "learning_rate": 5.924011732219503e-08, "loss": 0.70295209, "num_input_tokens_seen": 331882535, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09326172, "step": 15379, "time_per_iteration": 2.6597912311553955 }, { "auxiliary_loss_clip": 0.06410627, "auxiliary_loss_mlp": 0.01264767, "balance_loss_clip": 0.06275457, "balance_loss_mlp": 0.01255576, "epoch": 0.9246956260333684, "flos": 15958123695360.0, "grad_norm": 3.053218290269333, "language_loss": 0.83933896, "learning_rate": 5.914606645688591e-08, "loss": 0.91609293, "num_input_tokens_seen": 331899335, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.09191895, "step": 15380, "time_per_iteration": 2.5176992416381836 }, { "auxiliary_loss_clip": 0.06416351, "auxiliary_loss_mlp": 0.01269219, "balance_loss_clip": 0.06273973, "balance_loss_mlp": 0.01258568, "epoch": 0.9247557492860363, "flos": 23375197808640.0, "grad_norm": 1.4242735830018505, "language_loss": 0.73501837, "learning_rate": 5.905208918895233e-08, "loss": 0.81187397, "num_input_tokens_seen": 331919030, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10656738, "step": 15381, "time_per_iteration": 2.5635550022125244 }, { "auxiliary_loss_clip": 0.06412073, "auxiliary_loss_mlp": 0.01264475, "balance_loss_clip": 0.06274672, "balance_loss_mlp": 0.01254855, "epoch": 0.9248158725387043, "flos": 23046608822400.0, "grad_norm": 1.7881456215463438, "language_loss": 0.78496063, "learning_rate": 5.8958185521958524e-08, "loss": 0.86172611, "num_input_tokens_seen": 331936465, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09625244, "step": 15382, "time_per_iteration": 2.5551679134368896 }, { "auxiliary_loss_clip": 0.06413959, "auxiliary_loss_mlp": 0.0126467, "balance_loss_clip": 0.06274069, "balance_loss_mlp": 0.01254871, "epoch": 0.9248759957913724, "flos": 22527974776320.0, "grad_norm": 2.0046924623231885, "language_loss": 0.75106108, "learning_rate": 5.886435545946455e-08, "loss": 0.82784742, "num_input_tokens_seen": 331954625, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09796143, "step": 15383, "time_per_iteration": 2.5857608318328857 }, { "auxiliary_loss_clip": 0.06409533, "auxiliary_loss_mlp": 0.01260566, "balance_loss_clip": 0.06273382, "balance_loss_mlp": 0.01251971, "epoch": 0.9249361190440403, "flos": 25454318768640.0, "grad_norm": 1.5556908155537166, "language_loss": 0.75784552, "learning_rate": 5.8770599005028456e-08, "loss": 0.83454645, "num_input_tokens_seen": 331975865, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.0859375, "step": 15384, "time_per_iteration": 4.070967435836792 }, { "auxiliary_loss_clip": 0.06406101, "auxiliary_loss_mlp": 0.01261506, "balance_loss_clip": 0.06271965, "balance_loss_mlp": 0.01252744, "epoch": 0.9249962422967083, "flos": 12382358232960.0, "grad_norm": 1.94776136679769, "language_loss": 0.66261858, "learning_rate": 5.8676916162206045e-08, "loss": 0.73929465, "num_input_tokens_seen": 331992760, "router_z_loss_clip": 1.34277344, "router_z_loss_mlp": 0.08758545, "step": 15385, "time_per_iteration": 2.5059316158294678 }, { "auxiliary_loss_clip": 0.06410181, "auxiliary_loss_mlp": 0.01267458, "balance_loss_clip": 0.06273768, "balance_loss_mlp": 0.01258333, "epoch": 0.9250563655493762, "flos": 22936003032960.0, "grad_norm": 1.754875036868346, "language_loss": 0.80588263, "learning_rate": 5.85833069345496e-08, "loss": 0.88265908, "num_input_tokens_seen": 332011890, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.09118652, "step": 15386, "time_per_iteration": 2.565819025039673 }, { "auxiliary_loss_clip": 0.06415713, "auxiliary_loss_mlp": 0.0126285, "balance_loss_clip": 0.06279749, "balance_loss_mlp": 0.01253605, "epoch": 0.9251164888020442, "flos": 18484573276800.0, "grad_norm": 1.7997401873602787, "language_loss": 0.75798213, "learning_rate": 5.8489771325608504e-08, "loss": 0.83476782, "num_input_tokens_seen": 332029485, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.09246826, "step": 15387, "time_per_iteration": 2.520455837249756 }, { "auxiliary_loss_clip": 0.06410865, "auxiliary_loss_mlp": 0.01265246, "balance_loss_clip": 0.0627598, "balance_loss_mlp": 0.01256437, "epoch": 0.9251766120547121, "flos": 33045505666560.0, "grad_norm": 1.3141410986034778, "language_loss": 0.70229346, "learning_rate": 5.839630933893014e-08, "loss": 0.77905458, "num_input_tokens_seen": 332052970, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.08807373, "step": 15388, "time_per_iteration": 2.6552627086639404 }, { "auxiliary_loss_clip": 0.06419222, "auxiliary_loss_mlp": 0.01266854, "balance_loss_clip": 0.06277758, "balance_loss_mlp": 0.01257579, "epoch": 0.9252367353073802, "flos": 24394563054720.0, "grad_norm": 1.9703878365319711, "language_loss": 0.81856, "learning_rate": 5.8302920978058115e-08, "loss": 0.89542079, "num_input_tokens_seen": 332070395, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09277344, "step": 15389, "time_per_iteration": 4.012057542800903 }, { "auxiliary_loss_clip": 0.06426613, "auxiliary_loss_mlp": 0.01265256, "balance_loss_clip": 0.06277268, "balance_loss_mlp": 0.01254837, "epoch": 0.9252968585600481, "flos": 18922887584640.0, "grad_norm": 1.519486612956465, "language_loss": 0.79422307, "learning_rate": 5.820960624653381e-08, "loss": 0.87114173, "num_input_tokens_seen": 332090185, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.10424805, "step": 15390, "time_per_iteration": 2.541121244430542 }, { "auxiliary_loss_clip": 0.06416568, "auxiliary_loss_mlp": 0.0126422, "balance_loss_clip": 0.06274626, "balance_loss_mlp": 0.0125454, "epoch": 0.9253569818127161, "flos": 21731707825920.0, "grad_norm": 1.7334301396084122, "language_loss": 0.75239158, "learning_rate": 5.811636514789597e-08, "loss": 0.82919949, "num_input_tokens_seen": 332109050, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09686279, "step": 15391, "time_per_iteration": 2.5463900566101074 }, { "auxiliary_loss_clip": 0.06415477, "auxiliary_loss_mlp": 0.0126685, "balance_loss_clip": 0.06274456, "balance_loss_mlp": 0.01257069, "epoch": 0.925417105065384, "flos": 34248878478720.0, "grad_norm": 2.1400274085213433, "language_loss": 0.52411377, "learning_rate": 5.80231976856802e-08, "loss": 0.60093707, "num_input_tokens_seen": 332131180, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09783936, "step": 15392, "time_per_iteration": 2.674672842025757 }, { "auxiliary_loss_clip": 0.06417146, "auxiliary_loss_mlp": 0.01263203, "balance_loss_clip": 0.06276231, "balance_loss_mlp": 0.01253952, "epoch": 0.925477228318052, "flos": 25966915320960.0, "grad_norm": 1.8691840748055966, "language_loss": 0.77619547, "learning_rate": 5.7930103863419454e-08, "loss": 0.85299897, "num_input_tokens_seen": 332149555, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.0925293, "step": 15393, "time_per_iteration": 2.5842881202697754 }, { "auxiliary_loss_clip": 0.06409191, "auxiliary_loss_mlp": 0.0126501, "balance_loss_clip": 0.0627288, "balance_loss_mlp": 0.01255676, "epoch": 0.9255373515707199, "flos": 11843039427840.0, "grad_norm": 1.980993678617473, "language_loss": 0.69678617, "learning_rate": 5.783708368464357e-08, "loss": 0.77352816, "num_input_tokens_seen": 332165830, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09332275, "step": 15394, "time_per_iteration": 2.5125908851623535 }, { "auxiliary_loss_clip": 0.06417276, "auxiliary_loss_mlp": 0.01264611, "balance_loss_clip": 0.06276224, "balance_loss_mlp": 0.01254544, "epoch": 0.925597474823388, "flos": 21440784049920.0, "grad_norm": 3.8284185747740582, "language_loss": 0.72740024, "learning_rate": 5.7744137152879956e-08, "loss": 0.80421919, "num_input_tokens_seen": 332185130, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.1005249, "step": 15395, "time_per_iteration": 2.546570301055908 }, { "auxiliary_loss_clip": 0.06410089, "auxiliary_loss_mlp": 0.0126251, "balance_loss_clip": 0.06274606, "balance_loss_mlp": 0.01253421, "epoch": 0.925657598076056, "flos": 22864320264960.0, "grad_norm": 2.206086043165786, "language_loss": 0.71414977, "learning_rate": 5.7651264271653785e-08, "loss": 0.79087579, "num_input_tokens_seen": 332203695, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.09082031, "step": 15396, "time_per_iteration": 3.993358850479126 }, { "auxiliary_loss_clip": 0.06412224, "auxiliary_loss_mlp": 0.01264225, "balance_loss_clip": 0.06275228, "balance_loss_mlp": 0.01254646, "epoch": 0.9257177213287239, "flos": 25711350768000.0, "grad_norm": 2.0249633224793717, "language_loss": 0.87616396, "learning_rate": 5.755846504448603e-08, "loss": 0.95292842, "num_input_tokens_seen": 332224850, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.0958252, "step": 15397, "time_per_iteration": 2.584878444671631 }, { "auxiliary_loss_clip": 0.06322713, "auxiliary_loss_mlp": 0.01250579, "balance_loss_clip": 0.06267601, "balance_loss_mlp": 0.01249644, "epoch": 0.9257778445813919, "flos": 59610955501440.0, "grad_norm": 0.8355383047804577, "language_loss": 0.55140013, "learning_rate": 5.746573947489586e-08, "loss": 0.62713301, "num_input_tokens_seen": 332278085, "router_z_loss_clip": 0.55371094, "router_z_loss_mlp": 0.00933075, "step": 15398, "time_per_iteration": 3.0354020595550537 }, { "auxiliary_loss_clip": 0.06422837, "auxiliary_loss_mlp": 0.01265552, "balance_loss_clip": 0.0627787, "balance_loss_mlp": 0.0125477, "epoch": 0.9258379678340598, "flos": 27716860316160.0, "grad_norm": 1.9530247103771643, "language_loss": 0.76349759, "learning_rate": 5.7373087566400025e-08, "loss": 0.8403815, "num_input_tokens_seen": 332297875, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10772705, "step": 15399, "time_per_iteration": 2.6354897022247314 }, { "auxiliary_loss_clip": 0.06405994, "auxiliary_loss_mlp": 0.01262818, "balance_loss_clip": 0.06273112, "balance_loss_mlp": 0.01254163, "epoch": 0.9258980910867278, "flos": 24870500645760.0, "grad_norm": 1.7429150121421566, "language_loss": 0.78053939, "learning_rate": 5.7280509322510826e-08, "loss": 0.85722756, "num_input_tokens_seen": 332318500, "router_z_loss_clip": 1.32910156, "router_z_loss_mlp": 0.08651733, "step": 15400, "time_per_iteration": 2.5945441722869873 }, { "auxiliary_loss_clip": 0.06321312, "auxiliary_loss_mlp": 0.01252362, "balance_loss_clip": 0.06266233, "balance_loss_mlp": 0.0125137, "epoch": 0.9259582143393957, "flos": 63153625800960.0, "grad_norm": 0.7130852615740891, "language_loss": 0.51287407, "learning_rate": 5.718800474673946e-08, "loss": 0.58861083, "num_input_tokens_seen": 332381980, "router_z_loss_clip": 0.55371094, "router_z_loss_mlp": 0.00991058, "step": 15401, "time_per_iteration": 3.137993574142456 }, { "auxiliary_loss_clip": 0.06410454, "auxiliary_loss_mlp": 0.01264593, "balance_loss_clip": 0.06277278, "balance_loss_mlp": 0.01255974, "epoch": 0.9260183375920638, "flos": 24132835226880.0, "grad_norm": 1.6961807577303556, "language_loss": 0.82726061, "learning_rate": 5.709557384259378e-08, "loss": 0.90401113, "num_input_tokens_seen": 332399510, "router_z_loss_clip": 1.33105469, "router_z_loss_mlp": 0.08624268, "step": 15402, "time_per_iteration": 2.5691442489624023 }, { "auxiliary_loss_clip": 0.06319173, "auxiliary_loss_mlp": 0.01250711, "balance_loss_clip": 0.06264263, "balance_loss_mlp": 0.0124973, "epoch": 0.9260784608447317, "flos": 63064863999360.0, "grad_norm": 0.7108698702062654, "language_loss": 0.51081526, "learning_rate": 5.700321661357876e-08, "loss": 0.58651412, "num_input_tokens_seen": 332459130, "router_z_loss_clip": 0.54980469, "router_z_loss_mlp": 0.00980377, "step": 15403, "time_per_iteration": 4.544946193695068 }, { "auxiliary_loss_clip": 0.06314329, "auxiliary_loss_mlp": 0.01252313, "balance_loss_clip": 0.06259081, "balance_loss_mlp": 0.01251379, "epoch": 0.9261385840973997, "flos": 70607652364800.0, "grad_norm": 0.701092689846971, "language_loss": 0.58747232, "learning_rate": 5.69109330631965e-08, "loss": 0.66313875, "num_input_tokens_seen": 332526555, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00932312, "step": 15404, "time_per_iteration": 3.195589542388916 }, { "auxiliary_loss_clip": 0.06416568, "auxiliary_loss_mlp": 0.01264117, "balance_loss_clip": 0.06276482, "balance_loss_mlp": 0.01254032, "epoch": 0.9261987073500676, "flos": 20236111499520.0, "grad_norm": 2.0970653217560358, "language_loss": 0.71750313, "learning_rate": 5.681872319494596e-08, "loss": 0.79431003, "num_input_tokens_seen": 332544005, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.10089111, "step": 15405, "time_per_iteration": 2.539970874786377 }, { "auxiliary_loss_clip": 0.06417734, "auxiliary_loss_mlp": 0.0126577, "balance_loss_clip": 0.06275186, "balance_loss_mlp": 0.01255732, "epoch": 0.9262588306027356, "flos": 20959563651840.0, "grad_norm": 4.6287539656830265, "language_loss": 0.68722814, "learning_rate": 5.672658701232458e-08, "loss": 0.76406318, "num_input_tokens_seen": 332563070, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.1003418, "step": 15406, "time_per_iteration": 2.558750629425049 }, { "auxiliary_loss_clip": 0.06415321, "auxiliary_loss_mlp": 0.01262941, "balance_loss_clip": 0.06275269, "balance_loss_mlp": 0.01252528, "epoch": 0.9263189538554035, "flos": 22164361983360.0, "grad_norm": 2.04434110516158, "language_loss": 0.76667166, "learning_rate": 5.663452451882555e-08, "loss": 0.84345424, "num_input_tokens_seen": 332579620, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.10406494, "step": 15407, "time_per_iteration": 2.5480904579162598 }, { "auxiliary_loss_clip": 0.06426079, "auxiliary_loss_mlp": 0.01267779, "balance_loss_clip": 0.06279075, "balance_loss_mlp": 0.01257038, "epoch": 0.9263790771080715, "flos": 18193146376320.0, "grad_norm": 2.579528228208878, "language_loss": 0.72555745, "learning_rate": 5.6542535717940096e-08, "loss": 0.80249608, "num_input_tokens_seen": 332597795, "router_z_loss_clip": 1.47070312, "router_z_loss_mlp": 0.10742188, "step": 15408, "time_per_iteration": 2.5476815700531006 }, { "auxiliary_loss_clip": 0.06408363, "auxiliary_loss_mlp": 0.01261237, "balance_loss_clip": 0.06273566, "balance_loss_mlp": 0.01253197, "epoch": 0.9264392003607396, "flos": 48189501492480.0, "grad_norm": 1.6787116997312321, "language_loss": 0.6845783, "learning_rate": 5.645062061315675e-08, "loss": 0.76127422, "num_input_tokens_seen": 332620375, "router_z_loss_clip": 1.34863281, "router_z_loss_mlp": 0.08044434, "step": 15409, "time_per_iteration": 2.831757068634033 }, { "auxiliary_loss_clip": 0.06420366, "auxiliary_loss_mlp": 0.01267642, "balance_loss_clip": 0.0627971, "balance_loss_mlp": 0.0125733, "epoch": 0.9264993236134075, "flos": 26395586409600.0, "grad_norm": 1.808716556290024, "language_loss": 0.75678217, "learning_rate": 5.6358779207960506e-08, "loss": 0.83366227, "num_input_tokens_seen": 332639510, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.10302734, "step": 15410, "time_per_iteration": 2.626417636871338 }, { "auxiliary_loss_clip": 0.06415652, "auxiliary_loss_mlp": 0.01266259, "balance_loss_clip": 0.06274357, "balance_loss_mlp": 0.01256603, "epoch": 0.9265594468660755, "flos": 20925881510400.0, "grad_norm": 1.5090873225734127, "language_loss": 0.82152092, "learning_rate": 5.6267011505833905e-08, "loss": 0.89833999, "num_input_tokens_seen": 332658350, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09655762, "step": 15411, "time_per_iteration": 2.5621871948242188 }, { "auxiliary_loss_clip": 0.06420573, "auxiliary_loss_mlp": 0.01263379, "balance_loss_clip": 0.06280769, "balance_loss_mlp": 0.01254296, "epoch": 0.9266195701187434, "flos": 17529930910080.0, "grad_norm": 1.650126264777934, "language_loss": 0.75576913, "learning_rate": 5.617531751025728e-08, "loss": 0.8326087, "num_input_tokens_seen": 332676715, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09088135, "step": 15412, "time_per_iteration": 2.550316095352173 }, { "auxiliary_loss_clip": 0.06413072, "auxiliary_loss_mlp": 0.01265797, "balance_loss_clip": 0.06273791, "balance_loss_mlp": 0.01256636, "epoch": 0.9266796933714114, "flos": 33696436510080.0, "grad_norm": 1.5999122611036627, "language_loss": 0.66742837, "learning_rate": 5.6083697224707406e-08, "loss": 0.74421704, "num_input_tokens_seen": 332701470, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09161377, "step": 15413, "time_per_iteration": 2.674180269241333 }, { "auxiliary_loss_clip": 0.06416032, "auxiliary_loss_mlp": 0.01263887, "balance_loss_clip": 0.06274584, "balance_loss_mlp": 0.01254196, "epoch": 0.9267398166240793, "flos": 18922510241280.0, "grad_norm": 1.9456311789660816, "language_loss": 0.762483, "learning_rate": 5.5992150652658167e-08, "loss": 0.83928216, "num_input_tokens_seen": 332719060, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09692383, "step": 15414, "time_per_iteration": 2.58652925491333 }, { "auxiliary_loss_clip": 0.0640999, "auxiliary_loss_mlp": 0.01264824, "balance_loss_clip": 0.06272995, "balance_loss_mlp": 0.01256009, "epoch": 0.9267999398767474, "flos": 20484129185280.0, "grad_norm": 1.864042421742596, "language_loss": 0.81727386, "learning_rate": 5.59006777975819e-08, "loss": 0.89402199, "num_input_tokens_seen": 332736345, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.08813477, "step": 15415, "time_per_iteration": 2.542299270629883 }, { "auxiliary_loss_clip": 0.0642148, "auxiliary_loss_mlp": 0.01267497, "balance_loss_clip": 0.06278493, "balance_loss_mlp": 0.01256834, "epoch": 0.9268600631294153, "flos": 24796092620160.0, "grad_norm": 1.4594510925863333, "language_loss": 0.54072207, "learning_rate": 5.580927866294671e-08, "loss": 0.61761189, "num_input_tokens_seen": 332756270, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10650635, "step": 15416, "time_per_iteration": 2.590837001800537 }, { "auxiliary_loss_clip": 0.06410694, "auxiliary_loss_mlp": 0.01265031, "balance_loss_clip": 0.06275206, "balance_loss_mlp": 0.01256043, "epoch": 0.9269201863820833, "flos": 18703059598080.0, "grad_norm": 1.6703180871341714, "language_loss": 0.724208, "learning_rate": 5.571795325221807e-08, "loss": 0.80096531, "num_input_tokens_seen": 332775185, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.08990479, "step": 15417, "time_per_iteration": 2.5298519134521484 }, { "auxiliary_loss_clip": 0.06412694, "auxiliary_loss_mlp": 0.01263865, "balance_loss_clip": 0.06273837, "balance_loss_mlp": 0.01254603, "epoch": 0.9269803096347512, "flos": 20930451557760.0, "grad_norm": 2.059410720414968, "language_loss": 0.7544508, "learning_rate": 5.5626701568859624e-08, "loss": 0.83121639, "num_input_tokens_seen": 332794320, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.0927124, "step": 15418, "time_per_iteration": 2.535182237625122 }, { "auxiliary_loss_clip": 0.06414624, "auxiliary_loss_mlp": 0.01266634, "balance_loss_clip": 0.06276623, "balance_loss_mlp": 0.01257181, "epoch": 0.9270404328874192, "flos": 28010425495680.0, "grad_norm": 1.6905423496072107, "language_loss": 0.7633543, "learning_rate": 5.553552361633174e-08, "loss": 0.84016687, "num_input_tokens_seen": 332818095, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09466553, "step": 15419, "time_per_iteration": 2.693098783493042 }, { "auxiliary_loss_clip": 0.06407192, "auxiliary_loss_mlp": 0.01264264, "balance_loss_clip": 0.06273123, "balance_loss_mlp": 0.01255982, "epoch": 0.9271005561400871, "flos": 25897790102400.0, "grad_norm": 1.54436526171876, "language_loss": 0.7609961, "learning_rate": 5.5444419398091636e-08, "loss": 0.83771074, "num_input_tokens_seen": 332839860, "router_z_loss_clip": 1.34082031, "router_z_loss_mlp": 0.08291626, "step": 15420, "time_per_iteration": 2.5982422828674316 }, { "auxiliary_loss_clip": 0.06418227, "auxiliary_loss_mlp": 0.01262639, "balance_loss_clip": 0.06275539, "balance_loss_mlp": 0.01252393, "epoch": 0.9271606793927551, "flos": 27061443279360.0, "grad_norm": 1.4470912621534606, "language_loss": 0.76678276, "learning_rate": 5.535338891759389e-08, "loss": 0.84359145, "num_input_tokens_seen": 332861155, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10241699, "step": 15421, "time_per_iteration": 2.629423141479492 }, { "auxiliary_loss_clip": 0.06412543, "auxiliary_loss_mlp": 0.01264763, "balance_loss_clip": 0.06273371, "balance_loss_mlp": 0.01255423, "epoch": 0.9272208026454232, "flos": 26216442380160.0, "grad_norm": 1.8643472219061472, "language_loss": 0.72807968, "learning_rate": 5.526243217829041e-08, "loss": 0.80485272, "num_input_tokens_seen": 332881110, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09338379, "step": 15422, "time_per_iteration": 2.6834301948547363 }, { "auxiliary_loss_clip": 0.06413883, "auxiliary_loss_mlp": 0.01265549, "balance_loss_clip": 0.06273503, "balance_loss_mlp": 0.01255792, "epoch": 0.9272809258980911, "flos": 12463348803840.0, "grad_norm": 2.126991167023258, "language_loss": 0.77656674, "learning_rate": 5.517154918363065e-08, "loss": 0.85336107, "num_input_tokens_seen": 332899350, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09765625, "step": 15423, "time_per_iteration": 4.03388786315918 }, { "auxiliary_loss_clip": 0.0641803, "auxiliary_loss_mlp": 0.012638, "balance_loss_clip": 0.06275533, "balance_loss_mlp": 0.01254091, "epoch": 0.9273410491507591, "flos": 22863523651200.0, "grad_norm": 1.7304815950556955, "language_loss": 0.74992251, "learning_rate": 5.508073993706053e-08, "loss": 0.82674074, "num_input_tokens_seen": 332918105, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09716797, "step": 15424, "time_per_iteration": 2.540379047393799 }, { "auxiliary_loss_clip": 0.06319684, "auxiliary_loss_mlp": 0.01250573, "balance_loss_clip": 0.06264806, "balance_loss_mlp": 0.01249608, "epoch": 0.927401172403427, "flos": 47681963383680.0, "grad_norm": 0.7627917535726876, "language_loss": 0.60218549, "learning_rate": 5.499000444202351e-08, "loss": 0.6778881, "num_input_tokens_seen": 332969490, "router_z_loss_clip": 0.54736328, "router_z_loss_mlp": 0.00963593, "step": 15425, "time_per_iteration": 2.9385762214660645 }, { "auxiliary_loss_clip": 0.064135, "auxiliary_loss_mlp": 0.01262187, "balance_loss_clip": 0.06273869, "balance_loss_mlp": 0.01252429, "epoch": 0.927461295656095, "flos": 29980324258560.0, "grad_norm": 1.484056150773979, "language_loss": 0.70892888, "learning_rate": 5.489934270196106e-08, "loss": 0.78568578, "num_input_tokens_seen": 332988805, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09753418, "step": 15426, "time_per_iteration": 2.6061198711395264 }, { "auxiliary_loss_clip": 0.06414325, "auxiliary_loss_mlp": 0.01263841, "balance_loss_clip": 0.06276479, "balance_loss_mlp": 0.01254709, "epoch": 0.9275214189087629, "flos": 20381573387520.0, "grad_norm": 2.3857045202168403, "language_loss": 0.83439517, "learning_rate": 5.480875472030977e-08, "loss": 0.91117686, "num_input_tokens_seen": 333007960, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09130859, "step": 15427, "time_per_iteration": 2.5457603931427 }, { "auxiliary_loss_clip": 0.06412293, "auxiliary_loss_mlp": 0.01264342, "balance_loss_clip": 0.06273431, "balance_loss_mlp": 0.01254364, "epoch": 0.927581542161431, "flos": 22389850120320.0, "grad_norm": 1.5340905177701067, "language_loss": 0.76972055, "learning_rate": 5.471824050050555e-08, "loss": 0.84648681, "num_input_tokens_seen": 333026035, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09985352, "step": 15428, "time_per_iteration": 2.5342912673950195 }, { "auxiliary_loss_clip": 0.0641198, "auxiliary_loss_mlp": 0.01266763, "balance_loss_clip": 0.06273713, "balance_loss_mlp": 0.01257054, "epoch": 0.9276416654140989, "flos": 23959435201920.0, "grad_norm": 1.988760797861729, "language_loss": 0.74311996, "learning_rate": 5.4627800045980555e-08, "loss": 0.81990737, "num_input_tokens_seen": 333045590, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09716797, "step": 15429, "time_per_iteration": 4.024100303649902 }, { "auxiliary_loss_clip": 0.06407042, "auxiliary_loss_mlp": 0.01265901, "balance_loss_clip": 0.06272122, "balance_loss_mlp": 0.01257017, "epoch": 0.9277017886667669, "flos": 13922831220480.0, "grad_norm": 1.782285825548984, "language_loss": 0.75035077, "learning_rate": 5.45374333601647e-08, "loss": 0.82708025, "num_input_tokens_seen": 333063355, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.08889771, "step": 15430, "time_per_iteration": 2.492889642715454 }, { "auxiliary_loss_clip": 0.06414288, "auxiliary_loss_mlp": 0.01262971, "balance_loss_clip": 0.06274024, "balance_loss_mlp": 0.01253226, "epoch": 0.9277619119194348, "flos": 35675768856960.0, "grad_norm": 1.3759923464551116, "language_loss": 0.76570773, "learning_rate": 5.444714044648391e-08, "loss": 0.8424803, "num_input_tokens_seen": 333088045, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09741211, "step": 15431, "time_per_iteration": 2.6808807849884033 }, { "auxiliary_loss_clip": 0.06410713, "auxiliary_loss_mlp": 0.01266634, "balance_loss_clip": 0.06276329, "balance_loss_mlp": 0.01256925, "epoch": 0.9278220351721028, "flos": 23847907017600.0, "grad_norm": 1.5663481027071215, "language_loss": 0.70791864, "learning_rate": 5.4356921308363e-08, "loss": 0.78469211, "num_input_tokens_seen": 333108005, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.09710693, "step": 15432, "time_per_iteration": 2.540332555770874 }, { "auxiliary_loss_clip": 0.06417044, "auxiliary_loss_mlp": 0.01268964, "balance_loss_clip": 0.06275427, "balance_loss_mlp": 0.01259582, "epoch": 0.9278821584247707, "flos": 15232952534400.0, "grad_norm": 2.294396347711923, "language_loss": 0.82930315, "learning_rate": 5.4266775949222354e-08, "loss": 0.90616322, "num_input_tokens_seen": 333124335, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09387207, "step": 15433, "time_per_iteration": 2.502140760421753 }, { "auxiliary_loss_clip": 0.06404494, "auxiliary_loss_mlp": 0.01263208, "balance_loss_clip": 0.06271829, "balance_loss_mlp": 0.01254786, "epoch": 0.9279422816774388, "flos": 24688379796480.0, "grad_norm": 1.9114784145729105, "language_loss": 0.66853446, "learning_rate": 5.417670437248056e-08, "loss": 0.74521148, "num_input_tokens_seen": 333143995, "router_z_loss_clip": 1.32714844, "router_z_loss_mlp": 0.08422852, "step": 15434, "time_per_iteration": 2.5506491661071777 }, { "auxiliary_loss_clip": 0.06403243, "auxiliary_loss_mlp": 0.01262233, "balance_loss_clip": 0.06272738, "balance_loss_mlp": 0.01253543, "epoch": 0.9280024049301068, "flos": 19174762558080.0, "grad_norm": 1.5978092605032046, "language_loss": 0.68657279, "learning_rate": 5.40867065815529e-08, "loss": 0.76322752, "num_input_tokens_seen": 333162805, "router_z_loss_clip": 1.3046875, "router_z_loss_mlp": 0.0869751, "step": 15435, "time_per_iteration": 3.9597928524017334 }, { "auxiliary_loss_clip": 0.06412259, "auxiliary_loss_mlp": 0.01265437, "balance_loss_clip": 0.06272496, "balance_loss_mlp": 0.01255608, "epoch": 0.9280625281827747, "flos": 11397304033920.0, "grad_norm": 2.051598251807218, "language_loss": 0.72688591, "learning_rate": 5.399678257985263e-08, "loss": 0.80366284, "num_input_tokens_seen": 333175770, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.0982666, "step": 15436, "time_per_iteration": 2.476256847381592 }, { "auxiliary_loss_clip": 0.06418061, "auxiliary_loss_mlp": 0.01263831, "balance_loss_clip": 0.06279039, "balance_loss_mlp": 0.01255063, "epoch": 0.9281226514354427, "flos": 24791732208000.0, "grad_norm": 2.0440104007988587, "language_loss": 0.67147487, "learning_rate": 5.390693237078925e-08, "loss": 0.74829376, "num_input_tokens_seen": 333194775, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.08758545, "step": 15437, "time_per_iteration": 2.5458970069885254 }, { "auxiliary_loss_clip": 0.06419501, "auxiliary_loss_mlp": 0.01265635, "balance_loss_clip": 0.06277494, "balance_loss_mlp": 0.01254495, "epoch": 0.9281827746881106, "flos": 15088077624960.0, "grad_norm": 1.9424539949614807, "language_loss": 0.71899962, "learning_rate": 5.3817155957770254e-08, "loss": 0.79585093, "num_input_tokens_seen": 333208920, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.11138916, "step": 15438, "time_per_iteration": 2.4811573028564453 }, { "auxiliary_loss_clip": 0.06416463, "auxiliary_loss_mlp": 0.01265029, "balance_loss_clip": 0.06276107, "balance_loss_mlp": 0.01256285, "epoch": 0.9282428979407786, "flos": 24142101102720.0, "grad_norm": 1.6623107331974465, "language_loss": 0.64658493, "learning_rate": 5.3727453344199366e-08, "loss": 0.72339988, "num_input_tokens_seen": 333229350, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.08746338, "step": 15439, "time_per_iteration": 2.540013074874878 }, { "auxiliary_loss_clip": 0.06414203, "auxiliary_loss_mlp": 0.01265851, "balance_loss_clip": 0.06275681, "balance_loss_mlp": 0.01256708, "epoch": 0.9283030211934465, "flos": 24829523199360.0, "grad_norm": 1.6720964237765645, "language_loss": 0.7027272, "learning_rate": 5.363782453347876e-08, "loss": 0.77952766, "num_input_tokens_seen": 333246125, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09143066, "step": 15440, "time_per_iteration": 2.568849563598633 }, { "auxiliary_loss_clip": 0.06416844, "auxiliary_loss_mlp": 0.01267172, "balance_loss_clip": 0.06272975, "balance_loss_mlp": 0.01257242, "epoch": 0.9283631444461146, "flos": 23986702506240.0, "grad_norm": 1.53612977092463, "language_loss": 0.77120304, "learning_rate": 5.354826952900682e-08, "loss": 0.8480432, "num_input_tokens_seen": 333263685, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.09936523, "step": 15441, "time_per_iteration": 2.561410665512085 }, { "auxiliary_loss_clip": 0.06412807, "auxiliary_loss_mlp": 0.01263312, "balance_loss_clip": 0.06279663, "balance_loss_mlp": 0.01255158, "epoch": 0.9284232676987825, "flos": 22791253904640.0, "grad_norm": 1.5430812346502611, "language_loss": 0.64495039, "learning_rate": 5.345878833417949e-08, "loss": 0.72171164, "num_input_tokens_seen": 333282435, "router_z_loss_clip": 1.33203125, "router_z_loss_mlp": 0.081604, "step": 15442, "time_per_iteration": 2.560626983642578 }, { "auxiliary_loss_clip": 0.06417639, "auxiliary_loss_mlp": 0.01269746, "balance_loss_clip": 0.06274009, "balance_loss_mlp": 0.01259547, "epoch": 0.9284833909514505, "flos": 19506621853440.0, "grad_norm": 1.8174460089984026, "language_loss": 0.81092274, "learning_rate": 5.3369380952390295e-08, "loss": 0.88779664, "num_input_tokens_seen": 333300400, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10198975, "step": 15443, "time_per_iteration": 3.9019699096679688 }, { "auxiliary_loss_clip": 0.06418592, "auxiliary_loss_mlp": 0.01263866, "balance_loss_clip": 0.06277744, "balance_loss_mlp": 0.01254514, "epoch": 0.9285435142041184, "flos": 23192783470080.0, "grad_norm": 1.8181859806412453, "language_loss": 0.65733761, "learning_rate": 5.328004738702896e-08, "loss": 0.73416221, "num_input_tokens_seen": 333318980, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09344482, "step": 15444, "time_per_iteration": 2.540329933166504 }, { "auxiliary_loss_clip": 0.06414459, "auxiliary_loss_mlp": 0.01263186, "balance_loss_clip": 0.06275018, "balance_loss_mlp": 0.0125427, "epoch": 0.9286036374567864, "flos": 17681220656640.0, "grad_norm": 2.0125821484024797, "language_loss": 0.74146199, "learning_rate": 5.3190787641483215e-08, "loss": 0.8182385, "num_input_tokens_seen": 333334135, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.08917236, "step": 15445, "time_per_iteration": 2.500518798828125 }, { "auxiliary_loss_clip": 0.06412862, "auxiliary_loss_mlp": 0.01263465, "balance_loss_clip": 0.06273736, "balance_loss_mlp": 0.01254054, "epoch": 0.9286637607094543, "flos": 20892995982720.0, "grad_norm": 1.7399564341434182, "language_loss": 0.71641922, "learning_rate": 5.3101601719138135e-08, "loss": 0.79318249, "num_input_tokens_seen": 333353325, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09411621, "step": 15446, "time_per_iteration": 2.519712448120117 }, { "auxiliary_loss_clip": 0.06423716, "auxiliary_loss_mlp": 0.01261879, "balance_loss_clip": 0.06277704, "balance_loss_mlp": 0.01251836, "epoch": 0.9287238839621224, "flos": 19032025927680.0, "grad_norm": 2.2722517795761883, "language_loss": 0.69572359, "learning_rate": 5.301248962337523e-08, "loss": 0.77257955, "num_input_tokens_seen": 333371110, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.10040283, "step": 15447, "time_per_iteration": 2.5170161724090576 }, { "auxiliary_loss_clip": 0.06407112, "auxiliary_loss_mlp": 0.0126436, "balance_loss_clip": 0.06274756, "balance_loss_mlp": 0.0125555, "epoch": 0.9287840072147904, "flos": 20563065331200.0, "grad_norm": 1.7191677006740205, "language_loss": 0.72720784, "learning_rate": 5.292345135757403e-08, "loss": 0.80392253, "num_input_tokens_seen": 333391420, "router_z_loss_clip": 1.32324219, "router_z_loss_mlp": 0.08807373, "step": 15448, "time_per_iteration": 2.547828197479248 }, { "auxiliary_loss_clip": 0.06410749, "auxiliary_loss_mlp": 0.0126357, "balance_loss_clip": 0.06273785, "balance_loss_mlp": 0.01253879, "epoch": 0.9288441304674583, "flos": 21257069973120.0, "grad_norm": 1.6740573589786925, "language_loss": 0.74630916, "learning_rate": 5.283448692511072e-08, "loss": 0.82305235, "num_input_tokens_seen": 333410365, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09698486, "step": 15449, "time_per_iteration": 2.572833299636841 }, { "auxiliary_loss_clip": 0.06414375, "auxiliary_loss_mlp": 0.01260772, "balance_loss_clip": 0.06276976, "balance_loss_mlp": 0.01251593, "epoch": 0.9289042537201263, "flos": 27676763337600.0, "grad_norm": 2.111299536075659, "language_loss": 0.6767503, "learning_rate": 5.27455963293586e-08, "loss": 0.75350183, "num_input_tokens_seen": 333430000, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09185791, "step": 15450, "time_per_iteration": 2.5631229877471924 }, { "auxiliary_loss_clip": 0.06415738, "auxiliary_loss_mlp": 0.01263154, "balance_loss_clip": 0.06275782, "balance_loss_mlp": 0.01253969, "epoch": 0.9289643769727942, "flos": 19323788244480.0, "grad_norm": 2.360951157983877, "language_loss": 0.72091436, "learning_rate": 5.265677957368875e-08, "loss": 0.79770327, "num_input_tokens_seen": 333445800, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09191895, "step": 15451, "time_per_iteration": 2.5342984199523926 }, { "auxiliary_loss_clip": 0.06417622, "auxiliary_loss_mlp": 0.01263945, "balance_loss_clip": 0.06278197, "balance_loss_mlp": 0.01254229, "epoch": 0.9290245002254622, "flos": 14062255614720.0, "grad_norm": 1.9522222224593577, "language_loss": 0.73326826, "learning_rate": 5.25680366614687e-08, "loss": 0.81008387, "num_input_tokens_seen": 333461550, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09710693, "step": 15452, "time_per_iteration": 2.4755959510803223 }, { "auxiliary_loss_clip": 0.06413199, "auxiliary_loss_mlp": 0.01265855, "balance_loss_clip": 0.06276237, "balance_loss_mlp": 0.01256396, "epoch": 0.9290846234781301, "flos": 20053235963520.0, "grad_norm": 1.7454839204455905, "language_loss": 0.74645674, "learning_rate": 5.2479367596064196e-08, "loss": 0.82324731, "num_input_tokens_seen": 333478835, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.09460449, "step": 15453, "time_per_iteration": 2.5203440189361572 }, { "auxiliary_loss_clip": 0.06318272, "auxiliary_loss_mlp": 0.01251847, "balance_loss_clip": 0.06263192, "balance_loss_mlp": 0.01250824, "epoch": 0.9291447467307982, "flos": 61244592629760.0, "grad_norm": 0.8071978204576872, "language_loss": 0.60705352, "learning_rate": 5.2390772380837226e-08, "loss": 0.6827547, "num_input_tokens_seen": 333535250, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01023102, "step": 15454, "time_per_iteration": 3.0464138984680176 }, { "auxiliary_loss_clip": 0.06415533, "auxiliary_loss_mlp": 0.01268377, "balance_loss_clip": 0.0627517, "balance_loss_mlp": 0.01258519, "epoch": 0.9292048699834661, "flos": 20558746846080.0, "grad_norm": 1.5117898202005404, "language_loss": 0.69061279, "learning_rate": 5.230225101914709e-08, "loss": 0.76745188, "num_input_tokens_seen": 333553805, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09857178, "step": 15455, "time_per_iteration": 2.5427327156066895 }, { "auxiliary_loss_clip": 0.06414508, "auxiliary_loss_mlp": 0.01263957, "balance_loss_clip": 0.06275414, "balance_loss_mlp": 0.01255029, "epoch": 0.9292649932361341, "flos": 23630510799360.0, "grad_norm": 1.7569803847431624, "language_loss": 0.64684933, "learning_rate": 5.22138035143509e-08, "loss": 0.723634, "num_input_tokens_seen": 333572800, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.0894165, "step": 15456, "time_per_iteration": 2.562530994415283 }, { "auxiliary_loss_clip": 0.06410572, "auxiliary_loss_mlp": 0.01265801, "balance_loss_clip": 0.06273822, "balance_loss_mlp": 0.01255632, "epoch": 0.929325116488802, "flos": 15014843556480.0, "grad_norm": 1.9240384786228297, "language_loss": 0.68752182, "learning_rate": 5.2125429869802615e-08, "loss": 0.76428545, "num_input_tokens_seen": 333588520, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.10162354, "step": 15457, "time_per_iteration": 2.508039951324463 }, { "auxiliary_loss_clip": 0.06415096, "auxiliary_loss_mlp": 0.0126184, "balance_loss_clip": 0.0627405, "balance_loss_mlp": 0.01252548, "epoch": 0.92938523974147, "flos": 17973108754560.0, "grad_norm": 2.737989007672978, "language_loss": 0.81430268, "learning_rate": 5.203713008885291e-08, "loss": 0.89107203, "num_input_tokens_seen": 333603435, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09289551, "step": 15458, "time_per_iteration": 2.512345790863037 }, { "auxiliary_loss_clip": 0.06413642, "auxiliary_loss_mlp": 0.01265649, "balance_loss_clip": 0.06274513, "balance_loss_mlp": 0.01256273, "epoch": 0.9294453629941379, "flos": 23009740225920.0, "grad_norm": 1.8422577624632872, "language_loss": 0.72215348, "learning_rate": 5.194890417485065e-08, "loss": 0.79894644, "num_input_tokens_seen": 333623305, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09375, "step": 15459, "time_per_iteration": 2.5665159225463867 }, { "auxiliary_loss_clip": 0.06415464, "auxiliary_loss_mlp": 0.01266481, "balance_loss_clip": 0.06275576, "balance_loss_mlp": 0.01257081, "epoch": 0.929505486246806, "flos": 17060827426560.0, "grad_norm": 2.2177301288167866, "language_loss": 0.58918428, "learning_rate": 5.1860752131141384e-08, "loss": 0.6660037, "num_input_tokens_seen": 333641205, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09399414, "step": 15460, "time_per_iteration": 2.505251884460449 }, { "auxiliary_loss_clip": 0.06419525, "auxiliary_loss_mlp": 0.0126689, "balance_loss_clip": 0.06277893, "balance_loss_mlp": 0.01257318, "epoch": 0.9295656094994739, "flos": 27347084248320.0, "grad_norm": 1.7377323825142221, "language_loss": 0.80371273, "learning_rate": 5.177267396106733e-08, "loss": 0.88057685, "num_input_tokens_seen": 333659615, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09576416, "step": 15461, "time_per_iteration": 2.57303786277771 }, { "auxiliary_loss_clip": 0.06412851, "auxiliary_loss_mlp": 0.0126605, "balance_loss_clip": 0.06275688, "balance_loss_mlp": 0.0125699, "epoch": 0.9296257327521419, "flos": 21477443011200.0, "grad_norm": 1.7920529154541458, "language_loss": 0.78344214, "learning_rate": 5.168466966796869e-08, "loss": 0.86023116, "num_input_tokens_seen": 333678985, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09063721, "step": 15462, "time_per_iteration": 3.9415717124938965 }, { "auxiliary_loss_clip": 0.06413968, "auxiliary_loss_mlp": 0.01265151, "balance_loss_clip": 0.06274772, "balance_loss_mlp": 0.01255919, "epoch": 0.9296858560048099, "flos": 16368248304000.0, "grad_norm": 1.9908792400139992, "language_loss": 0.62893999, "learning_rate": 5.159673925518282e-08, "loss": 0.70573115, "num_input_tokens_seen": 333696410, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09228516, "step": 15463, "time_per_iteration": 2.510920286178589 }, { "auxiliary_loss_clip": 0.06409511, "auxiliary_loss_mlp": 0.01262864, "balance_loss_clip": 0.06271504, "balance_loss_mlp": 0.01253714, "epoch": 0.9297459792574778, "flos": 29865819254400.0, "grad_norm": 1.4706323925969742, "language_loss": 0.71132034, "learning_rate": 5.15088827260437e-08, "loss": 0.7880441, "num_input_tokens_seen": 333716615, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09161377, "step": 15464, "time_per_iteration": 2.5903725624084473 }, { "auxiliary_loss_clip": 0.06418521, "auxiliary_loss_mlp": 0.01262617, "balance_loss_clip": 0.0627659, "balance_loss_mlp": 0.01253474, "epoch": 0.9298061025101458, "flos": 15930353266560.0, "grad_norm": 2.5531495851817585, "language_loss": 0.77537072, "learning_rate": 5.1421100083883115e-08, "loss": 0.85218203, "num_input_tokens_seen": 333732800, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09143066, "step": 15465, "time_per_iteration": 2.47422194480896 }, { "auxiliary_loss_clip": 0.06317858, "auxiliary_loss_mlp": 0.01253127, "balance_loss_clip": 0.06262814, "balance_loss_mlp": 0.01252143, "epoch": 0.9298662257628137, "flos": 64118498365440.0, "grad_norm": 0.6836759714094715, "language_loss": 0.56411088, "learning_rate": 5.133339133202952e-08, "loss": 0.63982075, "num_input_tokens_seen": 333799300, "router_z_loss_clip": 0.55224609, "router_z_loss_mlp": 0.00982666, "step": 15466, "time_per_iteration": 3.258051633834839 }, { "auxiliary_loss_clip": 0.06416017, "auxiliary_loss_mlp": 0.01268975, "balance_loss_clip": 0.06274171, "balance_loss_mlp": 0.01258944, "epoch": 0.9299263490154818, "flos": 24287143720320.0, "grad_norm": 1.4254571934926101, "language_loss": 0.72765195, "learning_rate": 5.1245756473809355e-08, "loss": 0.80450189, "num_input_tokens_seen": 333820360, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.10040283, "step": 15467, "time_per_iteration": 2.6231613159179688 }, { "auxiliary_loss_clip": 0.06417094, "auxiliary_loss_mlp": 0.01264535, "balance_loss_clip": 0.0627615, "balance_loss_mlp": 0.01255117, "epoch": 0.9299864722681497, "flos": 23300999418240.0, "grad_norm": 1.6163406652153056, "language_loss": 0.72249472, "learning_rate": 5.1158195512545076e-08, "loss": 0.79931104, "num_input_tokens_seen": 333840415, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09405518, "step": 15468, "time_per_iteration": 4.003041982650757 }, { "auxiliary_loss_clip": 0.06417497, "auxiliary_loss_mlp": 0.01263222, "balance_loss_clip": 0.06274734, "balance_loss_mlp": 0.01253328, "epoch": 0.9300465955208177, "flos": 21402112590720.0, "grad_norm": 1.6636274489744023, "language_loss": 0.7558918, "learning_rate": 5.107070845155737e-08, "loss": 0.83269906, "num_input_tokens_seen": 333859910, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09893799, "step": 15469, "time_per_iteration": 2.6049134731292725 }, { "auxiliary_loss_clip": 0.06415203, "auxiliary_loss_mlp": 0.01265513, "balance_loss_clip": 0.06274332, "balance_loss_mlp": 0.01256209, "epoch": 0.9301067187734856, "flos": 24578319058560.0, "grad_norm": 1.815192703991649, "language_loss": 0.76338136, "learning_rate": 5.098329529416379e-08, "loss": 0.84018856, "num_input_tokens_seen": 333880495, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09307861, "step": 15470, "time_per_iteration": 2.5595171451568604 }, { "auxiliary_loss_clip": 0.06409861, "auxiliary_loss_mlp": 0.01263541, "balance_loss_clip": 0.06272575, "balance_loss_mlp": 0.01254756, "epoch": 0.9301668420261536, "flos": 22202949588480.0, "grad_norm": 1.475348740266843, "language_loss": 0.74897462, "learning_rate": 5.089595604367902e-08, "loss": 0.82570863, "num_input_tokens_seen": 333897640, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.08782959, "step": 15471, "time_per_iteration": 2.532682180404663 }, { "auxiliary_loss_clip": 0.0641049, "auxiliary_loss_mlp": 0.0126422, "balance_loss_clip": 0.06273225, "balance_loss_mlp": 0.01254838, "epoch": 0.9302269652788215, "flos": 17753196913920.0, "grad_norm": 2.795785833259003, "language_loss": 0.69081485, "learning_rate": 5.080869070341487e-08, "loss": 0.76756203, "num_input_tokens_seen": 333913670, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09381104, "step": 15472, "time_per_iteration": 2.4977080821990967 }, { "auxiliary_loss_clip": 0.06408215, "auxiliary_loss_mlp": 0.01261748, "balance_loss_clip": 0.06275724, "balance_loss_mlp": 0.01253112, "epoch": 0.9302870885314896, "flos": 19396854604800.0, "grad_norm": 1.6475994246433172, "language_loss": 0.88635665, "learning_rate": 5.0721499276680233e-08, "loss": 0.96305633, "num_input_tokens_seen": 333934105, "router_z_loss_clip": 1.32421875, "router_z_loss_mlp": 0.08642578, "step": 15473, "time_per_iteration": 2.534325361251831 }, { "auxiliary_loss_clip": 0.06420112, "auxiliary_loss_mlp": 0.01264518, "balance_loss_clip": 0.06277794, "balance_loss_mlp": 0.01254307, "epoch": 0.9303472117841575, "flos": 21766396216320.0, "grad_norm": 1.8893132585794592, "language_loss": 0.64221239, "learning_rate": 5.063438176678203e-08, "loss": 0.71905869, "num_input_tokens_seen": 333953635, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10211182, "step": 15474, "time_per_iteration": 2.5313801765441895 }, { "auxiliary_loss_clip": 0.06413959, "auxiliary_loss_mlp": 0.01264379, "balance_loss_clip": 0.06274802, "balance_loss_mlp": 0.01254586, "epoch": 0.9304073350368255, "flos": 19615844050560.0, "grad_norm": 2.1420609993940603, "language_loss": 0.74840558, "learning_rate": 5.054733817702339e-08, "loss": 0.82518899, "num_input_tokens_seen": 333971825, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09790039, "step": 15475, "time_per_iteration": 3.997134208679199 }, { "auxiliary_loss_clip": 0.06412786, "auxiliary_loss_mlp": 0.01269303, "balance_loss_clip": 0.06274199, "balance_loss_mlp": 0.01260226, "epoch": 0.9304674582894935, "flos": 30448756909440.0, "grad_norm": 2.204858783338773, "language_loss": 0.66802454, "learning_rate": 5.0460368510704786e-08, "loss": 0.74484551, "num_input_tokens_seen": 333990120, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09082031, "step": 15476, "time_per_iteration": 2.594210386276245 }, { "auxiliary_loss_clip": 0.06419363, "auxiliary_loss_mlp": 0.01270256, "balance_loss_clip": 0.06279912, "balance_loss_mlp": 0.01260398, "epoch": 0.9305275815421614, "flos": 17791532956800.0, "grad_norm": 11.254214774949828, "language_loss": 0.69370794, "learning_rate": 5.0373472771124914e-08, "loss": 0.77060413, "num_input_tokens_seen": 334007970, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09857178, "step": 15477, "time_per_iteration": 2.5054214000701904 }, { "auxiliary_loss_clip": 0.06413721, "auxiliary_loss_mlp": 0.01269002, "balance_loss_clip": 0.06276735, "balance_loss_mlp": 0.01259811, "epoch": 0.9305877047948294, "flos": 25304999592960.0, "grad_norm": 1.615447972722632, "language_loss": 0.58737123, "learning_rate": 5.0286650961578027e-08, "loss": 0.6641984, "num_input_tokens_seen": 334027120, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09197998, "step": 15478, "time_per_iteration": 2.5759079456329346 }, { "auxiliary_loss_clip": 0.06423745, "auxiliary_loss_mlp": 0.01267419, "balance_loss_clip": 0.06276955, "balance_loss_mlp": 0.01256577, "epoch": 0.9306478280474973, "flos": 16981975134720.0, "grad_norm": 2.546731937752856, "language_loss": 0.7976917, "learning_rate": 5.01999030853566e-08, "loss": 0.87460339, "num_input_tokens_seen": 334042785, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.10839844, "step": 15479, "time_per_iteration": 2.4933419227600098 }, { "auxiliary_loss_clip": 0.06416968, "auxiliary_loss_mlp": 0.01265264, "balance_loss_clip": 0.0627633, "balance_loss_mlp": 0.01256067, "epoch": 0.9307079513001654, "flos": 35672121204480.0, "grad_norm": 1.5964213642258873, "language_loss": 0.69113255, "learning_rate": 5.0113229145750445e-08, "loss": 0.76795483, "num_input_tokens_seen": 334063480, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09197998, "step": 15480, "time_per_iteration": 2.655064344406128 }, { "auxiliary_loss_clip": 0.06414357, "auxiliary_loss_mlp": 0.01265563, "balance_loss_clip": 0.06274106, "balance_loss_mlp": 0.01256283, "epoch": 0.9307680745528333, "flos": 19214146776960.0, "grad_norm": 1.5943699587876825, "language_loss": 0.68126994, "learning_rate": 5.002662914604583e-08, "loss": 0.75806916, "num_input_tokens_seen": 334082005, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09277344, "step": 15481, "time_per_iteration": 2.5114941596984863 }, { "auxiliary_loss_clip": 0.06410246, "auxiliary_loss_mlp": 0.01263821, "balance_loss_clip": 0.06273624, "balance_loss_mlp": 0.01254952, "epoch": 0.9308281978055013, "flos": 19068684888960.0, "grad_norm": 1.7713309229056233, "language_loss": 0.75011808, "learning_rate": 4.994010308952701e-08, "loss": 0.82685876, "num_input_tokens_seen": 334101375, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.08874512, "step": 15482, "time_per_iteration": 3.933375120162964 }, { "auxiliary_loss_clip": 0.06410538, "auxiliary_loss_mlp": 0.01266225, "balance_loss_clip": 0.0627577, "balance_loss_mlp": 0.01256885, "epoch": 0.9308883210581692, "flos": 20527748035200.0, "grad_norm": 1.83748436449917, "language_loss": 0.80400574, "learning_rate": 4.985365097947469e-08, "loss": 0.88077343, "num_input_tokens_seen": 334119460, "router_z_loss_clip": 1.34863281, "router_z_loss_mlp": 0.09344482, "step": 15483, "time_per_iteration": 2.51072359085083 }, { "auxiliary_loss_clip": 0.06413941, "auxiliary_loss_mlp": 0.01265713, "balance_loss_clip": 0.06274915, "balance_loss_mlp": 0.01255598, "epoch": 0.9309484443108372, "flos": 13005686355840.0, "grad_norm": 3.135277152244589, "language_loss": 0.75156695, "learning_rate": 4.976727281916782e-08, "loss": 0.82836348, "num_input_tokens_seen": 334136065, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.10119629, "step": 15484, "time_per_iteration": 2.5248665809631348 }, { "auxiliary_loss_clip": 0.0641735, "auxiliary_loss_mlp": 0.01266222, "balance_loss_clip": 0.06276695, "balance_loss_mlp": 0.01256906, "epoch": 0.9310085675635051, "flos": 12572654855040.0, "grad_norm": 2.494412460876042, "language_loss": 0.76719749, "learning_rate": 4.968096861188087e-08, "loss": 0.84403318, "num_input_tokens_seen": 334153690, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09320068, "step": 15485, "time_per_iteration": 2.4901082515716553 }, { "auxiliary_loss_clip": 0.06417243, "auxiliary_loss_mlp": 0.01267209, "balance_loss_clip": 0.06274106, "balance_loss_mlp": 0.01257023, "epoch": 0.9310686908161732, "flos": 23484378078720.0, "grad_norm": 1.642896863400032, "language_loss": 0.7820428, "learning_rate": 4.959473836088723e-08, "loss": 0.85888731, "num_input_tokens_seen": 334171880, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10180664, "step": 15486, "time_per_iteration": 2.5509378910064697 }, { "auxiliary_loss_clip": 0.06420851, "auxiliary_loss_mlp": 0.01268888, "balance_loss_clip": 0.06278578, "balance_loss_mlp": 0.01258636, "epoch": 0.9311288140688411, "flos": 24177124909440.0, "grad_norm": 2.431628296615697, "language_loss": 0.77059686, "learning_rate": 4.950858206945674e-08, "loss": 0.8474943, "num_input_tokens_seen": 334190005, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.1026001, "step": 15487, "time_per_iteration": 2.5674450397491455 }, { "auxiliary_loss_clip": 0.06412056, "auxiliary_loss_mlp": 0.01262525, "balance_loss_clip": 0.06274141, "balance_loss_mlp": 0.01253215, "epoch": 0.9311889373215091, "flos": 35598929063040.0, "grad_norm": 2.0146882737632605, "language_loss": 0.67628288, "learning_rate": 4.942249974085633e-08, "loss": 0.75302869, "num_input_tokens_seen": 334209545, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09313965, "step": 15488, "time_per_iteration": 2.63962459564209 }, { "auxiliary_loss_clip": 0.06413164, "auxiliary_loss_mlp": 0.01265563, "balance_loss_clip": 0.06278323, "balance_loss_mlp": 0.01255722, "epoch": 0.9312490605741771, "flos": 20236824259200.0, "grad_norm": 1.7604898982550423, "language_loss": 0.75316536, "learning_rate": 4.933649137834983e-08, "loss": 0.82995266, "num_input_tokens_seen": 334228900, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.09844971, "step": 15489, "time_per_iteration": 2.5492868423461914 }, { "auxiliary_loss_clip": 0.06419452, "auxiliary_loss_mlp": 0.01265513, "balance_loss_clip": 0.06276381, "balance_loss_mlp": 0.01255738, "epoch": 0.931309183826845, "flos": 13955087842560.0, "grad_norm": 2.3384662693728195, "language_loss": 0.81210923, "learning_rate": 4.925055698519931e-08, "loss": 0.88895893, "num_input_tokens_seen": 334245500, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.09777832, "step": 15490, "time_per_iteration": 2.5152945518493652 }, { "auxiliary_loss_clip": 0.064168, "auxiliary_loss_mlp": 0.01267706, "balance_loss_clip": 0.06275605, "balance_loss_mlp": 0.01257359, "epoch": 0.931369307079513, "flos": 20162877431040.0, "grad_norm": 1.6840610791316095, "language_loss": 0.72629571, "learning_rate": 4.9164696564663264e-08, "loss": 0.80314076, "num_input_tokens_seen": 334264370, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10351562, "step": 15491, "time_per_iteration": 2.5259501934051514 }, { "auxiliary_loss_clip": 0.06406834, "auxiliary_loss_mlp": 0.01266071, "balance_loss_clip": 0.06272171, "balance_loss_mlp": 0.01257745, "epoch": 0.931429430332181, "flos": 25345725477120.0, "grad_norm": 1.6351563886569722, "language_loss": 0.74486786, "learning_rate": 4.9078910119997096e-08, "loss": 0.82159692, "num_input_tokens_seen": 334283905, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.08325195, "step": 15492, "time_per_iteration": 2.5715038776397705 }, { "auxiliary_loss_clip": 0.06321237, "auxiliary_loss_mlp": 0.01251052, "balance_loss_clip": 0.06266128, "balance_loss_mlp": 0.01249977, "epoch": 0.931489553584849, "flos": 71245208482560.0, "grad_norm": 0.6994479063453213, "language_loss": 0.53406876, "learning_rate": 4.899319765445442e-08, "loss": 0.60979164, "num_input_tokens_seen": 334339925, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01076508, "step": 15493, "time_per_iteration": 3.0419323444366455 }, { "auxiliary_loss_clip": 0.06416814, "auxiliary_loss_mlp": 0.01265735, "balance_loss_clip": 0.06278861, "balance_loss_mlp": 0.01257098, "epoch": 0.9315496768375169, "flos": 14648253943680.0, "grad_norm": 1.6831114173291546, "language_loss": 0.71124518, "learning_rate": 4.890755917128531e-08, "loss": 0.78807068, "num_input_tokens_seen": 334357225, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.08636475, "step": 15494, "time_per_iteration": 2.489902973175049 }, { "auxiliary_loss_clip": 0.06418363, "auxiliary_loss_mlp": 0.01265335, "balance_loss_clip": 0.06276028, "balance_loss_mlp": 0.01255923, "epoch": 0.9316098000901849, "flos": 28337505108480.0, "grad_norm": 1.5343888268870391, "language_loss": 0.6855799, "learning_rate": 4.882199467373671e-08, "loss": 0.76241684, "num_input_tokens_seen": 334375945, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09411621, "step": 15495, "time_per_iteration": 2.581075668334961 }, { "auxiliary_loss_clip": 0.06410812, "auxiliary_loss_mlp": 0.01265053, "balance_loss_clip": 0.06275, "balance_loss_mlp": 0.01256464, "epoch": 0.9316699233428528, "flos": 28520338717440.0, "grad_norm": 2.102383036475824, "language_loss": 0.62288547, "learning_rate": 4.8736504165053815e-08, "loss": 0.69964415, "num_input_tokens_seen": 334395310, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.08581543, "step": 15496, "time_per_iteration": 2.57741641998291 }, { "auxiliary_loss_clip": 0.06416678, "auxiliary_loss_mlp": 0.0126481, "balance_loss_clip": 0.0627635, "balance_loss_mlp": 0.01255315, "epoch": 0.9317300465955208, "flos": 33701887025280.0, "grad_norm": 1.7547170550681408, "language_loss": 0.77097082, "learning_rate": 4.865108764847825e-08, "loss": 0.84778571, "num_input_tokens_seen": 334416965, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09490967, "step": 15497, "time_per_iteration": 2.6417617797851562 }, { "auxiliary_loss_clip": 0.06420733, "auxiliary_loss_mlp": 0.01266565, "balance_loss_clip": 0.0627775, "balance_loss_mlp": 0.01256426, "epoch": 0.9317901698481887, "flos": 23664779919360.0, "grad_norm": 4.882201044343771, "language_loss": 0.66665, "learning_rate": 4.856574512724898e-08, "loss": 0.743523, "num_input_tokens_seen": 334435620, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10131836, "step": 15498, "time_per_iteration": 2.5531935691833496 }, { "auxiliary_loss_clip": 0.06413485, "auxiliary_loss_mlp": 0.01266546, "balance_loss_clip": 0.06274524, "balance_loss_mlp": 0.01256061, "epoch": 0.9318502931008568, "flos": 20966397759360.0, "grad_norm": 1.666880776366871, "language_loss": 0.80515552, "learning_rate": 4.8480476604602305e-08, "loss": 0.8819558, "num_input_tokens_seen": 334456210, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.10479736, "step": 15499, "time_per_iteration": 2.532257318496704 }, { "auxiliary_loss_clip": 0.06409045, "auxiliary_loss_mlp": 0.01268546, "balance_loss_clip": 0.06274744, "balance_loss_mlp": 0.01258878, "epoch": 0.9319104163535247, "flos": 23447844898560.0, "grad_norm": 1.4458129661963992, "language_loss": 0.7734291, "learning_rate": 4.8395282083771196e-08, "loss": 0.850205, "num_input_tokens_seen": 334475485, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.09667969, "step": 15500, "time_per_iteration": 2.5373120307922363 }, { "auxiliary_loss_clip": 0.064089, "auxiliary_loss_mlp": 0.01266905, "balance_loss_clip": 0.06272615, "balance_loss_mlp": 0.01258119, "epoch": 0.9319705396061927, "flos": 22354197408000.0, "grad_norm": 1.7675652735268246, "language_loss": 0.72535521, "learning_rate": 4.8310161567987064e-08, "loss": 0.80211329, "num_input_tokens_seen": 334494740, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.08782959, "step": 15501, "time_per_iteration": 2.5572292804718018 }, { "auxiliary_loss_clip": 0.06417954, "auxiliary_loss_mlp": 0.01264503, "balance_loss_clip": 0.06275199, "balance_loss_mlp": 0.01254776, "epoch": 0.9320306628588607, "flos": 20999450995200.0, "grad_norm": 1.6710862216954305, "language_loss": 0.66664642, "learning_rate": 4.822511506047666e-08, "loss": 0.74347103, "num_input_tokens_seen": 334511910, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09729004, "step": 15502, "time_per_iteration": 3.969602346420288 }, { "auxiliary_loss_clip": 0.06415954, "auxiliary_loss_mlp": 0.01263786, "balance_loss_clip": 0.06274908, "balance_loss_mlp": 0.01255072, "epoch": 0.9320907861115286, "flos": 24545727020160.0, "grad_norm": 1.4671679863650522, "language_loss": 0.65972227, "learning_rate": 4.814014256446586e-08, "loss": 0.73651969, "num_input_tokens_seen": 334533150, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.0871582, "step": 15503, "time_per_iteration": 2.608161687850952 }, { "auxiliary_loss_clip": 0.06414022, "auxiliary_loss_mlp": 0.01266207, "balance_loss_clip": 0.06273488, "balance_loss_mlp": 0.01255782, "epoch": 0.9321509093641966, "flos": 19790418032640.0, "grad_norm": 1.4938829398271898, "language_loss": 0.75228202, "learning_rate": 4.805524408317652e-08, "loss": 0.82908428, "num_input_tokens_seen": 334550940, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.10418701, "step": 15504, "time_per_iteration": 2.5340216159820557 }, { "auxiliary_loss_clip": 0.0641776, "auxiliary_loss_mlp": 0.01265484, "balance_loss_clip": 0.06277803, "balance_loss_mlp": 0.01255626, "epoch": 0.9322110326168646, "flos": 24979597061760.0, "grad_norm": 2.398181838516042, "language_loss": 0.71883273, "learning_rate": 4.797041961982762e-08, "loss": 0.7956652, "num_input_tokens_seen": 334570935, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09857178, "step": 15505, "time_per_iteration": 2.5729904174804688 }, { "auxiliary_loss_clip": 0.06412636, "auxiliary_loss_mlp": 0.01265643, "balance_loss_clip": 0.06271525, "balance_loss_mlp": 0.01255707, "epoch": 0.9322711558695326, "flos": 16149175004160.0, "grad_norm": 1.7579439541054804, "language_loss": 0.75769913, "learning_rate": 4.788566917763614e-08, "loss": 0.83448195, "num_input_tokens_seen": 334589315, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09936523, "step": 15506, "time_per_iteration": 2.494946002960205 }, { "auxiliary_loss_clip": 0.06403571, "auxiliary_loss_mlp": 0.01262969, "balance_loss_clip": 0.06269873, "balance_loss_mlp": 0.01254034, "epoch": 0.9323312791222005, "flos": 23739187944960.0, "grad_norm": 1.8011935318441878, "language_loss": 0.83586776, "learning_rate": 4.780099275981597e-08, "loss": 0.91253316, "num_input_tokens_seen": 334608990, "router_z_loss_clip": 1.33691406, "router_z_loss_mlp": 0.08929443, "step": 15507, "time_per_iteration": 2.5476088523864746 }, { "auxiliary_loss_clip": 0.06421405, "auxiliary_loss_mlp": 0.0126435, "balance_loss_clip": 0.06279815, "balance_loss_mlp": 0.01254927, "epoch": 0.9323914023748685, "flos": 20784318837120.0, "grad_norm": 1.448151047616097, "language_loss": 0.67737198, "learning_rate": 4.771639036957742e-08, "loss": 0.75422955, "num_input_tokens_seen": 334628655, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09423828, "step": 15508, "time_per_iteration": 3.9889445304870605 }, { "auxiliary_loss_clip": 0.0641165, "auxiliary_loss_mlp": 0.01266113, "balance_loss_clip": 0.06275465, "balance_loss_mlp": 0.01256839, "epoch": 0.9324515256275364, "flos": 23922021553920.0, "grad_norm": 1.606112849538948, "language_loss": 0.72567666, "learning_rate": 4.7631862010129033e-08, "loss": 0.80245435, "num_input_tokens_seen": 334648295, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.0927124, "step": 15509, "time_per_iteration": 2.5382673740386963 }, { "auxiliary_loss_clip": 0.06413918, "auxiliary_loss_mlp": 0.01264005, "balance_loss_clip": 0.06275088, "balance_loss_mlp": 0.01254248, "epoch": 0.9325116488802044, "flos": 18011193235200.0, "grad_norm": 1.8064299679525333, "language_loss": 0.74349988, "learning_rate": 4.754740768467624e-08, "loss": 0.82027912, "num_input_tokens_seen": 334666280, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09759521, "step": 15510, "time_per_iteration": 2.5229227542877197 }, { "auxiliary_loss_clip": 0.06421996, "auxiliary_loss_mlp": 0.01262647, "balance_loss_clip": 0.06277259, "balance_loss_mlp": 0.01253032, "epoch": 0.9325717721328723, "flos": 29029036055040.0, "grad_norm": 2.1581144603865314, "language_loss": 0.70410943, "learning_rate": 4.746302739642161e-08, "loss": 0.78095591, "num_input_tokens_seen": 334688830, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.09613037, "step": 15511, "time_per_iteration": 2.597712755203247 }, { "auxiliary_loss_clip": 0.06416589, "auxiliary_loss_mlp": 0.01267229, "balance_loss_clip": 0.06277829, "balance_loss_mlp": 0.01257508, "epoch": 0.9326318953855404, "flos": 21651681576960.0, "grad_norm": 1.997902449340195, "language_loss": 0.78126556, "learning_rate": 4.737872114856412e-08, "loss": 0.85810375, "num_input_tokens_seen": 334705205, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09716797, "step": 15512, "time_per_iteration": 2.5293045043945312 }, { "auxiliary_loss_clip": 0.06413861, "auxiliary_loss_mlp": 0.01262382, "balance_loss_clip": 0.06276365, "balance_loss_mlp": 0.01252559, "epoch": 0.9326920186382083, "flos": 26072573719680.0, "grad_norm": 1.4951531311836028, "language_loss": 0.80678362, "learning_rate": 4.7294488944301436e-08, "loss": 0.88354605, "num_input_tokens_seen": 334723830, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09820557, "step": 15513, "time_per_iteration": 2.565619468688965 }, { "auxiliary_loss_clip": 0.06426259, "auxiliary_loss_mlp": 0.01267002, "balance_loss_clip": 0.0628032, "balance_loss_mlp": 0.01256774, "epoch": 0.9327521418908763, "flos": 12061945019520.0, "grad_norm": 1.9110864130541703, "language_loss": 0.80422431, "learning_rate": 4.721033078682768e-08, "loss": 0.88115692, "num_input_tokens_seen": 334740825, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10229492, "step": 15514, "time_per_iteration": 2.507685422897339 }, { "auxiliary_loss_clip": 0.06410784, "auxiliary_loss_mlp": 0.01265784, "balance_loss_clip": 0.06275864, "balance_loss_mlp": 0.01256593, "epoch": 0.9328122651435443, "flos": 43844233259520.0, "grad_norm": 1.677008192336929, "language_loss": 0.71580839, "learning_rate": 4.7126246679333626e-08, "loss": 0.79257405, "num_input_tokens_seen": 334765825, "router_z_loss_clip": 1.34863281, "router_z_loss_mlp": 0.09197998, "step": 15515, "time_per_iteration": 4.280287265777588 }, { "auxiliary_loss_clip": 0.06420092, "auxiliary_loss_mlp": 0.012621, "balance_loss_clip": 0.06275046, "balance_loss_mlp": 0.01252015, "epoch": 0.9328723883962122, "flos": 15200318568960.0, "grad_norm": 2.0827502586349156, "language_loss": 0.8133285, "learning_rate": 4.704223662500806e-08, "loss": 0.89015043, "num_input_tokens_seen": 334782680, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10083008, "step": 15516, "time_per_iteration": 2.5020923614501953 }, { "auxiliary_loss_clip": 0.06418963, "auxiliary_loss_mlp": 0.0126326, "balance_loss_clip": 0.06276843, "balance_loss_mlp": 0.01253711, "epoch": 0.9329325116488802, "flos": 20267194164480.0, "grad_norm": 1.6496081118812476, "language_loss": 0.80769002, "learning_rate": 4.695830062703643e-08, "loss": 0.88451225, "num_input_tokens_seen": 334800160, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09539795, "step": 15517, "time_per_iteration": 2.604250907897949 }, { "auxiliary_loss_clip": 0.06418336, "auxiliary_loss_mlp": 0.01269379, "balance_loss_clip": 0.06277214, "balance_loss_mlp": 0.01259425, "epoch": 0.9329926349015482, "flos": 13119981724800.0, "grad_norm": 2.562329674780172, "language_loss": 0.75329477, "learning_rate": 4.687443868860219e-08, "loss": 0.83017194, "num_input_tokens_seen": 334815840, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.0994873, "step": 15518, "time_per_iteration": 2.528534173965454 }, { "auxiliary_loss_clip": 0.06414521, "auxiliary_loss_mlp": 0.01267157, "balance_loss_clip": 0.062758, "balance_loss_mlp": 0.01258073, "epoch": 0.9330527581542162, "flos": 23047070019840.0, "grad_norm": 1.8536704321918587, "language_loss": 0.75548929, "learning_rate": 4.679065081288458e-08, "loss": 0.83230603, "num_input_tokens_seen": 334834735, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09082031, "step": 15519, "time_per_iteration": 2.543287515640259 }, { "auxiliary_loss_clip": 0.06413709, "auxiliary_loss_mlp": 0.01267125, "balance_loss_clip": 0.06276277, "balance_loss_mlp": 0.01257368, "epoch": 0.9331128814068841, "flos": 15565021464960.0, "grad_norm": 2.198182413478704, "language_loss": 0.83023512, "learning_rate": 4.6706937003061275e-08, "loss": 0.90704346, "num_input_tokens_seen": 334853490, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09753418, "step": 15520, "time_per_iteration": 2.5202345848083496 }, { "auxiliary_loss_clip": 0.06411169, "auxiliary_loss_mlp": 0.01272454, "balance_loss_clip": 0.06274879, "balance_loss_mlp": 0.01263347, "epoch": 0.9331730046595521, "flos": 22278070373760.0, "grad_norm": 1.6145118442971156, "language_loss": 0.76389486, "learning_rate": 4.6623297262306846e-08, "loss": 0.84073108, "num_input_tokens_seen": 334873675, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.09106445, "step": 15521, "time_per_iteration": 2.5318522453308105 }, { "auxiliary_loss_clip": 0.06412422, "auxiliary_loss_mlp": 0.01263027, "balance_loss_clip": 0.06274976, "balance_loss_mlp": 0.01253866, "epoch": 0.93323312791222, "flos": 15782920807680.0, "grad_norm": 1.6830590274018544, "language_loss": 0.77905381, "learning_rate": 4.6539731593792545e-08, "loss": 0.85580832, "num_input_tokens_seen": 334890970, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.0916748, "step": 15522, "time_per_iteration": 3.9298386573791504 }, { "auxiliary_loss_clip": 0.06415536, "auxiliary_loss_mlp": 0.01265489, "balance_loss_clip": 0.06276098, "balance_loss_mlp": 0.01255053, "epoch": 0.933293251164888, "flos": 22016342545920.0, "grad_norm": 1.8876221911253839, "language_loss": 0.63105679, "learning_rate": 4.6456240000687373e-08, "loss": 0.70786703, "num_input_tokens_seen": 334906635, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.10443115, "step": 15523, "time_per_iteration": 2.5061380863189697 }, { "auxiliary_loss_clip": 0.06415458, "auxiliary_loss_mlp": 0.01269963, "balance_loss_clip": 0.06278377, "balance_loss_mlp": 0.01259949, "epoch": 0.933353374417556, "flos": 26038556161920.0, "grad_norm": 1.6556022494104679, "language_loss": 0.68819487, "learning_rate": 4.63728224861577e-08, "loss": 0.7650491, "num_input_tokens_seen": 334926230, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.10015869, "step": 15524, "time_per_iteration": 2.569550037384033 }, { "auxiliary_loss_clip": 0.06417611, "auxiliary_loss_mlp": 0.01265149, "balance_loss_clip": 0.06276564, "balance_loss_mlp": 0.01255195, "epoch": 0.933413497670224, "flos": 24907075752960.0, "grad_norm": 3.0922356097364574, "language_loss": 0.74113953, "learning_rate": 4.628947905336589e-08, "loss": 0.81796718, "num_input_tokens_seen": 334946680, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09954834, "step": 15525, "time_per_iteration": 2.567452907562256 }, { "auxiliary_loss_clip": 0.06412044, "auxiliary_loss_mlp": 0.01263989, "balance_loss_clip": 0.0627519, "balance_loss_mlp": 0.0125475, "epoch": 0.9334736209228919, "flos": 23694227429760.0, "grad_norm": 2.1233335767001247, "language_loss": 0.84200478, "learning_rate": 4.6206209705473175e-08, "loss": 0.91876507, "num_input_tokens_seen": 334964785, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09240723, "step": 15526, "time_per_iteration": 2.538146495819092 }, { "auxiliary_loss_clip": 0.06417502, "auxiliary_loss_mlp": 0.0126647, "balance_loss_clip": 0.06276683, "balance_loss_mlp": 0.01256999, "epoch": 0.9335337441755599, "flos": 15382732907520.0, "grad_norm": 1.7835201349581848, "language_loss": 0.69436419, "learning_rate": 4.61230144456366e-08, "loss": 0.77120388, "num_input_tokens_seen": 334982400, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09472656, "step": 15527, "time_per_iteration": 2.5033140182495117 }, { "auxiliary_loss_clip": 0.06418368, "auxiliary_loss_mlp": 0.01264469, "balance_loss_clip": 0.0627643, "balance_loss_mlp": 0.01254259, "epoch": 0.9335938674282279, "flos": 16112180626560.0, "grad_norm": 3.4568283948228657, "language_loss": 0.65009785, "learning_rate": 4.603989327701141e-08, "loss": 0.72692621, "num_input_tokens_seen": 334999685, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10198975, "step": 15528, "time_per_iteration": 2.4979286193847656 }, { "auxiliary_loss_clip": 0.06415603, "auxiliary_loss_mlp": 0.01265041, "balance_loss_clip": 0.06274448, "balance_loss_mlp": 0.0125523, "epoch": 0.9336539906808958, "flos": 18958875713280.0, "grad_norm": 1.9099069447291077, "language_loss": 0.75071406, "learning_rate": 4.5956846202748867e-08, "loss": 0.82752049, "num_input_tokens_seen": 335019160, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09802246, "step": 15529, "time_per_iteration": 2.5276899337768555 }, { "auxiliary_loss_clip": 0.06415261, "auxiliary_loss_mlp": 0.01263155, "balance_loss_clip": 0.06276731, "balance_loss_mlp": 0.0125428, "epoch": 0.9337141139335638, "flos": 18114168303360.0, "grad_norm": 1.7283405102409526, "language_loss": 0.62930942, "learning_rate": 4.5873873225998674e-08, "loss": 0.70609361, "num_input_tokens_seen": 335037350, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.08874512, "step": 15530, "time_per_iteration": 2.5148844718933105 }, { "auxiliary_loss_clip": 0.0640856, "auxiliary_loss_mlp": 0.01262612, "balance_loss_clip": 0.06273723, "balance_loss_mlp": 0.01253666, "epoch": 0.9337742371862318, "flos": 17351122296960.0, "grad_norm": 1.7880885084056082, "language_loss": 0.72476292, "learning_rate": 4.5790974349907194e-08, "loss": 0.80147469, "num_input_tokens_seen": 335056060, "router_z_loss_clip": 1.34667969, "router_z_loss_mlp": 0.08953857, "step": 15531, "time_per_iteration": 2.522272825241089 }, { "auxiliary_loss_clip": 0.06412263, "auxiliary_loss_mlp": 0.01264627, "balance_loss_clip": 0.06274996, "balance_loss_mlp": 0.01254262, "epoch": 0.9338343604388998, "flos": 29066575484160.0, "grad_norm": 1.6459853992698161, "language_loss": 0.70950806, "learning_rate": 4.5708149577617925e-08, "loss": 0.78627694, "num_input_tokens_seen": 335075410, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.10375977, "step": 15532, "time_per_iteration": 2.607452869415283 }, { "auxiliary_loss_clip": 0.06416811, "auxiliary_loss_mlp": 0.01263981, "balance_loss_clip": 0.06274539, "balance_loss_mlp": 0.01254331, "epoch": 0.9338944836915677, "flos": 18666819907200.0, "grad_norm": 1.5688621787845218, "language_loss": 0.73232174, "learning_rate": 4.5625398912271016e-08, "loss": 0.80912971, "num_input_tokens_seen": 335095190, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09655762, "step": 15533, "time_per_iteration": 2.538990020751953 }, { "auxiliary_loss_clip": 0.06408411, "auxiliary_loss_mlp": 0.01262899, "balance_loss_clip": 0.06272359, "balance_loss_mlp": 0.0125403, "epoch": 0.9339546069442357, "flos": 16623309732480.0, "grad_norm": 2.827601772515905, "language_loss": 0.79803956, "learning_rate": 4.554272235700507e-08, "loss": 0.87475264, "num_input_tokens_seen": 335113825, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.08868408, "step": 15534, "time_per_iteration": 2.5130200386047363 }, { "auxiliary_loss_clip": 0.06410243, "auxiliary_loss_mlp": 0.01268197, "balance_loss_clip": 0.06279431, "balance_loss_mlp": 0.01259781, "epoch": 0.9340147301969036, "flos": 23699384455680.0, "grad_norm": 1.6109149376768717, "language_loss": 0.74718976, "learning_rate": 4.546011991495513e-08, "loss": 0.82397407, "num_input_tokens_seen": 335136425, "router_z_loss_clip": 1.30761719, "router_z_loss_mlp": 0.08416748, "step": 15535, "time_per_iteration": 2.5889742374420166 }, { "auxiliary_loss_clip": 0.0641547, "auxiliary_loss_mlp": 0.01263094, "balance_loss_clip": 0.06275237, "balance_loss_mlp": 0.01253659, "epoch": 0.9340748534495716, "flos": 28661440193280.0, "grad_norm": 1.861591179293775, "language_loss": 0.77544224, "learning_rate": 4.537759158925292e-08, "loss": 0.85222787, "num_input_tokens_seen": 335157925, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09436035, "step": 15536, "time_per_iteration": 2.5999577045440674 }, { "auxiliary_loss_clip": 0.06412758, "auxiliary_loss_mlp": 0.01264095, "balance_loss_clip": 0.06273988, "balance_loss_mlp": 0.01254565, "epoch": 0.9341349767022396, "flos": 24906530701440.0, "grad_norm": 1.5162950866279838, "language_loss": 0.81033939, "learning_rate": 4.5295137383028593e-08, "loss": 0.88710797, "num_input_tokens_seen": 335177840, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09533691, "step": 15537, "time_per_iteration": 2.56276535987854 }, { "auxiliary_loss_clip": 0.06418476, "auxiliary_loss_mlp": 0.01266223, "balance_loss_clip": 0.06277102, "balance_loss_mlp": 0.01256925, "epoch": 0.9341950999549076, "flos": 29067204389760.0, "grad_norm": 1.9019459919075985, "language_loss": 0.78162277, "learning_rate": 4.5212757299408764e-08, "loss": 0.85846972, "num_input_tokens_seen": 335199470, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09295654, "step": 15538, "time_per_iteration": 2.5964527130126953 }, { "auxiliary_loss_clip": 0.06415798, "auxiliary_loss_mlp": 0.01264118, "balance_loss_clip": 0.06277967, "balance_loss_mlp": 0.01254814, "epoch": 0.9342552232075755, "flos": 23593893765120.0, "grad_norm": 1.4091471162363747, "language_loss": 0.73184305, "learning_rate": 4.513045134151672e-08, "loss": 0.80864215, "num_input_tokens_seen": 335218885, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09295654, "step": 15539, "time_per_iteration": 2.5587825775146484 }, { "auxiliary_loss_clip": 0.06412135, "auxiliary_loss_mlp": 0.01265274, "balance_loss_clip": 0.06275976, "balance_loss_mlp": 0.01256566, "epoch": 0.9343153464602435, "flos": 36730325617920.0, "grad_norm": 1.4066579058531488, "language_loss": 0.65159249, "learning_rate": 4.504821951247373e-08, "loss": 0.72836655, "num_input_tokens_seen": 335239485, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.0869751, "step": 15540, "time_per_iteration": 2.6476104259490967 }, { "auxiliary_loss_clip": 0.06411189, "auxiliary_loss_mlp": 0.01263813, "balance_loss_clip": 0.06273112, "balance_loss_mlp": 0.01254401, "epoch": 0.9343754697129115, "flos": 22243004640000.0, "grad_norm": 1.7387584880686728, "language_loss": 0.76620454, "learning_rate": 4.496606181539864e-08, "loss": 0.84295452, "num_input_tokens_seen": 335258355, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09405518, "step": 15541, "time_per_iteration": 3.9198079109191895 }, { "auxiliary_loss_clip": 0.064134, "auxiliary_loss_mlp": 0.01264575, "balance_loss_clip": 0.0627641, "balance_loss_mlp": 0.01254568, "epoch": 0.9344355929655794, "flos": 29717128984320.0, "grad_norm": 2.1339445651829765, "language_loss": 0.67650449, "learning_rate": 4.4883978253406066e-08, "loss": 0.75328422, "num_input_tokens_seen": 335276835, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.10009766, "step": 15542, "time_per_iteration": 2.575528383255005 }, { "auxiliary_loss_clip": 0.06410549, "auxiliary_loss_mlp": 0.01264168, "balance_loss_clip": 0.06272831, "balance_loss_mlp": 0.01253993, "epoch": 0.9344957162182475, "flos": 18886438258560.0, "grad_norm": 1.9136785043896911, "language_loss": 0.69403046, "learning_rate": 4.480196882960907e-08, "loss": 0.77077758, "num_input_tokens_seen": 335296220, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.10174561, "step": 15543, "time_per_iteration": 2.51190185546875 }, { "auxiliary_loss_clip": 0.06419228, "auxiliary_loss_mlp": 0.01264762, "balance_loss_clip": 0.06277199, "balance_loss_mlp": 0.0125473, "epoch": 0.9345558394709154, "flos": 27425181853440.0, "grad_norm": 1.9273142630038624, "language_loss": 0.69922912, "learning_rate": 4.4720033547117394e-08, "loss": 0.77606905, "num_input_tokens_seen": 335316335, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.1003418, "step": 15544, "time_per_iteration": 2.5690412521362305 }, { "auxiliary_loss_clip": 0.06417616, "auxiliary_loss_mlp": 0.01267461, "balance_loss_clip": 0.06276239, "balance_loss_mlp": 0.01257346, "epoch": 0.9346159627235834, "flos": 20747659875840.0, "grad_norm": 1.7133714333137824, "language_loss": 0.77230179, "learning_rate": 4.463817240903789e-08, "loss": 0.84915251, "num_input_tokens_seen": 335335545, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10119629, "step": 15545, "time_per_iteration": 2.542051076889038 }, { "auxiliary_loss_clip": 0.06417264, "auxiliary_loss_mlp": 0.01262348, "balance_loss_clip": 0.06275107, "balance_loss_mlp": 0.01252853, "epoch": 0.9346760859762513, "flos": 21075578029440.0, "grad_norm": 2.992085980804401, "language_loss": 0.69189656, "learning_rate": 4.455638541847495e-08, "loss": 0.76869267, "num_input_tokens_seen": 335355350, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.0949707, "step": 15546, "time_per_iteration": 2.5407259464263916 }, { "auxiliary_loss_clip": 0.06409109, "auxiliary_loss_mlp": 0.01262353, "balance_loss_clip": 0.06274246, "balance_loss_mlp": 0.01253704, "epoch": 0.9347362092289193, "flos": 29212540496640.0, "grad_norm": 1.6790584832603679, "language_loss": 0.82930708, "learning_rate": 4.447467257852966e-08, "loss": 0.90602165, "num_input_tokens_seen": 335375160, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.08648682, "step": 15547, "time_per_iteration": 4.00691294670105 }, { "auxiliary_loss_clip": 0.06409615, "auxiliary_loss_mlp": 0.01265326, "balance_loss_clip": 0.06272899, "balance_loss_mlp": 0.01256404, "epoch": 0.9347963324815872, "flos": 19433429712000.0, "grad_norm": 1.8512595974705521, "language_loss": 0.83729279, "learning_rate": 4.439303389230087e-08, "loss": 0.91404212, "num_input_tokens_seen": 335394080, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.0892334, "step": 15548, "time_per_iteration": 2.514432907104492 }, { "auxiliary_loss_clip": 0.06424351, "auxiliary_loss_mlp": 0.01265453, "balance_loss_clip": 0.06280439, "balance_loss_mlp": 0.01254545, "epoch": 0.9348564557342552, "flos": 36910475896320.0, "grad_norm": 1.5066904373318037, "language_loss": 0.65421516, "learning_rate": 4.4311469362884326e-08, "loss": 0.7311132, "num_input_tokens_seen": 335414230, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10894775, "step": 15549, "time_per_iteration": 2.671508312225342 }, { "auxiliary_loss_clip": 0.06418577, "auxiliary_loss_mlp": 0.01264453, "balance_loss_clip": 0.06278493, "balance_loss_mlp": 0.01254523, "epoch": 0.9349165789869232, "flos": 21696684019200.0, "grad_norm": 1.9440629301945809, "language_loss": 0.79902393, "learning_rate": 4.4229978993372665e-08, "loss": 0.87585425, "num_input_tokens_seen": 335432890, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09924316, "step": 15550, "time_per_iteration": 2.5561273097991943 }, { "auxiliary_loss_clip": 0.06420361, "auxiliary_loss_mlp": 0.01267117, "balance_loss_clip": 0.06283164, "balance_loss_mlp": 0.0125748, "epoch": 0.9349767022395912, "flos": 18850114713600.0, "grad_norm": 1.6649797285299373, "language_loss": 0.75916725, "learning_rate": 4.4148562786856524e-08, "loss": 0.83604205, "num_input_tokens_seen": 335452085, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09637451, "step": 15551, "time_per_iteration": 2.561260938644409 }, { "auxiliary_loss_clip": 0.06407455, "auxiliary_loss_mlp": 0.01265227, "balance_loss_clip": 0.06272976, "balance_loss_mlp": 0.01257162, "epoch": 0.9350368254922591, "flos": 24980477529600.0, "grad_norm": 1.6469520786178622, "language_loss": 0.73715758, "learning_rate": 4.406722074642255e-08, "loss": 0.8138845, "num_input_tokens_seen": 335472130, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.08056641, "step": 15552, "time_per_iteration": 2.5661230087280273 }, { "auxiliary_loss_clip": 0.0641263, "auxiliary_loss_mlp": 0.01268059, "balance_loss_clip": 0.06273673, "balance_loss_mlp": 0.0125888, "epoch": 0.9350969487449271, "flos": 23076391749120.0, "grad_norm": 1.845492239515978, "language_loss": 0.77309477, "learning_rate": 4.3985952875155386e-08, "loss": 0.84990168, "num_input_tokens_seen": 335489970, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09173584, "step": 15553, "time_per_iteration": 2.5294651985168457 }, { "auxiliary_loss_clip": 0.06417678, "auxiliary_loss_mlp": 0.01269976, "balance_loss_clip": 0.06274907, "balance_loss_mlp": 0.01259319, "epoch": 0.9351570719975951, "flos": 18631209121920.0, "grad_norm": 1.6910617522799882, "language_loss": 0.78535044, "learning_rate": 4.390475917613723e-08, "loss": 0.86222708, "num_input_tokens_seen": 335509125, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10662842, "step": 15554, "time_per_iteration": 3.948352813720703 }, { "auxiliary_loss_clip": 0.06406737, "auxiliary_loss_mlp": 0.01263413, "balance_loss_clip": 0.06273769, "balance_loss_mlp": 0.01255557, "epoch": 0.935217195250263, "flos": 15893862013440.0, "grad_norm": 1.5316863782818708, "language_loss": 0.69228345, "learning_rate": 4.382363965244695e-08, "loss": 0.76898497, "num_input_tokens_seen": 335525620, "router_z_loss_clip": 1.32714844, "router_z_loss_mlp": 0.07855225, "step": 15555, "time_per_iteration": 2.5493948459625244 }, { "auxiliary_loss_clip": 0.06409609, "auxiliary_loss_mlp": 0.01264445, "balance_loss_clip": 0.06273319, "balance_loss_mlp": 0.01254425, "epoch": 0.935277318502931, "flos": 24397372166400.0, "grad_norm": 1.4703580488916805, "language_loss": 0.75924462, "learning_rate": 4.374259430715965e-08, "loss": 0.83598512, "num_input_tokens_seen": 335547565, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.10015869, "step": 15556, "time_per_iteration": 2.5932116508483887 }, { "auxiliary_loss_clip": 0.06414278, "auxiliary_loss_mlp": 0.01264932, "balance_loss_clip": 0.06276248, "balance_loss_mlp": 0.01256241, "epoch": 0.935337441755599, "flos": 27607721973120.0, "grad_norm": 1.4921790418847138, "language_loss": 0.73162007, "learning_rate": 4.366162314334953e-08, "loss": 0.80841208, "num_input_tokens_seen": 335570285, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.08685303, "step": 15557, "time_per_iteration": 2.584338665008545 }, { "auxiliary_loss_clip": 0.06415829, "auxiliary_loss_mlp": 0.01266527, "balance_loss_clip": 0.06277031, "balance_loss_mlp": 0.01257068, "epoch": 0.935397565008267, "flos": 20488699232640.0, "grad_norm": 1.5551819001789924, "language_loss": 0.6339494, "learning_rate": 4.358072616408681e-08, "loss": 0.71077293, "num_input_tokens_seen": 335588600, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09460449, "step": 15558, "time_per_iteration": 2.5154948234558105 }, { "auxiliary_loss_clip": 0.06414729, "auxiliary_loss_mlp": 0.01268647, "balance_loss_clip": 0.06275944, "balance_loss_mlp": 0.01258532, "epoch": 0.9354576882609349, "flos": 23660293726080.0, "grad_norm": 1.8274117452745673, "language_loss": 0.72873902, "learning_rate": 4.34999033724388e-08, "loss": 0.80557287, "num_input_tokens_seen": 335606235, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10119629, "step": 15559, "time_per_iteration": 2.5174174308776855 }, { "auxiliary_loss_clip": 0.06413122, "auxiliary_loss_mlp": 0.0126124, "balance_loss_clip": 0.06276086, "balance_loss_mlp": 0.01252902, "epoch": 0.9355178115136029, "flos": 36693834364800.0, "grad_norm": 1.5130567051109132, "language_loss": 0.63908994, "learning_rate": 4.341915477147062e-08, "loss": 0.7158336, "num_input_tokens_seen": 335628240, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.08337402, "step": 15560, "time_per_iteration": 2.658540964126587 }, { "auxiliary_loss_clip": 0.06432803, "auxiliary_loss_mlp": 0.01268536, "balance_loss_clip": 0.06283379, "balance_loss_mlp": 0.0125736, "epoch": 0.9355779347662708, "flos": 14464833356160.0, "grad_norm": 2.1583176487164253, "language_loss": 0.64414799, "learning_rate": 4.3338480364244034e-08, "loss": 0.72116137, "num_input_tokens_seen": 335643755, "router_z_loss_clip": 1.4921875, "router_z_loss_mlp": 0.11175537, "step": 15561, "time_per_iteration": 3.89744234085083 }, { "auxiliary_loss_clip": 0.06410722, "auxiliary_loss_mlp": 0.01265811, "balance_loss_clip": 0.06273764, "balance_loss_mlp": 0.01255959, "epoch": 0.9356380580189388, "flos": 23192783470080.0, "grad_norm": 1.6278103199960194, "language_loss": 0.75486761, "learning_rate": 4.325788015381859e-08, "loss": 0.83163291, "num_input_tokens_seen": 335665160, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09857178, "step": 15562, "time_per_iteration": 2.537881851196289 }, { "auxiliary_loss_clip": 0.06315488, "auxiliary_loss_mlp": 0.01249937, "balance_loss_clip": 0.06260281, "balance_loss_mlp": 0.01248903, "epoch": 0.9356981812716068, "flos": 67490592480000.0, "grad_norm": 0.9202877775285204, "language_loss": 0.62239277, "learning_rate": 4.31773541432503e-08, "loss": 0.69804704, "num_input_tokens_seen": 335715240, "router_z_loss_clip": 0.55224609, "router_z_loss_mlp": 0.01034546, "step": 15563, "time_per_iteration": 3.0602948665618896 }, { "auxiliary_loss_clip": 0.06406985, "auxiliary_loss_mlp": 0.01268413, "balance_loss_clip": 0.06271909, "balance_loss_mlp": 0.01259467, "epoch": 0.9357583045242748, "flos": 24688631358720.0, "grad_norm": 1.5389395664967298, "language_loss": 0.78427982, "learning_rate": 4.3096902335592714e-08, "loss": 0.86103386, "num_input_tokens_seen": 335734970, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.0894165, "step": 15564, "time_per_iteration": 2.5613725185394287 }, { "auxiliary_loss_clip": 0.06417841, "auxiliary_loss_mlp": 0.01268583, "balance_loss_clip": 0.06275564, "balance_loss_mlp": 0.01258588, "epoch": 0.9358184277769427, "flos": 19469795184000.0, "grad_norm": 2.116056564235588, "language_loss": 0.78080571, "learning_rate": 4.301652473389694e-08, "loss": 0.85766995, "num_input_tokens_seen": 335753435, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09991455, "step": 15565, "time_per_iteration": 2.5532124042510986 }, { "auxiliary_loss_clip": 0.06413084, "auxiliary_loss_mlp": 0.01262298, "balance_loss_clip": 0.06277059, "balance_loss_mlp": 0.01253852, "epoch": 0.9358785510296107, "flos": 18923055292800.0, "grad_norm": 2.1936189175600784, "language_loss": 0.72130817, "learning_rate": 4.2936221341210774e-08, "loss": 0.79806203, "num_input_tokens_seen": 335772105, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.08447266, "step": 15566, "time_per_iteration": 2.6130356788635254 }, { "auxiliary_loss_clip": 0.06413712, "auxiliary_loss_mlp": 0.01264007, "balance_loss_clip": 0.06273507, "balance_loss_mlp": 0.01254435, "epoch": 0.9359386742822787, "flos": 23448096460800.0, "grad_norm": 1.8899699355236486, "language_loss": 0.67670643, "learning_rate": 4.285599216057889e-08, "loss": 0.75348365, "num_input_tokens_seen": 335789125, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09576416, "step": 15567, "time_per_iteration": 2.592226505279541 }, { "auxiliary_loss_clip": 0.06413443, "auxiliary_loss_mlp": 0.0126407, "balance_loss_clip": 0.0627515, "balance_loss_mlp": 0.0125417, "epoch": 0.9359987975349466, "flos": 32752275903360.0, "grad_norm": 1.8586564098577931, "language_loss": 0.627491, "learning_rate": 4.277583719504418e-08, "loss": 0.70426619, "num_input_tokens_seen": 335810995, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09893799, "step": 15568, "time_per_iteration": 2.6710305213928223 }, { "auxiliary_loss_clip": 0.06413003, "auxiliary_loss_mlp": 0.01262773, "balance_loss_clip": 0.06275891, "balance_loss_mlp": 0.01253123, "epoch": 0.9360589207876147, "flos": 22826151930240.0, "grad_norm": 1.5700152375042078, "language_loss": 0.79044992, "learning_rate": 4.269575644764556e-08, "loss": 0.86720765, "num_input_tokens_seen": 335830580, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09655762, "step": 15569, "time_per_iteration": 2.5723917484283447 }, { "auxiliary_loss_clip": 0.06417297, "auxiliary_loss_mlp": 0.01266633, "balance_loss_clip": 0.06276843, "balance_loss_mlp": 0.01257114, "epoch": 0.9361190440402826, "flos": 20891318901120.0, "grad_norm": 2.4789879953118437, "language_loss": 0.70328248, "learning_rate": 4.261574992142014e-08, "loss": 0.78012174, "num_input_tokens_seen": 335846515, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09521484, "step": 15570, "time_per_iteration": 2.550747871398926 }, { "auxiliary_loss_clip": 0.06411701, "auxiliary_loss_mlp": 0.01266161, "balance_loss_clip": 0.06272345, "balance_loss_mlp": 0.01256058, "epoch": 0.9361791672929506, "flos": 19323872098560.0, "grad_norm": 2.0618963980060423, "language_loss": 0.79226279, "learning_rate": 4.2535817619401726e-08, "loss": 0.86904138, "num_input_tokens_seen": 335863350, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.10095215, "step": 15571, "time_per_iteration": 2.5053791999816895 }, { "auxiliary_loss_clip": 0.06417812, "auxiliary_loss_mlp": 0.01267727, "balance_loss_clip": 0.06277898, "balance_loss_mlp": 0.01258632, "epoch": 0.9362392905456185, "flos": 15163491899520.0, "grad_norm": 1.7095380507267324, "language_loss": 0.7775206, "learning_rate": 4.2455959544621224e-08, "loss": 0.85437602, "num_input_tokens_seen": 335880510, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09094238, "step": 15572, "time_per_iteration": 2.5197081565856934 }, { "auxiliary_loss_clip": 0.06408932, "auxiliary_loss_mlp": 0.01265262, "balance_loss_clip": 0.06274036, "balance_loss_mlp": 0.0125622, "epoch": 0.9362994137982865, "flos": 22091589112320.0, "grad_norm": 1.6596075125440914, "language_loss": 0.78031147, "learning_rate": 4.237617570010688e-08, "loss": 0.8570534, "num_input_tokens_seen": 335899440, "router_z_loss_clip": 1.34863281, "router_z_loss_mlp": 0.09051514, "step": 15573, "time_per_iteration": 2.535153388977051 }, { "auxiliary_loss_clip": 0.0640825, "auxiliary_loss_mlp": 0.01268034, "balance_loss_clip": 0.06274005, "balance_loss_mlp": 0.01259094, "epoch": 0.9363595370509544, "flos": 23518772979840.0, "grad_norm": 1.4973332974459028, "language_loss": 0.74247992, "learning_rate": 4.2296466088884044e-08, "loss": 0.81924272, "num_input_tokens_seen": 335919540, "router_z_loss_clip": 1.34082031, "router_z_loss_mlp": 0.08935547, "step": 15574, "time_per_iteration": 2.546152114868164 }, { "auxiliary_loss_clip": 0.06407016, "auxiliary_loss_mlp": 0.01262411, "balance_loss_clip": 0.06272461, "balance_loss_mlp": 0.01253029, "epoch": 0.9364196603036224, "flos": 27130442716800.0, "grad_norm": 1.813814723193323, "language_loss": 0.68666369, "learning_rate": 4.221683071397564e-08, "loss": 0.763358, "num_input_tokens_seen": 335939665, "router_z_loss_clip": 1.34667969, "router_z_loss_mlp": 0.09384155, "step": 15575, "time_per_iteration": 2.5808215141296387 }, { "auxiliary_loss_clip": 0.0640705, "auxiliary_loss_mlp": 0.01265553, "balance_loss_clip": 0.06272293, "balance_loss_mlp": 0.01255945, "epoch": 0.9364797835562904, "flos": 18485034474240.0, "grad_norm": 1.9325576394657686, "language_loss": 0.65619493, "learning_rate": 4.2137269578401026e-08, "loss": 0.73292094, "num_input_tokens_seen": 335958580, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.09606934, "step": 15576, "time_per_iteration": 2.5047268867492676 }, { "auxiliary_loss_clip": 0.06411488, "auxiliary_loss_mlp": 0.01265942, "balance_loss_clip": 0.06272189, "balance_loss_mlp": 0.01255976, "epoch": 0.9365399068089584, "flos": 13010507965440.0, "grad_norm": 2.2660817365569055, "language_loss": 0.76367748, "learning_rate": 4.2057782685177566e-08, "loss": 0.84045172, "num_input_tokens_seen": 335974965, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09960938, "step": 15577, "time_per_iteration": 2.5148913860321045 }, { "auxiliary_loss_clip": 0.06412523, "auxiliary_loss_mlp": 0.01264916, "balance_loss_clip": 0.06272019, "balance_loss_mlp": 0.01254724, "epoch": 0.9366000300616263, "flos": 25673559776640.0, "grad_norm": 1.7458751169576012, "language_loss": 0.52399826, "learning_rate": 4.1978370037318855e-08, "loss": 0.60077262, "num_input_tokens_seen": 335996575, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.10186768, "step": 15578, "time_per_iteration": 2.6198904514312744 }, { "auxiliary_loss_clip": 0.06410476, "auxiliary_loss_mlp": 0.01264556, "balance_loss_clip": 0.06275271, "balance_loss_mlp": 0.01255466, "epoch": 0.9366601533142943, "flos": 21439652019840.0, "grad_norm": 1.6471906906711542, "language_loss": 0.70671159, "learning_rate": 4.189903163783692e-08, "loss": 0.78346193, "num_input_tokens_seen": 336017265, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.09082031, "step": 15579, "time_per_iteration": 2.55826473236084 }, { "auxiliary_loss_clip": 0.06412432, "auxiliary_loss_mlp": 0.01262838, "balance_loss_clip": 0.06275934, "balance_loss_mlp": 0.0125404, "epoch": 0.9367202765669622, "flos": 24099362720640.0, "grad_norm": 1.797311869563078, "language_loss": 0.76414967, "learning_rate": 4.181976748973959e-08, "loss": 0.84090233, "num_input_tokens_seen": 336035905, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.08813477, "step": 15580, "time_per_iteration": 2.5532429218292236 }, { "auxiliary_loss_clip": 0.06421518, "auxiliary_loss_mlp": 0.0126409, "balance_loss_clip": 0.06278248, "balance_loss_mlp": 0.01253612, "epoch": 0.9367803998196302, "flos": 20895511605120.0, "grad_norm": 1.536942435778924, "language_loss": 0.66729814, "learning_rate": 4.1740577596033114e-08, "loss": 0.74415421, "num_input_tokens_seen": 336055585, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10479736, "step": 15581, "time_per_iteration": 3.968590497970581 }, { "auxiliary_loss_clip": 0.06411508, "auxiliary_loss_mlp": 0.01266158, "balance_loss_clip": 0.06273569, "balance_loss_mlp": 0.01256586, "epoch": 0.9368405230722983, "flos": 22570838939520.0, "grad_norm": 1.5012630314209086, "language_loss": 0.76577401, "learning_rate": 4.166146195972042e-08, "loss": 0.84255064, "num_input_tokens_seen": 336076695, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09570312, "step": 15582, "time_per_iteration": 2.5878379344940186 }, { "auxiliary_loss_clip": 0.064134, "auxiliary_loss_mlp": 0.01266039, "balance_loss_clip": 0.06275865, "balance_loss_mlp": 0.01256485, "epoch": 0.9369006463249662, "flos": 18886228623360.0, "grad_norm": 1.6650040552519734, "language_loss": 0.73808992, "learning_rate": 4.1582420583800905e-08, "loss": 0.81488431, "num_input_tokens_seen": 336094740, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09552002, "step": 15583, "time_per_iteration": 2.535963296890259 }, { "auxiliary_loss_clip": 0.0642366, "auxiliary_loss_mlp": 0.012704, "balance_loss_clip": 0.06279162, "balance_loss_mlp": 0.01260636, "epoch": 0.9369607695776342, "flos": 26439750311040.0, "grad_norm": 2.1760983180585765, "language_loss": 0.84509069, "learning_rate": 4.1503453471272376e-08, "loss": 0.92203122, "num_input_tokens_seen": 336113985, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.09765625, "step": 15584, "time_per_iteration": 2.5890250205993652 }, { "auxiliary_loss_clip": 0.06425605, "auxiliary_loss_mlp": 0.01269897, "balance_loss_clip": 0.06279756, "balance_loss_mlp": 0.01259139, "epoch": 0.9370208928303021, "flos": 39576769142400.0, "grad_norm": 1.4402687081411236, "language_loss": 0.72465193, "learning_rate": 4.1424560625129334e-08, "loss": 0.80160695, "num_input_tokens_seen": 336136395, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10766602, "step": 15585, "time_per_iteration": 2.6895182132720947 }, { "auxiliary_loss_clip": 0.06407453, "auxiliary_loss_mlp": 0.01262882, "balance_loss_clip": 0.06272426, "balance_loss_mlp": 0.01254091, "epoch": 0.9370810160829701, "flos": 22969223976960.0, "grad_norm": 1.8251144582690786, "language_loss": 0.80499554, "learning_rate": 4.134574204836316e-08, "loss": 0.88169885, "num_input_tokens_seen": 336156345, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.08795166, "step": 15586, "time_per_iteration": 4.032634258270264 }, { "auxiliary_loss_clip": 0.06415901, "auxiliary_loss_mlp": 0.01267789, "balance_loss_clip": 0.06278107, "balance_loss_mlp": 0.01258777, "epoch": 0.937141139335638, "flos": 23081590702080.0, "grad_norm": 1.5591370224898564, "language_loss": 0.76301789, "learning_rate": 4.126699774396258e-08, "loss": 0.83985472, "num_input_tokens_seen": 336176760, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09002686, "step": 15587, "time_per_iteration": 2.5401418209075928 }, { "auxiliary_loss_clip": 0.06421909, "auxiliary_loss_mlp": 0.01263474, "balance_loss_clip": 0.06278361, "balance_loss_mlp": 0.01253431, "epoch": 0.937201262588306, "flos": 16361246488320.0, "grad_norm": 1.9032347889143328, "language_loss": 0.88138103, "learning_rate": 4.118832771491387e-08, "loss": 0.95823491, "num_input_tokens_seen": 336193285, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10046387, "step": 15588, "time_per_iteration": 2.4914417266845703 }, { "auxiliary_loss_clip": 0.06408481, "auxiliary_loss_mlp": 0.01263668, "balance_loss_clip": 0.06274097, "balance_loss_mlp": 0.01255288, "epoch": 0.937261385840974, "flos": 20200374933120.0, "grad_norm": 1.606493159600404, "language_loss": 0.78155315, "learning_rate": 4.11097319642002e-08, "loss": 0.8582747, "num_input_tokens_seen": 336211425, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.0838623, "step": 15589, "time_per_iteration": 2.5415942668914795 }, { "auxiliary_loss_clip": 0.06410573, "auxiliary_loss_mlp": 0.01264212, "balance_loss_clip": 0.06276179, "balance_loss_mlp": 0.01254925, "epoch": 0.937321509093642, "flos": 18301781594880.0, "grad_norm": 1.7943260169765112, "language_loss": 0.78218973, "learning_rate": 4.103121049480163e-08, "loss": 0.85893756, "num_input_tokens_seen": 336230205, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.09283447, "step": 15590, "time_per_iteration": 2.555124044418335 }, { "auxiliary_loss_clip": 0.06423764, "auxiliary_loss_mlp": 0.01266479, "balance_loss_clip": 0.06280187, "balance_loss_mlp": 0.01256543, "epoch": 0.9373816323463099, "flos": 25891710681600.0, "grad_norm": 1.6586428786663594, "language_loss": 0.71680003, "learning_rate": 4.095276330969577e-08, "loss": 0.79370242, "num_input_tokens_seen": 336252440, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.0993042, "step": 15591, "time_per_iteration": 2.6329538822174072 }, { "auxiliary_loss_clip": 0.06421294, "auxiliary_loss_mlp": 0.0126874, "balance_loss_clip": 0.06277363, "balance_loss_mlp": 0.01257755, "epoch": 0.9374417555989779, "flos": 27206234334720.0, "grad_norm": 2.2119561139783954, "language_loss": 0.53706515, "learning_rate": 4.0874390411857804e-08, "loss": 0.61396551, "num_input_tokens_seen": 336273845, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10986328, "step": 15592, "time_per_iteration": 2.5776853561401367 }, { "auxiliary_loss_clip": 0.06411485, "auxiliary_loss_mlp": 0.01261582, "balance_loss_clip": 0.06275578, "balance_loss_mlp": 0.01252463, "epoch": 0.9375018788516458, "flos": 23627701687680.0, "grad_norm": 1.415745629075797, "language_loss": 0.67367852, "learning_rate": 4.0796091804259136e-08, "loss": 0.75040925, "num_input_tokens_seen": 336292790, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.09118652, "step": 15593, "time_per_iteration": 2.5455520153045654 }, { "auxiliary_loss_clip": 0.06413617, "auxiliary_loss_mlp": 0.01264141, "balance_loss_clip": 0.06272727, "balance_loss_mlp": 0.01254462, "epoch": 0.9375620021043138, "flos": 22686098630400.0, "grad_norm": 1.749385398173223, "language_loss": 0.74262452, "learning_rate": 4.0717867489868715e-08, "loss": 0.8194021, "num_input_tokens_seen": 336312600, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09686279, "step": 15594, "time_per_iteration": 4.024414300918579 }, { "auxiliary_loss_clip": 0.06407903, "auxiliary_loss_mlp": 0.01262959, "balance_loss_clip": 0.0627251, "balance_loss_mlp": 0.01253446, "epoch": 0.9376221253569819, "flos": 27567121870080.0, "grad_norm": 1.563960719400803, "language_loss": 0.74000263, "learning_rate": 4.063971747165351e-08, "loss": 0.81671131, "num_input_tokens_seen": 336332770, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.09503174, "step": 15595, "time_per_iteration": 2.5866360664367676 }, { "auxiliary_loss_clip": 0.06418432, "auxiliary_loss_mlp": 0.01266096, "balance_loss_clip": 0.0627728, "balance_loss_mlp": 0.01256887, "epoch": 0.9376822486096498, "flos": 24136063608960.0, "grad_norm": 1.7343207785873165, "language_loss": 0.76351225, "learning_rate": 4.056164175257626e-08, "loss": 0.84035754, "num_input_tokens_seen": 336351445, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09204102, "step": 15596, "time_per_iteration": 2.555964231491089 }, { "auxiliary_loss_clip": 0.06417425, "auxiliary_loss_mlp": 0.01268034, "balance_loss_clip": 0.0627877, "balance_loss_mlp": 0.01258807, "epoch": 0.9377423718623178, "flos": 22790666926080.0, "grad_norm": 1.6139961607578404, "language_loss": 0.79087943, "learning_rate": 4.0483640335597926e-08, "loss": 0.86773401, "num_input_tokens_seen": 336368690, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09222412, "step": 15597, "time_per_iteration": 2.560016393661499 }, { "auxiliary_loss_clip": 0.06422848, "auxiliary_loss_mlp": 0.01265185, "balance_loss_clip": 0.06279457, "balance_loss_mlp": 0.0125507, "epoch": 0.9378024951149857, "flos": 19174427141760.0, "grad_norm": 1.3989783242596563, "language_loss": 0.81423438, "learning_rate": 4.0405713223676363e-08, "loss": 0.89111471, "num_input_tokens_seen": 336388165, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10113525, "step": 15598, "time_per_iteration": 2.5383517742156982 }, { "auxiliary_loss_clip": 0.06421343, "auxiliary_loss_mlp": 0.01264525, "balance_loss_clip": 0.06274876, "balance_loss_mlp": 0.01254559, "epoch": 0.9378626183676537, "flos": 23510890696320.0, "grad_norm": 3.0884023143321473, "language_loss": 0.63102639, "learning_rate": 4.0327860419766994e-08, "loss": 0.70788503, "num_input_tokens_seen": 336406475, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.09967041, "step": 15599, "time_per_iteration": 2.5310239791870117 }, { "auxiliary_loss_clip": 0.06414811, "auxiliary_loss_mlp": 0.01263982, "balance_loss_clip": 0.06274401, "balance_loss_mlp": 0.01255161, "epoch": 0.9379227416203216, "flos": 18411548843520.0, "grad_norm": 1.772837982573606, "language_loss": 0.73566604, "learning_rate": 4.0250081926821e-08, "loss": 0.81245399, "num_input_tokens_seen": 336424690, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.08825684, "step": 15600, "time_per_iteration": 2.510329484939575 }, { "auxiliary_loss_clip": 0.06412087, "auxiliary_loss_mlp": 0.01264253, "balance_loss_clip": 0.06276527, "balance_loss_mlp": 0.01255843, "epoch": 0.9379828648729897, "flos": 17827646866560.0, "grad_norm": 1.798429058646183, "language_loss": 0.6945399, "learning_rate": 4.0172377747788474e-08, "loss": 0.77130336, "num_input_tokens_seen": 336443055, "router_z_loss_clip": 1.35644531, "router_z_loss_mlp": 0.08410645, "step": 15601, "time_per_iteration": 3.907432794570923 }, { "auxiliary_loss_clip": 0.06317993, "auxiliary_loss_mlp": 0.01251816, "balance_loss_clip": 0.0626286, "balance_loss_mlp": 0.01250833, "epoch": 0.9380429881256576, "flos": 68044376113920.0, "grad_norm": 0.7331663729487778, "language_loss": 0.58096337, "learning_rate": 4.009474788561573e-08, "loss": 0.65666139, "num_input_tokens_seen": 336510190, "router_z_loss_clip": 0.55371094, "router_z_loss_mlp": 0.00982666, "step": 15602, "time_per_iteration": 3.3166933059692383 }, { "auxiliary_loss_clip": 0.06420302, "auxiliary_loss_mlp": 0.01264128, "balance_loss_clip": 0.06280164, "balance_loss_mlp": 0.01254443, "epoch": 0.9381031113783256, "flos": 20783228734080.0, "grad_norm": 2.176926893729015, "language_loss": 0.71467608, "learning_rate": 4.001719234324663e-08, "loss": 0.79152042, "num_input_tokens_seen": 336529250, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09692383, "step": 15603, "time_per_iteration": 2.5302226543426514 }, { "auxiliary_loss_clip": 0.06404267, "auxiliary_loss_mlp": 0.0126817, "balance_loss_clip": 0.06274328, "balance_loss_mlp": 0.01259832, "epoch": 0.9381632346309935, "flos": 19030935824640.0, "grad_norm": 1.591195632208983, "language_loss": 0.76083773, "learning_rate": 3.993971112362171e-08, "loss": 0.83756208, "num_input_tokens_seen": 336548530, "router_z_loss_clip": 1.29882812, "router_z_loss_mlp": 0.08337402, "step": 15604, "time_per_iteration": 2.5115177631378174 }, { "auxiliary_loss_clip": 0.06416906, "auxiliary_loss_mlp": 0.01264773, "balance_loss_clip": 0.06277046, "balance_loss_mlp": 0.01254885, "epoch": 0.9382233578836615, "flos": 23520617769600.0, "grad_norm": 1.9859641101691727, "language_loss": 0.65007985, "learning_rate": 3.9862304229679734e-08, "loss": 0.72689664, "num_input_tokens_seen": 336568510, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09881592, "step": 15605, "time_per_iteration": 2.539790153503418 }, { "auxiliary_loss_clip": 0.06420736, "auxiliary_loss_mlp": 0.01266245, "balance_loss_clip": 0.06277119, "balance_loss_mlp": 0.0125647, "epoch": 0.9382834811363294, "flos": 43077539600640.0, "grad_norm": 1.6475215527546037, "language_loss": 0.67471492, "learning_rate": 3.9784971664355683e-08, "loss": 0.75158471, "num_input_tokens_seen": 336592020, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09777832, "step": 15606, "time_per_iteration": 2.729383707046509 }, { "auxiliary_loss_clip": 0.06407079, "auxiliary_loss_mlp": 0.01267011, "balance_loss_clip": 0.06273353, "balance_loss_mlp": 0.01258953, "epoch": 0.9383436043889974, "flos": 16441943569920.0, "grad_norm": 1.668563258969671, "language_loss": 0.77736068, "learning_rate": 3.970771343058166e-08, "loss": 0.85410166, "num_input_tokens_seen": 336610010, "router_z_loss_clip": 1.33789062, "router_z_loss_mlp": 0.08050537, "step": 15607, "time_per_iteration": 2.5022857189178467 }, { "auxiliary_loss_clip": 0.06416162, "auxiliary_loss_mlp": 0.01262038, "balance_loss_clip": 0.06276432, "balance_loss_mlp": 0.01252436, "epoch": 0.9384037276416655, "flos": 20746863262080.0, "grad_norm": 1.7848791754913147, "language_loss": 0.83381391, "learning_rate": 3.963052953128776e-08, "loss": 0.91059589, "num_input_tokens_seen": 336628520, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.0960083, "step": 15608, "time_per_iteration": 2.529827117919922 }, { "auxiliary_loss_clip": 0.06411787, "auxiliary_loss_mlp": 0.01268459, "balance_loss_clip": 0.06274906, "balance_loss_mlp": 0.0125841, "epoch": 0.9384638508943334, "flos": 19068726816000.0, "grad_norm": 1.5971362022743476, "language_loss": 0.69183123, "learning_rate": 3.9553419969400536e-08, "loss": 0.76863366, "num_input_tokens_seen": 336647365, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.10040283, "step": 15609, "time_per_iteration": 2.524299144744873 }, { "auxiliary_loss_clip": 0.06420672, "auxiliary_loss_mlp": 0.01264535, "balance_loss_clip": 0.06277038, "balance_loss_mlp": 0.01254224, "epoch": 0.9385239741470014, "flos": 23411730988800.0, "grad_norm": 2.6207271732213773, "language_loss": 0.75441015, "learning_rate": 3.9476384747844316e-08, "loss": 0.83126223, "num_input_tokens_seen": 336667165, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10308838, "step": 15610, "time_per_iteration": 2.5446598529815674 }, { "auxiliary_loss_clip": 0.06418154, "auxiliary_loss_mlp": 0.01262157, "balance_loss_clip": 0.0627833, "balance_loss_mlp": 0.01253169, "epoch": 0.9385840973996693, "flos": 12829938416640.0, "grad_norm": 1.9651156725081682, "language_loss": 0.75161248, "learning_rate": 3.939942386953987e-08, "loss": 0.82841557, "num_input_tokens_seen": 336684130, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.08984375, "step": 15611, "time_per_iteration": 2.5253303050994873 }, { "auxiliary_loss_clip": 0.06413454, "auxiliary_loss_mlp": 0.01265148, "balance_loss_clip": 0.06274894, "balance_loss_mlp": 0.0125601, "epoch": 0.9386442206523373, "flos": 15492416302080.0, "grad_norm": 2.004439210815771, "language_loss": 0.66383016, "learning_rate": 3.9322537337405756e-08, "loss": 0.74061614, "num_input_tokens_seen": 336701520, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09136963, "step": 15612, "time_per_iteration": 2.615750789642334 }, { "auxiliary_loss_clip": 0.0641014, "auxiliary_loss_mlp": 0.01261999, "balance_loss_clip": 0.06274442, "balance_loss_mlp": 0.01253047, "epoch": 0.9387043439050052, "flos": 21185219496960.0, "grad_norm": 1.6531922761202547, "language_loss": 0.57768452, "learning_rate": 3.924572515435742e-08, "loss": 0.65440589, "num_input_tokens_seen": 336720675, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.08953857, "step": 15613, "time_per_iteration": 2.5496723651885986 }, { "auxiliary_loss_clip": 0.06414555, "auxiliary_loss_mlp": 0.01270781, "balance_loss_clip": 0.06273563, "balance_loss_mlp": 0.01261339, "epoch": 0.9387644671576733, "flos": 27674918547840.0, "grad_norm": 2.249825040242144, "language_loss": 0.71309674, "learning_rate": 3.916898732330764e-08, "loss": 0.78995013, "num_input_tokens_seen": 336741005, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09448242, "step": 15614, "time_per_iteration": 2.672259569168091 }, { "auxiliary_loss_clip": 0.06414887, "auxiliary_loss_mlp": 0.0126692, "balance_loss_clip": 0.06273615, "balance_loss_mlp": 0.01256978, "epoch": 0.9388245904103412, "flos": 18841100400000.0, "grad_norm": 2.2666066939208744, "language_loss": 0.81126618, "learning_rate": 3.9092323847166544e-08, "loss": 0.88808423, "num_input_tokens_seen": 336757990, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.0993042, "step": 15615, "time_per_iteration": 2.5417470932006836 }, { "auxiliary_loss_clip": 0.06410015, "auxiliary_loss_mlp": 0.0126133, "balance_loss_clip": 0.06274417, "balance_loss_mlp": 0.01251805, "epoch": 0.9388847136630092, "flos": 25490893875840.0, "grad_norm": 1.8320306719606498, "language_loss": 0.72582483, "learning_rate": 3.901573472884134e-08, "loss": 0.80253822, "num_input_tokens_seen": 336777705, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.09527588, "step": 15616, "time_per_iteration": 2.551394462585449 }, { "auxiliary_loss_clip": 0.0641596, "auxiliary_loss_mlp": 0.01266405, "balance_loss_clip": 0.06277476, "balance_loss_mlp": 0.01256892, "epoch": 0.9389448369156771, "flos": 18741102151680.0, "grad_norm": 1.8808479196453085, "language_loss": 0.66484231, "learning_rate": 3.89392199712355e-08, "loss": 0.74166596, "num_input_tokens_seen": 336798275, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09515381, "step": 15617, "time_per_iteration": 2.5343780517578125 }, { "auxiliary_loss_clip": 0.06417164, "auxiliary_loss_mlp": 0.01266425, "balance_loss_clip": 0.06273767, "balance_loss_mlp": 0.01255618, "epoch": 0.9390049601683451, "flos": 21722945074560.0, "grad_norm": 1.891002940017432, "language_loss": 0.73497069, "learning_rate": 3.886277957725092e-08, "loss": 0.81180656, "num_input_tokens_seen": 336813835, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10803223, "step": 15618, "time_per_iteration": 2.513075828552246 }, { "auxiliary_loss_clip": 0.06423366, "auxiliary_loss_mlp": 0.01264473, "balance_loss_clip": 0.06277518, "balance_loss_mlp": 0.01254078, "epoch": 0.939065083421013, "flos": 19397357729280.0, "grad_norm": 1.841670166762659, "language_loss": 0.69987595, "learning_rate": 3.878641354978662e-08, "loss": 0.77675438, "num_input_tokens_seen": 336832210, "router_z_loss_clip": 1.45800781, "router_z_loss_mlp": 0.10400391, "step": 15619, "time_per_iteration": 2.532409906387329 }, { "auxiliary_loss_clip": 0.06417529, "auxiliary_loss_mlp": 0.01265677, "balance_loss_clip": 0.06278373, "balance_loss_mlp": 0.01255425, "epoch": 0.939125206673681, "flos": 24688505577600.0, "grad_norm": 1.702245187589931, "language_loss": 0.77946985, "learning_rate": 3.8710121891737834e-08, "loss": 0.8563019, "num_input_tokens_seen": 336851380, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.10253906, "step": 15620, "time_per_iteration": 3.9743549823760986 }, { "auxiliary_loss_clip": 0.06410059, "auxiliary_loss_mlp": 0.01264024, "balance_loss_clip": 0.06274693, "balance_loss_mlp": 0.01255238, "epoch": 0.9391853299263491, "flos": 16331505488640.0, "grad_norm": 1.8675005495143318, "language_loss": 0.74107754, "learning_rate": 3.8633904605998025e-08, "loss": 0.81781828, "num_input_tokens_seen": 336868525, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.08789062, "step": 15621, "time_per_iteration": 2.4981026649475098 }, { "auxiliary_loss_clip": 0.06428047, "auxiliary_loss_mlp": 0.01268244, "balance_loss_clip": 0.06283864, "balance_loss_mlp": 0.01257837, "epoch": 0.939245453179017, "flos": 11660541235200.0, "grad_norm": 2.4475220848987225, "language_loss": 0.66943645, "learning_rate": 3.855776169545688e-08, "loss": 0.7463994, "num_input_tokens_seen": 336886200, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10412598, "step": 15622, "time_per_iteration": 2.490125894546509 }, { "auxiliary_loss_clip": 0.06412545, "auxiliary_loss_mlp": 0.01267065, "balance_loss_clip": 0.06274888, "balance_loss_mlp": 0.0125794, "epoch": 0.939305576431685, "flos": 23155369822080.0, "grad_norm": 1.5643804088336613, "language_loss": 0.72152478, "learning_rate": 3.848169316300209e-08, "loss": 0.79832089, "num_input_tokens_seen": 336905815, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09136963, "step": 15623, "time_per_iteration": 2.5547194480895996 }, { "auxiliary_loss_clip": 0.06418608, "auxiliary_loss_mlp": 0.0126876, "balance_loss_clip": 0.0627933, "balance_loss_mlp": 0.01259391, "epoch": 0.9393656996843529, "flos": 33295493923200.0, "grad_norm": 1.733666391699347, "language_loss": 0.7247504, "learning_rate": 3.84056990115178e-08, "loss": 0.80162412, "num_input_tokens_seen": 336928460, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09381104, "step": 15624, "time_per_iteration": 2.6409831047058105 }, { "auxiliary_loss_clip": 0.06413998, "auxiliary_loss_mlp": 0.01269691, "balance_loss_clip": 0.06277457, "balance_loss_mlp": 0.0126016, "epoch": 0.9394258229370209, "flos": 21695887405440.0, "grad_norm": 1.7621924285513788, "language_loss": 0.89913821, "learning_rate": 3.832977924388614e-08, "loss": 0.97597516, "num_input_tokens_seen": 336948320, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.09527588, "step": 15625, "time_per_iteration": 2.544616222381592 }, { "auxiliary_loss_clip": 0.06412096, "auxiliary_loss_mlp": 0.01263954, "balance_loss_clip": 0.06274525, "balance_loss_mlp": 0.01254316, "epoch": 0.9394859461896888, "flos": 23880289420800.0, "grad_norm": 1.6404957019918172, "language_loss": 0.84158516, "learning_rate": 3.825393386298592e-08, "loss": 0.91834569, "num_input_tokens_seen": 336967670, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09643555, "step": 15626, "time_per_iteration": 3.9828827381134033 }, { "auxiliary_loss_clip": 0.06319265, "auxiliary_loss_mlp": 0.01251172, "balance_loss_clip": 0.06263852, "balance_loss_mlp": 0.01250146, "epoch": 0.9395460694423569, "flos": 61584963114240.0, "grad_norm": 0.7619090222779998, "language_loss": 0.56011689, "learning_rate": 3.8178162871693284e-08, "loss": 0.63582128, "num_input_tokens_seen": 337028395, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01026154, "step": 15627, "time_per_iteration": 3.107232093811035 }, { "auxiliary_loss_clip": 0.06413804, "auxiliary_loss_mlp": 0.01265637, "balance_loss_clip": 0.06275956, "balance_loss_mlp": 0.01256709, "epoch": 0.9396061926950248, "flos": 21001966617600.0, "grad_norm": 1.3455164115113962, "language_loss": 0.70143735, "learning_rate": 3.810246627288105e-08, "loss": 0.77823186, "num_input_tokens_seen": 337048150, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.0892334, "step": 15628, "time_per_iteration": 2.5489320755004883 }, { "auxiliary_loss_clip": 0.06414421, "auxiliary_loss_mlp": 0.01263472, "balance_loss_clip": 0.06277396, "balance_loss_mlp": 0.01254448, "epoch": 0.9396663159476928, "flos": 27494726342400.0, "grad_norm": 1.5879007047604603, "language_loss": 0.75519335, "learning_rate": 3.8026844069420025e-08, "loss": 0.8319723, "num_input_tokens_seen": 337069315, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09020996, "step": 15629, "time_per_iteration": 2.6219944953918457 }, { "auxiliary_loss_clip": 0.06406038, "auxiliary_loss_mlp": 0.012632, "balance_loss_clip": 0.06272111, "balance_loss_mlp": 0.01254784, "epoch": 0.9397264392003607, "flos": 19433555493120.0, "grad_norm": 1.6047717975134121, "language_loss": 0.74595678, "learning_rate": 3.795129626417748e-08, "loss": 0.82264912, "num_input_tokens_seen": 337087765, "router_z_loss_clip": 1.33886719, "router_z_loss_mlp": 0.08416748, "step": 15630, "time_per_iteration": 2.5146915912628174 }, { "auxiliary_loss_clip": 0.06409998, "auxiliary_loss_mlp": 0.01264368, "balance_loss_clip": 0.06274925, "balance_loss_mlp": 0.01255833, "epoch": 0.9397865624530287, "flos": 18010732037760.0, "grad_norm": 1.7945430466676004, "language_loss": 0.69681644, "learning_rate": 3.787582286001845e-08, "loss": 0.77356005, "num_input_tokens_seen": 337106265, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.08538818, "step": 15631, "time_per_iteration": 2.501709222793579 }, { "auxiliary_loss_clip": 0.06410698, "auxiliary_loss_mlp": 0.01263651, "balance_loss_clip": 0.06274721, "balance_loss_mlp": 0.01254275, "epoch": 0.9398466857056966, "flos": 22571132428800.0, "grad_norm": 1.4460997820163966, "language_loss": 0.75339222, "learning_rate": 3.7800423859805086e-08, "loss": 0.8301357, "num_input_tokens_seen": 337126090, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09375, "step": 15632, "time_per_iteration": 2.5262768268585205 }, { "auxiliary_loss_clip": 0.06421997, "auxiliary_loss_mlp": 0.01264673, "balance_loss_clip": 0.06276891, "balance_loss_mlp": 0.0125395, "epoch": 0.9399068089583646, "flos": 24542666346240.0, "grad_norm": 1.7311434542322595, "language_loss": 0.74648285, "learning_rate": 3.772509926639622e-08, "loss": 0.82334954, "num_input_tokens_seen": 337145655, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.1072998, "step": 15633, "time_per_iteration": 2.558509349822998 }, { "auxiliary_loss_clip": 0.0641572, "auxiliary_loss_mlp": 0.01265613, "balance_loss_clip": 0.06273787, "balance_loss_mlp": 0.01255748, "epoch": 0.9399669322110327, "flos": 25637529720960.0, "grad_norm": 1.878688212844726, "language_loss": 0.72656047, "learning_rate": 3.764984908264823e-08, "loss": 0.80337381, "num_input_tokens_seen": 337164805, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09869385, "step": 15634, "time_per_iteration": 3.9925484657287598 }, { "auxiliary_loss_clip": 0.06422673, "auxiliary_loss_mlp": 0.01262597, "balance_loss_clip": 0.06280418, "balance_loss_mlp": 0.01252667, "epoch": 0.9400270554637006, "flos": 17094593422080.0, "grad_norm": 3.0935932602791487, "language_loss": 0.69242936, "learning_rate": 3.75746733114144e-08, "loss": 0.7692821, "num_input_tokens_seen": 337182280, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.0993042, "step": 15635, "time_per_iteration": 2.5227675437927246 }, { "auxiliary_loss_clip": 0.06407405, "auxiliary_loss_mlp": 0.01261402, "balance_loss_clip": 0.06272954, "balance_loss_mlp": 0.01252688, "epoch": 0.9400871787163686, "flos": 22061764258560.0, "grad_norm": 1.544573365636433, "language_loss": 0.74502009, "learning_rate": 3.7499571955545985e-08, "loss": 0.8217082, "num_input_tokens_seen": 337203495, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.08709717, "step": 15636, "time_per_iteration": 2.54270601272583 }, { "auxiliary_loss_clip": 0.06414659, "auxiliary_loss_mlp": 0.01262966, "balance_loss_clip": 0.06274903, "balance_loss_mlp": 0.01253424, "epoch": 0.9401473019690365, "flos": 16988431898880.0, "grad_norm": 1.8914571093976729, "language_loss": 0.82945406, "learning_rate": 3.7424545017890054e-08, "loss": 0.90623033, "num_input_tokens_seen": 337220435, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09539795, "step": 15637, "time_per_iteration": 2.500018358230591 }, { "auxiliary_loss_clip": 0.06415068, "auxiliary_loss_mlp": 0.01267589, "balance_loss_clip": 0.06274765, "balance_loss_mlp": 0.01257606, "epoch": 0.9402074252217045, "flos": 19687946088960.0, "grad_norm": 2.0318129611988685, "language_loss": 0.69160444, "learning_rate": 3.7349592501292325e-08, "loss": 0.76843095, "num_input_tokens_seen": 337238095, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09979248, "step": 15638, "time_per_iteration": 2.5208702087402344 }, { "auxiliary_loss_clip": 0.06410842, "auxiliary_loss_mlp": 0.01264425, "balance_loss_clip": 0.06276646, "balance_loss_mlp": 0.01255579, "epoch": 0.9402675484743724, "flos": 24761278448640.0, "grad_norm": 1.7141994486640382, "language_loss": 0.85164821, "learning_rate": 3.727471440859498e-08, "loss": 0.92840087, "num_input_tokens_seen": 337256645, "router_z_loss_clip": 1.34082031, "router_z_loss_mlp": 0.08837891, "step": 15639, "time_per_iteration": 2.544316530227661 }, { "auxiliary_loss_clip": 0.06414364, "auxiliary_loss_mlp": 0.0126301, "balance_loss_clip": 0.06274418, "balance_loss_mlp": 0.01254266, "epoch": 0.9403276717270405, "flos": 25566014661120.0, "grad_norm": 1.5278297626567072, "language_loss": 0.78193027, "learning_rate": 3.719991074263662e-08, "loss": 0.85870403, "num_input_tokens_seen": 337278360, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.08746338, "step": 15640, "time_per_iteration": 3.884176015853882 }, { "auxiliary_loss_clip": 0.06417302, "auxiliary_loss_mlp": 0.01264717, "balance_loss_clip": 0.06274852, "balance_loss_mlp": 0.0125552, "epoch": 0.9403877949797084, "flos": 26697453143040.0, "grad_norm": 4.462673199568312, "language_loss": 0.74643123, "learning_rate": 3.7125181506254544e-08, "loss": 0.82325143, "num_input_tokens_seen": 337302480, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09197998, "step": 15641, "time_per_iteration": 2.6159422397613525 }, { "auxiliary_loss_clip": 0.06420729, "auxiliary_loss_mlp": 0.01268096, "balance_loss_clip": 0.0627582, "balance_loss_mlp": 0.01257296, "epoch": 0.9404479182323764, "flos": 15016856054400.0, "grad_norm": 1.9987099270865751, "language_loss": 0.8259871, "learning_rate": 3.7050526702282256e-08, "loss": 0.9028753, "num_input_tokens_seen": 337316600, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10797119, "step": 15642, "time_per_iteration": 2.5189545154571533 }, { "auxiliary_loss_clip": 0.06409889, "auxiliary_loss_mlp": 0.01267476, "balance_loss_clip": 0.06274346, "balance_loss_mlp": 0.01259053, "epoch": 0.9405080414850443, "flos": 24980645237760.0, "grad_norm": 1.999310920067201, "language_loss": 0.68582642, "learning_rate": 3.697594633355084e-08, "loss": 0.76260006, "num_input_tokens_seen": 337336895, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.08428955, "step": 15643, "time_per_iteration": 2.558581829071045 }, { "auxiliary_loss_clip": 0.06420964, "auxiliary_loss_mlp": 0.0126766, "balance_loss_clip": 0.06279291, "balance_loss_mlp": 0.01257855, "epoch": 0.9405681647377123, "flos": 20850131819520.0, "grad_norm": 2.43550767449952, "language_loss": 0.76949859, "learning_rate": 3.6901440402888226e-08, "loss": 0.84638482, "num_input_tokens_seen": 337355105, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09820557, "step": 15644, "time_per_iteration": 2.5254852771759033 }, { "auxiliary_loss_clip": 0.06409281, "auxiliary_loss_mlp": 0.01265497, "balance_loss_clip": 0.06273961, "balance_loss_mlp": 0.012568, "epoch": 0.9406282879903802, "flos": 23812380086400.0, "grad_norm": 1.5426286862499594, "language_loss": 0.67326683, "learning_rate": 3.682700891311974e-08, "loss": 0.7500146, "num_input_tokens_seen": 337374905, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.0869751, "step": 15645, "time_per_iteration": 2.555222511291504 }, { "auxiliary_loss_clip": 0.06405397, "auxiliary_loss_mlp": 0.01267464, "balance_loss_clip": 0.06272536, "balance_loss_mlp": 0.01258577, "epoch": 0.9406884112430483, "flos": 27682716977280.0, "grad_norm": 1.4788087718022185, "language_loss": 0.70537621, "learning_rate": 3.6752651867067774e-08, "loss": 0.78210479, "num_input_tokens_seen": 337397130, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.08892822, "step": 15646, "time_per_iteration": 2.5969960689544678 }, { "auxiliary_loss_clip": 0.0641179, "auxiliary_loss_mlp": 0.0126107, "balance_loss_clip": 0.06274118, "balance_loss_mlp": 0.01252082, "epoch": 0.9407485344957163, "flos": 23081590702080.0, "grad_norm": 1.6631917564274472, "language_loss": 0.74725604, "learning_rate": 3.667836926755208e-08, "loss": 0.82398462, "num_input_tokens_seen": 337418660, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.08984375, "step": 15647, "time_per_iteration": 2.5494420528411865 }, { "auxiliary_loss_clip": 0.06316362, "auxiliary_loss_mlp": 0.01252131, "balance_loss_clip": 0.06260989, "balance_loss_mlp": 0.01251063, "epoch": 0.9408086577483842, "flos": 71034143247360.0, "grad_norm": 0.8596484153552603, "language_loss": 0.63427866, "learning_rate": 3.660416111738907e-08, "loss": 0.70996368, "num_input_tokens_seen": 337478055, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01069641, "step": 15648, "time_per_iteration": 3.2391200065612793 }, { "auxiliary_loss_clip": 0.0641219, "auxiliary_loss_mlp": 0.01262883, "balance_loss_clip": 0.06277041, "balance_loss_mlp": 0.01254277, "epoch": 0.9408687810010522, "flos": 23737468936320.0, "grad_norm": 1.3707162267212063, "language_loss": 0.66637158, "learning_rate": 3.653002741939337e-08, "loss": 0.74312234, "num_input_tokens_seen": 337499405, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.08612061, "step": 15649, "time_per_iteration": 2.5505969524383545 }, { "auxiliary_loss_clip": 0.06413887, "auxiliary_loss_mlp": 0.01263631, "balance_loss_clip": 0.06274463, "balance_loss_mlp": 0.01254863, "epoch": 0.9409289042537201, "flos": 18375225298560.0, "grad_norm": 1.7508469437866094, "language_loss": 0.77621365, "learning_rate": 3.645596817637586e-08, "loss": 0.85298878, "num_input_tokens_seen": 337517195, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.08770752, "step": 15650, "time_per_iteration": 2.500910758972168 }, { "auxiliary_loss_clip": 0.06412056, "auxiliary_loss_mlp": 0.01263195, "balance_loss_clip": 0.06274157, "balance_loss_mlp": 0.0125429, "epoch": 0.9409890275063881, "flos": 23885111030400.0, "grad_norm": 2.116023548504355, "language_loss": 0.74711287, "learning_rate": 3.638198339114451e-08, "loss": 0.82386541, "num_input_tokens_seen": 337535245, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.08905029, "step": 15651, "time_per_iteration": 2.5563173294067383 }, { "auxiliary_loss_clip": 0.06412028, "auxiliary_loss_mlp": 0.01263109, "balance_loss_clip": 0.06276312, "balance_loss_mlp": 0.01253888, "epoch": 0.941049150759056, "flos": 16550704569600.0, "grad_norm": 1.797434183319549, "language_loss": 0.72162592, "learning_rate": 3.630807306650507e-08, "loss": 0.79837728, "num_input_tokens_seen": 337553040, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.09222412, "step": 15652, "time_per_iteration": 2.5036447048187256 }, { "auxiliary_loss_clip": 0.06422201, "auxiliary_loss_mlp": 0.01268358, "balance_loss_clip": 0.06276067, "balance_loss_mlp": 0.01258374, "epoch": 0.9411092740117241, "flos": 25125310512000.0, "grad_norm": 1.7544263161205838, "language_loss": 0.66715133, "learning_rate": 3.6234237205260645e-08, "loss": 0.74405694, "num_input_tokens_seen": 337574580, "router_z_loss_clip": 1.45996094, "router_z_loss_mlp": 0.09985352, "step": 15653, "time_per_iteration": 2.5758132934570312 }, { "auxiliary_loss_clip": 0.06416342, "auxiliary_loss_mlp": 0.01264354, "balance_loss_clip": 0.06274329, "balance_loss_mlp": 0.01253774, "epoch": 0.941169397264392, "flos": 21148644389760.0, "grad_norm": 1.818818779361213, "language_loss": 0.77814245, "learning_rate": 3.6160475810210536e-08, "loss": 0.85494936, "num_input_tokens_seen": 337593010, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.105896, "step": 15654, "time_per_iteration": 2.531728506088257 }, { "auxiliary_loss_clip": 0.06428148, "auxiliary_loss_mlp": 0.01265394, "balance_loss_clip": 0.06281462, "balance_loss_mlp": 0.01255625, "epoch": 0.94122952051706, "flos": 38518103531520.0, "grad_norm": 1.590015498230525, "language_loss": 0.70104313, "learning_rate": 3.6086788884152065e-08, "loss": 0.77797854, "num_input_tokens_seen": 337616170, "router_z_loss_clip": 1.46679688, "router_z_loss_mlp": 0.09771729, "step": 15655, "time_per_iteration": 2.6716532707214355 }, { "auxiliary_loss_clip": 0.06413255, "auxiliary_loss_mlp": 0.0126456, "balance_loss_clip": 0.06273997, "balance_loss_mlp": 0.01254863, "epoch": 0.9412896437697279, "flos": 18375099517440.0, "grad_norm": 1.7243319811711717, "language_loss": 0.72513342, "learning_rate": 3.601317642987944e-08, "loss": 0.80191159, "num_input_tokens_seen": 337635215, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09698486, "step": 15656, "time_per_iteration": 2.537215232849121 }, { "auxiliary_loss_clip": 0.06414919, "auxiliary_loss_mlp": 0.01264063, "balance_loss_clip": 0.06276602, "balance_loss_mlp": 0.0125461, "epoch": 0.9413497670223959, "flos": 25892046097920.0, "grad_norm": 1.9837438351667442, "language_loss": 0.78385174, "learning_rate": 3.593963845018377e-08, "loss": 0.8606416, "num_input_tokens_seen": 337654195, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09454346, "step": 15657, "time_per_iteration": 2.5708069801330566 }, { "auxiliary_loss_clip": 0.06414343, "auxiliary_loss_mlp": 0.01265577, "balance_loss_clip": 0.06274769, "balance_loss_mlp": 0.01255796, "epoch": 0.9414098902750638, "flos": 16623980565120.0, "grad_norm": 2.0369385390772936, "language_loss": 0.84622931, "learning_rate": 3.586617494785371e-08, "loss": 0.92302847, "num_input_tokens_seen": 337671810, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09777832, "step": 15658, "time_per_iteration": 2.501765727996826 }, { "auxiliary_loss_clip": 0.06421383, "auxiliary_loss_mlp": 0.01266516, "balance_loss_clip": 0.06277063, "balance_loss_mlp": 0.01255495, "epoch": 0.9414700135277319, "flos": 18631041413760.0, "grad_norm": 1.9597266960822823, "language_loss": 0.70914114, "learning_rate": 3.5792785925675254e-08, "loss": 0.78602016, "num_input_tokens_seen": 337689410, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.11029053, "step": 15659, "time_per_iteration": 2.5065712928771973 }, { "auxiliary_loss_clip": 0.06412663, "auxiliary_loss_mlp": 0.01267689, "balance_loss_clip": 0.06275946, "balance_loss_mlp": 0.01258504, "epoch": 0.9415301367803999, "flos": 26286280358400.0, "grad_norm": 1.639762232316462, "language_loss": 0.79910123, "learning_rate": 3.571947138643172e-08, "loss": 0.87590468, "num_input_tokens_seen": 337709950, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.09182739, "step": 15660, "time_per_iteration": 4.038947582244873 }, { "auxiliary_loss_clip": 0.06407546, "auxiliary_loss_mlp": 0.01263514, "balance_loss_clip": 0.06273209, "balance_loss_mlp": 0.01254335, "epoch": 0.9415902600330678, "flos": 23268617015040.0, "grad_norm": 1.4109249232497867, "language_loss": 0.67925894, "learning_rate": 3.564623133290201e-08, "loss": 0.75596958, "num_input_tokens_seen": 337731320, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.09173584, "step": 15661, "time_per_iteration": 2.5779831409454346 }, { "auxiliary_loss_clip": 0.06417687, "auxiliary_loss_mlp": 0.01267675, "balance_loss_clip": 0.06279027, "balance_loss_mlp": 0.01258484, "epoch": 0.9416503832857358, "flos": 14724171342720.0, "grad_norm": 1.9648856782757078, "language_loss": 0.66263866, "learning_rate": 3.557306576786434e-08, "loss": 0.7394923, "num_input_tokens_seen": 337747720, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09191895, "step": 15662, "time_per_iteration": 2.565584182739258 }, { "auxiliary_loss_clip": 0.06316748, "auxiliary_loss_mlp": 0.0125278, "balance_loss_clip": 0.06261539, "balance_loss_mlp": 0.01251757, "epoch": 0.9417105065384037, "flos": 70331333927040.0, "grad_norm": 0.7553429153235269, "language_loss": 0.59246153, "learning_rate": 3.5499974694092935e-08, "loss": 0.66815686, "num_input_tokens_seen": 337806930, "router_z_loss_clip": 0.55273438, "router_z_loss_mlp": 0.01023865, "step": 15663, "time_per_iteration": 3.271855592727661 }, { "auxiliary_loss_clip": 0.06417707, "auxiliary_loss_mlp": 0.01266304, "balance_loss_clip": 0.06273997, "balance_loss_mlp": 0.01255361, "epoch": 0.9417706297910717, "flos": 34066380286080.0, "grad_norm": 2.2070119056956123, "language_loss": 0.6682514, "learning_rate": 3.542695811435914e-08, "loss": 0.7450915, "num_input_tokens_seen": 337828100, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10955811, "step": 15664, "time_per_iteration": 2.6355080604553223 }, { "auxiliary_loss_clip": 0.06414141, "auxiliary_loss_mlp": 0.0126446, "balance_loss_clip": 0.06276701, "balance_loss_mlp": 0.01255773, "epoch": 0.9418307530437396, "flos": 16477135084800.0, "grad_norm": 2.917399145464556, "language_loss": 0.73280674, "learning_rate": 3.535401603143207e-08, "loss": 0.80959278, "num_input_tokens_seen": 337844805, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.08682251, "step": 15665, "time_per_iteration": 3.927459478378296 }, { "auxiliary_loss_clip": 0.06410629, "auxiliary_loss_mlp": 0.01264441, "balance_loss_clip": 0.06274344, "balance_loss_mlp": 0.01255297, "epoch": 0.9418908762964077, "flos": 11258089274880.0, "grad_norm": 2.2865639339640795, "language_loss": 0.63689512, "learning_rate": 3.528114844807773e-08, "loss": 0.71364582, "num_input_tokens_seen": 337860490, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.0914917, "step": 15666, "time_per_iteration": 2.4875149726867676 }, { "auxiliary_loss_clip": 0.06414436, "auxiliary_loss_mlp": 0.01265158, "balance_loss_clip": 0.06275766, "balance_loss_mlp": 0.01255824, "epoch": 0.9419509995490756, "flos": 18444182808960.0, "grad_norm": 1.6520939673230939, "language_loss": 0.79083836, "learning_rate": 3.520835536705902e-08, "loss": 0.8676343, "num_input_tokens_seen": 337878360, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09338379, "step": 15667, "time_per_iteration": 2.5071334838867188 }, { "auxiliary_loss_clip": 0.06413144, "auxiliary_loss_mlp": 0.01263206, "balance_loss_clip": 0.06276484, "balance_loss_mlp": 0.01254977, "epoch": 0.9420111228017436, "flos": 20743760661120.0, "grad_norm": 1.6041098741116533, "language_loss": 0.75204527, "learning_rate": 3.5135636791136404e-08, "loss": 0.82880872, "num_input_tokens_seen": 337895635, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.08236694, "step": 15668, "time_per_iteration": 2.5212602615356445 }, { "auxiliary_loss_clip": 0.06415707, "auxiliary_loss_mlp": 0.0127017, "balance_loss_clip": 0.06274857, "balance_loss_mlp": 0.01259948, "epoch": 0.9420712460544115, "flos": 21148267046400.0, "grad_norm": 2.121685069896677, "language_loss": 0.5944466, "learning_rate": 3.506299272306723e-08, "loss": 0.6713053, "num_input_tokens_seen": 337913940, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.10217285, "step": 15669, "time_per_iteration": 2.5378384590148926 }, { "auxiliary_loss_clip": 0.06408118, "auxiliary_loss_mlp": 0.01260602, "balance_loss_clip": 0.06274633, "balance_loss_mlp": 0.01252096, "epoch": 0.9421313693070795, "flos": 15857244979200.0, "grad_norm": 1.5903368559549278, "language_loss": 0.76963568, "learning_rate": 3.4990423165606406e-08, "loss": 0.84632289, "num_input_tokens_seen": 337932015, "router_z_loss_clip": 1.33300781, "router_z_loss_mlp": 0.08508301, "step": 15670, "time_per_iteration": 2.506558895111084 }, { "auxiliary_loss_clip": 0.06414221, "auxiliary_loss_mlp": 0.01266519, "balance_loss_clip": 0.06276749, "balance_loss_mlp": 0.0125684, "epoch": 0.9421914925597474, "flos": 32424106187520.0, "grad_norm": 1.6627392976585142, "language_loss": 0.65616852, "learning_rate": 3.491792812150574e-08, "loss": 0.7329759, "num_input_tokens_seen": 337953345, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09680176, "step": 15671, "time_per_iteration": 2.6017510890960693 }, { "auxiliary_loss_clip": 0.06419483, "auxiliary_loss_mlp": 0.01268951, "balance_loss_clip": 0.06280084, "balance_loss_mlp": 0.01259445, "epoch": 0.9422516158124155, "flos": 19724521196160.0, "grad_norm": 1.5205764739184748, "language_loss": 0.79865998, "learning_rate": 3.48455075935139e-08, "loss": 0.87554431, "num_input_tokens_seen": 337973685, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09503174, "step": 15672, "time_per_iteration": 2.533346176147461 }, { "auxiliary_loss_clip": 0.06422728, "auxiliary_loss_mlp": 0.0126451, "balance_loss_clip": 0.06277479, "balance_loss_mlp": 0.01253949, "epoch": 0.9423117390650835, "flos": 16258858398720.0, "grad_norm": 1.8902119159317745, "language_loss": 0.73738158, "learning_rate": 3.47731615843776e-08, "loss": 0.81425399, "num_input_tokens_seen": 337989175, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.10559082, "step": 15673, "time_per_iteration": 2.4954099655151367 }, { "auxiliary_loss_clip": 0.064109, "auxiliary_loss_mlp": 0.01265764, "balance_loss_clip": 0.06273455, "balance_loss_mlp": 0.01256585, "epoch": 0.9423718623177514, "flos": 31804803060480.0, "grad_norm": 1.7910496979716064, "language_loss": 0.70557475, "learning_rate": 3.470089009683974e-08, "loss": 0.78234136, "num_input_tokens_seen": 338011800, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09179688, "step": 15674, "time_per_iteration": 4.093324899673462 }, { "auxiliary_loss_clip": 0.06415014, "auxiliary_loss_mlp": 0.01263399, "balance_loss_clip": 0.06274655, "balance_loss_mlp": 0.01254673, "epoch": 0.9424319855704194, "flos": 23338622701440.0, "grad_norm": 1.72138108887783, "language_loss": 0.81537807, "learning_rate": 3.462869313364125e-08, "loss": 0.8921622, "num_input_tokens_seen": 338032120, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.08721924, "step": 15675, "time_per_iteration": 2.5376627445220947 }, { "auxiliary_loss_clip": 0.06415378, "auxiliary_loss_mlp": 0.01270323, "balance_loss_clip": 0.06277935, "balance_loss_mlp": 0.0126149, "epoch": 0.9424921088230873, "flos": 20783983420800.0, "grad_norm": 1.658070628276653, "language_loss": 0.63020003, "learning_rate": 3.4556570697519494e-08, "loss": 0.70705712, "num_input_tokens_seen": 338051880, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.08837891, "step": 15676, "time_per_iteration": 2.5339713096618652 }, { "auxiliary_loss_clip": 0.06411545, "auxiliary_loss_mlp": 0.0126629, "balance_loss_clip": 0.06272727, "balance_loss_mlp": 0.0125627, "epoch": 0.9425522320757553, "flos": 19032780614400.0, "grad_norm": 1.9393455549441423, "language_loss": 0.67223489, "learning_rate": 3.448452279120984e-08, "loss": 0.74901319, "num_input_tokens_seen": 338069665, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10015869, "step": 15677, "time_per_iteration": 2.5442802906036377 }, { "auxiliary_loss_clip": 0.06417444, "auxiliary_loss_mlp": 0.01263923, "balance_loss_clip": 0.06274997, "balance_loss_mlp": 0.01253719, "epoch": 0.9426123553284232, "flos": 25162346816640.0, "grad_norm": 2.1762977489444144, "language_loss": 0.64384222, "learning_rate": 3.441254941744387e-08, "loss": 0.72065592, "num_input_tokens_seen": 338090490, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.10211182, "step": 15678, "time_per_iteration": 2.561455011367798 }, { "auxiliary_loss_clip": 0.06409362, "auxiliary_loss_mlp": 0.01266826, "balance_loss_clip": 0.06272362, "balance_loss_mlp": 0.01257015, "epoch": 0.9426724785810913, "flos": 21185848402560.0, "grad_norm": 1.5216458039732093, "language_loss": 0.74155688, "learning_rate": 3.434065057895097e-08, "loss": 0.81831872, "num_input_tokens_seen": 338109825, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09814453, "step": 15679, "time_per_iteration": 2.555957794189453 }, { "auxiliary_loss_clip": 0.06419581, "auxiliary_loss_mlp": 0.01267869, "balance_loss_clip": 0.06277904, "balance_loss_mlp": 0.01257748, "epoch": 0.9427326018337592, "flos": 14762171969280.0, "grad_norm": 2.228887369686808, "language_loss": 0.77238077, "learning_rate": 3.426882627845762e-08, "loss": 0.84925526, "num_input_tokens_seen": 338125790, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.10125732, "step": 15680, "time_per_iteration": 3.884457588195801 }, { "auxiliary_loss_clip": 0.06415661, "auxiliary_loss_mlp": 0.01268623, "balance_loss_clip": 0.06277668, "balance_loss_mlp": 0.01259241, "epoch": 0.9427927250864272, "flos": 20930032287360.0, "grad_norm": 1.8309053570326574, "language_loss": 0.75550723, "learning_rate": 3.419707651868742e-08, "loss": 0.83235002, "num_input_tokens_seen": 338145610, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09381104, "step": 15681, "time_per_iteration": 2.5548367500305176 }, { "auxiliary_loss_clip": 0.06417681, "auxiliary_loss_mlp": 0.01268967, "balance_loss_clip": 0.06278124, "balance_loss_mlp": 0.01259674, "epoch": 0.9428528483390951, "flos": 19758119483520.0, "grad_norm": 1.656774391800352, "language_loss": 0.66119158, "learning_rate": 3.412540130236086e-08, "loss": 0.73805809, "num_input_tokens_seen": 338165960, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09283447, "step": 15682, "time_per_iteration": 2.553175687789917 }, { "auxiliary_loss_clip": 0.06411924, "auxiliary_loss_mlp": 0.01264809, "balance_loss_clip": 0.06273053, "balance_loss_mlp": 0.01255046, "epoch": 0.9429129715917631, "flos": 24541869732480.0, "grad_norm": 1.726119932982034, "language_loss": 0.76691711, "learning_rate": 3.405380063219665e-08, "loss": 0.84368443, "num_input_tokens_seen": 338187215, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09759521, "step": 15683, "time_per_iteration": 2.5727970600128174 }, { "auxiliary_loss_clip": 0.06418991, "auxiliary_loss_mlp": 0.01266307, "balance_loss_clip": 0.06277181, "balance_loss_mlp": 0.01255811, "epoch": 0.942973094844431, "flos": 17964304076160.0, "grad_norm": 2.853985920162418, "language_loss": 0.75874591, "learning_rate": 3.398227451090885e-08, "loss": 0.83559889, "num_input_tokens_seen": 338201825, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.1050415, "step": 15684, "time_per_iteration": 2.5240142345428467 }, { "auxiliary_loss_clip": 0.06411465, "auxiliary_loss_mlp": 0.01266003, "balance_loss_clip": 0.06274886, "balance_loss_mlp": 0.0125708, "epoch": 0.9430332180970991, "flos": 26144382268800.0, "grad_norm": 1.5417492298831275, "language_loss": 0.77615291, "learning_rate": 3.391082294121017e-08, "loss": 0.85292757, "num_input_tokens_seen": 338220865, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.0892334, "step": 15685, "time_per_iteration": 2.5793423652648926 }, { "auxiliary_loss_clip": 0.06409488, "auxiliary_loss_mlp": 0.01261377, "balance_loss_clip": 0.06274703, "balance_loss_mlp": 0.01252716, "epoch": 0.943093341349767, "flos": 23958177390720.0, "grad_norm": 1.9975070827461505, "language_loss": 0.76341152, "learning_rate": 3.383944592581023e-08, "loss": 0.8401202, "num_input_tokens_seen": 338240160, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.08660889, "step": 15686, "time_per_iteration": 2.6066200733184814 }, { "auxiliary_loss_clip": 0.06417844, "auxiliary_loss_mlp": 0.01266374, "balance_loss_clip": 0.06276404, "balance_loss_mlp": 0.01256903, "epoch": 0.943153464602435, "flos": 17974324638720.0, "grad_norm": 1.865429679367298, "language_loss": 0.80788255, "learning_rate": 3.376814346741575e-08, "loss": 0.88472468, "num_input_tokens_seen": 338259305, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09466553, "step": 15687, "time_per_iteration": 2.6081080436706543 }, { "auxiliary_loss_clip": 0.06419867, "auxiliary_loss_mlp": 0.01265428, "balance_loss_clip": 0.06277019, "balance_loss_mlp": 0.01254735, "epoch": 0.943213587855103, "flos": 14506733197440.0, "grad_norm": 2.35012295128043, "language_loss": 0.76095355, "learning_rate": 3.369691556873011e-08, "loss": 0.83780658, "num_input_tokens_seen": 338274950, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10699463, "step": 15688, "time_per_iteration": 2.537088394165039 }, { "auxiliary_loss_clip": 0.06406154, "auxiliary_loss_mlp": 0.01265693, "balance_loss_clip": 0.06273597, "balance_loss_mlp": 0.01256192, "epoch": 0.9432737111077709, "flos": 28994054175360.0, "grad_norm": 1.7957009866285525, "language_loss": 0.68650341, "learning_rate": 3.3625762232454504e-08, "loss": 0.76322198, "num_input_tokens_seen": 338295585, "router_z_loss_clip": 1.32519531, "router_z_loss_mlp": 0.09503174, "step": 15689, "time_per_iteration": 2.61255145072937 }, { "auxiliary_loss_clip": 0.06411941, "auxiliary_loss_mlp": 0.01267157, "balance_loss_clip": 0.06275732, "balance_loss_mlp": 0.01258712, "epoch": 0.9433338343604389, "flos": 21614267928960.0, "grad_norm": 1.9211538325188802, "language_loss": 0.80803549, "learning_rate": 3.35546834612872e-08, "loss": 0.88482648, "num_input_tokens_seen": 338314555, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.08447266, "step": 15690, "time_per_iteration": 2.567836046218872 }, { "auxiliary_loss_clip": 0.06412702, "auxiliary_loss_mlp": 0.01262658, "balance_loss_clip": 0.06276253, "balance_loss_mlp": 0.01253151, "epoch": 0.9433939576131068, "flos": 33190632138240.0, "grad_norm": 1.8904424755171816, "language_loss": 0.60629529, "learning_rate": 3.348367925792317e-08, "loss": 0.6830489, "num_input_tokens_seen": 338336260, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09503174, "step": 15691, "time_per_iteration": 2.688689947128296 }, { "auxiliary_loss_clip": 0.06416874, "auxiliary_loss_mlp": 0.01265334, "balance_loss_clip": 0.06277231, "balance_loss_mlp": 0.01255815, "epoch": 0.9434540808657749, "flos": 20492808082560.0, "grad_norm": 1.4638398317926067, "language_loss": 0.66916871, "learning_rate": 3.341274962505514e-08, "loss": 0.74599075, "num_input_tokens_seen": 338354680, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09515381, "step": 15692, "time_per_iteration": 2.545975685119629 }, { "auxiliary_loss_clip": 0.0641192, "auxiliary_loss_mlp": 0.01265546, "balance_loss_clip": 0.06273116, "balance_loss_mlp": 0.01255896, "epoch": 0.9435142041184428, "flos": 21549293487360.0, "grad_norm": 3.0526522641244678, "language_loss": 0.74613392, "learning_rate": 3.334189456537251e-08, "loss": 0.82290858, "num_input_tokens_seen": 338372490, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09649658, "step": 15693, "time_per_iteration": 2.5185866355895996 }, { "auxiliary_loss_clip": 0.06416702, "auxiliary_loss_mlp": 0.01264114, "balance_loss_clip": 0.06278184, "balance_loss_mlp": 0.01254392, "epoch": 0.9435743273711108, "flos": 25016004460800.0, "grad_norm": 1.7086912046405194, "language_loss": 0.73341256, "learning_rate": 3.327111408156291e-08, "loss": 0.81022072, "num_input_tokens_seen": 338390870, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09729004, "step": 15694, "time_per_iteration": 2.546379327774048 }, { "auxiliary_loss_clip": 0.06318494, "auxiliary_loss_mlp": 0.01250331, "balance_loss_clip": 0.06263156, "balance_loss_mlp": 0.01249304, "epoch": 0.9436344506237787, "flos": 60179916723840.0, "grad_norm": 0.7058663322789724, "language_loss": 0.50380087, "learning_rate": 3.3200408176309316e-08, "loss": 0.57948911, "num_input_tokens_seen": 338453075, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01027679, "step": 15695, "time_per_iteration": 3.2065865993499756 }, { "auxiliary_loss_clip": 0.06408711, "auxiliary_loss_mlp": 0.01272331, "balance_loss_clip": 0.06275143, "balance_loss_mlp": 0.01263897, "epoch": 0.9436945738764467, "flos": 22243885107840.0, "grad_norm": 1.7069558616825962, "language_loss": 0.6518895, "learning_rate": 3.312977685229335e-08, "loss": 0.72869992, "num_input_tokens_seen": 338471770, "router_z_loss_clip": 1.3359375, "router_z_loss_mlp": 0.08428955, "step": 15696, "time_per_iteration": 2.5280416011810303 }, { "auxiliary_loss_clip": 0.06417023, "auxiliary_loss_mlp": 0.01263914, "balance_loss_clip": 0.06279002, "balance_loss_mlp": 0.01255266, "epoch": 0.9437546971291146, "flos": 25052034516480.0, "grad_norm": 1.7428712975797076, "language_loss": 0.66289872, "learning_rate": 3.305922011219353e-08, "loss": 0.73970807, "num_input_tokens_seen": 338492190, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.08654785, "step": 15697, "time_per_iteration": 2.56365704536438 }, { "auxiliary_loss_clip": 0.06317008, "auxiliary_loss_mlp": 0.01251416, "balance_loss_clip": 0.0626184, "balance_loss_mlp": 0.01250418, "epoch": 0.9438148203817827, "flos": 56809556346240.0, "grad_norm": 0.8352060144974445, "language_loss": 0.63079917, "learning_rate": 3.298873795868506e-08, "loss": 0.70648336, "num_input_tokens_seen": 338552560, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00997162, "step": 15698, "time_per_iteration": 3.0823025703430176 }, { "auxiliary_loss_clip": 0.06417607, "auxiliary_loss_mlp": 0.01264412, "balance_loss_clip": 0.06276134, "balance_loss_mlp": 0.01254446, "epoch": 0.9438749436344506, "flos": 22352981523840.0, "grad_norm": 1.6901071016114768, "language_loss": 0.69691807, "learning_rate": 3.291833039444092e-08, "loss": 0.77373827, "num_input_tokens_seen": 338571770, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09954834, "step": 15699, "time_per_iteration": 2.5389246940612793 }, { "auxiliary_loss_clip": 0.06408833, "auxiliary_loss_mlp": 0.01262661, "balance_loss_clip": 0.06273977, "balance_loss_mlp": 0.01253416, "epoch": 0.9439350668871186, "flos": 13375881694080.0, "grad_norm": 1.9323234174552568, "language_loss": 0.74664259, "learning_rate": 3.2847997422130734e-08, "loss": 0.82335746, "num_input_tokens_seen": 338587310, "router_z_loss_clip": 1.34863281, "router_z_loss_mlp": 0.09246826, "step": 15700, "time_per_iteration": 3.9283287525177 }, { "auxiliary_loss_clip": 0.0641342, "auxiliary_loss_mlp": 0.01262865, "balance_loss_clip": 0.06276833, "balance_loss_mlp": 0.01253626, "epoch": 0.9439951901397866, "flos": 17791113686400.0, "grad_norm": 1.5298398880023103, "language_loss": 0.70622998, "learning_rate": 3.2777739044421495e-08, "loss": 0.7829929, "num_input_tokens_seen": 338606235, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09240723, "step": 15701, "time_per_iteration": 2.525291681289673 }, { "auxiliary_loss_clip": 0.06424117, "auxiliary_loss_mlp": 0.01263595, "balance_loss_clip": 0.06278318, "balance_loss_mlp": 0.01253998, "epoch": 0.9440553133924545, "flos": 18885473936640.0, "grad_norm": 1.8939416661112822, "language_loss": 0.78152907, "learning_rate": 3.2707555263977505e-08, "loss": 0.85840619, "num_input_tokens_seen": 338624090, "router_z_loss_clip": 1.45605469, "router_z_loss_mlp": 0.0960083, "step": 15702, "time_per_iteration": 2.5362207889556885 }, { "auxiliary_loss_clip": 0.06418987, "auxiliary_loss_mlp": 0.01264809, "balance_loss_clip": 0.06278434, "balance_loss_mlp": 0.01255118, "epoch": 0.9441154366451225, "flos": 19579017381120.0, "grad_norm": 1.6096993060104938, "language_loss": 0.66822863, "learning_rate": 3.2637446083460194e-08, "loss": 0.74506658, "num_input_tokens_seen": 338643695, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09692383, "step": 15703, "time_per_iteration": 2.533998489379883 }, { "auxiliary_loss_clip": 0.06417765, "auxiliary_loss_mlp": 0.01267014, "balance_loss_clip": 0.06276836, "balance_loss_mlp": 0.01257108, "epoch": 0.9441755598977905, "flos": 30302037210240.0, "grad_norm": 1.538481700785891, "language_loss": 0.73411083, "learning_rate": 3.256741150552833e-08, "loss": 0.81095862, "num_input_tokens_seen": 338664725, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09899902, "step": 15704, "time_per_iteration": 2.6263387203216553 }, { "auxiliary_loss_clip": 0.0641135, "auxiliary_loss_mlp": 0.01265863, "balance_loss_clip": 0.06276204, "balance_loss_mlp": 0.01256046, "epoch": 0.9442356831504585, "flos": 20674174245120.0, "grad_norm": 1.9322162461807542, "language_loss": 0.74724805, "learning_rate": 3.2497451532837336e-08, "loss": 0.82402027, "num_input_tokens_seen": 338683990, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.0980835, "step": 15705, "time_per_iteration": 4.008691072463989 }, { "auxiliary_loss_clip": 0.06413785, "auxiliary_loss_mlp": 0.01265487, "balance_loss_clip": 0.0627628, "balance_loss_mlp": 0.01256076, "epoch": 0.9442958064031264, "flos": 16112809532160.0, "grad_norm": 4.956751882048931, "language_loss": 0.77454591, "learning_rate": 3.2427566168039986e-08, "loss": 0.85133862, "num_input_tokens_seen": 338702025, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09411621, "step": 15706, "time_per_iteration": 2.504818916320801 }, { "auxiliary_loss_clip": 0.06407645, "auxiliary_loss_mlp": 0.01264468, "balance_loss_clip": 0.06274246, "balance_loss_mlp": 0.01255491, "epoch": 0.9443559296557944, "flos": 20453381936640.0, "grad_norm": 2.2805052717329697, "language_loss": 0.69301534, "learning_rate": 3.23577554137866e-08, "loss": 0.76973647, "num_input_tokens_seen": 338720920, "router_z_loss_clip": 1.33300781, "router_z_loss_mlp": 0.08978271, "step": 15707, "time_per_iteration": 2.5530598163604736 }, { "auxiliary_loss_clip": 0.06408242, "auxiliary_loss_mlp": 0.0126299, "balance_loss_clip": 0.06275057, "balance_loss_mlp": 0.0125427, "epoch": 0.9444160529084623, "flos": 21616406208000.0, "grad_norm": 1.8223289734865846, "language_loss": 0.69532681, "learning_rate": 3.22880192727244e-08, "loss": 0.77203912, "num_input_tokens_seen": 338739590, "router_z_loss_clip": 1.33203125, "router_z_loss_mlp": 0.08728027, "step": 15708, "time_per_iteration": 2.564849376678467 }, { "auxiliary_loss_clip": 0.06413351, "auxiliary_loss_mlp": 0.01265584, "balance_loss_clip": 0.06277867, "balance_loss_mlp": 0.01256542, "epoch": 0.9444761761611303, "flos": 18447620826240.0, "grad_norm": 3.2528673857105517, "language_loss": 0.70585972, "learning_rate": 3.221835774749748e-08, "loss": 0.78264904, "num_input_tokens_seen": 338757240, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.09039307, "step": 15709, "time_per_iteration": 2.515043020248413 }, { "auxiliary_loss_clip": 0.06410002, "auxiliary_loss_mlp": 0.01266323, "balance_loss_clip": 0.06273471, "balance_loss_mlp": 0.01257078, "epoch": 0.9445362994137982, "flos": 20963043596160.0, "grad_norm": 2.103140179210413, "language_loss": 0.85037494, "learning_rate": 3.214877084074774e-08, "loss": 0.92713815, "num_input_tokens_seen": 338773750, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09240723, "step": 15710, "time_per_iteration": 2.7033145427703857 }, { "auxiliary_loss_clip": 0.06417267, "auxiliary_loss_mlp": 0.01263482, "balance_loss_clip": 0.06274688, "balance_loss_mlp": 0.01253385, "epoch": 0.9445964226664663, "flos": 20309555203200.0, "grad_norm": 1.5037684836332375, "language_loss": 0.71692479, "learning_rate": 3.2079258555113956e-08, "loss": 0.79373229, "num_input_tokens_seen": 338792115, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10095215, "step": 15711, "time_per_iteration": 2.5588080883026123 }, { "auxiliary_loss_clip": 0.06416757, "auxiliary_loss_mlp": 0.01263283, "balance_loss_clip": 0.06278167, "balance_loss_mlp": 0.01254349, "epoch": 0.9446565459191342, "flos": 26403259057920.0, "grad_norm": 2.0011937031839597, "language_loss": 0.69135439, "learning_rate": 3.200982089323179e-08, "loss": 0.76815486, "num_input_tokens_seen": 338812480, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.08935547, "step": 15712, "time_per_iteration": 2.5643470287323 }, { "auxiliary_loss_clip": 0.06420408, "auxiliary_loss_mlp": 0.01267954, "balance_loss_clip": 0.06276695, "balance_loss_mlp": 0.01257863, "epoch": 0.9447166691718022, "flos": 16550327226240.0, "grad_norm": 2.553966961642873, "language_loss": 0.709207, "learning_rate": 3.1940457857734246e-08, "loss": 0.78609061, "num_input_tokens_seen": 338829105, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.10089111, "step": 15713, "time_per_iteration": 4.037062406539917 }, { "auxiliary_loss_clip": 0.06407852, "auxiliary_loss_mlp": 0.01267265, "balance_loss_clip": 0.06274219, "balance_loss_mlp": 0.01257633, "epoch": 0.9447767924244702, "flos": 29171604977280.0, "grad_norm": 1.527050026566757, "language_loss": 0.77068174, "learning_rate": 3.187116945125212e-08, "loss": 0.84743285, "num_input_tokens_seen": 338850670, "router_z_loss_clip": 1.33496094, "router_z_loss_mlp": 0.09637451, "step": 15714, "time_per_iteration": 2.580132007598877 }, { "auxiliary_loss_clip": 0.06417748, "auxiliary_loss_mlp": 0.01266791, "balance_loss_clip": 0.06275912, "balance_loss_mlp": 0.01257194, "epoch": 0.9448369156771381, "flos": 19279875905280.0, "grad_norm": 2.29529215569223, "language_loss": 0.67814887, "learning_rate": 3.1801955676412194e-08, "loss": 0.75499415, "num_input_tokens_seen": 338867795, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09594727, "step": 15715, "time_per_iteration": 2.5099215507507324 }, { "auxiliary_loss_clip": 0.06419261, "auxiliary_loss_mlp": 0.01264559, "balance_loss_clip": 0.06277957, "balance_loss_mlp": 0.01254611, "epoch": 0.9448970389298061, "flos": 23847823163520.0, "grad_norm": 2.4173283920147375, "language_loss": 0.74844444, "learning_rate": 3.173281653583948e-08, "loss": 0.82528257, "num_input_tokens_seen": 338887205, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09942627, "step": 15716, "time_per_iteration": 2.5489301681518555 }, { "auxiliary_loss_clip": 0.06419381, "auxiliary_loss_mlp": 0.01265011, "balance_loss_clip": 0.06280537, "balance_loss_mlp": 0.01255682, "epoch": 0.944957162182474, "flos": 22388760017280.0, "grad_norm": 1.885524097234569, "language_loss": 0.62527329, "learning_rate": 3.166375203215565e-08, "loss": 0.70211726, "num_input_tokens_seen": 338906130, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09332275, "step": 15717, "time_per_iteration": 2.5553319454193115 }, { "auxiliary_loss_clip": 0.0641432, "auxiliary_loss_mlp": 0.01266237, "balance_loss_clip": 0.06276129, "balance_loss_mlp": 0.01256772, "epoch": 0.9450172854351421, "flos": 17389584120960.0, "grad_norm": 1.7083832143417719, "language_loss": 0.79401243, "learning_rate": 3.1594762167979514e-08, "loss": 0.87081802, "num_input_tokens_seen": 338923045, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09466553, "step": 15718, "time_per_iteration": 2.4958622455596924 }, { "auxiliary_loss_clip": 0.06312913, "auxiliary_loss_mlp": 0.01250687, "balance_loss_clip": 0.06257799, "balance_loss_mlp": 0.01249733, "epoch": 0.94507740868781, "flos": 68487092760960.0, "grad_norm": 0.6926001753713025, "language_loss": 0.57683241, "learning_rate": 3.152584694592719e-08, "loss": 0.65246838, "num_input_tokens_seen": 338987545, "router_z_loss_clip": 0.55273438, "router_z_loss_mlp": 0.00952911, "step": 15719, "time_per_iteration": 4.5741307735443115 }, { "auxiliary_loss_clip": 0.06419798, "auxiliary_loss_mlp": 0.01268583, "balance_loss_clip": 0.06278773, "balance_loss_mlp": 0.01259106, "epoch": 0.945137531940478, "flos": 21148895952000.0, "grad_norm": 1.5815999353432495, "language_loss": 0.76037055, "learning_rate": 3.145700636861193e-08, "loss": 0.83725441, "num_input_tokens_seen": 339007830, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09472656, "step": 15720, "time_per_iteration": 2.56894588470459 }, { "auxiliary_loss_clip": 0.0640713, "auxiliary_loss_mlp": 0.01263946, "balance_loss_clip": 0.06271082, "balance_loss_mlp": 0.01255643, "epoch": 0.9451976551931459, "flos": 24540611921280.0, "grad_norm": 1.8543823132224038, "language_loss": 0.72913373, "learning_rate": 3.138824043864452e-08, "loss": 0.80584443, "num_input_tokens_seen": 339028980, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.08306885, "step": 15721, "time_per_iteration": 2.6126716136932373 }, { "auxiliary_loss_clip": 0.06416659, "auxiliary_loss_mlp": 0.01263919, "balance_loss_clip": 0.06276375, "balance_loss_mlp": 0.0125437, "epoch": 0.9452577784458139, "flos": 23447299847040.0, "grad_norm": 2.9964677999659415, "language_loss": 0.85608935, "learning_rate": 3.131954915863244e-08, "loss": 0.93289512, "num_input_tokens_seen": 339047950, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09545898, "step": 15722, "time_per_iteration": 2.5439400672912598 }, { "auxiliary_loss_clip": 0.0631917, "auxiliary_loss_mlp": 0.0125216, "balance_loss_clip": 0.06264105, "balance_loss_mlp": 0.01251074, "epoch": 0.9453179016984818, "flos": 52036749054720.0, "grad_norm": 0.8818252399287285, "language_loss": 0.64446354, "learning_rate": 3.125093253118005e-08, "loss": 0.72017682, "num_input_tokens_seen": 339104535, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01086426, "step": 15723, "time_per_iteration": 3.1326472759246826 }, { "auxiliary_loss_clip": 0.06417753, "auxiliary_loss_mlp": 0.0126471, "balance_loss_clip": 0.06277601, "balance_loss_mlp": 0.01254172, "epoch": 0.9453780249511499, "flos": 13476886191360.0, "grad_norm": 2.208530128454826, "language_loss": 0.72928572, "learning_rate": 3.1182390558889715e-08, "loss": 0.80611038, "num_input_tokens_seen": 339122050, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.10540771, "step": 15724, "time_per_iteration": 2.4957079887390137 }, { "auxiliary_loss_clip": 0.06417715, "auxiliary_loss_mlp": 0.01265869, "balance_loss_clip": 0.06279401, "balance_loss_mlp": 0.01256881, "epoch": 0.9454381482038178, "flos": 23265262851840.0, "grad_norm": 1.8911537791540787, "language_loss": 0.8517344, "learning_rate": 3.111392324436024e-08, "loss": 0.92857027, "num_input_tokens_seen": 339138940, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.08984375, "step": 15725, "time_per_iteration": 2.5546891689300537 }, { "auxiliary_loss_clip": 0.06419484, "auxiliary_loss_mlp": 0.01264938, "balance_loss_clip": 0.06279042, "balance_loss_mlp": 0.01255944, "epoch": 0.9454982714564858, "flos": 19502093733120.0, "grad_norm": 2.129291791256662, "language_loss": 0.7125783, "learning_rate": 3.104553059018822e-08, "loss": 0.78942251, "num_input_tokens_seen": 339158245, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.08996582, "step": 15726, "time_per_iteration": 2.53018856048584 }, { "auxiliary_loss_clip": 0.06416084, "auxiliary_loss_mlp": 0.0126731, "balance_loss_clip": 0.06276709, "balance_loss_mlp": 0.01257404, "epoch": 0.9455583947091538, "flos": 23264801654400.0, "grad_norm": 1.881852572491276, "language_loss": 0.61442125, "learning_rate": 3.097721259896735e-08, "loss": 0.69125521, "num_input_tokens_seen": 339178200, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09906006, "step": 15727, "time_per_iteration": 2.5480785369873047 }, { "auxiliary_loss_clip": 0.06411611, "auxiliary_loss_mlp": 0.01267302, "balance_loss_clip": 0.06276437, "balance_loss_mlp": 0.0125866, "epoch": 0.9456185179618217, "flos": 17678327690880.0, "grad_norm": 1.6445858122020378, "language_loss": 0.81908983, "learning_rate": 3.0908969273287566e-08, "loss": 0.89587897, "num_input_tokens_seen": 339193950, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.08642578, "step": 15728, "time_per_iteration": 2.5032525062561035 }, { "auxiliary_loss_clip": 0.0631896, "auxiliary_loss_mlp": 0.01252231, "balance_loss_clip": 0.06264134, "balance_loss_mlp": 0.01251242, "epoch": 0.9456786412144897, "flos": 61433002535040.0, "grad_norm": 0.7547440922137008, "language_loss": 0.58558095, "learning_rate": 3.08408006157368e-08, "loss": 0.66129291, "num_input_tokens_seen": 339252330, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 0.0098877, "step": 15729, "time_per_iteration": 3.113093614578247 }, { "auxiliary_loss_clip": 0.0640901, "auxiliary_loss_mlp": 0.01266052, "balance_loss_clip": 0.06272425, "balance_loss_mlp": 0.01256808, "epoch": 0.9457387644671577, "flos": 18594340525440.0, "grad_norm": 2.0125630660782066, "language_loss": 0.76720601, "learning_rate": 3.077270662890052e-08, "loss": 0.84395665, "num_input_tokens_seen": 339270325, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.09240723, "step": 15730, "time_per_iteration": 2.5246968269348145 }, { "auxiliary_loss_clip": 0.0641342, "auxiliary_loss_mlp": 0.01267348, "balance_loss_clip": 0.06272958, "balance_loss_mlp": 0.01257167, "epoch": 0.9457988877198257, "flos": 21115381518720.0, "grad_norm": 1.6375022753696888, "language_loss": 0.62797391, "learning_rate": 3.070468731536047e-08, "loss": 0.70478165, "num_input_tokens_seen": 339291980, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.10180664, "step": 15731, "time_per_iteration": 2.5654144287109375 }, { "auxiliary_loss_clip": 0.0641589, "auxiliary_loss_mlp": 0.01262178, "balance_loss_clip": 0.06276026, "balance_loss_mlp": 0.0125223, "epoch": 0.9458590109724936, "flos": 26695734134400.0, "grad_norm": 1.7066215476260849, "language_loss": 0.64378387, "learning_rate": 3.063674267769589e-08, "loss": 0.7205646, "num_input_tokens_seen": 339311795, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09942627, "step": 15732, "time_per_iteration": 2.5695533752441406 }, { "auxiliary_loss_clip": 0.06424099, "auxiliary_loss_mlp": 0.01262057, "balance_loss_clip": 0.06279391, "balance_loss_mlp": 0.01252007, "epoch": 0.9459191342251616, "flos": 18667616520960.0, "grad_norm": 1.7923092409516481, "language_loss": 0.84160936, "learning_rate": 3.056887271848363e-08, "loss": 0.91847098, "num_input_tokens_seen": 339327745, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10046387, "step": 15733, "time_per_iteration": 2.518975257873535 }, { "auxiliary_loss_clip": 0.06407083, "auxiliary_loss_mlp": 0.01264965, "balance_loss_clip": 0.0627185, "balance_loss_mlp": 0.01256024, "epoch": 0.9459792574778295, "flos": 23404226048640.0, "grad_norm": 1.460060749418407, "language_loss": 0.72218633, "learning_rate": 3.0501077440297173e-08, "loss": 0.7989068, "num_input_tokens_seen": 339346445, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.0894165, "step": 15734, "time_per_iteration": 2.549071788787842 }, { "auxiliary_loss_clip": 0.06406185, "auxiliary_loss_mlp": 0.01264109, "balance_loss_clip": 0.06273083, "balance_loss_mlp": 0.01255979, "epoch": 0.9460393807304975, "flos": 24400474767360.0, "grad_norm": 1.4352425452138569, "language_loss": 0.86763024, "learning_rate": 3.043335684570692e-08, "loss": 0.94433308, "num_input_tokens_seen": 339367945, "router_z_loss_clip": 1.33203125, "router_z_loss_mlp": 0.08123779, "step": 15735, "time_per_iteration": 2.6054465770721436 }, { "auxiliary_loss_clip": 0.06415881, "auxiliary_loss_mlp": 0.01266952, "balance_loss_clip": 0.06275552, "balance_loss_mlp": 0.0125785, "epoch": 0.9460995039831654, "flos": 21944995194240.0, "grad_norm": 1.8648919317163948, "language_loss": 0.67204404, "learning_rate": 3.036571093728102e-08, "loss": 0.7488724, "num_input_tokens_seen": 339386060, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09106445, "step": 15736, "time_per_iteration": 2.537242889404297 }, { "auxiliary_loss_clip": 0.06314362, "auxiliary_loss_mlp": 0.01251123, "balance_loss_clip": 0.06259313, "balance_loss_mlp": 0.01250162, "epoch": 0.9461596272358335, "flos": 70342738081920.0, "grad_norm": 0.8544288455572201, "language_loss": 0.65368259, "learning_rate": 3.029813971758499e-08, "loss": 0.72933745, "num_input_tokens_seen": 339446695, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00959778, "step": 15737, "time_per_iteration": 3.2480294704437256 }, { "auxiliary_loss_clip": 0.06320753, "auxiliary_loss_mlp": 0.0125232, "balance_loss_clip": 0.06265658, "balance_loss_mlp": 0.01251216, "epoch": 0.9462197504885014, "flos": 58612427994240.0, "grad_norm": 0.7867976617840937, "language_loss": 0.58769518, "learning_rate": 3.0230643189181225e-08, "loss": 0.66342592, "num_input_tokens_seen": 339510080, "router_z_loss_clip": 0.55126953, "router_z_loss_mlp": 0.01106262, "step": 15738, "time_per_iteration": 3.1777803897857666 }, { "auxiliary_loss_clip": 0.06406592, "auxiliary_loss_mlp": 0.01263143, "balance_loss_clip": 0.06272755, "balance_loss_mlp": 0.01254501, "epoch": 0.9462798737411694, "flos": 23439333709440.0, "grad_norm": 1.6892948825843195, "language_loss": 0.71564174, "learning_rate": 3.016322135462834e-08, "loss": 0.79233909, "num_input_tokens_seen": 339529335, "router_z_loss_clip": 1.33984375, "router_z_loss_mlp": 0.08642578, "step": 15739, "time_per_iteration": 3.989171266555786 }, { "auxiliary_loss_clip": 0.06416865, "auxiliary_loss_mlp": 0.01265355, "balance_loss_clip": 0.06275961, "balance_loss_mlp": 0.01255693, "epoch": 0.9463399969938374, "flos": 25053082692480.0, "grad_norm": 2.152104660238956, "language_loss": 0.64452213, "learning_rate": 3.009587421648363e-08, "loss": 0.72134435, "num_input_tokens_seen": 339548820, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09655762, "step": 15740, "time_per_iteration": 2.562711477279663 }, { "auxiliary_loss_clip": 0.0640888, "auxiliary_loss_mlp": 0.01266844, "balance_loss_clip": 0.06273253, "balance_loss_mlp": 0.0125785, "epoch": 0.9464001202465053, "flos": 24359455393920.0, "grad_norm": 1.6328775715601178, "language_loss": 0.66543758, "learning_rate": 3.0028601777301045e-08, "loss": 0.74219477, "num_input_tokens_seen": 339566775, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.08990479, "step": 15741, "time_per_iteration": 2.5669753551483154 }, { "auxiliary_loss_clip": 0.06414463, "auxiliary_loss_mlp": 0.0126567, "balance_loss_clip": 0.06275892, "balance_loss_mlp": 0.01256461, "epoch": 0.9464602434991733, "flos": 17171181653760.0, "grad_norm": 1.8309598919546308, "language_loss": 0.76374316, "learning_rate": 2.9961404039630987e-08, "loss": 0.84054446, "num_input_tokens_seen": 339581905, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09204102, "step": 15742, "time_per_iteration": 2.512377977371216 }, { "auxiliary_loss_clip": 0.0641482, "auxiliary_loss_mlp": 0.01264759, "balance_loss_clip": 0.06279297, "balance_loss_mlp": 0.01255997, "epoch": 0.9465203667518413, "flos": 19944265328640.0, "grad_norm": 1.7252765593214836, "language_loss": 0.72552669, "learning_rate": 2.989428100602187e-08, "loss": 0.80232251, "num_input_tokens_seen": 339599870, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.08758545, "step": 15743, "time_per_iteration": 2.5617825984954834 }, { "auxiliary_loss_clip": 0.06417571, "auxiliary_loss_mlp": 0.01267044, "balance_loss_clip": 0.06277069, "balance_loss_mlp": 0.01257096, "epoch": 0.9465804900045093, "flos": 20126470032000.0, "grad_norm": 1.6300041454335843, "language_loss": 0.79614878, "learning_rate": 2.982723267901943e-08, "loss": 0.8729949, "num_input_tokens_seen": 339620250, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.0994873, "step": 15744, "time_per_iteration": 3.986238479614258 }, { "auxiliary_loss_clip": 0.06414835, "auxiliary_loss_mlp": 0.01266883, "balance_loss_clip": 0.06274043, "balance_loss_mlp": 0.01257287, "epoch": 0.9466406132571772, "flos": 23917870776960.0, "grad_norm": 1.6142345982534143, "language_loss": 0.78504306, "learning_rate": 2.9760259061165417e-08, "loss": 0.86186022, "num_input_tokens_seen": 339639900, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09594727, "step": 15745, "time_per_iteration": 2.5589680671691895 }, { "auxiliary_loss_clip": 0.06416269, "auxiliary_loss_mlp": 0.01267587, "balance_loss_clip": 0.06274353, "balance_loss_mlp": 0.01257341, "epoch": 0.9467007365098452, "flos": 19938563251200.0, "grad_norm": 1.5783532273808125, "language_loss": 0.70495552, "learning_rate": 2.9693360155000014e-08, "loss": 0.78179407, "num_input_tokens_seen": 339658970, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10241699, "step": 15746, "time_per_iteration": 2.542996644973755 }, { "auxiliary_loss_clip": 0.06412011, "auxiliary_loss_mlp": 0.01262724, "balance_loss_clip": 0.06274928, "balance_loss_mlp": 0.0125258, "epoch": 0.9467608597625131, "flos": 19315318982400.0, "grad_norm": 2.154538663855269, "language_loss": 0.56722432, "learning_rate": 2.962653596305964e-08, "loss": 0.64397168, "num_input_tokens_seen": 339675600, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.10144043, "step": 15747, "time_per_iteration": 2.522864580154419 }, { "auxiliary_loss_clip": 0.0632034, "auxiliary_loss_mlp": 0.01250101, "balance_loss_clip": 0.06265162, "balance_loss_mlp": 0.01249137, "epoch": 0.9468209830151811, "flos": 69650578229760.0, "grad_norm": 0.6426349613196518, "language_loss": 0.53227592, "learning_rate": 2.955978648787871e-08, "loss": 0.60798025, "num_input_tokens_seen": 339744505, "router_z_loss_clip": 0.55126953, "router_z_loss_mlp": 0.0096283, "step": 15748, "time_per_iteration": 3.3602359294891357 }, { "auxiliary_loss_clip": 0.06415124, "auxiliary_loss_mlp": 0.01265385, "balance_loss_clip": 0.06275851, "balance_loss_mlp": 0.01256051, "epoch": 0.946881106267849, "flos": 27024029631360.0, "grad_norm": 1.788622444990915, "language_loss": 0.6673854, "learning_rate": 2.9493111731988096e-08, "loss": 0.74419045, "num_input_tokens_seen": 339765810, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09344482, "step": 15749, "time_per_iteration": 2.5971505641937256 }, { "auxiliary_loss_clip": 0.0641678, "auxiliary_loss_mlp": 0.01263085, "balance_loss_clip": 0.06275539, "balance_loss_mlp": 0.01252553, "epoch": 0.9469412295205171, "flos": 20195721031680.0, "grad_norm": 1.9238861687778328, "language_loss": 0.7673353, "learning_rate": 2.942651169791621e-08, "loss": 0.84413397, "num_input_tokens_seen": 339784125, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10528564, "step": 15750, "time_per_iteration": 2.5316295623779297 }, { "auxiliary_loss_clip": 0.06410721, "auxiliary_loss_mlp": 0.01263058, "balance_loss_clip": 0.06274901, "balance_loss_mlp": 0.01253885, "epoch": 0.947001352773185, "flos": 21331352217600.0, "grad_norm": 1.809287621499324, "language_loss": 0.67928576, "learning_rate": 2.9359986388188372e-08, "loss": 0.75602353, "num_input_tokens_seen": 339803450, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.09161377, "step": 15751, "time_per_iteration": 2.5555405616760254 }, { "auxiliary_loss_clip": 0.06414908, "auxiliary_loss_mlp": 0.01265269, "balance_loss_clip": 0.06275285, "balance_loss_mlp": 0.01255851, "epoch": 0.947061476025853, "flos": 21950403782400.0, "grad_norm": 1.6456933314602595, "language_loss": 0.6575917, "learning_rate": 2.929353580532723e-08, "loss": 0.73439342, "num_input_tokens_seen": 339823215, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09417725, "step": 15752, "time_per_iteration": 2.5450472831726074 }, { "auxiliary_loss_clip": 0.06413203, "auxiliary_loss_mlp": 0.01265291, "balance_loss_clip": 0.06275722, "balance_loss_mlp": 0.01255796, "epoch": 0.947121599278521, "flos": 21400645144320.0, "grad_norm": 2.6191117902477776, "language_loss": 0.71931118, "learning_rate": 2.9227159951852764e-08, "loss": 0.79609621, "num_input_tokens_seen": 339842230, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09490967, "step": 15753, "time_per_iteration": 4.021072864532471 }, { "auxiliary_loss_clip": 0.06418034, "auxiliary_loss_mlp": 0.01266834, "balance_loss_clip": 0.06273607, "balance_loss_mlp": 0.01256087, "epoch": 0.9471817225311889, "flos": 23082387315840.0, "grad_norm": 2.106448249777443, "language_loss": 0.69875962, "learning_rate": 2.9160858830281855e-08, "loss": 0.7756083, "num_input_tokens_seen": 339861640, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10754395, "step": 15754, "time_per_iteration": 2.5629255771636963 }, { "auxiliary_loss_clip": 0.06416313, "auxiliary_loss_mlp": 0.01262307, "balance_loss_clip": 0.06273653, "balance_loss_mlp": 0.0125311, "epoch": 0.947241845783857, "flos": 11915476882560.0, "grad_norm": 2.7498122661539433, "language_loss": 0.79089475, "learning_rate": 2.9094632443129153e-08, "loss": 0.86768091, "num_input_tokens_seen": 339878210, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.09197998, "step": 15755, "time_per_iteration": 2.594055414199829 }, { "auxiliary_loss_clip": 0.06422734, "auxiliary_loss_mlp": 0.012683, "balance_loss_clip": 0.0627697, "balance_loss_mlp": 0.0125645, "epoch": 0.9473019690365249, "flos": 20746947116160.0, "grad_norm": 2.177545231640249, "language_loss": 0.75705934, "learning_rate": 2.9028480792904876e-08, "loss": 0.83396959, "num_input_tokens_seen": 339894255, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.11846924, "step": 15756, "time_per_iteration": 2.5432980060577393 }, { "auxiliary_loss_clip": 0.06419829, "auxiliary_loss_mlp": 0.01263209, "balance_loss_clip": 0.06279281, "balance_loss_mlp": 0.01253631, "epoch": 0.9473620922891929, "flos": 17645735652480.0, "grad_norm": 1.9053414041300238, "language_loss": 0.74822474, "learning_rate": 2.8962403882118347e-08, "loss": 0.82505512, "num_input_tokens_seen": 339912425, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.0958252, "step": 15757, "time_per_iteration": 2.6047987937927246 }, { "auxiliary_loss_clip": 0.06418292, "auxiliary_loss_mlp": 0.0126386, "balance_loss_clip": 0.06276387, "balance_loss_mlp": 0.0125455, "epoch": 0.9474222155418608, "flos": 23556731679360.0, "grad_norm": 1.9041057978050837, "language_loss": 0.80020809, "learning_rate": 2.889640171327512e-08, "loss": 0.87702954, "num_input_tokens_seen": 339929635, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09307861, "step": 15758, "time_per_iteration": 2.5433261394500732 }, { "auxiliary_loss_clip": 0.06413522, "auxiliary_loss_mlp": 0.01263824, "balance_loss_clip": 0.06277294, "balance_loss_mlp": 0.0125496, "epoch": 0.9474823387945288, "flos": 27097179845760.0, "grad_norm": 1.4410003727966942, "language_loss": 0.72225177, "learning_rate": 2.8830474288877638e-08, "loss": 0.79902524, "num_input_tokens_seen": 339951200, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.08868408, "step": 15759, "time_per_iteration": 4.05617618560791 }, { "auxiliary_loss_clip": 0.06406602, "auxiliary_loss_mlp": 0.01265085, "balance_loss_clip": 0.06274893, "balance_loss_mlp": 0.01257521, "epoch": 0.9475424620471967, "flos": 22973207045760.0, "grad_norm": 1.6287549446997525, "language_loss": 0.7592575, "learning_rate": 2.8764621611426344e-08, "loss": 0.83597434, "num_input_tokens_seen": 339971820, "router_z_loss_clip": 1.31640625, "router_z_loss_mlp": 0.07562256, "step": 15760, "time_per_iteration": 2.5525124073028564 }, { "auxiliary_loss_clip": 0.06413326, "auxiliary_loss_mlp": 0.01263854, "balance_loss_clip": 0.06275146, "balance_loss_mlp": 0.01254753, "epoch": 0.9476025852998647, "flos": 20053864869120.0, "grad_norm": 2.2017551432102946, "language_loss": 0.73336571, "learning_rate": 2.8698843683418128e-08, "loss": 0.81013751, "num_input_tokens_seen": 339989420, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09106445, "step": 15761, "time_per_iteration": 2.539424419403076 }, { "auxiliary_loss_clip": 0.06411467, "auxiliary_loss_mlp": 0.01263997, "balance_loss_clip": 0.06274861, "balance_loss_mlp": 0.01254758, "epoch": 0.9476627085525327, "flos": 14980700217600.0, "grad_norm": 1.8898321681679915, "language_loss": 0.71894872, "learning_rate": 2.863314050734722e-08, "loss": 0.79570335, "num_input_tokens_seen": 340006690, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09228516, "step": 15762, "time_per_iteration": 2.5028538703918457 }, { "auxiliary_loss_clip": 0.06423722, "auxiliary_loss_mlp": 0.01265367, "balance_loss_clip": 0.0627825, "balance_loss_mlp": 0.01255354, "epoch": 0.9477228318052007, "flos": 18703772357760.0, "grad_norm": 2.652693385694036, "language_loss": 0.6734134, "learning_rate": 2.856751208570518e-08, "loss": 0.75030434, "num_input_tokens_seen": 340025480, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10009766, "step": 15763, "time_per_iteration": 2.5470690727233887 }, { "auxiliary_loss_clip": 0.06417812, "auxiliary_loss_mlp": 0.01264582, "balance_loss_clip": 0.06276783, "balance_loss_mlp": 0.0125566, "epoch": 0.9477829550578686, "flos": 23881295669760.0, "grad_norm": 1.836964725430444, "language_loss": 0.70494521, "learning_rate": 2.8501958420980466e-08, "loss": 0.7817691, "num_input_tokens_seen": 340043785, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.08935547, "step": 15764, "time_per_iteration": 2.5813944339752197 }, { "auxiliary_loss_clip": 0.06406312, "auxiliary_loss_mlp": 0.0126247, "balance_loss_clip": 0.06275681, "balance_loss_mlp": 0.01254584, "epoch": 0.9478430783105366, "flos": 22569119930880.0, "grad_norm": 1.8947589385062313, "language_loss": 0.71217, "learning_rate": 2.8436479515659306e-08, "loss": 0.78885782, "num_input_tokens_seen": 340064360, "router_z_loss_clip": 1.30566406, "router_z_loss_mlp": 0.07891846, "step": 15765, "time_per_iteration": 2.567457914352417 }, { "auxiliary_loss_clip": 0.06315649, "auxiliary_loss_mlp": 0.0125125, "balance_loss_clip": 0.06260397, "balance_loss_mlp": 0.01250238, "epoch": 0.9479032015632046, "flos": 60874103802240.0, "grad_norm": 0.7850237747324104, "language_loss": 0.58836204, "learning_rate": 2.8371075372224384e-08, "loss": 0.66403109, "num_input_tokens_seen": 340114425, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01011658, "step": 15766, "time_per_iteration": 2.9317100048065186 }, { "auxiliary_loss_clip": 0.06414489, "auxiliary_loss_mlp": 0.01266443, "balance_loss_clip": 0.0627702, "balance_loss_mlp": 0.01256745, "epoch": 0.9479633248158725, "flos": 14689105608960.0, "grad_norm": 2.077475335172389, "language_loss": 0.74470884, "learning_rate": 2.8305745993155938e-08, "loss": 0.82151812, "num_input_tokens_seen": 340132200, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09692383, "step": 15767, "time_per_iteration": 2.534381151199341 }, { "auxiliary_loss_clip": 0.0642411, "auxiliary_loss_mlp": 0.01264188, "balance_loss_clip": 0.0628086, "balance_loss_mlp": 0.01254193, "epoch": 0.9480234480685406, "flos": 20339170421760.0, "grad_norm": 2.1483491370699355, "language_loss": 0.7341975, "learning_rate": 2.8240491380931096e-08, "loss": 0.81108046, "num_input_tokens_seen": 340149175, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09985352, "step": 15768, "time_per_iteration": 2.5307259559631348 }, { "auxiliary_loss_clip": 0.06318364, "auxiliary_loss_mlp": 0.01251986, "balance_loss_clip": 0.06263177, "balance_loss_mlp": 0.01250954, "epoch": 0.9480835713212085, "flos": 70314548382720.0, "grad_norm": 0.7200458664046921, "language_loss": 0.55257535, "learning_rate": 2.8175311538024326e-08, "loss": 0.62827885, "num_input_tokens_seen": 340208155, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.0103302, "step": 15769, "time_per_iteration": 3.185688018798828 }, { "auxiliary_loss_clip": 0.06415655, "auxiliary_loss_mlp": 0.01263853, "balance_loss_clip": 0.06275085, "balance_loss_mlp": 0.01255058, "epoch": 0.9481436945738765, "flos": 25457211734400.0, "grad_norm": 1.2829488937844213, "language_loss": 0.77440798, "learning_rate": 2.8110206466907428e-08, "loss": 0.85120308, "num_input_tokens_seen": 340229275, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.08792114, "step": 15770, "time_per_iteration": 2.601454973220825 }, { "auxiliary_loss_clip": 0.06417361, "auxiliary_loss_mlp": 0.0126532, "balance_loss_clip": 0.06278959, "balance_loss_mlp": 0.01255449, "epoch": 0.9482038178265444, "flos": 26987244888960.0, "grad_norm": 2.0343961720873254, "language_loss": 0.80148923, "learning_rate": 2.8045176170049313e-08, "loss": 0.87831604, "num_input_tokens_seen": 340248920, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09875488, "step": 15771, "time_per_iteration": 2.6113717555999756 }, { "auxiliary_loss_clip": 0.06409719, "auxiliary_loss_mlp": 0.01262612, "balance_loss_clip": 0.06273226, "balance_loss_mlp": 0.01253707, "epoch": 0.9482639410792124, "flos": 17791239467520.0, "grad_norm": 1.75730925400223, "language_loss": 0.70314658, "learning_rate": 2.7980220649915566e-08, "loss": 0.77986991, "num_input_tokens_seen": 340266775, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.08911133, "step": 15772, "time_per_iteration": 2.512380838394165 }, { "auxiliary_loss_clip": 0.06410424, "auxiliary_loss_mlp": 0.01261898, "balance_loss_clip": 0.06272884, "balance_loss_mlp": 0.01252874, "epoch": 0.9483240643318803, "flos": 21003098647680.0, "grad_norm": 1.5843357696140448, "language_loss": 0.74131495, "learning_rate": 2.7915339908969327e-08, "loss": 0.81803817, "num_input_tokens_seen": 340285295, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09014893, "step": 15773, "time_per_iteration": 2.5541787147521973 }, { "auxiliary_loss_clip": 0.06416215, "auxiliary_loss_mlp": 0.01265783, "balance_loss_clip": 0.06274202, "balance_loss_mlp": 0.01255453, "epoch": 0.9483841875845483, "flos": 20089349873280.0, "grad_norm": 2.3068029687067115, "language_loss": 0.63725793, "learning_rate": 2.7850533949671072e-08, "loss": 0.71407795, "num_input_tokens_seen": 340304265, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10327148, "step": 15774, "time_per_iteration": 2.5191969871520996 }, { "auxiliary_loss_clip": 0.06416009, "auxiliary_loss_mlp": 0.0126412, "balance_loss_clip": 0.06276514, "balance_loss_mlp": 0.01254577, "epoch": 0.9484443108372163, "flos": 20819929622400.0, "grad_norm": 1.7194491073672813, "language_loss": 0.59248501, "learning_rate": 2.7785802774478396e-08, "loss": 0.66928637, "num_input_tokens_seen": 340323690, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09552002, "step": 15775, "time_per_iteration": 2.53471040725708 }, { "auxiliary_loss_clip": 0.06416374, "auxiliary_loss_mlp": 0.01267176, "balance_loss_clip": 0.06276138, "balance_loss_mlp": 0.01257621, "epoch": 0.9485044340898843, "flos": 36438018249600.0, "grad_norm": 1.6131437563617044, "language_loss": 0.61865175, "learning_rate": 2.772114638584555e-08, "loss": 0.69548726, "num_input_tokens_seen": 340345830, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09552002, "step": 15776, "time_per_iteration": 2.656899929046631 }, { "auxiliary_loss_clip": 0.06417152, "auxiliary_loss_mlp": 0.01264944, "balance_loss_clip": 0.06276661, "balance_loss_mlp": 0.01255282, "epoch": 0.9485645573425522, "flos": 22609300763520.0, "grad_norm": 1.6161349287176832, "language_loss": 0.73932618, "learning_rate": 2.765656478622458e-08, "loss": 0.81614715, "num_input_tokens_seen": 340365910, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09655762, "step": 15777, "time_per_iteration": 2.551882266998291 }, { "auxiliary_loss_clip": 0.06429771, "auxiliary_loss_mlp": 0.01266667, "balance_loss_clip": 0.06279248, "balance_loss_mlp": 0.01255419, "epoch": 0.9486246805952202, "flos": 22024266756480.0, "grad_norm": 3.066729722399395, "language_loss": 0.72015005, "learning_rate": 2.759205797806441e-08, "loss": 0.79711437, "num_input_tokens_seen": 340383935, "router_z_loss_clip": 1.50585938, "router_z_loss_mlp": 0.11236572, "step": 15778, "time_per_iteration": 2.537022590637207 }, { "auxiliary_loss_clip": 0.06406263, "auxiliary_loss_mlp": 0.01266075, "balance_loss_clip": 0.06275731, "balance_loss_mlp": 0.01258034, "epoch": 0.9486848038478882, "flos": 16514297170560.0, "grad_norm": 1.96332357629848, "language_loss": 0.69835585, "learning_rate": 2.7527625963810865e-08, "loss": 0.77507925, "num_input_tokens_seen": 340402760, "router_z_loss_clip": 1.30566406, "router_z_loss_mlp": 0.0803833, "step": 15779, "time_per_iteration": 3.976665496826172 }, { "auxiliary_loss_clip": 0.06420927, "auxiliary_loss_mlp": 0.01266039, "balance_loss_clip": 0.06280917, "balance_loss_mlp": 0.01255543, "epoch": 0.9487449271005561, "flos": 19250344540800.0, "grad_norm": 2.386469605953482, "language_loss": 0.78586704, "learning_rate": 2.7463268745907542e-08, "loss": 0.8627367, "num_input_tokens_seen": 340422105, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.1048584, "step": 15780, "time_per_iteration": 2.5325980186462402 }, { "auxiliary_loss_clip": 0.0641235, "auxiliary_loss_mlp": 0.01268675, "balance_loss_clip": 0.06274334, "balance_loss_mlp": 0.0125918, "epoch": 0.9488050503532242, "flos": 21769205328000.0, "grad_norm": 1.651581062158619, "language_loss": 0.66597104, "learning_rate": 2.7398986326794494e-08, "loss": 0.74278122, "num_input_tokens_seen": 340441160, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.0949707, "step": 15781, "time_per_iteration": 2.602403163909912 }, { "auxiliary_loss_clip": 0.06410672, "auxiliary_loss_mlp": 0.01267603, "balance_loss_clip": 0.06275519, "balance_loss_mlp": 0.0125762, "epoch": 0.9488651736058921, "flos": 18374764101120.0, "grad_norm": 1.9227014062529226, "language_loss": 0.79821908, "learning_rate": 2.733477870890999e-08, "loss": 0.87500179, "num_input_tokens_seen": 340458200, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.09985352, "step": 15782, "time_per_iteration": 2.563737392425537 }, { "auxiliary_loss_clip": 0.0631557, "auxiliary_loss_mlp": 0.01251205, "balance_loss_clip": 0.06260462, "balance_loss_mlp": 0.01250216, "epoch": 0.9489252968585601, "flos": 70107130800000.0, "grad_norm": 0.712441504945573, "language_loss": 0.59882015, "learning_rate": 2.7270645894688082e-08, "loss": 0.67448783, "num_input_tokens_seen": 340526420, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.0098877, "step": 15783, "time_per_iteration": 3.2778947353363037 }, { "auxiliary_loss_clip": 0.06413476, "auxiliary_loss_mlp": 0.01263687, "balance_loss_clip": 0.06273732, "balance_loss_mlp": 0.01254061, "epoch": 0.948985420111228, "flos": 27862909182720.0, "grad_norm": 1.4618136706767257, "language_loss": 0.73662502, "learning_rate": 2.720658788656105e-08, "loss": 0.81339663, "num_input_tokens_seen": 340546325, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09631348, "step": 15784, "time_per_iteration": 3.994271755218506 }, { "auxiliary_loss_clip": 0.06420627, "auxiliary_loss_mlp": 0.01266893, "balance_loss_clip": 0.06280823, "balance_loss_mlp": 0.01257344, "epoch": 0.949045543363896, "flos": 24322880286720.0, "grad_norm": 1.8928171950083608, "language_loss": 0.70057309, "learning_rate": 2.714260468695806e-08, "loss": 0.77744824, "num_input_tokens_seen": 340565145, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09558105, "step": 15785, "time_per_iteration": 2.5739529132843018 }, { "auxiliary_loss_clip": 0.06419647, "auxiliary_loss_mlp": 0.0126699, "balance_loss_clip": 0.06276975, "balance_loss_mlp": 0.01257823, "epoch": 0.9491056666165639, "flos": 24248262625920.0, "grad_norm": 1.585509895294729, "language_loss": 0.76234043, "learning_rate": 2.707869629830495e-08, "loss": 0.83920681, "num_input_tokens_seen": 340585465, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.0916748, "step": 15786, "time_per_iteration": 2.55989670753479 }, { "auxiliary_loss_clip": 0.06411988, "auxiliary_loss_mlp": 0.01262267, "balance_loss_clip": 0.06273837, "balance_loss_mlp": 0.01253892, "epoch": 0.949165789869232, "flos": 24537509320320.0, "grad_norm": 1.927420984789548, "language_loss": 0.79165691, "learning_rate": 2.7014862723025335e-08, "loss": 0.86839944, "num_input_tokens_seen": 340606010, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.08380127, "step": 15787, "time_per_iteration": 2.567013740539551 }, { "auxiliary_loss_clip": 0.06410739, "auxiliary_loss_mlp": 0.01263688, "balance_loss_clip": 0.06275947, "balance_loss_mlp": 0.01254437, "epoch": 0.9492259131218999, "flos": 22241662974720.0, "grad_norm": 1.4999592798739645, "language_loss": 0.7671352, "learning_rate": 2.6951103963540388e-08, "loss": 0.84387946, "num_input_tokens_seen": 340626135, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.09246826, "step": 15788, "time_per_iteration": 2.5395750999450684 }, { "auxiliary_loss_clip": 0.06416676, "auxiliary_loss_mlp": 0.01270229, "balance_loss_clip": 0.06274637, "balance_loss_mlp": 0.01260084, "epoch": 0.9492860363745679, "flos": 22972955483520.0, "grad_norm": 1.769231418214219, "language_loss": 0.71609724, "learning_rate": 2.6887420022266848e-08, "loss": 0.79296625, "num_input_tokens_seen": 340644870, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10144043, "step": 15789, "time_per_iteration": 2.5640957355499268 }, { "auxiliary_loss_clip": 0.06416938, "auxiliary_loss_mlp": 0.01266322, "balance_loss_clip": 0.06280185, "balance_loss_mlp": 0.01256529, "epoch": 0.9493461596272358, "flos": 18376357328640.0, "grad_norm": 1.7024064886822776, "language_loss": 0.73436987, "learning_rate": 2.682381090161989e-08, "loss": 0.81120253, "num_input_tokens_seen": 340663695, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09796143, "step": 15790, "time_per_iteration": 2.511518716812134 }, { "auxiliary_loss_clip": 0.06416442, "auxiliary_loss_mlp": 0.01264889, "balance_loss_clip": 0.06274538, "balance_loss_mlp": 0.01254995, "epoch": 0.9494062828799038, "flos": 20018002521600.0, "grad_norm": 1.7189460879905012, "language_loss": 0.77994728, "learning_rate": 2.6760276604012033e-08, "loss": 0.85676062, "num_input_tokens_seen": 340682970, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09893799, "step": 15791, "time_per_iteration": 2.545578718185425 }, { "auxiliary_loss_clip": 0.06426875, "auxiliary_loss_mlp": 0.0126603, "balance_loss_clip": 0.06280997, "balance_loss_mlp": 0.0125616, "epoch": 0.9494664061325718, "flos": 27234843304320.0, "grad_norm": 1.7644319761573568, "language_loss": 0.74637008, "learning_rate": 2.6696817131852234e-08, "loss": 0.82329911, "num_input_tokens_seen": 340702275, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.09875488, "step": 15792, "time_per_iteration": 2.57765531539917 }, { "auxiliary_loss_clip": 0.06415022, "auxiliary_loss_mlp": 0.01261392, "balance_loss_clip": 0.06276894, "balance_loss_mlp": 0.01252076, "epoch": 0.9495265293852397, "flos": 18375812277120.0, "grad_norm": 1.7951873007055579, "language_loss": 0.78027761, "learning_rate": 2.663343248754679e-08, "loss": 0.85704178, "num_input_tokens_seen": 340719060, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09320068, "step": 15793, "time_per_iteration": 3.989821434020996 }, { "auxiliary_loss_clip": 0.06410763, "auxiliary_loss_mlp": 0.01263302, "balance_loss_clip": 0.06272383, "balance_loss_mlp": 0.01254516, "epoch": 0.9495866526379078, "flos": 23082429242880.0, "grad_norm": 1.7586287738217234, "language_loss": 0.77644658, "learning_rate": 2.6570122673499562e-08, "loss": 0.8531872, "num_input_tokens_seen": 340737815, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.08782959, "step": 15794, "time_per_iteration": 2.5567238330841064 }, { "auxiliary_loss_clip": 0.06421478, "auxiliary_loss_mlp": 0.01265723, "balance_loss_clip": 0.06278932, "balance_loss_mlp": 0.01255501, "epoch": 0.9496467758905757, "flos": 17535632987520.0, "grad_norm": 2.0329492901210915, "language_loss": 0.61271983, "learning_rate": 2.650688769211107e-08, "loss": 0.68959183, "num_input_tokens_seen": 340756035, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.10223389, "step": 15795, "time_per_iteration": 2.5565032958984375 }, { "auxiliary_loss_clip": 0.06407765, "auxiliary_loss_mlp": 0.01264662, "balance_loss_clip": 0.0627332, "balance_loss_mlp": 0.01254904, "epoch": 0.9497068991432437, "flos": 24140759437440.0, "grad_norm": 1.6019803250503328, "language_loss": 0.79212236, "learning_rate": 2.644372754577895e-08, "loss": 0.86884665, "num_input_tokens_seen": 340775620, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.09753418, "step": 15796, "time_per_iteration": 2.5691983699798584 }, { "auxiliary_loss_clip": 0.06412356, "auxiliary_loss_mlp": 0.01269042, "balance_loss_clip": 0.06272309, "balance_loss_mlp": 0.01258862, "epoch": 0.9497670223959116, "flos": 20309597130240.0, "grad_norm": 2.0699224707067962, "language_loss": 0.75627351, "learning_rate": 2.6380642236898398e-08, "loss": 0.83308756, "num_input_tokens_seen": 340794510, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.10192871, "step": 15797, "time_per_iteration": 2.558741331100464 }, { "auxiliary_loss_clip": 0.06415009, "auxiliary_loss_mlp": 0.01263663, "balance_loss_clip": 0.0627557, "balance_loss_mlp": 0.01253912, "epoch": 0.9498271456485796, "flos": 13704009482880.0, "grad_norm": 1.9271324022010112, "language_loss": 0.66361445, "learning_rate": 2.6317631767861727e-08, "loss": 0.74040121, "num_input_tokens_seen": 340812955, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09759521, "step": 15798, "time_per_iteration": 2.5105795860290527 }, { "auxiliary_loss_clip": 0.06424902, "auxiliary_loss_mlp": 0.01265279, "balance_loss_clip": 0.06280755, "balance_loss_mlp": 0.01255647, "epoch": 0.9498872689012475, "flos": 20820348892800.0, "grad_norm": 1.8303345982787524, "language_loss": 0.77707851, "learning_rate": 2.6254696141058575e-08, "loss": 0.8539803, "num_input_tokens_seen": 340829200, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.09631348, "step": 15799, "time_per_iteration": 3.859534978866577 }, { "auxiliary_loss_clip": 0.06407839, "auxiliary_loss_mlp": 0.01265983, "balance_loss_clip": 0.06271984, "balance_loss_mlp": 0.012565, "epoch": 0.9499473921539155, "flos": 21039044849280.0, "grad_norm": 1.7968240168100391, "language_loss": 0.71247303, "learning_rate": 2.6191835358874814e-08, "loss": 0.78921127, "num_input_tokens_seen": 340848035, "router_z_loss_clip": 1.35644531, "router_z_loss_mlp": 0.09484863, "step": 15800, "time_per_iteration": 2.536363124847412 }, { "auxiliary_loss_clip": 0.06413356, "auxiliary_loss_mlp": 0.01265878, "balance_loss_clip": 0.06275464, "balance_loss_mlp": 0.01256836, "epoch": 0.9500075154065835, "flos": 21005446561920.0, "grad_norm": 1.6419973004954833, "language_loss": 0.719926, "learning_rate": 2.6129049423694315e-08, "loss": 0.79671836, "num_input_tokens_seen": 340870025, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.0904541, "step": 15801, "time_per_iteration": 2.5998551845550537 }, { "auxiliary_loss_clip": 0.0641902, "auxiliary_loss_mlp": 0.01266305, "balance_loss_clip": 0.06280632, "balance_loss_mlp": 0.0125734, "epoch": 0.9500676386592515, "flos": 25129461288960.0, "grad_norm": 1.7553371936242415, "language_loss": 0.81152344, "learning_rate": 2.6066338337898508e-08, "loss": 0.88837671, "num_input_tokens_seen": 340892290, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.08959961, "step": 15802, "time_per_iteration": 2.6421103477478027 }, { "auxiliary_loss_clip": 0.06414518, "auxiliary_loss_mlp": 0.01265182, "balance_loss_clip": 0.06273776, "balance_loss_mlp": 0.01255764, "epoch": 0.9501277619119194, "flos": 27530462908800.0, "grad_norm": 1.587331344970014, "language_loss": 0.6799885, "learning_rate": 2.60037021038646e-08, "loss": 0.75678551, "num_input_tokens_seen": 340912260, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09417725, "step": 15803, "time_per_iteration": 2.5902693271636963 }, { "auxiliary_loss_clip": 0.06409851, "auxiliary_loss_mlp": 0.01266154, "balance_loss_clip": 0.06273924, "balance_loss_mlp": 0.01256379, "epoch": 0.9501878851645874, "flos": 20820306965760.0, "grad_norm": 1.5519987080906685, "language_loss": 0.76017916, "learning_rate": 2.5941140723968247e-08, "loss": 0.83693933, "num_input_tokens_seen": 340928930, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09777832, "step": 15804, "time_per_iteration": 2.66300630569458 }, { "auxiliary_loss_clip": 0.06416732, "auxiliary_loss_mlp": 0.0126641, "balance_loss_clip": 0.06275012, "balance_loss_mlp": 0.01256599, "epoch": 0.9502480084172553, "flos": 18375309152640.0, "grad_norm": 1.6308716345122372, "language_loss": 0.7334457, "learning_rate": 2.5878654200581775e-08, "loss": 0.8102771, "num_input_tokens_seen": 340946615, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09802246, "step": 15805, "time_per_iteration": 2.620739221572876 }, { "auxiliary_loss_clip": 0.06414391, "auxiliary_loss_mlp": 0.01267744, "balance_loss_clip": 0.06276043, "balance_loss_mlp": 0.01257862, "epoch": 0.9503081316699233, "flos": 23556270481920.0, "grad_norm": 1.364530870581833, "language_loss": 0.80080551, "learning_rate": 2.5816242536074618e-08, "loss": 0.8776269, "num_input_tokens_seen": 340967545, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09881592, "step": 15806, "time_per_iteration": 2.546983480453491 }, { "auxiliary_loss_clip": 0.06417018, "auxiliary_loss_mlp": 0.01266482, "balance_loss_clip": 0.06276008, "balance_loss_mlp": 0.01257863, "epoch": 0.9503682549225914, "flos": 18046217041920.0, "grad_norm": 2.6728099469750504, "language_loss": 0.82406855, "learning_rate": 2.5753905732813108e-08, "loss": 0.90090358, "num_input_tokens_seen": 340984955, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.08618164, "step": 15807, "time_per_iteration": 2.517324924468994 }, { "auxiliary_loss_clip": 0.06411488, "auxiliary_loss_mlp": 0.01267327, "balance_loss_clip": 0.0627408, "balance_loss_mlp": 0.01258232, "epoch": 0.9504283781752593, "flos": 25893429690240.0, "grad_norm": 2.0265286189452976, "language_loss": 0.72264689, "learning_rate": 2.5691643793161355e-08, "loss": 0.79943502, "num_input_tokens_seen": 341007300, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09094238, "step": 15808, "time_per_iteration": 2.6086833477020264 }, { "auxiliary_loss_clip": 0.0640875, "auxiliary_loss_mlp": 0.01265812, "balance_loss_clip": 0.06273005, "balance_loss_mlp": 0.01256448, "epoch": 0.9504885014279273, "flos": 22130009009280.0, "grad_norm": 1.550564677709732, "language_loss": 0.69937396, "learning_rate": 2.562945671948058e-08, "loss": 0.77611965, "num_input_tokens_seen": 341026695, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.09362793, "step": 15809, "time_per_iteration": 2.5634806156158447 }, { "auxiliary_loss_clip": 0.0640887, "auxiliary_loss_mlp": 0.01263435, "balance_loss_clip": 0.06271043, "balance_loss_mlp": 0.01254328, "epoch": 0.9505486246805952, "flos": 21622317920640.0, "grad_norm": 1.5659147103735775, "language_loss": 0.75990093, "learning_rate": 2.5567344514128452e-08, "loss": 0.83662403, "num_input_tokens_seen": 341047080, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09106445, "step": 15810, "time_per_iteration": 2.5620572566986084 }, { "auxiliary_loss_clip": 0.06413327, "auxiliary_loss_mlp": 0.01268934, "balance_loss_clip": 0.06274439, "balance_loss_mlp": 0.01259141, "epoch": 0.9506087479332632, "flos": 22534766956800.0, "grad_norm": 1.3224449565616812, "language_loss": 0.80229568, "learning_rate": 2.5505307179460643e-08, "loss": 0.87911832, "num_input_tokens_seen": 341067310, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09796143, "step": 15811, "time_per_iteration": 2.558178424835205 }, { "auxiliary_loss_clip": 0.064131, "auxiliary_loss_mlp": 0.01264909, "balance_loss_clip": 0.06273519, "balance_loss_mlp": 0.01255229, "epoch": 0.9506688711859311, "flos": 27534823320960.0, "grad_norm": 2.306601283153931, "language_loss": 0.69961691, "learning_rate": 2.5443344717829495e-08, "loss": 0.77639699, "num_input_tokens_seen": 341085110, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09686279, "step": 15812, "time_per_iteration": 2.5662851333618164 }, { "auxiliary_loss_clip": 0.06419294, "auxiliary_loss_mlp": 0.01262546, "balance_loss_clip": 0.06277643, "balance_loss_mlp": 0.01253063, "epoch": 0.9507289944385992, "flos": 19872037509120.0, "grad_norm": 1.482101651051224, "language_loss": 0.65490651, "learning_rate": 2.538145713158446e-08, "loss": 0.73172486, "num_input_tokens_seen": 341103190, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09472656, "step": 15813, "time_per_iteration": 2.546189308166504 }, { "auxiliary_loss_clip": 0.06414619, "auxiliary_loss_mlp": 0.01264893, "balance_loss_clip": 0.06274743, "balance_loss_mlp": 0.01255243, "epoch": 0.9507891176912671, "flos": 25200515151360.0, "grad_norm": 1.3267139222976356, "language_loss": 0.70453113, "learning_rate": 2.5319644423072327e-08, "loss": 0.78132629, "num_input_tokens_seen": 341125695, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09643555, "step": 15814, "time_per_iteration": 2.5904719829559326 }, { "auxiliary_loss_clip": 0.0640952, "auxiliary_loss_mlp": 0.01262072, "balance_loss_clip": 0.06273116, "balance_loss_mlp": 0.01253537, "epoch": 0.9508492409439351, "flos": 24906446847360.0, "grad_norm": 2.2732950353942787, "language_loss": 0.63124633, "learning_rate": 2.5257906594637445e-08, "loss": 0.70796221, "num_input_tokens_seen": 341143930, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.08532715, "step": 15815, "time_per_iteration": 2.5632717609405518 }, { "auxiliary_loss_clip": 0.06414872, "auxiliary_loss_mlp": 0.0126386, "balance_loss_clip": 0.06276944, "balance_loss_mlp": 0.01254908, "epoch": 0.950909364196603, "flos": 29791033885440.0, "grad_norm": 1.7317138395531113, "language_loss": 0.58933282, "learning_rate": 2.519624364862061e-08, "loss": 0.66612017, "num_input_tokens_seen": 341164280, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.08947754, "step": 15816, "time_per_iteration": 2.615973949432373 }, { "auxiliary_loss_clip": 0.06411705, "auxiliary_loss_mlp": 0.01263744, "balance_loss_clip": 0.06273621, "balance_loss_mlp": 0.01254672, "epoch": 0.950969487449271, "flos": 24724745268480.0, "grad_norm": 1.4836871789637798, "language_loss": 0.7373538, "learning_rate": 2.513465558735994e-08, "loss": 0.81410825, "num_input_tokens_seen": 341183670, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09075928, "step": 15817, "time_per_iteration": 2.5798890590667725 }, { "auxiliary_loss_clip": 0.06417333, "auxiliary_loss_mlp": 0.01264432, "balance_loss_clip": 0.06274273, "balance_loss_mlp": 0.01253661, "epoch": 0.9510296107019389, "flos": 13704302972160.0, "grad_norm": 1.614772104314845, "language_loss": 0.60249734, "learning_rate": 2.5073142413190918e-08, "loss": 0.67931497, "num_input_tokens_seen": 341201900, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.10772705, "step": 15818, "time_per_iteration": 2.514091968536377 }, { "auxiliary_loss_clip": 0.0641278, "auxiliary_loss_mlp": 0.01264069, "balance_loss_clip": 0.06274948, "balance_loss_mlp": 0.01254651, "epoch": 0.9510897339546069, "flos": 17317691717760.0, "grad_norm": 1.928034197965045, "language_loss": 0.69725204, "learning_rate": 2.5011704128446552e-08, "loss": 0.77402049, "num_input_tokens_seen": 341218340, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09423828, "step": 15819, "time_per_iteration": 3.9654743671417236 }, { "auxiliary_loss_clip": 0.0642107, "auxiliary_loss_mlp": 0.01262772, "balance_loss_clip": 0.0628064, "balance_loss_mlp": 0.01253754, "epoch": 0.951149857207275, "flos": 14799292128000.0, "grad_norm": 1.7683620675179488, "language_loss": 0.74374938, "learning_rate": 2.49503407354561e-08, "loss": 0.82058787, "num_input_tokens_seen": 341235885, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09020996, "step": 15820, "time_per_iteration": 2.564516067504883 }, { "auxiliary_loss_clip": 0.06417677, "auxiliary_loss_mlp": 0.01263744, "balance_loss_clip": 0.06275146, "balance_loss_mlp": 0.01253749, "epoch": 0.9512099804599429, "flos": 19397273875200.0, "grad_norm": 3.1573267416635655, "language_loss": 0.78801513, "learning_rate": 2.4889052236546804e-08, "loss": 0.8648293, "num_input_tokens_seen": 341255280, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10003662, "step": 15821, "time_per_iteration": 2.5452613830566406 }, { "auxiliary_loss_clip": 0.06413423, "auxiliary_loss_mlp": 0.01263899, "balance_loss_clip": 0.06275365, "balance_loss_mlp": 0.01253921, "epoch": 0.9512701037126109, "flos": 36766816871040.0, "grad_norm": 1.5231088986358023, "language_loss": 0.7150718, "learning_rate": 2.4827838634042586e-08, "loss": 0.79184502, "num_input_tokens_seen": 341279055, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09979248, "step": 15822, "time_per_iteration": 2.6607718467712402 }, { "auxiliary_loss_clip": 0.06411523, "auxiliary_loss_mlp": 0.01265455, "balance_loss_clip": 0.06274369, "balance_loss_mlp": 0.0125661, "epoch": 0.9513302269652788, "flos": 22644911548800.0, "grad_norm": 1.5392623423732184, "language_loss": 0.66152465, "learning_rate": 2.47666999302647e-08, "loss": 0.73829442, "num_input_tokens_seen": 341298560, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.08850098, "step": 15823, "time_per_iteration": 4.045542001724243 }, { "auxiliary_loss_clip": 0.06410228, "auxiliary_loss_mlp": 0.01263417, "balance_loss_clip": 0.06273448, "balance_loss_mlp": 0.01254005, "epoch": 0.9513903502179468, "flos": 22899847196160.0, "grad_norm": 1.7622411501725495, "language_loss": 0.77580214, "learning_rate": 2.4705636127531292e-08, "loss": 0.85253865, "num_input_tokens_seen": 341316650, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.09411621, "step": 15824, "time_per_iteration": 2.6030144691467285 }, { "auxiliary_loss_clip": 0.06416087, "auxiliary_loss_mlp": 0.01263393, "balance_loss_clip": 0.06273183, "balance_loss_mlp": 0.01253093, "epoch": 0.9514504734706147, "flos": 27936143251200.0, "grad_norm": 2.010819933742803, "language_loss": 0.74268758, "learning_rate": 2.4644647228158065e-08, "loss": 0.81948233, "num_input_tokens_seen": 341336185, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.10302734, "step": 15825, "time_per_iteration": 2.6572723388671875 }, { "auxiliary_loss_clip": 0.06317191, "auxiliary_loss_mlp": 0.01252475, "balance_loss_clip": 0.06261976, "balance_loss_mlp": 0.01251461, "epoch": 0.9515105967232828, "flos": 67386485381760.0, "grad_norm": 0.8038994537118657, "language_loss": 0.53139305, "learning_rate": 2.458373323445806e-08, "loss": 0.60708964, "num_input_tokens_seen": 341395795, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01013947, "step": 15826, "time_per_iteration": 3.1783816814422607 }, { "auxiliary_loss_clip": 0.06415353, "auxiliary_loss_mlp": 0.0126622, "balance_loss_clip": 0.06275248, "balance_loss_mlp": 0.01256314, "epoch": 0.9515707199759507, "flos": 25853290784640.0, "grad_norm": 1.8715008369497799, "language_loss": 0.72636938, "learning_rate": 2.452289414874076e-08, "loss": 0.80318511, "num_input_tokens_seen": 341415675, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09899902, "step": 15827, "time_per_iteration": 2.6827759742736816 }, { "auxiliary_loss_clip": 0.0642072, "auxiliary_loss_mlp": 0.01266643, "balance_loss_clip": 0.06281354, "balance_loss_mlp": 0.01256547, "epoch": 0.9516308432286187, "flos": 21834389404800.0, "grad_norm": 1.803367828611818, "language_loss": 0.74828237, "learning_rate": 2.4462129973313207e-08, "loss": 0.82515603, "num_input_tokens_seen": 341432990, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.10095215, "step": 15828, "time_per_iteration": 2.6726136207580566 }, { "auxiliary_loss_clip": 0.06414217, "auxiliary_loss_mlp": 0.01262896, "balance_loss_clip": 0.06278041, "balance_loss_mlp": 0.01254116, "epoch": 0.9516909664812866, "flos": 27276617364480.0, "grad_norm": 2.1931015679596912, "language_loss": 0.73388755, "learning_rate": 2.440144071047978e-08, "loss": 0.81065869, "num_input_tokens_seen": 341454100, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.08776855, "step": 15829, "time_per_iteration": 2.5839569568634033 }, { "auxiliary_loss_clip": 0.06417, "auxiliary_loss_mlp": 0.01266092, "balance_loss_clip": 0.06277122, "balance_loss_mlp": 0.01257121, "epoch": 0.9517510897339546, "flos": 21221752677120.0, "grad_norm": 2.1520876448943684, "language_loss": 0.61647594, "learning_rate": 2.4340826362541533e-08, "loss": 0.6933068, "num_input_tokens_seen": 341472955, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.08972168, "step": 15830, "time_per_iteration": 2.555689811706543 }, { "auxiliary_loss_clip": 0.06421671, "auxiliary_loss_mlp": 0.01269237, "balance_loss_clip": 0.06280132, "balance_loss_mlp": 0.01258907, "epoch": 0.9518112129866225, "flos": 18739928194560.0, "grad_norm": 1.986445615617076, "language_loss": 0.73747325, "learning_rate": 2.428028693179729e-08, "loss": 0.81438231, "num_input_tokens_seen": 341490165, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10327148, "step": 15831, "time_per_iteration": 2.5176496505737305 }, { "auxiliary_loss_clip": 0.06412062, "auxiliary_loss_mlp": 0.01262224, "balance_loss_clip": 0.06275181, "balance_loss_mlp": 0.01253075, "epoch": 0.9518713362392905, "flos": 16769274744960.0, "grad_norm": 1.667692583225708, "language_loss": 0.65820169, "learning_rate": 2.4219822420542545e-08, "loss": 0.73494458, "num_input_tokens_seen": 341508055, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.0914917, "step": 15832, "time_per_iteration": 3.9586098194122314 }, { "auxiliary_loss_clip": 0.06412704, "auxiliary_loss_mlp": 0.01267266, "balance_loss_clip": 0.06279269, "balance_loss_mlp": 0.01257479, "epoch": 0.9519314594919586, "flos": 15235887427200.0, "grad_norm": 1.6803119711430625, "language_loss": 0.7828514, "learning_rate": 2.4159432831070135e-08, "loss": 0.85965109, "num_input_tokens_seen": 341526155, "router_z_loss_clip": 1.33496094, "router_z_loss_mlp": 0.09790039, "step": 15833, "time_per_iteration": 2.4900765419006348 }, { "auxiliary_loss_clip": 0.06411719, "auxiliary_loss_mlp": 0.01261383, "balance_loss_clip": 0.0627584, "balance_loss_mlp": 0.01251995, "epoch": 0.9519915827446265, "flos": 19358770124160.0, "grad_norm": 2.5708481958597114, "language_loss": 0.75251681, "learning_rate": 2.4099118165670007e-08, "loss": 0.82924783, "num_input_tokens_seen": 341540450, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.09393311, "step": 15834, "time_per_iteration": 2.515889883041382 }, { "auxiliary_loss_clip": 0.06423073, "auxiliary_loss_mlp": 0.01267531, "balance_loss_clip": 0.06278674, "balance_loss_mlp": 0.01257648, "epoch": 0.9520517059972945, "flos": 22271697463680.0, "grad_norm": 2.168505795330749, "language_loss": 0.76684928, "learning_rate": 2.4038878426629216e-08, "loss": 0.84375525, "num_input_tokens_seen": 341557865, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.09887695, "step": 15835, "time_per_iteration": 2.5653645992279053 }, { "auxiliary_loss_clip": 0.0641633, "auxiliary_loss_mlp": 0.01265626, "balance_loss_clip": 0.0627712, "balance_loss_mlp": 0.01256268, "epoch": 0.9521118292499624, "flos": 14866907973120.0, "grad_norm": 2.45668241865509, "language_loss": 0.66511357, "learning_rate": 2.397871361623238e-08, "loss": 0.74193317, "num_input_tokens_seen": 341573890, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09356689, "step": 15836, "time_per_iteration": 2.5068359375 }, { "auxiliary_loss_clip": 0.06410223, "auxiliary_loss_mlp": 0.01264096, "balance_loss_clip": 0.062746, "balance_loss_mlp": 0.01254655, "epoch": 0.9521719525026304, "flos": 23514747984000.0, "grad_norm": 1.5713191276685816, "language_loss": 0.70392358, "learning_rate": 2.391862373676057e-08, "loss": 0.78066671, "num_input_tokens_seen": 341593770, "router_z_loss_clip": 1.35644531, "router_z_loss_mlp": 0.09442139, "step": 15837, "time_per_iteration": 2.5611417293548584 }, { "auxiliary_loss_clip": 0.06418259, "auxiliary_loss_mlp": 0.01264318, "balance_loss_clip": 0.0627726, "balance_loss_mlp": 0.01254359, "epoch": 0.9522320757552983, "flos": 19720328492160.0, "grad_norm": 1.7323432863942878, "language_loss": 0.7388258, "learning_rate": 2.3858608790492617e-08, "loss": 0.81565154, "num_input_tokens_seen": 341612065, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09954834, "step": 15838, "time_per_iteration": 3.986755609512329 }, { "auxiliary_loss_clip": 0.06411436, "auxiliary_loss_mlp": 0.01265668, "balance_loss_clip": 0.0627258, "balance_loss_mlp": 0.01256066, "epoch": 0.9522921990079664, "flos": 25928369642880.0, "grad_norm": 1.8812486008228677, "language_loss": 0.78553188, "learning_rate": 2.379866877970449e-08, "loss": 0.86230296, "num_input_tokens_seen": 341631365, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.0960083, "step": 15839, "time_per_iteration": 2.5962796211242676 }, { "auxiliary_loss_clip": 0.0641779, "auxiliary_loss_mlp": 0.01267007, "balance_loss_clip": 0.06275974, "balance_loss_mlp": 0.01257971, "epoch": 0.9523523222606343, "flos": 19214104849920.0, "grad_norm": 1.4795578085865217, "language_loss": 0.8068012, "learning_rate": 2.3738803706668585e-08, "loss": 0.88364917, "num_input_tokens_seen": 341650300, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.0904541, "step": 15840, "time_per_iteration": 2.588258981704712 }, { "auxiliary_loss_clip": 0.06410076, "auxiliary_loss_mlp": 0.01262155, "balance_loss_clip": 0.06275873, "balance_loss_mlp": 0.01254317, "epoch": 0.9524124455133023, "flos": 20927265102720.0, "grad_norm": 1.903586723104935, "language_loss": 0.73121917, "learning_rate": 2.3679013573655314e-08, "loss": 0.8079415, "num_input_tokens_seen": 341667680, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.07843018, "step": 15841, "time_per_iteration": 2.5568833351135254 }, { "auxiliary_loss_clip": 0.06407325, "auxiliary_loss_mlp": 0.01264702, "balance_loss_clip": 0.06274303, "balance_loss_mlp": 0.01256405, "epoch": 0.9524725687659702, "flos": 18849527735040.0, "grad_norm": 1.8438634080412328, "language_loss": 0.79136288, "learning_rate": 2.3619298382931972e-08, "loss": 0.86808324, "num_input_tokens_seen": 341685760, "router_z_loss_clip": 1.32910156, "router_z_loss_mlp": 0.08294678, "step": 15842, "time_per_iteration": 2.5501554012298584 }, { "auxiliary_loss_clip": 0.06417458, "auxiliary_loss_mlp": 0.01266087, "balance_loss_clip": 0.0627903, "balance_loss_mlp": 0.01256484, "epoch": 0.9525326920186382, "flos": 22681318947840.0, "grad_norm": 2.405613611562388, "language_loss": 0.72565788, "learning_rate": 2.3559658136762973e-08, "loss": 0.80249333, "num_input_tokens_seen": 341705300, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.0960083, "step": 15843, "time_per_iteration": 2.576489210128784 }, { "auxiliary_loss_clip": 0.06419691, "auxiliary_loss_mlp": 0.0126574, "balance_loss_clip": 0.06279401, "balance_loss_mlp": 0.01255673, "epoch": 0.9525928152713061, "flos": 22092469580160.0, "grad_norm": 1.5850270342591022, "language_loss": 0.78246242, "learning_rate": 2.3500092837409612e-08, "loss": 0.85931671, "num_input_tokens_seen": 341724565, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.10070801, "step": 15844, "time_per_iteration": 2.566333532333374 }, { "auxiliary_loss_clip": 0.06419183, "auxiliary_loss_mlp": 0.01265413, "balance_loss_clip": 0.06274464, "balance_loss_mlp": 0.01254189, "epoch": 0.9526529385239741, "flos": 20711084768640.0, "grad_norm": 1.777254842785675, "language_loss": 0.70120263, "learning_rate": 2.3440602487130977e-08, "loss": 0.77804863, "num_input_tokens_seen": 341743605, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.11236572, "step": 15845, "time_per_iteration": 2.5288448333740234 }, { "auxiliary_loss_clip": 0.06416465, "auxiliary_loss_mlp": 0.01265763, "balance_loss_clip": 0.06275205, "balance_loss_mlp": 0.01255875, "epoch": 0.9527130617766422, "flos": 23374820465280.0, "grad_norm": 1.5221110049020372, "language_loss": 0.75744516, "learning_rate": 2.338118708818282e-08, "loss": 0.83426744, "num_input_tokens_seen": 341763475, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09893799, "step": 15846, "time_per_iteration": 2.5720884799957275 }, { "auxiliary_loss_clip": 0.06411912, "auxiliary_loss_mlp": 0.01264512, "balance_loss_clip": 0.06271474, "balance_loss_mlp": 0.01255542, "epoch": 0.9527731850293101, "flos": 18991341970560.0, "grad_norm": 1.6421808691162818, "language_loss": 0.78270555, "learning_rate": 2.3321846642817998e-08, "loss": 0.85946977, "num_input_tokens_seen": 341781265, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.08966064, "step": 15847, "time_per_iteration": 2.521392822265625 }, { "auxiliary_loss_clip": 0.06407474, "auxiliary_loss_mlp": 0.0126416, "balance_loss_clip": 0.06270979, "balance_loss_mlp": 0.01255285, "epoch": 0.9528333082819781, "flos": 19324123660800.0, "grad_norm": 2.0458090808122797, "language_loss": 0.78079242, "learning_rate": 2.326258115328672e-08, "loss": 0.85750878, "num_input_tokens_seen": 341798825, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.08874512, "step": 15848, "time_per_iteration": 2.5693907737731934 }, { "auxiliary_loss_clip": 0.0642537, "auxiliary_loss_mlp": 0.01265536, "balance_loss_clip": 0.06281173, "balance_loss_mlp": 0.01255708, "epoch": 0.952893431534646, "flos": 23958135463680.0, "grad_norm": 5.748759814144795, "language_loss": 0.72494757, "learning_rate": 2.320339062183674e-08, "loss": 0.80185664, "num_input_tokens_seen": 341819480, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.09832764, "step": 15849, "time_per_iteration": 2.641702651977539 }, { "auxiliary_loss_clip": 0.06423104, "auxiliary_loss_mlp": 0.01266299, "balance_loss_clip": 0.06278522, "balance_loss_mlp": 0.01255838, "epoch": 0.952953554787314, "flos": 21036529226880.0, "grad_norm": 2.017382062535451, "language_loss": 0.75692558, "learning_rate": 2.314427505071226e-08, "loss": 0.83381963, "num_input_tokens_seen": 341838035, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10455322, "step": 15850, "time_per_iteration": 2.6488444805145264 }, { "auxiliary_loss_clip": 0.06414776, "auxiliary_loss_mlp": 0.0126491, "balance_loss_clip": 0.06275757, "balance_loss_mlp": 0.01256082, "epoch": 0.9530136780399819, "flos": 22389472776960.0, "grad_norm": 2.2418100700327583, "language_loss": 0.72794116, "learning_rate": 2.308523444215482e-08, "loss": 0.80473804, "num_input_tokens_seen": 341855895, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.08825684, "step": 15851, "time_per_iteration": 2.6324527263641357 }, { "auxiliary_loss_clip": 0.06415387, "auxiliary_loss_mlp": 0.01265477, "balance_loss_clip": 0.06278372, "balance_loss_mlp": 0.01256716, "epoch": 0.95307380129265, "flos": 22165452086400.0, "grad_norm": 2.321347380301437, "language_loss": 0.7969479, "learning_rate": 2.3026268798403525e-08, "loss": 0.87375653, "num_input_tokens_seen": 341875240, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.08770752, "step": 15852, "time_per_iteration": 2.5576024055480957 }, { "auxiliary_loss_clip": 0.06412369, "auxiliary_loss_mlp": 0.01265259, "balance_loss_clip": 0.06272723, "balance_loss_mlp": 0.01255889, "epoch": 0.9531339245453179, "flos": 44033607486720.0, "grad_norm": 1.5427074786940835, "language_loss": 0.60279936, "learning_rate": 2.2967378121694138e-08, "loss": 0.67957556, "num_input_tokens_seen": 341901020, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09368896, "step": 15853, "time_per_iteration": 2.8349225521087646 }, { "auxiliary_loss_clip": 0.0640946, "auxiliary_loss_mlp": 0.01264401, "balance_loss_clip": 0.06275545, "balance_loss_mlp": 0.01255448, "epoch": 0.9531940477979859, "flos": 20272938168960.0, "grad_norm": 1.6697484487013436, "language_loss": 0.72567189, "learning_rate": 2.290856241425998e-08, "loss": 0.80241048, "num_input_tokens_seen": 341919365, "router_z_loss_clip": 1.33984375, "router_z_loss_mlp": 0.08953857, "step": 15854, "time_per_iteration": 2.5198421478271484 }, { "auxiliary_loss_clip": 0.0641629, "auxiliary_loss_mlp": 0.01263877, "balance_loss_clip": 0.06275776, "balance_loss_mlp": 0.01254561, "epoch": 0.9532541710506538, "flos": 25342413240960.0, "grad_norm": 1.899961016947268, "language_loss": 0.67817283, "learning_rate": 2.284982167833127e-08, "loss": 0.75497448, "num_input_tokens_seen": 341939985, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09313965, "step": 15855, "time_per_iteration": 2.5753884315490723 }, { "auxiliary_loss_clip": 0.06411512, "auxiliary_loss_mlp": 0.01268343, "balance_loss_clip": 0.06272811, "balance_loss_mlp": 0.01259003, "epoch": 0.9533142943033218, "flos": 26476576980480.0, "grad_norm": 1.6716858039619475, "language_loss": 0.76813084, "learning_rate": 2.279115591613556e-08, "loss": 0.84492946, "num_input_tokens_seen": 341959255, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09350586, "step": 15856, "time_per_iteration": 2.5734035968780518 }, { "auxiliary_loss_clip": 0.06414405, "auxiliary_loss_mlp": 0.01265047, "balance_loss_clip": 0.06276453, "balance_loss_mlp": 0.0125566, "epoch": 0.9533744175559897, "flos": 23663270545920.0, "grad_norm": 1.7824577381864952, "language_loss": 0.77991271, "learning_rate": 2.2732565129897075e-08, "loss": 0.85670727, "num_input_tokens_seen": 341977205, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09393311, "step": 15857, "time_per_iteration": 2.549622058868408 }, { "auxiliary_loss_clip": 0.06316882, "auxiliary_loss_mlp": 0.01251051, "balance_loss_clip": 0.06261718, "balance_loss_mlp": 0.01249923, "epoch": 0.9534345408086577, "flos": 61070270209920.0, "grad_norm": 0.701983286909325, "language_loss": 0.62500334, "learning_rate": 2.267404932183803e-08, "loss": 0.7006827, "num_input_tokens_seen": 342038545, "router_z_loss_clip": 0.55175781, "router_z_loss_mlp": 0.01129913, "step": 15858, "time_per_iteration": 4.564266681671143 }, { "auxiliary_loss_clip": 0.06408811, "auxiliary_loss_mlp": 0.01263053, "balance_loss_clip": 0.06271581, "balance_loss_mlp": 0.01254267, "epoch": 0.9534946640613258, "flos": 18957450193920.0, "grad_norm": 1.5163774779867913, "language_loss": 0.57507944, "learning_rate": 2.2615608494177097e-08, "loss": 0.65179807, "num_input_tokens_seen": 342058195, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.08776855, "step": 15859, "time_per_iteration": 2.524503707885742 }, { "auxiliary_loss_clip": 0.06406306, "auxiliary_loss_mlp": 0.01262833, "balance_loss_clip": 0.06273191, "balance_loss_mlp": 0.01254494, "epoch": 0.9535547873139937, "flos": 16659884839680.0, "grad_norm": 1.8843878842962736, "language_loss": 0.8164134, "learning_rate": 2.2557242649130504e-08, "loss": 0.89310479, "num_input_tokens_seen": 342075025, "router_z_loss_clip": 1.33007812, "router_z_loss_mlp": 0.08343506, "step": 15860, "time_per_iteration": 2.511881113052368 }, { "auxiliary_loss_clip": 0.0641771, "auxiliary_loss_mlp": 0.01262847, "balance_loss_clip": 0.06277865, "balance_loss_mlp": 0.01253537, "epoch": 0.9536149105666617, "flos": 20674048464000.0, "grad_norm": 4.78519458166564, "language_loss": 0.67015767, "learning_rate": 2.249895178891159e-08, "loss": 0.74696326, "num_input_tokens_seen": 342094595, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09307861, "step": 15861, "time_per_iteration": 2.6690990924835205 }, { "auxiliary_loss_clip": 0.0641734, "auxiliary_loss_mlp": 0.01266919, "balance_loss_clip": 0.06276618, "balance_loss_mlp": 0.01256923, "epoch": 0.9536750338193296, "flos": 30708304531200.0, "grad_norm": 1.5887990532438172, "language_loss": 0.65698159, "learning_rate": 2.244073591573037e-08, "loss": 0.73382419, "num_input_tokens_seen": 342115970, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09991455, "step": 15862, "time_per_iteration": 2.6123175621032715 }, { "auxiliary_loss_clip": 0.0641306, "auxiliary_loss_mlp": 0.01268512, "balance_loss_clip": 0.06280038, "balance_loss_mlp": 0.01259691, "epoch": 0.9537351570719976, "flos": 20410559700480.0, "grad_norm": 1.589146454945014, "language_loss": 0.6841346, "learning_rate": 2.238259503179485e-08, "loss": 0.76095033, "num_input_tokens_seen": 342134080, "router_z_loss_clip": 1.33007812, "router_z_loss_mlp": 0.0881958, "step": 15863, "time_per_iteration": 3.9836535453796387 }, { "auxiliary_loss_clip": 0.0641197, "auxiliary_loss_mlp": 0.0126721, "balance_loss_clip": 0.06273659, "balance_loss_mlp": 0.01258174, "epoch": 0.9537952803246655, "flos": 29936076503040.0, "grad_norm": 1.9718414566760685, "language_loss": 0.78522629, "learning_rate": 2.2324529139309267e-08, "loss": 0.86201811, "num_input_tokens_seen": 342154725, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09033203, "step": 15864, "time_per_iteration": 2.6286468505859375 }, { "auxiliary_loss_clip": 0.06414528, "auxiliary_loss_mlp": 0.01265994, "balance_loss_clip": 0.06277222, "balance_loss_mlp": 0.01257393, "epoch": 0.9538554035773336, "flos": 20527580327040.0, "grad_norm": 3.263440877533761, "language_loss": 0.60335451, "learning_rate": 2.226653824047586e-08, "loss": 0.68015969, "num_input_tokens_seen": 342172275, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.08599854, "step": 15865, "time_per_iteration": 2.5246095657348633 }, { "auxiliary_loss_clip": 0.06413968, "auxiliary_loss_mlp": 0.01266673, "balance_loss_clip": 0.06274657, "balance_loss_mlp": 0.01257517, "epoch": 0.9539155268300015, "flos": 18412555092480.0, "grad_norm": 1.7776464115429622, "language_loss": 0.70003659, "learning_rate": 2.2208622337493765e-08, "loss": 0.77684301, "num_input_tokens_seen": 342190880, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.0914917, "step": 15866, "time_per_iteration": 2.5271804332733154 }, { "auxiliary_loss_clip": 0.06413933, "auxiliary_loss_mlp": 0.01269524, "balance_loss_clip": 0.06275654, "balance_loss_mlp": 0.01259832, "epoch": 0.9539756500826695, "flos": 26220425448960.0, "grad_norm": 2.8985125502364912, "language_loss": 0.85099643, "learning_rate": 2.215078143255855e-08, "loss": 0.92783105, "num_input_tokens_seen": 342208165, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09692383, "step": 15867, "time_per_iteration": 2.5695247650146484 }, { "auxiliary_loss_clip": 0.06315457, "auxiliary_loss_mlp": 0.01251603, "balance_loss_clip": 0.06260225, "balance_loss_mlp": 0.01250553, "epoch": 0.9540357733353374, "flos": 68310673989120.0, "grad_norm": 0.7438254767549964, "language_loss": 0.61886066, "learning_rate": 2.2093015527864024e-08, "loss": 0.69453126, "num_input_tokens_seen": 342277110, "router_z_loss_clip": 0.55419922, "router_z_loss_mlp": 0.01050568, "step": 15868, "time_per_iteration": 3.2360503673553467 }, { "auxiliary_loss_clip": 0.06414121, "auxiliary_loss_mlp": 0.01264987, "balance_loss_clip": 0.06276395, "balance_loss_mlp": 0.01255737, "epoch": 0.9540958965880054, "flos": 21294693256320.0, "grad_norm": 2.0290762254788084, "language_loss": 0.59906065, "learning_rate": 2.2035324625600425e-08, "loss": 0.6758517, "num_input_tokens_seen": 342294695, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.0925293, "step": 15869, "time_per_iteration": 2.547361135482788 }, { "auxiliary_loss_clip": 0.06415021, "auxiliary_loss_mlp": 0.01265192, "balance_loss_clip": 0.06276793, "balance_loss_mlp": 0.01256967, "epoch": 0.9541560198406733, "flos": 19756819745280.0, "grad_norm": 1.6605106086588386, "language_loss": 0.71167445, "learning_rate": 2.197770872795579e-08, "loss": 0.78847659, "num_input_tokens_seen": 342314970, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.08227539, "step": 15870, "time_per_iteration": 2.630966901779175 }, { "auxiliary_loss_clip": 0.06410969, "auxiliary_loss_mlp": 0.01263881, "balance_loss_clip": 0.06272799, "balance_loss_mlp": 0.01254273, "epoch": 0.9542161430933414, "flos": 24722229646080.0, "grad_norm": 1.7573140991590428, "language_loss": 0.76963222, "learning_rate": 2.1920167837114368e-08, "loss": 0.84638077, "num_input_tokens_seen": 342334255, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09613037, "step": 15871, "time_per_iteration": 2.7002432346343994 }, { "auxiliary_loss_clip": 0.06417348, "auxiliary_loss_mlp": 0.01267871, "balance_loss_clip": 0.06278667, "balance_loss_mlp": 0.01257679, "epoch": 0.9542762663460094, "flos": 31073762113920.0, "grad_norm": 1.8136148837313955, "language_loss": 0.58966905, "learning_rate": 2.1862701955258634e-08, "loss": 0.66652119, "num_input_tokens_seen": 342354730, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.10180664, "step": 15872, "time_per_iteration": 4.145113945007324 }, { "auxiliary_loss_clip": 0.06416281, "auxiliary_loss_mlp": 0.0126614, "balance_loss_clip": 0.06274521, "balance_loss_mlp": 0.012564, "epoch": 0.9543363895986773, "flos": 20782935244800.0, "grad_norm": 1.460096004786823, "language_loss": 0.74819976, "learning_rate": 2.1805311084567514e-08, "loss": 0.82502395, "num_input_tokens_seen": 342374565, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09741211, "step": 15873, "time_per_iteration": 2.611766815185547 }, { "auxiliary_loss_clip": 0.0641447, "auxiliary_loss_mlp": 0.01264274, "balance_loss_clip": 0.06275365, "balance_loss_mlp": 0.0125466, "epoch": 0.9543965128513453, "flos": 24469725767040.0, "grad_norm": 1.6266008932136276, "language_loss": 0.62998414, "learning_rate": 2.1747995227217265e-08, "loss": 0.70677155, "num_input_tokens_seen": 342394590, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09619141, "step": 15874, "time_per_iteration": 2.6268410682678223 }, { "auxiliary_loss_clip": 0.06413034, "auxiliary_loss_mlp": 0.01267179, "balance_loss_clip": 0.06277356, "balance_loss_mlp": 0.01258358, "epoch": 0.9544566361040132, "flos": 15265838062080.0, "grad_norm": 1.9486016149522858, "language_loss": 0.89569175, "learning_rate": 2.169075438538104e-08, "loss": 0.97249377, "num_input_tokens_seen": 342410445, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.0881958, "step": 15875, "time_per_iteration": 2.624084949493408 }, { "auxiliary_loss_clip": 0.06420546, "auxiliary_loss_mlp": 0.01264407, "balance_loss_clip": 0.06276181, "balance_loss_mlp": 0.01254173, "epoch": 0.9545167593566812, "flos": 25925434750080.0, "grad_norm": 1.5314165710028866, "language_loss": 0.68022102, "learning_rate": 2.1633588561229765e-08, "loss": 0.7570706, "num_input_tokens_seen": 342430970, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10235596, "step": 15876, "time_per_iteration": 2.560884714126587 }, { "auxiliary_loss_clip": 0.06418531, "auxiliary_loss_mlp": 0.01265282, "balance_loss_clip": 0.06277303, "balance_loss_mlp": 0.01255191, "epoch": 0.9545768826093491, "flos": 25635014098560.0, "grad_norm": 1.8774351365887596, "language_loss": 0.6927588, "learning_rate": 2.1576497756931267e-08, "loss": 0.76959693, "num_input_tokens_seen": 342449505, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10089111, "step": 15877, "time_per_iteration": 2.565523386001587 }, { "auxiliary_loss_clip": 0.06417503, "auxiliary_loss_mlp": 0.01263041, "balance_loss_clip": 0.06275213, "balance_loss_mlp": 0.01253296, "epoch": 0.9546370058620172, "flos": 22497982214400.0, "grad_norm": 1.6724678003869555, "language_loss": 0.71417606, "learning_rate": 2.1519481974650035e-08, "loss": 0.79098153, "num_input_tokens_seen": 342470390, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09747314, "step": 15878, "time_per_iteration": 3.934981346130371 }, { "auxiliary_loss_clip": 0.06407289, "auxiliary_loss_mlp": 0.0126172, "balance_loss_clip": 0.06271048, "balance_loss_mlp": 0.01252797, "epoch": 0.9546971291146851, "flos": 24616738955520.0, "grad_norm": 1.341709133769854, "language_loss": 0.68377495, "learning_rate": 2.1462541216548335e-08, "loss": 0.76046503, "num_input_tokens_seen": 342492560, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.08929443, "step": 15879, "time_per_iteration": 2.572084903717041 }, { "auxiliary_loss_clip": 0.06414185, "auxiliary_loss_mlp": 0.01265824, "balance_loss_clip": 0.0627747, "balance_loss_mlp": 0.01256639, "epoch": 0.9547572523673531, "flos": 28665297480960.0, "grad_norm": 1.7723230784426558, "language_loss": 0.84928268, "learning_rate": 2.1405675484785334e-08, "loss": 0.92608273, "num_input_tokens_seen": 342512315, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.09185791, "step": 15880, "time_per_iteration": 2.6086652278900146 }, { "auxiliary_loss_clip": 0.06412213, "auxiliary_loss_mlp": 0.01263444, "balance_loss_clip": 0.06272247, "balance_loss_mlp": 0.0125377, "epoch": 0.954817375620021, "flos": 33811067295360.0, "grad_norm": 1.7088347768559584, "language_loss": 0.72269106, "learning_rate": 2.134888478151753e-08, "loss": 0.79944766, "num_input_tokens_seen": 342533060, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09674072, "step": 15881, "time_per_iteration": 2.638157367706299 }, { "auxiliary_loss_clip": 0.06411388, "auxiliary_loss_mlp": 0.01264863, "balance_loss_clip": 0.06274374, "balance_loss_mlp": 0.0125535, "epoch": 0.954877498872689, "flos": 14433373347840.0, "grad_norm": 1.9258384910415793, "language_loss": 0.71730387, "learning_rate": 2.1292169108898083e-08, "loss": 0.79406637, "num_input_tokens_seen": 342550830, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09515381, "step": 15882, "time_per_iteration": 2.5292656421661377 }, { "auxiliary_loss_clip": 0.06417906, "auxiliary_loss_mlp": 0.01264689, "balance_loss_clip": 0.06277183, "balance_loss_mlp": 0.01255868, "epoch": 0.9549376221253569, "flos": 59282129681280.0, "grad_norm": 2.170396323415434, "language_loss": 0.66392159, "learning_rate": 2.1235528469078168e-08, "loss": 0.74074757, "num_input_tokens_seen": 342575070, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.0881958, "step": 15883, "time_per_iteration": 2.878131866455078 }, { "auxiliary_loss_clip": 0.06414437, "auxiliary_loss_mlp": 0.01264294, "balance_loss_clip": 0.06273668, "balance_loss_mlp": 0.01254191, "epoch": 0.954997745378025, "flos": 17280068434560.0, "grad_norm": 2.1656951258555224, "language_loss": 0.77929449, "learning_rate": 2.1178962864205175e-08, "loss": 0.85608172, "num_input_tokens_seen": 342592215, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.10095215, "step": 15884, "time_per_iteration": 2.6170451641082764 }, { "auxiliary_loss_clip": 0.06416526, "auxiliary_loss_mlp": 0.01263795, "balance_loss_clip": 0.06275364, "balance_loss_mlp": 0.01253973, "epoch": 0.955057868630693, "flos": 13011472287360.0, "grad_norm": 1.8216693800413395, "language_loss": 0.78154564, "learning_rate": 2.1122472296424054e-08, "loss": 0.85834885, "num_input_tokens_seen": 342610030, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09814453, "step": 15885, "time_per_iteration": 2.596585750579834 }, { "auxiliary_loss_clip": 0.06415157, "auxiliary_loss_mlp": 0.01265757, "balance_loss_clip": 0.06275827, "balance_loss_mlp": 0.01256732, "epoch": 0.9551179918833609, "flos": 22644240716160.0, "grad_norm": 1.630992551530713, "language_loss": 0.7049545, "learning_rate": 2.1066056767877317e-08, "loss": 0.78176367, "num_input_tokens_seen": 342626475, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09020996, "step": 15886, "time_per_iteration": 2.5986900329589844 }, { "auxiliary_loss_clip": 0.0642291, "auxiliary_loss_mlp": 0.01265357, "balance_loss_clip": 0.06279513, "balance_loss_mlp": 0.01254575, "epoch": 0.9551781151360289, "flos": 21549125779200.0, "grad_norm": 1.7159519462843744, "language_loss": 0.72859025, "learning_rate": 2.1009716280703916e-08, "loss": 0.80547297, "num_input_tokens_seen": 342646645, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.10772705, "step": 15887, "time_per_iteration": 2.589383363723755 }, { "auxiliary_loss_clip": 0.06410322, "auxiliary_loss_mlp": 0.01263229, "balance_loss_clip": 0.06276698, "balance_loss_mlp": 0.01254271, "epoch": 0.9552382383886968, "flos": 20708191802880.0, "grad_norm": 2.482110348886992, "language_loss": 0.56728339, "learning_rate": 2.0953450837040364e-08, "loss": 0.64401889, "num_input_tokens_seen": 342663615, "router_z_loss_clip": 1.33691406, "router_z_loss_mlp": 0.08959961, "step": 15888, "time_per_iteration": 2.551809072494507 }, { "auxiliary_loss_clip": 0.06314037, "auxiliary_loss_mlp": 0.01250627, "balance_loss_clip": 0.06259014, "balance_loss_mlp": 0.01249624, "epoch": 0.9552983616413648, "flos": 67789859736960.0, "grad_norm": 0.6932872923771152, "language_loss": 0.57779425, "learning_rate": 2.0897260439020514e-08, "loss": 0.65344095, "num_input_tokens_seen": 342728275, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01002502, "step": 15889, "time_per_iteration": 3.223754405975342 }, { "auxiliary_loss_clip": 0.06415637, "auxiliary_loss_mlp": 0.01264775, "balance_loss_clip": 0.06272867, "balance_loss_mlp": 0.01254738, "epoch": 0.9553584848940327, "flos": 21586413646080.0, "grad_norm": 1.361336597577748, "language_loss": 0.67326218, "learning_rate": 2.084114508877466e-08, "loss": 0.75006628, "num_input_tokens_seen": 342748860, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.10046387, "step": 15890, "time_per_iteration": 2.5712151527404785 }, { "auxiliary_loss_clip": 0.06413542, "auxiliary_loss_mlp": 0.01264356, "balance_loss_clip": 0.06275333, "balance_loss_mlp": 0.01255606, "epoch": 0.9554186081467008, "flos": 24215251317120.0, "grad_norm": 1.425087508083716, "language_loss": 0.74145383, "learning_rate": 2.0785104788430874e-08, "loss": 0.81823277, "num_input_tokens_seen": 342769705, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.08752441, "step": 15891, "time_per_iteration": 2.5806500911712646 }, { "auxiliary_loss_clip": 0.06414033, "auxiliary_loss_mlp": 0.01264573, "balance_loss_clip": 0.06278873, "balance_loss_mlp": 0.01255996, "epoch": 0.9554787313993687, "flos": 16256845900800.0, "grad_norm": 1.9055708797842916, "language_loss": 0.78107464, "learning_rate": 2.072913954011435e-08, "loss": 0.85786068, "num_input_tokens_seen": 342787000, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.08575439, "step": 15892, "time_per_iteration": 2.510952949523926 }, { "auxiliary_loss_clip": 0.06417222, "auxiliary_loss_mlp": 0.01265864, "balance_loss_clip": 0.06279708, "balance_loss_mlp": 0.01256185, "epoch": 0.9555388546520367, "flos": 23410850520960.0, "grad_norm": 1.6370305475000306, "language_loss": 0.70318276, "learning_rate": 2.0673249345947386e-08, "loss": 0.78001368, "num_input_tokens_seen": 342807795, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09674072, "step": 15893, "time_per_iteration": 2.684600353240967 }, { "auxiliary_loss_clip": 0.0641337, "auxiliary_loss_mlp": 0.01265405, "balance_loss_clip": 0.06277371, "balance_loss_mlp": 0.01254288, "epoch": 0.9555989779047046, "flos": 14799417909120.0, "grad_norm": 2.6154821517792133, "language_loss": 0.65959102, "learning_rate": 2.0617434208048955e-08, "loss": 0.73637879, "num_input_tokens_seen": 342825490, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.11114502, "step": 15894, "time_per_iteration": 2.508823871612549 }, { "auxiliary_loss_clip": 0.06415722, "auxiliary_loss_mlp": 0.01263553, "balance_loss_clip": 0.06275073, "balance_loss_mlp": 0.0125407, "epoch": 0.9556591011573726, "flos": 22243298129280.0, "grad_norm": 1.815884081592941, "language_loss": 0.81607848, "learning_rate": 2.056169412853581e-08, "loss": 0.8928712, "num_input_tokens_seen": 342844965, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09484863, "step": 15895, "time_per_iteration": 2.5667505264282227 }, { "auxiliary_loss_clip": 0.06418647, "auxiliary_loss_mlp": 0.01266915, "balance_loss_clip": 0.06280018, "balance_loss_mlp": 0.01257861, "epoch": 0.9557192244100405, "flos": 27862741474560.0, "grad_norm": 1.494618276155355, "language_loss": 0.72725153, "learning_rate": 2.0506029109521593e-08, "loss": 0.80410719, "num_input_tokens_seen": 342865915, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09057617, "step": 15896, "time_per_iteration": 2.6041409969329834 }, { "auxiliary_loss_clip": 0.06411672, "auxiliary_loss_mlp": 0.0126568, "balance_loss_clip": 0.06274828, "balance_loss_mlp": 0.01256102, "epoch": 0.9557793476627086, "flos": 17608531639680.0, "grad_norm": 1.8393140308102116, "language_loss": 0.79147899, "learning_rate": 2.045043915311706e-08, "loss": 0.86825252, "num_input_tokens_seen": 342884000, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.0958252, "step": 15897, "time_per_iteration": 2.5534746646881104 }, { "auxiliary_loss_clip": 0.06416435, "auxiliary_loss_mlp": 0.01262878, "balance_loss_clip": 0.06277804, "balance_loss_mlp": 0.01253357, "epoch": 0.9558394709153766, "flos": 23881798794240.0, "grad_norm": 1.6573434691199411, "language_loss": 0.72809601, "learning_rate": 2.03949242614303e-08, "loss": 0.80488908, "num_input_tokens_seen": 342903095, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.0953064, "step": 15898, "time_per_iteration": 4.087982892990112 }, { "auxiliary_loss_clip": 0.06321008, "auxiliary_loss_mlp": 0.01250157, "balance_loss_clip": 0.06265928, "balance_loss_mlp": 0.01249085, "epoch": 0.9558995941680445, "flos": 53698995152640.0, "grad_norm": 0.9057983912659017, "language_loss": 0.52336329, "learning_rate": 2.033948443656652e-08, "loss": 0.59907496, "num_input_tokens_seen": 342958155, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01073456, "step": 15899, "time_per_iteration": 3.0969324111938477 }, { "auxiliary_loss_clip": 0.06421405, "auxiliary_loss_mlp": 0.01265588, "balance_loss_clip": 0.06276871, "balance_loss_mlp": 0.01254913, "epoch": 0.9559597174207125, "flos": 13768355018880.0, "grad_norm": 2.194816815128907, "language_loss": 0.69050419, "learning_rate": 2.028411968062782e-08, "loss": 0.76737416, "num_input_tokens_seen": 342972500, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10675049, "step": 15900, "time_per_iteration": 2.692680835723877 }, { "auxiliary_loss_clip": 0.06413184, "auxiliary_loss_mlp": 0.01266622, "balance_loss_clip": 0.06272954, "balance_loss_mlp": 0.01256662, "epoch": 0.9560198406733804, "flos": 19942210903680.0, "grad_norm": 2.15373856608119, "language_loss": 0.83160305, "learning_rate": 2.0228829995713627e-08, "loss": 0.90840107, "num_input_tokens_seen": 342989035, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09967041, "step": 15901, "time_per_iteration": 2.542362689971924 }, { "auxiliary_loss_clip": 0.06315367, "auxiliary_loss_mlp": 0.0125159, "balance_loss_clip": 0.06260181, "balance_loss_mlp": 0.0125054, "epoch": 0.9560799639260484, "flos": 57306388331520.0, "grad_norm": 1.4330270036581794, "language_loss": 0.54190481, "learning_rate": 2.0173615383920485e-08, "loss": 0.61757433, "num_input_tokens_seen": 343051675, "router_z_loss_clip": 0.55175781, "router_z_loss_mlp": 0.01050568, "step": 15902, "time_per_iteration": 4.676306247711182 }, { "auxiliary_loss_clip": 0.06404495, "auxiliary_loss_mlp": 0.01263174, "balance_loss_clip": 0.06273387, "balance_loss_mlp": 0.01255663, "epoch": 0.9561400871787163, "flos": 18923264928000.0, "grad_norm": 1.5860514580335323, "language_loss": 0.8537311, "learning_rate": 2.01184758473425e-08, "loss": 0.93040776, "num_input_tokens_seen": 343068895, "router_z_loss_clip": 1.30859375, "router_z_loss_mlp": 0.07510376, "step": 15903, "time_per_iteration": 2.565077304840088 }, { "auxiliary_loss_clip": 0.06410842, "auxiliary_loss_mlp": 0.01263651, "balance_loss_clip": 0.06273212, "balance_loss_mlp": 0.01254847, "epoch": 0.9562002104313844, "flos": 18044036835840.0, "grad_norm": 1.7254315729615137, "language_loss": 0.80778897, "learning_rate": 2.0063411388070217e-08, "loss": 0.88453388, "num_input_tokens_seen": 343087115, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.0880127, "step": 15904, "time_per_iteration": 2.504068613052368 }, { "auxiliary_loss_clip": 0.06416728, "auxiliary_loss_mlp": 0.01265876, "balance_loss_clip": 0.0627645, "balance_loss_mlp": 0.0125625, "epoch": 0.9562603336840523, "flos": 24724619487360.0, "grad_norm": 2.183040359656459, "language_loss": 0.60108888, "learning_rate": 2.0008422008191972e-08, "loss": 0.67791486, "num_input_tokens_seen": 343105575, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09625244, "step": 15905, "time_per_iteration": 2.554598808288574 }, { "auxiliary_loss_clip": 0.06411397, "auxiliary_loss_mlp": 0.01265495, "balance_loss_clip": 0.06273073, "balance_loss_mlp": 0.01256322, "epoch": 0.9563204569367203, "flos": 21183332780160.0, "grad_norm": 1.8219178723317837, "language_loss": 0.70490587, "learning_rate": 1.995350770979254e-08, "loss": 0.78167474, "num_input_tokens_seen": 343123025, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09173584, "step": 15906, "time_per_iteration": 2.533961296081543 }, { "auxiliary_loss_clip": 0.06423746, "auxiliary_loss_mlp": 0.01264286, "balance_loss_clip": 0.06280415, "balance_loss_mlp": 0.01253926, "epoch": 0.9563805801893882, "flos": 20235901864320.0, "grad_norm": 1.875212401843134, "language_loss": 0.71169692, "learning_rate": 1.9898668494954473e-08, "loss": 0.78857726, "num_input_tokens_seen": 343141625, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10357666, "step": 15907, "time_per_iteration": 2.5467286109924316 }, { "auxiliary_loss_clip": 0.06412572, "auxiliary_loss_mlp": 0.01266292, "balance_loss_clip": 0.06276643, "balance_loss_mlp": 0.01256719, "epoch": 0.9564407034420562, "flos": 25418079077760.0, "grad_norm": 1.9496402673487268, "language_loss": 0.70252347, "learning_rate": 1.9843904365757447e-08, "loss": 0.77931213, "num_input_tokens_seen": 343161300, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09570312, "step": 15908, "time_per_iteration": 2.569019317626953 }, { "auxiliary_loss_clip": 0.06415959, "auxiliary_loss_mlp": 0.01264632, "balance_loss_clip": 0.06277634, "balance_loss_mlp": 0.01255537, "epoch": 0.9565008266947241, "flos": 18629699748480.0, "grad_norm": 1.8123978403155474, "language_loss": 0.83227599, "learning_rate": 1.978921532427802e-08, "loss": 0.90908188, "num_input_tokens_seen": 343177815, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09100342, "step": 15909, "time_per_iteration": 2.514801263809204 }, { "auxiliary_loss_clip": 0.06415661, "auxiliary_loss_mlp": 0.01263714, "balance_loss_clip": 0.06278388, "balance_loss_mlp": 0.01254666, "epoch": 0.9565609499473922, "flos": 24868865491200.0, "grad_norm": 1.7651389503300352, "language_loss": 0.68140769, "learning_rate": 1.9734601372590086e-08, "loss": 0.75820136, "num_input_tokens_seen": 343198140, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.0904541, "step": 15910, "time_per_iteration": 2.5729143619537354 }, { "auxiliary_loss_clip": 0.06422522, "auxiliary_loss_mlp": 0.01265072, "balance_loss_clip": 0.0627863, "balance_loss_mlp": 0.01255577, "epoch": 0.9566210732000601, "flos": 21804858040320.0, "grad_norm": 2.606877178920476, "language_loss": 0.74458516, "learning_rate": 1.968006251276444e-08, "loss": 0.82146108, "num_input_tokens_seen": 343218280, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.09490967, "step": 15911, "time_per_iteration": 2.545269727706909 }, { "auxiliary_loss_clip": 0.06415263, "auxiliary_loss_mlp": 0.01265322, "balance_loss_clip": 0.0627528, "balance_loss_mlp": 0.01256537, "epoch": 0.9566811964527281, "flos": 18703562722560.0, "grad_norm": 2.0253682254361287, "language_loss": 0.69873077, "learning_rate": 1.9625598746869198e-08, "loss": 0.7755366, "num_input_tokens_seen": 343236850, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.08789062, "step": 15912, "time_per_iteration": 3.9144253730773926 }, { "auxiliary_loss_clip": 0.06415586, "auxiliary_loss_mlp": 0.01267668, "balance_loss_clip": 0.06277566, "balance_loss_mlp": 0.0125794, "epoch": 0.9567413197053961, "flos": 13004763960960.0, "grad_norm": 2.730916951135782, "language_loss": 0.72550368, "learning_rate": 1.95712100769696e-08, "loss": 0.80233622, "num_input_tokens_seen": 343253065, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09729004, "step": 15913, "time_per_iteration": 2.508826732635498 }, { "auxiliary_loss_clip": 0.0641198, "auxiliary_loss_mlp": 0.012655, "balance_loss_clip": 0.06274739, "balance_loss_mlp": 0.01256601, "epoch": 0.956801442958064, "flos": 19725401664000.0, "grad_norm": 2.016766378986276, "language_loss": 0.73134756, "learning_rate": 1.9516896505128444e-08, "loss": 0.80812234, "num_input_tokens_seen": 343270330, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.08911133, "step": 15914, "time_per_iteration": 2.5600039958953857 }, { "auxiliary_loss_clip": 0.06411529, "auxiliary_loss_mlp": 0.01263057, "balance_loss_clip": 0.06272665, "balance_loss_mlp": 0.01253711, "epoch": 0.956861566210732, "flos": 18228631380480.0, "grad_norm": 1.3932757084278504, "language_loss": 0.6726321, "learning_rate": 1.9462658033404965e-08, "loss": 0.74937797, "num_input_tokens_seen": 343289625, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09344482, "step": 15915, "time_per_iteration": 2.5606369972229004 }, { "auxiliary_loss_clip": 0.06406075, "auxiliary_loss_mlp": 0.0126654, "balance_loss_clip": 0.06271029, "balance_loss_mlp": 0.01257563, "epoch": 0.9569216894634, "flos": 22202949588480.0, "grad_norm": 1.7319131596338553, "language_loss": 0.64646423, "learning_rate": 1.9408494663855967e-08, "loss": 0.72319043, "num_input_tokens_seen": 343309200, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.08972168, "step": 15916, "time_per_iteration": 2.595785140991211 }, { "auxiliary_loss_clip": 0.06408766, "auxiliary_loss_mlp": 0.01265332, "balance_loss_clip": 0.06277043, "balance_loss_mlp": 0.01256761, "epoch": 0.956981812716068, "flos": 21695719697280.0, "grad_norm": 1.748063032107336, "language_loss": 0.80964839, "learning_rate": 1.935440639853536e-08, "loss": 0.88638937, "num_input_tokens_seen": 343326270, "router_z_loss_clip": 1.31738281, "router_z_loss_mlp": 0.08575439, "step": 15917, "time_per_iteration": 3.9928641319274902 }, { "auxiliary_loss_clip": 0.06413779, "auxiliary_loss_mlp": 0.01266154, "balance_loss_clip": 0.06276858, "balance_loss_mlp": 0.01256444, "epoch": 0.9570419359687359, "flos": 13996065288960.0, "grad_norm": 1.7700702008685878, "language_loss": 0.73052949, "learning_rate": 1.9300393239494172e-08, "loss": 0.80732882, "num_input_tokens_seen": 343344430, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09710693, "step": 15918, "time_per_iteration": 2.515691041946411 }, { "auxiliary_loss_clip": 0.06317325, "auxiliary_loss_mlp": 0.0125134, "balance_loss_clip": 0.06262134, "balance_loss_mlp": 0.01250323, "epoch": 0.9571020592214039, "flos": 65219525015040.0, "grad_norm": 0.6229851692168391, "language_loss": 0.5307399, "learning_rate": 1.924645518878032e-08, "loss": 0.6064266, "num_input_tokens_seen": 343416155, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01016235, "step": 15919, "time_per_iteration": 3.2548632621765137 }, { "auxiliary_loss_clip": 0.06424516, "auxiliary_loss_mlp": 0.01267982, "balance_loss_clip": 0.06281408, "balance_loss_mlp": 0.01257885, "epoch": 0.9571621824740718, "flos": 17389793756160.0, "grad_norm": 2.451028008511409, "language_loss": 0.75869739, "learning_rate": 1.919259224843972e-08, "loss": 0.83562231, "num_input_tokens_seen": 343431715, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.10107422, "step": 15920, "time_per_iteration": 2.4978060722351074 }, { "auxiliary_loss_clip": 0.06417029, "auxiliary_loss_mlp": 0.01267773, "balance_loss_clip": 0.06276745, "balance_loss_mlp": 0.01257134, "epoch": 0.9572223057267398, "flos": 14543434085760.0, "grad_norm": 1.6486000311627302, "language_loss": 0.79787242, "learning_rate": 1.9138804420514298e-08, "loss": 0.87472045, "num_input_tokens_seen": 343450425, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.10638428, "step": 15921, "time_per_iteration": 2.5114052295684814 }, { "auxiliary_loss_clip": 0.06423795, "auxiliary_loss_mlp": 0.01263647, "balance_loss_clip": 0.06278528, "balance_loss_mlp": 0.0125318, "epoch": 0.9572824289794077, "flos": 33956151840000.0, "grad_norm": 1.9534435701736612, "language_loss": 0.51614004, "learning_rate": 1.9085091707044197e-08, "loss": 0.59301448, "num_input_tokens_seen": 343470445, "router_z_loss_clip": 1.45410156, "router_z_loss_mlp": 0.10455322, "step": 15922, "time_per_iteration": 2.641112804412842 }, { "auxiliary_loss_clip": 0.06413421, "auxiliary_loss_mlp": 0.01266092, "balance_loss_clip": 0.06272559, "balance_loss_mlp": 0.01256573, "epoch": 0.9573425522320758, "flos": 18700418194560.0, "grad_norm": 2.099825711353889, "language_loss": 0.83833349, "learning_rate": 1.903145411006557e-08, "loss": 0.91512859, "num_input_tokens_seen": 343485200, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09521484, "step": 15923, "time_per_iteration": 2.4932103157043457 }, { "auxiliary_loss_clip": 0.06410748, "auxiliary_loss_mlp": 0.01262122, "balance_loss_clip": 0.0627283, "balance_loss_mlp": 0.0125305, "epoch": 0.9574026754847437, "flos": 28517571532800.0, "grad_norm": 1.5190659109981692, "language_loss": 0.75396979, "learning_rate": 1.8977891631613008e-08, "loss": 0.83069855, "num_input_tokens_seen": 343505080, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09075928, "step": 15924, "time_per_iteration": 2.5879883766174316 }, { "auxiliary_loss_clip": 0.06415883, "auxiliary_loss_mlp": 0.01264498, "balance_loss_clip": 0.06276515, "balance_loss_mlp": 0.01255265, "epoch": 0.9574627987374117, "flos": 24359203831680.0, "grad_norm": 2.3225299710794145, "language_loss": 0.865403, "learning_rate": 1.892440427371711e-08, "loss": 0.9422068, "num_input_tokens_seen": 343523995, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09228516, "step": 15925, "time_per_iteration": 2.561218738555908 }, { "auxiliary_loss_clip": 0.06417982, "auxiliary_loss_mlp": 0.01264844, "balance_loss_clip": 0.06274596, "balance_loss_mlp": 0.01254884, "epoch": 0.9575229219900797, "flos": 23516928190080.0, "grad_norm": 1.8991396152568667, "language_loss": 0.74762952, "learning_rate": 1.8870992038406474e-08, "loss": 0.82445776, "num_input_tokens_seen": 343542015, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09973145, "step": 15926, "time_per_iteration": 2.5405054092407227 }, { "auxiliary_loss_clip": 0.06418848, "auxiliary_loss_mlp": 0.01263046, "balance_loss_clip": 0.06278548, "balance_loss_mlp": 0.01254588, "epoch": 0.9575830452427476, "flos": 22681486656000.0, "grad_norm": 1.5779207891104896, "language_loss": 0.78227293, "learning_rate": 1.8817654927706373e-08, "loss": 0.85909188, "num_input_tokens_seen": 343561680, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.08459473, "step": 15927, "time_per_iteration": 2.5663602352142334 }, { "auxiliary_loss_clip": 0.06418027, "auxiliary_loss_mlp": 0.01265845, "balance_loss_clip": 0.06276055, "balance_loss_mlp": 0.01255408, "epoch": 0.9576431684954156, "flos": 30493633570560.0, "grad_norm": 1.9095470699189183, "language_loss": 0.69434643, "learning_rate": 1.8764392943639183e-08, "loss": 0.77118516, "num_input_tokens_seen": 343585290, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.10437012, "step": 15928, "time_per_iteration": 2.629976511001587 }, { "auxiliary_loss_clip": 0.06418684, "auxiliary_loss_mlp": 0.01264206, "balance_loss_clip": 0.06278634, "balance_loss_mlp": 0.01254491, "epoch": 0.9577032917480836, "flos": 21693497564160.0, "grad_norm": 1.8673294594167376, "language_loss": 0.8200258, "learning_rate": 1.871120608822485e-08, "loss": 0.89685476, "num_input_tokens_seen": 343604045, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09710693, "step": 15929, "time_per_iteration": 2.5422606468200684 }, { "auxiliary_loss_clip": 0.06425435, "auxiliary_loss_mlp": 0.0126547, "balance_loss_clip": 0.06280339, "balance_loss_mlp": 0.01255975, "epoch": 0.9577634150007516, "flos": 29030838917760.0, "grad_norm": 1.6467616850989202, "language_loss": 0.7252776, "learning_rate": 1.8658094363480202e-08, "loss": 0.80218661, "num_input_tokens_seen": 343626595, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.0949707, "step": 15930, "time_per_iteration": 2.5967090129852295 }, { "auxiliary_loss_clip": 0.06413558, "auxiliary_loss_mlp": 0.01263719, "balance_loss_clip": 0.06276158, "balance_loss_mlp": 0.01254677, "epoch": 0.9578235382534195, "flos": 19288429021440.0, "grad_norm": 1.7439020584522182, "language_loss": 0.6267159, "learning_rate": 1.8605057771419185e-08, "loss": 0.70348871, "num_input_tokens_seen": 343646195, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09033203, "step": 15931, "time_per_iteration": 2.5309364795684814 }, { "auxiliary_loss_clip": 0.06410983, "auxiliary_loss_mlp": 0.01264119, "balance_loss_clip": 0.06275973, "balance_loss_mlp": 0.01255483, "epoch": 0.9578836615060875, "flos": 13704428753280.0, "grad_norm": 1.8833406297269948, "language_loss": 0.69665676, "learning_rate": 1.8552096314052633e-08, "loss": 0.77340782, "num_input_tokens_seen": 343663665, "router_z_loss_clip": 1.34863281, "router_z_loss_mlp": 0.08630371, "step": 15932, "time_per_iteration": 2.579448938369751 }, { "auxiliary_loss_clip": 0.06417368, "auxiliary_loss_mlp": 0.01272222, "balance_loss_clip": 0.06273059, "balance_loss_mlp": 0.0126191, "epoch": 0.9579437847587554, "flos": 17059988885760.0, "grad_norm": 1.6789180460801376, "language_loss": 0.75729334, "learning_rate": 1.849920999338961e-08, "loss": 0.8341893, "num_input_tokens_seen": 343682145, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.10314941, "step": 15933, "time_per_iteration": 2.5290544033050537 }, { "auxiliary_loss_clip": 0.06320041, "auxiliary_loss_mlp": 0.01250682, "balance_loss_clip": 0.062648, "balance_loss_mlp": 0.01249663, "epoch": 0.9580039080114234, "flos": 60587875854720.0, "grad_norm": 0.7100396502820774, "language_loss": 0.57346433, "learning_rate": 1.8446398811434948e-08, "loss": 0.64917159, "num_input_tokens_seen": 343744685, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.0102005, "step": 15934, "time_per_iteration": 3.2618343830108643 }, { "auxiliary_loss_clip": 0.06318922, "auxiliary_loss_mlp": 0.01251225, "balance_loss_clip": 0.06263416, "balance_loss_mlp": 0.01250252, "epoch": 0.9580640312640913, "flos": 66254837264640.0, "grad_norm": 0.8864467157386227, "language_loss": 0.65952229, "learning_rate": 1.8393662770191277e-08, "loss": 0.73522377, "num_input_tokens_seen": 343801835, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00971985, "step": 15935, "time_per_iteration": 3.1080381870269775 }, { "auxiliary_loss_clip": 0.06315613, "auxiliary_loss_mlp": 0.01251127, "balance_loss_clip": 0.06260255, "balance_loss_mlp": 0.01250053, "epoch": 0.9581241545167594, "flos": 62236145520000.0, "grad_norm": 1.0084758140492052, "language_loss": 0.56829095, "learning_rate": 1.8341001871658546e-08, "loss": 0.64395833, "num_input_tokens_seen": 343861515, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01075745, "step": 15936, "time_per_iteration": 3.1405117511749268 }, { "auxiliary_loss_clip": 0.06414773, "auxiliary_loss_mlp": 0.0126662, "balance_loss_clip": 0.06274767, "balance_loss_mlp": 0.01257108, "epoch": 0.9581842777694273, "flos": 23774714876160.0, "grad_norm": 1.5464128033973834, "language_loss": 0.78894401, "learning_rate": 1.8288416117833825e-08, "loss": 0.86575794, "num_input_tokens_seen": 343881240, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09509277, "step": 15937, "time_per_iteration": 4.061129808425903 }, { "auxiliary_loss_clip": 0.0641854, "auxiliary_loss_mlp": 0.0126451, "balance_loss_clip": 0.06279615, "balance_loss_mlp": 0.01254431, "epoch": 0.9582444010220953, "flos": 21219111273600.0, "grad_norm": 1.5502120282394505, "language_loss": 0.68009269, "learning_rate": 1.8235905510710636e-08, "loss": 0.7569232, "num_input_tokens_seen": 343900885, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.10083008, "step": 15938, "time_per_iteration": 2.543849468231201 }, { "auxiliary_loss_clip": 0.06414264, "auxiliary_loss_mlp": 0.01265083, "balance_loss_clip": 0.06275118, "balance_loss_mlp": 0.01255576, "epoch": 0.9583045242747633, "flos": 23811876961920.0, "grad_norm": 2.2351036134031617, "language_loss": 0.66869259, "learning_rate": 1.8183470052280712e-08, "loss": 0.74548602, "num_input_tokens_seen": 343918460, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09503174, "step": 15939, "time_per_iteration": 2.5520596504211426 }, { "auxiliary_loss_clip": 0.06412028, "auxiliary_loss_mlp": 0.01262676, "balance_loss_clip": 0.06272995, "balance_loss_mlp": 0.01253741, "epoch": 0.9583646475274312, "flos": 24137908398720.0, "grad_norm": 1.5070376166821042, "language_loss": 0.74149442, "learning_rate": 1.8131109744532025e-08, "loss": 0.81824142, "num_input_tokens_seen": 343938030, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.08947754, "step": 15940, "time_per_iteration": 2.5883963108062744 }, { "auxiliary_loss_clip": 0.06420304, "auxiliary_loss_mlp": 0.01266616, "balance_loss_clip": 0.06279534, "balance_loss_mlp": 0.01256752, "epoch": 0.9584247707800992, "flos": 20892954055680.0, "grad_norm": 1.8956982926633739, "language_loss": 0.72696996, "learning_rate": 1.8078824589450535e-08, "loss": 0.80383909, "num_input_tokens_seen": 343956635, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09863281, "step": 15941, "time_per_iteration": 2.517179012298584 }, { "auxiliary_loss_clip": 0.06414558, "auxiliary_loss_mlp": 0.01264564, "balance_loss_clip": 0.06276833, "balance_loss_mlp": 0.01255415, "epoch": 0.9584848940327672, "flos": 26074753925760.0, "grad_norm": 1.6637842724663356, "language_loss": 0.71830201, "learning_rate": 1.8026614589018442e-08, "loss": 0.79509324, "num_input_tokens_seen": 343976625, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09155273, "step": 15942, "time_per_iteration": 4.092994689941406 }, { "auxiliary_loss_clip": 0.06419392, "auxiliary_loss_mlp": 0.0126405, "balance_loss_clip": 0.06277925, "balance_loss_mlp": 0.01253965, "epoch": 0.9585450172854352, "flos": 34501088868480.0, "grad_norm": 1.7040736973765567, "language_loss": 0.72123373, "learning_rate": 1.797447974521571e-08, "loss": 0.79806817, "num_input_tokens_seen": 343997790, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10083008, "step": 15943, "time_per_iteration": 2.6696388721466064 }, { "auxiliary_loss_clip": 0.06421138, "auxiliary_loss_mlp": 0.01266678, "balance_loss_clip": 0.06279065, "balance_loss_mlp": 0.01256349, "epoch": 0.9586051405381031, "flos": 23117159560320.0, "grad_norm": 1.7210807516951865, "language_loss": 0.68419987, "learning_rate": 1.792242006001965e-08, "loss": 0.76107806, "num_input_tokens_seen": 344016935, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10333252, "step": 15944, "time_per_iteration": 2.5779902935028076 }, { "auxiliary_loss_clip": 0.06411272, "auxiliary_loss_mlp": 0.01267247, "balance_loss_clip": 0.06271829, "balance_loss_mlp": 0.01257024, "epoch": 0.9586652637907711, "flos": 19609135724160.0, "grad_norm": 1.6579332286947526, "language_loss": 0.66070497, "learning_rate": 1.7870435535403795e-08, "loss": 0.73749012, "num_input_tokens_seen": 344035590, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.10223389, "step": 15945, "time_per_iteration": 2.5650622844696045 }, { "auxiliary_loss_clip": 0.06314611, "auxiliary_loss_mlp": 0.01251433, "balance_loss_clip": 0.06259412, "balance_loss_mlp": 0.01250336, "epoch": 0.958725387043439, "flos": 72093815107200.0, "grad_norm": 0.7461328185896564, "language_loss": 0.61865246, "learning_rate": 1.7818526173339678e-08, "loss": 0.69431287, "num_input_tokens_seen": 344100845, "router_z_loss_clip": 0.55419922, "router_z_loss_mlp": 0.01098633, "step": 15946, "time_per_iteration": 3.284280776977539 }, { "auxiliary_loss_clip": 0.06411518, "auxiliary_loss_mlp": 0.01263488, "balance_loss_clip": 0.06274992, "balance_loss_mlp": 0.01254672, "epoch": 0.958785510296107, "flos": 28919310733440.0, "grad_norm": 2.352626997880809, "language_loss": 0.75393748, "learning_rate": 1.7766691975795723e-08, "loss": 0.83068752, "num_input_tokens_seen": 344121780, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.0881958, "step": 15947, "time_per_iteration": 2.725238084793091 }, { "auxiliary_loss_clip": 0.06413358, "auxiliary_loss_mlp": 0.01266186, "balance_loss_clip": 0.06274654, "balance_loss_mlp": 0.01256876, "epoch": 0.958845633548775, "flos": 18482854268160.0, "grad_norm": 3.1071519947677197, "language_loss": 0.70135617, "learning_rate": 1.771493294473747e-08, "loss": 0.77815163, "num_input_tokens_seen": 344140150, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09307861, "step": 15948, "time_per_iteration": 2.5272059440612793 }, { "auxiliary_loss_clip": 0.06412303, "auxiliary_loss_mlp": 0.0126236, "balance_loss_clip": 0.06275369, "balance_loss_mlp": 0.0125265, "epoch": 0.958905756801443, "flos": 24213783870720.0, "grad_norm": 2.0725809159018906, "language_loss": 0.79066789, "learning_rate": 1.7663249082127574e-08, "loss": 0.86741447, "num_input_tokens_seen": 344158200, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09710693, "step": 15949, "time_per_iteration": 2.652322292327881 }, { "auxiliary_loss_clip": 0.06415377, "auxiliary_loss_mlp": 0.0126426, "balance_loss_clip": 0.0627448, "balance_loss_mlp": 0.01254449, "epoch": 0.9589658800541109, "flos": 25014662795520.0, "grad_norm": 1.741036974858078, "language_loss": 0.68775463, "learning_rate": 1.761164038992602e-08, "loss": 0.76455104, "num_input_tokens_seen": 344174720, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09802246, "step": 15950, "time_per_iteration": 2.564058303833008 }, { "auxiliary_loss_clip": 0.06413489, "auxiliary_loss_mlp": 0.01265902, "balance_loss_clip": 0.06273773, "balance_loss_mlp": 0.01256985, "epoch": 0.9590260033067789, "flos": 23521456310400.0, "grad_norm": 1.7009915604248027, "language_loss": 0.86550915, "learning_rate": 1.7560106870089687e-08, "loss": 0.94230306, "num_input_tokens_seen": 344192580, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.0892334, "step": 15951, "time_per_iteration": 2.553410053253174 }, { "auxiliary_loss_clip": 0.06417866, "auxiliary_loss_mlp": 0.01263083, "balance_loss_clip": 0.06274673, "balance_loss_mlp": 0.01252897, "epoch": 0.9590861265594469, "flos": 25527427056000.0, "grad_norm": 2.380346906143006, "language_loss": 0.80314201, "learning_rate": 1.7508648524572568e-08, "loss": 0.87995148, "num_input_tokens_seen": 344210345, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.10186768, "step": 15952, "time_per_iteration": 3.9932215213775635 }, { "auxiliary_loss_clip": 0.06416561, "auxiliary_loss_mlp": 0.01265898, "balance_loss_clip": 0.06277865, "balance_loss_mlp": 0.01256284, "epoch": 0.9591462498121148, "flos": 21185806475520.0, "grad_norm": 1.5637769501847059, "language_loss": 0.69889754, "learning_rate": 1.7457265355326434e-08, "loss": 0.77572215, "num_input_tokens_seen": 344229540, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09613037, "step": 15953, "time_per_iteration": 2.540959119796753 }, { "auxiliary_loss_clip": 0.06417154, "auxiliary_loss_mlp": 0.01265343, "balance_loss_clip": 0.06276979, "balance_loss_mlp": 0.01254888, "epoch": 0.9592063730647828, "flos": 21729024495360.0, "grad_norm": 2.4150954121902184, "language_loss": 0.58061838, "learning_rate": 1.7405957364299285e-08, "loss": 0.65744328, "num_input_tokens_seen": 344247830, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.10455322, "step": 15954, "time_per_iteration": 2.542259693145752 }, { "auxiliary_loss_clip": 0.06416409, "auxiliary_loss_mlp": 0.01265381, "balance_loss_clip": 0.0627465, "balance_loss_mlp": 0.01255397, "epoch": 0.9592664963174508, "flos": 29897992022400.0, "grad_norm": 2.5934617652252783, "language_loss": 0.74011093, "learning_rate": 1.7354724553437117e-08, "loss": 0.8169288, "num_input_tokens_seen": 344267760, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09991455, "step": 15955, "time_per_iteration": 2.5891518592834473 }, { "auxiliary_loss_clip": 0.06413653, "auxiliary_loss_mlp": 0.01268837, "balance_loss_clip": 0.06273259, "balance_loss_mlp": 0.01259085, "epoch": 0.9593266195701188, "flos": 18004652616960.0, "grad_norm": 1.8185905262908648, "language_loss": 0.62471139, "learning_rate": 1.7303566924682378e-08, "loss": 0.70153624, "num_input_tokens_seen": 344284905, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09747314, "step": 15956, "time_per_iteration": 2.511786937713623 }, { "auxiliary_loss_clip": 0.06415601, "auxiliary_loss_mlp": 0.01265281, "balance_loss_clip": 0.06275921, "balance_loss_mlp": 0.01255464, "epoch": 0.9593867428227867, "flos": 18843364460160.0, "grad_norm": 1.7615464375797198, "language_loss": 0.60253775, "learning_rate": 1.725248447997507e-08, "loss": 0.67934656, "num_input_tokens_seen": 344302025, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.0980835, "step": 15957, "time_per_iteration": 3.891268253326416 }, { "auxiliary_loss_clip": 0.06415987, "auxiliary_loss_mlp": 0.01266615, "balance_loss_clip": 0.06275698, "balance_loss_mlp": 0.01256065, "epoch": 0.9594468660754547, "flos": 29574266572800.0, "grad_norm": 1.9691758547737128, "language_loss": 0.74206156, "learning_rate": 1.7201477221252314e-08, "loss": 0.81888759, "num_input_tokens_seen": 344321935, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.10540771, "step": 15958, "time_per_iteration": 2.6247973442077637 }, { "auxiliary_loss_clip": 0.06407806, "auxiliary_loss_mlp": 0.01267712, "balance_loss_clip": 0.06270804, "balance_loss_mlp": 0.01258438, "epoch": 0.9595069893281226, "flos": 20709365760000.0, "grad_norm": 1.5068907124822821, "language_loss": 0.74823916, "learning_rate": 1.7150545150448116e-08, "loss": 0.82499433, "num_input_tokens_seen": 344340405, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09277344, "step": 15959, "time_per_iteration": 2.5432870388031006 }, { "auxiliary_loss_clip": 0.06420337, "auxiliary_loss_mlp": 0.01265419, "balance_loss_clip": 0.06278557, "balance_loss_mlp": 0.01255734, "epoch": 0.9595671125807906, "flos": 22459855806720.0, "grad_norm": 2.207505896459676, "language_loss": 0.6543541, "learning_rate": 1.7099688269493816e-08, "loss": 0.7312116, "num_input_tokens_seen": 344359925, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09686279, "step": 15960, "time_per_iteration": 2.5357718467712402 }, { "auxiliary_loss_clip": 0.0641194, "auxiliary_loss_mlp": 0.01266666, "balance_loss_clip": 0.06277511, "balance_loss_mlp": 0.01257398, "epoch": 0.9596272358334585, "flos": 23922063480960.0, "grad_norm": 1.5711069862748186, "language_loss": 0.77793568, "learning_rate": 1.7048906580318544e-08, "loss": 0.85472178, "num_input_tokens_seen": 344379100, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.0927124, "step": 15961, "time_per_iteration": 2.5728986263275146 }, { "auxiliary_loss_clip": 0.06408255, "auxiliary_loss_mlp": 0.01265832, "balance_loss_clip": 0.06271228, "balance_loss_mlp": 0.01256772, "epoch": 0.9596873590861266, "flos": 17677740712320.0, "grad_norm": 1.8648024822986633, "language_loss": 0.76091981, "learning_rate": 1.699820008484698e-08, "loss": 0.83766067, "num_input_tokens_seen": 344396895, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.09051514, "step": 15962, "time_per_iteration": 2.4938855171203613 }, { "auxiliary_loss_clip": 0.0641728, "auxiliary_loss_mlp": 0.0126514, "balance_loss_clip": 0.06274819, "balance_loss_mlp": 0.01254852, "epoch": 0.9597474823387945, "flos": 25815038595840.0, "grad_norm": 4.443682270970219, "language_loss": 0.71614468, "learning_rate": 1.6947568785002698e-08, "loss": 0.79296887, "num_input_tokens_seen": 344415115, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10296631, "step": 15963, "time_per_iteration": 2.568309783935547 }, { "auxiliary_loss_clip": 0.06403521, "auxiliary_loss_mlp": 0.01267657, "balance_loss_clip": 0.06270627, "balance_loss_mlp": 0.01258997, "epoch": 0.9598076055914625, "flos": 23775218000640.0, "grad_norm": 1.4327052505126763, "language_loss": 0.7433607, "learning_rate": 1.689701268270527e-08, "loss": 0.82007241, "num_input_tokens_seen": 344435185, "router_z_loss_clip": 1.32714844, "router_z_loss_mlp": 0.08660889, "step": 15964, "time_per_iteration": 2.5314722061157227 }, { "auxiliary_loss_clip": 0.06313083, "auxiliary_loss_mlp": 0.01250276, "balance_loss_clip": 0.06258069, "balance_loss_mlp": 0.01249324, "epoch": 0.9598677288441305, "flos": 56531435045760.0, "grad_norm": 0.8691753198544739, "language_loss": 0.57663679, "learning_rate": 1.684653177987161e-08, "loss": 0.65227038, "num_input_tokens_seen": 344488950, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00950623, "step": 15965, "time_per_iteration": 3.1230897903442383 }, { "auxiliary_loss_clip": 0.06415228, "auxiliary_loss_mlp": 0.01267215, "balance_loss_clip": 0.06275143, "balance_loss_mlp": 0.01257893, "epoch": 0.9599278520967984, "flos": 23003241534720.0, "grad_norm": 1.6271101632893954, "language_loss": 0.78967506, "learning_rate": 1.6796126078416627e-08, "loss": 0.86649948, "num_input_tokens_seen": 344506740, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09326172, "step": 15966, "time_per_iteration": 2.5153889656066895 }, { "auxiliary_loss_clip": 0.06411322, "auxiliary_loss_mlp": 0.01263245, "balance_loss_clip": 0.06273571, "balance_loss_mlp": 0.01254429, "epoch": 0.9599879753494664, "flos": 23046399187200.0, "grad_norm": 1.5104265689991159, "language_loss": 0.79334158, "learning_rate": 1.674579558025102e-08, "loss": 0.87008727, "num_input_tokens_seen": 344526670, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.0881958, "step": 15967, "time_per_iteration": 2.5503623485565186 }, { "auxiliary_loss_clip": 0.06417064, "auxiliary_loss_mlp": 0.01264015, "balance_loss_clip": 0.0627553, "balance_loss_mlp": 0.01253894, "epoch": 0.9600480986021344, "flos": 16396731492480.0, "grad_norm": 2.085450433074894, "language_loss": 0.8086679, "learning_rate": 1.669554028728348e-08, "loss": 0.88547862, "num_input_tokens_seen": 344541995, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.10125732, "step": 15968, "time_per_iteration": 2.478527307510376 }, { "auxiliary_loss_clip": 0.06416976, "auxiliary_loss_mlp": 0.01266987, "balance_loss_clip": 0.0627299, "balance_loss_mlp": 0.01256896, "epoch": 0.9601082218548024, "flos": 24282741381120.0, "grad_norm": 2.5008119695700426, "language_loss": 0.67803305, "learning_rate": 1.6645360201420044e-08, "loss": 0.75487268, "num_input_tokens_seen": 344559980, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10083008, "step": 15969, "time_per_iteration": 2.530761957168579 }, { "auxiliary_loss_clip": 0.06411961, "auxiliary_loss_mlp": 0.01263631, "balance_loss_clip": 0.06274432, "balance_loss_mlp": 0.01254696, "epoch": 0.9601683451074703, "flos": 19616137539840.0, "grad_norm": 5.474324885257152, "language_loss": 0.79671144, "learning_rate": 1.6595255324563186e-08, "loss": 0.87346733, "num_input_tokens_seen": 344577765, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08935547, "step": 15970, "time_per_iteration": 2.5282046794891357 }, { "auxiliary_loss_clip": 0.06412269, "auxiliary_loss_mlp": 0.01262818, "balance_loss_clip": 0.06277141, "balance_loss_mlp": 0.01252977, "epoch": 0.9602284683601383, "flos": 26658320486400.0, "grad_norm": 1.4859549179641673, "language_loss": 0.77780628, "learning_rate": 1.654522565861316e-08, "loss": 0.85455716, "num_input_tokens_seen": 344597650, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.09838867, "step": 15971, "time_per_iteration": 2.572050094604492 }, { "auxiliary_loss_clip": 0.0641913, "auxiliary_loss_mlp": 0.0127003, "balance_loss_clip": 0.06274274, "balance_loss_mlp": 0.01260005, "epoch": 0.9602885916128062, "flos": 15558564700800.0, "grad_norm": 1.9084178701646082, "language_loss": 0.67606336, "learning_rate": 1.64952712054669e-08, "loss": 0.75295496, "num_input_tokens_seen": 344613580, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10028076, "step": 15972, "time_per_iteration": 2.4926209449768066 }, { "auxiliary_loss_clip": 0.06414026, "auxiliary_loss_mlp": 0.01266579, "balance_loss_clip": 0.06275237, "balance_loss_mlp": 0.01257507, "epoch": 0.9603487148654742, "flos": 16506918011520.0, "grad_norm": 2.048785478834872, "language_loss": 0.75998646, "learning_rate": 1.644539196701844e-08, "loss": 0.83679247, "num_input_tokens_seen": 344626910, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09075928, "step": 15973, "time_per_iteration": 2.483433246612549 }, { "auxiliary_loss_clip": 0.06411347, "auxiliary_loss_mlp": 0.01263751, "balance_loss_clip": 0.06274259, "balance_loss_mlp": 0.01254416, "epoch": 0.9604088381181421, "flos": 20850844579200.0, "grad_norm": 1.661568239928399, "language_loss": 0.69530916, "learning_rate": 1.639558794515983e-08, "loss": 0.77206016, "num_input_tokens_seen": 344644330, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09338379, "step": 15974, "time_per_iteration": 2.5214297771453857 }, { "auxiliary_loss_clip": 0.06414521, "auxiliary_loss_mlp": 0.01263862, "balance_loss_clip": 0.06272459, "balance_loss_mlp": 0.01254146, "epoch": 0.9604689613708102, "flos": 19689287754240.0, "grad_norm": 1.6628048807645381, "language_loss": 0.68201029, "learning_rate": 1.6345859141779105e-08, "loss": 0.75879407, "num_input_tokens_seen": 344663910, "router_z_loss_clip": 1.42382812, "router_z_loss_mlp": 0.09716797, "step": 15975, "time_per_iteration": 2.5360193252563477 }, { "auxiliary_loss_clip": 0.06406318, "auxiliary_loss_mlp": 0.01264588, "balance_loss_clip": 0.06273665, "balance_loss_mlp": 0.01255414, "epoch": 0.9605290846234781, "flos": 24104435892480.0, "grad_norm": 2.174533977234062, "language_loss": 0.56100762, "learning_rate": 1.6296205558762322e-08, "loss": 0.63771671, "num_input_tokens_seen": 344682320, "router_z_loss_clip": 1.32617188, "router_z_loss_mlp": 0.09161377, "step": 15976, "time_per_iteration": 2.5636942386627197 }, { "auxiliary_loss_clip": 0.06407331, "auxiliary_loss_mlp": 0.01269461, "balance_loss_clip": 0.06272254, "balance_loss_mlp": 0.01260419, "epoch": 0.9605892078761461, "flos": 27129394540800.0, "grad_norm": 1.7172081007203126, "language_loss": 0.685628, "learning_rate": 1.624662719799219e-08, "loss": 0.76239592, "num_input_tokens_seen": 344701355, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.0904541, "step": 15977, "time_per_iteration": 4.049966096878052 }, { "auxiliary_loss_clip": 0.06412714, "auxiliary_loss_mlp": 0.01265354, "balance_loss_clip": 0.06272858, "balance_loss_mlp": 0.0125621, "epoch": 0.9606493311288141, "flos": 14142114155520.0, "grad_norm": 1.7649394200878477, "language_loss": 0.82212859, "learning_rate": 1.6197124061348766e-08, "loss": 0.89890927, "num_input_tokens_seen": 344717980, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.0914917, "step": 15978, "time_per_iteration": 2.5284383296966553 }, { "auxiliary_loss_clip": 0.06416957, "auxiliary_loss_mlp": 0.01263055, "balance_loss_clip": 0.062751, "balance_loss_mlp": 0.01253036, "epoch": 0.960709454381482, "flos": 15818489665920.0, "grad_norm": 2.126165038671386, "language_loss": 0.83657229, "learning_rate": 1.614769615070921e-08, "loss": 0.91337246, "num_input_tokens_seen": 344733480, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10009766, "step": 15979, "time_per_iteration": 2.5018937587738037 }, { "auxiliary_loss_clip": 0.06418028, "auxiliary_loss_mlp": 0.01264749, "balance_loss_clip": 0.06276601, "balance_loss_mlp": 0.01255611, "epoch": 0.96076957763415, "flos": 22572054823680.0, "grad_norm": 2.0716723396050325, "language_loss": 0.80462503, "learning_rate": 1.6098343467947805e-08, "loss": 0.8814528, "num_input_tokens_seen": 344752130, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09136963, "step": 15980, "time_per_iteration": 2.5861763954162598 }, { "auxiliary_loss_clip": 0.06417276, "auxiliary_loss_mlp": 0.01263151, "balance_loss_clip": 0.06274249, "balance_loss_mlp": 0.01253507, "epoch": 0.960829700886818, "flos": 24688212088320.0, "grad_norm": 2.0554985718413152, "language_loss": 0.68668342, "learning_rate": 1.6049066014935942e-08, "loss": 0.7634877, "num_input_tokens_seen": 344771195, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09649658, "step": 15981, "time_per_iteration": 4.036841869354248 }, { "auxiliary_loss_clip": 0.06413955, "auxiliary_loss_mlp": 0.01267243, "balance_loss_clip": 0.06275419, "balance_loss_mlp": 0.01257522, "epoch": 0.960889824139486, "flos": 26549517559680.0, "grad_norm": 1.3276003892358994, "language_loss": 0.69710732, "learning_rate": 1.5999863793542344e-08, "loss": 0.77391934, "num_input_tokens_seen": 344793150, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09729004, "step": 15982, "time_per_iteration": 2.5810928344726562 }, { "auxiliary_loss_clip": 0.06316762, "auxiliary_loss_mlp": 0.01251709, "balance_loss_clip": 0.06261663, "balance_loss_mlp": 0.01250737, "epoch": 0.9609499473921539, "flos": 71133638371200.0, "grad_norm": 0.666778854426707, "language_loss": 0.53296387, "learning_rate": 1.595073680563286e-08, "loss": 0.60864854, "num_input_tokens_seen": 344852855, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00970459, "step": 15983, "time_per_iteration": 3.253181219100952 }, { "auxiliary_loss_clip": 0.06415033, "auxiliary_loss_mlp": 0.01266279, "balance_loss_clip": 0.06275579, "balance_loss_mlp": 0.01257496, "epoch": 0.9610100706448219, "flos": 20557740597120.0, "grad_norm": 2.143715275949337, "language_loss": 0.68109012, "learning_rate": 1.5901685053070212e-08, "loss": 0.75790328, "num_input_tokens_seen": 344869830, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.08786011, "step": 15984, "time_per_iteration": 2.505378246307373 }, { "auxiliary_loss_clip": 0.06408006, "auxiliary_loss_mlp": 0.01265904, "balance_loss_clip": 0.06274798, "balance_loss_mlp": 0.01257041, "epoch": 0.9610701938974898, "flos": 14069425138560.0, "grad_norm": 1.6558674866728238, "language_loss": 0.68006694, "learning_rate": 1.5852708537714477e-08, "loss": 0.75680602, "num_input_tokens_seen": 344888905, "router_z_loss_clip": 1.33398438, "router_z_loss_mlp": 0.08862305, "step": 15985, "time_per_iteration": 2.504744529724121 }, { "auxiliary_loss_clip": 0.06416045, "auxiliary_loss_mlp": 0.0126561, "balance_loss_clip": 0.06275721, "balance_loss_mlp": 0.01256079, "epoch": 0.9611303171501578, "flos": 20236195353600.0, "grad_norm": 2.0274131528634056, "language_loss": 0.7902571, "learning_rate": 1.580380726142283e-08, "loss": 0.86707366, "num_input_tokens_seen": 344907160, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09539795, "step": 15986, "time_per_iteration": 2.5083858966827393 }, { "auxiliary_loss_clip": 0.06412912, "auxiliary_loss_mlp": 0.0126707, "balance_loss_clip": 0.06275718, "balance_loss_mlp": 0.01256461, "epoch": 0.9611904404028258, "flos": 20955957926400.0, "grad_norm": 2.0469313259245734, "language_loss": 0.64339316, "learning_rate": 1.5754981226049792e-08, "loss": 0.72019303, "num_input_tokens_seen": 344922400, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.1060791, "step": 15987, "time_per_iteration": 2.507380485534668 }, { "auxiliary_loss_clip": 0.06410243, "auxiliary_loss_mlp": 0.01264685, "balance_loss_clip": 0.06277459, "balance_loss_mlp": 0.01255834, "epoch": 0.9612505636554938, "flos": 24834806006400.0, "grad_norm": 1.9629452152222373, "language_loss": 0.6720987, "learning_rate": 1.5706230433446544e-08, "loss": 0.74884796, "num_input_tokens_seen": 344941910, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.08850098, "step": 15988, "time_per_iteration": 2.6211025714874268 }, { "auxiliary_loss_clip": 0.06414405, "auxiliary_loss_mlp": 0.0126734, "balance_loss_clip": 0.06274359, "balance_loss_mlp": 0.01258614, "epoch": 0.9613106869081617, "flos": 17170636602240.0, "grad_norm": 1.7376956045690133, "language_loss": 0.74986589, "learning_rate": 1.5657554885462055e-08, "loss": 0.82668334, "num_input_tokens_seen": 344960020, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.08721924, "step": 15989, "time_per_iteration": 2.747509717941284 }, { "auxiliary_loss_clip": 0.06315985, "auxiliary_loss_mlp": 0.01253378, "balance_loss_clip": 0.06260838, "balance_loss_mlp": 0.01252342, "epoch": 0.9613708101608297, "flos": 61582279783680.0, "grad_norm": 0.8019598030443981, "language_loss": 0.63296807, "learning_rate": 1.5608954583941737e-08, "loss": 0.70866168, "num_input_tokens_seen": 345018290, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01036072, "step": 15990, "time_per_iteration": 3.154186725616455 }, { "auxiliary_loss_clip": 0.06410883, "auxiliary_loss_mlp": 0.01263371, "balance_loss_clip": 0.06271477, "balance_loss_mlp": 0.01254478, "epoch": 0.9614309334134977, "flos": 27425349561600.0, "grad_norm": 1.8056410856096268, "language_loss": 0.77975714, "learning_rate": 1.5560429530729003e-08, "loss": 0.85649967, "num_input_tokens_seen": 345040235, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.08892822, "step": 15991, "time_per_iteration": 4.184460878372192 }, { "auxiliary_loss_clip": 0.06422548, "auxiliary_loss_mlp": 0.01266466, "balance_loss_clip": 0.06277667, "balance_loss_mlp": 0.01255904, "epoch": 0.9614910566661656, "flos": 22825564951680.0, "grad_norm": 2.3319880590495803, "language_loss": 0.84457558, "learning_rate": 1.5511979727663493e-08, "loss": 0.92146575, "num_input_tokens_seen": 345054540, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10565186, "step": 15992, "time_per_iteration": 2.8028228282928467 }, { "auxiliary_loss_clip": 0.06416756, "auxiliary_loss_mlp": 0.01267798, "balance_loss_clip": 0.06276853, "balance_loss_mlp": 0.01257517, "epoch": 0.9615511799188337, "flos": 20674090391040.0, "grad_norm": 2.1314820357031907, "language_loss": 0.72838533, "learning_rate": 1.5463605176582406e-08, "loss": 0.80523086, "num_input_tokens_seen": 345074035, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.10284424, "step": 15993, "time_per_iteration": 2.641911745071411 }, { "auxiliary_loss_clip": 0.06414522, "auxiliary_loss_mlp": 0.01264022, "balance_loss_clip": 0.06273626, "balance_loss_mlp": 0.01254556, "epoch": 0.9616113031715016, "flos": 33158123953920.0, "grad_norm": 1.5473948747628676, "language_loss": 0.6836825, "learning_rate": 1.5415305879320716e-08, "loss": 0.76046789, "num_input_tokens_seen": 345099270, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09466553, "step": 15994, "time_per_iteration": 2.7949795722961426 }, { "auxiliary_loss_clip": 0.06412039, "auxiliary_loss_mlp": 0.01264473, "balance_loss_clip": 0.06273462, "balance_loss_mlp": 0.01255717, "epoch": 0.9616714264241696, "flos": 25016843001600.0, "grad_norm": 1.9198132777327686, "language_loss": 0.84558237, "learning_rate": 1.5367081837709183e-08, "loss": 0.92234743, "num_input_tokens_seen": 345116975, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.08752441, "step": 15995, "time_per_iteration": 2.599072217941284 }, { "auxiliary_loss_clip": 0.06424179, "auxiliary_loss_mlp": 0.01266279, "balance_loss_clip": 0.06278972, "balance_loss_mlp": 0.01256266, "epoch": 0.9617315496768375, "flos": 13551629633280.0, "grad_norm": 1.8971196825837373, "language_loss": 0.76042765, "learning_rate": 1.5318933053576788e-08, "loss": 0.83733225, "num_input_tokens_seen": 345133645, "router_z_loss_clip": 1.45019531, "router_z_loss_mlp": 0.10015869, "step": 15996, "time_per_iteration": 3.896359443664551 }, { "auxiliary_loss_clip": 0.06410132, "auxiliary_loss_mlp": 0.01265949, "balance_loss_clip": 0.06271115, "balance_loss_mlp": 0.01255983, "epoch": 0.9617916729295055, "flos": 11259221304960.0, "grad_norm": 2.035332262575673, "language_loss": 0.77035356, "learning_rate": 1.52708595287494e-08, "loss": 0.84711432, "num_input_tokens_seen": 345150740, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09967041, "step": 15997, "time_per_iteration": 2.4940638542175293 }, { "auxiliary_loss_clip": 0.06406625, "auxiliary_loss_mlp": 0.01263673, "balance_loss_clip": 0.06272525, "balance_loss_mlp": 0.01255006, "epoch": 0.9618517961821734, "flos": 22826235784320.0, "grad_norm": 1.45679528153324, "language_loss": 0.67481446, "learning_rate": 1.522286126505001e-08, "loss": 0.75151742, "num_input_tokens_seen": 345170365, "router_z_loss_clip": 1.34082031, "router_z_loss_mlp": 0.08666992, "step": 15998, "time_per_iteration": 2.541325330734253 }, { "auxiliary_loss_clip": 0.06413092, "auxiliary_loss_mlp": 0.01264784, "balance_loss_clip": 0.06275479, "balance_loss_mlp": 0.01255194, "epoch": 0.9619119194348414, "flos": 16622848535040.0, "grad_norm": 2.5530515698878538, "language_loss": 0.73059595, "learning_rate": 1.5174938264298498e-08, "loss": 0.80737466, "num_input_tokens_seen": 345188930, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.0960083, "step": 15999, "time_per_iteration": 2.5459187030792236 }, { "auxiliary_loss_clip": 0.06407756, "auxiliary_loss_mlp": 0.0126721, "balance_loss_clip": 0.06273547, "balance_loss_mlp": 0.01258597, "epoch": 0.9619720426875094, "flos": 24542037440640.0, "grad_norm": 1.8646690488679787, "language_loss": 0.6589033, "learning_rate": 1.5127090528312514e-08, "loss": 0.73565304, "num_input_tokens_seen": 345209615, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.08618164, "step": 16000, "time_per_iteration": 2.580972909927368 }, { "auxiliary_loss_clip": 0.06413538, "auxiliary_loss_mlp": 0.01263972, "balance_loss_clip": 0.0627344, "balance_loss_mlp": 0.01253577, "epoch": 0.9620321659401774, "flos": 20638647313920.0, "grad_norm": 1.6064669554357933, "language_loss": 0.75689065, "learning_rate": 1.5079318058905723e-08, "loss": 0.83366567, "num_input_tokens_seen": 345229175, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.10394287, "step": 16001, "time_per_iteration": 2.5295238494873047 }, { "auxiliary_loss_clip": 0.06413268, "auxiliary_loss_mlp": 0.01267191, "balance_loss_clip": 0.06274576, "balance_loss_mlp": 0.01257732, "epoch": 0.9620922891928453, "flos": 18521232238080.0, "grad_norm": 1.7413175200124684, "language_loss": 0.68232477, "learning_rate": 1.5031620857890447e-08, "loss": 0.75912941, "num_input_tokens_seen": 345247815, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09460449, "step": 16002, "time_per_iteration": 2.5019466876983643 }, { "auxiliary_loss_clip": 0.06414378, "auxiliary_loss_mlp": 0.01265226, "balance_loss_clip": 0.06277408, "balance_loss_mlp": 0.01255886, "epoch": 0.9621524124455133, "flos": 28774980875520.0, "grad_norm": 1.887117156815828, "language_loss": 0.64975727, "learning_rate": 1.4983998927074804e-08, "loss": 0.72655332, "num_input_tokens_seen": 345269935, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09338379, "step": 16003, "time_per_iteration": 2.606667995452881 }, { "auxiliary_loss_clip": 0.06416208, "auxiliary_loss_mlp": 0.01264393, "balance_loss_clip": 0.06276953, "balance_loss_mlp": 0.01254594, "epoch": 0.9622125356981813, "flos": 19104882652800.0, "grad_norm": 1.9470498252903385, "language_loss": 0.7576369, "learning_rate": 1.493645226826512e-08, "loss": 0.83444291, "num_input_tokens_seen": 345288310, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09790039, "step": 16004, "time_per_iteration": 2.5293314456939697 }, { "auxiliary_loss_clip": 0.064116, "auxiliary_loss_mlp": 0.01265961, "balance_loss_clip": 0.06275355, "balance_loss_mlp": 0.01256395, "epoch": 0.9622726589508492, "flos": 20309010151680.0, "grad_norm": 1.768641361460055, "language_loss": 0.79920268, "learning_rate": 1.4888980883263958e-08, "loss": 0.87597829, "num_input_tokens_seen": 345306615, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.09552002, "step": 16005, "time_per_iteration": 2.528986930847168 }, { "auxiliary_loss_clip": 0.06413143, "auxiliary_loss_mlp": 0.01262384, "balance_loss_clip": 0.06277007, "balance_loss_mlp": 0.01253521, "epoch": 0.9623327822035173, "flos": 54942060401280.0, "grad_norm": 1.9670643201907805, "language_loss": 0.68161917, "learning_rate": 1.4841584773871652e-08, "loss": 0.75837445, "num_input_tokens_seen": 345331935, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.08874512, "step": 16006, "time_per_iteration": 2.8697474002838135 }, { "auxiliary_loss_clip": 0.06408949, "auxiliary_loss_mlp": 0.0126178, "balance_loss_clip": 0.06277353, "balance_loss_mlp": 0.01252834, "epoch": 0.9623929054561852, "flos": 21764928769920.0, "grad_norm": 1.4723011884778643, "language_loss": 0.7839784, "learning_rate": 1.479426394188521e-08, "loss": 0.86068565, "num_input_tokens_seen": 345351510, "router_z_loss_clip": 1.31445312, "router_z_loss_mlp": 0.08947754, "step": 16007, "time_per_iteration": 2.5326192378997803 }, { "auxiliary_loss_clip": 0.0641579, "auxiliary_loss_mlp": 0.01263954, "balance_loss_clip": 0.06275681, "balance_loss_mlp": 0.01253971, "epoch": 0.9624530287088532, "flos": 17937414115200.0, "grad_norm": 2.128284779228739, "language_loss": 0.67921501, "learning_rate": 1.4747018389099198e-08, "loss": 0.7560125, "num_input_tokens_seen": 345367750, "router_z_loss_clip": 1.40039062, "router_z_loss_mlp": 0.09979248, "step": 16008, "time_per_iteration": 2.5226945877075195 }, { "auxiliary_loss_clip": 0.06418048, "auxiliary_loss_mlp": 0.0126888, "balance_loss_clip": 0.06276989, "balance_loss_mlp": 0.01257901, "epoch": 0.9625131519615211, "flos": 23259686555520.0, "grad_norm": 2.041865780828123, "language_loss": 0.72966993, "learning_rate": 1.469984811730529e-08, "loss": 0.80653918, "num_input_tokens_seen": 345384790, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10986328, "step": 16009, "time_per_iteration": 2.5591678619384766 }, { "auxiliary_loss_clip": 0.06413057, "auxiliary_loss_mlp": 0.01263344, "balance_loss_clip": 0.06275869, "balance_loss_mlp": 0.0125398, "epoch": 0.9625732752141891, "flos": 18922636022400.0, "grad_norm": 1.802436594901151, "language_loss": 0.75561035, "learning_rate": 1.4652753128292061e-08, "loss": 0.83237433, "num_input_tokens_seen": 345403390, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09368896, "step": 16010, "time_per_iteration": 2.522686004638672 }, { "auxiliary_loss_clip": 0.06418857, "auxiliary_loss_mlp": 0.01267455, "balance_loss_clip": 0.06275785, "balance_loss_mlp": 0.01256058, "epoch": 0.962633398466857, "flos": 16258439128320.0, "grad_norm": 1.7592757871854074, "language_loss": 0.69933361, "learning_rate": 1.4605733423845635e-08, "loss": 0.77619672, "num_input_tokens_seen": 345418685, "router_z_loss_clip": 1.4296875, "router_z_loss_mlp": 0.1138916, "step": 16011, "time_per_iteration": 2.5038630962371826 }, { "auxiliary_loss_clip": 0.06413554, "auxiliary_loss_mlp": 0.0126417, "balance_loss_clip": 0.06276319, "balance_loss_mlp": 0.0125514, "epoch": 0.962693521719525, "flos": 54209174664960.0, "grad_norm": 1.8893610203067577, "language_loss": 0.68869245, "learning_rate": 1.4558789005748585e-08, "loss": 0.76546967, "num_input_tokens_seen": 345442380, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.090271, "step": 16012, "time_per_iteration": 2.810809373855591 }, { "auxiliary_loss_clip": 0.06423386, "auxiliary_loss_mlp": 0.01266559, "balance_loss_clip": 0.06278643, "balance_loss_mlp": 0.01255657, "epoch": 0.962753644972193, "flos": 33113540782080.0, "grad_norm": 1.7657177772295731, "language_loss": 0.72411394, "learning_rate": 1.4511919875781264e-08, "loss": 0.80101335, "num_input_tokens_seen": 345463815, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10894775, "step": 16013, "time_per_iteration": 2.632399320602417 }, { "auxiliary_loss_clip": 0.06412499, "auxiliary_loss_mlp": 0.0126481, "balance_loss_clip": 0.06274439, "balance_loss_mlp": 0.01254552, "epoch": 0.962813768224861, "flos": 42240504839040.0, "grad_norm": 2.7000019509609348, "language_loss": 0.63849366, "learning_rate": 1.4465126035720698e-08, "loss": 0.71526676, "num_input_tokens_seen": 345484525, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.1026001, "step": 16014, "time_per_iteration": 2.7176856994628906 }, { "auxiliary_loss_clip": 0.06406854, "auxiliary_loss_mlp": 0.01267225, "balance_loss_clip": 0.06272389, "balance_loss_mlp": 0.01258618, "epoch": 0.9628738914775289, "flos": 43954671340800.0, "grad_norm": 1.655925779239756, "language_loss": 0.72381806, "learning_rate": 1.4418407487341688e-08, "loss": 0.80055887, "num_input_tokens_seen": 345508295, "router_z_loss_clip": 1.34277344, "router_z_loss_mlp": 0.08599854, "step": 16015, "time_per_iteration": 2.729017734527588 }, { "auxiliary_loss_clip": 0.06412238, "auxiliary_loss_mlp": 0.01263247, "balance_loss_clip": 0.06273592, "balance_loss_mlp": 0.01253574, "epoch": 0.9629340147301969, "flos": 15601596572160.0, "grad_norm": 1.9232261542934528, "language_loss": 0.77334774, "learning_rate": 1.4371764232415707e-08, "loss": 0.85010254, "num_input_tokens_seen": 345525155, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09674072, "step": 16016, "time_per_iteration": 3.9618122577667236 }, { "auxiliary_loss_clip": 0.06317902, "auxiliary_loss_mlp": 0.01251793, "balance_loss_clip": 0.06262843, "balance_loss_mlp": 0.01250747, "epoch": 0.9629941379828649, "flos": 62969827870080.0, "grad_norm": 0.81411881035005, "language_loss": 0.63088357, "learning_rate": 1.4325196272711337e-08, "loss": 0.70658052, "num_input_tokens_seen": 345578905, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01046753, "step": 16017, "time_per_iteration": 3.081264019012451 }, { "auxiliary_loss_clip": 0.06414481, "auxiliary_loss_mlp": 0.01265002, "balance_loss_clip": 0.06274903, "balance_loss_mlp": 0.01255167, "epoch": 0.9630542612355328, "flos": 29907006336000.0, "grad_norm": 1.7480511224070132, "language_loss": 0.66763449, "learning_rate": 1.4278703609994502e-08, "loss": 0.74442929, "num_input_tokens_seen": 345598965, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09838867, "step": 16018, "time_per_iteration": 2.666621208190918 }, { "auxiliary_loss_clip": 0.06414829, "auxiliary_loss_mlp": 0.0126191, "balance_loss_clip": 0.06275356, "balance_loss_mlp": 0.01252606, "epoch": 0.9631143844882009, "flos": 17900335883520.0, "grad_norm": 1.9349816376200524, "language_loss": 0.79709691, "learning_rate": 1.4232286246028457e-08, "loss": 0.87386435, "num_input_tokens_seen": 345617945, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09307861, "step": 16019, "time_per_iteration": 2.5562736988067627 }, { "auxiliary_loss_clip": 0.06409518, "auxiliary_loss_mlp": 0.01264425, "balance_loss_clip": 0.06274458, "balance_loss_mlp": 0.01255973, "epoch": 0.9631745077408688, "flos": 26146101277440.0, "grad_norm": 1.408366531395443, "language_loss": 0.71872413, "learning_rate": 1.4185944182572907e-08, "loss": 0.7954635, "num_input_tokens_seen": 345637920, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.08459473, "step": 16020, "time_per_iteration": 2.6815595626831055 }, { "auxiliary_loss_clip": 0.06415051, "auxiliary_loss_mlp": 0.01269657, "balance_loss_clip": 0.06276035, "balance_loss_mlp": 0.01260055, "epoch": 0.9632346309935368, "flos": 24980729091840.0, "grad_norm": 1.6082812563095927, "language_loss": 0.77206188, "learning_rate": 1.4139677421385331e-08, "loss": 0.84890902, "num_input_tokens_seen": 345656195, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09606934, "step": 16021, "time_per_iteration": 4.045653581619263 }, { "auxiliary_loss_clip": 0.06423344, "auxiliary_loss_mlp": 0.01265639, "balance_loss_clip": 0.0627753, "balance_loss_mlp": 0.01255608, "epoch": 0.9632947542462047, "flos": 23623005859200.0, "grad_norm": 4.969233053264329, "language_loss": 0.652565, "learning_rate": 1.4093485964220331e-08, "loss": 0.72945482, "num_input_tokens_seen": 345676700, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.1003418, "step": 16022, "time_per_iteration": 2.570948839187622 }, { "auxiliary_loss_clip": 0.064137, "auxiliary_loss_mlp": 0.01263908, "balance_loss_clip": 0.06276613, "balance_loss_mlp": 0.01254663, "epoch": 0.9633548774988727, "flos": 26402755933440.0, "grad_norm": 2.3596612073381555, "language_loss": 0.73374677, "learning_rate": 1.4047369812829168e-08, "loss": 0.81052279, "num_input_tokens_seen": 345696725, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09246826, "step": 16023, "time_per_iteration": 2.578350067138672 }, { "auxiliary_loss_clip": 0.06409009, "auxiliary_loss_mlp": 0.01265284, "balance_loss_clip": 0.06273133, "balance_loss_mlp": 0.01256242, "epoch": 0.9634150007515406, "flos": 23774295605760.0, "grad_norm": 1.5419574657760764, "language_loss": 0.8181507, "learning_rate": 1.4001328968960891e-08, "loss": 0.89489359, "num_input_tokens_seen": 345716245, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09039307, "step": 16024, "time_per_iteration": 2.5459601879119873 }, { "auxiliary_loss_clip": 0.06423611, "auxiliary_loss_mlp": 0.01264683, "balance_loss_clip": 0.06279816, "balance_loss_mlp": 0.01254932, "epoch": 0.9634751240042086, "flos": 24142436519040.0, "grad_norm": 1.591852308870974, "language_loss": 0.8166604, "learning_rate": 1.3955363434361212e-08, "loss": 0.89354336, "num_input_tokens_seen": 345739060, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.09753418, "step": 16025, "time_per_iteration": 2.6182451248168945 }, { "auxiliary_loss_clip": 0.06418927, "auxiliary_loss_mlp": 0.01265192, "balance_loss_clip": 0.06275697, "balance_loss_mlp": 0.0125587, "epoch": 0.9635352472568766, "flos": 24355346544000.0, "grad_norm": 1.794473467700276, "language_loss": 0.77100021, "learning_rate": 1.3909473210773181e-08, "loss": 0.84784138, "num_input_tokens_seen": 345758325, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09320068, "step": 16026, "time_per_iteration": 2.6341285705566406 }, { "auxiliary_loss_clip": 0.06413812, "auxiliary_loss_mlp": 0.01273317, "balance_loss_clip": 0.06274411, "balance_loss_mlp": 0.01263048, "epoch": 0.9635953705095446, "flos": 23991062918400.0, "grad_norm": 1.9073869436675157, "language_loss": 0.63459718, "learning_rate": 1.3863658299936965e-08, "loss": 0.71146846, "num_input_tokens_seen": 345778530, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.10272217, "step": 16027, "time_per_iteration": 2.603064775466919 }, { "auxiliary_loss_clip": 0.06418765, "auxiliary_loss_mlp": 0.01266784, "balance_loss_clip": 0.06276852, "balance_loss_mlp": 0.01257242, "epoch": 0.9636554937622125, "flos": 19834540007040.0, "grad_norm": 1.8351887294579559, "language_loss": 0.87339288, "learning_rate": 1.3817918703589837e-08, "loss": 0.9502483, "num_input_tokens_seen": 345796535, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09545898, "step": 16028, "time_per_iteration": 2.514052629470825 }, { "auxiliary_loss_clip": 0.06315552, "auxiliary_loss_mlp": 0.01253197, "balance_loss_clip": 0.06260248, "balance_loss_mlp": 0.01252137, "epoch": 0.9637156170148805, "flos": 67454520497280.0, "grad_norm": 0.6723757290528625, "language_loss": 0.53129804, "learning_rate": 1.3772254423466412e-08, "loss": 0.60698557, "num_input_tokens_seen": 345859700, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01061249, "step": 16029, "time_per_iteration": 3.157257318496704 }, { "auxiliary_loss_clip": 0.06417898, "auxiliary_loss_mlp": 0.01263883, "balance_loss_clip": 0.06276622, "balance_loss_mlp": 0.01253911, "epoch": 0.9637757402675484, "flos": 20306788018560.0, "grad_norm": 1.4463382828826632, "language_loss": 0.74183047, "learning_rate": 1.372666546129797e-08, "loss": 0.81864822, "num_input_tokens_seen": 345878760, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09979248, "step": 16030, "time_per_iteration": 2.6018593311309814 }, { "auxiliary_loss_clip": 0.06412792, "auxiliary_loss_mlp": 0.01265294, "balance_loss_clip": 0.06276211, "balance_loss_mlp": 0.01255984, "epoch": 0.9638358635202164, "flos": 27241803192960.0, "grad_norm": 1.5317555369116393, "language_loss": 0.65900445, "learning_rate": 1.3681151818813575e-08, "loss": 0.73578537, "num_input_tokens_seen": 345900445, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.09313965, "step": 16031, "time_per_iteration": 4.020009279251099 }, { "auxiliary_loss_clip": 0.06318939, "auxiliary_loss_mlp": 0.01253348, "balance_loss_clip": 0.06263835, "balance_loss_mlp": 0.01252325, "epoch": 0.9638959867728845, "flos": 70309768700160.0, "grad_norm": 0.8452190867261276, "language_loss": 0.60669529, "learning_rate": 1.3635713497738955e-08, "loss": 0.68241823, "num_input_tokens_seen": 345961020, "router_z_loss_clip": 0.55126953, "router_z_loss_mlp": 0.01023865, "step": 16032, "time_per_iteration": 3.2109858989715576 }, { "auxiliary_loss_clip": 0.06405345, "auxiliary_loss_mlp": 0.01266549, "balance_loss_clip": 0.06274544, "balance_loss_mlp": 0.01257662, "epoch": 0.9639561100255524, "flos": 25414012154880.0, "grad_norm": 1.8124296975165786, "language_loss": 0.6698125, "learning_rate": 1.3590350499796954e-08, "loss": 0.74653143, "num_input_tokens_seen": 345980210, "router_z_loss_clip": 1.30957031, "router_z_loss_mlp": 0.08886719, "step": 16033, "time_per_iteration": 2.5659825801849365 }, { "auxiliary_loss_clip": 0.06415023, "auxiliary_loss_mlp": 0.01263421, "balance_loss_clip": 0.06277308, "balance_loss_mlp": 0.01254123, "epoch": 0.9640162332782204, "flos": 18119744599680.0, "grad_norm": 2.0073957478390234, "language_loss": 0.65916765, "learning_rate": 1.3545062826707976e-08, "loss": 0.73595214, "num_input_tokens_seen": 345998280, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09301758, "step": 16034, "time_per_iteration": 2.5152053833007812 }, { "auxiliary_loss_clip": 0.06408831, "auxiliary_loss_mlp": 0.01264996, "balance_loss_clip": 0.06272203, "balance_loss_mlp": 0.01255704, "epoch": 0.9640763565308883, "flos": 23446964430720.0, "grad_norm": 2.2494924425759857, "language_loss": 0.74232686, "learning_rate": 1.3499850480189313e-08, "loss": 0.81906515, "num_input_tokens_seen": 346015545, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.09295654, "step": 16035, "time_per_iteration": 2.582139730453491 }, { "auxiliary_loss_clip": 0.06413411, "auxiliary_loss_mlp": 0.01262654, "balance_loss_clip": 0.06276161, "balance_loss_mlp": 0.01254, "epoch": 0.9641364797835563, "flos": 22425964030080.0, "grad_norm": 1.9559150813666988, "language_loss": 0.82691151, "learning_rate": 1.3454713461955591e-08, "loss": 0.90367216, "num_input_tokens_seen": 346034055, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.08648682, "step": 16036, "time_per_iteration": 3.913437604904175 }, { "auxiliary_loss_clip": 0.06412864, "auxiliary_loss_mlp": 0.01264115, "balance_loss_clip": 0.06273465, "balance_loss_mlp": 0.01254715, "epoch": 0.9641966030362242, "flos": 30629284531200.0, "grad_norm": 1.7866990145405717, "language_loss": 0.70300281, "learning_rate": 1.340965177371789e-08, "loss": 0.77977258, "num_input_tokens_seen": 346054130, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09393311, "step": 16037, "time_per_iteration": 2.6344668865203857 }, { "auxiliary_loss_clip": 0.06412497, "auxiliary_loss_mlp": 0.01266496, "balance_loss_clip": 0.06272255, "balance_loss_mlp": 0.0125734, "epoch": 0.9642567262888923, "flos": 20958347767680.0, "grad_norm": 1.9969584895208945, "language_loss": 0.63419336, "learning_rate": 1.3364665417185506e-08, "loss": 0.71098328, "num_input_tokens_seen": 346072990, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09155273, "step": 16038, "time_per_iteration": 2.5861995220184326 }, { "auxiliary_loss_clip": 0.06417625, "auxiliary_loss_mlp": 0.01266606, "balance_loss_clip": 0.06276773, "balance_loss_mlp": 0.0125639, "epoch": 0.9643168495415602, "flos": 22646253214080.0, "grad_norm": 1.5412089153604829, "language_loss": 0.71145761, "learning_rate": 1.3319754394064187e-08, "loss": 0.78829992, "num_input_tokens_seen": 346093745, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10211182, "step": 16039, "time_per_iteration": 2.6746721267700195 }, { "auxiliary_loss_clip": 0.06419864, "auxiliary_loss_mlp": 0.01266987, "balance_loss_clip": 0.0627895, "balance_loss_mlp": 0.01256586, "epoch": 0.9643769727942282, "flos": 20272435044480.0, "grad_norm": 1.9978276395758228, "language_loss": 0.73280102, "learning_rate": 1.327491870605657e-08, "loss": 0.80966955, "num_input_tokens_seen": 346110115, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10406494, "step": 16040, "time_per_iteration": 2.595404624938965 }, { "auxiliary_loss_clip": 0.06416129, "auxiliary_loss_mlp": 0.01263492, "balance_loss_clip": 0.06274576, "balance_loss_mlp": 0.01254396, "epoch": 0.9644370960468961, "flos": 13887052727040.0, "grad_norm": 2.287605169500207, "language_loss": 0.73465818, "learning_rate": 1.3230158354863296e-08, "loss": 0.81145442, "num_input_tokens_seen": 346127165, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09094238, "step": 16041, "time_per_iteration": 2.566572666168213 }, { "auxiliary_loss_clip": 0.06403901, "auxiliary_loss_mlp": 0.01263515, "balance_loss_clip": 0.06273086, "balance_loss_mlp": 0.01254622, "epoch": 0.9644972192995641, "flos": 17243912597760.0, "grad_norm": 1.9665832256846185, "language_loss": 0.72352058, "learning_rate": 1.3185473342181674e-08, "loss": 0.80019474, "num_input_tokens_seen": 346145950, "router_z_loss_clip": 1.30859375, "router_z_loss_mlp": 0.08892822, "step": 16042, "time_per_iteration": 2.6495232582092285 }, { "auxiliary_loss_clip": 0.06419733, "auxiliary_loss_mlp": 0.01264268, "balance_loss_clip": 0.06276338, "balance_loss_mlp": 0.01255434, "epoch": 0.964557342552232, "flos": 23846858841600.0, "grad_norm": 1.655943548194597, "language_loss": 0.81264108, "learning_rate": 1.3140863669705683e-08, "loss": 0.88948107, "num_input_tokens_seen": 346165005, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.08831787, "step": 16043, "time_per_iteration": 2.5713984966278076 }, { "auxiliary_loss_clip": 0.06418061, "auxiliary_loss_mlp": 0.0126596, "balance_loss_clip": 0.06280527, "balance_loss_mlp": 0.0125712, "epoch": 0.9646174658049, "flos": 21659605787520.0, "grad_norm": 8.194663375919578, "language_loss": 0.71881908, "learning_rate": 1.3096329339127522e-08, "loss": 0.7956593, "num_input_tokens_seen": 346185095, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.08840942, "step": 16044, "time_per_iteration": 2.646616220474243 }, { "auxiliary_loss_clip": 0.06410256, "auxiliary_loss_mlp": 0.01262655, "balance_loss_clip": 0.06273539, "balance_loss_mlp": 0.01253393, "epoch": 0.9646775890575681, "flos": 17135403160320.0, "grad_norm": 1.7850734829635722, "language_loss": 0.70517653, "learning_rate": 1.3051870352135397e-08, "loss": 0.78190559, "num_input_tokens_seen": 346202580, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.09259033, "step": 16045, "time_per_iteration": 2.503554105758667 }, { "auxiliary_loss_clip": 0.06414974, "auxiliary_loss_mlp": 0.01263154, "balance_loss_clip": 0.06275613, "balance_loss_mlp": 0.01254332, "epoch": 0.964737712310236, "flos": 13010717600640.0, "grad_norm": 1.9642119640175844, "language_loss": 0.75458443, "learning_rate": 1.3007486710415737e-08, "loss": 0.83136576, "num_input_tokens_seen": 346219395, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.0881958, "step": 16046, "time_per_iteration": 2.556116819381714 }, { "auxiliary_loss_clip": 0.06414834, "auxiliary_loss_mlp": 0.01266488, "balance_loss_clip": 0.06274515, "balance_loss_mlp": 0.01256981, "epoch": 0.964797835562904, "flos": 24286011690240.0, "grad_norm": 1.5960857678198337, "language_loss": 0.62914675, "learning_rate": 1.2963178415651199e-08, "loss": 0.70596004, "num_input_tokens_seen": 346239715, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09509277, "step": 16047, "time_per_iteration": 2.5740461349487305 }, { "auxiliary_loss_clip": 0.06418565, "auxiliary_loss_mlp": 0.01265516, "balance_loss_clip": 0.06280594, "balance_loss_mlp": 0.01256468, "epoch": 0.9648579588155719, "flos": 20529089700480.0, "grad_norm": 2.0128622842910193, "language_loss": 0.69749296, "learning_rate": 1.2918945469521992e-08, "loss": 0.77433372, "num_input_tokens_seen": 346258500, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.0904541, "step": 16048, "time_per_iteration": 2.5600991249084473 }, { "auxiliary_loss_clip": 0.0641648, "auxiliary_loss_mlp": 0.01267608, "balance_loss_clip": 0.06274614, "balance_loss_mlp": 0.01257588, "epoch": 0.9649180820682399, "flos": 32162042943360.0, "grad_norm": 1.9864968568329284, "language_loss": 0.64233011, "learning_rate": 1.2874787873705662e-08, "loss": 0.71917099, "num_input_tokens_seen": 346279110, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.10028076, "step": 16049, "time_per_iteration": 2.6481239795684814 }, { "auxiliary_loss_clip": 0.06416679, "auxiliary_loss_mlp": 0.01262271, "balance_loss_clip": 0.06276045, "balance_loss_mlp": 0.01253301, "epoch": 0.9649782053209078, "flos": 20528963919360.0, "grad_norm": 1.5416132298201684, "language_loss": 0.71368754, "learning_rate": 1.2830705629876427e-08, "loss": 0.79047704, "num_input_tokens_seen": 346297860, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.08972168, "step": 16050, "time_per_iteration": 2.5635411739349365 }, { "auxiliary_loss_clip": 0.06422451, "auxiliary_loss_mlp": 0.01264559, "balance_loss_clip": 0.06278585, "balance_loss_mlp": 0.01254134, "epoch": 0.9650383285735759, "flos": 43077623454720.0, "grad_norm": 1.8173185380966914, "language_loss": 0.70109838, "learning_rate": 1.278669873970606e-08, "loss": 0.77796847, "num_input_tokens_seen": 346319860, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.10437012, "step": 16051, "time_per_iteration": 2.7453603744506836 }, { "auxiliary_loss_clip": 0.06315763, "auxiliary_loss_mlp": 0.01251864, "balance_loss_clip": 0.06260448, "balance_loss_mlp": 0.01250902, "epoch": 0.9650984518262438, "flos": 61767083963520.0, "grad_norm": 0.8159463538059478, "language_loss": 0.59163904, "learning_rate": 1.2742767204863004e-08, "loss": 0.66731536, "num_input_tokens_seen": 346379025, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00960541, "step": 16052, "time_per_iteration": 3.20780086517334 }, { "auxiliary_loss_clip": 0.06409713, "auxiliary_loss_mlp": 0.01263381, "balance_loss_clip": 0.06273445, "balance_loss_mlp": 0.012545, "epoch": 0.9651585750789118, "flos": 29797155233280.0, "grad_norm": 1.5630015333361267, "language_loss": 0.74727643, "learning_rate": 1.2698911027013482e-08, "loss": 0.82400739, "num_input_tokens_seen": 346402250, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.08874512, "step": 16053, "time_per_iteration": 2.6599621772766113 }, { "auxiliary_loss_clip": 0.06414229, "auxiliary_loss_mlp": 0.01262837, "balance_loss_clip": 0.06272884, "balance_loss_mlp": 0.01253539, "epoch": 0.9652186983315797, "flos": 16878664650240.0, "grad_norm": 2.034201148835221, "language_loss": 0.68641961, "learning_rate": 1.2655130207820386e-08, "loss": 0.76319033, "num_input_tokens_seen": 346419555, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09307861, "step": 16054, "time_per_iteration": 2.52360463142395 }, { "auxiliary_loss_clip": 0.06411099, "auxiliary_loss_mlp": 0.01262172, "balance_loss_clip": 0.06273673, "balance_loss_mlp": 0.01253863, "epoch": 0.9652788215842477, "flos": 31657831799040.0, "grad_norm": 1.5574996339777645, "language_loss": 0.61994314, "learning_rate": 1.2611424748943944e-08, "loss": 0.69667578, "num_input_tokens_seen": 346441245, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.08306885, "step": 16055, "time_per_iteration": 4.075573921203613 }, { "auxiliary_loss_clip": 0.06410392, "auxiliary_loss_mlp": 0.01263853, "balance_loss_clip": 0.06273955, "balance_loss_mlp": 0.012549, "epoch": 0.9653389448369156, "flos": 24761236521600.0, "grad_norm": 2.0979676927347772, "language_loss": 0.76960456, "learning_rate": 1.2567794652041719e-08, "loss": 0.84634703, "num_input_tokens_seen": 346460065, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.08953857, "step": 16056, "time_per_iteration": 2.542222738265991 }, { "auxiliary_loss_clip": 0.06411803, "auxiliary_loss_mlp": 0.01269126, "balance_loss_clip": 0.06273243, "balance_loss_mlp": 0.0125993, "epoch": 0.9653990680895836, "flos": 20302511460480.0, "grad_norm": 1.5559787036559418, "language_loss": 0.72114986, "learning_rate": 1.2524239918767498e-08, "loss": 0.79795909, "num_input_tokens_seen": 346478005, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09191895, "step": 16057, "time_per_iteration": 2.535431385040283 }, { "auxiliary_loss_clip": 0.06410316, "auxiliary_loss_mlp": 0.012654, "balance_loss_clip": 0.0627338, "balance_loss_mlp": 0.01255952, "epoch": 0.9654591913422517, "flos": 22535395862400.0, "grad_norm": 1.7031429887838423, "language_loss": 0.71955842, "learning_rate": 1.2480760550773295e-08, "loss": 0.79631555, "num_input_tokens_seen": 346497575, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.09460449, "step": 16058, "time_per_iteration": 2.5427451133728027 }, { "auxiliary_loss_clip": 0.06410592, "auxiliary_loss_mlp": 0.01262702, "balance_loss_clip": 0.06275662, "balance_loss_mlp": 0.01253642, "epoch": 0.9655193145949196, "flos": 26770645284480.0, "grad_norm": 1.4494277101391477, "language_loss": 0.7434063, "learning_rate": 1.2437356549708011e-08, "loss": 0.82013923, "num_input_tokens_seen": 346520000, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.09057617, "step": 16059, "time_per_iteration": 2.6099274158477783 }, { "auxiliary_loss_clip": 0.06420363, "auxiliary_loss_mlp": 0.01263903, "balance_loss_clip": 0.06276978, "balance_loss_mlp": 0.01254408, "epoch": 0.9655794378475876, "flos": 41979741333120.0, "grad_norm": 1.6780348126377855, "language_loss": 0.73770416, "learning_rate": 1.239402791721722e-08, "loss": 0.81454676, "num_input_tokens_seen": 346541605, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.09484863, "step": 16060, "time_per_iteration": 4.169268846511841 }, { "auxiliary_loss_clip": 0.06405721, "auxiliary_loss_mlp": 0.01261726, "balance_loss_clip": 0.06271832, "balance_loss_mlp": 0.01253596, "epoch": 0.9656395611002555, "flos": 27716860316160.0, "grad_norm": 1.5074487534167595, "language_loss": 0.76461428, "learning_rate": 1.2350774654944273e-08, "loss": 0.84128881, "num_input_tokens_seen": 346560955, "router_z_loss_clip": 1.34082031, "router_z_loss_mlp": 0.0814209, "step": 16061, "time_per_iteration": 2.570119857788086 }, { "auxiliary_loss_clip": 0.06317985, "auxiliary_loss_mlp": 0.0125226, "balance_loss_clip": 0.06262653, "balance_loss_mlp": 0.01251169, "epoch": 0.9656996843529235, "flos": 68987949742080.0, "grad_norm": 0.7130906805610955, "language_loss": 0.6408115, "learning_rate": 1.2307596764528749e-08, "loss": 0.71651393, "num_input_tokens_seen": 346621615, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01093292, "step": 16062, "time_per_iteration": 3.2146494388580322 }, { "auxiliary_loss_clip": 0.06407955, "auxiliary_loss_mlp": 0.0126628, "balance_loss_clip": 0.0627517, "balance_loss_mlp": 0.01257757, "epoch": 0.9657598076055914, "flos": 20637599137920.0, "grad_norm": 1.9734508681660035, "language_loss": 0.93189961, "learning_rate": 1.226449424760867e-08, "loss": 1.00864196, "num_input_tokens_seen": 346637460, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.08526611, "step": 16063, "time_per_iteration": 2.5317156314849854 }, { "auxiliary_loss_clip": 0.06417108, "auxiliary_loss_mlp": 0.01269206, "balance_loss_clip": 0.06278226, "balance_loss_mlp": 0.01260045, "epoch": 0.9658199308582595, "flos": 20454765528960.0, "grad_norm": 1.8551937151550753, "language_loss": 0.82210296, "learning_rate": 1.2221467105818062e-08, "loss": 0.89896619, "num_input_tokens_seen": 346655625, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.0916748, "step": 16064, "time_per_iteration": 2.554917335510254 }, { "auxiliary_loss_clip": 0.06413838, "auxiliary_loss_mlp": 0.01261261, "balance_loss_clip": 0.06278118, "balance_loss_mlp": 0.01252094, "epoch": 0.9658800541109274, "flos": 24725038757760.0, "grad_norm": 1.4988754064058079, "language_loss": 0.84411412, "learning_rate": 1.2178515340788731e-08, "loss": 0.92086518, "num_input_tokens_seen": 346675220, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.0916748, "step": 16065, "time_per_iteration": 2.578040599822998 }, { "auxiliary_loss_clip": 0.06414488, "auxiliary_loss_mlp": 0.01263218, "balance_loss_clip": 0.0627529, "balance_loss_mlp": 0.01254271, "epoch": 0.9659401773635954, "flos": 21615399959040.0, "grad_norm": 2.082011957928731, "language_loss": 0.67730999, "learning_rate": 1.2135638954149151e-08, "loss": 0.75408697, "num_input_tokens_seen": 346694710, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.08947754, "step": 16066, "time_per_iteration": 2.5402519702911377 }, { "auxiliary_loss_clip": 0.06410807, "auxiliary_loss_mlp": 0.01268554, "balance_loss_clip": 0.06272882, "balance_loss_mlp": 0.01259458, "epoch": 0.9660003006162633, "flos": 20307123434880.0, "grad_norm": 1.6848177763569172, "language_loss": 0.82177007, "learning_rate": 1.209283794752558e-08, "loss": 0.89856374, "num_input_tokens_seen": 346712645, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09091187, "step": 16067, "time_per_iteration": 2.5313446521759033 }, { "auxiliary_loss_clip": 0.06413782, "auxiliary_loss_mlp": 0.01264185, "balance_loss_clip": 0.06276046, "balance_loss_mlp": 0.01255113, "epoch": 0.9660604238689313, "flos": 24468803372160.0, "grad_norm": 1.8074160377707402, "language_loss": 0.6934697, "learning_rate": 1.2050112322540496e-08, "loss": 0.77024937, "num_input_tokens_seen": 346732375, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09069824, "step": 16068, "time_per_iteration": 2.5890583992004395 }, { "auxiliary_loss_clip": 0.064057, "auxiliary_loss_mlp": 0.01266706, "balance_loss_clip": 0.06275673, "balance_loss_mlp": 0.01258749, "epoch": 0.9661205471215992, "flos": 19869983084160.0, "grad_norm": 1.8386201580938366, "language_loss": 0.6802327, "learning_rate": 1.20074620808146e-08, "loss": 0.7569567, "num_input_tokens_seen": 346750430, "router_z_loss_clip": 1.30175781, "router_z_loss_mlp": 0.07952881, "step": 16069, "time_per_iteration": 2.5247373580932617 }, { "auxiliary_loss_clip": 0.06410424, "auxiliary_loss_mlp": 0.012644, "balance_loss_clip": 0.0627323, "balance_loss_mlp": 0.01254732, "epoch": 0.9661806703742672, "flos": 20564071580160.0, "grad_norm": 1.7427674191634162, "language_loss": 0.89365268, "learning_rate": 1.1964887223964826e-08, "loss": 0.97040093, "num_input_tokens_seen": 346768455, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09667969, "step": 16070, "time_per_iteration": 2.5441458225250244 }, { "auxiliary_loss_clip": 0.06413795, "auxiliary_loss_mlp": 0.01266201, "balance_loss_clip": 0.06273793, "balance_loss_mlp": 0.01255347, "epoch": 0.9662407936269353, "flos": 21436842908160.0, "grad_norm": 1.7602421306515768, "language_loss": 0.77245533, "learning_rate": 1.1922387753605878e-08, "loss": 0.84925532, "num_input_tokens_seen": 346786530, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.10845947, "step": 16071, "time_per_iteration": 4.058960437774658 }, { "auxiliary_loss_clip": 0.06408139, "auxiliary_loss_mlp": 0.01264301, "balance_loss_clip": 0.06272801, "balance_loss_mlp": 0.01254514, "epoch": 0.9663009168796032, "flos": 14908178908800.0, "grad_norm": 1.7397486548756396, "language_loss": 0.66100204, "learning_rate": 1.1879963671349137e-08, "loss": 0.73772645, "num_input_tokens_seen": 346804635, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.09790039, "step": 16072, "time_per_iteration": 2.495800256729126 }, { "auxiliary_loss_clip": 0.06418809, "auxiliary_loss_mlp": 0.01266102, "balance_loss_clip": 0.06278478, "balance_loss_mlp": 0.01256714, "epoch": 0.9663610401322712, "flos": 24316842792960.0, "grad_norm": 1.737554526945808, "language_loss": 0.77841693, "learning_rate": 1.1837614978803534e-08, "loss": 0.85526603, "num_input_tokens_seen": 346823070, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09381104, "step": 16073, "time_per_iteration": 2.5573790073394775 }, { "auxiliary_loss_clip": 0.06423602, "auxiliary_loss_mlp": 0.01265374, "balance_loss_clip": 0.06281342, "balance_loss_mlp": 0.01255145, "epoch": 0.9664211633849391, "flos": 17643345811200.0, "grad_norm": 4.968425802334965, "language_loss": 0.75913489, "learning_rate": 1.1795341677574677e-08, "loss": 0.83602458, "num_input_tokens_seen": 346841180, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10229492, "step": 16074, "time_per_iteration": 2.5153865814208984 }, { "auxiliary_loss_clip": 0.06412791, "auxiliary_loss_mlp": 0.0126584, "balance_loss_clip": 0.06273317, "balance_loss_mlp": 0.01256011, "epoch": 0.9664812866376071, "flos": 29797239087360.0, "grad_norm": 1.8615742412648988, "language_loss": 0.75845599, "learning_rate": 1.1753143769265728e-08, "loss": 0.83524227, "num_input_tokens_seen": 346864250, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09820557, "step": 16075, "time_per_iteration": 3.9824299812316895 }, { "auxiliary_loss_clip": 0.06415658, "auxiliary_loss_mlp": 0.01266113, "balance_loss_clip": 0.06274345, "balance_loss_mlp": 0.01256976, "epoch": 0.966541409890275, "flos": 14287450262400.0, "grad_norm": 1.888535369844185, "language_loss": 0.78886354, "learning_rate": 1.171102125547696e-08, "loss": 0.86568123, "num_input_tokens_seen": 346881955, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09130859, "step": 16076, "time_per_iteration": 2.5055902004241943 }, { "auxiliary_loss_clip": 0.06415714, "auxiliary_loss_mlp": 0.01264942, "balance_loss_clip": 0.06275865, "balance_loss_mlp": 0.01254934, "epoch": 0.9666015331429431, "flos": 19865790380160.0, "grad_norm": 1.6135314220374595, "language_loss": 0.72390378, "learning_rate": 1.166897413780532e-08, "loss": 0.80071032, "num_input_tokens_seen": 346900445, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.10009766, "step": 16077, "time_per_iteration": 2.5385005474090576 }, { "auxiliary_loss_clip": 0.06411783, "auxiliary_loss_mlp": 0.01263419, "balance_loss_clip": 0.06274028, "balance_loss_mlp": 0.012539, "epoch": 0.966661656395611, "flos": 27133335682560.0, "grad_norm": 1.9373748726585052, "language_loss": 0.60036784, "learning_rate": 1.1627002417845533e-08, "loss": 0.67711985, "num_input_tokens_seen": 346920135, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09521484, "step": 16078, "time_per_iteration": 2.568155527114868 }, { "auxiliary_loss_clip": 0.06418507, "auxiliary_loss_mlp": 0.01264562, "balance_loss_clip": 0.06276146, "balance_loss_mlp": 0.01254483, "epoch": 0.966721779648279, "flos": 21514856659200.0, "grad_norm": 1.6899951665802573, "language_loss": 0.72048342, "learning_rate": 1.158510609718899e-08, "loss": 0.79731405, "num_input_tokens_seen": 346940450, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10083008, "step": 16079, "time_per_iteration": 2.5375077724456787 }, { "auxiliary_loss_clip": 0.06406407, "auxiliary_loss_mlp": 0.01265543, "balance_loss_clip": 0.06273361, "balance_loss_mlp": 0.01256322, "epoch": 0.9667819029009469, "flos": 23884859468160.0, "grad_norm": 1.5147720986899036, "language_loss": 0.72669709, "learning_rate": 1.1543285177424644e-08, "loss": 0.80341655, "num_input_tokens_seen": 346960935, "router_z_loss_clip": 1.33007812, "router_z_loss_mlp": 0.09216309, "step": 16080, "time_per_iteration": 2.543322801589966 }, { "auxiliary_loss_clip": 0.06411502, "auxiliary_loss_mlp": 0.01263627, "balance_loss_clip": 0.06274991, "balance_loss_mlp": 0.01254657, "epoch": 0.9668420261536149, "flos": 21513682702080.0, "grad_norm": 1.8869279954228546, "language_loss": 0.73776251, "learning_rate": 1.1501539660138115e-08, "loss": 0.8145138, "num_input_tokens_seen": 346980100, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.08966064, "step": 16081, "time_per_iteration": 2.524021863937378 }, { "auxiliary_loss_clip": 0.06409755, "auxiliary_loss_mlp": 0.01264728, "balance_loss_clip": 0.06271009, "balance_loss_mlp": 0.01255001, "epoch": 0.9669021494062828, "flos": 26694434396160.0, "grad_norm": 1.7578284998835074, "language_loss": 0.67699707, "learning_rate": 1.145986954691236e-08, "loss": 0.75374192, "num_input_tokens_seen": 347001250, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09735107, "step": 16082, "time_per_iteration": 2.5871963500976562 }, { "auxiliary_loss_clip": 0.06414525, "auxiliary_loss_mlp": 0.01267366, "balance_loss_clip": 0.06276969, "balance_loss_mlp": 0.01257639, "epoch": 0.9669622726589508, "flos": 29832724091520.0, "grad_norm": 1.467099225045995, "language_loss": 0.77039438, "learning_rate": 1.141827483932789e-08, "loss": 0.84721327, "num_input_tokens_seen": 347022975, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.097229, "step": 16083, "time_per_iteration": 2.616544723510742 }, { "auxiliary_loss_clip": 0.0641415, "auxiliary_loss_mlp": 0.01265089, "balance_loss_clip": 0.06274451, "balance_loss_mlp": 0.01256286, "epoch": 0.9670223959116189, "flos": 22927911114240.0, "grad_norm": 1.9550359965694508, "language_loss": 0.79863578, "learning_rate": 1.1376755538961669e-08, "loss": 0.8754282, "num_input_tokens_seen": 347038780, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.08807373, "step": 16084, "time_per_iteration": 2.53887939453125 }, { "auxiliary_loss_clip": 0.06417201, "auxiliary_loss_mlp": 0.01264448, "balance_loss_clip": 0.06275714, "balance_loss_mlp": 0.01254238, "epoch": 0.9670825191642868, "flos": 18630412508160.0, "grad_norm": 2.1207824365174406, "language_loss": 0.68069035, "learning_rate": 1.1335311647387991e-08, "loss": 0.75750685, "num_input_tokens_seen": 347056705, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.10205078, "step": 16085, "time_per_iteration": 2.518605947494507 }, { "auxiliary_loss_clip": 0.06420721, "auxiliary_loss_mlp": 0.01264957, "balance_loss_clip": 0.06276771, "balance_loss_mlp": 0.01254383, "epoch": 0.9671426424169548, "flos": 24504707646720.0, "grad_norm": 1.8655430474233259, "language_loss": 0.68748677, "learning_rate": 1.1293943166178709e-08, "loss": 0.7643435, "num_input_tokens_seen": 347075710, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10571289, "step": 16086, "time_per_iteration": 2.581663131713867 }, { "auxiliary_loss_clip": 0.06414476, "auxiliary_loss_mlp": 0.01265077, "balance_loss_clip": 0.06276582, "balance_loss_mlp": 0.01255331, "epoch": 0.9672027656696227, "flos": 20376625996800.0, "grad_norm": 1.44358513086806, "language_loss": 0.78677344, "learning_rate": 1.125265009690235e-08, "loss": 0.86356896, "num_input_tokens_seen": 347092325, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09741211, "step": 16087, "time_per_iteration": 2.6069400310516357 }, { "auxiliary_loss_clip": 0.06412496, "auxiliary_loss_mlp": 0.01261855, "balance_loss_clip": 0.062757, "balance_loss_mlp": 0.01253058, "epoch": 0.9672628889222907, "flos": 18886186696320.0, "grad_norm": 1.830502412574644, "language_loss": 0.7160542, "learning_rate": 1.1211432441124769e-08, "loss": 0.79279768, "num_input_tokens_seen": 347110595, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.0880127, "step": 16088, "time_per_iteration": 2.510802745819092 }, { "auxiliary_loss_clip": 0.06409287, "auxiliary_loss_mlp": 0.01262884, "balance_loss_clip": 0.06273998, "balance_loss_mlp": 0.01253795, "epoch": 0.9673230121749586, "flos": 28702962691200.0, "grad_norm": 1.405702268101301, "language_loss": 0.70867586, "learning_rate": 1.117029020040916e-08, "loss": 0.78539759, "num_input_tokens_seen": 347131625, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.09088135, "step": 16089, "time_per_iteration": 2.582651376724243 }, { "auxiliary_loss_clip": 0.0641515, "auxiliary_loss_mlp": 0.01264651, "balance_loss_clip": 0.06275527, "balance_loss_mlp": 0.0125487, "epoch": 0.9673831354276267, "flos": 20490544022400.0, "grad_norm": 2.0513235877654448, "language_loss": 0.74926865, "learning_rate": 1.1129223376315167e-08, "loss": 0.82606673, "num_input_tokens_seen": 347147910, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09783936, "step": 16090, "time_per_iteration": 2.6089718341827393 }, { "auxiliary_loss_clip": 0.06421092, "auxiliary_loss_mlp": 0.01263646, "balance_loss_clip": 0.06274807, "balance_loss_mlp": 0.01254187, "epoch": 0.9674432586802946, "flos": 26804872477440.0, "grad_norm": 1.6703459201100204, "language_loss": 0.68748206, "learning_rate": 1.1088231970400653e-08, "loss": 0.76432943, "num_input_tokens_seen": 347168805, "router_z_loss_clip": 1.46289062, "router_z_loss_mlp": 0.09466553, "step": 16091, "time_per_iteration": 2.6134073734283447 }, { "auxiliary_loss_clip": 0.06410336, "auxiliary_loss_mlp": 0.01264743, "balance_loss_clip": 0.06274805, "balance_loss_mlp": 0.01255189, "epoch": 0.9675033819329626, "flos": 22317706154880.0, "grad_norm": 1.6481401264319253, "language_loss": 0.77091706, "learning_rate": 1.1047315984219484e-08, "loss": 0.84766793, "num_input_tokens_seen": 347189455, "router_z_loss_clip": 1.35644531, "router_z_loss_mlp": 0.09558105, "step": 16092, "time_per_iteration": 2.555403709411621 }, { "auxiliary_loss_clip": 0.06411824, "auxiliary_loss_mlp": 0.01264917, "balance_loss_clip": 0.06274937, "balance_loss_mlp": 0.01256054, "epoch": 0.9675635051856305, "flos": 12680367678720.0, "grad_norm": 1.7896080330111548, "language_loss": 0.76881105, "learning_rate": 1.1006475419323313e-08, "loss": 0.84557849, "num_input_tokens_seen": 347206030, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.08868408, "step": 16093, "time_per_iteration": 2.4958655834198 }, { "auxiliary_loss_clip": 0.06415014, "auxiliary_loss_mlp": 0.01266312, "balance_loss_clip": 0.0627647, "balance_loss_mlp": 0.01256132, "epoch": 0.9676236284382985, "flos": 24615439217280.0, "grad_norm": 1.5947063991716393, "language_loss": 0.69176489, "learning_rate": 1.096571027726112e-08, "loss": 0.76857817, "num_input_tokens_seen": 347226250, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.10180664, "step": 16094, "time_per_iteration": 2.5761632919311523 }, { "auxiliary_loss_clip": 0.06418519, "auxiliary_loss_mlp": 0.01263419, "balance_loss_clip": 0.06274685, "balance_loss_mlp": 0.01254151, "epoch": 0.9676837516909664, "flos": 23373772289280.0, "grad_norm": 1.769982344706132, "language_loss": 0.76123929, "learning_rate": 1.0925020559578557e-08, "loss": 0.83805865, "num_input_tokens_seen": 347247350, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.09265137, "step": 16095, "time_per_iteration": 3.999784231185913 }, { "auxiliary_loss_clip": 0.06420524, "auxiliary_loss_mlp": 0.01263896, "balance_loss_clip": 0.06276897, "balance_loss_mlp": 0.01254103, "epoch": 0.9677438749436345, "flos": 20493395061120.0, "grad_norm": 1.8558371763306878, "language_loss": 0.70511222, "learning_rate": 1.0884406267818392e-08, "loss": 0.78195643, "num_input_tokens_seen": 347266870, "router_z_loss_clip": 1.43652344, "router_z_loss_mlp": 0.09783936, "step": 16096, "time_per_iteration": 2.542170286178589 }, { "auxiliary_loss_clip": 0.06419419, "auxiliary_loss_mlp": 0.01263652, "balance_loss_clip": 0.06278738, "balance_loss_mlp": 0.01254026, "epoch": 0.9678039981963025, "flos": 47566341077760.0, "grad_norm": 1.7432709750943822, "language_loss": 0.71672344, "learning_rate": 1.0843867403520946e-08, "loss": 0.79355419, "num_input_tokens_seen": 347290120, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09631348, "step": 16097, "time_per_iteration": 2.7637088298797607 }, { "auxiliary_loss_clip": 0.0641112, "auxiliary_loss_mlp": 0.01265851, "balance_loss_clip": 0.06275229, "balance_loss_mlp": 0.01256678, "epoch": 0.9678641214489704, "flos": 25046542074240.0, "grad_norm": 1.5486731925671364, "language_loss": 0.78693295, "learning_rate": 1.0803403968223434e-08, "loss": 0.86370265, "num_input_tokens_seen": 347308785, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.09173584, "step": 16098, "time_per_iteration": 2.589855909347534 }, { "auxiliary_loss_clip": 0.06410343, "auxiliary_loss_mlp": 0.01263528, "balance_loss_clip": 0.06274527, "balance_loss_mlp": 0.01254826, "epoch": 0.9679242447016384, "flos": 19246319544960.0, "grad_norm": 1.660904848008809, "language_loss": 0.9101361, "learning_rate": 1.0763015963459965e-08, "loss": 0.98687476, "num_input_tokens_seen": 347326375, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.08691406, "step": 16099, "time_per_iteration": 4.010148048400879 }, { "auxiliary_loss_clip": 0.06417812, "auxiliary_loss_mlp": 0.01267519, "balance_loss_clip": 0.06273927, "balance_loss_mlp": 0.0125747, "epoch": 0.9679843679543063, "flos": 33262943811840.0, "grad_norm": 1.5778808166934475, "language_loss": 0.66289222, "learning_rate": 1.0722703390762643e-08, "loss": 0.7397455, "num_input_tokens_seen": 347348250, "router_z_loss_clip": 1.43847656, "router_z_loss_mlp": 0.10064697, "step": 16100, "time_per_iteration": 2.633054494857788 }, { "auxiliary_loss_clip": 0.06415407, "auxiliary_loss_mlp": 0.01263116, "balance_loss_clip": 0.06277208, "balance_loss_mlp": 0.01253747, "epoch": 0.9680444912069743, "flos": 22790205728640.0, "grad_norm": 1.6693851447931045, "language_loss": 0.73520112, "learning_rate": 1.0682466251659584e-08, "loss": 0.81198633, "num_input_tokens_seen": 347367400, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09375, "step": 16101, "time_per_iteration": 2.5430805683135986 }, { "auxiliary_loss_clip": 0.06412709, "auxiliary_loss_mlp": 0.01263296, "balance_loss_clip": 0.06274911, "balance_loss_mlp": 0.01253742, "epoch": 0.9681046144596422, "flos": 24030866407680.0, "grad_norm": 1.6173441397873862, "language_loss": 0.73515803, "learning_rate": 1.0642304547676672e-08, "loss": 0.81191802, "num_input_tokens_seen": 347387600, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.09552002, "step": 16102, "time_per_iteration": 2.5377612113952637 }, { "auxiliary_loss_clip": 0.06414621, "auxiliary_loss_mlp": 0.01266509, "balance_loss_clip": 0.06275509, "balance_loss_mlp": 0.01255833, "epoch": 0.9681647377123103, "flos": 23447802971520.0, "grad_norm": 1.7110716215944777, "language_loss": 0.77701318, "learning_rate": 1.0602218280337139e-08, "loss": 0.85382438, "num_input_tokens_seen": 347406915, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10681152, "step": 16103, "time_per_iteration": 2.5464015007019043 }, { "auxiliary_loss_clip": 0.06414741, "auxiliary_loss_mlp": 0.0126275, "balance_loss_clip": 0.06276934, "balance_loss_mlp": 0.01254391, "epoch": 0.9682248609649782, "flos": 22681780145280.0, "grad_norm": 3.882342867446835, "language_loss": 0.80865365, "learning_rate": 1.0562207451160655e-08, "loss": 0.88542855, "num_input_tokens_seen": 347425140, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.08358765, "step": 16104, "time_per_iteration": 2.5437138080596924 }, { "auxiliary_loss_clip": 0.06411067, "auxiliary_loss_mlp": 0.01263588, "balance_loss_clip": 0.06274861, "balance_loss_mlp": 0.01254767, "epoch": 0.9682849842176462, "flos": 24435750136320.0, "grad_norm": 1.391606164529095, "language_loss": 0.77795374, "learning_rate": 1.0522272061664672e-08, "loss": 0.85470039, "num_input_tokens_seen": 347446350, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.08813477, "step": 16105, "time_per_iteration": 2.5829920768737793 }, { "auxiliary_loss_clip": 0.06316805, "auxiliary_loss_mlp": 0.0125125, "balance_loss_clip": 0.06261708, "balance_loss_mlp": 0.01250123, "epoch": 0.9683451074703141, "flos": 60013365534720.0, "grad_norm": 0.7984023574537356, "language_loss": 0.56651044, "learning_rate": 1.0482412113363536e-08, "loss": 0.64219099, "num_input_tokens_seen": 347510135, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01128387, "step": 16106, "time_per_iteration": 3.205489158630371 }, { "auxiliary_loss_clip": 0.06314558, "auxiliary_loss_mlp": 0.01251553, "balance_loss_clip": 0.06259466, "balance_loss_mlp": 0.01250515, "epoch": 0.9684052307229821, "flos": 52712850850560.0, "grad_norm": 0.8657575338821065, "language_loss": 0.61468333, "learning_rate": 1.0442627607768707e-08, "loss": 0.69034445, "num_input_tokens_seen": 347562505, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01038361, "step": 16107, "time_per_iteration": 3.0409460067749023 }, { "auxiliary_loss_clip": 0.06414768, "auxiliary_loss_mlp": 0.01265391, "balance_loss_clip": 0.0627611, "balance_loss_mlp": 0.01255533, "epoch": 0.96846535397565, "flos": 22790457290880.0, "grad_norm": 2.353295662903774, "language_loss": 0.74141467, "learning_rate": 1.040291854638875e-08, "loss": 0.81821626, "num_input_tokens_seen": 347579150, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09863281, "step": 16108, "time_per_iteration": 2.5496068000793457 }, { "auxiliary_loss_clip": 0.06417157, "auxiliary_loss_mlp": 0.01261968, "balance_loss_clip": 0.06276621, "balance_loss_mlp": 0.01252574, "epoch": 0.968525477228318, "flos": 23329482606720.0, "grad_norm": 2.231614120836551, "language_loss": 0.56771123, "learning_rate": 1.0363284930729576e-08, "loss": 0.64450252, "num_input_tokens_seen": 347596705, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09399414, "step": 16109, "time_per_iteration": 2.53391432762146 }, { "auxiliary_loss_clip": 0.06312944, "auxiliary_loss_mlp": 0.01252127, "balance_loss_clip": 0.06257804, "balance_loss_mlp": 0.01251116, "epoch": 0.9685856004809861, "flos": 67903651981440.0, "grad_norm": 0.6506214863078694, "language_loss": 0.54188275, "learning_rate": 1.0323726762294205e-08, "loss": 0.61753345, "num_input_tokens_seen": 347661870, "router_z_loss_clip": 0.55224609, "router_z_loss_mlp": 0.01011658, "step": 16110, "time_per_iteration": 4.574189901351929 }, { "auxiliary_loss_clip": 0.06418983, "auxiliary_loss_mlp": 0.01264269, "balance_loss_clip": 0.06276425, "balance_loss_mlp": 0.01254446, "epoch": 0.968645723733654, "flos": 33956277621120.0, "grad_norm": 1.3936417951164006, "language_loss": 0.62604284, "learning_rate": 1.0284244042582325e-08, "loss": 0.70287538, "num_input_tokens_seen": 347684295, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.0982666, "step": 16111, "time_per_iteration": 2.688758134841919 }, { "auxiliary_loss_clip": 0.06413704, "auxiliary_loss_mlp": 0.01262414, "balance_loss_clip": 0.06276992, "balance_loss_mlp": 0.0125348, "epoch": 0.968705846986322, "flos": 18557388074880.0, "grad_norm": 2.429982570727651, "language_loss": 0.74121499, "learning_rate": 1.024483677309118e-08, "loss": 0.81797618, "num_input_tokens_seen": 347702585, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.08935547, "step": 16112, "time_per_iteration": 2.610057830810547 }, { "auxiliary_loss_clip": 0.06409698, "auxiliary_loss_mlp": 0.01264894, "balance_loss_clip": 0.06274804, "balance_loss_mlp": 0.01256382, "epoch": 0.9687659702389899, "flos": 17426704279680.0, "grad_norm": 2.344939442138717, "language_loss": 0.66908014, "learning_rate": 1.020550495531558e-08, "loss": 0.74582601, "num_input_tokens_seen": 347721810, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.08514404, "step": 16113, "time_per_iteration": 2.5401906967163086 }, { "auxiliary_loss_clip": 0.06317814, "auxiliary_loss_mlp": 0.01251631, "balance_loss_clip": 0.06262671, "balance_loss_mlp": 0.01250593, "epoch": 0.9688260934916579, "flos": 62067231688320.0, "grad_norm": 0.6830382168915499, "language_loss": 0.5646168, "learning_rate": 1.0166248590746329e-08, "loss": 0.64031124, "num_input_tokens_seen": 347782330, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01039124, "step": 16114, "time_per_iteration": 4.584575414657593 }, { "auxiliary_loss_clip": 0.06414622, "auxiliary_loss_mlp": 0.01268198, "balance_loss_clip": 0.0627621, "balance_loss_mlp": 0.01258322, "epoch": 0.9688862167443258, "flos": 15080363049600.0, "grad_norm": 2.025176558067491, "language_loss": 0.82416749, "learning_rate": 1.0127067680872458e-08, "loss": 0.90099573, "num_input_tokens_seen": 347794835, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09887695, "step": 16115, "time_per_iteration": 2.5268051624298096 }, { "auxiliary_loss_clip": 0.06408772, "auxiliary_loss_mlp": 0.01261878, "balance_loss_clip": 0.06277055, "balance_loss_mlp": 0.01253504, "epoch": 0.9689463399969939, "flos": 19944391109760.0, "grad_norm": 1.5332179161477633, "language_loss": 0.72230113, "learning_rate": 1.0087962227179448e-08, "loss": 0.79900765, "num_input_tokens_seen": 347814320, "router_z_loss_clip": 1.31640625, "router_z_loss_mlp": 0.08374023, "step": 16116, "time_per_iteration": 2.565117835998535 }, { "auxiliary_loss_clip": 0.06418776, "auxiliary_loss_mlp": 0.01261759, "balance_loss_clip": 0.06277981, "balance_loss_mlp": 0.0125249, "epoch": 0.9690064632496618, "flos": 19579101235200.0, "grad_norm": 2.0059656808893527, "language_loss": 0.7576319, "learning_rate": 1.0048932231150553e-08, "loss": 0.83443725, "num_input_tokens_seen": 347832125, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09265137, "step": 16117, "time_per_iteration": 2.505464553833008 }, { "auxiliary_loss_clip": 0.06418559, "auxiliary_loss_mlp": 0.01263002, "balance_loss_clip": 0.06277627, "balance_loss_mlp": 0.01253042, "epoch": 0.9690665865023298, "flos": 21878846795520.0, "grad_norm": 2.3184354142017534, "language_loss": 0.7757659, "learning_rate": 1.000997769426548e-08, "loss": 0.8525815, "num_input_tokens_seen": 347850765, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09960938, "step": 16118, "time_per_iteration": 2.542370319366455 }, { "auxiliary_loss_clip": 0.06414767, "auxiliary_loss_mlp": 0.01266694, "balance_loss_clip": 0.06274578, "balance_loss_mlp": 0.01256942, "epoch": 0.9691267097549977, "flos": 21000541098240.0, "grad_norm": 1.6963214470987995, "language_loss": 0.78398919, "learning_rate": 9.971098618001272e-09, "loss": 0.86080378, "num_input_tokens_seen": 347870125, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09747314, "step": 16119, "time_per_iteration": 2.5263562202453613 }, { "auxiliary_loss_clip": 0.06408501, "auxiliary_loss_mlp": 0.0126317, "balance_loss_clip": 0.06275963, "balance_loss_mlp": 0.01254641, "epoch": 0.9691868330076657, "flos": 24285885909120.0, "grad_norm": 1.4184246427347929, "language_loss": 0.75934672, "learning_rate": 9.932295003832747e-09, "loss": 0.83606339, "num_input_tokens_seen": 347890615, "router_z_loss_clip": 1.32421875, "router_z_loss_mlp": 0.08532715, "step": 16120, "time_per_iteration": 2.575281858444214 }, { "auxiliary_loss_clip": 0.06410585, "auxiliary_loss_mlp": 0.01263294, "balance_loss_clip": 0.06272854, "balance_loss_mlp": 0.01254103, "epoch": 0.9692469562603336, "flos": 17681430291840.0, "grad_norm": 1.925740411565378, "language_loss": 0.70432156, "learning_rate": 9.89356685323095e-09, "loss": 0.78106034, "num_input_tokens_seen": 347908685, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09191895, "step": 16121, "time_per_iteration": 2.493919849395752 }, { "auxiliary_loss_clip": 0.0641022, "auxiliary_loss_mlp": 0.0126207, "balance_loss_clip": 0.06273013, "balance_loss_mlp": 0.01252354, "epoch": 0.9693070795130017, "flos": 26841783000960.0, "grad_norm": 1.6581128346583813, "language_loss": 0.69451332, "learning_rate": 9.854914167664486e-09, "loss": 0.77123624, "num_input_tokens_seen": 347926385, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09710693, "step": 16122, "time_per_iteration": 2.571794033050537 }, { "auxiliary_loss_clip": 0.0641599, "auxiliary_loss_mlp": 0.01260528, "balance_loss_clip": 0.0627649, "balance_loss_mlp": 0.01251951, "epoch": 0.9693672027656697, "flos": 18083127565440.0, "grad_norm": 1.8863489558248165, "language_loss": 0.75984621, "learning_rate": 9.81633694859907e-09, "loss": 0.83661145, "num_input_tokens_seen": 347945290, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.08581543, "step": 16123, "time_per_iteration": 2.5039618015289307 }, { "auxiliary_loss_clip": 0.06415638, "auxiliary_loss_mlp": 0.01264633, "balance_loss_clip": 0.06274346, "balance_loss_mlp": 0.01255472, "epoch": 0.9694273260183376, "flos": 21769582671360.0, "grad_norm": 1.4541389791020152, "language_loss": 0.74444509, "learning_rate": 9.777835197497753e-09, "loss": 0.82124782, "num_input_tokens_seen": 347966330, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.0916748, "step": 16124, "time_per_iteration": 2.5749690532684326 }, { "auxiliary_loss_clip": 0.06414723, "auxiliary_loss_mlp": 0.01265716, "balance_loss_clip": 0.06274687, "balance_loss_mlp": 0.01256608, "epoch": 0.9694874492710056, "flos": 24433066805760.0, "grad_norm": 2.111023882594631, "language_loss": 0.74437308, "learning_rate": 9.739408915820258e-09, "loss": 0.82117748, "num_input_tokens_seen": 347982590, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09100342, "step": 16125, "time_per_iteration": 2.599246025085449 }, { "auxiliary_loss_clip": 0.06315272, "auxiliary_loss_mlp": 0.01251449, "balance_loss_clip": 0.06260216, "balance_loss_mlp": 0.01250415, "epoch": 0.9695475725236735, "flos": 67669191457920.0, "grad_norm": 0.8527356030245078, "language_loss": 0.61560214, "learning_rate": 9.70105810502364e-09, "loss": 0.6912694, "num_input_tokens_seen": 348043310, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01034546, "step": 16126, "time_per_iteration": 3.1324946880340576 }, { "auxiliary_loss_clip": 0.06409452, "auxiliary_loss_mlp": 0.01265045, "balance_loss_clip": 0.0627487, "balance_loss_mlp": 0.01255538, "epoch": 0.9696076957763415, "flos": 19134330163200.0, "grad_norm": 1.6056495134823865, "language_loss": 0.74989444, "learning_rate": 9.662782766562738e-09, "loss": 0.82663941, "num_input_tokens_seen": 348062200, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.09503174, "step": 16127, "time_per_iteration": 2.6357972621917725 }, { "auxiliary_loss_clip": 0.06419317, "auxiliary_loss_mlp": 0.01264901, "balance_loss_clip": 0.0627723, "balance_loss_mlp": 0.01254554, "epoch": 0.9696678190290094, "flos": 15492248593920.0, "grad_norm": 1.7398553261397725, "language_loss": 0.69699448, "learning_rate": 9.62458290188839e-09, "loss": 0.77383673, "num_input_tokens_seen": 348080685, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10339355, "step": 16128, "time_per_iteration": 2.5254223346710205 }, { "auxiliary_loss_clip": 0.06412844, "auxiliary_loss_mlp": 0.01267492, "balance_loss_clip": 0.0627512, "balance_loss_mlp": 0.01258152, "epoch": 0.9697279422816775, "flos": 36217225941120.0, "grad_norm": 1.6082764007868264, "language_loss": 0.65307623, "learning_rate": 9.586458512449213e-09, "loss": 0.72987956, "num_input_tokens_seen": 348102500, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09344482, "step": 16129, "time_per_iteration": 2.674312114715576 }, { "auxiliary_loss_clip": 0.06419669, "auxiliary_loss_mlp": 0.01264328, "balance_loss_clip": 0.06274931, "balance_loss_mlp": 0.01254297, "epoch": 0.9697880655343454, "flos": 25491103511040.0, "grad_norm": 1.8437851420471694, "language_loss": 0.64139521, "learning_rate": 9.548409599691166e-09, "loss": 0.71823514, "num_input_tokens_seen": 348122515, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10028076, "step": 16130, "time_per_iteration": 2.5723228454589844 }, { "auxiliary_loss_clip": 0.06418763, "auxiliary_loss_mlp": 0.01266435, "balance_loss_clip": 0.06275105, "balance_loss_mlp": 0.01256433, "epoch": 0.9698481887870134, "flos": 15337688538240.0, "grad_norm": 2.220304831212386, "language_loss": 0.70413685, "learning_rate": 9.510436165056867e-09, "loss": 0.78098881, "num_input_tokens_seen": 348138775, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09997559, "step": 16131, "time_per_iteration": 2.676119327545166 }, { "auxiliary_loss_clip": 0.06417838, "auxiliary_loss_mlp": 0.01266897, "balance_loss_clip": 0.06276669, "balance_loss_mlp": 0.01256657, "epoch": 0.9699083120396813, "flos": 21988907533440.0, "grad_norm": 1.8475259493044234, "language_loss": 0.76988, "learning_rate": 9.472538209986058e-09, "loss": 0.84672737, "num_input_tokens_seen": 348157115, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10247803, "step": 16132, "time_per_iteration": 2.5501739978790283 }, { "auxiliary_loss_clip": 0.06411688, "auxiliary_loss_mlp": 0.01265844, "balance_loss_clip": 0.06271793, "balance_loss_mlp": 0.0125539, "epoch": 0.9699684352923493, "flos": 15668625438720.0, "grad_norm": 2.368080911902331, "language_loss": 0.78731704, "learning_rate": 9.434715735916477e-09, "loss": 0.86409235, "num_input_tokens_seen": 348173035, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.10455322, "step": 16133, "time_per_iteration": 2.495290994644165 }, { "auxiliary_loss_clip": 0.06409085, "auxiliary_loss_mlp": 0.01265859, "balance_loss_clip": 0.062738, "balance_loss_mlp": 0.01256746, "epoch": 0.9700285585450172, "flos": 21914876851200.0, "grad_norm": 5.265533749106058, "language_loss": 0.64632374, "learning_rate": 9.396968744281863e-09, "loss": 0.72307312, "num_input_tokens_seen": 348192960, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.09124756, "step": 16134, "time_per_iteration": 4.031613111495972 }, { "auxiliary_loss_clip": 0.06415101, "auxiliary_loss_mlp": 0.01262161, "balance_loss_clip": 0.0627601, "balance_loss_mlp": 0.0125291, "epoch": 0.9700886817976853, "flos": 23921686137600.0, "grad_norm": 2.120243575170399, "language_loss": 0.81284899, "learning_rate": 9.359297236513519e-09, "loss": 0.88962162, "num_input_tokens_seen": 348212805, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.0925293, "step": 16135, "time_per_iteration": 2.537407636642456 }, { "auxiliary_loss_clip": 0.06419604, "auxiliary_loss_mlp": 0.01264779, "balance_loss_clip": 0.06277905, "balance_loss_mlp": 0.01254366, "epoch": 0.9701488050503532, "flos": 25454989601280.0, "grad_norm": 1.6185986763484264, "language_loss": 0.73759043, "learning_rate": 9.321701214040079e-09, "loss": 0.81443423, "num_input_tokens_seen": 348232900, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.10412598, "step": 16136, "time_per_iteration": 2.565288543701172 }, { "auxiliary_loss_clip": 0.06408452, "auxiliary_loss_mlp": 0.01265271, "balance_loss_clip": 0.06271198, "balance_loss_mlp": 0.01256294, "epoch": 0.9702089283030212, "flos": 20596453983360.0, "grad_norm": 1.5321707746520103, "language_loss": 0.76387942, "learning_rate": 9.28418067828729e-09, "loss": 0.8406167, "num_input_tokens_seen": 348253065, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.08978271, "step": 16137, "time_per_iteration": 2.6572306156158447 }, { "auxiliary_loss_clip": 0.06319299, "auxiliary_loss_mlp": 0.01252597, "balance_loss_clip": 0.06264216, "balance_loss_mlp": 0.01251649, "epoch": 0.9702690515556892, "flos": 70671955973760.0, "grad_norm": 0.7338334498125911, "language_loss": 0.5490275, "learning_rate": 9.246735630678015e-09, "loss": 0.62474644, "num_input_tokens_seen": 348316075, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00946045, "step": 16138, "time_per_iteration": 3.3109514713287354 }, { "auxiliary_loss_clip": 0.06413481, "auxiliary_loss_mlp": 0.01264333, "balance_loss_clip": 0.06273051, "balance_loss_mlp": 0.01255142, "epoch": 0.9703291748083571, "flos": 35890104401280.0, "grad_norm": 1.8785346113538919, "language_loss": 0.70980597, "learning_rate": 9.209366072632007e-09, "loss": 0.78658402, "num_input_tokens_seen": 348337605, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.09191895, "step": 16139, "time_per_iteration": 4.088456630706787 }, { "auxiliary_loss_clip": 0.0641802, "auxiliary_loss_mlp": 0.01264798, "balance_loss_clip": 0.06276951, "balance_loss_mlp": 0.01255017, "epoch": 0.9703892980610251, "flos": 24323383411200.0, "grad_norm": 1.418218738166712, "language_loss": 0.72210735, "learning_rate": 9.172072005566134e-09, "loss": 0.79893553, "num_input_tokens_seen": 348359430, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09783936, "step": 16140, "time_per_iteration": 2.5786256790161133 }, { "auxiliary_loss_clip": 0.06418725, "auxiliary_loss_mlp": 0.01268159, "balance_loss_clip": 0.06277405, "balance_loss_mlp": 0.01257824, "epoch": 0.970449421313693, "flos": 18009474226560.0, "grad_norm": 2.4486624481323656, "language_loss": 0.68204534, "learning_rate": 9.13485343089504e-09, "loss": 0.75891423, "num_input_tokens_seen": 348377890, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10333252, "step": 16141, "time_per_iteration": 2.5136454105377197 }, { "auxiliary_loss_clip": 0.06411448, "auxiliary_loss_mlp": 0.01263901, "balance_loss_clip": 0.06276917, "balance_loss_mlp": 0.01254746, "epoch": 0.9705095445663611, "flos": 25345054644480.0, "grad_norm": 1.8273819175119927, "language_loss": 0.6785382, "learning_rate": 9.097710350029597e-09, "loss": 0.7552917, "num_input_tokens_seen": 348396550, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.0914917, "step": 16142, "time_per_iteration": 2.578821897506714 }, { "auxiliary_loss_clip": 0.06412995, "auxiliary_loss_mlp": 0.0126241, "balance_loss_clip": 0.06273142, "balance_loss_mlp": 0.01253231, "epoch": 0.970569667819029, "flos": 26840860606080.0, "grad_norm": 1.9581925490543393, "language_loss": 0.55569184, "learning_rate": 9.060642764378457e-09, "loss": 0.63244593, "num_input_tokens_seen": 348417120, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09179688, "step": 16143, "time_per_iteration": 2.5623364448547363 }, { "auxiliary_loss_clip": 0.06417464, "auxiliary_loss_mlp": 0.01266403, "balance_loss_clip": 0.06276587, "balance_loss_mlp": 0.01257677, "epoch": 0.970629791071697, "flos": 25855764480000.0, "grad_norm": 2.7918810824544016, "language_loss": 0.67211378, "learning_rate": 9.023650675347382e-09, "loss": 0.74895245, "num_input_tokens_seen": 348437750, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.08728027, "step": 16144, "time_per_iteration": 2.5671443939208984 }, { "auxiliary_loss_clip": 0.06408797, "auxiliary_loss_mlp": 0.01264935, "balance_loss_clip": 0.06273548, "balance_loss_mlp": 0.0125556, "epoch": 0.9706899143243649, "flos": 36549294871680.0, "grad_norm": 1.8145314088996423, "language_loss": 0.72306514, "learning_rate": 8.986734084339253e-09, "loss": 0.79980254, "num_input_tokens_seen": 348460935, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.09375, "step": 16145, "time_per_iteration": 2.666142225265503 }, { "auxiliary_loss_clip": 0.06417041, "auxiliary_loss_mlp": 0.01263481, "balance_loss_clip": 0.0627533, "balance_loss_mlp": 0.01253563, "epoch": 0.9707500375770329, "flos": 12271794370560.0, "grad_norm": 3.208923182997866, "language_loss": 0.80497533, "learning_rate": 8.949892992753395e-09, "loss": 0.88178056, "num_input_tokens_seen": 348474480, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09918213, "step": 16146, "time_per_iteration": 2.4902219772338867 }, { "auxiliary_loss_clip": 0.06318317, "auxiliary_loss_mlp": 0.01250495, "balance_loss_clip": 0.0626323, "balance_loss_mlp": 0.01249471, "epoch": 0.9708101608297008, "flos": 60874550271360.0, "grad_norm": 0.8092444102182238, "language_loss": 0.54590619, "learning_rate": 8.91312740198713e-09, "loss": 0.62159431, "num_input_tokens_seen": 348541220, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01023865, "step": 16147, "time_per_iteration": 3.158968448638916 }, { "auxiliary_loss_clip": 0.06415626, "auxiliary_loss_mlp": 0.01266951, "balance_loss_clip": 0.0627266, "balance_loss_mlp": 0.01257343, "epoch": 0.9708702840823689, "flos": 27131952090240.0, "grad_norm": 3.421212134505539, "language_loss": 0.60691029, "learning_rate": 8.876437313434682e-09, "loss": 0.68373609, "num_input_tokens_seen": 348559230, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09606934, "step": 16148, "time_per_iteration": 2.5571374893188477 }, { "auxiliary_loss_clip": 0.06414063, "auxiliary_loss_mlp": 0.01265222, "balance_loss_clip": 0.06277414, "balance_loss_mlp": 0.01255793, "epoch": 0.9709304073350368, "flos": 20784067274880.0, "grad_norm": 1.8412923397565208, "language_loss": 0.74098015, "learning_rate": 8.839822728487155e-09, "loss": 0.81777298, "num_input_tokens_seen": 348577850, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09429932, "step": 16149, "time_per_iteration": 2.534696340560913 }, { "auxiliary_loss_clip": 0.06415278, "auxiliary_loss_mlp": 0.01264733, "balance_loss_clip": 0.06277062, "balance_loss_mlp": 0.01255518, "epoch": 0.9709905305877048, "flos": 41943627423360.0, "grad_norm": 2.196635527902061, "language_loss": 0.75403082, "learning_rate": 8.803283648533222e-09, "loss": 0.83083093, "num_input_tokens_seen": 348598345, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09210205, "step": 16150, "time_per_iteration": 4.214098215103149 }, { "auxiliary_loss_clip": 0.06423248, "auxiliary_loss_mlp": 0.01269212, "balance_loss_clip": 0.06277446, "balance_loss_mlp": 0.01257619, "epoch": 0.9710506538403728, "flos": 17171349361920.0, "grad_norm": 2.177479803658136, "language_loss": 0.73763323, "learning_rate": 8.766820074958214e-09, "loss": 0.81455785, "num_input_tokens_seen": 348616300, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.1159668, "step": 16151, "time_per_iteration": 2.5362462997436523 }, { "auxiliary_loss_clip": 0.06405751, "auxiliary_loss_mlp": 0.01264777, "balance_loss_clip": 0.06271017, "balance_loss_mlp": 0.01255216, "epoch": 0.9711107770930407, "flos": 21178972368000.0, "grad_norm": 1.7133377384498851, "language_loss": 0.74820411, "learning_rate": 8.730432009145027e-09, "loss": 0.82490939, "num_input_tokens_seen": 348633845, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.09564209, "step": 16152, "time_per_iteration": 2.5346832275390625 }, { "auxiliary_loss_clip": 0.06413849, "auxiliary_loss_mlp": 0.01266197, "balance_loss_clip": 0.06276265, "balance_loss_mlp": 0.0125641, "epoch": 0.9711709003457087, "flos": 22243675472640.0, "grad_norm": 1.970300459052968, "language_loss": 0.67591816, "learning_rate": 8.694119452473448e-09, "loss": 0.75271863, "num_input_tokens_seen": 348653070, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09790039, "step": 16153, "time_per_iteration": 2.557600498199463 }, { "auxiliary_loss_clip": 0.06412496, "auxiliary_loss_mlp": 0.01269436, "balance_loss_clip": 0.06273071, "balance_loss_mlp": 0.01260412, "epoch": 0.9712310235983767, "flos": 26221096281600.0, "grad_norm": 1.6176488317827875, "language_loss": 0.70924956, "learning_rate": 8.65788240632037e-09, "loss": 0.7860688, "num_input_tokens_seen": 348672145, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09014893, "step": 16154, "time_per_iteration": 4.022677421569824 }, { "auxiliary_loss_clip": 0.0642066, "auxiliary_loss_mlp": 0.01266734, "balance_loss_clip": 0.06277274, "balance_loss_mlp": 0.01256477, "epoch": 0.9712911468510447, "flos": 20674509661440.0, "grad_norm": 1.7962421982346641, "language_loss": 0.80931914, "learning_rate": 8.621720872059812e-09, "loss": 0.88619304, "num_input_tokens_seen": 348690615, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.1026001, "step": 16155, "time_per_iteration": 2.5301601886749268 }, { "auxiliary_loss_clip": 0.06423482, "auxiliary_loss_mlp": 0.01267302, "balance_loss_clip": 0.06279378, "balance_loss_mlp": 0.01257139, "epoch": 0.9713512701037126, "flos": 13557960616320.0, "grad_norm": 1.8812472182938633, "language_loss": 0.67586529, "learning_rate": 8.58563485106334e-09, "loss": 0.75277317, "num_input_tokens_seen": 348708665, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.1015625, "step": 16156, "time_per_iteration": 2.53191876411438 }, { "auxiliary_loss_clip": 0.06417137, "auxiliary_loss_mlp": 0.01265415, "balance_loss_clip": 0.06274781, "balance_loss_mlp": 0.01256212, "epoch": 0.9714113933563806, "flos": 25855890261120.0, "grad_norm": 2.4684947833285054, "language_loss": 0.91155881, "learning_rate": 8.54962434469919e-09, "loss": 0.98838431, "num_input_tokens_seen": 348726105, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09197998, "step": 16157, "time_per_iteration": 2.5622408390045166 }, { "auxiliary_loss_clip": 0.06417418, "auxiliary_loss_mlp": 0.01263955, "balance_loss_clip": 0.06275921, "balance_loss_mlp": 0.01255491, "epoch": 0.9714715166090485, "flos": 12746809566720.0, "grad_norm": 1.7619475891495533, "language_loss": 0.72800827, "learning_rate": 8.513689354332721e-09, "loss": 0.80482197, "num_input_tokens_seen": 348743360, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.0847168, "step": 16158, "time_per_iteration": 2.6225147247314453 }, { "auxiliary_loss_clip": 0.0641191, "auxiliary_loss_mlp": 0.01264613, "balance_loss_clip": 0.06273982, "balance_loss_mlp": 0.01255803, "epoch": 0.9715316398617165, "flos": 18411423062400.0, "grad_norm": 2.123536517129609, "language_loss": 0.60372001, "learning_rate": 8.477829881326836e-09, "loss": 0.68048525, "num_input_tokens_seen": 348759045, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.08810425, "step": 16159, "time_per_iteration": 2.528317451477051 }, { "auxiliary_loss_clip": 0.06408381, "auxiliary_loss_mlp": 0.0126323, "balance_loss_clip": 0.06274284, "balance_loss_mlp": 0.01254665, "epoch": 0.9715917631143844, "flos": 28921490939520.0, "grad_norm": 1.5830370440152057, "language_loss": 0.7892828, "learning_rate": 8.44204592704112e-09, "loss": 0.86599892, "num_input_tokens_seen": 348779910, "router_z_loss_clip": 1.33886719, "router_z_loss_mlp": 0.08563232, "step": 16160, "time_per_iteration": 2.6548287868499756 }, { "auxiliary_loss_clip": 0.06310865, "auxiliary_loss_mlp": 0.012534, "balance_loss_clip": 0.06255719, "balance_loss_mlp": 0.01252316, "epoch": 0.9716518863670525, "flos": 65958504900480.0, "grad_norm": 0.7546825830584136, "language_loss": 0.54342067, "learning_rate": 8.406337492832704e-09, "loss": 0.61906332, "num_input_tokens_seen": 348838995, "router_z_loss_clip": 0.55419922, "router_z_loss_mlp": 0.01084137, "step": 16161, "time_per_iteration": 3.190291166305542 }, { "auxiliary_loss_clip": 0.06406531, "auxiliary_loss_mlp": 0.01263844, "balance_loss_clip": 0.06271791, "balance_loss_mlp": 0.01254576, "epoch": 0.9717120096197204, "flos": 17718592377600.0, "grad_norm": 1.9437797793102052, "language_loss": 0.72363222, "learning_rate": 8.3707045800554e-09, "loss": 0.800336, "num_input_tokens_seen": 348858090, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.09265137, "step": 16162, "time_per_iteration": 2.532317638397217 }, { "auxiliary_loss_clip": 0.06407736, "auxiliary_loss_mlp": 0.01265789, "balance_loss_clip": 0.06270962, "balance_loss_mlp": 0.01256086, "epoch": 0.9717721328723884, "flos": 24470522380800.0, "grad_norm": 1.6199561473733348, "language_loss": 0.79214257, "learning_rate": 8.335147190060787e-09, "loss": 0.86887777, "num_input_tokens_seen": 348877885, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09692383, "step": 16163, "time_per_iteration": 2.5717239379882812 }, { "auxiliary_loss_clip": 0.06412152, "auxiliary_loss_mlp": 0.01265171, "balance_loss_clip": 0.06275342, "balance_loss_mlp": 0.01256343, "epoch": 0.9718322561250564, "flos": 20782641755520.0, "grad_norm": 1.6505426479130052, "language_loss": 0.72922742, "learning_rate": 8.299665324196903e-09, "loss": 0.80600059, "num_input_tokens_seen": 348897720, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.08831787, "step": 16164, "time_per_iteration": 2.530381202697754 }, { "auxiliary_loss_clip": 0.06415282, "auxiliary_loss_mlp": 0.01264861, "balance_loss_clip": 0.06274885, "balance_loss_mlp": 0.01255044, "epoch": 0.9718923793777243, "flos": 19031900146560.0, "grad_norm": 1.961654867705526, "language_loss": 0.84431756, "learning_rate": 8.264258983809114e-09, "loss": 0.92111903, "num_input_tokens_seen": 348915410, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09820557, "step": 16165, "time_per_iteration": 2.508769989013672 }, { "auxiliary_loss_clip": 0.06413032, "auxiliary_loss_mlp": 0.01263316, "balance_loss_clip": 0.06274492, "balance_loss_mlp": 0.01254918, "epoch": 0.9719525026303923, "flos": 21878175962880.0, "grad_norm": 1.4949807962975856, "language_loss": 0.79747546, "learning_rate": 8.228928170240345e-09, "loss": 0.87423897, "num_input_tokens_seen": 348934335, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.08398438, "step": 16166, "time_per_iteration": 2.518616199493408 }, { "auxiliary_loss_clip": 0.06412307, "auxiliary_loss_mlp": 0.01265957, "balance_loss_clip": 0.062746, "balance_loss_mlp": 0.01256247, "epoch": 0.9720126258830603, "flos": 14434631159040.0, "grad_norm": 1.8408156581733965, "language_loss": 0.71123785, "learning_rate": 8.193672884830195e-09, "loss": 0.78802049, "num_input_tokens_seen": 348952405, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09710693, "step": 16167, "time_per_iteration": 2.505483627319336 }, { "auxiliary_loss_clip": 0.06413741, "auxiliary_loss_mlp": 0.01265672, "balance_loss_clip": 0.06276943, "balance_loss_mlp": 0.01256487, "epoch": 0.9720727491357283, "flos": 26258551856640.0, "grad_norm": 1.401573858176758, "language_loss": 0.75753343, "learning_rate": 8.158493128915812e-09, "loss": 0.83432758, "num_input_tokens_seen": 348973580, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09185791, "step": 16168, "time_per_iteration": 2.569406032562256 }, { "auxiliary_loss_clip": 0.0641738, "auxiliary_loss_mlp": 0.01262892, "balance_loss_clip": 0.0627701, "balance_loss_mlp": 0.01253474, "epoch": 0.9721328723883962, "flos": 22680648115200.0, "grad_norm": 1.9813374548782572, "language_loss": 0.7270124, "learning_rate": 8.123388903830797e-09, "loss": 0.80381513, "num_input_tokens_seen": 348992035, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09423828, "step": 16169, "time_per_iteration": 2.531007766723633 }, { "auxiliary_loss_clip": 0.06416541, "auxiliary_loss_mlp": 0.01265779, "balance_loss_clip": 0.06273372, "balance_loss_mlp": 0.01255968, "epoch": 0.9721929956410642, "flos": 28081647066240.0, "grad_norm": 1.7932228652960722, "language_loss": 0.57777035, "learning_rate": 8.088360210906309e-09, "loss": 0.65459359, "num_input_tokens_seen": 349013160, "router_z_loss_clip": 1.43457031, "router_z_loss_mlp": 0.0980835, "step": 16170, "time_per_iteration": 2.577887535095215 }, { "auxiliary_loss_clip": 0.06413145, "auxiliary_loss_mlp": 0.01266302, "balance_loss_clip": 0.06273651, "balance_loss_mlp": 0.01256109, "epoch": 0.9722531188937321, "flos": 21002595523200.0, "grad_norm": 4.960228946257162, "language_loss": 0.72092295, "learning_rate": 8.053407051471062e-09, "loss": 0.79771745, "num_input_tokens_seen": 349033485, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.10186768, "step": 16171, "time_per_iteration": 2.5518229007720947 }, { "auxiliary_loss_clip": 0.06415899, "auxiliary_loss_mlp": 0.01264991, "balance_loss_clip": 0.06275339, "balance_loss_mlp": 0.01255055, "epoch": 0.9723132421464001, "flos": 16076108643840.0, "grad_norm": 1.6507199981007716, "language_loss": 0.68454617, "learning_rate": 8.018529426850218e-09, "loss": 0.76135504, "num_input_tokens_seen": 349051705, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09936523, "step": 16172, "time_per_iteration": 2.505767345428467 }, { "auxiliary_loss_clip": 0.06411431, "auxiliary_loss_mlp": 0.01266351, "balance_loss_clip": 0.06275396, "balance_loss_mlp": 0.01257619, "epoch": 0.972373365399068, "flos": 27753183861120.0, "grad_norm": 1.7263146294878098, "language_loss": 0.86238599, "learning_rate": 7.983727338366274e-09, "loss": 0.9391638, "num_input_tokens_seen": 349070825, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.08734131, "step": 16173, "time_per_iteration": 2.6045279502868652 }, { "auxiliary_loss_clip": 0.06421804, "auxiliary_loss_mlp": 0.01268113, "balance_loss_clip": 0.06276304, "balance_loss_mlp": 0.01256866, "epoch": 0.9724334886517361, "flos": 23009614444800.0, "grad_norm": 1.9095674969396332, "language_loss": 0.65046448, "learning_rate": 7.949000787339289e-09, "loss": 0.72736359, "num_input_tokens_seen": 349089730, "router_z_loss_clip": 1.45507812, "router_z_loss_mlp": 0.11254883, "step": 16174, "time_per_iteration": 4.062329530715942 }, { "auxiliary_loss_clip": 0.06414255, "auxiliary_loss_mlp": 0.01266798, "balance_loss_clip": 0.06278027, "balance_loss_mlp": 0.01258364, "epoch": 0.972493611904404, "flos": 25454067206400.0, "grad_norm": 1.6595423904422082, "language_loss": 0.78428042, "learning_rate": 7.914349775085538e-09, "loss": 0.8610909, "num_input_tokens_seen": 349111315, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.08435059, "step": 16175, "time_per_iteration": 2.6297175884246826 }, { "auxiliary_loss_clip": 0.06415401, "auxiliary_loss_mlp": 0.01267208, "balance_loss_clip": 0.06278096, "balance_loss_mlp": 0.01256861, "epoch": 0.972553735157072, "flos": 16988767315200.0, "grad_norm": 2.1823995630491426, "language_loss": 0.5721432, "learning_rate": 7.879774302919307e-09, "loss": 0.64896929, "num_input_tokens_seen": 349129495, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.10351562, "step": 16176, "time_per_iteration": 2.537506341934204 }, { "auxiliary_loss_clip": 0.0641157, "auxiliary_loss_mlp": 0.01264244, "balance_loss_clip": 0.0627517, "balance_loss_mlp": 0.01255482, "epoch": 0.97261385840974, "flos": 26111916011520.0, "grad_norm": 1.9516195081901764, "language_loss": 0.72361714, "learning_rate": 7.845274372151545e-09, "loss": 0.80037522, "num_input_tokens_seen": 349148850, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.08764648, "step": 16177, "time_per_iteration": 2.5992231369018555 }, { "auxiliary_loss_clip": 0.06419349, "auxiliary_loss_mlp": 0.01265931, "balance_loss_clip": 0.06276678, "balance_loss_mlp": 0.01256764, "epoch": 0.9726739816624079, "flos": 25455031528320.0, "grad_norm": 1.754947953769222, "language_loss": 0.68447816, "learning_rate": 7.810849984090984e-09, "loss": 0.76133096, "num_input_tokens_seen": 349167620, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.0916748, "step": 16178, "time_per_iteration": 4.0440757274627686 }, { "auxiliary_loss_clip": 0.06418654, "auxiliary_loss_mlp": 0.01264596, "balance_loss_clip": 0.06276226, "balance_loss_mlp": 0.01254255, "epoch": 0.972734104915076, "flos": 29021237625600.0, "grad_norm": 2.0656244652646394, "language_loss": 0.6742909, "learning_rate": 7.776501140042358e-09, "loss": 0.75112337, "num_input_tokens_seen": 349185845, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.10345459, "step": 16179, "time_per_iteration": 2.599957227706909 }, { "auxiliary_loss_clip": 0.06410199, "auxiliary_loss_mlp": 0.01261974, "balance_loss_clip": 0.06274322, "balance_loss_mlp": 0.01253182, "epoch": 0.9727942281677439, "flos": 23443861829760.0, "grad_norm": 1.956709601912439, "language_loss": 0.77473044, "learning_rate": 7.742227841308624e-09, "loss": 0.85145217, "num_input_tokens_seen": 349204525, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.08795166, "step": 16180, "time_per_iteration": 2.5725295543670654 }, { "auxiliary_loss_clip": 0.06418953, "auxiliary_loss_mlp": 0.01266327, "balance_loss_clip": 0.06275174, "balance_loss_mlp": 0.01256331, "epoch": 0.9728543514204119, "flos": 31732994511360.0, "grad_norm": 1.489045548201952, "language_loss": 0.7671991, "learning_rate": 7.708030089189188e-09, "loss": 0.8440519, "num_input_tokens_seen": 349228075, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.09991455, "step": 16181, "time_per_iteration": 2.696439743041992 }, { "auxiliary_loss_clip": 0.06414717, "auxiliary_loss_mlp": 0.01265304, "balance_loss_clip": 0.06276213, "balance_loss_mlp": 0.01256125, "epoch": 0.9729144746730798, "flos": 16294888454400.0, "grad_norm": 1.5622467799219495, "language_loss": 0.63525867, "learning_rate": 7.67390788498079e-09, "loss": 0.7120589, "num_input_tokens_seen": 349246990, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.0916748, "step": 16182, "time_per_iteration": 2.590742349624634 }, { "auxiliary_loss_clip": 0.06415915, "auxiliary_loss_mlp": 0.01263506, "balance_loss_clip": 0.06275286, "balance_loss_mlp": 0.01254053, "epoch": 0.9729745979257478, "flos": 25047632177280.0, "grad_norm": 1.7837541314167316, "language_loss": 0.62478328, "learning_rate": 7.639861229977507e-09, "loss": 0.70157754, "num_input_tokens_seen": 349265890, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09454346, "step": 16183, "time_per_iteration": 2.570452928543091 }, { "auxiliary_loss_clip": 0.06406733, "auxiliary_loss_mlp": 0.01269479, "balance_loss_clip": 0.06270911, "balance_loss_mlp": 0.01259979, "epoch": 0.9730347211784157, "flos": 22645456600320.0, "grad_norm": 2.4390655885722214, "language_loss": 0.77829409, "learning_rate": 7.605890125470527e-09, "loss": 0.85505623, "num_input_tokens_seen": 349285275, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.0949707, "step": 16184, "time_per_iteration": 2.6088709831237793 }, { "auxiliary_loss_clip": 0.06412031, "auxiliary_loss_mlp": 0.01263587, "balance_loss_clip": 0.06274052, "balance_loss_mlp": 0.01254587, "epoch": 0.9730948444310837, "flos": 11003195554560.0, "grad_norm": 2.4456075534232546, "language_loss": 0.79558849, "learning_rate": 7.571994572747709e-09, "loss": 0.87234461, "num_input_tokens_seen": 349301515, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09008789, "step": 16185, "time_per_iteration": 2.631728410720825 }, { "auxiliary_loss_clip": 0.06417871, "auxiliary_loss_mlp": 0.01262148, "balance_loss_clip": 0.06276486, "balance_loss_mlp": 0.0125313, "epoch": 0.9731549676837516, "flos": 16804969384320.0, "grad_norm": 1.7151597587058878, "language_loss": 0.77591825, "learning_rate": 7.538174573094469e-09, "loss": 0.85271847, "num_input_tokens_seen": 349319590, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09014893, "step": 16186, "time_per_iteration": 2.5503158569335938 }, { "auxiliary_loss_clip": 0.06415021, "auxiliary_loss_mlp": 0.01265266, "balance_loss_clip": 0.06277108, "balance_loss_mlp": 0.01255855, "epoch": 0.9732150909364197, "flos": 21148057411200.0, "grad_norm": 1.596173863234303, "language_loss": 0.65241969, "learning_rate": 7.504430127793337e-09, "loss": 0.72922254, "num_input_tokens_seen": 349339230, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09405518, "step": 16187, "time_per_iteration": 2.558546781539917 }, { "auxiliary_loss_clip": 0.06409585, "auxiliary_loss_mlp": 0.0126406, "balance_loss_clip": 0.06272531, "balance_loss_mlp": 0.01254404, "epoch": 0.9732752141890876, "flos": 33735401458560.0, "grad_norm": 1.7600130266603546, "language_loss": 0.80984443, "learning_rate": 7.47076123812418e-09, "loss": 0.88658094, "num_input_tokens_seen": 349361155, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09655762, "step": 16188, "time_per_iteration": 2.6544675827026367 }, { "auxiliary_loss_clip": 0.06412359, "auxiliary_loss_mlp": 0.0126624, "balance_loss_clip": 0.06278164, "balance_loss_mlp": 0.01257645, "epoch": 0.9733353374417556, "flos": 23411144010240.0, "grad_norm": 1.7032171735292052, "language_loss": 0.78286374, "learning_rate": 7.437167905363084e-09, "loss": 0.85964972, "num_input_tokens_seen": 349379335, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.08587646, "step": 16189, "time_per_iteration": 4.041364908218384 }, { "auxiliary_loss_clip": 0.06413203, "auxiliary_loss_mlp": 0.01262719, "balance_loss_clip": 0.0627502, "balance_loss_mlp": 0.01253128, "epoch": 0.9733954606944236, "flos": 39175113795840.0, "grad_norm": 1.7523058228116064, "language_loss": 0.51611054, "learning_rate": 7.403650130784367e-09, "loss": 0.59286982, "num_input_tokens_seen": 349401575, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.0958252, "step": 16190, "time_per_iteration": 2.706453561782837 }, { "auxiliary_loss_clip": 0.06412762, "auxiliary_loss_mlp": 0.0126373, "balance_loss_clip": 0.06272866, "balance_loss_mlp": 0.01254086, "epoch": 0.9734555839470915, "flos": 21988404408960.0, "grad_norm": 2.2674804004022047, "language_loss": 0.809645, "learning_rate": 7.3702079156590105e-09, "loss": 0.88640994, "num_input_tokens_seen": 349420650, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09637451, "step": 16191, "time_per_iteration": 2.5458834171295166 }, { "auxiliary_loss_clip": 0.06412297, "auxiliary_loss_mlp": 0.01265886, "balance_loss_clip": 0.06274069, "balance_loss_mlp": 0.01256975, "epoch": 0.9735157071997596, "flos": 16580152080000.0, "grad_norm": 1.6281066407689624, "language_loss": 0.82863277, "learning_rate": 7.336841261255111e-09, "loss": 0.90541458, "num_input_tokens_seen": 349436830, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.08905029, "step": 16192, "time_per_iteration": 2.5132644176483154 }, { "auxiliary_loss_clip": 0.06417388, "auxiliary_loss_mlp": 0.01265373, "balance_loss_clip": 0.06277037, "balance_loss_mlp": 0.01256081, "epoch": 0.9735758304524275, "flos": 20228313070080.0, "grad_norm": 1.8726961735743635, "language_loss": 0.75245881, "learning_rate": 7.303550168837658e-09, "loss": 0.8292864, "num_input_tokens_seen": 349454325, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09283447, "step": 16193, "time_per_iteration": 3.958383798599243 }, { "auxiliary_loss_clip": 0.06409369, "auxiliary_loss_mlp": 0.01263094, "balance_loss_clip": 0.06274895, "balance_loss_mlp": 0.01254523, "epoch": 0.9736359537050955, "flos": 23659077841920.0, "grad_norm": 1.5892509755490989, "language_loss": 0.85547292, "learning_rate": 7.270334639669417e-09, "loss": 0.93219763, "num_input_tokens_seen": 349470230, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.08563232, "step": 16194, "time_per_iteration": 2.551267147064209 }, { "auxiliary_loss_clip": 0.06406099, "auxiliary_loss_mlp": 0.01266558, "balance_loss_clip": 0.06273649, "balance_loss_mlp": 0.0125788, "epoch": 0.9736960769577634, "flos": 15565692297600.0, "grad_norm": 1.4648977965861103, "language_loss": 0.75991106, "learning_rate": 7.237194675009828e-09, "loss": 0.83663762, "num_input_tokens_seen": 349486250, "router_z_loss_clip": 1.32519531, "router_z_loss_mlp": 0.08679199, "step": 16195, "time_per_iteration": 2.49748158454895 }, { "auxiliary_loss_clip": 0.06313166, "auxiliary_loss_mlp": 0.01251686, "balance_loss_clip": 0.06258039, "balance_loss_mlp": 0.0125068, "epoch": 0.9737562002104314, "flos": 65369781313920.0, "grad_norm": 0.7556338158529072, "language_loss": 0.52437556, "learning_rate": 7.204130276115439e-09, "loss": 0.6000241, "num_input_tokens_seen": 349545865, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01005554, "step": 16196, "time_per_iteration": 3.110431432723999 }, { "auxiliary_loss_clip": 0.06414167, "auxiliary_loss_mlp": 0.01265371, "balance_loss_clip": 0.06275059, "balance_loss_mlp": 0.01255853, "epoch": 0.9738163234630993, "flos": 27203760639360.0, "grad_norm": 1.487141757843979, "language_loss": 0.76839602, "learning_rate": 7.171141444240136e-09, "loss": 0.84519136, "num_input_tokens_seen": 349566080, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09521484, "step": 16197, "time_per_iteration": 2.589381217956543 }, { "auxiliary_loss_clip": 0.06420669, "auxiliary_loss_mlp": 0.01267085, "balance_loss_clip": 0.06276091, "balance_loss_mlp": 0.01256583, "epoch": 0.9738764467157673, "flos": 21075745737600.0, "grad_norm": 1.7149063950882382, "language_loss": 0.68037909, "learning_rate": 7.13822818063492e-09, "loss": 0.75725663, "num_input_tokens_seen": 349585665, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.10491943, "step": 16198, "time_per_iteration": 2.5488109588623047 }, { "auxiliary_loss_clip": 0.0641083, "auxiliary_loss_mlp": 0.01263083, "balance_loss_clip": 0.06270907, "balance_loss_mlp": 0.01253976, "epoch": 0.9739365699684353, "flos": 21367633835520.0, "grad_norm": 1.808732142172339, "language_loss": 0.78443807, "learning_rate": 7.10539048654768e-09, "loss": 0.86117721, "num_input_tokens_seen": 349605125, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09106445, "step": 16199, "time_per_iteration": 2.528110980987549 }, { "auxiliary_loss_clip": 0.06415221, "auxiliary_loss_mlp": 0.01264609, "balance_loss_clip": 0.06275575, "balance_loss_mlp": 0.01254035, "epoch": 0.9739966932211033, "flos": 21907497692160.0, "grad_norm": 2.0633697734192467, "language_loss": 0.79639196, "learning_rate": 7.072628363223865e-09, "loss": 0.87319028, "num_input_tokens_seen": 349623360, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.10565186, "step": 16200, "time_per_iteration": 2.551051378250122 }, { "auxiliary_loss_clip": 0.06420498, "auxiliary_loss_mlp": 0.01264635, "balance_loss_clip": 0.06273255, "balance_loss_mlp": 0.0125443, "epoch": 0.9740568164737712, "flos": 24834344808960.0, "grad_norm": 1.9004679791023484, "language_loss": 0.69004881, "learning_rate": 7.039941811905592e-09, "loss": 0.76690018, "num_input_tokens_seen": 349644390, "router_z_loss_clip": 1.47265625, "router_z_loss_mlp": 0.10205078, "step": 16201, "time_per_iteration": 2.5846951007843018 }, { "auxiliary_loss_clip": 0.06417034, "auxiliary_loss_mlp": 0.01263963, "balance_loss_clip": 0.06277879, "balance_loss_mlp": 0.01255091, "epoch": 0.9741169397264392, "flos": 23630426945280.0, "grad_norm": 1.3238874351025642, "language_loss": 0.72736478, "learning_rate": 7.0073308338325364e-09, "loss": 0.80417478, "num_input_tokens_seen": 349663200, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.0887146, "step": 16202, "time_per_iteration": 2.590177297592163 }, { "auxiliary_loss_clip": 0.06416369, "auxiliary_loss_mlp": 0.01263693, "balance_loss_clip": 0.06275345, "balance_loss_mlp": 0.01254353, "epoch": 0.9741770629791072, "flos": 18846718623360.0, "grad_norm": 1.8871720617183367, "language_loss": 0.73109078, "learning_rate": 6.974795430241265e-09, "loss": 0.80789143, "num_input_tokens_seen": 349681975, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09344482, "step": 16203, "time_per_iteration": 2.5685794353485107 }, { "auxiliary_loss_clip": 0.06413585, "auxiliary_loss_mlp": 0.01265707, "balance_loss_clip": 0.06274314, "balance_loss_mlp": 0.01256105, "epoch": 0.9742371862317751, "flos": 22352813815680.0, "grad_norm": 1.6570482800076418, "language_loss": 0.77158225, "learning_rate": 6.942335602365235e-09, "loss": 0.8483752, "num_input_tokens_seen": 349701185, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09606934, "step": 16204, "time_per_iteration": 2.5981359481811523 }, { "auxiliary_loss_clip": 0.06421942, "auxiliary_loss_mlp": 0.01266343, "balance_loss_clip": 0.06281674, "balance_loss_mlp": 0.01256407, "epoch": 0.9742973094844432, "flos": 21769289182080.0, "grad_norm": 2.4054053867645253, "language_loss": 0.79837179, "learning_rate": 6.909951351435905e-09, "loss": 0.87525463, "num_input_tokens_seen": 349720360, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09924316, "step": 16205, "time_per_iteration": 2.6056854724884033 }, { "auxiliary_loss_clip": 0.06410494, "auxiliary_loss_mlp": 0.01264827, "balance_loss_clip": 0.06272975, "balance_loss_mlp": 0.01255874, "epoch": 0.9743574327371111, "flos": 26255700817920.0, "grad_norm": 1.5568525508874582, "language_loss": 0.74375725, "learning_rate": 6.87764267868074e-09, "loss": 0.82051045, "num_input_tokens_seen": 349741040, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08947754, "step": 16206, "time_per_iteration": 2.6713309288024902 }, { "auxiliary_loss_clip": 0.06414133, "auxiliary_loss_mlp": 0.01262418, "balance_loss_clip": 0.06273499, "balance_loss_mlp": 0.01252613, "epoch": 0.9744175559897791, "flos": 12354252387840.0, "grad_norm": 1.9773887556856329, "language_loss": 0.84342486, "learning_rate": 6.8454095853252015e-09, "loss": 0.92019033, "num_input_tokens_seen": 349758895, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.0980835, "step": 16207, "time_per_iteration": 2.510035514831543 }, { "auxiliary_loss_clip": 0.06412171, "auxiliary_loss_mlp": 0.01264118, "balance_loss_clip": 0.062759, "balance_loss_mlp": 0.01255022, "epoch": 0.974477679242447, "flos": 28404575902080.0, "grad_norm": 1.520334528651236, "language_loss": 0.71236914, "learning_rate": 6.813252072591425e-09, "loss": 0.789132, "num_input_tokens_seen": 349779740, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.09094238, "step": 16208, "time_per_iteration": 2.587256908416748 }, { "auxiliary_loss_clip": 0.06407264, "auxiliary_loss_mlp": 0.01263979, "balance_loss_clip": 0.06275812, "balance_loss_mlp": 0.01255694, "epoch": 0.974537802495115, "flos": 17791155613440.0, "grad_norm": 1.6527527784161171, "language_loss": 0.77424574, "learning_rate": 6.781170141698878e-09, "loss": 0.85095811, "num_input_tokens_seen": 349796820, "router_z_loss_clip": 1.31542969, "router_z_loss_mlp": 0.08282471, "step": 16209, "time_per_iteration": 2.5291359424591064 }, { "auxiliary_loss_clip": 0.06414527, "auxiliary_loss_mlp": 0.01263628, "balance_loss_clip": 0.06272517, "balance_loss_mlp": 0.01253644, "epoch": 0.9745979257477829, "flos": 23849164828800.0, "grad_norm": 1.8919096300324902, "language_loss": 0.79457939, "learning_rate": 6.749163793864144e-09, "loss": 0.8713609, "num_input_tokens_seen": 349816550, "router_z_loss_clip": 1.41894531, "router_z_loss_mlp": 0.09985352, "step": 16210, "time_per_iteration": 2.549146890640259 }, { "auxiliary_loss_clip": 0.0641482, "auxiliary_loss_mlp": 0.01261005, "balance_loss_clip": 0.06275837, "balance_loss_mlp": 0.01251545, "epoch": 0.9746580490004509, "flos": 27023484579840.0, "grad_norm": 1.9474387259317127, "language_loss": 0.78089875, "learning_rate": 6.7172330303009176e-09, "loss": 0.85765702, "num_input_tokens_seen": 349834350, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09454346, "step": 16211, "time_per_iteration": 2.5674703121185303 }, { "auxiliary_loss_clip": 0.06423371, "auxiliary_loss_mlp": 0.01266711, "balance_loss_clip": 0.06278475, "balance_loss_mlp": 0.01255762, "epoch": 0.9747181722531189, "flos": 19798132608000.0, "grad_norm": 2.177365481690824, "language_loss": 0.78279376, "learning_rate": 6.685377852219787e-09, "loss": 0.85969466, "num_input_tokens_seen": 349853460, "router_z_loss_clip": 1.44921875, "router_z_loss_mlp": 0.10943604, "step": 16212, "time_per_iteration": 2.5307931900024414 }, { "auxiliary_loss_clip": 0.06410964, "auxiliary_loss_mlp": 0.0126563, "balance_loss_clip": 0.06275658, "balance_loss_mlp": 0.0125697, "epoch": 0.9747782955057869, "flos": 31438590791040.0, "grad_norm": 1.502606632356964, "language_loss": 0.80395234, "learning_rate": 6.653598260829118e-09, "loss": 0.88071829, "num_input_tokens_seen": 349874830, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.08660889, "step": 16213, "time_per_iteration": 4.007638692855835 }, { "auxiliary_loss_clip": 0.06413076, "auxiliary_loss_mlp": 0.01263992, "balance_loss_clip": 0.06275289, "balance_loss_mlp": 0.01254682, "epoch": 0.9748384187584548, "flos": 15966802592640.0, "grad_norm": 1.8952701125490434, "language_loss": 0.66819263, "learning_rate": 6.6218942573335044e-09, "loss": 0.74496335, "num_input_tokens_seen": 349893690, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09301758, "step": 16214, "time_per_iteration": 2.5139660835266113 }, { "auxiliary_loss_clip": 0.0641728, "auxiliary_loss_mlp": 0.01269765, "balance_loss_clip": 0.06274985, "balance_loss_mlp": 0.01259566, "epoch": 0.9748985420111228, "flos": 20565035902080.0, "grad_norm": 1.6151045099718344, "language_loss": 0.74593997, "learning_rate": 6.5902658429355386e-09, "loss": 0.82281041, "num_input_tokens_seen": 349912480, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10192871, "step": 16215, "time_per_iteration": 2.525679349899292 }, { "auxiliary_loss_clip": 0.06412129, "auxiliary_loss_mlp": 0.01267374, "balance_loss_clip": 0.06274635, "balance_loss_mlp": 0.01258356, "epoch": 0.9749586652637908, "flos": 36730577180160.0, "grad_norm": 1.6928053820580768, "language_loss": 0.67050344, "learning_rate": 6.558713018834483e-09, "loss": 0.74729848, "num_input_tokens_seen": 349932470, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09020996, "step": 16216, "time_per_iteration": 2.691735029220581 }, { "auxiliary_loss_clip": 0.06418721, "auxiliary_loss_mlp": 0.01264237, "balance_loss_clip": 0.06276134, "balance_loss_mlp": 0.01254766, "epoch": 0.9750187885164587, "flos": 11003908314240.0, "grad_norm": 2.807576205862948, "language_loss": 0.72136223, "learning_rate": 6.527235786226937e-09, "loss": 0.79819185, "num_input_tokens_seen": 349949060, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.09466553, "step": 16217, "time_per_iteration": 2.53729510307312 }, { "auxiliary_loss_clip": 0.06412561, "auxiliary_loss_mlp": 0.01262144, "balance_loss_clip": 0.0627484, "balance_loss_mlp": 0.01252875, "epoch": 0.9750789117691268, "flos": 25746668064000.0, "grad_norm": 1.5620030797808762, "language_loss": 0.78735054, "learning_rate": 6.495834146306167e-09, "loss": 0.8640976, "num_input_tokens_seen": 349968010, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.0927124, "step": 16218, "time_per_iteration": 4.0710930824279785 }, { "auxiliary_loss_clip": 0.0641059, "auxiliary_loss_mlp": 0.01264585, "balance_loss_clip": 0.06276405, "balance_loss_mlp": 0.01254899, "epoch": 0.9751390350217947, "flos": 13338971170560.0, "grad_norm": 5.203912646683935, "language_loss": 0.77839506, "learning_rate": 6.464508100263222e-09, "loss": 0.85514683, "num_input_tokens_seen": 349985270, "router_z_loss_clip": 1.34277344, "router_z_loss_mlp": 0.09680176, "step": 16219, "time_per_iteration": 2.5144524574279785 }, { "auxiliary_loss_clip": 0.06413458, "auxiliary_loss_mlp": 0.01261106, "balance_loss_clip": 0.06274143, "balance_loss_mlp": 0.01252273, "epoch": 0.9751991582744627, "flos": 22827283960320.0, "grad_norm": 1.6855479697253961, "language_loss": 0.81322926, "learning_rate": 6.433257649285817e-09, "loss": 0.88997483, "num_input_tokens_seen": 350003935, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.08828735, "step": 16220, "time_per_iteration": 2.60711669921875 }, { "auxiliary_loss_clip": 0.06412522, "auxiliary_loss_mlp": 0.01261792, "balance_loss_clip": 0.06275894, "balance_loss_mlp": 0.01252911, "epoch": 0.9752592815271306, "flos": 19652293376640.0, "grad_norm": 2.679146970319411, "language_loss": 0.75676852, "learning_rate": 6.402082794559227e-09, "loss": 0.83351165, "num_input_tokens_seen": 350023595, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.08880615, "step": 16221, "time_per_iteration": 2.547182083129883 }, { "auxiliary_loss_clip": 0.0641254, "auxiliary_loss_mlp": 0.01265157, "balance_loss_clip": 0.06276188, "balance_loss_mlp": 0.01256198, "epoch": 0.9753194047797986, "flos": 26698165902720.0, "grad_norm": 1.5118708100462201, "language_loss": 0.66726691, "learning_rate": 6.370983537265395e-09, "loss": 0.74404395, "num_input_tokens_seen": 350045920, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.08947754, "step": 16222, "time_per_iteration": 2.629544258117676 }, { "auxiliary_loss_clip": 0.0641093, "auxiliary_loss_mlp": 0.01264109, "balance_loss_clip": 0.06274691, "balance_loss_mlp": 0.01255193, "epoch": 0.9753795280324665, "flos": 23228478109440.0, "grad_norm": 1.680205355039328, "language_loss": 0.88502544, "learning_rate": 6.3399598785836004e-09, "loss": 0.96177584, "num_input_tokens_seen": 350063925, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.08917236, "step": 16223, "time_per_iteration": 2.567805051803589 }, { "auxiliary_loss_clip": 0.06406072, "auxiliary_loss_mlp": 0.0126369, "balance_loss_clip": 0.06270567, "balance_loss_mlp": 0.01254827, "epoch": 0.9754396512851345, "flos": 19469920965120.0, "grad_norm": 1.614519712612873, "language_loss": 0.7467556, "learning_rate": 6.309011819690457e-09, "loss": 0.82345319, "num_input_tokens_seen": 350080900, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.08868408, "step": 16224, "time_per_iteration": 2.555323362350464 }, { "auxiliary_loss_clip": 0.06320626, "auxiliary_loss_mlp": 0.01251676, "balance_loss_clip": 0.06265606, "balance_loss_mlp": 0.01250618, "epoch": 0.9754997745378025, "flos": 68478875061120.0, "grad_norm": 0.7922782356197136, "language_loss": 0.58987486, "learning_rate": 6.278139361759249e-09, "loss": 0.66559786, "num_input_tokens_seen": 350144550, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01059723, "step": 16225, "time_per_iteration": 3.1462879180908203 }, { "auxiliary_loss_clip": 0.06414178, "auxiliary_loss_mlp": 0.01265317, "balance_loss_clip": 0.06277914, "balance_loss_mlp": 0.01256198, "epoch": 0.9755598977904705, "flos": 26402252808960.0, "grad_norm": 2.1455608021571257, "language_loss": 0.69348747, "learning_rate": 6.247342505960818e-09, "loss": 0.77028245, "num_input_tokens_seen": 350164050, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.09112549, "step": 16226, "time_per_iteration": 2.600255250930786 }, { "auxiliary_loss_clip": 0.06413369, "auxiliary_loss_mlp": 0.01262113, "balance_loss_clip": 0.06274566, "balance_loss_mlp": 0.01252887, "epoch": 0.9756200210431384, "flos": 16623225878400.0, "grad_norm": 1.6526163187024268, "language_loss": 0.8328017, "learning_rate": 6.216621253462894e-09, "loss": 0.90955657, "num_input_tokens_seen": 350181350, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09216309, "step": 16227, "time_per_iteration": 2.568519115447998 }, { "auxiliary_loss_clip": 0.06413145, "auxiliary_loss_mlp": 0.01265746, "balance_loss_clip": 0.06275685, "balance_loss_mlp": 0.01256448, "epoch": 0.9756801442958064, "flos": 23629798039680.0, "grad_norm": 1.5374991957010822, "language_loss": 0.77791691, "learning_rate": 6.185975605430549e-09, "loss": 0.85470581, "num_input_tokens_seen": 350199765, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09301758, "step": 16228, "time_per_iteration": 2.5831222534179688 }, { "auxiliary_loss_clip": 0.06315953, "auxiliary_loss_mlp": 0.01250463, "balance_loss_clip": 0.06260814, "balance_loss_mlp": 0.01249472, "epoch": 0.9757402675484744, "flos": 61642432615680.0, "grad_norm": 0.8241124936423578, "language_loss": 0.55769575, "learning_rate": 6.155405563025962e-09, "loss": 0.63335991, "num_input_tokens_seen": 350256420, "router_z_loss_clip": 0.55126953, "router_z_loss_mlp": 0.00991058, "step": 16229, "time_per_iteration": 4.6572301387786865 }, { "auxiliary_loss_clip": 0.06413471, "auxiliary_loss_mlp": 0.01267671, "balance_loss_clip": 0.06275627, "balance_loss_mlp": 0.01258546, "epoch": 0.9758003908011423, "flos": 24065470944000.0, "grad_norm": 3.806377707250815, "language_loss": 0.75203228, "learning_rate": 6.124911127407984e-09, "loss": 0.82884371, "num_input_tokens_seen": 350276270, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09124756, "step": 16230, "time_per_iteration": 2.6124634742736816 }, { "auxiliary_loss_clip": 0.06406829, "auxiliary_loss_mlp": 0.01264347, "balance_loss_clip": 0.06273966, "balance_loss_mlp": 0.01255615, "epoch": 0.9758605140538104, "flos": 17498764391040.0, "grad_norm": 1.7925929151889834, "language_loss": 0.71626896, "learning_rate": 6.094492299733245e-09, "loss": 0.79298073, "num_input_tokens_seen": 350295000, "router_z_loss_clip": 1.32910156, "router_z_loss_mlp": 0.08734131, "step": 16231, "time_per_iteration": 2.6132290363311768 }, { "auxiliary_loss_clip": 0.06420596, "auxiliary_loss_mlp": 0.01267721, "balance_loss_clip": 0.06277195, "balance_loss_mlp": 0.01257207, "epoch": 0.9759206373064783, "flos": 24833883611520.0, "grad_norm": 2.1399096763003183, "language_loss": 0.76742291, "learning_rate": 6.064149081155267e-09, "loss": 0.84430605, "num_input_tokens_seen": 350314980, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.10510254, "step": 16232, "time_per_iteration": 2.7731082439422607 }, { "auxiliary_loss_clip": 0.06315696, "auxiliary_loss_mlp": 0.01250544, "balance_loss_clip": 0.06260648, "balance_loss_mlp": 0.01249474, "epoch": 0.9759807605591463, "flos": 68179649731200.0, "grad_norm": 0.7335058430199183, "language_loss": 0.5383538, "learning_rate": 6.033881472824465e-09, "loss": 0.61401623, "num_input_tokens_seen": 350371985, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01071167, "step": 16233, "time_per_iteration": 4.465801477432251 }, { "auxiliary_loss_clip": 0.06413692, "auxiliary_loss_mlp": 0.01266195, "balance_loss_clip": 0.06275955, "balance_loss_mlp": 0.01257099, "epoch": 0.9760408838118142, "flos": 18995199258240.0, "grad_norm": 1.679431476893574, "language_loss": 0.71766359, "learning_rate": 6.003689475888807e-09, "loss": 0.79446244, "num_input_tokens_seen": 350390590, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09100342, "step": 16234, "time_per_iteration": 2.547111988067627 }, { "auxiliary_loss_clip": 0.06420608, "auxiliary_loss_mlp": 0.01266911, "balance_loss_clip": 0.06275938, "balance_loss_mlp": 0.01256701, "epoch": 0.9761010070644822, "flos": 17131210456320.0, "grad_norm": 2.498473699236987, "language_loss": 0.79407108, "learning_rate": 5.973573091493156e-09, "loss": 0.87094629, "num_input_tokens_seen": 350403770, "router_z_loss_clip": 1.44628906, "router_z_loss_mlp": 0.10211182, "step": 16235, "time_per_iteration": 2.502993583679199 }, { "auxiliary_loss_clip": 0.06413335, "auxiliary_loss_mlp": 0.01266023, "balance_loss_clip": 0.06276149, "balance_loss_mlp": 0.01255943, "epoch": 0.9761611303171501, "flos": 22058829365760.0, "grad_norm": 1.8066831221982194, "language_loss": 0.76965457, "learning_rate": 5.943532320779265e-09, "loss": 0.84644818, "num_input_tokens_seen": 350421870, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.10076904, "step": 16236, "time_per_iteration": 2.558054208755493 }, { "auxiliary_loss_clip": 0.06413807, "auxiliary_loss_mlp": 0.01265189, "balance_loss_clip": 0.06274775, "balance_loss_mlp": 0.01256075, "epoch": 0.9762212535698181, "flos": 21763167834240.0, "grad_norm": 1.7290392665910916, "language_loss": 0.75787503, "learning_rate": 5.913567164886446e-09, "loss": 0.83466494, "num_input_tokens_seen": 350440025, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09112549, "step": 16237, "time_per_iteration": 2.5151360034942627 }, { "auxiliary_loss_clip": 0.0641401, "auxiliary_loss_mlp": 0.01264771, "balance_loss_clip": 0.06274351, "balance_loss_mlp": 0.01254203, "epoch": 0.9762813768224861, "flos": 25928746986240.0, "grad_norm": 1.6313372488976636, "language_loss": 0.73228246, "learning_rate": 5.8836776249509e-09, "loss": 0.80907023, "num_input_tokens_seen": 350459435, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.10571289, "step": 16238, "time_per_iteration": 2.5853681564331055 }, { "auxiliary_loss_clip": 0.06416374, "auxiliary_loss_mlp": 0.01269705, "balance_loss_clip": 0.06276625, "balance_loss_mlp": 0.01260175, "epoch": 0.9763415000751541, "flos": 24057169390080.0, "grad_norm": 2.210277647272603, "language_loss": 0.84378666, "learning_rate": 5.8538637021063875e-09, "loss": 0.92064744, "num_input_tokens_seen": 350472655, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09527588, "step": 16239, "time_per_iteration": 2.5184788703918457 }, { "auxiliary_loss_clip": 0.06415769, "auxiliary_loss_mlp": 0.01267067, "balance_loss_clip": 0.06277186, "balance_loss_mlp": 0.01257929, "epoch": 0.976401623327822, "flos": 17024252319360.0, "grad_norm": 5.135048228812485, "language_loss": 0.60666764, "learning_rate": 5.824125397483115e-09, "loss": 0.683496, "num_input_tokens_seen": 350488160, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09143066, "step": 16240, "time_per_iteration": 2.5557620525360107 }, { "auxiliary_loss_clip": 0.06409279, "auxiliary_loss_mlp": 0.01264945, "balance_loss_clip": 0.06273526, "balance_loss_mlp": 0.01255641, "epoch": 0.97646174658049, "flos": 16112432188800.0, "grad_norm": 1.8154953661629645, "language_loss": 0.82603604, "learning_rate": 5.7944627122088474e-09, "loss": 0.90277833, "num_input_tokens_seen": 350506065, "router_z_loss_clip": 1.35742188, "router_z_loss_mlp": 0.09307861, "step": 16241, "time_per_iteration": 2.511758327484131 }, { "auxiliary_loss_clip": 0.06417164, "auxiliary_loss_mlp": 0.01266883, "balance_loss_clip": 0.06278247, "balance_loss_mlp": 0.01257215, "epoch": 0.9765218698331579, "flos": 21259292106240.0, "grad_norm": 1.639110819575365, "language_loss": 0.83366555, "learning_rate": 5.764875647408463e-09, "loss": 0.91050601, "num_input_tokens_seen": 350524495, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09667969, "step": 16242, "time_per_iteration": 2.602781057357788 }, { "auxiliary_loss_clip": 0.06415042, "auxiliary_loss_mlp": 0.01267875, "balance_loss_clip": 0.06276235, "balance_loss_mlp": 0.0125832, "epoch": 0.9765819930858259, "flos": 18593963182080.0, "grad_norm": 1.5995937506992197, "language_loss": 0.75705469, "learning_rate": 5.7353642042037294e-09, "loss": 0.83388388, "num_input_tokens_seen": 350544185, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09552002, "step": 16243, "time_per_iteration": 2.5644748210906982 }, { "auxiliary_loss_clip": 0.06411866, "auxiliary_loss_mlp": 0.01271087, "balance_loss_clip": 0.06273157, "balance_loss_mlp": 0.01260424, "epoch": 0.976642116338494, "flos": 20273105877120.0, "grad_norm": 1.5016638078926583, "language_loss": 0.70186955, "learning_rate": 5.705928383713754e-09, "loss": 0.7786991, "num_input_tokens_seen": 350562675, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.10662842, "step": 16244, "time_per_iteration": 2.575324296951294 }, { "auxiliary_loss_clip": 0.06417044, "auxiliary_loss_mlp": 0.01265755, "balance_loss_clip": 0.06276674, "balance_loss_mlp": 0.0125607, "epoch": 0.9767022395911619, "flos": 25556497223040.0, "grad_norm": 2.0228556757696903, "language_loss": 0.84138393, "learning_rate": 5.676568187055197e-09, "loss": 0.91821194, "num_input_tokens_seen": 350581535, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09686279, "step": 16245, "time_per_iteration": 2.5837483406066895 }, { "auxiliary_loss_clip": 0.06409509, "auxiliary_loss_mlp": 0.01262901, "balance_loss_clip": 0.06272203, "balance_loss_mlp": 0.0125371, "epoch": 0.9767623628438299, "flos": 21769163400960.0, "grad_norm": 1.2923400481176253, "language_loss": 0.78504145, "learning_rate": 5.647283615340726e-09, "loss": 0.8617655, "num_input_tokens_seen": 350601615, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.09185791, "step": 16246, "time_per_iteration": 2.544171094894409 }, { "auxiliary_loss_clip": 0.06401456, "auxiliary_loss_mlp": 0.01261433, "balance_loss_clip": 0.06272434, "balance_loss_mlp": 0.01253327, "epoch": 0.9768224860964978, "flos": 15856490292480.0, "grad_norm": 1.3534582095812513, "language_loss": 0.74179238, "learning_rate": 5.6180746696812275e-09, "loss": 0.81842124, "num_input_tokens_seen": 350619580, "router_z_loss_clip": 1.29003906, "router_z_loss_mlp": 0.08099365, "step": 16247, "time_per_iteration": 2.530221700668335 }, { "auxiliary_loss_clip": 0.06415753, "auxiliary_loss_mlp": 0.01264614, "balance_loss_clip": 0.0627676, "balance_loss_mlp": 0.01255119, "epoch": 0.9768826093491658, "flos": 25157441352960.0, "grad_norm": 1.7255735689214018, "language_loss": 0.80235279, "learning_rate": 5.58894135118404e-09, "loss": 0.87915653, "num_input_tokens_seen": 350640015, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.0949707, "step": 16248, "time_per_iteration": 2.6001064777374268 }, { "auxiliary_loss_clip": 0.06425917, "auxiliary_loss_mlp": 0.01268842, "balance_loss_clip": 0.06283358, "balance_loss_mlp": 0.0125766, "epoch": 0.9769427326018337, "flos": 22973794024320.0, "grad_norm": 1.672005022248949, "language_loss": 0.79592311, "learning_rate": 5.559883660954278e-09, "loss": 0.87287068, "num_input_tokens_seen": 350659155, "router_z_loss_clip": 1.42480469, "router_z_loss_mlp": 0.11187744, "step": 16249, "time_per_iteration": 2.578239917755127 }, { "auxiliary_loss_clip": 0.06408194, "auxiliary_loss_mlp": 0.01270023, "balance_loss_clip": 0.06273467, "balance_loss_mlp": 0.01260242, "epoch": 0.9770028558545018, "flos": 15269066444160.0, "grad_norm": 2.313262399972215, "language_loss": 0.67399561, "learning_rate": 5.530901600093507e-09, "loss": 0.75077772, "num_input_tokens_seen": 350676615, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.09771729, "step": 16250, "time_per_iteration": 2.597257137298584 }, { "auxiliary_loss_clip": 0.06315689, "auxiliary_loss_mlp": 0.01250954, "balance_loss_clip": 0.0626058, "balance_loss_mlp": 0.01249881, "epoch": 0.9770629791071697, "flos": 71470277349120.0, "grad_norm": 0.7730399499735742, "language_loss": 0.59759128, "learning_rate": 5.501995169700846e-09, "loss": 0.67325771, "num_input_tokens_seen": 350736805, "router_z_loss_clip": 0.55371094, "router_z_loss_mlp": 0.01074219, "step": 16251, "time_per_iteration": 3.283423900604248 }, { "auxiliary_loss_clip": 0.06415727, "auxiliary_loss_mlp": 0.01263951, "balance_loss_clip": 0.06276157, "balance_loss_mlp": 0.01254808, "epoch": 0.9771231023598377, "flos": 22418375235840.0, "grad_norm": 1.9860095981553731, "language_loss": 0.78795183, "learning_rate": 5.473164370872307e-09, "loss": 0.86474866, "num_input_tokens_seen": 350753600, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09143066, "step": 16252, "time_per_iteration": 3.9855034351348877 }, { "auxiliary_loss_clip": 0.06412859, "auxiliary_loss_mlp": 0.01263334, "balance_loss_clip": 0.06275123, "balance_loss_mlp": 0.01253368, "epoch": 0.9771832256125056, "flos": 19031942073600.0, "grad_norm": 3.0280892239659796, "language_loss": 0.64985645, "learning_rate": 5.444409204701461e-09, "loss": 0.72661841, "num_input_tokens_seen": 350771225, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09967041, "step": 16253, "time_per_iteration": 2.5528676509857178 }, { "auxiliary_loss_clip": 0.06419931, "auxiliary_loss_mlp": 0.01266079, "balance_loss_clip": 0.06278785, "balance_loss_mlp": 0.01255213, "epoch": 0.9772433488651736, "flos": 17827982282880.0, "grad_norm": 2.0666984231832273, "language_loss": 0.76988399, "learning_rate": 5.415729672278324e-09, "loss": 0.84674406, "num_input_tokens_seen": 350789100, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.10864258, "step": 16254, "time_per_iteration": 2.5666584968566895 }, { "auxiliary_loss_clip": 0.06417134, "auxiliary_loss_mlp": 0.01264495, "balance_loss_clip": 0.06275566, "balance_loss_mlp": 0.01254511, "epoch": 0.9773034721178415, "flos": 37638246533760.0, "grad_norm": 1.9520531619137433, "language_loss": 0.64053321, "learning_rate": 5.387125774690471e-09, "loss": 0.71734947, "num_input_tokens_seen": 350811085, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09991455, "step": 16255, "time_per_iteration": 2.6915018558502197 }, { "auxiliary_loss_clip": 0.06418423, "auxiliary_loss_mlp": 0.01264482, "balance_loss_clip": 0.0627524, "balance_loss_mlp": 0.01254546, "epoch": 0.9773635953705095, "flos": 20308590881280.0, "grad_norm": 1.7794463445590922, "language_loss": 0.76190746, "learning_rate": 5.358597513023033e-09, "loss": 0.83873653, "num_input_tokens_seen": 350831065, "router_z_loss_clip": 1.43164062, "router_z_loss_mlp": 0.0994873, "step": 16256, "time_per_iteration": 2.620305061340332 }, { "auxiliary_loss_clip": 0.06416309, "auxiliary_loss_mlp": 0.0126988, "balance_loss_clip": 0.06281208, "balance_loss_mlp": 0.01259747, "epoch": 0.9774237186231776, "flos": 22315735584000.0, "grad_norm": 8.508946455130838, "language_loss": 0.78470373, "learning_rate": 5.330144888357369e-09, "loss": 0.86156565, "num_input_tokens_seen": 350849675, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.10125732, "step": 16257, "time_per_iteration": 3.989645481109619 }, { "auxiliary_loss_clip": 0.06418596, "auxiliary_loss_mlp": 0.0126568, "balance_loss_clip": 0.0628093, "balance_loss_mlp": 0.01255947, "epoch": 0.9774838418758455, "flos": 24211435956480.0, "grad_norm": 1.6319728093523376, "language_loss": 0.75186908, "learning_rate": 5.301767901772391e-09, "loss": 0.82871187, "num_input_tokens_seen": 350868955, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09735107, "step": 16258, "time_per_iteration": 2.5868711471557617 }, { "auxiliary_loss_clip": 0.06313621, "auxiliary_loss_mlp": 0.01251724, "balance_loss_clip": 0.06258471, "balance_loss_mlp": 0.01250708, "epoch": 0.9775439651285135, "flos": 66378691998720.0, "grad_norm": 0.6791951813421274, "language_loss": 0.59711528, "learning_rate": 5.273466554344353e-09, "loss": 0.67276871, "num_input_tokens_seen": 350935110, "router_z_loss_clip": 0.55419922, "router_z_loss_mlp": 0.01016235, "step": 16259, "time_per_iteration": 3.2322020530700684 }, { "auxiliary_loss_clip": 0.06421532, "auxiliary_loss_mlp": 0.01265326, "balance_loss_clip": 0.06278739, "balance_loss_mlp": 0.01255492, "epoch": 0.9776040883811814, "flos": 22608168733440.0, "grad_norm": 2.389270923235087, "language_loss": 0.73474872, "learning_rate": 5.2452408471461705e-09, "loss": 0.81161731, "num_input_tokens_seen": 350953220, "router_z_loss_clip": 1.42578125, "router_z_loss_mlp": 0.09832764, "step": 16260, "time_per_iteration": 2.5722384452819824 }, { "auxiliary_loss_clip": 0.06414472, "auxiliary_loss_mlp": 0.01265211, "balance_loss_clip": 0.0627663, "balance_loss_mlp": 0.01255024, "epoch": 0.9776642116338494, "flos": 18448082023680.0, "grad_norm": 2.063904881596099, "language_loss": 0.79810816, "learning_rate": 5.2170907812485456e-09, "loss": 0.87490499, "num_input_tokens_seen": 350971915, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.10192871, "step": 16261, "time_per_iteration": 2.543781042098999 }, { "auxiliary_loss_clip": 0.06414086, "auxiliary_loss_mlp": 0.0126242, "balance_loss_clip": 0.06272607, "balance_loss_mlp": 0.01252836, "epoch": 0.9777243348865173, "flos": 22645121184000.0, "grad_norm": 2.18112687377505, "language_loss": 0.74506283, "learning_rate": 5.189016357718845e-09, "loss": 0.82182795, "num_input_tokens_seen": 350990470, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09588623, "step": 16262, "time_per_iteration": 2.6066079139709473 }, { "auxiliary_loss_clip": 0.06419182, "auxiliary_loss_mlp": 0.01266077, "balance_loss_clip": 0.06278487, "balance_loss_mlp": 0.01255205, "epoch": 0.9777844581391854, "flos": 31329410520960.0, "grad_norm": 2.231166619704655, "language_loss": 0.70055151, "learning_rate": 5.16101757762133e-09, "loss": 0.77740407, "num_input_tokens_seen": 351010755, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.10876465, "step": 16263, "time_per_iteration": 2.6640408039093018 }, { "auxiliary_loss_clip": 0.06413867, "auxiliary_loss_mlp": 0.01264309, "balance_loss_clip": 0.06274104, "balance_loss_mlp": 0.012551, "epoch": 0.9778445813918533, "flos": 23045728354560.0, "grad_norm": 1.5945023268933187, "language_loss": 0.667018, "learning_rate": 5.133094442018038e-09, "loss": 0.74379975, "num_input_tokens_seen": 351029965, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09204102, "step": 16264, "time_per_iteration": 2.7025504112243652 }, { "auxiliary_loss_clip": 0.06420606, "auxiliary_loss_mlp": 0.0126531, "balance_loss_clip": 0.06276266, "balance_loss_mlp": 0.01254974, "epoch": 0.9779047046445213, "flos": 17572082313600.0, "grad_norm": 1.8422829459728207, "language_loss": 0.73038232, "learning_rate": 5.105246951967679e-09, "loss": 0.8072415, "num_input_tokens_seen": 351046205, "router_z_loss_clip": 1.44433594, "router_z_loss_mlp": 0.10327148, "step": 16265, "time_per_iteration": 2.564239263534546 }, { "auxiliary_loss_clip": 0.06409565, "auxiliary_loss_mlp": 0.01263974, "balance_loss_clip": 0.06274579, "balance_loss_mlp": 0.01255105, "epoch": 0.9779648278971892, "flos": 20747492167680.0, "grad_norm": 2.32167870498934, "language_loss": 0.68687582, "learning_rate": 5.077475108526297e-09, "loss": 0.7636112, "num_input_tokens_seen": 351065390, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.08868408, "step": 16266, "time_per_iteration": 2.587221384048462 }, { "auxiliary_loss_clip": 0.06411298, "auxiliary_loss_mlp": 0.01263764, "balance_loss_clip": 0.06277871, "balance_loss_mlp": 0.01255115, "epoch": 0.9780249511498572, "flos": 21032336522880.0, "grad_norm": 1.632540893757627, "language_loss": 0.87012321, "learning_rate": 5.049778912747049e-09, "loss": 0.94687384, "num_input_tokens_seen": 351084355, "router_z_loss_clip": 1.33398438, "router_z_loss_mlp": 0.08654785, "step": 16267, "time_per_iteration": 2.613819122314453 }, { "auxiliary_loss_clip": 0.06415147, "auxiliary_loss_mlp": 0.01265132, "balance_loss_clip": 0.06274952, "balance_loss_mlp": 0.01254665, "epoch": 0.9780850744025251, "flos": 30782167505280.0, "grad_norm": 1.7214061396612774, "language_loss": 0.70370352, "learning_rate": 5.022158365679985e-09, "loss": 0.78050637, "num_input_tokens_seen": 351105870, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.10467529, "step": 16268, "time_per_iteration": 4.067834854125977 }, { "auxiliary_loss_clip": 0.06415167, "auxiliary_loss_mlp": 0.01267407, "balance_loss_clip": 0.06275067, "balance_loss_mlp": 0.01258175, "epoch": 0.9781451976551931, "flos": 20309219786880.0, "grad_norm": 1.6339524629602142, "language_loss": 0.74263352, "learning_rate": 4.994613468372711e-09, "loss": 0.8194592, "num_input_tokens_seen": 351124760, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09228516, "step": 16269, "time_per_iteration": 2.5543134212493896 }, { "auxiliary_loss_clip": 0.06418963, "auxiliary_loss_mlp": 0.01269161, "balance_loss_clip": 0.06278981, "balance_loss_mlp": 0.01258676, "epoch": 0.9782053209078612, "flos": 24323383411200.0, "grad_norm": 1.76816950582415, "language_loss": 0.71428424, "learning_rate": 4.967144221869501e-09, "loss": 0.79116547, "num_input_tokens_seen": 351142820, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.1048584, "step": 16270, "time_per_iteration": 2.5703041553497314 }, { "auxiliary_loss_clip": 0.06410772, "auxiliary_loss_mlp": 0.01263826, "balance_loss_clip": 0.06271157, "balance_loss_mlp": 0.01253979, "epoch": 0.9782654441605291, "flos": 32497717599360.0, "grad_norm": 2.1796594938856155, "language_loss": 0.64350361, "learning_rate": 4.939750627212191e-09, "loss": 0.72024959, "num_input_tokens_seen": 351164805, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09857178, "step": 16271, "time_per_iteration": 2.640817403793335 }, { "auxiliary_loss_clip": 0.06409211, "auxiliary_loss_mlp": 0.01263651, "balance_loss_clip": 0.06275448, "balance_loss_mlp": 0.01254406, "epoch": 0.9783255674131971, "flos": 26986280567040.0, "grad_norm": 1.398506192575482, "language_loss": 0.70749515, "learning_rate": 4.912432685439505e-09, "loss": 0.78422379, "num_input_tokens_seen": 351187005, "router_z_loss_clip": 1.33886719, "router_z_loss_mlp": 0.09240723, "step": 16272, "time_per_iteration": 4.010678052902222 }, { "auxiliary_loss_clip": 0.06415702, "auxiliary_loss_mlp": 0.01265693, "balance_loss_clip": 0.06274956, "balance_loss_mlp": 0.01256276, "epoch": 0.978385690665865, "flos": 23118920496000.0, "grad_norm": 1.6456589116207692, "language_loss": 0.67102492, "learning_rate": 4.88519039758728e-09, "loss": 0.74783885, "num_input_tokens_seen": 351208450, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09417725, "step": 16273, "time_per_iteration": 2.555811643600464 }, { "auxiliary_loss_clip": 0.06415378, "auxiliary_loss_mlp": 0.01266699, "balance_loss_clip": 0.0627595, "balance_loss_mlp": 0.01257472, "epoch": 0.978445813918533, "flos": 25416527777280.0, "grad_norm": 2.0030987828705933, "language_loss": 0.74316484, "learning_rate": 4.85802376468869e-09, "loss": 0.81998557, "num_input_tokens_seen": 351229585, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09222412, "step": 16274, "time_per_iteration": 2.583693504333496 }, { "auxiliary_loss_clip": 0.06412354, "auxiliary_loss_mlp": 0.01265117, "balance_loss_clip": 0.0627441, "balance_loss_mlp": 0.01256075, "epoch": 0.9785059371712009, "flos": 23556983241600.0, "grad_norm": 1.5629854271716945, "language_loss": 0.77886122, "learning_rate": 4.830932787773579e-09, "loss": 0.855636, "num_input_tokens_seen": 351249525, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09033203, "step": 16275, "time_per_iteration": 2.5770623683929443 }, { "auxiliary_loss_clip": 0.06420475, "auxiliary_loss_mlp": 0.01263606, "balance_loss_clip": 0.06278746, "balance_loss_mlp": 0.01253825, "epoch": 0.978566060423869, "flos": 34359945465600.0, "grad_norm": 1.6983495392584995, "language_loss": 0.71286476, "learning_rate": 4.803917467869567e-09, "loss": 0.78970557, "num_input_tokens_seen": 351272530, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09777832, "step": 16276, "time_per_iteration": 2.7357356548309326 }, { "auxiliary_loss_clip": 0.06410907, "auxiliary_loss_mlp": 0.01266348, "balance_loss_clip": 0.06276735, "balance_loss_mlp": 0.01257509, "epoch": 0.9786261836765369, "flos": 11623546857600.0, "grad_norm": 1.7967006752812662, "language_loss": 0.85978281, "learning_rate": 4.776977806000726e-09, "loss": 0.93655539, "num_input_tokens_seen": 351288530, "router_z_loss_clip": 1.34082031, "router_z_loss_mlp": 0.08837891, "step": 16277, "time_per_iteration": 2.5454092025756836 }, { "auxiliary_loss_clip": 0.06410879, "auxiliary_loss_mlp": 0.01261959, "balance_loss_clip": 0.06275724, "balance_loss_mlp": 0.01253149, "epoch": 0.9786863069292049, "flos": 17426746206720.0, "grad_norm": 1.8132595827972418, "language_loss": 0.71048611, "learning_rate": 4.7501138031891264e-09, "loss": 0.78721452, "num_input_tokens_seen": 351305890, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.08813477, "step": 16278, "time_per_iteration": 2.5589616298675537 }, { "auxiliary_loss_clip": 0.06408894, "auxiliary_loss_mlp": 0.01261901, "balance_loss_clip": 0.06271792, "balance_loss_mlp": 0.01252335, "epoch": 0.9787464301818728, "flos": 20850341454720.0, "grad_norm": 1.9368522390055385, "language_loss": 0.84544933, "learning_rate": 4.723325460453065e-09, "loss": 0.92215729, "num_input_tokens_seen": 351325010, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09570312, "step": 16279, "time_per_iteration": 2.803633451461792 }, { "auxiliary_loss_clip": 0.06411743, "auxiliary_loss_mlp": 0.01264005, "balance_loss_clip": 0.06272648, "balance_loss_mlp": 0.01254558, "epoch": 0.9788065534345408, "flos": 18228757161600.0, "grad_norm": 1.8970379804746993, "language_loss": 0.7960369, "learning_rate": 4.696612778808395e-09, "loss": 0.87279439, "num_input_tokens_seen": 351343060, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09448242, "step": 16280, "time_per_iteration": 2.585679531097412 }, { "auxiliary_loss_clip": 0.06408647, "auxiliary_loss_mlp": 0.01267931, "balance_loss_clip": 0.06275684, "balance_loss_mlp": 0.01258961, "epoch": 0.9788666766872087, "flos": 21584359221120.0, "grad_norm": 1.5887019093351789, "language_loss": 0.79736209, "learning_rate": 4.669975759268085e-09, "loss": 0.87412792, "num_input_tokens_seen": 351363260, "router_z_loss_clip": 1.32910156, "router_z_loss_mlp": 0.08972168, "step": 16281, "time_per_iteration": 2.6210811138153076 }, { "auxiliary_loss_clip": 0.06411327, "auxiliary_loss_mlp": 0.01266543, "balance_loss_clip": 0.06271878, "balance_loss_mlp": 0.01256971, "epoch": 0.9789267999398767, "flos": 24907536950400.0, "grad_norm": 1.5814917403789743, "language_loss": 0.80646443, "learning_rate": 4.643414402842216e-09, "loss": 0.88324308, "num_input_tokens_seen": 351382610, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09576416, "step": 16282, "time_per_iteration": 2.574986457824707 }, { "auxiliary_loss_clip": 0.0641061, "auxiliary_loss_mlp": 0.01262976, "balance_loss_clip": 0.06273118, "balance_loss_mlp": 0.01253392, "epoch": 0.9789869231925448, "flos": 19579185089280.0, "grad_norm": 2.233854008516117, "language_loss": 0.83586311, "learning_rate": 4.616928710538204e-09, "loss": 0.91259897, "num_input_tokens_seen": 351401075, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.0958252, "step": 16283, "time_per_iteration": 2.5370395183563232 }, { "auxiliary_loss_clip": 0.06413625, "auxiliary_loss_mlp": 0.01266509, "balance_loss_clip": 0.06274172, "balance_loss_mlp": 0.01256555, "epoch": 0.9790470464452127, "flos": 16801657148160.0, "grad_norm": 2.1638819944526353, "language_loss": 0.71948528, "learning_rate": 4.590518683360134e-09, "loss": 0.79628664, "num_input_tokens_seen": 351419275, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09954834, "step": 16284, "time_per_iteration": 2.515850067138672 }, { "auxiliary_loss_clip": 0.06412496, "auxiliary_loss_mlp": 0.01265367, "balance_loss_clip": 0.06277558, "balance_loss_mlp": 0.01256999, "epoch": 0.9791071696978807, "flos": 18375267225600.0, "grad_norm": 4.039825787085932, "language_loss": 0.64833874, "learning_rate": 4.56418432230965e-09, "loss": 0.72511733, "num_input_tokens_seen": 351437375, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.0836792, "step": 16285, "time_per_iteration": 2.5604445934295654 }, { "auxiliary_loss_clip": 0.06413946, "auxiliary_loss_mlp": 0.01268184, "balance_loss_clip": 0.06278074, "balance_loss_mlp": 0.012591, "epoch": 0.9791672929505486, "flos": 24177166836480.0, "grad_norm": 1.5857322716041224, "language_loss": 0.70929444, "learning_rate": 4.537925628385286e-09, "loss": 0.78611577, "num_input_tokens_seen": 351457810, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.09075928, "step": 16286, "time_per_iteration": 2.566293478012085 }, { "auxiliary_loss_clip": 0.06407788, "auxiliary_loss_mlp": 0.01264579, "balance_loss_clip": 0.06272918, "balance_loss_mlp": 0.01255168, "epoch": 0.9792274162032166, "flos": 24361216329600.0, "grad_norm": 1.6351030926871661, "language_loss": 0.59019327, "learning_rate": 4.511742602582691e-09, "loss": 0.66691697, "num_input_tokens_seen": 351478825, "router_z_loss_clip": 1.34863281, "router_z_loss_mlp": 0.09411621, "step": 16287, "time_per_iteration": 2.637629985809326 }, { "auxiliary_loss_clip": 0.06413388, "auxiliary_loss_mlp": 0.01262448, "balance_loss_clip": 0.06276245, "balance_loss_mlp": 0.0125318, "epoch": 0.9792875394558845, "flos": 26402965568640.0, "grad_norm": 1.59861837669671, "language_loss": 0.81541485, "learning_rate": 4.485635245894626e-09, "loss": 0.89217317, "num_input_tokens_seen": 351498785, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09259033, "step": 16288, "time_per_iteration": 2.5687966346740723 }, { "auxiliary_loss_clip": 0.0641128, "auxiliary_loss_mlp": 0.01265997, "balance_loss_clip": 0.06272692, "balance_loss_mlp": 0.01256359, "epoch": 0.9793476627085526, "flos": 28155635821440.0, "grad_norm": 2.930803340313233, "language_loss": 0.7219286, "learning_rate": 4.459603559311631e-09, "loss": 0.79870141, "num_input_tokens_seen": 351520235, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09637451, "step": 16289, "time_per_iteration": 2.6293764114379883 }, { "auxiliary_loss_clip": 0.06413108, "auxiliary_loss_mlp": 0.01261998, "balance_loss_clip": 0.06277312, "balance_loss_mlp": 0.01253111, "epoch": 0.9794077859612205, "flos": 16769568234240.0, "grad_norm": 2.1788693773953067, "language_loss": 0.75575566, "learning_rate": 4.43364754382003e-09, "loss": 0.83250666, "num_input_tokens_seen": 351538900, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.08880615, "step": 16290, "time_per_iteration": 2.5235848426818848 }, { "auxiliary_loss_clip": 0.06413789, "auxiliary_loss_mlp": 0.01264462, "balance_loss_clip": 0.06273046, "balance_loss_mlp": 0.01254442, "epoch": 0.9794679092138885, "flos": 19286793866880.0, "grad_norm": 1.670089054345553, "language_loss": 0.6771602, "learning_rate": 4.4077672004048105e-09, "loss": 0.75394267, "num_input_tokens_seen": 351558715, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10009766, "step": 16291, "time_per_iteration": 4.0049097537994385 }, { "auxiliary_loss_clip": 0.06420451, "auxiliary_loss_mlp": 0.01264115, "balance_loss_clip": 0.06278282, "balance_loss_mlp": 0.01254114, "epoch": 0.9795280324665564, "flos": 32164139295360.0, "grad_norm": 1.9218682752281226, "language_loss": 0.62844014, "learning_rate": 4.3819625300467456e-09, "loss": 0.70528579, "num_input_tokens_seen": 351578450, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09997559, "step": 16292, "time_per_iteration": 2.66416335105896 }, { "auxiliary_loss_clip": 0.06410632, "auxiliary_loss_mlp": 0.01266197, "balance_loss_clip": 0.06272637, "balance_loss_mlp": 0.01256219, "epoch": 0.9795881557192244, "flos": 19066714318080.0, "grad_norm": 2.1339353711258964, "language_loss": 0.73714852, "learning_rate": 4.356233533724829e-09, "loss": 0.8139168, "num_input_tokens_seen": 351597195, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09979248, "step": 16293, "time_per_iteration": 2.735325574874878 }, { "auxiliary_loss_clip": 0.06416337, "auxiliary_loss_mlp": 0.01263027, "balance_loss_clip": 0.06274146, "balance_loss_mlp": 0.0125352, "epoch": 0.9796482789718923, "flos": 28337505108480.0, "grad_norm": 1.725976605126845, "language_loss": 0.84263945, "learning_rate": 4.330580212414503e-09, "loss": 0.91943306, "num_input_tokens_seen": 351617460, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09509277, "step": 16294, "time_per_iteration": 2.6682848930358887 }, { "auxiliary_loss_clip": 0.06404845, "auxiliary_loss_mlp": 0.01268615, "balance_loss_clip": 0.06271632, "balance_loss_mlp": 0.01259948, "epoch": 0.9797084022245603, "flos": 17973821514240.0, "grad_norm": 1.8297513840490545, "language_loss": 0.72446299, "learning_rate": 4.305002567088767e-09, "loss": 0.80119759, "num_input_tokens_seen": 351635900, "router_z_loss_clip": 1.33203125, "router_z_loss_mlp": 0.08673096, "step": 16295, "time_per_iteration": 2.558277130126953 }, { "auxiliary_loss_clip": 0.06418809, "auxiliary_loss_mlp": 0.01265854, "balance_loss_clip": 0.06277876, "balance_loss_mlp": 0.01255966, "epoch": 0.9797685254772284, "flos": 20272980096000.0, "grad_norm": 1.6071459981825156, "language_loss": 0.80954927, "learning_rate": 4.2795005987170674e-09, "loss": 0.88639587, "num_input_tokens_seen": 351655400, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09887695, "step": 16296, "time_per_iteration": 2.560302257537842 }, { "auxiliary_loss_clip": 0.06409008, "auxiliary_loss_mlp": 0.01264792, "balance_loss_clip": 0.06272486, "balance_loss_mlp": 0.01255256, "epoch": 0.9798286487298963, "flos": 26914513944960.0, "grad_norm": 1.7045528520324733, "language_loss": 0.75748289, "learning_rate": 4.254074308266853e-09, "loss": 0.83422095, "num_input_tokens_seen": 351675505, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.09539795, "step": 16297, "time_per_iteration": 4.07452392578125 }, { "auxiliary_loss_clip": 0.06421775, "auxiliary_loss_mlp": 0.01267336, "balance_loss_clip": 0.06278047, "balance_loss_mlp": 0.01257752, "epoch": 0.9798887719825643, "flos": 27168233708160.0, "grad_norm": 1.5635488503331458, "language_loss": 0.78548789, "learning_rate": 4.228723696702019e-09, "loss": 0.86237895, "num_input_tokens_seen": 351697920, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09588623, "step": 16298, "time_per_iteration": 2.613751173019409 }, { "auxiliary_loss_clip": 0.06410188, "auxiliary_loss_mlp": 0.01261601, "balance_loss_clip": 0.06275454, "balance_loss_mlp": 0.01252797, "epoch": 0.9799488952352322, "flos": 20674803150720.0, "grad_norm": 1.5597436385246959, "language_loss": 0.7257753, "learning_rate": 4.203448764984019e-09, "loss": 0.80249316, "num_input_tokens_seen": 351717615, "router_z_loss_clip": 1.34765625, "router_z_loss_mlp": 0.08795166, "step": 16299, "time_per_iteration": 2.539485454559326 }, { "auxiliary_loss_clip": 0.06411891, "auxiliary_loss_mlp": 0.01263577, "balance_loss_clip": 0.06271201, "balance_loss_mlp": 0.01254482, "epoch": 0.9800090184879002, "flos": 21987691649280.0, "grad_norm": 2.089751051449995, "language_loss": 0.89561999, "learning_rate": 4.178249514071419e-09, "loss": 0.97237474, "num_input_tokens_seen": 351735260, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09100342, "step": 16300, "time_per_iteration": 2.5401477813720703 }, { "auxiliary_loss_clip": 0.06418727, "auxiliary_loss_mlp": 0.0126566, "balance_loss_clip": 0.06276161, "balance_loss_mlp": 0.01255533, "epoch": 0.9800691417405681, "flos": 21294860964480.0, "grad_norm": 2.2872893327546175, "language_loss": 0.78094304, "learning_rate": 4.1531259449194555e-09, "loss": 0.85778689, "num_input_tokens_seen": 351755800, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10119629, "step": 16301, "time_per_iteration": 2.540314197540283 }, { "auxiliary_loss_clip": 0.06413707, "auxiliary_loss_mlp": 0.01266345, "balance_loss_clip": 0.06274837, "balance_loss_mlp": 0.01256457, "epoch": 0.9801292649932362, "flos": 18445398693120.0, "grad_norm": 1.9409798028468992, "language_loss": 0.75132847, "learning_rate": 4.128078058480921e-09, "loss": 0.82812899, "num_input_tokens_seen": 351774790, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09887695, "step": 16302, "time_per_iteration": 2.5368309020996094 }, { "auxiliary_loss_clip": 0.06414, "auxiliary_loss_mlp": 0.01264969, "balance_loss_clip": 0.06274895, "balance_loss_mlp": 0.01255289, "epoch": 0.9801893882459041, "flos": 25053418108800.0, "grad_norm": 1.7034523274942452, "language_loss": 0.79863423, "learning_rate": 4.103105855705724e-09, "loss": 0.87542385, "num_input_tokens_seen": 351792855, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09667969, "step": 16303, "time_per_iteration": 2.5627951622009277 }, { "auxiliary_loss_clip": 0.06418533, "auxiliary_loss_mlp": 0.01263475, "balance_loss_clip": 0.06276188, "balance_loss_mlp": 0.01253766, "epoch": 0.9802495114985721, "flos": 18516787971840.0, "grad_norm": 1.9134796057221246, "language_loss": 0.83447301, "learning_rate": 4.078209337540883e-09, "loss": 0.91129309, "num_input_tokens_seen": 351811450, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09710693, "step": 16304, "time_per_iteration": 2.5007083415985107 }, { "auxiliary_loss_clip": 0.06403887, "auxiliary_loss_mlp": 0.01262721, "balance_loss_clip": 0.06270674, "balance_loss_mlp": 0.01253929, "epoch": 0.98030963475124, "flos": 21476143272960.0, "grad_norm": 1.9969145358473148, "language_loss": 0.70444292, "learning_rate": 4.053388504930089e-09, "loss": 0.78110904, "num_input_tokens_seen": 351831960, "router_z_loss_clip": 1.33203125, "router_z_loss_mlp": 0.08795166, "step": 16305, "time_per_iteration": 2.559469699859619 }, { "auxiliary_loss_clip": 0.06417592, "auxiliary_loss_mlp": 0.01264866, "balance_loss_clip": 0.06276439, "balance_loss_mlp": 0.01255323, "epoch": 0.980369758003908, "flos": 20418483911040.0, "grad_norm": 1.839029218172707, "language_loss": 0.72205663, "learning_rate": 4.028643358815032e-09, "loss": 0.79888123, "num_input_tokens_seen": 351851585, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09539795, "step": 16306, "time_per_iteration": 2.523686408996582 }, { "auxiliary_loss_clip": 0.06406879, "auxiliary_loss_mlp": 0.01262259, "balance_loss_clip": 0.06272839, "balance_loss_mlp": 0.01253891, "epoch": 0.9804298812565759, "flos": 23405064589440.0, "grad_norm": 1.5486841798401885, "language_loss": 0.73884374, "learning_rate": 4.00397390013385e-09, "loss": 0.81553513, "num_input_tokens_seen": 351871085, "router_z_loss_clip": 1.33886719, "router_z_loss_mlp": 0.08374023, "step": 16307, "time_per_iteration": 2.622271776199341 }, { "auxiliary_loss_clip": 0.06405522, "auxiliary_loss_mlp": 0.01263199, "balance_loss_clip": 0.06273268, "balance_loss_mlp": 0.01255307, "epoch": 0.980490004509244, "flos": 23299028847360.0, "grad_norm": 1.653281166808259, "language_loss": 0.7477597, "learning_rate": 3.979380129822018e-09, "loss": 0.82444692, "num_input_tokens_seen": 351891775, "router_z_loss_clip": 1.32324219, "router_z_loss_mlp": 0.07891846, "step": 16308, "time_per_iteration": 4.0412211418151855 }, { "auxiliary_loss_clip": 0.06314175, "auxiliary_loss_mlp": 0.0125106, "balance_loss_clip": 0.06258852, "balance_loss_mlp": 0.01250039, "epoch": 0.980550127761912, "flos": 56067991712640.0, "grad_norm": 0.7738157810220359, "language_loss": 0.57675469, "learning_rate": 3.954862048811902e-09, "loss": 0.65240711, "num_input_tokens_seen": 351946770, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01020813, "step": 16309, "time_per_iteration": 3.0620014667510986 }, { "auxiliary_loss_clip": 0.06411485, "auxiliary_loss_mlp": 0.01263549, "balance_loss_clip": 0.06271495, "balance_loss_mlp": 0.01254031, "epoch": 0.9806102510145799, "flos": 25339562202240.0, "grad_norm": 1.7900595163383923, "language_loss": 0.66599739, "learning_rate": 3.930419658033646e-09, "loss": 0.74274766, "num_input_tokens_seen": 351966155, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09521484, "step": 16310, "time_per_iteration": 2.574824571609497 }, { "auxiliary_loss_clip": 0.06315024, "auxiliary_loss_mlp": 0.01250499, "balance_loss_clip": 0.06259646, "balance_loss_mlp": 0.01249417, "epoch": 0.9806703742672479, "flos": 67297472017920.0, "grad_norm": 0.798729073405123, "language_loss": 0.54577947, "learning_rate": 3.906052958413841e-09, "loss": 0.62143469, "num_input_tokens_seen": 352031655, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01083374, "step": 16311, "time_per_iteration": 3.247593879699707 }, { "auxiliary_loss_clip": 0.06414013, "auxiliary_loss_mlp": 0.01263032, "balance_loss_clip": 0.06275661, "balance_loss_mlp": 0.01253746, "epoch": 0.9807304975199158, "flos": 25236084009600.0, "grad_norm": 1.3893050668517581, "language_loss": 0.79802454, "learning_rate": 3.881761950876638e-09, "loss": 0.87479496, "num_input_tokens_seen": 352051920, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09289551, "step": 16312, "time_per_iteration": 3.948404312133789 }, { "auxiliary_loss_clip": 0.06411362, "auxiliary_loss_mlp": 0.01263663, "balance_loss_clip": 0.06274444, "balance_loss_mlp": 0.01255247, "epoch": 0.9807906207725838, "flos": 17462021575680.0, "grad_norm": 1.9978798961251598, "language_loss": 0.63840532, "learning_rate": 3.8575466363430785e-09, "loss": 0.71515554, "num_input_tokens_seen": 352069315, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.08410645, "step": 16313, "time_per_iteration": 2.5282106399536133 }, { "auxiliary_loss_clip": 0.06409898, "auxiliary_loss_mlp": 0.01262859, "balance_loss_clip": 0.06271815, "balance_loss_mlp": 0.01253638, "epoch": 0.9808507440252517, "flos": 21038709432960.0, "grad_norm": 2.3534155888100434, "language_loss": 0.73130429, "learning_rate": 3.833407015731316e-09, "loss": 0.80803192, "num_input_tokens_seen": 352089480, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09222412, "step": 16314, "time_per_iteration": 2.5204949378967285 }, { "auxiliary_loss_clip": 0.06311376, "auxiliary_loss_mlp": 0.01251144, "balance_loss_clip": 0.06256271, "balance_loss_mlp": 0.01250142, "epoch": 0.9809108672779198, "flos": 64063307652480.0, "grad_norm": 0.6749922169352792, "language_loss": 0.516868, "learning_rate": 3.80934308995684e-09, "loss": 0.59249318, "num_input_tokens_seen": 352150000, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.01002502, "step": 16315, "time_per_iteration": 3.1817736625671387 }, { "auxiliary_loss_clip": 0.06414013, "auxiliary_loss_mlp": 0.0126388, "balance_loss_clip": 0.06274483, "balance_loss_mlp": 0.0125482, "epoch": 0.9809709905305877, "flos": 22786683857280.0, "grad_norm": 1.2969238970329713, "language_loss": 0.6990751, "learning_rate": 3.785354859932033e-09, "loss": 0.77585405, "num_input_tokens_seen": 352170990, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09057617, "step": 16316, "time_per_iteration": 2.565335273742676 }, { "auxiliary_loss_clip": 0.06415869, "auxiliary_loss_mlp": 0.01263898, "balance_loss_clip": 0.06275944, "balance_loss_mlp": 0.01254922, "epoch": 0.9810311137832557, "flos": 37022423351040.0, "grad_norm": 1.8710784854076852, "language_loss": 0.54802454, "learning_rate": 3.76144232656661e-09, "loss": 0.6248222, "num_input_tokens_seen": 352195335, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.08972168, "step": 16317, "time_per_iteration": 2.6886441707611084 }, { "auxiliary_loss_clip": 0.06417292, "auxiliary_loss_mlp": 0.01269233, "balance_loss_clip": 0.06280875, "balance_loss_mlp": 0.01259386, "epoch": 0.9810912370359236, "flos": 18922258679040.0, "grad_norm": 1.7178491443731179, "language_loss": 0.7338962, "learning_rate": 3.737605490767404e-09, "loss": 0.81076145, "num_input_tokens_seen": 352214170, "router_z_loss_clip": 1.36230469, "router_z_loss_mlp": 0.09844971, "step": 16318, "time_per_iteration": 2.5352249145507812 }, { "auxiliary_loss_clip": 0.06411569, "auxiliary_loss_mlp": 0.01264564, "balance_loss_clip": 0.06276989, "balance_loss_mlp": 0.01255403, "epoch": 0.9811513602885916, "flos": 18447411191040.0, "grad_norm": 2.9470028246282634, "language_loss": 0.8199048, "learning_rate": 3.7138443534383555e-09, "loss": 0.89666605, "num_input_tokens_seen": 352231470, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.0916748, "step": 16319, "time_per_iteration": 2.5548009872436523 }, { "auxiliary_loss_clip": 0.06316975, "auxiliary_loss_mlp": 0.01250852, "balance_loss_clip": 0.06261878, "balance_loss_mlp": 0.01249858, "epoch": 0.9812114835412595, "flos": 68078603306880.0, "grad_norm": 0.7043570073195506, "language_loss": 0.53473848, "learning_rate": 3.6901589154803014e-09, "loss": 0.61041671, "num_input_tokens_seen": 352291770, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00993347, "step": 16320, "time_per_iteration": 3.0401549339294434 }, { "auxiliary_loss_clip": 0.06411409, "auxiliary_loss_mlp": 0.01264626, "balance_loss_clip": 0.06272069, "balance_loss_mlp": 0.01255137, "epoch": 0.9812716067939276, "flos": 25379826888960.0, "grad_norm": 1.6395509993655852, "language_loss": 0.73419774, "learning_rate": 3.6665491777914116e-09, "loss": 0.81095809, "num_input_tokens_seen": 352310735, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09490967, "step": 16321, "time_per_iteration": 2.59625244140625 }, { "auxiliary_loss_clip": 0.06408571, "auxiliary_loss_mlp": 0.0126301, "balance_loss_clip": 0.06273801, "balance_loss_mlp": 0.01253962, "epoch": 0.9813317300465956, "flos": 22863439797120.0, "grad_norm": 1.5247187723141506, "language_loss": 0.78785485, "learning_rate": 3.6430151412669698e-09, "loss": 0.86457068, "num_input_tokens_seen": 352329545, "router_z_loss_clip": 1.34863281, "router_z_loss_mlp": 0.09051514, "step": 16322, "time_per_iteration": 2.560964822769165 }, { "auxiliary_loss_clip": 0.06413293, "auxiliary_loss_mlp": 0.01266509, "balance_loss_clip": 0.06276452, "balance_loss_mlp": 0.01257306, "epoch": 0.9813918532992635, "flos": 23593767984000.0, "grad_norm": 1.5692669422287921, "language_loss": 0.80663788, "learning_rate": 3.619556806799595e-09, "loss": 0.8834359, "num_input_tokens_seen": 352352080, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09197998, "step": 16323, "time_per_iteration": 2.6513800621032715 }, { "auxiliary_loss_clip": 0.06417805, "auxiliary_loss_mlp": 0.01265263, "balance_loss_clip": 0.06275388, "balance_loss_mlp": 0.01255917, "epoch": 0.9814519765519315, "flos": 19611860981760.0, "grad_norm": 2.442681677875762, "language_loss": 0.85099685, "learning_rate": 3.596174175278799e-09, "loss": 0.92782748, "num_input_tokens_seen": 352366455, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09350586, "step": 16324, "time_per_iteration": 2.609271287918091 }, { "auxiliary_loss_clip": 0.06413798, "auxiliary_loss_mlp": 0.01264765, "balance_loss_clip": 0.06275136, "balance_loss_mlp": 0.01255026, "epoch": 0.9815120998045994, "flos": 33954390904320.0, "grad_norm": 1.3920979958812947, "language_loss": 0.74738872, "learning_rate": 3.5728672475909827e-09, "loss": 0.82417434, "num_input_tokens_seen": 352386090, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.09735107, "step": 16325, "time_per_iteration": 2.6756203174591064 }, { "auxiliary_loss_clip": 0.06409381, "auxiliary_loss_mlp": 0.01263266, "balance_loss_clip": 0.06276512, "balance_loss_mlp": 0.01254213, "epoch": 0.9815722230572674, "flos": 20856295094400.0, "grad_norm": 2.143667301302791, "language_loss": 0.76625907, "learning_rate": 3.5496360246201063e-09, "loss": 0.84298557, "num_input_tokens_seen": 352404000, "router_z_loss_clip": 1.33007812, "router_z_loss_mlp": 0.09051514, "step": 16326, "time_per_iteration": 2.5648980140686035 }, { "auxiliary_loss_clip": 0.06419037, "auxiliary_loss_mlp": 0.01266296, "balance_loss_clip": 0.06279396, "balance_loss_mlp": 0.0125636, "epoch": 0.9816323463099353, "flos": 22901356569600.0, "grad_norm": 1.6714458311574458, "language_loss": 0.6743325, "learning_rate": 3.5264805072470205e-09, "loss": 0.75118589, "num_input_tokens_seen": 352423540, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09936523, "step": 16327, "time_per_iteration": 2.718738317489624 }, { "auxiliary_loss_clip": 0.06421177, "auxiliary_loss_mlp": 0.0126539, "balance_loss_clip": 0.0627684, "balance_loss_mlp": 0.01255108, "epoch": 0.9816924695626034, "flos": 31547351790720.0, "grad_norm": 1.3945365423933358, "language_loss": 0.73818135, "learning_rate": 3.5034006963501337e-09, "loss": 0.81504703, "num_input_tokens_seen": 352445530, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10284424, "step": 16328, "time_per_iteration": 2.6627659797668457 }, { "auxiliary_loss_clip": 0.06429395, "auxiliary_loss_mlp": 0.01269182, "balance_loss_clip": 0.06279937, "balance_loss_mlp": 0.01258924, "epoch": 0.9817525928152713, "flos": 21513305358720.0, "grad_norm": 2.5150643846704805, "language_loss": 0.81419384, "learning_rate": 3.4803965928040802e-09, "loss": 0.89117968, "num_input_tokens_seen": 352466325, "router_z_loss_clip": 1.49414062, "router_z_loss_mlp": 0.10253906, "step": 16329, "time_per_iteration": 2.5928502082824707 }, { "auxiliary_loss_clip": 0.06419982, "auxiliary_loss_mlp": 0.01265593, "balance_loss_clip": 0.06276046, "balance_loss_mlp": 0.01255323, "epoch": 0.9818127160679393, "flos": 25556539150080.0, "grad_norm": 1.937873352242106, "language_loss": 0.76107734, "learning_rate": 3.4574681974817168e-09, "loss": 0.83793306, "num_input_tokens_seen": 352485505, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.1026001, "step": 16330, "time_per_iteration": 2.583369493484497 }, { "auxiliary_loss_clip": 0.06431054, "auxiliary_loss_mlp": 0.01267328, "balance_loss_clip": 0.06280002, "balance_loss_mlp": 0.01255138, "epoch": 0.9818728393206072, "flos": 28811220566400.0, "grad_norm": 2.2088776062451565, "language_loss": 0.66510886, "learning_rate": 3.434615511252126e-09, "loss": 0.74209273, "num_input_tokens_seen": 352505360, "router_z_loss_clip": 1.51171875, "router_z_loss_mlp": 0.12194824, "step": 16331, "time_per_iteration": 4.140071630477905 }, { "auxiliary_loss_clip": 0.06410632, "auxiliary_loss_mlp": 0.01263439, "balance_loss_clip": 0.06273196, "balance_loss_mlp": 0.01254796, "epoch": 0.9819329625732752, "flos": 23229023160960.0, "grad_norm": 1.917923681077088, "language_loss": 0.73485476, "learning_rate": 3.411838534981948e-09, "loss": 0.81159544, "num_input_tokens_seen": 352524035, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.08642578, "step": 16332, "time_per_iteration": 2.5610828399658203 }, { "auxiliary_loss_clip": 0.06413454, "auxiliary_loss_mlp": 0.01262785, "balance_loss_clip": 0.06275217, "balance_loss_mlp": 0.01253856, "epoch": 0.9819930858259431, "flos": 17536261893120.0, "grad_norm": 1.5545470661138416, "language_loss": 0.77335358, "learning_rate": 3.389137269534936e-09, "loss": 0.85011601, "num_input_tokens_seen": 352543210, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.08929443, "step": 16333, "time_per_iteration": 2.540435314178467 }, { "auxiliary_loss_clip": 0.06412587, "auxiliary_loss_mlp": 0.01264116, "balance_loss_clip": 0.06274652, "balance_loss_mlp": 0.01255199, "epoch": 0.9820532090786112, "flos": 12534570374400.0, "grad_norm": 4.555285902788485, "language_loss": 0.73177111, "learning_rate": 3.366511715771958e-09, "loss": 0.80853808, "num_input_tokens_seen": 352559770, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.08917236, "step": 16334, "time_per_iteration": 2.516230821609497 }, { "auxiliary_loss_clip": 0.06417112, "auxiliary_loss_mlp": 0.01265884, "balance_loss_clip": 0.06276561, "balance_loss_mlp": 0.01256163, "epoch": 0.9821133323312792, "flos": 18845586593280.0, "grad_norm": 1.7494563376859615, "language_loss": 0.78288233, "learning_rate": 3.3439618745509934e-09, "loss": 0.85971224, "num_input_tokens_seen": 352577690, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.097229, "step": 16335, "time_per_iteration": 2.564829111099243 }, { "auxiliary_loss_clip": 0.06417745, "auxiliary_loss_mlp": 0.01266388, "balance_loss_clip": 0.06275557, "balance_loss_mlp": 0.01256017, "epoch": 0.9821734555839471, "flos": 34832612747520.0, "grad_norm": 2.1107387936354423, "language_loss": 0.64662409, "learning_rate": 3.3214877467271362e-09, "loss": 0.72346544, "num_input_tokens_seen": 352598850, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.10369873, "step": 16336, "time_per_iteration": 2.664428949356079 }, { "auxiliary_loss_clip": 0.06421476, "auxiliary_loss_mlp": 0.01265466, "balance_loss_clip": 0.06275126, "balance_loss_mlp": 0.01254601, "epoch": 0.9822335788366151, "flos": 17133768005760.0, "grad_norm": 1.9322929683884353, "language_loss": 0.73529422, "learning_rate": 3.299089333152372e-09, "loss": 0.81216371, "num_input_tokens_seen": 352616130, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.10864258, "step": 16337, "time_per_iteration": 4.031511068344116 }, { "auxiliary_loss_clip": 0.0641612, "auxiliary_loss_mlp": 0.01265801, "balance_loss_clip": 0.06275684, "balance_loss_mlp": 0.01255734, "epoch": 0.982293702089283, "flos": 20819468424960.0, "grad_norm": 1.6644961571491788, "language_loss": 0.73458248, "learning_rate": 3.2767666346764645e-09, "loss": 0.81140172, "num_input_tokens_seen": 352636885, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.10064697, "step": 16338, "time_per_iteration": 2.5337820053100586 }, { "auxiliary_loss_clip": 0.06412205, "auxiliary_loss_mlp": 0.01266515, "balance_loss_clip": 0.06273341, "balance_loss_mlp": 0.01257574, "epoch": 0.982353825341951, "flos": 24687708963840.0, "grad_norm": 1.564999378068667, "language_loss": 0.81482708, "learning_rate": 3.2545196521454045e-09, "loss": 0.89161432, "num_input_tokens_seen": 352657905, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.0894165, "step": 16339, "time_per_iteration": 2.576601028442383 }, { "auxiliary_loss_clip": 0.06409813, "auxiliary_loss_mlp": 0.01266295, "balance_loss_clip": 0.06275151, "balance_loss_mlp": 0.01257236, "epoch": 0.982413948594619, "flos": 20856840145920.0, "grad_norm": 1.672535509833246, "language_loss": 0.62766451, "learning_rate": 3.232348386403405e-09, "loss": 0.70442563, "num_input_tokens_seen": 352676320, "router_z_loss_clip": 1.34667969, "router_z_loss_mlp": 0.09051514, "step": 16340, "time_per_iteration": 2.521289825439453 }, { "auxiliary_loss_clip": 0.06419696, "auxiliary_loss_mlp": 0.01264925, "balance_loss_clip": 0.06278333, "balance_loss_mlp": 0.01255508, "epoch": 0.982474071847287, "flos": 15382774834560.0, "grad_norm": 2.7490106039707793, "language_loss": 0.85910726, "learning_rate": 3.2102528382904613e-09, "loss": 0.9359535, "num_input_tokens_seen": 352692665, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09423828, "step": 16341, "time_per_iteration": 2.5295491218566895 }, { "auxiliary_loss_clip": 0.0640787, "auxiliary_loss_mlp": 0.01267503, "balance_loss_clip": 0.06274351, "balance_loss_mlp": 0.01258962, "epoch": 0.9825341950999549, "flos": 23782471378560.0, "grad_norm": 1.4954334800269802, "language_loss": 0.67024744, "learning_rate": 3.188233008645014e-09, "loss": 0.74700123, "num_input_tokens_seen": 352716130, "router_z_loss_clip": 1.33398438, "router_z_loss_mlp": 0.08544922, "step": 16342, "time_per_iteration": 2.6238465309143066 }, { "auxiliary_loss_clip": 0.06416757, "auxiliary_loss_mlp": 0.01263954, "balance_loss_clip": 0.06275609, "balance_loss_mlp": 0.01254661, "epoch": 0.9825943183526229, "flos": 22752708226560.0, "grad_norm": 1.4711083713312698, "language_loss": 0.7764107, "learning_rate": 3.16628889830195e-09, "loss": 0.85321784, "num_input_tokens_seen": 352734705, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09301758, "step": 16343, "time_per_iteration": 2.529538631439209 }, { "auxiliary_loss_clip": 0.06411639, "auxiliary_loss_mlp": 0.01263876, "balance_loss_clip": 0.06274825, "balance_loss_mlp": 0.01255448, "epoch": 0.9826544416052908, "flos": 27717489221760.0, "grad_norm": 1.568172073833647, "language_loss": 0.75755179, "learning_rate": 3.1444205080932707e-09, "loss": 0.83430696, "num_input_tokens_seen": 352756225, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.08435059, "step": 16344, "time_per_iteration": 2.5746452808380127 }, { "auxiliary_loss_clip": 0.06412649, "auxiliary_loss_mlp": 0.01264693, "balance_loss_clip": 0.06273345, "balance_loss_mlp": 0.01254817, "epoch": 0.9827145648579588, "flos": 26948699210880.0, "grad_norm": 1.9914666587632686, "language_loss": 0.6659013, "learning_rate": 3.122627838848313e-09, "loss": 0.74267477, "num_input_tokens_seen": 352776210, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09875488, "step": 16345, "time_per_iteration": 2.560825824737549 }, { "auxiliary_loss_clip": 0.06407957, "auxiliary_loss_mlp": 0.01261185, "balance_loss_clip": 0.06274627, "balance_loss_mlp": 0.01252858, "epoch": 0.9827746881106267, "flos": 21872138469120.0, "grad_norm": 1.3836885069534999, "language_loss": 0.79537272, "learning_rate": 3.1009108913933045e-09, "loss": 0.87206417, "num_input_tokens_seen": 352795455, "router_z_loss_clip": 1.33496094, "router_z_loss_mlp": 0.08325195, "step": 16346, "time_per_iteration": 2.539651393890381 }, { "auxiliary_loss_clip": 0.06428296, "auxiliary_loss_mlp": 0.01264855, "balance_loss_clip": 0.06281674, "balance_loss_mlp": 0.01254877, "epoch": 0.9828348113632948, "flos": 20857175562240.0, "grad_norm": 1.9002889642765546, "language_loss": 0.75549448, "learning_rate": 3.079269666552031e-09, "loss": 0.83242595, "num_input_tokens_seen": 352812895, "router_z_loss_clip": 1.46484375, "router_z_loss_mlp": 0.09973145, "step": 16347, "time_per_iteration": 3.9638545513153076 }, { "auxiliary_loss_clip": 0.06406352, "auxiliary_loss_mlp": 0.01263371, "balance_loss_clip": 0.06272629, "balance_loss_mlp": 0.01255176, "epoch": 0.9828949346159628, "flos": 34577886735360.0, "grad_norm": 1.5265234662912295, "language_loss": 0.66985923, "learning_rate": 3.0577041651449474e-09, "loss": 0.74655646, "num_input_tokens_seen": 352835470, "router_z_loss_clip": 1.33691406, "router_z_loss_mlp": 0.08197021, "step": 16348, "time_per_iteration": 2.69378399848938 }, { "auxiliary_loss_clip": 0.06412598, "auxiliary_loss_mlp": 0.01266924, "balance_loss_clip": 0.06275734, "balance_loss_mlp": 0.0125694, "epoch": 0.9829550578686307, "flos": 24463562492160.0, "grad_norm": 1.7144931132990155, "language_loss": 0.69326091, "learning_rate": 3.0362143879898437e-09, "loss": 0.77005613, "num_input_tokens_seen": 352854295, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09985352, "step": 16349, "time_per_iteration": 2.6199209690093994 }, { "auxiliary_loss_clip": 0.0640834, "auxiliary_loss_mlp": 0.01264444, "balance_loss_clip": 0.06275806, "balance_loss_mlp": 0.01255623, "epoch": 0.9830151811212987, "flos": 16915784808960.0, "grad_norm": 1.7921388494683568, "language_loss": 0.76140034, "learning_rate": 3.0148003359014018e-09, "loss": 0.83812821, "num_input_tokens_seen": 352869695, "router_z_loss_clip": 1.32324219, "router_z_loss_mlp": 0.08831787, "step": 16350, "time_per_iteration": 2.536699056625366 }, { "auxiliary_loss_clip": 0.06413102, "auxiliary_loss_mlp": 0.01264922, "balance_loss_clip": 0.0627245, "balance_loss_mlp": 0.012547, "epoch": 0.9830753043739666, "flos": 21294735183360.0, "grad_norm": 2.0477090271464555, "language_loss": 0.84720564, "learning_rate": 2.9934620096920826e-09, "loss": 0.9239859, "num_input_tokens_seen": 352887430, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10217285, "step": 16351, "time_per_iteration": 4.014637470245361 }, { "auxiliary_loss_clip": 0.06411305, "auxiliary_loss_mlp": 0.01261651, "balance_loss_clip": 0.06271961, "balance_loss_mlp": 0.01253038, "epoch": 0.9831354276266346, "flos": 31731736700160.0, "grad_norm": 1.6242020199179168, "language_loss": 0.69038475, "learning_rate": 2.972199410170795e-09, "loss": 0.76711428, "num_input_tokens_seen": 352907555, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.08612061, "step": 16352, "time_per_iteration": 2.6254000663757324 }, { "auxiliary_loss_clip": 0.06413402, "auxiliary_loss_mlp": 0.0126235, "balance_loss_clip": 0.06275328, "balance_loss_mlp": 0.01253213, "epoch": 0.9831955508793025, "flos": 21625923646080.0, "grad_norm": 1.5381658037219619, "language_loss": 0.66823643, "learning_rate": 2.951012538143782e-09, "loss": 0.74499393, "num_input_tokens_seen": 352928670, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09136963, "step": 16353, "time_per_iteration": 2.5996932983398438 }, { "auxiliary_loss_clip": 0.06408072, "auxiliary_loss_mlp": 0.01263707, "balance_loss_clip": 0.06273606, "balance_loss_mlp": 0.01255309, "epoch": 0.9832556741319706, "flos": 22975177616640.0, "grad_norm": 1.5370196250967156, "language_loss": 0.74845123, "learning_rate": 2.9299013944144025e-09, "loss": 0.82516897, "num_input_tokens_seen": 352948345, "router_z_loss_clip": 1.34277344, "router_z_loss_mlp": 0.08404541, "step": 16354, "time_per_iteration": 2.602069854736328 }, { "auxiliary_loss_clip": 0.06411034, "auxiliary_loss_mlp": 0.01266609, "balance_loss_clip": 0.06272817, "balance_loss_mlp": 0.01257263, "epoch": 0.9833157973846385, "flos": 21330178260480.0, "grad_norm": 1.855994341038005, "language_loss": 0.7828747, "learning_rate": 2.9088659797835702e-09, "loss": 0.85965121, "num_input_tokens_seen": 352967250, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09344482, "step": 16355, "time_per_iteration": 2.611534595489502 }, { "auxiliary_loss_clip": 0.06411057, "auxiliary_loss_mlp": 0.01264535, "balance_loss_clip": 0.06273883, "balance_loss_mlp": 0.01256167, "epoch": 0.9833759206373065, "flos": 21074991050880.0, "grad_norm": 1.7349455994058742, "language_loss": 0.73458052, "learning_rate": 2.8879062950484256e-09, "loss": 0.81133646, "num_input_tokens_seen": 352984725, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.0836792, "step": 16356, "time_per_iteration": 2.5305097103118896 }, { "auxiliary_loss_clip": 0.06410496, "auxiliary_loss_mlp": 0.01264551, "balance_loss_clip": 0.06273827, "balance_loss_mlp": 0.0125499, "epoch": 0.9834360438899744, "flos": 18703227306240.0, "grad_norm": 1.5996753653823583, "language_loss": 0.75998497, "learning_rate": 2.8670223410041104e-09, "loss": 0.83673543, "num_input_tokens_seen": 353003480, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.09558105, "step": 16357, "time_per_iteration": 2.6931204795837402 }, { "auxiliary_loss_clip": 0.06414491, "auxiliary_loss_mlp": 0.01262832, "balance_loss_clip": 0.0627697, "balance_loss_mlp": 0.01253134, "epoch": 0.9834961671426424, "flos": 21111524231040.0, "grad_norm": 2.0096050432607457, "language_loss": 0.80492318, "learning_rate": 2.846214118442436e-09, "loss": 0.88169634, "num_input_tokens_seen": 353021425, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.0970459, "step": 16358, "time_per_iteration": 2.541663885116577 }, { "auxiliary_loss_clip": 0.06412097, "auxiliary_loss_mlp": 0.01262209, "balance_loss_clip": 0.06274609, "balance_loss_mlp": 0.01253495, "epoch": 0.9835562903953103, "flos": 26694853666560.0, "grad_norm": 1.930594887894124, "language_loss": 0.68285954, "learning_rate": 2.8254816281523263e-09, "loss": 0.75960267, "num_input_tokens_seen": 353039870, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.0871582, "step": 16359, "time_per_iteration": 2.5846502780914307 }, { "auxiliary_loss_clip": 0.06409478, "auxiliary_loss_mlp": 0.01263041, "balance_loss_clip": 0.06274819, "balance_loss_mlp": 0.01254399, "epoch": 0.9836164136479784, "flos": 22096578430080.0, "grad_norm": 1.7463500795163114, "language_loss": 0.70048183, "learning_rate": 2.804824870920264e-09, "loss": 0.77720702, "num_input_tokens_seen": 353059750, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.08642578, "step": 16360, "time_per_iteration": 2.5588905811309814 }, { "auxiliary_loss_clip": 0.06415147, "auxiliary_loss_mlp": 0.01267438, "balance_loss_clip": 0.06274819, "balance_loss_mlp": 0.01258032, "epoch": 0.9836765369006463, "flos": 23885194884480.0, "grad_norm": 1.7070469336868936, "language_loss": 0.84310091, "learning_rate": 2.7842438475293996e-09, "loss": 0.91992676, "num_input_tokens_seen": 353079940, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.09411621, "step": 16361, "time_per_iteration": 2.6342337131500244 }, { "auxiliary_loss_clip": 0.06414767, "auxiliary_loss_mlp": 0.01262467, "balance_loss_clip": 0.06275883, "balance_loss_mlp": 0.01254391, "epoch": 0.9837366601533143, "flos": 25851529848960.0, "grad_norm": 1.7180837319666402, "language_loss": 0.76014394, "learning_rate": 2.76373855876022e-09, "loss": 0.83691627, "num_input_tokens_seen": 353099990, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.08081055, "step": 16362, "time_per_iteration": 2.575589656829834 }, { "auxiliary_loss_clip": 0.06412841, "auxiliary_loss_mlp": 0.01266079, "balance_loss_clip": 0.0627463, "balance_loss_mlp": 0.01256453, "epoch": 0.9837967834059823, "flos": 21363902328960.0, "grad_norm": 2.9183218523286283, "language_loss": 0.71171957, "learning_rate": 2.7433090053901043e-09, "loss": 0.78850877, "num_input_tokens_seen": 353118710, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09625244, "step": 16363, "time_per_iteration": 2.535668134689331 }, { "auxiliary_loss_clip": 0.06406052, "auxiliary_loss_mlp": 0.01263076, "balance_loss_clip": 0.06273429, "balance_loss_mlp": 0.01254248, "epoch": 0.9838569066586502, "flos": 18521819216640.0, "grad_norm": 1.6474316131751434, "language_loss": 0.63179433, "learning_rate": 2.7229551881937653e-09, "loss": 0.7084856, "num_input_tokens_seen": 353136415, "router_z_loss_clip": 1.32617188, "router_z_loss_mlp": 0.08825684, "step": 16364, "time_per_iteration": 2.5031230449676514 }, { "auxiliary_loss_clip": 0.06412323, "auxiliary_loss_mlp": 0.01262183, "balance_loss_clip": 0.06274007, "balance_loss_mlp": 0.01253397, "epoch": 0.9839170299113182, "flos": 22458430287360.0, "grad_norm": 1.5352722349921848, "language_loss": 0.75195587, "learning_rate": 2.702677107943252e-09, "loss": 0.8287009, "num_input_tokens_seen": 353154650, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.08776855, "step": 16365, "time_per_iteration": 2.630718469619751 }, { "auxiliary_loss_clip": 0.06412083, "auxiliary_loss_mlp": 0.01264951, "balance_loss_clip": 0.06275894, "balance_loss_mlp": 0.01255945, "epoch": 0.9839771531639862, "flos": 27899861633280.0, "grad_norm": 1.8753646562920046, "language_loss": 0.76766711, "learning_rate": 2.6824747654072832e-09, "loss": 0.84443748, "num_input_tokens_seen": 353174065, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.09002686, "step": 16366, "time_per_iteration": 2.641162633895874 }, { "auxiliary_loss_clip": 0.06410684, "auxiliary_loss_mlp": 0.01265124, "balance_loss_clip": 0.06274076, "balance_loss_mlp": 0.01255927, "epoch": 0.9840372764166542, "flos": 28221071460480.0, "grad_norm": 1.5716317991232376, "language_loss": 0.77214181, "learning_rate": 2.662348161352357e-09, "loss": 0.84889996, "num_input_tokens_seen": 353193560, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09191895, "step": 16367, "time_per_iteration": 2.567272663116455 }, { "auxiliary_loss_clip": 0.06410752, "auxiliary_loss_mlp": 0.01265889, "balance_loss_clip": 0.0627386, "balance_loss_mlp": 0.01256335, "epoch": 0.9840973996693221, "flos": 23410682812800.0, "grad_norm": 1.4165818975053432, "language_loss": 0.61781991, "learning_rate": 2.642297296540974e-09, "loss": 0.6945864, "num_input_tokens_seen": 353213525, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.09552002, "step": 16368, "time_per_iteration": 2.5351040363311768 }, { "auxiliary_loss_clip": 0.06408029, "auxiliary_loss_mlp": 0.01265876, "balance_loss_clip": 0.0627517, "balance_loss_mlp": 0.01257394, "epoch": 0.9841575229219901, "flos": 21401986809600.0, "grad_norm": 1.528607213065385, "language_loss": 0.65983808, "learning_rate": 2.6223221717340816e-09, "loss": 0.73657715, "num_input_tokens_seen": 353234000, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.08477783, "step": 16369, "time_per_iteration": 2.587232828140259 }, { "auxiliary_loss_clip": 0.06415105, "auxiliary_loss_mlp": 0.01267716, "balance_loss_clip": 0.06276099, "balance_loss_mlp": 0.01257494, "epoch": 0.984217646174658, "flos": 24471277067520.0, "grad_norm": 2.3005168434598438, "language_loss": 0.68217033, "learning_rate": 2.6024227876886295e-09, "loss": 0.75899857, "num_input_tokens_seen": 353254940, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.10223389, "step": 16370, "time_per_iteration": 4.025588274002075 }, { "auxiliary_loss_clip": 0.06417538, "auxiliary_loss_mlp": 0.01263092, "balance_loss_clip": 0.06274636, "balance_loss_mlp": 0.01253287, "epoch": 0.984277769427326, "flos": 16440559977600.0, "grad_norm": 1.7546776595812534, "language_loss": 0.74037659, "learning_rate": 2.582599145159792e-09, "loss": 0.8171829, "num_input_tokens_seen": 353272590, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09802246, "step": 16371, "time_per_iteration": 2.5721216201782227 }, { "auxiliary_loss_clip": 0.06317811, "auxiliary_loss_mlp": 0.01251471, "balance_loss_clip": 0.06262626, "balance_loss_mlp": 0.01250424, "epoch": 0.9843378926799939, "flos": 64551487939200.0, "grad_norm": 0.7689921283454573, "language_loss": 0.65015173, "learning_rate": 2.562851244898745e-09, "loss": 0.7258445, "num_input_tokens_seen": 353334380, "router_z_loss_clip": 0.55371094, "router_z_loss_mlp": 0.01047516, "step": 16372, "time_per_iteration": 3.2038164138793945 }, { "auxiliary_loss_clip": 0.06407987, "auxiliary_loss_mlp": 0.0126397, "balance_loss_clip": 0.06271903, "balance_loss_mlp": 0.01255083, "epoch": 0.984398015932662, "flos": 17388326309760.0, "grad_norm": 1.6430561761410651, "language_loss": 0.70928431, "learning_rate": 2.5431790876544456e-09, "loss": 0.78600389, "num_input_tokens_seen": 353351640, "router_z_loss_clip": 1.36132812, "router_z_loss_mlp": 0.08898926, "step": 16373, "time_per_iteration": 2.5320889949798584 }, { "auxiliary_loss_clip": 0.06411559, "auxiliary_loss_mlp": 0.01266215, "balance_loss_clip": 0.06276101, "balance_loss_mlp": 0.01257387, "epoch": 0.9844581391853299, "flos": 23885991498240.0, "grad_norm": 1.606160320156113, "language_loss": 0.8163079, "learning_rate": 2.523582674173186e-09, "loss": 0.89308566, "num_input_tokens_seen": 353372555, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.08831787, "step": 16374, "time_per_iteration": 2.6473941802978516 }, { "auxiliary_loss_clip": 0.06413298, "auxiliary_loss_mlp": 0.01265489, "balance_loss_clip": 0.06274863, "balance_loss_mlp": 0.01256316, "epoch": 0.9845182624379979, "flos": 19871534384640.0, "grad_norm": 1.702869380132088, "language_loss": 0.69481474, "learning_rate": 2.504062005197927e-09, "loss": 0.77160257, "num_input_tokens_seen": 353391385, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.09173584, "step": 16375, "time_per_iteration": 2.5782740116119385 }, { "auxiliary_loss_clip": 0.06419103, "auxiliary_loss_mlp": 0.01263892, "balance_loss_clip": 0.06276698, "balance_loss_mlp": 0.01254414, "epoch": 0.9845783856906659, "flos": 28261839271680.0, "grad_norm": 2.0635134881632187, "language_loss": 0.8064307, "learning_rate": 2.484617081468521e-09, "loss": 0.88326061, "num_input_tokens_seen": 353411630, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.09472656, "step": 16376, "time_per_iteration": 4.0626630783081055 }, { "auxiliary_loss_clip": 0.06414726, "auxiliary_loss_mlp": 0.01264122, "balance_loss_clip": 0.06279098, "balance_loss_mlp": 0.012548, "epoch": 0.9846385089433338, "flos": 28335702245760.0, "grad_norm": 1.7582869703670874, "language_loss": 0.62244213, "learning_rate": 2.4652479037228224e-09, "loss": 0.69923061, "num_input_tokens_seen": 353432895, "router_z_loss_clip": 1.35644531, "router_z_loss_mlp": 0.09313965, "step": 16377, "time_per_iteration": 2.6138360500335693 }, { "auxiliary_loss_clip": 0.06414548, "auxiliary_loss_mlp": 0.01266907, "balance_loss_clip": 0.06275657, "balance_loss_mlp": 0.01257599, "epoch": 0.9846986321960018, "flos": 24323718827520.0, "grad_norm": 1.5623728611866556, "language_loss": 0.73174548, "learning_rate": 2.445954472695133e-09, "loss": 0.80856001, "num_input_tokens_seen": 353454195, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.09317017, "step": 16378, "time_per_iteration": 2.575773000717163 }, { "auxiliary_loss_clip": 0.06412631, "auxiliary_loss_mlp": 0.01265531, "balance_loss_clip": 0.06273862, "balance_loss_mlp": 0.01256036, "epoch": 0.9847587554486698, "flos": 27279426476160.0, "grad_norm": 1.7351734024109589, "language_loss": 0.70877373, "learning_rate": 2.426736789116868e-09, "loss": 0.7855553, "num_input_tokens_seen": 353475125, "router_z_loss_clip": 1.38867188, "router_z_loss_mlp": 0.0949707, "step": 16379, "time_per_iteration": 2.6072633266448975 }, { "auxiliary_loss_clip": 0.06415947, "auxiliary_loss_mlp": 0.01263847, "balance_loss_clip": 0.06275538, "balance_loss_mlp": 0.01254394, "epoch": 0.9848188787013378, "flos": 16547937384960.0, "grad_norm": 1.7195951679698247, "language_loss": 0.68363357, "learning_rate": 2.407594853716999e-09, "loss": 0.76043147, "num_input_tokens_seen": 353493265, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09454346, "step": 16380, "time_per_iteration": 2.5153679847717285 }, { "auxiliary_loss_clip": 0.06416499, "auxiliary_loss_mlp": 0.01264616, "balance_loss_clip": 0.06273241, "balance_loss_mlp": 0.0125496, "epoch": 0.9848790019540057, "flos": 20199871808640.0, "grad_norm": 2.2088595519671337, "language_loss": 0.79495156, "learning_rate": 2.38852866722139e-09, "loss": 0.87176275, "num_input_tokens_seen": 353511650, "router_z_loss_clip": 1.43261719, "router_z_loss_mlp": 0.09661865, "step": 16381, "time_per_iteration": 2.568166971206665 }, { "auxiliary_loss_clip": 0.06413287, "auxiliary_loss_mlp": 0.01262531, "balance_loss_clip": 0.06272531, "balance_loss_mlp": 0.01252655, "epoch": 0.9849391252066737, "flos": 28267750984320.0, "grad_norm": 1.4520797143949724, "language_loss": 0.82497454, "learning_rate": 2.3695382303527965e-09, "loss": 0.90173268, "num_input_tokens_seen": 353534035, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.09869385, "step": 16382, "time_per_iteration": 2.613469362258911 }, { "auxiliary_loss_clip": 0.06425092, "auxiliary_loss_mlp": 0.01264036, "balance_loss_clip": 0.06280409, "balance_loss_mlp": 0.01254607, "epoch": 0.9849992484593416, "flos": 22461407107200.0, "grad_norm": 2.4337392449236988, "language_loss": 0.74539065, "learning_rate": 2.3506235438315316e-09, "loss": 0.82228196, "num_input_tokens_seen": 353549950, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.09429932, "step": 16383, "time_per_iteration": 2.5644428730010986 }, { "auxiliary_loss_clip": 0.064142, "auxiliary_loss_mlp": 0.01265438, "balance_loss_clip": 0.06275676, "balance_loss_mlp": 0.01256319, "epoch": 0.9850593717120096, "flos": 34505994332160.0, "grad_norm": 1.47193786013319, "language_loss": 0.66410422, "learning_rate": 2.3317846083750203e-09, "loss": 0.74090064, "num_input_tokens_seen": 353573745, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09124756, "step": 16384, "time_per_iteration": 2.6544251441955566 }, { "auxiliary_loss_clip": 0.06421968, "auxiliary_loss_mlp": 0.01266031, "balance_loss_clip": 0.06279156, "balance_loss_mlp": 0.0125588, "epoch": 0.9851194949646775, "flos": 38846524809600.0, "grad_norm": 1.6778761889407543, "language_loss": 0.70475626, "learning_rate": 2.313021424697359e-09, "loss": 0.78163624, "num_input_tokens_seen": 353595335, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.1015625, "step": 16385, "time_per_iteration": 2.6959519386291504 }, { "auxiliary_loss_clip": 0.06420287, "auxiliary_loss_mlp": 0.01269804, "balance_loss_clip": 0.06279245, "balance_loss_mlp": 0.01260291, "epoch": 0.9851796182173456, "flos": 17718215034240.0, "grad_norm": 1.851270316141162, "language_loss": 0.81305736, "learning_rate": 2.294333993509978e-09, "loss": 0.88995826, "num_input_tokens_seen": 353614270, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09521484, "step": 16386, "time_per_iteration": 2.5368094444274902 }, { "auxiliary_loss_clip": 0.06416366, "auxiliary_loss_mlp": 0.01264912, "balance_loss_clip": 0.06274827, "balance_loss_mlp": 0.01254547, "epoch": 0.9852397414700135, "flos": 27461756960640.0, "grad_norm": 1.9286892757857412, "language_loss": 0.68371874, "learning_rate": 2.2757223155216442e-09, "loss": 0.76053154, "num_input_tokens_seen": 353634900, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.10369873, "step": 16387, "time_per_iteration": 4.126010417938232 }, { "auxiliary_loss_clip": 0.06403522, "auxiliary_loss_mlp": 0.01263169, "balance_loss_clip": 0.06271288, "balance_loss_mlp": 0.0125495, "epoch": 0.9852998647226815, "flos": 18302662062720.0, "grad_norm": 1.6573851229850982, "language_loss": 0.73994613, "learning_rate": 2.257186391438237e-09, "loss": 0.81661296, "num_input_tokens_seen": 353652890, "router_z_loss_clip": 1.32128906, "router_z_loss_mlp": 0.08215332, "step": 16388, "time_per_iteration": 2.5300962924957275 }, { "auxiliary_loss_clip": 0.06412286, "auxiliary_loss_mlp": 0.01264756, "balance_loss_clip": 0.06274624, "balance_loss_mlp": 0.01255893, "epoch": 0.9853599879753495, "flos": 19648058745600.0, "grad_norm": 1.9087440942306877, "language_loss": 0.8242408, "learning_rate": 2.238726221962528e-09, "loss": 0.90101123, "num_input_tokens_seen": 353671295, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.08874512, "step": 16389, "time_per_iteration": 2.516167402267456 }, { "auxiliary_loss_clip": 0.06412523, "auxiliary_loss_mlp": 0.01264186, "balance_loss_clip": 0.06274893, "balance_loss_mlp": 0.01255251, "epoch": 0.9854201112280174, "flos": 23848745558400.0, "grad_norm": 2.1862802511174677, "language_loss": 0.67293501, "learning_rate": 2.2203418077946234e-09, "loss": 0.7497021, "num_input_tokens_seen": 353690560, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.08929443, "step": 16390, "time_per_iteration": 2.5590617656707764 }, { "auxiliary_loss_clip": 0.06416557, "auxiliary_loss_mlp": 0.01264989, "balance_loss_clip": 0.06276786, "balance_loss_mlp": 0.01254785, "epoch": 0.9854802344806854, "flos": 30088330571520.0, "grad_norm": 1.5529681843765666, "language_loss": 0.77547705, "learning_rate": 2.2020331496312994e-09, "loss": 0.85229248, "num_input_tokens_seen": 353710660, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.10211182, "step": 16391, "time_per_iteration": 4.143389463424683 }, { "auxiliary_loss_clip": 0.06406993, "auxiliary_loss_mlp": 0.01264522, "balance_loss_clip": 0.06275138, "balance_loss_mlp": 0.01256022, "epoch": 0.9855403577333534, "flos": 21913744821120.0, "grad_norm": 1.7909708169096517, "language_loss": 0.68461454, "learning_rate": 2.1838002481673333e-09, "loss": 0.76132977, "num_input_tokens_seen": 353730440, "router_z_loss_clip": 1.3203125, "router_z_loss_mlp": 0.08502197, "step": 16392, "time_per_iteration": 2.543910026550293 }, { "auxiliary_loss_clip": 0.06422853, "auxiliary_loss_mlp": 0.0126357, "balance_loss_clip": 0.06277071, "balance_loss_mlp": 0.01253187, "epoch": 0.9856004809860214, "flos": 15419182233600.0, "grad_norm": 2.2117604017816452, "language_loss": 0.56518352, "learning_rate": 2.1656431040937286e-09, "loss": 0.64204776, "num_input_tokens_seen": 353748360, "router_z_loss_clip": 1.45703125, "router_z_loss_mlp": 0.1038208, "step": 16393, "time_per_iteration": 2.5139286518096924 }, { "auxiliary_loss_clip": 0.06423946, "auxiliary_loss_mlp": 0.01265353, "balance_loss_clip": 0.06278183, "balance_loss_mlp": 0.01255554, "epoch": 0.9856606042386893, "flos": 13656742980480.0, "grad_norm": 2.7139751968867944, "language_loss": 0.79076672, "learning_rate": 2.1475617180990444e-09, "loss": 0.86765969, "num_input_tokens_seen": 353760880, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.09790039, "step": 16394, "time_per_iteration": 2.477640151977539 }, { "auxiliary_loss_clip": 0.06414964, "auxiliary_loss_mlp": 0.01265411, "balance_loss_clip": 0.06272897, "balance_loss_mlp": 0.01255535, "epoch": 0.9857207274913573, "flos": 23486222868480.0, "grad_norm": 1.3693177532811407, "language_loss": 0.76232839, "learning_rate": 2.129556090869178e-09, "loss": 0.83913219, "num_input_tokens_seen": 353782255, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.09875488, "step": 16395, "time_per_iteration": 2.5689280033111572 }, { "auxiliary_loss_clip": 0.06411262, "auxiliary_loss_mlp": 0.01263562, "balance_loss_clip": 0.06273003, "balance_loss_mlp": 0.01255075, "epoch": 0.9857808507440252, "flos": 21071217617280.0, "grad_norm": 1.936089071181666, "language_loss": 0.75405359, "learning_rate": 2.1116262230866933e-09, "loss": 0.83080184, "num_input_tokens_seen": 353803580, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.08496094, "step": 16396, "time_per_iteration": 2.5996811389923096 }, { "auxiliary_loss_clip": 0.06413713, "auxiliary_loss_mlp": 0.01261732, "balance_loss_clip": 0.06275921, "balance_loss_mlp": 0.01252542, "epoch": 0.9858409739966932, "flos": 25308395683200.0, "grad_norm": 1.4211384860223095, "language_loss": 0.71581399, "learning_rate": 2.0937721154317133e-09, "loss": 0.79256845, "num_input_tokens_seen": 353824200, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09197998, "step": 16397, "time_per_iteration": 2.614459276199341 }, { "auxiliary_loss_clip": 0.06405619, "auxiliary_loss_mlp": 0.01263371, "balance_loss_clip": 0.06274522, "balance_loss_mlp": 0.01255035, "epoch": 0.9859010972493611, "flos": 20565077829120.0, "grad_norm": 1.62107584272828, "language_loss": 0.71387053, "learning_rate": 2.0759937685810304e-09, "loss": 0.79056048, "num_input_tokens_seen": 353843350, "router_z_loss_clip": 1.31152344, "router_z_loss_mlp": 0.08340454, "step": 16398, "time_per_iteration": 2.5361037254333496 }, { "auxiliary_loss_clip": 0.06411681, "auxiliary_loss_mlp": 0.01264005, "balance_loss_clip": 0.06275944, "balance_loss_mlp": 0.01255148, "epoch": 0.9859612205020292, "flos": 24762075062400.0, "grad_norm": 1.5027015040052942, "language_loss": 0.74111545, "learning_rate": 2.058291183208771e-09, "loss": 0.81787235, "num_input_tokens_seen": 353864520, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.08856201, "step": 16399, "time_per_iteration": 2.5923023223876953 }, { "auxiliary_loss_clip": 0.06410855, "auxiliary_loss_mlp": 0.0126398, "balance_loss_clip": 0.0627259, "balance_loss_mlp": 0.01254688, "epoch": 0.9860213437546971, "flos": 21112236990720.0, "grad_norm": 2.0299950325162124, "language_loss": 0.57763135, "learning_rate": 2.0406643599863993e-09, "loss": 0.65437967, "num_input_tokens_seen": 353882240, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09289551, "step": 16400, "time_per_iteration": 2.5460057258605957 }, { "auxiliary_loss_clip": 0.06427433, "auxiliary_loss_mlp": 0.01266932, "balance_loss_clip": 0.06279266, "balance_loss_mlp": 0.01256352, "epoch": 0.9860814670073651, "flos": 19142212446720.0, "grad_norm": 1.5862155008823058, "language_loss": 0.8076368, "learning_rate": 2.023113299582491e-09, "loss": 0.88458049, "num_input_tokens_seen": 353901590, "router_z_loss_clip": 1.48046875, "router_z_loss_mlp": 0.10583496, "step": 16401, "time_per_iteration": 2.563178062438965 }, { "auxiliary_loss_clip": 0.06410306, "auxiliary_loss_mlp": 0.01263943, "balance_loss_clip": 0.06273746, "balance_loss_mlp": 0.01253739, "epoch": 0.9861415902600331, "flos": 17242570932480.0, "grad_norm": 1.7394796030167783, "language_loss": 0.78156471, "learning_rate": 2.005638002662069e-09, "loss": 0.85830718, "num_input_tokens_seen": 353918785, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.10205078, "step": 16402, "time_per_iteration": 2.50270414352417 }, { "auxiliary_loss_clip": 0.06413987, "auxiliary_loss_mlp": 0.01265476, "balance_loss_clip": 0.06273667, "balance_loss_mlp": 0.01255754, "epoch": 0.986201713512701, "flos": 27790052457600.0, "grad_norm": 7.122474757443916, "language_loss": 0.70376021, "learning_rate": 1.9882384698881596e-09, "loss": 0.78055489, "num_input_tokens_seen": 353940390, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09710693, "step": 16403, "time_per_iteration": 2.6053428649902344 }, { "auxiliary_loss_clip": 0.0641117, "auxiliary_loss_mlp": 0.01264145, "balance_loss_clip": 0.06274019, "balance_loss_mlp": 0.01255746, "epoch": 0.986261836765369, "flos": 28737902643840.0, "grad_norm": 1.6945120065209374, "language_loss": 0.74627578, "learning_rate": 1.9709147019204566e-09, "loss": 0.82302892, "num_input_tokens_seen": 353962180, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.08404541, "step": 16404, "time_per_iteration": 2.618473529815674 }, { "auxiliary_loss_clip": 0.06411017, "auxiliary_loss_mlp": 0.01265455, "balance_loss_clip": 0.06270719, "balance_loss_mlp": 0.01256026, "epoch": 0.986321960018037, "flos": 34322028693120.0, "grad_norm": 1.6385793168860319, "language_loss": 0.70144427, "learning_rate": 1.953666699415768e-09, "loss": 0.77820897, "num_input_tokens_seen": 353984305, "router_z_loss_clip": 1.40136719, "router_z_loss_mlp": 0.09442139, "step": 16405, "time_per_iteration": 2.6810097694396973 }, { "auxiliary_loss_clip": 0.06408515, "auxiliary_loss_mlp": 0.01267854, "balance_loss_clip": 0.06273928, "balance_loss_mlp": 0.01259116, "epoch": 0.986382083270705, "flos": 25196406301440.0, "grad_norm": 1.587835977915224, "language_loss": 0.70082211, "learning_rate": 1.93649446302846e-09, "loss": 0.77758586, "num_input_tokens_seen": 354004495, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.08734131, "step": 16406, "time_per_iteration": 2.556269645690918 }, { "auxiliary_loss_clip": 0.06410918, "auxiliary_loss_mlp": 0.01263915, "balance_loss_clip": 0.06274799, "balance_loss_mlp": 0.01254742, "epoch": 0.9864422065233729, "flos": 11028953485440.0, "grad_norm": 4.2306161174207455, "language_loss": 0.74293911, "learning_rate": 1.9193979934095663e-09, "loss": 0.81968743, "num_input_tokens_seen": 354015985, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.09179688, "step": 16407, "time_per_iteration": 2.4629950523376465 }, { "auxiliary_loss_clip": 0.06414042, "auxiliary_loss_mlp": 0.01267242, "balance_loss_clip": 0.06276031, "balance_loss_mlp": 0.01257795, "epoch": 0.9865023297760409, "flos": 16551291548160.0, "grad_norm": 1.8940027012905636, "language_loss": 0.7704097, "learning_rate": 1.9023772912072357e-09, "loss": 0.84722251, "num_input_tokens_seen": 354033260, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09436035, "step": 16408, "time_per_iteration": 2.4831860065460205 }, { "auxiliary_loss_clip": 0.06421299, "auxiliary_loss_mlp": 0.0126571, "balance_loss_clip": 0.06277347, "balance_loss_mlp": 0.01254987, "epoch": 0.9865624530287088, "flos": 18886186696320.0, "grad_norm": 1.7889398464223092, "language_loss": 0.68223923, "learning_rate": 1.8854323570669515e-09, "loss": 0.75910926, "num_input_tokens_seen": 354052825, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10717773, "step": 16409, "time_per_iteration": 2.5169804096221924 }, { "auxiliary_loss_clip": 0.06320553, "auxiliary_loss_mlp": 0.01251622, "balance_loss_clip": 0.06265328, "balance_loss_mlp": 0.01250542, "epoch": 0.9866225762813768, "flos": 68905869068160.0, "grad_norm": 0.7906119413456552, "language_loss": 0.60743165, "learning_rate": 1.8685631916313118e-09, "loss": 0.68315339, "num_input_tokens_seen": 354113920, "router_z_loss_clip": 0.55419922, "router_z_loss_mlp": 0.01081085, "step": 16410, "time_per_iteration": 4.636219501495361 }, { "auxiliary_loss_clip": 0.06417669, "auxiliary_loss_mlp": 0.01265942, "balance_loss_clip": 0.06277734, "balance_loss_mlp": 0.01255923, "epoch": 0.9866826995340447, "flos": 29030796990720.0, "grad_norm": 2.3425151057604143, "language_loss": 0.6667558, "learning_rate": 1.8517697955400258e-09, "loss": 0.7435919, "num_input_tokens_seen": 354134210, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.10021973, "step": 16411, "time_per_iteration": 2.5852813720703125 }, { "auxiliary_loss_clip": 0.06317952, "auxiliary_loss_mlp": 0.01250541, "balance_loss_clip": 0.06262878, "balance_loss_mlp": 0.01249586, "epoch": 0.9867428227867128, "flos": 65399004460800.0, "grad_norm": 0.7124128867102586, "language_loss": 0.56190062, "learning_rate": 1.8350521694299182e-09, "loss": 0.63758552, "num_input_tokens_seen": 354198010, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00952911, "step": 16412, "time_per_iteration": 3.237082004547119 }, { "auxiliary_loss_clip": 0.06424712, "auxiliary_loss_mlp": 0.01265785, "balance_loss_clip": 0.06279933, "balance_loss_mlp": 0.01254907, "epoch": 0.9868029460393807, "flos": 26513697139200.0, "grad_norm": 1.5753553550957287, "language_loss": 0.72967929, "learning_rate": 1.818410313934926e-09, "loss": 0.80658424, "num_input_tokens_seen": 354220000, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.10876465, "step": 16413, "time_per_iteration": 2.6020872592926025 }, { "auxiliary_loss_clip": 0.06415344, "auxiliary_loss_mlp": 0.012644, "balance_loss_clip": 0.0627432, "balance_loss_mlp": 0.01255263, "epoch": 0.9868630692920487, "flos": 22974087513600.0, "grad_norm": 1.4777115644455203, "language_loss": 0.71780932, "learning_rate": 1.8018442296858782e-09, "loss": 0.7946068, "num_input_tokens_seen": 354240910, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.09136963, "step": 16414, "time_per_iteration": 2.597790479660034 }, { "auxiliary_loss_clip": 0.06408904, "auxiliary_loss_mlp": 0.01266344, "balance_loss_clip": 0.06275591, "balance_loss_mlp": 0.01257677, "epoch": 0.9869231925447167, "flos": 19834833496320.0, "grad_norm": 1.6019134645727076, "language_loss": 0.70252192, "learning_rate": 1.7853539173111608e-09, "loss": 0.7792744, "num_input_tokens_seen": 354259430, "router_z_loss_clip": 1.3359375, "router_z_loss_mlp": 0.08660889, "step": 16415, "time_per_iteration": 4.059832572937012 }, { "auxiliary_loss_clip": 0.06404535, "auxiliary_loss_mlp": 0.01263022, "balance_loss_clip": 0.06272775, "balance_loss_mlp": 0.01254529, "epoch": 0.9869833157973846, "flos": 20201716598400.0, "grad_norm": 1.4279692151749679, "language_loss": 0.75624722, "learning_rate": 1.7689393774362737e-09, "loss": 0.83292282, "num_input_tokens_seen": 354279490, "router_z_loss_clip": 1.31738281, "router_z_loss_mlp": 0.08496094, "step": 16416, "time_per_iteration": 2.5473835468292236 }, { "auxiliary_loss_clip": 0.06411315, "auxiliary_loss_mlp": 0.01267388, "balance_loss_clip": 0.06275773, "balance_loss_mlp": 0.01258167, "epoch": 0.9870434390500527, "flos": 16103753291520.0, "grad_norm": 1.8588366833231307, "language_loss": 0.71097732, "learning_rate": 1.7526006106833858e-09, "loss": 0.78776443, "num_input_tokens_seen": 354295080, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.09216309, "step": 16417, "time_per_iteration": 2.5368716716766357 }, { "auxiliary_loss_clip": 0.06423812, "auxiliary_loss_mlp": 0.01266606, "balance_loss_clip": 0.06278616, "balance_loss_mlp": 0.01256616, "epoch": 0.9871035623027206, "flos": 21766941267840.0, "grad_norm": 1.5908975828735052, "language_loss": 0.70773375, "learning_rate": 1.7363376176720013e-09, "loss": 0.78463793, "num_input_tokens_seen": 354314610, "router_z_loss_clip": 1.45214844, "router_z_loss_mlp": 0.09985352, "step": 16418, "time_per_iteration": 2.569326400756836 }, { "auxiliary_loss_clip": 0.0631368, "auxiliary_loss_mlp": 0.01251237, "balance_loss_clip": 0.06258595, "balance_loss_mlp": 0.0125025, "epoch": 0.9871636855553886, "flos": 70240936970880.0, "grad_norm": 0.6441587648022113, "language_loss": 0.53762293, "learning_rate": 1.7201503990189603e-09, "loss": 0.61327207, "num_input_tokens_seen": 354383115, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00986481, "step": 16419, "time_per_iteration": 3.351790428161621 }, { "auxiliary_loss_clip": 0.06416113, "auxiliary_loss_mlp": 0.01266931, "balance_loss_clip": 0.06272729, "balance_loss_mlp": 0.01257048, "epoch": 0.9872238088080565, "flos": 25052789203200.0, "grad_norm": 1.5466144342275183, "language_loss": 0.78312433, "learning_rate": 1.7040389553382162e-09, "loss": 0.85995471, "num_input_tokens_seen": 354403115, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.09881592, "step": 16420, "time_per_iteration": 2.6302075386047363 }, { "auxiliary_loss_clip": 0.06408645, "auxiliary_loss_mlp": 0.01267529, "balance_loss_clip": 0.06274365, "balance_loss_mlp": 0.01258439, "epoch": 0.9872839320607245, "flos": 19472268879360.0, "grad_norm": 1.7997114275267565, "language_loss": 0.70998156, "learning_rate": 1.6880032872403916e-09, "loss": 0.78674328, "num_input_tokens_seen": 354424520, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.09094238, "step": 16421, "time_per_iteration": 2.613616943359375 }, { "auxiliary_loss_clip": 0.06421421, "auxiliary_loss_mlp": 0.01264605, "balance_loss_clip": 0.06277487, "balance_loss_mlp": 0.01254628, "epoch": 0.9873440553133924, "flos": 26950166657280.0, "grad_norm": 2.2168429178741444, "language_loss": 0.82242405, "learning_rate": 1.6720433953338886e-09, "loss": 0.8992843, "num_input_tokens_seen": 354444800, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.09991455, "step": 16422, "time_per_iteration": 2.6859397888183594 }, { "auxiliary_loss_clip": 0.06411013, "auxiliary_loss_mlp": 0.01264592, "balance_loss_clip": 0.06275328, "balance_loss_mlp": 0.01255907, "epoch": 0.9874041785660604, "flos": 19068181764480.0, "grad_norm": 1.7040307831182346, "language_loss": 0.8608073, "learning_rate": 1.656159280223779e-09, "loss": 0.93756342, "num_input_tokens_seen": 354464590, "router_z_loss_clip": 1.35644531, "router_z_loss_mlp": 0.08685303, "step": 16423, "time_per_iteration": 2.5341732501983643 }, { "auxiliary_loss_clip": 0.06412689, "auxiliary_loss_mlp": 0.012638, "balance_loss_clip": 0.06272502, "balance_loss_mlp": 0.01254604, "epoch": 0.9874643018187284, "flos": 21112195063680.0, "grad_norm": 2.137134091096297, "language_loss": 0.70831871, "learning_rate": 1.6403509425122475e-09, "loss": 0.78508365, "num_input_tokens_seen": 354484145, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09197998, "step": 16424, "time_per_iteration": 2.5596861839294434 }, { "auxiliary_loss_clip": 0.06414261, "auxiliary_loss_mlp": 0.0126822, "balance_loss_clip": 0.0627488, "balance_loss_mlp": 0.01258522, "epoch": 0.9875244250713964, "flos": 24432982951680.0, "grad_norm": 1.9695617744186462, "language_loss": 0.80689633, "learning_rate": 1.6246183827990366e-09, "loss": 0.88372117, "num_input_tokens_seen": 354502475, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09698486, "step": 16425, "time_per_iteration": 2.5634138584136963 }, { "auxiliary_loss_clip": 0.06414701, "auxiliary_loss_mlp": 0.01267023, "balance_loss_clip": 0.06274776, "balance_loss_mlp": 0.01257224, "epoch": 0.9875845483240643, "flos": 25124388117120.0, "grad_norm": 1.9048543614959963, "language_loss": 0.80159289, "learning_rate": 1.6089616016803364e-09, "loss": 0.8784101, "num_input_tokens_seen": 354521855, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09802246, "step": 16426, "time_per_iteration": 2.5645270347595215 }, { "auxiliary_loss_clip": 0.06413919, "auxiliary_loss_mlp": 0.0126807, "balance_loss_clip": 0.06277464, "balance_loss_mlp": 0.01258682, "epoch": 0.9876446715767323, "flos": 16587447384960.0, "grad_norm": 1.9603098639763479, "language_loss": 0.84923989, "learning_rate": 1.593380599750338e-09, "loss": 0.92605972, "num_input_tokens_seen": 354539535, "router_z_loss_clip": 1.36523438, "router_z_loss_mlp": 0.09381104, "step": 16427, "time_per_iteration": 4.066830635070801 }, { "auxiliary_loss_clip": 0.06407771, "auxiliary_loss_mlp": 0.01265501, "balance_loss_clip": 0.06270751, "balance_loss_mlp": 0.01256036, "epoch": 0.9877047948294003, "flos": 21622527555840.0, "grad_norm": 2.0304103322810847, "language_loss": 0.70258081, "learning_rate": 1.577875377599458e-09, "loss": 0.77931345, "num_input_tokens_seen": 354557430, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09466553, "step": 16428, "time_per_iteration": 2.5743441581726074 }, { "auxiliary_loss_clip": 0.06408308, "auxiliary_loss_mlp": 0.0126691, "balance_loss_clip": 0.06272832, "balance_loss_mlp": 0.0125797, "epoch": 0.9877649180820682, "flos": 21184842153600.0, "grad_norm": 2.021130064733843, "language_loss": 0.80140209, "learning_rate": 1.5624459358158926e-09, "loss": 0.87815434, "num_input_tokens_seen": 354574735, "router_z_loss_clip": 1.35546875, "router_z_loss_mlp": 0.08947754, "step": 16429, "time_per_iteration": 2.5376272201538086 }, { "auxiliary_loss_clip": 0.06409979, "auxiliary_loss_mlp": 0.01267548, "balance_loss_clip": 0.06271745, "balance_loss_mlp": 0.01258131, "epoch": 0.9878250413347363, "flos": 39758596502400.0, "grad_norm": 1.5568716057533696, "language_loss": 0.62478954, "learning_rate": 1.5470922749845073e-09, "loss": 0.70156485, "num_input_tokens_seen": 354597050, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09411621, "step": 16430, "time_per_iteration": 2.700805902481079 }, { "auxiliary_loss_clip": 0.06419156, "auxiliary_loss_mlp": 0.01268221, "balance_loss_clip": 0.06279558, "balance_loss_mlp": 0.01258708, "epoch": 0.9878851645874042, "flos": 29433584367360.0, "grad_norm": 1.3084551551153614, "language_loss": 0.73199481, "learning_rate": 1.531814395687725e-09, "loss": 0.80886865, "num_input_tokens_seen": 354619095, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09521484, "step": 16431, "time_per_iteration": 4.019779682159424 }, { "auxiliary_loss_clip": 0.06420676, "auxiliary_loss_mlp": 0.01269871, "balance_loss_clip": 0.06282213, "balance_loss_mlp": 0.01260221, "epoch": 0.9879452878400722, "flos": 15810230039040.0, "grad_norm": 2.1306750926172375, "language_loss": 0.81239539, "learning_rate": 1.5166122985048602e-09, "loss": 0.88930082, "num_input_tokens_seen": 354633790, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09643555, "step": 16432, "time_per_iteration": 2.5104334354400635 }, { "auxiliary_loss_clip": 0.064087, "auxiliary_loss_mlp": 0.0126118, "balance_loss_clip": 0.06271316, "balance_loss_mlp": 0.01252763, "epoch": 0.9880054110927401, "flos": 22239985893120.0, "grad_norm": 1.5761937312905026, "language_loss": 0.80998611, "learning_rate": 1.5014859840123405e-09, "loss": 0.88668489, "num_input_tokens_seen": 354653180, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08410645, "step": 16433, "time_per_iteration": 2.5567314624786377 }, { "auxiliary_loss_clip": 0.06411079, "auxiliary_loss_mlp": 0.01262815, "balance_loss_clip": 0.0627605, "balance_loss_mlp": 0.01253716, "epoch": 0.9880655343454081, "flos": 28770830098560.0, "grad_norm": 2.462043124741031, "language_loss": 0.65022093, "learning_rate": 1.4864354527837075e-09, "loss": 0.72695982, "num_input_tokens_seen": 354669900, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.0909729, "step": 16434, "time_per_iteration": 2.569749116897583 }, { "auxiliary_loss_clip": 0.06416495, "auxiliary_loss_mlp": 0.01264261, "balance_loss_clip": 0.06275617, "balance_loss_mlp": 0.01254617, "epoch": 0.988125657598076, "flos": 32861581954560.0, "grad_norm": 1.5703925201508508, "language_loss": 0.69675744, "learning_rate": 1.4714607053896154e-09, "loss": 0.77356493, "num_input_tokens_seen": 354693165, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09643555, "step": 16435, "time_per_iteration": 2.6314189434051514 }, { "auxiliary_loss_clip": 0.0641337, "auxiliary_loss_mlp": 0.01264656, "balance_loss_clip": 0.0627652, "balance_loss_mlp": 0.01255399, "epoch": 0.988185780850744, "flos": 19396728823680.0, "grad_norm": 1.551147620046388, "language_loss": 0.75603974, "learning_rate": 1.4565617423980548e-09, "loss": 0.83282006, "num_input_tokens_seen": 354711915, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09259033, "step": 16436, "time_per_iteration": 2.51491379737854 }, { "auxiliary_loss_clip": 0.06413049, "auxiliary_loss_mlp": 0.01263038, "balance_loss_clip": 0.06274842, "balance_loss_mlp": 0.01253501, "epoch": 0.988245904103412, "flos": 22534976592000.0, "grad_norm": 2.1154352736135134, "language_loss": 0.74821234, "learning_rate": 1.4417385643741286e-09, "loss": 0.82497311, "num_input_tokens_seen": 354729135, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.09545898, "step": 16437, "time_per_iteration": 2.5412285327911377 }, { "auxiliary_loss_clip": 0.06404258, "auxiliary_loss_mlp": 0.01266918, "balance_loss_clip": 0.06270915, "balance_loss_mlp": 0.01257947, "epoch": 0.98830602735608, "flos": 28666974562560.0, "grad_norm": 2.6332745034968528, "language_loss": 0.60380983, "learning_rate": 1.4269911718796103e-09, "loss": 0.68052161, "num_input_tokens_seen": 354752530, "router_z_loss_clip": 1.33398438, "router_z_loss_mlp": 0.08972168, "step": 16438, "time_per_iteration": 2.606476306915283 }, { "auxiliary_loss_clip": 0.06410363, "auxiliary_loss_mlp": 0.01266887, "balance_loss_clip": 0.06272557, "balance_loss_mlp": 0.0125682, "epoch": 0.9883661506087479, "flos": 21002343960960.0, "grad_norm": 1.787268911108383, "language_loss": 0.72005808, "learning_rate": 1.4123195654738295e-09, "loss": 0.79683053, "num_input_tokens_seen": 354771135, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.10070801, "step": 16439, "time_per_iteration": 2.541870594024658 }, { "auxiliary_loss_clip": 0.06408369, "auxiliary_loss_mlp": 0.01265688, "balance_loss_clip": 0.06273065, "balance_loss_mlp": 0.01256604, "epoch": 0.9884262738614159, "flos": 32714065641600.0, "grad_norm": 1.8025779766161056, "language_loss": 0.60186744, "learning_rate": 1.3977237457134528e-09, "loss": 0.678608, "num_input_tokens_seen": 354791800, "router_z_loss_clip": 1.35449219, "router_z_loss_mlp": 0.09082031, "step": 16440, "time_per_iteration": 2.6240501403808594 }, { "auxiliary_loss_clip": 0.06414877, "auxiliary_loss_mlp": 0.01264522, "balance_loss_clip": 0.0627327, "balance_loss_mlp": 0.012552, "epoch": 0.9884863971140839, "flos": 17570153669760.0, "grad_norm": 2.3681301996844177, "language_loss": 0.75998831, "learning_rate": 1.3832037131513707e-09, "loss": 0.83678222, "num_input_tokens_seen": 354809200, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.09332275, "step": 16441, "time_per_iteration": 2.509089946746826 }, { "auxiliary_loss_clip": 0.06417589, "auxiliary_loss_mlp": 0.01264919, "balance_loss_clip": 0.0627787, "balance_loss_mlp": 0.01255483, "epoch": 0.9885465203667518, "flos": 40562116830720.0, "grad_norm": 1.7594459961955606, "language_loss": 0.67870402, "learning_rate": 1.3687594683386982e-09, "loss": 0.75552911, "num_input_tokens_seen": 354829945, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09429932, "step": 16442, "time_per_iteration": 2.698335886001587 }, { "auxiliary_loss_clip": 0.0640804, "auxiliary_loss_mlp": 0.01266248, "balance_loss_clip": 0.06272978, "balance_loss_mlp": 0.01257152, "epoch": 0.9886066436194199, "flos": 13813022044800.0, "grad_norm": 2.708935127247499, "language_loss": 0.74777198, "learning_rate": 1.3543910118227753e-09, "loss": 0.82451487, "num_input_tokens_seen": 354845055, "router_z_loss_clip": 1.34960938, "router_z_loss_mlp": 0.09094238, "step": 16443, "time_per_iteration": 2.5264041423797607 }, { "auxiliary_loss_clip": 0.06412245, "auxiliary_loss_mlp": 0.01266249, "balance_loss_clip": 0.06272925, "balance_loss_mlp": 0.01256468, "epoch": 0.9886667668720878, "flos": 23330824272000.0, "grad_norm": 2.726495725131972, "language_loss": 0.73856241, "learning_rate": 1.3400983441487213e-09, "loss": 0.81534731, "num_input_tokens_seen": 354864680, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09783936, "step": 16444, "time_per_iteration": 2.556472063064575 }, { "auxiliary_loss_clip": 0.0641023, "auxiliary_loss_mlp": 0.01266987, "balance_loss_clip": 0.06274685, "balance_loss_mlp": 0.01258436, "epoch": 0.9887268901247558, "flos": 22711814634240.0, "grad_norm": 1.9355742362188266, "language_loss": 0.69764715, "learning_rate": 1.325881465858547e-09, "loss": 0.77441931, "num_input_tokens_seen": 354885685, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.08547974, "step": 16445, "time_per_iteration": 2.6369788646698 }, { "auxiliary_loss_clip": 0.06421059, "auxiliary_loss_mlp": 0.01265707, "balance_loss_clip": 0.06282511, "balance_loss_mlp": 0.012557, "epoch": 0.9887870133774237, "flos": 13046118750720.0, "grad_norm": 2.791912181963286, "language_loss": 0.60983503, "learning_rate": 1.311740377491155e-09, "loss": 0.68670273, "num_input_tokens_seen": 354901505, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.10009766, "step": 16446, "time_per_iteration": 2.5276238918304443 }, { "auxiliary_loss_clip": 0.06417438, "auxiliary_loss_mlp": 0.01262494, "balance_loss_clip": 0.06280065, "balance_loss_mlp": 0.01254161, "epoch": 0.9888471366300917, "flos": 15164288513280.0, "grad_norm": 2.194262455369493, "language_loss": 0.70912021, "learning_rate": 1.297675079582783e-09, "loss": 0.78591955, "num_input_tokens_seen": 354920060, "router_z_loss_clip": 1.37402344, "router_z_loss_mlp": 0.08331299, "step": 16447, "time_per_iteration": 2.521742343902588 }, { "auxiliary_loss_clip": 0.06406993, "auxiliary_loss_mlp": 0.01264961, "balance_loss_clip": 0.0626996, "balance_loss_mlp": 0.01256044, "epoch": 0.9889072598827596, "flos": 25125771709440.0, "grad_norm": 2.4777761652915076, "language_loss": 0.83726269, "learning_rate": 1.2836855726667818e-09, "loss": 0.91398215, "num_input_tokens_seen": 354938690, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.08911133, "step": 16448, "time_per_iteration": 2.5733680725097656 }, { "auxiliary_loss_clip": 0.06411684, "auxiliary_loss_mlp": 0.01262145, "balance_loss_clip": 0.06276339, "balance_loss_mlp": 0.01253603, "epoch": 0.9889673831354276, "flos": 16734502500480.0, "grad_norm": 1.491711154500127, "language_loss": 0.70273554, "learning_rate": 1.26977185727406e-09, "loss": 0.77947384, "num_input_tokens_seen": 354956955, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.08544922, "step": 16449, "time_per_iteration": 3.9870240688323975 }, { "auxiliary_loss_clip": 0.06417073, "auxiliary_loss_mlp": 0.01262973, "balance_loss_clip": 0.06275982, "balance_loss_mlp": 0.01253871, "epoch": 0.9890275063880956, "flos": 35593059277440.0, "grad_norm": 2.2258682619302483, "language_loss": 0.74163461, "learning_rate": 1.25593393393153e-09, "loss": 0.81843507, "num_input_tokens_seen": 354976800, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09100342, "step": 16450, "time_per_iteration": 2.7037761211395264 }, { "auxiliary_loss_clip": 0.06414802, "auxiliary_loss_mlp": 0.01266356, "balance_loss_clip": 0.06272397, "balance_loss_mlp": 0.01255854, "epoch": 0.9890876296407636, "flos": 18958246807680.0, "grad_norm": 1.624891418627831, "language_loss": 0.7957927, "learning_rate": 1.242171803164549e-09, "loss": 0.87260425, "num_input_tokens_seen": 354996625, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.1050415, "step": 16451, "time_per_iteration": 2.5407729148864746 }, { "auxiliary_loss_clip": 0.06419107, "auxiliary_loss_mlp": 0.01265191, "balance_loss_clip": 0.06276101, "balance_loss_mlp": 0.01255463, "epoch": 0.9891477528934315, "flos": 23776140395520.0, "grad_norm": 2.086569260253861, "language_loss": 0.70337093, "learning_rate": 1.2284854654946996e-09, "loss": 0.78021389, "num_input_tokens_seen": 355014535, "router_z_loss_clip": 1.42871094, "router_z_loss_mlp": 0.09729004, "step": 16452, "time_per_iteration": 2.539734363555908 }, { "auxiliary_loss_clip": 0.06409118, "auxiliary_loss_mlp": 0.01262212, "balance_loss_clip": 0.06275223, "balance_loss_mlp": 0.01253742, "epoch": 0.9892078761460995, "flos": 20778490978560.0, "grad_norm": 1.5399180682341842, "language_loss": 0.74043047, "learning_rate": 1.2148749214409004e-09, "loss": 0.81714368, "num_input_tokens_seen": 355033280, "router_z_loss_clip": 1.33886719, "router_z_loss_mlp": 0.08465576, "step": 16453, "time_per_iteration": 2.519483804702759 }, { "auxiliary_loss_clip": 0.06415309, "auxiliary_loss_mlp": 0.01264861, "balance_loss_clip": 0.06276169, "balance_loss_mlp": 0.01255235, "epoch": 0.9892679993987675, "flos": 23374568903040.0, "grad_norm": 2.10733260041616, "language_loss": 0.70493966, "learning_rate": 1.2013401715191828e-09, "loss": 0.78174138, "num_input_tokens_seen": 355053320, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09625244, "step": 16454, "time_per_iteration": 2.548739433288574 }, { "auxiliary_loss_clip": 0.06407521, "auxiliary_loss_mlp": 0.0126684, "balance_loss_clip": 0.06273887, "balance_loss_mlp": 0.0125781, "epoch": 0.9893281226514354, "flos": 22711101874560.0, "grad_norm": 2.1268439245026123, "language_loss": 0.75949621, "learning_rate": 1.1878812162433583e-09, "loss": 0.83623976, "num_input_tokens_seen": 355070230, "router_z_loss_clip": 1.3359375, "router_z_loss_mlp": 0.09039307, "step": 16455, "time_per_iteration": 3.976356267929077 }, { "auxiliary_loss_clip": 0.06405322, "auxiliary_loss_mlp": 0.01263226, "balance_loss_clip": 0.06271136, "balance_loss_mlp": 0.01254303, "epoch": 0.9893882459041035, "flos": 21802761688320.0, "grad_norm": 2.0181701413055557, "language_loss": 0.65837091, "learning_rate": 1.1744980561230188e-09, "loss": 0.7350564, "num_input_tokens_seen": 355090125, "router_z_loss_clip": 1.34179688, "router_z_loss_mlp": 0.0892334, "step": 16456, "time_per_iteration": 2.5704314708709717 }, { "auxiliary_loss_clip": 0.06418429, "auxiliary_loss_mlp": 0.01264282, "balance_loss_clip": 0.06276944, "balance_loss_mlp": 0.01254668, "epoch": 0.9894483691567714, "flos": 18119618818560.0, "grad_norm": 1.8780363019568072, "language_loss": 0.7432664, "learning_rate": 1.161190691666203e-09, "loss": 0.82009351, "num_input_tokens_seen": 355107890, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09613037, "step": 16457, "time_per_iteration": 2.5019781589508057 }, { "auxiliary_loss_clip": 0.06418405, "auxiliary_loss_mlp": 0.01262915, "balance_loss_clip": 0.06278202, "balance_loss_mlp": 0.01253212, "epoch": 0.9895084924094394, "flos": 31219559418240.0, "grad_norm": 2.2507789656322874, "language_loss": 0.69215769, "learning_rate": 1.1479591233773954e-09, "loss": 0.76897085, "num_input_tokens_seen": 355126340, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.0970459, "step": 16458, "time_per_iteration": 2.608889579772949 }, { "auxiliary_loss_clip": 0.06408873, "auxiliary_loss_mlp": 0.01265934, "balance_loss_clip": 0.06274676, "balance_loss_mlp": 0.01256731, "epoch": 0.9895686156621073, "flos": 19683376041600.0, "grad_norm": 1.5628747019729696, "language_loss": 0.797894, "learning_rate": 1.1348033517581956e-09, "loss": 0.87464207, "num_input_tokens_seen": 355144025, "router_z_loss_clip": 1.34082031, "router_z_loss_mlp": 0.09197998, "step": 16459, "time_per_iteration": 2.553558349609375 }, { "auxiliary_loss_clip": 0.06415261, "auxiliary_loss_mlp": 0.01264409, "balance_loss_clip": 0.06275363, "balance_loss_mlp": 0.01255111, "epoch": 0.9896287389147753, "flos": 23587604709120.0, "grad_norm": 1.8488605076831315, "language_loss": 0.71027052, "learning_rate": 1.1217233773075373e-09, "loss": 0.78706717, "num_input_tokens_seen": 355163125, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09301758, "step": 16460, "time_per_iteration": 2.644104242324829 }, { "auxiliary_loss_clip": 0.06415058, "auxiliary_loss_mlp": 0.01264952, "balance_loss_clip": 0.06274386, "balance_loss_mlp": 0.01255469, "epoch": 0.9896888621674432, "flos": 29612854177920.0, "grad_norm": 1.5456528570762413, "language_loss": 0.87429047, "learning_rate": 1.1087192005214685e-09, "loss": 0.95109057, "num_input_tokens_seen": 355184060, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09484863, "step": 16461, "time_per_iteration": 2.6040353775024414 }, { "auxiliary_loss_clip": 0.06408371, "auxiliary_loss_mlp": 0.01268203, "balance_loss_clip": 0.06270093, "balance_loss_mlp": 0.01258547, "epoch": 0.9897489854201112, "flos": 23701648515840.0, "grad_norm": 1.8890216232389847, "language_loss": 0.6359489, "learning_rate": 1.09579082189315e-09, "loss": 0.71271461, "num_input_tokens_seen": 355204505, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.09661865, "step": 16462, "time_per_iteration": 2.564004421234131 }, { "auxiliary_loss_clip": 0.06411687, "auxiliary_loss_mlp": 0.01264009, "balance_loss_clip": 0.06274337, "balance_loss_mlp": 0.01254997, "epoch": 0.9898091086727792, "flos": 13230252097920.0, "grad_norm": 1.7964466890082706, "language_loss": 0.7311523, "learning_rate": 1.0829382419126343e-09, "loss": 0.80790925, "num_input_tokens_seen": 355223055, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09008789, "step": 16463, "time_per_iteration": 2.532118320465088 }, { "auxiliary_loss_clip": 0.06413007, "auxiliary_loss_mlp": 0.01266486, "balance_loss_clip": 0.06273332, "balance_loss_mlp": 0.01256782, "epoch": 0.9898692319254472, "flos": 22937135063040.0, "grad_norm": 1.9979669206980146, "language_loss": 0.70635498, "learning_rate": 1.0701614610675314e-09, "loss": 0.78314984, "num_input_tokens_seen": 355242000, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.0970459, "step": 16464, "time_per_iteration": 2.5341956615448 }, { "auxiliary_loss_clip": 0.06412932, "auxiliary_loss_mlp": 0.01264725, "balance_loss_clip": 0.06271654, "balance_loss_mlp": 0.01254908, "epoch": 0.9899293551781151, "flos": 12463223022720.0, "grad_norm": 1.9929507822951242, "language_loss": 0.74130994, "learning_rate": 1.0574604798421204e-09, "loss": 0.8180865, "num_input_tokens_seen": 355260175, "router_z_loss_clip": 1.41601562, "router_z_loss_mlp": 0.09814453, "step": 16465, "time_per_iteration": 2.5119776725769043 }, { "auxiliary_loss_clip": 0.06408405, "auxiliary_loss_mlp": 0.01262916, "balance_loss_clip": 0.06272622, "balance_loss_mlp": 0.01254243, "epoch": 0.9899894784307831, "flos": 26878567743360.0, "grad_norm": 1.6175245235479039, "language_loss": 0.86459821, "learning_rate": 1.0448352987182386e-09, "loss": 0.94131142, "num_input_tokens_seen": 355281930, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.08666992, "step": 16466, "time_per_iteration": 2.586888313293457 }, { "auxiliary_loss_clip": 0.06410158, "auxiliary_loss_mlp": 0.01265784, "balance_loss_clip": 0.06273596, "balance_loss_mlp": 0.01256557, "epoch": 0.990049601683451, "flos": 21548287238400.0, "grad_norm": 1.66924888624807, "language_loss": 0.72064012, "learning_rate": 1.0322859181743915e-09, "loss": 0.79739952, "num_input_tokens_seen": 355301555, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09222412, "step": 16467, "time_per_iteration": 3.9932260513305664 }, { "auxiliary_loss_clip": 0.06411776, "auxiliary_loss_mlp": 0.01264968, "balance_loss_clip": 0.06274725, "balance_loss_mlp": 0.01256051, "epoch": 0.990109724936119, "flos": 28780137901440.0, "grad_norm": 1.3627510783782915, "language_loss": 0.65045714, "learning_rate": 1.019812338686643e-09, "loss": 0.72722459, "num_input_tokens_seen": 355324925, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.08905029, "step": 16468, "time_per_iteration": 2.630420684814453 }, { "auxiliary_loss_clip": 0.06417444, "auxiliary_loss_mlp": 0.01263644, "balance_loss_clip": 0.06274006, "balance_loss_mlp": 0.01253947, "epoch": 0.9901698481887871, "flos": 29281288371840.0, "grad_norm": 1.6899959994980873, "language_loss": 0.62118673, "learning_rate": 1.0074145607281704e-09, "loss": 0.69799763, "num_input_tokens_seen": 355343875, "router_z_loss_clip": 1.43359375, "router_z_loss_mlp": 0.09698486, "step": 16469, "time_per_iteration": 2.5739593505859375 }, { "auxiliary_loss_clip": 0.0641707, "auxiliary_loss_mlp": 0.01264626, "balance_loss_clip": 0.06275906, "balance_loss_mlp": 0.01254636, "epoch": 0.990229971441455, "flos": 15964161189120.0, "grad_norm": 2.981520317601219, "language_loss": 0.70907027, "learning_rate": 9.950925847685976e-10, "loss": 0.78588724, "num_input_tokens_seen": 355358835, "router_z_loss_clip": 1.41015625, "router_z_loss_mlp": 0.09985352, "step": 16470, "time_per_iteration": 3.9055769443511963 }, { "auxiliary_loss_clip": 0.06320659, "auxiliary_loss_mlp": 0.01251647, "balance_loss_clip": 0.06265471, "balance_loss_mlp": 0.01250615, "epoch": 0.990290094694123, "flos": 69801322924800.0, "grad_norm": 0.6925033057662029, "language_loss": 0.55490351, "learning_rate": 9.828464112755509e-10, "loss": 0.63062656, "num_input_tokens_seen": 355431225, "router_z_loss_clip": 0.55126953, "router_z_loss_mlp": 0.01032257, "step": 16471, "time_per_iteration": 3.357696056365967 }, { "auxiliary_loss_clip": 0.06412053, "auxiliary_loss_mlp": 0.01266602, "balance_loss_clip": 0.06274216, "balance_loss_mlp": 0.01257208, "epoch": 0.9903502179467909, "flos": 16257894076800.0, "grad_norm": 2.127960495654915, "language_loss": 0.8437506, "learning_rate": 9.706760407131032e-10, "loss": 0.92053717, "num_input_tokens_seen": 355448250, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09387207, "step": 16472, "time_per_iteration": 2.5053553581237793 }, { "auxiliary_loss_clip": 0.06415591, "auxiliary_loss_mlp": 0.01265889, "balance_loss_clip": 0.06276402, "balance_loss_mlp": 0.01256448, "epoch": 0.9904103411994589, "flos": 21694671521280.0, "grad_norm": 1.7380194635481807, "language_loss": 0.86344171, "learning_rate": 9.585814735431075e-10, "loss": 0.94025648, "num_input_tokens_seen": 355467040, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09442139, "step": 16473, "time_per_iteration": 2.537484884262085 }, { "auxiliary_loss_clip": 0.06409004, "auxiliary_loss_mlp": 0.01263695, "balance_loss_clip": 0.06270878, "balance_loss_mlp": 0.01255404, "epoch": 0.9904704644521268, "flos": 25746584209920.0, "grad_norm": 1.5495413674706204, "language_loss": 0.84710813, "learning_rate": 9.465627102240859e-10, "loss": 0.9238351, "num_input_tokens_seen": 355487825, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.08294678, "step": 16474, "time_per_iteration": 2.579911947250366 }, { "auxiliary_loss_clip": 0.06409825, "auxiliary_loss_mlp": 0.0126354, "balance_loss_clip": 0.0627223, "balance_loss_mlp": 0.01254803, "epoch": 0.9905305877047949, "flos": 21914834924160.0, "grad_norm": 1.710553900318055, "language_loss": 0.76842928, "learning_rate": 9.346197512116738e-10, "loss": 0.84516299, "num_input_tokens_seen": 355507445, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.08740234, "step": 16475, "time_per_iteration": 2.5390472412109375 }, { "auxiliary_loss_clip": 0.06412852, "auxiliary_loss_mlp": 0.01262501, "balance_loss_clip": 0.06272951, "balance_loss_mlp": 0.01253542, "epoch": 0.9905907109574628, "flos": 21397961813760.0, "grad_norm": 1.4088770392113001, "language_loss": 0.75611746, "learning_rate": 9.227525969588423e-10, "loss": 0.83287096, "num_input_tokens_seen": 355527205, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.08959961, "step": 16476, "time_per_iteration": 2.5435400009155273 }, { "auxiliary_loss_clip": 0.06429391, "auxiliary_loss_mlp": 0.01264477, "balance_loss_clip": 0.06280817, "balance_loss_mlp": 0.01253515, "epoch": 0.9906508342101308, "flos": 20527831889280.0, "grad_norm": 2.1272375017713987, "language_loss": 0.67305613, "learning_rate": 9.109612479154538e-10, "loss": 0.74999487, "num_input_tokens_seen": 355544740, "router_z_loss_clip": 1.48632812, "router_z_loss_mlp": 0.10968018, "step": 16477, "time_per_iteration": 2.527484655380249 }, { "auxiliary_loss_clip": 0.0642063, "auxiliary_loss_mlp": 0.01265115, "balance_loss_clip": 0.06277815, "balance_loss_mlp": 0.01254458, "epoch": 0.9907109574627987, "flos": 21367633835520.0, "grad_norm": 2.073939666147212, "language_loss": 0.72186834, "learning_rate": 8.992457045289282e-10, "loss": 0.79872578, "num_input_tokens_seen": 355564385, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10662842, "step": 16478, "time_per_iteration": 2.52317476272583 }, { "auxiliary_loss_clip": 0.06412423, "auxiliary_loss_mlp": 0.01264477, "balance_loss_clip": 0.06272872, "balance_loss_mlp": 0.01254923, "epoch": 0.9907710807154667, "flos": 17342820743040.0, "grad_norm": 2.1609849971140043, "language_loss": 0.81662017, "learning_rate": 8.876059672433545e-10, "loss": 0.89338917, "num_input_tokens_seen": 355579260, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09558105, "step": 16479, "time_per_iteration": 2.4759042263031006 }, { "auxiliary_loss_clip": 0.06418796, "auxiliary_loss_mlp": 0.01263831, "balance_loss_clip": 0.06277354, "balance_loss_mlp": 0.01254706, "epoch": 0.9908312039681346, "flos": 28629518987520.0, "grad_norm": 1.4885444346867547, "language_loss": 0.66630173, "learning_rate": 8.760420364999355e-10, "loss": 0.74312794, "num_input_tokens_seen": 355599790, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.09130859, "step": 16480, "time_per_iteration": 2.579805612564087 }, { "auxiliary_loss_clip": 0.06409354, "auxiliary_loss_mlp": 0.01263948, "balance_loss_clip": 0.06272365, "balance_loss_mlp": 0.0125481, "epoch": 0.9908913272208026, "flos": 35779079341440.0, "grad_norm": 1.791515209047282, "language_loss": 0.72656107, "learning_rate": 8.645539127374313e-10, "loss": 0.80329406, "num_input_tokens_seen": 355620925, "router_z_loss_clip": 1.37011719, "router_z_loss_mlp": 0.09130859, "step": 16481, "time_per_iteration": 2.6507785320281982 }, { "auxiliary_loss_clip": 0.0641138, "auxiliary_loss_mlp": 0.01261702, "balance_loss_clip": 0.06275369, "balance_loss_mlp": 0.01252773, "epoch": 0.9909514504734707, "flos": 19908444908160.0, "grad_norm": 1.7244706536266095, "language_loss": 0.77668172, "learning_rate": 8.531415963912713e-10, "loss": 0.85341251, "num_input_tokens_seen": 355639165, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.08929443, "step": 16482, "time_per_iteration": 2.5116376876831055 }, { "auxiliary_loss_clip": 0.06412537, "auxiliary_loss_mlp": 0.01264635, "balance_loss_clip": 0.0627322, "balance_loss_mlp": 0.01255415, "epoch": 0.9910115737261386, "flos": 20009910602880.0, "grad_norm": 1.8833679252294353, "language_loss": 0.75420976, "learning_rate": 8.418050878944427e-10, "loss": 0.83098143, "num_input_tokens_seen": 355657320, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09222412, "step": 16483, "time_per_iteration": 2.4965014457702637 }, { "auxiliary_loss_clip": 0.06315921, "auxiliary_loss_mlp": 0.01250065, "balance_loss_clip": 0.06260764, "balance_loss_mlp": 0.01249145, "epoch": 0.9910716969788066, "flos": 70708950351360.0, "grad_norm": 0.6869075719758873, "language_loss": 0.53662193, "learning_rate": 8.305443876768237e-10, "loss": 0.6122818, "num_input_tokens_seen": 355726370, "router_z_loss_clip": 0.55126953, "router_z_loss_mlp": 0.00917816, "step": 16484, "time_per_iteration": 3.270362377166748 }, { "auxiliary_loss_clip": 0.0640789, "auxiliary_loss_mlp": 0.01264919, "balance_loss_clip": 0.06274384, "balance_loss_mlp": 0.01256297, "epoch": 0.9911318202314745, "flos": 21440448633600.0, "grad_norm": 1.7080740627550208, "language_loss": 0.82253021, "learning_rate": 8.19359496165184e-10, "loss": 0.89925838, "num_input_tokens_seen": 355745840, "router_z_loss_clip": 1.33398438, "router_z_loss_mlp": 0.08621216, "step": 16485, "time_per_iteration": 2.5430545806884766 }, { "auxiliary_loss_clip": 0.06410843, "auxiliary_loss_mlp": 0.0126831, "balance_loss_clip": 0.06275035, "balance_loss_mlp": 0.0125919, "epoch": 0.9911919434841425, "flos": 19832653290240.0, "grad_norm": 1.6014798883109678, "language_loss": 0.81583858, "learning_rate": 8.082504137836288e-10, "loss": 0.89263004, "num_input_tokens_seen": 355763385, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09124756, "step": 16486, "time_per_iteration": 2.5264275074005127 }, { "auxiliary_loss_clip": 0.06418886, "auxiliary_loss_mlp": 0.01263539, "balance_loss_clip": 0.06277683, "balance_loss_mlp": 0.0125445, "epoch": 0.9912520667368104, "flos": 41729040316800.0, "grad_norm": 1.3443298840158069, "language_loss": 0.66178668, "learning_rate": 7.972171409538209e-10, "loss": 0.73861098, "num_input_tokens_seen": 355786075, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09082031, "step": 16487, "time_per_iteration": 2.7124671936035156 }, { "auxiliary_loss_clip": 0.06406567, "auxiliary_loss_mlp": 0.0126399, "balance_loss_clip": 0.06271373, "balance_loss_mlp": 0.01255133, "epoch": 0.9913121899894785, "flos": 23776559665920.0, "grad_norm": 1.603336375815571, "language_loss": 0.76860595, "learning_rate": 7.862596780936481e-10, "loss": 0.84531152, "num_input_tokens_seen": 355806295, "router_z_loss_clip": 1.35253906, "router_z_loss_mlp": 0.08862305, "step": 16488, "time_per_iteration": 3.9904444217681885 }, { "auxiliary_loss_clip": 0.06421064, "auxiliary_loss_mlp": 0.01263353, "balance_loss_clip": 0.06275123, "balance_loss_mlp": 0.01253149, "epoch": 0.9913723132421464, "flos": 23776559665920.0, "grad_norm": 1.9763726301771114, "language_loss": 0.69034803, "learning_rate": 7.753780256190001e-10, "loss": 0.76719218, "num_input_tokens_seen": 355825730, "router_z_loss_clip": 1.45898438, "router_z_loss_mlp": 0.10217285, "step": 16489, "time_per_iteration": 2.549764394760132 }, { "auxiliary_loss_clip": 0.06315656, "auxiliary_loss_mlp": 0.01250324, "balance_loss_clip": 0.06260377, "balance_loss_mlp": 0.01249349, "epoch": 0.9914324364948144, "flos": 71287234104960.0, "grad_norm": 0.5966198943465845, "language_loss": 0.52495861, "learning_rate": 7.645721839424357e-10, "loss": 0.60061836, "num_input_tokens_seen": 355891545, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00975037, "step": 16490, "time_per_iteration": 3.2695298194885254 }, { "auxiliary_loss_clip": 0.06423114, "auxiliary_loss_mlp": 0.01269494, "balance_loss_clip": 0.06279166, "balance_loss_mlp": 0.0125926, "epoch": 0.9914925597474823, "flos": 23702109713280.0, "grad_norm": 1.5450074556521625, "language_loss": 0.75945795, "learning_rate": 7.538421534734052e-10, "loss": 0.83638406, "num_input_tokens_seen": 355909920, "router_z_loss_clip": 1.43945312, "router_z_loss_mlp": 0.10229492, "step": 16491, "time_per_iteration": 2.5632622241973877 }, { "auxiliary_loss_clip": 0.06422338, "auxiliary_loss_mlp": 0.01266418, "balance_loss_clip": 0.06280212, "balance_loss_mlp": 0.01255725, "epoch": 0.9915526830001503, "flos": 13437250410240.0, "grad_norm": 2.2709521654689255, "language_loss": 0.70732594, "learning_rate": 7.431879346191383e-10, "loss": 0.78421354, "num_input_tokens_seen": 355923130, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.10693359, "step": 16492, "time_per_iteration": 2.5562446117401123 }, { "auxiliary_loss_clip": 0.06411287, "auxiliary_loss_mlp": 0.01264508, "balance_loss_clip": 0.06273298, "balance_loss_mlp": 0.01254667, "epoch": 0.9916128062528182, "flos": 20747282532480.0, "grad_norm": 2.249432189051555, "language_loss": 0.68461871, "learning_rate": 7.326095277837563e-10, "loss": 0.76137662, "num_input_tokens_seen": 355941960, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09844971, "step": 16493, "time_per_iteration": 2.538262128829956 }, { "auxiliary_loss_clip": 0.06416029, "auxiliary_loss_mlp": 0.01265126, "balance_loss_clip": 0.06273557, "balance_loss_mlp": 0.01255667, "epoch": 0.9916729295054862, "flos": 22492825188480.0, "grad_norm": 1.6457744875851477, "language_loss": 0.71702594, "learning_rate": 7.221069333678276e-10, "loss": 0.79383743, "num_input_tokens_seen": 355961640, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09448242, "step": 16494, "time_per_iteration": 4.0979578495025635 }, { "auxiliary_loss_clip": 0.06418863, "auxiliary_loss_mlp": 0.0126298, "balance_loss_clip": 0.06276982, "balance_loss_mlp": 0.01252913, "epoch": 0.9917330527581543, "flos": 14797573119360.0, "grad_norm": 4.31712130161273, "language_loss": 0.683375, "learning_rate": 7.116801517701443e-10, "loss": 0.76019347, "num_input_tokens_seen": 355977980, "router_z_loss_clip": 1.41699219, "router_z_loss_mlp": 0.10070801, "step": 16495, "time_per_iteration": 2.4973602294921875 }, { "auxiliary_loss_clip": 0.06317462, "auxiliary_loss_mlp": 0.01251461, "balance_loss_clip": 0.06262295, "balance_loss_mlp": 0.01250458, "epoch": 0.9917931760108222, "flos": 59209551717120.0, "grad_norm": 0.699734567934256, "language_loss": 0.53464329, "learning_rate": 7.013291833859458e-10, "loss": 0.61033249, "num_input_tokens_seen": 356042900, "router_z_loss_clip": 0.55273438, "router_z_loss_mlp": 0.01002502, "step": 16496, "time_per_iteration": 3.2524521350860596 }, { "auxiliary_loss_clip": 0.06417458, "auxiliary_loss_mlp": 0.01266294, "balance_loss_clip": 0.06276078, "balance_loss_mlp": 0.01256143, "epoch": 0.9918532992634902, "flos": 26769052056960.0, "grad_norm": 1.8024631321178315, "language_loss": 0.71712238, "learning_rate": 6.91054028607585e-10, "loss": 0.79395992, "num_input_tokens_seen": 356063000, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.10150146, "step": 16497, "time_per_iteration": 2.573854923248291 }, { "auxiliary_loss_clip": 0.06418661, "auxiliary_loss_mlp": 0.01267516, "balance_loss_clip": 0.06274634, "balance_loss_mlp": 0.01257509, "epoch": 0.9919134225161581, "flos": 14980993706880.0, "grad_norm": 2.8200272865157032, "language_loss": 0.81770885, "learning_rate": 6.808546878249721e-10, "loss": 0.89457071, "num_input_tokens_seen": 356078130, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10021973, "step": 16498, "time_per_iteration": 2.5020217895507812 }, { "auxiliary_loss_clip": 0.06415694, "auxiliary_loss_mlp": 0.01264388, "balance_loss_clip": 0.06277323, "balance_loss_mlp": 0.01254744, "epoch": 0.9919735457688261, "flos": 27825537461760.0, "grad_norm": 1.7215623674951686, "language_loss": 0.68761683, "learning_rate": 6.707311614246869e-10, "loss": 0.76441765, "num_input_tokens_seen": 356101655, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09643555, "step": 16499, "time_per_iteration": 2.6029610633850098 }, { "auxiliary_loss_clip": 0.06417097, "auxiliary_loss_mlp": 0.01265098, "balance_loss_clip": 0.0627486, "balance_loss_mlp": 0.01255722, "epoch": 0.992033669021494, "flos": 22568994149760.0, "grad_norm": 1.931184082396353, "language_loss": 0.82592547, "learning_rate": 6.606834497904223e-10, "loss": 0.90274739, "num_input_tokens_seen": 356121425, "router_z_loss_clip": 1.42285156, "router_z_loss_mlp": 0.09381104, "step": 16500, "time_per_iteration": 2.521888017654419 }, { "auxiliary_loss_clip": 0.06412087, "auxiliary_loss_mlp": 0.01266786, "balance_loss_clip": 0.06272569, "balance_loss_mlp": 0.01256885, "epoch": 0.9920937922741621, "flos": 25381671678720.0, "grad_norm": 1.8523449508526064, "language_loss": 0.82194626, "learning_rate": 6.507115533036511e-10, "loss": 0.89873493, "num_input_tokens_seen": 356140710, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09899902, "step": 16501, "time_per_iteration": 2.5753889083862305 }, { "auxiliary_loss_clip": 0.06413984, "auxiliary_loss_mlp": 0.01265695, "balance_loss_clip": 0.06274737, "balance_loss_mlp": 0.01256265, "epoch": 0.99215391552683, "flos": 22061009571840.0, "grad_norm": 1.8981977299877597, "language_loss": 0.76562065, "learning_rate": 6.408154723420711e-10, "loss": 0.84241742, "num_input_tokens_seen": 356159835, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09436035, "step": 16502, "time_per_iteration": 2.5251874923706055 }, { "auxiliary_loss_clip": 0.06419977, "auxiliary_loss_mlp": 0.01264339, "balance_loss_clip": 0.0627588, "balance_loss_mlp": 0.01254236, "epoch": 0.992214038779498, "flos": 15419349941760.0, "grad_norm": 2.1431438827901954, "language_loss": 0.71935648, "learning_rate": 6.309952072811597e-10, "loss": 0.79619962, "num_input_tokens_seen": 356177555, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.10101318, "step": 16503, "time_per_iteration": 2.5372769832611084 }, { "auxiliary_loss_clip": 0.06317812, "auxiliary_loss_mlp": 0.01251729, "balance_loss_clip": 0.06262333, "balance_loss_mlp": 0.01250751, "epoch": 0.9922741620321659, "flos": 62035184701440.0, "grad_norm": 0.6407175476786473, "language_loss": 0.5513128, "learning_rate": 6.212507584932858e-10, "loss": 0.6270082, "num_input_tokens_seen": 356244975, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00977325, "step": 16504, "time_per_iteration": 3.2433114051818848 }, { "auxiliary_loss_clip": 0.06411198, "auxiliary_loss_mlp": 0.01264683, "balance_loss_clip": 0.06272726, "balance_loss_mlp": 0.01256088, "epoch": 0.9923342852848339, "flos": 17171223580800.0, "grad_norm": 1.7445235166140658, "language_loss": 0.70279711, "learning_rate": 6.115821263481536e-10, "loss": 0.77955592, "num_input_tokens_seen": 356262605, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.0859375, "step": 16505, "time_per_iteration": 2.5326430797576904 }, { "auxiliary_loss_clip": 0.06416531, "auxiliary_loss_mlp": 0.0126589, "balance_loss_clip": 0.06272969, "balance_loss_mlp": 0.01254887, "epoch": 0.9923944085375018, "flos": 23189555088000.0, "grad_norm": 1.9423252189904463, "language_loss": 0.66049916, "learning_rate": 6.019893112119146e-10, "loss": 0.73732334, "num_input_tokens_seen": 356278935, "router_z_loss_clip": 1.43554688, "router_z_loss_mlp": 0.10998535, "step": 16506, "time_per_iteration": 4.0204126834869385 }, { "auxiliary_loss_clip": 0.06411004, "auxiliary_loss_mlp": 0.01264201, "balance_loss_clip": 0.06272691, "balance_loss_mlp": 0.01254694, "epoch": 0.9924545317901698, "flos": 20820181184640.0, "grad_norm": 2.076387088368759, "language_loss": 0.63393098, "learning_rate": 5.924723134487219e-10, "loss": 0.71068299, "num_input_tokens_seen": 356295675, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09503174, "step": 16507, "time_per_iteration": 2.5295591354370117 }, { "auxiliary_loss_clip": 0.06412837, "auxiliary_loss_mlp": 0.01262917, "balance_loss_clip": 0.06271691, "balance_loss_mlp": 0.01253213, "epoch": 0.9925146550428379, "flos": 20089517581440.0, "grad_norm": 2.1178858926079385, "language_loss": 0.72608161, "learning_rate": 5.830311334193983e-10, "loss": 0.80283916, "num_input_tokens_seen": 356312885, "router_z_loss_clip": 1.41210938, "router_z_loss_mlp": 0.09698486, "step": 16508, "time_per_iteration": 2.528810739517212 }, { "auxiliary_loss_clip": 0.06413951, "auxiliary_loss_mlp": 0.01263362, "balance_loss_clip": 0.06274144, "balance_loss_mlp": 0.01253998, "epoch": 0.9925747782955058, "flos": 24980812945920.0, "grad_norm": 1.586871652425449, "language_loss": 0.704687, "learning_rate": 5.736657714818793e-10, "loss": 0.78146005, "num_input_tokens_seen": 356334070, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09356689, "step": 16509, "time_per_iteration": 2.582366704940796 }, { "auxiliary_loss_clip": 0.06417111, "auxiliary_loss_mlp": 0.01266503, "balance_loss_clip": 0.06276444, "balance_loss_mlp": 0.01256698, "epoch": 0.9926349015481738, "flos": 60485250931200.0, "grad_norm": 1.8368212566888864, "language_loss": 0.68480551, "learning_rate": 5.643762279912146e-10, "loss": 0.76164168, "num_input_tokens_seen": 356359410, "router_z_loss_clip": 1.40820312, "router_z_loss_mlp": 0.09802246, "step": 16510, "time_per_iteration": 4.149740219116211 }, { "auxiliary_loss_clip": 0.06418633, "auxiliary_loss_mlp": 0.01265327, "balance_loss_clip": 0.06278272, "balance_loss_mlp": 0.01255128, "epoch": 0.9926950248008417, "flos": 20748163000320.0, "grad_norm": 2.19609714416502, "language_loss": 0.82109851, "learning_rate": 5.551625032997886e-10, "loss": 0.89793813, "num_input_tokens_seen": 356378345, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.10192871, "step": 16511, "time_per_iteration": 2.542668581008911 }, { "auxiliary_loss_clip": 0.06411804, "auxiliary_loss_mlp": 0.01263856, "balance_loss_clip": 0.06275, "balance_loss_mlp": 0.01254832, "epoch": 0.9927551480535097, "flos": 24359874664320.0, "grad_norm": 1.777382163624662, "language_loss": 0.91683882, "learning_rate": 5.460245977570998e-10, "loss": 0.99359542, "num_input_tokens_seen": 356397345, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.09020996, "step": 16512, "time_per_iteration": 2.55564284324646 }, { "auxiliary_loss_clip": 0.06316211, "auxiliary_loss_mlp": 0.01250793, "balance_loss_clip": 0.06260949, "balance_loss_mlp": 0.01249827, "epoch": 0.9928152713061776, "flos": 71296751543040.0, "grad_norm": 0.6935248108993338, "language_loss": 0.55025864, "learning_rate": 5.369625117095378e-10, "loss": 0.6259287, "num_input_tokens_seen": 356459160, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.00964355, "step": 16513, "time_per_iteration": 3.289703607559204 }, { "auxiliary_loss_clip": 0.06412078, "auxiliary_loss_mlp": 0.01264725, "balance_loss_clip": 0.06274322, "balance_loss_mlp": 0.01254867, "epoch": 0.9928753945588457, "flos": 57821850650880.0, "grad_norm": 1.3025593768903103, "language_loss": 0.65194005, "learning_rate": 5.279762455006054e-10, "loss": 0.72870815, "num_input_tokens_seen": 356486405, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09863281, "step": 16514, "time_per_iteration": 2.8830180168151855 }, { "auxiliary_loss_clip": 0.0641695, "auxiliary_loss_mlp": 0.0126689, "balance_loss_clip": 0.06276642, "balance_loss_mlp": 0.01256036, "epoch": 0.9929355178115136, "flos": 19574363479680.0, "grad_norm": 1.922190198017857, "language_loss": 0.73415166, "learning_rate": 5.190657994713632e-10, "loss": 0.81099004, "num_input_tokens_seen": 356502905, "router_z_loss_clip": 1.40332031, "router_z_loss_mlp": 0.10858154, "step": 16515, "time_per_iteration": 2.514094829559326 }, { "auxiliary_loss_clip": 0.06417997, "auxiliary_loss_mlp": 0.01265077, "balance_loss_clip": 0.06279194, "balance_loss_mlp": 0.01255504, "epoch": 0.9929956410641816, "flos": 22971026839680.0, "grad_norm": 1.4834412365507403, "language_loss": 0.77663976, "learning_rate": 5.102311739593191e-10, "loss": 0.8534705, "num_input_tokens_seen": 356523830, "router_z_loss_clip": 1.38964844, "router_z_loss_mlp": 0.09558105, "step": 16516, "time_per_iteration": 2.60917067527771 }, { "auxiliary_loss_clip": 0.06409849, "auxiliary_loss_mlp": 0.0126631, "balance_loss_clip": 0.06272642, "balance_loss_mlp": 0.0125694, "epoch": 0.9930557643168495, "flos": 22573228780800.0, "grad_norm": 1.3993919853659387, "language_loss": 0.78041631, "learning_rate": 5.014723692997602e-10, "loss": 0.85717797, "num_input_tokens_seen": 356543965, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.09381104, "step": 16517, "time_per_iteration": 2.569716691970825 }, { "auxiliary_loss_clip": 0.06419489, "auxiliary_loss_mlp": 0.01265277, "balance_loss_clip": 0.06275117, "balance_loss_mlp": 0.01255425, "epoch": 0.9931158875695175, "flos": 17206624730880.0, "grad_norm": 1.9358230800675342, "language_loss": 0.67540956, "learning_rate": 4.927893858248655e-10, "loss": 0.75225723, "num_input_tokens_seen": 356561530, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.09857178, "step": 16518, "time_per_iteration": 2.510206699371338 }, { "auxiliary_loss_clip": 0.06318524, "auxiliary_loss_mlp": 0.01250588, "balance_loss_clip": 0.06263463, "balance_loss_mlp": 0.01249629, "epoch": 0.9931760108221854, "flos": 63729142369920.0, "grad_norm": 0.72820178598545, "language_loss": 0.53356153, "learning_rate": 4.84182223863483e-10, "loss": 0.60925263, "num_input_tokens_seen": 356616845, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 0.00958252, "step": 16519, "time_per_iteration": 3.0329599380493164 }, { "auxiliary_loss_clip": 0.06412176, "auxiliary_loss_mlp": 0.01264409, "balance_loss_clip": 0.06275022, "balance_loss_mlp": 0.0125523, "epoch": 0.9932361340748534, "flos": 15310253525760.0, "grad_norm": 2.3550483292498243, "language_loss": 0.60187083, "learning_rate": 4.756508837426842e-10, "loss": 0.67863667, "num_input_tokens_seen": 356633560, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.09173584, "step": 16520, "time_per_iteration": 2.5308244228363037 }, { "auxiliary_loss_clip": 0.06415763, "auxiliary_loss_mlp": 0.01265801, "balance_loss_clip": 0.06276374, "balance_loss_mlp": 0.01256354, "epoch": 0.9932962573275215, "flos": 36073776551040.0, "grad_norm": 1.978202163832175, "language_loss": 0.62709242, "learning_rate": 4.671953657853223e-10, "loss": 0.70390809, "num_input_tokens_seen": 356657600, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.09442139, "step": 16521, "time_per_iteration": 2.6939682960510254 }, { "auxiliary_loss_clip": 0.06417525, "auxiliary_loss_mlp": 0.01266077, "balance_loss_clip": 0.06275624, "balance_loss_mlp": 0.01255765, "epoch": 0.9933563805801894, "flos": 21476939886720.0, "grad_norm": 1.675240261891197, "language_loss": 0.74711049, "learning_rate": 4.5881567031225145e-10, "loss": 0.82394648, "num_input_tokens_seen": 356675880, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10308838, "step": 16522, "time_per_iteration": 2.535465717315674 }, { "auxiliary_loss_clip": 0.06411047, "auxiliary_loss_mlp": 0.01268002, "balance_loss_clip": 0.06274078, "balance_loss_mlp": 0.01257977, "epoch": 0.9934165038328574, "flos": 23993117343360.0, "grad_norm": 1.460285132658965, "language_loss": 0.73411226, "learning_rate": 4.5051179764143964e-10, "loss": 0.81090277, "num_input_tokens_seen": 356696000, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.10021973, "step": 16523, "time_per_iteration": 2.558443546295166 }, { "auxiliary_loss_clip": 0.06412004, "auxiliary_loss_mlp": 0.01262985, "balance_loss_clip": 0.06273462, "balance_loss_mlp": 0.01253662, "epoch": 0.9934766270855253, "flos": 21914206018560.0, "grad_norm": 1.4379311362196554, "language_loss": 0.71109998, "learning_rate": 4.422837480875241e-10, "loss": 0.78784984, "num_input_tokens_seen": 356716845, "router_z_loss_clip": 1.38574219, "router_z_loss_mlp": 0.09320068, "step": 16524, "time_per_iteration": 2.5460283756256104 }, { "auxiliary_loss_clip": 0.06412061, "auxiliary_loss_mlp": 0.01263558, "balance_loss_clip": 0.06273003, "balance_loss_mlp": 0.0125448, "epoch": 0.9935367503381933, "flos": 17134900035840.0, "grad_norm": 10.490701217564997, "language_loss": 0.79590309, "learning_rate": 4.341315219624775e-10, "loss": 0.87265927, "num_input_tokens_seen": 356732100, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09082031, "step": 16525, "time_per_iteration": 2.50067138671875 }, { "auxiliary_loss_clip": 0.06416604, "auxiliary_loss_mlp": 0.01265629, "balance_loss_clip": 0.0627889, "balance_loss_mlp": 0.01256211, "epoch": 0.9935968735908612, "flos": 22352813815680.0, "grad_norm": 1.8391911596336983, "language_loss": 0.75111854, "learning_rate": 4.2605511957582995e-10, "loss": 0.82794088, "num_input_tokens_seen": 356751480, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.09423828, "step": 16526, "time_per_iteration": 2.5419368743896484 }, { "auxiliary_loss_clip": 0.064078, "auxiliary_loss_mlp": 0.01267558, "balance_loss_clip": 0.06273429, "balance_loss_mlp": 0.01258307, "epoch": 0.9936569968435293, "flos": 29468230830720.0, "grad_norm": 3.1892584895088776, "language_loss": 0.72467935, "learning_rate": 4.180545412333369e-10, "loss": 0.80143291, "num_input_tokens_seen": 356772650, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.0925293, "step": 16527, "time_per_iteration": 2.587726354598999 }, { "auxiliary_loss_clip": 0.06417979, "auxiliary_loss_mlp": 0.01265328, "balance_loss_clip": 0.06275992, "balance_loss_mlp": 0.01255618, "epoch": 0.9937171200961972, "flos": 16549488685440.0, "grad_norm": 1.967476806956771, "language_loss": 0.7637502, "learning_rate": 4.1012978723875547e-10, "loss": 0.84058332, "num_input_tokens_seen": 356788510, "router_z_loss_clip": 1.42089844, "router_z_loss_mlp": 0.09716797, "step": 16528, "time_per_iteration": 3.944841146469116 }, { "auxiliary_loss_clip": 0.06417641, "auxiliary_loss_mlp": 0.01265157, "balance_loss_clip": 0.06275497, "balance_loss_mlp": 0.01254876, "epoch": 0.9937772433488652, "flos": 24397330239360.0, "grad_norm": 2.369507337542031, "language_loss": 0.68997121, "learning_rate": 4.022808578922898e-10, "loss": 0.76679915, "num_input_tokens_seen": 356809115, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.10284424, "step": 16529, "time_per_iteration": 2.5853374004364014 }, { "auxiliary_loss_clip": 0.06424026, "auxiliary_loss_mlp": 0.01267313, "balance_loss_clip": 0.06280073, "balance_loss_mlp": 0.0125628, "epoch": 0.9938373666015331, "flos": 15675459546240.0, "grad_norm": 2.4787337160077043, "language_loss": 0.65395772, "learning_rate": 3.9450775349170186e-10, "loss": 0.73087108, "num_input_tokens_seen": 356826410, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.11047363, "step": 16530, "time_per_iteration": 2.5367815494537354 }, { "auxiliary_loss_clip": 0.06414095, "auxiliary_loss_mlp": 0.01264374, "balance_loss_clip": 0.06275092, "balance_loss_mlp": 0.01255267, "epoch": 0.9938974898542011, "flos": 19501590608640.0, "grad_norm": 2.1907199100913664, "language_loss": 0.71660852, "learning_rate": 3.8681047433186676e-10, "loss": 0.79339314, "num_input_tokens_seen": 356844990, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09112549, "step": 16531, "time_per_iteration": 2.551035165786743 }, { "auxiliary_loss_clip": 0.06415297, "auxiliary_loss_mlp": 0.01269658, "balance_loss_clip": 0.06274867, "balance_loss_mlp": 0.01259674, "epoch": 0.993957613106869, "flos": 26914220455680.0, "grad_norm": 1.4568139888730645, "language_loss": 0.74561572, "learning_rate": 3.791890207045512e-10, "loss": 0.8224653, "num_input_tokens_seen": 356866530, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09985352, "step": 16532, "time_per_iteration": 2.57118558883667 }, { "auxiliary_loss_clip": 0.06404242, "auxiliary_loss_mlp": 0.01262863, "balance_loss_clip": 0.06273145, "balance_loss_mlp": 0.0125459, "epoch": 0.994017736359537, "flos": 14944921724160.0, "grad_norm": 1.7905571782168872, "language_loss": 0.70586658, "learning_rate": 3.7164339289885717e-10, "loss": 0.78253764, "num_input_tokens_seen": 356884660, "router_z_loss_clip": 1.31152344, "router_z_loss_mlp": 0.08282471, "step": 16533, "time_per_iteration": 2.5649638175964355 }, { "auxiliary_loss_clip": 0.06415972, "auxiliary_loss_mlp": 0.01263457, "balance_loss_clip": 0.0627476, "balance_loss_mlp": 0.01253789, "epoch": 0.9940778596122051, "flos": 15383361813120.0, "grad_norm": 2.1635332892387877, "language_loss": 0.84425998, "learning_rate": 3.641735912007782e-10, "loss": 0.9210543, "num_input_tokens_seen": 356900895, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09667969, "step": 16534, "time_per_iteration": 3.9905238151550293 }, { "auxiliary_loss_clip": 0.06405256, "auxiliary_loss_mlp": 0.01264055, "balance_loss_clip": 0.06273922, "balance_loss_mlp": 0.01255388, "epoch": 0.994137982864873, "flos": 25235077760640.0, "grad_norm": 1.3069769074812991, "language_loss": 0.66055918, "learning_rate": 3.567796158934211e-10, "loss": 0.73725224, "num_input_tokens_seen": 356920985, "router_z_loss_clip": 1.31347656, "router_z_loss_mlp": 0.08666992, "step": 16535, "time_per_iteration": 2.5856375694274902 }, { "auxiliary_loss_clip": 0.06412797, "auxiliary_loss_mlp": 0.01261906, "balance_loss_clip": 0.06276846, "balance_loss_mlp": 0.01253639, "epoch": 0.994198106117541, "flos": 18448040096640.0, "grad_norm": 1.907708343147979, "language_loss": 0.65044802, "learning_rate": 3.4946146725767235e-10, "loss": 0.72719502, "num_input_tokens_seen": 356939800, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.08276367, "step": 16536, "time_per_iteration": 2.514932155609131 }, { "auxiliary_loss_clip": 0.06407461, "auxiliary_loss_mlp": 0.01268612, "balance_loss_clip": 0.06271514, "balance_loss_mlp": 0.0125932, "epoch": 0.9942582293702089, "flos": 16659675204480.0, "grad_norm": 1.848561271584233, "language_loss": 0.78577965, "learning_rate": 3.4221914557064357e-10, "loss": 0.86254042, "num_input_tokens_seen": 356957780, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.09289551, "step": 16537, "time_per_iteration": 2.518523931503296 }, { "auxiliary_loss_clip": 0.06422174, "auxiliary_loss_mlp": 0.01263901, "balance_loss_clip": 0.06275539, "balance_loss_mlp": 0.01253625, "epoch": 0.9943183526228769, "flos": 21951032688000.0, "grad_norm": 1.6347310617572088, "language_loss": 0.68759197, "learning_rate": 3.35052651107004e-10, "loss": 0.76445276, "num_input_tokens_seen": 356979185, "router_z_loss_clip": 1.46582031, "router_z_loss_mlp": 0.10284424, "step": 16538, "time_per_iteration": 2.5424277782440186 }, { "auxiliary_loss_clip": 0.06406428, "auxiliary_loss_mlp": 0.01265767, "balance_loss_clip": 0.06273709, "balance_loss_mlp": 0.01257029, "epoch": 0.9943784758755448, "flos": 23849458318080.0, "grad_norm": 2.5246946689737904, "language_loss": 0.75662428, "learning_rate": 3.2796198413853614e-10, "loss": 0.83334625, "num_input_tokens_seen": 356997735, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.08728027, "step": 16539, "time_per_iteration": 2.7643377780914307 }, { "auxiliary_loss_clip": 0.06420685, "auxiliary_loss_mlp": 0.01263209, "balance_loss_clip": 0.06279335, "balance_loss_mlp": 0.01254077, "epoch": 0.9944385991282129, "flos": 21476310981120.0, "grad_norm": 2.1062168581011966, "language_loss": 0.70884061, "learning_rate": 3.209471449341361e-10, "loss": 0.78567946, "num_input_tokens_seen": 357015660, "router_z_loss_clip": 1.41308594, "router_z_loss_mlp": 0.09136963, "step": 16540, "time_per_iteration": 2.5361785888671875 }, { "auxiliary_loss_clip": 0.06407997, "auxiliary_loss_mlp": 0.01260725, "balance_loss_clip": 0.06271299, "balance_loss_mlp": 0.01252029, "epoch": 0.9944987223808808, "flos": 22933193921280.0, "grad_norm": 2.39524349861116, "language_loss": 0.75290263, "learning_rate": 3.140081337600353e-10, "loss": 0.82958984, "num_input_tokens_seen": 357034800, "router_z_loss_clip": 1.36816406, "router_z_loss_mlp": 0.0869751, "step": 16541, "time_per_iteration": 2.533907651901245 }, { "auxiliary_loss_clip": 0.06412235, "auxiliary_loss_mlp": 0.01267571, "balance_loss_clip": 0.06273589, "balance_loss_mlp": 0.01257372, "epoch": 0.9945588456335488, "flos": 22389640485120.0, "grad_norm": 1.6910162926028856, "language_loss": 0.76983047, "learning_rate": 3.0714495087891255e-10, "loss": 0.84662855, "num_input_tokens_seen": 357053785, "router_z_loss_clip": 1.38671875, "router_z_loss_mlp": 0.10198975, "step": 16542, "time_per_iteration": 2.5452075004577637 }, { "auxiliary_loss_clip": 0.0641898, "auxiliary_loss_mlp": 0.01264171, "balance_loss_clip": 0.06278217, "balance_loss_mlp": 0.01253931, "epoch": 0.9946189688862167, "flos": 21403915453440.0, "grad_norm": 2.283227744001861, "language_loss": 0.75423932, "learning_rate": 3.0035759655122615e-10, "loss": 0.8310709, "num_input_tokens_seen": 357072025, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.10241699, "step": 16543, "time_per_iteration": 2.5204591751098633 }, { "auxiliary_loss_clip": 0.06423169, "auxiliary_loss_mlp": 0.01264173, "balance_loss_clip": 0.06278963, "balance_loss_mlp": 0.01254171, "epoch": 0.9946790921388847, "flos": 12420526567680.0, "grad_norm": 2.470422558545765, "language_loss": 0.82256585, "learning_rate": 2.9364607103454785e-10, "loss": 0.89943922, "num_input_tokens_seen": 357086960, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.09997559, "step": 16544, "time_per_iteration": 2.5046169757843018 }, { "auxiliary_loss_clip": 0.06412773, "auxiliary_loss_mlp": 0.01263707, "balance_loss_clip": 0.06275333, "balance_loss_mlp": 0.01254505, "epoch": 0.9947392153915526, "flos": 19063611717120.0, "grad_norm": 1.7912914523497478, "language_loss": 0.79028857, "learning_rate": 2.870103745831187e-10, "loss": 0.86705339, "num_input_tokens_seen": 357105095, "router_z_loss_clip": 1.37695312, "router_z_loss_mlp": 0.09197998, "step": 16545, "time_per_iteration": 2.504661798477173 }, { "auxiliary_loss_clip": 0.06418665, "auxiliary_loss_mlp": 0.01264236, "balance_loss_clip": 0.06277185, "balance_loss_mlp": 0.01254967, "epoch": 0.9947993386442207, "flos": 27316295072640.0, "grad_norm": 1.635865206558022, "language_loss": 0.72143513, "learning_rate": 2.8045050744873733e-10, "loss": 0.79826415, "num_input_tokens_seen": 357125065, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09265137, "step": 16546, "time_per_iteration": 4.013949632644653 }, { "auxiliary_loss_clip": 0.06412321, "auxiliary_loss_mlp": 0.01262802, "balance_loss_clip": 0.06275716, "balance_loss_mlp": 0.01253831, "epoch": 0.9948594618968887, "flos": 20811586141440.0, "grad_norm": 2.3897756455513908, "language_loss": 0.77644372, "learning_rate": 2.739664698798716e-10, "loss": 0.85319495, "num_input_tokens_seen": 357141600, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.08959961, "step": 16547, "time_per_iteration": 2.530975341796875 }, { "auxiliary_loss_clip": 0.06418385, "auxiliary_loss_mlp": 0.01263957, "balance_loss_clip": 0.06277956, "balance_loss_mlp": 0.0125507, "epoch": 0.9949195851495566, "flos": 23299364263680.0, "grad_norm": 2.2856474225309995, "language_loss": 0.70207822, "learning_rate": 2.67558262122769e-10, "loss": 0.77890158, "num_input_tokens_seen": 357157880, "router_z_loss_clip": 1.40429688, "router_z_loss_mlp": 0.08880615, "step": 16548, "time_per_iteration": 2.5476903915405273 }, { "auxiliary_loss_clip": 0.06412663, "auxiliary_loss_mlp": 0.01261783, "balance_loss_clip": 0.06274719, "balance_loss_mlp": 0.01252747, "epoch": 0.9949797084022246, "flos": 18521441873280.0, "grad_norm": 1.646032168030992, "language_loss": 0.75393945, "learning_rate": 2.6122588442012427e-10, "loss": 0.83068389, "num_input_tokens_seen": 357176705, "router_z_loss_clip": 1.37988281, "router_z_loss_mlp": 0.09033203, "step": 16549, "time_per_iteration": 3.7852578163146973 }, { "auxiliary_loss_clip": 0.06418619, "auxiliary_loss_mlp": 0.01266372, "balance_loss_clip": 0.06275946, "balance_loss_mlp": 0.01255959, "epoch": 0.9950398316548925, "flos": 30415326330240.0, "grad_norm": 1.5581287894279567, "language_loss": 0.74740285, "learning_rate": 2.5496933701241177e-10, "loss": 0.82425284, "num_input_tokens_seen": 357197630, "router_z_loss_clip": 1.42773438, "router_z_loss_mlp": 0.10406494, "step": 16550, "time_per_iteration": 2.5896520614624023 }, { "auxiliary_loss_clip": 0.06412856, "auxiliary_loss_mlp": 0.0126392, "balance_loss_clip": 0.06274598, "balance_loss_mlp": 0.0125523, "epoch": 0.9950999549075605, "flos": 19906893607680.0, "grad_norm": 1.404104565458959, "language_loss": 0.78111207, "learning_rate": 2.4878862013655297e-10, "loss": 0.85787988, "num_input_tokens_seen": 357215445, "router_z_loss_clip": 1.38183594, "router_z_loss_mlp": 0.08691406, "step": 16551, "time_per_iteration": 2.603128433227539 }, { "auxiliary_loss_clip": 0.06406831, "auxiliary_loss_mlp": 0.01266356, "balance_loss_clip": 0.06276838, "balance_loss_mlp": 0.01258289, "epoch": 0.9951600781602284, "flos": 17609412107520.0, "grad_norm": 1.3768523013115839, "language_loss": 0.66580576, "learning_rate": 2.426837340270271e-10, "loss": 0.74253762, "num_input_tokens_seen": 357234285, "router_z_loss_clip": 1.29785156, "router_z_loss_mlp": 0.08065796, "step": 16552, "time_per_iteration": 2.5438694953918457 }, { "auxiliary_loss_clip": 0.06413333, "auxiliary_loss_mlp": 0.01265318, "balance_loss_clip": 0.06272899, "balance_loss_mlp": 0.01255912, "epoch": 0.9952202014128965, "flos": 28958485317120.0, "grad_norm": 1.3345552116418538, "language_loss": 0.81665128, "learning_rate": 2.3665467891520465e-10, "loss": 0.89343774, "num_input_tokens_seen": 357257565, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09405518, "step": 16553, "time_per_iteration": 2.612623691558838 }, { "auxiliary_loss_clip": 0.063194, "auxiliary_loss_mlp": 0.01252953, "balance_loss_clip": 0.0626416, "balance_loss_mlp": 0.01251949, "epoch": 0.9952803246655644, "flos": 70833014720640.0, "grad_norm": 0.706459844650445, "language_loss": 0.57309365, "learning_rate": 2.3070145503001348e-10, "loss": 0.64881724, "num_input_tokens_seen": 357320205, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 0.01004028, "step": 16554, "time_per_iteration": 3.252598285675049 }, { "auxiliary_loss_clip": 0.06418547, "auxiliary_loss_mlp": 0.01267656, "balance_loss_clip": 0.06277344, "balance_loss_mlp": 0.01258704, "epoch": 0.9953404479182324, "flos": 21805570800000.0, "grad_norm": 1.6347506606449482, "language_loss": 0.77385128, "learning_rate": 2.24824062597051e-10, "loss": 0.85071331, "num_input_tokens_seen": 357340695, "router_z_loss_clip": 1.41113281, "router_z_loss_mlp": 0.08959961, "step": 16555, "time_per_iteration": 2.541984796524048 }, { "auxiliary_loss_clip": 0.06413866, "auxiliary_loss_mlp": 0.0126538, "balance_loss_clip": 0.06275748, "balance_loss_mlp": 0.01256081, "epoch": 0.9954005711709003, "flos": 21942647280000.0, "grad_norm": 2.2218853689194176, "language_loss": 0.86041659, "learning_rate": 2.1902250183902793e-10, "loss": 0.93720901, "num_input_tokens_seen": 357357505, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09295654, "step": 16556, "time_per_iteration": 2.5402419567108154 }, { "auxiliary_loss_clip": 0.06407676, "auxiliary_loss_mlp": 0.01264718, "balance_loss_clip": 0.06272803, "balance_loss_mlp": 0.01255336, "epoch": 0.9954606944235683, "flos": 19360656840960.0, "grad_norm": 1.5791871468847223, "language_loss": 0.73440528, "learning_rate": 2.132967729762125e-10, "loss": 0.81112921, "num_input_tokens_seen": 357375395, "router_z_loss_clip": 1.34863281, "router_z_loss_mlp": 0.09387207, "step": 16557, "time_per_iteration": 2.5161120891571045 }, { "auxiliary_loss_clip": 0.06410331, "auxiliary_loss_mlp": 0.01263729, "balance_loss_clip": 0.06276022, "balance_loss_mlp": 0.01255203, "epoch": 0.9955208176762362, "flos": 30526477171200.0, "grad_norm": 1.6690100033975432, "language_loss": 0.76667285, "learning_rate": 2.0764687622554233e-10, "loss": 0.84341347, "num_input_tokens_seen": 357397375, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.08529663, "step": 16558, "time_per_iteration": 2.6081011295318604 }, { "auxiliary_loss_clip": 0.06412807, "auxiliary_loss_mlp": 0.01263334, "balance_loss_clip": 0.06273058, "balance_loss_mlp": 0.01253362, "epoch": 0.9955809409289043, "flos": 30016102752000.0, "grad_norm": 1.8745787574982797, "language_loss": 0.64174283, "learning_rate": 2.0207281180129044e-10, "loss": 0.71850431, "num_input_tokens_seen": 357418880, "router_z_loss_clip": 1.39941406, "router_z_loss_mlp": 0.09967041, "step": 16559, "time_per_iteration": 2.6030824184417725 }, { "auxiliary_loss_clip": 0.06411846, "auxiliary_loss_mlp": 0.0126741, "balance_loss_clip": 0.06274429, "balance_loss_mlp": 0.01257689, "epoch": 0.9956410641815723, "flos": 21549670830720.0, "grad_norm": 1.83719743579433, "language_loss": 0.74392909, "learning_rate": 1.965745799148433e-10, "loss": 0.82072163, "num_input_tokens_seen": 357438310, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.097229, "step": 16560, "time_per_iteration": 2.5400454998016357 }, { "auxiliary_loss_clip": 0.0641345, "auxiliary_loss_mlp": 0.01265079, "balance_loss_clip": 0.06276263, "balance_loss_mlp": 0.01256145, "epoch": 0.9957011874342402, "flos": 21695929332480.0, "grad_norm": 1.7521590026737062, "language_loss": 0.79584372, "learning_rate": 1.9115218077470073e-10, "loss": 0.87262905, "num_input_tokens_seen": 357457155, "router_z_loss_clip": 1.37109375, "router_z_loss_mlp": 0.08929443, "step": 16561, "time_per_iteration": 2.5849571228027344 }, { "auxiliary_loss_clip": 0.06409304, "auxiliary_loss_mlp": 0.01263101, "balance_loss_clip": 0.06274077, "balance_loss_mlp": 0.01254518, "epoch": 0.9957613106869082, "flos": 17706810879360.0, "grad_norm": 2.2378281585051356, "language_loss": 0.65774029, "learning_rate": 1.8580561458647614e-10, "loss": 0.73446435, "num_input_tokens_seen": 357468060, "router_z_loss_clip": 1.35351562, "router_z_loss_mlp": 0.08587646, "step": 16562, "time_per_iteration": 2.6501262187957764 }, { "auxiliary_loss_clip": 0.06415334, "auxiliary_loss_mlp": 0.01271315, "balance_loss_clip": 0.06273794, "balance_loss_mlp": 0.01261349, "epoch": 0.9958214339395761, "flos": 30564016600320.0, "grad_norm": 1.8437779673504593, "language_loss": 0.64500654, "learning_rate": 1.805348815528962e-10, "loss": 0.72187304, "num_input_tokens_seen": 357489665, "router_z_loss_clip": 1.41503906, "router_z_loss_mlp": 0.09960938, "step": 16563, "time_per_iteration": 2.613024950027466 }, { "auxiliary_loss_clip": 0.06416532, "auxiliary_loss_mlp": 0.01266365, "balance_loss_clip": 0.06277022, "balance_loss_mlp": 0.01257084, "epoch": 0.9958815571922441, "flos": 24175825171200.0, "grad_norm": 1.4913455032146659, "language_loss": 0.65043902, "learning_rate": 1.7533998187380105e-10, "loss": 0.72726798, "num_input_tokens_seen": 357511975, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.09277344, "step": 16564, "time_per_iteration": 2.594846248626709 }, { "auxiliary_loss_clip": 0.06412475, "auxiliary_loss_mlp": 0.01263582, "balance_loss_clip": 0.06275299, "balance_loss_mlp": 0.01253926, "epoch": 0.995941680444912, "flos": 15492458229120.0, "grad_norm": 1.7351638832314553, "language_loss": 0.74394953, "learning_rate": 1.7022091574636633e-10, "loss": 0.82071012, "num_input_tokens_seen": 357529345, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09649658, "step": 16565, "time_per_iteration": 2.5070855617523193 }, { "auxiliary_loss_clip": 0.06410469, "auxiliary_loss_mlp": 0.01265841, "balance_loss_clip": 0.06273177, "balance_loss_mlp": 0.01256864, "epoch": 0.9960018036975801, "flos": 18626597147520.0, "grad_norm": 1.8373717226137325, "language_loss": 0.79799497, "learning_rate": 1.6517768336443694e-10, "loss": 0.87475812, "num_input_tokens_seen": 357547615, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.08978271, "step": 16566, "time_per_iteration": 2.543191432952881 }, { "auxiliary_loss_clip": 0.06408236, "auxiliary_loss_mlp": 0.01267392, "balance_loss_clip": 0.06273268, "balance_loss_mlp": 0.01259154, "epoch": 0.996061926950248, "flos": 20090314195200.0, "grad_norm": 2.0931325009724064, "language_loss": 0.7142542, "learning_rate": 1.6021028491941535e-10, "loss": 0.7910105, "num_input_tokens_seen": 357567380, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.08230591, "step": 16567, "time_per_iteration": 2.531636953353882 }, { "auxiliary_loss_clip": 0.06419089, "auxiliary_loss_mlp": 0.01265969, "balance_loss_clip": 0.06277132, "balance_loss_mlp": 0.01255961, "epoch": 0.996122050202916, "flos": 24353879097600.0, "grad_norm": 2.308503065070397, "language_loss": 0.78952318, "learning_rate": 1.5531872059959538e-10, "loss": 0.86637378, "num_input_tokens_seen": 357586435, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.10009766, "step": 16568, "time_per_iteration": 4.1178014278411865 }, { "auxiliary_loss_clip": 0.06407161, "auxiliary_loss_mlp": 0.01266614, "balance_loss_clip": 0.06274211, "balance_loss_mlp": 0.01258406, "epoch": 0.9961821734555839, "flos": 24204895338240.0, "grad_norm": 1.8235115631056693, "language_loss": 0.82333219, "learning_rate": 1.5050299059060634e-10, "loss": 0.90006995, "num_input_tokens_seen": 357604720, "router_z_loss_clip": 1.32910156, "router_z_loss_mlp": 0.08203125, "step": 16569, "time_per_iteration": 2.5585949420928955 }, { "auxiliary_loss_clip": 0.06410584, "auxiliary_loss_mlp": 0.01268112, "balance_loss_clip": 0.06276083, "balance_loss_mlp": 0.01259112, "epoch": 0.9962422967082519, "flos": 22639628741760.0, "grad_norm": 1.746298343852877, "language_loss": 0.70164376, "learning_rate": 1.457630950747468e-10, "loss": 0.7784307, "num_input_tokens_seen": 357622345, "router_z_loss_clip": 1.34570312, "router_z_loss_mlp": 0.09002686, "step": 16570, "time_per_iteration": 2.5410454273223877 }, { "auxiliary_loss_clip": 0.06414153, "auxiliary_loss_mlp": 0.01265423, "balance_loss_clip": 0.06275788, "balance_loss_mlp": 0.01256089, "epoch": 0.9963024199609198, "flos": 26403259057920.0, "grad_norm": 1.5634449958667103, "language_loss": 0.75139886, "learning_rate": 1.4109903423209502e-10, "loss": 0.82819462, "num_input_tokens_seen": 357642710, "router_z_loss_clip": 1.38378906, "router_z_loss_mlp": 0.09332275, "step": 16571, "time_per_iteration": 2.551980495452881 }, { "auxiliary_loss_clip": 0.06411296, "auxiliary_loss_mlp": 0.01264533, "balance_loss_clip": 0.06273875, "balance_loss_mlp": 0.01255187, "epoch": 0.9963625432135879, "flos": 16587153895680.0, "grad_norm": 1.732010761036275, "language_loss": 0.79873639, "learning_rate": 1.3651080823939843e-10, "loss": 0.87549472, "num_input_tokens_seen": 357659870, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09344482, "step": 16572, "time_per_iteration": 2.4988794326782227 }, { "auxiliary_loss_clip": 0.06411651, "auxiliary_loss_mlp": 0.01264336, "balance_loss_clip": 0.06275319, "balance_loss_mlp": 0.0125483, "epoch": 0.9964226664662559, "flos": 26475696512640.0, "grad_norm": 1.920466312693879, "language_loss": 0.70667148, "learning_rate": 1.3199841727074e-10, "loss": 0.78343141, "num_input_tokens_seen": 357677075, "router_z_loss_clip": 1.36425781, "router_z_loss_mlp": 0.09503174, "step": 16573, "time_per_iteration": 3.9950742721557617 }, { "auxiliary_loss_clip": 0.06419235, "auxiliary_loss_mlp": 0.01265926, "balance_loss_clip": 0.06275209, "balance_loss_mlp": 0.01255441, "epoch": 0.9964827897189238, "flos": 27454755144960.0, "grad_norm": 1.7878825846068864, "language_loss": 0.63720381, "learning_rate": 1.275618614968721e-10, "loss": 0.71405542, "num_input_tokens_seen": 357696715, "router_z_loss_clip": 1.44238281, "router_z_loss_mlp": 0.1048584, "step": 16574, "time_per_iteration": 2.584071159362793 }, { "auxiliary_loss_clip": 0.06423879, "auxiliary_loss_mlp": 0.01267172, "balance_loss_clip": 0.06279501, "balance_loss_mlp": 0.01257075, "epoch": 0.9965429129715918, "flos": 11725138333440.0, "grad_norm": 2.2389804115108287, "language_loss": 0.7666887, "learning_rate": 1.2320114108654856e-10, "loss": 0.8435992, "num_input_tokens_seen": 357712345, "router_z_loss_clip": 1.44335938, "router_z_loss_mlp": 0.10095215, "step": 16575, "time_per_iteration": 2.509018898010254 }, { "auxiliary_loss_clip": 0.06417348, "auxiliary_loss_mlp": 0.01264637, "balance_loss_clip": 0.06278215, "balance_loss_mlp": 0.01254641, "epoch": 0.9966030362242597, "flos": 19762186406400.0, "grad_norm": 1.6561234050400582, "language_loss": 0.70355237, "learning_rate": 1.1891625620474855e-10, "loss": 0.78037226, "num_input_tokens_seen": 357731815, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09997559, "step": 16576, "time_per_iteration": 2.5197694301605225 }, { "auxiliary_loss_clip": 0.06409934, "auxiliary_loss_mlp": 0.01264685, "balance_loss_clip": 0.06276716, "balance_loss_mlp": 0.01255428, "epoch": 0.9966631594769277, "flos": 23922021553920.0, "grad_norm": 1.6010521307653398, "language_loss": 0.71855438, "learning_rate": 1.1470720701400871e-10, "loss": 0.79530048, "num_input_tokens_seen": 357751640, "router_z_loss_clip": 1.33398438, "router_z_loss_mlp": 0.09259033, "step": 16577, "time_per_iteration": 2.5730857849121094 }, { "auxiliary_loss_clip": 0.06414382, "auxiliary_loss_mlp": 0.01267435, "balance_loss_clip": 0.06275929, "balance_loss_mlp": 0.01258483, "epoch": 0.9967232827295956, "flos": 15564979537920.0, "grad_norm": 2.1177582471531453, "language_loss": 0.79178727, "learning_rate": 1.1057399367397912e-10, "loss": 0.86860543, "num_input_tokens_seen": 357769850, "router_z_loss_clip": 1.38476562, "router_z_loss_mlp": 0.08947754, "step": 16578, "time_per_iteration": 2.5338339805603027 }, { "auxiliary_loss_clip": 0.06413777, "auxiliary_loss_mlp": 0.01263749, "balance_loss_clip": 0.06274278, "balance_loss_mlp": 0.0125457, "epoch": 0.9967834059822637, "flos": 20819216862720.0, "grad_norm": 1.6259725669628173, "language_loss": 0.75808179, "learning_rate": 1.0651661634142328e-10, "loss": 0.83485711, "num_input_tokens_seen": 357789550, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09173584, "step": 16579, "time_per_iteration": 2.5377304553985596 }, { "auxiliary_loss_clip": 0.06417719, "auxiliary_loss_mlp": 0.01267978, "balance_loss_clip": 0.0627677, "balance_loss_mlp": 0.01257244, "epoch": 0.9968435292349316, "flos": 36727809995520.0, "grad_norm": 1.982384004787474, "language_loss": 0.6970396, "learning_rate": 1.0253507516999604e-10, "loss": 0.77389657, "num_input_tokens_seen": 357809525, "router_z_loss_clip": 1.40917969, "router_z_loss_mlp": 0.10736084, "step": 16580, "time_per_iteration": 2.676011085510254 }, { "auxiliary_loss_clip": 0.06413854, "auxiliary_loss_mlp": 0.01262433, "balance_loss_clip": 0.0627425, "balance_loss_mlp": 0.01253504, "epoch": 0.9969036524875996, "flos": 26768213516160.0, "grad_norm": 2.59724285920231, "language_loss": 0.80210638, "learning_rate": 9.862937031113184e-11, "loss": 0.87886918, "num_input_tokens_seen": 357829795, "router_z_loss_clip": 1.39453125, "router_z_loss_mlp": 0.08935547, "step": 16581, "time_per_iteration": 2.5792040824890137 }, { "auxiliary_loss_clip": 0.06410065, "auxiliary_loss_mlp": 0.01264063, "balance_loss_clip": 0.06273345, "balance_loss_mlp": 0.0125523, "epoch": 0.9969637757402675, "flos": 24834219027840.0, "grad_norm": 1.7389654776937813, "language_loss": 0.80179685, "learning_rate": 9.479950191249031e-11, "loss": 0.87853819, "num_input_tokens_seen": 357851655, "router_z_loss_clip": 1.36621094, "router_z_loss_mlp": 0.08831787, "step": 16582, "time_per_iteration": 2.575791120529175 }, { "auxiliary_loss_clip": 0.06410038, "auxiliary_loss_mlp": 0.01262132, "balance_loss_clip": 0.06275763, "balance_loss_mlp": 0.01253353, "epoch": 0.9970238989929355, "flos": 23045309084160.0, "grad_norm": 1.689656855849271, "language_loss": 0.60697871, "learning_rate": 9.104547011951069e-11, "loss": 0.68370038, "num_input_tokens_seen": 357871205, "router_z_loss_clip": 1.34277344, "router_z_loss_mlp": 0.08764648, "step": 16583, "time_per_iteration": 2.547081470489502 }, { "auxiliary_loss_clip": 0.06415441, "auxiliary_loss_mlp": 0.01264266, "balance_loss_clip": 0.06275585, "balance_loss_mlp": 0.01254836, "epoch": 0.9970840222456034, "flos": 25305418863360.0, "grad_norm": 1.5476206652892524, "language_loss": 0.78133601, "learning_rate": 8.736727507452357e-11, "loss": 0.85813308, "num_input_tokens_seen": 357892145, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09423828, "step": 16584, "time_per_iteration": 2.608205556869507 }, { "auxiliary_loss_clip": 0.06415118, "auxiliary_loss_mlp": 0.01266421, "balance_loss_clip": 0.06279016, "balance_loss_mlp": 0.01257778, "epoch": 0.9971441454982715, "flos": 21621898650240.0, "grad_norm": 1.3543262973154806, "language_loss": 0.69590378, "learning_rate": 8.376491691697297e-11, "loss": 0.77271914, "num_input_tokens_seen": 357911205, "router_z_loss_clip": 1.36035156, "router_z_loss_mlp": 0.08648682, "step": 16585, "time_per_iteration": 3.97949481010437 }, { "auxiliary_loss_clip": 0.06410498, "auxiliary_loss_mlp": 0.01263034, "balance_loss_clip": 0.06275867, "balance_loss_mlp": 0.01254021, "epoch": 0.9972042687509394, "flos": 14980867925760.0, "grad_norm": 3.127212783568863, "language_loss": 0.813227, "learning_rate": 8.023839578363834e-11, "loss": 0.88996232, "num_input_tokens_seen": 357928190, "router_z_loss_clip": 1.34667969, "router_z_loss_mlp": 0.09014893, "step": 16586, "time_per_iteration": 2.5268330574035645 }, { "auxiliary_loss_clip": 0.06414136, "auxiliary_loss_mlp": 0.01263717, "balance_loss_clip": 0.0627515, "balance_loss_mlp": 0.01254538, "epoch": 0.9972643920036074, "flos": 25812858389760.0, "grad_norm": 1.5834157783479843, "language_loss": 0.78122866, "learning_rate": 7.678771180796851e-11, "loss": 0.85800713, "num_input_tokens_seen": 357946985, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09185791, "step": 16587, "time_per_iteration": 2.732484817504883 }, { "auxiliary_loss_clip": 0.06417668, "auxiliary_loss_mlp": 0.01266333, "balance_loss_clip": 0.06276977, "balance_loss_mlp": 0.01256993, "epoch": 0.9973245152562754, "flos": 23332124010240.0, "grad_norm": 1.6281394881412903, "language_loss": 0.72730708, "learning_rate": 7.341286512074773e-11, "loss": 0.80414706, "num_input_tokens_seen": 357966720, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.09344482, "step": 16588, "time_per_iteration": 2.594585657119751 }, { "auxiliary_loss_clip": 0.06419817, "auxiliary_loss_mlp": 0.01263869, "balance_loss_clip": 0.06274927, "balance_loss_mlp": 0.01253916, "epoch": 0.9973846385089433, "flos": 12170999508480.0, "grad_norm": 2.5051492441663696, "language_loss": 0.83159381, "learning_rate": 7.011385585031781e-11, "loss": 0.9084307, "num_input_tokens_seen": 357981375, "router_z_loss_clip": 1.44824219, "router_z_loss_mlp": 0.09954834, "step": 16589, "time_per_iteration": 3.93924617767334 }, { "auxiliary_loss_clip": 0.06420074, "auxiliary_loss_mlp": 0.01264625, "balance_loss_clip": 0.06275241, "balance_loss_mlp": 0.01254302, "epoch": 0.9974447617616113, "flos": 20050929976320.0, "grad_norm": 2.1200768557957357, "language_loss": 0.71052122, "learning_rate": 6.689068412168986e-11, "loss": 0.78736818, "num_input_tokens_seen": 358000290, "router_z_loss_clip": 1.44726562, "router_z_loss_mlp": 0.10327148, "step": 16590, "time_per_iteration": 2.5698838233947754 }, { "auxiliary_loss_clip": 0.0641761, "auxiliary_loss_mlp": 0.01265174, "balance_loss_clip": 0.06276654, "balance_loss_mlp": 0.01255608, "epoch": 0.9975048850142793, "flos": 32022744330240.0, "grad_norm": 1.7389163652090587, "language_loss": 0.64063513, "learning_rate": 6.374335005676634e-11, "loss": 0.7174629, "num_input_tokens_seen": 358022075, "router_z_loss_clip": 1.40722656, "router_z_loss_mlp": 0.09558105, "step": 16591, "time_per_iteration": 2.6742424964904785 }, { "auxiliary_loss_clip": 0.06411637, "auxiliary_loss_mlp": 0.01265987, "balance_loss_clip": 0.06272455, "balance_loss_mlp": 0.01256975, "epoch": 0.9975650082669473, "flos": 36941600488320.0, "grad_norm": 1.632169724182881, "language_loss": 0.73155552, "learning_rate": 6.067185377522933e-11, "loss": 0.80833179, "num_input_tokens_seen": 358043940, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09014893, "step": 16592, "time_per_iteration": 2.6622238159179688 }, { "auxiliary_loss_clip": 0.06411026, "auxiliary_loss_mlp": 0.01265795, "balance_loss_clip": 0.06272322, "balance_loss_mlp": 0.01256395, "epoch": 0.9976251315196152, "flos": 16477722063360.0, "grad_norm": 1.517605253559676, "language_loss": 0.8529368, "learning_rate": 5.767619539343016e-11, "loss": 0.92970502, "num_input_tokens_seen": 358062720, "router_z_loss_clip": 1.38769531, "router_z_loss_mlp": 0.09405518, "step": 16593, "time_per_iteration": 2.567110061645508 }, { "auxiliary_loss_clip": 0.06412858, "auxiliary_loss_mlp": 0.01265993, "balance_loss_clip": 0.06275433, "balance_loss_mlp": 0.01257237, "epoch": 0.9976852547722832, "flos": 19653048063360.0, "grad_norm": 1.6091464551511658, "language_loss": 0.69694477, "learning_rate": 5.4756375024833656e-11, "loss": 0.77373326, "num_input_tokens_seen": 358081560, "router_z_loss_clip": 1.37207031, "router_z_loss_mlp": 0.08758545, "step": 16594, "time_per_iteration": 2.533597946166992 }, { "auxiliary_loss_clip": 0.0641581, "auxiliary_loss_mlp": 0.01265242, "balance_loss_clip": 0.06274012, "balance_loss_mlp": 0.01256034, "epoch": 0.9977453780249511, "flos": 20454597820800.0, "grad_norm": 1.879236259792407, "language_loss": 0.72995526, "learning_rate": 5.1912392780462113e-11, "loss": 0.80676579, "num_input_tokens_seen": 358099065, "router_z_loss_clip": 1.41796875, "router_z_loss_mlp": 0.09216309, "step": 16595, "time_per_iteration": 2.596459150314331 }, { "auxiliary_loss_clip": 0.06316892, "auxiliary_loss_mlp": 0.01251321, "balance_loss_clip": 0.06261711, "balance_loss_mlp": 0.01250377, "epoch": 0.9978055012776191, "flos": 65472085549440.0, "grad_norm": 0.7884565197645217, "language_loss": 0.60431039, "learning_rate": 4.9144248768007156e-11, "loss": 0.6799925, "num_input_tokens_seen": 358156095, "router_z_loss_clip": 0.55224609, "router_z_loss_mlp": 0.0094223, "step": 16596, "time_per_iteration": 3.039524555206299 }, { "auxiliary_loss_clip": 0.06412112, "auxiliary_loss_mlp": 0.01263923, "balance_loss_clip": 0.06274065, "balance_loss_mlp": 0.01254177, "epoch": 0.997865624530287, "flos": 20637808773120.0, "grad_norm": 2.0138776127336357, "language_loss": 0.77587152, "learning_rate": 4.645194309227385e-11, "loss": 0.85263181, "num_input_tokens_seen": 358175230, "router_z_loss_clip": 1.38085938, "router_z_loss_mlp": 0.09747314, "step": 16597, "time_per_iteration": 2.5945422649383545 }, { "auxiliary_loss_clip": 0.06416707, "auxiliary_loss_mlp": 0.0126522, "balance_loss_clip": 0.06276128, "balance_loss_mlp": 0.01255671, "epoch": 0.9979257477829551, "flos": 29394703272960.0, "grad_norm": 1.749855449801116, "language_loss": 0.81722951, "learning_rate": 4.383547585562475e-11, "loss": 0.89404869, "num_input_tokens_seen": 358197075, "router_z_loss_clip": 1.40527344, "router_z_loss_mlp": 0.09552002, "step": 16598, "time_per_iteration": 2.6161742210388184 }, { "auxiliary_loss_clip": 0.06419125, "auxiliary_loss_mlp": 0.01266512, "balance_loss_clip": 0.06274474, "balance_loss_mlp": 0.01255395, "epoch": 0.997985871035623, "flos": 22641180042240.0, "grad_norm": 2.1536349209780377, "language_loss": 0.6488601, "learning_rate": 4.129484715709175e-11, "loss": 0.72571647, "num_input_tokens_seen": 358215925, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.11114502, "step": 16599, "time_per_iteration": 2.594242811203003 }, { "auxiliary_loss_clip": 0.06317773, "auxiliary_loss_mlp": 0.0125117, "balance_loss_clip": 0.06262612, "balance_loss_mlp": 0.01250241, "epoch": 0.998045994288291, "flos": 61823421434880.0, "grad_norm": 0.8361596611730041, "language_loss": 0.62340021, "learning_rate": 3.8830057093264256e-11, "loss": 0.69908965, "num_input_tokens_seen": 358269035, "router_z_loss_clip": 0.55126953, "router_z_loss_mlp": 0.00927734, "step": 16600, "time_per_iteration": 3.096358299255371 }, { "auxiliary_loss_clip": 0.064097, "auxiliary_loss_mlp": 0.0126377, "balance_loss_clip": 0.06272204, "balance_loss_mlp": 0.01254966, "epoch": 0.998106117540959, "flos": 19251686206080.0, "grad_norm": 1.5462535599928044, "language_loss": 0.79068762, "learning_rate": 3.644110575717896e-11, "loss": 0.86742234, "num_input_tokens_seen": 358287680, "router_z_loss_clip": 1.37597656, "router_z_loss_mlp": 0.08795166, "step": 16601, "time_per_iteration": 2.563758611679077 }, { "auxiliary_loss_clip": 0.06421974, "auxiliary_loss_mlp": 0.01266221, "balance_loss_clip": 0.06278677, "balance_loss_mlp": 0.01256792, "epoch": 0.9981662407936269, "flos": 21112656261120.0, "grad_norm": 2.1174482110128845, "language_loss": 0.83001894, "learning_rate": 3.412799323987414e-11, "loss": 0.90690088, "num_input_tokens_seen": 358304080, "router_z_loss_clip": 1.43066406, "router_z_loss_mlp": 0.09436035, "step": 16602, "time_per_iteration": 2.5407657623291016 }, { "auxiliary_loss_clip": 0.06413461, "auxiliary_loss_mlp": 0.01265765, "balance_loss_clip": 0.06274379, "balance_loss_mlp": 0.01256396, "epoch": 0.998226364046295, "flos": 24323802681600.0, "grad_norm": 2.157835018834239, "language_loss": 0.62375307, "learning_rate": 3.189071962883538e-11, "loss": 0.70054531, "num_input_tokens_seen": 358323670, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.09362793, "step": 16603, "time_per_iteration": 2.5770633220672607 }, { "auxiliary_loss_clip": 0.06415757, "auxiliary_loss_mlp": 0.01267385, "balance_loss_clip": 0.06275942, "balance_loss_mlp": 0.01257593, "epoch": 0.9982864872989629, "flos": 23842246867200.0, "grad_norm": 1.9040767517645407, "language_loss": 0.71523786, "learning_rate": 2.972928500866168e-11, "loss": 0.79206932, "num_input_tokens_seen": 358341980, "router_z_loss_clip": 1.39648438, "router_z_loss_mlp": 0.09796143, "step": 16604, "time_per_iteration": 2.5662765502929688 }, { "auxiliary_loss_clip": 0.06414452, "auxiliary_loss_mlp": 0.0126191, "balance_loss_clip": 0.06275237, "balance_loss_mlp": 0.01251897, "epoch": 0.9983466105516309, "flos": 18339069461760.0, "grad_norm": 1.510375203675967, "language_loss": 0.64508545, "learning_rate": 2.7643689461953613e-11, "loss": 0.72184902, "num_input_tokens_seen": 358360400, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.10015869, "step": 16605, "time_per_iteration": 2.542088031768799 }, { "auxiliary_loss_clip": 0.06410658, "auxiliary_loss_mlp": 0.01262835, "balance_loss_clip": 0.06274021, "balance_loss_mlp": 0.01253197, "epoch": 0.9984067338042988, "flos": 17242235516160.0, "grad_norm": 1.9802638091454614, "language_loss": 0.71482015, "learning_rate": 2.5633933067092938e-11, "loss": 0.79155505, "num_input_tokens_seen": 358378990, "router_z_loss_clip": 1.36328125, "router_z_loss_mlp": 0.09631348, "step": 16606, "time_per_iteration": 2.548577070236206 }, { "auxiliary_loss_clip": 0.06414983, "auxiliary_loss_mlp": 0.01265515, "balance_loss_clip": 0.06277017, "balance_loss_mlp": 0.01256539, "epoch": 0.9984668570569668, "flos": 20674174245120.0, "grad_norm": 1.8565514781730028, "language_loss": 0.82055396, "learning_rate": 2.370001590090709e-11, "loss": 0.89735895, "num_input_tokens_seen": 358395970, "router_z_loss_clip": 1.37890625, "router_z_loss_mlp": 0.08972168, "step": 16607, "time_per_iteration": 3.986057996749878 }, { "auxiliary_loss_clip": 0.06421256, "auxiliary_loss_mlp": 0.01265261, "balance_loss_clip": 0.06277361, "balance_loss_mlp": 0.01255045, "epoch": 0.9985269803096347, "flos": 30270241785600.0, "grad_norm": 1.5258286064962303, "language_loss": 0.67028439, "learning_rate": 2.184193803622669e-11, "loss": 0.74714959, "num_input_tokens_seen": 358417355, "router_z_loss_clip": 1.44140625, "router_z_loss_mlp": 0.10211182, "step": 16608, "time_per_iteration": 2.6382007598876953 }, { "auxiliary_loss_clip": 0.06417657, "auxiliary_loss_mlp": 0.01262255, "balance_loss_clip": 0.06277904, "balance_loss_mlp": 0.01253034, "epoch": 0.9985871035623027, "flos": 10565510152320.0, "grad_norm": 2.0270744186035223, "language_loss": 0.80819356, "learning_rate": 2.0059699543883978e-11, "loss": 0.88499272, "num_input_tokens_seen": 358434345, "router_z_loss_clip": 1.39550781, "router_z_loss_mlp": 0.09222412, "step": 16609, "time_per_iteration": 2.5928781032562256 }, { "auxiliary_loss_clip": 0.06415799, "auxiliary_loss_mlp": 0.01264785, "balance_loss_clip": 0.06275518, "balance_loss_mlp": 0.01255594, "epoch": 0.9986472268149706, "flos": 16879125847680.0, "grad_norm": 1.5304103839727605, "language_loss": 0.63035083, "learning_rate": 1.8353300491158462e-11, "loss": 0.70715672, "num_input_tokens_seen": 358452870, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09191895, "step": 16610, "time_per_iteration": 2.542905807495117 }, { "auxiliary_loss_clip": 0.06409512, "auxiliary_loss_mlp": 0.01263905, "balance_loss_clip": 0.06271941, "balance_loss_mlp": 0.01255238, "epoch": 0.9987073500676387, "flos": 22061093425920.0, "grad_norm": 2.324523681459263, "language_loss": 0.67403871, "learning_rate": 1.672274094288717e-11, "loss": 0.75077289, "num_input_tokens_seen": 358472210, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.08673096, "step": 16611, "time_per_iteration": 2.5580859184265137 }, { "auxiliary_loss_clip": 0.06416328, "auxiliary_loss_mlp": 0.01265354, "balance_loss_clip": 0.06276952, "balance_loss_mlp": 0.01255769, "epoch": 0.9987674733203066, "flos": 30490866385920.0, "grad_norm": 1.601614822434397, "language_loss": 0.70024145, "learning_rate": 1.5168020961020544e-11, "loss": 0.77705824, "num_input_tokens_seen": 358493840, "router_z_loss_clip": 1.39355469, "router_z_loss_mlp": 0.0958252, "step": 16612, "time_per_iteration": 2.635896682739258 }, { "auxiliary_loss_clip": 0.06409249, "auxiliary_loss_mlp": 0.01262675, "balance_loss_clip": 0.06275681, "balance_loss_mlp": 0.01254146, "epoch": 0.9988275965729746, "flos": 27752554955520.0, "grad_norm": 1.5229887519757337, "language_loss": 0.74186945, "learning_rate": 1.3689140604400407e-11, "loss": 0.81858873, "num_input_tokens_seen": 358515060, "router_z_loss_clip": 1.33496094, "router_z_loss_mlp": 0.08526611, "step": 16613, "time_per_iteration": 4.112160682678223 }, { "auxiliary_loss_clip": 0.06415891, "auxiliary_loss_mlp": 0.0126737, "balance_loss_clip": 0.06275882, "balance_loss_mlp": 0.01257291, "epoch": 0.9988877198256426, "flos": 17528966588160.0, "grad_norm": 2.004547042040472, "language_loss": 0.74075603, "learning_rate": 1.2286099928981996e-11, "loss": 0.81758863, "num_input_tokens_seen": 358528200, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.10089111, "step": 16614, "time_per_iteration": 2.5302703380584717 }, { "auxiliary_loss_clip": 0.06413882, "auxiliary_loss_mlp": 0.01267154, "balance_loss_clip": 0.06274468, "balance_loss_mlp": 0.01257254, "epoch": 0.9989478430783105, "flos": 21002889012480.0, "grad_norm": 1.5774105456983087, "language_loss": 0.72812665, "learning_rate": 1.0958898988278065e-11, "loss": 0.80493689, "num_input_tokens_seen": 358548360, "router_z_loss_clip": 1.39257812, "router_z_loss_mlp": 0.09893799, "step": 16615, "time_per_iteration": 2.5798916816711426 }, { "auxiliary_loss_clip": 0.06419629, "auxiliary_loss_mlp": 0.01263094, "balance_loss_clip": 0.06277589, "balance_loss_mlp": 0.01253432, "epoch": 0.9990079663309785, "flos": 13375672058880.0, "grad_norm": 2.526168770647519, "language_loss": 0.77896804, "learning_rate": 9.70753783247069e-12, "loss": 0.85579526, "num_input_tokens_seen": 358566270, "router_z_loss_clip": 1.41992188, "router_z_loss_mlp": 0.09661865, "step": 16616, "time_per_iteration": 2.530620813369751 }, { "auxiliary_loss_clip": 0.06412687, "auxiliary_loss_mlp": 0.01268521, "balance_loss_clip": 0.06274866, "balance_loss_mlp": 0.01259259, "epoch": 0.9990680895836465, "flos": 17315805000960.0, "grad_norm": 2.4815928509490317, "language_loss": 0.82960796, "learning_rate": 8.532016508855378e-12, "loss": 0.90642005, "num_input_tokens_seen": 358584710, "router_z_loss_clip": 1.37792969, "router_z_loss_mlp": 0.09259033, "step": 16617, "time_per_iteration": 2.5429790019989014 }, { "auxiliary_loss_clip": 0.06410252, "auxiliary_loss_mlp": 0.01263012, "balance_loss_clip": 0.06273306, "balance_loss_mlp": 0.01254519, "epoch": 0.9991282128363145, "flos": 24215041681920.0, "grad_norm": 2.439870397362243, "language_loss": 0.78587651, "learning_rate": 7.43233506206309e-12, "loss": 0.86260915, "num_input_tokens_seen": 358606750, "router_z_loss_clip": 1.36914062, "router_z_loss_mlp": 0.0848999, "step": 16618, "time_per_iteration": 2.5706913471221924 }, { "auxiliary_loss_clip": 0.06408813, "auxiliary_loss_mlp": 0.01264786, "balance_loss_clip": 0.06271481, "balance_loss_mlp": 0.012551, "epoch": 0.9991883360889824, "flos": 21181110647040.0, "grad_norm": 1.5592325769903175, "language_loss": 0.75000763, "learning_rate": 6.408493534060255e-12, "loss": 0.8267436, "num_input_tokens_seen": 358624675, "router_z_loss_clip": 1.37304688, "router_z_loss_mlp": 0.09692383, "step": 16619, "time_per_iteration": 2.546924591064453 }, { "auxiliary_loss_clip": 0.06408544, "auxiliary_loss_mlp": 0.01264509, "balance_loss_clip": 0.06273443, "balance_loss_mlp": 0.01256153, "epoch": 0.9992484593416504, "flos": 19907229024000.0, "grad_norm": 1.946358784723238, "language_loss": 0.86927664, "learning_rate": 5.460491963260594e-12, "loss": 0.94600719, "num_input_tokens_seen": 358640715, "router_z_loss_clip": 1.35058594, "router_z_loss_mlp": 0.08355713, "step": 16620, "time_per_iteration": 2.5827760696411133 }, { "auxiliary_loss_clip": 0.06408563, "auxiliary_loss_mlp": 0.01266408, "balance_loss_clip": 0.06272762, "balance_loss_mlp": 0.01257968, "epoch": 0.9993085825943183, "flos": 24863834246400.0, "grad_norm": 2.172177901077028, "language_loss": 0.72748303, "learning_rate": 4.58833038607942e-12, "loss": 0.80423278, "num_input_tokens_seen": 358659630, "router_z_loss_clip": 1.35839844, "router_z_loss_mlp": 0.08441162, "step": 16621, "time_per_iteration": 2.5814592838287354 }, { "auxiliary_loss_clip": 0.06316176, "auxiliary_loss_mlp": 0.01251232, "balance_loss_clip": 0.06260981, "balance_loss_mlp": 0.0125028, "epoch": 0.9993687058469863, "flos": 71307149448960.0, "grad_norm": 0.7280139233091107, "language_loss": 0.56441545, "learning_rate": 3.79200883515729e-12, "loss": 0.64008951, "num_input_tokens_seen": 358727840, "router_z_loss_clip": 0.55224609, "router_z_loss_mlp": 0.00950623, "step": 16622, "time_per_iteration": 3.357063055038452 }, { "auxiliary_loss_clip": 0.06412692, "auxiliary_loss_mlp": 0.01266348, "balance_loss_clip": 0.06273264, "balance_loss_mlp": 0.01256675, "epoch": 0.9994288290996542, "flos": 12203843109120.0, "grad_norm": 1.8079472759839297, "language_loss": 0.71053648, "learning_rate": 3.071527340914315e-12, "loss": 0.78732693, "num_input_tokens_seen": 358744125, "router_z_loss_clip": 1.39746094, "router_z_loss_mlp": 0.09667969, "step": 16623, "time_per_iteration": 2.5295517444610596 }, { "auxiliary_loss_clip": 0.06415717, "auxiliary_loss_mlp": 0.01264558, "balance_loss_clip": 0.06278373, "balance_loss_mlp": 0.01255051, "epoch": 0.9994889523523223, "flos": 17894927295360.0, "grad_norm": 1.7904736734099538, "language_loss": 0.74828893, "learning_rate": 2.4268859304399368e-12, "loss": 0.82509166, "num_input_tokens_seen": 358761420, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.09515381, "step": 16624, "time_per_iteration": 2.5314252376556396 }, { "auxiliary_loss_clip": 0.06417157, "auxiliary_loss_mlp": 0.01264354, "balance_loss_clip": 0.06276938, "balance_loss_mlp": 0.01254889, "epoch": 0.9995490756049902, "flos": 26586218448000.0, "grad_norm": 1.4414209972806935, "language_loss": 0.73939145, "learning_rate": 1.8580846286031514e-12, "loss": 0.81620651, "num_input_tokens_seen": 358782600, "router_z_loss_clip": 1.40234375, "router_z_loss_mlp": 0.09472656, "step": 16625, "time_per_iteration": 4.0158960819244385 }, { "auxiliary_loss_clip": 0.06407734, "auxiliary_loss_mlp": 0.01264855, "balance_loss_clip": 0.06273259, "balance_loss_mlp": 0.01255277, "epoch": 0.9996091988576582, "flos": 22206555313920.0, "grad_norm": 2.341288381221055, "language_loss": 0.77753353, "learning_rate": 1.3651234567202408e-12, "loss": 0.85425943, "num_input_tokens_seen": 358801220, "router_z_loss_clip": 1.34472656, "router_z_loss_mlp": 0.09570312, "step": 16626, "time_per_iteration": 2.5611071586608887 }, { "auxiliary_loss_clip": 0.06411076, "auxiliary_loss_mlp": 0.01267565, "balance_loss_clip": 0.06274457, "balance_loss_mlp": 0.01258147, "epoch": 0.9996693221103262, "flos": 27379257016320.0, "grad_norm": 1.8520346486666854, "language_loss": 0.82202733, "learning_rate": 9.480024334429515e-13, "loss": 0.89881372, "num_input_tokens_seen": 358819190, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.09411621, "step": 16627, "time_per_iteration": 2.63284969329834 }, { "auxiliary_loss_clip": 0.06418199, "auxiliary_loss_mlp": 0.01267868, "balance_loss_clip": 0.06275615, "balance_loss_mlp": 0.01257741, "epoch": 0.9997294453629941, "flos": 26877729202560.0, "grad_norm": 1.8342231957248316, "language_loss": 0.70816696, "learning_rate": 6.067215747584952e-13, "loss": 0.78502762, "num_input_tokens_seen": 358839850, "router_z_loss_clip": 1.42675781, "router_z_loss_mlp": 0.10125732, "step": 16628, "time_per_iteration": 4.188854455947876 }, { "auxiliary_loss_clip": 0.06414019, "auxiliary_loss_mlp": 0.01264457, "balance_loss_clip": 0.06275015, "balance_loss_mlp": 0.01255361, "epoch": 0.9997895686156621, "flos": 23483707246080.0, "grad_norm": 1.3464690263004402, "language_loss": 0.75835133, "learning_rate": 3.4128089332341456e-13, "loss": 0.83513612, "num_input_tokens_seen": 358859805, "router_z_loss_clip": 1.39160156, "router_z_loss_mlp": 0.09100342, "step": 16629, "time_per_iteration": 2.53574800491333 }, { "auxiliary_loss_clip": 0.06419675, "auxiliary_loss_mlp": 0.01266778, "balance_loss_clip": 0.06275659, "balance_loss_mlp": 0.01256919, "epoch": 0.9998496918683301, "flos": 20230325568000.0, "grad_norm": 1.6527256065572242, "language_loss": 0.60943401, "learning_rate": 1.5168039935176126e-13, "loss": 0.68629849, "num_input_tokens_seen": 358877900, "router_z_loss_clip": 1.44042969, "router_z_loss_mlp": 0.09857178, "step": 16630, "time_per_iteration": 2.6088366508483887 }, { "auxiliary_loss_clip": 0.0641562, "auxiliary_loss_mlp": 0.01264771, "balance_loss_clip": 0.06275777, "balance_loss_mlp": 0.01255402, "epoch": 0.9999098151209981, "flos": 21659354225280.0, "grad_norm": 1.9969060508317158, "language_loss": 0.61029291, "learning_rate": 3.792010017100722e-14, "loss": 0.68709683, "num_input_tokens_seen": 358897285, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.09375, "step": 16631, "time_per_iteration": 2.5161359310150146 }, { "auxiliary_loss_clip": 0.06409271, "auxiliary_loss_mlp": 0.0126539, "balance_loss_clip": 0.06275015, "balance_loss_mlp": 0.01256897, "epoch": 0.999969938373666, "flos": 11549054977920.0, "grad_norm": 1.8406002652203113, "language_loss": 0.72653902, "learning_rate": 0.0, "loss": 0.80328566, "num_input_tokens_seen": 358911570, "router_z_loss_clip": 1.34277344, "router_z_loss_mlp": 0.0848999, "step": 16632, "time_per_iteration": 2.4930031299591064 } ], "logging_steps": 1.0, "max_steps": 16632, "num_input_tokens_seen": 358911570, "num_train_epochs": 1, "save_steps": 3328, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.399648566653223e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }